File Coverage

Bio/Matrix/PSM/ProtMatrix.pm
Criterion Covered Total %
statement 146 231 63.2
branch 26 90 28.8
condition 0 12 0.0
subroutine 16 25 64.0
pod 18 18 100.0
total 206 376 54.7


line stmt bran cond sub pod time code
1             #---------------------------------------------------------
2              
3             =head1 NAME
4              
5             Bio::Matrix::PSM::ProtMatrix - SiteMatrixI implementation, holds a
6             position scoring matrix (or position weight matrix) with log-odds scoring
7             information.
8              
9             =head1 SYNOPSIS
10              
11             use Bio::Matrix::PSM::ProtMatrix;
12             # Create from memory by supplying probability matrix hash both as strings or
13             # arrays where the frequencies Hash entries of the form lN refer to an array
14             # of position-specific log-odds scores for amino acid N. Hash entries of the
15             # form pN represent the position-specific probability of finding amino acid N.
16              
17             my %param = (
18             'id' => 'A. thaliana protein atp1',
19             '-e_val' => $score,
20             'lS' => [ '-2', '3', '-3', '2', '-3', '1', '1', '3' ],
21             'lF' => [ '-1', '-4', '0', '-5', '0', '-5', '-4', '-4' ],
22             'lT' => [ '-1', '1', '0', '1', '-2', '-1', '0', '1' ],
23             'lN' => [ '-3', '-1', '-2', '3', '-5', '5', '-2', '0' ],
24             'lK' => [ '-2', '0', '-3', '2', '-3', '2', '-3', '-1' ],
25             'lY' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-4', '-4' ],
26             'lE' => [ '-3', '4', '-3', '2', '-4', '-2', '-3', '2' ],
27             'lV' => [ '0', '-2', '1', '-4', '1', '-4', '-1', '-3' ],
28             'lQ' => [ '-1', '0', '-2', '3', '-4', '1', '-3', '0' ],
29             'lM' => [ '8', '-3', '8', '-3', '1', '-3', '-3', '-3' ],
30             'lC' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-3', '-3' ],
31             'lL' => [ '1', '-3', '1', '-4', '3', '-4', '-2', '-4' ],
32             'lA' => [ '-2', '1', '-2', '0', '-2', '-2', '2', '2' ],
33             'lW' => [ '-2', '-4', '-3', '-5', '-4', '-5', '-5', '-5' ],
34             'lP' => [ '-3', '-2', '-4', '-3', '-1', '-3', '6', '-3' ],
35             'lH' => [ '-2', '-2', '-3', '-2', '-5', '-2', '-2', '-3' ],
36             'lD' => [ '-4', '-1', '-3', '1', '-3', '-1', '-3', '4' ],
37             'lR' => [ '-2', '-1', '-3', '0', '-4', '4', '-4', '-3' ],
38             'lI' => [ '0', '-3', '0', '-4', '6', '-4', '-2', '-2' ],
39             'lG' => [ '-4', '-2', '-4', '-2', '-5', '-3', '-1', '-2' ],
40             'pS' => [ '0', '33', '0', '16', '1', '12', '11', '25' ],
41             'pF' => [ '0', '0', '2', '0', '3', '0', '0', '0' ],
42             'pT' => [ '0', '8', '7', '10', '1', '2', '7', '8' ],
43             'pN' => [ '0', '0', '2', '13', '0', '36', '1', '4' ],
44             'pK' => [ '0', '5', '0', '13', '1', '15', '0', '2' ],
45             'pY' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
46             'pE' => [ '0', '41', '1', '12', '0', '0', '0', '15' ],
47             'pV' => [ '0', '3', '9', '0', '2', '0', '3', '1' ],
48             'pQ' => [ '0', '0', '0', '15', '0', '4', '0', '3' ],
49             'pM' => [ '100', '0', '66', '0', '2', '0', '0', '0' ],
50             'pC' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
51             'pL' => [ '0', '0', '8', '0', '25', '0', '4', '0' ],
52             'pA' => [ '0', '10', '1', '9', '2', '0', '22', '16' ],
53             'pW' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
54             'pP' => [ '0', '0', '0', '0', '3', '1', '45', '0' ],
55             'pH' => [ '0', '0', '0', '0', '0', '0', '1', '0' ],
56             'pD' => [ '0', '0', '1', '7', '2', '2', '0', '22' ],
57             'pR' => [ '0', '0', '0', '3', '0', '27', '0', '0' ],
58             'pI' => [ '0', '0', '3', '0', '59', '1', '2', '3' ],
59             'pG' => [ '0', '0', '0', '1', '0', '0', '4', '1' ],
60             );
61              
62             my $matrix = Bio::Matrix::PSM::ProtMatrix( %param );
63              
64              
65             my $site = Bio::Matrix::PSM::ProtMatrix->new(%param);
66             # Or get it from a file:
67             use Bio::Matrix::PSM::IO;
68             my $psmIO = Bio::Matrix::PSM::IO->new(-file => $file, -format => 'psi-blast');
69             while (my $psm = $psmIO->next_psm) {
70             #Now we have a Bio::Matrix::PSM::Psm object,
71             # see Bio::Matrix::PSM::PsmI for details
72             #This is a Bio::Matrix::PSM::ProtMatrix object now
73             my $matrix = $psm->matrix;
74             }
75              
76             # Get a simple consensus, where alphabet is:
77             # {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V,}
78             # choosing the highest probability or N if prob is too low
79             my $consensus = $site->consensus;
80              
81             # Retrieving and using regular expressions:
82             my $regexp = $site->regexp;
83             my $count = grep($regexp,$seq);
84             my $count = ($seq=~ s/$regexp/$1/eg);
85             print "Motif $mid is present $count times in this sequence\n";
86              
87             =head1 DESCRIPTION
88              
89             ProtMatrix is designed to provide some basic methods when working with
90             position scoring (weight) matrices related to protein sequences. A
91             protein PSM consists of 20 vectors with 20 frequencies (one per amino
92             acid per position). This is the minimum information you should
93             provide to construct a PSM object. The vectors can be provided as
94             strings with frequencies where the frequency is {0..a} and a=1. This
95             is the way MEME compressed representation of a matrix and it is quite
96             useful when working with relational DB. If arrays are provided as an
97             input (references to arrays actually) they can be any number, real or
98             integer (frequency or count).
99              
100             When creating the object the constructor will check for positions that
101             equal 0. If such is found it will increase the count for all
102             positions by one and recalculate the frequency. Potential bug - if
103             you are using frequencies and one of the positions is 0 it will change
104             significantly. However, you should never have frequency that equals
105             0.
106              
107             Throws an exception if: You mix as an input array and string (for
108             example A matrix is given as array, C - as string). The position
109             vector is (0,0,0,0). One of the probability vectors is shorter than
110             the rest.
111              
112             Summary of the methods I use most frequently (details below):
113              
114             iupac - return IUPAC compliant consensus as a string
115             score - Returns the score as a real number
116             IC - information content. Returns a real number
117             id - identifier. Returns a string
118             accession - accession number. Returns a string
119             next_pos - return the sequence probably for each letter, IUPAC
120             symbol, IUPAC probability and simple sequence
121             consenus letter for this position. Rewind at the end. Returns a hash.
122             pos - current position get/set. Returns an integer.
123             regexp - construct a regular expression based on IUPAC consensus.
124             For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
125             width - site width
126             get_string - gets the probability vector for a single base as a string.
127             get_array - gets the probability vector for a single base as an array.
128             get_logs_array - gets the log-odds vector for a single base as an array.
129              
130             New methods, which might be of interest to anyone who wants to store
131             PSM in a relational database without creating an entry for each
132             position is the ability to compress the PSM vector into a string with
133             losing usually less than 1% of the data. this can be done with:
134              
135             my $str=$matrix->get_compressed_freq('A');
136             or
137              
138             my $str=$matrix->get_compressed_logs('A');
139              
140             Loading from a database should be done with new, but is not yet implemented.
141             However you can still uncompress such string with:
142              
143             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
144              
145             or
146              
147             my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
148              
149             =head1 FEEDBACK
150              
151             =head2 Mailing Lists
152              
153             User feedback is an integral part of the evolution of this and other
154             Bioperl modules. Send your comments and suggestions preferably to one
155             of the Bioperl mailing lists. Your participation is much appreciated.
156              
157             bioperl-l@bioperl.org - General discussion
158             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
159              
160             =head2 Support
161              
162             Please direct usage questions or support issues to the mailing list:
163              
164             I
165              
166             rather than to the module maintainer directly. Many experienced and
167             reponsive experts will be able look at the problem and quickly
168             address it. Please include a thorough description of the problem
169             with code and data examples if at all possible.
170              
171             =head2 Reporting Bugs
172              
173             Report bugs to the Bioperl bug tracking system to help us keep track
174             the bugs and their resolution. Bug reports can be submitted via the
175             web:
176              
177             https://github.com/bioperl/bioperl-live/issues
178              
179             =head1 AUTHOR - James Thompson
180              
181             Email tex@biosysadmin.com
182              
183             =head1 APPENDIX
184              
185             =cut
186              
187              
188             # Let the code begin...
189             package Bio::Matrix::PSM::ProtMatrix;
190 2     2   479 use strict;
  2         3  
  2         54  
191              
192 2     2   9 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
  2         3  
  2         1963  
193              
194             =head2 new
195              
196             Title : new
197             Usage : my $site = Bio::Matrix::PSM::ProtMatrix->new(
198             %probs,
199             %logs,
200             -IC => $ic,
201             -e_val => $score,
202             -id => $mid
203             -model => \%model
204             );
205             Function : Creates a new Bio::Matrix::PSM::ProtMatrix object from memory
206             Throws : If inconsistent data for all vectors (all 20 amino acids) is
207             provided, if you mix input types (string vs array) or if a
208             position freq is 0.
209             Example :
210             Returns : Bio::Matrix::PSM::ProtMatrix object
211             Args : Hash references to log-odds scores and probabilities for
212             position-specific scoring info, e-value (optional), information
213             content (optional), id (optional), model for background distribution
214             of proteins (optional).
215              
216             =cut
217              
218             sub new {
219 2     2 1 209 my ($class, @args) = @_;
220 2         26 my $self = $class->SUPER::new(@args);
221 2         4 my $consensus;
222             #Too many things to rearrange, and I am creating simultanuously >500
223             # such objects routinely, so this becomes performance issue
224             my %input;
225 2         5 while( @args ) {
226 84         113 (my $key = shift @args) =~ s/-//gi; #deletes all dashes (only dashes)!
227 84         139 $input{$key} = shift @args;
228             }
229              
230             # get a protein alphabet for processing log-odds scores and probabilities
231             # maybe change this later on to allow for non-standard aa lists?
232 2         12 my @alphabet = qw/A R N D C Q E G H I L K M F P S T W Y V/;
233              
234 2         6 foreach my $aa (@alphabet) {
235 40 50       77 $self->{"log$aa"} = defined($input{"l$aa"}) ? $input{"l$aa"}
236             : $self->throw("Error: No log-odds information for $aa!");
237 40 50       91 $self->{"prob$aa"} = defined($input{"p$aa"}) ? $input{"p$aa"}
238             : $self->throw("Error: No probability information for $aa!");
239             }
240            
241 2         5 $self->{_position} = 0;
242 2         5 $self->{IC} = $input{IC};
243 2         5 $self->{e_val} = $input{e_val};
244 2         4 $self->{sites} = $input{sites};
245 2         4 $self->{width} = $input{width};
246 2         4 $self->{accession_number} = $input{accession_number};
247             $self->{_correction} = defined($input{correction}) ?
248 2 50       6 $input{correction} : 1 ; # Correction might be unwanted- supply your own
249             # No id provided, null for the sake of rel db
250 2 100       5 $self->{id} = defined($input{id}) ? $input{id} : 'null';
251 2         5 $self->{_alphabet} = \@alphabet;
252              
253             #Make consensus, throw if any one of the vectors is shorter
254 2         7 $self = _calculate_consensus($self,$input{model});
255 2         21 return $self;
256             }
257              
258             =head2 alphabet
259              
260             Title : Returns an array (or array reference if desired) to the alphabet
261             Usage :
262             Function : Returns an array (or array reference) containing all of the
263             allowable characters for this matrix.
264             Throws :
265             Example :
266             Returns : Array or arrary reference.
267             Args :
268              
269             =cut
270              
271             sub alphabet {
272 0     0 1 0 my $self = shift;
273 0 0       0 if ( wantarray ) {
274 0         0 return $self->{_alphabet};
275             } else {
276 0         0 return @{$self->{_alphabet}};
  0         0  
277             }
278             }
279              
280             =head2 _calculate_consensus
281              
282             Title : _calculate_consensus
283             Usage :
284             Function : Calculates the consensus sequence for this matrix.
285             Throws :
286             Example :
287             Returns :
288             Args :
289              
290             =cut
291              
292             sub _calculate_consensus {
293 2     2   6 my $self = shift;
294 2         6 my $thresh = shift;
295            
296             # verify that all of the array lengths in %probs are the same
297 2         4 my @lengths = map { scalar(@$_) } map {$self->{"prob$_"}} @{ $self->{_alphabet} };
  40         44  
  40         48  
  2         7  
298 2         5 my $len = shift @lengths;
299 2         4 for ( @lengths ) {
300 38 50       53 if ( $_ ne $len ) { $self->throw( "Probability matrix is damaged!\n" ) };
  0         0  
301             }
302              
303             # iterate over probs, generate the most likely sequence and put it into
304             # $self->{seq}. Put the probability of this sequence into $self->{seqp}.
305 2         7 for ( my $i = 0; $i < $len; $i++ ) {
306             # get a list of all the probabilities at position $i, ordered by $self->{_alphabet}
307 515         518 my @probs = map { ${$self->{"prob$_"}}[$i] } @{ $self->{_alphabet} };
  10300         9116  
  10300         14652  
  515         624  
308             # calculate the consensus of @probs, put sequence into seqp and probabilities into seqp
309 515         748 (${$self->{seq}}[$i],${$self->{seqp}}[$i]) = $self->_to_cons( @probs, $thresh );
  515         721  
  515         1665  
310             }
311              
312 2         9 return $self;
313             }
314              
315             =head2 next_pos
316              
317             Title : next_pos
318             Usage :
319             Function : Retrieves the next position features: frequencies for all 20 amino
320             acids, log-odds scores for all 20 amino acids at this position,
321             the main (consensus) letter at this position, the probability
322             for the consensus letter to occur at this position and the relative
323             current position as an integer.
324             Throws :
325             Example :
326             Returns : hash (or hash reference) (pA,pR,pN,pD,...,logA,logR,logN,logD,aa,prob,rel)
327             - pN entries represent the probability for amino acid N
328             to be at this position
329             - logN entries represent the log-odds score for having amino acid
330             N at this position
331             - aa is the consensus amino acid
332             - prob is the probability for the consensus amino acid to be at this
333             position
334             - rel is the relative index of the current position (integer)
335             Args : none
336              
337              
338             =cut
339              
340             sub next_pos {
341 1     1 1 1 my $self = shift;
342 1 50       3 $self->throw("instance method called on class") unless ref $self;
343              
344 1         2 my $len = @{$self->{seq}};
  1         2  
345 1         2 my $pos = $self->{_position};
346              
347             # return a PSM if we're still within range
348 1 50       3 if ($pos<$len) {
349              
350 1         2 my %probs = map { ("p$_", ${$self->{"prob$_"}}[$pos]) } @{$self->{_alphabet}};
  20         22  
  20         38  
  1         2  
351 1         4 my %logs = map { ("l$_", ${$self->{"log$_"}}[$pos]) } @{$self->{_alphabet}};
  20         21  
  20         34  
  1         2  
352 1         3 my $base = ${$self->{seq}}[$pos];
  1         2  
353 1         2 my $prob = ${$self->{seqp}}[$pos];
  1         1  
354              
355 1         2 $self->{_position}++;
356 1         13 my %hash = ( %probs, %logs, base => $base, rel => $pos, prob => $prob );
357            
358             # decide whether to return the hash or a reference to it
359 1 50       3 if ( wantarray ) {
360 1         18 return %hash;
361             } else {
362 0         0 return \%hash;
363             }
364             } else { # otherwise, reset $self->{_position} and return nothing
365 0         0 $self->{_position} = 0;
366 0         0 return;
367             }
368             }
369              
370              
371             =head2 curpos
372              
373             Title : curpos
374             Usage :
375             Function : Gets/sets the current position.
376             Throws :
377             Example :
378             Returns : Current position (integer).
379             Args : New position (integer).
380              
381             =cut
382              
383             sub curpos {
384 2     2 1 854 my $self = shift;
385 2 50       6 if (@_) { $self->{_position} = shift; }
  0         0  
386 2         7 return $self->{_position};
387             }
388              
389              
390             =head2 e_val
391              
392             Title : e_val
393             Usage :
394             Function : Gets/sets the e-value
395             Throws :
396             Example :
397             Returns :
398             Args : real number
399              
400             =cut
401              
402             sub e_val {
403 2     2 1 4 my $self = shift;
404 2 100       5 if (@_) { $self->{e_val} = shift; }
  1         2  
405 2         7 return $self->{e_val};
406             }
407              
408              
409             =head2 IC
410              
411             Title : IC
412             Usage :
413             Function : Position-specific information content.
414             Throws :
415             Example :
416             Returns : Information content for current position.
417             Args : Information content for current position.
418              
419             =cut
420              
421             sub IC {
422 0     0 1 0 my $self = shift;
423 0 0       0 if (@_) { $self->{IC} = shift; }
  0         0  
424 0         0 return $self->{IC};
425             }
426              
427             =head2 accession_number
428              
429             Title : accession_number
430             Usage :
431             Function: accession number, this will be unique id for the ProtMatrix object as
432             well for any other object, inheriting from ProtMatrix.
433             Throws :
434             Example :
435             Returns : New accession number (string)
436             Args : Accession number (string)
437              
438             =cut
439              
440             sub accession_number {
441 0     0 1 0 my $self = shift;
442 0 0       0 if (@_) { $self->{accession_number} = shift; }
  0         0  
443 0         0 return $self->{accession_number};
444             }
445              
446             =head2 consensus
447              
448             Title : consensus
449             Usage :
450             Function : Returns the consensus sequence for this PSM.
451             Throws : if supplied with thresold outisde 5..10 range
452             Example :
453             Returns : string
454             Args : (optional) threshold value 5 to 10 (corresponds to 50-100% at each position
455              
456             =cut
457              
458             sub consensus {
459 3     3 1 6 my $self = shift;
460 3         5 my $thresh=shift;
461 3 50       8 $self->_calculate_consensus($thresh) if ($thresh); #Change of threshold
462 3         5 my $consensus='';
463              
464 3         4 foreach my $letter (@{$self->{seq}}) {
  3         8  
465 523         462 $consensus .= $letter;
466             }
467              
468 3         14 return $consensus;
469             }
470              
471             sub IUPAC {
472 2     2 1 1331 my $self = shift;
473 2         5 return $self->consensus;
474             }
475              
476              
477             =head2 get_string
478              
479             Title : get_string
480             Usage :
481             Function: Returns given probability vector as a string. Useful if you want to
482             store things in a rel database, where arrays are not first choice
483             Throws : If the argument is outside {A,C,G,T}
484             Example :
485             Returns : string
486             Args : character {A,C,G,T}
487              
488             =cut
489              
490             sub get_string {
491 1     1 1 3 my $self = shift;
492 1         2 my $base = shift;
493 1         2 my $string = '';
494              
495 1         2 my @prob = @{$self->{"prob$base"}};
  1         4  
496 1 50       3 if ( ! @prob ) {
497 0         0 $self->throw( "No such base: $base\n");
498             }
499              
500 1         3 foreach my $prob (@prob) {
501 8         10 my $corrected = $prob*10;
502 8         12 my $next = sprintf("%.0f",$corrected);
503 8 100       12 $next = 'a' if ($next eq '10');
504 8         8 $string .= $next;
505             }
506 1         4 return $string;
507             }
508              
509              
510              
511             =head2 width
512              
513             Title : width
514             Usage :
515             Function : Returns the length of the site
516             Throws :
517             Example :
518             Returns : number
519             Args :
520              
521             =cut
522              
523             sub width {
524 4     4 1 5 my $self = shift;
525 4         5 my $width = @{$self->{probA}};
  4         6  
526 4         15 return $width;
527             }
528              
529             =head2 get_array
530              
531             Title : get_array
532             Usage :
533             Function : Returns an array with frequencies for a specified amino acid.
534             Throws :
535             Example :
536             Returns : Array representing frequencies for specified amino acid.
537             Args : Single amino acid (character).
538              
539             =cut
540              
541             sub get_array {
542 1     1 1 2 my $self = shift;
543 1         2 my $letter = uc(shift);
544              
545 1 50       2 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
  20         42  
  1         2  
546              
547 1         2 return @{$self->{"prob$letter"}};
  1         7  
548             }
549              
550              
551             =head2 get_logs_array
552              
553             Title : get_logs_array
554             Usage :
555             Function : Returns an array with log_odds for a specified base
556             Throws :
557             Example :
558             Returns : Array representing log-odds scores for specified amino acid.
559             Args : Single amino acid (character).
560              
561             =cut
562              
563             sub get_logs_array {
564 0     0 1 0 my $self = shift;
565 0         0 my $letter = uc(shift);
566              
567 0 0       0 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
  0         0  
  0         0  
568              
569 0         0 return @{$self->{"log$letter"}};
  0         0  
570             }
571              
572             =head2 id
573              
574             Title : id
575             Usage :
576             Function : Gets/sets the site id
577             Throws :
578             Example :
579             Returns : string
580             Args : string
581              
582             =cut
583              
584             sub id {
585 0     0 1 0 my $self = shift;
586 0 0       0 if (@_) { $self->{id} = shift; }
  0         0  
587 0         0 return $self->{id};
588             }
589              
590             =head2 regexp
591              
592             Title : regexp
593             Usage :
594             Function : Returns a case-insensitive regular expression which matches the
595             IUPAC convention. X's in consensus sequence will match anything.
596             Throws :
597             Example :
598             Returns : string
599             Args : Threshold for calculating consensus sequence (number in range 0-100
600             representing a percentage). Threshold defaults to 20.
601              
602             =cut
603              
604             sub regexp {
605 1     1 1 2 my $self = shift;
606 1         2 my $threshold = 20;
607 1 50       3 if ( @_ ) { my $threshold = shift };
  0         0  
608              
609 1         1 my @alphabet = @{$self->{_alphabet}};
  1         5  
610 1         3 my $width = $self->width;
611 1         2 my (@regexp, $i);
612 1         3 for ( $i = 0; $i < $width; $i++ ) {
613             # get an array of the residues at this position with p > $threshold
614 8         10 my @letters = map { uc($_).lc($_) } grep { $self->{"prob$_"}->[$i] >= $threshold } @alphabet;
  12         23  
  160         247  
615              
616 8         9 my $reg;
617 8 100       11 if ( scalar(@letters) == 0 ) {
618 1         1 $reg = '\.';
619             } else {
620 7         12 $reg = '['.join('',@letters).']';
621             }
622 8         17 push @regexp, $reg;
623             }
624              
625 1 50       2 if ( wantarray ) {
626 0         0 return @regexp;
627             } else {
628 1         7 return join '', @regexp;
629             }
630             }
631              
632              
633             =head2 regexp_array
634              
635             Title : regexp_array
636             Usage :
637             Function : Returns an array of position-specific regular expressions.
638             X's in consensus sequence will match anything.
639             Throws :
640             Example :
641             Returns : Array of position-specific regular expressions.
642             Args : Threshold for calculating consensus sequence (number in range 0-100
643             representing a percentage). Threshold defaults to 20.
644             Notes : Simply calls regexp method in list context.
645              
646             =cut
647              
648             sub regexp_array {
649 0     0 1 0 my $self = shift;
650            
651 0         0 return @{ $self->regexp };
  0         0  
652             }
653              
654              
655             =head2 _compress_array
656              
657             Title : _compress_array
658             Usage :
659             Function : Will compress an array of real signed numbers to a string (ie vector of bytes)
660             -127 to +127 for bi-directional(signed) and 0..255 for unsigned ;
661             Throws :
662             Example : Internal stuff
663             Returns : String
664             Args : array reference, followed by max value and direction (optional, defaults to 1),
665             direction of 1 is unsigned, anything else is signed.
666              
667             =cut
668              
669             sub _compress_array {
670 0     0   0 my ($array,$lm,$direct)=@_;
671 0         0 my $str;
672 0 0 0     0 return unless(($array) && ($lm));
673 0 0       0 $direct=1 unless ($direct);
674 0 0       0 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
675 0         0 foreach my $c (@{$array}) {
  0         0  
676 0 0       0 $c=$lm if ($c>$lm);
677 0 0 0     0 $c=-$lm if (($c<-$lm) && ($direct !=1));
678 0 0 0     0 $c=0 if (($c<0) && ($direct ==1));
679 0         0 my $byte=int($k1*$c);
680 0 0       0 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
681 0         0 my $char=chr($byte);
682 0         0 $str.=$char;
683             }
684 0         0 return $str;
685             }
686              
687             =head2 _uncompress_string
688              
689             Title : _uncompress_string
690             Usage :
691             Function : Will uncompress a string (vector of bytes) to create an array of real
692             signed numbers (opposite to_compress_array)
693             Throws :
694             Example : Internal stuff
695             Returns : string, followed by max value and direction (optional, defaults to 1),
696             direction of 1 is unsigned, anything else is signed.
697             Args : array
698              
699             =cut
700              
701             sub _uncompress_string {
702 0     0   0 my ($str,$lm,$direct)=@_;
703 0         0 my @array;
704 0 0 0     0 return unless(($str) && ($lm));
705 0 0       0 $direct=1 unless ($direct);
706 0 0       0 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
707 0         0 while (my $c=chop($str)) {
708 0         0 my $byte=ord($c);
709 0 0       0 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
710 0         0 my $num=$byte/$k1;
711 0         0 unshift @array,$num;
712             }
713              
714 0         0 return @array;
715             }
716              
717             =head2 get_compressed_freq
718              
719             Title : get_compressed_freq
720             Usage :
721             Function: A method to provide a compressed frequency vector. It uses one byte to
722             code the frequence for one of the probability vectors for one position.
723             Useful for relational database. Improvement of the previous 0..a coding.
724             Throws :
725             Example : my $strA=$self->get_compressed_freq('A');
726             Returns : String
727             Args : char
728              
729             =cut
730              
731             sub get_compressed_freq {
732 0     0 1 0 my $self=shift;
733 0         0 my $base=shift;
734 0         0 my $string='';
735 0         0 my @prob;
736             BASE: {
737 0 0       0 if ($base eq 'A') {
  0         0  
738 0 0       0 @prob = @{$self->{probA}} unless (!defined($self->{probA}));
  0         0  
739 0         0 last BASE;
740             }
741 0 0       0 if ($base eq 'G') {
742 0 0       0 @prob = @{$self->{probG}} unless (!defined($self->{probG}));
  0         0  
743 0         0 last BASE;
744             }
745 0 0       0 if ($base eq 'C') {
746 0 0       0 @prob = @{$self->{probC}} unless (!defined($self->{probC}));
  0         0  
747 0         0 last BASE;
748             }
749 0 0       0 if ($base eq 'T') {
750 0 0       0 @prob = @{$self->{probT}} unless (!defined($self->{probT}));
  0         0  
751 0         0 last BASE;
752             }
753 0         0 $self->throw ("No such base: $base!\n");
754             }
755 0         0 my $str= _compress_array(\@prob,1,1);
756 0         0 return $str;
757             }
758              
759             =head2 sequence_match_weight
760              
761             Title : sequence_match_weight
762             Usage :
763             Function : This method will calculate the score of a match, based on the PSM
764             if such is associated with the matrix object. Returns undef if no
765             PSM data is available.
766             Throws : if the length of the sequence is different from the matrix width
767             Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
768             Returns : Floating point
769             Args : string
770              
771             =cut
772              
773             sub sequence_match_weight {
774 1     1 1 3 my ($self,$seq)=@_;
775 1 50       3 return unless ($self->{logA});
776              
777 1         2 my $seqlen = length($seq);
778 1         3 my $width = $self->width;
779 1 50       2 $self->throw("Error: Input sequence size ($seqlen) not equal to PSM size ($width)!\n")
780             unless (length($seq) == $self->width);
781              
782 1         2 my ($score,$i) = (0,0);
783 1         4 foreach my $letter ( split //, $seq ) {
784             # add up the score for this position
785 8         13 $score += $self->{"log$letter"}->[$i];
786 8         8 $i++;
787             }
788 1         4 return $score;
789             }
790              
791              
792             =head2 _to_IUPAC
793              
794             Title : _to_IUPAC
795             Usage :
796             Function: Converts a single position to IUPAC compliant symbol and returns its probability.
797             Currently returns the most likely amino acid/probability combination.
798             Throws :
799             Example :
800             Returns : char, real number representing an amino acid and a probability.
801             Args : real numbers for all 20 amino acids (ordered by alphabet contained
802             in $self->{_alphabet}, minimum probability threshold.
803              
804             =cut
805              
806             sub _to_IUPAC {
807 515     515   1550 my ($self,@probs,$thresh) = @_;
808              
809             # provide a default threshold of 5, corresponds to 5% threshold for
810             # inferring that the aa at any position is the true aa
811 515 50       681 $thresh = 5 unless ( defined $thresh );
812              
813 515         547 my ($IUPAC_aa,$max_prob) = ('X',$thresh);
814 515         432 for my $aa ( @{$self->{_alphabet}} ) {
  515         658  
815 10300         9046 my $prob = shift @probs;
816 10300 100       12497 if ( $prob > $max_prob ) {
817 907         861 $IUPAC_aa = $aa;
818 907         934 $max_prob = $prob;
819             }
820             }
821            
822 515         720 return $IUPAC_aa, $max_prob;
823             }
824              
825             =head2 _to_cons
826              
827             Title : _to_cons
828             Usage :
829             Function: Converts a single position to simple consensus character and returns
830             its probability. Currently just calls the _to_IUPAC subroutine.
831             Throws :
832             Example :
833             Returns : char, real number
834             Args : real numbers for A,C,G,T (positional)
835              
836             =cut
837              
838             sub _to_cons {
839 515     515   602 return _to_IUPAC( @_ );
840             }
841              
842             =head2 get_all_vectors
843              
844             Title : get_all_vectors
845             Usage :
846             Function : returns all possible sequence vectors to satisfy the PFM under
847             a given threshold
848             Throws : If threshold outside of 0..1 (no sense to do that)
849             Example : my @vectors = $self->get_all_vectors(4);
850             Returns : Array of strings
851             Args : (optional) floating
852              
853             =cut
854              
855             #sub get_all_vectors {
856             # my $self = shift;
857             # my $thresh = shift;
858             #
859             # $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
860             #
861             # my @seq = split(//,$self->consensus($thresh*10));
862             # my @perm;
863             # for my $i (0..@{$self->{probA}}) {
864             # push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
865             # push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
866             # push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
867             # push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
868             # push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
869             # }
870             # my $fpos=shift @perm;
871             # my @strings=@$fpos;
872             # foreach my $pos (@perm) {
873             # my @newstr;
874             # foreach my $let (@$pos) {
875             # foreach my $string (@strings) {
876             # my $newstring = $string . $let;
877             # push @newstr,$newstring;
878             # }
879             # }
880             # @strings=@newstr;
881             # }
882             # return @strings;
883             #}
884              
885             1;