line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#--------------------------------------------------------- |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Bio::Matrix::PSM::ProtMatrix - SiteMatrixI implementation, holds a |
6
|
|
|
|
|
|
|
position scoring matrix (or position weight matrix) with log-odds scoring |
7
|
|
|
|
|
|
|
information. |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Bio::Matrix::PSM::ProtMatrix; |
12
|
|
|
|
|
|
|
# Create from memory by supplying probability matrix hash both as strings or |
13
|
|
|
|
|
|
|
# arrays where the frequencies Hash entries of the form lN refer to an array |
14
|
|
|
|
|
|
|
# of position-specific log-odds scores for amino acid N. Hash entries of the |
15
|
|
|
|
|
|
|
# form pN represent the position-specific probability of finding amino acid N. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my %param = ( |
18
|
|
|
|
|
|
|
'id' => 'A. thaliana protein atp1', |
19
|
|
|
|
|
|
|
'-e_val' => $score, |
20
|
|
|
|
|
|
|
'lS' => [ '-2', '3', '-3', '2', '-3', '1', '1', '3' ], |
21
|
|
|
|
|
|
|
'lF' => [ '-1', '-4', '0', '-5', '0', '-5', '-4', '-4' ], |
22
|
|
|
|
|
|
|
'lT' => [ '-1', '1', '0', '1', '-2', '-1', '0', '1' ], |
23
|
|
|
|
|
|
|
'lN' => [ '-3', '-1', '-2', '3', '-5', '5', '-2', '0' ], |
24
|
|
|
|
|
|
|
'lK' => [ '-2', '0', '-3', '2', '-3', '2', '-3', '-1' ], |
25
|
|
|
|
|
|
|
'lY' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-4', '-4' ], |
26
|
|
|
|
|
|
|
'lE' => [ '-3', '4', '-3', '2', '-4', '-2', '-3', '2' ], |
27
|
|
|
|
|
|
|
'lV' => [ '0', '-2', '1', '-4', '1', '-4', '-1', '-3' ], |
28
|
|
|
|
|
|
|
'lQ' => [ '-1', '0', '-2', '3', '-4', '1', '-3', '0' ], |
29
|
|
|
|
|
|
|
'lM' => [ '8', '-3', '8', '-3', '1', '-3', '-3', '-3' ], |
30
|
|
|
|
|
|
|
'lC' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-3', '-3' ], |
31
|
|
|
|
|
|
|
'lL' => [ '1', '-3', '1', '-4', '3', '-4', '-2', '-4' ], |
32
|
|
|
|
|
|
|
'lA' => [ '-2', '1', '-2', '0', '-2', '-2', '2', '2' ], |
33
|
|
|
|
|
|
|
'lW' => [ '-2', '-4', '-3', '-5', '-4', '-5', '-5', '-5' ], |
34
|
|
|
|
|
|
|
'lP' => [ '-3', '-2', '-4', '-3', '-1', '-3', '6', '-3' ], |
35
|
|
|
|
|
|
|
'lH' => [ '-2', '-2', '-3', '-2', '-5', '-2', '-2', '-3' ], |
36
|
|
|
|
|
|
|
'lD' => [ '-4', '-1', '-3', '1', '-3', '-1', '-3', '4' ], |
37
|
|
|
|
|
|
|
'lR' => [ '-2', '-1', '-3', '0', '-4', '4', '-4', '-3' ], |
38
|
|
|
|
|
|
|
'lI' => [ '0', '-3', '0', '-4', '6', '-4', '-2', '-2' ], |
39
|
|
|
|
|
|
|
'lG' => [ '-4', '-2', '-4', '-2', '-5', '-3', '-1', '-2' ], |
40
|
|
|
|
|
|
|
'pS' => [ '0', '33', '0', '16', '1', '12', '11', '25' ], |
41
|
|
|
|
|
|
|
'pF' => [ '0', '0', '2', '0', '3', '0', '0', '0' ], |
42
|
|
|
|
|
|
|
'pT' => [ '0', '8', '7', '10', '1', '2', '7', '8' ], |
43
|
|
|
|
|
|
|
'pN' => [ '0', '0', '2', '13', '0', '36', '1', '4' ], |
44
|
|
|
|
|
|
|
'pK' => [ '0', '5', '0', '13', '1', '15', '0', '2' ], |
45
|
|
|
|
|
|
|
'pY' => [ '0', '0', '0', '0', '0', '0', '0', '0' ], |
46
|
|
|
|
|
|
|
'pE' => [ '0', '41', '1', '12', '0', '0', '0', '15' ], |
47
|
|
|
|
|
|
|
'pV' => [ '0', '3', '9', '0', '2', '0', '3', '1' ], |
48
|
|
|
|
|
|
|
'pQ' => [ '0', '0', '0', '15', '0', '4', '0', '3' ], |
49
|
|
|
|
|
|
|
'pM' => [ '100', '0', '66', '0', '2', '0', '0', '0' ], |
50
|
|
|
|
|
|
|
'pC' => [ '0', '0', '0', '0', '0', '0', '0', '0' ], |
51
|
|
|
|
|
|
|
'pL' => [ '0', '0', '8', '0', '25', '0', '4', '0' ], |
52
|
|
|
|
|
|
|
'pA' => [ '0', '10', '1', '9', '2', '0', '22', '16' ], |
53
|
|
|
|
|
|
|
'pW' => [ '0', '0', '0', '0', '0', '0', '0', '0' ], |
54
|
|
|
|
|
|
|
'pP' => [ '0', '0', '0', '0', '3', '1', '45', '0' ], |
55
|
|
|
|
|
|
|
'pH' => [ '0', '0', '0', '0', '0', '0', '1', '0' ], |
56
|
|
|
|
|
|
|
'pD' => [ '0', '0', '1', '7', '2', '2', '0', '22' ], |
57
|
|
|
|
|
|
|
'pR' => [ '0', '0', '0', '3', '0', '27', '0', '0' ], |
58
|
|
|
|
|
|
|
'pI' => [ '0', '0', '3', '0', '59', '1', '2', '3' ], |
59
|
|
|
|
|
|
|
'pG' => [ '0', '0', '0', '1', '0', '0', '4', '1' ], |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
my $matrix = Bio::Matrix::PSM::ProtMatrix( %param ); |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
my $site = Bio::Matrix::PSM::ProtMatrix->new(%param); |
66
|
|
|
|
|
|
|
# Or get it from a file: |
67
|
|
|
|
|
|
|
use Bio::Matrix::PSM::IO; |
68
|
|
|
|
|
|
|
my $psmIO = Bio::Matrix::PSM::IO->new(-file => $file, -format => 'psi-blast'); |
69
|
|
|
|
|
|
|
while (my $psm = $psmIO->next_psm) { |
70
|
|
|
|
|
|
|
#Now we have a Bio::Matrix::PSM::Psm object, |
71
|
|
|
|
|
|
|
# see Bio::Matrix::PSM::PsmI for details |
72
|
|
|
|
|
|
|
#This is a Bio::Matrix::PSM::ProtMatrix object now |
73
|
|
|
|
|
|
|
my $matrix = $psm->matrix; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Get a simple consensus, where alphabet is: |
77
|
|
|
|
|
|
|
# {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V,} |
78
|
|
|
|
|
|
|
# choosing the highest probability or N if prob is too low |
79
|
|
|
|
|
|
|
my $consensus = $site->consensus; |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Retrieving and using regular expressions: |
82
|
|
|
|
|
|
|
my $regexp = $site->regexp; |
83
|
|
|
|
|
|
|
my $count = grep($regexp,$seq); |
84
|
|
|
|
|
|
|
my $count = ($seq=~ s/$regexp/$1/eg); |
85
|
|
|
|
|
|
|
print "Motif $mid is present $count times in this sequence\n"; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head1 DESCRIPTION |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
ProtMatrix is designed to provide some basic methods when working with |
90
|
|
|
|
|
|
|
position scoring (weight) matrices related to protein sequences. A |
91
|
|
|
|
|
|
|
protein PSM consists of 20 vectors with 20 frequencies (one per amino |
92
|
|
|
|
|
|
|
acid per position). This is the minimum information you should |
93
|
|
|
|
|
|
|
provide to construct a PSM object. The vectors can be provided as |
94
|
|
|
|
|
|
|
strings with frequencies where the frequency is {0..a} and a=1. This |
95
|
|
|
|
|
|
|
is the way MEME compressed representation of a matrix and it is quite |
96
|
|
|
|
|
|
|
useful when working with relational DB. If arrays are provided as an |
97
|
|
|
|
|
|
|
input (references to arrays actually) they can be any number, real or |
98
|
|
|
|
|
|
|
integer (frequency or count). |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
When creating the object the constructor will check for positions that |
101
|
|
|
|
|
|
|
equal 0. If such is found it will increase the count for all |
102
|
|
|
|
|
|
|
positions by one and recalculate the frequency. Potential bug - if |
103
|
|
|
|
|
|
|
you are using frequencies and one of the positions is 0 it will change |
104
|
|
|
|
|
|
|
significantly. However, you should never have frequency that equals |
105
|
|
|
|
|
|
|
0. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Throws an exception if: You mix as an input array and string (for |
108
|
|
|
|
|
|
|
example A matrix is given as array, C - as string). The position |
109
|
|
|
|
|
|
|
vector is (0,0,0,0). One of the probability vectors is shorter than |
110
|
|
|
|
|
|
|
the rest. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Summary of the methods I use most frequently (details bellow): |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
iupac - return IUPAC compliant consensus as a string |
115
|
|
|
|
|
|
|
score - Returns the score as a real number |
116
|
|
|
|
|
|
|
IC - information content. Returns a real number |
117
|
|
|
|
|
|
|
id - identifier. Returns a string |
118
|
|
|
|
|
|
|
accession - accession number. Returns a string |
119
|
|
|
|
|
|
|
next_pos - return the sequence probably for each letter, IUPAC |
120
|
|
|
|
|
|
|
symbol, IUPAC probability and simple sequence |
121
|
|
|
|
|
|
|
consenus letter for this position. Rewind at the end. Returns a hash. |
122
|
|
|
|
|
|
|
pos - current position get/set. Returns an integer. |
123
|
|
|
|
|
|
|
regexp - construct a regular expression based on IUPAC consensus. |
124
|
|
|
|
|
|
|
For example AGWV will be [Aa][Gg][AaTt][AaCcGg] |
125
|
|
|
|
|
|
|
width - site width |
126
|
|
|
|
|
|
|
get_string - gets the probability vector for a single base as a string. |
127
|
|
|
|
|
|
|
get_array - gets the probability vector for a single base as an array. |
128
|
|
|
|
|
|
|
get_logs_array - gets the log-odds vector for a single base as an array. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
New methods, which might be of interest to anyone who wants to store |
131
|
|
|
|
|
|
|
PSM in a relational database without creating an entry for each |
132
|
|
|
|
|
|
|
position is the ability to compress the PSM vector into a string with |
133
|
|
|
|
|
|
|
losing usually less than 1% of the data. this can be done with: |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
my $str=$matrix->get_compressed_freq('A'); |
136
|
|
|
|
|
|
|
or |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
my $str=$matrix->get_compressed_logs('A'); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Loading from a database should be done with new, but is not yet implemented. |
141
|
|
|
|
|
|
|
However you can still uncompress such string with: |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
or |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head1 FEEDBACK |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head2 Mailing Lists |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
154
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
155
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
158
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head2 Support |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
I |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
167
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
168
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
169
|
|
|
|
|
|
|
with code and data examples if at all possible. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head2 Reporting Bugs |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
174
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
175
|
|
|
|
|
|
|
web: |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 AUTHOR - James Thompson |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Email tex@biosysadmin.com |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 APPENDIX |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Let the code begin... |
189
|
|
|
|
|
|
|
package Bio::Matrix::PSM::ProtMatrix; |
190
|
2
|
|
|
2
|
|
572
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
54
|
|
191
|
|
|
|
|
|
|
|
192
|
2
|
|
|
2
|
|
7
|
use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
1930
|
|
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head2 new |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Title : new |
197
|
|
|
|
|
|
|
Usage : my $site = Bio::Matrix::PSM::ProtMatrix->new( |
198
|
|
|
|
|
|
|
%probs, |
199
|
|
|
|
|
|
|
%logs, |
200
|
|
|
|
|
|
|
-IC => $ic, |
201
|
|
|
|
|
|
|
-e_val => $score, |
202
|
|
|
|
|
|
|
-id => $mid |
203
|
|
|
|
|
|
|
-model => \%model |
204
|
|
|
|
|
|
|
); |
205
|
|
|
|
|
|
|
Function : Creates a new Bio::Matrix::PSM::ProtMatrix object from memory |
206
|
|
|
|
|
|
|
Throws : If inconsistent data for all vectors (all 20 amino acids) is |
207
|
|
|
|
|
|
|
provided, if you mix input types (string vs array) or if a |
208
|
|
|
|
|
|
|
position freq is 0. |
209
|
|
|
|
|
|
|
Example : |
210
|
|
|
|
|
|
|
Returns : Bio::Matrix::PSM::ProtMatrix object |
211
|
|
|
|
|
|
|
Args : Hash references to log-odds scores and probabilities for |
212
|
|
|
|
|
|
|
position-specific scoring info, e-value (optional), information |
213
|
|
|
|
|
|
|
content (optional), id (optional), model for background distribution |
214
|
|
|
|
|
|
|
of proteins (optional). |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=cut |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub new { |
219
|
2
|
|
|
2
|
1
|
266
|
my ($class, @args) = @_; |
220
|
2
|
|
|
|
|
30
|
my $self = $class->SUPER::new(@args); |
221
|
2
|
|
|
|
|
3
|
my $consensus; |
222
|
|
|
|
|
|
|
#Too many things to rearrange, and I am creating simultanuously >500 |
223
|
|
|
|
|
|
|
# such objects routinely, so this becomes performance issue |
224
|
|
|
|
|
|
|
my %input; |
225
|
2
|
|
|
|
|
8
|
while( @args ) { |
226
|
84
|
|
|
|
|
93
|
(my $key = shift @args) =~ s/-//gi; #deletes all dashes (only dashes)! |
227
|
84
|
|
|
|
|
149
|
$input{$key} = shift @args; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# get a protein alphabet for processing log-odds scores and probabilities |
231
|
|
|
|
|
|
|
# maybe change this later on to allow for non-standard aa lists? |
232
|
2
|
|
|
|
|
12
|
my @alphabet = qw/A R N D C Q E G H I L K M F P S T W Y V/; |
233
|
|
|
|
|
|
|
|
234
|
2
|
|
|
|
|
5
|
foreach my $aa (@alphabet) { |
235
|
40
|
50
|
|
|
|
84
|
$self->{"log$aa"} = defined($input{"l$aa"}) ? $input{"l$aa"} |
236
|
|
|
|
|
|
|
: $self->throw("Error: No log-odds information for $aa!"); |
237
|
40
|
50
|
|
|
|
72
|
$self->{"prob$aa"} = defined($input{"p$aa"}) ? $input{"p$aa"} |
238
|
|
|
|
|
|
|
: $self->throw("Error: No probability information for $aa!"); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
2
|
|
|
|
|
9
|
$self->{_position} = 0; |
242
|
2
|
|
|
|
|
3
|
$self->{IC} = $input{IC}; |
243
|
2
|
|
|
|
|
4
|
$self->{e_val} = $input{e_val}; |
244
|
2
|
|
|
|
|
3
|
$self->{sites} = $input{sites}; |
245
|
2
|
|
|
|
|
3
|
$self->{width} = $input{width}; |
246
|
2
|
|
|
|
|
3
|
$self->{accession_number} = $input{accession_number}; |
247
|
|
|
|
|
|
|
$self->{_correction} = defined($input{correction}) ? |
248
|
2
|
50
|
|
|
|
6
|
$input{correction} : 1 ; # Correction might be unwanted- supply your own |
249
|
|
|
|
|
|
|
# No id provided, null for the sake of rel db |
250
|
2
|
100
|
|
|
|
6
|
$self->{id} = defined($input{id}) ? $input{id} : 'null'; |
251
|
2
|
|
|
|
|
4
|
$self->{_alphabet} = \@alphabet; |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
#Make consensus, throw if any one of the vectors is shorter |
254
|
2
|
|
|
|
|
19
|
$self = _calculate_consensus($self,$input{model}); |
255
|
2
|
|
|
|
|
26
|
return $self; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head2 alphabet |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Title : Returns an array (or array reference if desired) to the alphabet |
261
|
|
|
|
|
|
|
Usage : |
262
|
|
|
|
|
|
|
Function : Returns an array (or array reference) containing all of the |
263
|
|
|
|
|
|
|
allowable characters for this matrix. |
264
|
|
|
|
|
|
|
Throws : |
265
|
|
|
|
|
|
|
Example : |
266
|
|
|
|
|
|
|
Returns : Array or arrary reference. |
267
|
|
|
|
|
|
|
Args : |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=cut |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub alphabet { |
272
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
273
|
0
|
0
|
|
|
|
0
|
if ( wantarray ) { |
274
|
0
|
|
|
|
|
0
|
return $self->{_alphabet}; |
275
|
|
|
|
|
|
|
} else { |
276
|
0
|
|
|
|
|
0
|
return @{$self->{_alphabet}}; |
|
0
|
|
|
|
|
0
|
|
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head2 _calculate_consensus |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
Title : _calculate_consensus |
283
|
|
|
|
|
|
|
Usage : |
284
|
|
|
|
|
|
|
Function : Calculates the consensus sequence for this matrix. |
285
|
|
|
|
|
|
|
Throws : |
286
|
|
|
|
|
|
|
Example : |
287
|
|
|
|
|
|
|
Returns : |
288
|
|
|
|
|
|
|
Args : |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
=cut |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _calculate_consensus { |
293
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
294
|
2
|
|
|
|
|
6
|
my $thresh = shift; |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# verify that all of the array lengths in %probs are the same |
297
|
2
|
|
|
|
|
3
|
my @lengths = map { scalar(@$_) } map {$self->{"prob$_"}} @{ $self->{_alphabet} }; |
|
40
|
|
|
|
|
29
|
|
|
40
|
|
|
|
|
37
|
|
|
2
|
|
|
|
|
7
|
|
298
|
2
|
|
|
|
|
7
|
my $len = shift @lengths; |
299
|
2
|
|
|
|
|
4
|
for ( @lengths ) { |
300
|
38
|
50
|
|
|
|
57
|
if ( $_ ne $len ) { $self->throw( "Probability matrix is damaged!\n" ) }; |
|
0
|
|
|
|
|
0
|
|
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# iterate over probs, generate the most likely sequence and put it into |
304
|
|
|
|
|
|
|
# $self->{seq}. Put the probability of this sequence into $self->{seqp}. |
305
|
2
|
|
|
|
|
8
|
for ( my $i = 0; $i < $len; $i++ ) { |
306
|
|
|
|
|
|
|
# get a list of all the probabilities at position $i, ordered by $self->{_alphabet} |
307
|
515
|
|
|
|
|
298
|
my @probs = map { ${$self->{"prob$_"}}[$i] } @{ $self->{_alphabet} }; |
|
10300
|
|
|
|
|
5374
|
|
|
10300
|
|
|
|
|
11590
|
|
|
515
|
|
|
|
|
484
|
|
308
|
|
|
|
|
|
|
# calculate the consensus of @probs, put sequence into seqp and probabilities into seqp |
309
|
515
|
|
|
|
|
772
|
(${$self->{seq}}[$i],${$self->{seqp}}[$i]) = $self->_to_cons( @probs, $thresh ); |
|
515
|
|
|
|
|
507
|
|
|
515
|
|
|
|
|
1380
|
|
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
|
312
|
2
|
|
|
|
|
8
|
return $self; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=head2 next_pos |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Title : next_pos |
318
|
|
|
|
|
|
|
Usage : |
319
|
|
|
|
|
|
|
Function : Retrives the next position features: frequencies for all 20 amino |
320
|
|
|
|
|
|
|
acids, log-odds scores for all 20 amino acids at this position, |
321
|
|
|
|
|
|
|
the main (consensus) letter at this position, the probability |
322
|
|
|
|
|
|
|
for the consensus letter to occur at this position and the relative |
323
|
|
|
|
|
|
|
current position as an integer. |
324
|
|
|
|
|
|
|
Throws : |
325
|
|
|
|
|
|
|
Example : |
326
|
|
|
|
|
|
|
Returns : hash (or hash reference) (pA,pR,pN,pD,...,logA,logR,logN,logD,aa,prob,rel) |
327
|
|
|
|
|
|
|
- pN entries represent the probability for amino acid N |
328
|
|
|
|
|
|
|
to be at this position |
329
|
|
|
|
|
|
|
- logN entries represent the log-odds score for having amino acid |
330
|
|
|
|
|
|
|
N at this position |
331
|
|
|
|
|
|
|
- aa is the consensus amino acid |
332
|
|
|
|
|
|
|
- prob is the probability for the consensus amino acid to be at this |
333
|
|
|
|
|
|
|
position |
334
|
|
|
|
|
|
|
- rel is the relative index of the current position (integer) |
335
|
|
|
|
|
|
|
Args : none |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=cut |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
sub next_pos { |
341
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
342
|
1
|
50
|
|
|
|
5
|
$self->throw("instance method called on class") unless ref $self; |
343
|
|
|
|
|
|
|
|
344
|
1
|
|
|
|
|
2
|
my $len = @{$self->{seq}}; |
|
1
|
|
|
|
|
4
|
|
345
|
1
|
|
|
|
|
2
|
my $pos = $self->{_position}; |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
# return a PSM if we're still within range |
348
|
1
|
50
|
|
|
|
5
|
if ($pos<$len) { |
349
|
|
|
|
|
|
|
|
350
|
1
|
|
|
|
|
3
|
my %probs = map { ("p$_", ${$self->{"prob$_"}}[$pos]) } @{$self->{_alphabet}}; |
|
20
|
|
|
|
|
21
|
|
|
20
|
|
|
|
|
67
|
|
|
1
|
|
|
|
|
4
|
|
351
|
1
|
|
|
|
|
5
|
my %logs = map { ("l$_", ${$self->{"log$_"}}[$pos]) } @{$self->{_alphabet}}; |
|
20
|
|
|
|
|
20
|
|
|
20
|
|
|
|
|
56
|
|
|
1
|
|
|
|
|
3
|
|
352
|
1
|
|
|
|
|
5
|
my $base = ${$self->{seq}}[$pos]; |
|
1
|
|
|
|
|
3
|
|
353
|
1
|
|
|
|
|
1
|
my $prob = ${$self->{seqp}}[$pos]; |
|
1
|
|
|
|
|
3
|
|
354
|
|
|
|
|
|
|
|
355
|
1
|
|
|
|
|
2
|
$self->{_position}++; |
356
|
1
|
|
|
|
|
24
|
my %hash = ( %probs, %logs, base => $base, rel => $pos, prob => $prob ); |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
# decide whether to return the hash or a reference to it |
359
|
1
|
50
|
|
|
|
7
|
if ( wantarray ) { |
360
|
1
|
|
|
|
|
30
|
return %hash; |
361
|
|
|
|
|
|
|
} else { |
362
|
0
|
|
|
|
|
0
|
return \%hash; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
} else { # otherwise, reset $self->{_position} and return nothing |
365
|
0
|
|
|
|
|
0
|
$self->{_position} = 0; |
366
|
0
|
|
|
|
|
0
|
return; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=head2 curpos |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Title : curpos |
374
|
|
|
|
|
|
|
Usage : |
375
|
|
|
|
|
|
|
Function : Gets/sets the current position. |
376
|
|
|
|
|
|
|
Throws : |
377
|
|
|
|
|
|
|
Example : |
378
|
|
|
|
|
|
|
Returns : Current position (integer). |
379
|
|
|
|
|
|
|
Args : New position (integer). |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=cut |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub curpos { |
384
|
2
|
|
|
2
|
1
|
837
|
my $self = shift; |
385
|
2
|
50
|
|
|
|
6
|
if (@_) { $self->{_position} = shift; } |
|
0
|
|
|
|
|
0
|
|
386
|
2
|
|
|
|
|
7
|
return $self->{_position}; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=head2 e_val |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Title : e_val |
393
|
|
|
|
|
|
|
Usage : |
394
|
|
|
|
|
|
|
Function : Gets/sets the e-value |
395
|
|
|
|
|
|
|
Throws : |
396
|
|
|
|
|
|
|
Example : |
397
|
|
|
|
|
|
|
Returns : |
398
|
|
|
|
|
|
|
Args : real number |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=cut |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
sub e_val { |
403
|
2
|
|
|
2
|
1
|
3
|
my $self = shift; |
404
|
2
|
100
|
|
|
|
6
|
if (@_) { $self->{e_val} = shift; } |
|
1
|
|
|
|
|
2
|
|
405
|
2
|
|
|
|
|
6
|
return $self->{e_val}; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=head2 IC |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
Title : IC |
412
|
|
|
|
|
|
|
Usage : |
413
|
|
|
|
|
|
|
Function : Position-specific information content. |
414
|
|
|
|
|
|
|
Throws : |
415
|
|
|
|
|
|
|
Example : |
416
|
|
|
|
|
|
|
Returns : Information content for current position. |
417
|
|
|
|
|
|
|
Args : Information content for current position. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=cut |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
sub IC { |
422
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
423
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{IC} = shift; } |
|
0
|
|
|
|
|
0
|
|
424
|
0
|
|
|
|
|
0
|
return $self->{IC}; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=head2 accession_number |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
Title : accession_number |
430
|
|
|
|
|
|
|
Usage : |
431
|
|
|
|
|
|
|
Function: accession number, this will be unique id for the ProtMatrix object as |
432
|
|
|
|
|
|
|
well for any other object, inheriting from ProtMatrix. |
433
|
|
|
|
|
|
|
Throws : |
434
|
|
|
|
|
|
|
Example : |
435
|
|
|
|
|
|
|
Returns : New accession number (string) |
436
|
|
|
|
|
|
|
Args : Accession number (string) |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=cut |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
sub accession_number { |
441
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
442
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{accession_number} = shift; } |
|
0
|
|
|
|
|
0
|
|
443
|
0
|
|
|
|
|
0
|
return $self->{accession_number}; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head2 consensus |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
Title : consensus |
449
|
|
|
|
|
|
|
Usage : |
450
|
|
|
|
|
|
|
Function : Returns the consensus sequence for this PSM. |
451
|
|
|
|
|
|
|
Throws : if supplied with thresold outisde 5..10 range |
452
|
|
|
|
|
|
|
Example : |
453
|
|
|
|
|
|
|
Returns : string |
454
|
|
|
|
|
|
|
Args : (optional) threshold value 5 to 10 (corresponds to 50-100% at each position |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=cut |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
sub consensus { |
459
|
3
|
|
|
3
|
1
|
4
|
my $self = shift; |
460
|
3
|
|
|
|
|
3
|
my $thresh=shift; |
461
|
3
|
50
|
|
|
|
7
|
$self->_calculate_consensus($thresh) if ($thresh); #Change of threshold |
462
|
3
|
|
|
|
|
3
|
my $consensus=''; |
463
|
|
|
|
|
|
|
|
464
|
3
|
|
|
|
|
5
|
foreach my $letter (@{$self->{seq}}) { |
|
3
|
|
|
|
|
6
|
|
465
|
523
|
|
|
|
|
319
|
$consensus .= $letter; |
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
3
|
|
|
|
|
10
|
return $consensus; |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
sub IUPAC { |
472
|
2
|
|
|
2
|
1
|
979
|
my $self = shift; |
473
|
2
|
|
|
|
|
10
|
return $self->consensus; |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=head2 get_string |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Title : get_string |
480
|
|
|
|
|
|
|
Usage : |
481
|
|
|
|
|
|
|
Function: Returns given probability vector as a string. Useful if you want to |
482
|
|
|
|
|
|
|
store things in a rel database, where arrays are not first choice |
483
|
|
|
|
|
|
|
Throws : If the argument is outside {A,C,G,T} |
484
|
|
|
|
|
|
|
Example : |
485
|
|
|
|
|
|
|
Returns : string |
486
|
|
|
|
|
|
|
Args : character {A,C,G,T} |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=cut |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub get_string { |
491
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
492
|
1
|
|
|
|
|
2
|
my $base = shift; |
493
|
1
|
|
|
|
|
3
|
my $string = ''; |
494
|
|
|
|
|
|
|
|
495
|
1
|
|
|
|
|
2
|
my @prob = @{$self->{"prob$base"}}; |
|
1
|
|
|
|
|
7
|
|
496
|
1
|
50
|
|
|
|
4
|
if ( ! @prob ) { |
497
|
0
|
|
|
|
|
0
|
$self->throw( "No such base: $base\n"); |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
1
|
|
|
|
|
3
|
foreach my $prob (@prob) { |
501
|
8
|
|
|
|
|
13
|
my $corrected = $prob*10; |
502
|
8
|
|
|
|
|
15
|
my $next = sprintf("%.0f",$corrected); |
503
|
8
|
100
|
|
|
|
12
|
$next = 'a' if ($next eq '10'); |
504
|
8
|
|
|
|
|
12
|
$string .= $next; |
505
|
|
|
|
|
|
|
} |
506
|
1
|
|
|
|
|
19
|
return $string; |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=head2 width |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
Title : width |
514
|
|
|
|
|
|
|
Usage : |
515
|
|
|
|
|
|
|
Function : Returns the length of the site |
516
|
|
|
|
|
|
|
Throws : |
517
|
|
|
|
|
|
|
Example : |
518
|
|
|
|
|
|
|
Returns : number |
519
|
|
|
|
|
|
|
Args : |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=cut |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
sub width { |
524
|
4
|
|
|
4
|
1
|
5
|
my $self = shift; |
525
|
4
|
|
|
|
|
3
|
my $width = @{$self->{probA}}; |
|
4
|
|
|
|
|
6
|
|
526
|
4
|
|
|
|
|
42
|
return $width; |
527
|
|
|
|
|
|
|
} |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=head2 get_array |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
Title : get_array |
532
|
|
|
|
|
|
|
Usage : |
533
|
|
|
|
|
|
|
Function : Returns an array with frequencies for a specified amino acid. |
534
|
|
|
|
|
|
|
Throws : |
535
|
|
|
|
|
|
|
Example : |
536
|
|
|
|
|
|
|
Returns : Array representing frequencies for specified amino acid. |
537
|
|
|
|
|
|
|
Args : Single amino acid (character). |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=cut |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
sub get_array { |
542
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
543
|
1
|
|
|
|
|
3
|
my $letter = uc(shift); |
544
|
|
|
|
|
|
|
|
545
|
1
|
50
|
|
|
|
2
|
$self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}}; |
|
20
|
|
|
|
|
41
|
|
|
1
|
|
|
|
|
2
|
|
546
|
|
|
|
|
|
|
|
547
|
1
|
|
|
|
|
2
|
return @{$self->{"prob$letter"}}; |
|
1
|
|
|
|
|
7
|
|
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=head2 get_logs_array |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Title : get_logs_array |
554
|
|
|
|
|
|
|
Usage : |
555
|
|
|
|
|
|
|
Function : Returns an array with log_odds for a specified base |
556
|
|
|
|
|
|
|
Throws : |
557
|
|
|
|
|
|
|
Example : |
558
|
|
|
|
|
|
|
Returns : Array representing log-odds scores for specified amino acid. |
559
|
|
|
|
|
|
|
Args : Single amino acid (character). |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=cut |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
sub get_logs_array { |
564
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
565
|
0
|
|
|
|
|
0
|
my $letter = uc(shift); |
566
|
|
|
|
|
|
|
|
567
|
0
|
0
|
|
|
|
0
|
$self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
568
|
|
|
|
|
|
|
|
569
|
0
|
|
|
|
|
0
|
return @{$self->{"log$letter"}}; |
|
0
|
|
|
|
|
0
|
|
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=head2 id |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
Title : id |
575
|
|
|
|
|
|
|
Usage : |
576
|
|
|
|
|
|
|
Function : Gets/sets the site id |
577
|
|
|
|
|
|
|
Throws : |
578
|
|
|
|
|
|
|
Example : |
579
|
|
|
|
|
|
|
Returns : string |
580
|
|
|
|
|
|
|
Args : string |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=cut |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
sub id { |
585
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
586
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{id} = shift; } |
|
0
|
|
|
|
|
0
|
|
587
|
0
|
|
|
|
|
0
|
return $self->{id}; |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=head2 regexp |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
Title : regexp |
593
|
|
|
|
|
|
|
Usage : |
594
|
|
|
|
|
|
|
Function : Returns a case-insensitive regular expression which matches the |
595
|
|
|
|
|
|
|
IUPAC convention. X's in consensus sequence will match anything. |
596
|
|
|
|
|
|
|
Throws : |
597
|
|
|
|
|
|
|
Example : |
598
|
|
|
|
|
|
|
Returns : string |
599
|
|
|
|
|
|
|
Args : Threshold for calculating consensus sequence (number in range 0-100 |
600
|
|
|
|
|
|
|
representing a percentage). Threshold defaults to 20. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=cut |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
sub regexp { |
605
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
606
|
1
|
|
|
|
|
1
|
my $threshold = 20; |
607
|
1
|
50
|
|
|
|
4
|
if ( @_ ) { my $threshold = shift }; |
|
0
|
|
|
|
|
0
|
|
608
|
|
|
|
|
|
|
|
609
|
1
|
|
|
|
|
2
|
my @alphabet = @{$self->{_alphabet}}; |
|
1
|
|
|
|
|
4
|
|
610
|
1
|
|
|
|
|
3
|
my $width = $self->width; |
611
|
1
|
|
|
|
|
2
|
my (@regexp, $i); |
612
|
1
|
|
|
|
|
13
|
for ( $i = 0; $i < $width; $i++ ) { |
613
|
|
|
|
|
|
|
# get an array of the residues at this position with p > $threshold |
614
|
8
|
|
|
|
|
7
|
my @letters = map { uc($_).lc($_) } grep { $self->{"prob$_"}->[$i] >= $threshold } @alphabet; |
|
12
|
|
|
|
|
17
|
|
|
160
|
|
|
|
|
181
|
|
615
|
|
|
|
|
|
|
|
616
|
8
|
|
|
|
|
8
|
my $reg; |
617
|
8
|
100
|
|
|
|
10
|
if ( scalar(@letters) == 0 ) { |
618
|
1
|
|
|
|
|
2
|
$reg = '\.'; |
619
|
|
|
|
|
|
|
} else { |
620
|
7
|
|
|
|
|
10
|
$reg = '['.join('',@letters).']'; |
621
|
|
|
|
|
|
|
} |
622
|
8
|
|
|
|
|
17
|
push @regexp, $reg; |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
|
625
|
1
|
50
|
|
|
|
3
|
if ( wantarray ) { |
626
|
0
|
|
|
|
|
0
|
return @regexp; |
627
|
|
|
|
|
|
|
} else { |
628
|
1
|
|
|
|
|
7
|
return join '', @regexp; |
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
=head2 regexp_array |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
Title : regexp_array |
636
|
|
|
|
|
|
|
Usage : |
637
|
|
|
|
|
|
|
Function : Returns an array of position-specific regular expressions. |
638
|
|
|
|
|
|
|
X's in consensus sequence will match anything. |
639
|
|
|
|
|
|
|
Throws : |
640
|
|
|
|
|
|
|
Example : |
641
|
|
|
|
|
|
|
Returns : Array of position-specific regular expressions. |
642
|
|
|
|
|
|
|
Args : Threshold for calculating consensus sequence (number in range 0-100 |
643
|
|
|
|
|
|
|
representing a percentage). Threshold defaults to 20. |
644
|
|
|
|
|
|
|
Notes : Simply calls regexp method in list context. |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
=cut |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
sub regexp_array { |
649
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
650
|
|
|
|
|
|
|
|
651
|
0
|
|
|
|
|
0
|
return @{ $self->regexp }; |
|
0
|
|
|
|
|
0
|
|
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=head2 _compress_array |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
Title : _compress_array |
658
|
|
|
|
|
|
|
Usage : |
659
|
|
|
|
|
|
|
Function : Will compress an array of real signed numbers to a string (ie vector of bytes) |
660
|
|
|
|
|
|
|
-127 to +127 for bi-directional(signed) and 0..255 for unsigned ; |
661
|
|
|
|
|
|
|
Throws : |
662
|
|
|
|
|
|
|
Example : Internal stuff |
663
|
|
|
|
|
|
|
Returns : String |
664
|
|
|
|
|
|
|
Args : array reference, followed by max value and direction (optional, defaults to 1), |
665
|
|
|
|
|
|
|
direction of 1 is unsigned, anything else is signed. |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=cut |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
sub _compress_array { |
670
|
0
|
|
|
0
|
|
0
|
my ($array,$lm,$direct)=@_; |
671
|
0
|
|
|
|
|
0
|
my $str; |
672
|
0
|
0
|
0
|
|
|
0
|
return unless(($array) && ($lm)); |
673
|
0
|
0
|
|
|
|
0
|
$direct=1 unless ($direct); |
674
|
0
|
0
|
|
|
|
0
|
my $k1= ($direct==1) ? (255/$lm) : (127/$lm); |
675
|
0
|
|
|
|
|
0
|
foreach my $c (@{$array}) { |
|
0
|
|
|
|
|
0
|
|
676
|
0
|
0
|
|
|
|
0
|
$c=$lm if ($c>$lm); |
677
|
0
|
0
|
0
|
|
|
0
|
$c=-$lm if (($c<-$lm) && ($direct !=1)); |
678
|
0
|
0
|
0
|
|
|
0
|
$c=0 if (($c<0) && ($direct ==1)); |
679
|
0
|
|
|
|
|
0
|
my $byte=int($k1*$c); |
680
|
0
|
0
|
|
|
|
0
|
$byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits |
681
|
0
|
|
|
|
|
0
|
my $char=chr($byte); |
682
|
0
|
|
|
|
|
0
|
$str.=$char; |
683
|
|
|
|
|
|
|
} |
684
|
0
|
|
|
|
|
0
|
return $str; |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=head2 _uncompress_string |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
Title : _uncompress_string |
690
|
|
|
|
|
|
|
Usage : |
691
|
|
|
|
|
|
|
Function : Will uncompress a string (vector of bytes) to create an array of real |
692
|
|
|
|
|
|
|
signed numbers (opposite to_compress_array) |
693
|
|
|
|
|
|
|
Throws : |
694
|
|
|
|
|
|
|
Example : Internal stuff |
695
|
|
|
|
|
|
|
Returns : string, followed by max value and direction (optional, defaults to 1), |
696
|
|
|
|
|
|
|
direction of 1 is unsigned, anything else is signed. |
697
|
|
|
|
|
|
|
Args : array |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
=cut |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
sub _uncompress_string { |
702
|
0
|
|
|
0
|
|
0
|
my ($str,$lm,$direct)=@_; |
703
|
0
|
|
|
|
|
0
|
my @array; |
704
|
0
|
0
|
0
|
|
|
0
|
return unless(($str) && ($lm)); |
705
|
0
|
0
|
|
|
|
0
|
$direct=1 unless ($direct); |
706
|
0
|
0
|
|
|
|
0
|
my $k1= ($direct==1) ? (255/$lm) : (127/$lm); |
707
|
0
|
|
|
|
|
0
|
while (my $c=chop($str)) { |
708
|
0
|
|
|
|
|
0
|
my $byte=ord($c); |
709
|
0
|
0
|
|
|
|
0
|
$byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits |
710
|
0
|
|
|
|
|
0
|
my $num=$byte/$k1; |
711
|
0
|
|
|
|
|
0
|
unshift @array,$num; |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
|
714
|
0
|
|
|
|
|
0
|
return @array; |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
=head2 get_compressed_freq |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
Title : get_compressed_freq |
720
|
|
|
|
|
|
|
Usage : |
721
|
|
|
|
|
|
|
Function: A method to provide a compressed frequency vector. It uses one byte to |
722
|
|
|
|
|
|
|
code the frequence for one of the probability vectors for one position. |
723
|
|
|
|
|
|
|
Useful for relational database. Improvment of the previous 0..a coding. |
724
|
|
|
|
|
|
|
Throws : |
725
|
|
|
|
|
|
|
Example : my $strA=$self->get_compressed_freq('A'); |
726
|
|
|
|
|
|
|
Returns : String |
727
|
|
|
|
|
|
|
Args : char |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
=cut |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub get_compressed_freq { |
732
|
0
|
|
|
0
|
1
|
0
|
my $self=shift; |
733
|
0
|
|
|
|
|
0
|
my $base=shift; |
734
|
0
|
|
|
|
|
0
|
my $string=''; |
735
|
0
|
|
|
|
|
0
|
my @prob; |
736
|
|
|
|
|
|
|
BASE: { |
737
|
0
|
0
|
|
|
|
0
|
if ($base eq 'A') { |
|
0
|
|
|
|
|
0
|
|
738
|
0
|
0
|
|
|
|
0
|
@prob = @{$self->{probA}} unless (!defined($self->{probA})); |
|
0
|
|
|
|
|
0
|
|
739
|
0
|
|
|
|
|
0
|
last BASE; |
740
|
|
|
|
|
|
|
} |
741
|
0
|
0
|
|
|
|
0
|
if ($base eq 'G') { |
742
|
0
|
0
|
|
|
|
0
|
@prob = @{$self->{probG}} unless (!defined($self->{probG})); |
|
0
|
|
|
|
|
0
|
|
743
|
0
|
|
|
|
|
0
|
last BASE; |
744
|
|
|
|
|
|
|
} |
745
|
0
|
0
|
|
|
|
0
|
if ($base eq 'C') { |
746
|
0
|
0
|
|
|
|
0
|
@prob = @{$self->{probC}} unless (!defined($self->{probC})); |
|
0
|
|
|
|
|
0
|
|
747
|
0
|
|
|
|
|
0
|
last BASE; |
748
|
|
|
|
|
|
|
} |
749
|
0
|
0
|
|
|
|
0
|
if ($base eq 'T') { |
750
|
0
|
0
|
|
|
|
0
|
@prob = @{$self->{probT}} unless (!defined($self->{probT})); |
|
0
|
|
|
|
|
0
|
|
751
|
0
|
|
|
|
|
0
|
last BASE; |
752
|
|
|
|
|
|
|
} |
753
|
0
|
|
|
|
|
0
|
$self->throw ("No such base: $base!\n"); |
754
|
|
|
|
|
|
|
} |
755
|
0
|
|
|
|
|
0
|
my $str= _compress_array(\@prob,1,1); |
756
|
0
|
|
|
|
|
0
|
return $str; |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=head2 sequence_match_weight |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
Title : sequence_match_weight |
762
|
|
|
|
|
|
|
Usage : |
763
|
|
|
|
|
|
|
Function : This method will calculate the score of a match, based on the PSM |
764
|
|
|
|
|
|
|
if such is associated with the matrix object. Returns undef if no |
765
|
|
|
|
|
|
|
PSM data is available. |
766
|
|
|
|
|
|
|
Throws : if the length of the sequence is different from the matrix width |
767
|
|
|
|
|
|
|
Example : my $score=$matrix->sequence_match_weight('ACGGATAG'); |
768
|
|
|
|
|
|
|
Returns : Floating point |
769
|
|
|
|
|
|
|
Args : string |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
=cut |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
sub sequence_match_weight { |
774
|
1
|
|
|
1
|
1
|
2
|
my ($self,$seq)=@_; |
775
|
1
|
50
|
|
|
|
5
|
return unless ($self->{logA}); |
776
|
|
|
|
|
|
|
|
777
|
1
|
|
|
|
|
2
|
my $seqlen = length($seq); |
778
|
1
|
|
|
|
|
2
|
my $width = $self->width; |
779
|
1
|
50
|
|
|
|
2
|
$self->throw("Error: Input sequence size ($seqlen) not equal to PSM size ($width)!\n") |
780
|
|
|
|
|
|
|
unless (length($seq) == $self->width); |
781
|
|
|
|
|
|
|
|
782
|
1
|
|
|
|
|
2
|
my ($score,$i) = (0,0); |
783
|
1
|
|
|
|
|
4
|
foreach my $letter ( split //, $seq ) { |
784
|
|
|
|
|
|
|
# add up the score for this position |
785
|
8
|
|
|
|
|
13
|
$score += $self->{"log$letter"}->[$i]; |
786
|
8
|
|
|
|
|
5
|
$i++; |
787
|
|
|
|
|
|
|
} |
788
|
1
|
|
|
|
|
4
|
return $score; |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=head2 _to_IUPAC |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Title : _to_IUPAC |
795
|
|
|
|
|
|
|
Usage : |
796
|
|
|
|
|
|
|
Function: Converts a single position to IUPAC compliant symbol and returns its probability. |
797
|
|
|
|
|
|
|
Currently returns the most likely amino acid/probability combination. |
798
|
|
|
|
|
|
|
Throws : |
799
|
|
|
|
|
|
|
Example : |
800
|
|
|
|
|
|
|
Returns : char, real number representing an amino acid and a probability. |
801
|
|
|
|
|
|
|
Args : real numbers for all 20 amino acids (ordered by alphabet contained |
802
|
|
|
|
|
|
|
in $self->{_alphabet}, minimum probability threshold. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=cut |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
sub _to_IUPAC { |
807
|
515
|
|
|
515
|
|
1282
|
my ($self,@probs,$thresh) = @_; |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
# provide a default threshold of 5, corresponds to 5% threshold for |
810
|
|
|
|
|
|
|
# inferring that the aa at any position is the true aa |
811
|
515
|
50
|
|
|
|
650
|
$thresh = 5 unless ( defined $thresh ); |
812
|
|
|
|
|
|
|
|
813
|
515
|
|
|
|
|
344
|
my ($IUPAC_aa,$max_prob) = ('X',$thresh); |
814
|
515
|
|
|
|
|
274
|
for my $aa ( @{$self->{_alphabet}} ) { |
|
515
|
|
|
|
|
541
|
|
815
|
10300
|
|
|
|
|
6037
|
my $prob = shift @probs; |
816
|
10300
|
100
|
|
|
|
11607
|
if ( $prob > $max_prob ) { |
817
|
907
|
|
|
|
|
530
|
$IUPAC_aa = $aa; |
818
|
907
|
|
|
|
|
680
|
$max_prob = $prob; |
819
|
|
|
|
|
|
|
} |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
|
822
|
515
|
|
|
|
|
615
|
return $IUPAC_aa, $max_prob; |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
=head2 _to_cons |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
Title : _to_cons |
828
|
|
|
|
|
|
|
Usage : |
829
|
|
|
|
|
|
|
Function: Converts a single position to simple consensus character and returns |
830
|
|
|
|
|
|
|
its probability. Currently just calls the _to_IUPAC subroutine. |
831
|
|
|
|
|
|
|
Throws : |
832
|
|
|
|
|
|
|
Example : |
833
|
|
|
|
|
|
|
Returns : char, real number |
834
|
|
|
|
|
|
|
Args : real numbers for A,C,G,T (positional) |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
=cut |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
sub _to_cons { |
839
|
515
|
|
|
515
|
|
493
|
return _to_IUPAC( @_ ); |
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
=head2 get_all_vectors |
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
Title : get_all_vectors |
845
|
|
|
|
|
|
|
Usage : |
846
|
|
|
|
|
|
|
Function : returns all possible sequence vectors to satisfy the PFM under |
847
|
|
|
|
|
|
|
a given threshold |
848
|
|
|
|
|
|
|
Throws : If threshold outside of 0..1 (no sense to do that) |
849
|
|
|
|
|
|
|
Example : my @vectors = $self->get_all_vectors(4); |
850
|
|
|
|
|
|
|
Returns : Array of strings |
851
|
|
|
|
|
|
|
Args : (optional) floating |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
=cut |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
#sub get_all_vectors { |
856
|
|
|
|
|
|
|
# my $self = shift; |
857
|
|
|
|
|
|
|
# my $thresh = shift; |
858
|
|
|
|
|
|
|
# |
859
|
|
|
|
|
|
|
# $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1)); |
860
|
|
|
|
|
|
|
# |
861
|
|
|
|
|
|
|
# my @seq = split(//,$self->consensus($thresh*10)); |
862
|
|
|
|
|
|
|
# my @perm; |
863
|
|
|
|
|
|
|
# for my $i (0..@{$self->{probA}}) { |
864
|
|
|
|
|
|
|
# push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh); |
865
|
|
|
|
|
|
|
# push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh); |
866
|
|
|
|
|
|
|
# push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh); |
867
|
|
|
|
|
|
|
# push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh); |
868
|
|
|
|
|
|
|
# push @{$perm[$i]},'N' if ($seq[$i] eq 'N'); |
869
|
|
|
|
|
|
|
# } |
870
|
|
|
|
|
|
|
# my $fpos=shift @perm; |
871
|
|
|
|
|
|
|
# my @strings=@$fpos; |
872
|
|
|
|
|
|
|
# foreach my $pos (@perm) { |
873
|
|
|
|
|
|
|
# my @newstr; |
874
|
|
|
|
|
|
|
# foreach my $let (@$pos) { |
875
|
|
|
|
|
|
|
# foreach my $string (@strings) { |
876
|
|
|
|
|
|
|
# my $newstring = $string . $let; |
877
|
|
|
|
|
|
|
# push @newstr,$newstring; |
878
|
|
|
|
|
|
|
# } |
879
|
|
|
|
|
|
|
# } |
880
|
|
|
|
|
|
|
# @strings=@newstr; |
881
|
|
|
|
|
|
|
# } |
882
|
|
|
|
|
|
|
# return @strings; |
883
|
|
|
|
|
|
|
#} |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
1; |