line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#--------------------------------------------------------- |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a |
6
|
|
|
|
|
|
|
position scoring matrix (or position weight matrix) and log-odds |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 SYNOPSIS |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Bio::Matrix::PSM::SiteMatrix; |
11
|
|
|
|
|
|
|
# Create from memory by supplying probability matrix hash |
12
|
|
|
|
|
|
|
# both as strings or arrays |
13
|
|
|
|
|
|
|
# where the frequencies $a,$c,$g and $t are supplied either as |
14
|
|
|
|
|
|
|
# arrayref or string. Accordingly, lA, lC, lG and lT are the log |
15
|
|
|
|
|
|
|
# odds (only as arrays, no checks done right now) |
16
|
|
|
|
|
|
|
my ($a,$c,$g,$t,$score,$ic, $mid)=@_; |
17
|
|
|
|
|
|
|
#or |
18
|
|
|
|
|
|
|
my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001', |
19
|
|
|
|
|
|
|
'100104',0.001,19.2,'CRE1'); |
20
|
|
|
|
|
|
|
#Where a stands for all (this frequency=1), see explanation below |
21
|
|
|
|
|
|
|
my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t, |
22
|
|
|
|
|
|
|
-lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l, |
23
|
|
|
|
|
|
|
-IC=>$ic,-e_val=>$score, -id=>$mid); |
24
|
|
|
|
|
|
|
my $site=Bio::Matrix::PSM::SiteMatrix->new(%param); |
25
|
|
|
|
|
|
|
#Or get it from a file: |
26
|
|
|
|
|
|
|
use Bio::Matrix::PSM::IO; |
27
|
|
|
|
|
|
|
my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac'); |
28
|
|
|
|
|
|
|
while (my $psm=$psmIO->next_psm) { |
29
|
|
|
|
|
|
|
#Now we have a Bio::Matrix::PSM::Psm object, |
30
|
|
|
|
|
|
|
# see Bio::Matrix::PSM::PsmI for details |
31
|
|
|
|
|
|
|
#This is a Bio::Matrix::PSM::SiteMatrix object now |
32
|
|
|
|
|
|
|
my $matrix=$psm->matrix; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Get a simple consensus, where alphabet is {A,C,G,T,N}, |
36
|
|
|
|
|
|
|
# choosing the character that both satisfies a supplied or default threshold |
37
|
|
|
|
|
|
|
# frequency and is the most frequenct character at each position, or N. |
38
|
|
|
|
|
|
|
# So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15, |
39
|
|
|
|
|
|
|
# the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it |
40
|
|
|
|
|
|
|
# would be 'N'. |
41
|
|
|
|
|
|
|
my $consensus=$site->consensus; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Get the IUPAC ambiguity code representation of the data in the matrix. |
44
|
|
|
|
|
|
|
# Because the frequencies may have been pseudo-count corrected, insignificant |
45
|
|
|
|
|
|
|
# frequences (below 0.05 by default) are ignored. So a position with |
46
|
|
|
|
|
|
|
# A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M', |
47
|
|
|
|
|
|
|
# while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and |
48
|
|
|
|
|
|
|
# 0.25, 0.25, 0.25, 0.25 would get 'N'. |
49
|
|
|
|
|
|
|
my $iupac=$site->IUPAC; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Getting/using regular expression (a representation of the IUPAC string) |
52
|
|
|
|
|
|
|
my $regexp=$site->regexp; |
53
|
|
|
|
|
|
|
my $count=grep($regexp,$seq); |
54
|
|
|
|
|
|
|
my $count=($seq=~ s/$regexp/$1/eg); |
55
|
|
|
|
|
|
|
print "Motif $mid is present $count times in this sequence\n"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 DESCRIPTION |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
SiteMatrix is designed to provide some basic methods when working with position |
60
|
|
|
|
|
|
|
scoring (weight) matrices, such as transcription factor binding sites for |
61
|
|
|
|
|
|
|
example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is |
62
|
|
|
|
|
|
|
the minimum information you should provide to construct a PSM object. The |
63
|
|
|
|
|
|
|
vectors can be provided as strings with frequenciesx10 rounded to an int, going |
64
|
|
|
|
|
|
|
from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed |
65
|
|
|
|
|
|
|
representation of a matrix and it is quite useful when working with relational |
66
|
|
|
|
|
|
|
DB. If arrays are provided as an input (references to arrays actually) they can |
67
|
|
|
|
|
|
|
be any number, real or integer (frequency or count). |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
When creating the object you can ask the constructor to make a simple pseudo |
70
|
|
|
|
|
|
|
count correction by adding a number (typically 1) to all positions (with the |
71
|
|
|
|
|
|
|
-correction option). After adding the number the frequencies will be |
72
|
|
|
|
|
|
|
calculated. Only use correction when you supply counts, not frequencies. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Throws an exception if: You mix as an input array and string (for example A |
75
|
|
|
|
|
|
|
matrix is given as array, C - as string). The position vector is (0,0,0,0). One |
76
|
|
|
|
|
|
|
of the probability vectors is shorter than the rest. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
Summary of the methods I use most frequently (details below): |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
iupac - return IUPAC compliant consensus as a string |
81
|
|
|
|
|
|
|
score - Returns the score as a real number |
82
|
|
|
|
|
|
|
IC - information content. Returns a real number |
83
|
|
|
|
|
|
|
id - identifier. Returns a string |
84
|
|
|
|
|
|
|
accession - accession number. Returns a string |
85
|
|
|
|
|
|
|
next_pos - return the sequence probably for each letter, IUPAC |
86
|
|
|
|
|
|
|
symbol, IUPAC probability and simple sequence |
87
|
|
|
|
|
|
|
consenus letter for this position. Rewind at the end. Returns a hash. |
88
|
|
|
|
|
|
|
pos - current position get/set. Returns an integer. |
89
|
|
|
|
|
|
|
regexp - construct a regular expression based on IUPAC consensus. |
90
|
|
|
|
|
|
|
For example AGWV will be [Aa][Gg][AaTt][AaCcGg] |
91
|
|
|
|
|
|
|
width - site width |
92
|
|
|
|
|
|
|
get_string - gets the probability vector for a single base as a string. |
93
|
|
|
|
|
|
|
get_array - gets the probability vector for a single base as an array. |
94
|
|
|
|
|
|
|
get_logs_array - gets the log-odds vector for a single base as an array. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
New methods, which might be of interest to anyone who wants to store |
97
|
|
|
|
|
|
|
PSM in a relational database without creating an entry for each |
98
|
|
|
|
|
|
|
position is the ability to compress the PSM vector into a string with |
99
|
|
|
|
|
|
|
losing usually less than 1% of the data. this can be done with: |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
my $str=$matrix->get_compressed_freq('A'); |
102
|
|
|
|
|
|
|
or |
103
|
|
|
|
|
|
|
my $str=$matrix->get_compressed_logs('A'); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Loading from a database should be done with new, but is not yest implemented. |
106
|
|
|
|
|
|
|
However you can still uncompress such string with: |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM |
109
|
|
|
|
|
|
|
or |
110
|
|
|
|
|
|
|
my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head1 FEEDBACK |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head2 Mailing Lists |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
117
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
118
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
121
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 Support |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
I |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
130
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
131
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
132
|
|
|
|
|
|
|
with code and data examples if at all possible. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 Reporting Bugs |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
137
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
138
|
|
|
|
|
|
|
web: |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 AUTHOR - Stefan Kirov |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Email skirov@utk.edu |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Sendu Bala, bix@sendu.me.uk |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 APPENDIX |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
153
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=cut |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Let the code begin... |
158
|
|
|
|
|
|
|
package Bio::Matrix::PSM::SiteMatrix; |
159
|
5
|
|
|
5
|
|
471
|
use strict; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
139
|
|
160
|
|
|
|
|
|
|
|
161
|
5
|
|
|
5
|
|
46
|
use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI); |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
1509
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head2 new |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Title : new |
166
|
|
|
|
|
|
|
Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c, |
167
|
|
|
|
|
|
|
-pG=>$g,-pT=>$t, |
168
|
|
|
|
|
|
|
-IC=>$ic, |
169
|
|
|
|
|
|
|
-e_val=>$score, |
170
|
|
|
|
|
|
|
-id=>$mid); |
171
|
|
|
|
|
|
|
Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory |
172
|
|
|
|
|
|
|
Throws : If inconsistent data for all vectors (A,C,G and T) is |
173
|
|
|
|
|
|
|
provided, if you mix input types (string vs array) or if a |
174
|
|
|
|
|
|
|
position freq is 0. |
175
|
|
|
|
|
|
|
Returns : Bio::Matrix::PSM::SiteMatrix object |
176
|
|
|
|
|
|
|
Args : -pA => vector with the frequencies or counts of A |
177
|
|
|
|
|
|
|
-pC => vector for C |
178
|
|
|
|
|
|
|
-pG => vector for G |
179
|
|
|
|
|
|
|
-pt => vector for T |
180
|
|
|
|
|
|
|
-lA => vector for the log of A |
181
|
|
|
|
|
|
|
-lC => vector for the log of C |
182
|
|
|
|
|
|
|
-lG => vector for the log of G |
183
|
|
|
|
|
|
|
-lT => vector for the log of T |
184
|
|
|
|
|
|
|
-IC => real number, the information content of this matrix |
185
|
|
|
|
|
|
|
-e_val => real number, the expect value |
186
|
|
|
|
|
|
|
-id => string, an identifier |
187
|
|
|
|
|
|
|
-width => int, width of the matrix in nucleotides |
188
|
|
|
|
|
|
|
-sites => int, the number of sites that went into this matrix |
189
|
|
|
|
|
|
|
-model => hash ref, background frequencies for A, C, G and T |
190
|
|
|
|
|
|
|
-correction => number, the number to add to all positions to achieve |
191
|
|
|
|
|
|
|
pseudo count correction (default 0: no correction) |
192
|
|
|
|
|
|
|
NB: do not use correction when your input is |
193
|
|
|
|
|
|
|
frequences! |
194
|
|
|
|
|
|
|
-accession_number => string, an accession number |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Vectors can be strings of the frequencies where the frequencies are |
197
|
|
|
|
|
|
|
multiplied by 10 and rounded to the nearest whole number, and where |
198
|
|
|
|
|
|
|
'a' is used to denote the maximal frequency 10. There should be no |
199
|
|
|
|
|
|
|
punctuation (spaces etc.) in the string. For example, 'a0501'. |
200
|
|
|
|
|
|
|
Alternatively frequencies or counts can be represented by an array |
201
|
|
|
|
|
|
|
ref containing the counts, frequencies or logs as any kind of |
202
|
|
|
|
|
|
|
number. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub new { |
207
|
14
|
|
|
14
|
1
|
140
|
my ($class, @args) = @_; |
208
|
14
|
|
|
|
|
57
|
my $self = $class->SUPER::new(@args); |
209
|
14
|
|
|
|
|
26
|
my $consensus; |
210
|
|
|
|
|
|
|
# Too many things to rearrange, and I am creating simultanuously >500 |
211
|
|
|
|
|
|
|
# such objects routinely, so this becomes performance issue |
212
|
|
|
|
|
|
|
my %input; |
213
|
14
|
|
|
|
|
28
|
while (@args) { |
214
|
114
|
|
|
|
|
204
|
(my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)! |
215
|
114
|
|
|
|
|
221
|
$input{$key} = shift @args; |
216
|
|
|
|
|
|
|
} |
217
|
14
|
|
|
|
|
22
|
$self->{_position} = 0; |
218
|
14
|
|
|
|
|
27
|
$self->{IC} = $input{IC}; |
219
|
14
|
|
|
|
|
30
|
$self->{e_val} = $input{e_val}; |
220
|
14
|
|
|
|
|
17
|
$self->{width} = $input{width}; |
221
|
14
|
|
|
|
|
19
|
$self->{logA} = $input{lA}; |
222
|
14
|
|
|
|
|
21
|
$self->{logC} = $input{lC}; |
223
|
14
|
|
|
|
|
28
|
$self->{logG} = $input{lG}; |
224
|
14
|
|
|
|
|
17
|
$self->{logT} = $input{lT}; |
225
|
14
|
|
|
|
|
19
|
$self->{sites} = $input{sites}; |
226
|
14
|
|
100
|
|
|
33
|
$self->{id} = $input{id} || 'null'; |
227
|
14
|
|
50
|
|
|
41
|
$self->{correction} = $input{correction} || 0; |
228
|
14
|
|
|
|
|
21
|
$self->{accession_number} = $input{accession_number}; |
229
|
14
|
50
|
66
|
|
|
89
|
return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT})); |
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# This should go to _initialize? |
232
|
|
|
|
|
|
|
# Check for input type- no mixing alllowed, throw ex |
233
|
12
|
100
|
|
|
|
50
|
if (ref($input{pA}) =~ /ARRAY/i ) { |
234
|
11
|
50
|
|
|
|
23
|
$self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC})); |
235
|
11
|
50
|
|
|
|
21
|
$self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG})); |
236
|
11
|
50
|
|
|
|
16
|
$self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT})); |
237
|
11
|
|
|
|
|
15
|
$self->{probA} = $input{pA}; |
238
|
11
|
|
|
|
|
19
|
$self->{probC} = $input{pC}; |
239
|
11
|
|
|
|
|
21
|
$self->{probG} = $input{pG}; |
240
|
11
|
|
|
|
|
16
|
$self->{probT} = $input{pT}; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
else { |
243
|
1
|
50
|
|
|
|
3
|
$self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC})); |
244
|
1
|
50
|
|
|
|
2
|
$self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG})); |
245
|
1
|
50
|
|
|
|
2
|
$self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT})); |
246
|
1
|
|
|
|
|
5
|
$self->{probA} = [split(//,$input{pA})]; |
247
|
1
|
|
|
|
|
4
|
$self->{probC} = [split(//,$input{pC})]; |
248
|
1
|
|
|
|
|
5
|
$self->{probG} = [split(//,$input{pG})]; |
249
|
1
|
|
|
|
|
3
|
$self->{probT} = [split(//,$input{pT})]; |
250
|
1
|
|
|
|
|
3
|
for (my $i=0; $i<= @{$self->{probA}}+1; $i++) { |
|
8
|
|
|
|
|
16
|
|
251
|
|
|
|
|
|
|
# we implictely assume these are MEME-style frequencies x 10, so |
252
|
|
|
|
|
|
|
# 'a' represents the 'maximum', 10. Other positions can actually |
253
|
|
|
|
|
|
|
# add up to over 10 due to rounding, but I don't think that is a |
254
|
|
|
|
|
|
|
# problem? |
255
|
7
|
100
|
100
|
|
|
5
|
if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') { |
|
7
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
7
|
|
256
|
1
|
|
|
|
|
2
|
${$self->{probA}}[$i]='10'; |
|
1
|
|
|
|
|
2
|
|
257
|
|
|
|
|
|
|
} |
258
|
7
|
100
|
100
|
|
|
7
|
if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') { |
|
7
|
|
|
|
|
11
|
|
|
3
|
|
|
|
|
7
|
|
259
|
1
|
|
|
|
|
1
|
${$self->{probC}}[$i]='10'; |
|
1
|
|
|
|
|
2
|
|
260
|
|
|
|
|
|
|
} |
261
|
7
|
50
|
66
|
|
|
8
|
if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') { |
|
7
|
|
|
|
|
13
|
|
|
3
|
|
|
|
|
7
|
|
262
|
0
|
|
|
|
|
0
|
${$self->{probG}}[$i]='10'; |
|
0
|
|
|
|
|
0
|
|
263
|
|
|
|
|
|
|
} |
264
|
7
|
50
|
66
|
|
|
7
|
if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') { |
|
7
|
|
|
|
|
12
|
|
|
2
|
|
|
|
|
4
|
|
265
|
0
|
|
|
|
|
0
|
${$self->{probT}}[$i]='10'; |
|
0
|
|
|
|
|
0
|
|
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Check for position with 0 for all bases, throw exception if so |
271
|
12
|
|
|
|
|
19
|
for (my $i=0;$i <= $#{$self->{probA}}; $i++) { |
|
279
|
|
|
|
|
376
|
|
272
|
267
|
50
|
|
|
|
238
|
if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) { |
|
267
|
|
|
|
|
269
|
|
|
267
|
|
|
|
|
345
|
|
|
267
|
|
|
|
|
301
|
|
|
267
|
|
|
|
|
392
|
|
273
|
0
|
|
|
|
|
0
|
$self->throw("Position meaningless-all frequencies are 0"); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# apply pseudo-count correction to all values - this will result in |
277
|
|
|
|
|
|
|
# very bad frequencies if the input is already frequences and a |
278
|
|
|
|
|
|
|
# correction value as large as 1 is used! |
279
|
267
|
50
|
|
|
|
362
|
if ($self->{correction}) { |
280
|
0
|
|
|
|
|
0
|
${$self->{probA}}[$i] += $self->{correction}; |
|
0
|
|
|
|
|
0
|
|
281
|
0
|
|
|
|
|
0
|
${$self->{probC}}[$i] += $self->{correction}; |
|
0
|
|
|
|
|
0
|
|
282
|
0
|
|
|
|
|
0
|
${$self->{probG}}[$i] += $self->{correction}; |
|
0
|
|
|
|
|
0
|
|
283
|
0
|
|
|
|
|
0
|
${$self->{probT}}[$i] += $self->{correction}; |
|
0
|
|
|
|
|
0
|
|
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# (re)calculate frequencies |
287
|
267
|
|
|
|
|
233
|
my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]; |
|
267
|
|
|
|
|
259
|
|
|
267
|
|
|
|
|
264
|
|
|
267
|
|
|
|
|
271
|
|
|
267
|
|
|
|
|
285
|
|
288
|
267
|
|
|
|
|
242
|
${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div; |
|
267
|
|
|
|
|
260
|
|
|
267
|
|
|
|
|
303
|
|
289
|
267
|
|
|
|
|
232
|
${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div; |
|
267
|
|
|
|
|
254
|
|
|
267
|
|
|
|
|
287
|
|
290
|
267
|
|
|
|
|
245
|
${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div; |
|
267
|
|
|
|
|
262
|
|
|
267
|
|
|
|
|
269
|
|
291
|
267
|
|
|
|
|
227
|
${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div; |
|
267
|
|
|
|
|
309
|
|
|
267
|
|
|
|
|
262
|
|
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# Calculate the logs |
295
|
12
|
50
|
66
|
|
|
45
|
if ((!defined($self->{logA})) && ($input{model})) { |
296
|
0
|
|
|
|
|
0
|
$self->calc_weight($input{model}); |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Make consensus, throw if any one of the vectors is shorter |
300
|
12
|
|
|
|
|
43
|
$self->_calculate_consensus; |
301
|
12
|
|
|
|
|
104
|
return $self; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=head2 _calculate_consensus |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
Title : _calculate_consensus |
307
|
|
|
|
|
|
|
Function: Internal stuff |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=cut |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub _calculate_consensus { |
312
|
12
|
|
|
12
|
|
15
|
my $self=shift; |
313
|
12
|
|
|
|
|
16
|
my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}}); |
|
12
|
|
|
|
|
25
|
|
|
12
|
|
|
|
|
20
|
|
|
12
|
|
|
|
|
23
|
|
314
|
12
|
|
|
|
|
16
|
my $len=$#{$self->{probA}}; |
|
12
|
|
|
|
|
17
|
|
315
|
12
|
50
|
|
|
|
29
|
$self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc); |
316
|
12
|
50
|
|
|
|
24
|
$self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt); |
317
|
12
|
50
|
|
|
|
19
|
$self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg); |
318
|
12
|
|
|
|
|
30
|
for (my $i=0; $i<$len+1; $i++) { |
319
|
|
|
|
|
|
|
#*** IUPACp values not actually used (eg. by next_pos) |
320
|
267
|
|
|
|
|
269
|
(${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]); |
|
267
|
|
|
|
|
415
|
|
|
267
|
|
|
|
|
420
|
|
|
267
|
|
|
|
|
295
|
|
|
267
|
|
|
|
|
278
|
|
|
267
|
|
|
|
|
270
|
|
|
267
|
|
|
|
|
333
|
|
321
|
267
|
|
|
|
|
320
|
(${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]); |
|
267
|
|
|
|
|
384
|
|
|
267
|
|
|
|
|
604
|
|
|
267
|
|
|
|
|
291
|
|
|
267
|
|
|
|
|
256
|
|
|
267
|
|
|
|
|
265
|
|
|
267
|
|
|
|
|
343
|
|
322
|
|
|
|
|
|
|
} |
323
|
12
|
|
|
|
|
19
|
return $self; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head2 calc_weight |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Title : calc_weight |
329
|
|
|
|
|
|
|
Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568}); |
330
|
|
|
|
|
|
|
Function: Recalculates the PSM (or weights) based on the PFM (the frequency |
331
|
|
|
|
|
|
|
matrix) and user supplied background model. |
332
|
|
|
|
|
|
|
Throws : if no model is supplied |
333
|
|
|
|
|
|
|
Returns : n/a |
334
|
|
|
|
|
|
|
Args : reference to a hash with background frequencies for A,C,G and T |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=cut |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
sub calc_weight { |
339
|
0
|
|
|
0
|
1
|
0
|
my ($self, $model) = @_; |
340
|
0
|
|
|
|
|
0
|
my %model; |
341
|
0
|
|
|
|
|
0
|
$model{probA}=$model->{A}; |
342
|
0
|
|
|
|
|
0
|
$model{probC}=$model->{C}; |
343
|
0
|
|
|
|
|
0
|
$model{probG}=$model->{G}; |
344
|
0
|
|
|
|
|
0
|
$model{probT}=$model->{T}; |
345
|
0
|
|
|
|
|
0
|
foreach my $let (qw(probA probC probG probT)) { |
346
|
0
|
|
|
|
|
0
|
my @str; |
347
|
0
|
0
|
0
|
|
|
0
|
$self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1)); |
348
|
0
|
|
|
|
|
0
|
foreach my $f (@{$self->{$let}}) { |
|
0
|
|
|
|
|
0
|
|
349
|
0
|
|
|
|
|
0
|
my $w=log($f)-log($model{$let}); |
350
|
0
|
|
|
|
|
0
|
push @str,$w; |
351
|
|
|
|
|
|
|
} |
352
|
0
|
|
|
|
|
0
|
my $llet=$let; |
353
|
0
|
|
|
|
|
0
|
$llet=~s/prob/log/; |
354
|
0
|
|
|
|
|
0
|
$self->{$llet}=\@str; |
355
|
|
|
|
|
|
|
} |
356
|
0
|
|
|
|
|
0
|
return $self; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=head2 next_pos |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
Title : next_pos |
362
|
|
|
|
|
|
|
Usage : |
363
|
|
|
|
|
|
|
Function: Retrieves the next position features: frequencies for A,C,G,T, the |
364
|
|
|
|
|
|
|
main letter (as in consensus) and the probabilty for this letter to |
365
|
|
|
|
|
|
|
occur at this position and the current position |
366
|
|
|
|
|
|
|
Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel) |
367
|
|
|
|
|
|
|
Args : none |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub next_pos { |
372
|
104
|
|
|
104
|
1
|
101
|
my $self = shift; |
373
|
104
|
50
|
|
|
|
157
|
$self->throw("instance method called on class") unless ref $self; |
374
|
104
|
|
|
|
|
85
|
my $len=@{$self->{seq}}; |
|
104
|
|
|
|
|
123
|
|
375
|
104
|
|
|
|
|
98
|
my $pos=$self->{_position}; |
376
|
|
|
|
|
|
|
# End reached? |
377
|
104
|
100
|
|
|
|
126
|
if ($pos<$len) { |
378
|
101
|
|
|
|
|
79
|
my $pA=${$self->{probA}}[$pos]; |
|
101
|
|
|
|
|
125
|
|
379
|
101
|
|
|
|
|
88
|
my $pC=${$self->{probC}}[$pos]; |
|
101
|
|
|
|
|
105
|
|
380
|
101
|
|
|
|
|
93
|
my $pG=${$self->{probG}}[$pos]; |
|
101
|
|
|
|
|
127
|
|
381
|
101
|
|
|
|
|
84
|
my $pT=${$self->{probT}}[$pos]; |
|
101
|
|
|
|
|
100
|
|
382
|
101
|
|
|
|
|
94
|
my $lA=${$self->{logA}}[$pos]; |
|
101
|
|
|
|
|
105
|
|
383
|
101
|
|
|
|
|
88
|
my $lC=${$self->{logC}}[$pos]; |
|
101
|
|
|
|
|
101
|
|
384
|
101
|
|
|
|
|
88
|
my $lG=${$self->{logG}}[$pos]; |
|
101
|
|
|
|
|
102
|
|
385
|
101
|
|
|
|
|
108
|
my $lT=${$self->{logT}}[$pos]; |
|
101
|
|
|
|
|
109
|
|
386
|
101
|
|
|
|
|
85
|
my $base=${$self->{seq}}[$pos]; |
|
101
|
|
|
|
|
102
|
|
387
|
101
|
|
|
|
|
84
|
my $prob=${$self->{seqp}}[$pos]; |
|
101
|
|
|
|
|
98
|
|
388
|
101
|
|
|
|
|
92
|
$self->{_position}++; |
389
|
101
|
|
|
|
|
330
|
my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob); |
390
|
101
|
|
|
|
|
573
|
return %seq; |
391
|
|
|
|
|
|
|
} |
392
|
3
|
|
|
|
|
4
|
else {$self->{_position}=0; return;} |
|
3
|
|
|
|
|
8
|
|
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head2 curpos |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Title : curpos |
398
|
|
|
|
|
|
|
Usage : |
399
|
|
|
|
|
|
|
Function: Gets/sets the current position. Converts to 0 if argument is minus |
400
|
|
|
|
|
|
|
and to width if greater than width |
401
|
|
|
|
|
|
|
Returns : integer |
402
|
|
|
|
|
|
|
Args : integer |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=cut |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
sub curpos { |
407
|
2
|
|
|
2
|
1
|
264
|
my $self = shift; |
408
|
2
|
|
|
|
|
3
|
my $prev = $self->{_position}; |
409
|
2
|
50
|
|
|
|
6
|
if (@_) { $self->{_position} = shift; } |
|
0
|
|
|
|
|
0
|
|
410
|
2
|
|
|
|
|
6
|
return $prev; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 e_val |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Title : e_val |
416
|
|
|
|
|
|
|
Usage : |
417
|
|
|
|
|
|
|
Function: Gets/sets the e-value |
418
|
|
|
|
|
|
|
Returns : real number |
419
|
|
|
|
|
|
|
Args : none to get, real number to set |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=cut |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub e_val { |
424
|
3
|
|
|
3
|
1
|
6
|
my $self = shift; |
425
|
3
|
|
|
|
|
5
|
my $prev = $self->{e_val}; |
426
|
3
|
100
|
|
|
|
7
|
if (@_) { $self->{e_val} = shift; } |
|
1
|
|
|
|
|
2
|
|
427
|
3
|
|
|
|
|
8
|
return $prev; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head2 IC |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
Title : IC |
433
|
|
|
|
|
|
|
Usage : |
434
|
|
|
|
|
|
|
Function: Get/set the Information Content |
435
|
|
|
|
|
|
|
Returns : real number |
436
|
|
|
|
|
|
|
Args : none to get, real number to set |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=cut |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
sub IC { |
441
|
1
|
|
|
1
|
1
|
1
|
my $self = shift; |
442
|
1
|
|
|
|
|
2
|
my $prev = $self->{IC}; |
443
|
1
|
50
|
|
|
|
3
|
if (@_) { $self->{IC} = shift; } |
|
0
|
|
|
|
|
0
|
|
444
|
1
|
|
|
|
|
2
|
return $prev; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=head2 accession_number |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
Title : accession_number |
450
|
|
|
|
|
|
|
Function: Get/set the accession number, this will be unique id for the |
451
|
|
|
|
|
|
|
SiteMatrix object as well for any other object, inheriting from |
452
|
|
|
|
|
|
|
SiteMatrix |
453
|
|
|
|
|
|
|
Returns : string |
454
|
|
|
|
|
|
|
Args : none to get, string to set |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=cut |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
sub accession_number { |
459
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
460
|
0
|
|
|
|
|
0
|
my $prev = $self->{accession_number}; |
461
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{accession_number} = shift; } |
|
0
|
|
|
|
|
0
|
|
462
|
0
|
|
|
|
|
0
|
return $prev; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=head2 consensus |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
Title : consensus |
468
|
|
|
|
|
|
|
Usage : |
469
|
|
|
|
|
|
|
Function: Returns the consensus |
470
|
|
|
|
|
|
|
Returns : string |
471
|
|
|
|
|
|
|
Args : (optional) threshold value 1 to 10, default 5 |
472
|
|
|
|
|
|
|
'5' means the returned characters had a 50% or higher presence at |
473
|
|
|
|
|
|
|
their position |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=cut |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
sub consensus { |
478
|
8
|
|
|
8
|
1
|
1042
|
my ($self, $thresh) = @_; |
479
|
8
|
50
|
|
|
|
21
|
if ($thresh) { |
480
|
0
|
|
|
|
|
0
|
my $len=$#{$self->{probA}}; |
|
0
|
|
|
|
|
0
|
|
481
|
0
|
|
|
|
|
0
|
for (my $i=0; $i<$len+1; $i++) { |
482
|
0
|
|
|
|
|
0
|
(${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
} |
485
|
8
|
|
|
|
|
12
|
my $consensus=''; |
486
|
8
|
|
|
|
|
10
|
foreach my $letter (@{$self->{seq}}) { |
|
8
|
|
|
|
|
17
|
|
487
|
180
|
|
|
|
|
177
|
$consensus .= $letter; |
488
|
|
|
|
|
|
|
} |
489
|
8
|
|
|
|
|
24
|
return $consensus; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 width |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : width |
495
|
|
|
|
|
|
|
Usage : |
496
|
|
|
|
|
|
|
Function: Returns the length of the sites in used to make this matrix |
497
|
|
|
|
|
|
|
Returns : int |
498
|
|
|
|
|
|
|
Args : none |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=cut |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub width { |
503
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
504
|
2
|
|
|
|
|
3
|
my $width=@{$self->{probA}}; |
|
2
|
|
|
|
|
4
|
|
505
|
2
|
|
|
|
|
6
|
return $width; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 sites |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Title : sites |
511
|
|
|
|
|
|
|
Usage : |
512
|
|
|
|
|
|
|
Function: Get/set the number of sites that were used to make this matrix |
513
|
|
|
|
|
|
|
Returns : int |
514
|
|
|
|
|
|
|
Args : none to get, int to set |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=cut |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub sites { |
519
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
520
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{sites} = shift } |
|
0
|
|
|
|
|
0
|
|
521
|
0
|
|
0
|
|
|
0
|
return $self->{sites} || return; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
=head2 IUPAC |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
Title : IUPAC |
527
|
|
|
|
|
|
|
Usage : |
528
|
|
|
|
|
|
|
Function: Returns IUPAC compliant consensus |
529
|
|
|
|
|
|
|
Returns : string |
530
|
|
|
|
|
|
|
Args : optionally, also supply a whole number (int) of 1 or higher to set |
531
|
|
|
|
|
|
|
the significance level when considering the frequencies. 1 (the |
532
|
|
|
|
|
|
|
default) means a 0.05 significance level: frequencies lower than |
533
|
|
|
|
|
|
|
0.05 will be ignored. 2 Means a 0.005 level, and so on. |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=cut |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub IUPAC { |
538
|
5
|
|
|
5
|
1
|
2624
|
my ($self, $thresh) = @_; |
539
|
5
|
50
|
|
|
|
12
|
if ($thresh) { |
540
|
0
|
|
|
|
|
0
|
my $len=$#{$self->{probA}}; |
|
0
|
|
|
|
|
0
|
|
541
|
0
|
|
|
|
|
0
|
for (my $i=0; $i<$len+1; $i++) { |
542
|
0
|
|
|
|
|
0
|
(${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
} |
545
|
5
|
|
|
|
|
11
|
my $iu=$self->{IUPAC}; |
546
|
5
|
|
|
|
|
7
|
my $iupac=''; |
547
|
5
|
|
|
|
|
5
|
foreach my $let (@{$iu}) { |
|
5
|
|
|
|
|
8
|
|
548
|
92
|
|
|
|
|
91
|
$iupac .= $let; |
549
|
|
|
|
|
|
|
} |
550
|
5
|
|
|
|
|
18
|
return $iupac; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=head2 _to_IUPAC |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
Title : _to_IUPAC |
556
|
|
|
|
|
|
|
Usage : |
557
|
|
|
|
|
|
|
Function: Converts a single position to IUPAC compliant symbol. |
558
|
|
|
|
|
|
|
For rules see the implementation |
559
|
|
|
|
|
|
|
Returns : char, real number |
560
|
|
|
|
|
|
|
Args : real numbers for frequencies of A,C,G,T (positional) |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
optionally, also supply a whole number (int) of 1 or higher to set |
563
|
|
|
|
|
|
|
the significance level when considering the frequencies. 1 (the |
564
|
|
|
|
|
|
|
default) means a 0.05 significance level: frequencies lower than |
565
|
|
|
|
|
|
|
0.05 will be ignored. 2 Means a 0.005 level, and so on. |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=cut |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
sub _to_IUPAC { |
570
|
267
|
|
|
267
|
|
343
|
my ($a, $c, $g, $t, $thresh) = @_; |
571
|
267
|
|
50
|
|
|
623
|
$thresh ||= 1; |
572
|
267
|
|
|
|
|
239
|
$thresh = int($thresh); |
573
|
267
|
|
|
|
|
860
|
$a = sprintf ("%.${thresh}f", $a); |
574
|
267
|
|
|
|
|
551
|
$c = sprintf ("%.${thresh}f", $c); |
575
|
267
|
|
|
|
|
499
|
$g = sprintf ("%.${thresh}f", $g); |
576
|
267
|
|
|
|
|
494
|
$t = sprintf ("%.${thresh}f", $t); |
577
|
|
|
|
|
|
|
|
578
|
267
|
|
|
|
|
568
|
my $total = $a + $c + $g + $t; |
579
|
|
|
|
|
|
|
|
580
|
267
|
100
|
|
|
|
430
|
return 'A' if ($a == $total); |
581
|
199
|
100
|
|
|
|
258
|
return 'G' if ($g == $total); |
582
|
190
|
100
|
|
|
|
278
|
return 'C' if ($c == $total); |
583
|
108
|
100
|
|
|
|
178
|
return 'T' if ($t == $total); |
584
|
93
|
|
|
|
|
98
|
my $r=$g+$a; |
585
|
93
|
50
|
|
|
|
120
|
return 'R' if ($r == $total); |
586
|
93
|
|
|
|
|
91
|
my $y=$t+$c; |
587
|
93
|
100
|
|
|
|
138
|
return 'Y' if ($y == $total); |
588
|
86
|
|
|
|
|
89
|
my $m=$a+$c; |
589
|
86
|
100
|
|
|
|
135
|
return 'M' if ($m == $total); |
590
|
62
|
|
|
|
|
63
|
my $k=$g+$t; |
591
|
62
|
100
|
|
|
|
97
|
return 'K' if ($k == $total); |
592
|
49
|
|
|
|
|
62
|
my $s=$g+$c; |
593
|
49
|
100
|
|
|
|
72
|
return 'S' if ($s == $total); |
594
|
42
|
|
|
|
|
46
|
my $w=$a+$t; |
595
|
42
|
100
|
|
|
|
70
|
return 'W' if ($w == $total); |
596
|
20
|
|
|
|
|
22
|
my $d=$r+$t; |
597
|
20
|
100
|
|
|
|
36
|
return 'D' if ($d == $total); |
598
|
17
|
|
|
|
|
21
|
my $v=$r+$c; |
599
|
17
|
100
|
|
|
|
34
|
return 'V' if ($v == $total); |
600
|
9
|
|
|
|
|
13
|
my $b=$y+$g; |
601
|
9
|
100
|
|
|
|
19
|
return 'B' if ($b == $total); |
602
|
7
|
|
|
|
|
7
|
my $h=$y+$a; |
603
|
7
|
100
|
|
|
|
16
|
return 'H' if ($h == $total); |
604
|
1
|
|
|
|
|
1
|
return 'N'; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=head2 _to_cons |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
Title : _to_cons |
610
|
|
|
|
|
|
|
Usage : |
611
|
|
|
|
|
|
|
Function: Converts a single position to simple consensus character and returns |
612
|
|
|
|
|
|
|
its probability. For rules see the implementation |
613
|
|
|
|
|
|
|
Returns : char, real number |
614
|
|
|
|
|
|
|
Args : real numbers for A,C,G,T (positional), and optional 5th argument of |
615
|
|
|
|
|
|
|
threshold (as a number between 1 and 10, where 5 is default and |
616
|
|
|
|
|
|
|
means the returned character had a 50% or higher presence at this |
617
|
|
|
|
|
|
|
position) |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
=cut |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
sub _to_cons { |
622
|
267
|
|
|
267
|
|
375
|
my ($A, $C, $G, $T, $thresh) = @_; |
623
|
267
|
|
50
|
|
|
610
|
$thresh ||= 5; |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
# this multiplication by 10 is just to satisfy the thresh range of 1-10 |
626
|
267
|
|
|
|
|
282
|
my $a = $A * 10; |
627
|
267
|
|
|
|
|
242
|
my $c = $C * 10; |
628
|
267
|
|
|
|
|
240
|
my $g = $G * 10; |
629
|
267
|
|
|
|
|
229
|
my $t = $T * 10; |
630
|
|
|
|
|
|
|
|
631
|
267
|
100
|
100
|
|
|
669
|
return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh)); |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
632
|
244
|
50
|
100
|
|
|
409
|
return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g)); |
|
|
|
66
|
|
|
|
|
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# threshold could be lower than 50%, so must check is not only over |
635
|
|
|
|
|
|
|
# threshold, but also the highest frequency |
636
|
244
|
50
|
100
|
|
|
642
|
return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g)); |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
637
|
146
|
50
|
100
|
|
|
518
|
return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g)); |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
638
|
45
|
50
|
66
|
|
|
149
|
return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a)); |
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
639
|
26
|
100
|
33
|
|
|
151
|
return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a)); |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
640
|
|
|
|
|
|
|
|
641
|
2
|
|
|
|
|
4
|
return 'N',10; |
642
|
|
|
|
|
|
|
} |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=head2 get_string |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
Title : get_string |
647
|
|
|
|
|
|
|
Usage : |
648
|
|
|
|
|
|
|
Function: Returns given probability vector as a string. Useful if you want to |
649
|
|
|
|
|
|
|
store things in a rel database, where arrays are not first choice |
650
|
|
|
|
|
|
|
Throws : If the argument is outside {A,C,G,T} |
651
|
|
|
|
|
|
|
Returns : string |
652
|
|
|
|
|
|
|
Args : character {A,C,G,T} |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=cut |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
sub get_string { |
657
|
1
|
|
|
1
|
1
|
2
|
my $self=shift; |
658
|
1
|
|
|
|
|
2
|
my $base=shift; |
659
|
1
|
|
|
|
|
1
|
my $string=''; |
660
|
1
|
|
|
|
|
2
|
my @prob; |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
BASE: { |
663
|
1
|
50
|
|
|
|
1
|
if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; } |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
2
|
|
664
|
0
|
0
|
|
|
|
0
|
if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
665
|
0
|
0
|
|
|
|
0
|
if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
666
|
0
|
0
|
|
|
|
0
|
if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
667
|
0
|
|
|
|
|
0
|
$self->throw ("No such base: $base!\n"); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
|
670
|
1
|
|
|
|
|
3
|
foreach my $prob (@prob) { |
671
|
5
|
|
|
|
|
7
|
my $corrected = $prob*10; |
672
|
5
|
|
|
|
|
8
|
my $next=sprintf("%.0f",$corrected); |
673
|
5
|
100
|
|
|
|
8
|
$next='a' if ($next eq '10'); |
674
|
5
|
|
|
|
|
6
|
$string .= $next; |
675
|
|
|
|
|
|
|
} |
676
|
1
|
|
|
|
|
4
|
return $string; |
677
|
|
|
|
|
|
|
} |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=head2 get_array |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
Title : get_array |
682
|
|
|
|
|
|
|
Usage : |
683
|
|
|
|
|
|
|
Function: Returns an array with frequencies for a specified base |
684
|
|
|
|
|
|
|
Returns : array |
685
|
|
|
|
|
|
|
Args : char |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=cut |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub get_array { |
690
|
3
|
|
|
3
|
1
|
10
|
my $self=shift; |
691
|
3
|
|
|
|
|
7
|
my $base=uc(shift); |
692
|
3
|
50
|
|
|
|
7
|
return @{$self->{probA}} if ($base eq 'A'); |
|
3
|
|
|
|
|
16
|
|
693
|
0
|
0
|
|
|
|
0
|
return @{$self->{probC}} if ($base eq 'C'); |
|
0
|
|
|
|
|
0
|
|
694
|
0
|
0
|
|
|
|
0
|
return @{$self->{probG}} if ($base eq 'G'); |
|
0
|
|
|
|
|
0
|
|
695
|
0
|
0
|
|
|
|
0
|
return @{$self->{probT}} if ($base eq 'T'); |
|
0
|
|
|
|
|
0
|
|
696
|
0
|
|
|
|
|
0
|
$self->throw("No such base: $base!\n"); |
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
=head2 get_logs_array |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
Title : get_logs_array |
702
|
|
|
|
|
|
|
Usage : |
703
|
|
|
|
|
|
|
Function: Returns an array with log_odds for a specified base |
704
|
|
|
|
|
|
|
Returns : array |
705
|
|
|
|
|
|
|
Args : char |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=cut |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
sub get_logs_array { |
710
|
1
|
|
|
1
|
1
|
5
|
my $self=shift; |
711
|
1
|
|
|
|
|
2
|
my $base=uc(shift); |
712
|
1
|
50
|
33
|
|
|
5
|
return @{$self->{logA}} if (($base eq 'A') && ($self->{logA})); |
|
1
|
|
|
|
|
6
|
|
713
|
0
|
0
|
0
|
|
|
0
|
return @{$self->{logC}} if (($base eq 'C') && ($self->{logC})); |
|
0
|
|
|
|
|
0
|
|
714
|
0
|
0
|
0
|
|
|
0
|
return @{$self->{logG}} if (($base eq 'G') && ($self->{logG})); |
|
0
|
|
|
|
|
0
|
|
715
|
0
|
0
|
0
|
|
|
0
|
return @{$self->{logT}} if (($base eq 'T') && ($self->{logT})); |
|
0
|
|
|
|
|
0
|
|
716
|
0
|
0
|
|
|
|
0
|
$self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T))); |
717
|
0
|
|
|
|
|
0
|
return; |
718
|
|
|
|
|
|
|
} |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=head2 id |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
Title : id |
723
|
|
|
|
|
|
|
Usage : |
724
|
|
|
|
|
|
|
Function: Gets/sets the site id |
725
|
|
|
|
|
|
|
Returns : string |
726
|
|
|
|
|
|
|
Args : string |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=cut |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
sub id { |
731
|
12
|
|
|
12
|
1
|
30
|
my $self = shift; |
732
|
12
|
|
|
|
|
16
|
my $prev = $self->{id}; |
733
|
12
|
100
|
|
|
|
19
|
if (@_) { $self->{id} = shift; } |
|
2
|
|
|
|
|
2
|
|
734
|
12
|
|
|
|
|
29
|
return $prev; |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
=head2 regexp |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
Title : regexp |
740
|
|
|
|
|
|
|
Usage : |
741
|
|
|
|
|
|
|
Function: Returns a regular expression which matches the IUPAC convention. |
742
|
|
|
|
|
|
|
N will match X, N, - and . |
743
|
|
|
|
|
|
|
Returns : string |
744
|
|
|
|
|
|
|
Args : none (works at the threshold last used for making the IUPAC string) |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=cut |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
sub regexp { |
749
|
1
|
|
|
1
|
1
|
2
|
my $self=shift; |
750
|
1
|
|
|
|
|
1
|
my $regexp; |
751
|
1
|
|
|
|
|
2
|
foreach my $letter (@{$self->{IUPAC}}) { |
|
1
|
|
|
|
|
3
|
|
752
|
5
|
|
|
|
|
4
|
my $reg; |
753
|
|
|
|
|
|
|
LETTER: { |
754
|
5
|
100
|
|
|
|
4
|
if ($letter eq 'A') { $reg='[Aa]'; last LETTER; } |
|
5
|
|
|
|
|
10
|
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2
|
|
755
|
4
|
100
|
|
|
|
6
|
if ($letter eq 'C') { $reg='[Cc]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2
|
|
756
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'G') { $reg='[Gg]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
757
|
3
|
50
|
|
|
|
6
|
if ($letter eq 'T') { $reg='[Tt]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
758
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
759
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
760
|
3
|
50
|
|
|
|
5
|
if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
761
|
3
|
50
|
|
|
|
5
|
if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
762
|
3
|
50
|
|
|
|
11
|
if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
763
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
764
|
3
|
100
|
|
|
|
5
|
if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1
|
|
765
|
2
|
50
|
|
|
|
5
|
if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
766
|
2
|
100
|
|
|
|
3
|
if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2
|
|
767
|
1
|
50
|
|
|
|
2
|
if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2
|
|
768
|
0
|
|
|
|
|
0
|
$reg='\S'; |
769
|
|
|
|
|
|
|
} |
770
|
5
|
|
|
|
|
8
|
$regexp .= $reg; |
771
|
|
|
|
|
|
|
} |
772
|
1
|
|
|
|
|
3
|
return $regexp; |
773
|
|
|
|
|
|
|
} |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=head2 regexp_array |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
Title : regexp_array |
778
|
|
|
|
|
|
|
Usage : |
779
|
|
|
|
|
|
|
Function: Returns a regular expression which matches the IUPAC convention. |
780
|
|
|
|
|
|
|
N will match X, N, - and . |
781
|
|
|
|
|
|
|
Returns : array |
782
|
|
|
|
|
|
|
Args : none (works at the threshold last used for making the IUPAC string) |
783
|
|
|
|
|
|
|
To do : I have separated regexp and regexp_array, but |
784
|
|
|
|
|
|
|
maybe they can be rewritten as one - just check what should be returned |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=cut |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
sub regexp_array { |
789
|
1
|
|
|
1
|
1
|
2
|
my $self=shift; |
790
|
1
|
|
|
|
|
2
|
my @regexp; |
791
|
1
|
|
|
|
|
2
|
foreach my $letter (@{$self->{IUPAC}}) { |
|
1
|
|
|
|
|
2
|
|
792
|
5
|
|
|
|
|
6
|
my $reg; |
793
|
|
|
|
|
|
|
LETTER: { |
794
|
5
|
100
|
|
|
|
5
|
if ($letter eq 'A') { $reg='[Aa]'; last LETTER; } |
|
5
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2
|
|
795
|
4
|
100
|
|
|
|
5
|
if ($letter eq 'C') { $reg='[Cc]'; last LETTER; } |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1
|
|
796
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'G') { $reg='[Gg]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
797
|
3
|
50
|
|
|
|
5
|
if ($letter eq 'T') { $reg='[Tt]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
798
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
799
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
800
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
801
|
3
|
50
|
|
|
|
4
|
if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
802
|
3
|
50
|
|
|
|
5
|
if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
803
|
3
|
50
|
|
|
|
5
|
if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
804
|
3
|
100
|
|
|
|
4
|
if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; } |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1
|
|
805
|
2
|
50
|
|
|
|
3
|
if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
806
|
2
|
100
|
|
|
|
4
|
if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1
|
|
807
|
1
|
50
|
|
|
|
2
|
if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; } |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1
|
|
808
|
0
|
|
|
|
|
0
|
$reg='\S'; |
809
|
|
|
|
|
|
|
} |
810
|
5
|
|
|
|
|
8
|
push @regexp,$reg; |
811
|
|
|
|
|
|
|
} |
812
|
1
|
|
|
|
|
4
|
return @regexp; |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
=head2 _compress_array |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
Title : _compress_array |
819
|
|
|
|
|
|
|
Usage : |
820
|
|
|
|
|
|
|
Function: Will compress an array of real signed numbers to a string (ie vector |
821
|
|
|
|
|
|
|
of bytes) -127 to +127 for bi-directional(signed) and 0..255 for |
822
|
|
|
|
|
|
|
unsigned |
823
|
|
|
|
|
|
|
Returns : String |
824
|
|
|
|
|
|
|
Args : array reference, followed by an max value and direction (optional, |
825
|
|
|
|
|
|
|
default 1-unsigned),1 unsigned, any other is signed. |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=cut |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
sub _compress_array { |
830
|
3
|
|
|
3
|
|
5
|
my ($array,$lm,$direct)=@_; |
831
|
3
|
|
|
|
|
2
|
my $str; |
832
|
3
|
50
|
33
|
|
|
12
|
return unless(($array) && ($lm)); |
833
|
3
|
50
|
|
|
|
3
|
$direct=1 unless ($direct); |
834
|
3
|
100
|
|
|
|
9
|
my $k1= ($direct==1) ? (255/$lm) : (127/$lm); |
835
|
3
|
|
|
|
|
3
|
foreach my $c (@{$array}) { |
|
3
|
|
|
|
|
6
|
|
836
|
62
|
50
|
|
|
|
77
|
$c=$lm if ($c>$lm); |
837
|
62
|
50
|
33
|
|
|
81
|
$c=-$lm if (($c<-$lm) && ($direct !=1)); |
838
|
62
|
50
|
66
|
|
|
96
|
$c=0 if (($c<0) && ($direct ==1)); |
839
|
62
|
|
|
|
|
62
|
my $byte=int($k1*$c); |
840
|
62
|
100
|
|
|
|
75
|
$byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits |
841
|
62
|
|
|
|
|
55
|
my $char=chr($byte); |
842
|
62
|
|
|
|
|
67
|
$str.=$char; |
843
|
|
|
|
|
|
|
} |
844
|
3
|
|
|
|
|
10
|
return $str; |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
=head2 _uncompress_string |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
Title : _uncompress_string |
850
|
|
|
|
|
|
|
Usage : |
851
|
|
|
|
|
|
|
Function: Will uncompress a string (vector of bytes) to create an array of |
852
|
|
|
|
|
|
|
real signed numbers (opposite to_compress_array) |
853
|
|
|
|
|
|
|
Returns : string, followed by an max value and |
854
|
|
|
|
|
|
|
direction (optional, default 1-unsigned), 1 unsigned, any other is signed. |
855
|
|
|
|
|
|
|
Args : array |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=cut |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
sub _uncompress_string { |
860
|
3
|
|
|
3
|
|
14
|
my ($str,$lm,$direct)=@_; |
861
|
3
|
|
|
|
|
3
|
my @array; |
862
|
3
|
50
|
33
|
|
|
9
|
return unless(($str) && ($lm)); |
863
|
3
|
50
|
|
|
|
4
|
$direct=1 unless ($direct); |
864
|
3
|
100
|
|
|
|
6
|
my $k1= ($direct==1) ? (255/$lm) : (127/$lm); |
865
|
3
|
|
|
|
|
12
|
foreach my $c (split(//,$str)) { |
866
|
62
|
|
|
|
|
52
|
my $byte=ord($c); |
867
|
62
|
100
|
|
|
|
76
|
$byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits |
868
|
62
|
|
|
|
|
52
|
my $num=$byte/$k1; |
869
|
62
|
|
|
|
|
65
|
push @array,$num; |
870
|
|
|
|
|
|
|
} |
871
|
3
|
|
|
|
|
13
|
return @array; |
872
|
|
|
|
|
|
|
} |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=head2 get_compressed_freq |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
Title : get_compressed_freq |
877
|
|
|
|
|
|
|
Usage : |
878
|
|
|
|
|
|
|
Function: A method to provide a compressed frequency vector. It uses one byte |
879
|
|
|
|
|
|
|
to code the frequence for one of the probability vectors for one |
880
|
|
|
|
|
|
|
position. Useful for relational database. Improvement of the previous |
881
|
|
|
|
|
|
|
0..a coding. |
882
|
|
|
|
|
|
|
Example : my $strA=$self->get_compressed_freq('A'); |
883
|
|
|
|
|
|
|
Returns : String |
884
|
|
|
|
|
|
|
Args : char |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
=cut |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
sub get_compressed_freq { |
889
|
2
|
|
|
2
|
1
|
516
|
my $self=shift; |
890
|
2
|
|
|
|
|
4
|
my $base=shift; |
891
|
2
|
|
|
|
|
3
|
my $string=''; |
892
|
2
|
|
|
|
|
2
|
my @prob; |
893
|
|
|
|
|
|
|
BASE: { |
894
|
2
|
50
|
|
|
|
3
|
if ($base eq 'A') { |
|
2
|
|
|
|
|
5
|
|
895
|
2
|
50
|
|
|
|
5
|
@prob= @{$self->{probA}} unless (!defined($self->{probA})); |
|
2
|
|
|
|
|
6
|
|
896
|
2
|
|
|
|
|
4
|
last BASE; |
897
|
|
|
|
|
|
|
} |
898
|
0
|
0
|
|
|
|
0
|
if ($base eq 'G') { |
899
|
0
|
0
|
|
|
|
0
|
@prob= @{$self->{probG}} unless (!defined($self->{probG})); |
|
0
|
|
|
|
|
0
|
|
900
|
0
|
|
|
|
|
0
|
last BASE; |
901
|
|
|
|
|
|
|
} |
902
|
0
|
0
|
|
|
|
0
|
if ($base eq 'C') { |
903
|
0
|
0
|
|
|
|
0
|
@prob= @{$self->{probC}} unless (!defined($self->{probC})); |
|
0
|
|
|
|
|
0
|
|
904
|
0
|
|
|
|
|
0
|
last BASE; |
905
|
|
|
|
|
|
|
} |
906
|
0
|
0
|
|
|
|
0
|
if ($base eq 'T') { |
907
|
0
|
0
|
|
|
|
0
|
@prob= @{$self->{probT}} unless (!defined($self->{probT})); |
|
0
|
|
|
|
|
0
|
|
908
|
0
|
|
|
|
|
0
|
last BASE; |
909
|
|
|
|
|
|
|
} |
910
|
0
|
|
|
|
|
0
|
$self->throw ("No such base: $base!\n"); |
911
|
|
|
|
|
|
|
} |
912
|
2
|
|
|
|
|
5
|
my $str= _compress_array(\@prob,1,1); |
913
|
2
|
|
|
|
|
5
|
return $str; |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
=head2 get_compressed_logs |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
Title : get_compressed_logs |
919
|
|
|
|
|
|
|
Usage : |
920
|
|
|
|
|
|
|
Function: A method to provide a compressed log-odd vector. It uses one byte to |
921
|
|
|
|
|
|
|
code the log value for one of the log-odds vectors for one position. |
922
|
|
|
|
|
|
|
Example : my $strA=$self->get_compressed_logs('A'); |
923
|
|
|
|
|
|
|
Returns : String |
924
|
|
|
|
|
|
|
Args : char |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
=cut |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
sub get_compressed_logs { |
929
|
1
|
|
|
1
|
1
|
2
|
my $self=shift; |
930
|
1
|
|
|
|
|
1
|
my $base=shift; |
931
|
1
|
|
|
|
|
2
|
my $string=''; |
932
|
1
|
|
|
|
|
1
|
my @prob; |
933
|
|
|
|
|
|
|
BASE: { |
934
|
1
|
50
|
|
|
|
1
|
if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; } |
|
1
|
50
|
|
|
|
3
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
2
|
|
935
|
0
|
0
|
|
|
|
0
|
if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; } |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
936
|
0
|
0
|
|
|
|
0
|
if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; } |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
937
|
0
|
0
|
|
|
|
0
|
if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; } |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
938
|
0
|
|
|
|
|
0
|
$self->throw ("No such base: $base!\n"); |
939
|
|
|
|
|
|
|
} |
940
|
1
|
|
|
|
|
2
|
return _compress_array(\@prob,1000,2); |
941
|
|
|
|
|
|
|
} |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=head2 sequence_match_weight |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Title : sequence_match_weight |
946
|
|
|
|
|
|
|
Usage : |
947
|
|
|
|
|
|
|
Function: This method will calculate the score of a match, based on the PWM |
948
|
|
|
|
|
|
|
if such is associated with the matrix object. Returns undef if no |
949
|
|
|
|
|
|
|
PWM data is available. |
950
|
|
|
|
|
|
|
Throws : if the length of the sequence is different from the matrix width |
951
|
|
|
|
|
|
|
Example : my $score=$matrix->sequence_match_weight('ACGGATAG'); |
952
|
|
|
|
|
|
|
Returns : Floating point |
953
|
|
|
|
|
|
|
Args : string |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=cut |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
sub sequence_match_weight { |
958
|
1
|
|
|
1
|
1
|
504
|
my ($self,$seq)=@_; |
959
|
1
|
50
|
|
|
|
3
|
return unless ($self->{logA}); |
960
|
1
|
|
|
|
|
15
|
my $width=$self->width; |
961
|
1
|
50
|
|
|
|
1
|
$self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}}); |
|
1
|
|
|
|
|
3
|
|
962
|
1
|
|
|
|
|
2
|
$seq = uc($seq); |
963
|
1
|
|
|
|
|
6
|
my @seq=split(//,$seq); |
964
|
1
|
|
|
|
|
2
|
my $score = 0; |
965
|
1
|
|
|
|
|
1
|
my $i=0; |
966
|
1
|
|
|
|
|
2
|
foreach my $pos (@seq) { |
967
|
25
|
|
|
|
|
25
|
my $tv = 'log'.$pos; |
968
|
25
|
50
|
|
|
|
31
|
$self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv}; |
969
|
25
|
50
|
|
|
|
38
|
$score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0; |
970
|
25
|
|
|
|
|
25
|
$i++; |
971
|
|
|
|
|
|
|
} |
972
|
1
|
|
|
|
|
5
|
return $score; |
973
|
|
|
|
|
|
|
} |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
=head2 get_all_vectors |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
Title : get_all_vectors |
978
|
|
|
|
|
|
|
Usage : |
979
|
|
|
|
|
|
|
Function: returns all possible sequence vectors to satisfy the PFM under |
980
|
|
|
|
|
|
|
a given threshold |
981
|
|
|
|
|
|
|
Throws : If threshold outside of 0..1 (no sense to do that) |
982
|
|
|
|
|
|
|
Example : my @vectors=$self->get_all_vectors(4); |
983
|
|
|
|
|
|
|
Returns : Array of strings |
984
|
|
|
|
|
|
|
Args : (optional) floating |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=cut |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
sub get_all_vectors { |
989
|
0
|
|
|
0
|
1
|
|
my $self=shift; |
990
|
0
|
|
|
|
|
|
my $thresh=shift; |
991
|
0
|
0
|
0
|
|
|
|
$self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1)); |
992
|
0
|
|
|
|
|
|
my @seq=split(//,$self->consensus($thresh*10)); |
993
|
0
|
|
|
|
|
|
my @perm; |
994
|
0
|
|
|
|
|
|
for my $i (0..@{$self->{probA}}) { |
|
0
|
|
|
|
|
|
|
995
|
0
|
0
|
|
|
|
|
push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh); |
|
0
|
|
|
|
|
|
|
996
|
0
|
0
|
|
|
|
|
push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh); |
|
0
|
|
|
|
|
|
|
997
|
0
|
0
|
|
|
|
|
push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh); |
|
0
|
|
|
|
|
|
|
998
|
0
|
0
|
|
|
|
|
push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh); |
|
0
|
|
|
|
|
|
|
999
|
0
|
0
|
|
|
|
|
push @{$perm[$i]},'N' if ($seq[$i] eq 'N'); |
|
0
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
} |
1001
|
0
|
|
|
|
|
|
my $fpos=shift @perm; |
1002
|
0
|
|
|
|
|
|
my @strings=@$fpos; |
1003
|
0
|
|
|
|
|
|
foreach my $pos (@perm) { |
1004
|
0
|
|
|
|
|
|
my @newstr; |
1005
|
0
|
|
|
|
|
|
foreach my $let (@$pos) { |
1006
|
0
|
|
|
|
|
|
foreach my $string (@strings) { |
1007
|
0
|
|
|
|
|
|
my $newstring = $string . $let; |
1008
|
0
|
|
|
|
|
|
push @newstr,$newstring; |
1009
|
|
|
|
|
|
|
} |
1010
|
|
|
|
|
|
|
} |
1011
|
0
|
|
|
|
|
|
@strings=@newstr; |
1012
|
|
|
|
|
|
|
} |
1013
|
0
|
|
|
|
|
|
return @strings; |
1014
|
|
|
|
|
|
|
} |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
1; |