File Coverage

Bio/Matrix/PSM/SiteMatrix.pm

Criterion	Covered	Total	%
statement	375	556	67.4
branch	131	256	51.1
condition	67	115	58.2
subroutine	24	28	85.7
pod	21	21	100.0
total	618	976	63.3

line	stmt	bran	cond	sub	pod	time	code
1							#---------------------------------------------------------
2
3							=head1 NAME
4
5							Bio::Matrix::PSM::SiteMatrix - SiteMatrixI implementation, holds a
6							position scoring matrix (or position weight matrix) and log-odds
7
8							=head1 SYNOPSIS
9
10							use Bio::Matrix::PSM::SiteMatrix;
11							# Create from memory by supplying probability matrix hash
12							# both as strings or arrays
13							# where the frequencies $a,$c,$g and $t are supplied either as
14							# arrayref or string. Accordingly, lA, lC, lG and lT are the log
15							# odds (only as arrays, no checks done right now)
16							my ($a,$c,$g,$t,$score,$ic, $mid)=@_;
17							#or
18							my ($a,$c,$g,$t,$score,$ic,$mid)=('05a011','110550','400001',
19							'100104',0.001,19.2,'CRE1');
20							#Where a stands for all (this frequency=1), see explanation below
21							my %param=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,
22							-lA=>$la, -lC=>$lc,-lG=>$lg,-lT=>$l,
23							-IC=>$ic,-e_val=>$score, -id=>$mid);
24							my $site=Bio::Matrix::PSM::SiteMatrix->new(%param);
25							#Or get it from a file:
26							use Bio::Matrix::PSM::IO;
27							my $psmIO= Bio::Matrix::PSM::IO->new(-file=>$file, -format=>'transfac');
28							while (my $psm=$psmIO->next_psm) {
29							#Now we have a Bio::Matrix::PSM::Psm object,
30							# see Bio::Matrix::PSM::PsmI for details
31							#This is a Bio::Matrix::PSM::SiteMatrix object now
32							my $matrix=$psm->matrix;
33							}
34
35							# Get a simple consensus, where alphabet is {A,C,G,T,N},
36							# choosing the character that both satisfies a supplied or default threshold
37							# frequency and is the most frequenct character at each position, or N.
38							# So for the position with A, C, G, T frequencies of 0.5, 0.25, 0.10, 0.15,
39							# the simple consensus character will be 'A', whilst for 0.5, 0.5, 0, 0 it
40							# would be 'N'.
41							my $consensus=$site->consensus;
42
43							# Get the IUPAC ambiguity code representation of the data in the matrix.
44							# Because the frequencies may have been pseudo-count corrected, insignificant
45							# frequences (below 0.05 by default) are ignored. So a position with
46							# A, C, G, T frequencies of 0.5, 0.5, 0.01, 0.01 will get the IUPAC code 'M',
47							# while 0.97, 0.01, 0.01, 0.01 will get the code 'A' and
48							# 0.25, 0.25, 0.25, 0.25 would get 'N'.
49							my $iupac=$site->IUPAC;
50
51							# Getting/using regular expression (a representation of the IUPAC string)
52							my $regexp=$site->regexp;
53							my $count=grep($regexp,$seq);
54							my $count=($seq=~ s/$regexp/$1/eg);
55							print "Motif $mid is present $count times in this sequence\n";
56
57							=head1 DESCRIPTION
58
59							SiteMatrix is designed to provide some basic methods when working with position
60							scoring (weight) matrices, such as transcription factor binding sites for
61							example. A DNA PSM consists of four vectors with frequencies {A,C,G,T}. This is
62							the minimum information you should provide to construct a PSM object. The
63							vectors can be provided as strings with frequenciesx10 rounded to an int, going
64							from {0..a} and 'a' represents the maximum (10). This is like MEME's compressed
65							representation of a matrix and it is quite useful when working with relational
66							DB. If arrays are provided as an input (references to arrays actually) they can
67							be any number, real or integer (frequency or count).
68
69							When creating the object you can ask the constructor to make a simple pseudo
70							count correction by adding a number (typically 1) to all positions (with the
71							-correction option). After adding the number the frequencies will be
72							calculated. Only use correction when you supply counts, not frequencies.
73
74							Throws an exception if: You mix as an input array and string (for example A
75							matrix is given as array, C - as string). The position vector is (0,0,0,0). One
76							of the probability vectors is shorter than the rest.
77
78							Summary of the methods I use most frequently (details below):
79
80							iupac - return IUPAC compliant consensus as a string
81							score - Returns the score as a real number
82							IC - information content. Returns a real number
83							id - identifier. Returns a string
84							accession - accession number. Returns a string
85							next_pos - return the sequence probably for each letter, IUPAC
86							symbol, IUPAC probability and simple sequence
87							consenus letter for this position. Rewind at the end. Returns a hash.
88							pos - current position get/set. Returns an integer.
89							regexp - construct a regular expression based on IUPAC consensus.
90							For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
91							width - site width
92							get_string - gets the probability vector for a single base as a string.
93							get_array - gets the probability vector for a single base as an array.
94							get_logs_array - gets the log-odds vector for a single base as an array.
95
96							New methods, which might be of interest to anyone who wants to store
97							PSM in a relational database without creating an entry for each
98							position is the ability to compress the PSM vector into a string with
99							losing usually less than 1% of the data. this can be done with:
100
101							my $str=$matrix->get_compressed_freq('A');
102							or
103							my $str=$matrix->get_compressed_logs('A');
104
105							Loading from a database should be done with new, but is not yest implemented.
106							However you can still uncompress such string with:
107
108							my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
109							or
110							my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
111
112							=head1 FEEDBACK
113
114							=head2 Mailing Lists
115
116							User feedback is an integral part of the evolution of this and other
117							Bioperl modules. Send your comments and suggestions preferably to one
118							of the Bioperl mailing lists. Your participation is much appreciated.
119
120							bioperl-l@bioperl.org - General discussion
121							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
122
123							=head2 Support
124
125							Please direct usage questions or support issues to the mailing list:
126
127							I
128
129							rather than to the module maintainer directly. Many experienced and
130							reponsive experts will be able look at the problem and quickly
131							address it. Please include a thorough description of the problem
132							with code and data examples if at all possible.
133
134							=head2 Reporting Bugs
135
136							Report bugs to the Bioperl bug tracking system to help us keep track
137							the bugs and their resolution. Bug reports can be submitted via the
138							web:
139
140							https://github.com/bioperl/bioperl-live/issues
141
142							=head1 AUTHOR - Stefan Kirov
143
144							Email skirov@utk.edu
145
146							=head1 CONTRIBUTORS
147
148							Sendu Bala, bix@sendu.me.uk
149
150							=head1 APPENDIX
151
152							The rest of the documentation details each of the object methods.
153							Internal methods are usually preceded with a _
154
155							=cut
156
157							# Let the code begin...
158							package Bio::Matrix::PSM::SiteMatrix;
159	5			5		471	use strict;
	5					8
	5					139
160
161	5			5		46	use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
	5					8
	5					1509
162
163							=head2 new
164
165							Title : new
166							Usage : my $site=Bio::Matrix::PSM::SiteMatrix->new(-pA=>$a,-pC=>$c,
167							-pG=>$g,-pT=>$t,
168							-IC=>$ic,
169							-e_val=>$score,
170							-id=>$mid);
171							Function: Creates a new Bio::Matrix::PSM::SiteMatrix object from memory
172							Throws : If inconsistent data for all vectors (A,C,G and T) is
173							provided, if you mix input types (string vs array) or if a
174							position freq is 0.
175							Returns : Bio::Matrix::PSM::SiteMatrix object
176							Args : -pA => vector with the frequencies or counts of A
177							-pC => vector for C
178							-pG => vector for G
179							-pt => vector for T
180							-lA => vector for the log of A
181							-lC => vector for the log of C
182							-lG => vector for the log of G
183							-lT => vector for the log of T
184							-IC => real number, the information content of this matrix
185							-e_val => real number, the expect value
186							-id => string, an identifier
187							-width => int, width of the matrix in nucleotides
188							-sites => int, the number of sites that went into this matrix
189							-model => hash ref, background frequencies for A, C, G and T
190							-correction => number, the number to add to all positions to achieve
191							pseudo count correction (default 0: no correction)
192							NB: do not use correction when your input is
193							frequences!
194							-accession_number => string, an accession number
195
196							Vectors can be strings of the frequencies where the frequencies are
197							multiplied by 10 and rounded to the nearest whole number, and where
198							'a' is used to denote the maximal frequency 10. There should be no
199							punctuation (spaces etc.) in the string. For example, 'a0501'.
200							Alternatively frequencies or counts can be represented by an array
201							ref containing the counts, frequencies or logs as any kind of
202							number.
203
204							=cut
205
206							sub new {
207	14			14	1	140	my ($class, @args) = @_;
208	14					57	my $self = $class->SUPER::new(@args);
209	14					26	my $consensus;
210							# Too many things to rearrange, and I am creating simultanuously >500
211							# such objects routinely, so this becomes performance issue
212							my %input;
213	14					28	while (@args) {
214	114					204	(my $key = shift @args) =~ s/-//g; #deletes all dashes (only dashes)!
215	114					221	$input{$key} = shift @args;
216							}
217	14					22	$self->{_position} = 0;
218	14					27	$self->{IC} = $input{IC};
219	14					30	$self->{e_val} = $input{e_val};
220	14					17	$self->{width} = $input{width};
221	14					19	$self->{logA} = $input{lA};
222	14					21	$self->{logC} = $input{lC};
223	14					28	$self->{logG} = $input{lG};
224	14					17	$self->{logT} = $input{lT};
225	14					19	$self->{sites} = $input{sites};
226	14		100			33	$self->{id} = $input{id} \|\| 'null';
227	14		50			41	$self->{correction} = $input{correction} \|\| 0;
228	14					21	$self->{accession_number} = $input{accession_number};
229	14	50	66			89	return $self unless (defined($input{pA}) && defined($input{pC}) && defined($input{pG}) && defined($input{pT}));
			66
			33
230
231							# This should go to _initialize?
232							# Check for input type- no mixing alllowed, throw ex
233	12	100				50	if (ref($input{pA}) =~ /ARRAY/i ) {
234	11	50				23	$self->throw("Mixing matrix data types not allowed: C is not reference") unless(ref($input{pC}));
235	11	50				21	$self->throw("Mixing matrix data types not allowed: G is not reference") unless (ref($input{pG}));
236	11	50				16	$self->throw("Mixing matrix data types not allowed: T is not reference") unless (ref($input{pT}));
237	11					15	$self->{probA} = $input{pA};
238	11					19	$self->{probC} = $input{pC};
239	11					21	$self->{probG} = $input{pG};
240	11					16	$self->{probT} = $input{pT};
241							}
242							else {
243	1	50				3	$self->throw("Mixing matrix data types not allowed: C is reference") if (ref($input{pC}));
244	1	50				2	$self->throw("Mixing matrix data types not allowed: G is reference") if (ref($input{pG}));
245	1	50				2	$self->throw("Mixing matrix data types not allowed: T is reference") if (ref($input{pT}));
246	1					5	$self->{probA} = [split(//,$input{pA})];
247	1					4	$self->{probC} = [split(//,$input{pC})];
248	1					5	$self->{probG} = [split(//,$input{pG})];
249	1					3	$self->{probT} = [split(//,$input{pT})];
250	1					3	for (my $i=0; $i<= @{$self->{probA}}+1; $i++) {
	8					16
251							# we implictely assume these are MEME-style frequencies x 10, so
252							# 'a' represents the 'maximum', 10. Other positions can actually
253							# add up to over 10 due to rounding, but I don't think that is a
254							# problem?
255	7	100	100			5	if (${$self->{probA}}[$i] and ${$self->{probA}}[$i] eq 'a') {
	7					12
	3					7
256	1					2	${$self->{probA}}[$i]='10';
	1					2
257							}
258	7	100	100			7	if (${$self->{probC}}[$i] and ${$self->{probC}}[$i] eq 'a') {
	7					11
	3					7
259	1					1	${$self->{probC}}[$i]='10';
	1					2
260							}
261	7	50	66			8	if (${$self->{probG}}[$i] and ${$self->{probG}}[$i] eq 'a') {
	7					13
	3					7
262	0					0	${$self->{probG}}[$i]='10';
	0					0
263							}
264	7	50	66			7	if (${$self->{probT}}[$i] and ${$self->{probT}}[$i] eq 'a') {
	7					12
	2					4
265	0					0	${$self->{probT}}[$i]='10';
	0					0
266							}
267							}
268							}
269
270							# Check for position with 0 for all bases, throw exception if so
271	12					19	for (my $i=0;$i <= $#{$self->{probA}}; $i++) {
	279					376
272	267	50				238	if ((${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i]) == 0) {
	267					269
	267					345
	267					301
	267					392
273	0					0	$self->throw("Position meaningless-all frequencies are 0");
274							}
275
276							# apply pseudo-count correction to all values - this will result in
277							# very bad frequencies if the input is already frequences and a
278							# correction value as large as 1 is used!
279	267	50				362	if ($self->{correction}) {
280	0					0	${$self->{probA}}[$i] += $self->{correction};
	0					0
281	0					0	${$self->{probC}}[$i] += $self->{correction};
	0					0
282	0					0	${$self->{probG}}[$i] += $self->{correction};
	0					0
283	0					0	${$self->{probT}}[$i] += $self->{correction};
	0					0
284							}
285
286							# (re)calculate frequencies
287	267					233	my $div= ${$self->{probA}}[$i] + ${$self->{probC}}[$i] + ${$self->{probG}}[$i] + ${$self->{probT}}[$i];
	267					259
	267					264
	267					271
	267					285
288	267					242	${$self->{probA}}[$i]=${$self->{probA}}[$i]/$div;
	267					260
	267					303
289	267					232	${$self->{probC}}[$i]=${$self->{probC}}[$i]/$div;
	267					254
	267					287
290	267					245	${$self->{probG}}[$i]=${$self->{probG}}[$i]/$div;
	267					262
	267					269
291	267					227	${$self->{probT}}[$i]=${$self->{probT}}[$i]/$div;
	267					309
	267					262
292							}
293
294							# Calculate the logs
295	12	50	66			45	if ((!defined($self->{logA})) && ($input{model})) {
296	0					0	$self->calc_weight($input{model});
297							}
298
299							# Make consensus, throw if any one of the vectors is shorter
300	12					43	$self->_calculate_consensus;
301	12					104	return $self;
302							}
303
304							=head2 _calculate_consensus
305
306							Title : _calculate_consensus
307							Function: Internal stuff
308
309							=cut
310
311							sub _calculate_consensus {
312	12			12		15	my $self=shift;
313	12					16	my ($lc,$lt,$lg)=($#{$self->{probC}},$#{$self->{probT}},$#{$self->{probG}});
	12					25
	12					20
	12					23
314	12					16	my $len=$#{$self->{probA}};
	12					17
315	12	50				29	$self->throw("Probability matrix is damaged for C: $len vs $lc") if ($len != $lc);
316	12	50				24	$self->throw("Probability matrix is damaged for T: $len vs $lt") if ($len != $lt);
317	12	50				19	$self->throw("Probability matrix is damaged for G: $len vs $lg") if ($len != $lg);
318	12					30	for (my $i=0; $i<$len+1; $i++) {
319							#*** IUPACp values not actually used (eg. by next_pos)
320	267					269	(${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
	267					415
	267					420
	267					295
	267					278
	267					270
	267					333
321	267					320	(${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i]);
	267					384
	267					604
	267					291
	267					256
	267					265
	267					343
322							}
323	12					19	return $self;
324							}
325
326							=head2 calc_weight
327
328							Title : calc_weight
329							Usage : $obj->calc_weight({A=>0.2562, C=>0.2438, G=>0.2432, T=>0.2568});
330							Function: Recalculates the PSM (or weights) based on the PFM (the frequency
331							matrix) and user supplied background model.
332							Throws : if no model is supplied
333							Returns : n/a
334							Args : reference to a hash with background frequencies for A,C,G and T
335
336							=cut
337
338							sub calc_weight {
339	0			0	1	0	my ($self, $model) = @_;
340	0					0	my %model;
341	0					0	$model{probA}=$model->{A};
342	0					0	$model{probC}=$model->{C};
343	0					0	$model{probG}=$model->{G};
344	0					0	$model{probT}=$model->{T};
345	0					0	foreach my $let (qw(probA probC probG probT)) {
346	0					0	my @str;
347	0	0	0			0	$self->throw('You did not provide valid model\n') unless (($model{$let}>0) && ($model{$let}<1));
348	0					0	foreach my $f (@{$self->{$let}}) {
	0					0
349	0					0	my $w=log($f)-log($model{$let});
350	0					0	push @str,$w;
351							}
352	0					0	my $llet=$let;
353	0					0	$llet=~s/prob/log/;
354	0					0	$self->{$llet}=\@str;
355							}
356	0					0	return $self;
357							}
358
359							=head2 next_pos
360
361							Title : next_pos
362							Usage :
363							Function: Retrieves the next position features: frequencies for A,C,G,T, the
364							main letter (as in consensus) and the probabilty for this letter to
365							occur at this position and the current position
366							Returns : hash (pA,pC,pG,pT,logA,logC,logG,logT,base,prob,rel)
367							Args : none
368
369							=cut
370
371							sub next_pos {
372	104			104	1	101	my $self = shift;
373	104	50				157	$self->throw("instance method called on class") unless ref $self;
374	104					85	my $len=@{$self->{seq}};
	104					123
375	104					98	my $pos=$self->{_position};
376							# End reached?
377	104	100				126	if ($pos<$len) {
378	101					79	my $pA=${$self->{probA}}[$pos];
	101					125
379	101					88	my $pC=${$self->{probC}}[$pos];
	101					105
380	101					93	my $pG=${$self->{probG}}[$pos];
	101					127
381	101					84	my $pT=${$self->{probT}}[$pos];
	101					100
382	101					94	my $lA=${$self->{logA}}[$pos];
	101					105
383	101					88	my $lC=${$self->{logC}}[$pos];
	101					101
384	101					88	my $lG=${$self->{logG}}[$pos];
	101					102
385	101					108	my $lT=${$self->{logT}}[$pos];
	101					109
386	101					85	my $base=${$self->{seq}}[$pos];
	101					102
387	101					84	my $prob=${$self->{seqp}}[$pos];
	101					98
388	101					92	$self->{_position}++;
389	101					330	my %seq=(pA=>$pA,pT=>$pT,pC=>$pC,pG=>$pG, lA=>$lA,lT=>$lT,lC=>$lC,lG=>$lG,base=>$base,rel=>$pos, prob=>$prob);
390	101					573	return %seq;
391							}
392	3					4	else {$self->{_position}=0; return;}
	3					8
393							}
394
395							=head2 curpos
396
397							Title : curpos
398							Usage :
399							Function: Gets/sets the current position. Converts to 0 if argument is minus
400							and to width if greater than width
401							Returns : integer
402							Args : integer
403
404							=cut
405
406							sub curpos {
407	2			2	1	264	my $self = shift;
408	2					3	my $prev = $self->{_position};
409	2	50				6	if (@_) { $self->{_position} = shift; }
	0					0
410	2					6	return $prev;
411							}
412
413							=head2 e_val
414
415							Title : e_val
416							Usage :
417							Function: Gets/sets the e-value
418							Returns : real number
419							Args : none to get, real number to set
420
421							=cut
422
423							sub e_val {
424	3			3	1	6	my $self = shift;
425	3					5	my $prev = $self->{e_val};
426	3	100				7	if (@_) { $self->{e_val} = shift; }
	1					2
427	3					8	return $prev;
428							}
429
430							=head2 IC
431
432							Title : IC
433							Usage :
434							Function: Get/set the Information Content
435							Returns : real number
436							Args : none to get, real number to set
437
438							=cut
439
440							sub IC {
441	1			1	1	1	my $self = shift;
442	1					2	my $prev = $self->{IC};
443	1	50				3	if (@_) { $self->{IC} = shift; }
	0					0
444	1					2	return $prev;
445							}
446
447							=head2 accession_number
448
449							Title : accession_number
450							Function: Get/set the accession number, this will be unique id for the
451							SiteMatrix object as well for any other object, inheriting from
452							SiteMatrix
453							Returns : string
454							Args : none to get, string to set
455
456							=cut
457
458							sub accession_number {
459	0			0	1	0	my $self = shift;
460	0					0	my $prev = $self->{accession_number};
461	0	0				0	if (@_) { $self->{accession_number} = shift; }
	0					0
462	0					0	return $prev;
463							}
464
465							=head2 consensus
466
467							Title : consensus
468							Usage :
469							Function: Returns the consensus
470							Returns : string
471							Args : (optional) threshold value 1 to 10, default 5
472							'5' means the returned characters had a 50% or higher presence at
473							their position
474
475							=cut
476
477							sub consensus {
478	8			8	1	1042	my ($self, $thresh) = @_;
479	8	50				21	if ($thresh) {
480	0					0	my $len=$#{$self->{probA}};
	0					0
481	0					0	for (my $i=0; $i<$len+1; $i++) {
482	0					0	(${$self->{seq}}[$i], ${$self->{seqp}}[$i]) = _to_cons(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
	0					0
	0					0
	0					0
	0					0
	0					0
	0					0
483							}
484							}
485	8					12	my $consensus='';
486	8					10	foreach my $letter (@{$self->{seq}}) {
	8					17
487	180					177	$consensus .= $letter;
488							}
489	8					24	return $consensus;
490							}
491
492							=head2 width
493
494							Title : width
495							Usage :
496							Function: Returns the length of the sites in used to make this matrix
497							Returns : int
498							Args : none
499
500							=cut
501
502							sub width {
503	2			2	1	4	my $self = shift;
504	2					3	my $width=@{$self->{probA}};
	2					4
505	2					6	return $width;
506							}
507
508							=head2 sites
509
510							Title : sites
511							Usage :
512							Function: Get/set the number of sites that were used to make this matrix
513							Returns : int
514							Args : none to get, int to set
515
516							=cut
517
518							sub sites {
519	0			0	1	0	my $self = shift;
520	0	0				0	if (@_) { $self->{sites} = shift }
	0					0
521	0		0			0	return $self->{sites} \|\| return;
522							}
523
524							=head2 IUPAC
525
526							Title : IUPAC
527							Usage :
528							Function: Returns IUPAC compliant consensus
529							Returns : string
530							Args : optionally, also supply a whole number (int) of 1 or higher to set
531							the significance level when considering the frequencies. 1 (the
532							default) means a 0.05 significance level: frequencies lower than
533							0.05 will be ignored. 2 Means a 0.005 level, and so on.
534
535							=cut
536
537							sub IUPAC {
538	5			5	1	2624	my ($self, $thresh) = @_;
539	5	50				12	if ($thresh) {
540	0					0	my $len=$#{$self->{probA}};
	0					0
541	0					0	for (my $i=0; $i<$len+1; $i++) {
542	0					0	(${$self->{IUPAC}}[$i],${$self->{IUPACp}}[$i])=_to_IUPAC(${$self->{probA}}[$i], ${$self->{probC}}[$i], ${$self->{probG}}[$i], ${$self->{probT}}[$i], $thresh);
	0					0
	0					0
	0					0
	0					0
	0					0
	0					0
543							}
544							}
545	5					11	my $iu=$self->{IUPAC};
546	5					7	my $iupac='';
547	5					5	foreach my $let (@{$iu}) {
	5					8
548	92					91	$iupac .= $let;
549							}
550	5					18	return $iupac;
551							}
552
553							=head2 _to_IUPAC
554
555							Title : _to_IUPAC
556							Usage :
557							Function: Converts a single position to IUPAC compliant symbol.
558							For rules see the implementation
559							Returns : char, real number
560							Args : real numbers for frequencies of A,C,G,T (positional)
561
562							optionally, also supply a whole number (int) of 1 or higher to set
563							the significance level when considering the frequencies. 1 (the
564							default) means a 0.05 significance level: frequencies lower than
565							0.05 will be ignored. 2 Means a 0.005 level, and so on.
566
567							=cut
568
569							sub _to_IUPAC {
570	267			267		343	my ($a, $c, $g, $t, $thresh) = @_;
571	267		50			623	$thresh \|\|= 1;
572	267					239	$thresh = int($thresh);
573	267					860	$a = sprintf ("%.${thresh}f", $a);
574	267					551	$c = sprintf ("%.${thresh}f", $c);
575	267					499	$g = sprintf ("%.${thresh}f", $g);
576	267					494	$t = sprintf ("%.${thresh}f", $t);
577
578	267					568	my $total = $a + $c + $g + $t;
579
580	267	100				430	return 'A' if ($a == $total);
581	199	100				258	return 'G' if ($g == $total);
582	190	100				278	return 'C' if ($c == $total);
583	108	100				178	return 'T' if ($t == $total);
584	93					98	my $r=$g+$a;
585	93	50				120	return 'R' if ($r == $total);
586	93					91	my $y=$t+$c;
587	93	100				138	return 'Y' if ($y == $total);
588	86					89	my $m=$a+$c;
589	86	100				135	return 'M' if ($m == $total);
590	62					63	my $k=$g+$t;
591	62	100				97	return 'K' if ($k == $total);
592	49					62	my $s=$g+$c;
593	49	100				72	return 'S' if ($s == $total);
594	42					46	my $w=$a+$t;
595	42	100				70	return 'W' if ($w == $total);
596	20					22	my $d=$r+$t;
597	20	100				36	return 'D' if ($d == $total);
598	17					21	my $v=$r+$c;
599	17	100				34	return 'V' if ($v == $total);
600	9					13	my $b=$y+$g;
601	9	100				19	return 'B' if ($b == $total);
602	7					7	my $h=$y+$a;
603	7	100				16	return 'H' if ($h == $total);
604	1					1	return 'N';
605							}
606
607							=head2 _to_cons
608
609							Title : _to_cons
610							Usage :
611							Function: Converts a single position to simple consensus character and returns
612							its probability. For rules see the implementation
613							Returns : char, real number
614							Args : real numbers for A,C,G,T (positional), and optional 5th argument of
615							threshold (as a number between 1 and 10, where 5 is default and
616							means the returned character had a 50% or higher presence at this
617							position)
618
619							=cut
620
621							sub _to_cons {
622	267			267		375	my ($A, $C, $G, $T, $thresh) = @_;
623	267		50			610	$thresh \|\|= 5;
624
625							# this multiplication by 10 is just to satisfy the thresh range of 1-10
626	267					282	my $a = $A * 10;
627	267					242	my $c = $C * 10;
628	267					240	my $g = $G * 10;
629	267					229	my $t = $T * 10;
630
631	267	100	100			669	return 'N',10 if (($a<$thresh) && ($c<$thresh) && ($g<$thresh) && ($t<$thresh));
			100
			100
632	244	50	100			409	return 'N',10 if (($a==$t) && ($a==$c) && ($a==$g));
			66
633
634							# threshold could be lower than 50%, so must check is not only over
635							# threshold, but also the highest frequency
636	244	50	100			642	return 'A',$a if (($a>=$thresh) && ($a>$t) && ($a>$c) && ($a>$g));
			66
			66
637	146	50	100			518	return 'C',$c if (($c>=$thresh) && ($c>$t) && ($c>$a) && ($c>$g));
			66
			66
638	45	50	66			149	return 'G',$g if (($g>=$thresh) && ($g>$t) && ($g>$c) && ($g>$a));
			66
			33
639	26	100	33			151	return 'T',$t if (($t>=$thresh) && ($t>$g) && ($t>$c) && ($t>$a));
			66
			100
640
641	2					4	return 'N',10;
642							}
643
644							=head2 get_string
645
646							Title : get_string
647							Usage :
648							Function: Returns given probability vector as a string. Useful if you want to
649							store things in a rel database, where arrays are not first choice
650							Throws : If the argument is outside {A,C,G,T}
651							Returns : string
652							Args : character {A,C,G,T}
653
654							=cut
655
656							sub get_string {
657	1			1	1	2	my $self=shift;
658	1					2	my $base=shift;
659	1					1	my $string='';
660	1					2	my @prob;
661
662							BASE: {
663	1	50				1	if ($base eq 'A') {@prob= @{$self->{probA}}; last BASE; }
	1					4
	1					2
	1					3
	1					2
664	0	0				0	if ($base eq 'C') {@prob= @{$self->{probC}}; last BASE; }
	0					0
	0					0
	0					0
665	0	0				0	if ($base eq 'G') {@prob= @{$self->{probG}}; last BASE; }
	0					0
	0					0
	0					0
666	0	0				0	if ($base eq 'T') {@prob= @{$self->{probT}}; last BASE; }
	0					0
	0					0
	0					0
667	0					0	$self->throw ("No such base: $base!\n");
668							}
669
670	1					3	foreach my $prob (@prob) {
671	5					7	my $corrected = $prob*10;
672	5					8	my $next=sprintf("%.0f",$corrected);
673	5	100				8	$next='a' if ($next eq '10');
674	5					6	$string .= $next;
675							}
676	1					4	return $string;
677							}
678
679							=head2 get_array
680
681							Title : get_array
682							Usage :
683							Function: Returns an array with frequencies for a specified base
684							Returns : array
685							Args : char
686
687							=cut
688
689							sub get_array {
690	3			3	1	10	my $self=shift;
691	3					7	my $base=uc(shift);
692	3	50				7	return @{$self->{probA}} if ($base eq 'A');
	3					16
693	0	0				0	return @{$self->{probC}} if ($base eq 'C');
	0					0
694	0	0				0	return @{$self->{probG}} if ($base eq 'G');
	0					0
695	0	0				0	return @{$self->{probT}} if ($base eq 'T');
	0					0
696	0					0	$self->throw("No such base: $base!\n");
697							}
698
699							=head2 get_logs_array
700
701							Title : get_logs_array
702							Usage :
703							Function: Returns an array with log_odds for a specified base
704							Returns : array
705							Args : char
706
707							=cut
708
709							sub get_logs_array {
710	1			1	1	5	my $self=shift;
711	1					2	my $base=uc(shift);
712	1	50	33			5	return @{$self->{logA}} if (($base eq 'A') && ($self->{logA}));
	1					6
713	0	0	0			0	return @{$self->{logC}} if (($base eq 'C') && ($self->{logC}));
	0					0
714	0	0	0			0	return @{$self->{logG}} if (($base eq 'G') && ($self->{logG}));
	0					0
715	0	0	0			0	return @{$self->{logT}} if (($base eq 'T') && ($self->{logT}));
	0					0
716	0	0				0	$self->throw ("No such base: $base!\n") if (!grep(/$base/,qw(A C G T)));
717	0					0	return;
718							}
719
720							=head2 id
721
722							Title : id
723							Usage :
724							Function: Gets/sets the site id
725							Returns : string
726							Args : string
727
728							=cut
729
730							sub id {
731	12			12	1	30	my $self = shift;
732	12					16	my $prev = $self->{id};
733	12	100				19	if (@_) { $self->{id} = shift; }
	2					2
734	12					29	return $prev;
735							}
736
737							=head2 regexp
738
739							Title : regexp
740							Usage :
741							Function: Returns a regular expression which matches the IUPAC convention.
742							N will match X, N, - and .
743							Returns : string
744							Args : none (works at the threshold last used for making the IUPAC string)
745
746							=cut
747
748							sub regexp {
749	1			1	1	2	my $self=shift;
750	1					1	my $regexp;
751	1					2	foreach my $letter (@{$self->{IUPAC}}) {
	1					3
752	5					4	my $reg;
753							LETTER: {
754	5	100				4	if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
	5					10
	1					1
	1					2
755	4	100				6	if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
	1					1
	1					2
756	3	50				4	if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
	0					0
	0					0
757	3	50				6	if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
	0					0
	0					0
758	3	50				4	if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
	0					0
	0					0
759	3	50				4	if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
	0					0
	0					0
760	3	50				5	if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
	0					0
	0					0
761	3	50				5	if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
	0					0
	0					0
762	3	50				11	if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
	0					0
	0					0
763	3	50				4	if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
	0					0
	0					0
764	3	100				5	if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
	1					1
	1					1
765	2	50				5	if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
	0					0
	0					0
766	2	100				3	if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
	1					1
	1					2
767	1	50				2	if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
	1					1
	1					2
768	0					0	$reg='\S';
769							}
770	5					8	$regexp .= $reg;
771							}
772	1					3	return $regexp;
773							}
774
775							=head2 regexp_array
776
777							Title : regexp_array
778							Usage :
779							Function: Returns a regular expression which matches the IUPAC convention.
780							N will match X, N, - and .
781							Returns : array
782							Args : none (works at the threshold last used for making the IUPAC string)
783							To do : I have separated regexp and regexp_array, but
784							maybe they can be rewritten as one - just check what should be returned
785
786							=cut
787
788							sub regexp_array {
789	1			1	1	2	my $self=shift;
790	1					2	my @regexp;
791	1					2	foreach my $letter (@{$self->{IUPAC}}) {
	1					2
792	5					6	my $reg;
793							LETTER: {
794	5	100				5	if ($letter eq 'A') { $reg='[Aa]'; last LETTER; }
	5					8
	1					1
	1					2
795	4	100				5	if ($letter eq 'C') { $reg='[Cc]'; last LETTER; }
	1					2
	1					1
796	3	50				4	if ($letter eq 'G') { $reg='[Gg]'; last LETTER; }
	0					0
	0					0
797	3	50				5	if ($letter eq 'T') { $reg='[Tt]'; last LETTER; }
	0					0
	0					0
798	3	50				4	if ($letter eq 'M') { $reg='[AaCcMm]'; last LETTER; }
	0					0
	0					0
799	3	50				4	if ($letter eq 'R') { $reg='[AaGgRr]'; last LETTER; }
	0					0
	0					0
800	3	50				4	if ($letter eq 'W') { $reg='[AaTtWw]'; last LETTER; }
	0					0
	0					0
801	3	50				4	if ($letter eq 'S') { $reg='[CcGgSs]'; last LETTER; }
	0					0
	0					0
802	3	50				5	if ($letter eq 'Y') { $reg='[CcTtYy]'; last LETTER; }
	0					0
	0					0
803	3	50				5	if ($letter eq 'K') { $reg='[GgTtKk]'; last LETTER; }
	0					0
	0					0
804	3	100				4	if ($letter eq 'V') { $reg='[AaCcGgVv]'; last LETTER; }
	1					2
	1					1
805	2	50				3	if ($letter eq 'H') { $reg='[AaCcTtHh]'; last LETTER; }
	0					0
	0					0
806	2	100				4	if ($letter eq 'D') { $reg='[AaGgTtDd]'; last LETTER; }
	1					1
	1					1
807	1	50				2	if ($letter eq 'B') { $reg='[CcGgTtBb]'; last LETTER; }
	1					1
	1					1
808	0					0	$reg='\S';
809							}
810	5					8	push @regexp,$reg;
811							}
812	1					4	return @regexp;
813							}
814
815
816							=head2 _compress_array
817
818							Title : _compress_array
819							Usage :
820							Function: Will compress an array of real signed numbers to a string (ie vector
821							of bytes) -127 to +127 for bi-directional(signed) and 0..255 for
822							unsigned
823							Returns : String
824							Args : array reference, followed by an max value and direction (optional,
825							default 1-unsigned),1 unsigned, any other is signed.
826
827							=cut
828
829							sub _compress_array {
830	3			3		5	my ($array,$lm,$direct)=@_;
831	3					2	my $str;
832	3	50	33			12	return unless(($array) && ($lm));
833	3	50				3	$direct=1 unless ($direct);
834	3	100				9	my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
835	3					3	foreach my $c (@{$array}) {
	3					6
836	62	50				77	$c=$lm if ($c>$lm);
837	62	50	33			81	$c=-$lm if (($c<-$lm) && ($direct !=1));
838	62	50	66			96	$c=0 if (($c<0) && ($direct ==1));
839	62					62	my $byte=int($k1*$c);
840	62	100				75	$byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
841	62					55	my $char=chr($byte);
842	62					67	$str.=$char;
843							}
844	3					10	return $str;
845							}
846
847							=head2 _uncompress_string
848
849							Title : _uncompress_string
850							Usage :
851							Function: Will uncompress a string (vector of bytes) to create an array of
852							real signed numbers (opposite to_compress_array)
853							Returns : string, followed by an max value and
854							direction (optional, default 1-unsigned), 1 unsigned, any other is signed.
855							Args : array
856
857							=cut
858
859							sub _uncompress_string {
860	3			3		14	my ($str,$lm,$direct)=@_;
861	3					3	my @array;
862	3	50	33			9	return unless(($str) && ($lm));
863	3	50				4	$direct=1 unless ($direct);
864	3	100				6	my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
865	3					12	foreach my $c (split(//,$str)) {
866	62					52	my $byte=ord($c);
867	62	100				76	$byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
868	62					52	my $num=$byte/$k1;
869	62					65	push @array,$num;
870							}
871	3					13	return @array;
872							}
873
874							=head2 get_compressed_freq
875
876							Title : get_compressed_freq
877							Usage :
878							Function: A method to provide a compressed frequency vector. It uses one byte
879							to code the frequence for one of the probability vectors for one
880							position. Useful for relational database. Improvement of the previous
881							0..a coding.
882							Example : my $strA=$self->get_compressed_freq('A');
883							Returns : String
884							Args : char
885
886							=cut
887
888							sub get_compressed_freq {
889	2			2	1	516	my $self=shift;
890	2					4	my $base=shift;
891	2					3	my $string='';
892	2					2	my @prob;
893							BASE: {
894	2	50				3	if ($base eq 'A') {
	2					5
895	2	50				5	@prob= @{$self->{probA}} unless (!defined($self->{probA}));
	2					6
896	2					4	last BASE;
897							}
898	0	0				0	if ($base eq 'G') {
899	0	0				0	@prob= @{$self->{probG}} unless (!defined($self->{probG}));
	0					0
900	0					0	last BASE;
901							}
902	0	0				0	if ($base eq 'C') {
903	0	0				0	@prob= @{$self->{probC}} unless (!defined($self->{probC}));
	0					0
904	0					0	last BASE;
905							}
906	0	0				0	if ($base eq 'T') {
907	0	0				0	@prob= @{$self->{probT}} unless (!defined($self->{probT}));
	0					0
908	0					0	last BASE;
909							}
910	0					0	$self->throw ("No such base: $base!\n");
911							}
912	2					5	my $str= _compress_array(\@prob,1,1);
913	2					5	return $str;
914							}
915
916							=head2 get_compressed_logs
917
918							Title : get_compressed_logs
919							Usage :
920							Function: A method to provide a compressed log-odd vector. It uses one byte to
921							code the log value for one of the log-odds vectors for one position.
922							Example : my $strA=$self->get_compressed_logs('A');
923							Returns : String
924							Args : char
925
926							=cut
927
928							sub get_compressed_logs {
929	1			1	1	2	my $self=shift;
930	1					1	my $base=shift;
931	1					2	my $string='';
932	1					1	my @prob;
933							BASE: {
934	1	50				1	if ($base eq 'A') {@prob= @{$self->{logA}} unless (!defined($self->{logA})); last BASE; }
	1	50				3
	1					3
	1					5
	1					2
935	0	0				0	if ($base eq 'C') {@prob= @{$self->{logC}} unless (!defined($self->{logC})); last BASE; }
	0	0				0
	0					0
	0					0
936	0	0				0	if ($base eq 'G') {@prob= @{$self->{logG}} unless (!defined($self->{logG})); last BASE; }
	0	0				0
	0					0
	0					0
937	0	0				0	if ($base eq 'T') {@prob= @{$self->{logT}} unless (!defined($self->{logT})); last BASE; }
	0	0				0
	0					0
	0					0
938	0					0	$self->throw ("No such base: $base!\n");
939							}
940	1					2	return _compress_array(\@prob,1000,2);
941							}
942
943							=head2 sequence_match_weight
944
945							Title : sequence_match_weight
946							Usage :
947							Function: This method will calculate the score of a match, based on the PWM
948							if such is associated with the matrix object. Returns undef if no
949							PWM data is available.
950							Throws : if the length of the sequence is different from the matrix width
951							Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
952							Returns : Floating point
953							Args : string
954
955							=cut
956
957							sub sequence_match_weight {
958	1			1	1	504	my ($self,$seq)=@_;
959	1	50				3	return unless ($self->{logA});
960	1					15	my $width=$self->width;
961	1	50				1	$self->throw ("I can calculate the score only for sequence which are exactly my size for $seq, my width is $width\n") unless (length($seq)==@{$self->{logA}});
	1					3
962	1					2	$seq = uc($seq);
963	1					6	my @seq=split(//,$seq);
964	1					2	my $score = 0;
965	1					1	my $i=0;
966	1					2	foreach my $pos (@seq) {
967	25					25	my $tv = 'log'.$pos;
968	25	50				31	$self->warn("Position ".($i+1)." of input sequence has unknown (ambiguity?) character '$pos': scores will be wrong") unless defined $self->{$tv};
969	25	50				38	$score += defined $self->{$tv} ? $self->{$tv}->[$i] : 0;
970	25					25	$i++;
971							}
972	1					5	return $score;
973							}
974
975							=head2 get_all_vectors
976
977							Title : get_all_vectors
978							Usage :
979							Function: returns all possible sequence vectors to satisfy the PFM under
980							a given threshold
981							Throws : If threshold outside of 0..1 (no sense to do that)
982							Example : my @vectors=$self->get_all_vectors(4);
983							Returns : Array of strings
984							Args : (optional) floating
985
986							=cut
987
988							sub get_all_vectors {
989	0			0	1		my $self=shift;
990	0						my $thresh=shift;
991	0	0	0				$self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) \|\| ($thresh>1));
992	0						my @seq=split(//,$self->consensus($thresh*10));
993	0						my @perm;
994	0						for my $i (0..@{$self->{probA}}) {
	0
995	0	0					push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
	0
996	0	0					push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
	0
997	0	0					push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
	0
998	0	0					push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
	0
999	0	0					push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
	0
1000							}
1001	0						my $fpos=shift @perm;
1002	0						my @strings=@$fpos;
1003	0						foreach my $pos (@perm) {
1004	0						my @newstr;
1005	0						foreach my $let (@$pos) {
1006	0						foreach my $string (@strings) {
1007	0						my $newstring = $string . $let;
1008	0						push @newstr,$newstring;
1009							}
1010							}
1011	0						@strings=@newstr;
1012							}
1013	0						return @strings;
1014							}
1015
1016							1;