File Coverage

blib/lib/Bio/CUA/Summarizer.pm

Criterion	Covered	Total	%
statement	94	127	74.0
branch	28	74	37.8
condition	1	7	14.2
subroutine	19	23	82.6
pod	9	11	81.8
total	151	242	62.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::CUA::Summarizer;
2
3	4			4		29792	use 5.006;
	4					10
4	4			4		22	use strict;
	4					4
	4					93
5	4			4		18	use warnings;
	4					5
	4					142
6	4			4		1044	use parent qw/Bio::CUA/;
	4					608
	4					31
7	4			4		2138	use Bio::CUA::CodonTable;
	4					10
	4					448
8
9							my $codonPkg = 'Bio::CUA::CodonTable';
10							my $pkg = __PACKAGE__;
11							my @bases = qw/A T C G/;
12
13							# determine which class is used for sequence processing
14							my $seq_io_pkg;
15							BEGIN{
16							# set version which might be checked during compilation
17							#our $VERSION = 1.01;
18
19							# determine sequence processing module
20	4			4		8	eval { require Bio::SeqIO; };
	4					1016
21	4	50				24	if($@) # Bio::SeqIO is not available
22							{
23	4					6	$seq_io_pkg = 'Bio::CUA::SeqIO';
24	4					2674	require Bio::CUA::SeqIO;
25							}else # otherwise use Bio::SeqIO
26							{
27	0					0	$seq_io_pkg = 'Bio::SeqIO';
28							}
29							}
30
31							=pod
32
33							=head1 NAME
34
35							Bio::CUA::Summarizer - a class to summarize features of sequences.
36
37							=head1 SYNOPSIS
38
39							This class provides convenience for its child classes with methods
40							summarizing sequence features, such
41							as counting and listing amino acids and codons, retrieving amino acids
42							with certain degree degeneracy in a genetic table. Refer to the
43							L section for more details.
44
45							use Bio::CUA::Summarizer;
46
47							my $summarizer = Bio::CUA::Summarizer->new(
48							codon_table => 1 ); # using stardard genetic code
49
50							# get codons in a sequence file
51							my $codonList = $summarizer->tabulate_codons('seqs.fa');
52							# get the codon table object of this summarizer
53							my $table = $summarizer->codon_table;
54							# get all sense codons in the genetic codon table
55							my @senseCodons = $summarizer->all_sense_codons;
56							# get codons encoding an amino acid
57							my @codons = $summarizer->codons_of_AA('Ser');
58
59							=cut
60
61
62							=head2 new
63
64							Title : new
65							Usage : $obj=Bio::CUA::Summarizer->new(%args);
66							Function: create an object which can be used to summarizing sequence
67							features.
68							Returns : an object of this or child class
69							Args : a hash with a key 'codon_table', acceptable values are
70							codon_table => id of genetic codon table # 1
71							codon_table => Bio::CUA::CodonTable object # 2
72							codon_table => 'map-file' # 3
73
74							=over 3
75
76							=item 1
77
78							id of genetic codon table can be found from L
79							codes\|http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=t>.
80							A valid id is an integer.
81
82							=item 2
83
84							an object of L. One can directly provide an
85							object to the method.
86
87							=item 3
88
89							If genetic code in analyzed sequences is not included in NCBI, one can
90							also provide its own genetic code in a map-file, in the format of
91							codon1AA1
92							codon2AA2,
93							... ... ....
94
95							=back
96
97							Note all the analyzed sequences will use this provided genetic codon
98							table to map between amino acids and codons.
99
100							=cut
101
102							sub new
103							{
104	3			3	1	772	my ($caller, @args) = @_;
105	3					26	my $self = $caller->SUPER::new(@args);
106
107	3					13	my $hashRef = $self->_array_to_hash(\@args);
108
109							# only process its own argument
110	3					4	my $codonTable;
111	3					23	while(my ($tag, $val) = each %$hashRef)
112							{
113	3	50				11	next unless($tag eq 'codon_table');
114	3	50				25	if(ref($val))
		50
115							{
116	0	0				0	$self->throw("$val is not an object of $codonPkg")
117							unless($val->isa($codonPkg));
118	0					0	$codonTable = $val;
119							}elsif($val =~ /^\d+$/) # genetic code id
120							{
121	3	50				28	$codonTable = $codonPkg->new(-id => $val) or
122							$self->throw("Invalid genetic code ID '$val'");
123							}else # a map file
124							{
125	0	0				0	$codonTable = $codonPkg->new(-map_file => $val) or
126							$self->throw("Can not construct codon table with file '$val'");
127							}
128	3					9	last;
129							}
130	3	50	0			13	$self->warn("option 'codon_table' is missing in the method",
131							"'new' of $pkg") and return undef unless($codonTable);
132							# store the result
133	3					26	$self->{'_codon_table'} = $codonTable;
134
135	3					17	return $self;
136							}
137
138							=head2 codon_table
139
140							Title : codon_table
141							Usage : $table = $self->codon_table;
142							Function: get associated codon table of this object
143							Returns : an object of L
144							Args : None
145
146							=cut
147
148							sub codon_table
149							{
150	249	50		249	1	1135	my $table = $_[0]->{'_codon_table'} or
151							$_[0]->warn("No codon table associated with this object $_[0]");
152	249					499	return $table;
153							}
154
155							=head2 bases
156
157							get the 4 nucleotides A,T,C,G always in this order, to keep
158							consistency among different classes
159
160							=cut
161
162							# get all the nucleotide bases in a certain order
163							sub bases
164							{
165	1	50		1	1	481	return wantarray? @bases : \@bases;
166							}
167
168							# sequence-level functions
169							=head2 get_codon_list
170
171							Title : get_codon_list
172							Usage : $codonList = $self->get_codon_list($input)
173							Function: get codons and their counts in input
174							Returns : reference to a hash containing codons as keys and counts
175							as values.
176							Args : seq string, seq object, seq file, or another codon list
177
178							=cut
179
180							# the main interface to preprocess input to most methods
181							# return a codon list with its count
182							# acceptable parameters: seq string, seq object, seq file, codon list
183							sub get_codon_list
184							{
185	57			57	0	67	my ($self, $input) = @_;
186
187	57					83	my $ref = ref($input);
188	57	100				100	unless($ref) # a scalar variable
189							{
190							# a sequence string
191	5	50	33			31	if($input =~ /^[ATGCUN]+$/ and (! -f $input))
192							{
193	0					0	return $self->_catalog_codons($input);
194							}else # a sequence file
195							{
196	5					20	return $self->tabulate_codons($input);
197							}
198							}
199
200	52	50				88	if($ref eq 'HASH') # codon list
201							{
202	0					0	return $input;
203							}else # an seq object
204							{
205	52					89	return $self->_catalog_codons($input);
206							}
207							}
208
209							=head2 tabulate_codons
210
211							Title : tabulate_codons
212							Usage : $codonList = $self->tabulate_codons($input,[$each]);
213							Function: count codons in the input sequences
214							Returns : reference to a hash in which codon is the key and counts as
215							values. If $each is true, then each sequence is separately processed
216							and stored in a larger hash. The count of a codon in a sequence can
217							be retrieved like this: $codonList->{'seqId'}->{'codon'}.
218							Args : accepted arguments are as follows:
219							I = name of a file containing fasta sequences
220							I = optional, if TRUE (i.e., non-zero values), each sequence is
221							separately processed.
222
223							This is a companionate method of L for situations
224							when one want to get codon counts for each sequence separately.
225
226							=cut
227
228							sub tabulate_codons
229							{
230	6			6	1	13	my ($self, $input, $each) = @_;
231
232	6	50				33	my $seqIO = $self->_get_seq_io($input) or return;
233	6					11	my %list;
234
235	6	50				16	if($each)
236							{
237	0					0	while(my $seq = $seqIO->next_seq())
238							{
239	0					0	my $codons = $self->_catalog_codons($seq->seq);
240	0					0	$list{$seq->id} = $codons;
241							}
242
243							}else # otherwise process together
244							{
245	6					31	while(my $seq = $seqIO->next_seq())
246							{
247	78					188	my $codons = $self->_catalog_codons($seq->seq);
248							# merge all codons together
249	78					413	while(my ($c, $v) = each %$codons)
250							{
251	4758					9979	$list{$c} += $v;
252							}
253							}
254							}
255
256	6	50				33	return undef unless(keys %list);
257	6					41	return \%list;
258							}
259
260							=head2 tabulate_AAs
261
262							Title : tabulate_AAs
263							Usage : $AAList = $self->tabulate_AAs($input,[$each]);
264							Function: similar to L, but for counting amino acids
265							Returns : the same as L, but for amino acids
266							Args : refer to L.
267
268							=cut
269
270							sub tabulate_AAs
271							{
272	0			0	1	0	my ($self, $input, $each) = @_;
273
274	0	0				0	my $codonList = $self->tabulate_codons($input) or return;
275
276	0					0	my %AAs;
277	0	0				0	if($each)
278							{
279	0					0	while(my ($id, $hashRef) = each %$codonList)
280							{
281	0					0	while(my ($codon, $count) = each %$codonList)
282							{
283	0	0				0	my $AA = $self->_codon_to_aa($codon) or next;
284	0					0	$AAs{$id}->{$AA} += $count;
285							}
286							}
287							}else
288							{
289	0					0	while(my ($codon, $count) = each %$codonList)
290							{
291	0	0				0	my $AA = $self->_codon_to_aa($codon) or next;
292	0					0	$AAs{$AA} += $count;
293							}
294							}
295
296	0					0	return \%AAs;
297							}
298
299							# get the sequence IO and return it
300							sub _get_seq_io
301							{
302	6			6		12	my ($self, $input) = @_;
303
304	6	50	0			19	$self->warn("input fasta file is needed to obtain seq IO") and return
305							unless($input);
306							# at present, use Bio::SeqIO
307	6					79	my $io = $seq_io_pkg->new(-file => $input, -format => 'fasta');
308	6					29	return $io;
309							}
310
311
312							# codon table functions
313							=head2 all_sense_codons
314
315							get all sense codons in the genetic codon table of this object
316
317							=cut
318
319							sub all_sense_codons
320							{
321	2			2	0	5	my ($self) = @_;
322
323	2	50				3	my $codonTable = $self->codon_table() or return;
324
325	2					9	return $codonTable->all_sense_codons;
326							}
327
328							=head2 all_AAs_in_table
329
330							get all the amino acids coded in the genetic code table of this object
331
332							=cut
333
334							sub all_AAs_in_table
335							{
336	8			8	1	13	my ($self) = @_;
337
338	8	50				27	my $codonTable = $self->codon_table() or return;
339
340	8					35	$codonTable->all_amino_acids();
341							}
342
343							=head2 codons_of_AA
344
345							get codons coding the given amino acid, I,
346
347							my @codons = $self->codons_of_AA('Ser');
348
349							=cut
350
351							sub codons_of_AA
352							{
353	160			160	1	149	my ($self, $AA) = @_;
354
355	160	50				197	my $codonTable = $self->codon_table() or return;
356
357	160					292	return $codonTable->codons_of_AA($AA);
358							}
359
360							=head2 aa_degeneracy_classes
361
362							Title : aa_degeneracy_classes
363							Usage : $hashRef = $self->aa_degeneracy_classes;
364							Function: get amino acid degeneracy classes according to the
365							associated genetic code table
366							Returns : a hash reference in which first level key is degeneracy
367							degrees such as 1,2,3,4,6, second level is amino acid, the value is
368							reference to the corresponding codon array. For example:
369
370							{ 2 => { D => [GAU, GAC],
371							C => [UGU, UGC],
372							... ...
373							},
374							4 => { A => [GCU, GCC, GCA, GCG],
375							... ...
376							},
377							... ... ...
378							}
379
380							Args : None
381
382							=cut
383
384							sub aa_degeneracy_classes
385							{
386	52	50		52	1	97	my $codonTable = $_[0]->codon_table or return;
387	52					178	return $codonTable->codon_degeneracy;
388							}
389
390							=head2 codons_by_degeneracy
391
392							Title : codons_by_degeneracy
393							Usage : @codons = $self->codons_by_degeneracy(2);
394							Function: get all the codons of AAs which have the specified degree
395							of degeneracy, for example, codons of amino acids with degenracy
396							degree 2.
397							Returns : an array of codons, or its reference in scalar context
398							Args : an integer for degeneracy degree
399
400							=cut
401
402							sub codons_by_degeneracy
403							{
404	26			26	1	42	my ($self, $deg) = @_;
405	26	50				49	my $degHash = $self->aa_degeneracy_classes or return;
406	26	50				72	my $aaClass = $degHash->{$deg} or return;
407	26					31	my @codons;
408	26					84	while(my ($aa, $codonRef) = each %$aaClass)
409							{
410	52					147	push @codons, @$codonRef;
411							}
412	26	50				127	return wantarray? @codons : \@codons;
413							}
414
415							# other misc functions
416							#############################################
417							# Other methods used internally
418							#############################################
419
420							# check whether a codon is valid and also not stop codon
421							sub _is_sense_codon
422							{
423	0			0		0	my ($self, $codon) = @_;
424
425	0	0				0	my $codonTable = $self->codon_table() or return;
426	0	0				0	return 0 unless($codonTable->is_valid_codon($codon));
427	0	0				0	return 0 if($codonTable->is_stop_codon($codon));
428
429	0					0	return 1;
430							}
431
432							# check whether a codon is a stop codon
433							sub _is_stop_codon
434							{
435	0			0		0	my ($self, $codon) = @_;
436	0	0				0	my $codonTable = $self->codon_table() or return;
437	0					0	$codonTable->is_stop_codon($codon);
438							}
439
440							# get the corresponding AA of a codon
441							sub _codon_to_aa
442							{
443	0			0		0	my ($self, $codon) = @_;
444
445	0	0				0	my $codonTable = $self->codon_table() or return;
446	0					0	$codonTable->translate($codon);
447							}
448
449							# get all the codons in the sequence, that is, split into nucleotide
450							# triplet
451							sub _catalog_codons
452							{
453	130			130		228	my ($self, $seq) = @_;
454
455	130					271	$seq = $self->_get_seq_str($seq);
456	130					166	my %codons;
457	130					136	my $accuLen = 0;
458	130					198	my $seqLen = length($seq);
459	130	50				369	$self->warn("sequence [$seq] is not multiple of 3 long") unless($seqLen %
460							3 == 0);
461	130					125	my $codon;
462	130					293	while($accuLen + 3 <= $seqLen)
463							{
464	88570					114341	$codon = substr($seq,$accuLen,3);
465	88570					88053	$codons{$codon}++;
466	88570					165545	$accuLen += 3;
467							}
468
469	130	50				284	return undef unless($accuLen);
470	130					694	return \%codons;
471							}
472
473							# get and preprocess sequence
474							# get a seq string or seq object and return sequence string
475							sub _get_seq_str
476							{
477	130			130		186	my ($self, $seq) = @_;
478	130	100				315	$seq = ref($seq)? $seq->seq : $seq;
479	130					979	$seq = uc($seq);
480	130					531	$seq =~ tr/U/T/;
481	130					1373	$seq =~ s/[^A-Z]+//g; # remove all non-nucleotide characters
482
483	130					239	my $len = length($seq);
484	130	50				311	if($len > 0)
485							{
486	130					397	return $seq;
487							}else
488							{
489	0						return undef;
490							}
491							}
492
493							=head1 AUTHOR
494
495							Zhenguo Zhang, C<< >>
496
497							=head1 BUGS
498
499							Please report any bugs or feature requests to C, or through
500							the web interface at L. I will be notified, and then you'll
501							automatically be notified of progress on your bug as I make changes.
502
503							=head1 SUPPORT
504
505							You can find documentation for this module with the perldoc command.
506
507							perldoc Bio::CUA::Summarizer
508
509
510							You can also look for information at:
511
512							=over 4
513
514							=item * RT: CPAN's request tracker (report bugs here)
515
516							L
517
518							=item * AnnoCPAN: Annotated CPAN documentation
519
520							L
521
522							=item * CPAN Ratings
523
524							L
525
526							=item * Search CPAN
527
528							L
529
530							=back
531
532
533							=head1 ACKNOWLEDGEMENTS
534
535
536							=head1 LICENSE AND COPYRIGHT
537
538							Copyright 2015 Zhenguo Zhang.
539
540							This program is free software: you can redistribute it and/or modify
541							it under the terms of the GNU General Public License as published by
542							the Free Software Foundation, either version 3 of the License, or
543							(at your option) any later version.
544
545							This program is distributed in the hope that it will be useful,
546							but WITHOUT ANY WARRANTY; without even the implied warranty of
547							MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
548							GNU General Public License for more details.
549
550							You should have received a copy of the GNU General Public License
551							along with this program. If not, see L.
552
553
554							=cut
555
556							1; # End of Bio::CUA::Summarizer
557