File Coverage

Bio/Tools/SeqWords.pm

Criterion	Covered	Total	%
statement	48	59	81.3
branch	12	28	42.8
condition	4	12	33.3
subroutine	6	6	100.0
pod	3	3	100.0
total	73	108	67.5

line	stmt	bran	cond	sub	pod	time	code
1							#---------------------------------------------------------------------------
2							# PACKAGE : Bio::Tools::SeqWords
3							# PURPOSE : To count n-mers in any sequence of characters
4							# AUTHOR : Derek Gatherer (d.gatherer@vir.gla.ac.uk)
5							# SOURCE :
6							# CREATED : 21st March 2000
7							# MODIFIED : 11th November 2003 (DG - new method, count_overlap_words)
8							# LICENCE : You may distribute this module under the same terms
9							# : as the rest of BioPerl.
10							#---------------------------------------------------------------------------
11
12							=head1 NAME
13
14							Bio::Tools::SeqWords - Object holding n-mer statistics for a sequence
15
16							=head1 SYNOPSIS
17
18							# Create the SeqWords object, e.g.:
19
20							my $inputstream = Bio::SeqIO->new(-file => "seqfile",
21							-format => 'Fasta');
22							my $seqobj = $inputstream->next_seq();
23							my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
24
25							# Or:
26							my $seqobj = Bio::PrimarySeq->new(-seq => "agggtttccc",
27							-alphabet => 'dna',
28							-id => 'test');
29							my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
30
31							# obtain a hash of word counts, eg:
32							my $hash_ref = $seq_stats->count_words($word_length);
33
34							# display hash table, eg:
35							my %hash = %$hash_ref;
36							foreach my $key(sort keys %hash)
37							{
38							print "\n$key\t$hash{$key}";
39							}
40
41							# Or:
42
43							my $hash_ref =
44							Bio::Tools::SeqWords->count_words($seqobj,$word_length);
45
46							=head1 DESCRIPTION
47
48							L is a featherweight object for the calculation
49							of n-mer word occurrences in a single sequence. It is envisaged that
50							the object will be useful for construction of scripts which use n-mer
51							word tables as the raw material for statistical calculations; for
52							instance, hexamer frequency for the calculation of coding protential,
53							or the calculation of periodicity in repetitive DNA. Triplet
54							frequency is already handled by L (author: Peter
55							Schattner).
56
57							There are a few possible applications for protein, e.g. hypothesised
58							amino acid 7-mers in heat shock proteins, or proteins with multiple
59							simple motifs. Sometimes these protein periodicities are best seen
60							when the amino acid alphabet is truncated, e.g. Shulman alphabet.
61							Since there are quite a few of these shortened alphabets, this module
62							does not specify any particular alphabet.
63
64							See Synopsis above for object creation code.
65
66							=head2 Rationale
67
68							Take a sequence object and create an object for the purposes of
69							holding n-mer word statistics about that sequence. The sequence can be
70							nucleic acid or protein.
71
72							In count_words() the words are counted in a non-overlapping manner,
73							ie. in the style of a codon table, but with any word length.
74
75							In count_overlap_words() the words are counted in an overlapping
76							manner.
77
78							For counts on opposite strand (DNA/RNA), a reverse complement method
79							should be performed, and then the count repeated.
80
81							=head1 FEEDBACK
82
83							=head2 Mailing Lists
84
85							User feedback is an integral part of the evolution of this and other
86							Bioperl modules. Send your comments and suggestions preferably to one
87							of the Bioperl mailing lists. Your participation is much appreciated.
88
89							bioperl-l@bioperl.org - General discussion
90							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91
92							=head2 Support
93
94							Please direct usage questions or support issues to the mailing list:
95
96							I
97
98							rather than to the module maintainer directly. Many experienced and
99							reponsive experts will be able look at the problem and quickly
100							address it. Please include a thorough description of the problem
101							with code and data examples if at all possible.
102
103							=head2 Reporting Bugs
104
105							Report bugs to the Bioperl bug tracking system to help us keep track
106							the bugs and their resolution. Bug reports can be submitted via the
107							web:
108
109							https://github.com/bioperl/bioperl-live/issues
110
111							=head1 AUTHOR
112
113							Derek Gatherer, in the loosest sense of the word 'author'. The
114							general shape of the module is lifted directly from the SeqStat module
115							of Peter Schattner. The central subroutine to count the words is
116							adapted from original code provided by Dave Shivak, in response to a
117							query on the bioperl mailing list. At least 2 other people provided
118							alternative means (equally good but not used in the end) of performing
119							the same calculation. Thanks to all for your assistance.
120
121							=head1 CONTRIBUTORS
122
123							Jason Stajich, jason-at-bioperl.org
124
125							=head1 APPENDIX
126
127							The rest of the documentation details each of the object methods.
128							Internal methods are usually preceded with a _
129
130							=cut
131
132							package Bio::Tools::SeqWords;
133	1			1		751	use strict;
	1					1
	1					27
134
135
136	1			1		2	use base qw(Bio::Root::Root);
	1					1
	1					601
137
138							sub new {
139	3			3	1	353	my($class,@args) = @_;
140							# our new standard way of instantiation
141	3					9	my $self = $class->SUPER::new(@args);
142
143	3					9	my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
144	3	0	33			8	if((! defined($seqobj)) && @args && ref($args[0])) {
			33
145							# parameter not passed as named parameter?
146	0					0	$seqobj = $args[0];
147							}
148
149	3	50				15	if(! $seqobj->isa("Bio::PrimarySeqI")) {
150	0					0	$self->throw(ref($self) . " works only on PrimarySeqI objects\n");
151							}
152
153	3					5	$self->{'_seqref'} = $seqobj;
154	3					8	return $self;
155							}
156
157
158							=head2 count_words
159
160							Title : count_words
161							Usage : $word_count = $seq_stats->count_words($word_length)
162							or
163							$word_count = $seq_stats->Bio::Tools::SeqWords->($seqobj,$word_length);
164							Function: Counts non-overlapping words within a string, any alphabet is
165							used
166							Example : a sequence ACCGTCCGT, counted at word length 4, will give the hash
167							{ACCG => 1, TCCG => 1}
168							Returns : Reference to a hash in which keys are words (any length) of the
169							alphabet used and values are number of occurrences of the word
170							in the sequence.
171							Args : Word length as scalar and, reference to sequence object if
172							required
173
174							Throws an exception word length is not a positive integer
175							or if word length is longer than the sequence.
176
177							=cut
178
179							sub count_words
180							{
181	3			3	1	6	my ($self,$seqobj,$word_length) = @_;
182
183							# check how we were called, and if necessary rearrange arguments
184	3	50				4	if(ref($seqobj)) {
185							# call as SeqWords->count_words($seq, $wordlen)
186	0	0				0	if(! $seqobj->isa("Bio::PrimarySeqI")) {
187	0					0	$self->throw("SeqWords works only on PrimarySeqI objects\n");
188							}
189							} else {
190							# call as $obj->count_words($wordlen)
191	3					3	$word_length = $seqobj;
192	3					3	$seqobj = undef;
193							}
194
195	3	50				6	if(! defined($seqobj)){
196	3					4	$seqobj = $self->{'_seqref'};
197							}
198
199	3	50	33			24	if($word_length eq "" \|\| $word_length =~ /[a-z]/i){
		50	33
200	0					0	$self->throw("SeqWords cannot accept non-numeric characters".
201							" or a null value in the \$word_length variable\n");
202							}elsif ($word_length <1 \|\| ($word_length - int($word_length)) >0){
203	0					0	$self->throw("SeqWords requires the word length to be a ".
204							"positive integer\n");
205							}
206
207	3					6	my $seqstring = uc $seqobj->seq();
208
209	3	50				6	if($word_length > length($seqstring)){
210	0					0	$self->throw("die in _count, \$word_length is bigger ".
211							"than sequence length\n");
212							}
213
214	3					3	my $type = "non-overlap";
215	3					4	my $words = _count($seqobj, $word_length, $type);
216	3					11	return $words; # ref. to a hash
217							}
218
219							=head2 count_overlap_words
220
221							Title : count_overlap_words
222							Usage : $word_count = $word_obj->count_overlap_words($word_length);
223							Function: Counts overlapping words within a string, any alphabet is used
224							Example : A sequence ACCAACCA, counted at word length 4, will give the hash
225							{ACCA=>2, CCAA=>1, CAAC=>1, AACC=>1}
226							Returns : Reference to a hash in which keys are words (any length) of the
227							alphabet used and values are number of occurrences of the word in
228							the sequence.
229							Args : Word length as scalar
230
231							Throws an exception if word length is not a positive integer
232							or if word length is longer than the sequence.
233
234							=cut
235
236							sub count_overlap_words
237							{
238	2			2	1	1959	my ($self,$seqobj,$word_length) = @_;
239							# check how we were called, and if necessary rearrange arguments
240	2	50				4	if(ref($seqobj)){
241							# call as SeqWords->count_words($seq, $wordlen)
242	0	0				0	if(! $seqobj->isa("Bio::PrimarySeqI")){
243	0					0	$self->throw("SeqWords works only on PrimarySeqI objects\n");
244							}
245							}else{
246							# call as $obj->count_words($wordlen)
247	2					2	$word_length = $seqobj;
248	2					3	$seqobj = undef;
249							}
250
251	2	50				4	if(! defined($seqobj)) {
252	2					3	$seqobj = $self->{'_seqref'};
253							}
254	2					15	my $seqstring = uc $seqobj->seq();
255
256	2	50				6	if($word_length > length($seqstring)){
257	0					0	$self->throw("die in _count, \$word_length is bigger ".
258							"than sequence length\n");
259							}
260
261	2					2	my $type = "overlap";
262	2					4	my $words = _count($seqobj, $word_length, $type);
263	2					4	return $words; # ref. to a hash
264							}
265
266							# the actual counting routine
267							# used by both count_words and count_overlap_words
268							sub _count {
269	5			5		6	my ($seqobj, $word_length, $type) = @_;
270	5					6	my %codon = ();
271
272							# now the real business
273							# JS - remove DNA assumption
274
275	5					7	my $seqstring = uc $seqobj->seq();
276	5	100				10	if($type eq "non-overlap")
		50
277							{
278	3					34	while($seqstring =~ /((\w){$word_length})/gim){
279	66					165	$codon{uc($1)}++;
280							}
281							} elsif($type eq "overlap"){
282	2					3	my $seqlen = $seqobj->length(); # measure length
283	2					5	for (my $frame = 1; $frame <= $word_length; $frame++) {
284							# run through frames
285	8					12	my $seqstring = uc($seqobj->subseq($frame,$seqlen));
286							# take the relevant substring
287	8					40	while($seqstring =~ /((\w){$word_length})/gim){
288	49					154	$codon{uc($1)}++; # keep adding to hash
289							}
290							}
291							} else {
292	0					0	Bio::Root::Root->throw("\nSomething badly wrong here. \$type: $type can only be overlap or non-overlap");
293							}
294	5					9	return \%codon;
295							}
296
297							1;