File Coverage

Bio/Tools/SeqWords.pm
Criterion Covered Total %
statement 48 59 81.3
branch 12 28 42.8
condition 4 12 33.3
subroutine 6 6 100.0
pod 3 3 100.0
total 73 108 67.5


line stmt bran cond sub pod time code
1             #---------------------------------------------------------------------------
2             # PACKAGE : Bio::Tools::SeqWords
3             # PURPOSE : To count n-mers in any sequence of characters
4             # AUTHOR : Derek Gatherer (d.gatherer@vir.gla.ac.uk)
5             # SOURCE :
6             # CREATED : 21st March 2000
7             # MODIFIED : 11th November 2003 (DG - new method, count_overlap_words)
8             # LICENCE : You may distribute this module under the same terms
9             # : as the rest of BioPerl.
10             #---------------------------------------------------------------------------
11              
12             =head1 NAME
13              
14             Bio::Tools::SeqWords - Object holding n-mer statistics for a sequence
15              
16             =head1 SYNOPSIS
17              
18             # Create the SeqWords object, e.g.:
19              
20             my $inputstream = Bio::SeqIO->new(-file => "seqfile",
21             -format => 'Fasta');
22             my $seqobj = $inputstream->next_seq();
23             my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
24              
25             # Or:
26             my $seqobj = Bio::PrimarySeq->new(-seq => "agggtttccc",
27             -alphabet => 'dna',
28             -id => 'test');
29             my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
30              
31             # obtain a hash of word counts, eg:
32             my $hash_ref = $seq_stats->count_words($word_length);
33              
34             # display hash table, eg:
35             my %hash = %$hash_ref;
36             foreach my $key(sort keys %hash)
37             {
38             print "\n$key\t$hash{$key}";
39             }
40              
41             # Or:
42              
43             my $hash_ref =
44             Bio::Tools::SeqWords->count_words($seqobj,$word_length);
45              
46             =head1 DESCRIPTION
47              
48             L is a featherweight object for the calculation
49             of n-mer word occurrences in a single sequence. It is envisaged that
50             the object will be useful for construction of scripts which use n-mer
51             word tables as the raw material for statistical calculations; for
52             instance, hexamer frequency for the calculation of coding protential,
53             or the calculation of periodicity in repetitive DNA. Triplet
54             frequency is already handled by L (author: Peter
55             Schattner).
56              
57             There are a few possible applications for protein, e.g. hypothesised
58             amino acid 7-mers in heat shock proteins, or proteins with multiple
59             simple motifs. Sometimes these protein periodicities are best seen
60             when the amino acid alphabet is truncated, e.g. Shulman alphabet.
61             Since there are quite a few of these shortened alphabets, this module
62             does not specify any particular alphabet.
63              
64             See Synopsis above for object creation code.
65              
66             =head2 Rationale
67              
68             Take a sequence object and create an object for the purposes of
69             holding n-mer word statistics about that sequence. The sequence can be
70             nucleic acid or protein.
71              
72             In count_words() the words are counted in a non-overlapping manner,
73             ie. in the style of a codon table, but with any word length.
74              
75             In count_overlap_words() the words are counted in an overlapping
76             manner.
77              
78             For counts on opposite strand (DNA/RNA), a reverse complement method
79             should be performed, and then the count repeated.
80              
81             =head1 FEEDBACK
82              
83             =head2 Mailing Lists
84              
85             User feedback is an integral part of the evolution of this and other
86             Bioperl modules. Send your comments and suggestions preferably to one
87             of the Bioperl mailing lists. Your participation is much appreciated.
88              
89             bioperl-l@bioperl.org - General discussion
90             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
91              
92             =head2 Support
93              
94             Please direct usage questions or support issues to the mailing list:
95              
96             I
97              
98             rather than to the module maintainer directly. Many experienced and
99             reponsive experts will be able look at the problem and quickly
100             address it. Please include a thorough description of the problem
101             with code and data examples if at all possible.
102              
103             =head2 Reporting Bugs
104              
105             Report bugs to the Bioperl bug tracking system to help us keep track
106             the bugs and their resolution. Bug reports can be submitted via the
107             web:
108              
109             https://github.com/bioperl/bioperl-live/issues
110              
111             =head1 AUTHOR
112              
113             Derek Gatherer, in the loosest sense of the word 'author'. The
114             general shape of the module is lifted directly from the SeqStat module
115             of Peter Schattner. The central subroutine to count the words is
116             adapted from original code provided by Dave Shivak, in response to a
117             query on the bioperl mailing list. At least 2 other people provided
118             alternative means (equally good but not used in the end) of performing
119             the same calculation. Thanks to all for your assistance.
120              
121             =head1 CONTRIBUTORS
122              
123             Jason Stajich, jason-at-bioperl.org
124              
125             =head1 APPENDIX
126              
127             The rest of the documentation details each of the object methods.
128             Internal methods are usually preceded with a _
129              
130             =cut
131              
132             package Bio::Tools::SeqWords;
133 1     1   1002 use strict;
  1         2  
  1         27  
134              
135              
136 1     1   6 use base qw(Bio::Root::Root);
  1         2  
  1         571  
137              
138             sub new {
139 3     3 1 650 my($class,@args) = @_;
140             # our new standard way of instantiation
141 3         20 my $self = $class->SUPER::new(@args);
142              
143 3         15 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
144 3 0 33     12 if((! defined($seqobj)) && @args && ref($args[0])) {
      33        
145             # parameter not passed as named parameter?
146 0         0 $seqobj = $args[0];
147             }
148            
149 3 50       24 if(! $seqobj->isa("Bio::PrimarySeqI")) {
150 0         0 $self->throw(ref($self) . " works only on PrimarySeqI objects\n");
151             }
152            
153 3         7 $self->{'_seqref'} = $seqobj;
154 3         16 return $self;
155             }
156              
157              
158             =head2 count_words
159              
160             Title : count_words
161             Usage : $word_count = $seq_stats->count_words($word_length)
162             or
163             $word_count = $seq_stats->Bio::Tools::SeqWords->($seqobj,$word_length);
164             Function: Counts non-overlapping words within a string, any alphabet is
165             used
166             Example : a sequence ACCGTCCGT, counted at word length 4, will give the hash
167             {ACCG => 1, TCCG => 1}
168             Returns : Reference to a hash in which keys are words (any length) of the
169             alphabet used and values are number of occurrences of the word
170             in the sequence.
171             Args : Word length as scalar and, reference to sequence object if
172             required
173              
174             Throws an exception word length is not a positive integer
175             or if word length is longer than the sequence.
176              
177             =cut
178              
179             sub count_words
180             {
181 3     3 1 10 my ($self,$seqobj,$word_length) = @_;
182              
183             # check how we were called, and if necessary rearrange arguments
184 3 50       7 if(ref($seqobj)) {
185             # call as SeqWords->count_words($seq, $wordlen)
186 0 0       0 if(! $seqobj->isa("Bio::PrimarySeqI")) {
187 0         0 $self->throw("SeqWords works only on PrimarySeqI objects\n");
188             }
189             } else {
190             # call as $obj->count_words($wordlen)
191 3         5 $word_length = $seqobj;
192 3         5 $seqobj = undef;
193             }
194              
195 3 50       8 if(! defined($seqobj)){
196 3         5 $seqobj = $self->{'_seqref'};
197             }
198            
199 3 50 33     28 if($word_length eq "" || $word_length =~ /[a-z]/i){
    50 33        
200 0         0 $self->throw("SeqWords cannot accept non-numeric characters".
201             " or a null value in the \$word_length variable\n");
202             }elsif ($word_length <1 || ($word_length - int($word_length)) >0){
203 0         0 $self->throw("SeqWords requires the word length to be a ".
204             "positive integer\n");
205             }
206              
207 3         11 my $seqstring = uc $seqobj->seq();
208              
209 3 50       9 if($word_length > length($seqstring)){
210 0         0 $self->throw("die in _count, \$word_length is bigger ".
211             "than sequence length\n");
212             }
213              
214 3         4 my $type = "non-overlap";
215 3         9 my $words = _count($seqobj, $word_length, $type);
216 3         10 return $words; # ref. to a hash
217             }
218              
219             =head2 count_overlap_words
220              
221             Title : count_overlap_words
222             Usage : $word_count = $word_obj->count_overlap_words($word_length);
223             Function: Counts overlapping words within a string, any alphabet is used
224             Example : A sequence ACCAACCA, counted at word length 4, will give the hash
225             {ACCA=>2, CCAA=>1, CAAC=>1, AACC=>1}
226             Returns : Reference to a hash in which keys are words (any length) of the
227             alphabet used and values are number of occurrences of the word in
228             the sequence.
229             Args : Word length as scalar
230              
231             Throws an exception if word length is not a positive integer
232             or if word length is longer than the sequence.
233              
234             =cut
235              
236             sub count_overlap_words
237             {
238 2     2 1 3015 my ($self,$seqobj,$word_length) = @_;
239             # check how we were called, and if necessary rearrange arguments
240 2 50       7 if(ref($seqobj)){
241             # call as SeqWords->count_words($seq, $wordlen)
242 0 0       0 if(! $seqobj->isa("Bio::PrimarySeqI")){
243 0         0 $self->throw("SeqWords works only on PrimarySeqI objects\n");
244             }
245             }else{
246             # call as $obj->count_words($wordlen)
247 2         4 $word_length = $seqobj;
248 2         2 $seqobj = undef;
249             }
250              
251 2 50       7 if(! defined($seqobj)) {
252 2         4 $seqobj = $self->{'_seqref'};
253             }
254 2         25 my $seqstring = uc $seqobj->seq();
255              
256 2 50       7 if($word_length > length($seqstring)){
257 0         0 $self->throw("die in _count, \$word_length is bigger ".
258             "than sequence length\n");
259             }
260            
261 2         4 my $type = "overlap";
262 2         4 my $words = _count($seqobj, $word_length, $type);
263 2         8 return $words; # ref. to a hash
264             }
265              
266             # the actual counting routine
267             # used by both count_words and count_overlap_words
268             sub _count {
269 5     5   10 my ($seqobj, $word_length, $type) = @_;
270 5         8 my %codon = ();
271              
272             # now the real business
273             # JS - remove DNA assumption
274              
275 5         10 my $seqstring = uc $seqobj->seq();
276 5 100       14 if($type eq "non-overlap")
    50          
277             {
278 3         56 while($seqstring =~ /((\w){$word_length})/gim){
279 66         250 $codon{uc($1)}++;
280             }
281             } elsif($type eq "overlap"){
282 2         8 my $seqlen = $seqobj->length(); # measure length
283 2         7 for (my $frame = 1; $frame <= $word_length; $frame++) {
284             # run through frames
285 8         18 my $seqstring = uc($seqobj->subseq($frame,$seqlen));
286             # take the relevant substring
287 8         45 while($seqstring =~ /((\w){$word_length})/gim){
288 49         176 $codon{uc($1)}++; # keep adding to hash
289             }
290             }
291             } else {
292 0         0 Bio::Root::Root->throw("\nSomething badly wrong here. \$type: $type can only be overlap or non-overlap");
293             }
294 5         10 return \%codon;
295             }
296              
297             1;