File Coverage

Bio/PopGen/Utilities.pm
Criterion Covered Total %
statement 66 75 88.0
branch 10 18 55.5
condition 8 15 53.3
subroutine 7 7 100.0
pod 1 1 100.0
total 92 116 79.3


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PopGen::Utilities
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PopGen::Utilities - Utilities for working with PopGen data and objects
17              
18             =head1 SYNOPSIS
19              
20             use Bio::PopGen::Utilities;
21             use Bio::AlignIO;
22              
23             my $in = Bio::AlignIO->new(-file => 't/data/t7.aln',
24             -format => 'clustalw');
25             my $aln = $in->next_aln;
26             # get a population, each sequence is an individual and
27             # for the default case, every site which is not monomorphic
28             # is a 'marker'. Each individual will have a 'genotype' for the
29             # site which will be the specific base in the alignment at that
30             # site
31             my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
32              
33             # get the synonymous sites from the alignemt only as the 'genotypes'
34             # for the population
35             my $synpop = Bio::PopGen::Utilities->aln_to_population(-site_model => 'cod',
36             -alignment => $aln);
37              
38              
39             =head1 DESCRIPTION
40              
41             This object provides some convience function to turn sequence
42             alignments into usable objects for the Population genetics modules
43             (Bio::PopGen).
44              
45             =head1 FEEDBACK
46              
47             =head2 Mailing Lists
48              
49             User feedback is an integral part of the evolution of this and other
50             Bioperl modules. Send your comments and suggestions preferably to
51             the Bioperl mailing list. Your participation is much appreciated.
52              
53             bioperl-l@bioperl.org - General discussion
54             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55              
56             =head2 Support
57              
58             Please direct usage questions or support issues to the mailing list:
59              
60             I
61              
62             rather than to the module maintainer directly. Many experienced and
63             reponsive experts will be able look at the problem and quickly
64             address it. Please include a thorough description of the problem
65             with code and data examples if at all possible.
66              
67             =head2 Reporting Bugs
68              
69             Report bugs to the Bioperl bug tracking system to help us keep track
70             of the bugs and their resolution. Bug reports can be submitted via
71             the web:
72              
73             https://github.com/bioperl/bioperl-live/issues
74              
75             =head1 AUTHOR - Jason Stajich
76              
77             Email jason-at-bioperl-dot-org
78              
79             =head1 APPENDIX
80              
81             The rest of the documentation details each of the object methods.
82             Internal methods are usually preceded with a _
83              
84             =cut
85              
86              
87             # Let the code begin...
88              
89              
90             package Bio::PopGen::Utilities;
91 2     2   1850 use strict;
  2         4  
  2         53  
92              
93 2     2   871 use Bio::Align::DNAStatistics;
  2         4  
  2         159  
94 2     2   631 use Bio::PopGen::Population;
  2         3  
  2         48  
95 2     2   306 use Bio::PopGen::Individual;
  2         3  
  2         46  
96              
97 2     2   9 use base qw(Bio::Root::Root);
  2         2  
  2         178  
98 2     2   11 use constant CodonLen => 3;
  2         3  
  2         1251  
99              
100              
101             =head2 aln_to_population
102              
103             Title : aln_to_population
104             Usage : my $pop = Bio::PopGen::Utilities->aln_to_population($aln);
105             Function: Turn and alignment into a set of L
106             objects grouped in a L object
107              
108             Sites are treated as 'Markers' in the Bioperl PopGen object
109             model in the sense that a site is a unique location for which
110             an individual will have a genotype (a set of alleles).
111             In this implementation we are assuming that each individual
112             has a single entry in the alignment file.
113              
114             Specify a site model as one of those listed
115             'all' -- every base in the alignment is considered a site
116             'cod' -- codons
117              
118             The option -site_model
119             for All sites : 'all'
120             Codon sites : 'cod' or 'codon'
121              
122             To see all sites, including those which are fixed in the population
123             add -include_monomorphic => 1
124             to the arguments
125             Returns :
126             Args : -include_monomorphic => 1 to specify all sites,
127             even those which are monomorphic
128             in the population
129             (useful for HKA test mostly)
130             [default is false]
131             -phase => specify a phase for the data, this is only
132             used if the site_mode is codon
133             [default is 0]
134             -site_model => one-of 'all', 'codon'
135             to specify a site model for the data extraction
136             from the alignment
137             [default is all]
138             -alignment => provide a L object [required]
139              
140             =cut
141              
142             sub aln_to_population{
143 3     3 1 350 my ($self,@args) = @_;
144 3         23 my ($aln,
145             $sitemodel,$phase,
146             $includefixed,$checkisa) = $self->_rearrange([qw(ALIGNMENT
147             SITE_MODEL
148             PHASE
149             INCLUDE_MONOMORPHIC
150             CHECKISA)],
151             @args);
152              
153 3         50 my %ambig_code = ('?' => ['?','?'],
154             'N' => ['?','?'],
155             '-' => ['?','?'],
156             'G' => ['G','G'],
157             'A' => ['A','A'],
158             'T' => ['T','T'],
159             'C' => ['C','C'],
160             'R' => ['A','G'],
161             'Y' => ['C','T'],
162             'W' => ['T','A'],
163             'M' => ['C','A'],
164             'S' => ['C','G'],
165             'K' => ['G','T']);
166            
167 3 50       12 if( ! defined $aln ) {
168 0         0 $self->warn("Must provide a valid Bio::SimpleAlign object to run aln_to_population");
169 0         0 return;
170             }
171              
172 3 50       13 if( ! $aln->is_flush ) {
173 0         0 $self->warn("Must provide a Bio::SimpleAlign object with aligned sequences to aln_to_population!");
174 0         0 return;
175             }
176 3 50       9 $phase = 0 unless defined $phase;
177 3 0 33     19 if( $phase != 0 && $phase != 1 && $phase != 2 ) {
      33        
178 0         0 warn("phase must be 0,1, or 2");
179 0         0 return;
180             }
181 3         10 my $alength = $aln->length;
182 3         5 my @inds;
183 3 100 66     29 if( ! defined $sitemodel || $sitemodel =~ /all/i ) {
    50          
184 1         2 my $ct = 0;
185 1         1 my @seqs;
186 1         2 for my $seq ( $aln->each_seq ) {
187 9         13 push @seqs, $seq->seq;
188 9         17 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
189             }
190              
191 1         6 for( my $i = 0; $i < $alength; $i++ ) {
192 1662         957 my (@genotypes,%set);
193            
194             # do we skip indels?
195             # slicing vertically
196 1662         1294 for my $seq ( @seqs ) {
197 14958         9868 my $site = uc(substr($seq,$i,1));
198 14958         9537 push @genotypes, $ambig_code{$site};
199 14958         10367 $set{$site}++;
200             }
201 1662 100 66     5195 if( keys %set > 1 || $includefixed ) {
202 181         141 my $genoct = scalar @genotypes;
203 181         234 for( my $j = 0; $j < $genoct; $j++ ) {
204 1629         3659 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
205             (-marker_name => ($i+1),
206             -individual_id=> $inds[$j]->unique_id,
207             -alleles => $genotypes[$j]));
208             }
209             }
210             }
211             } elsif( $sitemodel =~ /cod(on)?/i ) {
212 2         3 my $ct = 0;
213 2         1 my @seqs;
214 2         5 for my $seq ( $aln->each_seq ) {
215 13         18 push @seqs, $seq->seq;
216 13         27 push @inds, Bio::PopGen::Individual->new(-unique_id => $seq->display_id);
217             }
218 2         5 my $codonct = 0;
219 2         9 for( my $i = $phase; $i < $alength; $i += CodonLen ) {
220 812         553 my (@genotypes,%set,$genoct);
221            
222 812         874 for my $seq ( @seqs ) {
223 5250         3044 my @unambig_site;
224 5250         4457 my $site = uc(substr($seq,$i,CodonLen));
225 5250 50       5998 if( length($site) < CodonLen ) {
226             # at end of alignment and this is not in phase
227 0         0 $self->debug("phase was $phase, but got to end of alignment with overhang of $site");
228 0         0 next;
229             }
230             # do we check for gaps/indels here?
231 5250         6084 for (my $pos=0; $pos
232             {
233 15750         14090 $unambig_site[0] .= $ambig_code{substr($site, $pos, 1)}[0];
234 15750         21179 $unambig_site[1] .= $ambig_code{substr($site, $pos, 1)}[1];
235             }
236 5250         5630 push @genotypes, [@unambig_site];
237 5250         6426 $set{$site}++;
238             }
239 812         642 $genoct = scalar @genotypes;
240            
241             # do we include fixed sites? I think we should leave it to the user.
242 812 50 66     1598 if( keys %set > 1 || $includefixed ) {
243 812         1150 for( my $j = 0; $j < $genoct; $j++ ) {
244 5250         14253 $inds[$j]->add_Genotype(Bio::PopGen::Genotype->new
245             (-marker_name => ($i/CodonLen),
246             -individual_id=> $inds[$j]->unique_id,
247             -alleles => $genotypes[$j]));
248             }
249 812         3559 $codonct++;
250             }
251             }
252             } else {
253 0         0 $self->throw("Can only build sites based on all the data right now!");
254             }
255 3         55 return Bio::PopGen::Population->new(-checkisa => 0,
256             -source => 'alignment',
257             -individuals=> \@inds);
258             }
259              
260             1;