| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Align::Utilities; |
|
2
|
3
|
|
|
3
|
|
1571
|
use strict; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
70
|
|
|
3
|
3
|
|
|
3
|
|
9
|
use warnings; |
|
|
3
|
|
|
|
|
2
|
|
|
|
3
|
|
|
|
|
62
|
|
|
4
|
3
|
|
|
3
|
|
10
|
use Carp; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
140
|
|
|
5
|
3
|
|
|
3
|
|
428
|
use Bio::Root::Version; |
|
|
3
|
|
|
|
|
21
|
|
|
|
3
|
|
|
|
|
16
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
76
|
use Exporter 'import'; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
190
|
|
|
8
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
9
|
|
|
|
|
|
|
aa_to_dna_aln |
|
10
|
|
|
|
|
|
|
bootstrap_replicates |
|
11
|
|
|
|
|
|
|
cat |
|
12
|
|
|
|
|
|
|
bootstrap_replicates_codons |
|
13
|
|
|
|
|
|
|
dna_to_aa_aln |
|
14
|
|
|
|
|
|
|
most_common_sequences |
|
15
|
|
|
|
|
|
|
); |
|
16
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => \@EXPORT_OK); |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# |
|
19
|
|
|
|
|
|
|
# BioPerl module for Bio::Align::Utilities |
|
20
|
|
|
|
|
|
|
# |
|
21
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
22
|
|
|
|
|
|
|
# |
|
23
|
|
|
|
|
|
|
# Cared for by Jason Stajich and Brian Osborne |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# Copyright Jason Stajich |
|
26
|
|
|
|
|
|
|
# |
|
27
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 NAME |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Bio::Align::Utilities - A collection of utilities regarding converting |
|
34
|
|
|
|
|
|
|
and manipulating alignment objects |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
use Bio::Align::Utilities qw(:all); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Even if the protein alignments are local make sure the start/end |
|
41
|
|
|
|
|
|
|
# stored in the LocatableSeq objects are to the full length protein. |
|
42
|
|
|
|
|
|
|
# The coding sequence that is passed in should still be the full |
|
43
|
|
|
|
|
|
|
# length CDS as the nt alignment will be generated. |
|
44
|
|
|
|
|
|
|
# %dnaseqs is a hash of CDS sequences (spliced) |
|
45
|
|
|
|
|
|
|
my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs); |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# The reverse, which is simpler. The input alignment has to be |
|
48
|
|
|
|
|
|
|
# translate-able, with gap lengths and an overall length divisible by 3 |
|
49
|
|
|
|
|
|
|
my $aa_aln = dna_to_aa_aln($dna_al); |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Generate bootstraps |
|
52
|
|
|
|
|
|
|
my $replicates = bootstrap_replicates($aln,$count); |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
This module contains utility methods for manipulating sequence |
|
57
|
|
|
|
|
|
|
alignments (L) objects. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The B utility is essentially the same as the B |
|
60
|
|
|
|
|
|
|
program by Bill Pearson available at |
|
61
|
|
|
|
|
|
|
ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this |
|
62
|
|
|
|
|
|
|
is a pure-Perl implementation, but just to mention that if anything |
|
63
|
|
|
|
|
|
|
seems odd you can check the alignments generated against Bill's |
|
64
|
|
|
|
|
|
|
program. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
71
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
|
72
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
75
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 Support |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
I |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
84
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
85
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
86
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
91
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via the |
|
92
|
|
|
|
|
|
|
web: |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Email jason-at-bioperl-dot-org |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 APPENDIX |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
|
103
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
|
106
|
|
|
|
|
|
|
|
|
107
|
3
|
|
|
3
|
|
10
|
use constant CODONSIZE => 3; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
4714
|
|
|
108
|
|
|
|
|
|
|
our $GAP = '-'; |
|
109
|
|
|
|
|
|
|
our $CODONGAP = $GAP x CODONSIZE; |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 aa_to_dna_aln |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Title : aa_to_dna_aln |
|
114
|
|
|
|
|
|
|
Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs); |
|
115
|
|
|
|
|
|
|
Function: Will convert an AA alignment to DNA space given the |
|
116
|
|
|
|
|
|
|
corresponding DNA sequences. Note that this method expects |
|
117
|
|
|
|
|
|
|
the DNA sequences to be in frame +1 (GFF frame 0) as it will |
|
118
|
|
|
|
|
|
|
start to project into coordinates starting at the first base of |
|
119
|
|
|
|
|
|
|
the DNA sequence, if this alignment represents a different |
|
120
|
|
|
|
|
|
|
frame for the cDNA you will need to edit the DNA sequences |
|
121
|
|
|
|
|
|
|
to remove the 1st or 2nd bases (and revcom if things should be). |
|
122
|
|
|
|
|
|
|
Returns : Bio::Align::AlignI object |
|
123
|
|
|
|
|
|
|
Args : 2 arguments, the alignment and a hashref. |
|
124
|
|
|
|
|
|
|
Alignment is a Bio::Align::AlignI of amino acid sequences. |
|
125
|
|
|
|
|
|
|
The hash reference should have keys which are |
|
126
|
|
|
|
|
|
|
the display_ids for the aa |
|
127
|
|
|
|
|
|
|
sequences in the alignment and the values are a |
|
128
|
|
|
|
|
|
|
Bio::PrimarySeqI object for the corresponding |
|
129
|
|
|
|
|
|
|
spliced cDNA sequence. |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
See also: L, L, L |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub aa_to_dna_aln { |
|
136
|
2
|
|
|
2
|
1
|
11
|
my ( $aln, $dnaseqs ) = @_; |
|
137
|
2
|
50
|
33
|
|
|
23
|
unless ( defined $aln |
|
|
|
|
33
|
|
|
|
|
|
138
|
|
|
|
|
|
|
&& ref($aln) |
|
139
|
|
|
|
|
|
|
&& $aln->isa('Bio::Align::AlignI') ) |
|
140
|
|
|
|
|
|
|
{ |
|
141
|
0
|
|
|
|
|
0
|
croak( |
|
142
|
|
|
|
|
|
|
'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature' |
|
143
|
|
|
|
|
|
|
); |
|
144
|
|
|
|
|
|
|
} |
|
145
|
2
|
|
|
|
|
8
|
my $alnlen = $aln->length; |
|
146
|
2
|
|
|
|
|
10
|
my $dnaalign = Bio::SimpleAlign->new(); |
|
147
|
2
|
|
|
|
|
8
|
$aln->map_chars( '\.', $GAP ); |
|
148
|
|
|
|
|
|
|
|
|
149
|
2
|
|
|
|
|
5
|
foreach my $seq ( $aln->each_seq ) { |
|
150
|
17
|
|
|
|
|
27
|
my $aa_seqstr = $seq->seq(); |
|
151
|
17
|
|
|
|
|
25
|
my $pepid = $seq->display_id; |
|
152
|
17
|
|
33
|
|
|
37
|
my $dnaseq = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id ); |
|
153
|
17
|
|
|
|
|
23
|
my $start_offset = ( $seq->start - 1 ) * CODONSIZE; |
|
154
|
17
|
|
|
|
|
50
|
$dnaseq = $dnaseq->seq(); |
|
155
|
17
|
|
|
|
|
30
|
my $dnalen = $dnaseqs->{$pepid}->length; |
|
156
|
17
|
|
33
|
|
|
27
|
my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137) |
|
157
|
17
|
|
|
|
|
14
|
my $nt_seqstr; |
|
158
|
17
|
|
|
|
|
15
|
my $j = 0; |
|
159
|
17
|
|
|
|
|
29
|
for ( my $i = 0 ; $i < $alnlen ; $i++ ) { |
|
160
|
5552
|
|
|
|
|
3956
|
my $char = substr( $aa_seqstr, $i + $start_offset, 1 ); |
|
161
|
5552
|
100
|
66
|
|
|
10811
|
if ( $char eq $GAP || $j >= $dnalen ) { |
|
162
|
765
|
|
|
|
|
902
|
$nt_seqstr .= $CODONGAP; |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
else { |
|
165
|
4787
|
|
|
|
|
3149
|
$nt_seqstr .= substr( $dnaseq, $j, CODONSIZE ); |
|
166
|
4787
|
|
|
|
|
5937
|
$j += CODONSIZE; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
} |
|
169
|
17
|
|
|
|
|
33
|
$nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) ); |
|
170
|
|
|
|
|
|
|
|
|
171
|
17
|
|
|
|
|
80
|
my $newdna = Bio::LocatableSeq->new( |
|
172
|
|
|
|
|
|
|
-display_id => $dnaid, |
|
173
|
|
|
|
|
|
|
-alphabet => 'dna', |
|
174
|
|
|
|
|
|
|
-start => $start_offset + 1, |
|
175
|
|
|
|
|
|
|
-end => ( $seq->end * CODONSIZE ), |
|
176
|
|
|
|
|
|
|
-strand => 1, |
|
177
|
|
|
|
|
|
|
-seq => $nt_seqstr |
|
178
|
|
|
|
|
|
|
); |
|
179
|
17
|
|
|
|
|
48
|
$dnaalign->add_seq($newdna); |
|
180
|
|
|
|
|
|
|
} |
|
181
|
2
|
|
|
|
|
10
|
return $dnaalign; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head2 dna_to_aa_aln |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Title : dna_to_aa_aln |
|
187
|
|
|
|
|
|
|
Usage : my $aa_aln = dna_to_aa_aln($dna_aln); |
|
188
|
|
|
|
|
|
|
Function: Convert a DNA alignment to an amino acid alignment where |
|
189
|
|
|
|
|
|
|
the length of all alignment strings and the lengths of any |
|
190
|
|
|
|
|
|
|
gaps must be divisible by 3 |
|
191
|
|
|
|
|
|
|
Returns : Bio::Align::AlignI object |
|
192
|
|
|
|
|
|
|
Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
See also: L, L, L |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub dna_to_aa_aln { |
|
199
|
1
|
|
|
1
|
1
|
7
|
my $dna_aln = shift; |
|
200
|
1
|
50
|
33
|
|
|
12
|
unless ( defined $dna_aln |
|
|
|
|
33
|
|
|
|
|
|
201
|
|
|
|
|
|
|
&& ref($dna_aln) |
|
202
|
|
|
|
|
|
|
&& $dna_aln->isa('Bio::Align::AlignI') ) { |
|
203
|
0
|
|
|
|
|
0
|
croak( |
|
204
|
|
|
|
|
|
|
'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln' |
|
205
|
|
|
|
|
|
|
); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
1
|
|
|
|
|
3
|
my $codon_table = Bio::Tools::CodonTable->new; |
|
208
|
1
|
|
|
|
|
4
|
my $aa_aln = Bio::SimpleAlign->new; |
|
209
|
|
|
|
|
|
|
|
|
210
|
1
|
|
|
|
|
3
|
for my $seq ( $dna_aln->each_seq ) { |
|
211
|
14
|
|
|
|
|
10
|
my ($aa_str, $aa_len); |
|
212
|
14
|
|
|
|
|
27
|
my @aln_str = split '', $seq->seq; |
|
213
|
14
|
50
|
|
|
|
26
|
croak("All lines in the alignment must have lengths divisible by 3") |
|
214
|
|
|
|
|
|
|
if ( scalar(@aln_str) % CODONSIZE ); |
|
215
|
|
|
|
|
|
|
|
|
216
|
14
|
|
|
|
|
22
|
while ( @aln_str ) { |
|
217
|
5516
|
|
|
|
|
5882
|
my $triplet = join '', (splice( @aln_str, 0, CODONSIZE )); |
|
218
|
|
|
|
|
|
|
|
|
219
|
5516
|
100
|
|
|
|
10466
|
if ( $triplet =~ /^[GATC]+$/i ) { |
|
|
|
50
|
|
|
|
|
|
|
220
|
4754
|
|
|
|
|
5831
|
$aa_str .= $codon_table->translate($triplet); |
|
221
|
4754
|
|
|
|
|
6760
|
$aa_len++; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
|
|
|
|
|
|
elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) { |
|
224
|
762
|
|
|
|
|
1133
|
$aa_str .= $GAP; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
else { |
|
227
|
0
|
|
|
|
|
0
|
croak("The triplet '$triplet' is neither a valid codon nor all gaps"); |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
14
|
|
|
|
|
63
|
my $new_aa = Bio::LocatableSeq->new( |
|
231
|
|
|
|
|
|
|
-display_id => $seq->display_id, |
|
232
|
|
|
|
|
|
|
-alphabet => 'protein', |
|
233
|
|
|
|
|
|
|
-start => 1, |
|
234
|
|
|
|
|
|
|
-end => $aa_len, |
|
235
|
|
|
|
|
|
|
-strand => 1, |
|
236
|
|
|
|
|
|
|
-seq => $aa_str |
|
237
|
|
|
|
|
|
|
); |
|
238
|
|
|
|
|
|
|
|
|
239
|
14
|
|
|
|
|
48
|
$aa_aln->add_seq($new_aa); |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
1
|
|
|
|
|
8
|
$aa_aln; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head2 bootstrap_replicates |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Title : bootstrap_replicates |
|
248
|
|
|
|
|
|
|
Usage : my $alns = &bootstrap_replicates($aln,100); |
|
249
|
|
|
|
|
|
|
Function: Generate a pseudo-replicate of the data by randomly |
|
250
|
|
|
|
|
|
|
sampling, with replacement, the columns from an alignment for |
|
251
|
|
|
|
|
|
|
the non-parametric bootstrap. |
|
252
|
|
|
|
|
|
|
Returns : Arrayref of L objects |
|
253
|
|
|
|
|
|
|
Args : L object |
|
254
|
|
|
|
|
|
|
Number of replicates to generate |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub bootstrap_replicates { |
|
259
|
3
|
|
|
3
|
1
|
11
|
my ( $aln, $count ) = @_; |
|
260
|
3
|
|
50
|
|
|
10
|
$count ||= 1; |
|
261
|
3
|
|
|
|
|
11
|
my $alen = $aln->length; |
|
262
|
3
|
|
|
|
|
4
|
my ( @seqs, @nm ); |
|
263
|
3
|
|
|
|
|
15
|
$aln->set_displayname_flat(1); |
|
264
|
3
|
|
|
|
|
8
|
for my $s ( $aln->each_seq ) { |
|
265
|
31
|
|
|
|
|
41
|
push @seqs, $s->seq(); |
|
266
|
31
|
|
|
|
|
43
|
push @nm, $s->id; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
3
|
|
|
|
|
7
|
my ( @alns, $i ); |
|
269
|
3
|
|
|
|
|
12
|
while ( $count-- > 0 ) { |
|
270
|
23
|
|
|
|
|
23
|
my @newseqs; |
|
271
|
23
|
|
|
|
|
49
|
for ( $i = 0 ; $i < $alen ; $i++ ) { |
|
272
|
7988
|
|
|
|
|
5943
|
my $index = int( rand($alen) ); |
|
273
|
7988
|
|
|
|
|
4343
|
my $c = 0; |
|
274
|
7988
|
|
|
|
|
6426
|
for (@seqs) { |
|
275
|
110644
|
|
|
|
|
92266
|
$newseqs[ $c++ ] .= substr( $_, $index, 1 ); |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
} |
|
278
|
23
|
|
|
|
|
98
|
my $newaln = Bio::SimpleAlign->new(); |
|
279
|
23
|
|
|
|
|
27
|
my $i = 0; |
|
280
|
23
|
|
|
|
|
34
|
for my $s (@newseqs) { |
|
281
|
289
|
|
|
|
|
5539
|
( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g; |
|
282
|
289
|
|
|
|
|
1168
|
$newaln->add_seq( |
|
283
|
|
|
|
|
|
|
Bio::LocatableSeq->new( |
|
284
|
|
|
|
|
|
|
-start => 1, |
|
285
|
|
|
|
|
|
|
-end => length($tmp), |
|
286
|
|
|
|
|
|
|
-display_id => $nm[ $i++ ], |
|
287
|
|
|
|
|
|
|
-seq => $s |
|
288
|
|
|
|
|
|
|
) |
|
289
|
|
|
|
|
|
|
); |
|
290
|
|
|
|
|
|
|
} |
|
291
|
23
|
|
|
|
|
78
|
push @alns, $newaln; |
|
292
|
|
|
|
|
|
|
} |
|
293
|
3
|
|
|
|
|
20
|
return \@alns; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head2 bootstrap_replicates_codons |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Title : bootstrap_replicates_codons |
|
299
|
|
|
|
|
|
|
Usage : my $alns = &bootstrap_replicates_codons($aln,100); |
|
300
|
|
|
|
|
|
|
Function: Generate a pseudo-replicate of the data by randomly |
|
301
|
|
|
|
|
|
|
sampling, with replacement, the columns from a codon alignment for |
|
302
|
|
|
|
|
|
|
the non-parametric bootstrap. The alignment is assumed to start on |
|
303
|
|
|
|
|
|
|
the first position of a codon. |
|
304
|
|
|
|
|
|
|
Returns : Arrayref of L objects |
|
305
|
|
|
|
|
|
|
Args : L object |
|
306
|
|
|
|
|
|
|
Number of replicates to generate |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=cut |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub bootstrap_replicates_codons { |
|
311
|
0
|
|
|
0
|
1
|
0
|
my ( $aln, $count ) = @_; |
|
312
|
0
|
|
0
|
|
|
0
|
$count ||= 1; |
|
313
|
0
|
|
|
|
|
0
|
my $alen = $aln->length; |
|
314
|
0
|
|
|
|
|
0
|
my $ncodon = int( $alen / 3 ); |
|
315
|
0
|
|
|
|
|
0
|
my ( @seqs, @nm ); |
|
316
|
0
|
|
|
|
|
0
|
$aln->set_displayname_flat(1); |
|
317
|
0
|
|
|
|
|
0
|
for my $s ( $aln->each_seq ) { |
|
318
|
0
|
|
|
|
|
0
|
push @seqs, $s->seq(); |
|
319
|
0
|
|
|
|
|
0
|
push @nm, $s->id; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
0
|
|
|
|
|
0
|
my ( @alns, $i ); |
|
322
|
0
|
|
|
|
|
0
|
while ( $count-- > 0 ) { |
|
323
|
0
|
|
|
|
|
0
|
my @newseqs; |
|
324
|
0
|
|
|
|
|
0
|
for ( $i = 0 ; $i < $ncodon ; $i++ ) { |
|
325
|
0
|
|
|
|
|
0
|
my $index = int( rand($ncodon) ); |
|
326
|
0
|
|
|
|
|
0
|
my $seqpos = $index * 3; |
|
327
|
0
|
|
|
|
|
0
|
my $c = 0; |
|
328
|
0
|
|
|
|
|
0
|
for (@seqs) { |
|
329
|
0
|
|
|
|
|
0
|
$newseqs[ $c++ ] .= substr( $_, $seqpos, 3 ); |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
} |
|
332
|
0
|
|
|
|
|
0
|
my $newaln = Bio::SimpleAlign->new(); |
|
333
|
0
|
|
|
|
|
0
|
my $i = 0; |
|
334
|
0
|
|
|
|
|
0
|
for my $s (@newseqs) { |
|
335
|
0
|
|
|
|
|
0
|
( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g; |
|
336
|
0
|
|
|
|
|
0
|
$newaln->add_seq( |
|
337
|
|
|
|
|
|
|
Bio::LocatableSeq->new( |
|
338
|
|
|
|
|
|
|
-start => 1, |
|
339
|
|
|
|
|
|
|
-end => length($tmp), |
|
340
|
|
|
|
|
|
|
-display_id => $nm[ $i++ ], |
|
341
|
|
|
|
|
|
|
-seq => $s |
|
342
|
|
|
|
|
|
|
) |
|
343
|
|
|
|
|
|
|
); |
|
344
|
|
|
|
|
|
|
} |
|
345
|
0
|
|
|
|
|
0
|
push @alns, $newaln; |
|
346
|
|
|
|
|
|
|
} |
|
347
|
0
|
|
|
|
|
0
|
return \@alns; |
|
348
|
|
|
|
|
|
|
} |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 cat |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
Title : cat |
|
353
|
|
|
|
|
|
|
Usage : $aln123 = cat($aln1, $aln2, $aln3) |
|
354
|
|
|
|
|
|
|
Function : Concatenates alignment objects. Sequences are identified by id. |
|
355
|
|
|
|
|
|
|
An error will be thrown if the sequence ids are not unique in the |
|
356
|
|
|
|
|
|
|
first alignment. If any ids are not present or not unique in any |
|
357
|
|
|
|
|
|
|
of the additional alignments then those sequences are omitted from |
|
358
|
|
|
|
|
|
|
the concatenated alignment, and a warning is issued. An error will |
|
359
|
|
|
|
|
|
|
be thrown if any of the alignments are not flush, since |
|
360
|
|
|
|
|
|
|
concatenating such alignments is unlikely to make biological |
|
361
|
|
|
|
|
|
|
sense. |
|
362
|
|
|
|
|
|
|
Returns : A new Bio::SimpleAlign object |
|
363
|
|
|
|
|
|
|
Args : A list of Bio::SimpleAlign objects |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub cat { |
|
368
|
1
|
|
|
1
|
1
|
6
|
my ( $self, @aln ) = @_; |
|
369
|
1
|
50
|
|
|
|
4
|
$self->throw("cat method called with no arguments") unless $self; |
|
370
|
1
|
|
|
|
|
4
|
for ( $self, @aln ) { |
|
371
|
2
|
50
|
|
|
|
8
|
$self->throw( $_->id . " is not a Bio::Align::AlignI object" ) |
|
372
|
|
|
|
|
|
|
unless $_->isa('Bio::Align::AlignI'); |
|
373
|
2
|
50
|
|
|
|
6
|
$self->throw( $_->id . " is not flush" ) unless $_->is_flush; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
1
|
|
|
|
|
3
|
my $aln = $self->new; |
|
376
|
1
|
|
|
|
|
3
|
$aln->id( $self->id ); |
|
377
|
1
|
|
|
|
|
2
|
$aln->annotation( $self->annotation ); |
|
378
|
1
|
|
|
|
|
1
|
my %unique; |
|
379
|
1
|
|
|
|
|
3
|
SEQ: foreach my $seq ( $self->each_seq() ) { |
|
380
|
|
|
|
|
|
|
throw( "ID: ", $seq->id, " is not unique in initial alignment." ) |
|
381
|
14
|
50
|
|
|
|
25
|
if exists $unique{ $seq->id }; |
|
382
|
14
|
|
|
|
|
19
|
$unique{ $seq->id } = 1; |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array |
|
385
|
14
|
|
|
|
|
24
|
my $new_seq = $seq->new( |
|
386
|
|
|
|
|
|
|
-id => $seq->id, |
|
387
|
|
|
|
|
|
|
-strand => $seq->strand, |
|
388
|
|
|
|
|
|
|
-verbose => $self->verbose |
|
389
|
|
|
|
|
|
|
); |
|
390
|
14
|
|
|
|
|
31
|
$new_seq->seq( $seq->seq ); |
|
391
|
14
|
|
|
|
|
27
|
$new_seq->start( $seq->start ); |
|
392
|
14
|
|
|
|
|
22
|
$new_seq->end( $seq->end ); |
|
393
|
14
|
50
|
|
|
|
60
|
if ( $new_seq->isa('Bio::Seq::MetaI') ) { |
|
394
|
0
|
|
|
|
|
0
|
for my $meta_name ( $seq->meta_names ) { |
|
395
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $new_seq->start, |
|
396
|
|
|
|
|
|
|
$new_seq->end, $seq->named_meta($meta_name) ); |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
} |
|
399
|
14
|
|
|
|
|
18
|
for my $cat_aln (@aln) { |
|
400
|
14
|
|
|
|
|
17
|
my @cat_seq = $cat_aln->each_seq_with_id( $seq->id ); |
|
401
|
14
|
50
|
|
|
|
24
|
if ( @cat_seq == 0 ) { |
|
402
|
0
|
|
|
|
|
0
|
$self->warn( $seq->id |
|
403
|
|
|
|
|
|
|
. " not found in alignment " |
|
404
|
|
|
|
|
|
|
. $cat_aln->id |
|
405
|
|
|
|
|
|
|
. ", skipping this sequence." ); |
|
406
|
0
|
|
|
|
|
0
|
next SEQ; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
14
|
50
|
|
|
|
19
|
if ( @cat_seq > 1 ) { |
|
409
|
0
|
|
|
|
|
0
|
$self->warn( $seq->id |
|
410
|
|
|
|
|
|
|
. " found multiple times in alignment " |
|
411
|
|
|
|
|
|
|
. $cat_aln->id |
|
412
|
|
|
|
|
|
|
. ", skipping this sequence." ); |
|
413
|
0
|
|
|
|
|
0
|
next SEQ; |
|
414
|
|
|
|
|
|
|
} |
|
415
|
14
|
|
|
|
|
12
|
my $cat_seq = $cat_seq[0]; |
|
416
|
14
|
|
|
|
|
15
|
my $old_end = $new_seq->end; |
|
417
|
14
|
|
|
|
|
18
|
$new_seq->seq( $new_seq->seq . $cat_seq->seq ); |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# Not sure if this is a sensible way to deal with end coordinates |
|
420
|
14
|
|
|
|
|
22
|
$new_seq->end( |
|
421
|
|
|
|
|
|
|
$new_seq->end + $cat_seq->end + 1 - $cat_seq->start ); |
|
422
|
|
|
|
|
|
|
|
|
423
|
14
|
50
|
|
|
|
90
|
if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) { |
|
|
|
50
|
|
|
|
|
|
|
424
|
0
|
0
|
|
|
|
0
|
unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) { |
|
425
|
0
|
|
|
|
|
0
|
my $meta_seq = Bio::Seq::Meta::Array->new; |
|
426
|
0
|
|
|
|
|
0
|
$meta_seq->seq( $new_seq->seq ); |
|
427
|
0
|
|
|
|
|
0
|
$meta_seq->start( $new_seq->start ); |
|
428
|
0
|
|
|
|
|
0
|
$meta_seq->end( $new_seq->end ); |
|
429
|
0
|
0
|
|
|
|
0
|
if ( $new_seq->isa('Bio::Seq::Meta') ) { |
|
430
|
0
|
|
|
|
|
0
|
for my $meta_name ( $new_seq->meta_names ) { |
|
431
|
0
|
|
|
|
|
0
|
$meta_seq->named_submeta( |
|
432
|
|
|
|
|
|
|
$meta_name, |
|
433
|
|
|
|
|
|
|
$new_seq->start, |
|
434
|
|
|
|
|
|
|
$old_end, |
|
435
|
|
|
|
|
|
|
[ |
|
436
|
|
|
|
|
|
|
split( |
|
437
|
|
|
|
|
|
|
//, $new_seq->named_meta($meta_name) |
|
438
|
|
|
|
|
|
|
) |
|
439
|
|
|
|
|
|
|
] |
|
440
|
|
|
|
|
|
|
); |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
} |
|
443
|
0
|
|
|
|
|
0
|
$new_seq = $meta_seq; |
|
444
|
|
|
|
|
|
|
} |
|
445
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
|
446
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
|
447
|
|
|
|
|
|
|
$new_seq->end, $cat_seq->named_meta($meta_name) ); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
elsif ( $cat_seq->isa('Bio::Seq::Meta') ) { |
|
451
|
0
|
0
|
|
|
|
0
|
if ( $new_seq->isa('Bio::Seq::Meta::Array') ) { |
|
452
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
|
453
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
|
454
|
|
|
|
|
|
|
$new_seq->end, |
|
455
|
|
|
|
|
|
|
[ split( //, $cat_seq->named_meta($meta_name) ) ] ); |
|
456
|
|
|
|
|
|
|
} |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
else { |
|
459
|
0
|
0
|
|
|
|
0
|
unless ( $new_seq->isa('Bio::Seq::Meta') ) { |
|
460
|
0
|
|
|
|
|
0
|
my $meta_seq = Bio::Seq::Meta::Array->new; |
|
461
|
0
|
|
|
|
|
0
|
$meta_seq->seq( $new_seq->seq ); |
|
462
|
0
|
|
|
|
|
0
|
$meta_seq->start( $new_seq->start ); |
|
463
|
0
|
|
|
|
|
0
|
$meta_seq->end( $new_seq->end ); |
|
464
|
0
|
|
|
|
|
0
|
$new_seq = $meta_seq; |
|
465
|
|
|
|
|
|
|
} |
|
466
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
|
467
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
|
468
|
|
|
|
|
|
|
$new_seq->end, $cat_seq->named_meta($meta_name) ); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
|
|
|
|
|
|
} |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
} |
|
473
|
14
|
|
|
|
|
29
|
$aln->add_seq($new_seq); |
|
474
|
|
|
|
|
|
|
} |
|
475
|
1
|
|
|
|
|
6
|
my $cons_meta = $self->consensus_meta; |
|
476
|
1
|
|
|
|
|
1
|
my $new_cons_meta; |
|
477
|
1
|
50
|
|
|
|
3
|
if ($cons_meta) { |
|
478
|
0
|
|
|
|
|
0
|
$new_cons_meta = Bio::Seq::Meta->new(); |
|
479
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cons_meta->meta_names ) { |
|
480
|
0
|
|
|
|
|
0
|
$new_cons_meta->named_submeta( $meta_name, 1, $self->length, |
|
481
|
|
|
|
|
|
|
$cons_meta->$meta_name ); |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
} |
|
484
|
1
|
|
|
|
|
4
|
my $end = $self->length; |
|
485
|
1
|
|
|
|
|
2
|
for my $cat_aln (@aln) { |
|
486
|
1
|
|
|
|
|
3
|
my $cat_cons_meta = $cat_aln->consensus_meta; |
|
487
|
1
|
50
|
|
|
|
3
|
if ($cat_cons_meta) { |
|
488
|
0
|
0
|
|
|
|
0
|
$new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta; |
|
489
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_cons_meta->meta_names ) { |
|
490
|
0
|
|
|
|
|
0
|
$new_cons_meta->named_submeta( |
|
491
|
|
|
|
|
|
|
$meta_name, $end + 1, |
|
492
|
|
|
|
|
|
|
$end + $cat_aln->length, |
|
493
|
|
|
|
|
|
|
$cat_cons_meta->$meta_name |
|
494
|
|
|
|
|
|
|
); |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
} |
|
497
|
1
|
|
|
|
|
3
|
$end += $cat_aln->length; |
|
498
|
|
|
|
|
|
|
} |
|
499
|
1
|
50
|
|
|
|
2
|
$aln->consensus_meta($new_cons_meta) if $new_cons_meta; |
|
500
|
1
|
|
|
|
|
4
|
return $aln; |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head2 most_common_sequences |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
Title : most_common_sequences |
|
507
|
|
|
|
|
|
|
Usage : @common = most_common_sequences ($align, $case_sensitivity) |
|
508
|
|
|
|
|
|
|
Function : Returns an array of the sequences that appear most often in the |
|
509
|
|
|
|
|
|
|
alignment (although this probably makes more sense when there is |
|
510
|
|
|
|
|
|
|
only a single most common sequence). Sequences are compared after |
|
511
|
|
|
|
|
|
|
removing any "-" (gap characters), and ambiguous units (e.g., R |
|
512
|
|
|
|
|
|
|
for purines) are only compared to themselves. The returned |
|
513
|
|
|
|
|
|
|
sequence is also missing the "-" since they don't actually make |
|
514
|
|
|
|
|
|
|
part of the sequence. |
|
515
|
|
|
|
|
|
|
Returns : Array of text strings. |
|
516
|
|
|
|
|
|
|
Arguments : Optional argument defining whether the comparison between sequences |
|
517
|
|
|
|
|
|
|
to find the most common should be case sensitive. Defaults to |
|
518
|
|
|
|
|
|
|
false, i.e, not case sensitive. |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub most_common_sequences { |
|
523
|
0
|
0
|
|
0
|
1
|
|
my $align = shift |
|
524
|
|
|
|
|
|
|
or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences"); |
|
525
|
0
|
|
|
|
|
|
my $case_sensitive = shift; # defaults to false (we get undef if nothing) |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
## We keep track of the max on this loop. Saves us having to |
|
528
|
|
|
|
|
|
|
## transverse the hash table later to find the maximum value. |
|
529
|
0
|
|
|
|
|
|
my $max = 0; |
|
530
|
0
|
|
|
|
|
|
my %counts; |
|
531
|
0
|
|
|
|
|
|
foreach ($align->each_seq) { |
|
532
|
0
|
|
|
|
|
|
(my $seq = $_->seq) =~ tr/-//d; |
|
533
|
0
|
0
|
|
|
|
|
$seq = uc ($seq) unless $case_sensitive; |
|
534
|
0
|
0
|
|
|
|
|
$max++ if (++$counts{$seq} > $max); |
|
535
|
|
|
|
|
|
|
} |
|
536
|
0
|
|
|
|
|
|
my @common = grep ($counts{$_} == $max, keys %counts); |
|
537
|
0
|
|
|
|
|
|
return @common; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
1; |