line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Align::Utilities; |
2
|
3
|
|
|
3
|
|
1571
|
use strict; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
70
|
|
3
|
3
|
|
|
3
|
|
9
|
use warnings; |
|
3
|
|
|
|
|
2
|
|
|
3
|
|
|
|
|
62
|
|
4
|
3
|
|
|
3
|
|
10
|
use Carp; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
140
|
|
5
|
3
|
|
|
3
|
|
428
|
use Bio::Root::Version; |
|
3
|
|
|
|
|
21
|
|
|
3
|
|
|
|
|
16
|
|
6
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
76
|
use Exporter 'import'; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
190
|
|
8
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
9
|
|
|
|
|
|
|
aa_to_dna_aln |
10
|
|
|
|
|
|
|
bootstrap_replicates |
11
|
|
|
|
|
|
|
cat |
12
|
|
|
|
|
|
|
bootstrap_replicates_codons |
13
|
|
|
|
|
|
|
dna_to_aa_aln |
14
|
|
|
|
|
|
|
most_common_sequences |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => \@EXPORT_OK); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# |
19
|
|
|
|
|
|
|
# BioPerl module for Bio::Align::Utilities |
20
|
|
|
|
|
|
|
# |
21
|
|
|
|
|
|
|
# Please direct questions and support issues to |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# Cared for by Jason Stajich and Brian Osborne |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# Copyright Jason Stajich |
26
|
|
|
|
|
|
|
# |
27
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 NAME |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Bio::Align::Utilities - A collection of utilities regarding converting |
34
|
|
|
|
|
|
|
and manipulating alignment objects |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 SYNOPSIS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
use Bio::Align::Utilities qw(:all); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Even if the protein alignments are local make sure the start/end |
41
|
|
|
|
|
|
|
# stored in the LocatableSeq objects are to the full length protein. |
42
|
|
|
|
|
|
|
# The coding sequence that is passed in should still be the full |
43
|
|
|
|
|
|
|
# length CDS as the nt alignment will be generated. |
44
|
|
|
|
|
|
|
# %dnaseqs is a hash of CDS sequences (spliced) |
45
|
|
|
|
|
|
|
my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# The reverse, which is simpler. The input alignment has to be |
48
|
|
|
|
|
|
|
# translate-able, with gap lengths and an overall length divisible by 3 |
49
|
|
|
|
|
|
|
my $aa_aln = dna_to_aa_aln($dna_al); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Generate bootstraps |
52
|
|
|
|
|
|
|
my $replicates = bootstrap_replicates($aln,$count); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 DESCRIPTION |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
This module contains utility methods for manipulating sequence |
57
|
|
|
|
|
|
|
alignments (L) objects. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The B utility is essentially the same as the B |
60
|
|
|
|
|
|
|
program by Bill Pearson available at |
61
|
|
|
|
|
|
|
ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this |
62
|
|
|
|
|
|
|
is a pure-Perl implementation, but just to mention that if anything |
63
|
|
|
|
|
|
|
seems odd you can check the alignments generated against Bill's |
64
|
|
|
|
|
|
|
program. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 FEEDBACK |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 Mailing Lists |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
71
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
72
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
75
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 Support |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
I |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
84
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
85
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
86
|
|
|
|
|
|
|
with code and data examples if at all possible. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 Reporting Bugs |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
91
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via the |
92
|
|
|
|
|
|
|
web: |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Email jason-at-bioperl-dot-org |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 APPENDIX |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
103
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
106
|
|
|
|
|
|
|
|
107
|
3
|
|
|
3
|
|
10
|
use constant CODONSIZE => 3; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
4714
|
|
108
|
|
|
|
|
|
|
our $GAP = '-'; |
109
|
|
|
|
|
|
|
our $CODONGAP = $GAP x CODONSIZE; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 aa_to_dna_aln |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Title : aa_to_dna_aln |
114
|
|
|
|
|
|
|
Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs); |
115
|
|
|
|
|
|
|
Function: Will convert an AA alignment to DNA space given the |
116
|
|
|
|
|
|
|
corresponding DNA sequences. Note that this method expects |
117
|
|
|
|
|
|
|
the DNA sequences to be in frame +1 (GFF frame 0) as it will |
118
|
|
|
|
|
|
|
start to project into coordinates starting at the first base of |
119
|
|
|
|
|
|
|
the DNA sequence, if this alignment represents a different |
120
|
|
|
|
|
|
|
frame for the cDNA you will need to edit the DNA sequences |
121
|
|
|
|
|
|
|
to remove the 1st or 2nd bases (and revcom if things should be). |
122
|
|
|
|
|
|
|
Returns : Bio::Align::AlignI object |
123
|
|
|
|
|
|
|
Args : 2 arguments, the alignment and a hashref. |
124
|
|
|
|
|
|
|
Alignment is a Bio::Align::AlignI of amino acid sequences. |
125
|
|
|
|
|
|
|
The hash reference should have keys which are |
126
|
|
|
|
|
|
|
the display_ids for the aa |
127
|
|
|
|
|
|
|
sequences in the alignment and the values are a |
128
|
|
|
|
|
|
|
Bio::PrimarySeqI object for the corresponding |
129
|
|
|
|
|
|
|
spliced cDNA sequence. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
See also: L, L, L |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub aa_to_dna_aln { |
136
|
2
|
|
|
2
|
1
|
11
|
my ( $aln, $dnaseqs ) = @_; |
137
|
2
|
50
|
33
|
|
|
23
|
unless ( defined $aln |
|
|
|
33
|
|
|
|
|
138
|
|
|
|
|
|
|
&& ref($aln) |
139
|
|
|
|
|
|
|
&& $aln->isa('Bio::Align::AlignI') ) |
140
|
|
|
|
|
|
|
{ |
141
|
0
|
|
|
|
|
0
|
croak( |
142
|
|
|
|
|
|
|
'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature' |
143
|
|
|
|
|
|
|
); |
144
|
|
|
|
|
|
|
} |
145
|
2
|
|
|
|
|
8
|
my $alnlen = $aln->length; |
146
|
2
|
|
|
|
|
10
|
my $dnaalign = Bio::SimpleAlign->new(); |
147
|
2
|
|
|
|
|
8
|
$aln->map_chars( '\.', $GAP ); |
148
|
|
|
|
|
|
|
|
149
|
2
|
|
|
|
|
5
|
foreach my $seq ( $aln->each_seq ) { |
150
|
17
|
|
|
|
|
27
|
my $aa_seqstr = $seq->seq(); |
151
|
17
|
|
|
|
|
25
|
my $pepid = $seq->display_id; |
152
|
17
|
|
33
|
|
|
37
|
my $dnaseq = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id ); |
153
|
17
|
|
|
|
|
23
|
my $start_offset = ( $seq->start - 1 ) * CODONSIZE; |
154
|
17
|
|
|
|
|
50
|
$dnaseq = $dnaseq->seq(); |
155
|
17
|
|
|
|
|
30
|
my $dnalen = $dnaseqs->{$pepid}->length; |
156
|
17
|
|
33
|
|
|
27
|
my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137) |
157
|
17
|
|
|
|
|
14
|
my $nt_seqstr; |
158
|
17
|
|
|
|
|
15
|
my $j = 0; |
159
|
17
|
|
|
|
|
29
|
for ( my $i = 0 ; $i < $alnlen ; $i++ ) { |
160
|
5552
|
|
|
|
|
3956
|
my $char = substr( $aa_seqstr, $i + $start_offset, 1 ); |
161
|
5552
|
100
|
66
|
|
|
10811
|
if ( $char eq $GAP || $j >= $dnalen ) { |
162
|
765
|
|
|
|
|
902
|
$nt_seqstr .= $CODONGAP; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
else { |
165
|
4787
|
|
|
|
|
3149
|
$nt_seqstr .= substr( $dnaseq, $j, CODONSIZE ); |
166
|
4787
|
|
|
|
|
5937
|
$j += CODONSIZE; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} |
169
|
17
|
|
|
|
|
33
|
$nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) ); |
170
|
|
|
|
|
|
|
|
171
|
17
|
|
|
|
|
80
|
my $newdna = Bio::LocatableSeq->new( |
172
|
|
|
|
|
|
|
-display_id => $dnaid, |
173
|
|
|
|
|
|
|
-alphabet => 'dna', |
174
|
|
|
|
|
|
|
-start => $start_offset + 1, |
175
|
|
|
|
|
|
|
-end => ( $seq->end * CODONSIZE ), |
176
|
|
|
|
|
|
|
-strand => 1, |
177
|
|
|
|
|
|
|
-seq => $nt_seqstr |
178
|
|
|
|
|
|
|
); |
179
|
17
|
|
|
|
|
48
|
$dnaalign->add_seq($newdna); |
180
|
|
|
|
|
|
|
} |
181
|
2
|
|
|
|
|
10
|
return $dnaalign; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head2 dna_to_aa_aln |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Title : dna_to_aa_aln |
187
|
|
|
|
|
|
|
Usage : my $aa_aln = dna_to_aa_aln($dna_aln); |
188
|
|
|
|
|
|
|
Function: Convert a DNA alignment to an amino acid alignment where |
189
|
|
|
|
|
|
|
the length of all alignment strings and the lengths of any |
190
|
|
|
|
|
|
|
gaps must be divisible by 3 |
191
|
|
|
|
|
|
|
Returns : Bio::Align::AlignI object |
192
|
|
|
|
|
|
|
Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
See also: L, L, L |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub dna_to_aa_aln { |
199
|
1
|
|
|
1
|
1
|
7
|
my $dna_aln = shift; |
200
|
1
|
50
|
33
|
|
|
12
|
unless ( defined $dna_aln |
|
|
|
33
|
|
|
|
|
201
|
|
|
|
|
|
|
&& ref($dna_aln) |
202
|
|
|
|
|
|
|
&& $dna_aln->isa('Bio::Align::AlignI') ) { |
203
|
0
|
|
|
|
|
0
|
croak( |
204
|
|
|
|
|
|
|
'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln' |
205
|
|
|
|
|
|
|
); |
206
|
|
|
|
|
|
|
} |
207
|
1
|
|
|
|
|
3
|
my $codon_table = Bio::Tools::CodonTable->new; |
208
|
1
|
|
|
|
|
4
|
my $aa_aln = Bio::SimpleAlign->new; |
209
|
|
|
|
|
|
|
|
210
|
1
|
|
|
|
|
3
|
for my $seq ( $dna_aln->each_seq ) { |
211
|
14
|
|
|
|
|
10
|
my ($aa_str, $aa_len); |
212
|
14
|
|
|
|
|
27
|
my @aln_str = split '', $seq->seq; |
213
|
14
|
50
|
|
|
|
26
|
croak("All lines in the alignment must have lengths divisible by 3") |
214
|
|
|
|
|
|
|
if ( scalar(@aln_str) % CODONSIZE ); |
215
|
|
|
|
|
|
|
|
216
|
14
|
|
|
|
|
22
|
while ( @aln_str ) { |
217
|
5516
|
|
|
|
|
5882
|
my $triplet = join '', (splice( @aln_str, 0, CODONSIZE )); |
218
|
|
|
|
|
|
|
|
219
|
5516
|
100
|
|
|
|
10466
|
if ( $triplet =~ /^[GATC]+$/i ) { |
|
|
50
|
|
|
|
|
|
220
|
4754
|
|
|
|
|
5831
|
$aa_str .= $codon_table->translate($triplet); |
221
|
4754
|
|
|
|
|
6760
|
$aa_len++; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) { |
224
|
762
|
|
|
|
|
1133
|
$aa_str .= $GAP; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
else { |
227
|
0
|
|
|
|
|
0
|
croak("The triplet '$triplet' is neither a valid codon nor all gaps"); |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
14
|
|
|
|
|
63
|
my $new_aa = Bio::LocatableSeq->new( |
231
|
|
|
|
|
|
|
-display_id => $seq->display_id, |
232
|
|
|
|
|
|
|
-alphabet => 'protein', |
233
|
|
|
|
|
|
|
-start => 1, |
234
|
|
|
|
|
|
|
-end => $aa_len, |
235
|
|
|
|
|
|
|
-strand => 1, |
236
|
|
|
|
|
|
|
-seq => $aa_str |
237
|
|
|
|
|
|
|
); |
238
|
|
|
|
|
|
|
|
239
|
14
|
|
|
|
|
48
|
$aa_aln->add_seq($new_aa); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
1
|
|
|
|
|
8
|
$aa_aln; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head2 bootstrap_replicates |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Title : bootstrap_replicates |
248
|
|
|
|
|
|
|
Usage : my $alns = &bootstrap_replicates($aln,100); |
249
|
|
|
|
|
|
|
Function: Generate a pseudo-replicate of the data by randomly |
250
|
|
|
|
|
|
|
sampling, with replacement, the columns from an alignment for |
251
|
|
|
|
|
|
|
the non-parametric bootstrap. |
252
|
|
|
|
|
|
|
Returns : Arrayref of L objects |
253
|
|
|
|
|
|
|
Args : L object |
254
|
|
|
|
|
|
|
Number of replicates to generate |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub bootstrap_replicates { |
259
|
3
|
|
|
3
|
1
|
11
|
my ( $aln, $count ) = @_; |
260
|
3
|
|
50
|
|
|
10
|
$count ||= 1; |
261
|
3
|
|
|
|
|
11
|
my $alen = $aln->length; |
262
|
3
|
|
|
|
|
4
|
my ( @seqs, @nm ); |
263
|
3
|
|
|
|
|
15
|
$aln->set_displayname_flat(1); |
264
|
3
|
|
|
|
|
8
|
for my $s ( $aln->each_seq ) { |
265
|
31
|
|
|
|
|
41
|
push @seqs, $s->seq(); |
266
|
31
|
|
|
|
|
43
|
push @nm, $s->id; |
267
|
|
|
|
|
|
|
} |
268
|
3
|
|
|
|
|
7
|
my ( @alns, $i ); |
269
|
3
|
|
|
|
|
12
|
while ( $count-- > 0 ) { |
270
|
23
|
|
|
|
|
23
|
my @newseqs; |
271
|
23
|
|
|
|
|
49
|
for ( $i = 0 ; $i < $alen ; $i++ ) { |
272
|
7988
|
|
|
|
|
5943
|
my $index = int( rand($alen) ); |
273
|
7988
|
|
|
|
|
4343
|
my $c = 0; |
274
|
7988
|
|
|
|
|
6426
|
for (@seqs) { |
275
|
110644
|
|
|
|
|
92266
|
$newseqs[ $c++ ] .= substr( $_, $index, 1 ); |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
} |
278
|
23
|
|
|
|
|
98
|
my $newaln = Bio::SimpleAlign->new(); |
279
|
23
|
|
|
|
|
27
|
my $i = 0; |
280
|
23
|
|
|
|
|
34
|
for my $s (@newseqs) { |
281
|
289
|
|
|
|
|
5539
|
( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g; |
282
|
289
|
|
|
|
|
1168
|
$newaln->add_seq( |
283
|
|
|
|
|
|
|
Bio::LocatableSeq->new( |
284
|
|
|
|
|
|
|
-start => 1, |
285
|
|
|
|
|
|
|
-end => length($tmp), |
286
|
|
|
|
|
|
|
-display_id => $nm[ $i++ ], |
287
|
|
|
|
|
|
|
-seq => $s |
288
|
|
|
|
|
|
|
) |
289
|
|
|
|
|
|
|
); |
290
|
|
|
|
|
|
|
} |
291
|
23
|
|
|
|
|
78
|
push @alns, $newaln; |
292
|
|
|
|
|
|
|
} |
293
|
3
|
|
|
|
|
20
|
return \@alns; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head2 bootstrap_replicates_codons |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Title : bootstrap_replicates_codons |
299
|
|
|
|
|
|
|
Usage : my $alns = &bootstrap_replicates_codons($aln,100); |
300
|
|
|
|
|
|
|
Function: Generate a pseudo-replicate of the data by randomly |
301
|
|
|
|
|
|
|
sampling, with replacement, the columns from a codon alignment for |
302
|
|
|
|
|
|
|
the non-parametric bootstrap. The alignment is assumed to start on |
303
|
|
|
|
|
|
|
the first position of a codon. |
304
|
|
|
|
|
|
|
Returns : Arrayref of L objects |
305
|
|
|
|
|
|
|
Args : L object |
306
|
|
|
|
|
|
|
Number of replicates to generate |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=cut |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub bootstrap_replicates_codons { |
311
|
0
|
|
|
0
|
1
|
0
|
my ( $aln, $count ) = @_; |
312
|
0
|
|
0
|
|
|
0
|
$count ||= 1; |
313
|
0
|
|
|
|
|
0
|
my $alen = $aln->length; |
314
|
0
|
|
|
|
|
0
|
my $ncodon = int( $alen / 3 ); |
315
|
0
|
|
|
|
|
0
|
my ( @seqs, @nm ); |
316
|
0
|
|
|
|
|
0
|
$aln->set_displayname_flat(1); |
317
|
0
|
|
|
|
|
0
|
for my $s ( $aln->each_seq ) { |
318
|
0
|
|
|
|
|
0
|
push @seqs, $s->seq(); |
319
|
0
|
|
|
|
|
0
|
push @nm, $s->id; |
320
|
|
|
|
|
|
|
} |
321
|
0
|
|
|
|
|
0
|
my ( @alns, $i ); |
322
|
0
|
|
|
|
|
0
|
while ( $count-- > 0 ) { |
323
|
0
|
|
|
|
|
0
|
my @newseqs; |
324
|
0
|
|
|
|
|
0
|
for ( $i = 0 ; $i < $ncodon ; $i++ ) { |
325
|
0
|
|
|
|
|
0
|
my $index = int( rand($ncodon) ); |
326
|
0
|
|
|
|
|
0
|
my $seqpos = $index * 3; |
327
|
0
|
|
|
|
|
0
|
my $c = 0; |
328
|
0
|
|
|
|
|
0
|
for (@seqs) { |
329
|
0
|
|
|
|
|
0
|
$newseqs[ $c++ ] .= substr( $_, $seqpos, 3 ); |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
} |
332
|
0
|
|
|
|
|
0
|
my $newaln = Bio::SimpleAlign->new(); |
333
|
0
|
|
|
|
|
0
|
my $i = 0; |
334
|
0
|
|
|
|
|
0
|
for my $s (@newseqs) { |
335
|
0
|
|
|
|
|
0
|
( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g; |
336
|
0
|
|
|
|
|
0
|
$newaln->add_seq( |
337
|
|
|
|
|
|
|
Bio::LocatableSeq->new( |
338
|
|
|
|
|
|
|
-start => 1, |
339
|
|
|
|
|
|
|
-end => length($tmp), |
340
|
|
|
|
|
|
|
-display_id => $nm[ $i++ ], |
341
|
|
|
|
|
|
|
-seq => $s |
342
|
|
|
|
|
|
|
) |
343
|
|
|
|
|
|
|
); |
344
|
|
|
|
|
|
|
} |
345
|
0
|
|
|
|
|
0
|
push @alns, $newaln; |
346
|
|
|
|
|
|
|
} |
347
|
0
|
|
|
|
|
0
|
return \@alns; |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=head2 cat |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
Title : cat |
353
|
|
|
|
|
|
|
Usage : $aln123 = cat($aln1, $aln2, $aln3) |
354
|
|
|
|
|
|
|
Function : Concatenates alignment objects. Sequences are identified by id. |
355
|
|
|
|
|
|
|
An error will be thrown if the sequence ids are not unique in the |
356
|
|
|
|
|
|
|
first alignment. If any ids are not present or not unique in any |
357
|
|
|
|
|
|
|
of the additional alignments then those sequences are omitted from |
358
|
|
|
|
|
|
|
the concatenated alignment, and a warning is issued. An error will |
359
|
|
|
|
|
|
|
be thrown if any of the alignments are not flush, since |
360
|
|
|
|
|
|
|
concatenating such alignments is unlikely to make biological |
361
|
|
|
|
|
|
|
sense. |
362
|
|
|
|
|
|
|
Returns : A new Bio::SimpleAlign object |
363
|
|
|
|
|
|
|
Args : A list of Bio::SimpleAlign objects |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub cat { |
368
|
1
|
|
|
1
|
1
|
6
|
my ( $self, @aln ) = @_; |
369
|
1
|
50
|
|
|
|
4
|
$self->throw("cat method called with no arguments") unless $self; |
370
|
1
|
|
|
|
|
4
|
for ( $self, @aln ) { |
371
|
2
|
50
|
|
|
|
8
|
$self->throw( $_->id . " is not a Bio::Align::AlignI object" ) |
372
|
|
|
|
|
|
|
unless $_->isa('Bio::Align::AlignI'); |
373
|
2
|
50
|
|
|
|
6
|
$self->throw( $_->id . " is not flush" ) unless $_->is_flush; |
374
|
|
|
|
|
|
|
} |
375
|
1
|
|
|
|
|
3
|
my $aln = $self->new; |
376
|
1
|
|
|
|
|
3
|
$aln->id( $self->id ); |
377
|
1
|
|
|
|
|
2
|
$aln->annotation( $self->annotation ); |
378
|
1
|
|
|
|
|
1
|
my %unique; |
379
|
1
|
|
|
|
|
3
|
SEQ: foreach my $seq ( $self->each_seq() ) { |
380
|
|
|
|
|
|
|
throw( "ID: ", $seq->id, " is not unique in initial alignment." ) |
381
|
14
|
50
|
|
|
|
25
|
if exists $unique{ $seq->id }; |
382
|
14
|
|
|
|
|
19
|
$unique{ $seq->id } = 1; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array |
385
|
14
|
|
|
|
|
24
|
my $new_seq = $seq->new( |
386
|
|
|
|
|
|
|
-id => $seq->id, |
387
|
|
|
|
|
|
|
-strand => $seq->strand, |
388
|
|
|
|
|
|
|
-verbose => $self->verbose |
389
|
|
|
|
|
|
|
); |
390
|
14
|
|
|
|
|
31
|
$new_seq->seq( $seq->seq ); |
391
|
14
|
|
|
|
|
27
|
$new_seq->start( $seq->start ); |
392
|
14
|
|
|
|
|
22
|
$new_seq->end( $seq->end ); |
393
|
14
|
50
|
|
|
|
60
|
if ( $new_seq->isa('Bio::Seq::MetaI') ) { |
394
|
0
|
|
|
|
|
0
|
for my $meta_name ( $seq->meta_names ) { |
395
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $new_seq->start, |
396
|
|
|
|
|
|
|
$new_seq->end, $seq->named_meta($meta_name) ); |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
} |
399
|
14
|
|
|
|
|
18
|
for my $cat_aln (@aln) { |
400
|
14
|
|
|
|
|
17
|
my @cat_seq = $cat_aln->each_seq_with_id( $seq->id ); |
401
|
14
|
50
|
|
|
|
24
|
if ( @cat_seq == 0 ) { |
402
|
0
|
|
|
|
|
0
|
$self->warn( $seq->id |
403
|
|
|
|
|
|
|
. " not found in alignment " |
404
|
|
|
|
|
|
|
. $cat_aln->id |
405
|
|
|
|
|
|
|
. ", skipping this sequence." ); |
406
|
0
|
|
|
|
|
0
|
next SEQ; |
407
|
|
|
|
|
|
|
} |
408
|
14
|
50
|
|
|
|
19
|
if ( @cat_seq > 1 ) { |
409
|
0
|
|
|
|
|
0
|
$self->warn( $seq->id |
410
|
|
|
|
|
|
|
. " found multiple times in alignment " |
411
|
|
|
|
|
|
|
. $cat_aln->id |
412
|
|
|
|
|
|
|
. ", skipping this sequence." ); |
413
|
0
|
|
|
|
|
0
|
next SEQ; |
414
|
|
|
|
|
|
|
} |
415
|
14
|
|
|
|
|
12
|
my $cat_seq = $cat_seq[0]; |
416
|
14
|
|
|
|
|
15
|
my $old_end = $new_seq->end; |
417
|
14
|
|
|
|
|
18
|
$new_seq->seq( $new_seq->seq . $cat_seq->seq ); |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# Not sure if this is a sensible way to deal with end coordinates |
420
|
14
|
|
|
|
|
22
|
$new_seq->end( |
421
|
|
|
|
|
|
|
$new_seq->end + $cat_seq->end + 1 - $cat_seq->start ); |
422
|
|
|
|
|
|
|
|
423
|
14
|
50
|
|
|
|
90
|
if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) { |
|
|
50
|
|
|
|
|
|
424
|
0
|
0
|
|
|
|
0
|
unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) { |
425
|
0
|
|
|
|
|
0
|
my $meta_seq = Bio::Seq::Meta::Array->new; |
426
|
0
|
|
|
|
|
0
|
$meta_seq->seq( $new_seq->seq ); |
427
|
0
|
|
|
|
|
0
|
$meta_seq->start( $new_seq->start ); |
428
|
0
|
|
|
|
|
0
|
$meta_seq->end( $new_seq->end ); |
429
|
0
|
0
|
|
|
|
0
|
if ( $new_seq->isa('Bio::Seq::Meta') ) { |
430
|
0
|
|
|
|
|
0
|
for my $meta_name ( $new_seq->meta_names ) { |
431
|
0
|
|
|
|
|
0
|
$meta_seq->named_submeta( |
432
|
|
|
|
|
|
|
$meta_name, |
433
|
|
|
|
|
|
|
$new_seq->start, |
434
|
|
|
|
|
|
|
$old_end, |
435
|
|
|
|
|
|
|
[ |
436
|
|
|
|
|
|
|
split( |
437
|
|
|
|
|
|
|
//, $new_seq->named_meta($meta_name) |
438
|
|
|
|
|
|
|
) |
439
|
|
|
|
|
|
|
] |
440
|
|
|
|
|
|
|
); |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
} |
443
|
0
|
|
|
|
|
0
|
$new_seq = $meta_seq; |
444
|
|
|
|
|
|
|
} |
445
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
446
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
447
|
|
|
|
|
|
|
$new_seq->end, $cat_seq->named_meta($meta_name) ); |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
elsif ( $cat_seq->isa('Bio::Seq::Meta') ) { |
451
|
0
|
0
|
|
|
|
0
|
if ( $new_seq->isa('Bio::Seq::Meta::Array') ) { |
452
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
453
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
454
|
|
|
|
|
|
|
$new_seq->end, |
455
|
|
|
|
|
|
|
[ split( //, $cat_seq->named_meta($meta_name) ) ] ); |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
else { |
459
|
0
|
0
|
|
|
|
0
|
unless ( $new_seq->isa('Bio::Seq::Meta') ) { |
460
|
0
|
|
|
|
|
0
|
my $meta_seq = Bio::Seq::Meta::Array->new; |
461
|
0
|
|
|
|
|
0
|
$meta_seq->seq( $new_seq->seq ); |
462
|
0
|
|
|
|
|
0
|
$meta_seq->start( $new_seq->start ); |
463
|
0
|
|
|
|
|
0
|
$meta_seq->end( $new_seq->end ); |
464
|
0
|
|
|
|
|
0
|
$new_seq = $meta_seq; |
465
|
|
|
|
|
|
|
} |
466
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_seq->meta_names ) { |
467
|
0
|
|
|
|
|
0
|
$new_seq->named_submeta( $meta_name, $old_end + 1, |
468
|
|
|
|
|
|
|
$new_seq->end, $cat_seq->named_meta($meta_name) ); |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
} |
473
|
14
|
|
|
|
|
29
|
$aln->add_seq($new_seq); |
474
|
|
|
|
|
|
|
} |
475
|
1
|
|
|
|
|
6
|
my $cons_meta = $self->consensus_meta; |
476
|
1
|
|
|
|
|
1
|
my $new_cons_meta; |
477
|
1
|
50
|
|
|
|
3
|
if ($cons_meta) { |
478
|
0
|
|
|
|
|
0
|
$new_cons_meta = Bio::Seq::Meta->new(); |
479
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cons_meta->meta_names ) { |
480
|
0
|
|
|
|
|
0
|
$new_cons_meta->named_submeta( $meta_name, 1, $self->length, |
481
|
|
|
|
|
|
|
$cons_meta->$meta_name ); |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
} |
484
|
1
|
|
|
|
|
4
|
my $end = $self->length; |
485
|
1
|
|
|
|
|
2
|
for my $cat_aln (@aln) { |
486
|
1
|
|
|
|
|
3
|
my $cat_cons_meta = $cat_aln->consensus_meta; |
487
|
1
|
50
|
|
|
|
3
|
if ($cat_cons_meta) { |
488
|
0
|
0
|
|
|
|
0
|
$new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta; |
489
|
0
|
|
|
|
|
0
|
for my $meta_name ( $cat_cons_meta->meta_names ) { |
490
|
0
|
|
|
|
|
0
|
$new_cons_meta->named_submeta( |
491
|
|
|
|
|
|
|
$meta_name, $end + 1, |
492
|
|
|
|
|
|
|
$end + $cat_aln->length, |
493
|
|
|
|
|
|
|
$cat_cons_meta->$meta_name |
494
|
|
|
|
|
|
|
); |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
1
|
|
|
|
|
3
|
$end += $cat_aln->length; |
498
|
|
|
|
|
|
|
} |
499
|
1
|
50
|
|
|
|
2
|
$aln->consensus_meta($new_cons_meta) if $new_cons_meta; |
500
|
1
|
|
|
|
|
4
|
return $aln; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head2 most_common_sequences |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
Title : most_common_sequences |
507
|
|
|
|
|
|
|
Usage : @common = most_common_sequences ($align, $case_sensitivity) |
508
|
|
|
|
|
|
|
Function : Returns an array of the sequences that appear most often in the |
509
|
|
|
|
|
|
|
alignment (although this probably makes more sense when there is |
510
|
|
|
|
|
|
|
only a single most common sequence). Sequences are compared after |
511
|
|
|
|
|
|
|
removing any "-" (gap characters), and ambiguous units (e.g., R |
512
|
|
|
|
|
|
|
for purines) are only compared to themselves. The returned |
513
|
|
|
|
|
|
|
sequence is also missing the "-" since they don't actually make |
514
|
|
|
|
|
|
|
part of the sequence. |
515
|
|
|
|
|
|
|
Returns : Array of text strings. |
516
|
|
|
|
|
|
|
Arguments : Optional argument defining whether the comparison between sequences |
517
|
|
|
|
|
|
|
to find the most common should be case sensitive. Defaults to |
518
|
|
|
|
|
|
|
false, i.e, not case sensitive. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=cut |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
sub most_common_sequences { |
523
|
0
|
0
|
|
0
|
1
|
|
my $align = shift |
524
|
|
|
|
|
|
|
or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences"); |
525
|
0
|
|
|
|
|
|
my $case_sensitive = shift; # defaults to false (we get undef if nothing) |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
## We keep track of the max on this loop. Saves us having to |
528
|
|
|
|
|
|
|
## transverse the hash table later to find the maximum value. |
529
|
0
|
|
|
|
|
|
my $max = 0; |
530
|
0
|
|
|
|
|
|
my %counts; |
531
|
0
|
|
|
|
|
|
foreach ($align->each_seq) { |
532
|
0
|
|
|
|
|
|
(my $seq = $_->seq) =~ tr/-//d; |
533
|
0
|
0
|
|
|
|
|
$seq = uc ($seq) unless $case_sensitive; |
534
|
0
|
0
|
|
|
|
|
$max++ if (++$counts{$seq} > $max); |
535
|
|
|
|
|
|
|
} |
536
|
0
|
|
|
|
|
|
my @common = grep ($counts{$_} == $max, keys %counts); |
537
|
0
|
|
|
|
|
|
return @common; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
1; |