| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
=encoding utf8 |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Bio::BPWrapper::SeqManipulations - Functions for L |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Bio::BPWrapper::SeqManipulations; |
|
10
|
|
|
|
|
|
|
# Set options hash ... |
|
11
|
|
|
|
|
|
|
initialize(\%opts); |
|
12
|
|
|
|
|
|
|
write_out(\%opts); |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=cut |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
package Bio::BPWrapper::SeqManipulations; |
|
17
|
|
|
|
|
|
|
|
|
18
|
19
|
|
|
19
|
|
139
|
use strict; # Still on 5.10, so need this for strict |
|
|
19
|
|
|
|
|
63
|
|
|
|
19
|
|
|
|
|
837
|
|
|
19
|
19
|
|
|
19
|
|
87
|
use warnings; |
|
|
19
|
|
|
|
|
32
|
|
|
|
19
|
|
|
|
|
1280
|
|
|
20
|
19
|
|
|
19
|
|
342
|
use 5.010; |
|
|
19
|
|
|
|
|
62
|
|
|
21
|
19
|
|
|
19
|
|
12879
|
use Bio::Seq; |
|
|
19
|
|
|
|
|
1825405
|
|
|
|
19
|
|
|
|
|
804
|
|
|
22
|
19
|
|
|
19
|
|
11441
|
use Bio::SeqIO; |
|
|
19
|
|
|
|
|
581375
|
|
|
|
19
|
|
|
|
|
1006
|
|
|
23
|
19
|
|
|
19
|
|
175
|
use File::Basename; |
|
|
19
|
|
|
|
|
38
|
|
|
|
19
|
|
|
|
|
2108
|
|
|
24
|
19
|
|
|
19
|
|
145
|
use Bio::Tools::CodonTable; |
|
|
19
|
|
|
|
|
72
|
|
|
|
19
|
|
|
|
|
683
|
|
|
25
|
19
|
|
|
19
|
|
10114
|
use Bio::DB::RefSeq; |
|
|
19
|
|
|
|
|
1677129
|
|
|
|
19
|
|
|
|
|
966
|
|
|
26
|
19
|
|
|
19
|
|
13048
|
use Bio::Tools::SeqStats; |
|
|
19
|
|
|
|
|
78056
|
|
|
|
19
|
|
|
|
|
830
|
|
|
27
|
19
|
|
|
19
|
|
185
|
use Bio::SeqUtils; |
|
|
19
|
|
|
|
|
37
|
|
|
|
19
|
|
|
|
|
629
|
|
|
28
|
19
|
|
|
19
|
|
125
|
use Scalar::Util; |
|
|
19
|
|
|
|
|
38
|
|
|
|
19
|
|
|
|
|
1017
|
|
|
29
|
19
|
|
|
19
|
|
103
|
use Exporter (); |
|
|
19
|
|
|
|
|
34
|
|
|
|
19
|
|
|
|
|
353
|
|
|
30
|
19
|
|
|
19
|
|
11277
|
use Bio::CodonUsage::IO; |
|
|
19
|
|
|
|
|
120814
|
|
|
|
19
|
|
|
|
|
889
|
|
|
31
|
19
|
|
|
19
|
|
14626
|
use Bio::Tools::pICalculator; |
|
|
19
|
|
|
|
|
26403
|
|
|
|
19
|
|
|
|
|
1104
|
|
|
32
|
|
|
|
|
|
|
# use Bio::Tools::GuessSeqFormat; |
|
33
|
|
|
|
|
|
|
|
|
34
|
19
|
|
|
19
|
|
12264
|
if ($ENV{'DEBUG'}) { use Data::Dumper } |
|
|
19
|
|
|
|
|
168854
|
|
|
|
19
|
|
|
|
|
1872
|
|
|
35
|
|
|
|
|
|
|
|
|
36
|
19
|
|
|
19
|
|
157
|
use vars qw(@ISA @EXPORT @EXPORT_OK); |
|
|
19
|
|
|
|
|
44
|
|
|
|
19
|
|
|
|
|
2410
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
#restrict_coord restrict_digest |
|
41
|
|
|
|
|
|
|
@EXPORT = qw(initialize can_handle handle_opt write_out |
|
42
|
|
|
|
|
|
|
print_composition filter_seqs retrieve_seqs remove_gaps |
|
43
|
|
|
|
|
|
|
print_lengths print_seq_count make_revcom |
|
44
|
|
|
|
|
|
|
print_subseq |
|
45
|
|
|
|
|
|
|
anonymize |
|
46
|
|
|
|
|
|
|
shred_seq count_codons print_gb_gene_feats |
|
47
|
|
|
|
|
|
|
count_leading_gaps hydroB linearize reloop_at |
|
48
|
|
|
|
|
|
|
remove_stop parse_orders find_by_order |
|
49
|
|
|
|
|
|
|
pick_by_order del_by_order find_by_id |
|
50
|
|
|
|
|
|
|
pick_by_id del_by_id find_by_re |
|
51
|
|
|
|
|
|
|
pick_by_re del_by_re |
|
52
|
|
|
|
|
|
|
pick_by_file del_by_file |
|
53
|
|
|
|
|
|
|
find_by_ambig pick_by_ambig del_by_ambig find_by_length |
|
54
|
|
|
|
|
|
|
del_by_length codon_sim codon_info); |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Package global variables |
|
57
|
|
|
|
|
|
|
my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser); |
|
58
|
19
|
|
|
19
|
|
8965
|
use Bio::BPWrapper; |
|
|
19
|
|
|
|
|
72
|
|
|
|
19
|
|
|
|
|
163375
|
|
|
59
|
|
|
|
|
|
|
my $VERSION = '1.0'; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
## For new options, just add an entry into this table with the same key as in |
|
62
|
|
|
|
|
|
|
## the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works. |
|
63
|
|
|
|
|
|
|
my %opt_dispatch = ( |
|
64
|
|
|
|
|
|
|
'codon-table' => \&codon_table, |
|
65
|
|
|
|
|
|
|
# 'codon-sim' => \&codon_sim, |
|
66
|
|
|
|
|
|
|
'codon-info' => \&codon_info, |
|
67
|
|
|
|
|
|
|
'iep' => \&iso_electric_point, |
|
68
|
|
|
|
|
|
|
'composition' => \&print_composition, |
|
69
|
|
|
|
|
|
|
'mol-wt' => \&print_weight, |
|
70
|
|
|
|
|
|
|
'delete' => \&filter_seqs, |
|
71
|
|
|
|
|
|
|
'fetch' => \&retrieve_seqs, |
|
72
|
|
|
|
|
|
|
'no-gaps' => \&remove_gaps, |
|
73
|
|
|
|
|
|
|
'length' => \&print_lengths, |
|
74
|
|
|
|
|
|
|
'longest-orf' => \&update_longest_orf, |
|
75
|
|
|
|
|
|
|
'num-seq' => \&print_seq_count, |
|
76
|
|
|
|
|
|
|
'num-gaps-dna' => \&count_gaps_dna, |
|
77
|
|
|
|
|
|
|
'num-gaps-aa' => \&count_gaps_aa, |
|
78
|
|
|
|
|
|
|
'pick' => \&filter_seqs, |
|
79
|
|
|
|
|
|
|
'revcom' => \&make_revcom, |
|
80
|
|
|
|
|
|
|
'subseq' => \&print_subseq, |
|
81
|
|
|
|
|
|
|
'translate' => \&reading_frame_ops, |
|
82
|
|
|
|
|
|
|
# 'restrict-coord' => \&restrict_coord, |
|
83
|
|
|
|
|
|
|
# 'restrict' => \&restrict_digest, |
|
84
|
|
|
|
|
|
|
'anonymize' => \&anonymize, |
|
85
|
|
|
|
|
|
|
'break' => \&shred_seq, |
|
86
|
|
|
|
|
|
|
'count-codons' => \&count_codons, |
|
87
|
|
|
|
|
|
|
'feat2fas' => \&print_gb_gene_feats, |
|
88
|
|
|
|
|
|
|
'lead-gaps' => \&count_leading_gaps, |
|
89
|
|
|
|
|
|
|
'hydroB' => \&hydroB, |
|
90
|
|
|
|
|
|
|
'linearize' => \&linearize, |
|
91
|
|
|
|
|
|
|
'reloop' => \&reloop_at, |
|
92
|
|
|
|
|
|
|
'remove-stop' => \&remove_stop, |
|
93
|
|
|
|
|
|
|
'sort' => \&sort_by, |
|
94
|
|
|
|
|
|
|
'split-cdhit' => \&split_cdhit, |
|
95
|
|
|
|
|
|
|
'syn-code' => \&synon_codons, |
|
96
|
|
|
|
|
|
|
# 'dotplot' => \&draw_dotplot, |
|
97
|
|
|
|
|
|
|
# 'extract' => \&reading_frame_ops, |
|
98
|
|
|
|
|
|
|
# 'longest-orf' => \&reading_frame_ops, |
|
99
|
|
|
|
|
|
|
# 'prefix' => \&anonymize, |
|
100
|
|
|
|
|
|
|
'rename' => \&rename_id, |
|
101
|
|
|
|
|
|
|
# 'slidingwindow' => \&sliding_window, |
|
102
|
|
|
|
|
|
|
# 'split' => \&split_seqs, |
|
103
|
|
|
|
|
|
|
); |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
my %filter_dispatch = ( |
|
106
|
|
|
|
|
|
|
'find_by_order' => \&find_by_order, |
|
107
|
|
|
|
|
|
|
'pick_by_order' => \&pick_by_order, |
|
108
|
|
|
|
|
|
|
'delete_by_order' => \&del_by_order, |
|
109
|
|
|
|
|
|
|
'find_by_id' => \&find_by_id, |
|
110
|
|
|
|
|
|
|
'pick_by_id' => \&pick_by_id, |
|
111
|
|
|
|
|
|
|
'delete_by_id' => \&del_by_id, |
|
112
|
|
|
|
|
|
|
'find_by_re' => \&find_by_re, |
|
113
|
|
|
|
|
|
|
'pick_by_re' => \&pick_by_re, |
|
114
|
|
|
|
|
|
|
'find_by_file' => \&find_by_file, |
|
115
|
|
|
|
|
|
|
'pick_by_file' => \&pick_by_file, |
|
116
|
|
|
|
|
|
|
'delete_by_file' => \&del_by_file, |
|
117
|
|
|
|
|
|
|
'find_by_ambig' => \&find_by_ambig, |
|
118
|
|
|
|
|
|
|
'pick_by_ambig' => \&pick_by_ambig, |
|
119
|
|
|
|
|
|
|
'delete_by_ambig' => \&del_by_ambig, |
|
120
|
|
|
|
|
|
|
'find_by_length' => \&find_by_length, |
|
121
|
|
|
|
|
|
|
'pick_by_length' => \&pick_by_length, |
|
122
|
|
|
|
|
|
|
'delete_by_length' => \&del_by_length, |
|
123
|
|
|
|
|
|
|
); |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
##################### initializer & option handlers ################### |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head1 SUBROUTINES |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head2 initialize() |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
Sets up most of the actions to be performed on sequences. |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Call this right after setting up an options hash. |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Sets package variables: C<$in>, C<$in_format>, C<$filename>, C<$out_format>, and C<$out>. |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub initialize { |
|
141
|
19
|
|
|
19
|
1
|
57
|
my $opts_ref = shift; |
|
142
|
19
|
|
|
|
|
117
|
Bio::BPWrapper::common_opts($opts_ref); |
|
143
|
19
|
|
|
|
|
37
|
%opts = %{$opts_ref}; |
|
|
19
|
|
|
|
|
80
|
|
|
144
|
|
|
|
|
|
|
|
|
145
|
19
|
50
|
33
|
|
|
93
|
die "Option 'prefix' requires a value\n" if defined $opts{"prefix"} && $opts{"prefix"} =~ /^$/; |
|
146
|
|
|
|
|
|
|
|
|
147
|
19
|
|
50
|
|
|
102
|
$filename = shift @ARGV || "STDIN"; # If no more arguments were given on the command line, assume we're getting input from standard input |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# guess format won't work for piped input; remove |
|
150
|
|
|
|
|
|
|
# if ($filename eq "STDIN") { |
|
151
|
|
|
|
|
|
|
# my $lines; |
|
152
|
|
|
|
|
|
|
# my $line_ct = 0; |
|
153
|
|
|
|
|
|
|
# while(<>) { $lines .= $_; $line_ct++; last if $line_ct >= 100 } # read the first 100 lines |
|
154
|
|
|
|
|
|
|
# $guesser = Bio::Tools::GuessSeqFormat->new( -text => $lines ); |
|
155
|
|
|
|
|
|
|
# } else { |
|
156
|
|
|
|
|
|
|
# $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename); |
|
157
|
|
|
|
|
|
|
# } |
|
158
|
|
|
|
|
|
|
# $in_format = $guesser->guess() unless $opts{'input'}; |
|
159
|
|
|
|
|
|
|
|
|
160
|
19
|
|
100
|
|
|
122
|
$in_format = $opts{"input"} // 'fasta'; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/; |
|
163
|
19
|
50
|
|
|
|
295
|
$in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename)); |
|
164
|
|
|
|
|
|
|
|
|
165
|
19
|
|
100
|
|
|
290425
|
$out_format = $opts{"output"} // 'fasta'; |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# A change in SeqIO, commit 0e04486ca4cc2e61fd72, means -fh or -file is required |
|
168
|
19
|
|
|
|
|
395
|
$out = Bio::SeqIO->new(-format => $out_format, -fh => \*STDOUT) |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub can_handle { |
|
172
|
18
|
|
|
18
|
0
|
82
|
my $option = shift; |
|
173
|
18
|
|
|
|
|
118
|
return defined($opt_dispatch{$option}) |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub handle_opt { |
|
177
|
18
|
|
|
18
|
0
|
41
|
my $option = shift; |
|
178
|
18
|
|
|
|
|
88
|
$opt_dispatch{$option}->($option) # This passes option name to all functions |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head2 write_out() |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
Writes out the sequence file. |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Call this after calling C<#initialize(\%opts)> and processing those options. |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=cut |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub write_out { |
|
190
|
1
|
|
|
1
|
1
|
7
|
while ($seq = $in->next_seq()) { $out->write_seq($seq) } |
|
|
22
|
|
|
|
|
3766343
|
|
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
################### subroutines ######################## |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub codon_sim { |
|
197
|
|
|
|
|
|
|
# my $seq = $in->next_seq(); # only the first sequence used |
|
198
|
|
|
|
|
|
|
# if (&_internal_stop_or_x($seq->translate()->seq())) { |
|
199
|
|
|
|
|
|
|
# die "internal stop or non-standard base:\t" . $seq->id . "\texit.\n"; |
|
200
|
|
|
|
|
|
|
# } |
|
201
|
|
|
|
|
|
|
# use Algorithm::Numerical::Sample qw /sample/; |
|
202
|
|
|
|
|
|
|
# use Math::Random qw /random_permutation/; |
|
203
|
|
|
|
|
|
|
######################## |
|
204
|
|
|
|
|
|
|
# Read CUTG and make a random codon set for each AA |
|
205
|
|
|
|
|
|
|
######################## |
|
206
|
|
|
|
|
|
|
# my $cutg_file = $opts{'codon-sim'}; |
|
207
|
|
|
|
|
|
|
# my $io = Bio::CodonUsage::IO->new(-file => $cutg_file); |
|
208
|
|
|
|
|
|
|
# my $cdtable = $io->next_data(); |
|
209
|
0
|
|
|
0
|
0
|
0
|
my @bases = qw(A T C G); |
|
210
|
0
|
|
|
|
|
0
|
my @codons; |
|
211
|
0
|
|
|
|
|
0
|
for (my $i=0; $i<=3; $i++) { |
|
212
|
0
|
|
|
|
|
0
|
my $first = $bases[$i]; |
|
213
|
0
|
|
|
|
|
0
|
for (my $j=0; $j<=3; $j++) { |
|
214
|
0
|
|
|
|
|
0
|
my $second = $bases[$j]; |
|
215
|
0
|
|
|
|
|
0
|
for (my $k=0; $k<=3; $k++) { |
|
216
|
0
|
|
|
|
|
0
|
my $third = $bases[$k]; |
|
217
|
0
|
|
|
|
|
0
|
push @codons, $first . $second . $third; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
0
|
my $myCodonTable = Bio::Tools::CodonTable->new( -id => 11 ); # bacterial codon table |
|
223
|
|
|
|
|
|
|
|
|
224
|
0
|
|
|
|
|
0
|
my (@cd_cts, %aas, %aa_cds); |
|
225
|
0
|
|
|
|
|
0
|
foreach my $cd (@codons) { |
|
226
|
0
|
|
|
|
|
0
|
my $aa = $myCodonTable->translate($cd); |
|
227
|
0
|
|
|
|
|
0
|
$aas{$aa}++; |
|
228
|
|
|
|
|
|
|
# push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)}; |
|
229
|
0
|
|
|
|
|
0
|
push @cd_cts, {codon => $cd, aa => $aa}; |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
0
|
|
|
|
|
0
|
foreach my $aa (keys %aas) { |
|
233
|
0
|
|
|
|
|
0
|
my @cds = grep {$_->{aa} eq $aa} @cd_cts; |
|
|
0
|
|
|
|
|
0
|
|
|
234
|
0
|
|
|
|
|
0
|
$aa_cds{$aa} = \@cds; |
|
235
|
|
|
|
|
|
|
# my @cd_sets; |
|
236
|
|
|
|
|
|
|
# foreach (@cds) { |
|
237
|
|
|
|
|
|
|
# for (my $i=1; $i<=$_->{cts}; $i++) { |
|
238
|
|
|
|
|
|
|
# push @cd_sets, $_->{codon}; |
|
239
|
|
|
|
|
|
|
# } |
|
240
|
|
|
|
|
|
|
# } |
|
241
|
|
|
|
|
|
|
# @cd_sets = random_permutation(@cd_sets); |
|
242
|
|
|
|
|
|
|
# $aa_cds{$aa} = \@cd_sets; |
|
243
|
|
|
|
|
|
|
# $aa_cds{$aa} = \@cd_sets; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
# print Dumper(\%aa_cds); |
|
246
|
0
|
|
|
|
|
0
|
return(\%aa_cds); |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# fisher_yates_shuffle( \@array ) : generate a random permutation |
|
250
|
|
|
|
|
|
|
# of @array in place |
|
251
|
|
|
|
|
|
|
sub fisher_yates_shuffle { |
|
252
|
0
|
|
|
0
|
0
|
0
|
my $array = shift; |
|
253
|
0
|
|
|
|
|
0
|
my $i; |
|
254
|
0
|
|
|
|
|
0
|
for ($i = @$array; --$i; ) { |
|
255
|
0
|
|
|
|
|
0
|
my $j = int rand ($i+1); |
|
256
|
0
|
0
|
|
|
|
0
|
next if $i == $j; |
|
257
|
0
|
|
|
|
|
0
|
@$array[$i,$j] = @$array[$j,$i]; |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub synon_codons { |
|
262
|
0
|
|
|
0
|
0
|
0
|
my %codon_set = %{&codon_sim}; |
|
|
0
|
|
|
|
|
0
|
|
|
263
|
0
|
|
|
|
|
0
|
while( my $seq = $in->next_seq() ) { |
|
264
|
|
|
|
|
|
|
############################## |
|
265
|
|
|
|
|
|
|
# generate a random CDS with the same AA sequence |
|
266
|
|
|
|
|
|
|
############################### |
|
267
|
0
|
|
|
|
|
0
|
my $pep = $seq->translate()->seq(); |
|
268
|
0
|
0
|
|
|
|
0
|
if (&_internal_stop_or_x($pep)) { |
|
269
|
0
|
|
|
|
|
0
|
warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n"; |
|
270
|
0
|
|
|
|
|
0
|
next; |
|
271
|
|
|
|
|
|
|
} |
|
272
|
0
|
|
|
|
|
0
|
my @aas = split //, $pep; |
|
273
|
0
|
|
|
|
|
0
|
my $sim_cds = ""; |
|
274
|
|
|
|
|
|
|
#print $pep, "\n"; |
|
275
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i <= $#aas; $i++) { |
|
276
|
0
|
|
|
|
|
0
|
my $cd_set = $codon_set{$aas[$i]}; |
|
277
|
|
|
|
|
|
|
# print $aas[$i], "\t", $cd_set->[0]->{codon}, "\n"; |
|
278
|
0
|
0
|
|
|
|
0
|
&fisher_yates_shuffle( $cd_set ) if scalar(@$cd_set) > 1; # permutes @array in place, for each codon |
|
279
|
|
|
|
|
|
|
# next if $aas[$i] eq '*'; |
|
280
|
|
|
|
|
|
|
# my @sampled_cds = sample(-set => $aa_cds{$aas[$i]}); # sample 1 by default |
|
281
|
0
|
|
|
|
|
0
|
my $cd_sim = $cd_set->[0]; |
|
282
|
0
|
|
|
|
|
0
|
$sim_cds .= $cd_sim->{codon}; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
0
|
|
|
|
|
0
|
my $sim_obj = Bio::Seq->new(-id => $seq->id() . "|sim", -seq => $sim_cds); |
|
285
|
0
|
|
|
|
|
0
|
$out->write_seq($sim_obj); |
|
286
|
|
|
|
|
|
|
} |
|
287
|
0
|
|
|
|
|
0
|
exit; |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub count_gaps_dna { |
|
292
|
0
|
|
|
0
|
0
|
0
|
while( my $seqobj = $in->next_seq() ) { |
|
293
|
0
|
|
|
|
|
0
|
print $seqobj->id(), "\t"; |
|
294
|
0
|
|
|
|
|
0
|
my ($num, $ref) = &_count_gap($seqobj->seq(), 'dna'); |
|
295
|
0
|
|
|
|
|
0
|
print $num, "\n"; |
|
296
|
0
|
0
|
|
|
|
0
|
if ($num) { |
|
297
|
0
|
|
|
|
|
0
|
my @pos; |
|
298
|
0
|
|
|
|
|
0
|
foreach my $mono (keys %$ref) { |
|
299
|
0
|
|
|
|
|
0
|
foreach (@{$ref->{$mono}}) { push @pos, $_} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
# print STDERR join "\t", sort {$a <=> $b} @pos; |
|
302
|
|
|
|
|
|
|
# print STDERR "\n"; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
} |
|
305
|
0
|
|
|
|
|
0
|
exit; |
|
306
|
|
|
|
|
|
|
} |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub count_gaps_aa { |
|
309
|
0
|
|
|
0
|
0
|
0
|
while( my $seqobj = $in->next_seq() ) { |
|
310
|
0
|
|
|
|
|
0
|
print $seqobj->id(), "\t"; |
|
311
|
0
|
|
|
|
|
0
|
my ($num, $ref) = &_count_gap($seqobj->seq(), 'protein'); |
|
312
|
0
|
|
|
|
|
0
|
print $num, "\n"; |
|
313
|
0
|
0
|
|
|
|
0
|
if ($num) { |
|
314
|
0
|
|
|
|
|
0
|
my @pos; |
|
315
|
0
|
|
|
|
|
0
|
foreach my $mono (keys %$ref) { |
|
316
|
0
|
|
|
|
|
0
|
foreach (@{$ref->{$mono}}) { push @pos, $_} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
317
|
|
|
|
|
|
|
} |
|
318
|
|
|
|
|
|
|
# print STDERR join "\t", sort {$a <=> $b} @pos; |
|
319
|
|
|
|
|
|
|
# print STDERR "\n"; |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
} |
|
322
|
0
|
|
|
|
|
0
|
exit; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
sub _count_gap { |
|
326
|
0
|
|
|
0
|
|
0
|
my ($str, $type) = @_; |
|
327
|
0
|
|
|
|
|
0
|
my @mono = split('', $str); |
|
328
|
0
|
|
|
|
|
0
|
my %seen_gaps; |
|
329
|
0
|
|
|
|
|
0
|
my $num_gaps = 0; |
|
330
|
0
|
|
|
|
|
0
|
my $ct = 0; |
|
331
|
0
|
|
|
|
|
0
|
foreach my $mon (@mono) { |
|
332
|
0
|
|
|
|
|
0
|
$ct++; |
|
333
|
0
|
0
|
0
|
|
|
0
|
next if ($type eq 'dna' && $mon =~ /[atcg]/i) or ($type eq 'protein' && $mon =~ /[ACDEFGHIKLMNPQRSTVWY]/i) or ($type eq 'protein' && $mon =~ /\*\s*$/); |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
334
|
0
|
|
|
|
|
0
|
$num_gaps ++; |
|
335
|
0
|
0
|
|
|
|
0
|
if ($seen_gaps{$mon}) { |
|
336
|
0
|
|
|
|
|
0
|
push @{$seen_gaps{$mon}}, $ct; |
|
|
0
|
|
|
|
|
0
|
|
|
337
|
|
|
|
|
|
|
} else { |
|
338
|
0
|
|
|
|
|
0
|
$seen_gaps{$mon} = [$ct] |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
# print Dumper(\%seen_gaps) if $num_gaps; |
|
342
|
0
|
|
|
|
|
0
|
return ($num_gaps, \%seen_gaps); |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
sub rename_id { |
|
346
|
0
|
|
|
0
|
0
|
0
|
my $optStr = $opts{rename}; |
|
347
|
0
|
|
|
|
|
0
|
my %names; |
|
348
|
|
|
|
|
|
|
|
|
349
|
0
|
0
|
|
|
|
0
|
if ($optStr =~ /^id:(\S+);(\S+)$/) { |
|
350
|
0
|
|
|
|
|
0
|
$names{$1} = $2; |
|
351
|
|
|
|
|
|
|
} else { |
|
352
|
0
|
|
0
|
|
|
0
|
open NAME, "<", $optStr || die "a file with old-tab-new needed\n"; |
|
353
|
0
|
|
|
|
|
0
|
while() { |
|
354
|
0
|
|
|
|
|
0
|
chomp; |
|
355
|
0
|
|
|
|
|
0
|
my ($oldN, $newN) = split; |
|
356
|
0
|
|
|
|
|
0
|
$names{$oldN} = $newN; |
|
357
|
|
|
|
|
|
|
} |
|
358
|
0
|
|
|
|
|
0
|
close NAME; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
0
|
|
|
|
|
0
|
while( my $seqobj = $in->next_seq() ) { |
|
362
|
0
|
|
|
|
|
0
|
my $id = $seqobj->display_id(); |
|
363
|
0
|
0
|
|
|
|
0
|
if ($names{$id}) { |
|
364
|
|
|
|
|
|
|
# $seqobj->id($id . "|" . $names{$id}); |
|
365
|
|
|
|
|
|
|
# warn "$id appended by $names{$id}\n"; |
|
366
|
0
|
|
|
|
|
0
|
$seqobj->id($names{$id}); |
|
367
|
0
|
|
|
|
|
0
|
warn "$id replaced by $names{$id}\n"; |
|
368
|
|
|
|
|
|
|
} else { |
|
369
|
0
|
|
|
|
|
0
|
warn "$id not changed\n"; |
|
370
|
|
|
|
|
|
|
} |
|
371
|
0
|
|
|
|
|
0
|
$out->write_seq($seqobj); |
|
372
|
|
|
|
|
|
|
} |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub update_longest_orf { |
|
377
|
0
|
|
|
0
|
0
|
0
|
while( my $seqobj = $in->next_seq() ) { |
|
378
|
0
|
|
|
|
|
0
|
my $pep_string = $seqobj->translate( undef, undef, 0 )->seq(); |
|
379
|
0
|
0
|
|
|
|
0
|
unless ($pep_string =~ /\*[A-Z]/) { # no internal stop; don't proceed |
|
380
|
0
|
|
|
|
|
0
|
my $id = $seqobj->id(); |
|
381
|
0
|
|
|
|
|
0
|
$seqobj->id($id . "|+1"); |
|
382
|
0
|
|
|
|
|
0
|
$seqobj = &_trim_end_to_frame($seqobj); |
|
383
|
0
|
|
|
|
|
0
|
$out->write_seq($seqobj); |
|
384
|
|
|
|
|
|
|
# warn $seqobj->id, ": +1 ok\n"; |
|
385
|
0
|
|
|
|
|
0
|
next; |
|
386
|
|
|
|
|
|
|
} |
|
387
|
|
|
|
|
|
|
|
|
388
|
0
|
0
|
|
|
|
0
|
unless ($opts{"no-revcom"}) { # do not search in revcom |
|
389
|
0
|
|
|
|
|
0
|
my $pep_rev = $seqobj->revcom()->translate( undef, undef, 0 )->seq(); |
|
390
|
0
|
0
|
|
|
|
0
|
unless ($pep_rev =~ /\*[A-Z]/) { # no internal stop for revcom |
|
391
|
0
|
|
|
|
|
0
|
my $id = $seqobj->id(); |
|
392
|
0
|
|
|
|
|
0
|
$seqobj->id($id . "|-1"); |
|
393
|
0
|
|
|
|
|
0
|
my $rev = $seqobj->revcom(); |
|
394
|
0
|
|
|
|
|
0
|
$rev = &_trim_end_to_frame($rev); |
|
395
|
0
|
|
|
|
|
0
|
$out->write_seq($rev); |
|
396
|
|
|
|
|
|
|
# warn $seqobj->id(), ": -1 ok\n"; |
|
397
|
0
|
|
|
|
|
0
|
next; |
|
398
|
|
|
|
|
|
|
} |
|
399
|
|
|
|
|
|
|
} |
|
400
|
|
|
|
|
|
|
|
|
401
|
0
|
|
|
|
|
0
|
my $longest = { |
|
402
|
|
|
|
|
|
|
'aa_start' => 1, |
|
403
|
|
|
|
|
|
|
'aa_end' => 1, |
|
404
|
|
|
|
|
|
|
'aa_length' => 1, |
|
405
|
|
|
|
|
|
|
'nt_start' => 1, |
|
406
|
|
|
|
|
|
|
'nt_end' => 1, |
|
407
|
|
|
|
|
|
|
'nt_seq' => $seqobj->seq(), |
|
408
|
|
|
|
|
|
|
'frame' => 1, |
|
409
|
|
|
|
|
|
|
}; |
|
410
|
|
|
|
|
|
|
|
|
411
|
0
|
|
|
|
|
0
|
foreach my $fm (1, 2, 3, -1, -2, -3 ) { |
|
412
|
|
|
|
|
|
|
# warn "checking frame $fm ...\n"; |
|
413
|
0
|
0
|
0
|
|
|
0
|
next if $opts{"no-revcom"} && $fm < 0; # do not search in revcom |
|
414
|
0
|
0
|
|
|
|
0
|
my $new_seqobj = Bio::Seq->new( |
|
415
|
|
|
|
|
|
|
-id => $seqobj->id() . "|$fm", |
|
416
|
|
|
|
|
|
|
-seq => $fm > 0 ? $seqobj->subseq( $fm, $seqobj->length() ) : $seqobj->revcom()->subseq( abs($fm), $seqobj->length() ) |
|
417
|
|
|
|
|
|
|
); # chop seq to frame first |
|
418
|
|
|
|
|
|
|
|
|
419
|
0
|
|
|
|
|
0
|
&_get_longest($new_seqobj, $longest, $fm); |
|
420
|
|
|
|
|
|
|
# warn "longest ORF:", $longest->{aa_length}, "\n"; |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
# warn "start codon not M/V/L:", $seqobj->id() unless substr( $longest->{nt_seq}, 0, 3 ) =~ /[atg|gt[atcg]|ct[atcg]|tt[ag]/i; |
|
424
|
|
|
|
|
|
|
# print ">", $seqobj->id, "|f", $longest->{frame}, "|longest-orf\n", $longest->{nt_seq}, "\n"; |
|
425
|
0
|
0
|
|
|
|
0
|
my $fid = $longest->{frame} > 0 ? "+" . $longest->{frame} : $longest->{frame}; |
|
426
|
0
|
|
|
|
|
0
|
my $longest_seq = Bio::Seq->new(-id => $seqobj->id . "|" . $fid, -seq => $longest->{nt_seq} ); |
|
427
|
0
|
|
|
|
|
0
|
$longest_seq = &_trim_end_to_frame($longest_seq); |
|
428
|
0
|
|
|
|
|
0
|
$out->write_seq($longest_seq); |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
} |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
sub _trim_end_to_frame { |
|
433
|
0
|
|
|
0
|
|
0
|
my $seqobj = shift; |
|
434
|
0
|
|
|
|
|
0
|
my $seq_len = $seqobj->length(); |
|
435
|
0
|
|
|
|
|
0
|
my $remainder = $seq_len % 3; |
|
436
|
0
|
|
|
|
|
0
|
my $seqstr = $seqobj->subseq(1, $seq_len - $remainder); |
|
437
|
0
|
|
|
|
|
0
|
$seqobj->seq($seqstr); |
|
438
|
0
|
|
|
|
|
0
|
return $seqobj; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub _get_longest { # for each frame |
|
442
|
0
|
|
|
0
|
|
0
|
my ($seq, $ref, $fm) = @_; |
|
443
|
0
|
|
|
|
|
0
|
my $pep_string = $seq->translate( undef, undef, 0 )->seq(); |
|
444
|
0
|
0
|
|
|
|
0
|
unless ($pep_string =~ /\*[A-Z]/) { # no internal stops, found the longest |
|
445
|
0
|
|
|
|
|
0
|
$ref->{nt_seq} = $seq->seq(); |
|
446
|
0
|
|
|
|
|
0
|
$ref->{frame} = $fm; |
|
447
|
0
|
|
|
|
|
0
|
$ref->{aa_length} = $seq->length()/3; |
|
448
|
|
|
|
|
|
|
} else { # has internal stops |
|
449
|
0
|
0
|
|
|
|
0
|
die $seq->id(), " contains ambiguous aa (X)\n" if $pep_string =~ /X/; |
|
450
|
0
|
|
|
|
|
0
|
my $three_prime = $seq->length(); |
|
451
|
|
|
|
|
|
|
|
|
452
|
0
|
|
|
|
|
0
|
my $start = 1; |
|
453
|
0
|
|
|
|
|
0
|
my @aas = split '', $pep_string; |
|
454
|
0
|
|
|
|
|
0
|
for ( my $i = 0; $i <= $#aas; $i++ ) { |
|
455
|
0
|
0
|
0
|
|
|
0
|
next unless $aas[$i] eq '*' || $i == $#aas; # hit a stop codon or end of sequence |
|
456
|
0
|
0
|
|
|
|
0
|
if ($i - $start + 2 > $ref->{aa_length}) { # if longer than the last longest |
|
457
|
0
|
|
|
|
|
0
|
$ref->{aa_start} = $start; |
|
458
|
0
|
|
|
|
|
0
|
$ref->{aa_end} = $i + 1; |
|
459
|
0
|
|
|
|
|
0
|
$ref->{aa_length} = $i - $start + 2; |
|
460
|
0
|
|
|
|
|
0
|
$ref->{nt_start} = 3 * ( $start - 1 ) + 1; |
|
461
|
0
|
|
|
|
|
0
|
my $end = 3 * $i + 3; |
|
462
|
0
|
0
|
|
|
|
0
|
$end |
|
463
|
|
|
|
|
|
|
= ( $end > $three_prime ) |
|
464
|
|
|
|
|
|
|
? $three_prime |
|
465
|
|
|
|
|
|
|
: $end; # guranteed not to go beyond 3' |
|
466
|
0
|
|
|
|
|
0
|
$ref->{nt_end} = $end; |
|
467
|
0
|
|
|
|
|
0
|
$ref->{frame} = $fm; |
|
468
|
0
|
|
|
|
|
0
|
$ref->{nt_seq} = $seq->subseq( 3 * ( $start - 1 ) + 1, $end ); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
0
|
|
|
|
|
0
|
$start = $i + 2; # re-start |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub iso_electric_point { |
|
477
|
0
|
|
|
0
|
0
|
0
|
my $calc = Bio::Tools::pICalculator->new(-places => 2, -pKset => 'EMBOSS'); |
|
478
|
0
|
|
|
|
|
0
|
while ( my $seq = $in->next_seq ) { |
|
479
|
0
|
|
|
|
|
0
|
$calc->seq($seq); |
|
480
|
0
|
|
|
|
|
0
|
print $seq->id(), "\t", $calc->iep; |
|
481
|
0
|
|
|
|
|
0
|
for(my $i = 0; $i <= 14; $i += 0.5 ){ |
|
482
|
0
|
|
|
|
|
0
|
print "\t", $i, "|", sprintf("%.2f", $calc->charge_at_pH($i)); |
|
483
|
|
|
|
|
|
|
} |
|
484
|
0
|
|
|
|
|
0
|
print "\n"; |
|
485
|
|
|
|
|
|
|
} |
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
sub print_weight { |
|
489
|
0
|
|
|
0
|
0
|
0
|
while ($seq = $in->next_seq()) { |
|
490
|
0
|
|
|
|
|
0
|
my $ref_weight = Bio::Tools::SeqStats->get_mol_wt($seq); |
|
491
|
0
|
|
|
|
|
0
|
print join "\t", $seq->id(), $ref_weight->[0], $ref_weight->[1]; |
|
492
|
0
|
|
|
|
|
0
|
print "\n"; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub codon_info { |
|
497
|
0
|
|
0
|
0
|
0
|
0
|
my $cutg_file = $opts{'codon-info'} || "need a codon usage file in CUTG GCG format"; |
|
498
|
0
|
|
|
|
|
0
|
my $io = Bio::CodonUsage::IO->new(-file => $cutg_file); |
|
499
|
0
|
|
|
|
|
0
|
my $cdtable = $io->next_data(); |
|
500
|
0
|
|
|
|
|
0
|
my @bases = qw(A T C G); |
|
501
|
0
|
|
|
|
|
0
|
my @codons; |
|
502
|
0
|
|
|
|
|
0
|
for (my $i=0; $i<=3; $i++) { |
|
503
|
0
|
|
|
|
|
0
|
my $first = $bases[$i]; |
|
504
|
0
|
|
|
|
|
0
|
for (my $j=0; $j<=3; $j++) { |
|
505
|
0
|
|
|
|
|
0
|
my $second = $bases[$j]; |
|
506
|
0
|
|
|
|
|
0
|
for (my $k=0; $k<=3; $k++) { |
|
507
|
0
|
|
|
|
|
0
|
my $third = $bases[$k]; |
|
508
|
0
|
|
|
|
|
0
|
push @codons, $first . $second . $third; |
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
} |
|
511
|
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
|
|
513
|
0
|
|
|
|
|
0
|
my $myCodonTable = Bio::Tools::CodonTable->new( -id => 1 ); |
|
514
|
|
|
|
|
|
|
#print Dumper($cdtable->all_aa_frequencies); |
|
515
|
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
0
|
my (@cd_cts, %aas); |
|
517
|
0
|
|
|
|
|
0
|
foreach my $cd (@codons) { |
|
518
|
0
|
|
|
|
|
0
|
my $aa = $myCodonTable->translate($cd); |
|
519
|
0
|
|
|
|
|
0
|
$aas{$aa}++; |
|
520
|
0
|
|
|
|
|
0
|
push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)}; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
|
|
523
|
0
|
|
|
|
|
0
|
my $h_genome = 0; |
|
524
|
0
|
|
|
|
|
0
|
foreach my $aa (keys %aas) { |
|
525
|
0
|
|
|
|
|
0
|
my @cds = grep {$_->{aa} eq $aa} @cd_cts; |
|
|
0
|
|
|
|
|
0
|
|
|
526
|
0
|
|
|
|
|
0
|
$h_genome += &__cd_entropy(\@cds); |
|
527
|
|
|
|
|
|
|
} |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=begin |
|
530
|
|
|
|
|
|
|
use Bio::Tools::CodonOptTable; |
|
531
|
|
|
|
|
|
|
while (my $seq = $in->next_seq()) { |
|
532
|
|
|
|
|
|
|
my $seqobj = Bio::Tools::CodonOptTable->new( |
|
533
|
|
|
|
|
|
|
-seq => $seq->seq(), |
|
534
|
|
|
|
|
|
|
-genetic_code => 1, |
|
535
|
|
|
|
|
|
|
-alphabet => 'dna', |
|
536
|
|
|
|
|
|
|
-is_circular => 0, |
|
537
|
|
|
|
|
|
|
-id => $seq->id(), |
|
538
|
|
|
|
|
|
|
); |
|
539
|
|
|
|
|
|
|
my $myCodons = $seqobj->rscu_rac_table(); |
|
540
|
|
|
|
|
|
|
my %oneLetterAA; |
|
541
|
|
|
|
|
|
|
my @cdCTs; |
|
542
|
|
|
|
|
|
|
my $numCodons = 0; |
|
543
|
|
|
|
|
|
|
foreach my $rec (@$myCodons) { |
|
544
|
|
|
|
|
|
|
$numCodons += $rec->{frequency}; |
|
545
|
|
|
|
|
|
|
my $codon = $rec->{codon}; |
|
546
|
|
|
|
|
|
|
my $aa = $myCodonTable->translate($codon); |
|
547
|
|
|
|
|
|
|
$oneLetterAA{$aa}++; |
|
548
|
|
|
|
|
|
|
push @cdCTs, {codon => $codon, aa => $aa, cts => $rec->{frequency}}; |
|
549
|
|
|
|
|
|
|
} |
|
550
|
|
|
|
|
|
|
=cut |
|
551
|
0
|
|
|
|
|
0
|
while (my $seq = $in->next_seq()) { |
|
552
|
0
|
0
|
|
|
|
0
|
if (&_internal_stop_or_x($seq->translate()->seq())) { |
|
553
|
0
|
|
|
|
|
0
|
warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n"; |
|
554
|
0
|
|
|
|
|
0
|
next; |
|
555
|
|
|
|
|
|
|
} |
|
556
|
0
|
|
|
|
|
0
|
my %oneLetterAA; |
|
557
|
|
|
|
|
|
|
my %codons; |
|
558
|
0
|
|
|
|
|
0
|
my @cdCTs; |
|
559
|
0
|
|
|
|
|
0
|
my $numCodons = 0; |
|
560
|
0
|
|
|
|
|
0
|
for (my $i=0; $i<=$seq->length()-3; $i+=3) { |
|
561
|
0
|
|
|
|
|
0
|
$codons{substr($seq->seq(), $i, 3)}++; |
|
562
|
0
|
|
|
|
|
0
|
$numCodons++; |
|
563
|
|
|
|
|
|
|
} |
|
564
|
|
|
|
|
|
|
|
|
565
|
0
|
|
|
|
|
0
|
foreach my $cd (keys %codons) { |
|
566
|
0
|
|
|
|
|
0
|
my $aa = $myCodonTable->translate($cd); |
|
567
|
0
|
|
|
|
|
0
|
$oneLetterAA{$aa}++; |
|
568
|
0
|
|
|
|
|
0
|
push @cdCTs, {codon => $cd, aa => $aa, cts => $codons{$cd}}; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
|
|
571
|
0
|
|
|
|
|
0
|
my $h_cds = 0; |
|
572
|
0
|
|
|
|
|
0
|
foreach my $aa (keys %oneLetterAA) { |
|
573
|
0
|
|
|
|
|
0
|
my @cds = grep {$_->{aa} eq $aa} @cdCTs; |
|
|
0
|
|
|
|
|
0
|
|
|
574
|
0
|
|
|
|
|
0
|
$h_cds += &__cd_entropy(\@cds); |
|
575
|
|
|
|
|
|
|
} |
|
576
|
0
|
|
|
|
|
0
|
print $seq->id, "\t", $seq->length(), "\t", $numCodons, "\t"; |
|
577
|
0
|
|
|
|
|
0
|
printf "%.6f\n", $h_genome - $h_cds; |
|
578
|
|
|
|
|
|
|
} |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub __cd_entropy { |
|
582
|
0
|
|
|
0
|
|
0
|
my $ref = shift; |
|
583
|
0
|
|
|
|
|
0
|
my @cd_obj = @$ref; |
|
584
|
0
|
0
|
|
|
|
0
|
return 0 if @cd_obj <= 1; # single codons (M & W) |
|
585
|
0
|
|
|
|
|
0
|
my $sum = 0; |
|
586
|
0
|
|
|
|
|
0
|
my $h = 0; |
|
587
|
0
|
|
|
|
|
0
|
foreach (@cd_obj) { |
|
588
|
0
|
|
|
|
|
0
|
$sum += $_->{cts}; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
0
|
0
|
|
|
|
0
|
return 0 unless $sum > 0; |
|
591
|
0
|
|
|
|
|
0
|
foreach (@cd_obj) { |
|
592
|
0
|
|
|
|
|
0
|
$_->{rel_freq} = $_->{cts}/$sum; |
|
593
|
|
|
|
|
|
|
} |
|
594
|
|
|
|
|
|
|
|
|
595
|
0
|
|
|
|
|
0
|
foreach (@cd_obj) { |
|
596
|
0
|
0
|
|
|
|
0
|
next unless $_->{rel_freq} > 0; |
|
597
|
0
|
|
|
|
|
0
|
$h -= $_->{rel_freq} * log($_->{rel_freq})/log(2); |
|
598
|
|
|
|
|
|
|
} |
|
599
|
0
|
|
|
|
|
0
|
return $h; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
sub sort_by { |
|
603
|
0
|
|
|
0
|
0
|
0
|
my $match = $opts{'sort'}; |
|
604
|
0
|
0
|
|
|
|
0
|
$match =~ /length|id|file/ || die "Enter 'length', 'id', or 'file:filename' to sort.\n"; |
|
605
|
0
|
|
|
|
|
0
|
my @seqs; |
|
606
|
|
|
|
|
|
|
|
|
607
|
0
|
|
|
|
|
0
|
while (my $seq = $in->next_seq()) { |
|
608
|
0
|
|
|
|
|
0
|
push @seqs, {id => $seq->display_id, len => $seq->length, seqob => $seq}; |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
|
|
611
|
0
|
0
|
|
|
|
0
|
if ($match eq 'id'){ |
|
612
|
0
|
|
|
|
|
0
|
@seqs = sort {$a->{id} cmp $b->{id}} @seqs; |
|
|
0
|
|
|
|
|
0
|
|
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
|
|
615
|
0
|
0
|
|
|
|
0
|
if ($match eq 'length'){ |
|
616
|
0
|
|
|
|
|
0
|
@seqs = sort {$a->{len} <=> $b->{len}} @seqs; |
|
|
0
|
|
|
|
|
0
|
|
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
|
|
619
|
0
|
0
|
|
|
|
0
|
if ($match =~ /file:(\S+)/){ |
|
620
|
0
|
|
|
|
|
0
|
my $filename = $1; |
|
621
|
0
|
|
0
|
|
|
0
|
open FL, "<", $filename || die "file not found: $filename\n"; |
|
622
|
0
|
|
|
|
|
0
|
my %order; |
|
623
|
0
|
|
|
|
|
0
|
my $ct = 1; |
|
624
|
0
|
|
|
|
|
0
|
while() { |
|
625
|
0
|
|
|
|
|
0
|
chomp; |
|
626
|
0
|
0
|
|
|
|
0
|
next unless /(\S+)/; |
|
627
|
0
|
|
|
|
|
0
|
$order{$1} = $ct++; |
|
628
|
|
|
|
|
|
|
} |
|
629
|
0
|
|
|
|
|
0
|
close FL; |
|
630
|
0
|
|
|
|
|
0
|
foreach (@seqs) { |
|
631
|
0
|
0
|
|
|
|
0
|
die "id not in order file: ", $_->{id}, "\n" unless $order{ $_->{id} }; |
|
632
|
|
|
|
|
|
|
} |
|
633
|
0
|
|
|
|
|
0
|
@seqs = sort { $order{ $a->{id} } <=> $order{ $b->{id} } } @seqs; |
|
|
0
|
|
|
|
|
0
|
|
|
634
|
|
|
|
|
|
|
} |
|
635
|
|
|
|
|
|
|
|
|
636
|
0
|
|
|
|
|
0
|
foreach (@seqs) { |
|
637
|
0
|
|
|
|
|
0
|
$out->write_seq($_->{seqob}); |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
} |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub split_cdhit { |
|
642
|
0
|
|
|
0
|
0
|
0
|
my $cls_file = $opts{'split-cdhit'}; |
|
643
|
0
|
|
0
|
|
|
0
|
open IN, "<" . $cls_file || die "cdhit clstr file not found: $cls_file\n"; |
|
644
|
0
|
|
|
|
|
0
|
my %clusters; |
|
645
|
|
|
|
|
|
|
my $cl_id; |
|
646
|
0
|
|
|
|
|
0
|
my @mem; |
|
647
|
0
|
|
|
|
|
0
|
while () { |
|
648
|
0
|
|
|
|
|
0
|
my $line = $_; |
|
649
|
0
|
|
|
|
|
0
|
chomp $line; |
|
650
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^>(\S+)\s+(\d+)/) { |
|
651
|
0
|
|
|
|
|
0
|
$cl_id = $1 . "_" . $2; |
|
652
|
0
|
|
|
|
|
0
|
my @mem = (); |
|
653
|
0
|
|
|
|
|
0
|
$clusters{$cl_id} = \@mem; |
|
654
|
|
|
|
|
|
|
} else { |
|
655
|
0
|
|
|
|
|
0
|
my $ref = $clusters{$cl_id}; |
|
656
|
0
|
|
|
|
|
0
|
my @mems = @$ref; |
|
657
|
0
|
|
|
|
|
0
|
my @els = split /\s+/, $line; |
|
658
|
0
|
|
|
|
|
0
|
my $seq_id = $els[2]; |
|
659
|
0
|
|
|
|
|
0
|
$seq_id =~ s/>//; |
|
660
|
0
|
|
|
|
|
0
|
$seq_id =~ s/\.\.\.$//; |
|
661
|
0
|
|
|
|
|
0
|
push @mems, $seq_id; |
|
662
|
0
|
|
|
|
|
0
|
$clusters{$cl_id} = \@mems; |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
# print Dumper(\%clusters); |
|
666
|
0
|
|
|
|
|
0
|
my %seqs; |
|
667
|
0
|
|
|
|
|
0
|
while ($seq = $in->next_seq()) { |
|
668
|
0
|
|
|
|
|
0
|
my $id = $seq->display_id(); |
|
669
|
0
|
|
|
|
|
0
|
$seqs{$id} = $seq; |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
|
|
672
|
0
|
|
|
|
|
0
|
foreach my $id (keys %clusters) { |
|
673
|
0
|
|
|
|
|
0
|
my $out = Bio::SeqIO->new( -file => ">" . $filename . "-". $id . ".fas", -format => 'fasta'); |
|
674
|
0
|
|
|
|
|
0
|
my @seqids = @{ $clusters{$id} }; |
|
|
0
|
|
|
|
|
0
|
|
|
675
|
0
|
|
|
|
|
0
|
foreach my $seq_id (@seqids) { |
|
676
|
0
|
|
|
|
|
0
|
$out->write_seq($seqs{$seq_id}); |
|
677
|
|
|
|
|
|
|
} |
|
678
|
|
|
|
|
|
|
} |
|
679
|
0
|
|
|
|
|
0
|
exit; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
sub print_composition { |
|
684
|
1
|
|
|
1
|
0
|
6
|
while ($seq = $in->next_seq()) { |
|
685
|
7
|
|
|
|
|
1768
|
my $hash_ref = Bio::Tools::SeqStats->count_monomers($seq); |
|
686
|
7
|
|
|
|
|
1793
|
my $count; |
|
687
|
7
|
|
|
|
|
39
|
$count += $hash_ref->{$_} foreach keys %$hash_ref; |
|
688
|
7
|
|
|
|
|
31
|
print $seq->id(); |
|
689
|
7
|
|
|
|
|
169
|
foreach (sort keys %$hash_ref) { |
|
690
|
28
|
|
|
|
|
69
|
print "\t", $_, ":", $hash_ref->{$_}, "("; |
|
691
|
28
|
|
|
|
|
134
|
printf "%.2f", $hash_ref->{$_}/$count*100; |
|
692
|
28
|
|
|
|
|
56
|
print "%)" |
|
693
|
|
|
|
|
|
|
} |
|
694
|
7
|
|
|
|
|
42
|
print "\n" |
|
695
|
|
|
|
|
|
|
} |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
# This sub calls all the del/pick subs above. Any option to filter input sequences by some criterion goes through here, and the appropriate filter subroutine is called. |
|
699
|
|
|
|
|
|
|
sub filter_seqs { |
|
700
|
4
|
|
|
4
|
0
|
8
|
my $action = shift; |
|
701
|
4
|
|
|
|
|
11
|
my $match = $opts{$action}; |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# matching to stop at 1st ':' so that ids using ':' as field delimiters are handled properly |
|
704
|
4
|
50
|
|
|
|
31
|
$match =~ /^([^:]+):(\S+)$/ || die "Bad search format. Expecting a pattern of the form: tag:value.\n"; |
|
705
|
|
|
|
|
|
|
|
|
706
|
4
|
|
|
|
|
18
|
my ($tag, $value) = ($1, $2); |
|
707
|
4
|
|
|
|
|
25
|
my @selected = split(/,/, $value); |
|
708
|
4
|
|
|
|
|
13
|
my $callsub = "find_by_" . "$tag"; |
|
709
|
|
|
|
|
|
|
|
|
710
|
4
|
50
|
|
|
|
18
|
die "Bad tag or function not implemented. Tag was: $tag\n" if !defined($filter_dispatch{$callsub}); |
|
711
|
|
|
|
|
|
|
|
|
712
|
4
|
50
|
|
|
|
14
|
if ($tag eq 'order') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
713
|
4
|
|
|
|
|
8
|
my $ct = 0; |
|
714
|
4
|
|
|
|
|
17
|
my %order_list = parse_orders(\@selected); # Parse selected orders and create a hash |
|
715
|
4
|
|
|
|
|
19
|
while (my $currseq = $in->next_seq) { $ct++; $filter_dispatch{$callsub}->($action, $ct, $currseq, \%order_list) } |
|
|
28
|
|
|
|
|
7246
|
|
|
|
28
|
|
|
|
|
110
|
|
|
716
|
4
|
50
|
|
|
|
420
|
foreach (keys %order_list) { print STDERR "No matches found for order number $_\n" if $_ > $ct } |
|
|
7
|
|
|
|
|
56
|
|
|
717
|
|
|
|
|
|
|
} |
|
718
|
|
|
|
|
|
|
elsif ($tag eq 'id') { |
|
719
|
0
|
|
|
|
|
0
|
my %id_list = map { $_ => 1 } @selected; # create a hash from @selected |
|
|
0
|
|
|
|
|
0
|
|
|
720
|
0
|
|
|
|
|
0
|
while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%id_list) } |
|
|
0
|
|
|
|
|
0
|
|
|
721
|
0
|
0
|
|
|
|
0
|
foreach (keys %id_list) { warn "No matches found for '$_'\n" if $id_list{$_} == 1 } |
|
|
0
|
|
|
|
|
0
|
|
|
722
|
|
|
|
|
|
|
} |
|
723
|
|
|
|
|
|
|
elsif ($tag eq 'file') { |
|
724
|
0
|
|
0
|
|
|
0
|
open LIST, "<", $value || die "can't find file $value\n"; |
|
725
|
0
|
|
|
|
|
0
|
my %list; |
|
726
|
0
|
|
|
|
|
0
|
while() { chomp; $list{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
727
|
0
|
|
|
|
|
0
|
close LIST; |
|
728
|
0
|
|
|
|
|
0
|
while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%list) } |
|
|
0
|
|
|
|
|
0
|
|
|
729
|
0
|
0
|
|
|
|
0
|
foreach (keys %list) { warn "No matches found for '$_'\n" if $list{$_} == 1 } |
|
|
0
|
|
|
|
|
0
|
|
|
730
|
|
|
|
|
|
|
} else { |
|
731
|
0
|
|
|
|
|
0
|
while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $currseq, $value) } |
|
|
0
|
|
|
|
|
0
|
|
|
732
|
|
|
|
|
|
|
} |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=head2 retrieve_seqs() |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
Retrieves a sequence from RefSeq using the provided accession |
|
738
|
|
|
|
|
|
|
number. A wrapper for CE#get_Seq_by_acc>. |
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
=cut |
|
741
|
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# To do: add fetch by gi |
|
743
|
|
|
|
|
|
|
sub retrieve_seqs { |
|
744
|
0
|
|
|
0
|
1
|
0
|
my $gb = Bio::DB::RefSeq->new(); |
|
745
|
0
|
|
|
|
|
0
|
my $seq = $gb->get_Seq_by_acc($opts{'fetch'}); # Retrieve sequence with Accession Number |
|
746
|
0
|
|
|
|
|
0
|
$out->write_seq($seq) |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=head2 remove_gaps() |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
Remove gaps |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=cut |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
sub remove_gaps { |
|
756
|
1
|
|
|
1
|
1
|
6
|
while ($seq = $in->next_seq()) { |
|
757
|
7
|
|
|
|
|
3616
|
my $string = $seq->seq(); |
|
758
|
7
|
|
|
|
|
125
|
$string =~ s/-//g; |
|
759
|
7
|
|
|
|
|
14
|
$string =~ s/\.//g; |
|
760
|
7
|
|
|
|
|
20
|
my $new_seq = Bio::Seq->new(-id => $seq->id(), -seq => $string); |
|
761
|
7
|
|
|
|
|
2346
|
$out->write_seq($new_seq) |
|
762
|
|
|
|
|
|
|
} |
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=head2 print_lengths() |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Print all sequence lengths. Wraps |
|
768
|
|
|
|
|
|
|
Llength|https://metacpan.org/pod/Bio::Seq#length>. |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=cut |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
sub print_lengths { |
|
773
|
1
|
|
|
1
|
1
|
3
|
while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->length(), "\n" } |
|
|
7
|
|
|
|
|
1021
|
|
|
774
|
|
|
|
|
|
|
} |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=head2 print_seq_count() |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Print all sequence lengths. Wraps |
|
780
|
|
|
|
|
|
|
Llength|https://metacpan.org/pod/Bio::Seq#length>. |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=cut |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
sub print_seq_count { |
|
785
|
1
|
|
|
1
|
1
|
2
|
my $count; |
|
786
|
1
|
|
|
|
|
4
|
while ($seq = $in->next_seq()) { $count++ } |
|
|
7
|
|
|
|
|
923
|
|
|
787
|
1
|
|
|
|
|
46
|
print $count, "\n" |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=head2 make_revcom() |
|
791
|
|
|
|
|
|
|
Reverse complement. Wraps |
|
792
|
|
|
|
|
|
|
Lrevcom()|https://metacpan.org/pod/Bio::Seq#revcom>. |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=cut |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
sub make_revcom { # reverse-complement a sequence |
|
798
|
1
|
|
|
1
|
1
|
27
|
while ($seq = $in->next_seq()) { |
|
799
|
7
|
|
|
|
|
2353
|
my $new = Bio::Seq->new(-id => $seq->id() . ":revcom", -seq => $seq->revcom()->seq()); |
|
800
|
7
|
|
|
|
|
2576
|
$out->write_seq($new) |
|
801
|
|
|
|
|
|
|
} |
|
802
|
|
|
|
|
|
|
} |
|
803
|
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head2 print_subseq() |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
Select substring (of the 1st sequence). Wraps |
|
807
|
|
|
|
|
|
|
Lsubseq()|https://metacpan.org/pod/Bio::Seq#subseq>. |
|
808
|
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=cut |
|
810
|
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
sub print_subseq { |
|
813
|
1
|
|
|
1
|
1
|
22
|
while ($seq = $in->next_seq()) { |
|
814
|
7
|
|
|
|
|
3546
|
my $id = $seq->id(); |
|
815
|
7
|
|
|
|
|
186
|
my ($start, $end) = split /\s*,\s*/, $opts{"subseq"}; |
|
816
|
7
|
50
|
|
|
|
29
|
$end = $seq->length() if $end eq '-'; # allow shorthand: -s'2,-' |
|
817
|
7
|
50
|
|
|
|
22
|
die "end out of bound: $id\n" if $end > $seq->length(); |
|
818
|
7
|
|
|
|
|
119
|
my $new = Bio::Seq->new(-id => $seq->id() . ":$start-$end", -seq => $seq->subseq($start, $end)); |
|
819
|
7
|
|
|
|
|
2903
|
$out->write_seq($new) |
|
820
|
|
|
|
|
|
|
} |
|
821
|
|
|
|
|
|
|
} |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
sub _internal_stop_or_x { |
|
824
|
7
|
|
|
7
|
|
2459
|
my $str = shift; |
|
825
|
7
|
|
|
|
|
12
|
$str =~ s/\*$//; # remove last stop |
|
826
|
7
|
|
|
|
|
10
|
my @internalStops; |
|
827
|
|
|
|
|
|
|
my @nonStandardAA; |
|
828
|
7
|
|
|
|
|
19
|
while ($str =~ m/\*/g) { # internal stop # previously missed double ** |
|
829
|
14
|
|
|
|
|
24
|
push @internalStops, pos($str); |
|
830
|
|
|
|
|
|
|
} |
|
831
|
|
|
|
|
|
|
|
|
832
|
7
|
|
|
|
|
13
|
while ($str =~ m/X/g) { # non-standard AA |
|
833
|
0
|
|
|
|
|
0
|
push @nonStandardAA, pos($str); |
|
834
|
|
|
|
|
|
|
} |
|
835
|
|
|
|
|
|
|
|
|
836
|
7
|
50
|
|
|
|
11
|
if (@nonStandardAA) { |
|
837
|
0
|
|
|
|
|
0
|
warn "Presence of X at postions:\t", join(";", @nonStandardAA), "\n"; |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
|
|
840
|
7
|
50
|
|
|
|
11
|
if (@internalStops) { |
|
841
|
7
|
|
|
|
|
217
|
warn "Presence of internal stops at postions:\t", join(";", @internalStops), "\n"; |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
|
|
844
|
7
|
50
|
|
|
|
32
|
return @internalStops ? 1 : 0; |
|
845
|
|
|
|
|
|
|
} |
|
846
|
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
=head2 reading_frame_ops |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
Translate in 1, 3, or 6 frames based on the value of C<$opts> set via |
|
850
|
|
|
|
|
|
|
L|/initialize>. Wraps |
|
851
|
|
|
|
|
|
|
Ltranslate()|https://metacpan.org/pod/Bio::Seq#translate>, |
|
852
|
|
|
|
|
|
|
Ltranslate_3frames()|https://metacpan.org/pod/Bio::SeqUtils#translate_3frames>, |
|
853
|
|
|
|
|
|
|
and |
|
854
|
|
|
|
|
|
|
Ltranslate_6frames()|https://metacpan.org/pod/Bio::SeqUtils#translate_6frames>. |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=cut |
|
857
|
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
sub reading_frame_ops { |
|
860
|
1
|
|
|
1
|
1
|
3
|
my $frame = $opts{"translate"}; |
|
861
|
1
|
|
|
|
|
3
|
while ($seq = $in->next_seq()) { |
|
862
|
7
|
50
|
|
|
|
1250
|
if ($frame == 1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
863
|
7
|
50
|
|
|
|
23
|
if (&_internal_stop_or_x($seq->translate()->seq())) { |
|
864
|
7
|
|
|
|
|
15
|
warn "internal stop:\t" . $seq->id . "\tskip.\n" |
|
865
|
|
|
|
|
|
|
} else { |
|
866
|
0
|
|
|
|
|
0
|
$out->write_seq($seq->translate()) |
|
867
|
|
|
|
|
|
|
} |
|
868
|
|
|
|
|
|
|
} |
|
869
|
|
|
|
|
|
|
elsif ($frame == 3) { |
|
870
|
0
|
|
|
|
|
0
|
my @prots = Bio::SeqUtils->translate_3frames($seq); |
|
871
|
0
|
|
|
|
|
0
|
foreach (@prots) { |
|
872
|
0
|
|
|
|
|
0
|
my $id = $_->id(); |
|
873
|
0
|
|
|
|
|
0
|
$id =~ /^(\S+)-(\d)F$/; |
|
874
|
0
|
|
|
|
|
0
|
my ($oriId, $fm) = ($1, $2); |
|
875
|
0
|
|
|
|
|
0
|
$_->id($oriId . "|+" . ($fm+1)); |
|
876
|
0
|
|
|
|
|
0
|
$out->write_seq($_); |
|
877
|
|
|
|
|
|
|
} |
|
878
|
|
|
|
|
|
|
} elsif ($frame == 6) { |
|
879
|
0
|
|
|
|
|
0
|
my @prots = Bio::SeqUtils->translate_6frames($seq); |
|
880
|
0
|
|
|
|
|
0
|
foreach (@prots) { |
|
881
|
0
|
|
|
|
|
0
|
my $id = $_->id(); |
|
882
|
0
|
|
|
|
|
0
|
$id =~ /^(\S+)-(\d)([RF])$/; |
|
883
|
0
|
|
|
|
|
0
|
my ($oriId, $fm, $dir) = ($1, $2, $3); |
|
884
|
0
|
0
|
|
|
|
0
|
if ($dir eq 'F') { |
|
885
|
0
|
|
|
|
|
0
|
$_->id($oriId . "|+" . ($fm+1)); |
|
886
|
|
|
|
|
|
|
} else { |
|
887
|
0
|
|
|
|
|
0
|
$_->id($oriId . "|-" . ($fm+1)); |
|
888
|
|
|
|
|
|
|
} |
|
889
|
0
|
|
|
|
|
0
|
$out->write_seq($_); |
|
890
|
|
|
|
|
|
|
} |
|
891
|
0
|
|
|
|
|
0
|
} else { warn "Accepted frame arguments: 1, 3, and 6\n"} |
|
892
|
|
|
|
|
|
|
} |
|
893
|
|
|
|
|
|
|
} |
|
894
|
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
=head2 restrict_coord() |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
Note: This function is currently DEPRECATED. |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
Finds digestion coordinates by a specified restriction enzyme |
|
900
|
|
|
|
|
|
|
specified in C<$opts{restrinct}> set via L|/initialize>. |
|
901
|
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
An input file with sequences is expected. Wraps |
|
903
|
|
|
|
|
|
|
Lcut()|https://github.com/bioperl/Bio-Restriction>. |
|
904
|
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
Outputs coordinates of overhangs in BED format. |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
sub restrict_coord { |
|
909
|
|
|
|
|
|
|
use Bio::Restriction::Analysis; |
|
910
|
|
|
|
|
|
|
use Bio::Restriction::EnzymeCollection; |
|
911
|
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
my $enz = $opts{"restrict-coord"}; |
|
913
|
|
|
|
|
|
|
my $re = Bio::Restriction::EnzymeCollection->new()->get_enzyme($enz); |
|
914
|
|
|
|
|
|
|
my $len = length($re->overhang_seq()); |
|
915
|
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
while ( $seq = $in->next_seq() ) { |
|
917
|
|
|
|
|
|
|
my $seq_str = $seq->seq(); |
|
918
|
|
|
|
|
|
|
die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i; |
|
919
|
|
|
|
|
|
|
my $ra = Bio::Restriction::Analysis->new(-seq=>$seq); |
|
920
|
|
|
|
|
|
|
foreach my $pos ($ra->positions($enz)) { |
|
921
|
|
|
|
|
|
|
print $seq->id()."\t".($pos-$len)."\t".$pos."\n"; |
|
922
|
|
|
|
|
|
|
} |
|
923
|
|
|
|
|
|
|
} |
|
924
|
|
|
|
|
|
|
} |
|
925
|
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
=head2 restrict_digest() |
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
Note: This function is currently DEPRECATED. |
|
929
|
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
Predicted fragments from digestion by a specified restriction enzyme |
|
931
|
|
|
|
|
|
|
specified in C<$opts{restrinct}> set via L|/initialize>. |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
An input file with sequences is expected. Wraps |
|
934
|
|
|
|
|
|
|
Lcut()|https://metacpan.org/pod/Bio::Restriction::Analysis#cut>. |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
sub restrict_digest { |
|
938
|
|
|
|
|
|
|
my $enz = $opts{"restrict"}; |
|
939
|
|
|
|
|
|
|
use Bio::Restriction::Analysis; |
|
940
|
|
|
|
|
|
|
while ( $seq = $in->next_seq() ) { |
|
941
|
|
|
|
|
|
|
my $seq_str = $seq->seq(); |
|
942
|
|
|
|
|
|
|
die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i; |
|
943
|
|
|
|
|
|
|
my $ra = Bio::Restriction::Analysis->new(-seq=>$seq); |
|
944
|
|
|
|
|
|
|
foreach my $frag ($ra->fragment_maps($enz)) { |
|
945
|
|
|
|
|
|
|
my $seq_obj = Bio::Seq->new( |
|
946
|
|
|
|
|
|
|
-id=>$seq->id().'|'.$frag->{start}.'-'.$frag->{end}.'|'.($frag->{end}-$frag->{start}+1), |
|
947
|
|
|
|
|
|
|
-seq=>$frag->{seq}); |
|
948
|
|
|
|
|
|
|
$out->write_seq($seq_obj) |
|
949
|
|
|
|
|
|
|
} |
|
950
|
|
|
|
|
|
|
} |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=cut |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=head2 anonymize() |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
Replace sequence IDs with serial IDs I characters long, as specified in |
|
958
|
|
|
|
|
|
|
C<$opts{'anonymize'}> set via L|/initialize>. |
|
959
|
|
|
|
|
|
|
For example if C<$opts{'anonymize'}>, the first ID will be C. |
|
960
|
|
|
|
|
|
|
leading 'S' The length of the serial idea |
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
A sed script file is produced with a F<.sed> suffix that may be used |
|
963
|
|
|
|
|
|
|
with sed's C<'-f'> argument. If the filename is F<'-'>, the sed file |
|
964
|
|
|
|
|
|
|
is named C instead. A message containing the sed filename is |
|
965
|
|
|
|
|
|
|
written to C. |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
=cut |
|
968
|
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
sub anonymize { |
|
970
|
1
|
|
50
|
1
|
1
|
5
|
my $char_len = $opts{"anonymize"} // die "Tried to use option 'prefix' without using option 'anonymize'. Exiting...\n"; |
|
971
|
1
|
50
|
|
|
|
3
|
my $prefix = (defined($opts{"prefix"})) ? $opts{"prefix"} : "S"; |
|
972
|
|
|
|
|
|
|
|
|
973
|
1
|
50
|
|
|
|
3
|
pod2usage(1) if $char_len < 1; |
|
974
|
|
|
|
|
|
|
|
|
975
|
1
|
|
|
|
|
2
|
my $ct = 1; |
|
976
|
1
|
|
|
|
|
1
|
my %serial_name; |
|
977
|
1
|
|
|
|
|
2
|
my $length_warn = 0; |
|
978
|
1
|
|
|
|
|
4
|
while ($seq = $in->next_seq()) { |
|
979
|
7
|
|
|
|
|
1126
|
my $serial = $prefix . sprintf "%0" . ($char_len - length($prefix)) . "s", $ct; |
|
980
|
7
|
50
|
|
|
|
11
|
$length_warn = 1 if length($serial) > $char_len; |
|
981
|
7
|
|
|
|
|
17
|
$serial_name{$serial} = $seq->id(); |
|
982
|
7
|
|
|
|
|
93
|
$seq->id($serial); |
|
983
|
7
|
|
|
|
|
72
|
$out->write_seq($seq); |
|
984
|
7
|
|
|
|
|
1197
|
$ct++ |
|
985
|
|
|
|
|
|
|
} |
|
986
|
|
|
|
|
|
|
|
|
987
|
1
|
|
|
|
|
73
|
_make_sed_file($filename, %serial_name); |
|
988
|
1
|
|
|
|
|
23
|
warn "Anonymization map:\n"; |
|
989
|
1
|
|
|
|
|
6
|
while (my ($k, $v) = each %serial_name) { warn "$k => $v\n" } |
|
|
7
|
|
|
|
|
62
|
|
|
990
|
|
|
|
|
|
|
|
|
991
|
1
|
50
|
|
|
|
7
|
warn "WARNING: Anonymized ID length exceeded requested length: try a different length or prefix.\n" if $length_warn |
|
992
|
|
|
|
|
|
|
} |
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
=head2 shred_seq() |
|
995
|
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Break into individual sequences writing a FASTA file for each sequence. |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=cut |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
sub shred_seq { |
|
1002
|
1
|
|
|
1
|
1
|
3
|
while ($seq = $in->next_seq()) { |
|
1003
|
7
|
|
|
|
|
2668
|
my $newid = $seq->id(); |
|
1004
|
7
|
|
|
|
|
112
|
$newid =~ s/[\s\|]/_/g; |
|
1005
|
7
|
|
|
|
|
22
|
print $newid, "\n"; |
|
1006
|
7
|
|
50
|
|
|
16
|
my $suffix = $out_format || "fas"; |
|
1007
|
7
|
|
|
|
|
26
|
my $newout = Bio::SeqIO->new(-format => $out_format, -file => ">" . $newid . ".$suffix"); |
|
1008
|
7
|
|
|
|
|
4314
|
$newout->write_seq($seq) |
|
1009
|
|
|
|
|
|
|
} |
|
1010
|
|
|
|
|
|
|
exit |
|
1011
|
1
|
|
|
|
|
347
|
} |
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
=head2 count_codons() |
|
1014
|
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
Count codons for coding sequences (e.g., a genome file consisting of |
|
1016
|
|
|
|
|
|
|
CDS sequences). Wraps |
|
1017
|
|
|
|
|
|
|
Lcount_codons()|https://metacpan.org/pod/Bio::Tools::SeqStats#count_codons>. |
|
1018
|
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
=cut |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
sub codon_table { |
|
1022
|
0
|
|
|
0
|
0
|
0
|
my $string = $opts{'codon-table'}; |
|
1023
|
|
|
|
|
|
|
# die "bioseq --codon-table atg|M\n" unless $string; |
|
1024
|
0
|
|
|
|
|
0
|
my $myCodonTable = Bio::Tools::CodonTable->new(); |
|
1025
|
19
|
|
|
19
|
|
197
|
use Bio::Tools::IUPAC; |
|
|
19
|
|
|
|
|
35
|
|
|
|
19
|
|
|
|
|
57645
|
|
|
1026
|
0
|
|
|
|
|
0
|
my $iupac = Bio::Tools::IUPAC->new(); |
|
1027
|
0
|
|
|
|
|
0
|
my %aas = $iupac->iupac_iup(); #print Dumper(\%aas); # T=>T; Z=>E/Q; etc |
|
1028
|
|
|
|
|
|
|
# my %nts = $iupac->iupac(); #print Dumper(\%nts); # M=>A/C; A=>A; etc |
|
1029
|
0
|
0
|
|
|
|
0
|
if ($string =~ /^[atcg]{3}$/i) { |
|
|
|
0
|
|
|
|
|
|
|
1030
|
0
|
|
|
|
|
0
|
print $string, "\t", $myCodonTable->translate($string), "\n"; |
|
1031
|
|
|
|
|
|
|
} elsif (&_in_list($string, [keys %aas])) { |
|
1032
|
0
|
|
|
|
|
0
|
print $string, "\t", join "\t", $myCodonTable->revtranslate($string), "\n"; |
|
1033
|
|
|
|
|
|
|
} else { |
|
1034
|
0
|
|
|
|
|
0
|
print $string, "Currently only takes a 3-letter DNA-base codon or a 1-letter uppercase IUPAC aa code"; |
|
1035
|
|
|
|
|
|
|
} |
|
1036
|
|
|
|
|
|
|
} |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
sub count_codons { |
|
1039
|
0
|
|
|
0
|
1
|
0
|
my $new_seq; |
|
1040
|
0
|
|
|
|
|
0
|
my $myCodonTable = Bio::Tools::CodonTable->new(); |
|
1041
|
0
|
|
|
|
|
0
|
while ($seq = $in->next_seq()) { $new_seq .= $seq->seq() } |
|
|
0
|
|
|
|
|
0
|
|
|
1042
|
0
|
|
|
|
|
0
|
my $hash_ref = Bio::Tools::SeqStats->count_codons(Bio::Seq->new(-seq=>$new_seq, -id=>'concat')); |
|
1043
|
0
|
|
|
|
|
0
|
my $count; |
|
1044
|
0
|
|
|
|
|
0
|
$count += $hash_ref->{$_} foreach keys %$hash_ref; |
|
1045
|
0
|
|
|
|
|
0
|
foreach (sort keys %$hash_ref) { |
|
1046
|
0
|
|
|
|
|
0
|
print $_, ":\t", $myCodonTable->translate($_), "\t", $hash_ref->{$_}, "\t"; |
|
1047
|
0
|
|
|
|
|
0
|
printf "%.2f", $hash_ref->{$_}/$count*100; |
|
1048
|
0
|
|
|
|
|
0
|
print "%\n" |
|
1049
|
|
|
|
|
|
|
} |
|
1050
|
|
|
|
|
|
|
} |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
=head2 print_gb_gene_feats() |
|
1053
|
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
print gene sequences in FASTA from a GenBank file of bacterial |
|
1055
|
|
|
|
|
|
|
genome. Won't work for a eukaryote genbank file. |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
=cut |
|
1058
|
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
sub print_gb_gene_feats { # works only for prokaryote genome |
|
1060
|
1
|
|
|
1
|
1
|
6
|
$seq = $in->next_seq(); |
|
1061
|
1
|
50
|
|
|
|
104468
|
die "$filename: Not a GenBank file. Quit\n" unless $in_format eq 'genbank'; |
|
1062
|
1
|
|
|
|
|
3
|
my $gene_count = 0; |
|
1063
|
1
|
|
|
|
|
12
|
foreach my $feat ($seq->get_SeqFeatures()) { |
|
1064
|
66
|
100
|
|
|
|
19780
|
if ($feat->primary_tag eq 'CDS') { |
|
1065
|
28
|
|
|
|
|
252
|
my $location = $feat->location(); |
|
1066
|
28
|
50
|
|
|
|
542
|
next if $location->isa('Bio::Location::Split'); |
|
1067
|
28
|
|
|
|
|
69
|
my $gene_tag = "gene_" . $gene_count++; |
|
1068
|
28
|
|
|
|
|
49
|
my $gene_symbol = 'na'; |
|
1069
|
28
|
|
|
|
|
46
|
my $product = 'na'; |
|
1070
|
28
|
|
|
|
|
84
|
foreach my $tag ($feat->get_all_tags()) { |
|
1071
|
181
|
100
|
|
|
|
770
|
($gene_tag) = $feat->get_tag_values($tag) if $tag eq 'locus_tag'; |
|
1072
|
181
|
100
|
|
|
|
614
|
($gene_symbol) = $feat->get_tag_values($tag) if $tag eq 'gene'; |
|
1073
|
181
|
100
|
|
|
|
364
|
($product) = $feat->get_tag_values($tag) if $tag eq 'product'; |
|
1074
|
181
|
|
|
|
|
757
|
$product =~ s/\s/_/g; |
|
1075
|
|
|
|
|
|
|
} |
|
1076
|
28
|
|
|
|
|
139
|
my $gene = Bio::Seq->new(-id => (join "|", ($gene_tag, $feat->start, $feat->end, $feat->strand, $gene_symbol, $product)), |
|
1077
|
|
|
|
|
|
|
-seq=>$seq->subseq($feat->start, $feat->end)); |
|
1078
|
28
|
100
|
|
|
|
14526
|
if ($feat->strand() > 0) { $out->write_seq($gene) } else { $out->write_seq($gene->revcom())} |
|
|
14
|
|
|
|
|
314
|
|
|
|
14
|
|
|
|
|
300
|
|
|
1079
|
|
|
|
|
|
|
# print join "\t", |
|
1080
|
|
|
|
|
|
|
# ($feat->gff_string, $feat->start, $feat->end, |
|
1081
|
|
|
|
|
|
|
# $feat->strand); |
|
1082
|
|
|
|
|
|
|
# print "\n"; |
|
1083
|
|
|
|
|
|
|
} |
|
1084
|
|
|
|
|
|
|
} |
|
1085
|
|
|
|
|
|
|
} |
|
1086
|
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
=head2 count_leading_gaps() |
|
1088
|
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
Count and print the number of leading gaps in each sequence. |
|
1090
|
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
=cut |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
sub count_leading_gaps { |
|
1094
|
0
|
|
|
0
|
1
|
0
|
while ($seq = $in->next_seq()) { |
|
1095
|
0
|
|
|
|
|
0
|
my $lead_gap = 0; |
|
1096
|
0
|
|
|
|
|
0
|
my $see_aa = 0; # status variable |
|
1097
|
0
|
|
|
|
|
0
|
my @mono = split //, $seq->seq(); |
|
1098
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < $seq->length(); $i++) { |
|
1099
|
0
|
0
|
|
|
|
0
|
$see_aa = 1 if $mono[$i] ne '-'; |
|
1100
|
0
|
0
|
0
|
|
|
0
|
$lead_gap++ if !$see_aa && $mono[$i] eq '-' |
|
1101
|
|
|
|
|
|
|
} |
|
1102
|
0
|
|
|
|
|
0
|
print $seq->id(), "\t", $lead_gap, "\n" |
|
1103
|
|
|
|
|
|
|
} |
|
1104
|
|
|
|
|
|
|
} |
|
1105
|
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
=head2 hydroB() |
|
1107
|
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
Return the mean Kyte-Doolittle hydropathicity for protein |
|
1109
|
|
|
|
|
|
|
sequences. Wraps |
|
1110
|
|
|
|
|
|
|
Lhydropathicity()|https://metacpan.org/pod/Bio::Tools::SeqStats#hydropathicity>. |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
=cut |
|
1113
|
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
sub hydroB { |
|
1116
|
1
|
|
|
1
|
1
|
3
|
while ($seq = $in->next_seq()) { |
|
1117
|
24
|
|
|
|
|
4051
|
my $pep_str = $seq->seq(); |
|
1118
|
24
|
|
|
|
|
308
|
$pep_str =~ s/\*//g; |
|
1119
|
24
|
|
|
|
|
51
|
$seq->seq($pep_str); |
|
1120
|
24
|
|
|
|
|
1444
|
my $gravy = Bio::Tools::SeqStats->hydropathicity($seq); |
|
1121
|
24
|
|
|
|
|
17496
|
printf "%s\t%.4f\n", $seq->id(), $gravy |
|
1122
|
|
|
|
|
|
|
} |
|
1123
|
|
|
|
|
|
|
} |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
=head2 linearize() |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
Linearize FASTA, print one sequence per line. |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
=cut |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
sub linearize { |
|
1133
|
1
|
|
|
1
|
1
|
6
|
while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->seq(), "\n" } |
|
|
7
|
|
|
|
|
1818
|
|
|
1134
|
|
|
|
|
|
|
} |
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=head2 reloop_at() |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
Re-circularize a bacterial genome by starting at a specified position |
|
1139
|
|
|
|
|
|
|
given in the C<$opts{"reloop"> set via |
|
1140
|
|
|
|
|
|
|
L|/initialize>. |
|
1141
|
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
For example for sequence "ABCDE". C |
|
1143
|
|
|
|
|
|
|
would generate"'BCDEA". |
|
1144
|
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
=cut |
|
1146
|
|
|
|
|
|
|
sub reloop_at { |
|
1147
|
1
|
|
|
1
|
1
|
5
|
my $seq = $in->next_seq; # only the first sequence |
|
1148
|
1
|
|
|
|
|
408
|
my $break = $opts{"reloop"}; |
|
1149
|
1
|
|
|
|
|
9
|
my $new_seq = Bio::Seq->new(-id => $seq->id().":relooped_at_".$break, -seq => $seq->subseq($break, $seq->length()) . $seq->subseq(1, $break-1)); |
|
1150
|
1
|
|
|
|
|
551
|
$out->write_seq($new_seq) |
|
1151
|
|
|
|
|
|
|
} |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
=head2 remove_stop() |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
Remove stop codons. |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
=cut |
|
1158
|
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
sub remove_stop { |
|
1160
|
1
|
|
|
1
|
1
|
11
|
my $myCodonTable = Bio::Tools::CodonTable->new(); |
|
1161
|
1
|
|
|
|
|
38
|
while ($seq = $in->next_seq()) { |
|
1162
|
7
|
|
|
|
|
2282
|
my $newstr = ""; |
|
1163
|
7
|
|
|
|
|
19
|
for (my $i = 1; $i <= $seq->length() / 3; $i++) { |
|
1164
|
206
|
|
|
|
|
1816
|
my $codon = $seq->subseq(3 * ($i - 1) + 1, 3 * ($i - 1) + 3); |
|
1165
|
206
|
100
|
|
|
|
4932
|
if ($myCodonTable->is_ter_codon($codon)) { warn "Found and removed stop codon\n"; next } |
|
|
14
|
|
|
|
|
706
|
|
|
|
14
|
|
|
|
|
43
|
|
|
1166
|
192
|
|
|
|
|
5751
|
$newstr .= $codon |
|
1167
|
|
|
|
|
|
|
} |
|
1168
|
7
|
|
|
|
|
69
|
my $new = Bio::Seq->new(-id => $seq->id(), -seq => $newstr); |
|
1169
|
7
|
|
|
|
|
1490
|
$out->write_seq($new) |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
|
|
|
|
|
|
} |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
####################### internal subroutine ########################### |
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
sub _in_list { |
|
1177
|
0
|
|
|
0
|
|
0
|
my $scalar = shift; |
|
1178
|
0
|
|
|
|
|
0
|
my $ref_to_array = shift; |
|
1179
|
0
|
|
|
|
|
0
|
foreach (@{$ref_to_array}) { |
|
|
0
|
|
|
|
|
0
|
|
|
1180
|
0
|
0
|
|
|
|
0
|
return 1 if $_ eq $scalar; |
|
1181
|
|
|
|
|
|
|
} |
|
1182
|
0
|
|
|
|
|
0
|
return 0; |
|
1183
|
|
|
|
|
|
|
} |
|
1184
|
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
sub parse_orders { |
|
1186
|
4
|
|
|
4
|
0
|
7
|
my @selected = @{shift()}; |
|
|
4
|
|
|
|
|
14
|
|
|
1187
|
|
|
|
|
|
|
|
|
1188
|
4
|
|
|
|
|
8
|
my @orders; |
|
1189
|
|
|
|
|
|
|
# Parse if $value contains ranges: allows mixing ranges and lists |
|
1190
|
4
|
|
|
|
|
11
|
foreach my $val (@selected) { |
|
1191
|
5
|
100
|
|
|
|
76
|
if ($val =~ /^(\d+)-(\d+)$/) { # A numerical range |
|
1192
|
1
|
|
|
|
|
3
|
my ($first, $last) = ($1, $2); |
|
1193
|
1
|
50
|
|
|
|
4
|
die "Invalid seq range: $first, $last\n" unless $last > $first; |
|
1194
|
1
|
|
|
|
|
3
|
push @orders, ($first .. $last) |
|
1195
|
4
|
|
|
|
|
12
|
} else { push @orders, $val } # Single value |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
4
|
|
|
|
|
11
|
return map { $_ => 1 } @orders |
|
|
7
|
|
|
|
|
28
|
|
|
1198
|
|
|
|
|
|
|
} |
|
1199
|
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
sub _make_sed_file { |
|
1201
|
1
|
|
|
1
|
|
4
|
my $filename = shift @_; |
|
1202
|
1
|
|
|
|
|
4
|
my (%serial_names) = @_; |
|
1203
|
|
|
|
|
|
|
|
|
1204
|
1
|
50
|
|
|
|
4
|
$filename = "STDOUT" if $filename eq '-'; |
|
1205
|
|
|
|
|
|
|
|
|
1206
|
1
|
|
|
|
|
51
|
my $sedfile = basename($filename) . ".sed"; |
|
1207
|
1
|
50
|
|
|
|
200
|
open(my $sedout, ">", $sedfile) or die $!; |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
1
|
|
|
|
|
16
|
print $sedout "# usage: sed -f $filename.sed \n"; |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
1
|
|
|
|
|
5
|
foreach (keys %serial_names) { |
|
1212
|
7
|
|
|
|
|
9
|
my $real_name = $serial_names{$_}; |
|
1213
|
7
|
|
|
|
|
10
|
my $sed_cmd = "s/$_/" . $real_name . "/g;\n"; |
|
1214
|
7
|
|
|
|
|
11
|
print $sedout $sed_cmd |
|
1215
|
|
|
|
|
|
|
} |
|
1216
|
1
|
|
|
|
|
42
|
close $sedout; |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
1
|
|
|
|
|
60
|
print STDERR "\nCreated $filename.sed\tusage: sed -f $filename.sed \n\n" |
|
1219
|
|
|
|
|
|
|
} |
|
1220
|
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
################### pick/delete filters ##################### |
|
1222
|
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
sub find_by_file { |
|
1224
|
0
|
|
|
0
|
0
|
0
|
my ($action, $match, $currseq, $id_list) = @_; |
|
1225
|
0
|
|
|
|
|
0
|
my $seq_id = $currseq->id(); |
|
1226
|
0
|
|
|
|
|
0
|
$filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id) |
|
1227
|
|
|
|
|
|
|
} |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
sub pick_by_file { |
|
1230
|
0
|
|
|
0
|
0
|
0
|
my ($match, $currseq, $id_list, $seq_id) = @_; |
|
1231
|
0
|
0
|
|
|
|
0
|
if ($id_list->{$seq_id}) { |
|
1232
|
0
|
|
|
|
|
0
|
$id_list->{$seq_id}++; |
|
1233
|
0
|
0
|
|
|
|
0
|
die "Multiple matches (" . $seq_id . ") for $match found\n" if $id_list->{$seq_id} > 2; |
|
1234
|
0
|
|
|
|
|
0
|
$out->write_seq($currseq) |
|
1235
|
|
|
|
|
|
|
} |
|
1236
|
|
|
|
|
|
|
} |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
sub del_by_file { |
|
1239
|
0
|
|
|
0
|
0
|
0
|
my ($match, $currseq, $id_list, $seq_id) = @_; |
|
1240
|
0
|
0
|
|
|
|
0
|
if ($id_list->{$seq_id}) { |
|
1241
|
0
|
|
|
|
|
0
|
$id_list->{$seq_id}++; |
|
1242
|
0
|
|
|
|
|
0
|
warn "Deleted sequence: ", $currseq->id(), "\n" |
|
1243
|
0
|
|
|
|
|
0
|
} else { $out->write_seq($currseq) } |
|
1244
|
|
|
|
|
|
|
} |
|
1245
|
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
sub find_by_order { |
|
1248
|
28
|
|
|
28
|
0
|
76
|
my ($action, $ct, $currseq, $order_list) = @_; # say join "\t", @_; |
|
1249
|
28
|
|
|
|
|
72
|
$filter_dispatch{ $action . "_by_order" }->($ct, $currseq, $order_list) |
|
1250
|
|
|
|
|
|
|
} |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
sub pick_by_order { |
|
1253
|
21
|
|
|
21
|
0
|
28
|
my ($ct, $currseq, $order_list) = @_; |
|
1254
|
21
|
100
|
|
|
|
96
|
$out->write_seq($currseq) if $order_list->{$ct} |
|
1255
|
|
|
|
|
|
|
} |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
sub del_by_order { |
|
1258
|
7
|
|
|
7
|
0
|
18
|
my ($ct, $currseq, $order_list) = @_; # say join "\t", @_; |
|
1259
|
7
|
100
|
|
|
|
27
|
if ($order_list->{$ct}) { warn "Deleted sequence: ", $currseq->id(), "\n" } |
|
|
1
|
|
|
|
|
21
|
|
|
1260
|
6
|
|
|
|
|
30
|
else { $out->write_seq($currseq) } |
|
1261
|
|
|
|
|
|
|
} |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
sub find_by_id { |
|
1264
|
0
|
|
|
0
|
0
|
|
my ($action, $match, $currseq, $id_list) = @_; |
|
1265
|
0
|
|
|
|
|
|
my $seq_id = $currseq->id(); |
|
1266
|
0
|
|
|
|
|
|
$filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id) |
|
1267
|
|
|
|
|
|
|
} |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
sub pick_by_id { |
|
1270
|
0
|
|
|
0
|
0
|
|
my ($match, $currseq, $id_list, $seq_id) = @_; |
|
1271
|
|
|
|
|
|
|
|
|
1272
|
0
|
0
|
|
|
|
|
if ($id_list->{$seq_id}) { |
|
1273
|
0
|
|
|
|
|
|
$id_list->{$seq_id}++; |
|
1274
|
0
|
0
|
|
|
|
|
die "Multiple matches (" . $seq_id . ":" . $id_list->{$seq_id} - 1 . ") for $match found\n" if $id_list->{$seq_id} > 2; |
|
1275
|
0
|
|
|
|
|
|
$out->write_seq($currseq) |
|
1276
|
|
|
|
|
|
|
} |
|
1277
|
|
|
|
|
|
|
} |
|
1278
|
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
sub del_by_id { |
|
1280
|
0
|
|
|
0
|
0
|
|
my ($match, $currseq, $id_list, $seq_id) = @_; |
|
1281
|
|
|
|
|
|
|
|
|
1282
|
0
|
0
|
|
|
|
|
if ($id_list->{$seq_id}) { |
|
1283
|
0
|
|
|
|
|
|
$id_list->{$seq_id}++; |
|
1284
|
0
|
|
|
|
|
|
warn "Deleted sequence: ", $currseq->id(), "\n" |
|
1285
|
0
|
|
|
|
|
|
} else { $out->write_seq($currseq) } |
|
1286
|
|
|
|
|
|
|
} |
|
1287
|
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
sub find_by_re { |
|
1289
|
0
|
|
|
0
|
0
|
|
my ($action, $currseq, $value) = @_; |
|
1290
|
0
|
|
|
|
|
|
my $regex = qr/$value/; |
|
1291
|
0
|
|
|
|
|
|
my $seq_id = $currseq->id(); |
|
1292
|
0
|
|
|
|
|
|
$filter_dispatch{ $action . "_by_re" }->($currseq, $regex, $seq_id) |
|
1293
|
|
|
|
|
|
|
} |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
sub pick_by_re { |
|
1296
|
0
|
|
|
0
|
0
|
|
my ($currseq, $regex, $seq_id) = @_; |
|
1297
|
0
|
0
|
|
|
|
|
$out->write_seq($currseq) if $seq_id =~ /$regex/ |
|
1298
|
|
|
|
|
|
|
} |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
sub del_by_re { |
|
1301
|
0
|
|
|
0
|
0
|
|
my ($currseq, $regex, $seq_id) = @_; |
|
1302
|
|
|
|
|
|
|
|
|
1303
|
0
|
0
|
|
|
|
|
if ($seq_id =~ /$regex/) { warn "Deleted sequence: $seq_id\n" } |
|
|
0
|
|
|
|
|
|
|
|
1304
|
0
|
|
|
|
|
|
else { $out->write_seq($currseq) } |
|
1305
|
|
|
|
|
|
|
} |
|
1306
|
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
# TODO This needs better documentation |
|
1308
|
|
|
|
|
|
|
sub find_by_ambig { |
|
1309
|
0
|
|
|
0
|
0
|
|
my ($action, $currseq, $cutoff) = @_; |
|
1310
|
0
|
|
|
|
|
|
my $string = $currseq->seq(); |
|
1311
|
0
|
|
|
|
|
|
my $ct = ($string =~ s/([^ATCG])/$1/gi); # won't work for AA seqs |
|
1312
|
0
|
|
|
|
|
|
my $percent_ambig = $ct / $currseq->length(); |
|
1313
|
0
|
|
|
|
|
|
$filter_dispatch{"$action" . "_by_ambig"}->($currseq, $cutoff, $ct, $percent_ambig) |
|
1314
|
|
|
|
|
|
|
} |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
# TODO Probably better to change behavior when 'picking'? |
|
1317
|
|
|
|
|
|
|
sub pick_by_ambig { |
|
1318
|
0
|
|
|
0
|
0
|
|
my ($currseq, $cutoff, $ct, $percent_ambig) = @_; |
|
1319
|
0
|
0
|
|
|
|
|
$out->write_seq($currseq) if $percent_ambig > $cutoff |
|
1320
|
|
|
|
|
|
|
} |
|
1321
|
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
sub del_by_ambig { |
|
1323
|
0
|
|
|
0
|
0
|
|
my ($currseq, $cutoff, $ct, $percent_ambig) = @_; |
|
1324
|
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
# if ($percent_ambig > $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of N: ", $ct, "\n" } |
|
1326
|
0
|
0
|
|
|
|
|
if ($ct >= $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of bad monomers: ", $ct, "\n" } |
|
|
0
|
|
|
|
|
|
|
|
1327
|
0
|
|
|
|
|
|
else { $out->write_seq($currseq) } |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
sub find_by_length { |
|
1331
|
0
|
|
|
0
|
0
|
|
my ($action, $currseq, $value) = @_; |
|
1332
|
0
|
|
|
|
|
|
$filter_dispatch{$action . "_by_length"}->($currseq, $value) |
|
1333
|
|
|
|
|
|
|
} |
|
1334
|
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
sub pick_by_length { |
|
1336
|
0
|
|
|
0
|
0
|
|
my ($currseq, $value) = @_; |
|
1337
|
0
|
0
|
|
|
|
|
$out->write_seq($currseq) if $currseq->length() <= $value |
|
1338
|
|
|
|
|
|
|
} |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
sub del_by_length { |
|
1341
|
0
|
|
|
0
|
0
|
|
my ($currseq, $value) = @_; |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
0
|
0
|
|
|
|
|
if ($currseq->length() <= $value) { warn "Deleted sequence: ", $currseq->id(), " length: ", $currseq->length(), "\n" } |
|
|
0
|
|
|
|
|
|
|
|
1344
|
0
|
|
|
|
|
|
else { $out->write_seq($currseq) } |
|
1345
|
|
|
|
|
|
|
} |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
1; |
|
1348
|
|
|
|
|
|
|
__END__ |