File Coverage

lib/Bio/BPWrapper/SeqManipulations.pm
Criterion Covered Total %
statement 230 636 36.1
branch 40 172 23.2
condition 8 47 17.0
subroutine 43 79 54.4
pod 18 54 33.3
total 339 988 34.3


line stmt bran cond sub pod time code
1             =encoding utf8
2              
3             =head1 NAME
4              
5             Bio::BPWrapper::SeqManipulations - Functions for L
6              
7             =head1 SYNOPSIS
8              
9             use Bio::BPWrapper::SeqManipulations;
10             # Set options hash ...
11             initialize(\%opts);
12             write_out(\%opts);
13              
14             =cut
15              
16             package Bio::BPWrapper::SeqManipulations;
17              
18 19     19   139 use strict; # Still on 5.10, so need this for strict
  19         63  
  19         837  
19 19     19   87 use warnings;
  19         32  
  19         1280  
20 19     19   342 use 5.010;
  19         62  
21 19     19   12879 use Bio::Seq;
  19         1825405  
  19         804  
22 19     19   11441 use Bio::SeqIO;
  19         581375  
  19         1006  
23 19     19   175 use File::Basename;
  19         38  
  19         2108  
24 19     19   145 use Bio::Tools::CodonTable;
  19         72  
  19         683  
25 19     19   10114 use Bio::DB::RefSeq;
  19         1677129  
  19         966  
26 19     19   13048 use Bio::Tools::SeqStats;
  19         78056  
  19         830  
27 19     19   185 use Bio::SeqUtils;
  19         37  
  19         629  
28 19     19   125 use Scalar::Util;
  19         38  
  19         1017  
29 19     19   103 use Exporter ();
  19         34  
  19         353  
30 19     19   11277 use Bio::CodonUsage::IO;
  19         120814  
  19         889  
31 19     19   14626 use Bio::Tools::pICalculator;
  19         26403  
  19         1104  
32             # use Bio::Tools::GuessSeqFormat;
33              
34 19     19   12264 if ($ENV{'DEBUG'}) { use Data::Dumper }
  19         168854  
  19         1872  
35              
36 19     19   157 use vars qw(@ISA @EXPORT @EXPORT_OK);
  19         44  
  19         2410  
37              
38             @ISA = qw(Exporter);
39              
40             #restrict_coord restrict_digest
41             @EXPORT = qw(initialize can_handle handle_opt write_out
42             print_composition filter_seqs retrieve_seqs remove_gaps
43             print_lengths print_seq_count make_revcom
44             print_subseq
45             anonymize
46             shred_seq count_codons print_gb_gene_feats
47             count_leading_gaps hydroB linearize reloop_at
48             remove_stop parse_orders find_by_order
49             pick_by_order del_by_order find_by_id
50             pick_by_id del_by_id find_by_re
51             pick_by_re del_by_re
52             pick_by_file del_by_file
53             find_by_ambig pick_by_ambig del_by_ambig find_by_length
54             del_by_length codon_sim codon_info);
55              
56             # Package global variables
57             my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser);
58 19     19   8965 use Bio::BPWrapper;
  19         72  
  19         163375  
59             my $VERSION = '1.0';
60              
61             ## For new options, just add an entry into this table with the same key as in
62             ## the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works.
63             my %opt_dispatch = (
64             'codon-table' => \&codon_table,
65             # 'codon-sim' => \&codon_sim,
66             'codon-info' => \&codon_info,
67             'iep' => \&iso_electric_point,
68             'composition' => \&print_composition,
69             'mol-wt' => \&print_weight,
70             'delete' => \&filter_seqs,
71             'fetch' => \&retrieve_seqs,
72             'no-gaps' => \&remove_gaps,
73             'length' => \&print_lengths,
74             'longest-orf' => \&update_longest_orf,
75             'num-seq' => \&print_seq_count,
76             'num-gaps-dna' => \&count_gaps_dna,
77             'num-gaps-aa' => \&count_gaps_aa,
78             'pick' => \&filter_seqs,
79             'revcom' => \&make_revcom,
80             'subseq' => \&print_subseq,
81             'translate' => \&reading_frame_ops,
82             # 'restrict-coord' => \&restrict_coord,
83             # 'restrict' => \&restrict_digest,
84             'anonymize' => \&anonymize,
85             'break' => \&shred_seq,
86             'count-codons' => \&count_codons,
87             'feat2fas' => \&print_gb_gene_feats,
88             'lead-gaps' => \&count_leading_gaps,
89             'hydroB' => \&hydroB,
90             'linearize' => \&linearize,
91             'reloop' => \&reloop_at,
92             'remove-stop' => \&remove_stop,
93             'sort' => \&sort_by,
94             'split-cdhit' => \&split_cdhit,
95             'syn-code' => \&synon_codons,
96             # 'dotplot' => \&draw_dotplot,
97             # 'extract' => \&reading_frame_ops,
98             # 'longest-orf' => \&reading_frame_ops,
99             # 'prefix' => \&anonymize,
100             'rename' => \&rename_id,
101             # 'slidingwindow' => \&sliding_window,
102             # 'split' => \&split_seqs,
103             );
104              
105             my %filter_dispatch = (
106             'find_by_order' => \&find_by_order,
107             'pick_by_order' => \&pick_by_order,
108             'delete_by_order' => \&del_by_order,
109             'find_by_id' => \&find_by_id,
110             'pick_by_id' => \&pick_by_id,
111             'delete_by_id' => \&del_by_id,
112             'find_by_re' => \&find_by_re,
113             'pick_by_re' => \&pick_by_re,
114             'find_by_file' => \&find_by_file,
115             'pick_by_file' => \&pick_by_file,
116             'delete_by_file' => \&del_by_file,
117             'find_by_ambig' => \&find_by_ambig,
118             'pick_by_ambig' => \&pick_by_ambig,
119             'delete_by_ambig' => \&del_by_ambig,
120             'find_by_length' => \&find_by_length,
121             'pick_by_length' => \&pick_by_length,
122             'delete_by_length' => \&del_by_length,
123             );
124              
125             ##################### initializer & option handlers ###################
126              
127             =head1 SUBROUTINES
128              
129             =head2 initialize()
130              
131             Sets up most of the actions to be performed on sequences.
132              
133             Call this right after setting up an options hash.
134              
135             Sets package variables: C<$in>, C<$in_format>, C<$filename>, C<$out_format>, and C<$out>.
136              
137              
138             =cut
139              
140             sub initialize {
141 19     19 1 57 my $opts_ref = shift;
142 19         117 Bio::BPWrapper::common_opts($opts_ref);
143 19         37 %opts = %{$opts_ref};
  19         80  
144              
145 19 50 33     93 die "Option 'prefix' requires a value\n" if defined $opts{"prefix"} && $opts{"prefix"} =~ /^$/;
146              
147 19   50     102 $filename = shift @ARGV || "STDIN"; # If no more arguments were given on the command line, assume we're getting input from standard input
148              
149             # guess format won't work for piped input; remove
150             # if ($filename eq "STDIN") {
151             # my $lines;
152             # my $line_ct = 0;
153             # while(<>) { $lines .= $_; $line_ct++; last if $line_ct >= 100 } # read the first 100 lines
154             # $guesser = Bio::Tools::GuessSeqFormat->new( -text => $lines );
155             # } else {
156             # $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename);
157             # }
158             # $in_format = $guesser->guess() unless $opts{'input'};
159              
160 19   100     122 $in_format = $opts{"input"} // 'fasta';
161              
162             # die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/;
163 19 50       295 $in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename));
164              
165 19   100     290425 $out_format = $opts{"output"} // 'fasta';
166              
167             # A change in SeqIO, commit 0e04486ca4cc2e61fd72, means -fh or -file is required
168 19         395 $out = Bio::SeqIO->new(-format => $out_format, -fh => \*STDOUT)
169             }
170              
171             sub can_handle {
172 18     18 0 82 my $option = shift;
173 18         118 return defined($opt_dispatch{$option})
174             }
175              
176             sub handle_opt {
177 18     18 0 41 my $option = shift;
178 18         88 $opt_dispatch{$option}->($option) # This passes option name to all functions
179             }
180              
181             =head2 write_out()
182              
183             Writes out the sequence file.
184              
185             Call this after calling C<#initialize(\%opts)> and processing those options.
186              
187             =cut
188              
189             sub write_out {
190 1     1 1 7 while ($seq = $in->next_seq()) { $out->write_seq($seq) }
  22         3766343  
191             }
192              
193              
194             ################### subroutines ########################
195              
196             sub codon_sim {
197             # my $seq = $in->next_seq(); # only the first sequence used
198             # if (&_internal_stop_or_x($seq->translate()->seq())) {
199             # die "internal stop or non-standard base:\t" . $seq->id . "\texit.\n";
200             # }
201             # use Algorithm::Numerical::Sample qw /sample/;
202             # use Math::Random qw /random_permutation/;
203             ########################
204             # Read CUTG and make a random codon set for each AA
205             ########################
206             # my $cutg_file = $opts{'codon-sim'};
207             # my $io = Bio::CodonUsage::IO->new(-file => $cutg_file);
208             # my $cdtable = $io->next_data();
209 0     0 0 0 my @bases = qw(A T C G);
210 0         0 my @codons;
211 0         0 for (my $i=0; $i<=3; $i++) {
212 0         0 my $first = $bases[$i];
213 0         0 for (my $j=0; $j<=3; $j++) {
214 0         0 my $second = $bases[$j];
215 0         0 for (my $k=0; $k<=3; $k++) {
216 0         0 my $third = $bases[$k];
217 0         0 push @codons, $first . $second . $third;
218             }
219             }
220             }
221            
222 0         0 my $myCodonTable = Bio::Tools::CodonTable->new( -id => 11 ); # bacterial codon table
223              
224 0         0 my (@cd_cts, %aas, %aa_cds);
225 0         0 foreach my $cd (@codons) {
226 0         0 my $aa = $myCodonTable->translate($cd);
227 0         0 $aas{$aa}++;
228             # push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)};
229 0         0 push @cd_cts, {codon => $cd, aa => $aa};
230             }
231              
232 0         0 foreach my $aa (keys %aas) {
233 0         0 my @cds = grep {$_->{aa} eq $aa} @cd_cts;
  0         0  
234 0         0 $aa_cds{$aa} = \@cds;
235             # my @cd_sets;
236             # foreach (@cds) {
237             # for (my $i=1; $i<=$_->{cts}; $i++) {
238             # push @cd_sets, $_->{codon};
239             # }
240             # }
241             # @cd_sets = random_permutation(@cd_sets);
242             # $aa_cds{$aa} = \@cd_sets;
243             # $aa_cds{$aa} = \@cd_sets;
244             }
245             # print Dumper(\%aa_cds);
246 0         0 return(\%aa_cds);
247             }
248              
249             # fisher_yates_shuffle( \@array ) : generate a random permutation
250             # of @array in place
251             sub fisher_yates_shuffle {
252 0     0 0 0 my $array = shift;
253 0         0 my $i;
254 0         0 for ($i = @$array; --$i; ) {
255 0         0 my $j = int rand ($i+1);
256 0 0       0 next if $i == $j;
257 0         0 @$array[$i,$j] = @$array[$j,$i];
258             }
259             }
260              
261             sub synon_codons {
262 0     0 0 0 my %codon_set = %{&codon_sim};
  0         0  
263 0         0 while( my $seq = $in->next_seq() ) {
264             ##############################
265             # generate a random CDS with the same AA sequence
266             ###############################
267 0         0 my $pep = $seq->translate()->seq();
268 0 0       0 if (&_internal_stop_or_x($pep)) {
269 0         0 warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n";
270 0         0 next;
271             }
272 0         0 my @aas = split //, $pep;
273 0         0 my $sim_cds = "";
274             #print $pep, "\n";
275 0         0 for (my $i = 0; $i <= $#aas; $i++) {
276 0         0 my $cd_set = $codon_set{$aas[$i]};
277             # print $aas[$i], "\t", $cd_set->[0]->{codon}, "\n";
278 0 0       0 &fisher_yates_shuffle( $cd_set ) if scalar(@$cd_set) > 1; # permutes @array in place, for each codon
279             # next if $aas[$i] eq '*';
280             # my @sampled_cds = sample(-set => $aa_cds{$aas[$i]}); # sample 1 by default
281 0         0 my $cd_sim = $cd_set->[0];
282 0         0 $sim_cds .= $cd_sim->{codon};
283             }
284 0         0 my $sim_obj = Bio::Seq->new(-id => $seq->id() . "|sim", -seq => $sim_cds);
285 0         0 $out->write_seq($sim_obj);
286             }
287 0         0 exit;
288             }
289            
290              
291             sub count_gaps_dna {
292 0     0 0 0 while( my $seqobj = $in->next_seq() ) {
293 0         0 print $seqobj->id(), "\t";
294 0         0 my ($num, $ref) = &_count_gap($seqobj->seq(), 'dna');
295 0         0 print $num, "\n";
296 0 0       0 if ($num) {
297 0         0 my @pos;
298 0         0 foreach my $mono (keys %$ref) {
299 0         0 foreach (@{$ref->{$mono}}) { push @pos, $_}
  0         0  
  0         0  
300             }
301             # print STDERR join "\t", sort {$a <=> $b} @pos;
302             # print STDERR "\n";
303             }
304             }
305 0         0 exit;
306             }
307              
308             sub count_gaps_aa {
309 0     0 0 0 while( my $seqobj = $in->next_seq() ) {
310 0         0 print $seqobj->id(), "\t";
311 0         0 my ($num, $ref) = &_count_gap($seqobj->seq(), 'protein');
312 0         0 print $num, "\n";
313 0 0       0 if ($num) {
314 0         0 my @pos;
315 0         0 foreach my $mono (keys %$ref) {
316 0         0 foreach (@{$ref->{$mono}}) { push @pos, $_}
  0         0  
  0         0  
317             }
318             # print STDERR join "\t", sort {$a <=> $b} @pos;
319             # print STDERR "\n";
320             }
321             }
322 0         0 exit;
323             }
324              
325             sub _count_gap {
326 0     0   0 my ($str, $type) = @_;
327 0         0 my @mono = split('', $str);
328 0         0 my %seen_gaps;
329 0         0 my $num_gaps = 0;
330 0         0 my $ct = 0;
331 0         0 foreach my $mon (@mono) {
332 0         0 $ct++;
333 0 0 0     0 next if ($type eq 'dna' && $mon =~ /[atcg]/i) or ($type eq 'protein' && $mon =~ /[ACDEFGHIKLMNPQRSTVWY]/i) or ($type eq 'protein' && $mon =~ /\*\s*$/);
      0        
      0        
      0        
      0        
334 0         0 $num_gaps ++;
335 0 0       0 if ($seen_gaps{$mon}) {
336 0         0 push @{$seen_gaps{$mon}}, $ct;
  0         0  
337             } else {
338 0         0 $seen_gaps{$mon} = [$ct]
339             }
340             }
341             # print Dumper(\%seen_gaps) if $num_gaps;
342 0         0 return ($num_gaps, \%seen_gaps);
343             }
344              
345             sub rename_id {
346 0     0 0 0 my $optStr = $opts{rename};
347 0         0 my %names;
348              
349 0 0       0 if ($optStr =~ /^id:(\S+);(\S+)$/) {
350 0         0 $names{$1} = $2;
351             } else {
352 0   0     0 open NAME, "<", $optStr || die "a file with old-tab-new needed\n";
353 0         0 while() {
354 0         0 chomp;
355 0         0 my ($oldN, $newN) = split;
356 0         0 $names{$oldN} = $newN;
357             }
358 0         0 close NAME;
359             }
360              
361 0         0 while( my $seqobj = $in->next_seq() ) {
362 0         0 my $id = $seqobj->display_id();
363 0 0       0 if ($names{$id}) {
364             # $seqobj->id($id . "|" . $names{$id});
365             # warn "$id appended by $names{$id}\n";
366 0         0 $seqobj->id($names{$id});
367 0         0 warn "$id replaced by $names{$id}\n";
368             } else {
369 0         0 warn "$id not changed\n";
370             }
371 0         0 $out->write_seq($seqobj);
372             }
373             }
374              
375              
376             sub update_longest_orf {
377 0     0 0 0 while( my $seqobj = $in->next_seq() ) {
378 0         0 my $pep_string = $seqobj->translate( undef, undef, 0 )->seq();
379 0 0       0 unless ($pep_string =~ /\*[A-Z]/) { # no internal stop; don't proceed
380 0         0 my $id = $seqobj->id();
381 0         0 $seqobj->id($id . "|+1");
382 0         0 $seqobj = &_trim_end_to_frame($seqobj);
383 0         0 $out->write_seq($seqobj);
384             # warn $seqobj->id, ": +1 ok\n";
385 0         0 next;
386             }
387              
388 0 0       0 unless ($opts{"no-revcom"}) { # do not search in revcom
389 0         0 my $pep_rev = $seqobj->revcom()->translate( undef, undef, 0 )->seq();
390 0 0       0 unless ($pep_rev =~ /\*[A-Z]/) { # no internal stop for revcom
391 0         0 my $id = $seqobj->id();
392 0         0 $seqobj->id($id . "|-1");
393 0         0 my $rev = $seqobj->revcom();
394 0         0 $rev = &_trim_end_to_frame($rev);
395 0         0 $out->write_seq($rev);
396             # warn $seqobj->id(), ": -1 ok\n";
397 0         0 next;
398             }
399             }
400              
401 0         0 my $longest = {
402             'aa_start' => 1,
403             'aa_end' => 1,
404             'aa_length' => 1,
405             'nt_start' => 1,
406             'nt_end' => 1,
407             'nt_seq' => $seqobj->seq(),
408             'frame' => 1,
409             };
410              
411 0         0 foreach my $fm (1, 2, 3, -1, -2, -3 ) {
412             # warn "checking frame $fm ...\n";
413 0 0 0     0 next if $opts{"no-revcom"} && $fm < 0; # do not search in revcom
414 0 0       0 my $new_seqobj = Bio::Seq->new(
415             -id => $seqobj->id() . "|$fm",
416             -seq => $fm > 0 ? $seqobj->subseq( $fm, $seqobj->length() ) : $seqobj->revcom()->subseq( abs($fm), $seqobj->length() )
417             ); # chop seq to frame first
418            
419 0         0 &_get_longest($new_seqobj, $longest, $fm);
420             # warn "longest ORF:", $longest->{aa_length}, "\n";
421              
422             }
423             # warn "start codon not M/V/L:", $seqobj->id() unless substr( $longest->{nt_seq}, 0, 3 ) =~ /[atg|gt[atcg]|ct[atcg]|tt[ag]/i;
424             # print ">", $seqobj->id, "|f", $longest->{frame}, "|longest-orf\n", $longest->{nt_seq}, "\n";
425 0 0       0 my $fid = $longest->{frame} > 0 ? "+" . $longest->{frame} : $longest->{frame};
426 0         0 my $longest_seq = Bio::Seq->new(-id => $seqobj->id . "|" . $fid, -seq => $longest->{nt_seq} );
427 0         0 $longest_seq = &_trim_end_to_frame($longest_seq);
428 0         0 $out->write_seq($longest_seq);
429             }
430             }
431              
432             sub _trim_end_to_frame {
433 0     0   0 my $seqobj = shift;
434 0         0 my $seq_len = $seqobj->length();
435 0         0 my $remainder = $seq_len % 3;
436 0         0 my $seqstr = $seqobj->subseq(1, $seq_len - $remainder);
437 0         0 $seqobj->seq($seqstr);
438 0         0 return $seqobj;
439             }
440              
441             sub _get_longest { # for each frame
442 0     0   0 my ($seq, $ref, $fm) = @_;
443 0         0 my $pep_string = $seq->translate( undef, undef, 0 )->seq();
444 0 0       0 unless ($pep_string =~ /\*[A-Z]/) { # no internal stops, found the longest
445 0         0 $ref->{nt_seq} = $seq->seq();
446 0         0 $ref->{frame} = $fm;
447 0         0 $ref->{aa_length} = $seq->length()/3;
448             } else { # has internal stops
449 0 0       0 die $seq->id(), " contains ambiguous aa (X)\n" if $pep_string =~ /X/;
450 0         0 my $three_prime = $seq->length();
451            
452 0         0 my $start = 1;
453 0         0 my @aas = split '', $pep_string;
454 0         0 for ( my $i = 0; $i <= $#aas; $i++ ) {
455 0 0 0     0 next unless $aas[$i] eq '*' || $i == $#aas; # hit a stop codon or end of sequence
456 0 0       0 if ($i - $start + 2 > $ref->{aa_length}) { # if longer than the last longest
457 0         0 $ref->{aa_start} = $start;
458 0         0 $ref->{aa_end} = $i + 1;
459 0         0 $ref->{aa_length} = $i - $start + 2;
460 0         0 $ref->{nt_start} = 3 * ( $start - 1 ) + 1;
461 0         0 my $end = 3 * $i + 3;
462 0 0       0 $end
463             = ( $end > $three_prime )
464             ? $three_prime
465             : $end; # guranteed not to go beyond 3'
466 0         0 $ref->{nt_end} = $end;
467 0         0 $ref->{frame} = $fm;
468 0         0 $ref->{nt_seq} = $seq->subseq( 3 * ( $start - 1 ) + 1, $end );
469             }
470 0         0 $start = $i + 2; # re-start
471             }
472             }
473             }
474              
475              
476             sub iso_electric_point {
477 0     0 0 0 my $calc = Bio::Tools::pICalculator->new(-places => 2, -pKset => 'EMBOSS');
478 0         0 while ( my $seq = $in->next_seq ) {
479 0         0 $calc->seq($seq);
480 0         0 print $seq->id(), "\t", $calc->iep;
481 0         0 for(my $i = 0; $i <= 14; $i += 0.5 ){
482 0         0 print "\t", $i, "|", sprintf("%.2f", $calc->charge_at_pH($i));
483             }
484 0         0 print "\n";
485             }
486             }
487              
488             sub print_weight {
489 0     0 0 0 while ($seq = $in->next_seq()) {
490 0         0 my $ref_weight = Bio::Tools::SeqStats->get_mol_wt($seq);
491 0         0 print join "\t", $seq->id(), $ref_weight->[0], $ref_weight->[1];
492 0         0 print "\n";
493             }
494             }
495              
496             sub codon_info {
497 0   0 0 0 0 my $cutg_file = $opts{'codon-info'} || "need a codon usage file in CUTG GCG format";
498 0         0 my $io = Bio::CodonUsage::IO->new(-file => $cutg_file);
499 0         0 my $cdtable = $io->next_data();
500 0         0 my @bases = qw(A T C G);
501 0         0 my @codons;
502 0         0 for (my $i=0; $i<=3; $i++) {
503 0         0 my $first = $bases[$i];
504 0         0 for (my $j=0; $j<=3; $j++) {
505 0         0 my $second = $bases[$j];
506 0         0 for (my $k=0; $k<=3; $k++) {
507 0         0 my $third = $bases[$k];
508 0         0 push @codons, $first . $second . $third;
509             }
510             }
511             }
512              
513 0         0 my $myCodonTable = Bio::Tools::CodonTable->new( -id => 1 );
514             #print Dumper($cdtable->all_aa_frequencies);
515              
516 0         0 my (@cd_cts, %aas);
517 0         0 foreach my $cd (@codons) {
518 0         0 my $aa = $myCodonTable->translate($cd);
519 0         0 $aas{$aa}++;
520 0         0 push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)};
521             }
522              
523 0         0 my $h_genome = 0;
524 0         0 foreach my $aa (keys %aas) {
525 0         0 my @cds = grep {$_->{aa} eq $aa} @cd_cts;
  0         0  
526 0         0 $h_genome += &__cd_entropy(\@cds);
527             }
528              
529             =begin
530             use Bio::Tools::CodonOptTable;
531             while (my $seq = $in->next_seq()) {
532             my $seqobj = Bio::Tools::CodonOptTable->new(
533             -seq => $seq->seq(),
534             -genetic_code => 1,
535             -alphabet => 'dna',
536             -is_circular => 0,
537             -id => $seq->id(),
538             );
539             my $myCodons = $seqobj->rscu_rac_table();
540             my %oneLetterAA;
541             my @cdCTs;
542             my $numCodons = 0;
543             foreach my $rec (@$myCodons) {
544             $numCodons += $rec->{frequency};
545             my $codon = $rec->{codon};
546             my $aa = $myCodonTable->translate($codon);
547             $oneLetterAA{$aa}++;
548             push @cdCTs, {codon => $codon, aa => $aa, cts => $rec->{frequency}};
549             }
550             =cut
551 0         0 while (my $seq = $in->next_seq()) {
552 0 0       0 if (&_internal_stop_or_x($seq->translate()->seq())) {
553 0         0 warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n";
554 0         0 next;
555             }
556 0         0 my %oneLetterAA;
557             my %codons;
558 0         0 my @cdCTs;
559 0         0 my $numCodons = 0;
560 0         0 for (my $i=0; $i<=$seq->length()-3; $i+=3) {
561 0         0 $codons{substr($seq->seq(), $i, 3)}++;
562 0         0 $numCodons++;
563             }
564              
565 0         0 foreach my $cd (keys %codons) {
566 0         0 my $aa = $myCodonTable->translate($cd);
567 0         0 $oneLetterAA{$aa}++;
568 0         0 push @cdCTs, {codon => $cd, aa => $aa, cts => $codons{$cd}};
569             }
570              
571 0         0 my $h_cds = 0;
572 0         0 foreach my $aa (keys %oneLetterAA) {
573 0         0 my @cds = grep {$_->{aa} eq $aa} @cdCTs;
  0         0  
574 0         0 $h_cds += &__cd_entropy(\@cds);
575             }
576 0         0 print $seq->id, "\t", $seq->length(), "\t", $numCodons, "\t";
577 0         0 printf "%.6f\n", $h_genome - $h_cds;
578             }
579             }
580              
581             sub __cd_entropy {
582 0     0   0 my $ref = shift;
583 0         0 my @cd_obj = @$ref;
584 0 0       0 return 0 if @cd_obj <= 1; # single codons (M & W)
585 0         0 my $sum = 0;
586 0         0 my $h = 0;
587 0         0 foreach (@cd_obj) {
588 0         0 $sum += $_->{cts};
589             }
590 0 0       0 return 0 unless $sum > 0;
591 0         0 foreach (@cd_obj) {
592 0         0 $_->{rel_freq} = $_->{cts}/$sum;
593             }
594              
595 0         0 foreach (@cd_obj) {
596 0 0       0 next unless $_->{rel_freq} > 0;
597 0         0 $h -= $_->{rel_freq} * log($_->{rel_freq})/log(2);
598             }
599 0         0 return $h;
600             }
601              
602             sub sort_by {
603 0     0 0 0 my $match = $opts{'sort'};
604 0 0       0 $match =~ /length|id|file/ || die "Enter 'length', 'id', or 'file:filename' to sort.\n";
605 0         0 my @seqs;
606              
607 0         0 while (my $seq = $in->next_seq()) {
608 0         0 push @seqs, {id => $seq->display_id, len => $seq->length, seqob => $seq};
609             }
610              
611 0 0       0 if ($match eq 'id'){
612 0         0 @seqs = sort {$a->{id} cmp $b->{id}} @seqs;
  0         0  
613             }
614              
615 0 0       0 if ($match eq 'length'){
616 0         0 @seqs = sort {$a->{len} <=> $b->{len}} @seqs;
  0         0  
617             }
618              
619 0 0       0 if ($match =~ /file:(\S+)/){
620 0         0 my $filename = $1;
621 0   0     0 open FL, "<", $filename || die "file not found: $filename\n";
622 0         0 my %order;
623 0         0 my $ct = 1;
624 0         0 while() {
625 0         0 chomp;
626 0 0       0 next unless /(\S+)/;
627 0         0 $order{$1} = $ct++;
628             }
629 0         0 close FL;
630 0         0 foreach (@seqs) {
631 0 0       0 die "id not in order file: ", $_->{id}, "\n" unless $order{ $_->{id} };
632             }
633 0         0 @seqs = sort { $order{ $a->{id} } <=> $order{ $b->{id} } } @seqs;
  0         0  
634             }
635              
636 0         0 foreach (@seqs) {
637 0         0 $out->write_seq($_->{seqob});
638             }
639             }
640              
641             sub split_cdhit {
642 0     0 0 0 my $cls_file = $opts{'split-cdhit'};
643 0   0     0 open IN, "<" . $cls_file || die "cdhit clstr file not found: $cls_file\n";
644 0         0 my %clusters;
645             my $cl_id;
646 0         0 my @mem;
647 0         0 while () {
648 0         0 my $line = $_;
649 0         0 chomp $line;
650 0 0       0 if ($line =~ /^>(\S+)\s+(\d+)/) {
651 0         0 $cl_id = $1 . "_" . $2;
652 0         0 my @mem = ();
653 0         0 $clusters{$cl_id} = \@mem;
654             } else {
655 0         0 my $ref = $clusters{$cl_id};
656 0         0 my @mems = @$ref;
657 0         0 my @els = split /\s+/, $line;
658 0         0 my $seq_id = $els[2];
659 0         0 $seq_id =~ s/>//;
660 0         0 $seq_id =~ s/\.\.\.$//;
661 0         0 push @mems, $seq_id;
662 0         0 $clusters{$cl_id} = \@mems;
663             }
664             }
665             # print Dumper(\%clusters);
666 0         0 my %seqs;
667 0         0 while ($seq = $in->next_seq()) {
668 0         0 my $id = $seq->display_id();
669 0         0 $seqs{$id} = $seq;
670             }
671              
672 0         0 foreach my $id (keys %clusters) {
673 0         0 my $out = Bio::SeqIO->new( -file => ">" . $filename . "-". $id . ".fas", -format => 'fasta');
674 0         0 my @seqids = @{ $clusters{$id} };
  0         0  
675 0         0 foreach my $seq_id (@seqids) {
676 0         0 $out->write_seq($seqs{$seq_id});
677             }
678             }
679 0         0 exit;
680             }
681              
682              
683             sub print_composition {
684 1     1 0 6 while ($seq = $in->next_seq()) {
685 7         1768 my $hash_ref = Bio::Tools::SeqStats->count_monomers($seq);
686 7         1793 my $count;
687 7         39 $count += $hash_ref->{$_} foreach keys %$hash_ref;
688 7         31 print $seq->id();
689 7         169 foreach (sort keys %$hash_ref) {
690 28         69 print "\t", $_, ":", $hash_ref->{$_}, "(";
691 28         134 printf "%.2f", $hash_ref->{$_}/$count*100;
692 28         56 print "%)"
693             }
694 7         42 print "\n"
695             }
696             }
697              
698             # This sub calls all the del/pick subs above. Any option to filter input sequences by some criterion goes through here, and the appropriate filter subroutine is called.
699             sub filter_seqs {
700 4     4 0 8 my $action = shift;
701 4         11 my $match = $opts{$action};
702              
703             # matching to stop at 1st ':' so that ids using ':' as field delimiters are handled properly
704 4 50       31 $match =~ /^([^:]+):(\S+)$/ || die "Bad search format. Expecting a pattern of the form: tag:value.\n";
705              
706 4         18 my ($tag, $value) = ($1, $2);
707 4         25 my @selected = split(/,/, $value);
708 4         13 my $callsub = "find_by_" . "$tag";
709              
710 4 50       18 die "Bad tag or function not implemented. Tag was: $tag\n" if !defined($filter_dispatch{$callsub});
711              
712 4 50       14 if ($tag eq 'order') {
    0          
    0          
713 4         8 my $ct = 0;
714 4         17 my %order_list = parse_orders(\@selected); # Parse selected orders and create a hash
715 4         19 while (my $currseq = $in->next_seq) { $ct++; $filter_dispatch{$callsub}->($action, $ct, $currseq, \%order_list) }
  28         7246  
  28         110  
716 4 50       420 foreach (keys %order_list) { print STDERR "No matches found for order number $_\n" if $_ > $ct }
  7         56  
717             }
718             elsif ($tag eq 'id') {
719 0         0 my %id_list = map { $_ => 1 } @selected; # create a hash from @selected
  0         0  
720 0         0 while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%id_list) }
  0         0  
721 0 0       0 foreach (keys %id_list) { warn "No matches found for '$_'\n" if $id_list{$_} == 1 }
  0         0  
722             }
723             elsif ($tag eq 'file') {
724 0   0     0 open LIST, "<", $value || die "can't find file $value\n";
725 0         0 my %list;
726 0         0 while() { chomp; $list{$_}++ }
  0         0  
  0         0  
727 0         0 close LIST;
728 0         0 while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%list) }
  0         0  
729 0 0       0 foreach (keys %list) { warn "No matches found for '$_'\n" if $list{$_} == 1 }
  0         0  
730             } else {
731 0         0 while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $currseq, $value) }
  0         0  
732             }
733             }
734              
735             =head2 retrieve_seqs()
736              
737             Retrieves a sequence from RefSeq using the provided accession
738             number. A wrapper for CE#get_Seq_by_acc>.
739              
740             =cut
741              
742             # To do: add fetch by gi
743             sub retrieve_seqs {
744 0     0 1 0 my $gb = Bio::DB::RefSeq->new();
745 0         0 my $seq = $gb->get_Seq_by_acc($opts{'fetch'}); # Retrieve sequence with Accession Number
746 0         0 $out->write_seq($seq)
747             }
748              
749             =head2 remove_gaps()
750              
751             Remove gaps
752              
753             =cut
754              
755             sub remove_gaps {
756 1     1 1 6 while ($seq = $in->next_seq()) {
757 7         3616 my $string = $seq->seq();
758 7         125 $string =~ s/-//g;
759 7         14 $string =~ s/\.//g;
760 7         20 my $new_seq = Bio::Seq->new(-id => $seq->id(), -seq => $string);
761 7         2346 $out->write_seq($new_seq)
762             }
763             }
764              
765             =head2 print_lengths()
766              
767             Print all sequence lengths. Wraps
768             Llength|https://metacpan.org/pod/Bio::Seq#length>.
769              
770             =cut
771              
772             sub print_lengths {
773 1     1 1 3 while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->length(), "\n" }
  7         1021  
774             }
775              
776              
777             =head2 print_seq_count()
778              
779             Print all sequence lengths. Wraps
780             Llength|https://metacpan.org/pod/Bio::Seq#length>.
781              
782             =cut
783              
784             sub print_seq_count {
785 1     1 1 2 my $count;
786 1         4 while ($seq = $in->next_seq()) { $count++ }
  7         923  
787 1         46 print $count, "\n"
788             }
789              
790             =head2 make_revcom()
791             Reverse complement. Wraps
792             Lrevcom()|https://metacpan.org/pod/Bio::Seq#revcom>.
793              
794             =cut
795              
796              
797             sub make_revcom { # reverse-complement a sequence
798 1     1 1 27 while ($seq = $in->next_seq()) {
799 7         2353 my $new = Bio::Seq->new(-id => $seq->id() . ":revcom", -seq => $seq->revcom()->seq());
800 7         2576 $out->write_seq($new)
801             }
802             }
803              
804             =head2 print_subseq()
805              
806             Select substring (of the 1st sequence). Wraps
807             Lsubseq()|https://metacpan.org/pod/Bio::Seq#subseq>.
808              
809             =cut
810              
811              
812             sub print_subseq {
813 1     1 1 22 while ($seq = $in->next_seq()) {
814 7         3546 my $id = $seq->id();
815 7         186 my ($start, $end) = split /\s*,\s*/, $opts{"subseq"};
816 7 50       29 $end = $seq->length() if $end eq '-'; # allow shorthand: -s'2,-'
817 7 50       22 die "end out of bound: $id\n" if $end > $seq->length();
818 7         119 my $new = Bio::Seq->new(-id => $seq->id() . ":$start-$end", -seq => $seq->subseq($start, $end));
819 7         2903 $out->write_seq($new)
820             }
821             }
822              
823             sub _internal_stop_or_x {
824 7     7   2459 my $str = shift;
825 7         12 $str =~ s/\*$//; # remove last stop
826 7         10 my @internalStops;
827             my @nonStandardAA;
828 7         19 while ($str =~ m/\*/g) { # internal stop # previously missed double **
829 14         24 push @internalStops, pos($str);
830             }
831              
832 7         13 while ($str =~ m/X/g) { # non-standard AA
833 0         0 push @nonStandardAA, pos($str);
834             }
835              
836 7 50       11 if (@nonStandardAA) {
837 0         0 warn "Presence of X at postions:\t", join(";", @nonStandardAA), "\n";
838             }
839              
840 7 50       11 if (@internalStops) {
841 7         217 warn "Presence of internal stops at postions:\t", join(";", @internalStops), "\n";
842             }
843              
844 7 50       32 return @internalStops ? 1 : 0;
845             }
846              
847             =head2 reading_frame_ops
848              
849             Translate in 1, 3, or 6 frames based on the value of C<$opts> set via
850             L|/initialize>. Wraps
851             Ltranslate()|https://metacpan.org/pod/Bio::Seq#translate>,
852             Ltranslate_3frames()|https://metacpan.org/pod/Bio::SeqUtils#translate_3frames>,
853             and
854             Ltranslate_6frames()|https://metacpan.org/pod/Bio::SeqUtils#translate_6frames>.
855              
856             =cut
857              
858              
859             sub reading_frame_ops {
860 1     1 1 3 my $frame = $opts{"translate"};
861 1         3 while ($seq = $in->next_seq()) {
862 7 50       1250 if ($frame == 1) {
    0          
    0          
863 7 50       23 if (&_internal_stop_or_x($seq->translate()->seq())) {
864 7         15 warn "internal stop:\t" . $seq->id . "\tskip.\n"
865             } else {
866 0         0 $out->write_seq($seq->translate())
867             }
868             }
869             elsif ($frame == 3) {
870 0         0 my @prots = Bio::SeqUtils->translate_3frames($seq);
871 0         0 foreach (@prots) {
872 0         0 my $id = $_->id();
873 0         0 $id =~ /^(\S+)-(\d)F$/;
874 0         0 my ($oriId, $fm) = ($1, $2);
875 0         0 $_->id($oriId . "|+" . ($fm+1));
876 0         0 $out->write_seq($_);
877             }
878             } elsif ($frame == 6) {
879 0         0 my @prots = Bio::SeqUtils->translate_6frames($seq);
880 0         0 foreach (@prots) {
881 0         0 my $id = $_->id();
882 0         0 $id =~ /^(\S+)-(\d)([RF])$/;
883 0         0 my ($oriId, $fm, $dir) = ($1, $2, $3);
884 0 0       0 if ($dir eq 'F') {
885 0         0 $_->id($oriId . "|+" . ($fm+1));
886             } else {
887 0         0 $_->id($oriId . "|-" . ($fm+1));
888             }
889 0         0 $out->write_seq($_);
890             }
891 0         0 } else { warn "Accepted frame arguments: 1, 3, and 6\n"}
892             }
893             }
894              
895             =head2 restrict_coord()
896              
897             Note: This function is currently DEPRECATED.
898              
899             Finds digestion coordinates by a specified restriction enzyme
900             specified in C<$opts{restrinct}> set via L|/initialize>.
901              
902             An input file with sequences is expected. Wraps
903             Lcut()|https://github.com/bioperl/Bio-Restriction>.
904              
905             Outputs coordinates of overhangs in BED format.
906              
907              
908             sub restrict_coord {
909             use Bio::Restriction::Analysis;
910             use Bio::Restriction::EnzymeCollection;
911              
912             my $enz = $opts{"restrict-coord"};
913             my $re = Bio::Restriction::EnzymeCollection->new()->get_enzyme($enz);
914             my $len = length($re->overhang_seq());
915              
916             while ( $seq = $in->next_seq() ) {
917             my $seq_str = $seq->seq();
918             die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i;
919             my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
920             foreach my $pos ($ra->positions($enz)) {
921             print $seq->id()."\t".($pos-$len)."\t".$pos."\n";
922             }
923             }
924             }
925              
926             =head2 restrict_digest()
927              
928             Note: This function is currently DEPRECATED.
929              
930             Predicted fragments from digestion by a specified restriction enzyme
931             specified in C<$opts{restrinct}> set via L|/initialize>.
932              
933             An input file with sequences is expected. Wraps
934             Lcut()|https://metacpan.org/pod/Bio::Restriction::Analysis#cut>.
935              
936              
937             sub restrict_digest {
938             my $enz = $opts{"restrict"};
939             use Bio::Restriction::Analysis;
940             while ( $seq = $in->next_seq() ) {
941             my $seq_str = $seq->seq();
942             die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i;
943             my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
944             foreach my $frag ($ra->fragment_maps($enz)) {
945             my $seq_obj = Bio::Seq->new(
946             -id=>$seq->id().'|'.$frag->{start}.'-'.$frag->{end}.'|'.($frag->{end}-$frag->{start}+1),
947             -seq=>$frag->{seq});
948             $out->write_seq($seq_obj)
949             }
950             }
951             }
952              
953             =cut
954              
955             =head2 anonymize()
956              
957             Replace sequence IDs with serial IDs I characters long, as specified in
958             C<$opts{'anonymize'}> set via L|/initialize>.
959             For example if C<$opts{'anonymize'}>, the first ID will be C.
960             leading 'S' The length of the serial idea
961              
962             A sed script file is produced with a F<.sed> suffix that may be used
963             with sed's C<'-f'> argument. If the filename is F<'-'>, the sed file
964             is named C instead. A message containing the sed filename is
965             written to C.
966              
967             =cut
968              
969             sub anonymize {
970 1   50 1 1 5 my $char_len = $opts{"anonymize"} // die "Tried to use option 'prefix' without using option 'anonymize'. Exiting...\n";
971 1 50       3 my $prefix = (defined($opts{"prefix"})) ? $opts{"prefix"} : "S";
972              
973 1 50       3 pod2usage(1) if $char_len < 1;
974              
975 1         2 my $ct = 1;
976 1         1 my %serial_name;
977 1         2 my $length_warn = 0;
978 1         4 while ($seq = $in->next_seq()) {
979 7         1126 my $serial = $prefix . sprintf "%0" . ($char_len - length($prefix)) . "s", $ct;
980 7 50       11 $length_warn = 1 if length($serial) > $char_len;
981 7         17 $serial_name{$serial} = $seq->id();
982 7         93 $seq->id($serial);
983 7         72 $out->write_seq($seq);
984 7         1197 $ct++
985             }
986              
987 1         73 _make_sed_file($filename, %serial_name);
988 1         23 warn "Anonymization map:\n";
989 1         6 while (my ($k, $v) = each %serial_name) { warn "$k => $v\n" }
  7         62  
990              
991 1 50       7 warn "WARNING: Anonymized ID length exceeded requested length: try a different length or prefix.\n" if $length_warn
992             }
993              
994             =head2 shred_seq()
995              
996             Break into individual sequences writing a FASTA file for each sequence.
997              
998             =cut
999              
1000              
1001             sub shred_seq {
1002 1     1 1 3 while ($seq = $in->next_seq()) {
1003 7         2668 my $newid = $seq->id();
1004 7         112 $newid =~ s/[\s\|]/_/g;
1005 7         22 print $newid, "\n";
1006 7   50     16 my $suffix = $out_format || "fas";
1007 7         26 my $newout = Bio::SeqIO->new(-format => $out_format, -file => ">" . $newid . ".$suffix");
1008 7         4314 $newout->write_seq($seq)
1009             }
1010             exit
1011 1         347 }
1012              
1013             =head2 count_codons()
1014              
1015             Count codons for coding sequences (e.g., a genome file consisting of
1016             CDS sequences). Wraps
1017             Lcount_codons()|https://metacpan.org/pod/Bio::Tools::SeqStats#count_codons>.
1018              
1019             =cut
1020              
1021             sub codon_table {
1022 0     0 0 0 my $string = $opts{'codon-table'};
1023             # die "bioseq --codon-table atg|M\n" unless $string;
1024 0         0 my $myCodonTable = Bio::Tools::CodonTable->new();
1025 19     19   197 use Bio::Tools::IUPAC;
  19         35  
  19         57645  
1026 0         0 my $iupac = Bio::Tools::IUPAC->new();
1027 0         0 my %aas = $iupac->iupac_iup(); #print Dumper(\%aas); # T=>T; Z=>E/Q; etc
1028             # my %nts = $iupac->iupac(); #print Dumper(\%nts); # M=>A/C; A=>A; etc
1029 0 0       0 if ($string =~ /^[atcg]{3}$/i) {
    0          
1030 0         0 print $string, "\t", $myCodonTable->translate($string), "\n";
1031             } elsif (&_in_list($string, [keys %aas])) {
1032 0         0 print $string, "\t", join "\t", $myCodonTable->revtranslate($string), "\n";
1033             } else {
1034 0         0 print $string, "Currently only takes a 3-letter DNA-base codon or a 1-letter uppercase IUPAC aa code";
1035             }
1036             }
1037              
1038             sub count_codons {
1039 0     0 1 0 my $new_seq;
1040 0         0 my $myCodonTable = Bio::Tools::CodonTable->new();
1041 0         0 while ($seq = $in->next_seq()) { $new_seq .= $seq->seq() }
  0         0  
1042 0         0 my $hash_ref = Bio::Tools::SeqStats->count_codons(Bio::Seq->new(-seq=>$new_seq, -id=>'concat'));
1043 0         0 my $count;
1044 0         0 $count += $hash_ref->{$_} foreach keys %$hash_ref;
1045 0         0 foreach (sort keys %$hash_ref) {
1046 0         0 print $_, ":\t", $myCodonTable->translate($_), "\t", $hash_ref->{$_}, "\t";
1047 0         0 printf "%.2f", $hash_ref->{$_}/$count*100;
1048 0         0 print "%\n"
1049             }
1050             }
1051              
1052             =head2 print_gb_gene_feats()
1053              
1054             print gene sequences in FASTA from a GenBank file of bacterial
1055             genome. Won't work for a eukaryote genbank file.
1056              
1057             =cut
1058              
1059             sub print_gb_gene_feats { # works only for prokaryote genome
1060 1     1 1 6 $seq = $in->next_seq();
1061 1 50       104468 die "$filename: Not a GenBank file. Quit\n" unless $in_format eq 'genbank';
1062 1         3 my $gene_count = 0;
1063 1         12 foreach my $feat ($seq->get_SeqFeatures()) {
1064 66 100       19780 if ($feat->primary_tag eq 'CDS') {
1065 28         252 my $location = $feat->location();
1066 28 50       542 next if $location->isa('Bio::Location::Split');
1067 28         69 my $gene_tag = "gene_" . $gene_count++;
1068 28         49 my $gene_symbol = 'na';
1069 28         46 my $product = 'na';
1070 28         84 foreach my $tag ($feat->get_all_tags()) {
1071 181 100       770 ($gene_tag) = $feat->get_tag_values($tag) if $tag eq 'locus_tag';
1072 181 100       614 ($gene_symbol) = $feat->get_tag_values($tag) if $tag eq 'gene';
1073 181 100       364 ($product) = $feat->get_tag_values($tag) if $tag eq 'product';
1074 181         757 $product =~ s/\s/_/g;
1075             }
1076 28         139 my $gene = Bio::Seq->new(-id => (join "|", ($gene_tag, $feat->start, $feat->end, $feat->strand, $gene_symbol, $product)),
1077             -seq=>$seq->subseq($feat->start, $feat->end));
1078 28 100       14526 if ($feat->strand() > 0) { $out->write_seq($gene) } else { $out->write_seq($gene->revcom())}
  14         314  
  14         300  
1079             # print join "\t",
1080             # ($feat->gff_string, $feat->start, $feat->end,
1081             # $feat->strand);
1082             # print "\n";
1083             }
1084             }
1085             }
1086              
1087             =head2 count_leading_gaps()
1088              
1089             Count and print the number of leading gaps in each sequence.
1090              
1091             =cut
1092              
1093             sub count_leading_gaps {
1094 0     0 1 0 while ($seq = $in->next_seq()) {
1095 0         0 my $lead_gap = 0;
1096 0         0 my $see_aa = 0; # status variable
1097 0         0 my @mono = split //, $seq->seq();
1098 0         0 for (my $i = 0; $i < $seq->length(); $i++) {
1099 0 0       0 $see_aa = 1 if $mono[$i] ne '-';
1100 0 0 0     0 $lead_gap++ if !$see_aa && $mono[$i] eq '-'
1101             }
1102 0         0 print $seq->id(), "\t", $lead_gap, "\n"
1103             }
1104             }
1105              
1106             =head2 hydroB()
1107              
1108             Return the mean Kyte-Doolittle hydropathicity for protein
1109             sequences. Wraps
1110             Lhydropathicity()|https://metacpan.org/pod/Bio::Tools::SeqStats#hydropathicity>.
1111              
1112             =cut
1113              
1114              
1115             sub hydroB {
1116 1     1 1 3 while ($seq = $in->next_seq()) {
1117 24         4051 my $pep_str = $seq->seq();
1118 24         308 $pep_str =~ s/\*//g;
1119 24         51 $seq->seq($pep_str);
1120 24         1444 my $gravy = Bio::Tools::SeqStats->hydropathicity($seq);
1121 24         17496 printf "%s\t%.4f\n", $seq->id(), $gravy
1122             }
1123             }
1124              
1125             =head2 linearize()
1126              
1127             Linearize FASTA, print one sequence per line.
1128              
1129             =cut
1130              
1131              
1132             sub linearize {
1133 1     1 1 6 while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->seq(), "\n" }
  7         1818  
1134             }
1135              
1136             =head2 reloop_at()
1137              
1138             Re-circularize a bacterial genome by starting at a specified position
1139             given in the C<$opts{"reloop"> set via
1140             L|/initialize>.
1141              
1142             For example for sequence "ABCDE". C
1143             would generate"'BCDEA".
1144              
1145             =cut
1146             sub reloop_at {
1147 1     1 1 5 my $seq = $in->next_seq; # only the first sequence
1148 1         408 my $break = $opts{"reloop"};
1149 1         9 my $new_seq = Bio::Seq->new(-id => $seq->id().":relooped_at_".$break, -seq => $seq->subseq($break, $seq->length()) . $seq->subseq(1, $break-1));
1150 1         551 $out->write_seq($new_seq)
1151             }
1152              
1153             =head2 remove_stop()
1154              
1155             Remove stop codons.
1156              
1157             =cut
1158              
1159             sub remove_stop {
1160 1     1 1 11 my $myCodonTable = Bio::Tools::CodonTable->new();
1161 1         38 while ($seq = $in->next_seq()) {
1162 7         2282 my $newstr = "";
1163 7         19 for (my $i = 1; $i <= $seq->length() / 3; $i++) {
1164 206         1816 my $codon = $seq->subseq(3 * ($i - 1) + 1, 3 * ($i - 1) + 3);
1165 206 100       4932 if ($myCodonTable->is_ter_codon($codon)) { warn "Found and removed stop codon\n"; next }
  14         706  
  14         43  
1166 192         5751 $newstr .= $codon
1167             }
1168 7         69 my $new = Bio::Seq->new(-id => $seq->id(), -seq => $newstr);
1169 7         1490 $out->write_seq($new)
1170             }
1171             }
1172              
1173              
1174             ####################### internal subroutine ###########################
1175              
1176             sub _in_list {
1177 0     0   0 my $scalar = shift;
1178 0         0 my $ref_to_array = shift;
1179 0         0 foreach (@{$ref_to_array}) {
  0         0  
1180 0 0       0 return 1 if $_ eq $scalar;
1181             }
1182 0         0 return 0;
1183             }
1184              
1185             sub parse_orders {
1186 4     4 0 7 my @selected = @{shift()};
  4         14  
1187              
1188 4         8 my @orders;
1189             # Parse if $value contains ranges: allows mixing ranges and lists
1190 4         11 foreach my $val (@selected) {
1191 5 100       76 if ($val =~ /^(\d+)-(\d+)$/) { # A numerical range
1192 1         3 my ($first, $last) = ($1, $2);
1193 1 50       4 die "Invalid seq range: $first, $last\n" unless $last > $first;
1194 1         3 push @orders, ($first .. $last)
1195 4         12 } else { push @orders, $val } # Single value
1196             }
1197 4         11 return map { $_ => 1 } @orders
  7         28  
1198             }
1199              
1200             sub _make_sed_file {
1201 1     1   4 my $filename = shift @_;
1202 1         4 my (%serial_names) = @_;
1203              
1204 1 50       4 $filename = "STDOUT" if $filename eq '-';
1205              
1206 1         51 my $sedfile = basename($filename) . ".sed";
1207 1 50       200 open(my $sedout, ">", $sedfile) or die $!;
1208              
1209 1         16 print $sedout "# usage: sed -f $filename.sed \n";
1210              
1211 1         5 foreach (keys %serial_names) {
1212 7         9 my $real_name = $serial_names{$_};
1213 7         10 my $sed_cmd = "s/$_/" . $real_name . "/g;\n";
1214 7         11 print $sedout $sed_cmd
1215             }
1216 1         42 close $sedout;
1217              
1218 1         60 print STDERR "\nCreated $filename.sed\tusage: sed -f $filename.sed \n\n"
1219             }
1220              
1221             ################### pick/delete filters #####################
1222              
1223             sub find_by_file {
1224 0     0 0 0 my ($action, $match, $currseq, $id_list) = @_;
1225 0         0 my $seq_id = $currseq->id();
1226 0         0 $filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id)
1227             }
1228              
1229             sub pick_by_file {
1230 0     0 0 0 my ($match, $currseq, $id_list, $seq_id) = @_;
1231 0 0       0 if ($id_list->{$seq_id}) {
1232 0         0 $id_list->{$seq_id}++;
1233 0 0       0 die "Multiple matches (" . $seq_id . ") for $match found\n" if $id_list->{$seq_id} > 2;
1234 0         0 $out->write_seq($currseq)
1235             }
1236             }
1237              
1238             sub del_by_file {
1239 0     0 0 0 my ($match, $currseq, $id_list, $seq_id) = @_;
1240 0 0       0 if ($id_list->{$seq_id}) {
1241 0         0 $id_list->{$seq_id}++;
1242 0         0 warn "Deleted sequence: ", $currseq->id(), "\n"
1243 0         0 } else { $out->write_seq($currseq) }
1244             }
1245              
1246              
1247             sub find_by_order {
1248 28     28 0 76 my ($action, $ct, $currseq, $order_list) = @_; # say join "\t", @_;
1249 28         72 $filter_dispatch{ $action . "_by_order" }->($ct, $currseq, $order_list)
1250             }
1251              
1252             sub pick_by_order {
1253 21     21 0 28 my ($ct, $currseq, $order_list) = @_;
1254 21 100       96 $out->write_seq($currseq) if $order_list->{$ct}
1255             }
1256              
1257             sub del_by_order {
1258 7     7 0 18 my ($ct, $currseq, $order_list) = @_; # say join "\t", @_;
1259 7 100       27 if ($order_list->{$ct}) { warn "Deleted sequence: ", $currseq->id(), "\n" }
  1         21  
1260 6         30 else { $out->write_seq($currseq) }
1261             }
1262              
1263             sub find_by_id {
1264 0     0 0   my ($action, $match, $currseq, $id_list) = @_;
1265 0           my $seq_id = $currseq->id();
1266 0           $filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id)
1267             }
1268              
1269             sub pick_by_id {
1270 0     0 0   my ($match, $currseq, $id_list, $seq_id) = @_;
1271              
1272 0 0         if ($id_list->{$seq_id}) {
1273 0           $id_list->{$seq_id}++;
1274 0 0         die "Multiple matches (" . $seq_id . ":" . $id_list->{$seq_id} - 1 . ") for $match found\n" if $id_list->{$seq_id} > 2;
1275 0           $out->write_seq($currseq)
1276             }
1277             }
1278              
1279             sub del_by_id {
1280 0     0 0   my ($match, $currseq, $id_list, $seq_id) = @_;
1281              
1282 0 0         if ($id_list->{$seq_id}) {
1283 0           $id_list->{$seq_id}++;
1284 0           warn "Deleted sequence: ", $currseq->id(), "\n"
1285 0           } else { $out->write_seq($currseq) }
1286             }
1287              
1288             sub find_by_re {
1289 0     0 0   my ($action, $currseq, $value) = @_;
1290 0           my $regex = qr/$value/;
1291 0           my $seq_id = $currseq->id();
1292 0           $filter_dispatch{ $action . "_by_re" }->($currseq, $regex, $seq_id)
1293             }
1294              
1295             sub pick_by_re {
1296 0     0 0   my ($currseq, $regex, $seq_id) = @_;
1297 0 0         $out->write_seq($currseq) if $seq_id =~ /$regex/
1298             }
1299              
1300             sub del_by_re {
1301 0     0 0   my ($currseq, $regex, $seq_id) = @_;
1302              
1303 0 0         if ($seq_id =~ /$regex/) { warn "Deleted sequence: $seq_id\n" }
  0            
1304 0           else { $out->write_seq($currseq) }
1305             }
1306              
1307             # TODO This needs better documentation
1308             sub find_by_ambig {
1309 0     0 0   my ($action, $currseq, $cutoff) = @_;
1310 0           my $string = $currseq->seq();
1311 0           my $ct = ($string =~ s/([^ATCG])/$1/gi); # won't work for AA seqs
1312 0           my $percent_ambig = $ct / $currseq->length();
1313 0           $filter_dispatch{"$action" . "_by_ambig"}->($currseq, $cutoff, $ct, $percent_ambig)
1314             }
1315              
1316             # TODO Probably better to change behavior when 'picking'?
1317             sub pick_by_ambig {
1318 0     0 0   my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
1319 0 0         $out->write_seq($currseq) if $percent_ambig > $cutoff
1320             }
1321              
1322             sub del_by_ambig {
1323 0     0 0   my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
1324              
1325             # if ($percent_ambig > $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of N: ", $ct, "\n" }
1326 0 0         if ($ct >= $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of bad monomers: ", $ct, "\n" }
  0            
1327 0           else { $out->write_seq($currseq) }
1328             }
1329              
1330             sub find_by_length {
1331 0     0 0   my ($action, $currseq, $value) = @_;
1332 0           $filter_dispatch{$action . "_by_length"}->($currseq, $value)
1333             }
1334              
1335             sub pick_by_length {
1336 0     0 0   my ($currseq, $value) = @_;
1337 0 0         $out->write_seq($currseq) if $currseq->length() <= $value
1338             }
1339              
1340             sub del_by_length {
1341 0     0 0   my ($currseq, $value) = @_;
1342              
1343 0 0         if ($currseq->length() <= $value) { warn "Deleted sequence: ", $currseq->id(), " length: ", $currseq->length(), "\n" }
  0            
1344 0           else { $out->write_seq($currseq) }
1345             }
1346              
1347             1;
1348             __END__