File Coverage

lib/Bio/BPWrapper/SeqManipulations.pm

Criterion	Covered	Total	%
statement	230	636	36.1
branch	40	172	23.2
condition	8	47	17.0
subroutine	43	79	54.4
pod	18	54	33.3
total	339	988	34.3

line	stmt	bran	cond	sub	pod	time	code
1							=encoding utf8
2
3							=head1 NAME
4
5							Bio::BPWrapper::SeqManipulations - Functions for L
6
7							=head1 SYNOPSIS
8
9							use Bio::BPWrapper::SeqManipulations;
10							# Set options hash ...
11							initialize(\%opts);
12							write_out(\%opts);
13
14							=cut
15
16							package Bio::BPWrapper::SeqManipulations;
17
18	19			19		139	use strict; # Still on 5.10, so need this for strict
	19					63
	19					837
19	19			19		87	use warnings;
	19					32
	19					1280
20	19			19		342	use 5.010;
	19					62
21	19			19		12879	use Bio::Seq;
	19					1825405
	19					804
22	19			19		11441	use Bio::SeqIO;
	19					581375
	19					1006
23	19			19		175	use File::Basename;
	19					38
	19					2108
24	19			19		145	use Bio::Tools::CodonTable;
	19					72
	19					683
25	19			19		10114	use Bio::DB::RefSeq;
	19					1677129
	19					966
26	19			19		13048	use Bio::Tools::SeqStats;
	19					78056
	19					830
27	19			19		185	use Bio::SeqUtils;
	19					37
	19					629
28	19			19		125	use Scalar::Util;
	19					38
	19					1017
29	19			19		103	use Exporter ();
	19					34
	19					353
30	19			19		11277	use Bio::CodonUsage::IO;
	19					120814
	19					889
31	19			19		14626	use Bio::Tools::pICalculator;
	19					26403
	19					1104
32							# use Bio::Tools::GuessSeqFormat;
33
34	19			19		12264	if ($ENV{'DEBUG'}) { use Data::Dumper }
	19					168854
	19					1872
35
36	19			19		157	use vars qw(@ISA @EXPORT @EXPORT_OK);
	19					44
	19					2410
37
38							@ISA = qw(Exporter);
39
40							#restrict_coord restrict_digest
41							@EXPORT = qw(initialize can_handle handle_opt write_out
42							print_composition filter_seqs retrieve_seqs remove_gaps
43							print_lengths print_seq_count make_revcom
44							print_subseq
45							anonymize
46							shred_seq count_codons print_gb_gene_feats
47							count_leading_gaps hydroB linearize reloop_at
48							remove_stop parse_orders find_by_order
49							pick_by_order del_by_order find_by_id
50							pick_by_id del_by_id find_by_re
51							pick_by_re del_by_re
52							pick_by_file del_by_file
53							find_by_ambig pick_by_ambig del_by_ambig find_by_length
54							del_by_length codon_sim codon_info);
55
56							# Package global variables
57							my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser);
58	19			19		8965	use Bio::BPWrapper;
	19					72
	19					163375
59							my $VERSION = '1.0';
60
61							## For new options, just add an entry into this table with the same key as in
62							## the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works.
63							my %opt_dispatch = (
64							'codon-table' => \&codon_table,
65							# 'codon-sim' => \&codon_sim,
66							'codon-info' => \&codon_info,
67							'iep' => \&iso_electric_point,
68							'composition' => \&print_composition,
69							'mol-wt' => \&print_weight,
70							'delete' => \&filter_seqs,
71							'fetch' => \&retrieve_seqs,
72							'no-gaps' => \&remove_gaps,
73							'length' => \&print_lengths,
74							'longest-orf' => \&update_longest_orf,
75							'num-seq' => \&print_seq_count,
76							'num-gaps-dna' => \&count_gaps_dna,
77							'num-gaps-aa' => \&count_gaps_aa,
78							'pick' => \&filter_seqs,
79							'revcom' => \&make_revcom,
80							'subseq' => \&print_subseq,
81							'translate' => \&reading_frame_ops,
82							# 'restrict-coord' => \&restrict_coord,
83							# 'restrict' => \&restrict_digest,
84							'anonymize' => \&anonymize,
85							'break' => \&shred_seq,
86							'count-codons' => \&count_codons,
87							'feat2fas' => \&print_gb_gene_feats,
88							'lead-gaps' => \&count_leading_gaps,
89							'hydroB' => \&hydroB,
90							'linearize' => \&linearize,
91							'reloop' => \&reloop_at,
92							'remove-stop' => \&remove_stop,
93							'sort' => \&sort_by,
94							'split-cdhit' => \&split_cdhit,
95							'syn-code' => \&synon_codons,
96							# 'dotplot' => \&draw_dotplot,
97							# 'extract' => \&reading_frame_ops,
98							# 'longest-orf' => \&reading_frame_ops,
99							# 'prefix' => \&anonymize,
100							'rename' => \&rename_id,
101							# 'slidingwindow' => \&sliding_window,
102							# 'split' => \&split_seqs,
103							);
104
105							my %filter_dispatch = (
106							'find_by_order' => \&find_by_order,
107							'pick_by_order' => \&pick_by_order,
108							'delete_by_order' => \&del_by_order,
109							'find_by_id' => \&find_by_id,
110							'pick_by_id' => \&pick_by_id,
111							'delete_by_id' => \&del_by_id,
112							'find_by_re' => \&find_by_re,
113							'pick_by_re' => \&pick_by_re,
114							'find_by_file' => \&find_by_file,
115							'pick_by_file' => \&pick_by_file,
116							'delete_by_file' => \&del_by_file,
117							'find_by_ambig' => \&find_by_ambig,
118							'pick_by_ambig' => \&pick_by_ambig,
119							'delete_by_ambig' => \&del_by_ambig,
120							'find_by_length' => \&find_by_length,
121							'pick_by_length' => \&pick_by_length,
122							'delete_by_length' => \&del_by_length,
123							);
124
125							##################### initializer & option handlers ###################
126
127							=head1 SUBROUTINES
128
129							=head2 initialize()
130
131							Sets up most of the actions to be performed on sequences.
132
133							Call this right after setting up an options hash.
134
135							Sets package variables: C<$in>, C<$in_format>, C<$filename>, C<$out_format>, and C<$out>.
136
137
138							=cut
139
140							sub initialize {
141	19			19	1	57	my $opts_ref = shift;
142	19					117	Bio::BPWrapper::common_opts($opts_ref);
143	19					37	%opts = %{$opts_ref};
	19					80
144
145	19	50	33			93	die "Option 'prefix' requires a value\n" if defined $opts{"prefix"} && $opts{"prefix"} =~ /^$/;
146
147	19		50			102	$filename = shift @ARGV \|\| "STDIN"; # If no more arguments were given on the command line, assume we're getting input from standard input
148
149							# guess format won't work for piped input; remove
150							# if ($filename eq "STDIN") {
151							# my $lines;
152							# my $line_ct = 0;
153							# while(<>) { $lines .= $_; $line_ct++; last if $line_ct >= 100 } # read the first 100 lines
154							# $guesser = Bio::Tools::GuessSeqFormat->new( -text => $lines );
155							# } else {
156							# $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename);
157							# }
158							# $in_format = $guesser->guess() unless $opts{'input'};
159
160	19		100			122	$in_format = $opts{"input"} // 'fasta';
161
162							# die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta\|fastq\|embl\|genbank/;
163	19	50				295	$in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename));
164
165	19		100			290425	$out_format = $opts{"output"} // 'fasta';
166
167							# A change in SeqIO, commit 0e04486ca4cc2e61fd72, means -fh or -file is required
168	19					395	$out = Bio::SeqIO->new(-format => $out_format, -fh => \*STDOUT)
169							}
170
171							sub can_handle {
172	18			18	0	82	my $option = shift;
173	18					118	return defined($opt_dispatch{$option})
174							}
175
176							sub handle_opt {
177	18			18	0	41	my $option = shift;
178	18					88	$opt_dispatch{$option}->($option) # This passes option name to all functions
179							}
180
181							=head2 write_out()
182
183							Writes out the sequence file.
184
185							Call this after calling C<#initialize(\%opts)> and processing those options.
186
187							=cut
188
189							sub write_out {
190	1			1	1	7	while ($seq = $in->next_seq()) { $out->write_seq($seq) }
	22					3766343
191							}
192
193
194							################### subroutines ########################
195
196							sub codon_sim {
197							# my $seq = $in->next_seq(); # only the first sequence used
198							# if (&_internal_stop_or_x($seq->translate()->seq())) {
199							# die "internal stop or non-standard base:\t" . $seq->id . "\texit.\n";
200							# }
201							# use Algorithm::Numerical::Sample qw /sample/;
202							# use Math::Random qw /random_permutation/;
203							########################
204							# Read CUTG and make a random codon set for each AA
205							########################
206							# my $cutg_file = $opts{'codon-sim'};
207							# my $io = Bio::CodonUsage::IO->new(-file => $cutg_file);
208							# my $cdtable = $io->next_data();
209	0			0	0	0	my @bases = qw(A T C G);
210	0					0	my @codons;
211	0					0	for (my $i=0; $i<=3; $i++) {
212	0					0	my $first = $bases[$i];
213	0					0	for (my $j=0; $j<=3; $j++) {
214	0					0	my $second = $bases[$j];
215	0					0	for (my $k=0; $k<=3; $k++) {
216	0					0	my $third = $bases[$k];
217	0					0	push @codons, $first . $second . $third;
218							}
219							}
220							}
221
222	0					0	my $myCodonTable = Bio::Tools::CodonTable->new( -id => 11 ); # bacterial codon table
223
224	0					0	my (@cd_cts, %aas, %aa_cds);
225	0					0	foreach my $cd (@codons) {
226	0					0	my $aa = $myCodonTable->translate($cd);
227	0					0	$aas{$aa}++;
228							# push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)};
229	0					0	push @cd_cts, {codon => $cd, aa => $aa};
230							}
231
232	0					0	foreach my $aa (keys %aas) {
233	0					0	my @cds = grep {$_->{aa} eq $aa} @cd_cts;
	0					0
234	0					0	$aa_cds{$aa} = \@cds;
235							# my @cd_sets;
236							# foreach (@cds) {
237							# for (my $i=1; $i<=$_->{cts}; $i++) {
238							# push @cd_sets, $_->{codon};
239							# }
240							# }
241							# @cd_sets = random_permutation(@cd_sets);
242							# $aa_cds{$aa} = \@cd_sets;
243							# $aa_cds{$aa} = \@cd_sets;
244							}
245							# print Dumper(\%aa_cds);
246	0					0	return(\%aa_cds);
247							}
248
249							# fisher_yates_shuffle( \@array ) : generate a random permutation
250							# of @array in place
251							sub fisher_yates_shuffle {
252	0			0	0	0	my $array = shift;
253	0					0	my $i;
254	0					0	for ($i = @$array; --$i; ) {
255	0					0	my $j = int rand ($i+1);
256	0	0				0	next if $i == $j;
257	0					0	@$array[$i,$j] = @$array[$j,$i];
258							}
259							}
260
261							sub synon_codons {
262	0			0	0	0	my %codon_set = %{&codon_sim};
	0					0
263	0					0	while( my $seq = $in->next_seq() ) {
264							##############################
265							# generate a random CDS with the same AA sequence
266							###############################
267	0					0	my $pep = $seq->translate()->seq();
268	0	0				0	if (&_internal_stop_or_x($pep)) {
269	0					0	warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n";
270	0					0	next;
271							}
272	0					0	my @aas = split //, $pep;
273	0					0	my $sim_cds = "";
274							#print $pep, "\n";
275	0					0	for (my $i = 0; $i <= $#aas; $i++) {
276	0					0	my $cd_set = $codon_set{$aas[$i]};
277							# print $aas[$i], "\t", $cd_set->[0]->{codon}, "\n";
278	0	0				0	&fisher_yates_shuffle( $cd_set ) if scalar(@$cd_set) > 1; # permutes @array in place, for each codon
279							# next if $aas[$i] eq '*';
280							# my @sampled_cds = sample(-set => $aa_cds{$aas[$i]}); # sample 1 by default
281	0					0	my $cd_sim = $cd_set->[0];
282	0					0	$sim_cds .= $cd_sim->{codon};
283							}
284	0					0	my $sim_obj = Bio::Seq->new(-id => $seq->id() . "\|sim", -seq => $sim_cds);
285	0					0	$out->write_seq($sim_obj);
286							}
287	0					0	exit;
288							}
289
290
291							sub count_gaps_dna {
292	0			0	0	0	while( my $seqobj = $in->next_seq() ) {
293	0					0	print $seqobj->id(), "\t";
294	0					0	my ($num, $ref) = &_count_gap($seqobj->seq(), 'dna');
295	0					0	print $num, "\n";
296	0	0				0	if ($num) {
297	0					0	my @pos;
298	0					0	foreach my $mono (keys %$ref) {
299	0					0	foreach (@{$ref->{$mono}}) { push @pos, $_}
	0					0
	0					0
300							}
301							# print STDERR join "\t", sort {$a <=> $b} @pos;
302							# print STDERR "\n";
303							}
304							}
305	0					0	exit;
306							}
307
308							sub count_gaps_aa {
309	0			0	0	0	while( my $seqobj = $in->next_seq() ) {
310	0					0	print $seqobj->id(), "\t";
311	0					0	my ($num, $ref) = &_count_gap($seqobj->seq(), 'protein');
312	0					0	print $num, "\n";
313	0	0				0	if ($num) {
314	0					0	my @pos;
315	0					0	foreach my $mono (keys %$ref) {
316	0					0	foreach (@{$ref->{$mono}}) { push @pos, $_}
	0					0
	0					0
317							}
318							# print STDERR join "\t", sort {$a <=> $b} @pos;
319							# print STDERR "\n";
320							}
321							}
322	0					0	exit;
323							}
324
325							sub _count_gap {
326	0			0		0	my ($str, $type) = @_;
327	0					0	my @mono = split('', $str);
328	0					0	my %seen_gaps;
329	0					0	my $num_gaps = 0;
330	0					0	my $ct = 0;
331	0					0	foreach my $mon (@mono) {
332	0					0	$ct++;
333	0	0	0			0	next if ($type eq 'dna' && $mon =~ /[atcg]/i) or ($type eq 'protein' && $mon =~ /[ACDEFGHIKLMNPQRSTVWY]/i) or ($type eq 'protein' && $mon =~ /\\s$/);
			0
			0
			0
			0
334	0					0	$num_gaps ++;
335	0	0				0	if ($seen_gaps{$mon}) {
336	0					0	push @{$seen_gaps{$mon}}, $ct;
	0					0
337							} else {
338	0					0	$seen_gaps{$mon} = [$ct]
339							}
340							}
341							# print Dumper(\%seen_gaps) if $num_gaps;
342	0					0	return ($num_gaps, \%seen_gaps);
343							}
344
345							sub rename_id {
346	0			0	0	0	my $optStr = $opts{rename};
347	0					0	my %names;
348
349	0	0				0	if ($optStr =~ /^id:(\S+);(\S+)$/) {
350	0					0	$names{$1} = $2;
351							} else {
352	0		0			0	open NAME, "<", $optStr \|\| die "a file with old-tab-new needed\n";
353	0					0	while() {
354	0					0	chomp;
355	0					0	my ($oldN, $newN) = split;
356	0					0	$names{$oldN} = $newN;
357							}
358	0					0	close NAME;
359							}
360
361	0					0	while( my $seqobj = $in->next_seq() ) {
362	0					0	my $id = $seqobj->display_id();
363	0	0				0	if ($names{$id}) {
364							# $seqobj->id($id . "\|" . $names{$id});
365							# warn "$id appended by $names{$id}\n";
366	0					0	$seqobj->id($names{$id});
367	0					0	warn "$id replaced by $names{$id}\n";
368							} else {
369	0					0	warn "$id not changed\n";
370							}
371	0					0	$out->write_seq($seqobj);
372							}
373							}
374
375
376							sub update_longest_orf {
377	0			0	0	0	while( my $seqobj = $in->next_seq() ) {
378	0					0	my $pep_string = $seqobj->translate( undef, undef, 0 )->seq();
379	0	0				0	unless ($pep_string =~ /\*[A-Z]/) { # no internal stop; don't proceed
380	0					0	my $id = $seqobj->id();
381	0					0	$seqobj->id($id . "\|+1");
382	0					0	$seqobj = &_trim_end_to_frame($seqobj);
383	0					0	$out->write_seq($seqobj);
384							# warn $seqobj->id, ": +1 ok\n";
385	0					0	next;
386							}
387
388	0	0				0	unless ($opts{"no-revcom"}) { # do not search in revcom
389	0					0	my $pep_rev = $seqobj->revcom()->translate( undef, undef, 0 )->seq();
390	0	0				0	unless ($pep_rev =~ /\*[A-Z]/) { # no internal stop for revcom
391	0					0	my $id = $seqobj->id();
392	0					0	$seqobj->id($id . "\|-1");
393	0					0	my $rev = $seqobj->revcom();
394	0					0	$rev = &_trim_end_to_frame($rev);
395	0					0	$out->write_seq($rev);
396							# warn $seqobj->id(), ": -1 ok\n";
397	0					0	next;
398							}
399							}
400
401	0					0	my $longest = {
402							'aa_start' => 1,
403							'aa_end' => 1,
404							'aa_length' => 1,
405							'nt_start' => 1,
406							'nt_end' => 1,
407							'nt_seq' => $seqobj->seq(),
408							'frame' => 1,
409							};
410
411	0					0	foreach my $fm (1, 2, 3, -1, -2, -3 ) {
412							# warn "checking frame $fm ...\n";
413	0	0	0			0	next if $opts{"no-revcom"} && $fm < 0; # do not search in revcom
414	0	0				0	my $new_seqobj = Bio::Seq->new(
415							-id => $seqobj->id() . "\|$fm",
416							-seq => $fm > 0 ? $seqobj->subseq( $fm, $seqobj->length() ) : $seqobj->revcom()->subseq( abs($fm), $seqobj->length() )
417							); # chop seq to frame first
418
419	0					0	&_get_longest($new_seqobj, $longest, $fm);
420							# warn "longest ORF:", $longest->{aa_length}, "\n";
421
422							}
423							# warn "start codon not M/V/L:", $seqobj->id() unless substr( $longest->{nt_seq}, 0, 3 ) =~ /[atg\|gt[atcg]\|ct[atcg]\|tt[ag]/i;
424							# print ">", $seqobj->id, "\|f", $longest->{frame}, "\|longest-orf\n", $longest->{nt_seq}, "\n";
425	0	0				0	my $fid = $longest->{frame} > 0 ? "+" . $longest->{frame} : $longest->{frame};
426	0					0	my $longest_seq = Bio::Seq->new(-id => $seqobj->id . "\|" . $fid, -seq => $longest->{nt_seq} );
427	0					0	$longest_seq = &_trim_end_to_frame($longest_seq);
428	0					0	$out->write_seq($longest_seq);
429							}
430							}
431
432							sub _trim_end_to_frame {
433	0			0		0	my $seqobj = shift;
434	0					0	my $seq_len = $seqobj->length();
435	0					0	my $remainder = $seq_len % 3;
436	0					0	my $seqstr = $seqobj->subseq(1, $seq_len - $remainder);
437	0					0	$seqobj->seq($seqstr);
438	0					0	return $seqobj;
439							}
440
441							sub _get_longest { # for each frame
442	0			0		0	my ($seq, $ref, $fm) = @_;
443	0					0	my $pep_string = $seq->translate( undef, undef, 0 )->seq();
444	0	0				0	unless ($pep_string =~ /\*[A-Z]/) { # no internal stops, found the longest
445	0					0	$ref->{nt_seq} = $seq->seq();
446	0					0	$ref->{frame} = $fm;
447	0					0	$ref->{aa_length} = $seq->length()/3;
448							} else { # has internal stops
449	0	0				0	die $seq->id(), " contains ambiguous aa (X)\n" if $pep_string =~ /X/;
450	0					0	my $three_prime = $seq->length();
451
452	0					0	my $start = 1;
453	0					0	my @aas = split '', $pep_string;
454	0					0	for ( my $i = 0; $i <= $#aas; $i++ ) {
455	0	0	0			0	next unless $aas[$i] eq '*' \|\| $i == $#aas; # hit a stop codon or end of sequence
456	0	0				0	if ($i - $start + 2 > $ref->{aa_length}) { # if longer than the last longest
457	0					0	$ref->{aa_start} = $start;
458	0					0	$ref->{aa_end} = $i + 1;
459	0					0	$ref->{aa_length} = $i - $start + 2;
460	0					0	$ref->{nt_start} = 3 * ( $start - 1 ) + 1;
461	0					0	my $end = 3 * $i + 3;
462	0	0				0	$end
463							= ( $end > $three_prime )
464							? $three_prime
465							: $end; # guranteed not to go beyond 3'
466	0					0	$ref->{nt_end} = $end;
467	0					0	$ref->{frame} = $fm;
468	0					0	$ref->{nt_seq} = $seq->subseq( 3 * ( $start - 1 ) + 1, $end );
469							}
470	0					0	$start = $i + 2; # re-start
471							}
472							}
473							}
474
475
476							sub iso_electric_point {
477	0			0	0	0	my $calc = Bio::Tools::pICalculator->new(-places => 2, -pKset => 'EMBOSS');
478	0					0	while ( my $seq = $in->next_seq ) {
479	0					0	$calc->seq($seq);
480	0					0	print $seq->id(), "\t", $calc->iep;
481	0					0	for(my $i = 0; $i <= 14; $i += 0.5 ){
482	0					0	print "\t", $i, "\|", sprintf("%.2f", $calc->charge_at_pH($i));
483							}
484	0					0	print "\n";
485							}
486							}
487
488							sub print_weight {
489	0			0	0	0	while ($seq = $in->next_seq()) {
490	0					0	my $ref_weight = Bio::Tools::SeqStats->get_mol_wt($seq);
491	0					0	print join "\t", $seq->id(), $ref_weight->[0], $ref_weight->[1];
492	0					0	print "\n";
493							}
494							}
495
496							sub codon_info {
497	0		0	0	0	0	my $cutg_file = $opts{'codon-info'} \|\| "need a codon usage file in CUTG GCG format";
498	0					0	my $io = Bio::CodonUsage::IO->new(-file => $cutg_file);
499	0					0	my $cdtable = $io->next_data();
500	0					0	my @bases = qw(A T C G);
501	0					0	my @codons;
502	0					0	for (my $i=0; $i<=3; $i++) {
503	0					0	my $first = $bases[$i];
504	0					0	for (my $j=0; $j<=3; $j++) {
505	0					0	my $second = $bases[$j];
506	0					0	for (my $k=0; $k<=3; $k++) {
507	0					0	my $third = $bases[$k];
508	0					0	push @codons, $first . $second . $third;
509							}
510							}
511							}
512
513	0					0	my $myCodonTable = Bio::Tools::CodonTable->new( -id => 1 );
514							#print Dumper($cdtable->all_aa_frequencies);
515
516	0					0	my (@cd_cts, %aas);
517	0					0	foreach my $cd (@codons) {
518	0					0	my $aa = $myCodonTable->translate($cd);
519	0					0	$aas{$aa}++;
520	0					0	push @cd_cts, {codon => $cd, aa => $aa, cts => $cdtable->codon_count($cd)};
521							}
522
523	0					0	my $h_genome = 0;
524	0					0	foreach my $aa (keys %aas) {
525	0					0	my @cds = grep {$_->{aa} eq $aa} @cd_cts;
	0					0
526	0					0	$h_genome += &__cd_entropy(\@cds);
527							}
528
529							=begin
530							use Bio::Tools::CodonOptTable;
531							while (my $seq = $in->next_seq()) {
532							my $seqobj = Bio::Tools::CodonOptTable->new(
533							-seq => $seq->seq(),
534							-genetic_code => 1,
535							-alphabet => 'dna',
536							-is_circular => 0,
537							-id => $seq->id(),
538							);
539							my $myCodons = $seqobj->rscu_rac_table();
540							my %oneLetterAA;
541							my @cdCTs;
542							my $numCodons = 0;
543							foreach my $rec (@$myCodons) {
544							$numCodons += $rec->{frequency};
545							my $codon = $rec->{codon};
546							my $aa = $myCodonTable->translate($codon);
547							$oneLetterAA{$aa}++;
548							push @cdCTs, {codon => $codon, aa => $aa, cts => $rec->{frequency}};
549							}
550							=cut
551	0					0	while (my $seq = $in->next_seq()) {
552	0	0				0	if (&_internal_stop_or_x($seq->translate()->seq())) {
553	0					0	warn "internal stop or non-standard base:\t" . $seq->id . "\tskip.\n";
554	0					0	next;
555							}
556	0					0	my %oneLetterAA;
557							my %codons;
558	0					0	my @cdCTs;
559	0					0	my $numCodons = 0;
560	0					0	for (my $i=0; $i<=$seq->length()-3; $i+=3) {
561	0					0	$codons{substr($seq->seq(), $i, 3)}++;
562	0					0	$numCodons++;
563							}
564
565	0					0	foreach my $cd (keys %codons) {
566	0					0	my $aa = $myCodonTable->translate($cd);
567	0					0	$oneLetterAA{$aa}++;
568	0					0	push @cdCTs, {codon => $cd, aa => $aa, cts => $codons{$cd}};
569							}
570
571	0					0	my $h_cds = 0;
572	0					0	foreach my $aa (keys %oneLetterAA) {
573	0					0	my @cds = grep {$_->{aa} eq $aa} @cdCTs;
	0					0
574	0					0	$h_cds += &__cd_entropy(\@cds);
575							}
576	0					0	print $seq->id, "\t", $seq->length(), "\t", $numCodons, "\t";
577	0					0	printf "%.6f\n", $h_genome - $h_cds;
578							}
579							}
580
581							sub __cd_entropy {
582	0			0		0	my $ref = shift;
583	0					0	my @cd_obj = @$ref;
584	0	0				0	return 0 if @cd_obj <= 1; # single codons (M & W)
585	0					0	my $sum = 0;
586	0					0	my $h = 0;
587	0					0	foreach (@cd_obj) {
588	0					0	$sum += $_->{cts};
589							}
590	0	0				0	return 0 unless $sum > 0;
591	0					0	foreach (@cd_obj) {
592	0					0	$_->{rel_freq} = $_->{cts}/$sum;
593							}
594
595	0					0	foreach (@cd_obj) {
596	0	0				0	next unless $_->{rel_freq} > 0;
597	0					0	$h -= $_->{rel_freq} * log($_->{rel_freq})/log(2);
598							}
599	0					0	return $h;
600							}
601
602							sub sort_by {
603	0			0	0	0	my $match = $opts{'sort'};
604	0	0				0	$match =~ /length\|id\|file/ \|\| die "Enter 'length', 'id', or 'file:filename' to sort.\n";
605	0					0	my @seqs;
606
607	0					0	while (my $seq = $in->next_seq()) {
608	0					0	push @seqs, {id => $seq->display_id, len => $seq->length, seqob => $seq};
609							}
610
611	0	0				0	if ($match eq 'id'){
612	0					0	@seqs = sort {$a->{id} cmp $b->{id}} @seqs;
	0					0
613							}
614
615	0	0				0	if ($match eq 'length'){
616	0					0	@seqs = sort {$a->{len} <=> $b->{len}} @seqs;
	0					0
617							}
618
619	0	0				0	if ($match =~ /file:(\S+)/){
620	0					0	my $filename = $1;
621	0		0			0	open FL, "<", $filename \|\| die "file not found: $filename\n";
622	0					0	my %order;
623	0					0	my $ct = 1;
624	0					0	while() {
625	0					0	chomp;
626	0	0				0	next unless /(\S+)/;
627	0					0	$order{$1} = $ct++;
628							}
629	0					0	close FL;
630	0					0	foreach (@seqs) {
631	0	0				0	die "id not in order file: ", $_->{id}, "\n" unless $order{ $_->{id} };
632							}
633	0					0	@seqs = sort { $order{ $a->{id} } <=> $order{ $b->{id} } } @seqs;
	0					0
634							}
635
636	0					0	foreach (@seqs) {
637	0					0	$out->write_seq($_->{seqob});
638							}
639							}
640
641							sub split_cdhit {
642	0			0	0	0	my $cls_file = $opts{'split-cdhit'};
643	0		0			0	open IN, "<" . $cls_file \|\| die "cdhit clstr file not found: $cls_file\n";
644	0					0	my %clusters;
645							my $cl_id;
646	0					0	my @mem;
647	0					0	while () {
648	0					0	my $line = $_;
649	0					0	chomp $line;
650	0	0				0	if ($line =~ /^>(\S+)\s+(\d+)/) {
651	0					0	$cl_id = $1 . "_" . $2;
652	0					0	my @mem = ();
653	0					0	$clusters{$cl_id} = \@mem;
654							} else {
655	0					0	my $ref = $clusters{$cl_id};
656	0					0	my @mems = @$ref;
657	0					0	my @els = split /\s+/, $line;
658	0					0	my $seq_id = $els[2];
659	0					0	$seq_id =~ s/>//;
660	0					0	$seq_id =~ s/\.\.\.$//;
661	0					0	push @mems, $seq_id;
662	0					0	$clusters{$cl_id} = \@mems;
663							}
664							}
665							# print Dumper(\%clusters);
666	0					0	my %seqs;
667	0					0	while ($seq = $in->next_seq()) {
668	0					0	my $id = $seq->display_id();
669	0					0	$seqs{$id} = $seq;
670							}
671
672	0					0	foreach my $id (keys %clusters) {
673	0					0	my $out = Bio::SeqIO->new( -file => ">" . $filename . "-". $id . ".fas", -format => 'fasta');
674	0					0	my @seqids = @{ $clusters{$id} };
	0					0
675	0					0	foreach my $seq_id (@seqids) {
676	0					0	$out->write_seq($seqs{$seq_id});
677							}
678							}
679	0					0	exit;
680							}
681
682
683							sub print_composition {
684	1			1	0	6	while ($seq = $in->next_seq()) {
685	7					1768	my $hash_ref = Bio::Tools::SeqStats->count_monomers($seq);
686	7					1793	my $count;
687	7					39	$count += $hash_ref->{$_} foreach keys %$hash_ref;
688	7					31	print $seq->id();
689	7					169	foreach (sort keys %$hash_ref) {
690	28					69	print "\t", $_, ":", $hash_ref->{$_}, "(";
691	28					134	printf "%.2f", $hash_ref->{$_}/$count*100;
692	28					56	print "%)"
693							}
694	7					42	print "\n"
695							}
696							}
697
698							# This sub calls all the del/pick subs above. Any option to filter input sequences by some criterion goes through here, and the appropriate filter subroutine is called.
699							sub filter_seqs {
700	4			4	0	8	my $action = shift;
701	4					11	my $match = $opts{$action};
702
703							# matching to stop at 1st ':' so that ids using ':' as field delimiters are handled properly
704	4	50				31	$match =~ /^([^:]+):(\S+)$/ \|\| die "Bad search format. Expecting a pattern of the form: tag:value.\n";
705
706	4					18	my ($tag, $value) = ($1, $2);
707	4					25	my @selected = split(/,/, $value);
708	4					13	my $callsub = "find_by_" . "$tag";
709
710	4	50				18	die "Bad tag or function not implemented. Tag was: $tag\n" if !defined($filter_dispatch{$callsub});
711
712	4	50				14	if ($tag eq 'order') {
		0
		0
713	4					8	my $ct = 0;
714	4					17	my %order_list = parse_orders(\@selected); # Parse selected orders and create a hash
715	4					19	while (my $currseq = $in->next_seq) { $ct++; $filter_dispatch{$callsub}->($action, $ct, $currseq, \%order_list) }
	28					7246
	28					110
716	4	50				420	foreach (keys %order_list) { print STDERR "No matches found for order number $_\n" if $_ > $ct }
	7					56
717							}
718							elsif ($tag eq 'id') {
719	0					0	my %id_list = map { $_ => 1 } @selected; # create a hash from @selected
	0					0
720	0					0	while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%id_list) }
	0					0
721	0	0				0	foreach (keys %id_list) { warn "No matches found for '$_'\n" if $id_list{$_} == 1 }
	0					0
722							}
723							elsif ($tag eq 'file') {
724	0		0			0	open LIST, "<", $value \|\| die "can't find file $value\n";
725	0					0	my %list;
726	0					0	while() { chomp; $list{$_}++ }
	0					0
	0					0
727	0					0	close LIST;
728	0					0	while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $match, $currseq, \%list) }
	0					0
729	0	0				0	foreach (keys %list) { warn "No matches found for '$_'\n" if $list{$_} == 1 }
	0					0
730							} else {
731	0					0	while (my $currseq = $in->next_seq) { $filter_dispatch{$callsub}->($action, $currseq, $value) }
	0					0
732							}
733							}
734
735							=head2 retrieve_seqs()
736
737							Retrieves a sequence from RefSeq using the provided accession
738							number. A wrapper for CE#get_Seq_by_acc>.
739
740							=cut
741
742							# To do: add fetch by gi
743							sub retrieve_seqs {
744	0			0	1	0	my $gb = Bio::DB::RefSeq->new();
745	0					0	my $seq = $gb->get_Seq_by_acc($opts{'fetch'}); # Retrieve sequence with Accession Number
746	0					0	$out->write_seq($seq)
747							}
748
749							=head2 remove_gaps()
750
751							Remove gaps
752
753							=cut
754
755							sub remove_gaps {
756	1			1	1	6	while ($seq = $in->next_seq()) {
757	7					3616	my $string = $seq->seq();
758	7					125	$string =~ s/-//g;
759	7					14	$string =~ s/\.//g;
760	7					20	my $new_seq = Bio::Seq->new(-id => $seq->id(), -seq => $string);
761	7					2346	$out->write_seq($new_seq)
762							}
763							}
764
765							=head2 print_lengths()
766
767							Print all sequence lengths. Wraps
768							Llength\|https://metacpan.org/pod/Bio::Seq#length>.
769
770							=cut
771
772							sub print_lengths {
773	1			1	1	3	while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->length(), "\n" }
	7					1021
774							}
775
776
777							=head2 print_seq_count()
778
779							Print all sequence lengths. Wraps
780							Llength\|https://metacpan.org/pod/Bio::Seq#length>.
781
782							=cut
783
784							sub print_seq_count {
785	1			1	1	2	my $count;
786	1					4	while ($seq = $in->next_seq()) { $count++ }
	7					923
787	1					46	print $count, "\n"
788							}
789
790							=head2 make_revcom()
791							Reverse complement. Wraps
792							Lrevcom()\|https://metacpan.org/pod/Bio::Seq#revcom>.
793
794							=cut
795
796
797							sub make_revcom { # reverse-complement a sequence
798	1			1	1	27	while ($seq = $in->next_seq()) {
799	7					2353	my $new = Bio::Seq->new(-id => $seq->id() . ":revcom", -seq => $seq->revcom()->seq());
800	7					2576	$out->write_seq($new)
801							}
802							}
803
804							=head2 print_subseq()
805
806							Select substring (of the 1st sequence). Wraps
807							Lsubseq()\|https://metacpan.org/pod/Bio::Seq#subseq>.
808
809							=cut
810
811
812							sub print_subseq {
813	1			1	1	22	while ($seq = $in->next_seq()) {
814	7					3546	my $id = $seq->id();
815	7					186	my ($start, $end) = split /\s,\s/, $opts{"subseq"};
816	7	50				29	$end = $seq->length() if $end eq '-'; # allow shorthand: -s'2,-'
817	7	50				22	die "end out of bound: $id\n" if $end > $seq->length();
818	7					119	my $new = Bio::Seq->new(-id => $seq->id() . ":$start-$end", -seq => $seq->subseq($start, $end));
819	7					2903	$out->write_seq($new)
820							}
821							}
822
823							sub _internal_stop_or_x {
824	7			7		2459	my $str = shift;
825	7					12	$str =~ s/\*$//; # remove last stop
826	7					10	my @internalStops;
827							my @nonStandardAA;
828	7					19	while ($str =~ m/\/g) { # internal stop # previously missed double *
829	14					24	push @internalStops, pos($str);
830							}
831
832	7					13	while ($str =~ m/X/g) { # non-standard AA
833	0					0	push @nonStandardAA, pos($str);
834							}
835
836	7	50				11	if (@nonStandardAA) {
837	0					0	warn "Presence of X at postions:\t", join(";", @nonStandardAA), "\n";
838							}
839
840	7	50				11	if (@internalStops) {
841	7					217	warn "Presence of internal stops at postions:\t", join(";", @internalStops), "\n";
842							}
843
844	7	50				32	return @internalStops ? 1 : 0;
845							}
846
847							=head2 reading_frame_ops
848
849							Translate in 1, 3, or 6 frames based on the value of C<$opts> set via
850							L\|/initialize>. Wraps
851							Ltranslate()\|https://metacpan.org/pod/Bio::Seq#translate>,
852							Ltranslate_3frames()\|https://metacpan.org/pod/Bio::SeqUtils#translate_3frames>,
853							and
854							Ltranslate_6frames()\|https://metacpan.org/pod/Bio::SeqUtils#translate_6frames>.
855
856							=cut
857
858
859							sub reading_frame_ops {
860	1			1	1	3	my $frame = $opts{"translate"};
861	1					3	while ($seq = $in->next_seq()) {
862	7	50				1250	if ($frame == 1) {
		0
		0
863	7	50				23	if (&_internal_stop_or_x($seq->translate()->seq())) {
864	7					15	warn "internal stop:\t" . $seq->id . "\tskip.\n"
865							} else {
866	0					0	$out->write_seq($seq->translate())
867							}
868							}
869							elsif ($frame == 3) {
870	0					0	my @prots = Bio::SeqUtils->translate_3frames($seq);
871	0					0	foreach (@prots) {
872	0					0	my $id = $_->id();
873	0					0	$id =~ /^(\S+)-(\d)F$/;
874	0					0	my ($oriId, $fm) = ($1, $2);
875	0					0	$_->id($oriId . "\|+" . ($fm+1));
876	0					0	$out->write_seq($_);
877							}
878							} elsif ($frame == 6) {
879	0					0	my @prots = Bio::SeqUtils->translate_6frames($seq);
880	0					0	foreach (@prots) {
881	0					0	my $id = $_->id();
882	0					0	$id =~ /^(\S+)-(\d)([RF])$/;
883	0					0	my ($oriId, $fm, $dir) = ($1, $2, $3);
884	0	0				0	if ($dir eq 'F') {
885	0					0	$_->id($oriId . "\|+" . ($fm+1));
886							} else {
887	0					0	$_->id($oriId . "\|-" . ($fm+1));
888							}
889	0					0	$out->write_seq($_);
890							}
891	0					0	} else { warn "Accepted frame arguments: 1, 3, and 6\n"}
892							}
893							}
894
895							=head2 restrict_coord()
896
897							Note: This function is currently DEPRECATED.
898
899							Finds digestion coordinates by a specified restriction enzyme
900							specified in C<$opts{restrinct}> set via L\|/initialize>.
901
902							An input file with sequences is expected. Wraps
903							Lcut()\|https://github.com/bioperl/Bio-Restriction>.
904
905							Outputs coordinates of overhangs in BED format.
906
907
908							sub restrict_coord {
909							use Bio::Restriction::Analysis;
910							use Bio::Restriction::EnzymeCollection;
911
912							my $enz = $opts{"restrict-coord"};
913							my $re = Bio::Restriction::EnzymeCollection->new()->get_enzyme($enz);
914							my $len = length($re->overhang_seq());
915
916							while ( $seq = $in->next_seq() ) {
917							my $seq_str = $seq->seq();
918							die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i;
919							my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
920							foreach my $pos ($ra->positions($enz)) {
921							print $seq->id()."\t".($pos-$len)."\t".$pos."\n";
922							}
923							}
924							}
925
926							=head2 restrict_digest()
927
928							Note: This function is currently DEPRECATED.
929
930							Predicted fragments from digestion by a specified restriction enzyme
931							specified in C<$opts{restrinct}> set via L\|/initialize>.
932
933							An input file with sequences is expected. Wraps
934							Lcut()\|https://metacpan.org/pod/Bio::Restriction::Analysis#cut>.
935
936
937							sub restrict_digest {
938							my $enz = $opts{"restrict"};
939							use Bio::Restriction::Analysis;
940							while ( $seq = $in->next_seq() ) {
941							my $seq_str = $seq->seq();
942							die "Not a DNA sequence\n" unless $seq_str =~ /^[ATCGRYSWKMBDHVN]+$/i;
943							my $ra = Bio::Restriction::Analysis->new(-seq=>$seq);
944							foreach my $frag ($ra->fragment_maps($enz)) {
945							my $seq_obj = Bio::Seq->new(
946							-id=>$seq->id().'\|'.$frag->{start}.'-'.$frag->{end}.'\|'.($frag->{end}-$frag->{start}+1),
947							-seq=>$frag->{seq});
948							$out->write_seq($seq_obj)
949							}
950							}
951							}
952
953							=cut
954
955							=head2 anonymize()
956
957							Replace sequence IDs with serial IDs I characters long, as specified in
958							C<$opts{'anonymize'}> set via L\|/initialize>.
959							For example if C<$opts{'anonymize'}>, the first ID will be C.
960							leading 'S' The length of the serial idea
961
962							A sed script file is produced with a F<.sed> suffix that may be used
963							with sed's C<'-f'> argument. If the filename is F<'-'>, the sed file
964							is named C instead. A message containing the sed filename is
965							written to C.
966
967							=cut
968
969							sub anonymize {
970	1		50	1	1	5	my $char_len = $opts{"anonymize"} // die "Tried to use option 'prefix' without using option 'anonymize'. Exiting...\n";
971	1	50				3	my $prefix = (defined($opts{"prefix"})) ? $opts{"prefix"} : "S";
972
973	1	50				3	pod2usage(1) if $char_len < 1;
974
975	1					2	my $ct = 1;
976	1					1	my %serial_name;
977	1					2	my $length_warn = 0;
978	1					4	while ($seq = $in->next_seq()) {
979	7					1126	my $serial = $prefix . sprintf "%0" . ($char_len - length($prefix)) . "s", $ct;
980	7	50				11	$length_warn = 1 if length($serial) > $char_len;
981	7					17	$serial_name{$serial} = $seq->id();
982	7					93	$seq->id($serial);
983	7					72	$out->write_seq($seq);
984	7					1197	$ct++
985							}
986
987	1					73	_make_sed_file($filename, %serial_name);
988	1					23	warn "Anonymization map:\n";
989	1					6	while (my ($k, $v) = each %serial_name) { warn "$k => $v\n" }
	7					62
990
991	1	50				7	warn "WARNING: Anonymized ID length exceeded requested length: try a different length or prefix.\n" if $length_warn
992							}
993
994							=head2 shred_seq()
995
996							Break into individual sequences writing a FASTA file for each sequence.
997
998							=cut
999
1000
1001							sub shred_seq {
1002	1			1	1	3	while ($seq = $in->next_seq()) {
1003	7					2668	my $newid = $seq->id();
1004	7					112	$newid =~ s/[\s\\|]/_/g;
1005	7					22	print $newid, "\n";
1006	7		50			16	my $suffix = $out_format \|\| "fas";
1007	7					26	my $newout = Bio::SeqIO->new(-format => $out_format, -file => ">" . $newid . ".$suffix");
1008	7					4314	$newout->write_seq($seq)
1009							}
1010							exit
1011	1					347	}
1012
1013							=head2 count_codons()
1014
1015							Count codons for coding sequences (e.g., a genome file consisting of
1016							CDS sequences). Wraps
1017							Lcount_codons()\|https://metacpan.org/pod/Bio::Tools::SeqStats#count_codons>.
1018
1019							=cut
1020
1021							sub codon_table {
1022	0			0	0	0	my $string = $opts{'codon-table'};
1023							# die "bioseq --codon-table atg\|M\n" unless $string;
1024	0					0	my $myCodonTable = Bio::Tools::CodonTable->new();
1025	19			19		197	use Bio::Tools::IUPAC;
	19					35
	19					57645
1026	0					0	my $iupac = Bio::Tools::IUPAC->new();
1027	0					0	my %aas = $iupac->iupac_iup(); #print Dumper(\%aas); # T=>T; Z=>E/Q; etc
1028							# my %nts = $iupac->iupac(); #print Dumper(\%nts); # M=>A/C; A=>A; etc
1029	0	0				0	if ($string =~ /^[atcg]{3}$/i) {
		0
1030	0					0	print $string, "\t", $myCodonTable->translate($string), "\n";
1031							} elsif (&_in_list($string, [keys %aas])) {
1032	0					0	print $string, "\t", join "\t", $myCodonTable->revtranslate($string), "\n";
1033							} else {
1034	0					0	print $string, "Currently only takes a 3-letter DNA-base codon or a 1-letter uppercase IUPAC aa code";
1035							}
1036							}
1037
1038							sub count_codons {
1039	0			0	1	0	my $new_seq;
1040	0					0	my $myCodonTable = Bio::Tools::CodonTable->new();
1041	0					0	while ($seq = $in->next_seq()) { $new_seq .= $seq->seq() }
	0					0
1042	0					0	my $hash_ref = Bio::Tools::SeqStats->count_codons(Bio::Seq->new(-seq=>$new_seq, -id=>'concat'));
1043	0					0	my $count;
1044	0					0	$count += $hash_ref->{$_} foreach keys %$hash_ref;
1045	0					0	foreach (sort keys %$hash_ref) {
1046	0					0	print $_, ":\t", $myCodonTable->translate($_), "\t", $hash_ref->{$_}, "\t";
1047	0					0	printf "%.2f", $hash_ref->{$_}/$count*100;
1048	0					0	print "%\n"
1049							}
1050							}
1051
1052							=head2 print_gb_gene_feats()
1053
1054							print gene sequences in FASTA from a GenBank file of bacterial
1055							genome. Won't work for a eukaryote genbank file.
1056
1057							=cut
1058
1059							sub print_gb_gene_feats { # works only for prokaryote genome
1060	1			1	1	6	$seq = $in->next_seq();
1061	1	50				104468	die "$filename: Not a GenBank file. Quit\n" unless $in_format eq 'genbank';
1062	1					3	my $gene_count = 0;
1063	1					12	foreach my $feat ($seq->get_SeqFeatures()) {
1064	66	100				19780	if ($feat->primary_tag eq 'CDS') {
1065	28					252	my $location = $feat->location();
1066	28	50				542	next if $location->isa('Bio::Location::Split');
1067	28					69	my $gene_tag = "gene_" . $gene_count++;
1068	28					49	my $gene_symbol = 'na';
1069	28					46	my $product = 'na';
1070	28					84	foreach my $tag ($feat->get_all_tags()) {
1071	181	100				770	($gene_tag) = $feat->get_tag_values($tag) if $tag eq 'locus_tag';
1072	181	100				614	($gene_symbol) = $feat->get_tag_values($tag) if $tag eq 'gene';
1073	181	100				364	($product) = $feat->get_tag_values($tag) if $tag eq 'product';
1074	181					757	$product =~ s/\s/_/g;
1075							}
1076	28					139	my $gene = Bio::Seq->new(-id => (join "\|", ($gene_tag, $feat->start, $feat->end, $feat->strand, $gene_symbol, $product)),
1077							-seq=>$seq->subseq($feat->start, $feat->end));
1078	28	100				14526	if ($feat->strand() > 0) { $out->write_seq($gene) } else { $out->write_seq($gene->revcom())}
	14					314
	14					300
1079							# print join "\t",
1080							# ($feat->gff_string, $feat->start, $feat->end,
1081							# $feat->strand);
1082							# print "\n";
1083							}
1084							}
1085							}
1086
1087							=head2 count_leading_gaps()
1088
1089							Count and print the number of leading gaps in each sequence.
1090
1091							=cut
1092
1093							sub count_leading_gaps {
1094	0			0	1	0	while ($seq = $in->next_seq()) {
1095	0					0	my $lead_gap = 0;
1096	0					0	my $see_aa = 0; # status variable
1097	0					0	my @mono = split //, $seq->seq();
1098	0					0	for (my $i = 0; $i < $seq->length(); $i++) {
1099	0	0				0	$see_aa = 1 if $mono[$i] ne '-';
1100	0	0	0			0	$lead_gap++ if !$see_aa && $mono[$i] eq '-'
1101							}
1102	0					0	print $seq->id(), "\t", $lead_gap, "\n"
1103							}
1104							}
1105
1106							=head2 hydroB()
1107
1108							Return the mean Kyte-Doolittle hydropathicity for protein
1109							sequences. Wraps
1110							Lhydropathicity()\|https://metacpan.org/pod/Bio::Tools::SeqStats#hydropathicity>.
1111
1112							=cut
1113
1114
1115							sub hydroB {
1116	1			1	1	3	while ($seq = $in->next_seq()) {
1117	24					4051	my $pep_str = $seq->seq();
1118	24					308	$pep_str =~ s/\*//g;
1119	24					51	$seq->seq($pep_str);
1120	24					1444	my $gravy = Bio::Tools::SeqStats->hydropathicity($seq);
1121	24					17496	printf "%s\t%.4f\n", $seq->id(), $gravy
1122							}
1123							}
1124
1125							=head2 linearize()
1126
1127							Linearize FASTA, print one sequence per line.
1128
1129							=cut
1130
1131
1132							sub linearize {
1133	1			1	1	6	while ($seq = $in->next_seq()) { print $seq->id(), "\t", $seq->seq(), "\n" }
	7					1818
1134							}
1135
1136							=head2 reloop_at()
1137
1138							Re-circularize a bacterial genome by starting at a specified position
1139							given in the C<$opts{"reloop"> set via
1140							L\|/initialize>.
1141
1142							For example for sequence "ABCDE". C
1143							would generate"'BCDEA".
1144
1145							=cut
1146							sub reloop_at {
1147	1			1	1	5	my $seq = $in->next_seq; # only the first sequence
1148	1					408	my $break = $opts{"reloop"};
1149	1					9	my $new_seq = Bio::Seq->new(-id => $seq->id().":relooped_at_".$break, -seq => $seq->subseq($break, $seq->length()) . $seq->subseq(1, $break-1));
1150	1					551	$out->write_seq($new_seq)
1151							}
1152
1153							=head2 remove_stop()
1154
1155							Remove stop codons.
1156
1157							=cut
1158
1159							sub remove_stop {
1160	1			1	1	11	my $myCodonTable = Bio::Tools::CodonTable->new();
1161	1					38	while ($seq = $in->next_seq()) {
1162	7					2282	my $newstr = "";
1163	7					19	for (my $i = 1; $i <= $seq->length() / 3; $i++) {
1164	206					1816	my $codon = $seq->subseq(3 * ($i - 1) + 1, 3 * ($i - 1) + 3);
1165	206	100				4932	if ($myCodonTable->is_ter_codon($codon)) { warn "Found and removed stop codon\n"; next }
	14					706
	14					43
1166	192					5751	$newstr .= $codon
1167							}
1168	7					69	my $new = Bio::Seq->new(-id => $seq->id(), -seq => $newstr);
1169	7					1490	$out->write_seq($new)
1170							}
1171							}
1172
1173
1174							####################### internal subroutine ###########################
1175
1176							sub _in_list {
1177	0			0		0	my $scalar = shift;
1178	0					0	my $ref_to_array = shift;
1179	0					0	foreach (@{$ref_to_array}) {
	0					0
1180	0	0				0	return 1 if $_ eq $scalar;
1181							}
1182	0					0	return 0;
1183							}
1184
1185							sub parse_orders {
1186	4			4	0	7	my @selected = @{shift()};
	4					14
1187
1188	4					8	my @orders;
1189							# Parse if $value contains ranges: allows mixing ranges and lists
1190	4					11	foreach my $val (@selected) {
1191	5	100				76	if ($val =~ /^(\d+)-(\d+)$/) { # A numerical range
1192	1					3	my ($first, $last) = ($1, $2);
1193	1	50				4	die "Invalid seq range: $first, $last\n" unless $last > $first;
1194	1					3	push @orders, ($first .. $last)
1195	4					12	} else { push @orders, $val } # Single value
1196							}
1197	4					11	return map { $_ => 1 } @orders
	7					28
1198							}
1199
1200							sub _make_sed_file {
1201	1			1		4	my $filename = shift @_;
1202	1					4	my (%serial_names) = @_;
1203
1204	1	50				4	$filename = "STDOUT" if $filename eq '-';
1205
1206	1					51	my $sedfile = basename($filename) . ".sed";
1207	1	50				200	open(my $sedout, ">", $sedfile) or die $!;
1208
1209	1					16	print $sedout "# usage: sed -f $filename.sed \n";
1210
1211	1					5	foreach (keys %serial_names) {
1212	7					9	my $real_name = $serial_names{$_};
1213	7					10	my $sed_cmd = "s/$_/" . $real_name . "/g;\n";
1214	7					11	print $sedout $sed_cmd
1215							}
1216	1					42	close $sedout;
1217
1218	1					60	print STDERR "\nCreated $filename.sed\tusage: sed -f $filename.sed \n\n"
1219							}
1220
1221							################### pick/delete filters #####################
1222
1223							sub find_by_file {
1224	0			0	0	0	my ($action, $match, $currseq, $id_list) = @_;
1225	0					0	my $seq_id = $currseq->id();
1226	0					0	$filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id)
1227							}
1228
1229							sub pick_by_file {
1230	0			0	0	0	my ($match, $currseq, $id_list, $seq_id) = @_;
1231	0	0				0	if ($id_list->{$seq_id}) {
1232	0					0	$id_list->{$seq_id}++;
1233	0	0				0	die "Multiple matches (" . $seq_id . ") for $match found\n" if $id_list->{$seq_id} > 2;
1234	0					0	$out->write_seq($currseq)
1235							}
1236							}
1237
1238							sub del_by_file {
1239	0			0	0	0	my ($match, $currseq, $id_list, $seq_id) = @_;
1240	0	0				0	if ($id_list->{$seq_id}) {
1241	0					0	$id_list->{$seq_id}++;
1242	0					0	warn "Deleted sequence: ", $currseq->id(), "\n"
1243	0					0	} else { $out->write_seq($currseq) }
1244							}
1245
1246
1247							sub find_by_order {
1248	28			28	0	76	my ($action, $ct, $currseq, $order_list) = @_; # say join "\t", @_;
1249	28					72	$filter_dispatch{ $action . "_by_order" }->($ct, $currseq, $order_list)
1250							}
1251
1252							sub pick_by_order {
1253	21			21	0	28	my ($ct, $currseq, $order_list) = @_;
1254	21	100				96	$out->write_seq($currseq) if $order_list->{$ct}
1255							}
1256
1257							sub del_by_order {
1258	7			7	0	18	my ($ct, $currseq, $order_list) = @_; # say join "\t", @_;
1259	7	100				27	if ($order_list->{$ct}) { warn "Deleted sequence: ", $currseq->id(), "\n" }
	1					21
1260	6					30	else { $out->write_seq($currseq) }
1261							}
1262
1263							sub find_by_id {
1264	0			0	0		my ($action, $match, $currseq, $id_list) = @_;
1265	0						my $seq_id = $currseq->id();
1266	0						$filter_dispatch{$action . "_by_id"}->($match, $currseq, $id_list, $seq_id)
1267							}
1268
1269							sub pick_by_id {
1270	0			0	0		my ($match, $currseq, $id_list, $seq_id) = @_;
1271
1272	0	0					if ($id_list->{$seq_id}) {
1273	0						$id_list->{$seq_id}++;
1274	0	0					die "Multiple matches (" . $seq_id . ":" . $id_list->{$seq_id} - 1 . ") for $match found\n" if $id_list->{$seq_id} > 2;
1275	0						$out->write_seq($currseq)
1276							}
1277							}
1278
1279							sub del_by_id {
1280	0			0	0		my ($match, $currseq, $id_list, $seq_id) = @_;
1281
1282	0	0					if ($id_list->{$seq_id}) {
1283	0						$id_list->{$seq_id}++;
1284	0						warn "Deleted sequence: ", $currseq->id(), "\n"
1285	0						} else { $out->write_seq($currseq) }
1286							}
1287
1288							sub find_by_re {
1289	0			0	0		my ($action, $currseq, $value) = @_;
1290	0						my $regex = qr/$value/;
1291	0						my $seq_id = $currseq->id();
1292	0						$filter_dispatch{ $action . "_by_re" }->($currseq, $regex, $seq_id)
1293							}
1294
1295							sub pick_by_re {
1296	0			0	0		my ($currseq, $regex, $seq_id) = @_;
1297	0	0					$out->write_seq($currseq) if $seq_id =~ /$regex/
1298							}
1299
1300							sub del_by_re {
1301	0			0	0		my ($currseq, $regex, $seq_id) = @_;
1302
1303	0	0					if ($seq_id =~ /$regex/) { warn "Deleted sequence: $seq_id\n" }
	0
1304	0						else { $out->write_seq($currseq) }
1305							}
1306
1307							# TODO This needs better documentation
1308							sub find_by_ambig {
1309	0			0	0		my ($action, $currseq, $cutoff) = @_;
1310	0						my $string = $currseq->seq();
1311	0						my $ct = ($string =~ s/([^ATCG])/$1/gi); # won't work for AA seqs
1312	0						my $percent_ambig = $ct / $currseq->length();
1313	0						$filter_dispatch{"$action" . "_by_ambig"}->($currseq, $cutoff, $ct, $percent_ambig)
1314							}
1315
1316							# TODO Probably better to change behavior when 'picking'?
1317							sub pick_by_ambig {
1318	0			0	0		my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
1319	0	0					$out->write_seq($currseq) if $percent_ambig > $cutoff
1320							}
1321
1322							sub del_by_ambig {
1323	0			0	0		my ($currseq, $cutoff, $ct, $percent_ambig) = @_;
1324
1325							# if ($percent_ambig > $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of N: ", $ct, "\n" }
1326	0	0					if ($ct >= $cutoff) { warn "Deleted sequence: ", $currseq->id(), " number of bad monomers: ", $ct, "\n" }
	0
1327	0						else { $out->write_seq($currseq) }
1328							}
1329
1330							sub find_by_length {
1331	0			0	0		my ($action, $currseq, $value) = @_;
1332	0						$filter_dispatch{$action . "_by_length"}->($currseq, $value)
1333							}
1334
1335							sub pick_by_length {
1336	0			0	0		my ($currseq, $value) = @_;
1337	0	0					$out->write_seq($currseq) if $currseq->length() <= $value
1338							}
1339
1340							sub del_by_length {
1341	0			0	0		my ($currseq, $value) = @_;
1342
1343	0	0					if ($currseq->length() <= $value) { warn "Deleted sequence: ", $currseq->id(), " length: ", $currseq->length(), "\n" }
	0
1344	0						else { $out->write_seq($currseq) }
1345							}
1346
1347							1;
1348							__END__