File Coverage

lib/Bio/BPWrapper/AlnManipulations.pm

Criterion	Covered	Total	%
statement	402	804	50.0
branch	81	224	36.1
condition	23	73	31.5
subroutine	55	73	75.3
pod	23	43	53.4
total	584	1217	47.9

line	stmt	bran	cond	sub	pod	time	code
1							=encoding utf8
2
3							=head1 NAME
4
5							Bio::BPWrapper::AlnManipulations - Functions for L
6
7							=head1 SYNOPSIS
8
9							use Bio::BPWrapper::AlnManipulations;
10							# Set options hash ...
11							initialize(\%opts);
12							write_out(\%opts);
13
14							=cut
15
16							package Bio::BPWrapper::AlnManipulations;
17
18	29			29		212	use strict;
	29					67
	29					994
19	29			29		121	use warnings;
	29					47
	29					1241
20	29			29		424	use 5.010;
	29					91
21	29			29		17528	use Bio::AlignIO;
	29					4894171
	29					1330
22	29			29		307	use Bio::SimpleAlign;
	29					97
	29					740
23	29			29		195	use Bio::LocatableSeq;
	29					63
	29					640
24	29			29		22810	use Data::Dumper;
	29					275311
	29					3048
25	29			29		277	use List::Util qw(shuffle);
	29					64
	29					4257
26	29			29		19404	use Bio::Align::Utilities qw(:all);
	29					94102
	29					5946
27	29			29		279	use Exporter ();
	29					60
	29					678
28	29			29		19804	use Bio::SearchIO;
	29					392487
	29					1635
29							#use Bio::Tools::GuessSeqFormat;
30
31	29			29		246	if ($ENV{'DEBUG'}) { use Data::Dumper }
	29					98
	29					2164
32
33	29			29		185	use vars qw(@ISA @EXPORT @EXPORT_OK);
	29					62
	29					3536
34
35							@ISA = qw(Exporter);
36
37							# FIXME: some of these have too generic names like
38							# "upper_case" or "concat".
39							@EXPORT = qw(initialize can_handle handle_opt write_out write_out_paml
40							phylip_non_interleaved split_cdhit trim_ends gap_states
41							gap_states_matrix print_avp_id bootstrap draw_codon_view del_seqs
42							remove_gaps print_length print_match print_num_seq pick_seq
43							change_ref aln_slice get_uniq binary_informative variables_sites
44							avg_id_by_win concat conserve_blocks get_consensus dns_to_protein
45							remove_gapped_cols_in_one_seq colnum_from_residue_pos
46							list_ids premute_states protein_to_dna sample_seqs
47							shuffle_sites random_slice select_third_sites remove_third_sites
48							upper_case );
49
50	29			29		382	use Bio::BPWrapper;
	29					166
	29					282291
51							# Package global variables
52							my ($in, $out, $aln, %opts, $file, $in_format, $out_format, @alns, $binary);
53
54							my $VERSION = $Bio::BPWrapper::VERSION;
55
56							## For new options, just add an entry into this table with the same
57							## key as in the GetOpts function in the main program. Make the key be
58							## a reference to the handler subroutine (defined below), and test
59							## that it works.
60							my %opt_dispatch = (
61							"avg-pid" => \&print_avp_id,
62							"boot" => \&bootstrap,
63							"codon-view" => \&draw_codon_view,
64							"delete" => \&del_seqs,
65							"gap-char" => \&gap_char,
66							"no-gaps" => \&remove_gaps,
67							"length" => \&print_length,
68							"match" => \&print_match,
69							"num-seq" => \&print_num_seq,
70							"pick" => \&pick_seqs,
71							"ref-seq" => \&change_ref,
72							"slice" => \&aln_slice,
73							"split-cdhit" => \&split_cdhit,
74							"uniq" => \&get_unique,
75							"var-sites" => \&variable_sites,
76							"window" => \&avg_id_by_win,
77							"concat" => \&concat,
78							"con-blocks" => \&conserved_blocks,
79							"consensus" => \&get_consensus,
80							"dna2pep" => \&dna_to_protein,
81							"rm-col" => \&remove_gapped_cols_in_one_seq,
82							"aln-index" => \&colnum_from_residue_pos,
83							"list-ids" => \&list_ids,
84							"pair-diff" => \&pair_diff,
85							"pair-diff-ref" => \&pair_diff_ref,
86							"permute-states" => \&permute_states,
87							"pep2dna" => \&protein_to_dna,
88							"resample" => \&sample_seqs,
89							"shuffle-sites" => \&shuffle_sites,
90							"select-third" => \&select_third_sites,
91							"remove-third" => \&remove_third_sites,
92							"random-slice" => \&random_slice,
93							"upper" => \&upper_case,
94							"gap-states" => \&gap_states,
95							"gap-states2" => \&gap_states_matrix,
96							"trim-ends" => \&trim_ends,
97							"bin-inform" => \&binary_informative,
98							"bin-ref" => \&binary_ref,
99							"phy-nonint" => \&phylip_non_interleaved
100							);
101
102
103							##################### initializer & option handlers ###################
104
105							## TODO Formal testing!
106
107							=head1 SUBROUTINES
108
109							=head2 initialize()
110
111							Sets up most of the actions to be performed on an alignment.
112
113							Call this right after setting up an options hash.
114
115							Sets package variables: C<$in_format>, C<$binary>, C<$out_format>, and C<$out>.
116
117							=cut
118
119							sub initialize {
120	29			29	1	74	my $opts_ref = shift;
121	29					197	Bio::BPWrapper::common_opts($opts_ref);
122	29					57	%opts = %{$opts_ref};
	29					154
123
124							# This is the format that aln-manipulations expects by default
125	29					80	my $default_format = "clustalw";
126
127							# assume we're getting input from standard input
128
129							# my $in_format = $opts{"input"} \|\| $default_format;
130							# my $in_format;
131							# use IO::Scalar;
132							# my $s;
133							# my ($guesser);
134							# if ($file eq "STDIN") {
135							# my $line_ct = 0;
136							# my $lines;
137							# while(<>) { $lines .= $_; $line_ct++; last if $line_ct >= 100 } # read the first 100 lines
138							# $guesser = Bio::Tools::GuessSeqFormat->new( -text => $lines );
139							# } else {
140							# open $ifh, "<", $file or die $!;
141							# $guesser = Bio::Tools::GuessSeqFormat->new( -file => $file );
142							# }
143							# $in_format = $guesser->guess();
144							# die "unknown file format. Try specify with -i flag.\n" unless $in_format;
145							# seek (STDIN, 0, 0);
146							# warn "$in_format\n";
147
148	29		100			189	my $in_format = $opts{'input'} \|\| 'clustalw';
149	29	100				136	if ($opts{"concat"}) {
150							# foreach my $file (glob @ARGV) {
151	1					5	while ($file = shift @ARGV) {
152							# warn "reading $file\n";
153							# $guesser = Bio::Tools::GuessSeqFormat->new( -file => $file);
154							# $in_format = $guesser->guess;
155	1					16	$in = Bio::AlignIO->new(-file => $file, -format => $in_format);
156	1					3200	while ($aln=$in->next_aln()) { push @alns, $aln }
	1					5941
157							}
158							} else {
159	28		50			106	$file = shift @ARGV \|\| "STDIN"; # If no more arguments were given on the command line
160	28	50	33			208	if ($in_format && $in_format =~ /blast/) { # guess blastoutput as "phylip", so -i 'blast' is needed
161							# if ($opts{"input"} && $opts{"input"} =~ /blast/) { # "blastxml" (-outfmt 5 ) preferred
162	0	0				0	my $searchio = Bio::SearchIO->new( -format => 'blast', ($file eq "STDIN")? (-fh => \*STDIN) : (-file => $file)); # works for regular blast output
163							# my $searchio = Bio::SearchIO->new( -format => 'blast', -fh => $ifh);
164	0					0	while ( my $result = $searchio->next_result() ) {
165	0					0	while( my $hit = $result->next_hit ) {
166	0					0	my $hsp = $hit->next_hsp; # get first hit; others ignored
167	0					0	$aln = $hsp->get_aln();
168							}
169							}
170							} else { # would throw error if format guessed wrong
171							# $in = Bio::AlignIO->new(-format => $in_format, ($file eq "STDIN")? (-fh => \*STDIN) : (-file => $file));
172							# $in = Bio::AlignIO->new(-format => $in_format, -fh => $ifh);
173	28	50				447	$in = Bio::AlignIO->new(-format=>$in_format, ($file eq "STDIN")? (-fh => \*STDIN) : (-file => $file) );
174	28					91784	$aln = $in->next_aln()
175							}
176							}
177
178	29	50				228179	$binary = $opts{"binary"} ? 1 : 0;
179
180							#### Options which require an output FH go after this ####
181	29		66			232	$out_format = $opts{"output"} \|\| $default_format;
182	29	50				282	$out = Bio::AlignIO->new(-format => $out_format, -fh => \*STDOUT) unless $out_format eq 'paml'
183							}
184
185							sub can_handle {
186	27			27	0	176	my $option = shift;
187	27					198	return defined($opt_dispatch{$option})
188							}
189
190							sub handle_opt {
191	26			26	0	80	my $option = shift;
192							# This passes option name to all functions
193	26					158	$opt_dispatch{$option}->($option)
194							}
195
196							=head2 write_out_paml()
197
198							Writes output in PAML format.
199
200							=cut
201
202							sub write_out_paml() {
203	0			0	1	0	my @seq;
204	0					0	my $ct=0;
205
206	0					0	foreach my $seq ($aln->each_seq()) {
207	0					0	my $id = $seq->display_id();
208	0	0				0	if ($seq->seq() =~ /^-+$/) { print STDERR "all gaps: $file\t$id\n"; next }
	0					0
	0					0
209	0					0	$ct++;
210	0					0	push @seq, $seq
211							}
212
213	0	0				0	die "No computable sequences: less than 2 seq.\n" unless $ct >= 2;
214	0					0	print $ct, "\t", $aln->length(), "\n";
215	0					0	foreach (@seq) {
216	0					0	print $_->display_id(), "\n";
217	0					0	print $_->seq(), "\n"
218							}
219							}
220
221							=head2 write_out()
222
223							Performs the bulk of the alignment actions actions set via
224							L\|/initialize> and calls
225							Lwrite_aln()>\|https://metacpan.org/pod/Bio::AlignIO#write_aln>
226							or L\|/write_out_paml>.
227
228							Call this after calling C<#initialize(\%opts)>.
229
230							=cut
231
232							sub write_out($) {
233
234	29			29	1	80	my $opts = shift;
235	29					93	for my $option (keys %{$opts}) {
	29					123
236	29	50	100			476	next if ($option eq 'input') \|\| ($option eq 'output') \|\| ($option eq 'noflatname') \|\| ($option eq 'binary'); # Don't process these options: they are for AlignIO
			66
			66
237
238	27	100				829	if (can_handle($option)) { handle_opt($option) } # If there is a function to handle the current option, execute it
	26					91
239	1					66	else { warn "Missing handler for: $option\n" }
240							}
241
242
243	21	100				87304	$aln->set_displayname_flat() unless $opts{"no-flat"};
244	21	50				18995	if ($out_format eq 'paml') { &write_out_paml($aln) }
	0					0
245	21					165	else { $out->write_aln($aln) }
246							}
247
248							sub phylip_non_interleaved {
249	0			0	0	0	my @seq;
250	0					0	my $ct=0;
251
252	0					0	foreach my $seq ($aln->each_seq()) {
253	0					0	my $id = $seq->display_id();
254	0	0				0	if ($seq->seq() =~ /^-+$/) { print STDERR "all gaps: $file\t$id\n"; next }
	0					0
	0					0
255	0					0	$ct++;
256	0					0	push @seq, $seq
257							}
258
259	0	0				0	die "No computable sequences: less than 2 seq.\n" unless $ct >= 2;
260	0					0	print "\t", $ct, "\t", $aln->length(), "\n";
261	0					0	foreach (@seq) {
262	0					0	printf "%-50s", $_->display_id();
263	0					0	print $_->seq(), "\n"
264							}
265	0					0	exit;
266							}
267
268							###################### subroutine ######################
269
270							sub gap_char {
271	0			0	0	0	my $char = $opts{'gap-char'};
272	0	0				0	die "gap-char takes a single character\n" unless length($char) == 1;
273	0					0	foreach my $seq ($aln->each_seq()) {
274	0					0	my $seq_str = $seq->seq();
275	0					0	$seq_str =~ s/[\.-]/$char/g;
276	0					0	$seq->seq($seq_str);
277							}
278							}
279
280							sub pair_diff_ref {
281	0			0	0	0	my $refId = $opts{'pair-diff-ref'};
282	0					0	my (@seqs, $refSeq);
283	0					0	foreach my $seq ($aln->each_seq()) {
284	0	0				0	if ($seq->id eq $refId) {
285	0					0	$refSeq = $seq;
286							} else {
287	0					0	push @seqs, $seq;
288							}
289							}
290
291	0	0				0	die "ref seq $refId not found\n" unless $refSeq;
292
293	0					0	@seqs = sort { $a->id() cmp $b->id() } @seqs;
	0					0
294	0					0	for (my $i=0; $i < $#seqs; $i++) {
295	0					0	my $idB = $seqs[$i]->id();
296	0					0	my $seqB = $seqs[$i];
297	0					0	my $pair = new Bio::SimpleAlign;
298	0					0	$pair->add_seq($refSeq);
299	0					0	$pair->add_seq($seqB);
300							# $pair = $pair->remove_gaps(); # not relialbe, e.g., n/N for DNA seqs
301	0					0	my $ct_diff = 0;
302	0					0	my $ct_valid = 0;
303							# my $matchLine = $pair->match_line();
304							# my @match_symbols = split //, $matchLine;
305	0					0	for (my $j = 1; $j <= $pair->length; $j++) {
306	0					0	my $mA = $refSeq->subseq($j,$j);
307	0					0	my $mB = $seqB->subseq($j,$j);
308	0	0	0			0	next if $refSeq->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i \|\| $mB !~ /^[ATCG]$/i);
			0
			0
309							# next if $match_symbols[$i] eq '*';
310	0	0	0			0	next if $mA eq '-' \|\| $mB eq '-';
311	0					0	$ct_valid++;
312	0	0				0	$ct_diff++ unless $mA eq $mB;
313							# next if $match_symbols[$i] eq '*';
314							}
315	0					0	my $pairdiff = $pair->percentage_identity();
316	0					0	print join "\t", ($refId, $idB, $ct_diff, $ct_valid, $pair->length());
317	0					0	printf "\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100;
318							}
319	0					0	exit;
320							}
321
322							sub pair_diff {
323	0			0	0	0	my $alnBack = $aln;
324	0					0	my $lenTotal = $alnBack->length();
325							# $alnBack = $alnBack->remove_gaps();
326	0					0	my $matchLineFull = $alnBack->match_line();
327	0					0	my @match_symbols_full = split //, $matchLineFull;
328	0					0	my $num_var = 0; # contain gaps
329							# my $num_var = 0; # de-gapped variable sites
330	0					0	for (my $i = 0; $i < $alnBack->length; $i++) {
331	0	0				0	next if $match_symbols_full[$i] eq '*';
332	0					0	$num_var++;
333							}
334
335	0					0	my (@seqs);
336	0					0	print join "\t", qw(seq_1 seq_2 len_gapless len_variable diff_gapless diff_variable len_aln identitty fac_diff frac_gapless frac_variable);
337	0					0	print "\n";
338	0					0	foreach my $seq ($aln->each_seq()) { push @seqs, $seq }
	0					0
339	0					0	@seqs = sort { $a->id() cmp $b->id() } @seqs;
	0					0
340	0					0	for (my $i=0; $i < $#seqs; $i++) {
341	0					0	my $idA = $seqs[$i]->id();
342	0					0	my $seqA = $seqs[$i];
343	0					0	for (my $j=$i+1; $j <= $#seqs; $j++) {
344	0					0	my $idB = $seqs[$j]->id();
345	0					0	my $seqB = $seqs[$j];
346	0					0	my $pair = new Bio::SimpleAlign;
347	0					0	$pair->add_seq($seqA);
348	0					0	$pair->add_seq($seqB);
349							# $pair = $pair->remove_gaps(); # unreliable: e.g., "n", "N" in DNA seqs
350							# my $mask = $seqA->seq ^ $seqB->seq; # (exclusive or) operator: returns "\0" if same
351	0					0	my $ct_diff = 0;
352	0					0	my $ct_valid = 0;
353	0					0	my $gap_included_diff = 0;
354							# my $matchLine = $pair->match_line();
355							# my @match_symbols = split //, $matchLine;
356	0					0	for (my $j = 1; $j <= $pair->length; $j++) {
357	0					0	my $mA = $seqA->subseq($j,$j);
358	0					0	my $mB = $seqB->subseq($j,$j);
359	0	0	0			0	next if $seqA->alphabet eq 'dna' && $seqB->alphabet eq 'dna' && ($mA !~ /^[ATCG]$/i \|\| $mB !~ /^[ATCG]$/i);
			0
			0
360	0	0				0	$gap_included_diff++ unless $mA eq $mB;
361							# next if $match_symbols[$i] eq '*';
362	0	0	0			0	next if $mA eq '-' \|\| $mB eq '-';
363	0					0	$ct_valid++;
364	0	0				0	$ct_diff++ unless $mA eq $mB;
365							}
366							# while ($mask =~ /[^\0]/g) { $ct_diff++ }
367	0					0	my $pairdiff = $pair->percentage_identity();
368	0					0	print join "\t", ($idA, $idB, $ct_valid, $num_var, $ct_diff, $gap_included_diff, $pair->length());
369	0					0	printf "\t%.4f\t%.4f\t%.4f\t%.4f\n", $pairdiff, 1-$pairdiff/100, $ct_diff/$ct_valid, $gap_included_diff/$num_var;
370							}
371							}
372	0					0	exit;
373							}
374
375
376							sub split_cdhit {
377	0			0	0	0	my $cls_file = $opts{'split-cdhit'};
378	0		0			0	open IN, "<" . $cls_file \|\| die "cdhit clstr file not found: $cls_file\n";
379	0					0	my %clusters;
380							my $cl_id;
381	0					0	my @mem;
382	0					0	while () {
383	0					0	my $line = $_;
384	0					0	chomp $line;
385	0	0				0	if ($line =~ /^>(\S+)\s+(\d+)/) {
386	0					0	$cl_id = $1 . "_" . $2;
387	0					0	my @mem = ();
388	0					0	$clusters{$cl_id} = \@mem;
389							} else {
390	0					0	my $ref = $clusters{$cl_id};
391	0					0	my @mems = @$ref;
392	0					0	my @els = split /\s+/, $line;
393	0					0	my $seq_id = $els[2];
394	0					0	$seq_id =~ s/>//;
395	0					0	$seq_id =~ s/\.\.\.$//;
396	0					0	push @mems, $seq_id;
397	0					0	$clusters{$cl_id} = \@mems;
398							}
399							}
400							# print Dumper(\%clusters);
401	0					0	my %seqs;
402	0					0	foreach my $seq ($aln->each_seq()) {
403	0					0	my $id = $seq->display_id();
404	0					0	$seqs{$id} = $seq;
405							}
406
407	0					0	foreach my $id (keys %clusters) {
408	0					0	my $out = Bio::AlignIO->new( -file => ">" . $file . "-". $id . ".aln", -format => 'clustalw');
409	0					0	my $new_aln = Bio::SimpleAlign->new();
410	0					0	my @seqids = @{ $clusters{$id} };
	0					0
411	0					0	foreach my $seq_id (@seqids) {
412	0					0	$new_aln->add_seq($seqs{$seq_id});
413							}
414	0					0	$new_aln->set_displayname_flat();
415							# $new_aln = &_remove_common_gaps($new_aln);
416	0					0	$out->write_aln($new_aln);
417							}
418	0					0	exit;
419							}
420
421							#sub _remove_common_gaps {
422							#}
423
424							sub trim_ends {
425	0			0	0	0	my (@seqs, @gaps);
426	0					0	foreach my $seq ($aln->each_seq()) {
427	0					0	my $id = $seq->display_id();
428	0					0	my @nts = split //, $seq->seq();
429	0					0	my $gap_start = 0;
430	0					0	my $new;
431	0					0	for (my $i=0; $i< $aln->length(); $i++) {
432	0	0				0	if ($nts[$i] eq '-') {
433	0	0				0	if ($gap_start) { # gap -> gap
434	0					0	$new->{end}++;
435							} else { # nt -> gap
436	0					0	$gap_start = 1;
437	0					0	$new = { 'start' => $i+1, 'end' => $i+1, 'seq_name' => $id }
438							}
439							} else {
440	0	0				0	if ($gap_start) { # gap -> nt
441	0					0	$gap_start = 0;
442	0					0	push @gaps, $new;
443							} else { # nt -> nt
444	0					0	next;
445							}
446							}
447							}
448	0	0				0	push @gaps, $new if $gap_start;
449							}
450
451	0					0	my (@three_end_gaps, @five_end_gaps);
452
453	0					0	foreach my $gap (@gaps) {
454	0					0	$gap->{length} = $gap->{end} - $gap->{start} + 1;
455	0	0				0	push @three_end_gaps, $gap if $gap->{start} == 1;
456	0	0				0	push @five_end_gaps, $gap if $gap->{end} == $aln->length;
457							}
458
459	0	0	0			0	return unless @three_end_gaps or @five_end_gaps;
460
461	0					0	my $longest_three_end = 0;
462	0					0	my $longest_five_start = 0;
463	0					0	my $longest_three_length = 0;
464	0					0	my $longest_five_length = 0;
465
466	0					0	foreach my $gap (@three_end_gaps) {
467	0	0				0	if ($gap->{length} > $longest_three_length) {
468	0					0	$longest_three_end = $gap->{end};
469	0					0	$longest_three_length = $gap->{length};
470							}
471							}
472
473	0					0	foreach my $gap (@five_end_gaps) {
474	0	0				0	if ($gap->{length} > $longest_five_length) {
475	0					0	$longest_five_start = $gap->{start};
476	0					0	$longest_five_length = $gap->{length};
477							}
478							}
479
480							# print STDERR $longest_three, "\t", $longest_five, "\n";
481	0	0				0	if (@three_end_gaps) {
482	0					0	print STDERR Dumper(\@three_end_gaps);
483	0					0	print STDERR $longest_three_end, "\n";
484	0					0	$aln = $aln->slice($longest_three_end + 1, $aln->length);
485							}
486
487	0	0				0	if (@five_end_gaps) {
488	0					0	print STDERR Dumper(\@five_end_gaps);
489	0					0	print STDERR $longest_five_start, "\n";
490	0					0	$aln = $aln->slice(1, $longest_five_start - $longest_three_end - 1);
491							}
492							}
493
494							sub gap_states {
495	0			0	0	0	my (@seqs, @gaps);
496	0					0	foreach my $seq ($aln->each_seq()) {
497	0					0	my $id = $seq->display_id();
498	0					0	my @nts = split //, $seq->seq();
499	0					0	my $gap_start = 0;
500	0					0	my $new;
501	0					0	for (my $i=0; $i< $aln->length(); $i++) {
502	0	0				0	if ($nts[$i] eq '-') {
503	0	0				0	if ($gap_start) { # gap -> gap
504	0					0	$new->{end}++;
505							} else { # nt -> gap
506	0					0	$gap_start = 1;
507	0					0	$new = { 'start' => $i+1, 'end' => $i+1, 'seq_name' => $id }
508							}
509							} else {
510	0	0				0	if ($gap_start) { # gap -> nt
511	0					0	$gap_start = 0;
512	0					0	push @gaps, $new;
513							} else { # nt -> nt
514	0					0	next;
515							}
516							}
517							}
518	0	0				0	push @gaps, $new if $gap_start;
519							}
520	0					0	my (%gap_freqs, @uniq_gaps);
521	0					0	foreach my $gap (@gaps) {
522	0					0	my $id = $gap->{start} . "-" . $gap->{end};
523	0					0	$gap->{id} = $id;
524	0					0	$gap_freqs{$id}++;
525							}
526
527	0					0	foreach my $id (keys %gap_freqs) {
528	0					0	my ($start, $end) = split /-/, $id;
529							push @uniq_gaps, {
530							'start' => $start,
531							'end' => $end,
532							'is_edge' => ($start == 1 \|\| $end == $aln->length) ? 1 : 0,
533							'in_frame' => ($end - $start + 1) % 3 ? 0 : 1,
534	0	0	0			0	'counts' => $gap_freqs{$id},
		0
535							};
536							}
537
538	0					0	foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
	0					0
539							# print Dumper(\@uniq_gaps);
540	0					0	exit;
541							}
542
543							sub gap_states_matrix {
544	0			0	0	0	my (@seq_ids, @gaps);
545	0					0	foreach my $seq ($aln->each_seq()) {
546	0					0	my $id = $seq->display_id();
547	0					0	push @seq_ids, $id;
548	0					0	my @nts = split //, $seq->seq();
549	0					0	my $gap_start = 0;
550	0					0	my $new;
551	0					0	for (my $i=0; $i< $aln->length(); $i++) {
552	0	0				0	if ($nts[$i] eq '-') {
553	0	0				0	if ($gap_start) { # gap -> gap
554	0					0	$new->{end}++;
555							} else { # nt -> gap
556	0					0	$gap_start = 1;
557	0					0	$new = { 'start' => $i+1, 'end' => $i+1, 'seq_name' => $id }
558							}
559							} else {
560	0	0				0	if ($gap_start) { # gap -> nt
561	0					0	$gap_start = 0;
562	0					0	push @gaps, $new;
563							} else { # nt -> nt
564	0					0	next;
565							}
566							}
567							}
568	0	0				0	push @gaps, $new if $gap_start;
569							}
570	0					0	my (%gap_freqs, @uniq_gaps, %gap_presence);
571	0					0	foreach my $gap (@gaps) {
572	0					0	my $id = $gap->{start} . "-" . $gap->{end};
573	0					0	$gap->{id} = $id;
574	0					0	$gap_freqs{$id}++;
575	0					0	$gap_presence{$id}->{$gap->{seq_name}} = 1;
576							}
577
578	0					0	foreach my $id (keys %gap_freqs) {
579	0					0	my ($start, $end) = split /-/, $id;
580							push @uniq_gaps, {
581							'start' => $start,
582							'end' => $end,
583							'is_edge' => ($start == 1 \|\| $end == $aln->length) ? 1 : 0,
584							'in_frame' => ($end - $start + 1) % 3 ? 0 : 1,
585	0	0	0			0	'counts' => $gap_freqs{$id},
		0
586							'id' => $id
587							};
588							}
589
590	0	0				0	my @gaps_sorted = sort {$a->{start} <=> $b->{start} \|\| $a->{end} <=> $b->{end}} @uniq_gaps;
	0					0
591	0					0	foreach (@gaps_sorted) { print "\t", $_->{id}}
	0					0
592	0					0	print "\n";
593	0					0	foreach my $sid (sort @seq_ids) {
594	0					0	print $sid;
595	0					0	foreach my $u_gap (@gaps_sorted) {
596	0		0			0	print "\t", $gap_presence{$u_gap->{id}}->{$sid} \|\| 0;
597							}
598	0					0	print "\n";
599							}
600
601							# foreach my $gap (@uniq_gaps) { say join "\t", ($file, $gap->{start}, $gap->{end}, $gap->{is_edge}, $gap->{in_frame}, $gap->{counts}, $aln->length()) }
602							# print Dumper(\@uniq_gaps);
603	0					0	exit;
604							}
605
606							=head2 print_avp_id
607
608							Print the average percent identity of an alignment.
609
610							Wraps
611							Laverage_percentage_identity()\|https://metacpan.org/pod/Bio::SimpleAlign#average_percentage_identity>.
612
613
614							=cut
615
616							sub print_avp_id {
617	1			1	1	5	printf "%.4f\n", $aln->average_percentage_identity();
618							exit
619	1					15520	}
620
621							=head2 boostrap()
622
623							Produce a bootstrapped
624							alignment. Lbootstrap()\|https://metacpan.org/pod/Bio::Align::Utilities#bootstrap_replicates>.
625
626							=cut
627
628							sub bootstrap {
629	1			1		7	my $replicates = bootstrap_replicates($aln,1);
630	1					13210	$aln = shift @$replicates
631							}
632
633							=head2 draw_codon_view()
634
635							Print a CLUSTALW-like alignment, but separated by codons. Intended for
636							use with DNA sequences. Block-final position numbers are printed at
637							the end of every alignment block at the point of wrapping, and
638							block-initial counts appear over first nucleotide in a block.
639
640							=cut
641
642							sub draw_codon_view {
643							# my $aln = shift;
644							# Is 20 by default. Blocks are measured in CODONS, so mult by 3
645	1			1	1	4	my $block_length = 3 * $opts{"codon-view"};
646	1					6	my $aln_length = $aln->length();
647	1					142	my $num_seqs = $aln->num_sequences();
648	1					58	my $min_pad = 4; # Minimum padding between sequence and ID
649	1					3	my $seq_matrix;
650	1					2	my @seqs = ($aln->each_seq);
651	1					86	my @display_ids;
652
653							# Find longest id length, add id/sequence padding
654	1					5	my $max_id_len = _find_max_id_len(\@seqs);
655
656							# id length includes padding
657	1					6	$max_id_len += $min_pad;
658
659							# Extract display_ids and sequences from AlignIO object.
660	1					3	foreach my $seq (@seqs) {
661	13					21	my @seq_str = split '', $seq->seq();
662	13					484	push @$seq_matrix, \@seq_str;
663	13					24	push @display_ids, $seq->display_id;
664
665							# Pad display ids so that space between them and sequence is consistent
666	13					61	$display_ids[-1] = _pad_display_id($display_ids[-1], $max_id_len)
667							}
668
669	1					2	my $nuc_count = 0;
670
671							# Loop over each sequence.
672	1					4	for (my $i = 0; $i < $num_seqs; $i++) {
673
674							# Print count at end of block when we are starting out a new block
675	26	100				49	_print_positions($nuc_count, $aln_length, $max_id_len) if $i == 0;
676
677							# Loop over nucleotides
678	26					41	for (my $j = $nuc_count; $j < $aln_length; $j++) {
679
680							# When we're starting, or starting a new block, print the display id's.
681	1560	100				1988	print $display_ids[$i] if $j % $block_length == 0;
682
683	1560					1834	print "$$seq_matrix[$i]->[$j]";
684	1560	100				1990	print " " if ((($j + 1) % 3) == 0);
685
686							# When we've reached the end of the alignment or a block
687	1560	100	100			3634	if ($j + 1 == $aln_length \|\| (($j + 1) % $block_length) == 0) {
688	26	100				39	if ($i + 1 == $num_seqs) { $nuc_count = $j + 1 } # If this is the last sequence, save the ending (next) position.
	2					5
689	24					27	else { print "\n" } # Otherwise, start on the next line.
690							last # In either case, need to exit this loop.
691	26					33	}
692							} # END for LOOP OVER NUCLEOTIDES
693
694							# Finish if we've reached the end of the alignment, and the last sequence
695	26	100	100			97	if (($i + 1 == $num_seqs) && ($nuc_count == $aln_length)) { print "\n"; last }
	1	100	66			6
	1					3
696
697							# If we haven't reached the end of the alignment, but we've run through
698							# all sequences, print final block position and start at first sequence.
699							elsif (($i + 1 == $num_seqs) && ($nuc_count < $aln_length)) {
700	1					3	$i = -1; # Always increments after a loop; next increment sets to 0.
701	1					3	print "\n\n"
702							}
703							} # END for LOOP OVER SEQUENCES
704
705							# Can't let script terminate normally: produces traditional alignment output
706	1					195	exit 0
707							}
708
709							=head2 del_seqs()
710
711							Delete sequences based on their id. Option takes a comma-separated list of ids.
712							The list of sequences to delete is in C<$opts{"delete"}> which is set via
713							L\|/initialize>
714
715							=cut
716
717							sub del_seqs {
718	1			1	1	5	_del_or_pick($opts{"delete"}, "remove_seq", 0)
719							}
720
721							=head2 remove_gaps()
722
723							Remove gaps (and returns an de-gapped alignment). Wraps
724							Lremove_gaps()\|https://metacpan.org/pod/Bio::SimpleAlign#remove_gaps>.
725
726							=cut
727
728							sub remove_gaps {
729	1			1	1	8	$aln = $aln->remove_gaps()
730							}
731
732							=head2 print_length()
733
734							Print alignment length. Wraps Llength()\|https://metacpan.org/pod/Bio::SimpleAlign#length>.
735
736							=cut
737
738							sub print_length {
739	1			1	1	10	say $aln->length();
740							exit
741	1					370	}
742
743							=head2 print_match()
744
745							Go through all columns and change residues identical to the reference
746							sequence to be the match character, '.' Wraps
747							Lmatch()\|https://metacpan.org/pod/Bio::SimpleAlign#match>.
748
749							=cut
750
751							sub print_match {
752	1			1	1	7	$aln->match()
753							}
754
755							=head2 print_num_seq()
756
757							Print number of sequences in alignment.
758
759							=cut
760
761							sub print_num_seq {
762	1			1	1	4	say $aln->num_sequences();
763							exit
764	1					163	}
765
766							=head2 pick_seqs()
767
768							Pick sequences based on their id. Option takes a comma-separated list
769							of ids. The sequences to pick set in C<$opts{"pick"}> which is set via
770							L\|/initialize>.
771
772							=cut
773
774							sub pick_seqs {
775	1			1	1	6	_del_or_pick($opts{"pick"}, "add_seq", 1)
776							}
777
778							=head2 change_ref()
779
780							Change the reference sequence to be what is in C<$opts{"refseq"}>
781							which is set via L\|/initialize>. Wraps
782							Lset_new_reference()\|https://metacpan.org/pod/Bio::SimpleAlign#set_new_reference>.
783
784							=cut
785
786							sub change_ref {
787	1			1	1	1	my @newAlns;
788	1	50				3	if ($opts{'concat'}) {
789	0					0	foreach (@alns) {
790	0					0	push @newAlns, $_->set_new_reference($opts{"ref-seq"})
791							}
792	0					0	@alns = @newAlns;
793							} else {
794	1					5	$aln = $aln->set_new_reference($opts{"ref-seq"})
795							}
796							}
797
798
799							=head2 aln_slice()
800
801							Get a slice of the alignment. The slice is specified
802							C<$opts{"slice"}> which is set via L\|/initialize>.
803
804							Wraps
805							Lslice()\|https://metacpan.org/pod/Bio::SimpleAlign#slice>
806							with improvements.
807
808							=cut
809
810
811							sub aln_slice { # get alignment slice
812	1			1	1	10	my ($begin, $end) = split(/\s,\s/, $opts{"slice"});
813
814							# Allow for one parameter to be omitted. Default $begin to the
815							# beginning of the alignment, and $end to the end.
816	1	50				5	$begin = 1 if $begin eq "-";
817	1	50				4	$end = $aln->length if $end eq "-";
818	1					8	$aln = $aln->slice($begin, $end)
819							}
820
821							=head2 get_unique()
822
823							Extract the alignment of unique sequences. Wraps
824							Luniq_seq()\|https://metacpan.org/pod/Bio::SimpleAlign#uniq_seq>.
825
826							=cut
827
828
829							sub get_unique() {
830	1			1	1	6	$aln->verbose(1);
831	1					17	$aln = $aln->uniq_seq();
832							}
833
834							sub _has_gap {
835	0			0		0	my $ref = shift;
836	0					0	foreach (@$ref) {
837	0	0				0	return 1 if $_ eq '-';
838							}
839	0					0	return 0;
840							}
841
842							sub _has_singleton {
843	0			0		0	my $ref = shift;
844	0					0	foreach my $key (keys %$ref) {
845	0	0				0	return 1 if $ref->{$key} == 1;
846							}
847	0					0	return 0;
848							}
849
850							=head2 binary_informative
851
852							extract binary and informative sites (for clique): discard constant,
853							3/4-states, non-informative
854
855							=cut
856
857							sub binary_informative {
858	0			0	1	0	my $new_aln = Bio::SimpleAlign->new();
859	0					0	my $len=$aln->length();
860	0					0	my (@seq_ids, @inf_sites, %bin_chars);
861
862							# Go through each column and save variable sites
863	0					0	my $ref_bases = &_get_a_site_v2(); #print Dumper($ref_bases); exit;
864	0					0	foreach (sort keys %$ref_bases) { push @seq_ids, $_ }
	0					0
865	0					0	for (my $i=1; $i<=$len; $i++) {
866	0					0	my (%seen, @bases);
867	0					0	foreach my $id (@seq_ids) { push @bases, $ref_bases->{$id}->{$i}; }
	0					0
868	0					0	%seen = %{&_seen_bases(\@bases)};
	0					0
869	0	0				0	next if &_has_gap( [ values %seen ] );
870	0	0				0	next if keys %seen != 2;
871	0	0				0	next if &_has_singleton(\%seen);
872	0					0	my ($base1, $base2) = sort keys %seen;
873	0					0	$bin_chars{$i}{$base1} = 0;
874	0					0	$bin_chars{$i}{$base2} = 1;
875	0					0	push @inf_sites, $i;
876							}
877
878	0	0				0	die "informative sites not found\n" unless @inf_sites;
879	0					0	foreach (@inf_sites) { warn $_, "\n" }
	0					0
880
881							# Recreate the object for output
882	0					0	foreach my $id (@seq_ids) {
883	0					0	my $seq_str;
884	0					0	foreach my $i (@inf_sites) {
885	0	0				0	$seq_str .= $binary ? $bin_chars{$i}->{$ref_bases->{$id}->{$i}} : $ref_bases->{$id}->{$i};
886							}
887	0					0	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
888	0					0	my $end = $loc_seq->end;
889	0					0	$loc_seq->end($end);
890	0					0	$new_aln->add_seq($loc_seq)
891							}
892
893	0					0	$aln = $new_aln
894							}
895
896							sub binary_ref {
897	0			0	0	0	my $new_aln = Bio::SimpleAlign->new();
898	0					0	my $len=$aln->length();
899	0					0	my (@seq_ids, @inf_sites, %bin_chars, @ref_states);
900	0		0			0	my $refId = $opts{'bin-ref'} \|\| die "need ref id as an argument\n";
901							# Go through each column and save variable sites
902	0					0	my $ref_bases = &_get_a_site_v2(); #print Dumper($ref_bases); exit;
903	0					0	my $seenRef = 0;
904	0					0	foreach (sort keys %$ref_bases) {
905	0					0	push @seq_ids, $_;
906	0	0				0	next unless $_ eq $refId;
907	0					0	$seenRef++;
908							}
909	0	0				0	die "ref seq not found: $refId\n" unless $seenRef;
910
911	0					0	for (my $i=1; $i<=$len; $i++) {
912	0					0	my (%seen, @bases);
913	0					0	foreach my $id (@seq_ids) { push @bases, $ref_bases->{$id}->{$i}; }
	0					0
914	0					0	%seen = %{&_seen_bases(\@bases)};
	0					0
915	0	0				0	next if &_has_gap( [ values %seen ] ); # skip gaps
916	0	0				0	next if keys %seen != 2; # skip multi-states or constant sites
917	0					0	my ($base1, $base2) = sort keys %seen;
918	0	0				0	if ($base1 eq $ref_bases->{$refId}->{$i}) { # base1 is ref
919	0					0	$bin_chars{$i}{$base1} = 1;
920	0					0	$bin_chars{$i}{$base2} = 0;
921							} else { # base2 is ref
922	0					0	$bin_chars{$i}{$base1} = 0;
923	0					0	$bin_chars{$i}{$base2} = 1;
924							}
925	0					0	push @inf_sites, $i;
926							}
927
928	0	0				0	die "no binary sites\n" unless @inf_sites;
929	0					0	foreach (@inf_sites) { warn $_, "\n" }
	0					0
930
931							# Recreate the object for output
932	0					0	foreach my $id (@seq_ids) {
933	0					0	my $seq_str;
934	0					0	foreach my $i (@inf_sites) {
935	0					0	$seq_str .= $bin_chars{$i}->{$ref_bases->{$id}->{$i}};
936							}
937	0					0	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
938	0					0	my $end = $loc_seq->end;
939	0					0	$loc_seq->end($end);
940	0					0	$new_aln->add_seq($loc_seq)
941							}
942	0					0	$aln = $new_aln
943							}
944
945
946							=head2 variable_sites()
947
948							Extracts variable sites.
949
950							=cut
951							sub variable_sites {
952	1			1	1	5	$aln = $aln->remove_gaps();
953	1					9022	my $new_aln = Bio::SimpleAlign->new();
954	1					33	my $len=$aln->length();
955	1					151	my (%seq_ids, @sites, @var_sites);
956
957							# Go through each column and save variable sites
958	1					4	for (my $i=1; $i<=$len; $i++) {
959	90					272	my ($ref_bases, $ref_ids) = &_get_a_site($i);
960	90					178	%seq_ids = %{$ref_ids};
	90					763
961	90					314	my $is_constant = &_is_constant(&_paste_nt($ref_bases));
962	90	100				933	if ($is_constant < 1) { push @sites, $ref_bases; push @var_sites, $i }
	2					4
	2					10
963							}
964
965	1					67	foreach (@var_sites) { warn $_, "\n" }
	2					128
966
967							# Recreate the object for output
968	1					13	foreach my $id (sort keys %seq_ids) {
969	13					1772	my $seq_str;
970	13					33	foreach my $aln_site (@sites) {
971	26	100				58	foreach (@$aln_site) { $seq_str .= $_->{nt} if $_->{id} eq $id }
	338					903
972							}
973
974	13					70	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
975	13					4602	my $end = $loc_seq->end;
976	13					723	$loc_seq->end($end);
977	13					1028	$new_aln->add_seq($loc_seq)
978							}
979
980	1					179	$aln = $new_aln
981							}
982
983							=head2 avg_id_by_win()
984
985							Calculate pairwise average sequence difference by windows (overlapping
986							windows with fixed step of 1). The window size is set in
987							C<$opts{"window"}> which is set via L\|/initialize>.
988
989							=cut
990
991
992							sub avg_id_by_win {
993	1			1	1	3	my $window_sz = $opts{"window"};
994	1					8	for my $i (1 .. ($aln->length() - $window_sz + 1)) {
995	61					10717	my $slice = $aln->slice($i, $i + $window_sz - 1);
996	61					607427	my $pi = (100 - $slice->average_percentage_identity()) / 100;
997	61					775528	printf "%d\t%d\t%.4f\n", $i, $i + $window_sz - 1, $pi
998							}
999							exit
1000	1					271	}
1001
1002							=head2 concat()
1003
1004							Concatenate multiple alignments sharing the same set of unique
1005							IDs. This is normally used for concatenating individual gene
1006							alignments of the same set of samples to a single one for making a
1007							"supertree". Wraps
1008							Lcat()\|https://metacpan.org/pod/Bio::Align::Utilities#cat>.
1009
1010							=cut
1011
1012							##################################################################
1013							# 2/26/2021: add position map (for rate4site applications)
1014							##############################################################
1015							sub concat {
1016	1			1	1	10	$aln = cat(@alns);
1017	1					5422	warn "Alignment concated. Getting position maps...\n";
1018	1	50				19	my $refSeq = $opts{"ref-seq"} ? $aln->get_seq_by_id($opts{"ref-seq"}) : $aln->get_seq_by_pos(1);
1019							# my $refSeq = $aln->get_seq_by_pos(1);
1020	1					90	my @refGenesInOrder = map { $_ -> get_seq_by_id($refSeq->id) } @alns;
	1					3
1021							# remap start & end for individual gene alignments, sync pos with concatenated aln:
1022	1					58	my $pos = 0;
1023	1					1	my %geneRange;
1024	1					6	for (my $i = 0; $i <= $#refGenesInOrder; $i++) {
1025	1					3	my $start = $pos + 1;
1026	1					3	$pos += $refGenesInOrder[$i]->length();
1027	1					6	my $end = $pos;
1028	1					7	$geneRange{$i+1} = {'start' => $start,
1029							'end' => $end,
1030							};
1031							}
1032
1033							# warn Dumper(\%geneRange);
1034
1035	1					2	my @locTable;
1036	1					16	for (my $i = 1; $i <= $refSeq->length(); $i++) {
1037	120	100				40549	next if $refSeq->subseq($i, $i) eq '-';
1038	90					2304	my ($inGene, $posGene) = &__gene_order($i, \%geneRange);
1039	90					232	push @locTable, {
1040							'pos_concat' => $i,
1041							'pos_unaligned' => $refSeq->location_from_column($i)->start(),
1042							'gene_order' => $inGene,
1043							'pos_gene_aln' => $posGene,
1044							'pos_gene_unaligned' => $refGenesInOrder[$inGene-1]->location_from_column($posGene)->start()
1045							};
1046							}
1047	1					706	open LOG, ">concat.log";
1048	1					20	print LOG join "\t", ("seq_id", "pos_concat", "pos_residue", "gene", "pos_gene_aligned", "pos_gene");
1049	1					4	print LOG "\n";
1050	1					4	foreach (@locTable) {
1051							print LOG join "\t", (
1052							$refSeq->id(),
1053							$_->{pos_concat},
1054							$_->{pos_unaligned},
1055							$_->{gene_order},
1056							$_->{pos_gene_aln},
1057							$_->{pos_gene_unaligned}
1058	90					119	);
1059	90					589	print LOG "\n";
1060							}
1061	1					54	close LOG;
1062	1					66	warn "Position map of reference seq is saved in file concat.log\n";
1063							}
1064
1065							sub __gene_order {
1066	90			90		108	my $pos = shift;
1067	90					94	my $ref = shift;
1068	90					224	my %range = %$ref;
1069	90					185	foreach my $gene (keys %range) {
1070	90	50	33			356	next unless $pos >= $range{$gene}->{start} && $pos <= $range{$gene}->{end};
1071	90					225	return ($gene, $pos - $range{$gene}->{'start'} + 1)
1072							}
1073	0					0	die "Not found in any genes: position $pos\n";
1074							}
1075
1076							#######################################
1077
1078							sub conserved_blocks {
1079	1			1	0	5	my $len=$aln->length();
1080	1					149	my $nseq = $aln->num_sequences();
1081	1					60	my $min_block_size = $opts{"con-blocks"};
1082	1					2	my %seq_ids;
1083
1084	1	50				2	die "Alignment contains only one sequence: $file\n" if $nseq < 2;
1085
1086	1					2	my (@blocks, $block);
1087	1					2	my $in_block=0;
1088	1					3	for (my $i=1; $i<=$len; $i++) {
1089	120					376	my ($ref_bases, $ref_ids) = &_get_a_site($i);
1090	120					215	%seq_ids = %{$ref_ids};
	120					1319
1091	120					479	my $is_constant = &_is_constant(&_paste_nt($ref_bases));
1092	120	100				444	if ($in_block) { # previous site is a contant one
1093	87	100				215	if ($is_constant) {
1094	85					273	$block->{length} ++;
1095	85					185	my @sites = @{$block->{sites}};
	85					479
1096	85					237	push @sites, $ref_bases;
1097	85					303	$block->{sites} = \@sites;
1098	85	100				622	if ($i == $len) {
1099	1					46	warn "Leaving a constant block at the end of alignment: $i\n";
1100	1	50				13	push @blocks, $block if $block->{length} >= $min_block_size
1101							}
1102							} else {
1103	2					7	$in_block = 0;
1104	2	50				12	push @blocks, $block if $block->{length} >= $min_block_size;
1105	2					185	warn "Leaving a constant block at $i\n"
1106							}
1107							} else { # previous site not a constant one
1108	33	100				174	if ($is_constant) { # entering a block
1109	3					169	warn "Entering a constant block at site $i ...\n";
1110	3					10	$in_block=1;
1111	3					33	$block = {start => $i, length => 1, num_seq => $nseq, sites => [($ref_bases)]} # start a new block
1112							}
1113							}
1114							}
1115
1116	1					3	foreach my $bl (@blocks) {
1117	3					7693	my $out = Bio::AlignIO->new(-file=> ">$file" . ".slice-". $bl->{start} . ".aln" , -format=>'clustalw');
1118	3					1903	my $block_aln = Bio::SimpleAlign->new();
1119	3					165	foreach my $id (sort keys %seq_ids) {
1120	39					3042	my ($seq_str, $ungapped_start, $ungapped_end);
1121	39					45	my @sites = @{$bl->{sites}};
	39					143
1122	39					109	for (my $i = 0; $i <= $#sites; $i++) {
1123	1144					1133	my $ref_chars = $sites[$i];
1124	1144					1227	foreach (@$ref_chars) {
1125	14872	100				20014	next unless $_->{id} eq $id;
1126	1144	100				1408	$ungapped_start = $_->{ungapped_pos} if $i == 0;
1127	1144	100				1430	$ungapped_end = $_->{ungapped_pos} if $i == $#sites;
1128							$seq_str .= $_->{nt}
1129	1144					1434	}
1130							}
1131
1132	39					151	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => $ungapped_start, -end => $ungapped_end);
1133	39					9462	$block_aln->add_seq($loc_seq)
1134							}
1135	3					319	$out->write_aln($block_aln)
1136							}
1137							exit
1138	1					3943	}
1139
1140							sub get_consensus {
1141	1			1	0	3	my $percent_threshold = $opts{"consensus"};
1142	1					6	my $consense = Bio::LocatableSeq->new(
1143							-seq => $aln->consensus_string($percent_threshold),
1144							-id => "Consensus_$percent_threshold",
1145							-start => 1,
1146							-end => $aln->length()
1147							);
1148	1					42673	$aln->add_seq($consense)
1149							}
1150
1151							=head2 dna_to_protein()
1152
1153							Align CDS sequences according to their corresponding protein
1154							alignment. Wraps
1155							Laa_to_dna_aln()\|https://metacpan.org/pod/Bio::Align::Utilities#aa_to_dna_aln>.
1156
1157							=cut
1158
1159							sub dna_to_protein {
1160	1			1	1	5	$aln = dna_to_aa_aln($aln)
1161							}
1162
1163							sub remove_gapped_cols_in_one_seq {
1164	1			1	0	3	my $id = $opts{"rm-col"};
1165	1					3	my $nmatch=0;
1166	1					2	my $ref_seq;
1167	1					6	foreach ($aln->each_seq) {
1168	13	100				242	if ($_->id() =~ /$id/) { $nmatch++; $ref_seq = $_ }
	1					11
	1					3
1169							}
1170	1	50	33			15	die "Quit. No ref seq found or more than one ref seq!\n" if !$nmatch \|\| $nmatch > 1;
1171	1					21	my ($ct_gap, $ref) = &_get_gaps($ref_seq);
1172	1					10	warn "Original length: " . $aln->length() . "\n";
1173	1	50				303	if ($ct_gap) {
1174	1					2	my @args;
1175	1					81	push @args, [$_, $_] foreach @$ref;
1176	1					18	$aln = $aln->remove_columns(@args);
1177	1					26896	warn "New length: " . $aln->length() . "\n"
1178							} else {
1179	0					0	warn "No gap: " . $aln->length() . "\n"
1180							}
1181							}
1182
1183							sub colnum_from_residue_pos {
1184	1			1	0	13	my ($id, $pos) = split /\s,\s/, $opts{"aln-index"};
1185	1					8	print $aln->column_from_residue_number($id, $pos), "\n";
1186							exit
1187	1					349	}
1188
1189							=head2 list_ids()
1190
1191							List all sequence ids.
1192
1193							=cut
1194
1195							sub list_ids {
1196	1			1	1	3	my @ids;
1197	1					7	foreach ($aln->each_seq) { push @ids, $_->display_id() }
	13					206
1198	1					28	say join "\n", @ids;
1199							exit
1200	1					122	}
1201
1202							sub permute_states {
1203	1			1	0	8	my $new_aln = Bio::SimpleAlign->new();
1204	1					60	my $len=$aln->length();
1205	1					296	my $nseq = $aln->num_sequences();
1206	1					116	my @seq_ids;
1207
1208	1	50				6	die "Alignment contains only one sequence: $file\n" if $nseq < 2;
1209
1210	1					2	my @sites;
1211	1					5	my $ref_bases = &_get_a_site_v2();
1212	1					10	foreach (sort keys %$ref_bases) { push @seq_ids, $_ }
	13					22
1213	1					5	for (my $i=1; $i<=$len; $i++) {
1214	120					113	my @bases;
1215	120					218	foreach (keys %$ref_bases) { push @bases, $ref_bases->{$_}->{$i} }
	1560					1939
1216	120					519	@bases = shuffle(@bases);
1217	120					156	for (my $j=0; $j<$nseq; $j++) { $ref_bases->{$seq_ids[$j]}->{$i} = $bases[$j] }
	1560					2340
1218							}
1219
1220	1					3	foreach my $id (@seq_ids) {
1221	13					1625	my $seq_str;
1222	13					39	for (my $i=1; $i<=$len; $i++) { $seq_str .= $ref_bases->{$id}->{$i} }
	1560					3613
1223
1224	13					95	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
1225	13					4555	my $end = $loc_seq->end;
1226	13					776	$loc_seq->end($end);
1227	13					964	$new_aln->add_seq($loc_seq)
1228							}
1229	1					153	$aln = $new_aln
1230							}
1231
1232							=head2 protein_to_dna()
1233
1234							Align CDS sequences according to their corresponding protein
1235							alignment. Wraps
1236							Laa_to_dna_aln()https://metacpan.org/pod/Bio::Align::Utilities#aa_to_dna_aln>.
1237
1238							=cut
1239
1240							sub protein_to_dna {
1241	29			29		21403	use Bio::SeqIO;
	29					311446
	29					80878
1242	1			1	1	63	my $cds_in = Bio::SeqIO->new(-file=>$opts{pep2dna}, -format=>'fasta');
1243	1					7823	my %CDSs;
1244	1					5	while (my $seq = $cds_in->next_seq()) { $CDSs{$seq->display_id()} = $seq }
	13					3005
1245	1					93	$aln = aa_to_dna_aln($aln, \%CDSs);
1246							}
1247
1248							=head2 sample_seqs()
1249
1250							Picks I random sequences from input alignment and produces a new
1251							alignment consisting of those sequences.
1252
1253							If n is not given, default is the number of sequences in alignment
1254							divided by 2, rounded down.
1255
1256							This functionality uses an implementation of Reservoir Sampling, based
1257							on the algorithm found here:
1258							http://blogs.msdn.com/b/spt/archive/2008/02/05/reservoir-sampling.aspx
1259
1260							=cut
1261
1262							sub sample_seqs {
1263							# If option was given with no number, take the integer part of num_sequences/2
1264							# Its OK to use int() here (especially since we want to round towards 0)
1265	1			1	1	3	my $num_seqs = $aln->num_sequences;
1266	1	50				86	my $sample_size = ($opts{"resample"} == 0) ? int($num_seqs / 2) : $opts{"resample"};
1267
1268	1	50				3	die "Error: sample size ($sample_size) exceeds number of sequences in alignment: ($num_seqs)" if $sample_size > $num_seqs;
1269
1270							# Use Reservoir Sampling to pick random sequences.
1271	1					3	my @sampled = (1 .. $sample_size);
1272	1					3	for my $j ($sample_size + 1 .. $num_seqs) {
1273	7	100				49	$sampled[ rand(@sampled) ] = $j if rand() <= ($sample_size / $j)
1274							}
1275
1276	1					57	warn "Sampled the following sequences: @sampled\n\n";
1277	1					9	my $tmp_aln = $aln->select_noncont(@sampled);
1278	1					1043	$aln = $tmp_aln
1279							}
1280
1281							=head2 shuffle_sites()
1282
1283							Make a shuffled (not bootstrapped) alignment. This operation randomizes
1284							alignment columns. It is used for testing the significance of long-runs
1285							of conserved sites in an alignment (e.g., conserved intergenic spacers
1286							[IGSs]).
1287
1288							=cut
1289
1290							sub shuffle_sites {
1291	0			0	1	0	my $new_aln = Bio::SimpleAlign->new();
1292	0					0	my $len = $aln->length();
1293	0					0	my $nseq = $aln->num_sequences();
1294	0					0	my %seq_ids;
1295
1296	0	0				0	die "Alignment contains only one sequence: $file\n" if $nseq < 2;
1297
1298	0					0	my @sites;
1299	0					0	for (my $i=1; $i<=$len; $i++) {
1300	0					0	my ($ref_bases, $ref_ids) = &_get_a_site($i);
1301	0					0	%seq_ids = %{$ref_ids};
	0					0
1302	0					0	push @sites, $ref_bases
1303							}
1304
1305	0					0	@sites = shuffle(@sites);
1306
1307	0					0	my @order;
1308	0					0	push @order, $_->[0]->{pos} foreach @sites;
1309	0					0	print STDERR "Shuffled site order:\t", join(",", @order);
1310	0					0	print STDERR "\n";
1311
1312	0					0	foreach my $id (sort keys %seq_ids) {
1313	0					0	my $seq_str;
1314	0					0	foreach my $aln_site (@sites) {
1315	0	0				0	foreach (@$aln_site) { $seq_str .= $_->{nt} if $_->{id} eq $id }
	0					0
1316							}
1317
1318	0					0	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
1319	0					0	my $end = $loc_seq->end;
1320	0					0	$loc_seq->end($end);
1321	0					0	$new_aln->add_seq($loc_seq);
1322							}
1323	0					0	$aln = $new_aln;
1324							}
1325
1326							sub random_slice {
1327	0			0	0	0	my $slice_length = $opts{'random-slice'};
1328	0					0	my $len=$aln->length();
1329	0					0	my $start = int(rand($len - $slice_length+1));
1330	0					0	my $end = $start + $slice_length - 1;
1331	0					0	$aln = $aln->slice($start, $end);
1332							}
1333
1334							sub select_third_sites {
1335	1			1	0	4	my $new_aln = Bio::SimpleAlign->new();
1336	1					34	my $len=$aln->length();
1337	1					135	my $nseq = $aln->num_sequences();
1338	1					79	my @seq_ids;
1339
1340	1	50				4	die "Alignment contains only one sequence: $file\n" if $nseq < 2;
1341
1342	1					4	my $ref_bases = &_get_a_site_v2();
1343	1					10	foreach (sort keys %$ref_bases) { push @seq_ids, $_ }
	13					17
1344
1345	1					2	my @sites;
1346	1					9	for (my $i=3; $i<=$len; $i+=3) { push @sites, $i }
	40					51
1347
1348	1					4	foreach my $id (sort @seq_ids) {
1349	13					991	my $seq_str;
1350	13					213	$seq_str .= $ref_bases->{$id}->{$_} foreach @sites;
1351
1352	13					39	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
1353	13					2491	my $end = $loc_seq->end;
1354	13					430	$loc_seq->end($end);
1355
1356	13					562	$new_aln->add_seq($loc_seq)
1357							}
1358	1					80	$aln = $new_aln
1359							}
1360
1361							sub remove_third_sites {
1362	0			0	0	0	my $new_aln = Bio::SimpleAlign->new();
1363	0					0	my $len=$aln->length();
1364	0					0	my $nseq = $aln->num_sequences();
1365	0					0	my @seq_ids;
1366
1367	0	0				0	die "Alignment contains only one sequence: $file\n" if $nseq < 2;
1368
1369	0					0	my $ref_bases = &_get_a_site_v2();
1370	0					0	foreach (sort keys %$ref_bases) { push @seq_ids, $_ }
	0					0
1371
1372	0					0	my @sites;
1373	0	0				0	for (my $i=1; $i<=$len; $i++) { push @sites, $i if $i % 3 }
	0					0
1374
1375	0					0	foreach my $id (sort @seq_ids) {
1376	0					0	my $seq_str;
1377	0					0	$seq_str .= $ref_bases->{$id}->{$_} foreach @sites;
1378
1379	0					0	my $loc_seq = Bio::LocatableSeq->new(-seq => $seq_str, -id => $id, -start => 1);
1380	0					0	my $end = $loc_seq->end;
1381	0					0	$loc_seq->end($end);
1382
1383	0					0	$new_aln->add_seq($loc_seq)
1384							}
1385	0					0	$aln = $new_aln
1386							}
1387
1388							sub upper_case {
1389	1			1	0	7	$aln->uppercase()
1390							}
1391
1392							########################## internal subroutine #######################
1393
1394							# For use in draw_codon_view
1395							# Pad display ids with a minimum of 4 spaces using the longest display id as a reference point for length. Pass-by-reference, so don't return array.
1396							# Return length of longest id plus padding.
1397							sub _pad_display_id {
1398	13			13		14	my $display_id = shift;
1399	13					13	my $max_len = shift;
1400	13					15	my $padding = ($max_len - length($display_id));
1401	13					27	$display_id .= " " x $padding;
1402	13					18	return $display_id
1403							}
1404
1405							# Used by draw_codon_view. Calculates position of final position in alinged block, prints the current position there.
1406							sub _print_positions {
1407	2			2		4	my $nuc_count = shift;
1408	2					3	my $aln_length = shift;
1409	2					7	my $block_length = 3 * $opts{"codon-view"};
1410	2					4	my $max_id_len = shift;
1411	2					3	my $num_spaces = 0;
1412
1413	2					5	my $start_pos = $nuc_count + 1;
1414	2					2	my $last_pos = 0;
1415	2					4	my $offset = 0;
1416	2	100				6	if (($nuc_count + $block_length) >= $aln_length) {
1417	1					2	$last_pos = $aln_length;
1418	1					3	my $diff = $aln_length - $nuc_count;
1419	1					4	$offset = $diff + ($diff) / 3 + ($diff % 3) - 2 # $diff % 3 gives the number of extra non-codon nucleotides
1420							} else {
1421	1					2	$last_pos = $nuc_count + $block_length;
1422	1					35	$offset = $block_length + ($block_length) / 3 - 2
1423							}
1424
1425							# -1 since we are also printing the starting position.
1426	2					10	$num_spaces += $offset - 1;
1427
1428							# $last_pos_len = length of last_pos treated as a string (ie length(335) = 3)
1429	2					7	my $last_pos_len = length($last_pos);
1430
1431							# Pad $start_pos with $num_blanks blanks if it is shorter than $last_pos
1432	2					5	my $num_blanks = $last_pos_len - length($start_pos);
1433	2	50				25	$start_pos = " " x $num_blanks . "$start_pos" if length($start_pos) < $last_pos_len;
1434
1435	2					9	for (my $i = 0; $i < $last_pos_len; $i++) {
1436	5					28	print " " x $max_id_len . substr($start_pos, $i, 1) . " " x ($num_spaces) . substr($last_pos, $i, 1) . "\n"
1437							}
1438							}
1439
1440							# Function: _del_or_pick
1441							# Desc: Internal function. Generic code for either picking or deleting a sequence from an alignment. Used by del_seqs and pick_seqs.
1442							# Input:
1443							# $id_list, a user-supplied string consisting of comma-separated seq id values
1444							# $method, the name of the Bio::SimpleAlign method to use (remove_seq or add_seq)
1445							# $need_new, a flag indicating whether a new Bio::SimpleAlign object is needed
1446							# Returns: Nothing; uses the $aln global variable
1447
1448							sub _del_or_pick {
1449	2			2		6	my ($id_list, $method, $need_new) = @_;
1450	2	100				9	my $new_aln = ($need_new) ? Bio::SimpleAlign->new() : $aln;
1451
1452	2					69	my @selected = split(/\s,\s/, $id_list);
1453	2					7	foreach my $seq ($aln->each_seq) {
1454	26					562	my $seqid = $seq->display_id();
1455	26					133	foreach (@selected) {
1456	65	100				582	next unless $seqid eq $_;
1457	5					17	$new_aln->$method($seq)
1458							}
1459							}
1460	2	100				16	$aln = $new_aln if $need_new == 1
1461							}
1462
1463							sub _get_gaps {
1464	1			1		3	my $seq = shift;
1465	1					32	my $seq_str = $seq->seq();
1466	1					46	my @chars = split //, $seq_str;
1467	1					4	my $cts = 0;
1468	1					2	my @pos=();
1469	1					6	for (my $i=0; $i<=$#chars; $i++) {
1470	120	100				316	if ($chars[$i] eq '-') { push @pos, $i; $cts++ }
	30					51
	30					67
1471							}
1472	1					28	warn "Found " . scalar(@pos) ." gaps at (@pos) on " . $seq->id() . "\n";
1473	1					121	return ($cts, \@pos)
1474							}
1475
1476							sub _paste_nt {
1477	210			210		352	my $ref = shift;
1478	210					323	my @nts;
1479	210					2565	push @nts, $_->{nt} foreach @$ref;
1480							return \@nts
1481	210					701	}
1482
1483							sub _get_a_site {
1484	210			210		339	my $pos = shift;
1485	210					368	my (@chars, %seq_ids);
1486
1487	210					1179	foreach my $seq ($aln->each_seq) {
1488	2730					106906	my $ungapped = 0;
1489	2730					5458	$seq_ids{$seq->id()}++;
1490	2730					19140	my $state;
1491	2730					5216	for (my $i = 1; $i <= $pos; $i++) {
1492	147615					265485	$state = $seq->subseq($i, $i);
1493	147615	100				4033713	$ungapped++ unless $state eq '-'
1494							}
1495
1496	2730	100				5352	push @chars, {
1497							nt => $seq->subseq($pos, $pos),
1498							ungapped_pos => ($state eq '-') ? "gap" : $ungapped++,
1499							id => $seq->id(),
1500							pos => $pos,
1501							}
1502							}
1503	210					8192	return (\@chars, \%seq_ids)
1504							}
1505
1506							sub _seen_bases {
1507	0			0		0	my %count;
1508	0					0	my $ref = shift;
1509	0					0	my @array = @$ref;
1510	0					0	my $constant = 1;
1511
1512	0					0	$count{$_}++ foreach @array;
1513	0					0	return \%count;
1514							}
1515
1516							sub _is_constant {
1517	210			210		341	my %count;
1518	210					415	my $ref = shift;
1519	210					1155	my @array = @$ref;
1520	210					346	my $constant = 1;
1521
1522	210					1169	$count{$_}++ foreach @array;
1523	210					493	my @keys = keys %count;
1524	210	100				632	$constant = 0 if @keys > 1;
1525	210					759	return $constant
1526							}
1527
1528							sub _column_status {
1529	0			0		0	my %count;
1530	0					0	my $ref = shift;
1531	0					0	my @array = @$ref;
1532	0					0	my $st = { gap => 0, informative => 1, constant => 1 };
1533
1534	0					0	foreach (@array) {
1535	0					0	$count{$_}++;
1536	0	0				0	$st->{gap} = 1 if $_ =~ /[\-\?]/
1537							}
1538
1539	0					0	my @keys = keys %count;
1540	0					0	foreach (values %count) {
1541	0	0				0	if ($_ < 2) { $st->{informative} = 0; last }
	0					0
	0					0
1542							}
1543	0	0				0	$st->{constant} = 0 if @keys > 1;
1544	0					0	return $st
1545							}
1546
1547							sub _get_a_site_v2 {
1548	2			2		4	my %seq_ids;
1549	2					7	my $len = $aln->length();
1550	2					386	foreach my $seq ($aln->each_seq) {
1551	26					772	my $id = $seq->id();
1552	26					188	for (my $i = 1; $i <= $len; $i++) { $seq_ids{$id}{$i} = $seq->subseq($i, $i) }
	3120					70198
1553							}
1554	2					45	return \%seq_ids
1555							}
1556
1557							sub _find_max_id_len {
1558	1			1		1	my $seqs = shift;
1559	1					5	my @sorted_by_length = sort {length $a->display_id <=> length $b->display_id} @$seqs;
	32					188
1560	1					9	return length $sorted_by_length[-1]->display_id
1561							}
1562
1563							1;
1564							__END__