File Coverage

Bio/SimpleAlign.pm

Criterion	Covered	Total	%
statement	1013	1175	86.2
branch	315	456	69.0
condition	136	218	62.3
subroutine	80	96	83.3
pod	70	78	89.7
total	1614	2023	79.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::SimpleAlign;
2	48			48		2001	use strict;
	48					128
	48					1295
3	48			48		207	use warnings;
	48					71
	48					1311
4	48			48		2775	use Bio::LocatableSeq; # uses Seq's as list
	48					112
	48					915
5	48			48		4105	use Bio::Seq;
	48					1080
	48					909
6	48			48		10046	use Bio::SeqFeature::Generic;
	48					121
	48					1652
7
8	48			48		304	use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
	48					82
	48					390
9
10							# BioPerl module for SimpleAlign
11							#
12							# Please direct questions and support issues to
13							#
14							# Cared for by Heikki Lehvaslaiho
15							#
16							# Copyright Ewan Birney
17							#
18							# You may distribute this module under the same terms as perl itself
19
20							# POD documentation - main docs before the code
21							#
22							# History:
23							# 11/3/00 Added threshold feature to consensus and consensus_aa - PS
24							# May 2001 major rewrite - Heikki Lehvaslaiho
25
26							=head1 NAME
27
28							Bio::SimpleAlign - Multiple alignments held as a set of sequences
29
30							=head1 SYNOPSIS
31
32							# Use Bio::AlignIO to read in the alignment
33							$str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
34							$aln = $str->next_aln();
35
36							# Describe
37							print $aln->length;
38							print $aln->num_residues;
39							print $aln->is_flush;
40							print $aln->num_sequences;
41							print $aln->score;
42							print $aln->percentage_identity;
43							print $aln->consensus_string(50);
44
45							# Find the position in the alignment for a sequence location
46							$pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
47
48							# Extract sequences and check values for the alignment column $pos
49							foreach $seq ($aln->each_seq) {
50							$res = $seq->subseq($pos, $pos);
51							$count{$res}++;
52							}
53							foreach $res (keys %count) {
54							printf "Res: %s Count: %2d\n", $res, $count{$res};
55							}
56
57							# Manipulate
58							$aln->remove_seq($seq);
59							$mini_aln = $aln->slice(20,30); # get a block of columns
60							$mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
61							$new_aln = $aln->remove_columns([20,30]); # remove by position
62							$new_aln = $aln->remove_columns(['mismatch']); # remove by property
63
64							# Analyze
65							$str = $aln->consensus_string($threshold_percent);
66							$str = $aln->match_line();
67							$str = $aln->cigar_line();
68							$id = $aln->percentage_identity;
69
70							# See the module documentation for details and more methods.
71
72							=head1 DESCRIPTION
73
74							SimpleAlign is an object that handles a multiple sequence alignment
75							(MSA). It is very permissive of types (it does not insist on sequences
76							being all same length, for example). Think of it as a set of sequences
77							with a whole series of built-in manipulations and methods for reading and
78							writing alignments.
79
80							SimpleAlign uses L, a subclass of L,
81							to store its sequences. These are subsequences with a start and end
82							positions in the parent reference sequence. Each sequence in the
83							SimpleAlign object is a Bio::LocatableSeq.
84
85							SimpleAlign expects the combination of name, start, and end for a
86							given sequence to be unique in the alignment, and this is the key for the
87							internal hashes (name, start, end are abbreviated C in the code).
88							However, in some cases people do not want the name/start-end to be displayed:
89							either multiple names in an alignment or names specific to the alignment
90							(ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
91							C, and generally is what is used to print out the
92							alignment. They default to name/start-end.
93
94							The SimpleAlign Module is derived from the Align module by Ewan Birney.
95
96							=head1 FEEDBACK
97
98							=head2 Mailing Lists
99
100							User feedback is an integral part of the evolution of this and other
101							Bioperl modules. Send your comments and suggestions preferably to one
102							of the Bioperl mailing lists. Your participation is much appreciated.
103
104							bioperl-l@bioperl.org - General discussion
105							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
106
107							=head2 Support
108
109							Please direct usage questions or support issues to the mailing list:
110
111							I
112
113							rather than to the module maintainer directly. Many experienced and
114							reponsive experts will be able look at the problem and quickly
115							address it. Please include a thorough description of the problem
116							with code and data examples if at all possible.
117
118							=head2 Reporting Bugs
119
120							Report bugs to the Bioperl bug tracking system to help us keep track
121							the bugs and their resolution. Bug reports can be submitted via the
122							web:
123
124							https://github.com/bioperl/bioperl-live/issues
125
126							=head1 AUTHOR
127
128							Ewan Birney, birney@ebi.ac.uk
129
130							=head1 CONTRIBUTORS
131
132							Allen Day, allenday-at-ucla.edu,
133							Richard Adams, Richard.Adams-at-ed.ac.uk,
134							David J. Evans, David.Evans-at-vir.gla.ac.uk,
135							Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
136							Allen Smith, allens-at-cpan.org,
137							Jason Stajich, jason-at-bioperl.org,
138							Anthony Underwood, aunderwood-at-phls.org.uk,
139							Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
140							Brian Osborne, bosborne at alum.mit.edu
141							Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
142							Hongyu Zhang, forward at hongyu.org
143							Jay Hannah, jay at jays.net
144							Alexandr Bezginov, albezg at gmail.com
145
146							=head1 SEE ALSO
147
148							L
149
150							=head1 APPENDIX
151
152							The rest of the documentation details each of the object
153							methods. Internal methods are usually preceded with a _
154
155							=cut
156
157							## This data should probably be in a more centralized module...
158							## it is taken from Clustalw documentation.
159							## These are all the positively scoring groups that occur in the
160							## Gonnet Pam250 matrix. The strong and weak groups are
161							## defined as strong score >0.5 and weak score =<0.5 respectively.
162							our %CONSERVATION_GROUPS = (
163							'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
164							'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
165							);
166
167
168							=head2 new
169
170							Title : new
171							Usage : my $aln = Bio::SimpleAlign->new();
172							Function : Creates a new simple align object
173							Returns : Bio::SimpleAlign
174							Args : -source => string representing the source program
175							where this alignment came from
176							-annotation => Bio::AnnotationCollectionI
177							-seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
178							-seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
179							-consensus => consensus string
180							-consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
181
182							=cut
183
184
185							sub new {
186	291			291	1	1101	my($class,@args) = @_;
187
188	291					1294	my $self = $class->SUPER::new(@args);
189
190	291					1667	my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
191							SOURCE
192							SCORE
193							ID
194							ACCESSION
195							DESCRIPTION
196							SEQS
197							FEATURES
198							ANNOTATION
199							SEQ_ANNOTATION
200							CONSENSUS
201							CONSENSUS_META
202							)], @args);
203	291	100				1260	$src && $self->source($src);
204	291	100				695	defined $score && $self->score($score);
205							# we need to set up internal hashs first!
206
207	291					832	$self->{'_seq'} = {};
208	291					554	$self->{'_order'} = {};
209	291					523	$self->{'_start_end_lists'} = {};
210	291					532	$self->{'_dis_name'} = {};
211	291					507	$self->{'_id'} = 'NoName';
212							# maybe we should automatically read in from args. Hmmm...
213	291	100				651	$id && $self->id($id);
214	291	100				643	$acc && $self->accession($acc);
215	291	100				601	$desc && $self->description($desc);
216	291	100				564	$coll && $self->annotation($coll);
217							# sequence annotation is layered into a provided annotation collection (or dies)
218	291	50				577	if ($sa) {
219	0	0				0	$self->throw("Must supply an alignment-based annotation collection (-annotation) ".
220							"with a sequence annotation collection")
221							if !$coll;
222	0					0	$coll->add_Annotation('seq_annotation', $sa);
223							}
224	291	100	66			690	if ($feats && ref $feats eq 'ARRAY') {
225	1					4	for my $feat (@$feats) {
226	6					9	$self->add_SeqFeature($feat);
227							}
228							}
229	291	50				647	$con && $self->consensus($con);
230	291	100				643	$cmeta && $self->consensus_meta($cmeta);
231							# assumes these are in correct alignment order
232	291	100	66			831	if ($seqs && ref($seqs) eq 'ARRAY') {
233	9					25	for my $seq (@$seqs) {
234	168					256	$self->add_seq($seq);
235							}
236							}
237
238	291					845	return $self; # success - we hope!
239							}
240
241							=head1 Modifier methods
242
243							These methods modify the MSA by adding, removing or shuffling complete
244							sequences.
245
246							=head2 add_seq
247
248							Title : add_seq
249							Usage : $myalign->add_seq($newseq);
250							$myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
251							Function : Adds another sequence to the alignment. Does not align
252							it - just adds it to the hashes.
253							If -ORDER is specified, the sequence is inserted at the
254							the position spec'd by -ORDER, and existing sequences
255							are pushed down the storage array.
256							Returns : nothing
257							Args : A Bio::LocatableSeq object
258							Positive integer for the sequence position (optional)
259
260							See L for more information
261
262							=cut
263
264							sub addSeq {
265	0			0	0	0	my $self = shift;
266	0					0	$self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
267	0					0	$self->add_seq(@_);
268							}
269
270							sub add_seq {
271	2420			2420	1	3290	my $self = shift;
272	2420					3548	my @args = @_;
273	2420					6462	my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
274	2420					3927	my ($name,$id,$start,$end);
275
276	2420	50				3896	unless ($seq) {
277	0					0	$self->throw("LocatableSeq argument required");
278							}
279	2420	50	33			9745	if( ! ref $seq \|\| ! $seq->isa('Bio::LocatableSeq') ) {
280	0					0	$self->throw("Unable to process non locatable sequences [". ref($seq). "]");
281							}
282	2420	100				4152	!defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
	2417					4285
283	2420					2838	$order--; # jay's patch (user-specified order is 1-origin)
284
285	2420	100				4007	if ($order < 0) {
286	1					6	$self->throw("User-specified value for ORDER must be >= 1");
287							}
288
289	2419		66			4181	$id = $seq->id() \|\|$seq->display_id \|\| $seq->primary_id;
290
291							# build the symbol list for this sequence,
292							# will prune out the gap and missing/match chars
293							# when actually asked for the symbol list in the
294							# symbol_chars
295							# map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296
297	2419					4768	$name = $seq->get_nse;
298
299	2419	50				4700	if( $self->{'_seq'}->{$name} ) {
300	0	0				0	$self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
301							}
302							else {
303	2419					8405	$self->debug( "Assigning $name to $order\n");
304
305	2419					3463	my $ordh = $self->{'_order'};
306	2419	50				4135	if ($ordh->{$order}) {
307							# make space to insert
308							# $c->() returns (in reverse order) the first subsequence
309							# of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
310							# (3,2,1), and $c->(2,4,5) returns (2).
311	0					0	my $c;
312	0	0		0		0	$c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
	0					0
313							map {
314	0					0	$ordh->{$_+1} = $ordh->{$_}
315	0					0	} $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
	0					0
	0					0
	0					0
316
317							}
318	2419					4341	$ordh->{$order} = $name;
319
320	2419	100				4247	unless( exists( $self->{'_start_end_lists'}->{$id})) {
321	2400					4544	$self->{'_start_end_lists'}->{$id} = [];
322							}
323	2419					2601	push @{$self->{'_start_end_lists'}->{$id}}, $seq;
	2419					5160
324							}
325
326	2419					7918	$self->{'_seq'}->{$name} = $seq;
327
328							}
329
330
331							=head2 remove_seq
332
333							Title : remove_seq
334							Usage : $aln->remove_seq($seq);
335							Function : Removes a single sequence from an alignment
336							Returns :
337							Argument : a Bio::LocatableSeq object
338
339							=cut
340
341							sub removeSeq {
342	0			0	0	0	my $self = shift;
343	0					0	$self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
344	0					0	$self->remove_seq(@_);
345							}
346
347							sub remove_seq {
348	13			13	1	17	my $self = shift;
349	13					14	my $seq = shift;
350	13					16	my ($name,$id);
351
352	13	50	33			65	$self->throw("Need Bio::Locatable seq argument ")
353							unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
354
355	13					28	$id = $seq->id();
356	13					25	$name = $seq->get_nse;
357
358	13	50				28	if( !exists $self->{'_seq'}->{$name} ) {
359	0					0	$self->throw("Sequence $name does not exist in the alignment to remove!");
360							}
361
362	13					19	delete $self->{'_seq'}->{$name};
363
364							# we need to remove this seq from the start_end_lists hash
365
366	13	50				22	if (exists $self->{'_start_end_lists'}->{$id}) {
367							# we need to find the sequence in the array.
368
369	13					13	my ($i, $found);;
370	13					15	for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
	13					26
371	13	50				13	if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
	13					33
372	13					14	$found = 1;
373	13					15	last;
374							}
375							}
376	13	50				21	if ($found) {
377	13					12	splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
	13					35
378							}
379							else {
380	0					0	$self->throw("Could not find the sequence to remoce from the start-end list");
381							}
382							}
383							else {
384	0					0	$self->throw("There is no seq list for the name $id");
385							}
386							# we need to shift order hash
387	13					15	my %rev_order = reverse %{$self->{'_order'}};
	13					70
388	13					23	my $no = $rev_order{$name};
389	13					24	my $num_sequences = $self->num_sequences;
390	13					21	for (; $no < $num_sequences; $no++) {
391	94					151	$self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
392							}
393	13					16	delete $self->{'_order'}->{$no};
394	13					37	return 1;
395							}
396
397
398							=head2 purge
399
400							Title : purge
401							Usage : $aln->purge(0.7);
402							Function: Removes sequences above given sequence similarity
403							This function will grind on large alignments. Beware!
404							Example :
405							Returns : An array of the removed sequences
406							Args : float, threshold for similarity
407
408							=cut
409
410							sub purge {
411	1			1	1	2	my ($self,$perc) = @_;
412	1					2	my (%duplicate, @dups);
413
414	1					4	my @seqs = $self->each_seq();
415
416	1					4	for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
417	15					19	my $seq = $seqs[$i];
418
419							#skip if already in duplicate hash
420	15	100				24	next if exists $duplicate{$seq->display_id} ;
421	4					12	my $one = $seq->seq();
422
423	4					77	my @one = split '', $one; #split to get 1aa per array element
424
425	4					11	for (my $j=$i+1;$j < @seqs;$j++) {
426	28					36	my $seq2 = $seqs[$j];
427
428							#skip if already in duplicate hash
429	28	100				54	next if exists $duplicate{$seq2->display_id} ;
430
431	27					54	my $two = $seq2->seq();
432	27					454	my @two = split '', $two;
433
434	27					28	my $count = 0;
435	27					24	my $res = 0;
436	27					39	for (my $k=0;$k<@one;$k++) {
437	6534	100	66			21282	if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
			66
			100
438							$one[$k] eq $two[$k]) {
439	4329					3497	$count++;
440							}
441	6534	100	66			25816	if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
			66
			66
			66
442							$two[$k] ne '.' && $two[$k] ne '-' ) {
443	6320					7397	$res++;
444							}
445							}
446
447	27					28	my $ratio = 0;
448	27	50				45	$ratio = $count/$res unless $res == 0;
449
450							# if above threshold put in duplicate hash and push onto
451							# duplicate array for returning to get_unique
452	27	100				190	if ( $ratio > $perc ) {
453	12	50				39	$self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
454	12					26	$duplicate{$seq2->display_id} = 1;
455	12					142	push @dups, $seq2;
456							}
457							}
458							}
459	1					5	foreach my $seq (@dups) {
460	12					17	$self->remove_seq($seq);
461							}
462	1					13	return @dups;
463							}
464
465							=head2 sort_alphabetically
466
467							Title : sort_alphabetically
468							Usage : $ali->sort_alphabetically
469							Function : Changes the order of the alignment to alphabetical on name
470							followed by numerical by number.
471							Returns :
472							Argument :
473
474							=cut
475
476							sub sort_alphabetically {
477	1			1	1	3	my $self = shift;
478	1					2	my ($seq,$nse,@arr,%hash,$count);
479
480	1					2	foreach $seq ( $self->each_seq() ) {
481	3					5	$nse = $seq->get_nse;
482	3					6	$hash{$nse} = $seq;
483							}
484
485	1					2	$count = 0;
486
487	1					1	%{$self->{'_order'}} = (); # reset the hash;
	1					4
488
489	1					5	foreach $nse ( sort _alpha_startend keys %hash) {
490	3					6	$self->{'_order'}->{$count} = $nse;
491
492	3					3	$count++;
493							}
494	1					5	1;
495							}
496
497							=head2 sort_by_list
498
499							Title : sort_by_list
500							Usage : $aln_ordered=$aln->sort_by_list($list_file)
501							Function : Arbitrarily order sequences in an alignment
502							Returns : A new Bio::SimpleAlign object
503							Argument : a file listing sequence names in intended order (one name per line)
504
505							=cut
506
507							sub sort_by_list {
508	1			1	1	6	my ($self, $list) = @_;
509	1					1	my (@seq, @ids, %order);
510
511	1					4	foreach my $seq ( $self->each_seq() ) {
512	6					9	push @seq, $seq;
513	6					8	push @ids, $seq->display_id;
514							}
515
516	1					2	my $ct=1;
517	1	50				30	open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
518	1					20	while (<$listfh>) {
519	6					9	chomp;
520	6					6	my $name=$_;
521	6	50				9	$self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
522	6					15	$order{$name}=$ct++;
523							}
524	1					6	close($listfh);
525
526							# use the map-sort-map idiom:
527	1					2	my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
	6					7
	7					9
	6					12
528	1					3	my $aln = $self->new;
529	1					2	foreach (@sorted) { $aln->add_seq($_) }
	6					9
530	1					7	return $aln;
531							}
532
533							=head2 set_new_reference
534
535							Title : set_new_reference
536							Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
537							the sequence whoes name is "B31" (full, exact, and case-sensitive),
538							as the reference (1st) sequence
539							Function : Change/Set a new reference (i.e., the first) sequence
540							Returns : a new Bio::SimpleAlign object.
541							Throws an exception if designated sequence not found
542							Argument : a positive integer of sequence order, or a sequence name
543							in the original alignment
544
545							=cut
546
547							sub set_new_reference {
548	2			2	1	8	my ($self, $seqid) = @_;
549	2					5	my $aln = $self->new;
550	2					3	my (@seq, @ids, @new_seq);
551	2					2	my $is_num=0;
552	2					4	foreach my $seq ( $self->each_seq() ) {
553	12					13	push @seq, $seq;
554	12					18	push @ids, $seq->display_id;
555							}
556
557	2	100				11	if ($seqid =~ /^\d+$/) { # argument is seq position
558	1					2	$is_num=1;
559	1	50	33			5	$self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 \|\| $seqid > $self->num_sequences);
560							} else { # argument is a seq name
561	1	50				4	$self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
562							}
563
564	2					6	for (my $i=0; $i<=$#seq; $i++) {
565	12					14	my $pos=$i+1;
566	12	100	100			32	if ( ($is_num && $pos == $seqid) \|\| ($seqid eq $seq[$i]->display_id) ) {
			100
567	2					5	unshift @new_seq, $seq[$i];
568							} else {
569	10					19	push @new_seq, $seq[$i];
570							}
571							}
572	2					3	foreach (@new_seq) { $aln->add_seq($_); }
	12					21
573	2					7	return $aln;
574							}
575
576							sub _in_aln { # check if input name exists in the alignment
577	7			7		11	my ($str, $ref) = @_;
578	7					9	foreach (@$ref) {
579	24	100				39	return 1 if $str eq $_;
580							}
581	0					0	return 0;
582							}
583
584
585							=head2 uniq_seq
586
587							Title : uniq_seq
588							Usage : $aln->uniq_seq(): Remove identical sequences in
589							in the alignment. Ambiguous base ("N", "n") and
590							leading and ending gaps ("-") are NOT counted as
591							differences.
592							Function : Make a new alignment of unique sequence types (STs)
593							Returns : 1a. if called in a scalar context,
594							a new Bio::SimpleAlign object (all sequences renamed as "ST")
595							1b. if called in an array context,
596							a new Bio::SimpleAlign object, and a hashref whose keys
597							are sequence types, and whose values are arrayrefs to
598							lists of sequence ids within the corresponding sequence type
599							2. if $aln->verbose > 0, ST of each sequence is sent to
600							STDERR (in a tabular format)
601							Argument : None
602
603							=cut
604
605							sub uniq_seq {
606	1			1	1	6	my ($self, $seqid) = @_;
607	1					3	my $aln = $self->new;
608	1					3	my (%member, %order, @seq, @uniq_str, $st);
609	1					2	my $order=0;
610	1					2	my $len = $self->length();
611	1					2	$st = {};
612	1					2	foreach my $seq ( $self->each_seq() ) {
613	15					27	my $str = $seq->seq();
614
615							# it's necessary to ignore "n", "N", leading gaps and ending gaps in
616							# comparing two sequence strings
617
618							# 1st, convert "n", "N" to "?" (for DNA sequence only):
619	15	50				46	$str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
620							# 2nd, convert leading and ending gaps to "?":
621	15					24	$str = &_convert_leading_ending_gaps($str, '-', '?');
622							# Note that '?' also can mean unknown residue.
623							# I don't like making global class member changes like this, too
624							# prone to errors... -- cjfields 08-11-18
625	15					21	local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
626	15					26	my $new = Bio::LocatableSeq->new(
627							-id => $seq->id(),
628							-alphabet=> $seq->alphabet,
629							-seq => $str,
630							-start => $seq->start,
631							-end => $seq->end
632							);
633	15					35	push @seq, $new;
634							}
635
636	1					4	foreach my $seq (@seq) {
637	15					48	my $str = $seq->seq();
638	15					30	my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
639	15	100				31	if ($seen) { # seen before
640	4					5	my @memb = @{$member{$key}};
	4					10
641	4					6	push @memb, $seq;
642	4					8	$member{$key} = \@memb;
643							} else { # not seen
644	11					28	push @uniq_str, $key;
645	11					12	$order++;
646	11					55	$member{$key} = [ ($seq) ];
647	11					26	$order{$key} = $order;
648							}
649							}
650
651	1					17	foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
	26					31
652							# convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
653	11					35	my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
654							# convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
655	11	50				53	$str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
656	11					13	my $gap='-';
657	11					14	my $end= CORE::length($str2);
658	11					339	$end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
659	11					55	my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
660							-seq =>$str2,
661							-start=>1,
662							-end =>$end
663							);
664	11					29	$aln->add_seq($new);
665	11					12	foreach (@{$member{$str}}) {
	11					24
666	15					16	push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
	15					39
667	15					24	$self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
668							}
669							}
670	1	50				16	return wantarray ? ($aln, $st) : $aln;
671							}
672
673							sub _check_uniq { # check if same seq exists in the alignment
674	15			15		20	my ($str1, $ref, $length) = @_;
675	15					424	my @char1=split //, $str1;
676	15					47	my @array=@$ref;
677
678	15	100				42	return (0, $str1) if @array==0; # not seen (1st sequence)
679
680	14					17	foreach my $str2 (@array) {
681	59					59	my $diff=0;
682	59					1642	my @char2=split //, $str2;
683	59					97	for (my $i=0; $i<=$length-1; $i++) {
684	24249	100				26646	next if $char1[$i] eq '?';
685	22382	100				23830	next if $char2[$i] eq '?';
686	21740	100				30883	$diff++ if $char1[$i] ne $char2[$i];
687							}
688	59	100				944	return (1, $str2) if $diff == 0; # seen before
689							}
690
691	10					179	return (0, $str1); # not seen
692							}
693
694							sub _convert_leading_ending_gaps {
695	26			26		30	my $s=shift;
696	26					23	my $sym1=shift;
697	26					27	my $sym2=shift;
698	26					815	my @array=split //, $s;
699							# convert leading char:
700	26					52	for (my $i=0; $i<=$#array; $i++) {
701	369	100				562	($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
702							}
703							# convert ending char:
704	26					42	for (my $i = $#array; $i>= 0; $i--) {
705	552	100				738	($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
706							}
707	26					250	my $s_new=join '', @array;
708	26					412	return $s_new;
709							}
710
711							=head1 Sequence selection methods
712
713							Methods returning one or more sequences objects.
714
715							=head2 each_seq
716
717							Title : each_seq
718							Usage : foreach $seq ( $align->each_seq() )
719							Function : Gets a Seq object from the alignment
720							Returns : Seq object
721							Argument :
722
723							=cut
724
725							sub eachSeq {
726	0			0	0	0	my $self = shift;
727	0					0	$self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
728	0					0	$self->each_seq();
729							}
730
731							sub each_seq {
732	7635			7635	1	8247	my $self = shift;
733	7635					7467	my (@arr,$order);
734
735	7635					7182	foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
	127161					113316
	7635					21482
736	56120	100				79964	if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
737	56107					69533	push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
738							}
739							}
740	7635					17268	return @arr;
741							}
742
743
744							=head2 each_alphabetically
745
746							Title : each_alphabetically
747							Usage : foreach $seq ( $ali->each_alphabetically() )
748							Function : Returns a sequence object, but the objects are returned
749							in alphabetically sorted order.
750							Does not change the order of the alignment.
751							Returns : Seq object
752							Argument :
753
754							=cut
755
756							sub each_alphabetically {
757	2			2	1	5	my $self = shift;
758	2					4	my ($seq,$nse,@arr,%hash,$count);
759
760	2					5	foreach $seq ( $self->each_seq() ) {
761	32					46	$nse = $seq->get_nse;
762	32					48	$hash{$nse} = $seq;
763							}
764
765	2					14	foreach $nse ( sort _alpha_startend keys %hash) {
766	32					34	push(@arr,$hash{$nse});
767							}
768	2					9	return @arr;
769							}
770
771							sub _alpha_startend {
772	97			97		93	my ($aname,$astart,$bname,$bstart);
773	97					130	($aname,$astart) = split (/-/,$a);
774	97					131	($bname,$bstart) = split (/-/,$b);
775
776	97	50				112	if( $aname eq $bname ) {
777	0					0	return $astart <=> $bstart;
778							}
779							else {
780	97					103	return $aname cmp $bname;
781							}
782							}
783
784							=head2 each_seq_with_id
785
786							Title : each_seq_with_id
787							Usage : foreach $seq ( $align->each_seq_with_id() )
788							Function : Gets a Seq objects from the alignment, the contents
789							being those sequences with the given name (there may be
790							more than one)
791							Returns : Seq object
792							Argument : a seq name
793
794							=cut
795
796							sub eachSeqWithId {
797	0			0	0	0	my $self = shift;
798	0					0	$self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
799	0					0	$self->each_seq_with_id(@_);
800							}
801
802							sub each_seq_with_id {
803	89			89	1	860	my $self = shift;
804	89					104	my $id = shift;
805
806	89	50				142	$self->throw("Method each_seq_with_id needs a sequence name argument")
807							unless defined $id;
808
809	89					106	my (@arr, $seq);
810
811	89	50				173	if (exists($self->{'_start_end_lists'}->{$id})) {
812	89					99	@arr = @{$self->{'_start_end_lists'}->{$id}};
	89					157
813							}
814	89					178	return @arr;
815							}
816
817							=head2 get_seq_by_pos
818
819							Title : get_seq_by_pos
820							Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
821							Function : Gets a sequence based on its position in the alignment.
822							Numbering starts from 1. Sequence positions larger than
823							num_sequences() will throw an error.
824							Returns : a Bio::LocatableSeq object
825							Args : positive integer for the sequence position
826
827							=cut
828
829							sub get_seq_by_pos {
830
831	258			258	1	15533	my $self = shift;
832	258					422	my ($pos) = @_;
833
834	258	50	33			1737	$self->throw("Sequence position has to be a positive integer, not [$pos]")
835							unless $pos =~ /^\d+$/ and $pos > 0;
836	258	50				635	$self->throw("No sequence at position [$pos]")
837							unless $pos <= $self->num_sequences ;
838
839	258					666	my $nse = $self->{'_order'}->{--$pos};
840	258					908	return $self->{'_seq'}->{$nse};
841							}
842
843							=head2 get_seq_by_id
844
845							Title : get_seq_by_id
846							Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
847							Function : Gets a sequence based on its name.
848							Sequences that do not exist will warn and return undef
849							Returns : a Bio::LocatableSeq object
850							Args : string for sequence name
851
852							=cut
853
854							sub get_seq_by_id {
855	18			18	1	2510	my ($self,$name) = @_;
856	18	50				37	unless( defined $name ) {
857	0					0	$self->warn("Must provide a sequence name");
858	0					0	return;
859							}
860	18					21	for my $seq ( values %{$self->{'_seq'}} ) {
	18					68
861	44	100				83	if ( $seq->id eq $name) {
862	18					238	return $seq;
863							}
864							}
865	0					0	return;
866							}
867
868							=head2 seq_with_features
869
870							Title : seq_with_features
871							Usage : $seq = $aln->seq_with_features(-pos => 1,
872							-consensus => 60
873							-mask =>
874							sub { my $consensus = shift;
875
876							for my $i (1..5){
877							my $n = 'N' x $i;
878							my $q = '\?' x $i;
879							while($consensus =~ /[^?]$q[^?]/){
880							$consensus =~ s/([^?])$q([^?])/$1$n$2/;
881							}
882							}
883							return $consensus;
884							}
885							);
886							Function: produces a Bio::Seq object by first splicing gaps from -pos
887							(by means of a splice_by_seq_pos() call), then creating
888							features using non-? chars (by means of a consensus_string()
889							call with stringency -consensus).
890							Returns : a Bio::Seq object
891							Args : -pos : required. sequence from which to build the Bio::Seq
892							object
893							-consensus : optional, defaults to consensus_string()'s
894							default cutoff value
895							-mask : optional, a coderef to apply to consensus_string()'s
896							output before building features. this may be useful for
897							closing gaps of 1 bp by masking over them with N, for
898							instance
899
900							=cut
901
902							sub seq_with_features{
903	0			0	1	0	my ($self,%arg) = @_;
904
905							#first do the preparatory splice
906	0	0				0	$self->throw("must provide a -pos argument") unless $arg{-pos};
907	0					0	$self->splice_by_seq_pos($arg{-pos});
908
909	0					0	my $consensus_string = $self->consensus_string($arg{-consensus});
910							$consensus_string = $arg{-mask}->($consensus_string)
911	0	0				0	if defined($arg{-mask});
912
913	0					0	my(@bs,@es);
914
915	0	0				0	push @bs, 1 if $consensus_string =~ /^[^?]/;
916
917	0					0	while($consensus_string =~ /\?[^?]/g){
918	0					0	push @bs, pos($consensus_string);
919							}
920	0					0	while($consensus_string =~ /[^?]\?/g){
921	0					0	push @es, pos($consensus_string);
922							}
923
924	0	0				0	push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
925
926	0					0	my $seq = Bio::Seq->new();
927
928							# my $rootfeature = Bio::SeqFeature::Generic->new(
929							# -source_tag => 'location',
930							# -start => $self->get_seq_by_pos($arg{-pos})->start,
931							# -end => $self->get_seq_by_pos($arg{-pos})->end,
932							# );
933							# $seq->add_SeqFeature($rootfeature);
934
935	0					0	while(my $b = shift @bs){
936	0					0	my $e = shift @es;
937							$seq->add_SeqFeature(
938							Bio::SeqFeature::Generic->new(
939							-start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
940	0		0			0	-end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
941							-source_tag => $self->source \|\| 'MSA',
942							)
943							);
944							}
945
946	0					0	return $seq;
947							}
948
949
950							=head1 Create new alignments
951
952							The result of these methods are horizontal or vertical subsets of the
953							current MSA.
954
955							=head2 select
956
957							Title : select
958							Usage : $aln2 = $aln->select(1, 3) # three first sequences
959							Function : Creates a new alignment from a continuous subset of
960							sequences. Numbering starts from 1. Sequence positions
961							larger than num_sequences() will throw an error.
962							Returns : a Bio::SimpleAlign object
963							Args : positive integer for the first sequence
964							positive integer for the last sequence to include (optional)
965
966							=cut
967
968							sub select {
969	1			1	1	3	my $self = shift;
970	1					3	my ($start, $end) = @_;
971
972	1	50	33			30	$self->throw("Select start has to be a positive integer, not [$start]")
973							unless $start =~ /^\d+$/ and $start > 0;
974	1	50	33			8	$self->throw("Select end has to be a positive integer, not [$end]")
975							unless $end =~ /^\d+$/ and $end > 0;
976	1	50				3	$self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
977							unless $start <= $end;
978
979	1					5	my $aln = $self->new;
980	1					4	foreach my $pos ($start .. $end) {
981	3					5	$aln->add_seq($self->get_seq_by_pos($pos));
982							}
983	1					5	$aln->id($self->id);
984							# fix for meta, sf, ann
985	1					2	return $aln;
986							}
987
988							=head2 select_noncont
989
990							Title : select_noncont
991							Usage : # 1st and 3rd sequences, sorted
992							$aln2 = $aln->select_noncont(1, 3)
993
994							# 1st and 3rd sequences, sorted (same as first)
995							$aln2 = $aln->select_noncont(3, 1)
996
997							# 1st and 3rd sequences, unsorted
998							$aln2 = $aln->select_noncont('nosort',3, 1)
999
1000							Function : Creates a new alignment from a subset of sequences. Numbering
1001							starts from 1. Sequence positions larger than num_sequences() will
1002							throw an error. Sorts the order added to new alignment by default,
1003							to prevent sorting pass 'nosort' as the first argument in the list.
1004							Returns : a Bio::SimpleAlign object
1005							Args : array of integers for the sequences. If the string 'nosort' is
1006							passed as the first argument, the sequences will not be sorted
1007							in the new alignment but will appear in the order listed.
1008
1009							=cut
1010
1011							sub select_noncont {
1012	8			8	1	24	my $self = shift;
1013	8					13	my $nosort = 0;
1014	8					22	my (@pos) = @_;
1015	8	100				52	if ($pos[0] !~ m{^\d+$}) {
1016	1					3	my $sortcmd = shift @pos;
1017	1	50				8	if ($sortcmd eq 'nosort') {
1018	1					2	$nosort = 1;
1019							} else {
1020	0					0	$self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1021							}
1022							}
1023
1024	8					25	my $end = $self->num_sequences;
1025	8					20	foreach ( @pos ) {
1026	32	50	33			193	$self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
			33
1027							unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1028							}
1029
1030	8	100				31	@pos = sort {$a <=> $b} @pos unless $nosort;
	25					34
1031
1032	8					31	my $aln = $self->new;
1033	8					18	foreach my $p (@pos) {
1034	32					59	$aln->add_seq($self->get_seq_by_pos($p));
1035							}
1036	8					29	$aln->id($self->id);
1037							# fix for meta, sf, ann
1038	8					30	return $aln;
1039							}
1040
1041							=head2 select_noncont_by_name
1042
1043							Title : select_noncont_by_name
1044							Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045							Function : Creates a new alignment from a subset of sequences which are
1046							selected by name (sequence ID).
1047							Returns : a Bio::SimpleAlign object
1048							Args : array of names (i.e., identifiers) for the sequences.
1049
1050							=cut
1051
1052							sub select_noncont_by_name {
1053	1			1	1	4	my ($self, @names) = @_;
1054
1055	1					3	my $aln = $self->new;
1056	1					2	foreach my $name (@names) {
1057	3					6	$aln->add_seq($self->get_seq_by_id($name));
1058							}
1059	1					3	$aln->id($self->id);
1060
1061	1					3	return $aln;
1062							}
1063
1064							=head2 slice
1065
1066							Title : slice
1067							Usage : $aln2 = $aln->slice(20,30)
1068							Function : Creates a slice from the alignment inclusive of start and
1069							end columns, and the first column in the alignment is denoted 1.
1070							Sequences with no residues in the slice are excluded from the
1071							new alignment and a warning is printed. Slice beyond the length of
1072							the sequence does not do padding.
1073							Returns : A Bio::SimpleAlign object
1074							Args : Positive integer for start column, positive integer for end column,
1075							optional boolean which if true will keep gap-only columns in the newly
1076							created slice. Example:
1077
1078							$aln2 = $aln->slice(20,30,1)
1079
1080							=cut
1081
1082							sub slice {
1083	31			31	1	2642	my $self = shift;
1084	31					66	my ($start, $end, $keep_gap_only) = @_;
1085
1086	31	50	33			214	$self->throw("Slice start has to be a positive integer, not [$start]")
1087							unless $start =~ /^\d+$/ and $start > 0;
1088	31	50	33			175	$self->throw("Slice end has to be a positive integer, not [$end]")
1089							unless $end =~ /^\d+$/ and $end > 0;
1090	31	50				81	$self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091							unless $start <= $end;
1092	31	100				90	$self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093							"[$start] is too big.") if $start > $self->length;
1094	30					83	my $cons_meta = $self->consensus_meta;
1095	30					69	my $aln = $self->new;
1096	30					80	$aln->id($self->id);
1097	30					59	foreach my $seq ( $self->each_seq() ) {
1098	107	50				669	my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099							Bio::Seq::Meta->new
1100							(-id => $seq->id,
1101							-alphabet => $seq->alphabet,
1102							-strand => $seq->strand,
1103							-verbose => $self->verbose) :
1104							Bio::LocatableSeq->new
1105							(-id => $seq->id,
1106							-alphabet => $seq->alphabet,
1107							-strand => $seq->strand,
1108							-verbose => $self->verbose);
1109
1110							# seq
1111	107					202	my $seq_end = $end;
1112	107	50				212	$seq_end = $seq->length if( $end > $seq->length );
1113
1114	107					212	my $slice_seq = $seq->subseq($start, $seq_end);
1115	107					224	$new_seq->seq( $slice_seq );
1116
1117							# Allowed extra characters in string
1118	107					127	my $allowed_chars = '';
1119	107	100				415	if (exists $self->{_mask_char}) {
1120	24					26	$allowed_chars = $self->{_mask_char};
1121	24					29	$allowed_chars = quotemeta $allowed_chars;
1122							}
1123	107					739	$slice_seq =~ s/[^\w$allowed_chars]//g;
1124
1125	107	100				206	if ($start > 1) {
1126	52					126	my $pre_start_seq = $seq->subseq(1, $start - 1);
1127	52					379	$pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128	52	100				122	if (!defined($seq->strand)) {
		100
1129	38					66	$new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130							} elsif ($seq->strand < 0){
1131	1					3	$new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132							} else {
1133	13					41	$new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134							}
1135							} else {
1136	55	100	100			108	if ((defined $seq->strand)&&($seq->strand < 0)){
1137	2					4	$new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138							} else {
1139	53					111	$new_seq->start( $seq->start);
1140							}
1141							}
1142	107	50				520	if ($new_seq->isa('Bio::Seq::MetaI')) {
1143	0					0	for my $meta_name ($seq->meta_names) {
1144	0					0	$new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1145							}
1146							}
1147	107					207	$new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148
1149	107	100	66			202	if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150	106					221	$aln->add_seq($new_seq);
1151							} else {
1152	1	50				3	if( $keep_gap_only ) {
1153	1					3	$aln->add_seq($new_seq);
1154							} else {
1155	0					0	my $nse = $seq->get_nse();
1156	0					0	$self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157							" Sequence excluded from the new alignment.");
1158							}
1159							}
1160							}
1161	30	50				82	if ($cons_meta) {
1162	0					0	my $new = Bio::Seq::Meta->new();
1163	0					0	for my $meta_name ($cons_meta->meta_names) {
1164	0					0	$new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165							}
1166	0					0	$aln->consensus_meta($new);
1167							}
1168	30					81	$aln->annotation($self->annotation);
1169							# fix for meta, sf, ann
1170	30					118	return $aln;
1171							}
1172
1173							=head2 remove_columns
1174
1175							Title : remove_columns
1176							Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177							$aln2 = $aln->remove_columns([0,0],[6,8])
1178							Function : Creates an aligment with columns removed corresponding to
1179							the specified type or by specifying the columns by number.
1180							Returns : Bio::SimpleAlign object
1181							Args : Array ref of types ('match'\|'weak'\|'strong'\|'mismatch'\|'gaps'\|
1182							'all_gaps_columns') or array ref where the referenced array
1183							contains a pair of integers that specify a range.
1184							The first column is 0
1185
1186							=cut
1187
1188							sub remove_columns {
1189	4			4	1	954	my ($self,@args) = @_;
1190	4	50				10	@args \|\| $self->throw("Must supply column ranges or column types");
1191	4					7	my $aln;
1192
1193	4	100				27	if ($args[0][0] =~ /^[a-z_]+$/i) {
		50
1194	1					4	$aln = $self->_remove_columns_by_type($args[0]);
1195							} elsif ($args[0][0] =~ /^\d+$/) {
1196	3					13	$aln = $self->_remove_columns_by_num(\@args);
1197							} else {
1198	0					0	$self->throw("You must pass array references to remove_columns(), not @args");
1199							}
1200							# fix for meta, sf, ann
1201	4					14	$aln;
1202							}
1203
1204
1205							=head2 remove_gaps
1206
1207							Title : remove_gaps
1208							Usage : $aln2 = $aln->remove_gaps
1209							Function : Creates an aligment with gaps removed
1210							Returns : a Bio::SimpleAlign object
1211							Args : a gap character(optional) if none specified taken
1212							from $self->gap_char,
1213							[optional] $all_gaps_columns flag (1 or 0, default is 0)
1214							indicates that only all-gaps columns should be deleted
1215
1216							Used from method L in most cases. Set gap character
1217							using L.
1218
1219							=cut
1220
1221							sub remove_gaps {
1222	6			6	1	22	my ($self,$gapchar,$all_gaps_columns) = @_;
1223	6					8	my $gap_line;
1224	6	100				12	if ($all_gaps_columns) {
1225	2					9	$gap_line = $self->all_gap_line($gapchar);
1226							} else {
1227	4					11	$gap_line = $self->gap_line($gapchar);
1228							}
1229	6					13	my $aln = $self->new;
1230
1231	6					8	my @remove;
1232	6					8	my $length = 0;
1233	6		66			18	my $del_char = $gapchar \|\| $self->gap_char;
1234							# Do the matching to get the segments to remove
1235	6					42	while ($gap_line =~ m/[$del_char]/g) {
1236	14					18	my $start = pos($gap_line)-1;
1237	14					34	$gap_line =~ m/\G[$del_char]+/gc;
1238	14					15	my $end = pos($gap_line)-1;
1239
1240							#have to offset the start and end for subsequent removes
1241	14					13	$start-=$length;
1242	14					13	$end -=$length;
1243	14					13	$length += ($end-$start+1);
1244	14					38	push @remove, [$start,$end];
1245							}
1246
1247							#remove the segments
1248	6	100				22	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249							# fix for meta, sf, ann
1250	6					28	return $aln;
1251							}
1252
1253
1254							sub _remove_col {
1255	9			9		16	my ($self,$aln,$remove) = @_;
1256	9					9	my @new;
1257
1258	9					19	my $gap = $self->gap_char;
1259
1260							# splice out the segments and create new seq
1261	9					17	foreach my $seq($self->each_seq){
1262	42					77	my $new_seq = Bio::LocatableSeq->new(
1263							-id => $seq->id,
1264							-alphabet=> $seq->alphabet,
1265							-strand => $seq->strand,
1266							-verbose => $self->verbose);
1267	42					91	my $sequence = $seq->seq;
1268	42					48	foreach my $pair(@{$remove}){
	42					58
1269	658					684	my $start = $pair->[0];
1270	658					564	my $end = $pair->[1];
1271	658	50				711	$sequence = $seq->seq unless $sequence;
1272	658					611	my $orig = $sequence;
1273	658	100				961	my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274	658	100				948	my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275	658					711	$sequence = $head.$tail;
1276							# start
1277	658	100				844	unless (defined $new_seq->start) {
1278	42	100				63	if ($start == 0) {
1279	31					99	my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280	31					55	$new_seq->start($seq->start + $end + 1 - $start_adjust);
1281							}
1282							else {
1283	11					49	my $start_adjust = $orig =~ /^$gap+/;
1284	11	100				21	if ($start_adjust) {
1285	5					15	$start_adjust = $+[0] == $start;
1286							}
1287	11					22	$new_seq->start($seq->start + $start_adjust);
1288							}
1289							}
1290							# end
1291	658	100				855	if (($end + 1) >= CORE::length($orig)) {
1292	32					90	my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293	32					59	$new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294							}
1295							else {
1296	626					766	$new_seq->end($seq->end);
1297							}
1298							}
1299
1300	42	100				62	if ($new_seq->end < $new_seq->start) {
1301							# we removed all columns except for gaps: set to 0 to indicate no
1302							# sequence
1303	1					3	$new_seq->start(0);
1304	1					2	$new_seq->end(0);
1305							}
1306
1307	42	50				96	$new_seq->seq($sequence) if $sequence;
1308	42					71	push @new, $new_seq;
1309							}
1310							# add the new seqs to the alignment
1311	9					15	foreach my $new(@new){
1312	42					63	$aln->add_seq($new);
1313							}
1314							# fix for meta, sf, ann
1315	9					18	return $aln;
1316							}
1317
1318							sub _remove_columns_by_type {
1319	1			1		3	my ($self,$type) = @_;
1320	1					2	my $aln = $self->new;
1321	1					2	my @remove;
1322
1323	1	50				1	my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
	1					4
	1					2
1324	1	50				1	my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
	1					3
1325	1					5	my %matchchars = ( 'match' => '\*',
1326							'weak' => '\.',
1327							'strong' => ':',
1328							'mismatch' => ' ',
1329							'gaps' => '',
1330							'all_gaps_columns' => ''
1331							);
1332							# get the characters to delete against
1333	1					2	my $del_char;
1334	1					1	foreach my $type (@{$type}){
	1					1
1335	1					3	$del_char.= $matchchars{$type};
1336							}
1337
1338	1					2	my $length = 0;
1339	1					4	my $match_line = $self->match_line;
1340							# do the matching to get the segments to remove
1341	1	50				5	if($del_char){
1342	1					33	while($match_line =~ m/[$del_char]/g ){
1343	37					42	my $start = pos($match_line)-1;
1344	37					78	$match_line=~/\G[$del_char]+/gc;
1345	37					36	my $end = pos($match_line)-1;
1346
1347							#have to offset the start and end for subsequent removes
1348	37					27	$start-=$length;
1349	37					28	$end -=$length;
1350	37					31	$length += ($end-$start+1);
1351	37					86	push @remove, [$start,$end];
1352							}
1353							}
1354
1355							# remove the segments
1356	1	50				8	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357	1	50				3	$aln = $aln->remove_gaps() if $gap;
1358	1	50				3	$aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359							# fix for meta, sf, ann
1360	1					11	$aln;
1361							}
1362
1363
1364							sub _remove_columns_by_num {
1365	3			3		6	my ($self,$positions) = @_;
1366	3					5	my $aln = $self->new;
1367
1368							# sort the positions
1369	3					9	@$positions = sort { $a->[0] <=> $b->[0] } @$positions;
	3					5
1370
1371	3					3	my @remove;
1372	3					5	my $length = 0;
1373	3					3	foreach my $pos (@{$positions}) {
	3					7
1374	5					5	my ($start, $end) = @{$pos};
	5					8
1375
1376							#have to offset the start and end for subsequent removes
1377	5					6	$start-=$length;
1378	5					6	$end -=$length;
1379	5					6	$length += ($end-$start+1);
1380	5					11	push @remove, [$start,$end];
1381							}
1382
1383							#remove the segments
1384	3	50				15	$aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385							# fix for meta, sf, ann
1386	3					9	$aln;
1387							}
1388
1389
1390							=head1 Change sequences within the MSA
1391
1392							These methods affect characters in all sequences without changing the
1393							alignment.
1394
1395							=head2 splice_by_seq_pos
1396
1397							Title : splice_by_seq_pos
1398							Usage : $status = splice_by_seq_pos(1);
1399							Function: splices all aligned sequences where the specified sequence
1400							has gaps.
1401							Example :
1402							Returns : 1 on success
1403							Args : position of sequence to splice by
1404
1405
1406							=cut
1407
1408							sub splice_by_seq_pos{
1409	0			0	1	0	my ($self,$pos) = @_;
1410
1411	0					0	my $guide = $self->get_seq_by_pos($pos);
1412	0					0	my $guide_seq = $guide->seq;
1413
1414	0					0	$guide_seq =~ s/\./\-/g;
1415
1416	0					0	my @gaps = ();
1417	0					0	$pos = -1;
1418	0					0	while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419	0					0	unshift @gaps, $pos;
1420	0					0	$pos++;
1421							}
1422
1423	0					0	foreach my $seq ($self->each_seq){
1424	0					0	my @bases = split '', $seq->seq;
1425
1426	0					0	splice(@bases, $_, 1) foreach @gaps;
1427	0					0	$seq->seq(join('', @bases));
1428							}
1429
1430	0					0	1;
1431							}
1432
1433							=head2 map_chars
1434
1435							Title : map_chars
1436							Usage : $ali->map_chars('\.','-')
1437							Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438							characters.
1439
1440							Note that the first argument is interpreted as a regexp
1441							so be careful and escape any wild card characters (e.g.
1442							do $ali->map_chars('\.','-') to replace periods with dashes.
1443							Returns : 1 on success
1444							Argument : A regexp and a string
1445
1446							=cut
1447
1448							sub map_chars {
1449	4			4	1	13	my $self = shift;
1450	4					6	my $from = shift;
1451	4					8	my $to = shift;
1452	4					8	my ( $seq, $temp );
1453
1454	4	50	33			23	$self->throw("Need two arguments: a regexp and a string")
1455							unless defined $from and defined $to;
1456
1457	4					13	foreach $seq ( $self->each_seq() ) {
1458	39					66	$temp = $seq->seq();
1459	39					168	$temp =~ s/$from/$to/g;
1460	39					70	$seq->seq($temp);
1461							}
1462	4					28	return 1;
1463							}
1464
1465
1466							=head2 uppercase
1467
1468							Title : uppercase()
1469							Usage : $ali->uppercase()
1470							Function : Sets all the sequences to uppercase
1471							Returns : 1 on success
1472							Argument :
1473
1474							=cut
1475
1476							sub uppercase {
1477	1			1	1	3	my $self = shift;
1478	1					2	my $seq;
1479							my $temp;
1480
1481	1					3	foreach $seq ( $self->each_seq() ) {
1482	16					29	$temp = $seq->seq();
1483	16					30	$temp =~ tr/[a-z]/[A-Z]/;
1484
1485	16					20	$seq->seq($temp);
1486							}
1487	1					5	return 1;
1488							}
1489
1490							=head2 cigar_line
1491
1492							Title : cigar_line()
1493							Usage : %cigars = $align->cigar_line()
1494							Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495							Report) line for each sequence in the alignment. Examples are
1496							"1,60" or "5,10:12,58", where the numbers refer to conserved
1497							positions within the alignment. The keys of the hash are the
1498							NSEs (name/start/end) assigned to each sequence.
1499							Args : threshold (optional, defaults to 100)
1500							Returns : Hash of strings (cigar lines)
1501
1502							=cut
1503
1504							sub cigar_line {
1505	1			1	1	466	my $self = shift;
1506	1		50			5	my $thr=shift\|\|100;
1507	1					2	my %cigars;
1508
1509	1					4	my @consensus = split "",($self->consensus_string($thr));
1510	1					142	my $len = $self->length;
1511	1					2	my $gapchar = $self->gap_char;
1512
1513							# create a precursor, something like (1,4,5,6,7,33,45),
1514							# where each number corresponds to a conserved position
1515	1					3	foreach my $seq ( $self->each_seq ) {
1516	4					6	my @seq = split "", uc ($seq->seq);
1517	4					4	my $pos = 1;
1518	4					9	for (my $x = 0 ; $x < $len ; $x++ ) {
1519	132	100				189	if ($seq[$x] eq $consensus[$x]) {
		100
1520	28					24	push @{$cigars{$seq->get_nse}},$pos;
	28					34
1521	28					53	$pos++;
1522							} elsif ($seq[$x] ne $gapchar) {
1523	84					93	$pos++;
1524							}
1525							}
1526							}
1527							# duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528	1					95	for my $name (keys %cigars) {
1529	4					6	splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
	4					6
1530	4	50				6	( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
	4					6
	4					7
1531	4					5	push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
	4					5
	4					5
1532	4					5	( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
	4					6
1533	4	50				4	${$cigars{$name}}[$#{$cigars{$name}}] );
	4					6
	4					5
1534	4					5	for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
	32					41
1535	28	100	100			21	if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
	28					27
	28					44
1536	8					8	${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
	8					16
1537	4					4	splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
	4					5
	4					5
1538							}
1539							}
1540							}
1541							# collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542	1					4	for my $name (keys %cigars) {
1543	4					4	my @remove;
1544	4					3	for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
	40					50
1545	36	100	100			29	if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
	36					34
	36					49
1546	12					12	${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
	12					24
1547	8					11	unshift @remove,$x;
1548							}
1549							}
1550	4					5	for my $pos (@remove) {
1551	8					6	splice @{$cigars{$name}}, $pos, 1;
	8					11
1552							}
1553							}
1554							# join and punctuate
1555	1					3	for my $name (keys %cigars) {
1556	4					6	my ($start,$end,$str) = "";
1557	4					4	while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
	20					30
1558	16					24	$str .= ($start . "," . $end . ":");
1559							}
1560	4					14	$str =~ s/:$//;
1561	4					7	$cigars{$name} = $str;
1562							}
1563	1					8	%cigars;
1564							}
1565
1566
1567							=head2 match_line
1568
1569							Title : match_line()
1570							Usage : $line = $align->match_line()
1571							Function : Generates a match line - much like consensus string
1572							except that a line indicating the '*' for a match.
1573							Args : (optional) Match line characters ('*' by default)
1574							(optional) Strong match char (':' by default)
1575							(optional) Weak match char ('.' by default)
1576							Returns : String
1577
1578							=cut
1579
1580							sub match_line {
1581	6			6	1	22	my ($self,$matchlinechar, $strong, $weak) = @_;
1582	6		50			73	my %matchchars = ('match' => $matchlinechar \|\| '*',
			50
			50
1583							'weak' => $weak \|\| '.',
1584							'strong' => $strong \|\| ':',
1585							'mismatch' => ' ',
1586							);
1587
1588	6					14	my @seqchars;
1589							my $alphabet;
1590	6					21	foreach my $seq ( $self->each_seq ) {
1591	69					128	push @seqchars, [ split(//, uc ($seq->seq)) ];
1592	69	100				1203	$alphabet = $seq->alphabet unless defined $alphabet;
1593							}
1594	6					17	my $refseq = shift @seqchars;
1595							# let's just march down the columns
1596	6					8	my $matchline;
1597							POS:
1598	6					25	foreach my $pos ( 0..$self->length ) {
1599	3138					3556	my $refchar = $refseq->[$pos];
1600	3138					2974	my $char = $matchchars{'mismatch'};
1601	3138	100				3749	unless( defined $refchar ) {
1602	6	50				47	last if $pos == $self->length; # short circuit on last residue
1603							# this in place to handle jason's soon-to-be-committed
1604							# intron mapping code
1605	0					0	goto bottom;
1606							}
1607	3132					3677	my %col = ($refchar => 1);
1608	3132		66			8125	my $dash = ($refchar eq '-' \|\| $refchar eq '.' \|\| $refchar eq ' ');
1609	3132					3149	foreach my $seq ( @seqchars ) {
1610	26906	50				29985	next if $pos >= scalar @$seq;
1611	26906	100	100			68731	$dash = 1 if( $seq->[$pos] eq '-' \|\| $seq->[$pos] eq '.' \|\|
			66
1612							$seq->[$pos] eq ' ' );
1613	26906	50				36958	$col{$seq->[$pos]}++ if defined $seq->[$pos];
1614							}
1615	3132					6195	my @colresidues = sort keys %col;
1616
1617							# if all the values are the same
1618	3132	100				5157	if( $dash ) { $char = $matchchars{'mismatch'} }
	758	100				804
		100
1619	1721					1921	elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620							elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621							# matches for protein seqs
1622							TYPE:
1623	472					509	foreach my $type ( qw(strong weak) ) {
1624							# iterate through categories
1625	804					694	my %groups;
1626							# iterate through each of the aa in the col
1627							# look to see which groups it is in
1628	804					798	foreach my $c ( @colresidues ) {
1629	3075					2563	foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
	30455					35302
	3075					3649
1630	6808					5661	push @{$groups{$f}},$c;
	6808					9905
1631							}
1632							}
1633							GRP:
1634	804					1096	foreach my $cols ( values %groups ) {
1635	3940					5045	@$cols = sort @$cols;
1636							# now we are just testing to see if two arrays
1637							# are identical w/o changing either one
1638							# have to be same len
1639	3940	100				6076	next if( scalar @$cols != scalar @colresidues );
1640							# walk down the length and check each slot
1641	196					310	for($_=0;$_ < (scalar @$cols);$_++ ) {
1642	453	50				766	next GRP if( $cols->[$_] ne $colresidues[$_] );
1643							}
1644	196					234	$char = $matchchars{$type};
1645	196					391	last TYPE;
1646							}
1647							}
1648							}
1649							bottom:
1650	3132					4963	$matchline .= $char;
1651							}
1652	6					1211	return $matchline;
1653							}
1654
1655
1656							=head2 gap_line
1657
1658							Title : gap_line()
1659							Usage : $line = $align->gap_line()
1660							Function : Generates a gap line - much like consensus string
1661							except that a line where '-' represents gap
1662							Args : (optional) gap line characters ('-' by default)
1663							Returns : string
1664
1665							=cut
1666
1667							sub gap_line {
1668	14			14	1	29	my ($self,$gapchar) = @_;
1669	14		33			54	$gapchar = $gapchar \|\| $self->gap_char;
1670	14					18	my %gap_hsh; # column gaps vector
1671	14					36	foreach my $seq ( $self->each_seq ) {
1672	33					49	my $i = 0;
1673	223					920	map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
	4020					6809
1674	33					95	map {[$i++, $_]} split(//, uc ($seq->seq));
	4020					5015
1675							}
1676	14					29	my $gap_line;
1677	14					38	foreach my $pos ( 0..$self->length-1 ) {
1678	1973	100				2545	$gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1679							}
1680	14					66	return $gap_line;
1681							}
1682
1683							=head2 all_gap_line
1684
1685							Title : all_gap_line()
1686							Usage : $line = $align->all_gap_line()
1687							Function : Generates a gap line - much like consensus string
1688							except that a line where '-' represents all-gap column
1689							Args : (optional) gap line characters ('-' by default)
1690							Returns : string
1691
1692							=cut
1693
1694							sub all_gap_line {
1695	2			2	1	4	my ($self,$gapchar) = @_;
1696	2		66			9	$gapchar = $gapchar \|\| $self->gap_char;
1697	2					4	my %gap_hsh; # column gaps counter hash
1698	2					7	my @seqs = $self->each_seq;
1699	2					4	foreach my $seq ( @seqs ) {
1700	5					7	my $i = 0;
1701	7					18	map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
	54					140
1702	5					12	map {[$i++, $_]} split(//, uc ($seq->seq));
	54					80
1703							}
1704	2					3	my $gap_line;
1705	2					6	foreach my $pos ( 0..$self->length-1 ) {
1706	22	100	100			47	if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707							# gaps column
1708	1					2	$gap_line .= $self->gap_char;
1709							} else {
1710	21					23	$gap_line .= '.';
1711							}
1712							}
1713	2					7	return $gap_line;
1714							}
1715
1716							=head2 gap_col_matrix
1717
1718							Title : gap_col_matrix()
1719							Usage : my $cols = $align->gap_col_matrix()
1720							Function : Generates an array where each element in the array is a
1721							hash reference with a key of the sequence name and a
1722							value of 1 if the sequence has a gap at that column
1723							Returns : Reference to an array
1724							Args : Optional: gap line character ($aln->gap_char or '-' by default)
1725
1726							=cut
1727
1728							sub gap_col_matrix {
1729	0			0	1	0	my ( $self, $gapchar ) = @_;
1730	0		0			0	$gapchar = $gapchar \|\| $self->gap_char;
1731	0					0	my %gap_hsh; # column gaps vector
1732							my @cols;
1733	0					0	foreach my $seq ( $self->each_seq ) {
1734	0					0	my $i = 0;
1735	0					0	my $str = $seq->seq;
1736	0					0	my $len = $seq->length;
1737	0					0	my $ch;
1738	0					0	my $id = $seq->display_id;
1739	0					0	while ( $i < $len ) {
1740	0					0	$ch = substr( $str, $i, 1 );
1741	0					0	$cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1742							}
1743							}
1744	0					0	return \@cols;
1745							}
1746
1747							=head2 match
1748
1749							Title : match()
1750							Usage : $ali->match()
1751							Function : Goes through all columns and changes residues that are
1752							identical to residue in first sequence to match '.'
1753							character. Sets match_char.
1754
1755							USE WITH CARE: Most MSA formats do not support match
1756							characters in sequences, so this is mostly for output
1757							only. NEXUS format (Bio::AlignIO::nexus) can handle
1758							it.
1759							Returns : 1 on success
1760							Argument : a match character, optional, defaults to '.'
1761
1762							=cut
1763
1764							sub match {
1765	1			1	1	2	my ( $self, $match ) = @_;
1766
1767	1		50			6	$match \|\|= '.';
1768	1					1	my ($matching_char) = $match;
1769	1	50				7	$matching_char = "\\$match" if $match =~ /[\^.$\|()\[\]]/; #';
1770	1					5	$self->map_chars( $matching_char, '-' );
1771
1772	1					3	my @seqs = $self->each_seq();
1773	1	50				4	return 1 unless scalar @seqs > 1;
1774
1775	1					2	my $refseq = shift @seqs;
1776	1					3	my @refseq = split //, $refseq->seq;
1777	1					5	my $gapchar = $self->gap_char;
1778
1779	1					2	foreach my $seq (@seqs) {
1780	5					12	my @varseq = split //, $seq->seq();
1781	5					13	for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782	720	100	66			3172	$varseq[$i] = $match
			33
			66
1783							if defined $refseq[$i]
1784							&& ( $refseq[$i] =~ /[A-Za-z\*]/
1785							\|\| $refseq[$i] =~ /$gapchar/ )
1786							&& $refseq[$i] eq $varseq[$i];
1787							}
1788	5					44	$seq->seq( join '', @varseq );
1789							}
1790	1					5	$self->match_char($match);
1791	1					8	return 1;
1792							}
1793
1794
1795
1796							=head2 unmatch
1797
1798							Title : unmatch()
1799							Usage : $ali->unmatch()
1800							Function : Undoes the effect of method match. Unsets match_char.
1801							Returns : 1 on success
1802							Argument : a match character, optional, defaults to '.'
1803
1804							See L and L
1805
1806							=cut
1807
1808							sub unmatch {
1809	5			5	1	12	my ( $self, $match ) = @_;
1810
1811	5		100			21	$match \|\|= '.';
1812
1813	5					20	my @seqs = $self->each_seq();
1814	5	50				16	return 1 unless scalar @seqs > 1;
1815
1816	5					9	my $refseq = shift @seqs;
1817	5					15	my @refseq = split //, $refseq->seq;
1818	5					24	my $gapchar = $self->gap_char;
1819	5					13	foreach my $seq (@seqs) {
1820	114					228	my @varseq = split //, $seq->seq();
1821	114					256	for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822	19782	100	100			77289	$varseq[$i] = $refseq[$i]
			66
			100
1823							if defined $refseq[$i]
1824							&& ( $refseq[$i] =~ /[A-Za-z\*]/
1825							\|\| $refseq[$i] =~ /$gapchar/ )
1826							&& $varseq[$i] eq $match;
1827							}
1828	114					962	$seq->seq( join '', @varseq );
1829							}
1830	5					32	$self->match_char('');
1831	5					101	return 1;
1832							}
1833
1834
1835							=head1 MSA attributes
1836
1837							Methods for setting and reading the MSA attributes.
1838
1839							Note that the methods defining character semantics depend on the user
1840							to set them sensibly. They are needed only by certain input/output
1841							methods. Unset them by setting to an empty string ('').
1842
1843							=head2 id
1844
1845							Title : id
1846							Usage : $myalign->id("Ig")
1847							Function : Gets/sets the id field of the alignment
1848							Returns : An id string
1849							Argument : An id string (optional)
1850
1851							=cut
1852
1853							sub id {
1854	142			142	1	262	my ( $self, $name ) = @_;
1855
1856	142	100				349	if ( defined($name) ) {
1857	72					140	$self->{'_id'} = $name;
1858							}
1859
1860	142					380	return $self->{'_id'};
1861							}
1862
1863							=head2 accession
1864
1865							Title : accession
1866							Usage : $myalign->accession("PF00244")
1867							Function : Gets/sets the accession field of the alignment
1868							Returns : An acc string
1869							Argument : An acc string (optional)
1870
1871							=cut
1872
1873							sub accession {
1874	16			16	1	36	my ( $self, $acc ) = @_;
1875
1876	16	100				44	if ( defined($acc) ) {
1877	8					22	$self->{'_accession'} = $acc;
1878							}
1879
1880	16					49	return $self->{'_accession'};
1881							}
1882
1883							=head2 description
1884
1885							Title : description
1886							Usage : $myalign->description("14-3-3 proteins")
1887							Function : Gets/sets the description field of the alignment
1888							Returns : An description string
1889							Argument : An description string (optional)
1890
1891							=cut
1892
1893							sub description {
1894	25			25	1	56	my ( $self, $name ) = @_;
1895
1896	25	100				68	if ( defined($name) ) {
1897	13					33	$self->{'_description'} = $name;
1898							}
1899
1900	25					75	return $self->{'_description'};
1901							}
1902
1903							=head2 missing_char
1904
1905							Title : missing_char
1906							Usage : $myalign->missing_char("?")
1907							Function : Gets/sets the missing_char attribute of the alignment
1908							It is generally recommended to set it to 'n' or 'N'
1909							for nucleotides and to 'X' for protein.
1910							Returns : An missing_char string,
1911							Argument : An missing_char string (optional)
1912
1913							=cut
1914
1915							sub missing_char {
1916	32			32	1	80	my ( $self, $char ) = @_;
1917
1918	32	100				98	if ( defined $char ) {
1919	22	50				50	$self->throw("Single missing character, not [$char]!")
1920							if CORE::length($char) > 1;
1921	22					52	$self->{'_missing_char'} = $char;
1922							}
1923
1924	32					70	return $self->{'_missing_char'};
1925							}
1926
1927
1928							=head2 match_char
1929
1930							Title : match_char
1931							Usage : $myalign->match_char('.')
1932							Function : Gets/sets the match_char attribute of the alignment
1933							Returns : An match_char string,
1934							Argument : An match_char string (optional)
1935
1936							=cut
1937
1938							sub match_char {
1939	18			18	1	39	my ( $self, $char ) = @_;
1940
1941	18	100				48	if ( defined $char ) {
1942	9	50				25	$self->throw("Single match character, not [$char]!")
1943							if CORE::length($char) > 1;
1944	9					22	$self->{'_match_char'} = $char;
1945							}
1946
1947	18					40	return $self->{'_match_char'};
1948							}
1949
1950							=head2 gap_char
1951
1952							Title : gap_char
1953							Usage : $myalign->gap_char('-')
1954							Function : Gets/sets the gap_char attribute of the alignment
1955							Returns : An gap_char string, defaults to '-'
1956							Argument : An gap_char string (optional)
1957
1958							=cut
1959
1960							sub gap_char {
1961	11242			11242	1	12863	my ( $self, $char ) = @_;
1962
1963	11242	100	100			25423	if ( defined $char \|\| !defined $self->{'_gap_char'} ) {
1964	51	100				134	$char = '-' unless defined $char;
1965	51	50				124	$self->throw("Single gap character, not [$char]!")
1966							if CORE::length($char) > 1;
1967	51					155	$self->{'_gap_char'} = $char;
1968							}
1969	11242					19402	return $self->{'_gap_char'};
1970							}
1971
1972
1973							=head2 symbol_chars
1974
1975							Title : symbol_chars
1976							Usage : my @symbolchars = $aln->symbol_chars;
1977							Function: Returns all the seen symbols (other than gaps)
1978							Returns : array of characters that are the seen symbols
1979							Args : boolean to include the gap/missing/match characters
1980
1981							=cut
1982
1983							sub symbol_chars{
1984	5			5	1	13	my ($self,$includeextra) = @_;
1985
1986	5	100				13	unless ($self->{'_symbols'}) {
1987	3					9	foreach my $seq ($self->each_seq) {
1988	12					30	map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
	2778					2780
1989							}
1990							}
1991	5					7	my %copy = %{$self->{'_symbols'}};
	5					33
1992	5	50				15	if( ! $includeextra ) {
1993	5					12	foreach my $char ( $self->gap_char, $self->match_char,
1994							$self->missing_char) {
1995	15	100				28	delete $copy{$char} if( defined $char );
1996							}
1997							}
1998	5					35	return keys %copy;
1999							}
2000
2001							=head1 Alignment descriptors
2002
2003							These read only methods describe the MSA in various ways.
2004
2005
2006							=head2 score
2007
2008							Title : score
2009							Usage : $str = $ali->score()
2010							Function : get/set a score of the alignment
2011							Returns : a score for the alignment
2012							Argument : an optional score to set
2013
2014							=cut
2015
2016							sub score {
2017	26			26	1	1690	my $self = shift;
2018	26	100				80	$self->{score} = shift if @_;
2019	26					55	return $self->{score};
2020							}
2021
2022							=head2 consensus_string
2023
2024							Title : consensus_string
2025							Usage : $str = $ali->consensus_string($threshold_percent)
2026							Function : Makes a strict consensus
2027							Returns : Consensus string
2028							Argument : Optional threshold ranging from 0 to 100.
2029							The consensus residue has to appear at least threshold %
2030							of the sequences at a given location, otherwise a '?'
2031							character will be placed at that location.
2032							(Default value = 0%)
2033
2034							=cut
2035
2036							sub consensus_string {
2037	11			11	1	2897	my $self = shift;
2038	11					19	my $threshold = shift;
2039
2040	11					16	my $out = "";
2041	11					35	my $len = $self->length - 1;
2042
2043	11					32	foreach ( 0 .. $len ) {
2044	2100					3149	$out .= $self->_consensus_aa( $_, $threshold );
2045							}
2046	11					106	return $out;
2047							}
2048
2049
2050							=head2 consensus_conservation
2051
2052							Title : consensus_conservation
2053							Usage : @conservation = $ali->consensus_conservation();
2054							Function : Conservation (as a percent) of each position of alignment
2055							Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2056							Argument :
2057
2058							=cut
2059
2060							sub consensus_conservation {
2061	1			1	1	2	my $self = shift;
2062	1					2	my @cons;
2063	1					3	my $num_sequences = $self->num_sequences;
2064	1					4	foreach my $point (0..$self->length-1) {
2065	422					521	my %hash = $self->_consensus_counts($point);
2066							# max frequency of a non-gap letter
2067	422					724	my $max = (sort {$b<=>$a} values %hash )[0];
	572					663
2068	422					719	push @cons, 100 * $max / $num_sequences;
2069							}
2070	1					41	return @cons;
2071							}
2072
2073							sub _consensus_aa {
2074	2100			2100		1989	my $self = shift;
2075	2100					2019	my $point = shift;
2076	2100		100			3561	my $threshold_percent = shift \|\| -1 ;
2077	2100					2156	my ($seq,%hash,$count,$letter,$key);
2078	2100					2655	my $gapchar = $self->gap_char;
2079	2100					2875	%hash = $self->_consensus_counts($point);
2080	2100					3475	my $number_of_sequences = $self->num_sequences();
2081	2100					2851	my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082	2100					1930	$count = -1;
2083	2100					1935	$letter = '?';
2084
2085	2100					3886	foreach $key ( sort keys %hash ) {
2086							# print "Now at $key $hash{$key}\n";
2087	4898	100	100			10478	if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088	2472					2481	$letter = $key;
2089	2472					2748	$count = $hash{$key};
2090							}
2091							}
2092	2100					4550	return $letter;
2093							}
2094
2095							# Frequency of each letter in one column
2096							sub _consensus_counts {
2097	2522			2522		2348	my $self = shift;
2098	2522					2187	my $point = shift;
2099	2522					2065	my %hash;
2100	2522					2659	my $gapchar = $self->gap_char;
2101	2522					3253	foreach my $seq ( $self->each_seq() ) {
2102	22078					28967	my $letter = substr($seq->seq,$point,1);
2103	22078	50				28185	$self->throw("--$point-----------") if $letter eq '';
2104	22078	100	100			44778	($letter eq $gapchar \|\| $letter =~ /\./) && next;
2105	17334					20215	$hash{$letter}++;
2106							}
2107	2522					6835	return %hash;
2108							}
2109
2110
2111							=head2 consensus_iupac
2112
2113							Title : consensus_iupac
2114							Usage : $str = $ali->consensus_iupac()
2115							Function : Makes a consensus using IUPAC ambiguity codes from DNA
2116							and RNA. The output is in upper case except when gaps in
2117							a column force output to be in lower case.
2118
2119							Note that if your alignment sequences contain a lot of
2120							IUPAC ambiquity codes you often have to manually set
2121							alphabet. Bio::PrimarySeq::_guess_type thinks they
2122							indicate a protein sequence.
2123							Returns : consensus string
2124							Argument : none
2125							Throws : on protein sequences
2126
2127							=cut
2128
2129							sub consensus_iupac {
2130	1			1	1	6	my $self = shift;
2131	1					2	my $out = "";
2132	1					32	my $len = $self->length - 1;
2133
2134							# only DNA and RNA sequences are valid
2135	1					3	foreach my $seq ( $self->each_seq() ) {
2136	2	50				5	$self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137							if $seq->alphabet eq 'protein';
2138							}
2139
2140							# loop over the alignment columns
2141	1					2	foreach my $count ( 0 .. $len ) {
2142	10					15	$out .= $self->_consensus_iupac($count);
2143							}
2144	1					4	return $out;
2145							}
2146
2147							sub _consensus_iupac {
2148	10			10		13	my ($self, $column) = @_;
2149	10					10	my ($string, $char, $rna);
2150
2151							#determine all residues in a column
2152	10					12	foreach my $seq ( $self->each_seq() ) {
2153	20					34	$string .= substr($seq->seq, $column, 1);
2154							}
2155	10					12	$string = uc $string;
2156
2157							# quick exit if there's an N in the string
2158	10	100				17	if ($string =~ /N/) {
2159	1	50				4	$string =~ /\W/ ? return 'n' : return 'N';
2160							}
2161							# ... or if there are only gap characters
2162	9	100				19	return '-' if $string =~ /^\W+$/;
2163
2164							# treat RNA as DNA in regexps
2165	7	50				11	if ($string =~ /U/) {
2166	0					0	$string =~ s/U/T/;
2167	0					0	$rna = 1;
2168							}
2169
2170							# the following s///'s only need to be done to the _first_ ambiguity code
2171							# as we only need to see the _range_ of characters in $string
2172
2173	7	50				10	if ($string =~ /[VDHB]/) {
2174	0					0	$string =~ s/V/AGC/;
2175	0					0	$string =~ s/D/AGT/;
2176	0					0	$string =~ s/H/ACT/;
2177	0					0	$string =~ s/B/CTG/;
2178							}
2179
2180	7	100				11	if ($string =~ /[SKYRWM]/) {
2181	1					2	$string =~ s/S/GC/;
2182	1					2	$string =~ s/K/GT/;
2183	1					2	$string =~ s/Y/CT/;
2184	1					2	$string =~ s/R/AG/;
2185	1					3	$string =~ s/W/AT/;
2186	1					2	$string =~ s/M/AC/;
2187							}
2188
2189							# and now the guts of the thing
2190
2191	7	100				18	if ($string =~ /A/) {
		50
		50
		50
2192	5					5	$char = 'A'; # A A
2193	5	50				14	if ($string =~ /G/) {
		50
		100
2194	0					0	$char = 'R'; # A and G (purines) R
2195	0	0				0	if ($string =~ /C/) {
		0
2196	0					0	$char = 'V'; # A and G and C V
2197	0	0				0	if ($string =~ /T/) {
2198	0					0	$char = 'N'; # A and G and C and T N
2199							}
2200							} elsif ($string =~ /T/) {
2201	0					0	$char = 'D'; # A and G and T D
2202							}
2203							} elsif ($string =~ /C/) {
2204	0					0	$char = 'M'; # A and C M
2205	0	0				0	if ($string =~ /T/) {
2206	0					0	$char = 'H'; # A and C and T H
2207							}
2208							} elsif ($string =~ /T/) {
2209	2					2	$char = 'W'; # A and T W
2210							}
2211							} elsif ($string =~ /C/) {
2212	0					0	$char = 'C'; # C C
2213	0	0				0	if ($string =~ /T/) {
		0
2214	0					0	$char = 'Y'; # C and T (pyrimidines) Y
2215	0	0				0	if ($string =~ /G/) {
2216	0					0	$char = 'B'; # C and T and G B
2217							}
2218							} elsif ($string =~ /G/) {
2219	0					0	$char = 'S'; # C and G S
2220							}
2221							} elsif ($string =~ /G/) {
2222	0					0	$char = 'G'; # G G
2223	0	0				0	if ($string =~ /C/) {
		0
2224	0					0	$char = 'S'; # G and C S
2225							} elsif ($string =~ /T/) {
2226	0					0	$char = 'K'; # G and T K
2227							}
2228							} elsif ($string =~ /T/) {
2229	2					3	$char = 'T'; # T T
2230							}
2231
2232	7	50	33			12	$char = 'U' if $rna and $char eq 'T';
2233	7	100				12	$char = lc $char if $string =~ /\W/;
2234
2235	7					14	return $char;
2236							}
2237
2238
2239							=head2 consensus_meta
2240
2241							Title : consensus_meta
2242							Usage : $seqmeta = $ali->consensus_meta()
2243							Function : Returns a Bio::Seq::Meta object containing the consensus
2244							strings derived from meta data analysis.
2245							Returns : Bio::Seq::Meta
2246							Argument : Bio::Seq::Meta
2247							Throws : non-MetaI object
2248
2249							=cut
2250
2251							sub consensus_meta {
2252	56			56	1	118	my ($self, $meta) = @_;
2253	56	50	33			285	if ($meta && (!ref $meta \|\| !$meta->isa('Bio::Seq::MetaI'))) {
			66
2254	0					0	$self->throw('Not a Bio::Seq::MetaI object');
2255							}
2256	56	100				136	return $self->{'_aln_meta'} = $meta if $meta;
2257	39					91	return $self->{'_aln_meta'}
2258							}
2259
2260							=head2 is_flush
2261
2262							Title : is_flush
2263							Usage : if ( $ali->is_flush() )
2264							Function : Tells you whether the alignment
2265							: is flush, i.e. all of the same length
2266							Returns : 1 or 0
2267							Argument :
2268
2269							=cut
2270
2271							sub is_flush {
2272	58			58	1	118	my ( $self, $report ) = @_;
2273	58					71	my $seq;
2274	58					73	my $length = (-1);
2275	58					72	my $temp;
2276
2277	58					136	foreach $seq ( $self->each_seq() ) {
2278	258	100				380	if ( $length == (-1) ) {
2279	58					166	$length = CORE::length( $seq->seq() );
2280	58					103	next;
2281							}
2282
2283	200					330	$temp = CORE::length( $seq->seq() );
2284	200	50				354	if ( $temp != $length ) {
2285	0	0				0	$self->warn(
2286							"expecting $length not $temp from " . $seq->display_id )
2287							if ($report);
2288	0					0	$self->debug(
2289							"expecting $length not $temp from " . $seq->display_id );
2290	0					0	$self->debug( $seq->seq() . "\n" );
2291	0					0	return 0;
2292							}
2293							}
2294
2295	58					170	return 1;
2296							}
2297
2298
2299							=head2 length
2300
2301							Title : length()
2302							Usage : $len = $ali->length()
2303							Function : Returns the maximum length of the alignment.
2304							To be sure the alignment is a block, use is_flush
2305							Returns : Integer
2306							Argument :
2307
2308							=cut
2309
2310							sub length_aln {
2311	0			0	0	0	my $self = shift;
2312	0					0	$self->deprecated("length_aln - deprecated method. Use length() instead.");
2313	0					0	$self->length(@_);
2314							}
2315
2316							sub length {
2317	2221			2221	1	3851	my $self = shift;
2318	2221					2014	my $seq;
2319	2221					2111	my $length = -1;
2320	2221					2016	my $temp;
2321
2322	2221					3029	foreach $seq ( $self->each_seq() ) {
2323	8534					12122	$temp = $seq->length();
2324	8534	100				11950	if( $temp > $length ) {
2325	2231					2501	$length = $temp;
2326							}
2327							}
2328
2329	2221					4284	return $length;
2330							}
2331
2332
2333							=head2 maxdisplayname_length
2334
2335							Title : maxdisplayname_length
2336							Usage : $ali->maxdisplayname_length()
2337							Function : Gets the maximum length of the displayname in the
2338							alignment. Used in writing out various MSA formats.
2339							Returns : integer
2340							Argument :
2341
2342							=cut
2343
2344							sub maxname_length {
2345	0			0	1	0	my $self = shift;
2346	0					0	$self->deprecated("maxname_length - deprecated method.".
2347							" Use maxdisplayname_length() instead.");
2348	0					0	$self->maxdisplayname_length();
2349							}
2350
2351							sub maxnse_length {
2352	0			0	0	0	my $self = shift;
2353	0					0	$self->deprecated("maxnse_length - deprecated method.".
2354							" Use maxnse_length() instead.");
2355	0					0	$self->maxdisplayname_length();
2356							}
2357
2358							sub maxdisplayname_length {
2359	21			21	1	32	my $self = shift;
2360	21					33	my $maxname = (-1);
2361	21					27	my ( $seq, $len );
2362
2363	21					53	foreach $seq ( $self->each_seq() ) {
2364	122					207	$len = CORE::length $self->displayname( $seq->get_nse() );
2365
2366	122	100				227	if ( $len > $maxname ) {
2367	52					71	$maxname = $len;
2368							}
2369							}
2370
2371	21					53	return $maxname;
2372							}
2373
2374							=head2 max_metaname_length
2375
2376							Title : max_metaname_length
2377							Usage : $ali->max_metaname_length()
2378							Function : Gets the maximum length of the meta name tags in the
2379							alignment for the sequences and for the alignment.
2380							Used in writing out various MSA formats.
2381							Returns : integer
2382							Argument : None
2383
2384							=cut
2385
2386							sub max_metaname_length {
2387	2			2	1	3	my $self = shift;
2388	2					4	my $maxname = (-1);
2389	2					4	my ($seq,$len);
2390
2391							# check seq meta first
2392	2					6	for $seq ( $self->each_seq() ) {
2393	17	100				42	next if !$seq->isa('Bio::Seq::MetaI' \|\| !$seq->meta_names);
2394	11					16	for my $mtag ($seq->meta_names) {
2395	0					0	$len = CORE::length $mtag;
2396	0	0				0	if( $len > $maxname ) {
2397	0					0	$maxname = $len;
2398							}
2399							}
2400							}
2401
2402							# alignment meta
2403	2					7	for my $meta ($self->consensus_meta) {
2404	2	100				6	next unless $meta;
2405	1					3	for my $name ($meta->meta_names) {
2406	1					11	$len = CORE::length $name;
2407	1	50				3	if( $len > $maxname ) {
2408	1					3	$maxname = $len;
2409							}
2410							}
2411							}
2412
2413	2					8	return $maxname;
2414							}
2415
2416							=head2 num_residues
2417
2418							Title : num_residues
2419							Usage : $no = $ali->num_residues
2420							Function : number of residues in total in the alignment
2421							Returns : integer
2422							Argument :
2423							Note : replaces no_residues()
2424
2425							=cut
2426
2427							sub num_residues {
2428	2			2	1	5	my $self = shift;
2429	2					4	my $count = 0;
2430
2431	2					6	foreach my $seq ( $self->each_seq ) {
2432	19					35	my $str = $seq->seq();
2433
2434	19					851	$count += ( $str =~ s/[A-Za-z]//g );
2435							}
2436
2437	2					10	return $count;
2438							}
2439
2440							=head2 num_sequences
2441
2442							Title : num_sequences
2443							Usage : $depth = $ali->num_sequences
2444							Function : number of sequence in the sequence alignment
2445							Returns : integer
2446							Argument : none
2447							Note : replaces no_sequences()
2448
2449							=cut
2450
2451							sub num_sequences {
2452	2553			2553	1	4026	my $self = shift;
2453	2553					3384	return scalar($self->each_seq);
2454							}
2455
2456							=head2 average_percentage_identity
2457
2458							Title : average_percentage_identity
2459							Usage : $id = $align->average_percentage_identity
2460							Function: The function uses a fast method to calculate the average
2461							percentage identity of the alignment
2462							Returns : The average percentage identity of the alignment
2463							Args : None
2464							Notes : This method implemented by Kevin Howe calculates a figure that is
2465							designed to be similar to the average pairwise identity of the
2466							alignment (identical in the absence of gaps), without having to
2467							explicitly calculate pairwise identities proposed by Richard Durbin.
2468							Validated by Ewan Birney ad Alex Bateman.
2469
2470							=cut
2471
2472							sub average_percentage_identity{
2473	9			9	1	23	my ($self,@args) = @_;
2474
2475	9					56	my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476							'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2477
2478	9					16	my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2479
2480	9	50				25	if (! $self->is_flush()) {
2481	0					0	$self->throw("All sequences in the alignment must be the same length");
2482							}
2483
2484	9					25	@seqs = $self->each_seq();
2485	9					31	$len = $self->length();
2486
2487							# load the each hash with correct keys for existence checks
2488
2489	9					33	for( my $index=0; $index < $len; $index++) {
2490	2096					2210	foreach my $letter (@alphabet) {
2491	54496					61088	$countHashes[$index]->{$letter} = 0;
2492							}
2493							}
2494	9					33	foreach my $seq (@seqs) {
2495	32					154	my @seqChars = split //, $seq->seq();
2496	32					94	for( my $column=0; $column < @seqChars; $column++ ) {
2497	7580					7320	my $char = uc($seqChars[$column]);
2498	7580	100				10245	if (exists $countHashes[$column]->{$char}) {
2499	7077					10253	$countHashes[$column]->{$char}++;
2500							}
2501							}
2502							}
2503
2504	9					27	$total = 0;
2505	9					20	$divisor = 0;
2506	9					39	for(my $column =0; $column < $len; $column++) {
2507	2096					2196	my %hash = %{$countHashes[$column]};
	2096					14337
2508	2096					4148	$subdivisor = 0;
2509	2096					5970	foreach my $res (keys %hash) {
2510	54496					56985	$total += $hash{$res}*($hash{$res} - 1);
2511	54496					56075	$subdivisor += $hash{$res};
2512							}
2513	2096					7414	$divisor += $subdivisor * ($subdivisor - 1);
2514							}
2515	9	50				2968	return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2516							}
2517
2518							=head2 percentage_identity
2519
2520							Title : percentage_identity
2521							Usage : $id = $align->percentage_identity
2522							Function: The function calculates the average percentage identity
2523							(aliased to average_percentage_identity)
2524							Returns : The average percentage identity
2525							Args : None
2526
2527							=cut
2528
2529							sub percentage_identity {
2530	5			5	1	10	my $self = shift;
2531	5					18	return $self->average_percentage_identity();
2532							}
2533
2534							=head2 overall_percentage_identity
2535
2536							Title : overall_percentage_identity
2537							Usage : $id = $align->overall_percentage_identity
2538							$id = $align->overall_percentage_identity('short')
2539							Function: The function calculates the percentage identity of
2540							the conserved columns
2541							Returns : The percentage identity of the conserved columns
2542							Args : length value to use, optional defaults to alignment length
2543							possible values: 'align', 'short', 'long'
2544
2545							The argument values 'short' and 'long' refer to shortest and longest
2546							sequence in the alignment. Method modification code by Hongyu Zhang.
2547
2548							=cut
2549
2550							sub overall_percentage_identity{
2551	22			22	1	55	my ($self, $length_measure) = @_;
2552
2553	22					48	my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
	572					757
2554
2555	22					54	my %enum = map {$_ => undef} qw (align short long);
	66					112
2556
2557							$self->throw("Unknown argument [$length_measure]")
2558	22	50	66			67	if $length_measure and not exists $enum{$length_measure};
2559	22		100			83	$length_measure \|\|= 'align';
2560
2561	22	50				53	if (! $self->is_flush()) {
2562	0					0	$self->throw("All sequences in the alignment must be the same length");
2563							}
2564
2565							# Count the residues seen at each position
2566	22					31	my $len;
2567	22					28	my $total = 0; # number of positions with identical residues
2568	22					28	my @countHashes;
2569	22					43	my @seqs = $self->each_seq;
2570	22					30	my $nof_seqs = scalar @seqs;
2571	22					44	my $aln_len = $self->length();
2572	22					31	for my $seq (@seqs) {
2573	100					223	my $seqstr = $seq->seq;
2574
2575							# Count residues for given sequence
2576	100					168	for my $column (0 .. $aln_len-1) {
2577	17908					17372	my $char = uc( substr($seqstr, $column, 1) );
2578	17908	100				21086	if ( exists $alphabet{$char} ) {
2579
2580							# This is a valid char
2581	17139	100				18512	if ( defined $countHashes[$column]->{$char} ) {
2582	12795					10696	$countHashes[$column]->{$char}++;
2583							} else {
2584	4344					4492	$countHashes[$column]->{$char} = 1;
2585							}
2586
2587	17139	100				21689	if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588							# All sequences have this same residue
2589	851					818	$total++;
2590							}
2591
2592							}
2593							}
2594
2595							# Sequence length
2596	100	100	100			293	if ($length_measure eq 'short' \|\| $length_measure eq 'long') {
2597	32					50	my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598	32	100				49	if ($length_measure eq 'short') {
		50
2599	16	100	100			45	if ( (not defined $len) \|\| ($seq_len < $len) ) {
2600	5					8	$len = $seq_len;
2601							}
2602							} elsif ($length_measure eq 'long') {
2603	16	100	100			42	if ( (not defined $len) \|\| ($seq_len > $len) ) {
2604	2					4	$len = $seq_len;
2605							}
2606							}
2607							}
2608
2609							}
2610
2611	22	100				51	if ($length_measure eq 'align') {
2612	20					23	$len = $aln_len;
2613							}
2614
2615	22					584	return ($total / $len ) * 100.0;
2616							}
2617
2618
2619
2620							=head1 Alignment positions
2621
2622							Methods to map a sequence position into an alignment column and back.
2623							column_from_residue_number() does the former. The latter is really a
2624							property of the sequence object and can done using
2625							L:
2626
2627							# select somehow a sequence from the alignment, e.g.
2628							my $seq = $aln->get_seq_by_pos(1);
2629							#$loc is undef or Bio::LocationI object
2630							my $loc = $seq->location_from_column(5);
2631
2632							=head2 column_from_residue_number
2633
2634							Title : column_from_residue_number
2635							Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636							Function: This function gives the position in the alignment
2637							(i.e. column number) of the given residue number in the
2638							sequence with the given name. For example, for the
2639							alignment
2640
2641							Seq1/91-97 AC..DEF.GH.
2642							Seq2/24-30 ACGG.RTY...
2643							Seq3/43-51 AC.DDEF.GHI
2644
2645							column_from_residue_number( "Seq1", 94 ) returns 6.
2646							column_from_residue_number( "Seq2", 25 ) returns 2.
2647							column_from_residue_number( "Seq3", 50 ) returns 10.
2648
2649							An exception is thrown if the residue number would lie
2650							outside the length of the aligment
2651							(e.g. column_from_residue_number( "Seq2", 22 )
2652
2653							Note: If the the parent sequence is represented by more than
2654							one alignment sequence and the residue number is present in
2655							them, this method finds only the first one.
2656
2657							Returns : A column number for the position in the alignment of the
2658							given residue in the given sequence (1 = first column)
2659							Args : A sequence id/name (not a name/start-end)
2660							A residue number in the whole sequence (not just that
2661							segment of it in the alignment)
2662
2663							=cut
2664
2665							sub column_from_residue_number {
2666	27			27	1	506	my ( $self, $name, $resnumber ) = @_;
2667
2668							$self->throw("No sequence with name [$name]")
2669	27	50				71	unless $self->{'_start_end_lists'}->{$name};
2670	27	50				50	$self->throw("Second argument residue number missing") unless $resnumber;
2671
2672	27					72	foreach my $seq ( $self->each_seq_with_id($name) ) {
2673	27					37	my $col;
2674	27					39	eval { $col = $seq->column_from_residue_number($resnumber); };
	27					75
2675	27	50				62	next if $@;
2676	27					67	return $col;
2677							}
2678
2679	0					0	$self->throw( "Could not find a sequence segment in $name "
2680							. "containing residue number $resnumber" );
2681
2682							}
2683
2684							=head1 Sequence names
2685
2686							Methods to manipulate the display name. The default name based on the
2687							sequence id and subsequence positions can be overridden in various
2688							ways.
2689
2690							=head2 displayname
2691
2692							Title : displayname
2693							Usage : $myalign->displayname("Ig", "IgA")
2694							Function : Gets/sets the display name of a sequence in the alignment
2695							Returns : A display name string
2696							Argument : name of the sequence
2697							displayname of the sequence (optional)
2698
2699							=cut
2700
2701							sub displayname {
2702	550			550	1	779	my ( $self, $name, $disname ) = @_;
2703
2704							$self->throw("No sequence with name [$name]")
2705	550	50				919	unless defined $self->{'_seq'}->{$name};
2706
2707	550	100	66			1206	if ( $disname and $name ) {
		100
2708	112					164	$self->{'_dis_name'}->{$name} = $disname;
2709	112					153	return $disname;
2710							}
2711							elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712	48					165	return $self->{'_dis_name'}->{$name};
2713							}
2714							else {
2715	390					794	return $name;
2716							}
2717							}
2718
2719							sub get_displayname {
2720	0			0	0	0	my $self = shift;
2721	0					0	$self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722	0					0	$self->displayname(@_);
2723							}
2724
2725							sub set_displayname {
2726	0			0	0	0	my $self = shift;
2727	0					0	$self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728	0					0	$self->displayname(@_);
2729							}
2730
2731
2732							=head2 set_displayname_count
2733
2734							Title : set_displayname_count
2735							Usage : $ali->set_displayname_count
2736							Function : Sets the names to be name_# where # is the number of
2737							times this name has been used.
2738							Returns : 1, on success
2739							Argument :
2740
2741							=cut
2742
2743							sub set_displayname_count {
2744	1			1	1	3	my $self= shift;
2745	1					2	my (@arr,$name,$seq,$count,$temp,$nse);
2746
2747	1					4	foreach $seq ( $self->each_alphabetically() ) {
2748	16					25	$nse = $seq->get_nse();
2749
2750							#name will be set when this is the second
2751							#time (or greater) is has been seen
2752
2753	16	50	66			34	if( defined $name and $name eq ($seq->id()) ) {
2754	0					0	$temp = sprintf("%s_%s",$name,$count);
2755	0					0	$self->displayname($nse,$temp);
2756	0					0	$count++;
2757							} else {
2758	16					18	$count = 1;
2759	16					23	$name = $seq->id();
2760	16					36	$temp = sprintf("%s_%s",$name,$count);
2761	16					24	$self->displayname($nse,$temp);
2762	16					19	$count++;
2763							}
2764							}
2765	1					6	return 1;
2766							}
2767
2768							=head2 set_displayname_flat
2769
2770							Title : set_displayname_flat
2771							Usage : $ali->set_displayname_flat()
2772							Function : Makes all the sequences be displayed as just their name,
2773							not name/start-end (NSE)
2774							Returns : 1
2775							Argument :
2776
2777							=cut
2778
2779							sub set_displayname_flat {
2780	11			11	1	26	my $self = shift;
2781	11					20	my ( $nse, $seq );
2782
2783	11					33	foreach $seq ( $self->each_seq() ) {
2784	79					143	$nse = $seq->get_nse();
2785	79					151	$self->displayname( $nse, $seq->id() );
2786							}
2787	11					30	return 1;
2788							}
2789
2790
2791							=head2 set_displayname_normal
2792
2793							Title : set_displayname_normal
2794							Usage : $ali->set_displayname_normal()
2795							Function : Makes all the sequences be displayed as name/start-end (NSE)
2796							Returns : 1, on success
2797							Argument :
2798
2799							=cut
2800
2801							sub set_displayname_normal {
2802	1			1	1	3	my $self = shift;
2803	1					3	my ( $nse, $seq );
2804
2805	1					4	foreach $seq ( $self->each_seq() ) {
2806	16					23	$nse = $seq->get_nse();
2807	16					24	$self->displayname( $nse, $nse );
2808							}
2809	1					5	return 1;
2810							}
2811
2812							=head2 source
2813
2814							Title : source
2815							Usage : $obj->source($newval)
2816							Function: sets the Alignment source program
2817							Example :
2818							Returns : value of source
2819							Args : newvalue (optional)
2820
2821							=cut
2822
2823							sub source {
2824	123			123	1	2059	my ( $self, $value ) = @_;
2825	123	100				366	if ( defined $value ) {
2826	111					325	$self->{'_source'} = $value;
2827							}
2828	123					302	return $self->{'_source'};
2829							}
2830
2831
2832							=head2 set_displayname_safe
2833
2834							Title : set_displayname_safe
2835							Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836							Function : Assign machine-generated serial names to sequences in input order.
2837							Designed to protect names during PHYLIP runs. Assign 10-char string
2838							in the form of "S000000001" to "S999999999". Restore the original
2839							names using "restore_displayname".
2840							Returns : 1. a new $aln with system names;
2841							2. a hash ref for restoring names
2842							Argument : Number for id length (default 10)
2843
2844							=cut
2845
2846							sub set_displayname_safe {
2847	1			1	1	2	my $self = shift;
2848	1		50			6	my $idlength = shift \|\| 10;
2849	1					2	my ( $seq, %phylip_name );
2850	1					1	my $ct = 0;
2851	1					3	my $new = Bio::SimpleAlign->new();
2852	1					3	foreach $seq ( $self->each_seq() ) {
2853	14					15	$ct++;
2854	14					48	my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855	14					31	$phylip_name{$pname} = $seq->id();
2856							my $new_seq = Bio::LocatableSeq->new(
2857							-id => $pname,
2858							-seq => $seq->seq(),
2859							-alphabet => $seq->alphabet,
2860							-start => $seq->{_start},
2861							-end => $seq->{_end}
2862	14					28	);
2863	14					42	$new->add_seq($new_seq);
2864							}
2865
2866							$self->debug(
2867	1					6	"$ct seq names changed. Restore names by using restore_displayname.");
2868	1					4	return ( $new, \%phylip_name );
2869							}
2870
2871
2872							=head2 restore_displayname
2873
2874							Title : restore_displayname
2875							Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876							Function : Restore original sequence names (after running
2877							$ali->set_displayname_safe)
2878							Returns : a new $aln with names restored.
2879							Argument : a hash reference of names from "set_displayname_safe".
2880
2881							=cut
2882
2883							sub restore_displayname {
2884	1			1	1	2	my $self = shift;
2885	1					2	my $ref=shift;
2886	1					9	my %name=%$ref;
2887	1					3	my $new=Bio::SimpleAlign->new();
2888	1					3	foreach my $seq ( $self->each_seq() ) {
2889	14	50				24	$self->throw("No sequence with name") unless defined $name{$seq->id()};
2890							my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891							-seq => $seq->seq(),
2892							-alphabet => $seq->alphabet,
2893							-start => $seq->{_start},
2894							-end => $seq->{_end}
2895	14					23	);
2896	14					51	$new->add_seq($new_seq);
2897							}
2898	1					5	return $new;
2899							}
2900
2901							=head2 sort_by_start
2902
2903							Title : sort_by_start
2904							Usage : $ali->sort_by_start
2905							Function : Changes the order of the alignment to the start position of each
2906							subalignment
2907							Returns : 1 on success
2908							Argument :
2909
2910							=cut
2911
2912							sub sort_by_start {
2913	1			1	1	2	my $self = shift;
2914	1					2	my ($seq,$nse,@arr,%hash,$count);
2915	1					2	foreach $seq ( $self->each_seq() ) {
2916	3					6	$nse = $seq->get_nse;
2917	3					7	$hash{$nse} = $seq;
2918							}
2919	1					1	$count = 0;
2920	1					3	%{$self->{'_order'}} = (); # reset the hash;
	1					3
2921	1					5	foreach $nse ( sort _startend keys %hash) {
2922	3					5	$self->{'_order'}->{$count} = $nse;
2923	3					4	$count++;
2924							}
2925	1					3	1;
2926							}
2927
2928							sub _startend {
2929	3			3		8	my ($aname,$arange) = split (/[\/]/,$a);
2930	3					6	my ($bname,$brange) = split (/[\/]/,$b);
2931	3					4	my ($astart,$aend) = split(/\-/,$arange);
2932	3					6	my ($bstart,$bend) = split(/\-/,$brange);
2933	3					6	return $astart <=> $bstart;
2934							}
2935
2936							=head2 bracket_string
2937
2938							Title : bracket_string
2939							Usage : my @params = (-refseq => 'testseq',
2940							-allele1 => 'allele1',
2941							-allele2 => 'allele2',
2942							-delimiters => '{}',
2943							-separator => '/');
2944							$str = $aln->bracket_string(@params)
2945
2946							Function : When supplied with a list of parameters (see below), returns a
2947							string in BIC format. This is used for allelic comparisons.
2948							Briefly, if either allele contains a base change when compared to
2949							the refseq, the base or gap for each allele is represented in
2950							brackets in the order present in the 'alleles' parameter.
2951
2952							For the following data:
2953
2954							>testseq
2955							GGATCCATTGCTACT
2956							>allele1
2957							GGATCCATTCCTACT
2958							>allele2
2959							GGAT--ATTCCTCCT
2960
2961							the returned string with parameters 'refseq => testseq' and
2962							'alleles => [qw(allele1 allele2)]' would be:
2963
2964							GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965							Returns : BIC-formatted string
2966							Argument : Required args
2967							refseq : string (ID) of the reference sequence used
2968							as basis for comparison
2969							allele1 : string (ID) of the first allele
2970							allele2 : string (ID) of the second allele
2971							Optional args
2972							delimiters: two symbol string of left and right delimiters.
2973							Only the first two symbols are used
2974							default = '[]'
2975							separator : string used as a separator. Only the first
2976							symbol is used
2977							default = '/'
2978							Throws : On no refseq/alleles, or invalid refseq/alleles.
2979
2980							=cut
2981
2982							sub bracket_string {
2983	14			14	1	5832	my ($self, @args) = @_;
2984	14					53	my ($ref, $a1, $a2, $delim, $sep) =
2985							$self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986	14	50	33			61	$self->throw('Missing refseq/allele1/allele2') if (!$a1 \|\| !$a2 \|\| !$ref);
			33
2987	14					16	my ($ld, $rd);
2988	14	100				41	($ld, $rd) = split('', $delim, 2) if $delim;
2989	14		100			33	$ld \|\|= '[';
2990	14		100			25	$rd \|\|= ']';
2991	14		100			30	$sep \|\|= '/';
2992							my ($refseq, $allele1, $allele2) =
2993	14					18	map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
	42					59
2994	14	50	33			57	if (!$refseq \|\| !$allele1 \|\| !$allele2) {
			33
2995	0					0	$self->throw("One of your refseq/allele IDs is invalid!");
2996							}
2997	14					22	my $len = $self->length-1;
2998	14					15	my $bic = '';
2999							# loop over the alignment columns
3000	14					25	for my $column ( 0 .. $len ) {
3001	210					169	my $string;
3002							my ($compres, $res1, $res2) =
3003	210					206	map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
	630					799
3004							# are any of the allele symbols different from the refseq?
3005	210	100	100			468	$string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006							$ld.$res1.$sep.$res2.$rd;
3007	210					258	$bic .= $string;
3008							}
3009	14					40	return $bic;
3010							}
3011
3012
3013							=head2 methods implementing Bio::FeatureHolderI
3014
3015							FeatureHolderI implementation to support labeled character sets like one
3016							would get from NEXUS represented data.
3017
3018							=head2 get_SeqFeatures
3019
3020							Usage : @features = $aln->get_SeqFeatures
3021							Function: Get the feature objects held by this feature holder.
3022							Example :
3023							Returns : an array of Bio::SeqFeatureI implementing objects
3024							Args : optional filter coderef, taking a Bio::SeqFeatureI
3025							: as argument, returning TRUE if wanted, FALSE if
3026							: unwanted
3027
3028							=cut
3029
3030							sub get_SeqFeatures {
3031	4			4	1	19	my $self = shift;
3032	4					5	my $filter_cb = shift;
3033	4	50	33			19	$self->throw("Arg (filter callback) must be a coderef")
3034							unless !defined($filter_cb)
3035							or ref($filter_cb) eq 'CODE';
3036	4	100				14	if ( !defined $self->{'_as_feat'} ) {
3037	2					5	$self->{'_as_feat'} = [];
3038							}
3039	4	50				13	if ($filter_cb) {
3040	0					0	return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
	0					0
	0					0
3041							}
3042	4					6	return @{ $self->{'_as_feat'} };
	4					15
3043							}
3044
3045
3046							=head2 add_SeqFeature
3047
3048							Usage : $aln->add_SeqFeature($subfeat);
3049							Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050							(see L) is supported, but has no effect.
3051							Example :
3052							Returns : 1 on success
3053							Args : a Bio::SeqFeatureI object
3054
3055							=cut
3056
3057							sub add_SeqFeature {
3058	9			9	1	16	my ($self, @feat) = @_;
3059
3060	9	100				19	$self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3061
3062	9	50				15	if (scalar @feat > 1) {
3063	0					0	$self->deprecated(
3064							-message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065							' is deprecated and will be removed in a future version. '.
3066							'Add a single feature at a time instead.',
3067							-warn_version => 1.007,
3068							-throw_version => 1.009,
3069							);
3070							}
3071
3072	9					14	for my $feat ( @feat ) {
3073
3074	9	50				18	next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3075
3076	9	50				42	if( !$feat->isa("Bio::SeqFeatureI") ) {
3077	0					0	$self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3078							}
3079
3080	9					8	push @{$self->{'_as_feat'}}, $feat;
	9					19
3081							}
3082	9					12	return 1;
3083							}
3084
3085
3086							=head2 remove_SeqFeatures
3087
3088							Usage : $obj->remove_SeqFeatures
3089							Function: Removes all SeqFeatures. If you want to remove only a subset,
3090							remove that subset from the returned array, and add back the rest.
3091							Returns : The array of Bio::SeqFeatureI features that was
3092							deleted from this alignment.
3093							Args : none
3094
3095							=cut
3096
3097							sub remove_SeqFeatures {
3098	0			0	1	0	my $self = shift;
3099
3100	0	0				0	return () unless $self->{'_as_feat'};
3101	0					0	my @feats = @{$self->{'_as_feat'}};
	0					0
3102	0					0	$self->{'_as_feat'} = [];
3103	0					0	return @feats;
3104							}
3105
3106							=head2 feature_count
3107
3108							Title : feature_count
3109							Usage : $obj->feature_count()
3110							Function: Return the number of SeqFeatures attached to the alignment
3111							Returns : integer representing the number of SeqFeatures
3112							Args : None
3113
3114							=cut
3115
3116							sub feature_count {
3117	2			2	1	3	my ($self) = @_;
3118
3119	2	50				5	if (defined($self->{'_as_feat'})) {
3120	2					2	return ($#{$self->{'_as_feat'}} + 1);
	2					9
3121							} else {
3122	0					0	return 0;
3123							}
3124							}
3125
3126							=head2 get_all_SeqFeatures
3127
3128							Title : get_all_SeqFeatures
3129							Usage :
3130							Function: Get all SeqFeatures.
3131							Example :
3132							Returns : an array of Bio::SeqFeatureI implementing objects
3133							Args : none
3134							Note : Falls through to Bio::FeatureHolderI implementation.
3135
3136							=cut
3137
3138							=head2 methods for Bio::AnnotatableI
3139
3140							AnnotatableI implementation to support sequence alignments which
3141							contain annotation (NEXUS, Stockholm).
3142
3143							=head2 annotation
3144
3145							Title : annotation
3146							Usage : $ann = $aln->annotation or
3147							$aln->annotation($ann)
3148							Function: Gets or sets the annotation
3149							Returns : Bio::AnnotationCollectionI object
3150							Args : None or Bio::AnnotationCollectionI object
3151
3152							See L and L
3153							for more information
3154
3155							=cut
3156
3157							sub annotation {
3158	87			87	1	146	my ($obj,$value) = @_;
3159	87	100				219	if( defined $value ) {
		100
3160	46	50				197	$obj->throw("object of class ".ref($value)." does not implement ".
3161							"Bio::AnnotationCollectionI. Too bad.")
3162							unless $value->isa("Bio::AnnotationCollectionI");
3163	46					87	$obj->{'_annotation'} = $value;
3164							} elsif( ! defined $obj->{'_annotation'}) {
3165	21					114	$obj->{'_annotation'} = Bio::Annotation::Collection->new();
3166							}
3167	87					179	return $obj->{'_annotation'};
3168							}
3169
3170							=head1 Deprecated methods
3171
3172							=cut
3173
3174							=head2 no_residues
3175
3176							Title : no_residues
3177							Usage : $no = $ali->no_residues
3178							Function : number of residues in total in the alignment
3179							Returns : integer
3180							Argument :
3181							Note : deprecated in favor of num_residues()
3182
3183							=cut
3184
3185							sub no_residues {
3186	0			0	1	0	my $self = shift;
3187	0					0	$self->deprecated(-warn_version => 1.0069,
3188							-throw_version => 1.0075,
3189							-message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190	0					0	$self->num_residues(@_);
3191							}
3192
3193							=head2 no_sequences
3194
3195							Title : no_sequences
3196							Usage : $depth = $ali->no_sequences
3197							Function : number of sequence in the sequence alignment
3198							Returns : integer
3199							Argument :
3200							Note : deprecated in favor of num_sequences()
3201
3202							=cut
3203
3204							sub no_sequences {
3205	0			0	1	0	my $self = shift;
3206	0					0	$self->deprecated(-warn_version => 1.0069,
3207							-throw_version => 1.0075,
3208							-message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209	0					0	$self->num_sequences(@_);
3210							}
3211
3212							=head2 mask_columns
3213
3214							Title : mask_columns
3215							Usage : $aln2 = $aln->mask_columns(20,30)
3216							Function : Masks a slice of the alignment inclusive of start and
3217							end columns, and the first column in the alignment is denoted 1.
3218							Mask beyond the length of the sequence does not do padding.
3219							Returns : A Bio::SimpleAlign object
3220							Args : Positive integer for start column, positive integer for end column,
3221							optional string value use for the mask. Example:
3222
3223							$aln2 = $aln->mask_columns(20,30,'?')
3224							Note : Masking must use a character that is not used for gaps or
3225							frameshifts. These can be adjusted using the relevant global
3226							variables, but be aware these may be (uncontrollably) modified
3227							elsewhere within BioPerl (see bug 2715)
3228
3229							=cut
3230
3231							sub mask_columns {
3232							#based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233	6			6	1	1373	my $self = shift;
3234
3235	6					12	my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236							$Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3237
3238							# coordinates are alignment-based, not sequence-based
3239	6					16	my ($start, $end, $mask_char) = @_;
3240	6	50				16	unless (defined $mask_char) { $mask_char = 'N' }
	0					0
3241
3242	6	50	33			74	$self->throw("Mask start has to be a positive integer and less than ".
			33
3243							"alignment length, not [$start]")
3244							unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245	6	50	33			40	$self->throw("Mask end has to be a positive integer and less than ".
			33
3246							"alignment length, not [$end]")
3247							unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248	6	50				14	$self->throw("Mask start [$start] has to be smaller than or equal to ".
3249							"end [$end]") unless $start <= $end;
3250	6	50	33			39	$self->throw("Mask character $mask_char has to be a single character ".
3251							"and not a gap or frameshift symbol")
3252							unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3253
3254	6					16	my $aln = $self->new;
3255	6					19	$aln->id($self->id);
3256	6					10	foreach my $seq ( $self->each_seq() ) {
3257	21					42	my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258							-alphabet => $seq->alphabet,
3259							-strand => $seq->strand,
3260							-verbose => $self->verbose);
3261
3262							# convert from 1-based alignment coords!
3263	21					47	my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264	21					154	$masked_string =~ s{[^$nonres]}{$mask_char}g;
3265	21					45	my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266	21					45	$new_seq->seq($new_dna_string);
3267	21					40	$aln->add_seq($new_seq);
3268							}
3269							# Preserve chosen mask character, it may be need later (like in 'slice')
3270	6					16	$aln->{_mask_char} = $mask_char;
3271	6					18	return $aln;
3272							}
3273
3274							1;