File Coverage

blib/lib/Bio/MUST/Core/Ali.pm

Criterion	Covered	Total	%
statement	278	279	99.6
branch	65	76	85.5
condition	14	18	77.7
subroutine	56	57	98.2
pod	33	33	100.0
total	446	463	96.3

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Ali;
2							# ABSTRACT: Multiple sequence alignment
3							# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
4							# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
5							$Bio::MUST::Core::Ali::VERSION = '0.212530';
6	17			17		13523	use Moose;
	17					45
	17					138
7	17			17		92305	use namespace::autoclean;
	17					49
	17					254
8
9	17			17		1696	use autodie;
	17					48
	17					146
10	17			17		90732	use feature qw(say);
	17					45
	17					1626
11
12							# use Smart::Comments;
13
14	17			17		135	use Carp;
	17					58
	17					1370
15	17			17		144	use File::Temp;
	17					40
	17					1573
16	17			17		129	use List::AllUtils qw(uniq indexes sum0);
	17					37
	17					1064
17	17			17		120	use Path::Class qw(file);
	17					52
	17					1261
18	17			17		120	use POSIX qw(ceil floor);
	17					38
	17					144
19	17			17		12536	use Statistics::Descriptive;
	17					388150
	17					693
20	17			17		10001	use Tie::IxHash;
	17					43254
	17					756
21
22	17			17		167	use Bio::MUST::Core::Types;
	17					51
	17					519
23	17			17		100	use Bio::MUST::Core::Constants qw(:gaps :files);
	17					40
	17					4109
24	17			17		145	use aliased 'Bio::MUST::Core::Seq';
	17					43
	17					184
25	17			17		3858	use aliased 'Bio::MUST::Core::SeqMask';
	17					44
	17					76
26
27							# TODO: add information about methods available in Ali-like objects
28
29							# ATTRIBUTES
30
31
32							has 'seqs' => (
33							traits => ['Array'],
34							is => 'ro',
35							isa => 'ArrayRef[Bio::MUST::Core::Seq]',
36							default => sub { [] },
37							writer => '_set_seqs',
38							handles => {
39							add_seq => 'push',
40							get_seq => 'get',
41							set_seq => 'set',
42							delete_seq => 'delete',
43							insert_seq => 'insert',
44							count_seqs => 'count',
45							all_seqs => 'elements',
46							first_seq => 'first',
47							filter_seqs => 'grep',
48							},
49							);
50
51
52							has 'file' => (
53							is => 'ro',
54							isa => 'Bio::MUST::Core::Types::File',
55							default => 'untitled.ali',
56							coerce => 1,
57							handles => {
58							filename => 'stringify',
59							},
60							);
61
62
63							has 'guessing' => (
64							traits => ['Bool'],
65							is => 'ro',
66							isa => 'Bool',
67							default => 1,
68							handles => {
69							dont_guess => 'unset',
70							guess => 'set',
71							},
72							);
73
74
75							with 'Bio::MUST::Core::Roles::Commentable',
76							'Bio::MUST::Core::Roles::Listable';
77							with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths)
78
79							# CONSTRUCTORS
80
81
82
83							sub clone {
84	1			1	1	3	my $self = shift;
85
86							return $self->new(
87							comments => [ $self->all_comments ],
88	1					51	seqs => [ map { $_->clone } $self->all_seqs ],
	250					789
89							file => file( $self->filename ),
90							guessing => $self->guessing,
91							);
92							}
93
94							# ACCESSORS
95
96
97							sub get_seq_with_id {
98	4			4	1	2613	my $self = shift;
99	4					10	my $id = shift;
100
101	4			11		209	my $seq = $self->first_seq( sub { $_->full_id eq $id } );
	11					46
102	4	100				53	carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
103							unless $seq;
104
105	4					450	return $seq;
106							}
107
108
109							sub all_new_seqs {
110	6			6	1	31	return shift->filter_seqs( sub { $_->is_new } );
	2			2		97
111							}
112
113
114							sub all_but_new_seqs {
115	3			3	1	11	return shift->filter_seqs( sub { not $_->is_new } );
	1			1		47
116							}
117
118
119							sub all_seq_ids {
120	105			105	1	3689	my $self = shift;
121	105					4403	return map { $_->seq_id } $self->all_seqs;
	1510					37004
122							}
123
124							# PROPERTIES
125
126
127							sub has_uniq_ids {
128	64			64	1	1591	my $self = shift;
129
130	64					307	my @ids = map { $_->full_id } $self->all_seq_ids;
	941					23577
131	64	100				2431	return 1 if $self->count_seqs == uniq @ids;
132	7					234	return 0;
133							}
134
135
136							sub is_protein {
137	77			77	1	2649	my $self = shift;
138	77	100		96		3574	return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
	96					555
139	5					37	return 0;
140							}
141
142
143							sub is_aligned {
144	310			310	1	681	my $self = shift;
145	310	100				11726	return 0 if not $self->guessing;
146	307	100		8588		11654	return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
	8588					18926
147	116					1062	return 0;
148							}
149
150
151							sub width {
152	189			189	1	495	my $self = shift;
153	189	100				871	$self->uniformize if $self->is_aligned; # pad seqs for robustness
154	189					1092	return $self->_max_seq_len;
155							}
156
157							sub _max_seq_len {
158	365			365		746	my $self = shift;
159	365					12495	my @lengths = map { $_->seq_len } $self->all_seqs;
	12841					402763
160	365		50			5637	return (List::AllUtils::max @lengths) // 0; # to avoid warnings
161							}
162
163
164							sub seq_len_stats {
165	1			1	1	10	my $self = shift;
166
167	1					41	my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
	10					27
168	1					15	my $stat = Statistics::Descriptive::Full->new;
169	1					138	$stat->add_data( \@lengths );
170	1					163	my @quantiles = map { $stat->quantile($_) } 0..4;
	5					266
171
172	1					48	return @quantiles;
173							}
174
175
176							sub perc_miss {
177	2			2	1	7	my $self = shift;
178
179	2					93	my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
	255					592
180	2	50				22	return 0.0 unless $n;
181
182	2					13	my $total = $self->width * $self->count_seqs;
183	2					28	return 100.0 * ($total - $n) / $total;
184							}
185
186							# MUTATORS
187
188
189							sub uc_seqs {
190	1			1	1	37	my $self = shift;
191
192	1					50	$_->uc for $self->all_seqs;
193	1					4	return $self;
194							}
195
196
197							sub recode_seqs { ## no critic (RequireArgUnpacking)
198	1			1	1	17	my $self = shift;
199
200	1					42	$_->recode(@_) for $self->all_seqs;
201	1					4	return $self;
202							}
203
204
205							sub degap_seqs {
206	1			1	1	12	my $self = shift;
207
208	1					46	$_->degap for $self->all_seqs;
209	1					5	return $self;
210							}
211
212
213							sub spacify_seqs {
214	176			176	1	381	my $self = shift;
215
216	176					5576	$_->spacify for $self->all_seqs;
217	176					571	return $self;
218							}
219
220
221							sub gapify_seqs { ## no critic (RequireArgUnpacking)
222	2			2	1	16	my $self = shift;
223	2					80	$_->gapify(@_) for $self->all_seqs;
224	2					6	return $self;
225							}
226
227
228							sub trim_seqs {
229	176			176	1	378	my $self = shift;
230
231	176					5574	$_->trim for $self->all_seqs;
232	176					615	return $self;
233							}
234
235
236							sub pad_seqs {
237	176			176	1	373	my $self = shift;
238
239	176					633	my $bound = $self->_max_seq_len;
240	176					5518	$_->pad_to($bound) for $self->all_seqs;
241	176					598	return $self;
242							}
243
244
245							sub uniformize {
246	175			175	1	400	my $self = shift;
247
248							# TODO: profile code (triggers?) to avoid useless multiple calls
249
250	175					663	$self->spacify_seqs;
251	175					749	$self->trim_seqs;
252	175					792	$self->pad_seqs;
253
254	175					395	return $self;
255							}
256
257
258							sub clear_new_tags {
259	1			1	1	3	my $self = shift;
260
261	1					47	$_->clear_new_tag for $self->all_seqs;
262	1					6	return $self;
263							}
264
265
266							sub shorten_ids { ## no critic (RequireArgUnpacking)
267	12			12	1	66	return shift->_change_ids_(1, @_);
268							}
269
270
271							sub restore_ids { ## no critic (RequireArgUnpacking)
272	10			10	1	42	return shift->_change_ids_(0, @_);
273							}
274
275							sub _change_ids_ {
276	22			22		47	my $self = shift;
277	22					42	my $abbr = shift;
278	22					39	my $mapper = shift;
279
280	22					861	for my $seq ($self->all_seqs) {
281	133	100				669	my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id )
282							: $mapper->long_id_for( $seq->full_id );
283	133	50				4166	$seq->set_seq_id($new_id) if $new_id;
284							} # Note: leave id alone if not found
285
286	22					109	return $self;
287							}
288
289
290							sub apply_list {
291	1			1	1	11	my $self = shift;
292	1					4	my $list = shift;
293
294							# loop through seqs from bottom to top
295							# ... and delete seq from Ali if not listed in list
296	1					49	my $seq_n = $self->count_seqs;
297
298	1					7	for (my $i = $seq_n-1; $i >= 0; $i--) {
299	250	100				8210	$self->delete_seq($i)
300							unless $list->is_listed( $self->get_seq($i)->full_id );
301							}
302
303	1					5	return $self;
304							}
305
306
307							sub _premask_check {
308	40			40		564	my $self = shift;
309	40					76	my $mask = shift;
310
311							# warn of unaligned Ali
312	40	50				155	carp '[BMC] Note: Ali does not look aligned!'
313							. ' This might not be an issue if seqs are ultra-conserved!'
314							unless $self->is_aligned;
315	40					243	$self->uniformize;
316
317							# warn of empty SeqMask
318							carp '[BMC] Note: applying this mask will result in a zero-width Ali!'
319	40	50		1845		1636	unless List::AllUtils::any { $_ } $mask->all_states;
	1845					2274
320
321							# check that SeqMask is compatible with Ali
322							# potential bugs could come from constant sites etc
323	40					13304	my $a_width = $self->width;
324	40					1493	my $m_width = $mask->mask_len;
325	40	100				181	carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!"
326							. ' This might not be an issue if the mask results from blocks.'
327							unless $a_width == $m_width;
328
329	40					1014	return; # return values of before subs are ignored
330							}
331
332							before 'apply_mask' => \&_premask_check;
333
334							sub apply_mask {
335							my $self = shift;
336							my $mask = shift;
337
338							# select sites for each seq using a precomputed array slice
339							my @indexes = indexes { $_ } $mask->all_states;
340							$_->_set_seq( join q{}, ( $_->all_states )[@indexes] )
341							for $self->all_seqs;
342
343							return $self;
344							}
345
346
347							sub idealize { ## no critic (RequireArgUnpacking)
348	13			13	1	121	my $self = shift;
349	13					224	return $self->apply_mask( SeqMask->ideal_mask($self, @_) );
350							}
351
352							# MISC METHODS
353
354
355							sub gapmiss_regex {
356	73	100		73	1	1200	return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS;
357							}
358
359
360							sub map_coords {
361	1			1	1	12	my $self = shift;
362	1					3	my $id = shift;
363	1					2	my $coords_in = shift;
364
365	1					6	my $seq = $self->get_seq_with_id($id);
366	1					5	my @states = $seq->all_states;
367
368	1					3	my $count = 0;
369	1					2	my @index;
370	1					3	for my $state (@states) {
371	130	100				331	$count++ if $state !~ $GAP;
372	130					215	push @index, $count;
373							}
374
375	1					2	my @coords_out = map { $index[$_-1] } @{$coords_in};
	6					13
	1					3
376
377	1					12	return \@coords_out;
378							}
379
380							# ALIASES
381
382
383							sub height {
384	0			0	1	0	return shift->count_seqs;
385							}
386
387							# I/O METHODS
388
389
390							sub load {
391	69			69	1	31703	my $class = shift;
392	69					221	my $infile = shift;
393
394	69					423	open my $in, '<', $infile;
395
396	69					33338	my $ali = $class->new( file => $infile );
397	69					411	my $seq_id;
398							my $seq;
399
400							LINE:
401	69					14756	while (my $line = <$in>) {
402	4091					7944	chomp $line;
403
404							# skip empty lines and process comments
405	4091	100	100			25606	next LINE if $line =~ $EMPTY_LINE
406							\|\| $ali->is_comment($line);
407
408							# at each '>' char...
409	3944					13635	my ($defline) = $line =~ $DEF_LINE;
410	3944	100				8400	if ($defline) {
411
412							# add current seq to ali (if any)
413	1662	100				3482	if ($seq) {
414	1593					48351	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
415	1593					54597	$ali->add_seq($new_seq);
416	1593					2987	$seq = q{};
417							}
418
419	1662					2726	$seq_id = $defline;
420	1662					26369	next LINE;
421							}
422
423							# elongate current seq (seqs can be broken on several lines)
424	2282					11246	$seq .= $line;
425							}
426
427							# add last seq to ali (if any)
428	69	50				328	if ($seq) {
429	69					2504	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
430	69					2392	$ali->add_seq($new_seq);
431							}
432
433	69					2021	return $ali;
434							}
435
436
437							before qr{\Astore}xms => sub {
438							my $self = shift;
439
440							# perform pre-storage duties
441							carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids;
442							$self->uniformize if $self->is_aligned;
443
444							return;
445							};
446
447
448							sub store { ## no critic (RequireArgUnpacking)
449							my $self = shift;
450							my $outfile = shift;
451
452							# automatically redirect to store_fasta for non-Ali outfile names
453							return $self->store_fasta($outfile, @_) # note the currying
454							unless $outfile =~ $ALI_SUFFIX;
455
456							open my $out, '>', $outfile;
457
458							print {$out} $self->header;
459							for my $seq ($self->all_seqs) {
460							say {$out} '>' . $seq->full_id;
461							say {$out} $seq->seq;
462							}
463
464							return;
465							}
466
467
468							sub store_fasta {
469							my $self = shift;
470							my $outfile = shift;
471							my $args = shift // {}; # HashRef (should not be empty...)
472
473							my $degap = $args->{degap} // 0;
474							my $clean = $args->{clean} // 0;
475							my $gap = $args->{gapify} // 'X';
476							my $chunk = $args->{chunk} // 60;
477							my $nowrap = $chunk < 0 ? 1 : 0;
478							my $is_aligned = $self->is_aligned;
479
480							open my $out, '>', $outfile;
481
482							for my $seq ($self->all_seqs) {
483							say {$out} '>' . $seq->foreign_id;
484
485							# optionally clean and/or degap seq
486							$seq = $seq->clone if $clean \|\| $degap; # clone seq only if needed
487							$seq->gapify($gap) if $clean;
488							$seq->degap if $degap;
489
490							my $width = $seq->seq_len;
491							$chunk = $width if $nowrap; # optionally disable wrap
492
493							my $str = $seq->wrapped_str($chunk);
494							$str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned
495							print {$out} $str;
496							}
497
498							return;
499							}
500
501
502							sub temp_fasta {
503	9			9	1	23	my $self = shift;
504	9		50			26	my $args = shift // {}; # HashRef (should not be empty...)
505
506							# abbreviate ids (possibly using a custom prefix)
507	9					22	my $prefix = $args->{id_prefix};
508	9					18	delete $args->{id_prefix}; # not necessarily required
509	9					47	my $mapper = $self->std_mapper($prefix);
510	9					54	$self->shorten_ids($mapper);
511
512							# write temporary .fasta file using standard abbr_ids
513							# ...and restore long_ids afterwards
514	9					88	my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta');
515	9					5089	$self->store_fasta($out->filename, $args);
516	9					74	$self->restore_ids($mapper);
517							### filename: $out->filename
518
519	9	100				56	return wantarray ? ($out->filename, $mapper) : $out->filename;
520							}
521
522
523							# TODO: add an option to remove trailing '_' in ids?
524
525							sub load_phylip {
526	105			105	1	8962	my $class = shift;
527	105					297	my $infile = shift;
528
529	105					838	open my $in, '<', $infile;
530
531	105					50325	my $ali = $class->new( file => $infile );
532	105					282	my $seq_n;
533							my $site_n;
534	105					330	my $n = 0;
535
536							LINE:
537	105					39303	while (my $line = <$in>) {
538	8557					21245	chomp $line;
539
540							# skip empty lines
541	8557	100				51994	next LINE if $line =~ $EMPTY_LINE;
542
543							# get matrix dimensions
544	8524	100				37214	if ($line =~ $DIM_LINE) {
545	105					695	($seq_n, $site_n) = ($1, $2);
546	105					2209	next LINE;
547							}
548
549							# process regular line (identifier is optional)
550	8419					49650	my ($seq_id, $seq) = $line =~ $PHY_LINE;
551
552							# should never happen...
553	8419	50				22049	croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!"
554							unless $seq;
555
556							# delete optional spaces
557	8419					20555	$seq =~ tr/ //d;
558
559							# process first seq block
560	8419	100				313196	if ($ali->count_seqs < $seq_n) {
561
562							# seq_id are mandatory in first block
563	8089	50				18313	croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!"
564							unless $seq_id;
565
566							# store first seq chunk along with seq_id
567	8089					225725	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
568	8089					279190	$ali->add_seq($new_seq);
569							}
570
571							# process remaining seq blocks
572							else {
573	330					11285	my $curr_seq = $ali->get_seq($n);
574
575							# elongate current with partial seq chunk mistaken as seq_id
576	330	100	100			1087	if ($seq_id && $seq_id ne $curr_seq->full_id) {
577	110					3664	$curr_seq->append_seq($seq_id);
578							}
579
580							# elongate current seq with new seq chunk
581	330					10759	$curr_seq->append_seq($seq);
582							}
583
584							# prepare reading of next seq or next block
585	8419	100				102891	$n = 0 if ++$n == $seq_n;
586							}
587
588	105					919	my $width = $ali->width;
589	105	50				696	croak "[BMC] Error: unexpected site number in PHYLIP file: $width;"
590							. ' aborting!' if $width != $site_n;
591
592	105					4069	return $ali;
593							}
594
595							# PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html
596							# The information for each species follows, starting with a ten-character
597							# species name (which can include blanks and some punctuation marks), and
598							# continuing with the characters for that species. The name should be on the
599							# same line as the first character of the data for that species. (...) The
600							# name should be ten characters in length, filled out to the full ten
601							# characters by blanks if shorter. Any printable ASCII/ISO character is
602							# allowed in the name, except for parentheses ("(" and ")"), square brackets
603							# ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that
604							# in these sequences we have a blank every ten sites to make them easier to
605							# read: any such blanks are allowed. The blank line which separates the two
606							# groups of lines (the ones containing sites 1-20 and ones containing sites
607							# 21-39) may or may not be present.
608
609							# PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command
610							# The input sequence file is a standard PHYLIP file of aligned DNA or
611							# amino-acids sequences. (...) The maximum number of characters in species
612							# name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed
613							# within sequence names because the Newick tree format makes special use of
614							# these symbols. However, blanks (one or more) MUST appear at the end of each
615							# species name.
616
617							# TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf
618							# A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-",
619							# dots ".", underscores "_", or some of "/", "?", "*", "+". No space is
620							# allowed inside the names. The first name character must be a letter or an
621							# underscore. A sequence fragment may start behind position 10 after a
622							# sequence name, or anywhere in a line without a name.
623
624							# TREE-PUZZLE [data/globin.a]
625							# 7 128
626							# HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN
627							# HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN
628							# HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS
629							# HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS
630							# MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS
631							# GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS
632							# LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN
633
634							# RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf
635							# The input alignment format of RAxML is relaxed interleaved or sequential
636							# PHYLIP. "Relaxed" means that sequence names can be of variable length
637							# between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names
638							# taxon names that contain any form of whitespace character, like blanks,
639							# tabulators, and carriage returns, as well as one of the following prohibited
640							# characters: :,();[].
641
642							# PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf
643							# Taxon names may contain more than 10 characters. Avoid special characters
644							# such as ';' or ')', which will create problems when parsing trees. The best
645							# is to only use letters, digits and '_'. Sequences can be interrupted by
646							# space and tab, but not by return characters. Be sure that the lengths of the
647							# sequences are all the same, and identical to the lengths indicated in the
648							# header. Sequences can be interleaved, in which case the taxon names may or
649							# may not be repeated in each block.
650
651
652							sub store_phylip {
653							my $self = shift;
654							my $outfile = shift;
655							my $args = shift // {}; # HashRef (should not be empty...)
656
657							my $short = $args->{short} // 1;
658							my $clean = $args->{clean} // 0;
659							my $chunk = $args->{chunk} // 60;
660
661							open my $out, '>', $outfile;
662
663							# print data matrix dimensions
664							my $height = $self->count_seqs;
665							my $width = $self->width;
666							say {$out} $height . q{ } . $width;
667
668							# setup id format (this will also affect block-like structure)
669							my $format = $short ? "%-10.10s %s\n" : "%s %s\n";
670							my $method = $short ? 'full_id' : 'foreign_id';
671							my $sep = $short ? q{ } : q{};
672
673							# optionally disable wrapping
674							$chunk = $width if $chunk < 0;
675
676							# output Ali in sequential or interleaved format
677							for (my $site = 0; $site < $width; $site += $chunk) {
678
679							# leave empty line between chunks (but not after last chunk)
680							print {$out} "\n" if $site;
681
682							for my $seq ($self->all_seqs) {
683
684							# print 10-chars ids only once
685							# Note: full_ids are only truncated (use IdMapper for more)
686							my $id = $site == 0 ? $seq->$method : q{};
687
688							# optionally clean seq
689							$seq = $seq->clone->gapify('X') if $clean;
690
691							# print seq chunks in 10-state blocks
692							# Note: We insert a space in the 11th column to ensure that more
693							# software packages can read the written files. PHYLIP itself will
694							# ignore this spurious space in the 'sequences'.
695							( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg;
696							printf {$out} $format, $id, join $sep,
697							map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1;
698							}
699							}
700
701							return;
702							}
703
704
705							sub load_stockholm {
706	2			2	1	31	my $class = shift;
707	2					4	my $infile = shift;
708
709	2					9	open my $in, '<', $infile;
710
711	2					430	my $ali = $class->new( file => $infile );
712	2					55	tie my %seq_for, 'Tie::IxHash';
713
714							LINE:
715	2					1111	while (my $line = <$in>) {
716	44					288	chomp $line;
717
718							# process GF comments
719	44	100				105	next LINE if $ali->is_comment($line, $STK_COMMENT);
720
721							# skip empty lines and other comment lines
722	27	100	100			214	next LINE if $line =~ $EMPTY_LINE
723							\|\| $line =~ $COMMENT_LINE;
724	11	100				38	last LINE if $line =~ $END_LINE;
725
726							# parse stockholm seq
727	9					42	my ($seq_id, $seq) = $line =~ $STK_SEQ;
728
729							# replace dots and gaps by stars
730	9					27	$seq =~ s/[\.\-]/*/xmsg;
731
732							# elongate seq
733	9					45	$seq_for{$seq_id} .= $seq;
734							}
735
736							# populate Ali object
737	2					11	while (my ($seq_id, $seq) = each %seq_for) {
738	9					409	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
739	9					307	$ali->add_seq($new_seq);
740							}
741
742	2					72	return $ali;
743							}
744
745
746							sub instant_store {
747	1			1	1	5	my $class = shift;
748	1					3	my $outfile = shift;
749	1		50			4	my $args = shift // {}; # HashRef (should not be empty...)
750
751	1					4	my $infile = $args->{infile};
752	1	50				5	croak '[BMC] Error: no infile specified for instant_store; aborting!'
753							unless $infile;
754
755	1					8	my $coderef = $args->{coderef};
756	1	50				4	croak '[BMC] Error: no coderef specified for instant_store; aborting!'
757							unless $coderef;
758
759	1					5	open my $in, '<', $infile;
760	1					270	open my $out, '>', $outfile;
761
762	1					273	my $seq_id;
763							my $seq;
764
765							LINE:
766	1					611	while (my $line = <$in>) {
767	92					134	chomp $line;
768
769							# skip empty lines and process comments
770	92	100	66			458	next LINE if $line =~ $EMPTY_LINE
771							\|\| $line =~ $COMMENT_LINE;
772
773							# at each '>' char...
774	90					213	my ($defline) = $line =~ $DEF_LINE;
775	90	100				159	if ($defline) {
776
777							# process current seq (if any)
778	3	100				8	if ($seq) {
779	2					74	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
780	2					5	print {$out} $coderef->($new_seq);
	2					9
781	2					74	$seq = q{};
782							}
783
784	3					6	$seq_id = $defline;
785	3					18	next LINE;
786							}
787
788							# elongate current seq (seqs can be broken on several lines)
789	87					242	$seq .= $line;
790							}
791
792							# process last seq (if any)
793	1	50				6	if ($seq) {
794	1					38	my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
795	1					3	print {$out} $coderef->($new_seq);
	1					5
796							}
797
798	1					65	return;
799							}
800
801
802							sub instant_count {
803	1			1	1	449	my $class = shift;
804	1					2	my $infile = shift;
805
806	1					2	my $seq_n = 0;
807
808	1					7	open my $in, '<', $infile;
809	1	100				2865	while (<$in>) { $seq_n++ if m/^>/xms }
	502					2586
810
811	1					23	return $seq_n;
812							}
813
814							__PACKAGE__->meta->make_immutable;
815							1;
816
817							__END__
818
819							=pod
820
821							=head1 NAME
822
823							Bio::MUST::Core::Ali - Multiple sequence alignment
824
825							=head1 VERSION
826
827							version 0.212530
828
829							=head1 SYNOPSIS
830
831							#!/usr/bin/env perl
832
833							use Modern::Perl '2011';
834							# same as:
835							# use strict;
836							# use warnings;
837							# use feature qw(say);
838
839							use Bio::MUST::Core;
840							use aliased 'Bio::MUST::Core::Ali';
841
842							# read Ali form disk
843							my $ali = Ali->load('example.ali');
844
845							# get some properties
846							say 'height: ' . $ali->height; # number of seqs
847							say 'width: ' . $ali->width; # number of sites
848							say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%)
849							say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl';
850
851							# turn seqs to uppercase
852							$ali->uc_seqs;
853
854							# filter out seqs with no species associated
855							my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } );
856							use aliased 'Bio::MUST::Core::IdList';
857							my $list = IdList->new( ids => \@seqs );
858							$ali->apply_list($list);
859
860							# alternatively:
861							# $ali = Ali->new( seqs => \@seqs );
862
863							# filter out gap-rich sites
864							$ali->idealize(0.2); # min 20% non-gaps per site
865
866							# filter out non-informative sites
867							use aliased 'Bio::MUST::Core::SeqMask';
868							my $mask = SeqMask->parsimony_mask($ali);
869							$ali->apply_mask($mask);
870
871							# write down reduced Ali to disk
872							$ali->store('example-uc-genus-sp-20.ali');
873							$ali->store_fasta('example-uc-genus-sp-20.fasta');
874
875							# see below for additional methods
876							# ...
877
878							=head1 DESCRIPTION
879
880							This module implements the multiple sequence alignment (MSA) class and its
881							methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
882							Consequently, sequence ids do not absolutely need to be unique for it to
883							work (though id uniqueness helps a lot for sequence access and filtering).
884
885							An Ali knows whether it contains nucleotide or protein sequences, whether
886							those are aligned or not, as well as its Seq count and exact width. All
887							these properties are introspected on the fly, which means that, while they
888							can be expensive to compute, they are always accurate.
889
890							This is important because an Ali provides methods for inserting, deleting
891							and editing its Seq objects. Further, it does its best to maintain a semantic
892							distinction between true gap states (encoded as '*') and missing regions of
893							undetermined length (encoded as pure whitespace, for example at the end of a
894							short sequence).
895
896							It also has methods for mapping its sequence ids (for example, before export
897							to the PHYLIP format), as well as methods to selectively retain sequences
898							and sites based on L<Bio::MUST::Core::IdList> and
899							L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
900							discards shared gaps and optionally removes gap-rich sites only due to a
901							tiny number of sequences.
902
903							Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
904							meaningful whitespace and allows comment lines in the header) or be
905							imported/exported from/to several popular MSA formats, such as plain FASTA,
906							STOCKHOLM and PHYLIP.
907
908							=head1 ATTRIBUTES
909
910							=head2 seqs
911
912							ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
913
914							Most of the accessor methods described below are implemented by delegation
915							to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
916							delegation>. Their documentation thus heavily borrows from the corresponding
917							help pages.
918
919							=head2 file
920
921							L<Path::Class::File> object (optional)
922
923							This optional attribute is initialized by class methods that C<load> an Ali
924							from disk. It is meant to improve the introspection capabilities of the Ali.
925							For now, this attribute is not used by the C<store> methods, though it might
926							provide them with a default value in the future.
927
928							=head2 guessing
929
930							Boolean (optional)
931
932							By default, an Ali object tries to guess whether it is aligned or not by
933							looking for gap-like characters in any of its Seq objects (see
934							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
935
936							When this smart behavior causes issues, one can disable it by unsetting this
937							boolean attribute (see C<dont_guess> and C<guess> accessor methods).
938
939							=head2 comments
940
941							ArrayRef of strings (optional)
942
943							An Ali object is commentable, which means that it supports all the methods
944							pertaining to comment lines described in
945							L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
946
947							=head1 CONSTRUCTORS
948
949							=head2 new
950
951							Default constructor (class method) returning a new Ali.
952
953							use aliased 'Bio::MUST::Core::Ali';
954							my $ali1 = Ali->new();
955							my @seqs = $ali->all_seqs;
956							my $ali2 = Ali->new( seqs => \@seqs );
957
958							This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
959							C<file>, C<guessing> and C<comments>.
960
961							=head2 clone
962
963							Creates a deep copy (a clone) of the Ali. Returns the copy.
964
965							use aliased 'Bio::MUST::Core::Ali';
966							my $ali = Ali->load('input.ali');
967							my $ali_copy = $ali->clone;
968							# you can now mess with $ali_copy without affecting $ali
969
970							This method does not accept any arguments.
971
972							=head1 ACCESSORS
973
974							=head2 add_seq
975
976							Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
977							number of sequences of the Ali.
978
979							use aliased 'Bio::MUST::Core::Seq';
980							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
981							$ali->add_seq($new_seq);
982
983							This method accepts any number of arguments.
984
985							=head2 get_seq
986
987							Returns a sequence of the Ali by its index. You can also use negative index
988							numbers, just as with Perl's core array handling. If the specified sequence
989							does not exist, this method will return C<undef>.
990
991							my $seq = $ali->get_seq($index);
992							croak "Seq $index not found in Ali!" unless defined $seq;
993
994							This method accepts just one argument (and not an array slice).
995
996							=head2 get_seq_with_id
997
998							Returns a sequence of the Ali by its id. If the specified id is not unique,
999							only the first matching sequence will be returned, whereas if no sequence
1000							exists for the specified id, this method will return C<undef>.
1001
1002							my $id = 'Pyrus malus_3750@658052655';
1003							my $seq = $ali->get_seq_with_id($id);
1004							croak "Seq $id not found in Ali!" unless defined $seq;
1005
1006							This method accepts just one argument.
1007
1008							=head2 set_seq
1009
1010							Given an index and a sequence, sets the specified Ali element to the
1011							sequence. This method returns the new sequence at C<$index>.
1012
1013							use aliased 'Bio::MUST::Core::Seq';
1014							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1015							$ali->set_seq($index, $new_seq);
1016
1017							This method requires two arguments.
1018
1019							=head2 delete_seq
1020
1021							Removes the sequence at the given index from the Ali. This method returns
1022							the deleted sequence. If the specified sequence does not exist, it will
1023							return C<undef> instead.
1024
1025							$ali->delete_seq($index);
1026
1027							This method requires one argument.
1028
1029							=head2 insert_seq
1030
1031							Inserts a new sequence into the Ali at the given index. This method returns
1032							the new sequence at C<$index>.
1033
1034							use aliased 'Bio::MUST::Core::Seq';
1035							my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
1036							$ali->insert_seq($index, $new_seq);
1037
1038							This method requires two arguments.
1039
1040							=head2 all_seqs
1041
1042							Returns all the sequences of the Ali (not an array reference).
1043
1044							my @seqs = $ali->all_seqs;
1045
1046							This method does not accept any arguments.
1047
1048							=head2 first_seq
1049
1050							Returns the first sequence of the Ali matching a given criterion, just like
1051							L<List::Util>'s C<first> function. This method requires a subroutine
1052							implementing the matching logic.
1053
1054							# emulate get_seq_with_id method
1055							my $id2find = 'seq13';
1056							my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
1057
1058							This method requires a single argument.
1059
1060							=head2 filter_seqs
1061
1062							Returns every sequence of the Ali matching a given criterion, just like
1063							Perl's core C<grep> function. This method requires a subroutine implementing
1064							the matching logic.
1065
1066							# keep only long sequences (ignoring gaps and missing states)
1067							my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
1068
1069							This method requires a single argument.
1070
1071							=head2 all_new_seqs
1072
1073							Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
1074
1075							my @new_seqs = $ali->all_new_seqs;
1076
1077							This method does not accept any arguments.
1078
1079							=head2 all_but_new_seqs
1080
1081							Returns all the sequences of the Ali except those tagged as #NEW# (not an array
1082							reference).
1083
1084							my @preexisting_seqs = $ali->all_but_new_seqs;
1085
1086							This method does not accept any arguments.
1087
1088							=head2 all_seq_ids
1089
1090							Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
1091							(not an array reference). This is only a convenience method.
1092
1093							use Test::Deeply;
1094							my @ids1 = $ali->all_seq_ids;
1095							my @ids2 = map { $_->seq_id } $ali->all_seqs;
1096							is_deeply \@ids1, \@ids2, 'should be true';
1097							my @orgs = map { $_->org } @ids1;
1098
1099							This method does not accept any arguments.
1100
1101							=head2 filename
1102
1103							Returns the stringified filename of the Ali.
1104
1105							This method does not accept any arguments.
1106
1107							=head2 guess
1108
1109							Turn on the smart detection of gaps (see C<guessing> attribute above).
1110
1111							This method does not accept any arguments.
1112
1113							=head2 dont_guess
1114
1115							Turn off the smart detection of gaps (see C<guessing> attribute above).
1116
1117							use aliased 'Bio::MUST::Core::Ali';
1118							my $ali = Ali->load('ensembl.fasta');
1119							$ali->dont_guess;
1120
1121							This method does not accept any arguments.
1122
1123							=head1 PROPERTIES
1124
1125							=head2 has_uniq_ids
1126
1127							Returns true if all the sequence ids are unique.
1128
1129							carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
1130
1131							This method does not accept any arguments.
1132
1133							=head2 is_protein
1134
1135							Returns true if any sequence of the Ali looks like a protein. See
1136							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1137
1138							say 'Your file includes nucleotide sequences' unless $ali->is_protein;
1139
1140							This method does not accept any arguments.
1141
1142							=head2 is_aligned
1143
1144							Returns true if any sequence of the Ali appears to be aligned. See
1145							L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
1146
1147							If the boolean attribute guessing is not set, always returns false.
1148
1149							carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
1150
1151							This method does not accept any arguments.
1152
1153							=head2 count_seqs
1154
1155							Returns the number of sequences of the Ali. The alias method C<height> is
1156							provided for convenience.
1157
1158							my $height = $ali->count_seqs;
1159
1160							This method does not accept any arguments.
1161
1162							=head2 width
1163
1164							Returns the width of the Ali (in characters). If the Ali is not aligned,
1165							returns the length of the longest sequence instead.
1166
1167							To avoid potential bugs due to caching, this method dynamically computes the
1168							Ali width at each call. Moreover, the Ali is always uniformized (see below)
1169							beforehands to ensure accurate width value. Therefore, this method is
1170							expensive and should not be called repeatedly (e.g., in a loop condition).
1171
1172							# you'd better looping through sites like this...
1173							my $width = $ali->width;
1174							for my $site (0..$width-1) {
1175							...
1176							}
1177
1178							This method does not accept any arguments.
1179
1180							=head2 seq_len_stats
1181
1182							Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
1183							The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
1184
1185							This method does not accept any arguments.
1186
1187							=head2 perc_miss
1188
1189							Returns the percentage of missing (and gap-like) character states in the Ali.
1190
1191							As this method internally calls C<Ali::width>, the remarks above also apply.
1192
1193							my $miss_level = $ali->perc_miss;
1194
1195							This method does not accept any arguments.
1196
1197							=head1 MUTATORS
1198
1199							=head2 uc_seqs
1200
1201							Turn all the sequences of the Ali to uppercase and returns it.
1202
1203							This method does not accept any arguments.
1204
1205							=head2 recode_seqs
1206
1207							Recode all the sequences of the Ali and returns it.
1208
1209							use aliased 'Bio::MUST::Core::Ali';
1210							my $ali = Ali->load('biased.ali');
1211
1212							# set up RY recoding for suppressing codon bias
1213							my %base_for = (
1214							A => 'A', G => 'A', # purines
1215							C => 'C', T => 'C', # pyrimidines
1216							);
1217
1218							my $ali_rec = $ali->recode_seqs( \%base_for );
1219							$ali_rec->store('biased_ry.ali');
1220
1221							This method requires one argument.
1222
1223							=head2 degap_seqs
1224
1225							Remove the gaps in all the sequences of the Ali and returns it.
1226
1227							This method does not accept any arguments.
1228
1229							=head2 spacify_seqs
1230
1231							Spacifies all the sequences of the Ali and returns it. See the corresponding
1232							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1233							operation.
1234
1235							This method does not accept any arguments.
1236
1237							=head2 gapify_seqs
1238
1239							Gapifies all the sequences of the Ali and returns it. See the corresponding
1240							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1241							operation.
1242
1243							This method accepts an optional argument.
1244
1245							=head2 trim_seqs
1246
1247							Trims all the sequences of the Ali and returns it. See the corresponding
1248							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1249							operation.
1250
1251							This method does not accept any arguments.
1252
1253							=head2 pad_seqs
1254
1255							Pads all the sequences of the Ali and returns it. See the corresponding
1256							method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
1257							operation.
1258
1259							This method does not accept any arguments.
1260
1261							=head2 uniformize
1262
1263							Performs the three gap-cleaning operations in turn on all the sequences of
1264							the Ali and returns it, which ensures that it is semantically clean and
1265							rectangular.
1266
1267							This is only a convenience method called internally by the Ali object before
1268							selected methods (such as storage-like methods). However, it might prove
1269							useful in some circumstances, hence it is not defined as private.
1270
1271							use aliased 'Bio::MUST::Core::Ali';
1272							my $ali = Ali->load('input.ali');
1273							$ali->add_seq(...);
1274							# more editing of the Ali sequences
1275							$ali->uniformize;
1276
1277							This method does not accept any arguments.
1278
1279							=head2 clear_new_tags
1280
1281							Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it.
1282
1283							use aliased 'Bio::MUST::Core::Ali';
1284							my $ali = Ali->load('input-42.ali');
1285							$ali->clear_new_tags;
1286							my @new_seqs = $ali->all_new_seqs;
1287							# array should be empty
1288
1289							This method does not accept any arguments.
1290
1291							=head2 shorten_ids
1292
1293							Replaces all the sequence ids of the Ali by their abbreviated forms as
1294							specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1295
1296							Note that this method will work only if the sequence ids have not been
1297							already shortened or modified in any way since the creation of the IdMapper.
1298							Long ids without abbreviated forms in the IdMapper are left untouched.
1299
1300							use aliased 'Bio::MUST::Core::Ali';
1301							use aliased 'Bio::MUST::Core::IdMapper';
1302							my $ali = Ali->load('input.ali');
1303							my $mapper = IdMapper->std_mapper($ali, 'lcl\|seq');
1304							$ali->shorten_ids($mapper);
1305							$ali->store_fasta('input.4blast.fasta');
1306							# makeblastdb
1307
1308							# Note: the temp_fasta method does exactly that
1309
1310							This method requires one argument.
1311
1312							=head2 restore_ids
1313
1314							Replaces all the sequence ids of the Ali by their long forms as specified by
1315							the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
1316
1317							Note that this method will work only if the sequence ids have been previously
1318							abbreviated (see above) and have not been modified in any way since then.
1319							Again, abbreviated ids without long forms in the IdMapper are left untouched.
1320
1321							use aliased 'Bio::MUST::Core::IdMapper';
1322							my $mapper = IdMapper->gi_mapper($ali);
1323							$ali->shorten_ids($mapper);
1324							...
1325							$ali->restore_ids($mapper);
1326
1327							This method requires one argument.
1328
1329							=head2 apply_list
1330
1331							Selectively retains or discards sequences from the Ali based on the content
1332							of the passed L<Bio::MUST::Core::IdList> object and returns the Ali.
1333
1334							use aliased 'Bio::MUST::Core::IdList';
1335							my $list = IdList->load('interesting_seqs.idl');
1336							$ali->apply_list($list); # discard non-interesting seqs
1337
1338							This method requires one argument.
1339
1340							=head2 apply_mask
1341
1342							Selectively retains or discards sites from the Ali based on the content of
1343							the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali.
1344
1345							use aliased 'Bio::MUST::Core::SeqMask';
1346							my $variable_sites = SeqMask->variable_mask($ali);
1347							$ali->apply_mask($variable_sites); # discard constant sites
1348
1349							This method requires one argument.
1350
1351							=head2 idealize
1352
1353							Computes and applies an ideal sequence mask to the Ali and returns it. This
1354							is only a convenience method.
1355
1356							When invoked without arguments, it will discard the gaps that are
1357							universally shared by all the sequences. Otherwise, the provided argument
1358							corresponds to the threshold of the C<ideal_mask> method described in
1359							L<Bio::MUST::Core::SeqMask>.
1360
1361							use aliased 'Bio::MUST::Core::IdList';
1362							my $fast_seqs = IdList->load('fast_evolving_seqs.idl');
1363							my $seqs2keep = $fast_seqs->negative_list($ali);
1364							$ali->apply_list($seqs2keep); # discard fast-evolving seqs
1365							$ali->idealize; # discard newly shared gaps caused by fast seqs
1366
1367							use aliased 'Bio::MUST::Core::Ali';
1368							my $ali = Ali->load('hmm_based.ali');
1369							$ali->idealize(0.05); # discard insertions due to <5% of the seqs
1370
1371							This method accepts an optional argument.
1372
1373							=head1 MISC METHODS
1374
1375							=head2 gapmiss_regex
1376
1377							Returns a regular expression matching gaps and ambiguous or missing states.
1378							The exact regex returned depends on the type of sequences in the Ali (nucl. or
1379							proteins).
1380
1381							my $regex = $ali->gapmiss_regex;
1382							my $first_seq = $ali->get_seq(0)->seq;
1383							my $gapmiss_n = $first_seq =~ m/($regex)/xmsg;
1384							say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites";
1385
1386							This method does not accept any arguments.
1387
1388							=head2 map_coords
1389
1390							Converts a set of site positions from Ali coordinates to coordinates of the
1391							specified sequence (thereby ignoring positions due to gaps). Returns the
1392							converted sites in sequence coordinates as an array refrence.
1393
1394							use aliased 'Bio::MUST::Core::Ali';
1395							my $ali = Ali->load('input.ali');
1396							my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125';
1397							my $ali_coords = [ 4, 25, 73, 89, 104, 116 ];
1398							my $seq_coords = $ali->map_coords($id, $ali_coords);
1399							# $seq_coords is [ 3, 23, 59, 71, 71, 74 ]
1400
1401							This method requires two arguments: the id of a sequence and an array
1402							reference of input sites in Ali coordinates.
1403
1404							=head1 I/O METHODS
1405
1406							=head2 load
1407
1408							Class method (constructor) returning a new Ali read from disk. This method
1409							will transparently import plain FASTA files in addition to the MUST
1410							pseudo-FASTA format (ALI files).
1411
1412							use Test::Deeply;
1413							use aliased 'Bio::MUST::Core::Ali';
1414							my $ali1 = Ali->load('example.ali');
1415							my $ali2 = Ali->load('example.fasta');
1416							my @seqs1 = $ali1->all_seqs;
1417							my @seqs2 = $ali2->all_seqs;
1418							is_deeply, \@seqs1, \@seqs2, 'should be true';
1419
1420							This method requires one argument.
1421
1422							=head2 store
1423
1424							Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files).
1425
1426							Note that the ALI format is only used when the suffix of the outfile name is
1427							'.ali'. In all other cases (including lack of suffix), this method
1428							automatically forwards the call to C<store_fasta>.
1429
1430							$ali->store('output.ali');
1431							# output.ali is written in ALI format
1432							$ali->store('output.fasta');
1433							# output.fasta is written in FASTA format
1434
1435							This method requires one argument (but see C<store_fasta> in case of automatic
1436							forwarding of the method call).
1437
1438							=head2 store_fasta
1439
1440							Writes the Ali to disk in the plain FASTA format.
1441
1442							For compatibility purposes, this method automatically fetches sequence ids
1443							using the C<foreign_id> method instead of the native C<full_id> method, both
1444							described in L<Bio::MUST::Core::SeqId>.
1445
1446							$ali->store_fasta( 'output.fasta' );
1447							$ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} );
1448
1449							This method requires one argument and accepts a second optional argument
1450							controlling the output format. It is a hash reference that may contain one
1451							or more of the following keys:
1452
1453							- clean: replace all ambiguous and missing states by C<X> (default: false)
1454							- degap: boolean value controlling degapping (default: false)
1455							- chunk: line width (default is 60 chars; negative values means no wrap)
1456
1457							Finally, it is possible to fine-tune the behavior of the C<clean> option by
1458							providing another character than C<X> through the C<gapify> key. This can be
1459							useful to replace all ambiguous and missing states by gaps, as shown below:
1460
1461							$ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } );
1462
1463							=head2 temp_fasta
1464
1465							Writes a temporary copy of the Ali to disk in the plain FASTA format using
1466							numeric sequence ids and returns the name of the temporary file. This is
1467							only a convenience method.
1468
1469							In list context, returns the IdMapper object along with temporary filename.
1470
1471							my $infile = $ali->temp_fasta( { degap => 1 } );
1472							my $output = `script.sh $infile`;
1473							...
1474
1475							This method accepts the same optional argument hash as C<store_fasta>.
1476							However, an additional option (C<id_prefix>) is available to control the way
1477							abbreviated sequence ids are prefixed by the C<std_mapper> method (see
1478							L<Bio::MUST::Core::Listable>).
1479
1480							my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } );
1481							my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } );
1482
1483							=head2 load_phylip
1484
1485							Class method (constructor) returning a new Ali read from a file in the
1486							PHYLIP format. Both sequential and interleaved formats are supported.
1487
1488							The only constraint is that sequence ids MUST NOT contain whitespace and be
1489							followed by at least one whitespace character. This means that some old-school
1490							PHYLIP files not following this convention will not be processed correctly.
1491
1492							When using the interleaved format, sequence ids may or may not be repeated in
1493							each block.
1494
1495							use aliased 'Bio::MUST::Core::Ali';
1496							my $ali = Ali->load_phylip('phylip.phy');
1497							say $ali->count_seqs;
1498							say $ali->width;
1499
1500							# outputs:
1501							# 10
1502							# 709
1503
1504							This method requires one argument.
1505
1506							=head2 store_phylip
1507
1508							Writes the Ali to disk in the interleaved (or sequential) PHYLIP format.
1509
1510							To ensure maximal flexibility, this method fetches sequence ids using the
1511							native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but
1512							truncates them to 10 characters, as expected by the original PHYLIP software
1513							package. No other tinkering is carried out on the ids. Thus, if the ids
1514							contain whitespace or are not unique in their 10 first characters, it is
1515							advised to first map them using one of the constructors in
1516							L<Bio::MUST::Core::IdMapper>.
1517
1518							use aliased 'Bio::MUST::Core::Ali';
1519							use aliased 'Bio::MUST::Core::IdMapper';
1520							my $ali = Ali->load('input.ali');
1521							my $mapper = IdMapper->std_mapper($ali);
1522							$ali->shorten_ids($mapper);
1523							$ali->store_phylip( 'input.phy', { chunk => 50 } );
1524
1525							This method requires one argument and accepts a second optional argument
1526							controlling the output format. It is a hash reference that may contain one
1527							or more of the following keys:
1528
1529							- short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes)
1530							- clean: replace all ambiguous and missing states by 'X' (default: false)
1531							- chunk: line width (default: 60 chars; negative values means no wrap)
1532
1533							To store the Ali in PHYLIP sequential format, specify a negative chunk (-1).
1534
1535							=head2 load_stockholm
1536
1537							Class method (constructor) returning a new Ali read from a file in the
1538							STOCKHOLM format. =GF comments are retained (see above) but not the other
1539							comment classes (=GS, =GR and =GC).
1540
1541							use aliased 'Bio::MUST::Core::Ali';
1542							my $ali = Ali->load('upsk.stockholm');
1543							say $ali->header;
1544
1545							# outputs:
1546							# ID UPSK
1547							# SE Predicted; Infernal
1548							# SS Published; PMID 9223489
1549							# RN [1]
1550							# RM 9223489
1551							# RT The role of the pseudoknot at the 3' end of turnip yellow mosaic
1552							# RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA
1553							# RT polymerase.
1554							# RA Deiman BA, Kortlever RM, Pleij CW;
1555							# RL J Virol 1997;71:5990-5996.
1556
1557							This method requires one argument.
1558
1559							=head2 instant_store
1560
1561							Class method intended to transform a large sequence file read from disk
1562							without loading it in memory. This method will transparently process plain
1563							FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1564
1565							my $chunk = 200;
1566
1567							my $split = sub {
1568							my $seq = shift;
1569							my $base_id = ( split /\s+/xms, $seq->full_id )[0];
1570							my $max_pos = $seq->seq_len - $chunk;
1571							my $n = 0;
1572							my $out_str;
1573							for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) {
1574							$out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos,
1575							$pos + $chunk <= $max_pos ? $chunk : 2 * $chunk
1576							) . "\n";
1577							}
1578							return $out_str;
1579							};
1580
1581							use aliased 'Bio::MUST::Core::Ali';
1582
1583							Ali->instant_store(
1584							'outfile.fasta', { infile => 'infile.fasta', coderef => $split }
1585							);
1586
1587							This method requires two arguments. The sercond is a hash reference that must
1588							contain the following keys:
1589							- infile: input sequence file
1590							- coderef: subroutine implementing the transforming logic
1591
1592							=head2 instant_count
1593
1594							Class method returning the number of seqs in any sequence file read from disk
1595							without loading it in memory. This method will transparently process plain
1596							FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
1597
1598							use aliased 'Bio::MUST::Core::Ali';
1599							my $seq_n = Ali->instant_count('input.ali');
1600							say $seq_n;
1601
1602							=head1 ALIASES
1603
1604							=head2 height
1605
1606							Alias for C<count_seqs> method. For API consistency.
1607
1608							=head1 AUTHOR
1609
1610							Denis BAURAIN <denis.baurain@uliege.be>
1611
1612							=head1 CONTRIBUTORS
1613
1614							=for stopwords Catherine COLSON Arnaud DI FRANCO
1615
1616							=over 4
1617
1618							=item *
1619
1620							Catherine COLSON <ccolson@doct.uliege.be>
1621
1622							=item *
1623
1624							Arnaud DI FRANCO <arnaud.difranco@gmail.com>
1625
1626							=back
1627
1628							=head1 COPYRIGHT AND LICENSE
1629
1630							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
1631
1632							This is free software; you can redistribute it and/or modify it under
1633							the same terms as the Perl 5 programming language system itself.
1634
1635							=cut