| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Bio::MUST::Core::Ali; | 
| 2 |  |  |  |  |  |  | # ABSTRACT: Multiple sequence alignment | 
| 3 |  |  |  |  |  |  | # CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be> | 
| 4 |  |  |  |  |  |  | # CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com> | 
| 5 |  |  |  |  |  |  | $Bio::MUST::Core::Ali::VERSION = '0.212530'; | 
| 6 | 17 |  |  | 17 |  | 13523 | use Moose; | 
|  | 17 |  |  |  |  | 45 |  | 
|  | 17 |  |  |  |  | 138 |  | 
| 7 | 17 |  |  | 17 |  | 92305 | use namespace::autoclean; | 
|  | 17 |  |  |  |  | 49 |  | 
|  | 17 |  |  |  |  | 254 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 | 17 |  |  | 17 |  | 1696 | use autodie; | 
|  | 17 |  |  |  |  | 48 |  | 
|  | 17 |  |  |  |  | 146 |  | 
| 10 | 17 |  |  | 17 |  | 90732 | use feature qw(say); | 
|  | 17 |  |  |  |  | 45 |  | 
|  | 17 |  |  |  |  | 1626 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | # use Smart::Comments; | 
| 13 |  |  |  |  |  |  |  | 
| 14 | 17 |  |  | 17 |  | 135 | use Carp; | 
|  | 17 |  |  |  |  | 58 |  | 
|  | 17 |  |  |  |  | 1370 |  | 
| 15 | 17 |  |  | 17 |  | 144 | use File::Temp; | 
|  | 17 |  |  |  |  | 40 |  | 
|  | 17 |  |  |  |  | 1573 |  | 
| 16 | 17 |  |  | 17 |  | 129 | use List::AllUtils qw(uniq indexes sum0); | 
|  | 17 |  |  |  |  | 37 |  | 
|  | 17 |  |  |  |  | 1064 |  | 
| 17 | 17 |  |  | 17 |  | 120 | use Path::Class qw(file); | 
|  | 17 |  |  |  |  | 52 |  | 
|  | 17 |  |  |  |  | 1261 |  | 
| 18 | 17 |  |  | 17 |  | 120 | use POSIX qw(ceil floor); | 
|  | 17 |  |  |  |  | 38 |  | 
|  | 17 |  |  |  |  | 144 |  | 
| 19 | 17 |  |  | 17 |  | 12536 | use Statistics::Descriptive; | 
|  | 17 |  |  |  |  | 388150 |  | 
|  | 17 |  |  |  |  | 693 |  | 
| 20 | 17 |  |  | 17 |  | 10001 | use Tie::IxHash; | 
|  | 17 |  |  |  |  | 43254 |  | 
|  | 17 |  |  |  |  | 756 |  | 
| 21 |  |  |  |  |  |  |  | 
| 22 | 17 |  |  | 17 |  | 167 | use Bio::MUST::Core::Types; | 
|  | 17 |  |  |  |  | 51 |  | 
|  | 17 |  |  |  |  | 519 |  | 
| 23 | 17 |  |  | 17 |  | 100 | use Bio::MUST::Core::Constants qw(:gaps :files); | 
|  | 17 |  |  |  |  | 40 |  | 
|  | 17 |  |  |  |  | 4109 |  | 
| 24 | 17 |  |  | 17 |  | 145 | use aliased 'Bio::MUST::Core::Seq'; | 
|  | 17 |  |  |  |  | 43 |  | 
|  | 17 |  |  |  |  | 184 |  | 
| 25 | 17 |  |  | 17 |  | 3858 | use aliased 'Bio::MUST::Core::SeqMask'; | 
|  | 17 |  |  |  |  | 44 |  | 
|  | 17 |  |  |  |  | 76 |  | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | # TODO: add information about methods available in Ali-like objects | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | # ATTRIBUTES | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | has 'seqs' => ( | 
| 33 |  |  |  |  |  |  | traits   => ['Array'], | 
| 34 |  |  |  |  |  |  | is       => 'ro', | 
| 35 |  |  |  |  |  |  | isa      => 'ArrayRef[Bio::MUST::Core::Seq]', | 
| 36 |  |  |  |  |  |  | default  => sub { [] }, | 
| 37 |  |  |  |  |  |  | writer   => '_set_seqs', | 
| 38 |  |  |  |  |  |  | handles  => { | 
| 39 |  |  |  |  |  |  | add_seq  => 'push', | 
| 40 |  |  |  |  |  |  | get_seq  => 'get', | 
| 41 |  |  |  |  |  |  | set_seq  => 'set', | 
| 42 |  |  |  |  |  |  | delete_seq  => 'delete', | 
| 43 |  |  |  |  |  |  | insert_seq  => 'insert', | 
| 44 |  |  |  |  |  |  | count_seqs => 'count', | 
| 45 |  |  |  |  |  |  | all_seqs => 'elements', | 
| 46 |  |  |  |  |  |  | first_seq  => 'first', | 
| 47 |  |  |  |  |  |  | filter_seqs => 'grep', | 
| 48 |  |  |  |  |  |  | }, | 
| 49 |  |  |  |  |  |  | ); | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | has 'file' => ( | 
| 53 |  |  |  |  |  |  | is       => 'ro', | 
| 54 |  |  |  |  |  |  | isa      => 'Bio::MUST::Core::Types::File', | 
| 55 |  |  |  |  |  |  | default  => 'untitled.ali', | 
| 56 |  |  |  |  |  |  | coerce   => 1, | 
| 57 |  |  |  |  |  |  | handles  => { | 
| 58 |  |  |  |  |  |  | filename => 'stringify', | 
| 59 |  |  |  |  |  |  | }, | 
| 60 |  |  |  |  |  |  | ); | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | has 'guessing' => ( | 
| 64 |  |  |  |  |  |  | traits   => ['Bool'], | 
| 65 |  |  |  |  |  |  | is       => 'ro', | 
| 66 |  |  |  |  |  |  | isa      => 'Bool', | 
| 67 |  |  |  |  |  |  | default  => 1, | 
| 68 |  |  |  |  |  |  | handles  => { | 
| 69 |  |  |  |  |  |  | dont_guess => 'unset', | 
| 70 |  |  |  |  |  |  | guess      => 'set', | 
| 71 |  |  |  |  |  |  | }, | 
| 72 |  |  |  |  |  |  | ); | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | with 'Bio::MUST::Core::Roles::Commentable', | 
| 76 |  |  |  |  |  |  | 'Bio::MUST::Core::Roles::Listable'; | 
| 77 |  |  |  |  |  |  | with 'Bio::MUST::Core::Roles::Aliable';     ## no critic (ProhibitMultipleWiths) | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | # CONSTRUCTORS | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub clone { | 
| 84 | 1 |  |  | 1 | 1 | 3 | my $self = shift; | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | return $self->new( | 
| 87 |  |  |  |  |  |  | comments => [ $self->all_comments ], | 
| 88 | 1 |  |  |  |  | 51 | seqs     => [ map { $_->clone } $self->all_seqs ], | 
|  | 250 |  |  |  |  | 789 |  | 
| 89 |  |  |  |  |  |  | file     => file( $self->filename ), | 
| 90 |  |  |  |  |  |  | guessing => $self->guessing, | 
| 91 |  |  |  |  |  |  | ); | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | # ACCESSORS | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | sub get_seq_with_id { | 
| 98 | 4 |  |  | 4 | 1 | 2613 | my $self = shift; | 
| 99 | 4 |  |  |  |  | 10 | my $id   = shift; | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 4 |  |  | 11 |  | 209 | my $seq = $self->first_seq( sub { $_->full_id eq $id } ); | 
|  | 11 |  |  |  |  | 46 |  | 
| 102 | 4 | 100 |  |  |  | 53 | carp "[BMC] Warning: cannot find seq with id: $id; returning undef!" | 
| 103 |  |  |  |  |  |  | unless $seq; | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 4 |  |  |  |  | 450 | return $seq; | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | sub all_new_seqs { | 
| 110 | 6 |  |  | 6 | 1 | 31 | return shift->filter_seqs( sub {     $_->is_new } ); | 
|  | 2 |  |  | 2 |  | 97 |  | 
| 111 |  |  |  |  |  |  | } | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | sub all_but_new_seqs { | 
| 115 | 3 |  |  | 3 | 1 | 11 | return shift->filter_seqs( sub { not $_->is_new } ); | 
|  | 1 |  |  | 1 |  | 47 |  | 
| 116 |  |  |  |  |  |  | } | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | sub all_seq_ids { | 
| 120 | 105 |  |  | 105 | 1 | 3689 | my $self = shift; | 
| 121 | 105 |  |  |  |  | 4403 | return map { $_->seq_id } $self->all_seqs; | 
|  | 1510 |  |  |  |  | 37004 |  | 
| 122 |  |  |  |  |  |  | } | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | # PROPERTIES | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | sub has_uniq_ids { | 
| 128 | 64 |  |  | 64 | 1 | 1591 | my $self = shift; | 
| 129 |  |  |  |  |  |  |  | 
| 130 | 64 |  |  |  |  | 307 | my @ids = map { $_->full_id } $self->all_seq_ids; | 
|  | 941 |  |  |  |  | 23577 |  | 
| 131 | 64 | 100 |  |  |  | 2431 | return 1 if $self->count_seqs == uniq @ids; | 
| 132 | 7 |  |  |  |  | 234 | return 0; | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | sub is_protein { | 
| 137 | 77 |  |  | 77 | 1 | 2649 | my $self = shift; | 
| 138 | 77 | 100 |  | 96 |  | 3574 | return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs; | 
|  | 96 |  |  |  |  | 555 |  | 
| 139 | 5 |  |  |  |  | 37 | return 0; | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | sub is_aligned { | 
| 144 | 310 |  |  | 310 | 1 | 681 | my $self = shift; | 
| 145 | 310 | 100 |  |  |  | 11726 | return 0 if not $self->guessing; | 
| 146 | 307 | 100 |  | 8588 |  | 11654 | return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs; | 
|  | 8588 |  |  |  |  | 18926 |  | 
| 147 | 116 |  |  |  |  | 1062 | return 0; | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | sub width { | 
| 152 | 189 |  |  | 189 | 1 | 495 | my $self = shift; | 
| 153 | 189 | 100 |  |  |  | 871 | $self->uniformize if $self->is_aligned;     # pad seqs for robustness | 
| 154 | 189 |  |  |  |  | 1092 | return $self->_max_seq_len; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | sub _max_seq_len { | 
| 158 | 365 |  |  | 365 |  | 746 | my $self = shift; | 
| 159 | 365 |  |  |  |  | 12495 | my @lengths = map { $_->seq_len } $self->all_seqs; | 
|  | 12841 |  |  |  |  | 402763 |  | 
| 160 | 365 |  | 50 |  |  | 5637 | return (List::AllUtils::max @lengths) // 0;     # to avoid warnings | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | sub seq_len_stats { | 
| 165 | 1 |  |  | 1 | 1 | 10 | my $self = shift; | 
| 166 |  |  |  |  |  |  |  | 
| 167 | 1 |  |  |  |  | 41 | my @lengths = map { $_->nomiss_seq_len } $self->all_seqs; | 
|  | 10 |  |  |  |  | 27 |  | 
| 168 | 1 |  |  |  |  | 15 | my $stat = Statistics::Descriptive::Full->new; | 
| 169 | 1 |  |  |  |  | 138 | $stat->add_data( \@lengths ); | 
| 170 | 1 |  |  |  |  | 163 | my @quantiles = map { $stat->quantile($_) } 0..4; | 
|  | 5 |  |  |  |  | 266 |  | 
| 171 |  |  |  |  |  |  |  | 
| 172 | 1 |  |  |  |  | 48 | return @quantiles; | 
| 173 |  |  |  |  |  |  | } | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | sub perc_miss { | 
| 177 | 2 |  |  | 2 | 1 | 7 | my $self = shift; | 
| 178 |  |  |  |  |  |  |  | 
| 179 | 2 |  |  |  |  | 93 | my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs; | 
|  | 255 |  |  |  |  | 592 |  | 
| 180 | 2 | 50 |  |  |  | 22 | return 0.0 unless $n; | 
| 181 |  |  |  |  |  |  |  | 
| 182 | 2 |  |  |  |  | 13 | my $total = $self->width * $self->count_seqs; | 
| 183 | 2 |  |  |  |  | 28 | return 100.0 * ($total - $n) / $total; | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | # MUTATORS | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | sub uc_seqs { | 
| 190 | 1 |  |  | 1 | 1 | 37 | my $self = shift; | 
| 191 |  |  |  |  |  |  |  | 
| 192 | 1 |  |  |  |  | 50 | $_->uc for $self->all_seqs; | 
| 193 | 1 |  |  |  |  | 4 | return $self; | 
| 194 |  |  |  |  |  |  | } | 
| 195 |  |  |  |  |  |  |  | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | sub recode_seqs {                           ## no critic (RequireArgUnpacking) | 
| 198 | 1 |  |  | 1 | 1 | 17 | my $self = shift; | 
| 199 |  |  |  |  |  |  |  | 
| 200 | 1 |  |  |  |  | 42 | $_->recode(@_) for $self->all_seqs; | 
| 201 | 1 |  |  |  |  | 4 | return $self; | 
| 202 |  |  |  |  |  |  | } | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | sub degap_seqs { | 
| 206 | 1 |  |  | 1 | 1 | 12 | my $self = shift; | 
| 207 |  |  |  |  |  |  |  | 
| 208 | 1 |  |  |  |  | 46 | $_->degap for $self->all_seqs; | 
| 209 | 1 |  |  |  |  | 5 | return $self; | 
| 210 |  |  |  |  |  |  | } | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | sub spacify_seqs { | 
| 214 | 176 |  |  | 176 | 1 | 381 | my $self = shift; | 
| 215 |  |  |  |  |  |  |  | 
| 216 | 176 |  |  |  |  | 5576 | $_->spacify for $self->all_seqs; | 
| 217 | 176 |  |  |  |  | 571 | return $self; | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | sub gapify_seqs {                           ## no critic (RequireArgUnpacking) | 
| 222 | 2 |  |  | 2 | 1 | 16 | my $self = shift; | 
| 223 | 2 |  |  |  |  | 80 | $_->gapify(@_) for $self->all_seqs; | 
| 224 | 2 |  |  |  |  | 6 | return $self; | 
| 225 |  |  |  |  |  |  | } | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  |  | 
| 228 |  |  |  |  |  |  | sub trim_seqs { | 
| 229 | 176 |  |  | 176 | 1 | 378 | my $self = shift; | 
| 230 |  |  |  |  |  |  |  | 
| 231 | 176 |  |  |  |  | 5574 | $_->trim for $self->all_seqs; | 
| 232 | 176 |  |  |  |  | 615 | return $self; | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | sub pad_seqs { | 
| 237 | 176 |  |  | 176 | 1 | 373 | my $self = shift; | 
| 238 |  |  |  |  |  |  |  | 
| 239 | 176 |  |  |  |  | 633 | my $bound = $self->_max_seq_len; | 
| 240 | 176 |  |  |  |  | 5518 | $_->pad_to($bound) for $self->all_seqs; | 
| 241 | 176 |  |  |  |  | 598 | return $self; | 
| 242 |  |  |  |  |  |  | } | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  |  | 
| 245 |  |  |  |  |  |  | sub uniformize { | 
| 246 | 175 |  |  | 175 | 1 | 400 | my $self = shift; | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | # TODO: profile code (triggers?) to avoid useless multiple calls | 
| 249 |  |  |  |  |  |  |  | 
| 250 | 175 |  |  |  |  | 663 | $self->spacify_seqs; | 
| 251 | 175 |  |  |  |  | 749 | $self->trim_seqs; | 
| 252 | 175 |  |  |  |  | 792 | $self->pad_seqs; | 
| 253 |  |  |  |  |  |  |  | 
| 254 | 175 |  |  |  |  | 395 | return $self; | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | sub clear_new_tags { | 
| 259 | 1 |  |  | 1 | 1 | 3 | my $self = shift; | 
| 260 |  |  |  |  |  |  |  | 
| 261 | 1 |  |  |  |  | 47 | $_->clear_new_tag for $self->all_seqs; | 
| 262 | 1 |  |  |  |  | 6 | return $self; | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | sub shorten_ids {                           ## no critic (RequireArgUnpacking) | 
| 267 | 12 |  |  | 12 | 1 | 66 | return shift->_change_ids_(1, @_); | 
| 268 |  |  |  |  |  |  | } | 
| 269 |  |  |  |  |  |  |  | 
| 270 |  |  |  |  |  |  |  | 
| 271 |  |  |  |  |  |  | sub restore_ids {                           ## no critic (RequireArgUnpacking) | 
| 272 | 10 |  |  | 10 | 1 | 42 | return shift->_change_ids_(0, @_); | 
| 273 |  |  |  |  |  |  | } | 
| 274 |  |  |  |  |  |  |  | 
| 275 |  |  |  |  |  |  | sub _change_ids_ { | 
| 276 | 22 |  |  | 22 |  | 47 | my $self   = shift; | 
| 277 | 22 |  |  |  |  | 42 | my $abbr   = shift; | 
| 278 | 22 |  |  |  |  | 39 | my $mapper = shift; | 
| 279 |  |  |  |  |  |  |  | 
| 280 | 22 |  |  |  |  | 861 | for my $seq ($self->all_seqs) { | 
| 281 | 133 | 100 |  |  |  | 669 | my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id ) | 
| 282 |  |  |  |  |  |  | : $mapper->long_id_for( $seq->full_id ); | 
| 283 | 133 | 50 |  |  |  | 4166 | $seq->set_seq_id($new_id) if $new_id; | 
| 284 |  |  |  |  |  |  | }                       # Note: leave id alone if not found | 
| 285 |  |  |  |  |  |  |  | 
| 286 | 22 |  |  |  |  | 109 | return $self; | 
| 287 |  |  |  |  |  |  | } | 
| 288 |  |  |  |  |  |  |  | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | sub apply_list { | 
| 291 | 1 |  |  | 1 | 1 | 11 | my $self = shift; | 
| 292 | 1 |  |  |  |  | 4 | my $list = shift; | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | # loop through seqs from bottom to top | 
| 295 |  |  |  |  |  |  | # ... and delete seq from Ali if not listed in list | 
| 296 | 1 |  |  |  |  | 49 | my $seq_n = $self->count_seqs; | 
| 297 |  |  |  |  |  |  |  | 
| 298 | 1 |  |  |  |  | 7 | for (my $i = $seq_n-1; $i >= 0; $i--) { | 
| 299 | 250 | 100 |  |  |  | 8210 | $self->delete_seq($i) | 
| 300 |  |  |  |  |  |  | unless $list->is_listed( $self->get_seq($i)->full_id ); | 
| 301 |  |  |  |  |  |  | } | 
| 302 |  |  |  |  |  |  |  | 
| 303 | 1 |  |  |  |  | 5 | return $self; | 
| 304 |  |  |  |  |  |  | } | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  |  | 
| 307 |  |  |  |  |  |  | sub _premask_check { | 
| 308 | 40 |  |  | 40 |  | 564 | my $self = shift; | 
| 309 | 40 |  |  |  |  | 76 | my $mask = shift; | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | # warn of unaligned Ali | 
| 312 | 40 | 50 |  |  |  | 155 | carp '[BMC] Note: Ali does not look aligned!' | 
| 313 |  |  |  |  |  |  | . ' This might not be an issue if seqs are ultra-conserved!' | 
| 314 |  |  |  |  |  |  | unless $self->is_aligned; | 
| 315 | 40 |  |  |  |  | 243 | $self->uniformize; | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | # warn of empty SeqMask | 
| 318 |  |  |  |  |  |  | carp '[BMC] Note: applying this mask will result in a zero-width Ali!' | 
| 319 | 40 | 50 |  | 1845 |  | 1636 | unless List::AllUtils::any { $_ } $mask->all_states; | 
|  | 1845 |  |  |  |  | 2274 |  | 
| 320 |  |  |  |  |  |  |  | 
| 321 |  |  |  |  |  |  | # check that SeqMask is compatible with Ali | 
| 322 |  |  |  |  |  |  | # potential bugs could come from constant sites etc | 
| 323 | 40 |  |  |  |  | 13304 | my $a_width = $self->width; | 
| 324 | 40 |  |  |  |  | 1493 | my $m_width = $mask->mask_len; | 
| 325 | 40 | 100 |  |  |  | 181 | carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!" | 
| 326 |  |  |  |  |  |  | . ' This might not be an issue if the mask results from blocks.' | 
| 327 |  |  |  |  |  |  | unless $a_width == $m_width; | 
| 328 |  |  |  |  |  |  |  | 
| 329 | 40 |  |  |  |  | 1014 | return;                 # return values of before subs are ignored | 
| 330 |  |  |  |  |  |  | } | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | before 'apply_mask' => \&_premask_check; | 
| 333 |  |  |  |  |  |  |  | 
| 334 |  |  |  |  |  |  | sub apply_mask { | 
| 335 |  |  |  |  |  |  | my $self = shift; | 
| 336 |  |  |  |  |  |  | my $mask = shift; | 
| 337 |  |  |  |  |  |  |  | 
| 338 |  |  |  |  |  |  | # select sites for each seq using a precomputed array slice | 
| 339 |  |  |  |  |  |  | my @indexes = indexes { $_ } $mask->all_states; | 
| 340 |  |  |  |  |  |  | $_->_set_seq( join q{}, ( $_->all_states )[@indexes] ) | 
| 341 |  |  |  |  |  |  | for $self->all_seqs; | 
| 342 |  |  |  |  |  |  |  | 
| 343 |  |  |  |  |  |  | return $self; | 
| 344 |  |  |  |  |  |  | } | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | sub idealize {                              ## no critic (RequireArgUnpacking) | 
| 348 | 13 |  |  | 13 | 1 | 121 | my $self = shift; | 
| 349 | 13 |  |  |  |  | 224 | return $self->apply_mask( SeqMask->ideal_mask($self, @_) ); | 
| 350 |  |  |  |  |  |  | } | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  | # MISC METHODS | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | sub gapmiss_regex { | 
| 356 | 73 | 100 |  | 73 | 1 | 1200 | return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS; | 
| 357 |  |  |  |  |  |  | } | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  | sub map_coords { | 
| 361 | 1 |  |  | 1 | 1 | 12 | my $self      = shift; | 
| 362 | 1 |  |  |  |  | 3 | my $id        = shift; | 
| 363 | 1 |  |  |  |  | 2 | my $coords_in = shift; | 
| 364 |  |  |  |  |  |  |  | 
| 365 | 1 |  |  |  |  | 6 | my $seq = $self->get_seq_with_id($id); | 
| 366 | 1 |  |  |  |  | 5 | my @states = $seq->all_states; | 
| 367 |  |  |  |  |  |  |  | 
| 368 | 1 |  |  |  |  | 3 | my $count = 0; | 
| 369 | 1 |  |  |  |  | 2 | my @index; | 
| 370 | 1 |  |  |  |  | 3 | for my $state (@states) { | 
| 371 | 130 | 100 |  |  |  | 331 | $count++ if $state !~ $GAP; | 
| 372 | 130 |  |  |  |  | 215 | push @index, $count; | 
| 373 |  |  |  |  |  |  | } | 
| 374 |  |  |  |  |  |  |  | 
| 375 | 1 |  |  |  |  | 2 | my @coords_out = map { $index[$_-1] } @{$coords_in}; | 
|  | 6 |  |  |  |  | 13 |  | 
|  | 1 |  |  |  |  | 3 |  | 
| 376 |  |  |  |  |  |  |  | 
| 377 | 1 |  |  |  |  | 12 | return \@coords_out; | 
| 378 |  |  |  |  |  |  | } | 
| 379 |  |  |  |  |  |  |  | 
| 380 |  |  |  |  |  |  | # ALIASES | 
| 381 |  |  |  |  |  |  |  | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | sub height { | 
| 384 | 0 |  |  | 0 | 1 | 0 | return shift->count_seqs; | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  | # I/O METHODS | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | sub load { | 
| 391 | 69 |  |  | 69 | 1 | 31703 | my $class  = shift; | 
| 392 | 69 |  |  |  |  | 221 | my $infile = shift; | 
| 393 |  |  |  |  |  |  |  | 
| 394 | 69 |  |  |  |  | 423 | open my $in, '<', $infile; | 
| 395 |  |  |  |  |  |  |  | 
| 396 | 69 |  |  |  |  | 33338 | my $ali = $class->new( file => $infile ); | 
| 397 | 69 |  |  |  |  | 411 | my $seq_id; | 
| 398 |  |  |  |  |  |  | my $seq; | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | LINE: | 
| 401 | 69 |  |  |  |  | 14756 | while (my $line = <$in>) { | 
| 402 | 4091 |  |  |  |  | 7944 | chomp $line; | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | # skip empty lines and process comments | 
| 405 | 4091 | 100 | 100 |  |  | 25606 | next LINE if $line =~ $EMPTY_LINE | 
| 406 |  |  |  |  |  |  | || $ali->is_comment($line); | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | # at each '>' char... | 
| 409 | 3944 |  |  |  |  | 13635 | my ($defline) = $line =~ $DEF_LINE; | 
| 410 | 3944 | 100 |  |  |  | 8400 | if ($defline) { | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | # add current seq to ali (if any) | 
| 413 | 1662 | 100 |  |  |  | 3482 | if ($seq) { | 
| 414 | 1593 |  |  |  |  | 48351 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 415 | 1593 |  |  |  |  | 54597 | $ali->add_seq($new_seq); | 
| 416 | 1593 |  |  |  |  | 2987 | $seq = q{}; | 
| 417 |  |  |  |  |  |  | } | 
| 418 |  |  |  |  |  |  |  | 
| 419 | 1662 |  |  |  |  | 2726 | $seq_id = $defline; | 
| 420 | 1662 |  |  |  |  | 26369 | next LINE; | 
| 421 |  |  |  |  |  |  | } | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | # elongate current seq (seqs can be broken on several lines) | 
| 424 | 2282 |  |  |  |  | 11246 | $seq .= $line; | 
| 425 |  |  |  |  |  |  | } | 
| 426 |  |  |  |  |  |  |  | 
| 427 |  |  |  |  |  |  | # add last seq to ali (if any) | 
| 428 | 69 | 50 |  |  |  | 328 | if ($seq) { | 
| 429 | 69 |  |  |  |  | 2504 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 430 | 69 |  |  |  |  | 2392 | $ali->add_seq($new_seq); | 
| 431 |  |  |  |  |  |  | } | 
| 432 |  |  |  |  |  |  |  | 
| 433 | 69 |  |  |  |  | 2021 | return $ali; | 
| 434 |  |  |  |  |  |  | } | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  |  | 
| 437 |  |  |  |  |  |  | before qr{\Astore}xms => sub { | 
| 438 |  |  |  |  |  |  | my $self = shift; | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | # perform pre-storage duties | 
| 441 |  |  |  |  |  |  | carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids; | 
| 442 |  |  |  |  |  |  | $self->uniformize if $self->is_aligned; | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | return; | 
| 445 |  |  |  |  |  |  | }; | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | sub store {                                 ## no critic (RequireArgUnpacking) | 
| 449 |  |  |  |  |  |  | my $self = shift; | 
| 450 |  |  |  |  |  |  | my $outfile = shift; | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | # automatically redirect to store_fasta for non-Ali outfile names | 
| 453 |  |  |  |  |  |  | return $self->store_fasta($outfile, @_)     # note the currying | 
| 454 |  |  |  |  |  |  | unless $outfile =~ $ALI_SUFFIX; | 
| 455 |  |  |  |  |  |  |  | 
| 456 |  |  |  |  |  |  | open my $out, '>', $outfile; | 
| 457 |  |  |  |  |  |  |  | 
| 458 |  |  |  |  |  |  | print {$out} $self->header; | 
| 459 |  |  |  |  |  |  | for my $seq ($self->all_seqs) { | 
| 460 |  |  |  |  |  |  | say {$out} '>' . $seq->full_id; | 
| 461 |  |  |  |  |  |  | say {$out} $seq->seq; | 
| 462 |  |  |  |  |  |  | } | 
| 463 |  |  |  |  |  |  |  | 
| 464 |  |  |  |  |  |  | return; | 
| 465 |  |  |  |  |  |  | } | 
| 466 |  |  |  |  |  |  |  | 
| 467 |  |  |  |  |  |  |  | 
| 468 |  |  |  |  |  |  | sub store_fasta { | 
| 469 |  |  |  |  |  |  | my $self    = shift; | 
| 470 |  |  |  |  |  |  | my $outfile = shift; | 
| 471 |  |  |  |  |  |  | my $args    = shift // {};          # HashRef (should not be empty...) | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | my $degap = $args->{degap}  //  0; | 
| 474 |  |  |  |  |  |  | my $clean = $args->{clean}  //  0; | 
| 475 |  |  |  |  |  |  | my $gap   = $args->{gapify} // 'X'; | 
| 476 |  |  |  |  |  |  | my $chunk = $args->{chunk}  // 60; | 
| 477 |  |  |  |  |  |  | my $nowrap = $chunk < 0 ? 1 : 0; | 
| 478 |  |  |  |  |  |  | my $is_aligned = $self->is_aligned; | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | open my $out, '>', $outfile; | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | for my $seq ($self->all_seqs) { | 
| 483 |  |  |  |  |  |  | say {$out} '>' . $seq->foreign_id; | 
| 484 |  |  |  |  |  |  |  | 
| 485 |  |  |  |  |  |  | # optionally clean and/or degap seq | 
| 486 |  |  |  |  |  |  | $seq = $seq->clone  if $clean || $degap;    # clone seq only if needed | 
| 487 |  |  |  |  |  |  | $seq->gapify($gap)  if $clean; | 
| 488 |  |  |  |  |  |  | $seq->degap         if $degap; | 
| 489 |  |  |  |  |  |  |  | 
| 490 |  |  |  |  |  |  | my $width = $seq->seq_len; | 
| 491 |  |  |  |  |  |  | $chunk = $width     if $nowrap;             # optionally disable wrap | 
| 492 |  |  |  |  |  |  |  | 
| 493 |  |  |  |  |  |  | my $str = $seq->wrapped_str($chunk); | 
| 494 |  |  |  |  |  |  | $str =~ s{$GAP}{-}xmsg if $is_aligned;      # restore '-' when aligned | 
| 495 |  |  |  |  |  |  | print {$out} $str; | 
| 496 |  |  |  |  |  |  | } | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | return; | 
| 499 |  |  |  |  |  |  | } | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  |  | 
| 502 |  |  |  |  |  |  | sub temp_fasta { | 
| 503 | 9 |  |  | 9 | 1 | 23 | my $self = shift; | 
| 504 | 9 |  | 50 |  |  | 26 | my $args = shift // {};             # HashRef (should not be empty...) | 
| 505 |  |  |  |  |  |  |  | 
| 506 |  |  |  |  |  |  | # abbreviate ids (possibly using a custom prefix) | 
| 507 | 9 |  |  |  |  | 22 | my $prefix = $args->{id_prefix}; | 
| 508 | 9 |  |  |  |  | 18 | delete $args->{id_prefix};          # not necessarily required | 
| 509 | 9 |  |  |  |  | 47 | my $mapper = $self->std_mapper($prefix); | 
| 510 | 9 |  |  |  |  | 54 | $self->shorten_ids($mapper); | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | # write temporary .fasta file using standard abbr_ids | 
| 513 |  |  |  |  |  |  | # ...and restore long_ids afterwards | 
| 514 | 9 |  |  |  |  | 88 | my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta'); | 
| 515 | 9 |  |  |  |  | 5089 | $self->store_fasta($out->filename, $args); | 
| 516 | 9 |  |  |  |  | 74 | $self->restore_ids($mapper); | 
| 517 |  |  |  |  |  |  | ### filename: $out->filename | 
| 518 |  |  |  |  |  |  |  | 
| 519 | 9 | 100 |  |  |  | 56 | return wantarray ? ($out->filename, $mapper) : $out->filename; | 
| 520 |  |  |  |  |  |  | } | 
| 521 |  |  |  |  |  |  |  | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | # TODO: add an option to remove trailing '_' in ids? | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | sub load_phylip { | 
| 526 | 105 |  |  | 105 | 1 | 8962 | my $class  = shift; | 
| 527 | 105 |  |  |  |  | 297 | my $infile = shift; | 
| 528 |  |  |  |  |  |  |  | 
| 529 | 105 |  |  |  |  | 838 | open my $in, '<', $infile; | 
| 530 |  |  |  |  |  |  |  | 
| 531 | 105 |  |  |  |  | 50325 | my $ali = $class->new( file => $infile ); | 
| 532 | 105 |  |  |  |  | 282 | my $seq_n; | 
| 533 |  |  |  |  |  |  | my $site_n; | 
| 534 | 105 |  |  |  |  | 330 | my $n = 0; | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  | LINE: | 
| 537 | 105 |  |  |  |  | 39303 | while (my $line = <$in>) { | 
| 538 | 8557 |  |  |  |  | 21245 | chomp $line; | 
| 539 |  |  |  |  |  |  |  | 
| 540 |  |  |  |  |  |  | # skip empty lines | 
| 541 | 8557 | 100 |  |  |  | 51994 | next LINE if $line =~ $EMPTY_LINE; | 
| 542 |  |  |  |  |  |  |  | 
| 543 |  |  |  |  |  |  | # get matrix dimensions | 
| 544 | 8524 | 100 |  |  |  | 37214 | if ($line =~ $DIM_LINE) { | 
| 545 | 105 |  |  |  |  | 695 | ($seq_n, $site_n) = ($1, $2); | 
| 546 | 105 |  |  |  |  | 2209 | next LINE; | 
| 547 |  |  |  |  |  |  | } | 
| 548 |  |  |  |  |  |  |  | 
| 549 |  |  |  |  |  |  | # process regular line (identifier is optional) | 
| 550 | 8419 |  |  |  |  | 49650 | my ($seq_id, $seq) = $line =~ $PHY_LINE; | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | # should never happen... | 
| 553 | 8419 | 50 |  |  |  | 22049 | croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!" | 
| 554 |  |  |  |  |  |  | unless $seq; | 
| 555 |  |  |  |  |  |  |  | 
| 556 |  |  |  |  |  |  | # delete optional spaces | 
| 557 | 8419 |  |  |  |  | 20555 | $seq =~ tr/ //d; | 
| 558 |  |  |  |  |  |  |  | 
| 559 |  |  |  |  |  |  | # process first seq block | 
| 560 | 8419 | 100 |  |  |  | 313196 | if ($ali->count_seqs < $seq_n) { | 
| 561 |  |  |  |  |  |  |  | 
| 562 |  |  |  |  |  |  | # seq_id are mandatory in first block | 
| 563 | 8089 | 50 |  |  |  | 18313 | croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!" | 
| 564 |  |  |  |  |  |  | unless $seq_id; | 
| 565 |  |  |  |  |  |  |  | 
| 566 |  |  |  |  |  |  | # store first seq chunk along with seq_id | 
| 567 | 8089 |  |  |  |  | 225725 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 568 | 8089 |  |  |  |  | 279190 | $ali->add_seq($new_seq); | 
| 569 |  |  |  |  |  |  | } | 
| 570 |  |  |  |  |  |  |  | 
| 571 |  |  |  |  |  |  | # process remaining seq blocks | 
| 572 |  |  |  |  |  |  | else { | 
| 573 | 330 |  |  |  |  | 11285 | my $curr_seq = $ali->get_seq($n); | 
| 574 |  |  |  |  |  |  |  | 
| 575 |  |  |  |  |  |  | # elongate current with partial seq chunk mistaken as seq_id | 
| 576 | 330 | 100 | 100 |  |  | 1087 | if ($seq_id && $seq_id ne $curr_seq->full_id) { | 
| 577 | 110 |  |  |  |  | 3664 | $curr_seq->append_seq($seq_id); | 
| 578 |  |  |  |  |  |  | } | 
| 579 |  |  |  |  |  |  |  | 
| 580 |  |  |  |  |  |  | # elongate current seq with new seq chunk | 
| 581 | 330 |  |  |  |  | 10759 | $curr_seq->append_seq($seq); | 
| 582 |  |  |  |  |  |  | } | 
| 583 |  |  |  |  |  |  |  | 
| 584 |  |  |  |  |  |  | # prepare reading of next seq or next block | 
| 585 | 8419 | 100 |  |  |  | 102891 | $n = 0 if ++$n == $seq_n; | 
| 586 |  |  |  |  |  |  | } | 
| 587 |  |  |  |  |  |  |  | 
| 588 | 105 |  |  |  |  | 919 | my $width = $ali->width; | 
| 589 | 105 | 50 |  |  |  | 696 | croak "[BMC] Error: unexpected site number in PHYLIP file: $width;" | 
| 590 |  |  |  |  |  |  | . ' aborting!' if $width != $site_n; | 
| 591 |  |  |  |  |  |  |  | 
| 592 | 105 |  |  |  |  | 4069 | return $ali; | 
| 593 |  |  |  |  |  |  | } | 
| 594 |  |  |  |  |  |  |  | 
| 595 |  |  |  |  |  |  | # PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html | 
| 596 |  |  |  |  |  |  | # The information for each species follows, starting with a ten-character | 
| 597 |  |  |  |  |  |  | # species name (which can include blanks and some punctuation marks), and | 
| 598 |  |  |  |  |  |  | # continuing with the characters for that species. The name should be on the | 
| 599 |  |  |  |  |  |  | # same line as the first character of the data for that species. (...) The | 
| 600 |  |  |  |  |  |  | # name should be ten characters in length, filled out to the full ten | 
| 601 |  |  |  |  |  |  | # characters by blanks if shorter. Any printable ASCII/ISO character is | 
| 602 |  |  |  |  |  |  | # allowed in the name, except for parentheses ("(" and ")"), square brackets | 
| 603 |  |  |  |  |  |  | # ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that | 
| 604 |  |  |  |  |  |  | # in these sequences we have a blank every ten sites to make them easier to | 
| 605 |  |  |  |  |  |  | # read: any such blanks are allowed. The blank line which separates the two | 
| 606 |  |  |  |  |  |  | # groups of lines (the ones containing sites 1-20 and ones containing sites | 
| 607 |  |  |  |  |  |  | # 21-39) may or may not be present. | 
| 608 |  |  |  |  |  |  |  | 
| 609 |  |  |  |  |  |  | # PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command | 
| 610 |  |  |  |  |  |  | # The input sequence file is a standard PHYLIP file of aligned DNA or | 
| 611 |  |  |  |  |  |  | # amino-acids sequences. (...) The maximum number of characters in species | 
| 612 |  |  |  |  |  |  | # name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed | 
| 613 |  |  |  |  |  |  | # within sequence names because the Newick tree format makes special use of | 
| 614 |  |  |  |  |  |  | # these symbols. However, blanks (one or more) MUST appear at the end of each | 
| 615 |  |  |  |  |  |  | # species name. | 
| 616 |  |  |  |  |  |  |  | 
| 617 |  |  |  |  |  |  | # TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf | 
| 618 |  |  |  |  |  |  | # A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-", | 
| 619 |  |  |  |  |  |  | # dots ".", underscores "_", or some of "/", "?", "*", "+". No space is | 
| 620 |  |  |  |  |  |  | # allowed inside the names. The first name character must be a letter or an | 
| 621 |  |  |  |  |  |  | # underscore. A sequence fragment may start behind position 10 after a | 
| 622 |  |  |  |  |  |  | # sequence name, or anywhere in a line without a name. | 
| 623 |  |  |  |  |  |  |  | 
| 624 |  |  |  |  |  |  | # TREE-PUZZLE [data/globin.a] | 
| 625 |  |  |  |  |  |  | #   7 128 | 
| 626 |  |  |  |  |  |  | # HBB_HUMAN      HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN | 
| 627 |  |  |  |  |  |  | # HBB_HORSE      QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN | 
| 628 |  |  |  |  |  |  | # HBA_HUMAN      VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS | 
| 629 |  |  |  |  |  |  | # HBA_HORSE      VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS | 
| 630 |  |  |  |  |  |  | # MYG_PHYCA      VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS | 
| 631 |  |  |  |  |  |  | # GLB5_PETMA     PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS | 
| 632 |  |  |  |  |  |  | # LGB2_LUPLU     ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN | 
| 633 |  |  |  |  |  |  |  | 
| 634 |  |  |  |  |  |  | # RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf | 
| 635 |  |  |  |  |  |  | # The input alignment format of RAxML is relaxed interleaved or sequential | 
| 636 |  |  |  |  |  |  | # PHYLIP. "Relaxed" means that sequence names can be of variable length | 
| 637 |  |  |  |  |  |  | # between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names | 
| 638 |  |  |  |  |  |  | # taxon names that contain any form of whitespace character, like blanks, | 
| 639 |  |  |  |  |  |  | # tabulators, and carriage returns, as well as one of the following prohibited | 
| 640 |  |  |  |  |  |  | # characters: :,();[]. | 
| 641 |  |  |  |  |  |  |  | 
| 642 |  |  |  |  |  |  | # PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf | 
| 643 |  |  |  |  |  |  | # Taxon names may contain more than 10 characters. Avoid special characters | 
| 644 |  |  |  |  |  |  | # such as ';' or ')', which will create problems when parsing trees. The best | 
| 645 |  |  |  |  |  |  | # is to only use letters, digits and '_'. Sequences can be interrupted by | 
| 646 |  |  |  |  |  |  | # space and tab, but not by return characters. Be sure that the lengths of the | 
| 647 |  |  |  |  |  |  | # sequences are all the same, and identical to the lengths indicated in the | 
| 648 |  |  |  |  |  |  | # header. Sequences can be interleaved, in which case the taxon names may or | 
| 649 |  |  |  |  |  |  | # may not be repeated in each block. | 
| 650 |  |  |  |  |  |  |  | 
| 651 |  |  |  |  |  |  |  | 
| 652 |  |  |  |  |  |  | sub store_phylip { | 
| 653 |  |  |  |  |  |  | my $self    = shift; | 
| 654 |  |  |  |  |  |  | my $outfile = shift; | 
| 655 |  |  |  |  |  |  | my $args    = shift // {};          # HashRef (should not be empty...) | 
| 656 |  |  |  |  |  |  |  | 
| 657 |  |  |  |  |  |  | my $short = $args->{short} // 1; | 
| 658 |  |  |  |  |  |  | my $clean = $args->{clean} // 0; | 
| 659 |  |  |  |  |  |  | my $chunk = $args->{chunk} // 60; | 
| 660 |  |  |  |  |  |  |  | 
| 661 |  |  |  |  |  |  | open my $out, '>', $outfile; | 
| 662 |  |  |  |  |  |  |  | 
| 663 |  |  |  |  |  |  | # print data matrix dimensions | 
| 664 |  |  |  |  |  |  | my $height = $self->count_seqs; | 
| 665 |  |  |  |  |  |  | my $width  = $self->width; | 
| 666 |  |  |  |  |  |  | say {$out} $height . q{ } . $width; | 
| 667 |  |  |  |  |  |  |  | 
| 668 |  |  |  |  |  |  | # setup id format (this will also affect block-like structure) | 
| 669 |  |  |  |  |  |  | my $format = $short ? "%-10.10s %s\n" : "%s %s\n"; | 
| 670 |  |  |  |  |  |  | my $method = $short ? 'full_id'       : 'foreign_id'; | 
| 671 |  |  |  |  |  |  | my $sep    = $short ? q{ }            : q{}; | 
| 672 |  |  |  |  |  |  |  | 
| 673 |  |  |  |  |  |  | # optionally disable wrapping | 
| 674 |  |  |  |  |  |  | $chunk = $width if $chunk < 0; | 
| 675 |  |  |  |  |  |  |  | 
| 676 |  |  |  |  |  |  | # output Ali in sequential or interleaved format | 
| 677 |  |  |  |  |  |  | for (my $site = 0; $site < $width; $site += $chunk) { | 
| 678 |  |  |  |  |  |  |  | 
| 679 |  |  |  |  |  |  | # leave empty line between chunks (but not after last chunk) | 
| 680 |  |  |  |  |  |  | print {$out} "\n" if $site; | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | for my $seq ($self->all_seqs) { | 
| 683 |  |  |  |  |  |  |  | 
| 684 |  |  |  |  |  |  | # print 10-chars ids only once | 
| 685 |  |  |  |  |  |  | # Note: full_ids are only truncated (use IdMapper for more) | 
| 686 |  |  |  |  |  |  | my $id = $site == 0 ? $seq->$method : q{}; | 
| 687 |  |  |  |  |  |  |  | 
| 688 |  |  |  |  |  |  | # optionally clean seq | 
| 689 |  |  |  |  |  |  | $seq = $seq->clone->gapify('X') if $clean; | 
| 690 |  |  |  |  |  |  |  | 
| 691 |  |  |  |  |  |  | # print seq chunks in 10-state blocks | 
| 692 |  |  |  |  |  |  | # Note: We insert a space in the 11th column to ensure that more | 
| 693 |  |  |  |  |  |  | # software packages can read the written files. PHYLIP itself will | 
| 694 |  |  |  |  |  |  | # ignore this spurious space in the 'sequences'. | 
| 695 |  |  |  |  |  |  | ( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg; | 
| 696 |  |  |  |  |  |  | printf {$out} $format, $id, join $sep, | 
| 697 |  |  |  |  |  |  | map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1; | 
| 698 |  |  |  |  |  |  | } | 
| 699 |  |  |  |  |  |  | } | 
| 700 |  |  |  |  |  |  |  | 
| 701 |  |  |  |  |  |  | return; | 
| 702 |  |  |  |  |  |  | } | 
| 703 |  |  |  |  |  |  |  | 
| 704 |  |  |  |  |  |  |  | 
| 705 |  |  |  |  |  |  | sub load_stockholm { | 
| 706 | 2 |  |  | 2 | 1 | 31 | my $class  = shift; | 
| 707 | 2 |  |  |  |  | 4 | my $infile = shift; | 
| 708 |  |  |  |  |  |  |  | 
| 709 | 2 |  |  |  |  | 9 | open my $in, '<', $infile; | 
| 710 |  |  |  |  |  |  |  | 
| 711 | 2 |  |  |  |  | 430 | my $ali = $class->new( file => $infile ); | 
| 712 | 2 |  |  |  |  | 55 | tie my %seq_for, 'Tie::IxHash'; | 
| 713 |  |  |  |  |  |  |  | 
| 714 |  |  |  |  |  |  | LINE: | 
| 715 | 2 |  |  |  |  | 1111 | while (my $line = <$in>) { | 
| 716 | 44 |  |  |  |  | 288 | chomp $line; | 
| 717 |  |  |  |  |  |  |  | 
| 718 |  |  |  |  |  |  | # process GF comments | 
| 719 | 44 | 100 |  |  |  | 105 | next LINE if $ali->is_comment($line, $STK_COMMENT); | 
| 720 |  |  |  |  |  |  |  | 
| 721 |  |  |  |  |  |  | # skip empty lines and other comment lines | 
| 722 | 27 | 100 | 100 |  |  | 214 | next LINE if $line =~ $EMPTY_LINE | 
| 723 |  |  |  |  |  |  | || $line =~ $COMMENT_LINE; | 
| 724 | 11 | 100 |  |  |  | 38 | last LINE if $line =~ $END_LINE; | 
| 725 |  |  |  |  |  |  |  | 
| 726 |  |  |  |  |  |  | # parse stockholm seq | 
| 727 | 9 |  |  |  |  | 42 | my ($seq_id, $seq) = $line =~ $STK_SEQ; | 
| 728 |  |  |  |  |  |  |  | 
| 729 |  |  |  |  |  |  | # replace dots and gaps by stars | 
| 730 | 9 |  |  |  |  | 27 | $seq =~ s/[\.\-]/*/xmsg; | 
| 731 |  |  |  |  |  |  |  | 
| 732 |  |  |  |  |  |  | # elongate seq | 
| 733 | 9 |  |  |  |  | 45 | $seq_for{$seq_id} .= $seq; | 
| 734 |  |  |  |  |  |  | } | 
| 735 |  |  |  |  |  |  |  | 
| 736 |  |  |  |  |  |  | # populate Ali object | 
| 737 | 2 |  |  |  |  | 11 | while (my ($seq_id, $seq) = each %seq_for) { | 
| 738 | 9 |  |  |  |  | 409 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 739 | 9 |  |  |  |  | 307 | $ali->add_seq($new_seq); | 
| 740 |  |  |  |  |  |  | } | 
| 741 |  |  |  |  |  |  |  | 
| 742 | 2 |  |  |  |  | 72 | return $ali; | 
| 743 |  |  |  |  |  |  | } | 
| 744 |  |  |  |  |  |  |  | 
| 745 |  |  |  |  |  |  |  | 
| 746 |  |  |  |  |  |  | sub instant_store { | 
| 747 | 1 |  |  | 1 | 1 | 5 | my $class   = shift; | 
| 748 | 1 |  |  |  |  | 3 | my $outfile = shift; | 
| 749 | 1 |  | 50 |  |  | 4 | my $args    = shift // {};          # HashRef (should not be empty...) | 
| 750 |  |  |  |  |  |  |  | 
| 751 | 1 |  |  |  |  | 4 | my $infile  = $args->{infile}; | 
| 752 | 1 | 50 |  |  |  | 5 | croak '[BMC] Error: no infile specified for instant_store; aborting!' | 
| 753 |  |  |  |  |  |  | unless $infile; | 
| 754 |  |  |  |  |  |  |  | 
| 755 | 1 |  |  |  |  | 8 | my $coderef = $args->{coderef}; | 
| 756 | 1 | 50 |  |  |  | 4 | croak '[BMC] Error: no coderef specified for instant_store; aborting!' | 
| 757 |  |  |  |  |  |  | unless $coderef; | 
| 758 |  |  |  |  |  |  |  | 
| 759 | 1 |  |  |  |  | 5 | open my $in,  '<', $infile; | 
| 760 | 1 |  |  |  |  | 270 | open my $out, '>', $outfile; | 
| 761 |  |  |  |  |  |  |  | 
| 762 | 1 |  |  |  |  | 273 | my $seq_id; | 
| 763 |  |  |  |  |  |  | my $seq; | 
| 764 |  |  |  |  |  |  |  | 
| 765 |  |  |  |  |  |  | LINE: | 
| 766 | 1 |  |  |  |  | 611 | while (my $line = <$in>) { | 
| 767 | 92 |  |  |  |  | 134 | chomp $line; | 
| 768 |  |  |  |  |  |  |  | 
| 769 |  |  |  |  |  |  | # skip empty lines and process comments | 
| 770 | 92 | 100 | 66 |  |  | 458 | next LINE if $line =~ $EMPTY_LINE | 
| 771 |  |  |  |  |  |  | || $line =~ $COMMENT_LINE; | 
| 772 |  |  |  |  |  |  |  | 
| 773 |  |  |  |  |  |  | # at each '>' char... | 
| 774 | 90 |  |  |  |  | 213 | my ($defline) = $line =~ $DEF_LINE; | 
| 775 | 90 | 100 |  |  |  | 159 | if ($defline) { | 
| 776 |  |  |  |  |  |  |  | 
| 777 |  |  |  |  |  |  | # process current seq (if any) | 
| 778 | 3 | 100 |  |  |  | 8 | if ($seq) { | 
| 779 | 2 |  |  |  |  | 74 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 780 | 2 |  |  |  |  | 5 | print {$out} $coderef->($new_seq); | 
|  | 2 |  |  |  |  | 9 |  | 
| 781 | 2 |  |  |  |  | 74 | $seq = q{}; | 
| 782 |  |  |  |  |  |  | } | 
| 783 |  |  |  |  |  |  |  | 
| 784 | 3 |  |  |  |  | 6 | $seq_id = $defline; | 
| 785 | 3 |  |  |  |  | 18 | next LINE; | 
| 786 |  |  |  |  |  |  | } | 
| 787 |  |  |  |  |  |  |  | 
| 788 |  |  |  |  |  |  | # elongate current seq (seqs can be broken on several lines) | 
| 789 | 87 |  |  |  |  | 242 | $seq .= $line; | 
| 790 |  |  |  |  |  |  | } | 
| 791 |  |  |  |  |  |  |  | 
| 792 |  |  |  |  |  |  | # process last seq (if any) | 
| 793 | 1 | 50 |  |  |  | 6 | if ($seq) { | 
| 794 | 1 |  |  |  |  | 38 | my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); | 
| 795 | 1 |  |  |  |  | 3 | print {$out} $coderef->($new_seq); | 
|  | 1 |  |  |  |  | 5 |  | 
| 796 |  |  |  |  |  |  | } | 
| 797 |  |  |  |  |  |  |  | 
| 798 | 1 |  |  |  |  | 65 | return; | 
| 799 |  |  |  |  |  |  | } | 
| 800 |  |  |  |  |  |  |  | 
| 801 |  |  |  |  |  |  |  | 
| 802 |  |  |  |  |  |  | sub instant_count { | 
| 803 | 1 |  |  | 1 | 1 | 449 | my $class  = shift; | 
| 804 | 1 |  |  |  |  | 2 | my $infile = shift; | 
| 805 |  |  |  |  |  |  |  | 
| 806 | 1 |  |  |  |  | 2 | my $seq_n = 0; | 
| 807 |  |  |  |  |  |  |  | 
| 808 | 1 |  |  |  |  | 7 | open my $in, '<', $infile; | 
| 809 | 1 | 100 |  |  |  | 2865 | while (<$in>) { $seq_n++ if m/^>/xms } | 
|  | 502 |  |  |  |  | 2586 |  | 
| 810 |  |  |  |  |  |  |  | 
| 811 | 1 |  |  |  |  | 23 | return $seq_n; | 
| 812 |  |  |  |  |  |  | } | 
| 813 |  |  |  |  |  |  |  | 
| 814 |  |  |  |  |  |  | __PACKAGE__->meta->make_immutable; | 
| 815 |  |  |  |  |  |  | 1; | 
| 816 |  |  |  |  |  |  |  | 
| 817 |  |  |  |  |  |  | __END__ | 
| 818 |  |  |  |  |  |  |  | 
| 819 |  |  |  |  |  |  | =pod | 
| 820 |  |  |  |  |  |  |  | 
| 821 |  |  |  |  |  |  | =head1 NAME | 
| 822 |  |  |  |  |  |  |  | 
| 823 |  |  |  |  |  |  | Bio::MUST::Core::Ali - Multiple sequence alignment | 
| 824 |  |  |  |  |  |  |  | 
| 825 |  |  |  |  |  |  | =head1 VERSION | 
| 826 |  |  |  |  |  |  |  | 
| 827 |  |  |  |  |  |  | version 0.212530 | 
| 828 |  |  |  |  |  |  |  | 
| 829 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 830 |  |  |  |  |  |  |  | 
| 831 |  |  |  |  |  |  | #!/usr/bin/env perl | 
| 832 |  |  |  |  |  |  |  | 
| 833 |  |  |  |  |  |  | use Modern::Perl '2011'; | 
| 834 |  |  |  |  |  |  | # same as: | 
| 835 |  |  |  |  |  |  | # use strict; | 
| 836 |  |  |  |  |  |  | # use warnings; | 
| 837 |  |  |  |  |  |  | # use feature qw(say); | 
| 838 |  |  |  |  |  |  |  | 
| 839 |  |  |  |  |  |  | use Bio::MUST::Core; | 
| 840 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 841 |  |  |  |  |  |  |  | 
| 842 |  |  |  |  |  |  | # read Ali form disk | 
| 843 |  |  |  |  |  |  | my $ali = Ali->load('example.ali'); | 
| 844 |  |  |  |  |  |  |  | 
| 845 |  |  |  |  |  |  | # get some properties | 
| 846 |  |  |  |  |  |  | say 'height:  ' . $ali->height;         # number of seqs | 
| 847 |  |  |  |  |  |  | say 'width:   ' . $ali->width;          # number of sites | 
| 848 |  |  |  |  |  |  | say '% miss:  ' . $ali->perc_miss;      # fraction of missing chars (%) | 
| 849 |  |  |  |  |  |  | say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl'; | 
| 850 |  |  |  |  |  |  |  | 
| 851 |  |  |  |  |  |  | # turn seqs to uppercase | 
| 852 |  |  |  |  |  |  | $ali->uc_seqs; | 
| 853 |  |  |  |  |  |  |  | 
| 854 |  |  |  |  |  |  | # filter out seqs with no species associated | 
| 855 |  |  |  |  |  |  | my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } ); | 
| 856 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdList'; | 
| 857 |  |  |  |  |  |  | my $list = IdList->new( ids => \@seqs ); | 
| 858 |  |  |  |  |  |  | $ali->apply_list($list); | 
| 859 |  |  |  |  |  |  |  | 
| 860 |  |  |  |  |  |  | # alternatively: | 
| 861 |  |  |  |  |  |  | # $ali = Ali->new( seqs => \@seqs ); | 
| 862 |  |  |  |  |  |  |  | 
| 863 |  |  |  |  |  |  | # filter out gap-rich sites | 
| 864 |  |  |  |  |  |  | $ali->idealize(0.2);                    # min 20% non-gaps per site | 
| 865 |  |  |  |  |  |  |  | 
| 866 |  |  |  |  |  |  | # filter out non-informative sites | 
| 867 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::SeqMask'; | 
| 868 |  |  |  |  |  |  | my $mask = SeqMask->parsimony_mask($ali); | 
| 869 |  |  |  |  |  |  | $ali->apply_mask($mask); | 
| 870 |  |  |  |  |  |  |  | 
| 871 |  |  |  |  |  |  | # write down reduced Ali to disk | 
| 872 |  |  |  |  |  |  | $ali->store('example-uc-genus-sp-20.ali'); | 
| 873 |  |  |  |  |  |  | $ali->store_fasta('example-uc-genus-sp-20.fasta'); | 
| 874 |  |  |  |  |  |  |  | 
| 875 |  |  |  |  |  |  | # see below for additional methods | 
| 876 |  |  |  |  |  |  | # ... | 
| 877 |  |  |  |  |  |  |  | 
| 878 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 879 |  |  |  |  |  |  |  | 
| 880 |  |  |  |  |  |  | This module implements the multiple sequence alignment (MSA) class and its | 
| 881 |  |  |  |  |  |  | methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects. | 
| 882 |  |  |  |  |  |  | Consequently, sequence ids do not absolutely need to be unique for it to | 
| 883 |  |  |  |  |  |  | work (though id uniqueness helps a lot for sequence access and filtering). | 
| 884 |  |  |  |  |  |  |  | 
| 885 |  |  |  |  |  |  | An Ali knows whether it contains nucleotide or protein sequences, whether | 
| 886 |  |  |  |  |  |  | those are aligned or not, as well as its Seq count and exact width. All | 
| 887 |  |  |  |  |  |  | these properties are introspected on the fly, which means that, while they | 
| 888 |  |  |  |  |  |  | can be expensive to compute, they are always accurate. | 
| 889 |  |  |  |  |  |  |  | 
| 890 |  |  |  |  |  |  | This is important because an Ali provides methods for inserting, deleting | 
| 891 |  |  |  |  |  |  | and editing its Seq objects. Further, it does its best to maintain a semantic | 
| 892 |  |  |  |  |  |  | distinction between true gap states (encoded as '*') and missing regions of | 
| 893 |  |  |  |  |  |  | undetermined length (encoded as pure whitespace, for example at the end of a | 
| 894 |  |  |  |  |  |  | short sequence). | 
| 895 |  |  |  |  |  |  |  | 
| 896 |  |  |  |  |  |  | It also has methods for mapping its sequence ids (for example, before export | 
| 897 |  |  |  |  |  |  | to the PHYLIP format), as well as methods to selectively retain sequences | 
| 898 |  |  |  |  |  |  | and sites based on L<Bio::MUST::Core::IdList> and | 
| 899 |  |  |  |  |  |  | L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method | 
| 900 |  |  |  |  |  |  | discards shared gaps and optionally removes gap-rich sites only due to a | 
| 901 |  |  |  |  |  |  | tiny number of sequences. | 
| 902 |  |  |  |  |  |  |  | 
| 903 |  |  |  |  |  |  | Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles | 
| 904 |  |  |  |  |  |  | meaningful whitespace and allows comment lines in the header) or be | 
| 905 |  |  |  |  |  |  | imported/exported from/to several popular MSA formats, such as plain FASTA, | 
| 906 |  |  |  |  |  |  | STOCKHOLM and PHYLIP. | 
| 907 |  |  |  |  |  |  |  | 
| 908 |  |  |  |  |  |  | =head1 ATTRIBUTES | 
| 909 |  |  |  |  |  |  |  | 
| 910 |  |  |  |  |  |  | =head2 seqs | 
| 911 |  |  |  |  |  |  |  | 
| 912 |  |  |  |  |  |  | ArrayRef of L<Bio::MUST::Core::Seq> objects (optional) | 
| 913 |  |  |  |  |  |  |  | 
| 914 |  |  |  |  |  |  | Most of the accessor methods described below are implemented by delegation | 
| 915 |  |  |  |  |  |  | to this public attribute using L<Moose::Meta::Attribute::Native/Moose native | 
| 916 |  |  |  |  |  |  | delegation>. Their documentation thus heavily borrows from the corresponding | 
| 917 |  |  |  |  |  |  | help pages. | 
| 918 |  |  |  |  |  |  |  | 
| 919 |  |  |  |  |  |  | =head2 file | 
| 920 |  |  |  |  |  |  |  | 
| 921 |  |  |  |  |  |  | L<Path::Class::File> object (optional) | 
| 922 |  |  |  |  |  |  |  | 
| 923 |  |  |  |  |  |  | This optional attribute is initialized by class methods that C<load> an Ali | 
| 924 |  |  |  |  |  |  | from disk. It is meant to improve the introspection capabilities of the Ali. | 
| 925 |  |  |  |  |  |  | For now, this attribute is not used by the C<store> methods, though it might | 
| 926 |  |  |  |  |  |  | provide them with a default value in the future. | 
| 927 |  |  |  |  |  |  |  | 
| 928 |  |  |  |  |  |  | =head2 guessing | 
| 929 |  |  |  |  |  |  |  | 
| 930 |  |  |  |  |  |  | Boolean (optional) | 
| 931 |  |  |  |  |  |  |  | 
| 932 |  |  |  |  |  |  | By default, an Ali object tries to guess whether it is aligned or not by | 
| 933 |  |  |  |  |  |  | looking for gap-like characters in any of its Seq objects (see | 
| 934 |  |  |  |  |  |  | L<Bio::MUST::Core::Seq> for the exact test performed on each sequence). | 
| 935 |  |  |  |  |  |  |  | 
| 936 |  |  |  |  |  |  | When this smart behavior causes issues, one can disable it by unsetting this | 
| 937 |  |  |  |  |  |  | boolean attribute (see C<dont_guess> and C<guess> accessor methods). | 
| 938 |  |  |  |  |  |  |  | 
| 939 |  |  |  |  |  |  | =head2 comments | 
| 940 |  |  |  |  |  |  |  | 
| 941 |  |  |  |  |  |  | ArrayRef of strings (optional) | 
| 942 |  |  |  |  |  |  |  | 
| 943 |  |  |  |  |  |  | An Ali object is commentable, which means that it supports all the methods | 
| 944 |  |  |  |  |  |  | pertaining to comment lines described in | 
| 945 |  |  |  |  |  |  | L<Bio::MUST::Core::Roles::Commentable> (such as C<header>). | 
| 946 |  |  |  |  |  |  |  | 
| 947 |  |  |  |  |  |  | =head1 CONSTRUCTORS | 
| 948 |  |  |  |  |  |  |  | 
| 949 |  |  |  |  |  |  | =head2 new | 
| 950 |  |  |  |  |  |  |  | 
| 951 |  |  |  |  |  |  | Default constructor (class method) returning a new Ali. | 
| 952 |  |  |  |  |  |  |  | 
| 953 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 954 |  |  |  |  |  |  | my $ali1 = Ali->new(); | 
| 955 |  |  |  |  |  |  | my @seqs = $ali->all_seqs; | 
| 956 |  |  |  |  |  |  | my $ali2 = Ali->new( seqs => \@seqs ); | 
| 957 |  |  |  |  |  |  |  | 
| 958 |  |  |  |  |  |  | This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>, | 
| 959 |  |  |  |  |  |  | C<file>, C<guessing> and C<comments>. | 
| 960 |  |  |  |  |  |  |  | 
| 961 |  |  |  |  |  |  | =head2 clone | 
| 962 |  |  |  |  |  |  |  | 
| 963 |  |  |  |  |  |  | Creates a deep copy (a clone) of the Ali. Returns the copy. | 
| 964 |  |  |  |  |  |  |  | 
| 965 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 966 |  |  |  |  |  |  | my $ali = Ali->load('input.ali'); | 
| 967 |  |  |  |  |  |  | my $ali_copy = $ali->clone; | 
| 968 |  |  |  |  |  |  | # you can now mess with $ali_copy without affecting $ali | 
| 969 |  |  |  |  |  |  |  | 
| 970 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 971 |  |  |  |  |  |  |  | 
| 972 |  |  |  |  |  |  | =head1 ACCESSORS | 
| 973 |  |  |  |  |  |  |  | 
| 974 |  |  |  |  |  |  | =head2 add_seq | 
| 975 |  |  |  |  |  |  |  | 
| 976 |  |  |  |  |  |  | Adds one (or more) new sequence(s) at the end of the Ali. Returns the new | 
| 977 |  |  |  |  |  |  | number of sequences of the Ali. | 
| 978 |  |  |  |  |  |  |  | 
| 979 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Seq'; | 
| 980 |  |  |  |  |  |  | my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); | 
| 981 |  |  |  |  |  |  | $ali->add_seq($new_seq); | 
| 982 |  |  |  |  |  |  |  | 
| 983 |  |  |  |  |  |  | This method accepts any number of arguments. | 
| 984 |  |  |  |  |  |  |  | 
| 985 |  |  |  |  |  |  | =head2 get_seq | 
| 986 |  |  |  |  |  |  |  | 
| 987 |  |  |  |  |  |  | Returns a sequence of the Ali by its index. You can also use negative index | 
| 988 |  |  |  |  |  |  | numbers, just as with Perl's core array handling. If the specified sequence | 
| 989 |  |  |  |  |  |  | does not exist, this method will return C<undef>. | 
| 990 |  |  |  |  |  |  |  | 
| 991 |  |  |  |  |  |  | my $seq = $ali->get_seq($index); | 
| 992 |  |  |  |  |  |  | croak "Seq $index not found in Ali!" unless defined $seq; | 
| 993 |  |  |  |  |  |  |  | 
| 994 |  |  |  |  |  |  | This method accepts just one argument (and not an array slice). | 
| 995 |  |  |  |  |  |  |  | 
| 996 |  |  |  |  |  |  | =head2 get_seq_with_id | 
| 997 |  |  |  |  |  |  |  | 
| 998 |  |  |  |  |  |  | Returns a sequence of the Ali by its id. If the specified id is not unique, | 
| 999 |  |  |  |  |  |  | only the first matching sequence will be returned, whereas if no sequence | 
| 1000 |  |  |  |  |  |  | exists for the specified id, this method will return C<undef>. | 
| 1001 |  |  |  |  |  |  |  | 
| 1002 |  |  |  |  |  |  | my $id = 'Pyrus malus_3750@658052655'; | 
| 1003 |  |  |  |  |  |  | my $seq = $ali->get_seq_with_id($id); | 
| 1004 |  |  |  |  |  |  | croak "Seq $id not found in Ali!" unless defined $seq; | 
| 1005 |  |  |  |  |  |  |  | 
| 1006 |  |  |  |  |  |  | This method accepts just one argument. | 
| 1007 |  |  |  |  |  |  |  | 
| 1008 |  |  |  |  |  |  | =head2 set_seq | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  | Given an index and a sequence, sets the specified Ali element to the | 
| 1011 |  |  |  |  |  |  | sequence. This method returns the new sequence at C<$index>. | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Seq'; | 
| 1014 |  |  |  |  |  |  | my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); | 
| 1015 |  |  |  |  |  |  | $ali->set_seq($index, $new_seq); | 
| 1016 |  |  |  |  |  |  |  | 
| 1017 |  |  |  |  |  |  | This method requires two arguments. | 
| 1018 |  |  |  |  |  |  |  | 
| 1019 |  |  |  |  |  |  | =head2 delete_seq | 
| 1020 |  |  |  |  |  |  |  | 
| 1021 |  |  |  |  |  |  | Removes the sequence at the given index from the Ali. This method returns | 
| 1022 |  |  |  |  |  |  | the deleted sequence. If the specified sequence does not exist, it will | 
| 1023 |  |  |  |  |  |  | return C<undef> instead. | 
| 1024 |  |  |  |  |  |  |  | 
| 1025 |  |  |  |  |  |  | $ali->delete_seq($index); | 
| 1026 |  |  |  |  |  |  |  | 
| 1027 |  |  |  |  |  |  | This method requires one argument. | 
| 1028 |  |  |  |  |  |  |  | 
| 1029 |  |  |  |  |  |  | =head2 insert_seq | 
| 1030 |  |  |  |  |  |  |  | 
| 1031 |  |  |  |  |  |  | Inserts a new sequence into the Ali at the given index. This method returns | 
| 1032 |  |  |  |  |  |  | the new sequence at C<$index>. | 
| 1033 |  |  |  |  |  |  |  | 
| 1034 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Seq'; | 
| 1035 |  |  |  |  |  |  | my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); | 
| 1036 |  |  |  |  |  |  | $ali->insert_seq($index, $new_seq); | 
| 1037 |  |  |  |  |  |  |  | 
| 1038 |  |  |  |  |  |  | This method requires two arguments. | 
| 1039 |  |  |  |  |  |  |  | 
| 1040 |  |  |  |  |  |  | =head2 all_seqs | 
| 1041 |  |  |  |  |  |  |  | 
| 1042 |  |  |  |  |  |  | Returns all the sequences of the Ali (not an array reference). | 
| 1043 |  |  |  |  |  |  |  | 
| 1044 |  |  |  |  |  |  | my @seqs = $ali->all_seqs; | 
| 1045 |  |  |  |  |  |  |  | 
| 1046 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1047 |  |  |  |  |  |  |  | 
| 1048 |  |  |  |  |  |  | =head2 first_seq | 
| 1049 |  |  |  |  |  |  |  | 
| 1050 |  |  |  |  |  |  | Returns the first sequence of the Ali matching a given criterion, just like | 
| 1051 |  |  |  |  |  |  | L<List::Util>'s C<first> function. This method requires a subroutine | 
| 1052 |  |  |  |  |  |  | implementing the matching logic. | 
| 1053 |  |  |  |  |  |  |  | 
| 1054 |  |  |  |  |  |  | # emulate get_seq_with_id method | 
| 1055 |  |  |  |  |  |  | my $id2find = 'seq13'; | 
| 1056 |  |  |  |  |  |  | my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } ); | 
| 1057 |  |  |  |  |  |  |  | 
| 1058 |  |  |  |  |  |  | This method requires a single argument. | 
| 1059 |  |  |  |  |  |  |  | 
| 1060 |  |  |  |  |  |  | =head2 filter_seqs | 
| 1061 |  |  |  |  |  |  |  | 
| 1062 |  |  |  |  |  |  | Returns every sequence of the Ali matching a given criterion, just like | 
| 1063 |  |  |  |  |  |  | Perl's core C<grep> function. This method requires a subroutine implementing | 
| 1064 |  |  |  |  |  |  | the matching logic. | 
| 1065 |  |  |  |  |  |  |  | 
| 1066 |  |  |  |  |  |  | # keep only long sequences (ignoring gaps and missing states) | 
| 1067 |  |  |  |  |  |  | my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } ); | 
| 1068 |  |  |  |  |  |  |  | 
| 1069 |  |  |  |  |  |  | This method requires a single argument. | 
| 1070 |  |  |  |  |  |  |  | 
| 1071 |  |  |  |  |  |  | =head2 all_new_seqs | 
| 1072 |  |  |  |  |  |  |  | 
| 1073 |  |  |  |  |  |  | Returns all the sequences of the Ali tagged as #NEW# (not an array reference). | 
| 1074 |  |  |  |  |  |  |  | 
| 1075 |  |  |  |  |  |  | my @new_seqs = $ali->all_new_seqs; | 
| 1076 |  |  |  |  |  |  |  | 
| 1077 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1078 |  |  |  |  |  |  |  | 
| 1079 |  |  |  |  |  |  | =head2 all_but_new_seqs | 
| 1080 |  |  |  |  |  |  |  | 
| 1081 |  |  |  |  |  |  | Returns all the sequences of the Ali except those tagged as #NEW# (not an array | 
| 1082 |  |  |  |  |  |  | reference). | 
| 1083 |  |  |  |  |  |  |  | 
| 1084 |  |  |  |  |  |  | my @preexisting_seqs = $ali->all_but_new_seqs; | 
| 1085 |  |  |  |  |  |  |  | 
| 1086 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1087 |  |  |  |  |  |  |  | 
| 1088 |  |  |  |  |  |  | =head2 all_seq_ids | 
| 1089 |  |  |  |  |  |  |  | 
| 1090 |  |  |  |  |  |  | Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali | 
| 1091 |  |  |  |  |  |  | (not an array reference). This is only a convenience method. | 
| 1092 |  |  |  |  |  |  |  | 
| 1093 |  |  |  |  |  |  | use Test::Deeply; | 
| 1094 |  |  |  |  |  |  | my @ids1 = $ali->all_seq_ids; | 
| 1095 |  |  |  |  |  |  | my @ids2 = map { $_->seq_id } $ali->all_seqs; | 
| 1096 |  |  |  |  |  |  | is_deeply \@ids1, \@ids2, 'should be true'; | 
| 1097 |  |  |  |  |  |  | my @orgs = map { $_->org } @ids1; | 
| 1098 |  |  |  |  |  |  |  | 
| 1099 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1100 |  |  |  |  |  |  |  | 
| 1101 |  |  |  |  |  |  | =head2 filename | 
| 1102 |  |  |  |  |  |  |  | 
| 1103 |  |  |  |  |  |  | Returns the stringified filename of the Ali. | 
| 1104 |  |  |  |  |  |  |  | 
| 1105 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1106 |  |  |  |  |  |  |  | 
| 1107 |  |  |  |  |  |  | =head2 guess | 
| 1108 |  |  |  |  |  |  |  | 
| 1109 |  |  |  |  |  |  | Turn on the smart detection of gaps (see C<guessing> attribute above). | 
| 1110 |  |  |  |  |  |  |  | 
| 1111 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1112 |  |  |  |  |  |  |  | 
| 1113 |  |  |  |  |  |  | =head2 dont_guess | 
| 1114 |  |  |  |  |  |  |  | 
| 1115 |  |  |  |  |  |  | Turn off the smart detection of gaps (see C<guessing> attribute above). | 
| 1116 |  |  |  |  |  |  |  | 
| 1117 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1118 |  |  |  |  |  |  | my $ali = Ali->load('ensembl.fasta'); | 
| 1119 |  |  |  |  |  |  | $ali->dont_guess; | 
| 1120 |  |  |  |  |  |  |  | 
| 1121 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1122 |  |  |  |  |  |  |  | 
| 1123 |  |  |  |  |  |  | =head1 PROPERTIES | 
| 1124 |  |  |  |  |  |  |  | 
| 1125 |  |  |  |  |  |  | =head2 has_uniq_ids | 
| 1126 |  |  |  |  |  |  |  | 
| 1127 |  |  |  |  |  |  | Returns true if all the sequence ids are unique. | 
| 1128 |  |  |  |  |  |  |  | 
| 1129 |  |  |  |  |  |  | carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids; | 
| 1130 |  |  |  |  |  |  |  | 
| 1131 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1132 |  |  |  |  |  |  |  | 
| 1133 |  |  |  |  |  |  | =head2 is_protein | 
| 1134 |  |  |  |  |  |  |  | 
| 1135 |  |  |  |  |  |  | Returns true if any sequence of the Ali looks like a protein. See | 
| 1136 |  |  |  |  |  |  | L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. | 
| 1137 |  |  |  |  |  |  |  | 
| 1138 |  |  |  |  |  |  | say 'Your file includes nucleotide sequences' unless $ali->is_protein; | 
| 1139 |  |  |  |  |  |  |  | 
| 1140 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1141 |  |  |  |  |  |  |  | 
| 1142 |  |  |  |  |  |  | =head2 is_aligned | 
| 1143 |  |  |  |  |  |  |  | 
| 1144 |  |  |  |  |  |  | Returns true if any sequence of the Ali appears to be aligned. See | 
| 1145 |  |  |  |  |  |  | L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. | 
| 1146 |  |  |  |  |  |  |  | 
| 1147 |  |  |  |  |  |  | If the boolean attribute guessing is not set, always returns false. | 
| 1148 |  |  |  |  |  |  |  | 
| 1149 |  |  |  |  |  |  | carp 'Warning: file does not look aligned!' unless $ali->is_aligned; | 
| 1150 |  |  |  |  |  |  |  | 
| 1151 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1152 |  |  |  |  |  |  |  | 
| 1153 |  |  |  |  |  |  | =head2 count_seqs | 
| 1154 |  |  |  |  |  |  |  | 
| 1155 |  |  |  |  |  |  | Returns the number of sequences of the Ali. The alias method C<height> is | 
| 1156 |  |  |  |  |  |  | provided for convenience. | 
| 1157 |  |  |  |  |  |  |  | 
| 1158 |  |  |  |  |  |  | my $height = $ali->count_seqs; | 
| 1159 |  |  |  |  |  |  |  | 
| 1160 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1161 |  |  |  |  |  |  |  | 
| 1162 |  |  |  |  |  |  | =head2 width | 
| 1163 |  |  |  |  |  |  |  | 
| 1164 |  |  |  |  |  |  | Returns the width of the Ali (in characters). If the Ali is not aligned, | 
| 1165 |  |  |  |  |  |  | returns the length of the longest sequence instead. | 
| 1166 |  |  |  |  |  |  |  | 
| 1167 |  |  |  |  |  |  | To avoid potential bugs due to caching, this method dynamically computes the | 
| 1168 |  |  |  |  |  |  | Ali width at each call. Moreover, the Ali is always uniformized (see below) | 
| 1169 |  |  |  |  |  |  | beforehands to ensure accurate width value. Therefore, this method is | 
| 1170 |  |  |  |  |  |  | expensive and should not be called repeatedly (e.g., in a loop condition). | 
| 1171 |  |  |  |  |  |  |  | 
| 1172 |  |  |  |  |  |  | # you'd better looping through sites like this... | 
| 1173 |  |  |  |  |  |  | my $width = $ali->width; | 
| 1174 |  |  |  |  |  |  | for my $site (0..$width-1) { | 
| 1175 |  |  |  |  |  |  | ... | 
| 1176 |  |  |  |  |  |  | } | 
| 1177 |  |  |  |  |  |  |  | 
| 1178 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1179 |  |  |  |  |  |  |  | 
| 1180 |  |  |  |  |  |  | =head2 seq_len_stats | 
| 1181 |  |  |  |  |  |  |  | 
| 1182 |  |  |  |  |  |  | Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps). | 
| 1183 |  |  |  |  |  |  | The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max). | 
| 1184 |  |  |  |  |  |  |  | 
| 1185 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1186 |  |  |  |  |  |  |  | 
| 1187 |  |  |  |  |  |  | =head2 perc_miss | 
| 1188 |  |  |  |  |  |  |  | 
| 1189 |  |  |  |  |  |  | Returns the percentage of missing (and gap-like) character states in the Ali. | 
| 1190 |  |  |  |  |  |  |  | 
| 1191 |  |  |  |  |  |  | As this method internally calls C<Ali::width>, the remarks above also apply. | 
| 1192 |  |  |  |  |  |  |  | 
| 1193 |  |  |  |  |  |  | my $miss_level = $ali->perc_miss; | 
| 1194 |  |  |  |  |  |  |  | 
| 1195 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1196 |  |  |  |  |  |  |  | 
| 1197 |  |  |  |  |  |  | =head1 MUTATORS | 
| 1198 |  |  |  |  |  |  |  | 
| 1199 |  |  |  |  |  |  | =head2 uc_seqs | 
| 1200 |  |  |  |  |  |  |  | 
| 1201 |  |  |  |  |  |  | Turn all the sequences of the Ali to uppercase and returns it. | 
| 1202 |  |  |  |  |  |  |  | 
| 1203 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1204 |  |  |  |  |  |  |  | 
| 1205 |  |  |  |  |  |  | =head2 recode_seqs | 
| 1206 |  |  |  |  |  |  |  | 
| 1207 |  |  |  |  |  |  | Recode all the sequences of the Ali and returns it. | 
| 1208 |  |  |  |  |  |  |  | 
| 1209 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1210 |  |  |  |  |  |  | my $ali = Ali->load('biased.ali'); | 
| 1211 |  |  |  |  |  |  |  | 
| 1212 |  |  |  |  |  |  | # set up RY recoding for suppressing codon bias | 
| 1213 |  |  |  |  |  |  | my %base_for = ( | 
| 1214 |  |  |  |  |  |  | A => 'A',   G => 'A',       # purines | 
| 1215 |  |  |  |  |  |  | C => 'C',   T => 'C',       # pyrimidines | 
| 1216 |  |  |  |  |  |  | ); | 
| 1217 |  |  |  |  |  |  |  | 
| 1218 |  |  |  |  |  |  | my $ali_rec = $ali->recode_seqs( \%base_for ); | 
| 1219 |  |  |  |  |  |  | $ali_rec->store('biased_ry.ali'); | 
| 1220 |  |  |  |  |  |  |  | 
| 1221 |  |  |  |  |  |  | This method requires one argument. | 
| 1222 |  |  |  |  |  |  |  | 
| 1223 |  |  |  |  |  |  | =head2 degap_seqs | 
| 1224 |  |  |  |  |  |  |  | 
| 1225 |  |  |  |  |  |  | Remove the gaps in all the sequences of the Ali and returns it. | 
| 1226 |  |  |  |  |  |  |  | 
| 1227 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1228 |  |  |  |  |  |  |  | 
| 1229 |  |  |  |  |  |  | =head2 spacify_seqs | 
| 1230 |  |  |  |  |  |  |  | 
| 1231 |  |  |  |  |  |  | Spacifies all the sequences of the Ali and returns it. See the corresponding | 
| 1232 |  |  |  |  |  |  | method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning | 
| 1233 |  |  |  |  |  |  | operation. | 
| 1234 |  |  |  |  |  |  |  | 
| 1235 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1236 |  |  |  |  |  |  |  | 
| 1237 |  |  |  |  |  |  | =head2 gapify_seqs | 
| 1238 |  |  |  |  |  |  |  | 
| 1239 |  |  |  |  |  |  | Gapifies all the sequences of the Ali and returns it. See the corresponding | 
| 1240 |  |  |  |  |  |  | method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning | 
| 1241 |  |  |  |  |  |  | operation. | 
| 1242 |  |  |  |  |  |  |  | 
| 1243 |  |  |  |  |  |  | This method accepts an optional argument. | 
| 1244 |  |  |  |  |  |  |  | 
| 1245 |  |  |  |  |  |  | =head2 trim_seqs | 
| 1246 |  |  |  |  |  |  |  | 
| 1247 |  |  |  |  |  |  | Trims all the sequences of the Ali and returns it. See the corresponding | 
| 1248 |  |  |  |  |  |  | method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning | 
| 1249 |  |  |  |  |  |  | operation. | 
| 1250 |  |  |  |  |  |  |  | 
| 1251 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1252 |  |  |  |  |  |  |  | 
| 1253 |  |  |  |  |  |  | =head2 pad_seqs | 
| 1254 |  |  |  |  |  |  |  | 
| 1255 |  |  |  |  |  |  | Pads all the sequences of the Ali and returns it. See the corresponding | 
| 1256 |  |  |  |  |  |  | method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning | 
| 1257 |  |  |  |  |  |  | operation. | 
| 1258 |  |  |  |  |  |  |  | 
| 1259 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1260 |  |  |  |  |  |  |  | 
| 1261 |  |  |  |  |  |  | =head2 uniformize | 
| 1262 |  |  |  |  |  |  |  | 
| 1263 |  |  |  |  |  |  | Performs the three gap-cleaning operations in turn on all the sequences of | 
| 1264 |  |  |  |  |  |  | the Ali and returns it, which ensures that it is semantically clean and | 
| 1265 |  |  |  |  |  |  | rectangular. | 
| 1266 |  |  |  |  |  |  |  | 
| 1267 |  |  |  |  |  |  | This is only a convenience method called internally by the Ali object before | 
| 1268 |  |  |  |  |  |  | selected methods (such as storage-like methods). However, it might prove | 
| 1269 |  |  |  |  |  |  | useful in some circumstances, hence it is not defined as private. | 
| 1270 |  |  |  |  |  |  |  | 
| 1271 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1272 |  |  |  |  |  |  | my $ali = Ali->load('input.ali'); | 
| 1273 |  |  |  |  |  |  | $ali->add_seq(...); | 
| 1274 |  |  |  |  |  |  | # more editing of the Ali sequences | 
| 1275 |  |  |  |  |  |  | $ali->uniformize; | 
| 1276 |  |  |  |  |  |  |  | 
| 1277 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1278 |  |  |  |  |  |  |  | 
| 1279 |  |  |  |  |  |  | =head2 clear_new_tags | 
| 1280 |  |  |  |  |  |  |  | 
| 1281 |  |  |  |  |  |  | Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it. | 
| 1282 |  |  |  |  |  |  |  | 
| 1283 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1284 |  |  |  |  |  |  | my $ali = Ali->load('input-42.ali'); | 
| 1285 |  |  |  |  |  |  | $ali->clear_new_tags; | 
| 1286 |  |  |  |  |  |  | my @new_seqs = $ali->all_new_seqs; | 
| 1287 |  |  |  |  |  |  | # array should be empty | 
| 1288 |  |  |  |  |  |  |  | 
| 1289 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1290 |  |  |  |  |  |  |  | 
| 1291 |  |  |  |  |  |  | =head2 shorten_ids | 
| 1292 |  |  |  |  |  |  |  | 
| 1293 |  |  |  |  |  |  | Replaces all the sequence ids of the Ali by their abbreviated forms as | 
| 1294 |  |  |  |  |  |  | specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. | 
| 1295 |  |  |  |  |  |  |  | 
| 1296 |  |  |  |  |  |  | Note that this method will work only if the sequence ids have not been | 
| 1297 |  |  |  |  |  |  | already shortened or modified in any way since the creation of the IdMapper. | 
| 1298 |  |  |  |  |  |  | Long ids without abbreviated forms in the IdMapper are left untouched. | 
| 1299 |  |  |  |  |  |  |  | 
| 1300 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1301 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdMapper'; | 
| 1302 |  |  |  |  |  |  | my $ali = Ali->load('input.ali'); | 
| 1303 |  |  |  |  |  |  | my $mapper = IdMapper->std_mapper($ali, 'lcl|seq'); | 
| 1304 |  |  |  |  |  |  | $ali->shorten_ids($mapper); | 
| 1305 |  |  |  |  |  |  | $ali->store_fasta('input.4blast.fasta'); | 
| 1306 |  |  |  |  |  |  | # makeblastdb | 
| 1307 |  |  |  |  |  |  |  | 
| 1308 |  |  |  |  |  |  | # Note: the temp_fasta method does exactly that | 
| 1309 |  |  |  |  |  |  |  | 
| 1310 |  |  |  |  |  |  | This method requires one argument. | 
| 1311 |  |  |  |  |  |  |  | 
| 1312 |  |  |  |  |  |  | =head2 restore_ids | 
| 1313 |  |  |  |  |  |  |  | 
| 1314 |  |  |  |  |  |  | Replaces all the sequence ids of the Ali by their long forms as specified by | 
| 1315 |  |  |  |  |  |  | the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. | 
| 1316 |  |  |  |  |  |  |  | 
| 1317 |  |  |  |  |  |  | Note that this method will work only if the sequence ids have been previously | 
| 1318 |  |  |  |  |  |  | abbreviated (see above) and have not been modified in any way since then. | 
| 1319 |  |  |  |  |  |  | Again, abbreviated ids without long forms in the IdMapper are left untouched. | 
| 1320 |  |  |  |  |  |  |  | 
| 1321 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdMapper'; | 
| 1322 |  |  |  |  |  |  | my $mapper = IdMapper->gi_mapper($ali); | 
| 1323 |  |  |  |  |  |  | $ali->shorten_ids($mapper); | 
| 1324 |  |  |  |  |  |  | ... | 
| 1325 |  |  |  |  |  |  | $ali->restore_ids($mapper); | 
| 1326 |  |  |  |  |  |  |  | 
| 1327 |  |  |  |  |  |  | This method requires one argument. | 
| 1328 |  |  |  |  |  |  |  | 
| 1329 |  |  |  |  |  |  | =head2 apply_list | 
| 1330 |  |  |  |  |  |  |  | 
| 1331 |  |  |  |  |  |  | Selectively retains or discards sequences from the Ali based on the content | 
| 1332 |  |  |  |  |  |  | of the passed L<Bio::MUST::Core::IdList> object and returns the Ali. | 
| 1333 |  |  |  |  |  |  |  | 
| 1334 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdList'; | 
| 1335 |  |  |  |  |  |  | my $list = IdList->load('interesting_seqs.idl'); | 
| 1336 |  |  |  |  |  |  | $ali->apply_list($list);                # discard non-interesting seqs | 
| 1337 |  |  |  |  |  |  |  | 
| 1338 |  |  |  |  |  |  | This method requires one argument. | 
| 1339 |  |  |  |  |  |  |  | 
| 1340 |  |  |  |  |  |  | =head2 apply_mask | 
| 1341 |  |  |  |  |  |  |  | 
| 1342 |  |  |  |  |  |  | Selectively retains or discards sites from the Ali based on the content of | 
| 1343 |  |  |  |  |  |  | the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali. | 
| 1344 |  |  |  |  |  |  |  | 
| 1345 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::SeqMask'; | 
| 1346 |  |  |  |  |  |  | my $variable_sites = SeqMask->variable_mask($ali); | 
| 1347 |  |  |  |  |  |  | $ali->apply_mask($variable_sites);      # discard constant sites | 
| 1348 |  |  |  |  |  |  |  | 
| 1349 |  |  |  |  |  |  | This method requires one argument. | 
| 1350 |  |  |  |  |  |  |  | 
| 1351 |  |  |  |  |  |  | =head2 idealize | 
| 1352 |  |  |  |  |  |  |  | 
| 1353 |  |  |  |  |  |  | Computes and applies an ideal sequence mask to the Ali and returns it. This | 
| 1354 |  |  |  |  |  |  | is only a convenience method. | 
| 1355 |  |  |  |  |  |  |  | 
| 1356 |  |  |  |  |  |  | When invoked without arguments, it will discard the gaps that are | 
| 1357 |  |  |  |  |  |  | universally shared by all the sequences. Otherwise, the provided argument | 
| 1358 |  |  |  |  |  |  | corresponds to the threshold of the C<ideal_mask> method described in | 
| 1359 |  |  |  |  |  |  | L<Bio::MUST::Core::SeqMask>. | 
| 1360 |  |  |  |  |  |  |  | 
| 1361 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdList'; | 
| 1362 |  |  |  |  |  |  | my $fast_seqs = IdList->load('fast_evolving_seqs.idl'); | 
| 1363 |  |  |  |  |  |  | my $seqs2keep = $fast_seqs->negative_list($ali); | 
| 1364 |  |  |  |  |  |  | $ali->apply_list($seqs2keep);           # discard fast-evolving seqs | 
| 1365 |  |  |  |  |  |  | $ali->idealize;      # discard newly shared gaps caused by fast seqs | 
| 1366 |  |  |  |  |  |  |  | 
| 1367 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1368 |  |  |  |  |  |  | my $ali = Ali->load('hmm_based.ali'); | 
| 1369 |  |  |  |  |  |  | $ali->idealize(0.05);    # discard insertions due to <5% of the seqs | 
| 1370 |  |  |  |  |  |  |  | 
| 1371 |  |  |  |  |  |  | This method accepts an optional argument. | 
| 1372 |  |  |  |  |  |  |  | 
| 1373 |  |  |  |  |  |  | =head1 MISC METHODS | 
| 1374 |  |  |  |  |  |  |  | 
| 1375 |  |  |  |  |  |  | =head2 gapmiss_regex | 
| 1376 |  |  |  |  |  |  |  | 
| 1377 |  |  |  |  |  |  | Returns a regular expression matching gaps and ambiguous or missing states. | 
| 1378 |  |  |  |  |  |  | The exact regex returned depends on the type of sequences in the Ali (nucl. or | 
| 1379 |  |  |  |  |  |  | proteins). | 
| 1380 |  |  |  |  |  |  |  | 
| 1381 |  |  |  |  |  |  | my $regex = $ali->gapmiss_regex; | 
| 1382 |  |  |  |  |  |  | my $first_seq = $ali->get_seq(0)->seq; | 
| 1383 |  |  |  |  |  |  | my $gapmiss_n = $first_seq =~ m/($regex)/xmsg; | 
| 1384 |  |  |  |  |  |  | say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites"; | 
| 1385 |  |  |  |  |  |  |  | 
| 1386 |  |  |  |  |  |  | This method does not accept any arguments. | 
| 1387 |  |  |  |  |  |  |  | 
| 1388 |  |  |  |  |  |  | =head2 map_coords | 
| 1389 |  |  |  |  |  |  |  | 
| 1390 |  |  |  |  |  |  | Converts a set of site positions from Ali coordinates to coordinates of the | 
| 1391 |  |  |  |  |  |  | specified sequence (thereby ignoring positions due to gaps). Returns the | 
| 1392 |  |  |  |  |  |  | converted sites in sequence coordinates as an array refrence. | 
| 1393 |  |  |  |  |  |  |  | 
| 1394 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1395 |  |  |  |  |  |  | my $ali = Ali->load('input.ali'); | 
| 1396 |  |  |  |  |  |  | my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125'; | 
| 1397 |  |  |  |  |  |  | my $ali_coords = [ 4, 25, 73, 89, 104, 116 ]; | 
| 1398 |  |  |  |  |  |  | my $seq_coords = $ali->map_coords($id, $ali_coords); | 
| 1399 |  |  |  |  |  |  | # $seq_coords is [ 3, 23, 59, 71,  71,  74 ] | 
| 1400 |  |  |  |  |  |  |  | 
| 1401 |  |  |  |  |  |  | This method requires two arguments: the id of a sequence and an array | 
| 1402 |  |  |  |  |  |  | reference of input sites in Ali coordinates. | 
| 1403 |  |  |  |  |  |  |  | 
| 1404 |  |  |  |  |  |  | =head1 I/O METHODS | 
| 1405 |  |  |  |  |  |  |  | 
| 1406 |  |  |  |  |  |  | =head2 load | 
| 1407 |  |  |  |  |  |  |  | 
| 1408 |  |  |  |  |  |  | Class method (constructor) returning a new Ali read from disk. This method | 
| 1409 |  |  |  |  |  |  | will transparently import plain FASTA files in addition to the MUST | 
| 1410 |  |  |  |  |  |  | pseudo-FASTA format (ALI files). | 
| 1411 |  |  |  |  |  |  |  | 
| 1412 |  |  |  |  |  |  | use Test::Deeply; | 
| 1413 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1414 |  |  |  |  |  |  | my $ali1 = Ali->load('example.ali'); | 
| 1415 |  |  |  |  |  |  | my $ali2 = Ali->load('example.fasta'); | 
| 1416 |  |  |  |  |  |  | my @seqs1 = $ali1->all_seqs; | 
| 1417 |  |  |  |  |  |  | my @seqs2 = $ali2->all_seqs; | 
| 1418 |  |  |  |  |  |  | is_deeply, \@seqs1, \@seqs2, 'should be true'; | 
| 1419 |  |  |  |  |  |  |  | 
| 1420 |  |  |  |  |  |  | This method requires one argument. | 
| 1421 |  |  |  |  |  |  |  | 
| 1422 |  |  |  |  |  |  | =head2 store | 
| 1423 |  |  |  |  |  |  |  | 
| 1424 |  |  |  |  |  |  | Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files). | 
| 1425 |  |  |  |  |  |  |  | 
| 1426 |  |  |  |  |  |  | Note that the ALI format is only used when the suffix of the outfile name is | 
| 1427 |  |  |  |  |  |  | '.ali'. In all other cases (including lack of suffix), this method | 
| 1428 |  |  |  |  |  |  | automatically forwards the call to C<store_fasta>. | 
| 1429 |  |  |  |  |  |  |  | 
| 1430 |  |  |  |  |  |  | $ali->store('output.ali'); | 
| 1431 |  |  |  |  |  |  | # output.ali is written in ALI format | 
| 1432 |  |  |  |  |  |  | $ali->store('output.fasta'); | 
| 1433 |  |  |  |  |  |  | # output.fasta is written in FASTA format | 
| 1434 |  |  |  |  |  |  |  | 
| 1435 |  |  |  |  |  |  | This method requires one argument (but see C<store_fasta> in case of automatic | 
| 1436 |  |  |  |  |  |  | forwarding of the method call). | 
| 1437 |  |  |  |  |  |  |  | 
| 1438 |  |  |  |  |  |  | =head2 store_fasta | 
| 1439 |  |  |  |  |  |  |  | 
| 1440 |  |  |  |  |  |  | Writes the Ali to disk in the plain FASTA format. | 
| 1441 |  |  |  |  |  |  |  | 
| 1442 |  |  |  |  |  |  | For compatibility purposes, this method automatically fetches sequence ids | 
| 1443 |  |  |  |  |  |  | using the C<foreign_id> method instead of the native C<full_id> method, both | 
| 1444 |  |  |  |  |  |  | described in L<Bio::MUST::Core::SeqId>. | 
| 1445 |  |  |  |  |  |  |  | 
| 1446 |  |  |  |  |  |  | $ali->store_fasta( 'output.fasta' ); | 
| 1447 |  |  |  |  |  |  | $ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} ); | 
| 1448 |  |  |  |  |  |  |  | 
| 1449 |  |  |  |  |  |  | This method requires one argument and accepts a second optional argument | 
| 1450 |  |  |  |  |  |  | controlling the output format. It is a hash reference that may contain one | 
| 1451 |  |  |  |  |  |  | or more of the following keys: | 
| 1452 |  |  |  |  |  |  |  | 
| 1453 |  |  |  |  |  |  | - clean: replace all ambiguous and missing states by C<X> (default: false) | 
| 1454 |  |  |  |  |  |  | - degap: boolean value controlling degapping (default: false) | 
| 1455 |  |  |  |  |  |  | - chunk: line width (default is 60 chars; negative values means no wrap) | 
| 1456 |  |  |  |  |  |  |  | 
| 1457 |  |  |  |  |  |  | Finally, it is possible to fine-tune the behavior of the C<clean> option by | 
| 1458 |  |  |  |  |  |  | providing another character than C<X> through the C<gapify> key. This can be | 
| 1459 |  |  |  |  |  |  | useful to replace all ambiguous and missing states by gaps, as shown below: | 
| 1460 |  |  |  |  |  |  |  | 
| 1461 |  |  |  |  |  |  | $ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } ); | 
| 1462 |  |  |  |  |  |  |  | 
| 1463 |  |  |  |  |  |  | =head2 temp_fasta | 
| 1464 |  |  |  |  |  |  |  | 
| 1465 |  |  |  |  |  |  | Writes a temporary copy of the Ali to disk in the plain FASTA format using | 
| 1466 |  |  |  |  |  |  | numeric sequence ids and returns the name of the temporary file. This is | 
| 1467 |  |  |  |  |  |  | only a convenience method. | 
| 1468 |  |  |  |  |  |  |  | 
| 1469 |  |  |  |  |  |  | In list context, returns the IdMapper object along with temporary filename. | 
| 1470 |  |  |  |  |  |  |  | 
| 1471 |  |  |  |  |  |  | my $infile = $ali->temp_fasta( { degap => 1 } ); | 
| 1472 |  |  |  |  |  |  | my $output = `script.sh $infile`; | 
| 1473 |  |  |  |  |  |  | ... | 
| 1474 |  |  |  |  |  |  |  | 
| 1475 |  |  |  |  |  |  | This method accepts the same optional argument hash as C<store_fasta>. | 
| 1476 |  |  |  |  |  |  | However, an additional option (C<id_prefix>) is available to control the way | 
| 1477 |  |  |  |  |  |  | abbreviated sequence ids are prefixed by the C<std_mapper> method (see | 
| 1478 |  |  |  |  |  |  | L<Bio::MUST::Core::Listable>). | 
| 1479 |  |  |  |  |  |  |  | 
| 1480 |  |  |  |  |  |  | my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } ); | 
| 1481 |  |  |  |  |  |  | my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } ); | 
| 1482 |  |  |  |  |  |  |  | 
| 1483 |  |  |  |  |  |  | =head2 load_phylip | 
| 1484 |  |  |  |  |  |  |  | 
| 1485 |  |  |  |  |  |  | Class method (constructor) returning a new Ali read from a file in the | 
| 1486 |  |  |  |  |  |  | PHYLIP format. Both sequential and interleaved formats are supported. | 
| 1487 |  |  |  |  |  |  |  | 
| 1488 |  |  |  |  |  |  | The only constraint is that sequence ids MUST NOT contain whitespace and be | 
| 1489 |  |  |  |  |  |  | followed by at least one whitespace character. This means that some old-school | 
| 1490 |  |  |  |  |  |  | PHYLIP files not following this convention will not be processed correctly. | 
| 1491 |  |  |  |  |  |  |  | 
| 1492 |  |  |  |  |  |  | When using the interleaved format, sequence ids may or may not be repeated in | 
| 1493 |  |  |  |  |  |  | each block. | 
| 1494 |  |  |  |  |  |  |  | 
| 1495 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1496 |  |  |  |  |  |  | my $ali = Ali->load_phylip('phylip.phy'); | 
| 1497 |  |  |  |  |  |  | say $ali->count_seqs; | 
| 1498 |  |  |  |  |  |  | say $ali->width; | 
| 1499 |  |  |  |  |  |  |  | 
| 1500 |  |  |  |  |  |  | # outputs: | 
| 1501 |  |  |  |  |  |  | # 10 | 
| 1502 |  |  |  |  |  |  | # 709 | 
| 1503 |  |  |  |  |  |  |  | 
| 1504 |  |  |  |  |  |  | This method requires one argument. | 
| 1505 |  |  |  |  |  |  |  | 
| 1506 |  |  |  |  |  |  | =head2 store_phylip | 
| 1507 |  |  |  |  |  |  |  | 
| 1508 |  |  |  |  |  |  | Writes the Ali to disk in the interleaved (or sequential) PHYLIP format. | 
| 1509 |  |  |  |  |  |  |  | 
| 1510 |  |  |  |  |  |  | To ensure maximal flexibility, this method fetches sequence ids using the | 
| 1511 |  |  |  |  |  |  | native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but | 
| 1512 |  |  |  |  |  |  | truncates them to 10 characters, as expected by the original PHYLIP software | 
| 1513 |  |  |  |  |  |  | package. No other tinkering is carried out on the ids. Thus, if the ids | 
| 1514 |  |  |  |  |  |  | contain whitespace or are not unique in their 10 first characters, it is | 
| 1515 |  |  |  |  |  |  | advised to first map them using one of the constructors in | 
| 1516 |  |  |  |  |  |  | L<Bio::MUST::Core::IdMapper>. | 
| 1517 |  |  |  |  |  |  |  | 
| 1518 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1519 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::IdMapper'; | 
| 1520 |  |  |  |  |  |  | my $ali = Ali->load('input.ali'); | 
| 1521 |  |  |  |  |  |  | my $mapper = IdMapper->std_mapper($ali); | 
| 1522 |  |  |  |  |  |  | $ali->shorten_ids($mapper); | 
| 1523 |  |  |  |  |  |  | $ali->store_phylip( 'input.phy', { chunk => 50 } ); | 
| 1524 |  |  |  |  |  |  |  | 
| 1525 |  |  |  |  |  |  | This method requires one argument and accepts a second optional argument | 
| 1526 |  |  |  |  |  |  | controlling the output format. It is a hash reference that may contain one | 
| 1527 |  |  |  |  |  |  | or more of the following keys: | 
| 1528 |  |  |  |  |  |  |  | 
| 1529 |  |  |  |  |  |  | - short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes) | 
| 1530 |  |  |  |  |  |  | - clean: replace all ambiguous and missing states by 'X' (default: false) | 
| 1531 |  |  |  |  |  |  | - chunk: line width (default: 60 chars; negative values means no wrap) | 
| 1532 |  |  |  |  |  |  |  | 
| 1533 |  |  |  |  |  |  | To store the Ali in PHYLIP sequential format, specify a negative chunk (-1). | 
| 1534 |  |  |  |  |  |  |  | 
| 1535 |  |  |  |  |  |  | =head2 load_stockholm | 
| 1536 |  |  |  |  |  |  |  | 
| 1537 |  |  |  |  |  |  | Class method (constructor) returning a new Ali read from a file in the | 
| 1538 |  |  |  |  |  |  | STOCKHOLM format. =GF comments are retained (see above) but not the other | 
| 1539 |  |  |  |  |  |  | comment classes (=GS, =GR and =GC). | 
| 1540 |  |  |  |  |  |  |  | 
| 1541 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1542 |  |  |  |  |  |  | my $ali = Ali->load('upsk.stockholm'); | 
| 1543 |  |  |  |  |  |  | say $ali->header; | 
| 1544 |  |  |  |  |  |  |  | 
| 1545 |  |  |  |  |  |  | # outputs: | 
| 1546 |  |  |  |  |  |  | # ID    UPSK | 
| 1547 |  |  |  |  |  |  | # SE    Predicted; Infernal | 
| 1548 |  |  |  |  |  |  | # SS    Published; PMID 9223489 | 
| 1549 |  |  |  |  |  |  | # RN    [1] | 
| 1550 |  |  |  |  |  |  | # RM    9223489 | 
| 1551 |  |  |  |  |  |  | # RT    The role of the pseudoknot at the 3' end of turnip yellow mosaic | 
| 1552 |  |  |  |  |  |  | # RT    virus RNA in minus-strand synthesis by the viral RNA-dependent RNA | 
| 1553 |  |  |  |  |  |  | # RT    polymerase. | 
| 1554 |  |  |  |  |  |  | # RA    Deiman BA, Kortlever RM, Pleij CW; | 
| 1555 |  |  |  |  |  |  | # RL    J Virol 1997;71:5990-5996. | 
| 1556 |  |  |  |  |  |  |  | 
| 1557 |  |  |  |  |  |  | This method requires one argument. | 
| 1558 |  |  |  |  |  |  |  | 
| 1559 |  |  |  |  |  |  | =head2 instant_store | 
| 1560 |  |  |  |  |  |  |  | 
| 1561 |  |  |  |  |  |  | Class method intended to transform a large sequence file read from disk | 
| 1562 |  |  |  |  |  |  | without loading it in memory. This method will transparently process plain | 
| 1563 |  |  |  |  |  |  | FASTA files in addition to the MUST pseudo-FASTA format (ALI files). | 
| 1564 |  |  |  |  |  |  |  | 
| 1565 |  |  |  |  |  |  | my $chunk = 200; | 
| 1566 |  |  |  |  |  |  |  | 
| 1567 |  |  |  |  |  |  | my $split = sub { | 
| 1568 |  |  |  |  |  |  | my $seq = shift; | 
| 1569 |  |  |  |  |  |  | my $base_id = ( split /\s+/xms, $seq->full_id )[0]; | 
| 1570 |  |  |  |  |  |  | my $max_pos = $seq->seq_len - $chunk; | 
| 1571 |  |  |  |  |  |  | my $n = 0; | 
| 1572 |  |  |  |  |  |  | my $out_str; | 
| 1573 |  |  |  |  |  |  | for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) { | 
| 1574 |  |  |  |  |  |  | $out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos, | 
| 1575 |  |  |  |  |  |  | $pos + $chunk <= $max_pos ? $chunk : 2 * $chunk | 
| 1576 |  |  |  |  |  |  | ) . "\n"; | 
| 1577 |  |  |  |  |  |  | } | 
| 1578 |  |  |  |  |  |  | return $out_str; | 
| 1579 |  |  |  |  |  |  | }; | 
| 1580 |  |  |  |  |  |  |  | 
| 1581 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1582 |  |  |  |  |  |  |  | 
| 1583 |  |  |  |  |  |  | Ali->instant_store( | 
| 1584 |  |  |  |  |  |  | 'outfile.fasta', { infile => 'infile.fasta', coderef => $split } | 
| 1585 |  |  |  |  |  |  | ); | 
| 1586 |  |  |  |  |  |  |  | 
| 1587 |  |  |  |  |  |  | This method requires two arguments. The sercond is a hash reference that must | 
| 1588 |  |  |  |  |  |  | contain the following keys: | 
| 1589 |  |  |  |  |  |  | - infile:  input sequence file | 
| 1590 |  |  |  |  |  |  | - coderef: subroutine implementing the transforming logic | 
| 1591 |  |  |  |  |  |  |  | 
| 1592 |  |  |  |  |  |  | =head2 instant_count | 
| 1593 |  |  |  |  |  |  |  | 
| 1594 |  |  |  |  |  |  | Class method returning the number of seqs in any sequence file read from disk | 
| 1595 |  |  |  |  |  |  | without loading it in memory. This method will transparently process plain | 
| 1596 |  |  |  |  |  |  | FASTA files in addition to the MUST pseudo-FASTA format (ALI files). | 
| 1597 |  |  |  |  |  |  |  | 
| 1598 |  |  |  |  |  |  | use aliased 'Bio::MUST::Core::Ali'; | 
| 1599 |  |  |  |  |  |  | my $seq_n = Ali->instant_count('input.ali'); | 
| 1600 |  |  |  |  |  |  | say $seq_n; | 
| 1601 |  |  |  |  |  |  |  | 
| 1602 |  |  |  |  |  |  | =head1 ALIASES | 
| 1603 |  |  |  |  |  |  |  | 
| 1604 |  |  |  |  |  |  | =head2 height | 
| 1605 |  |  |  |  |  |  |  | 
| 1606 |  |  |  |  |  |  | Alias for C<count_seqs> method. For API consistency. | 
| 1607 |  |  |  |  |  |  |  | 
| 1608 |  |  |  |  |  |  | =head1 AUTHOR | 
| 1609 |  |  |  |  |  |  |  | 
| 1610 |  |  |  |  |  |  | Denis BAURAIN <denis.baurain@uliege.be> | 
| 1611 |  |  |  |  |  |  |  | 
| 1612 |  |  |  |  |  |  | =head1 CONTRIBUTORS | 
| 1613 |  |  |  |  |  |  |  | 
| 1614 |  |  |  |  |  |  | =for stopwords Catherine COLSON Arnaud DI FRANCO | 
| 1615 |  |  |  |  |  |  |  | 
| 1616 |  |  |  |  |  |  | =over 4 | 
| 1617 |  |  |  |  |  |  |  | 
| 1618 |  |  |  |  |  |  | =item * | 
| 1619 |  |  |  |  |  |  |  | 
| 1620 |  |  |  |  |  |  | Catherine COLSON <ccolson@doct.uliege.be> | 
| 1621 |  |  |  |  |  |  |  | 
| 1622 |  |  |  |  |  |  | =item * | 
| 1623 |  |  |  |  |  |  |  | 
| 1624 |  |  |  |  |  |  | Arnaud DI FRANCO <arnaud.difranco@gmail.com> | 
| 1625 |  |  |  |  |  |  |  | 
| 1626 |  |  |  |  |  |  | =back | 
| 1627 |  |  |  |  |  |  |  | 
| 1628 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 1629 |  |  |  |  |  |  |  | 
| 1630 |  |  |  |  |  |  | This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. | 
| 1631 |  |  |  |  |  |  |  | 
| 1632 |  |  |  |  |  |  | This is free software; you can redistribute it and/or modify it under | 
| 1633 |  |  |  |  |  |  | the same terms as the Perl 5 programming language system itself. | 
| 1634 |  |  |  |  |  |  |  | 
| 1635 |  |  |  |  |  |  | =cut |