| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # | 
| 2 |  |  |  |  |  |  | # BioPerl module for Bio::AlignIO::mega | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # Please direct questions and support issues to | 
| 5 |  |  |  |  |  |  | # | 
| 6 |  |  |  |  |  |  | # Cared for by Jason Stajich | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # Copyright Jason Stajich | 
| 9 |  |  |  |  |  |  | # | 
| 10 |  |  |  |  |  |  | # You may distribute this module under the same terms as perl itself | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | # POD documentation - main docs before the code | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | =head1 NAME | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | Bio::AlignIO::mega - Parse and Create MEGA format data files | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | use Bio::AlignIO; | 
| 21 |  |  |  |  |  |  | my $alignio = Bio::AlignIO->new(-format => 'mega', | 
| 22 |  |  |  |  |  |  | -file   => 't/data/hemoglobinA.meg'); | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | while( my $aln = $alignio->next_aln ) { | 
| 25 |  |  |  |  |  |  | # process each alignment or convert to another format like NEXUS | 
| 26 |  |  |  |  |  |  | } | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | This object handles reading and writing data streams in the MEGA | 
| 31 |  |  |  |  |  |  | format (Kumar and Nei). | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | =head1 FEEDBACK | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | =head2 Mailing Lists | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | User feedback is an integral part of the evolution of this and other | 
| 39 |  |  |  |  |  |  | Bioperl modules. Send your comments and suggestions preferably to | 
| 40 |  |  |  |  |  |  | the Bioperl mailing list.  Your participation is much appreciated. | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | bioperl-l@bioperl.org                  - General discussion | 
| 43 |  |  |  |  |  |  | http://bioperl.org/wiki/Mailing_lists  - About the mailing lists | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | =head2 Support | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | Please direct usage questions or support issues to the mailing list: | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | I | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | rather than to the module maintainer directly. Many experienced and | 
| 52 |  |  |  |  |  |  | reponsive experts will be able look at the problem and quickly | 
| 53 |  |  |  |  |  |  | address it. Please include a thorough description of the problem | 
| 54 |  |  |  |  |  |  | with code and data examples if at all possible. | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | =head2 Reporting Bugs | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | Report bugs to the Bioperl bug tracking system to help us keep track | 
| 59 |  |  |  |  |  |  | of the bugs and their resolution. Bug reports can be submitted the | 
| 60 |  |  |  |  |  |  | web: | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | https://github.com/bioperl/bioperl-live/issues | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | =head1 AUTHOR - Jason Stajich | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | Email jason-at-bioperl.org | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | =head1 APPENDIX | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | The rest of the documentation details each of the object methods. | 
| 71 |  |  |  |  |  |  | Internal methods are usually preceded with a _ | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | =cut | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | # Let the code begin... | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | package Bio::AlignIO::mega; | 
| 80 | 2 |  |  | 2 |  | 385 | use vars qw($MEGANAMELEN %VALID_TYPES $LINELEN $BLOCKLEN); | 
|  | 2 |  |  |  |  | 2 |  | 
|  | 2 |  |  |  |  | 113 |  | 
| 81 | 2 |  |  | 2 |  | 6 | use strict; | 
|  | 2 |  |  |  |  | 2 |  | 
|  | 2 |  |  |  |  | 37 |  | 
| 82 |  |  |  |  |  |  |  | 
| 83 | 2 |  |  | 2 |  | 478 | use Bio::SimpleAlign; | 
|  | 2 |  |  |  |  | 2 |  | 
|  | 2 |  |  |  |  | 66 |  | 
| 84 | 2 |  |  | 2 |  | 11 | use Bio::LocatableSeq; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 123 |  | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | # symbols are changed due to MEGA's use of '.' for redundant sequences | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | BEGIN { | 
| 89 | 2 |  |  | 2 |  | 3 | $MEGANAMELEN = 10; | 
| 90 | 2 |  |  |  |  | 5 | $LINELEN = 60; | 
| 91 | 2 |  |  |  |  | 3 | $BLOCKLEN = 10; | 
| 92 | 2 |  |  |  |  | 3 | %VALID_TYPES =  map {$_, 1} qw( dna rna protein standard); | 
|  | 8 |  |  |  |  | 57 |  | 
| 93 |  |  |  |  |  |  | } | 
| 94 | 2 |  |  | 2 |  | 10 | use base qw(Bio::AlignIO); | 
|  | 2 |  |  |  |  | 3 |  | 
|  | 2 |  |  |  |  | 1154 |  | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | =head2 next_aln | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | Title   : next_aln | 
| 100 |  |  |  |  |  |  | Usage   : $aln = $stream->next_aln() | 
| 101 |  |  |  |  |  |  | Function: returns the next alignment in the stream. | 
| 102 |  |  |  |  |  |  | Supports the following MEGA format features: | 
| 103 |  |  |  |  |  |  | - The file has to start with '#mega' | 
| 104 |  |  |  |  |  |  | - Reads in the name of the alignment from a comment | 
| 105 |  |  |  |  |  |  | (anything after '!TITLE: ') . | 
| 106 |  |  |  |  |  |  | - Reads in the format parameters datatype | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | Returns : L object - returns 0 on end of file | 
| 109 |  |  |  |  |  |  | or on error | 
| 110 |  |  |  |  |  |  | Args    : NONE | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | =cut | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | sub next_aln{ | 
| 116 | 2 |  |  | 2 | 1 | 6 | my ($self) = @_; | 
| 117 | 2 |  |  |  |  | 1 | my $entry; | 
| 118 | 2 |  |  |  |  | 2 | my ($alphabet,%seqs); | 
| 119 | 2 |  |  |  |  | 5 | local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.'; | 
| 120 | 2 |  |  |  |  | 3 | local $Bio::LocatableSeq::GAP_SYMBOLS = '\-'; | 
| 121 | 2 |  |  |  |  | 14 | my $aln = Bio::SimpleAlign->new(-source => 'mega'); | 
| 122 |  |  |  |  |  |  |  | 
| 123 | 2 |  | 33 |  |  | 12 | while( defined($entry = $self->_readline()) && ($entry =~ /^\s+$/) ) {} | 
| 124 |  |  |  |  |  |  |  | 
| 125 | 2 | 50 |  |  |  | 9 | $self->throw("Not a valid MEGA file! [#mega] not starting the file!") | 
| 126 |  |  |  |  |  |  | unless $entry =~ /^#mega/i; | 
| 127 |  |  |  |  |  |  |  | 
| 128 | 2 |  |  |  |  | 4 | while( defined($entry = $self->_readline() ) ) { | 
| 129 | 24 |  |  |  |  | 22 | local($_) = $entry; | 
| 130 | 24 | 100 |  |  |  | 87 | if(/\!Title:\s*([^\;]+)\s*/i) { $aln->id($1)} | 
|  | 2 | 100 |  |  |  | 9 |  | 
|  |  | 100 |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | elsif( s/\!Format\s+([^\;]+)\s*/$1/ ) { | 
| 132 | 2 |  |  |  |  | 10 | my (@fields) = split(/\s+/,$1); | 
| 133 | 2 |  |  |  |  | 4 | foreach my $f ( @fields ) { | 
| 134 | 6 |  |  |  |  | 12 | my ($name,$value) = split(/\=/,$f); | 
| 135 | 6 | 100 |  |  |  | 15 | if( $name eq 'datatype' ) { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 136 | 2 |  |  |  |  | 3 | $alphabet = $value; | 
| 137 |  |  |  |  |  |  | } elsif( $name eq 'identical' ) { | 
| 138 | 2 |  |  |  |  | 7 | $aln->match_char($value); | 
| 139 |  |  |  |  |  |  | } elsif( $name eq 'indel' ) { | 
| 140 | 2 |  |  |  |  | 8 | $aln->gap_char($value); | 
| 141 |  |  |  |  |  |  | } | 
| 142 |  |  |  |  |  |  | } | 
| 143 |  |  |  |  |  |  | } elsif( /^\#/ ) { | 
| 144 | 2 |  |  |  |  | 3 | last; | 
| 145 |  |  |  |  |  |  | } | 
| 146 |  |  |  |  |  |  | } | 
| 147 | 2 |  |  |  |  | 3 | my @order; | 
| 148 | 2 |  |  |  |  | 5 | while( defined($entry) ) { | 
| 149 | 42 | 100 |  |  |  | 91 | if( $entry !~ /^\s+$/ ) { | 
| 150 |  |  |  |  |  |  | # this is to skip the leading '#' | 
| 151 | 36 |  |  |  |  | 39 | my $seqname = substr($entry,1,$MEGANAMELEN-1); | 
| 152 | 36 |  |  |  |  | 79 | $seqname =~ s/(\S+)\s+$/$1/g; | 
| 153 | 36 |  |  |  |  | 42 | my $line = substr($entry,$MEGANAMELEN); | 
| 154 | 36 |  |  |  |  | 103 | $line =~ s/\s+//g; | 
| 155 | 36 | 100 |  |  |  | 54 | if( ! defined $seqs{$seqname} ) {push @order, $seqname; } | 
|  | 12 |  |  |  |  | 12 |  | 
| 156 | 36 |  |  |  |  | 53 | $seqs{$seqname} .= $line; | 
| 157 |  |  |  |  |  |  | } | 
| 158 | 42 |  |  |  |  | 59 | $entry = $self->_readline(); | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  |  | 
| 161 | 2 |  |  |  |  | 5 | foreach my $seqname ( @order ) { | 
| 162 | 12 |  |  |  |  | 13 | my $s = $seqs{$seqname}; | 
| 163 | 12 |  |  |  |  | 65 | $s =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g; | 
| 164 | 12 |  |  |  |  | 13 | my $end = length($s); | 
| 165 |  |  |  |  |  |  | my $seq = Bio::LocatableSeq->new('-alphabet'   => $alphabet, | 
| 166 |  |  |  |  |  |  | '-display_id' => $seqname, | 
| 167 | 12 |  |  |  |  | 36 | '-seq'        => $seqs{$seqname}, | 
| 168 |  |  |  |  |  |  | '-start'      => 1, | 
| 169 |  |  |  |  |  |  | '-end'        => $end); | 
| 170 |  |  |  |  |  |  |  | 
| 171 | 12 |  |  |  |  | 25 | $aln->add_seq($seq); | 
| 172 |  |  |  |  |  |  | } | 
| 173 | 2 |  |  |  |  | 8 | $aln->unmatch; | 
| 174 | 2 | 50 |  |  |  | 6 | return $aln if $aln->num_sequences; | 
| 175 | 0 |  |  |  |  | 0 | return; | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | =head2 write_aln | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | Title   : write_aln | 
| 181 |  |  |  |  |  |  | Usage   : $stream->write_aln(@aln) | 
| 182 |  |  |  |  |  |  | Function: writes the $aln object into the stream in MEGA format | 
| 183 |  |  |  |  |  |  | Returns : 1 for success and 0 for error | 
| 184 |  |  |  |  |  |  | Args    : L object | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | =cut | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | sub write_aln{ | 
| 189 | 1 |  |  | 1 | 1 | 6 | my ($self,@aln) = @_; | 
| 190 | 1 |  |  |  |  | 2 | my $count = 0; | 
| 191 | 1 |  |  |  |  | 1 | my $wrapped = 0; | 
| 192 | 1 |  |  |  |  | 2 | my $maxname; | 
| 193 |  |  |  |  |  |  |  | 
| 194 | 1 |  |  |  |  | 2 | foreach my $aln ( @aln ) { | 
| 195 | 1 | 50 | 33 |  |  | 13 | if( ! $aln || ! $aln->isa('Bio::Align::AlignI')  ) { | 
|  |  | 50 |  |  |  |  |  | 
| 196 | 0 |  |  |  |  | 0 | $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); | 
| 197 | 0 |  |  |  |  | 0 | return 0; | 
| 198 |  |  |  |  |  |  | } elsif( ! $aln->is_flush($self->verbose) ) { | 
| 199 | 0 |  |  |  |  | 0 | $self->warn("All Sequences in the alignment must be the same length"); | 
| 200 | 0 |  |  |  |  | 0 | return 0; | 
| 201 |  |  |  |  |  |  | } | 
| 202 | 1 |  |  |  |  | 5 | $aln->match(); | 
| 203 | 1 |  |  |  |  | 3 | my $len = $aln->length(); | 
| 204 | 1 |  |  |  |  | 3 | my $format = sprintf('datatype=%s identical=%s indel=%s;', | 
| 205 |  |  |  |  |  |  | $aln->get_seq_by_pos(1)->alphabet(), | 
| 206 |  |  |  |  |  |  | $aln->match_char, $aln->gap_char); | 
| 207 |  |  |  |  |  |  |  | 
| 208 | 1 |  |  |  |  | 5 | $self->_print(sprintf("#mega\n!Title: %s;\n!Format %s\n\n\n", | 
| 209 |  |  |  |  |  |  | $aln->id, $format)); | 
| 210 |  |  |  |  |  |  |  | 
| 211 | 1 |  |  |  |  | 3 | my ($count, $blockcount,$length) = ( 0,0,$aln->length()); | 
| 212 | 1 |  |  |  |  | 5 | $aln->set_displayname_flat(); | 
| 213 | 1 |  |  |  |  | 2 | while( $count < $length ) { | 
| 214 | 3 |  |  |  |  | 5 | foreach my $seq ( $aln->each_seq ) { | 
| 215 | 18 |  |  |  |  | 21 | my $seqchars = $seq->seq(); | 
| 216 | 18 |  |  |  |  | 15 | $blockcount = 0; | 
| 217 | 18 |  |  |  |  | 19 | my $substring = substr($seqchars, $count, $LINELEN); | 
| 218 | 18 |  |  |  |  | 8 | my @blocks; | 
| 219 | 18 |  |  |  |  | 26 | while( $blockcount < length($substring) ) { | 
| 220 | 90 |  |  |  |  | 89 | push @blocks, substr($substring, $blockcount,$BLOCKLEN); | 
| 221 | 90 |  |  |  |  | 105 | $blockcount += $BLOCKLEN; | 
| 222 |  |  |  |  |  |  | } | 
| 223 | 18 |  |  |  |  | 34 | $self->_print(sprintf("#%-".($MEGANAMELEN-1)."s%s\n", | 
| 224 |  |  |  |  |  |  | substr($aln->displayname($seq->get_nse()), | 
| 225 |  |  |  |  |  |  | 0,$MEGANAMELEN-2), | 
| 226 |  |  |  |  |  |  | join(' ', @blocks))); | 
| 227 |  |  |  |  |  |  | } | 
| 228 | 3 |  |  |  |  | 4 | $self->_print("\n"); | 
| 229 | 3 |  |  |  |  | 5 | $count += $LINELEN; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  | } | 
| 232 | 1 | 50 | 33 |  |  | 3 | $self->flush if $self->_flush_on_write && defined $self->_fh; | 
| 233 | 1 |  |  |  |  | 3 | return 1; | 
| 234 |  |  |  |  |  |  | } | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | 1; |