File Coverage

Bio/AlignIO/po.pm
Criterion Covered Total %
statement 148 154 96.1
branch 39 50 78.0
condition 6 17 35.2
subroutine 5 5 100.0
pod 2 2 100.0
total 200 228 87.7


line stmt bran cond sub pod time code
1             # $Id: po.pm
2             #
3             # BioPerl module for Bio::AlignIO::po
4              
5             # based on the Bio::AlignIO::fasta module
6             # by Peter Schattner (and others?)
7             #
8             # and the SimpleAlign.pm module of Ewan Birney
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::AlignIO::po - po MSA Sequence input/output stream
17              
18             =head1 SYNOPSIS
19              
20             Do not use this module directly. Use it via the L class.
21              
22             =head1 DESCRIPTION
23              
24             This object can transform L objects to and from
25             'po' format flat file databases. 'po' format is the native format of
26             the POA alignment program (Lee C, Grasso C, Sharlow MF, 'Multiple
27             sequence alignment using partial order graphs', Bioinformatics (2002),
28             18(3):452-64).
29              
30             =head1 FEEDBACK
31              
32             =head2 Support
33              
34             Please direct usage questions or support issues to the mailing list:
35              
36             I
37              
38             rather than to the module maintainer directly. Many experienced and
39             reponsive experts will be able look at the problem and quickly
40             address it. Please include a thorough description of the problem
41             with code and data examples if at all possible.
42              
43             =head2 Reporting Bugs
44              
45             Report bugs to the Bioperl bug tracking system to help us keep track
46             the bugs and their resolution. Bug reports can be submitted via the
47             web:
48              
49             https://github.com/bioperl/bioperl-live/issues
50              
51             =head1 AUTHORS - Matthew Betts
52              
53             Email: matthew.betts@ii.uib.no
54              
55             =head1 APPENDIX
56              
57             The rest of the documentation details each of the object
58             methods. Internal methods are usually preceded with a _
59              
60             =cut
61              
62             # Let the code begin...
63              
64             package Bio::AlignIO::po;
65 2     2   396 use strict;
  2         2  
  2         50  
66              
67 2     2   472 use Bio::SimpleAlign;
  2         4  
  2         53  
68              
69 2     2   9 use base qw(Bio::AlignIO);
  2         4  
  2         1423  
70              
71              
72             =head2 next_aln
73              
74             Title : next_aln
75             Usage : $aln = $stream->next_aln()
76             Function: returns the next alignment in the stream.
77             Returns : L object - returns undef on end of file
78             or on error
79             Args : NONE
80              
81             =cut
82              
83             sub next_aln {
84 3     3 1 756 my $self = shift;
85              
86 3         23 my $aln;
87             my $entry;
88 3         0 my $name;
89 3         0 my $seqs;
90 3         0 my $seq;
91 3         0 my $nodes;
92 3         0 my $list;
93 3         0 my $node;
94 3         0 my @chars;
95 3         0 my $s;
96 3         0 my $a;
97              
98 3         16 $aln = Bio::SimpleAlign->new();
99              
100             # get to the first 'VERSION' line
101 3         18 while(defined($entry = $self->_readline)) {
102 3 50       14 if($entry =~ /^VERSION=(\S+)/) {
103 3         12 $aln->source($1);
104              
105 3 50 33     6 if(defined($entry = $self->_readline) and $entry =~ /^NAME=(\S+)/) {
106 3         12 $aln->id($1);
107             }
108              
109 3         8 last;
110             }
111             }
112              
113             # read in the sequence names and node data, up to the end of
114             # the file or the next 'VERSION' line, whichever comes first
115 3         4 $seqs = [];
116 3         4 $nodes = [];
117 3         6 while(defined($entry = $self->_readline)) {
118 2682 50       9369 if($entry =~ /^VERSION/) {
    100          
    100          
    100          
119             # start of a new alignment, so...
120 0         0 $self->_pushback($entry);
121 0         0 last;
122             }
123             elsif($entry =~ /^SOURCENAME=(\S+)/) {
124 18         27 $name = $1;
125              
126 18 50       32 if($name =~ /(\S+)\/(\d+)-(\d+)/) {
127 0         0 $seq = Bio::LocatableSeq->new(
128             '-display_id' => $1,
129             '-start' => $2,
130             '-end' => $3,
131             '-alphabet' => $self->alphabet,
132             );
133              
134             } else {
135 18         37 $seq = Bio::LocatableSeq->new('-display_id'=> $name,
136             '-alphabet' => $self->alphabet);
137             }
138              
139             # store sequences in a list initially, because can't guarantee
140             # that will get them back from SimpleAlign object in the order
141             # they were read, and also can't add them to the SimpleAlign
142             # object here because their sequences are currently unknown
143 18         20 push @{$seqs}, {
  18         67  
144             'seq' => $seq,
145             'str' => '',
146             };
147             }
148             elsif($entry =~ /^SOURCEINFO=(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)/) {
149 18         41 $seq->desc($5);
150             }
151             elsif($entry =~ /^(\S):(\S+)/) {
152 2637         8013 $node = {
153             'aa' => $1,
154             'L' => [],
155             'S' => [],
156             'A' => [],
157             'status' => 'unvisited',
158             };
159              
160 2637         3465 $list = $2;
161 2637 50       5088 if($list =~ /^([L\d]*)([S\d]*)([A\d]*)/) {
162 2637         2401 push(@{$node->{'L'}}, split(/L/, $1));
  2637         5781  
163 2637         2774 push(@{$node->{'S'}}, split(/S/, $2));
  2637         5098  
164 2637         2724 push(@{$node->{'A'}}, split(/A/, $3));
  2637         4312  
165              
166 2637 100       2743 (@{$node->{'L'}} > 0) and shift @{$node->{'L'}};
  2625         2879  
  2637         3744  
167 2637 50       2492 (@{$node->{'S'}} > 0) and shift @{$node->{'S'}};
  2637         2574  
  2637         3274  
168 2637 100       2402 (@{$node->{'A'}} > 0) and shift @{$node->{'A'}};
  2313         2190  
  2637         3256  
169             }
170              
171 2637         2299 push @{$nodes}, $node;
  2637         4432  
172             }
173             }
174              
175             # process the nodes
176 3         6 foreach $node (@{$nodes}) {
  3         9  
177 2637 100       3295 ($node->{'status'} ne 'unvisited') and next;
178              
179 1266         1650 @chars = ($aln->gap_char) x @{$seqs}; # char for each seq defaults to a gap
  1266         1951  
180              
181             # set the character for each sequence represented by this node
182 1266         1174 foreach $s (@{$node->{'S'}}) {
  1266         1517  
183 2706         3675 $chars[$s] = $node->{'aa'};
184             }
185 1266         1264 $node->{'status'} = 'visited';
186              
187             # do the same for each node in the same align ring
188 1266         1732 while(defined($a = $node->{'A'}->[0])) {
189 2313         3099 $node = $nodes->[$a];
190 2313 100       2989 ($node->{'status'} ne 'unvisited') and last;
191              
192 1371         1090 foreach $s (@{$node->{'S'}}) {
  1371         1570  
193 1590         2128 $chars[$s] = $node->{'aa'};
194             }
195              
196 1371         2113 $node->{'status'} = 'visited';
197             }
198              
199             # update the sequences
200 1266         1011 foreach $seq (@{$seqs}) {
  1266         1281  
201 7596         7886 $seq->{'str'} .= shift @chars;
202             }
203             }
204              
205             # set the sequences of the bioperl objects
206             # and add them to the alignment
207 3         6 foreach $seq (@{$seqs}) {
  3         7  
208 18         61 $seq->{'seq'}->seq($seq->{'str'});
209 18         42 $aln->add_seq($seq->{'seq'});
210             }
211              
212             # has an alignment been read?...
213              
214 3 50       12 return $aln if $aln->num_sequences;
215 0         0 return;
216             }
217              
218             =head2 write_aln
219              
220             Title : write_aln
221             Usage : $stream->write_aln(@aln)
222             Function: writes the $aln object into the stream in po format
223             Returns : 1 for success and 0 for error
224             Args : L object
225              
226             =cut
227              
228             sub write_aln {
229 2     2 1 7 my $self = shift;
230 2         4 my @alns = @_;
231              
232 2         17 my $aln;
233             my $seqs;
234 2         0 my $nodes;
235 2         0 my $seq;
236 2         0 my $node;
237 2         0 my $col;
238 2         0 my $ring;
239 2         0 my $i;
240 2         0 my $char;
241              
242 2         5 foreach $aln (@alns) {
243 2 50 33     13 if(!$aln or !$aln->isa('Bio::Align::AlignI')) {
244 0         0 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
245 0         0 next;
246             }
247              
248             # store the seqs on a list, because po format
249             # refers to them by position on this list
250 2         6 $seqs = [];
251 2         6 foreach $seq ($aln->each_seq()) {
252 12         12 push @{$seqs}, {
  12         27  
253             'seq' => $seq,
254             'n_nodes' => 0,
255             'first' => undef,
256             'previous' => undef,
257             };
258             }
259              
260             # go through each column in the alignment and construct
261             # the nodes for the equivalent poa alignment ring
262 2         4 $nodes = [];
263 2         8 for($col = 0; $col < $aln->length; $col++) {
264             $ring = {
265             'nodes' => {},
266 844         953 'first' => scalar @{$nodes},
267 844         1346 'last' => scalar @{$nodes},
  844         2488  
268             };
269              
270 844         1125 for($i = 0; $i < @{$seqs}; $i++) {
  5908         8374  
271 5064         5497 $seq = $seqs->[$i];
272              
273 5064         9785 $char = $seq->{'seq'}->subseq($col + 1, $col + 1);
274              
275 5064 100       8194 ($char eq $aln->gap_char) and next;
276              
277 2864 100       5121 if(!defined($node = $ring->{'nodes'}->{$char})) {
278             $node = {
279 1758         1683 'n' => scalar @{$nodes},
  1758         5187  
280             'aa' => $char,
281             'L' => {},
282             'S' => [],
283             'A' => [],
284             };
285              
286             # update the ring
287 1758         2620 $ring->{'nodes'}->{$char} = $node;
288 1758         1927 $ring->{'last'} = $node->{'n'};
289              
290             # add the node to the node list
291 1758         1598 push @{$nodes}, $node;
  1758         2121  
292             }
293              
294             # add the sequence to the node
295 2864         2647 push @{$node->{'S'}}, $i;
  2864         3657  
296              
297             # add the node to the sequence
298 2864 100       4000 defined($seq->{'first'}) or ($seq->{'first'} = $node);
299 2864         2723 $seq->{'n_nodes'}++;
300              
301             # add an edge from the previous node in the sequence to this one.
302             # Then set the previous node to the current one, ready for the next
303             # residue in this sequence
304 2864 100       6442 defined($seq->{'previous'}) and ($node->{'L'}->{$seq->{'previous'}->{'n'}} = $seq->{'previous'});
305 2864         3416 $seq->{'previous'} = $node;
306             }
307              
308             # set the 'next node in ring' field for each node in the ring
309 844 100       1521 if($ring->{'first'} < $ring->{'last'}) {
310 628         1084 for($i = $ring->{'first'}; $i < $ring->{'last'}; $i++) {
311 914         846 push @{$nodes->[$i]->{'A'}}, $i + 1;
  914         1771  
312             }
313 628         642 push @{$nodes->[$ring->{'last'}]->{'A'}}, $ring->{'first'};
  628         1642  
314             }
315             }
316              
317             # print header information
318             $self->_print(
319             'VERSION=', ($aln->source and ($aln->source !~ /\A\s*\Z/)) ? $aln->source : 'bioperl', "\n",
320             'NAME=', $aln->id, "\n",
321             'TITLE=', ($seqs->[0]->{'seq'}->description or $aln->id), "\n",
322 2         4 'LENGTH=', scalar @{$nodes}, "\n",
323 2 50 33     10 'SOURCECOUNT=', scalar @{$seqs}, "\n",
  2   33     51  
324             );
325              
326             # print sequence information
327 2         5 foreach $seq (@{$seqs}) {
  2         6  
328             $self->_print(
329             'SOURCENAME=', $seq->{'seq'}->display_id, "\n",
330             'SOURCEINFO=',
331             $seq->{'n_nodes'}, ' ', # number of nodes in the sequence
332             $seq->{'first'}->{'n'}, ' ', # index of first node containing the sequence
333             0, ' ', # FIXME - sequence weight?
334             -1, ' ', # FIXME - index of bundle containing sequence?
335 12   50     23 ($seq->{'seq'}->description or 'untitled'), "\n",
336             );
337             }
338              
339             # print node information
340 2         5 foreach $node (@{$nodes}) {
  2         5  
341 1758         2809 $self->_print($node->{'aa'}, ':');
342 1758 100       1413 (keys %{$node->{'L'}} > 0) and $self->_print('L', join('L', sort {$a <=> $b} keys %{$node->{'L'}}));
  165         520  
  1750         4126  
  1758         3474  
343 1758 50       1813 (@{$node->{'S'}} > 0) and $self->_print('S', join('S', @{$node->{'S'}}));
  1758         4068  
  1758         2546  
344 1758 100       1704 (@{$node->{'A'}} > 0) and $self->_print('A', join('A', @{$node->{'A'}}));
  1542         3358  
  1758         2446  
345 1758         2349 $self->_print("\n");
346             }
347             }
348 2 50 33     23 $self->flush if $self->_flush_on_write && defined $self->_fh;
349              
350 2         2038 return 1;
351             }
352              
353             1;