File Coverage

blib/lib/Bio/GFF3/LowLevel/Parser.pm
Criterion Covered Total %
statement 155 179 86.5
branch 38 44 86.3
condition 25 39 64.1
subroutine 27 28 96.4
pod 4 4 100.0
total 249 294 84.6


line stmt bran cond sub pod time code
1             package Bio::GFF3::LowLevel::Parser;
2             BEGIN {
3 2     2   133908 $Bio::GFF3::LowLevel::Parser::AUTHORITY = 'cpan:RBUELS';
4             }
5             {
6             $Bio::GFF3::LowLevel::Parser::VERSION = '2.0';
7             }
8             # ABSTRACT: a fast, low-level gff3 parser
9              
10 2     2   18 use strict;
  2         3  
  2         51  
11 2     2   53 use warnings;
  2         3  
  2         61  
12 2     2   9 use Carp;
  2         4  
  2         111  
13              
14 2     2   9 use IO::Handle ();
  2         4  
  2         27  
15 2     2   9 use Scalar::Util ();
  2         2  
  2         41  
16              
17 2     2   1771 use List::MoreUtils ();
  2         2595  
  2         38  
18              
19 2     2   1188 use Bio::GFF3::LowLevel ();
  2         8  
  2         2084  
20              
21              
22              
23             sub open {
24 40     40 1 145154 my $class = shift;
25 40         214 return bless {
26              
27             filethings => \@_,
28             filehandles => [ map $class->_open($_), @_ ],
29              
30             # features that are ready to go out and be flushed
31             item_buffer => [],
32              
33             # features that we have to keep on hand for now because they
34             # might be referenced by something else
35             under_construction_top_level => [],
36             # index of the above by ID
37             under_construction_by_id => {},
38              
39             # features that reference something we have not seen yet
40             # structured as:
41             # { 'some_id' => {
42             # 'Parent' => [ orphans that have a Parent attr referencing it ],
43             # 'Derives_from' => [ orphans that have a Derives_from attr referencing it ],
44             # }
45             under_construction_orphans => {},
46             }, $class;
47             }
48             sub _open {
49 40     40   94 my ( $class, $thing ) = @_;
50 40 100 33     373 return $thing if ref $thing eq 'GLOB' || Scalar::Util::blessed( $thing ) && $thing->can('getline');
      66        
51 2 50   2   165 CORE::open my $f, '<', $thing or croak "$! opening '$thing' for reading";
  2         4  
  2         16  
  38         2531  
52 38         3935 return $f;
53             }
54              
55              
56             sub max_lookback {
57 17     17 1 102 my ( $self, $count ) = @_;
58 17         49 $self->{max_lookback} = $count
59             }
60              
61              
62             sub new {
63 16     16 1 72400 my $class = shift;
64 16         1368 require Bio::GFF3::LowLevel::Parser::1_0_backcompat;
65 16         126 return Bio::GFF3::LowLevel::Parser::1_0_backcompat->new( @_ );
66             }
67              
68              
69             sub next_item {
70 2330     2330 1 8490 my ( $self ) = @_;
71              
72             # try to get more items if the buffer is empty
73 2330 100       3873 $self->_buffer_items unless $self->_buffered_items_count;
74              
75             # return the next item if we have some
76 2328 100       4996 return shift @{ $self->{item_buffer}} if $self->_buffered_items_count;
  2294         6476  
77              
78             # if we were not able to get any more items, return nothing
79 34         144 return;
80             }
81              
82             sub _buffer_item {
83 2100     2100   2453 push @{$_[0]->{item_buffer}}, $_[1];
  2100         5336  
84             }
85              
86             sub _buffered_items_count {
87 9923     9923   12673 scalar @{ $_[0]->{item_buffer} }
  9923         42321  
88             }
89              
90             ## get and parse lines from the files(s) to add at least one item to
91             ## the buffer
92             sub _buffer_items {
93 2251     2251   2533 my ( $self ) = @_;
94              
95             # File position buffer, needed for correct parsing of
96             # implicit beginning of a FASTA section
97 2251         2434 my $pos_buffer = 0;
98 2251         9450 while( my $line = $self->_next_line ) {
99 5267 100       21275 if( $line =~ /^ \s* [^#\s>] /x ) { #< feature line, most common case
    100          
    100          
    50          
100             # Remember current position
101 5100         6372 $pos_buffer = tell;
102              
103 5100         19227 my $f = Bio::GFF3::LowLevel::gff3_parse_feature( $line );
104 5100         11807 $self->_buffer_feature( $f );
105             }
106             # directive or comment
107             elsif( my ( $hashsigns, $contents ) = $line =~ /^ \s* (\#+) (.*) /x ) {
108             # Remember current position
109 133         202 $pos_buffer = tell;
110              
111 133 100       387 if( length $hashsigns == 3 ) { #< sync directive, all forward-references are resolved.
    100          
112 63         143 $self->_buffer_all_under_construction_features;
113             }
114             elsif( length $hashsigns == 2 ) {
115 64         248 my $directive = Bio::GFF3::LowLevel::gff3_parse_directive( $line );
116 64 100       171 if( $directive->{directive} eq 'FASTA' ) {
117 11         30 $self->_buffer_all_under_construction_features;
118 11         22 $self->_buffer_item( { directive => 'FASTA', filehandle => shift @{$self->{filehandles} } });
  11         55  
119 11         15 shift @{$self->{filethings}};
  11         34  
120             } else {
121 53         118 $self->_buffer_item( $directive );
122             }
123             }
124             else {
125 6         73 $contents =~ s/\s*$//;
126 6         27 $self->_buffer_item( { comment => $contents } );
127             }
128             }
129             elsif( $line =~ /^ \s* $/x ) {
130             # blank line, do nothing
131              
132             # Remember current position
133 30         48 $pos_buffer = tell;
134             }
135             elsif( $line =~ /^ \s* > /x ) {
136             # implicit beginning of a FASTA section. a very stupid
137             # idea to include this in the format spec. increases
138             # implementation complexity by a lot.
139 4         12 $self->_buffer_all_under_construction_features;
140              
141             # rewind to previous file position to include the fasta header
142 4         7 my $fh = shift @{$self->{filehandles}};
  4         10  
143 4         40 seek $fh, $pos_buffer, 0;
144              
145 4         21 $self->_buffer_item( { directive => 'FASTA', filehandle => $fh } );
146 4         7 shift @{$self->{filethings}};
  4         9  
147              
148             # update file position
149 4         9 $pos_buffer = tell;
150             }
151             else { # it's a parse error
152             # Remember current position
153 0         0 $pos_buffer = tell;
154              
155 0         0 chomp $line;
156 0         0 $self->_parse_error("parse error. Cannot parse '$line'.");
157             }
158              
159             # buffer old features if we are starting to approach our mem limit
160 5265         15584 $self->_ensure_lookback_limit;
161              
162             # return now if we were able to find some things to put in the
163             # output buffer
164 5265 100       9260 return if $self->_buffered_items_count;
165             }
166              
167             # if we are out of lines, buffer all under-construction features
168 52         143 $self->_buffer_all_under_construction_features;
169             }
170              
171             sub _parse_error {
172 2     2   3 my ( $self, $error ) = @_;
173 2         417 croak "$self->{filethings}[0]:$.: $error";
174             }
175              
176             ## take all under-construction features and put them in the
177             ## item_buffer to be output
178             sub _buffer_all_under_construction_features {
179 130     130   176 my ( $self ) = @_;
180              
181             # since the under_construction_top_level buffer is likely to be
182             # much larger than the item_buffer, we swap them and unshift the
183             # existing buffer onto it to avoid a big copy.
184 130         245 my $old_buffer = $self->{item_buffer};
185 130         233 $self->{item_buffer} = $self->{under_construction_top_level};
186 130         151 unshift @{$self->{item_buffer}}, @$old_buffer;
  130         256  
187 130         184 undef $old_buffer;
188              
189 130         236 $self->{under_construction_top_level} = [];
190 130         210 $self->{under_construction_by_id} = {};
191 130         1095 $self->{completed_references} = {};
192              
193             # if we have any orphans hanging around still, this is a problem. die with a parse error
194 130 50       988 if( grep %$_, values %{$self->{under_construction_orphans}} ) {
  130         512  
195 0         0 die <_unresolved_references_report;
196              
197             GFF3 parse error: some features reference other features that do not exist in the file (or in the same '###' scope). A list of them:
198             EOM
199             }
200             }
201              
202             sub _ensure_lookback_limit {
203 5265     5265   6421 my ( $self ) = @_;
204              
205 5265 100       12771 return unless defined $self->{max_lookback};
206              
207 2562         3487 my $toplevel = $self->{under_construction_top_level};
208 2562         3400 my $byid = $self->{under_construction_by_id};
209 2562         3340 my $out_buffer = $self->{item_buffer};
210 2     2   12 no warnings 'uninitialized';
  2         3  
  2         356  
211 2562         8505 while( @$toplevel > $self->{max_lookback} ) {
212 40         75 my $f = shift @$toplevel;
213 40         173 delete $byid->{$_} for map @{$_->{attributes}{ID}}, @$f;
  40         294  
214 40         178 push @$out_buffer, $f;
215             }
216             }
217              
218             # makes a string with a tabular report of all the currently unresolved
219             # references: Parent, Derives_from, etc.
220             sub _unresolved_references_report {
221 0     0   0 my ( $self ) = @_;
222              
223 0         0 require IO::Handle;
224              
225 0         0 our $id;
226 0         0 our $reference_string;
227              
228 0         0 my $report = '';
229 0         0 CORE::open my $fh, '>', \$report;
230 0         0 $fh->print(<<'');
231             ID | Cannot Find
232             ----------------------------------------------------------------------
233              
234 2     2   10 no strict 'subs';
  2         4  
  2         622  
235             format UNRESOLVED_REFERENCES_REPORT =
236             @<<<<<<<<<<<<<<<<<< | @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
237             $id, $reference_string
238             .
239              
240              
241              
242 0         0 for my $refid ( keys %{$self->{under_construction_orphans}} ) {
  0         0  
243 0         0 my $escaped_refid = Bio::GFF3::LowLevel::gff3_escape( $refid );
244 0         0 my $references = $self->{under_construction_orphans}{$refid};
245 0         0 for my $reftype ( keys %$references ) {
246 0         0 my $references = $references->{$reftype};
247 0         0 for my $feature (@$references) {
248 0         0 for my $location ( @$feature ) {
249 0   0     0 $id = (($location->{attributes}||{})->{ID}||[])->[0]; #< avoid autovivification
250 0 0       0 $id = '(no id)' unless defined $id;
251 0         0 $reference_string = "$reftype=$escaped_refid";
252 0         0 $fh->format_write( Bio::GFF3::LowLevel::Parser::UNRESOLVED_REFERENCES_REPORT );
253             }
254             }
255             }
256             }
257 0         0 return $report;
258             }
259              
260              
261             ## get the next line from our file(s), returning nothing if we are out
262             ## of lines and files
263             sub _next_line {
264 2     2   12 no warnings;
  2         3  
  2         1511  
265             # fast code path for reading a line from the first filehandle,
266 5319     5319   9260 my $first_fh = $_[0]->{filehandles}[0];
267 5319   66     34470 return <$first_fh> || do {
268             # slower case where we are at the end, or need to change
269             # filehandles
270             my ( $self ) = @_;
271             my $filehandles = $self->{filehandles};
272             while ( @$filehandles ) {
273             my $line = $filehandles->[0]->getline;
274             return $line if $line;
275             shift @$filehandles;
276             shift @{$self->{filethings}};
277             }
278             return;
279             };
280             }
281              
282             my %container_attributes = ( Parent => 'child_features', Derives_from => 'derived_features' );
283             ## do the right thing with a newly-parsed feature line
284             sub _buffer_feature {
285 5100     5100   6441 my ( $self, $feature_line ) = @_;
286              
287 5100         11167 $feature_line->{'child_features'} = [];
288 5100         8686 $feature_line->{'derived_features'} = [];
289              
290             # NOTE: a feature is an arrayref of one or more feature lines.
291 5100   100     20417 my $ids = $feature_line->{attributes}{ID} || [];
292 5100   100     15692 my $parents = $feature_line->{attributes}{Parent} || [];
293 5100   100     19142 my $derives = $feature_line->{attributes}{Derives_from} || [];
294              
295 5100 50 66     23200 if( !@$ids && !@$parents && !@$derives ) {
      66        
296             # if it has no IDs and does not refer to anything, we can just
297             # output it
298 2026         6017 $self->_buffer_item( [ $feature_line ] );
299 2026         4762 return;
300             }
301              
302 3074         2901 my $feature;
303 3074         4642 for my $id ( @$ids ) {
304 1097 100       2836 if( my $existing = $self->{under_construction_by_id}{$id} ) {
305             # another location of the same feature
306 27 100       89 unless( $existing->[-1]->{type} eq $feature_line->{type} ) {
307 2         13 $self->_parse_error("type ".$feature_line->{type}." is not the same as previous type ".$existing->[-1]->{type}." for ID '".$id."'");
308             }
309 25         42 push @$existing, $feature_line;
310 25         66 $feature = $existing;
311             }
312             else {
313             # haven't seen it yet
314 1070         1807 $feature = [ $feature_line ];
315 1070 100 66     4956 if( !@$parents && !@$derives ) {
316 204         218 push @{ $self->{under_construction_top_level} }, $feature;
  204         839  
317             }
318 1070         6534 $self->{under_construction_by_id}{$id} = $feature;
319              
320             # see if we have anything buffered that refers to it
321 1070         2078 $self->_resolve_references_to( $feature, $id );
322             }
323             }
324              
325             # try to resolve all its references
326 3072   100     22872 $self->_resolve_references_from( $feature || [ $feature_line ], { Parent => $parents, Derives_from => $derives }, $ids );
327             }
328              
329             sub _resolve_references_to {
330 1070     1070   1951 my ( $self, $feature, $id ) = @_;
331 1070 100       4658 my $references = $self->{under_construction_orphans}{$id}
332             or return;
333 2         6 for my $attrname ( keys %$references ) {
334 3   33     11 my $pname = $container_attributes{$attrname} || lc $attrname;
335 3         7 for my $loc ( @$feature ) {
336 3         6 push @{ $loc->{$pname} },
  3         19  
337 3         4 @{ delete $references->{$attrname} };
338             }
339             }
340             }
341             sub _resolve_references_from {
342 3072     3072   4478 my ( $self, $feature, $references, $ids ) = @_;
343             # go through our references
344             # if we have the feature under construction, put this feature in the right place
345             # otherwise, put this feature in the right slot in the orphans
346              
347 3072         7767 for my $attrname ( keys %$references ) {
348 6144         6399 my $pname;
349 6144         5956 for my $to_id ( @{ $references->{ $attrname } } ) {
  6144         14987  
350 2884 100       7274 if( my $other_feature = $self->{under_construction_by_id}{ $to_id } ) {
351 2879   33     12423 $pname ||= $container_attributes{$attrname} || lc $attrname;
      66        
352 2879 100       18305 unless( grep $self->{completed_references}{$_}{$attrname}{$to_id}++, @$ids ) {
353 2861         4087 for my $loc ( @$other_feature ) {
354 2873         2828 push @{ $loc->{ $pname } }, $feature;
  2873         15844  
355             }
356             }
357             }
358             else {
359 5   100     7 push @{ $self->{under_construction_orphans}{$to_id}{$attrname} ||= [] }, $feature;
  5         41  
360             }
361             }
362             }
363             }
364              
365              
366             1;
367              
368             __END__