File Coverage

blib/lib/Bio/AGP/LowLevel.pm
Criterion Covered Total %
statement 120 137 87.5
branch 44 80 55.0
condition 12 23 52.1
subroutine 17 18 94.4
pod 6 7 85.7
total 199 265 75.0


line stmt bran cond sub pod time code
1             package Bio::AGP::LowLevel;
2 1     1   986 use strict;
  1         1  
  1         34  
3 1     1   4 use warnings;
  1         2  
  1         31  
4 1     1   781 use English;
  1         837  
  1         6  
5 1     1   670 use Carp;
  1         2  
  1         63  
6              
7 1     1   6 use File::Basename;
  1         1  
  1         74  
8 1     1   1066 use UNIVERSAL qw/isa/;
  1         13  
  1         5  
9 1     1   468 use List::Util qw/first/;
  1         3  
  1         103  
10              
11             =head1 NAME
12              
13             Bio::AGP::LowLevel - functions for dealing with AGP files
14              
15             =head1 SYNOPSIS
16            
17              
18             $lines_arrayref = agp_parse('my_agp_file.agp');
19              
20             agp_write( $lines => 'my_agp_file.agp');
21              
22              
23              
24             =head1 DESCRIPTION
25              
26             functions for working with AGP files.
27              
28             =head1 FUNCTIONS
29              
30             All functions below are EXPORT_OK.
31              
32             =cut
33              
34 1     1   5 use base qw/ Exporter /;
  1         2  
  1         142  
35              
36             our @EXPORT_OK;
37             BEGIN {
38 1     1   401 @EXPORT_OK = qw(
39             agp_parse
40             agp_write
41             agp_format_part
42             agp_contigs
43             );
44             }
45              
46              
47             =head2 str_in
48              
49             Usage: print "it's valid" if str_in($thingy,qw/foo bar baz/);
50             Desc : return 1 if the first argument is string equal to at least one of the
51             subsequent arguments
52             Ret : 1 or 0
53             Args : string to search for, array of strings to search in
54             Side Effects: none
55              
56             I kept writing this over and over in validation code and got sick of it.
57              
58             =cut
59              
60             sub str_in {
61 1138     1138 1 1742 my $needle = shift;
62 1138 50   2402   3946 return defined(first {$needle eq $_} @_) ? 1 : 0;
  2402         4928  
63             }
64              
65             =head2 is_filehandle
66              
67             Usage: print "it's a filehandle" if is_filehandle($my_thing);
68             Desc : check whether the given thing is usable as a filehandle.
69             I put this in a module cause a filehandle might be either
70             a GLOB or isa IO::Handle or isa Apache::Upload
71             Ret : true if it is a filehandle, false otherwise
72             Args : a single thing
73             Side Effects: none
74              
75             =cut
76              
77             sub is_filehandle {
78 9     9 1 17 my ($thing) = @_;
79 9   66     161 return isa($thing,'IO::Handle') || isa($thing,'Apache::Upload') || ref($thing) eq 'GLOB';
80             }
81              
82              
83             =head2 agp_parse
84              
85             Usage: my $lines = agp_parse('~/myagp.agp',validate_syntax => 1, validate_identifiers => 1);
86             Desc : parse an agp file
87             Args : filename or filehandle, hash-style list of options as
88             validate_syntax => if true, error
89             if there are any syntax errors,
90             validate_identifiers => if true, error
91             if there are any identifiers that
92             CXGN::Tools::Identifiers doesn't recognize
93             IMPLIES validate_syntax
94             error_array => an arrayref. if given, will push
95             error descriptions onto this array instead of
96             using warn to print them to stderr
97             Ret : undef if error, otherwise return an
98             arrayref containing line records, each of which is like:
99             { comment => 'text' } if a comment,
100             or if a data line:
101             { objname => the name of the object being assembled
102             (same for every record),
103             ostart => start coordinate for this component (object),
104             oend => end coordinate for this component (object),
105             partnum => the part number appearing in the 4th column,
106             linenum => the line number in the file,
107             type => letter type present in the file (/[ADFGNOPUW]/),
108             typedesc => description of the type, one of:
109             - (A) active_finishing
110             - (D) draft
111             - (F) finished
112             - (G) wgs_finishing
113             - (N) known_gap
114             - (O) other
115             - (P) predraft
116             - (U) unknown_gap
117             - (W) wgs_contig
118             ident => identifier of the component, if any,
119             length => length of the component,
120             is_gap => 1 if the line is some kind of gap, 0 if it
121             is covered by a component,
122             gap_type => one of:
123             fragment: gap between two sequence contigs (also
124             called a "sequence gap"),
125             clone: a gap between two clones that do not overlap.
126             contig: a gap between clone contigs (also called a
127             "layout gap").
128             centromere: a gap inserted for the centromere.
129             short_arm: a gap inserted at the start of an
130             acrocentric chromosome.
131             heterochromatin: a gap inserted for an especially
132             large region of heterochromatic sequence (may also
133             include the centromere).
134             telomere: a gap inserted for the telomere.
135             repeat: an unresolvable repeat.
136             cstart => start coordinate relative to the component,
137             cend => end coordinate relative to the component,
138             linkage => 'yes' or 'no', only set for type of 'N',
139             orient => '+', '-', 0, or 'na'
140             orientation of the component
141             relative to the object,
142             }
143              
144             Side Effects: unless error_array is given, will print error
145             descriptions to STDERR with warn()
146             Example:
147              
148             =cut
149              
150             sub agp_parse {
151 6     6 1 4161 my $agpfile = shift;
152 6         28 our %opt = @_;
153              
154 6 50       23 $agpfile or croak 'must provide an AGP filename';
155              
156 6 50       21 if($opt{validate_identifiers}) {
157 0         0 $opt{validate_syntax} = 1;
158             }
159              
160             #if the argument is a filehandle, use it, otherwise try to use it as
161             #a filename
162 6         11 my $agp_in; #< filehandle for reading AGP
163 6         7 our $bn; #< basename of file we're parsing
164 6         10 ($agp_in,$bn) = do {
165 6 50       60 if( is_filehandle($agpfile) ) {
166 0         0 ($agpfile,'')
167             } else {
168 6 50       430 open my $f,$agpfile
169             or die "$! opening '$agpfile'\n";
170 6         23 ($f,$agpfile)
171             }
172             };
173              
174 6         13 our $parse_error_flag = 0;
175             sub parse_error(@) {
176              
177 0 0   0 0 0 return unless $opt{validate_syntax};
178              
179 0         0 $parse_error_flag = 1;
180 0         0 my $errstr = "$bn:$.: ".join('',@_)."\n";
181             #if we're pushing errors onto an error_array, do that
182 0 0       0 if ($opt{error_array}) {
183 0         0 push @{$opt{error_array}},$errstr;
  0         0  
184             } else { # otherwise just warn
185 0         0 warn $errstr;
186             }
187             }
188              
189 6         9 my @records;
190              
191             my $last_end;
192 0         0 my $last_partnum;
193 0         0 my $last_objname;
194              
195 6         11 my $assembled_sequence = '';
196 6         294 while (my $line = <$agp_in>) {
197 1     1   5 no warnings 'uninitialized';
  1         1  
  1         1248  
198             # warn "parsing $line";
199 646         698 chomp $line;
200 646         851 $line =~ s/\r//g; #remove windows \r chars
201              
202             #deal with comments
203 646 100       1277 if($line =~ /#/) {
204 4 50       25 if( $line =~ s/^#// ) {
205 4         12 push @records, { comment => $line };
206 4         16 next;
207             }
208 0         0 parse_error("not a valid comment line, # must be first character on line");
209 0         0 next;
210             }
211              
212 642         4429 my @fields = split /\t/,$line,10;
213 642 50       1440 @fields == 9
214             or parse_error "This line contains ".scalar(@fields)." columns. All lines must have 9 columns.";
215              
216             #if there just really aren't many columns, this probably isn't a valid AGP line
217 642 50 33     2382 next unless @fields >= 5 && @fields <= 10;
218              
219 642         1740 my %r = (linenum => $.); #< the record we're building for this line, starting with line number
220              
221             #parse and check the first 5 cols
222 642         1761 @r{qw( objname ostart oend partnum type )} = splice @fields,0,5;
223 642 50       1318 $r{objname}
224             or parse_error "'$r{obj_name}' is a valid object name";
225             #end
226 642 50 66     3626 if ( defined $last_end && defined $last_objname && $r{objname} eq $last_objname ) {
      66        
227 636 50       1782 $r{ostart} == $last_end+1
228             or parse_error "start coordinate not contiguous with previous line's end";
229             }
230 642         850 $last_end = $r{oend};
231 642         749 $last_objname = $r{objname};
232              
233             #start
234 642 50       1443 $r{oend} >= $r{ostart} or parse_error("end must be >= start");
235              
236             #part num
237 642   100     1011 $last_partnum ||= 0;
238 642 50       1336 $r{partnum} == $last_partnum + 1
239             or parse_error("part numbers not sequential");
240              
241 642         871 $last_partnum = $r{partnum};
242              
243             #type
244 642 100       1841 if ( $r{type} =~ /^[NU]$/ ) {
    50          
245 496         1990 (@r{qw( length gap_type linkage)}, my $empty, my $undefined) = @fields;
246 496         817 @fields = ();
247 496         1099 my %descmap = qw/ U unknown_gap N known_gap /;
248 496 50       1344 $r{typedesc} = $descmap{$r{type}}
249             or parse_error("unregistered type $r{type}");
250 496         669 $r{is_gap} = 1;
251              
252 496   33     1703 my $gap_size_to_use = $opt{gap_length} || $r{length};
253              
254 496 50       1338 $r{length} == $r{oend} - $r{ostart} + 1
255             or parse_error("gap size of '$r{length}' does not agree with ostart, oend of ($r{ostart},$r{oend})");
256              
257 496 50       1037 str_in($r{gap_type},qw/fragment clone contig centromere short_arm heterochromatin telomere repeat/)
258             or parse_error("invalid gap type '$r{gap_type}'");
259              
260 496 50       1496 str_in($r{linkage},qw/yes no/)
261             or parse_error("linkage (column 8) should be 'yes' or 'no'\n");
262              
263 496 50 33     2264 defined $empty && $empty eq ''
264             or parse_error("9th column should be present and empty\n");
265              
266 496         2456 push @records,\%r;
267              
268             } elsif ( $r{type} =~ /^[ADFGOPW]$/ ) {
269 146         731 my %descmap = qw/A active_finishing D draft F finished G wgs_finishing N known_gap O other P predraft U unknown_gap W wgs_contig/;
270 146 50       380 $r{typedesc} = $descmap{$r{type}}
271             or parse_error("unregistered type $r{type}");
272 146         322 $r{is_gap} = 0;
273              
274 146         437 @r{qw(ident cstart cend orient)} = @fields;
275 146 50       355 if($opt{validate_identifiers}) {
276 0 0       0 my $comp_type = identifier_namespace($r{ident})
277             or parse_error("cannot guess type of '$r{ident}'");
278             } else {
279 146 50       263 $r{ident} or parse_error("invalid identifier '$r{ident}'");
280             }
281              
282 146 50       311 str_in($r{orient},qw/+ - 0 na/)
283             or parse_error("orientation must be one of +,-,0,na");
284              
285 146 50 33     892 $r{cstart} >= 1 && $r{cend} > $r{cstart}
286             or parse_error("invalid component start and/or end ($r{cstart},$r{cend})");
287              
288 146         378 $r{length} = $r{cend}-$r{cstart}+1;
289              
290 146 50       315 $r{length} == $r{oend} - $r{ostart} + 1
291             or parse_error("distance between object start, end ($r{ostart},$r{oend}) does not agree with distance between component start, end ($r{cstart},$r{cend})");
292              
293 146         932 push @records, \%r;
294             } else {
295 0         0 parse_error("invalid component type '$r{type}', it should be one of {A D F G N O P U W}");
296             }
297             }
298              
299 6 50       15 return if $parse_error_flag;
300              
301             #otherwise, everything was well
302 6         214 return \@records;
303             }
304              
305              
306             =head2 agp_write
307              
308             Usage: agp_write($lines,$file);
309             Desc : writes a properly formatted AGP file
310             Args : arrayref of line records to write, with the line records being
311             in the same format as those returned by agp_parse above,
312             filename or filehandle to write to,
313             Ret : nothing meaningful
314              
315             Side Effects: dies on failure. if you gave it a filehandle, does
316             not close it
317             Example:
318              
319             =cut
320              
321             sub agp_write {
322 3     3 1 61459 my ($lines,$file) = @_;
323 3 50       18 $file or confess "must provide file to write to!\n";
324              
325             my $out_fh = is_filehandle($file) ? $file
326 3 50       10 : do {
327 0 0       0 open my $f,">$file" or croak "$! opening '$file' for writing";
328 0         0 $f
329             };
330              
331 3         12 foreach my $line (@$lines) {
332 323         607 print $out_fh agp_format_part( $line );
333             }
334              
335 3         12 return;
336             }
337              
338             =head2 agp_format_part( $record )
339              
340             Format a single AGP part line (string terminated with a newline) from
341             the given record hashref.
342              
343             =cut
344              
345             sub agp_format_part {
346 323     323 1 348 my ( $line ) = @_;
347              
348 323 100       710 return "#$line->{comment}\n" if $line->{comment};
349              
350             #and all other lines
351 321         324 my @fields = @{$line}{qw(objname ostart oend partnum type)};
  321         1048  
352 321 100       952 if( $line->{type} =~ /^[NU]$/ ) {
353 248         235 push @fields, @{$line}{qw(length gap_type linkage)},'';
  248         644  
354             } else {
355 73         73 push @fields, @{$line}{qw(ident cstart cend orient)};
  73         210  
356             }
357              
358 321         1683 return join("\t", @fields)."\n";
359             }
360              
361              
362             =head2 agp_contigs
363              
364             Usage: my @contigs = agp_contigs( agp_parse($agp_filename) );
365             Desc : extract and number contigs from a parsed AGP file
366             Args : arrayref of AGP lines, like those returned by agp_parse() above
367             Ret : list of contigs, in the same order as they occur in the
368             file, formatted as:
369             [ agp_line_hashref, agp_line_hashref, ... ],
370             [ agp_line_hashref, agp_line_hashref, ... ],
371             ...
372              
373             =cut
374              
375             sub agp_contigs {
376 3     3 1 88365 my $lines = shift;
377              
378 3         12 my @contigs = ([]);
379 3         9 foreach my $l (@$lines) {
380 323 100       620 next if $l->{comment};
381 321 100       895 if( $l->{typedesc} =~ /_gap$/ ) {
382 248 100       221 push @contigs,[] if @{$contigs[-1]};
  248         803  
383             } else {
384 73         67 push @{$contigs[-1]},$l;
  73         141  
385             }
386             }
387 3 100       6 pop @contigs if @{$contigs[-1]} == 0;
  3         13  
388 3         24 return @contigs;
389             }
390              
391             =head1 AUTHOR(S)
392              
393             Robert Buels
394              
395             Sheena Scroggins
396              
397             =cut
398              
399             ###
400             1;#do not remove
401             ###