File Coverage

blib/lib/Bio/ToolBox/parser/bed.pm
Criterion Covered Total %
statement 221 306 72.2
branch 75 144 52.0
condition 21 60 35.0
subroutine 26 32 81.2
pod 18 22 81.8
total 361 564 64.0


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::bed;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::parser::bed - Parser for BED-style formats
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::parser::bed;
11            
12             ### Quick-n-easy bed parser
13             my $bed = Bio::ToolBox::parser::bed->new('file.bed');
14            
15             ### Full-powered bed parser, mostly for bed12 functionality
16             my $bed = Bio::ToolBox::parser::bed->new(
17             file => 'regions.bed',
18             do_exon => 1,
19             do_cds => 1,
20             do_codon => 1,
21             );
22            
23             # what type of bed file is being parsed, determined when opening file
24             my $type = $bed->version; # returns narrowPeak, bedGraph, bed12, bed6, etc
25            
26             # Retrieve one feature or line at a time
27             my $feature = $bed->next_feature;
28              
29             # Retrieve array of all features
30             my @genes = $bed->top_features;
31            
32             # each returned feature is a SeqFeature object
33             foreach my $f ($bed->next_top_feature) {
34             printf "%s:%d-%d\n", $f->seq_id, $f->start, $f->end;
35             }
36            
37              
38             =head1 DESCRIPTION
39              
40             This is a parser for converting BED-style and related formats into SeqFeature objects.
41             File formats include the following.
42              
43             =over 4
44              
45             =item Bed
46              
47             L files may have 3-12 columns,
48             where the first 3-6 columns are basic information about the feature itself, and
49             columns 7-12 are usually for defining subfeatures of a transcript model, including
50             exons, UTRs (thin portions), and CDS (thick portions) subfeatures. This parser will
51             parse these extra fields as appropriate into subfeature SeqFeature objects. Bed files
52             are recognized with the file extension F<.bed>.
53              
54             =item Bedgraph
55              
56             L files are a type of
57             wiggle format in Bed format, where the 4th column is a score instead of a name. BedGraph
58             files are recognized by the file extension F<.bedgraph> or F<.bdg>.
59              
60             =item narrowPeak
61              
62             L files are a specialized
63             Encode variant of bed files with 10 columns (typically denoted as bed6+4), where the
64             extra 4 fields represent score attributes to a narrow ChIPSeq peak. These files are
65             parsed as a typical bed6 file, and the extra four fields are assigned to SeqFeature
66             attribute tags C, C, C, and C, respectively.
67             NarrowPeak files are recognized by the file extension F<.narrowPeak>.
68              
69             =item broadPeak
70              
71             L files, like narrowPeak,
72             are an Encode variant with 9 columns (bed6+3) representing a broad or extended interval
73             of ChIP enrichment without a single "peak". The extra three fields are assigned to
74             SeqFeature attribute tags C, C, and C, respectively.
75             BroadPeak files are recognized by the file extension F<.broadPeak>.
76              
77             =back
78              
79             C and C lines are generally ignored, although a C definition
80             line containing a C key will be interpreted if it matches one of the above file
81             types.
82              
83             =head2 SeqFeature default values
84              
85             The SeqFeature objects built from the bed file intervals will have some inferred defaults.
86              
87             =over 4
88              
89             =item Coordinate system
90              
91             SeqFeature objects use the 1-based coordinate system, per the specification of
92             L, so the 0-based start coordinates of bed files will always be
93             parsed into 1-based coordinates.
94              
95             =item C
96              
97             SeqFeature objects will use the name field (4th column in bed files), if present, as the
98             C. The SeqFeature object should default to the C if a name was
99             not provided.
100              
101             =item C
102              
103             It will use a concatenation of the sequence ID, start (original 0-based), and
104             stop coordinates as the C, for example 'chr1:0-100'.
105              
106             =item C
107              
108             Bed files don't have a concept of feature type (they're all the same type), so a
109             default C of 'region' is set. For bed12 files with transcript models,
110             the transcripts will be set to either 'mRNA' or 'ncRNA', depending on the presence of
111             interpreted CDS start and stop (thick coordinates).
112              
113             =item C
114              
115             Bed files don't have a concept of a source. The basename of the provided file is
116             therefore used to set the C.
117              
118             =item attribute tags
119              
120             Extra columns in the narrowPeak and broadPeak formats are assigned to attribute tags
121             as described above. The C values set in bed12 files are also set to an attribute tag.
122              
123             =back
124              
125             =head1 METHODS
126              
127             =head2 Initializing the parser object
128              
129             =over 4
130              
131             =item new
132              
133             Initiate a new Bed file parser object. Pass a single value (the bed file name) to
134             open the file for parsing. Alternatively, pass an array of key
135             value pairs to control how the table is parsed. These options are primarily for
136             parsing bed12 files with subfeatures. Options include the following.
137              
138             =over 4
139              
140             =item file
141              
142             Provide the path and file name for a Bed file. The file may be gzip compressed.
143              
144             =item source
145              
146             Pass a string to be added as the source tag value of the SeqFeature objects.
147             The default value is the basename of the file to be parsed.
148              
149             =item do_exon
150              
151             =item do_cds
152              
153             =item do_utr
154              
155             =item do_codon
156              
157             Pass a boolean (1 or 0) value to parse certain subfeatures, including C,
158             C, C, C, C, and C
159             features. Default is false.
160              
161             =item class
162              
163             Pass the name of a L compliant class that will be used to
164             create the SeqFeature objects. The default is to use L.
165              
166             =back
167              
168             =back
169              
170             =head2 Modify the parser object
171              
172             These methods set or retrieve parameters that modify parser functionality.
173              
174             =over 4
175              
176             =item source
177              
178             =item do_exon
179              
180             =item do_cds
181              
182             =item do_utr
183              
184             =item do_codon
185              
186             These methods retrieve or set parameters to the parsing engine, same as
187             the options to the new method.
188              
189             =item open_file
190              
191             Pass the name of a file to parse. This function is called automatically by the
192             L method if a filename was passed. This will open the file, check its format,
193             and set the parsers appropriately.
194              
195             =back
196              
197             =head2 Parser or file attributes
198              
199             These retrieve attributes for the parser or file.
200              
201             =over 4
202              
203             =item version
204              
205             This returns a string representation of the opened bed file format. For standard
206             bed files, it returns 'bed' followed by the number columns, e.g. C or C.
207             For recognized special bed variants, it will return C, C, or
208             C.
209              
210             =item fh
211              
212             Retrieves the file handle of the current file. This module uses
213             L objects. Be careful manipulating file handles of open files!
214              
215             =item typelist
216              
217             Returns a string representation of the type of SeqFeature types to be encountered in
218             the file. Currently this returns generic strings, 'mRNA,ncRNA,exon,CDS' for bed12
219             and 'region' for everything else.
220              
221             =back
222              
223             =head2 Feature retrieval
224              
225             The following methods parse the table lines into SeqFeature objects.
226             It is best if methods are not mixed; unexpected results may occur.
227              
228             For bed12 files, it will return a transcript model SeqFeature with appropriate subfeatures.
229              
230             =over 4
231              
232             =item next_feature
233              
234             This will read the next line of the table, parse it into a feature object, and
235             immediately return it.
236              
237             =item next_top_feature
238              
239             This method will first parse the entire file into memory. It will then return each
240             feature one at a time. Call this method repeatedly using a while loop to get all features.
241              
242             =item top_features
243              
244             This method is similar to L, but instead returns an array
245             of all the top features.
246              
247             =back
248              
249             =head2 Other methods
250              
251             Additional methods for working with the parser object and the parsed
252             SeqFeature objects.
253              
254             =over 4
255              
256             =item parse_file
257              
258             Parses the entire file into memory without returning any objects.
259              
260             =item find_gene
261              
262             my $gene = $bed->find_gene(
263             display_name => 'ABC1',
264             primary_id => 'chr1:123-456',
265             );
266              
267             Pass a feature name, or an array of key =E values (name, display_name,
268             ID, primary_ID, and/or coordinate information), that can be used
269             to find a feature already loaded into memory. Only really successful if the
270             entire table is loaded into memory. Features with a matching name are
271             confirmed by a matching ID or overlapping coordinates, if available.
272             Otherwise the first match is returned.
273              
274             =item comments
275              
276             This method will return an array of the comment, track, or browser lines that may have
277             been in the parsed file. These may or may not be useful.
278              
279             =item seq_ids
280              
281             Returns an array or array reference of the names of the chromosomes or
282             reference sequences present in the file. Must parse the entire file before using.
283              
284             =item seq_id_lengths
285              
286             Returns a hash reference to the chromosomes or reference sequences and
287             their corresponding lengths. In this case, the length is inferred by the
288             greatest feature end position. Must parse the entire file before using.
289              
290             =back
291              
292             =head1 SEE ALSO
293              
294             L, L, L,
295              
296              
297             =cut
298              
299 1     1   845 use strict;
  1         1  
  1         27  
300 1     1   4 use Carp qw(carp cluck croak confess);
  1         1  
  1         39  
301 1     1   478 use Bio::ToolBox::Data::Stream;
  1         3  
  1         2860  
302              
303             my $SFCLASS = 'Bio::ToolBox::SeqFeature';
304             eval "require $SFCLASS" or croak $@;
305              
306             1;
307              
308             sub new {
309 20     20 1 2759 my $class = shift;
310 20         240 my $self = {
311             'stream' => undef,
312             'fh' => undef,
313             'version' => undef,
314             'bed' => undef,
315             'convertor_sub' => undef,
316             'source' => '.',
317             'top_features' => [],
318             'do_gene' => 0,
319             'do_exon' => 0,
320             'do_cds' => 0,
321             'do_utr' => 0,
322             'do_codon' => 0,
323             'share' => 0,
324             'eof' => 0,
325             'line_count' => 0,
326             'seq_ids' => {},
327             };
328 20         50 bless $self, $class;
329            
330             # check for options
331 20 100       60 if (@_) {
332 12 50       47 if (scalar(@_) == 1) {
333             # short and sweet, just a file, we assume
334 0         0 my $file = shift @_;
335 0         0 $self->open_file($file);
336             }
337             else {
338 12         47 my %options = @_;
339 12 50 33     55 if (exists $options{file} or $options{table}) {
340 12   33     31 $options{file} ||= $options{table};
341 12         50 $self->open_file( $options{file} );
342             }
343 12 100       30 if (exists $options{do_exon}) {
344 4         12 $self->do_exon($options{do_exon});
345             }
346 12 50       26 if (exists $options{do_cds}) {
347 0         0 $self->do_cds($options{do_cds});
348             }
349 12 50       27 if (exists $options{do_utr}) {
350 0         0 $self->do_utr($options{do_utr});
351             }
352 12 50       24 if (exists $options{do_codon}) {
353 0         0 $self->do_codon($options{do_codon});
354             }
355 12 50       21 if (exists $options{source}) {
356 0         0 $self->source($options{source});
357             }
358 12 100       32 if (exists $options{class}) {
359 8         25 my $class = $options{class};
360 8 50       360 if (eval "require $class; 1") {
361 8         21 $SFCLASS = $class;
362             }
363             else {
364 0         0 croak $@;
365             }
366             }
367             }
368             }
369            
370             # done
371 20         63 return $self;
372             }
373              
374             sub version {
375 28     28 1 1133 return shift->{version};
376             }
377              
378             sub source {
379 106     106 1 136 my $self = shift;
380 106 100       178 if (@_) {
381 20         30 $self->{'source'} = shift;
382             }
383 106         231 return $self->{'source'};
384             }
385              
386             sub simplify {
387             # this doesn't do anything, for now, but maintain compatibility with gff parser
388 8     8 0 14 return 0;
389             }
390              
391             sub do_gene {
392             # this does nothing
393 12     12 0 1132 return 0;
394             }
395              
396             sub do_exon {
397             # this only pertains to bed-12 files
398 98     98 1 122 my $self = shift;
399 98 100       165 if (@_) {
400 8         16 $self->{'do_exon'} = shift;
401             }
402 98         211 return $self->{'do_exon'};
403             }
404              
405             sub do_cds {
406             # this only pertains to bed-12 files
407 62     62 1 76 my $self = shift;
408 62 50       111 if (@_) {
409 0         0 $self->{'do_cds'} = shift;
410             }
411 62         127 return $self->{'do_cds'};
412             }
413              
414             sub do_utr {
415             # this only pertains to bed-12 files
416 58     58 1 74 my $self = shift;
417 58 50       96 if (@_) {
418 0         0 $self->{'do_utr'} = shift;
419             }
420 58         113 return $self->{'do_utr'};
421             }
422              
423             sub do_codon {
424             # this only pertains to bed-12 files
425 62     62 1 78 my $self = shift;
426 62 50       108 if (@_) {
427 0         0 $self->{'do_codon'} = shift;
428             }
429 62         122 return $self->{'do_codon'};
430             }
431              
432             sub do_name {
433             # this does nothing
434 229     229 0 387 return 0;
435             }
436              
437             sub share {
438             # this does nothing as we do not parse genes, so no opportunity for sharing
439 229     229 0 413 return 0;
440             }
441              
442             sub fh {
443 346     346 1 2430 my $self = shift;
444 346         3023 return $self->{'fh'};
445             }
446              
447             sub open_file {
448 20     20 1 36 my $self = shift;
449            
450 20 50       55 if ($self->{stream}) {
451 0         0 confess " Cannot open a new file with the same Bed parser object!";
452             }
453            
454             # check file
455 20         37 my $filename = shift;
456 20 50       51 unless ($filename) {
457 0         0 cluck("no file name passed!");
458 0         0 return;
459             }
460            
461             # Open file
462             # we are using a Stream object for simplicity to handle comment lines,
463             # identify columns and structures, etc
464 20 50       150 my $Stream = Bio::ToolBox::Data::Stream->new(in => $filename) or
465             croak " cannot open file '$filename'!\n";
466 20         51 my $bed = $Stream->bed; # this is the number of bed columns observed
467 20 50       44 unless ($bed) {
468 0         0 croak " file '$filename' doesn't appear to be proper Bed format!\n";
469             }
470 20         32 $self->{bed} = $bed;
471            
472             # check for special formats
473 20 100       54 my $peak = $Stream->extension =~ /peak/i ? $bed : 0;
474 20 50 33     66 my $bdg = ($bed == 4 and $Stream->extension =~ /bdg|bedgraph/i) ? 1 : 0;
475            
476             # check for a track or browser definition line
477 20         54 foreach my $c ($Stream->comments) {
478             # these may or may not be present
479             # if so, we can check for a track type which may hint at what the file is
480 8 100       52 if ($c =~ /^track.+type=gappedpeak/i) {
    50          
    50          
    50          
    50          
    50          
481             # looks like we have a gappedPeak file
482 4         9 $peak = 15;
483             }
484             elsif ($c =~ /^track.+type=narrowpeak/i) {
485             # looks like we have a narrowPeak file
486 0         0 $peak = 10;
487             }
488             elsif ($c =~ /^track.+type=broadpeak/i) {
489             # looks like we have a broadPeak file
490 0         0 $peak = 9;
491             }
492             elsif ($c =~ /^track.+type=bedgraph/i) {
493             # this is unlikely to occur, but you never know
494 0         0 $bdg = 1;
495             }
496             elsif ($c =~ /^track.+type=bed\s/i) {
497             # obviously, but just in case
498             }
499             elsif ($c =~ /^track.+type=(\w+)\s/i) {
500             # something weird
501 0         0 printf " file track definition type of '%s' is unrecognized, proceeding as bed file\n", $1;
502             }
503             }
504            
505             # assign source
506 20         63 $self->source( $Stream->basename );
507            
508             # assign the parsing subroutine
509 20 100       85 if ($peak == 15) {
    100          
    50          
    50          
    100          
510             # gappedPeak file
511 4         10 $self->{version} = 'gappedPeak';
512 4         9 $self->{convertor_sub} = \&_parse_gappedPeak;
513 4         15 $self->do_exon(1); # always parse sub peaks as exons
514            
515             # gappedPeak is essentially bed12 with extra columns
516             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
517             # we need more object stuff that the ucsc parser expects
518 4         224 eval "require Bio::ToolBox::parser::ucsc;";
519 4         15 $self->{id2count} = {};
520 4         10 $self->{refseqsum} = {};
521 4         10 $self->{refseqstat} = {};
522 4         8 $self->{kgxref} = {};
523 4         9 $self->{ensembldata} = {};
524 4         7 $self->{gene2seqf} = {};
525 4         10 $self->{sfclass} = $SFCLASS;
526             }
527             elsif ($peak == 10) {
528             # narrowPeak file
529 4         8 $self->{version} = 'narrowPeak';
530 4         9 $self->{convertor_sub} = \&_parse_narrowPeak;
531             }
532             elsif ($peak == 9) {
533             # broadPeak file
534 0         0 $self->{version} = 'broadPeak';
535 0         0 $self->{convertor_sub} = \&_parse_broadPeak;
536             }
537             elsif ($bdg) {
538             # bedGraph file
539 0         0 $self->{version} = 'bedGraph';
540 0         0 $self->{convertor_sub} = \&_parse_bedGraph;
541             }
542             elsif ($bed > 6) {
543             # a gene table bed 12 format
544 6         14 $self->{version} = 'bed12';
545 6         13 $self->{convertor_sub} = \&_parse_bed12;
546            
547             # we will use existing code from the ucsc parser to convert bed12 to seqfeatures
548             # we need more object stuff that the ucsc parser expects
549 6         311 eval "require Bio::ToolBox::parser::ucsc;";
550 6         21 $self->{id2count} = {};
551 6         15 $self->{refseqsum} = {};
552 6         12 $self->{refseqstat} = {};
553 6         12 $self->{kgxref} = {};
554 6         12 $self->{ensembldata} = {};
555 6         13 $self->{gene2seqf} = {};
556 6         12 $self->{sfclass} = $SFCLASS;
557             }
558             else {
559             # an ordinary bed file
560 6         26 $self->{version} = sprintf("bed%d", $bed);
561 6         14 $self->{convertor_sub} = \&_parse_bed;
562             }
563            
564             # store stuff
565 20         38 $self->{stream} = $Stream; # keep this around, but probably won't use it....
566 20         37 $self->{fh} = $Stream->{fh};
567 20         49 return 1;
568             }
569              
570             sub typelist {
571 8     8 1 14 my $self = shift;
572 8 50       21 return unless $self->{stream};
573            
574 8 100       22 if ($self->version eq 'bed12') {
575             # return generic list based on what I could expect
576 2         6 return 'mRNA,ncRNA,exon,CDS';
577             }
578             else {
579             # return generic
580 6         16 return 'region';
581             }
582             }
583              
584             sub next_feature {
585 138     138 1 361 my $self = shift;
586            
587             # check that we have an open filehandle
588 138 50       216 unless ($self->fh) {
589 0         0 croak("no Bed file loaded to parse!");
590             }
591            
592             # loop through the file
593 138         216 while (my $line = $self->fh->getline) {
594 124         2445 chomp $line;
595 124 50 33     789 if ($line =~ /^#/ or $line =~ /^(?:track|browser)/ or $line !~ /\w+/) {
      33        
596 0         0 $self->{line_count}++;
597 0         0 next;
598             }
599 124         160 my $feature = &{$self->{convertor_sub}}($self, $line);
  124         243  
600 124         419 $self->{line_count}++;
601 124 50       201 unless ($feature) {
602 0         0 printf STDERR "unable to make feature for line %d!\n", $self->{line_count};
603 0         0 next;
604             }
605            
606             # return the object, we do this while loop once per valid line
607 124         302 return $feature;
608             }
609            
610             # presumed end of file
611 14         472 $self->{'eof'} = 1;
612 14         54 $self->fh->close;
613 14         263 return;
614             }
615              
616             sub next_top_feature {
617 36     36 1 46 my $self = shift;
618             # check that we have an open filehandle
619 36 50       44 unless ($self->fh) {
620 0         0 croak("no Bed file loaded to parse!");
621             }
622 36 50       85 unless ($self->{'eof'}) {
623 0 0       0 $self->parse_file or croak "unable to parse file!";
624             }
625 36         35 return shift @{ $self->{top_features} };
  36         76  
626             }
627              
628             sub top_features {
629 6     6 1 27 my $self = shift;
630 6 50       16 unless ($self->{'eof'}) {
631 6         20 $self->parse_file;
632             }
633 6         10 my @features = @{ $self->{top_features} };
  6         20  
634 6 50       26 return wantarray ? @features : \@features;
635             }
636              
637             *parse_table = \&parse_file;
638              
639             sub parse_file {
640 14     14 1 30 my $self = shift;
641             # check that we have an open filehandle
642 14 50       39 unless ($self->fh) {
643 0         0 confess("no file loaded to parse!");
644             }
645 14 50       43 return 1 if $self->{'eof'};
646            
647 14         36 printf " Parsing %s format file....\n", $self->version;
648            
649 14         91 while (my $feature = $self->next_feature) {
650             # there are possibly lots and lots of features here
651             # we are not going to bother checking IDs for uniqueness or sort them
652             # especially since we don't have to assign subfeatures to them
653 118         125 push @{ $self->{top_features} }, $feature;
  118         199  
654            
655             # check chromosome
656 118         235 my $s = $feature->seq_id;
657 118 100       356 unless (exists $self->{seq_ids}{$s}) {
658 18         53 $self->{seq_ids}{$s} = $feature->end;
659             }
660 118 100       221 $self->{seq_ids}{$s} = $feature->end if $feature->end > $self->{seq_ids}{$s};
661             }
662 14         43 return 1;
663             }
664              
665              
666             sub _parse_narrowPeak {
667 16     16   34 my ($self, $line) = @_;
668 16         79 my @data = split /\t/, $line;
669 16 50       34 unless (scalar(@data) == 10) {
670             croak sprintf("narrowPeak line %d '%s' doesn't have 10 elements!",
671 0         0 $self->{line_count}, $line);
672             }
673            
674             # generate the basic SeqFeature
675             my $feature = $SFCLASS->new(
676             -seq_id => $data[0],
677             -start => $data[1] + 1,
678             -end => $data[2],
679             -name => $data[3],
680             -score => $data[4],
681             -strand => $data[5],
682             -primary_tag => 'region',
683             -source => $self->{source},
684 16         170 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
685             );
686            
687             # add extra columns
688 16         53 $feature->add_tag_value('signalValue', $data[6]);
689 16         37 $feature->add_tag_value('pValue', $data[7]);
690 16         28 $feature->add_tag_value('qValue', $data[8]);
691 16         30 $feature->add_tag_value('peak', $data[9]);
692            
693 16         32 return $feature;
694             }
695              
696             sub _parse_broadPeak {
697 0     0   0 my ($self, $line) = @_;
698 0         0 my @data = split /\t/, $line;
699 0 0       0 unless (scalar(@data) == 9) {
700             croak sprintf("broadPeak line %d '%s' doesn't have 9 elements!",
701 0         0 $self->{line_count}, $line);
702             }
703            
704             # generate the basic SeqFeature
705             my $feature = $SFCLASS->new(
706             -seq_id => $data[0],
707             -start => $data[1] + 1,
708             -end => $data[2],
709             -name => $data[3],
710             -score => $data[4],
711             -strand => $data[5],
712             -primary_tag => 'region',
713             -source => $self->{source},
714 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
715             );
716            
717             # add extra columns
718 0         0 $feature->add_tag_value('signalValue', $data[6]);
719 0         0 $feature->add_tag_value('pValue', $data[7]);
720 0         0 $feature->add_tag_value('qValue', $data[8]);
721            
722 0         0 return $feature;
723             }
724              
725             sub _parse_bedGraph {
726 0     0   0 my ($self, $line) = @_;
727 0         0 my @data = split /\t/, $line;
728 0 0       0 unless (scalar(@data) == 9) {
729             croak sprintf("bedGraph line %d '%s' doesn't have 4 elements!",
730 0         0 $self->{line_count}, $line);
731             }
732            
733             # generate the basic SeqFeature
734             return $SFCLASS->new(
735             -seq_id => $data[0],
736             -start => $data[1] + 1,
737             -end => $data[2],
738             -score => $data[3],
739             -primary_tag => 'region',
740             -source => $self->{source},
741 0         0 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
742             );
743             }
744              
745             sub _parse_bed {
746 22     22   42 my ($self, $line) = @_;
747 22         62 my @data = split /\t/, $line;
748 22 50       51 unless (scalar(@data) == $self->{bed}) {
749             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
750 0         0 $self->{line_count}, $line, $self->{bed});
751             }
752            
753             # generate the basic SeqFeature
754             return $SFCLASS->new(
755             -seq_id => $data[0],
756             -start => $data[1] + 1,
757             -end => $data[2],
758             -name => $data[3] || undef,
759             -score => $data[4] || undef,
760             -strand => $data[5] || undef,
761             -primary_tag => 'region',
762             -source => $self->{source},
763 22   50     240 -primary_id => sprintf("%s:%d-%d", $data[0], $data[1], $data[2]),
      50        
      50        
764             );
765             }
766              
767             sub _parse_bed12 {
768 70     70   117 my ($self, $line) = @_;
769 70         235 my @data = split /\t/, $line;
770 70 50       149 unless (scalar(@data) == $self->{bed}) {
771             croak sprintf("Bed line %d '%s' doesn't have %d elements!",
772 0         0 $self->{line_count}, $line, $self->{bed});
773             }
774            
775             # we will take advantage of pre-existing code in the UCSC parser to convert
776             # a bed12 line to a fully-fledged processed transcript
777             # we just have to go through a genePred format first
778             # fortunately, the two are pretty similar in structure
779            
780             # now convert from bed12
781             # Chromosome Start End Name Score Strand thickStart thickEnd itemRGB blockCount blockSizes blockStarts
782             # to genePred
783             # name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds
784            
785             # add missing data
786 70   33     119 $data[6] ||= $data[2]; # cdsStart
787 70   33     102 $data[7] ||= $data[2]; # cdsEnd
788 70   50     191 $data[8] ||= 0; # rgb values
789 70   50     102 $data[9] ||= 1; # exonCount
790 70   33     92 $data[10] ||= $data[7] - $data[6]; # block size
791 70   100     105 $data[11] ||= 0; # block starts
792            
793             # calculate exons
794 70         124 my @exonSizes = split(',', $data[10]);
795 70         154 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  366         508  
796 70         115 my @exonEnds;
797 70         136 for (my $i = 0; $i < $data[9]; $i++) {
798 366         551 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
799             }
800            
801             # calculate new genePred elements
802 70         319 my @new = (
803             $data[3], # name
804             $data[0], # chrom
805             $data[5], # strand
806             $data[1], # txStart
807             $data[2], # txStop
808             $data[6], # cdsStart
809             $data[7], # cdsEnd
810             $data[9], # exonCount
811             join(',', @exonStarts), # exonStarts
812             join(',', @exonEnds), # exonEnds
813             );
814            
815             # create builder
816 70         274 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
817 70         209 my $feature = $builder->build_transcript;
818 70         184 $feature->add_tag_value('itemRGB', $data[8]);
819 70         248 $feature->score($data[4]);
820 70         461 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
821             # change the primary ID to match other bed file behavior, not UCSC files'
822 70         493 return $feature;
823             }
824              
825             sub _parse_gappedPeak {
826 16     16   32 my ($self, $line) = @_;
827 16         64 my @data = split /\t/, $line;
828 16 50       41 unless (scalar(@data) == 15) {
829             croak sprintf("GappedPeak line %d '%s' doesn't have 15 elements!",
830 0         0 $self->{line_count}, $line);
831             }
832            
833             # we will take advantage of pre-existing code in the UCSC parser to convert
834             # a gappedPeak line into main peak with subpeaks.
835             # we just have to go through a genePred format first
836             # fortunately, the two are pretty similar in structure
837            
838             # calculate exons, er, sub peaks
839 16         34 my @exonSizes = split(',', $data[10]);
840 16         34 my @exonStarts = map {$data[1] + $_} split(',', $data[11]);
  41         83  
841 16         22 my @exonEnds;
842 16         38 for (my $i = 0; $i < $data[9]; $i++) {
843 41         75 push @exonEnds, $exonStarts[$i] + $exonSizes[$i];
844             }
845            
846             # calculate new genePred elements
847 16         71 my @new = (
848             $data[3], # name
849             $data[0], # chrom
850             $data[5], # strand
851             $data[1], # txStart
852             $data[2], # txStop
853             $data[6], # cdsStart
854             $data[7], # cdsEnd
855             $data[9], # exonCount
856             join(',', @exonStarts), # exonStarts
857             join(',', @exonEnds), # exonEnds
858             );
859            
860             # create builder and process
861 16         85 my $builder = Bio::ToolBox::parser::ucsc::builder->new(join("\t", @new), $self);
862 16         46 my $feature = $builder->build_transcript;
863            
864             # clean up feature and add extra values
865 16         45 $feature->add_tag_value('itemRGB', $data[8]);
866 16         37 $feature->score($data[4]);
867 16         39 $feature->primary_tag('region'); # it is not a RNA
868 16         85 $feature->primary_id(sprintf("%s:%d-%d", $data[0], $data[1], $data[2]));
869             # change the primary ID to match other bed file behavior, not UCSC files'
870 16         37 $feature->add_tag_value('signalValue', $data[12]);
871 16         56 $feature->add_tag_value('pValue', $data[13]);
872 16         32 $feature->add_tag_value('qValue', $data[14]);
873 16         99 return $feature;
874             }
875              
876             sub seq_ids {
877 0     0 1 0 my $self = shift;
878 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
879 0         0 $self->_get_seq_ids;
880             }
881 0         0 my @s = keys %{$self->{seq_ids}};
  0         0  
882 0 0       0 return wantarray ? @s : \@s;
883             }
884              
885              
886             sub seq_id_lengths {
887 0     0 1 0 my $self = shift;
888 0 0       0 unless (scalar keys %{$self->{seq_ids}}) {
  0         0  
889 0         0 $self->_get_seq_ids;
890             }
891 0         0 return $self->{seq_ids};
892             }
893              
894             sub _get_seq_ids {
895 0     0   0 my $self = shift;
896 0 0       0 return unless $self->{'eof'};
897 0         0 foreach (@{ $self->{top_features} }) {
  0         0  
898 0         0 my $s = $_->seq_id;
899 0 0       0 unless (exists $self->{seq_ids}{$s}) {
900 0         0 $self->{seq_ids}{$s} = 1;
901             }
902 0 0       0 $self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
903             }
904             }
905              
906             sub comments {
907 0     0 1 0 my $self = shift;
908 0 0       0 return unless $self->{stream};
909 0         0 return $self->{stream}->comments;
910             }
911              
912              
913             sub find_gene {
914 38     38 1 6768 my $self = shift;
915            
916             # check that we have id2seqf table
917             # we lazy load this as it might not be needed every time
918 38 100       73 unless (exists $self->{id2seqf}) {
919 10 50       29 croak "must parse file first!" unless $self->{'eof'};
920 10         25 $self->{id2seqf} = {};
921 10         14 foreach (@{ $self->{top_features} }) {
  10         26  
922 86         138 my $name = lc $_->display_name;
923 86 50       216 if (exists $self->{id2seqf}->{$name}) {
924 0         0 push @{ $self->{id2seqf}->{$name} }, $_;
  0         0  
925             }
926             else {
927 86         145 $self->{id2seqf}->{$name} = [$_];
928             }
929             }
930             }
931            
932             # get the name and coordinates from arguments
933 38         52 my ($name, $id, $chrom, $start, $end, $strand);
934 38 50       77 if (scalar @_ == 0) {
    100          
935 0         0 carp "must provide information to find_gene method!";
936 0         0 return;
937             }
938             elsif (scalar @_ == 1) {
939 6         9 $name = $_[0];
940             }
941             else {
942 32         53 my %opt = @_;
943 32   0     54 $name = $opt{name} || $opt{display_name} || undef;
944 32   0     47 $id = $opt{id} || $opt{primary_id} || undef;
945 32   50     95 $chrom = $opt{chrom} || $opt{seq_id} || undef;
946 32   50     98 $start = $opt{start} || undef;
947 32   50     92 $end = $opt{stop} || $opt{end} || undef;
948 32   50     69 $strand = $opt{strand} || 0;
949             }
950 38 50       62 unless ($name) {
951 0         0 carp "name is required for find_gene!";
952 0         0 return;
953             }
954            
955             # check if a gene with this name exists
956 38 50       102 if (exists $self->{id2seqf}->{lc $name} ) {
957             # we found a matching gene
958            
959             # pull out the gene seqfeature(s) array reference
960             # there may be more than one gene
961 38         67 my $genes = $self->{id2seqf}->{ lc $name };
962            
963             # go through a series of checks to find the appropriate
964 38 100       54 if ($id) {
965 32         41 foreach my $g (@$genes) {
966 32 50       72 if ($g->primary_id eq $id) {
967 32         51 return $g;
968             }
969             }
970 0         0 return; # none of these matched despite having an ID
971             }
972 6 0 33     14 if ($chrom and $start and $end) {
      33        
973 0         0 foreach my $g (@$genes) {
974 0 0 0     0 if (
      0        
975             # overlap method borrowed from Bio::RangeI
976             ($g->strand == $strand) and not (
977             $g->start > $end or
978             $g->end < $start
979             )
980             ) {
981             # gene and transcript overlap on the same strand
982             # we found the intersecting gene
983 0         0 return $g;
984             }
985             }
986 0         0 return; # none of these matched despite having coordinate info
987             }
988 6 50       14 if (scalar @$genes == 1) {
    0          
989             # going on trust here that this is the one
990 6         15 return $genes->[0];
991             }
992             elsif (scalar @$genes > 1) {
993 0           carp "more than one gene named $name found!";
994 0           return $genes->[0];
995             }
996            
997             # nothing suitable found
998 0           return;
999             }
1000             }
1001              
1002              
1003             __END__