File Coverage

blib/lib/Bio/ToolBox/Data/file.pm
Criterion Covered Total %
statement 365 638 57.2
branch 192 486 39.5
condition 65 211 30.8
subroutine 25 27 92.5
pod 19 21 90.4
total 666 1383 48.1


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::file;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::file - File functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             File methods for reading and writing data files for both L
11             and L objects. This module should not be used
12             directly. See the respective modules for more information.
13              
14             =cut
15              
16 4     4   22 use strict;
  4         5  
  4         107  
17 4     4   15 use Carp qw(carp cluck croak confess);
  4         5  
  4         141  
18 4     4   16 use File::Basename qw(fileparse);
  4         6  
  4         159  
19 4     4   1389 use File::Which;
  4         3190  
  4         157  
20 4     4   1013 use IO::File;
  4         13190  
  4         23640  
21              
22             # List of acceptable filename extensions
23             our $SUFFIX = qr/\.(?:txt|gff3?|gtf|bed|bg|bdg|bedgraph|sgr|kgg|cdt|vcf|narrowpeak|broadpeak|gappedpeak|reff?lat|genepred|ucsc|maf)(?:\.gz|\.bz2)?/i;
24              
25             # gzip application
26             our $gzip_app;
27             our $bgzip_app;
28              
29             ### The True Statement
30             1;
31              
32              
33              
34             ### Load new version data table from file
35              
36             sub load_file {
37 10     10 1 18 my $self = shift;
38 10         15 my $file = shift;
39 10   50     32 my $noheader = shift || 0; # may not be present
40            
41             # check that we have an empty table
42 10 50 33     31 if ($self->last_row != 0 or $self->number_columns != 0 or $self->filename) {
      33        
43 0         0 carp "Cannot load file onto an existing data table!";
44 0         0 return;
45             }
46            
47             # open the file and load metadata
48 10         48 my $filename = $self->check_file($file);
49 10 50       33 unless ($filename) {
50 0         0 print " file '$file' does not exist!\n";
51 0         0 return;
52             }
53 10         40 $self->add_file_metadata($filename);
54 10 50       35 $self->open_to_read_fh or return;
55 10         53 $self->parse_headers($noheader);
56            
57             # Load the data table
58 10         142 while (my $line = $self->{fh}->getline) {
59             # the current file position should be at the beginning of the
60             # data table information
61 198 50       3376 next if $line !~ m/\w+/;
62            
63             # skip comment and empty lines
64 198 50       323 if (substr($line,0,1) eq '#') {
65 0         0 $self->add_comment($line);
66 0         0 next;
67             }
68            
69             # process the line
70 198         302 $self->add_data_line($line);
71             }
72            
73             # completed loading the file
74 10         288 $self->close_fh;
75 10         175 delete $self->{fh};
76            
77             # verify the structure
78 10 50       44 return unless $self->verify;
79            
80             # finished
81 10         31 return 1;
82             }
83              
84             sub taste_file {
85 24     24 1 1752 my $self = shift;
86 24         37 my $file = shift;
87 24 50       57 my $filename = $self->check_file($file) or return;
88 24         74 my $Taste = $self->new;
89 24         58 $Taste->add_file_metadata($filename);
90 24 50       49 $Taste->open_to_read_fh or return;
91 24         75 $Taste->parse_headers;
92            
93             # load first 10 data lines
94 24         45 $Taste->{data_table}->[0] = $Taste->{'column_names'}; # set table header names
95 24         59 for (my $i = 1; $i <= 10; $i++) {
96 192 100       303 my $line = $Taste->fh->getline or last;
97 180 50       4929 next if $line !~ m/\w+/;
98 180 50       253 next if $line =~ /^#/;
99 180         234 $Taste->add_data_line($line);
100             }
101 24         366 $Taste->close_fh;
102 24         418 $Taste->verify(1); # silently check the integrity of the file
103            
104             # check existing metadata
105 24 100       47 if ($Taste->gff) {
    100          
    50          
106 4         10 return ('gff', $Taste->format);
107             }
108             elsif ($Taste->bed) {
109 12         23 return ('bed', $Taste->format);
110             }
111             elsif ($Taste->ucsc) {
112 0         0 return ('ucsc', $Taste->format);
113             }
114            
115            
116             # check if the number of columns match a known format, then verify
117 8         18 my $number = $Taste->number_columns;
118 8 50       61 if ($number == 9) {
    50          
    50          
    50          
    50          
    100          
119             # possibly a GFF file
120 0         0 $Taste->gff(2);
121 0         0 $Taste->verify(1);
122 0 0       0 if ($Taste->gff == 2) {
123 0         0 return ('gff', $Taste->format);
124             }
125             }
126             elsif ($number == 10) {
127             # possibly a genePred file
128 0         0 $Taste->ucsc(10);
129 0         0 $Taste->verify(1);
130 0 0       0 return 'ucsc' if $Taste->ucsc == 10;
131 0         0 $Taste->add_ucsc_metadata(10,1); # force metadata
132 0         0 $Taste->verify(1);
133 0 0       0 if ($Taste->ucsc == 10) {
134 0         0 return ('ucsc', $Taste->format);
135             }
136             }
137             elsif ($number == 11) {
138             # possibly a refFlat file
139 0         0 $Taste->ucsc(11);
140 0         0 $Taste->verify(1);
141 0 0       0 return 'ucsc' if $Taste->ucsc == 11;
142 0         0 $Taste->add_ucsc_metadata(11,1); # force metadata
143 0         0 $Taste->verify(1);
144 0 0       0 if ($Taste->ucsc == 11) {
145 0         0 return ('ucsc', $Taste->format);
146             }
147             }
148             elsif ($number == 12) {
149             # possibly a knownGene or BED12 file
150 0         0 $Taste->ucsc(12);
151 0         0 $Taste->verify(1);
152 0 0       0 if ($Taste->ucsc == 12) {
153 0         0 return ('ucsc', $Taste->format);
154             }
155 0         0 $Taste->bed(12);
156 0         0 $Taste->verify(1);
157 0 0       0 if ($Taste->bed == 12) {
158 0         0 return ('bed', $Taste->format);
159             }
160 0         0 $Taste->add_ucsc_metadata(12,1); # force metadata
161 0         0 $Taste->verify(1);
162 0 0       0 if ($Taste->ucsc == 12) {
163 0         0 return ('ucsc', $Taste->format);
164             }
165 0         0 $Taste->add_bed_metadata(12,1); # force metadata
166 0         0 $Taste->verify(1);
167 0 0       0 if ($Taste->bed == 12) {
168 0         0 return ('bed', $Taste->format);
169             }
170             }
171             elsif ($number == 15) {
172             # possibly a genePredExt file
173 0         0 $Taste->ucsc(15);
174 0         0 $Taste->verify(1);
175 0 0       0 if ($Taste->ucsc == 15) {
176 0         0 return ('ucsc', $Taste->format);
177             }
178 0         0 $Taste->add_ucsc_metadata(15,1); # force metadata
179 0         0 $Taste->verify(1);
180 0 0       0 if ($Taste->ucsc == 15) {
181 0         0 return ('ucsc', $Taste->format);
182             }
183             }
184             elsif ($number == 16) {
185             # possibly a genePredExt file
186 3         7 $Taste->ucsc(16);
187 3         6 $Taste->verify(1);
188 3 50       7 if ($Taste->ucsc == 16) {
189 0         0 return ('ucsc', $Taste->format);
190             }
191 3 50       7 return 'ucsc' if $Taste->ucsc == 16;
192 3         15 $Taste->add_ucsc_metadata(16,1); # force metadata
193 3         9 $Taste->verify(1);
194 3 50       7 if ($Taste->ucsc == 16) {
195 3         7 return ('ucsc', $Taste->format);
196             }
197             }
198 5         49 return;
199             }
200              
201             sub sample_gff_type_list {
202 11     11 1 23 my ($self, $file) = @_;
203 11 50       66 return unless $file =~ /\.g[tf]f\d?(?:\.gz)?$/i; # assume extension is accurate
204 11 50       33 my $fh = $self->open_to_read_fh($file) or die "can't open $file!\n";
205 11         19 my %types;
206 11         14 my $count = 0;
207 11         32 while ($count < 1000) {
208 1288 100       14087 my $line = $fh->getline or last;
209 1277 50       21394 next if $line !~ m/\w+/;
210 1277 100       1716 next if $line =~ /^#/;
211 1241         2729 my @fields = split('\t', $line);
212 1241         1469 $types{ $fields[2] } += 1;
213 1241         2174 $count++;
214             }
215 11         310 $fh->close;
216 11         301 return join(',', keys %types);
217             }
218              
219              
220             sub parse_headers {
221 60     60 1 81 my $self = shift;
222 60   50     178 my $noheader = shift || 0; # boolean to indicate no headers are present
223            
224             # filehandle
225 60         62 my $fh;
226 60 50 33     206 if (exists $self->{fh} and $self->{fh}) {
    0          
227 60         72 $fh = $self->{fh};
228             }
229             elsif ($self->filename) {
230 0         0 $fh = $self->open_to_read_fh($self->filename);
231             }
232             else {
233 0         0 confess " file metadata and/or open filehandle must be set before parsing headers!";
234             }
235            
236             # check that we have an empty table
237 60 50 33     145 if ($self->last_row != 0 or $self->number_columns != 0) {
238 0         0 cluck "Cannot parse file headers onto an existing data table!";
239 0         0 return;
240             }
241            
242             # read and parse the file
243             # we will ONLY parse the header lines prefixed with a #, as well as the
244             # first data row which contains the column names
245 60         158 $self->program(undef); # reset this to blank, it will be filled by file metadata
246 60         72 my $header_line_count = 0;
247 60         1207 my $line = $fh->getline; # first line
248            
249             # check the first line for proper line endings
250             {
251 60         4884 my $line2 = $line;
  60         89  
252 60         109 chomp($line2);
253 60 50       190 if ($line2 =~ /[\r\n]+/) {
254 0         0 my $filename = $self->filename;
255 0         0 die "File '$filename' does not have expected line endings!\n" .
256             " Try converting to native line endings and try again\n";
257             }
258             }
259            
260 60         114 while ($line) {
261             # we are not chomping the line here because of possible side effects
262             # with UCSC tables where we have to count elements in the first line
263             # and potential header lines, and the first line has a null value at
264             # the end
265            
266             # Parse the datafile metadata headers
267            
268             # no real line, just empty space
269 137 50       2152 if ($line !~ m/\w+/) {
    50          
    100          
    100          
    50          
    100          
    50          
    100          
    100          
270 0         0 $header_line_count++;
271 0         0 next;
272             }
273            
274             # the generating program
275             elsif ($line =~ m/^# Program (.+)$/) {
276 0         0 my $p = $1;
277 0         0 $self->program($p);
278 0         0 $header_line_count++;
279             }
280            
281             # the source database
282             elsif ($line =~ m/^# Database (.+)$/) {
283 10         24 my $d = $1;
284 10         29 $self->database($d);
285 10         12 $header_line_count++;
286             }
287            
288             # the type of feature in this datafile
289             elsif ($line =~ m/^# Feature (.+)$/) {
290 10         19 my $f = $1;
291 10         23 $self->feature($f);
292 10         12 $header_line_count++;
293             }
294            
295             # column or dataset specific information
296             elsif ($line =~ m/^# Column_(\d+)/) {
297             # the column number will become the index number
298 0         0 my $index = $1;
299 0         0 $self->add_column_metadata($line, $index);
300 0         0 $header_line_count++;
301             }
302            
303             # gff version header
304             elsif ($line =~ /^##gff-version\s+([\d\.]+)$/) {
305             # store the gff version in the hash
306             # this may or may not be present in the gff file, but want to keep
307             # it if it is
308 5         13 my $g = $1;
309 5         28 $self->gff($g);
310 5         7 $header_line_count++;
311             }
312            
313             # VCF version header
314             elsif ($line =~ /^##fileformat=VCFv([\d\.]+)$/) {
315             # store the VCF version in the hash
316             # this may or may not be present in the vcf file, but want to keep
317             # it if it is
318 0         0 my $v = $1;
319 0         0 $self->vcf($v);
320 0         0 $self->add_comment($line); # so that it is written properly
321 0         0 $header_line_count++;
322             }
323            
324             # any other nonstandard header
325             elsif ($line =~ /^#/) {
326 44         156 $self->add_comment($line);
327 44         47 $header_line_count++;
328             }
329            
330             # a track or browser line
331             elsif ($line =~ /^(?:track|browser)\s+/i) {
332             # common with wig, bed, or bedgraph files for use with UCSC genome browser
333             # treat as a comment line, there's not that much useful info here
334 8 50       28 if ($line =~ /type=(\w+)/i) {
335 8         30 $self->format($1);
336             }
337 8         21 $self->add_comment($line);
338 8         8 $header_line_count++;
339             }
340            
341             # the remainder is the data table itself
342             else {
343             # the first row in the data table are (usually) the column names
344             # we only want the names, not the rest of the table
345            
346             # specific file formats have implicit predefined column formats
347             # these file formats do NOT have column headers
348             # we will first check for those file formats and process accordingly
349            
350             ### Data tables with a commented header line
351 60 50       205 if ( $self->_commented_header_line($line) ) {
352             # these will have one comment line marked with #
353             # that really contains the column headers
354            
355             # process the real header line
356 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
357             }
358             # we will continue here in case the commented header line was part of a
359             # formatted file type, which will be checked by extension or possibly
360             # the format (if determined) below
361 60   66     198 my $format = $self->format || $self->extension;
362            
363             ### a GFF file
364 60 100       320 if ($format =~ /g[tf]f/i) {
    100          
    100          
    50          
    50          
365 5         17 $self->add_gff_metadata;
366             }
367            
368             ### a peak file
369             elsif ($format =~ /peak/i) {
370 16         32 my $count = scalar(split '\t', $line);
371 16         70 $self->add_peak_metadata($count);
372             }
373            
374             ### a Bed or BedGraph file
375             elsif ($format =~ /bg|bdg|bed/i) {
376 26         69 my $count = scalar(split '\t', $line);
377 26         92 $self->add_bed_metadata($count);
378             }
379            
380             ### a UCSC gene table
381             elsif ($format =~ /ref+lat|genepred|ucsc/i) {
382 0         0 my $count = scalar(split '\t', $line);
383 0         0 $self->add_ucsc_metadata($count);
384             }
385            
386             ### a SGR file
387             elsif ($format =~ /sgr/i) {
388 0         0 $self->add_sgr_metadata;
389             }
390            
391             ### standard text file with headers, i.e. everything else
392 60 100       138 unless ($self->number_columns) {
393             # we have not yet parsed the row of data column names
394             # we will do so now
395 13         36 $self->add_standard_metadata($line);
396            
397             # count as a header line
398 13         19 $header_line_count++;
399             }
400             }
401            
402             # keep processing by going to the next line until we have identified columns
403 137 100       253 if ($self->number_columns == 0) {
404 77         1117 $line = $fh->getline;
405             }
406             else {
407 60         126 undef $line;
408             }
409             }
410            
411             # if we didn't find columns, check that it wasn't actually commented
412             # for example, an empty vcf file
413 60 0 50     99 if ($self->number_columns == 0 and
      0        
414 0         0 scalar(@{ $self->{comments} }) and
415             $self->{comments}->[-1] =~ /\t/
416             ) {
417             # process the real header line
418 0         0 $self->add_standard_metadata( pop @{ $self->{'comments'} } );
  0         0  
419             }
420            
421            
422             # No header was requested
423 60 0 33     115 if ($noheader and not $self->bed and not $self->gff and
      33        
      0        
      0        
424             not $self->vcf and not $self->ucsc
425             ) {
426             # which means that what we used as a header is actually the first data row
427            
428             # fix the column names
429 0         0 for (my $i = 0; $i < $self->number_columns; $i++) {
430 0         0 my $name = $self->name($i);
431 0         0 $self->name($i, "Column$i ($name)");
432 0         0 $self->{$i}{'AUTO'} = 3;
433             }
434             # adjust metadata
435 0         0 $header_line_count -= 1;
436 0         0 $self->{'headers'} = -1; # special case, we never write headers here
437             }
438            
439             # Header sanity check
440             # some exported file formats, such as from R, do not include a proper
441             # header for the first column, as these are assumed to be row names
442             # this will result in incorrectly parsed files where the last columns
443             # will be merged into one column with an internal tab - not good
444             # need to handle these
445 60         984 my $nextline = $fh->getline;
446 60 50       1032 if ($nextline) {
447 60         213 my @nextdata = split '\t', $nextline;
448 60 50       150 if (scalar(@nextdata) - 1 == $self->number_columns) {
449             # whoops! we caught a off-by-one discrepancy between header and data row
450 0         0 my $old_last = $self->last_column;
451 0         0 $fh->close; # having this open complicates changing columns....
452            
453             # add a new "column" (just metadata for now) and move it to the beginning
454 0         0 my $i = $self->add_column('Column1');
455 0         0 $self->reorder_column($i, 0 .. $old_last);
456             }
457 60 100       148 if (ref($self) eq 'Bio::ToolBox::Data::Stream') {
458             # store an example first line for Stream objects
459 26         36 chomp $nextdata[-1];
460 26         50 $self->{example} = \@nextdata;
461             }
462             }
463            
464             # re-open the file
465             # I tried using seek functions - but they don't work with binary gzip
466             # files, and I can't get the seek function to return the same position
467             # as simply advancing through the file like below
468             # so I'll just do it the old way and close/open and advance
469 60 50       210 $fh->close if $fh; # may have been closed from the sanity check above
470 60         1231 $fh = $self->open_to_read_fh;
471 60         191 for (1 .. $header_line_count) {
472 90         2383 my $line = $fh->getline;
473             }
474 60         870 $self->{header_line_count} = $header_line_count;
475 60         93 $self->{fh} = $fh;
476 60         96 return 1;
477             }
478              
479              
480             sub add_data_line {
481 574     574 1 691 my ($self, $line) = @_;
482            
483             # do not chomp the line yet, just split into an array
484 574         2010 my @linedata = split '\t', $line, $self->{number_columns};
485            
486             # chomp the last element
487             # we do this here to ensure the tab split above gets all of the values
488             # otherwise trailing null values aren't included in @linedata
489 574         743 chomp $linedata[-1];
490            
491             # check for extra remaining tabs
492 574 50       876 if (index($linedata[-1], "\t") != -1) {
493 0         0 die "FILE INCONSISTENCY ERRORS! line has additional tabs (columns) than headers!\n $line";
494             }
495            
496             # add the line of data
497 574         632 push @{ $self->{data_table} }, \@linedata;
  574         917  
498 574         656 $self->{last_row} += 1;
499 574         3152 return 1;
500             }
501              
502              
503             ### Parse the filename using the list suffix list
504             sub add_file_metadata {
505 70     70 1 143 my ($self, $filename) = @_;
506 70 50       134 confess "no valid filename!" unless defined $filename;
507 70         2033 my ($basename, $path, $extension) = fileparse($filename, $SUFFIX);
508 70 50       173 unless ($extension) {
509             # look for a nonstandard extension, allowing for .gz extension
510 0 0       0 if ($filename =~ /(\.\w+(?:\.gz)?)$/i) {
511 0         0 $extension = $1;
512 0         0 $basename =~ s/$extension\Z//;
513             }
514             }
515 70         131 $self->{filename} = $filename;
516 70         95 $self->{basename} = $basename;
517 70         107 $self->{path} = $path;
518 70         114 $self->{extension} = $extension;
519             }
520              
521              
522             ### Write out a data file
523             sub write_file {
524 8     8 1 15 my $self = shift;
525            
526             # collect passed arguments
527 8         13 my %args;
528 8 100       21 if (scalar(@_) == 1) {
529 7         16 $args{'filename'} = $_[0];
530             }
531             else {
532 1         4 %args = @_;
533             }
534 8   0     22 $args{'filename'} ||= $args{'file'} || undef;
      33        
535 8   50     36 $args{'format'} ||= undef;
536 8 50       19 unless (exists $args{'gz'}) {$args{'gz'} = undef}
  8         14  
537            
538             # check the data
539 8 50       29 unless ($self->verify) {
540 0         0 cluck "bad data structure!";
541 0         0 return;
542             }
543            
544             # check filename
545 8 50 33     25 unless ($args{'filename'} or $self->filename) {
546 0         0 cluck "no filename given!\n";
547 0         0 return;
548             }
549            
550             # split filename into its base components
551             my ($name, $path, $extension) =
552 8   33     217 fileparse($args{'filename'} || $self->filename, $SUFFIX);
553            
554             # Adjust filename extension if necessary
555 8 50       96 if ($extension =~ /(g[tf]f)/i) {
    100          
    50          
    50          
    50          
    50          
    0          
556 0 0       0 if (not $self->gff) {
557             # let's set it to true and see if it passes verification
558 0 0       0 $self->{'gff'} = $extension =~ /gtf/i ? 2.5 : 3; # default
559 0 0 0     0 unless ($self->verify and $self->gff) {
560 0         0 warn " GFF structure changed, re-setting extension from $extension to .txt\n";
561 0         0 $extension =~ s/g[tf]f3?/txt/i;
562             }
563             }
564             }
565             elsif ($extension =~ /bedgraph|bed|bdg|narrowpeak|broadpeak/i) {
566 3 100       11 if (not $self->bed) {
567             # let's set it to true and see if it passes verification
568 1         2 $self->{'bed'} = 1; # a fake true
569 1 50 33     2 unless ($self->verify and $self->bed) {
570 0         0 warn " BED structure changed, re-setting extension from $extension to .txt\n";
571 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
572             }
573             }
574             }
575             elsif ($extension =~ /vcf/i) {
576 0 0       0 if (not $self->vcf) {
577             # let's set it to true and see if it passes verification
578 0         0 $self->{'vcf'} = 1; # a fake true
579 0 0 0     0 unless ($self->verify and $self->vcf) {
580 0         0 warn " VCF structure changed, re-setting extension from $extension to .txt\n";
581 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
582             }
583             }
584             }
585             elsif ($extension =~ /sgr/i) {
586 0 0       0 unless ($self->{'extension'} =~ /sgr/i) {
587             # original file was not SGR
588             # let's pretend it was and see if still passes
589             # the sgr verification relies on the recorded extension
590 0         0 $self->{'extension'} = '.sgr';
591 0         0 $self->verify;
592 0 0       0 if ($self->extension =~ /txt/) {
593 0         0 warn " SGR structure changed, re-setting extension from $extension to .txt\n";
594             }
595 0         0 $extension = $self->{'extension'};
596             }
597             }
598             elsif ($extension =~ /reff?lat|genepred|ucsc/i) {
599 0 0       0 if ($self->ucsc != $self->number_columns) {
600             # it's not set as a ucsc data
601             # let's set it to true and see if it passes verification
602 0         0 $self->ucsc($self->number_columns);
603 0 0 0     0 unless ($self->verify and $self->ucsc) {
604 0         0 warn " UCSC structure changed, re-setting extension from $extension to .txt\n";
605 0 0       0 $extension = $extension =~ /gz$/i ? '.txt.gz' : '.txt';
606             }
607             }
608             }
609             elsif ($extension =~ /txt/i) {
610             # plain old text file, sounds good to me
611             # make sure headers are enabled
612 5 50       23 $self->{'headers'} = 1 unless $self->{'headers'} == -1; # original noheader
613             }
614             elsif (not $extension) {
615             # no extension was available
616             # try and determine one from metadata
617            
618 0 0       0 if ($self->gff) {
    0          
    0          
    0          
    0          
    0          
619 0 0       0 $extension = $self->gff == 3 ? '.gff3' : $self->gff == 2.5 ? '.gtf' : '.gff';
    0          
620             }
621             elsif ($self->bed) {
622 0 0 0     0 if ($self->format) {
    0          
623             # re-use the format value as the extension
624 0         0 $extension = sprintf(".%s", $self->format);
625             }
626             elsif (
627             $self->number_columns == 4 and
628             $self->name(3) =~ /score/i
629             ) {
630 0         0 $extension = '.bdg'; # looks like a bedGraph file
631             }
632             else {
633 0         0 $extension = '.bed'; # a regular bed file
634             }
635             }
636             elsif ($self->ucsc) {
637 0 0       0 if ($self->format) {
638             # re-use the format value as the extension
639 0         0 $extension = sprintf(".%s", $self->format);
640             }
641             else {
642             # use a generic ucsc format
643 0         0 $extension = '.ucsc';
644             }
645             }
646             elsif ($self->vcf) {
647 0         0 $extension = '.vcf';
648             }
649             elsif ($name =~ /(\.\w{3}(?:\.gz)?)$/i) {
650             # a non-standard 3 letter file extension
651             # anything else might be construed as part of the filename, so run the
652             # risk of adding a default extension below
653 0         0 $extension = $1;
654 0         0 $name =~ s/$extension\Z//;
655             }
656             elsif ($self->extension) {
657             # original file had an extension, re-use it if appropriate
658             # why wouldn't this get picked up above???? probably old cruft,
659             # or a non-standard or unknown file extension
660             # leave it in for the time being, shouldn't hurt anything
661 0 0       0 if ($self->extension =~ /g[tf]f/i) {
    0          
662 0 0       0 $extension = $self->gff ? $self->extension : '.txt';
663             }
664             elsif ($self->extension =~ /bed|bdg|peak/i) {
665 0 0       0 $extension = $self->bed ? $self->extension : '.txt';
666             }
667             else {
668             # an unstructured format
669 0         0 $extension = $self->extension;
670             }
671             }
672             else {
673             # normal data text file
674 0         0 $extension = '.txt';
675             }
676             }
677             # otherwise the extension must be good, hope for the best
678            
679             # determine format
680             # this is an arcane specification of whether we want a "simple" no metadata
681             # format, or an ordinary text format that may or may not have metadata
682             # it's currently not hurting much, so leave it in for now?
683 8 50       17 unless ($args{'format'}) {
684 8 50       20 if (defined $args{'simple'}) {
    50          
685             # an old method of specifying simple
686 0         0 $args{'format'} = 'simple';
687             }
688             elsif ($extension) {
689             # check extension from the parsed filename, if present
690 8 50       27 if ($extension =~ /sgr|cdt/i) {
691             # sgr is simple format, no headers
692 0         0 $args{'format'} = 'simple';
693             }
694             else {
695             # everything else is text
696 8         17 $args{'format'} = 'text';
697             }
698             }
699             else {
700             # somehow we got this far without defining? use default text
701 0         0 $args{'format'} = 'text';
702             }
703             }
704            
705             # check zip status if necessary
706 8 50       18 unless (defined $args{'gz'}) {
707             # look at filename extension as a clue
708             # in case we're overwriting the input file, keep the zip status
709 8 50       26 if ($extension =~ m/\.vcf\.gz/i) {
    50          
710             # vcf requires bgzip
711 0         0 $args{'gz'} = 2;
712             }
713             elsif ($extension =~ m/\.gz$/i) {
714 0         0 $args{'gz'} = 1;
715             }
716             else {
717 8         12 $args{'gz'} = 0; # default
718             }
719             }
720            
721             # adjust gzip extension as necessary
722 8 50 33     46 if ($args{'gz'} and $extension !~ m/\.gz$/i) {
    50 33        
723 0         0 $extension .= '.gz';
724             }
725             elsif (not $args{'gz'} and $extension =~ /\.gz$/i) {
726 0         0 $extension =~ s/\.gz$//i;
727             }
728            
729             # check filename length
730             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
731             # don't even get me started on Windows NTFS path length limitation
732 8 50       25 if (length($name . $extension) > 255) {
733 0         0 my $limit = 253 - length($extension);
734 0         0 $name = substr($name, 0, $limit) . '..';
735 0         0 warn " filename too long! Truncating to $limit characters\n";
736             }
737            
738             # generate the new filename
739 8         17 my $newname = $path . $name . $extension;
740            
741            
742             # Convert strand information
743 8         31 my $strand_i = $self->strand_column;
744 8 0 0     18 if (defined $strand_i and ($self->gff or $self->bed or $self->ucsc) ) {
      33        
745             # convert to +/-/. nomenclature as necessary
746 0 0 0     0 if ($self->gff) {
    0          
747 0         0 for my $row (1 .. $self->last_row) {
748 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
749 0 0       0 if ($s =~ /\d/) {
750 0 0       0 $s = $s == 1 ? '+' : $s == -1 ? '-' : '.';
    0          
751             }
752 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
753             }
754             }
755             elsif ($self->bed or $self->ucsc) {
756 0         0 for my $row (1 .. $self->last_row) {
757 0         0 my $s = $self->{'data_table'}->[$row][$strand_i];
758 0 0       0 if ($s =~ /\d/) {
759 0 0       0 $s = $s >= 0 ? '+' : '-';
760             }
761 0         0 $self->{'data_table'}->[$row][$strand_i] = $s;
762             }
763             }
764             }
765            
766            
767             # Open file for writing
768 8         41 my $fh = $self->open_to_write_fh($newname, $args{'gz'});
769 8 50       30 return unless defined $fh;
770            
771            
772             # Write the headers
773 8 50       27 if ($args{'format'} eq 'text') {
774             # default text format has metadata headers
775            
776             # write gff statement if gff format
777 8 50       49 if ($self->gff) {
778 0         0 $fh->print('##gff-version ' . $self->gff . "\n");
779             }
780            
781             # Write the primary headers
782 8 50 66     19 unless (
      66        
      66        
      33        
783             $self->gff or $self->bed or $self->ucsc or $self->vcf or
784             $extension =~ m/sgr|kgg|cdt|peak/i
785             ) {
786             # we only write these for normal text files, not defined format files
787            
788 5 50       18 if ($self->program) {
789 0         0 $fh->print('# Program ' . $self->program . "\n");
790             }
791 5 50       17 if ($self->database) {
792 5         18 $fh->print('# Database ' . $self->database . "\n");
793             }
794 5 50       89 if ($self->feature) {
795 5         13 $fh->print('# Feature ' . $self->feature . "\n");
796             }
797             }
798            
799             # Write the miscellaneous headers
800 8         26 foreach ( @{ $self->{'comments'} } ) {
  8         23  
801             # write remaining miscellaneous header lines if present
802             # we do this for all files
803 6 100       34 unless (/\n$/s) {
804             # append newline if not present
805 4         7 $_ .= "\n";
806             }
807             # check for comment character at beginning
808 6 100       12 if (/^#/) {
809 5         16 $fh->print($_);
810             }
811             else {
812 1         4 $fh->print("# " . $_);
813             }
814             }
815            
816             # Write the column metadata headers
817 8         32 for (my $i = 0; $i < $self->number_columns; $i++) {
818             # each column metadata in the hash is referenced by the column's
819             # index number as the key
820             # we will take each index one at a time in increasing order
821            
822             # some files do not need or tolerate metadata lines, for those
823             # known files the metadata lines will be skipped
824            
825             # these column metadata lines do not need to be written if they
826             # only have two values, presumably name and index, for files
827             # that don't normally have column headers, e.g. gff
828 22 100 66     47 if (
    50 0        
    0 0        
829             exists $self->{$i}{'AUTO'} and
830 8         19 scalar( keys %{ $self->{$i} } ) == $self->{$i}{'AUTO'}
831             ) {
832             # some of the metadata values were autogenerated and
833             # we have the same number of keys as were autogenerated
834             # no need to write these
835 8         16 next;
836             }
837 14         31 elsif (scalar( keys %{ $self->{$i} } ) == 2) {
838             # only two metadata keys exist, name and index
839             # these are so simple it's not worth writing them
840 14         30 next;
841             }
842             elsif ($extension =~ /sgr|kgg|cdt/i or $self->ucsc or $self->vcf) {
843             # these do not support metadata lines
844 0         0 next;
845             }
846            
847             # we will put each key=value pair into @pairs, listed asciibetically
848 0         0 my @pairs; # an array of the key value pairs from the metadata hash
849             # put name first
850             # we are no longer writing the index number
851 0         0 push @pairs, 'name=' . $self->{$i}{'name'};
852             # put remainder in alphabetical order
853 0         0 foreach (sort {$a cmp $b} keys %{ $self->{$i} } ) {
  0         0  
  0         0  
854 0 0       0 next if $_ eq 'name'; # already written
855 0 0       0 next if $_ eq 'index'; # internal use only
856 0 0       0 next if $_ eq 'AUTO'; # internal use only
857 0         0 push @pairs, $_ . '=' . $self->{$i}{$_};
858             }
859            
860             # Finally write the header line, joining the pairs with a
861             # semi-colon into a single string.
862             # The column identifier is comprised of the word 'Column'
863             # and the index number joined by '_'.
864 0         0 $fh->print("# Column_$i ", join(";", @pairs), "\n");
865             }
866             }
867            
868            
869             # Write the table column headers
870 8 100       18 if ($self->{'headers'} == 1) {
871 5         6 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[0] }));
  5         22  
872             }
873            
874            
875             # Write the data table
876 8 50       49 if ($args{'format'} eq 'simple') {
877            
878             # the simple format will strip the non-value '.' from the table
879 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
880             # we will step though the data_table array one row at a time
881             # convert the non-value '.' to undefined
882             # and print using a tab-delimited format
883 0         0 my @linedata;
884 0         0 foreach ( @{ $self->{'data_table'}[$i] }) {
  0         0  
885 0 0       0 if ($_ eq '.') {
886 0         0 push @linedata, undef;
887             } else {
888 0         0 push @linedata, $_;
889             }
890             }
891 0         0 $fh->printf("%s\n", join("\t", @linedata));
892             }
893             }
894            
895             else {
896             # normal data files
897 8         23 for (my $i = 1; $i <= $self->last_row; $i++) {
898             # we will step though the data_table array one row at a time
899             # we will join each row's array of elements into a string to print
900             # using a tab-delimited format
901 104         107 $fh->printf("%s\n", join("\t", @{ $self->{'data_table'}[$i] }));
  104         201  
902             }
903             }
904            
905             # done writing
906 8         33 $fh->close;
907            
908             # if we made it this far, it should've been a success!
909             # return the new file name as indication of success
910 8         1287 return $newname;
911             }
912              
913             sub save {
914 6     6 1 31 return shift->write_file(@_);
915             }
916              
917              
918             #### Open a file for reading
919             sub open_to_read_fh {
920 161     161 1 254 my $self = shift;
921 161   100     433 my $file = shift || undef;
922 161 100       470 my $obj = ref($self) =~ /^Bio::ToolBox/ ? 1 : 0;
923            
924             # check file
925 161 100 66     417 if (not $file and $obj) {
926 120   50     299 $file = $self->filename || undef;
927             }
928 161 50       273 unless ($file) {
929 0         0 carp " no filename provided or associated with object!";
930 0         0 return;
931             }
932            
933             # Open filehandle object as appropriate
934 161         177 my $fh;
935 161 50       471 if ($file =~ /\.gz$/i) {
    50          
936             # the file is compressed with gzip
937 0 0       0 $fh = IO::File->new("gzip -dc $file |") or
938             carp "unable to read '$file' $!\n";
939             }
940             elsif ($file =~ /\.bz2$/i) {
941             # the file is compressed with bzip2
942 0 0       0 $fh = IO::File->new("bzip2 -dc $file |") or
943             carp "unable to read '$file' $!\n";
944             }
945             else {
946             # the file is uncompressed and space hogging
947 161 50       660 $fh = IO::File->new($file, 'r') or
948             carp "unable to read '$file' $!\n";
949             }
950            
951 161 100       13251 if ($obj) {
952 120         235 $self->{fh} = $fh;
953             }
954 161         465 return $fh;
955             }
956              
957              
958             #### Open a file for writing
959             sub open_to_write_fh {
960 10     10 1 20 my ($self, $filename, $gz, $append) = @_;
961            
962             # check filename
963 10 50       21 unless ($filename) {
964 0         0 carp " no filename to write!";
965 0         0 return;
966             }
967            
968             # check filename length
969             # assuming a maximum of 256, at least on Mac with HFS+, don't know about Linux
970 10         101 my $name = fileparse($filename);
971 10 50       24 if (length $name > 255) {
972 0         0 carp " filename is too long! please shorten\n";
973 0         0 return;
974             }
975            
976             # check zip status if necessary
977 10 100       22 unless (defined $gz) {
978             # look at filename extension as a clue
979             # in case we're overwriting the input file, keep the zip status
980 2 50       11 if ($filename =~ m/\.vcf(\.gz)?$/i) {
    50          
981 0         0 $gz = 2; # bgzip
982             }
983             elsif ($filename =~ m/\.gz$/i) {
984 0         0 $gz = 1; # regular gzip
985             }
986             else {
987 2         3 $gz = 0; # default
988             }
989             }
990            
991             # gzip compression application
992 10 50 33     51 if ($gz == 1 and not $gzip_app) {
    50 33        
993             # use parallel gzip if possible
994             # this is stored in a global variable so we only have to look once
995 0         0 $gzip_app = which('pigz');
996 0 0       0 if ($gzip_app) {
997 0         0 $gzip_app .= ' -p 3'; # use a conservative 3 processes, plus perl, so 4 total
998             }
999             else {
1000             # default is the standard gzip application
1001             # should be available in any application
1002 0         0 $gzip_app = which('gzip');
1003             }
1004 0 0       0 unless ($gzip_app) {
1005 0         0 croak "no gzip application in PATH to open compressed file handle output!\n";
1006             }
1007             }
1008             elsif ($gz == 2 and not $bgzip_app) {
1009             # use parallel bgzip if possible
1010             # this is stored in a global variable so we only have to look once
1011 0         0 $bgzip_app = which('bgzip');
1012 0 0       0 if ($bgzip_app) {
1013             # I'm going to assume this is a recent bgzip with multi-threading
1014             # use 3 threads, same as with pigz
1015 0         0 $bgzip_app .= ' -@ 3 -c';
1016             }
1017 0 0       0 unless ($bgzip_app) {
1018 0         0 croak "no bgzip application in PATH to open compressed file handle output!\n";
1019             }
1020             }
1021 10 50       25 my $gzipper = $gz == 1 ? $gzip_app : $gz == 2 ? $bgzip_app : undef;
    50          
1022            
1023             # check file append mode
1024 10 100       17 unless (defined $append) {
1025             # default is not to append
1026 8         11 $append = 0;
1027             }
1028            
1029            
1030             # Generate appropriate filehandle object
1031 10         20 my $fh;
1032 10 100 66     45 if (not $gzipper and not $append) {
    50 33        
    50 33        
    0 0        
1033 8 50       41 $fh = IO::File->new($filename, 'w') or
1034             carp "cannot write to file '$filename' $!\n";
1035             }
1036             elsif ($gzipper and !$append) {
1037 0 0       0 $fh = IO::File->new("| $gzipper >$filename") or
1038             carp "cannot write to compressed file '$filename' $!\n";
1039             }
1040             elsif (not $gzipper and $append) {
1041 2 50       23 $fh = IO::File->new(">> $filename") or
1042             carp "cannot append to file '$filename' $!\n";
1043             }
1044             elsif ($gzipper and $append) {
1045 0 0       0 $fh = IO::File->new("| $gzipper >>$filename") or
1046             carp "cannot append to compressed file '$filename' $!\n";
1047             }
1048 10 50       1421 return $fh if defined $fh;
1049             }
1050              
1051              
1052             ### Subroutine to check for file existance
1053             sub check_file {
1054 60     60 1 107 my ($self, $filename) = @_;
1055            
1056             # check for file existance
1057 60 50       1168 if (-e $filename) {
1058             # confirmed full filename and path
1059 60         263 return $filename;
1060             }
1061             else {
1062             # file name is either incomplete or non-existent
1063             # try adding some common file extensions in case those are missing
1064 0         0 my $new_filename;
1065 0         0 foreach my $ext (qw(.gz .txt .txt.gz .bed .bed.gz)) {
1066 0 0       0 if (-e $filename . $ext) {
1067 0         0 $new_filename = $filename . $ext;
1068 0         0 last;
1069             }
1070             }
1071 0         0 return $new_filename;
1072             }
1073             }
1074              
1075              
1076             sub fh {
1077 192     192 0 242 my $self = shift;
1078 192 50       2579 return $self->{fh} if exists $self->{fh};
1079 0         0 return;
1080             }
1081              
1082             sub close_fh {
1083 70     70 0 214 my $self = shift;
1084 70 50 33     376 $self->{fh}->close if (exists $self->{fh} and $self->{fh});
1085             }
1086              
1087              
1088             ### Internal subroutine to check if a comment line contains headers
1089             sub _commented_header_line {
1090 60     60   114 my ($data, $line) = @_;
1091            
1092             # prepare arrays from the other lines and current line
1093 60         76 my @commentdata;
1094 60 100       64 if ( scalar @{ $data->{'comments'} } >= 1 ) {
  60         136  
1095             # take the last line in the other array
1096 30         122 @commentdata = split '\t', $data->{'comments'}->[-1];
1097             }
1098 60         205 my @linedata = split '\t', $line;
1099            
1100             # check if the counts are equal
1101 60 50       117 if (scalar @commentdata == scalar @linedata) {
1102 0         0 return 1;
1103             }
1104             else {
1105 60         182 return 0;
1106             }
1107             }
1108              
1109              
1110             ### Internal subroutine to process metadata for standard columns
1111             sub add_column_metadata {
1112 0     0 1 0 my ($data, $line, $index) = @_;
1113            
1114             # strip the Column metadata identifier
1115 0         0 chomp $line;
1116 0         0 $line =~ s/^# Column_\d+ //;
1117            
1118             # break up the column metadata
1119 0         0 my %temphash; # a temporary hash to put the column metadata into
1120 0         0 foreach (split ';', $line) {
1121 0         0 my ($key, $value) = split '=';
1122 0 0       0 if ($key eq 'index') {
1123 0 0       0 if ($index != $value) {
1124             # the value from the metadata index key should be
1125             # correct, so we will use that
1126 0         0 $index = $value;
1127             }
1128             }
1129             # store the key & value
1130 0         0 $temphash{$key} = $value;
1131             }
1132            
1133             # create a index metadata key if not already present
1134             # the rest of biotoolbox may expect this to be present
1135 0 0       0 unless (exists $temphash{'index'}) {
1136 0         0 $temphash{'index'} = $index;
1137             }
1138            
1139             # store the column metadata hash into the main data hash
1140             # use the index as the key
1141 0 0       0 if (exists $data->{$index}) {
1142             # we will simply overwrite the previous metadata hash
1143             # harsh, I know, but what to do?
1144             # if it was canned metadata for a gff file, that's ok
1145 0         0 warn "Warning: more than one metadata line exists for index $index!\n";
1146 0         0 $data->{$index} = \%temphash;
1147             }
1148             else {
1149             # metadata hash doesn't exist, so we will add it
1150 0         0 $data->{$index} = \%temphash;
1151             }
1152 0         0 return 1;
1153             }
1154              
1155             ### Subroutine to generate metadata for gff files
1156             # gff files have nine defined columns
1157             # there are different specifications and variants:
1158             # gff (v.1), gff v.2, gff v.2.5 (aka gtf), gff v.3 (gff3)
1159             # however, the columns are the same in all versions
1160             # more info on gff can be found http://gmod.org/wiki/GFF3
1161             sub add_gff_metadata {
1162 5     5 1 7 my $self = shift;
1163 5   50     16 my $version = shift || undef;
1164 5   50     14 my $force = shift || 0;
1165            
1166             # set the gff version based on the extension if it isn't already
1167 5 50 33     12 if (not($self->gff) or $force) {
1168 0 0       0 if (defined $version) {
    0          
    0          
1169 0         0 $self->gff($version);
1170             }
1171             elsif ($self->extension =~ /gtf/i) {
1172 0         0 $self->gff(2.5);
1173             }
1174             elsif ($self->extension =~ /gff3/i) {
1175 0         0 $self->gff(3);
1176             }
1177             else {
1178 0         0 $self->gff(2); # hope for the best
1179             }
1180             }
1181             # set format based on version
1182 5 50       17 if (not $self->format) {
1183 5         15 my $v = $self->gff;
1184 5 0       17 my $f = $v == 3 ? 'gff3' : $v > 2 ? 'gtf' : 'gff';
    50          
1185 5         20 $self->format($f);
1186             }
1187            
1188             # set the metadata for the each column
1189             # some of these may already be defined if there was a
1190             # column metadata specific column in the file
1191 5         30 my $column_names = $self->standard_column_names('gff');
1192 5         14 for (my $i = 0; $i < 9; $i++) {
1193             # loop for each column
1194             # set metadata unless it's already loaded
1195 45 50 33     112 if ($force or not exists $self->{$i}) {
1196 45         84 $self->{$i}{'name'} = $column_names->[$i];
1197 45         54 $self->{$i}{'index'} = $i;
1198 45         49 $self->{$i}{'AUTO'} = 3;
1199             }
1200             # assign the name to the column header
1201 45 50 33     97 if ($force or not defined $self->{'column_names'}->[$i]) {
1202 45         78 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1203             }
1204             }
1205 5         16 $self->{data_table}->[0] = $self->{'column_names'};
1206            
1207             # set column number always to 9
1208 5 50 33     18 if ($force or $self->{'number_columns'} == 0) {
1209 5         7 $self->{'number_columns'} = 9;
1210             }
1211            
1212             # set headers flag to false
1213 5 50       16 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1214            
1215             # set the feature type
1216 5 50       9 unless (defined $self->{'feature'}) {
1217 5         14 $self->{'feature'} = 'region';
1218             }
1219 5         11 return 1;
1220             }
1221              
1222              
1223             ### Subroutine to generate metadata for BED files
1224             # bed files have a loose format
1225             # they require a minimum of 3 columns, and have a max of 12
1226             # 3, 6, and 12 column files are the most common
1227             # there are also something called paired bed files floating around
1228             # The official details and specifications may be found at
1229             # http://genome.ucsc.edu/FAQ/FAQformat#format1
1230              
1231             # a special type of bed file is the bedgraph, using
1232             # either a bdg, bedgraph, or simply bed extension
1233             # these only have four columns, no more, no less
1234             # the fourth column is score, not name
1235             sub add_bed_metadata {
1236 27     27 1 49 my ($self, $column_count) = @_;
1237 27   50     56 my $force = shift || 0;
1238            
1239             # check bed type and set metadata appropriately
1240 27         36 my $bed_names;
1241 27 50 33     68 if ($self->format =~ /bedgraph/i or $self->extension =~ /bg|bdg|graph/i) {
1242 0         0 $self->format('bedGraph'); # possibly redundant
1243 0         0 $self->bed($column_count);
1244 0         0 $bed_names = $self->standard_column_names('bdg');
1245             }
1246             else {
1247 27         63 $self->format('bed');
1248 27         84 $self->bed($column_count);
1249 27         66 $bed_names = $self->standard_column_names('bed12');
1250             }
1251 27         51 $self->{'number_columns'} = $column_count;
1252 27         36 $self->{'zerostart'} = 1;
1253            
1254             # set the metadata for each column
1255             # some of these may already be defined if there was a
1256             # column metadata specific column in the file
1257 27         70 for (my $i = 0; $i < $column_count; $i++) {
1258             # loop for each column
1259             # set name unless it already has one from metadata
1260 200 50 33     276 if ($force or not exists $self->{$i}) {
1261 200   50     401 $self->{$i}{'name'} = $bed_names->[$i] || 'extraColumn';
1262 200         260 $self->{$i}{'index'} = $i;
1263 200         237 $self->{$i}{'AUTO'} = 3;
1264             }
1265             # assign the name to the column header
1266 200 50 33     277 if ($force or not defined $self->{'column_names'}->[$i]) {
1267 200         347 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1268             }
1269             }
1270 27         49 $self->{data_table}->[0] = $self->{'column_names'};
1271              
1272             # set the feature type
1273 27 50       50 unless (defined $self->{'feature'}) {
1274 27         35 $self->{'feature'} = 'region';
1275             }
1276            
1277             # set headers flag to false
1278 27 50       75 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1279 27         57 return 1;
1280             }
1281              
1282              
1283             ### Subroutine to generate metadata for broadpeak and narrowpeak files
1284             # three different types of peak files are available
1285             # see http://genome.ucsc.edu/FAQ/FAQformat.html
1286             sub add_peak_metadata {
1287 16     16 1 38 my ($self, $column_count) = @_;
1288 16   50     28 my $force = shift || 0;
1289              
1290             # check bed type and set metadata appropriately
1291             # most of these are bed6 plus extra columns
1292 16         19 my $column_names;
1293 16 100 66     31 if ($self->format =~ /narrow/i or $self->extension =~ /narrow/i) {
    50 33        
    50 33        
1294 8         19 $self->format('narrowPeak');
1295 8         20 $self->bed($column_count);
1296 8         17 $column_names = $self->standard_column_names('narrowpeak');
1297             }
1298             elsif ($self->format =~ /broad/i or $self->extension =~ /broad/i) {
1299 0         0 $self->format('broadPeak'); # possibly redundant
1300 0         0 $self->bed($column_count);
1301 0         0 $column_names = $self->standard_column_names('broadpeak');
1302             }
1303             elsif ($self->format =~ /gapped/i or $self->extension =~ /gapped/i) {
1304 8         22 $self->format('gappedPeak'); # possibly redundant
1305 8         19 $self->bed($column_count);
1306 8         18 $column_names = $self->standard_column_names('gappedpeak');
1307             }
1308             else {
1309             # how did we get here???? Hope for the best.....
1310 0         0 $self->bed($column_count);
1311 0         0 $column_names = $self->standard_column_names('bed12');
1312             }
1313 16         27 $self->{'number_columns'} = $column_count;
1314 16         26 $self->{'zerostart'} = 1;
1315            
1316             # add metadata
1317 16         30 for (my $i = 0; $i < $column_count; $i++) {
1318 200 50 33     278 if ($force or not exists $self->{$i}) {
1319 200   50     378 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1320 200         297 $self->{$i}{'index'} = $i;
1321 200         256 $self->{$i}{'AUTO'} = 3;
1322             }
1323             # assign the name to the column header
1324 200 50 33     264 if ($force or not defined $self->{'column_names'}->[$i]) {
1325 200         322 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1326             }
1327             }
1328 16         28 $self->{data_table}->[0] = $self->{'column_names'};
1329            
1330             # set the feature type
1331 16 50       33 unless (defined $self->{'feature'}) {
1332 16         18 $self->{'feature'} = 'region';
1333             }
1334            
1335             # set headers flag to false
1336 16 50       46 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1337 16         42 return 1;
1338             }
1339              
1340              
1341             ### Subroutine to generate metadata for various UCSC gene files
1342             # these are tricky, as we will try to determine contents by counting
1343             # the columns, which may not be accurate
1344             # not only that, but this presumes the extension even makes it recognizable
1345             # see http://genome.ucsc.edu/FAQ/FAQformat.html#format9 for details
1346             # also biotoolbox script ucsc_table2gff3.pl
1347             sub add_ucsc_metadata {
1348 3     3 1 6 my ($self, $column_count) = @_;
1349 3   50     8 my $force = shift || 0;
1350            
1351             # set metadata
1352 3         4 $self->{'number_columns'} = $column_count;
1353 3         4 $self->{'ucsc'} = $column_count;
1354            
1355             # set format and determine column names;
1356 3         5 my $column_names;
1357 3 50       5 if ($column_count == 16) {
    0          
    0          
    0          
    0          
1358 3         7 $self->format('genePredExt');
1359 3         6 $column_names = $self->standard_column_names('ucsc16');
1360             }
1361             elsif ($column_count == 15) {
1362 0         0 $self->format('genePredExt');
1363 0         0 $column_names = $self->standard_column_names('ucsc15');
1364             }
1365             elsif ($column_count == 12) {
1366 0         0 $self->format('knownGene');
1367 0         0 $column_names = $self->standard_column_names('ucsc12');
1368             }
1369             elsif ($column_count == 11) {
1370 0         0 $self->format('refFlat');
1371 0         0 $column_names = $self->standard_column_names('ucsc11');
1372             }
1373             elsif ($column_count == 10) {
1374 0         0 $self->format('genePred');
1375 0         0 $column_names = $self->standard_column_names('ucsc10');
1376             }
1377 3         5 $self->{'zerostart'} = 1;
1378            
1379             # assign the column names and metadata
1380 3         6 for (my $i = 0; $i < $column_count; $i++) {
1381             # loop for each column
1382             # set name unless it already has one from metadata
1383 48 50 33     65 if ($force or not exists $self->{$i}) {
1384 48   50     73 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1385 48         48 $self->{$i}{'index'} = $i;
1386 48         49 $self->{$i}{'AUTO'} = 3;
1387             }
1388             # assign the name to the column header
1389 48 50 33     58 if ($force or not defined $self->{'column_names'}->[$i]) {
1390 48         67 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1391             }
1392             }
1393 3         5 $self->{data_table}->[0] = $self->{'column_names'};
1394            
1395             # set the feature type
1396 3 50       6 unless (defined $self->{'feature'}) {
1397 3         38 $self->{'feature'} = 'gene';
1398             }
1399            
1400             # set headers flag to false
1401 3 50       9 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1402 3         6 return 1;
1403             }
1404              
1405              
1406             ### Subroutine to generate metadata for SGR files
1407             # a sgr file contains three columns: chromo, position, score
1408             # this is a very simple file format, useful in exporting and
1409             # importing to binary BAR files used in T2, USeq, and IGB
1410             sub add_sgr_metadata {
1411 0     0 1 0 my $self = shift;
1412 0   0     0 my $force = shift || 0;
1413            
1414             # set column metadata
1415 0         0 my $column_names = $self->standard_column_names('sgr');
1416 0         0 for (my $i = 0; $i < 3; $i++) {
1417             # loop for each column
1418             # set name unless it already has one from metadata
1419 0 0 0     0 if ($force or not exists $self->{$i}) {
1420 0   0     0 $self->{$i}{'name'} = $column_names->[$i] || 'extraColumn';
1421 0         0 $self->{$i}{'index'} = $i;
1422 0         0 $self->{$i}{'AUTO'} = 3;
1423             }
1424             # assign the name to the column header
1425 0 0 0     0 if ($force or not defined $self->{'column_names'}->[$i]) {
1426 0         0 $self->{'column_names'}->[$i] = $self->{$i}{'name'};
1427             }
1428             }
1429 0         0 $self->{data_table}->[0] = $self->{'column_names'};
1430 0         0 $self->{'number_columns'} = 3;
1431              
1432              
1433             # set headers flag to false
1434 0 0       0 $self->{'headers'} = 0 unless $self->{0}{'name'} =~ /^#/;
1435            
1436             # set the feature type
1437 0 0       0 unless (defined $self->{'feature'}) {
1438 0         0 $self->{'feature'} = 'region';
1439             }
1440 0         0 return 1;
1441             }
1442              
1443              
1444             ### Internal subroutine to generate metadata for standard files
1445             sub add_standard_metadata {
1446 13     13 1 22 my ($self, $line) = @_;
1447            
1448 13         34 my @namelist = split '\t', $line;
1449 13         23 chomp $namelist[-1];
1450            
1451             # we will define the columns based on
1452 13         32 for my $i (0..$#namelist) {
1453             # make up a name if one doesn't exist
1454 68   66     100 $namelist[$i] ||= "Column_$i";
1455            
1456             # confirm that a file metadata exists for this column
1457 68 50       100 if (exists $self->{$i}) {
1458 0 0       0 unless ($namelist[$i] eq $self->{$i}->{'name'}) {
1459 0         0 warn "metadata and header names for column $i do not match!";
1460             # set the name to match the actual column name
1461 0         0 $self->{$i}->{'name'} = $namelist[$i];
1462             }
1463             }
1464            
1465             # otherwise be nice and generate it here
1466             else {
1467 68         166 $self->{$i} = {
1468             'name' => $namelist[$i],
1469             'index' => $i,
1470             'AUTO' => 3,
1471             };
1472             }
1473             }
1474            
1475             # check the number of columns
1476 13 50       62 if (scalar @namelist != $self->{'number_columns'} ) {
1477             # adjust to match actual content
1478 13         17 $self->{'number_columns'} = scalar @namelist;
1479             }
1480            
1481             # put the column names in the metadata
1482 13         24 $self->{'column_names'} = \@namelist;
1483 13         22 $self->{data_table}->[0] = $self->{'column_names'};
1484            
1485             # set headers flag to true
1486 13         16 $self->{'headers'} = 1;
1487 13         16 return 1;
1488             }
1489              
1490              
1491             ### Internal subroutine to generate hash of standard file format column names
1492             sub standard_column_names {
1493 51     51 1 150 my ($self, $type) = @_;
1494            
1495 51 100 0     202 if ($type eq 'gff') {
    100 0        
    50 0        
    50 0        
    100          
    50          
    100          
    50          
    50          
    0          
    0          
    0          
    0          
1496 5         16 return [qw(Chromosome Source Type Start Stop Score Strand Phase Group)];
1497             }
1498             elsif ($type eq 'bed12') {
1499 27         108 return [qw(Chromosome Start0 End Name Score Strand
1500             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0)];
1501             }
1502             elsif ($type eq 'bed6') {
1503 0         0 return [qw(Chromosome Start0 End Name Score Strand)];
1504             }
1505             elsif ($type eq 'bdg') {
1506 0         0 return [qw(Chromosome Start0 End Score)];
1507             }
1508             elsif ($type eq 'narrowpeak') {
1509 8         37 return [qw(Chromosome Start0 End Name Score Strand signalValue
1510             pValue qValue peak)];
1511             }
1512             elsif ($type eq 'broadpeak') {
1513 0         0 return [qw(Chromosome Start0 End Name Score Strand signalValue
1514             pValue qValue)];
1515             }
1516             elsif ($type eq 'gappedpeak') {
1517 8         27 return [qw(Chromosome Start0 End Name Score Strand
1518             thickStart0 thickEnd itemRGB blockCount blockSizes blockStarts0
1519             signalValue pValue qValue)];
1520             }
1521             elsif ($type eq 'sgr') {
1522 0         0 return [qw(Chromo Start Score)];
1523             }
1524             elsif ($type eq 'ucsc16') {
1525 3         12 return [qw(bin name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1526             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1527             }
1528             elsif ($type eq 'ucsc15' or $type eq 'genepredext') {
1529 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1530             exonStarts0 exonEnds score name2 cdsStartSt cdsEndStat exonFrames)];
1531             }
1532             elsif ($type eq 'ucsc12' or $type eq 'knowngene') {
1533 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1534             exonStarts0 exonEnds proteinID alignID)];
1535             }
1536             elsif ($type eq 'ucsc11' or $type eq 'refflat') {
1537 0           return [qw(geneName transcriptName chrom strand txStart0 txEnd cdsStart0
1538             cdsEnd exonCount exonStarts0 exonEnds)];
1539             }
1540             elsif ($type eq 'ucsc10' or $type eq 'genepred') {
1541 0           return [qw(name chrom strand txStart0 txEnd cdsStart0 cdsEnd exonCount
1542             exonStarts exonEnds)];
1543             }
1544             else {
1545 0           confess "unrecognized standard column name format '$type'!";
1546             }
1547             }
1548              
1549              
1550             __END__