File Coverage

blib/lib/Bio/ToolBox/Data/core.pm
Criterion Covered Total %
statement 414 757 54.6
branch 216 502 43.0
condition 120 239 50.2
subroutine 50 59 84.7
pod 48 49 97.9
total 848 1606 52.8


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::core;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::core - Common functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             Common methods for metadata and manipulation in a L
11             data table and L file stream. This module
12             should not be used directly. See the respective modules for more information.
13              
14             =cut
15              
16 3     3   19 use strict;
  3         4  
  3         73  
17 3     3   11 use Carp qw(carp cluck croak confess);
  3         4  
  3         108  
18 3     3   12 use base 'Bio::ToolBox::Data::file';
  3         5  
  3         1195  
19 3         179 use Bio::ToolBox::db_helper qw(
20             open_db_connection
21             verify_or_request_feature_types
22             use_bam_adapter
23             use_big_adapter
24 3     3   2047 );
  3         14  
25 3     3   16 use Module::Load;
  3         4  
  3         12  
26              
27             1;
28              
29             #### Initialization and verification ###############################################
30              
31             sub new {
32 73     73 1 109 my $class = shift;
33            
34             # in case someone calls this from an established object
35 73 100       176 if (ref($class)) {
36 1         1 $class = ref($class);
37             }
38            
39             # Initialize the hash structure
40 73         840 my %data = (
41             'program' => undef,
42             'feature' => undef,
43             'feature_type' => undef,
44             'db' => undef,
45             'format' => '',
46             'gff' => 0,
47             'bed' => 0,
48             'ucsc' => 0,
49             'vcf' => 0,
50             'number_columns' => 0,
51             'last_row' => 0,
52             'headers' => 1,
53             'column_names' => [],
54             'filename' => undef,
55             'basename' => undef,
56             'extension' => undef,
57             'path' => undef,
58             'comments' => [],
59             'data_table' => [],
60             'header_line_count' => 0,
61             'zerostart' => 0,
62             );
63            
64             # Finished
65 73         217 return bless \%data, $class;
66             }
67              
68              
69             sub verify {
70             # this function does not rely on any self functions for two reasons
71             # this is a low level integrity checker
72             # this is very old code from before the days of an OO API of Bio::ToolBox
73             # although a lot of things have been added and changed since then....
74 52     52 1 692 my $self = shift;
75 52   100     128 my $silence = shift || 0; # default is to yell
76            
77             # check for data table
78 52 50 33     247 unless (
79             defined $self->{'data_table'} and
80             ref $self->{'data_table'} eq 'ARRAY'
81             ) {
82 0 0       0 carp sprintf "\n DATA INTEGRITY ERROR: No data table in %s object!", ref $self
83             unless $silence;
84 0         0 return;
85             }
86            
87             # check for last row index
88 52 50       104 if (defined $self->{'last_row'}) {
89 52         55 my $number = scalar( @{ $self->{'data_table'} } ) - 1;
  52         104  
90 52 50       116 if ($self->{'last_row'} != $number) {
91             # carp sprintf "TABLE INTEGRITY ERROR: data table last_row index [%d] doesn't match " .
92             # "metadata value [%d]!\n", $number, $self->{'last_row'};
93             # fix it for them
94 0         0 $self->{'last_row'} = $number;
95             }
96             }
97             else {
98             # define it for them
99             $self->{'last_row'} =
100 0         0 scalar( @{ $self->{'data_table'} } ) - 1;
  0         0  
101             }
102            
103             # check for consistent number of columns
104 52 50       101 if (defined $self->{'number_columns'}) {
105 52         82 my $number = $self->{'number_columns'};
106 52         78 my @problems;
107 52         74 my $too_low = 0;
108 52         69 my $too_high = 0;
109 52         132 for (my $row = 0; $row <= $self->{'last_row'}; $row++) {
110 945         786 my $count = scalar @{ $self->{'data_table'}->[$row] };
  945         940  
111 945 50       1451 if ($count != $number) {
112 0         0 push @problems, $row;
113 0 0       0 $too_low++ if $count < $number;
114 0 0       0 $too_high++ if $count > $number;
115 0         0 while ($count < $number) {
116             # we can sort-of-fix this problem
117 0         0 $self->{'data_table'}->[$row][$count] = '.';
118 0         0 $count++;
119             }
120             }
121             }
122            
123             # we found errors
124 52 50       109 if (@problems) {
125             # collapse problem list into compact string
126             # from http://www.perlmonks.org/?node_id=87538
127 0         0 my $problem = join(',', @problems);
128 0         0 $problem =~ s/(?
  0         0  
129              
130 0 0       0 if ($too_low) {
131 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_low rows had fewer than expected " .
132             "columns!\n padded rows $problem with null values\n"
133             unless $silence;
134             }
135 0 0       0 if ($too_high) {
136 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_high rows had more columns than " .
137             "expected!\n Problem rows: $problem\n"
138             unless $silence;
139 0         0 return;
140             }
141             }
142             }
143             else {
144             # this wasn't set???? then set it
145             $self->{'number_columns'} =
146 0         0 scalar @{ $self->{'data_table'}->[0] };
  0         0  
147             }
148            
149             # check metadata and table names
150 52         74 my $mdcheck = 0;
151 52         125 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
152 415 50       653 unless (
153             $self->{$i}{'name'} eq
154             $self->{'data_table'}->[0][$i]
155             ) {
156             printf( "\n TABLE/METADATA MISMATCH ERROR: Column header names don't" .
157             " match metadata name values for index $i!" .
158             "\n compare '%s' with '%s'\n", $self->{'data_table'}->[0][$i],
159 0 0       0 $self->{$i}{'name'} ) unless $silence;
160 0         0 $mdcheck++;
161             }
162 415 50       765 unless ($self->{$i}{'index'} == $i) {
163             printf( "\n METADATA INDEX ERROR: index $i metadata doesn't match its " .
164 0 0       0 "index value %s\n", $self->{$i}{'index'} ) unless $silence;
165 0         0 $mdcheck++;
166             }
167             }
168 52 50       102 return if $mdcheck;
169            
170             ### Defined file format structure integrity
171 52         60 my $error;
172            
173             # check for proper gff structure
174 52 100       136 if ($self->{'gff'}) {
175             # if any of these checks fail, we will reset the gff version to
176             # the default of 0, or no gff
177 6         9 my $gff_check = 1; # start with assumption it is true
178            
179             # check number of columns
180 6 100       16 if ($self->{'number_columns'} != 9) {
181 1         2 $gff_check = 0;
182 1         2 $error .= " Number of columns not 9.";
183             }
184            
185             # check column indices
186 6 50 33     49 if (
187             # column 0 should look like chromosome
188             exists $self->{0} and
189             $self->{0}{'name'} !~
190             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
191             ) {
192 0         0 $gff_check = 0;
193 0         0 $error .= " Column 0 name not chromosome-like.";
194             }
195 6 50 33     46 if (
196             # column 3 should look like start
197             exists $self->{3} and
198             $self->{3}{'name'} !~ m/start|pos|position/i
199             ) {
200 0         0 $gff_check = 0;
201 0         0 $error .= " Column 3 name not start-like.";
202             }
203 6 50 33     46 if (
204             # column 4 should look like end
205             exists $self->{4} and
206             $self->{4}{'name'} !~ m/stop|end|pos|position/i
207             ) {
208 0         0 $gff_check = 0;
209 0         0 $error .= " Column 4 name not stop-like.";
210             }
211 6 50 33     26 if (
212             # column 6 should look like strand
213             exists $self->{6} and
214             $self->{6}{'name'} !~ m/strand/i
215             ) {
216 0         0 $gff_check = 0;
217 0         0 $error .= " Column 6 name not strand-like.";
218             }
219            
220             # check column data
221 6 50       24 unless ($self->_column_is_integers(3,4)) {
222 0         0 $gff_check = 0;
223 0         0 $error .= " Columns 3,4 not integers.";
224             }
225 6 50       22 unless ($self->_column_is_numeric(5)) {
226 0         0 $gff_check = 0;
227 0         0 $error .= " Column 5 not numeric.";
228             }
229 6 50       21 unless ($self->_column_is_stranded(6)) {
230 0         0 $gff_check = 0;
231 0         0 $error .= " Column 6 not strand values.";
232             }
233            
234             # update gff value as necessary
235 6 100       16 if ($gff_check == 0) {
236             # reset metadata
237 1         2 $self->{'gff'} = 0;
238 1         2 $self->{'headers'} = 1;
239            
240             # remove the AUTO key from the metadata
241 1         4 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
242 8 50       11 if (exists $self->{$i}{'AUTO'}) {
243 8         12 delete $self->{$i}{'AUTO'};
244             }
245             }
246 1 50       3 print "\n GFF FILE FORMAT ERROR: $error\n" unless $silence;
247             }
248             }
249            
250             # check for proper BED structure
251 52 100       104 if ($self->{'bed'}) {
252             # if any of these checks fail, we will reset the bed flag to 0
253             # to make it not a bed file format
254 21         35 my $bed_check = 1; # start with assumption it is correct
255            
256             # check number of columns
257 21 50       61 if ($self->{'number_columns'} < 3) {
258 0         0 $bed_check = 0;
259 0         0 $error .= " Number of columns not at least 3.";
260             }
261            
262             # check column index names
263 21 50 33     166 if (
264             exists $self->{0} and
265             $self->{0}{'name'} !~
266             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
267             ) {
268 0         0 $bed_check = 0;
269 0         0 $error .= " Column 0 name not chromosome-like.";
270             }
271 21 50 33     156 if (
272             exists $self->{1} and
273             $self->{1}{'name'} !~ m/start|pos|position/i
274             ) {
275 0         0 $bed_check = 0;
276 0         0 $error .= " Column 1 name not start-like.";
277             }
278 21 50 33     121 if (
279             exists $self->{2} and
280             $self->{2}{'name'} !~ m/stop|end|pos|position/i
281             ) {
282 0         0 $bed_check = 0;
283 0         0 $error .= " Column 2 name not stop-like.";
284             }
285 21 50 66     95 if (
286             exists $self->{5} and
287             $self->{5}{'name'} !~ m/strand/i
288             ) {
289 0         0 $bed_check = 0;
290 0         0 $error .= " Column 5 name not strand-like.";
291             }
292 21 50 100     129 if (
      66        
293             exists $self->{6} and
294             $self->{'format'} !~ /narrow|broad/i and
295             $self->{6}{'name'} !~ m/start|thick|cds/i
296             ) {
297 0         0 $bed_check = 0;
298 0         0 $error .= " Column 6 name not thickStart-like.";
299             }
300 21 50 100     129 if (
      66        
301             exists $self->{7} and
302             $self->{'format'} !~ /narrow|broad/i and
303             $self->{7}{'name'} !~ m/end|stop|thick|cds/i
304             ) {
305 0         0 $bed_check = 0;
306 0         0 $error .= " Column 7 name not thickEnd-like.";
307             }
308 21 50 100     104 if (
      66        
309             exists $self->{8} and
310             $self->{'format'} !~ /narrow|broad/i and
311             $self->{8}{'name'} !~ m/item|rgb|color/i
312             ) {
313 0         0 $bed_check = 0;
314 0         0 $error .= " Column 8 name not itemRGB-like.";
315             }
316 21 50 100     129 if (
      66        
317             exists $self->{9} and
318             $self->{'format'} !~ /narrow|broad/i and
319             $self->{9}{'name'} !~ m/count|number|block|exon/i
320             ) {
321 0         0 $bed_check = 0;
322 0         0 $error .= " Column 9 name not blockCount-like.";
323             }
324 21 50 66     75 if (
325             exists $self->{10} and
326             $self->{10}{'name'} !~ m/size|length|block|exon/i
327             ) {
328 0         0 $bed_check = 0;
329 0         0 $error .= " Column 10 name not blockSizes-like.";
330             }
331 21 50 66     72 if (
332             exists $self->{11} and
333             $self->{11}{'name'} !~ m/start|block|exon/i
334             ) {
335 0         0 $bed_check = 0;
336 0         0 $error .= " Column 11 name not blockStarts-like.";
337             }
338            
339             # check column data
340 21 50       66 unless ($self->_column_is_integers(1,2)) {
341 0         0 $bed_check = 0;
342 0         0 $error .= " Columns 1,2 not integers.";
343             }
344 21 100       54 if ($self->{'number_columns'} >= 5) {
345             # only check if it is actually present, since could be optional
346 15 50       43 unless ($self->_column_is_numeric(4) ) {
347 0         0 $bed_check = 0;
348 0         0 $error .= " Column 4 not numeric.";
349             }
350             }
351 21 100       55 if ($self->{'number_columns'} >= 6) {
352             # only check if it is actually present, since could be optional
353 15 50       49 unless ($self->_column_is_stranded(5) ) {
354 0         0 $bed_check = 0;
355 0         0 $error .= " Column 5 not strand values.";
356             }
357             }
358 21 100 100     93 if ($self->{'format'} and
359             $self->{'format'} =~ /narrow|broad/i) {
360 4 50       13 unless ($self->_column_is_numeric(6,7,8) ) {
361 0         0 $bed_check = 0;
362 0         0 $error .= " Columns 6,7,8 not numeric.";
363             }
364             }
365 21 100       50 if ($self->{'number_columns'} == 12) {
366             # bed12 has extra special limitations
367 3 50       7 unless ($self->_column_is_integers(6,7,9) ) {
368 0         0 $bed_check = 0;
369 0         0 $error .= " Column 6,7,9 not integers.";
370             }
371 3 50       8 unless ($self->_column_is_comma_integers(10,11) ) {
372 0         0 $bed_check = 0;
373 0         0 $error .= " Column 10,11 not comma-delimited integers.";
374             }
375             }
376 21 100 66     715 if (
377             $self->{'number_columns'} == 15 and
378             $self->{'format'} =~ /gapped/i
379             ) {
380             # gappedPeak has extra special limitations
381 4 50       13 unless ($self->_column_is_integers(6,7,9) ) {
382 0         0 $bed_check = 0;
383 0         0 $error .= " Column 6,7,9 not integers.";
384             }
385 4 50       18 unless ($self->_column_is_comma_integers(10,11) ) {
386 0         0 $bed_check = 0;
387 0         0 $error .= " Column 10,11 not comma-delimited integers.";
388             }
389 4 50       9 unless ($self->_column_is_numeric(12,13,14) ) {
390 0         0 $bed_check = 0;
391 0         0 $error .= " Columns 12,13,14 not numeric.";
392             }
393             }
394            
395             # peak file format
396 21 50 100     108 if ($self->{'format'} and
      66        
397             $self->{'format'} =~ /narrowpeak/i and
398             $self->{'number_columns'} != 10
399             ) {
400 0         0 $bed_check = 0;
401 0         0 $error .= " NarrowPeak has 10 columns only.";
402             }
403 21 50 66     85 if ($self->{'format'} and
      33        
404             $self->{'format'} =~ /broadpeak/i and
405             $self->{'number_columns'} != 9
406             ) {
407 0         0 $bed_check = 0;
408 0         0 $error .= " BroadPeak has 9 columns only.";
409             }
410 21 50 100     91 if ($self->{'format'} and
      66        
411             $self->{'format'} =~ /gappedpeak/i and
412             $self->{'number_columns'} != 15
413             ) {
414 0         0 $bed_check = 0;
415 0         0 $error .= " GappeddPeak has 15 columns only.";
416             }
417            
418             # reset the BED tag value as appropriate
419 21 50       45 if ($bed_check) {
420 21         46 $self->{'bed'} = $self->{'number_columns'}; # in case we had a fake true
421             }
422             else {
423             # reset metadata
424 0         0 $self->{'bed'} = 0;
425 0         0 $self->{'headers'} = 1;
426 0         0 my $ext = $self->{'extension'};
427 0         0 $self->{'filename'} =~ s/$ext/.txt/;
428 0         0 $self->{'extension'} = '.txt';
429 0         0 $self->{'format'} = '';
430            
431             # remove the AUTO key from the metadata
432 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
433 0 0       0 if (exists $self->{$i}{'AUTO'}) {
434 0         0 delete $self->{$i}{'AUTO'};
435             }
436             }
437 0 0       0 print "\n BED FILE FORMAT ERROR: $error\n" unless $silence;
438             }
439             }
440            
441             # check refFlat or genePred gene structure
442 52 100       94 if ($self->{'ucsc'}) {
443             # if any of these checks fail, we will reset the extension
444 6         6 my $ucsc_check = 1; # start with assumption it is correct
445 6         9 my $ucsc_type;
446            
447             # check number of columns
448 6         12 my $colnumber = $self->{number_columns};
449 6 50 0     11 if ($colnumber == 16) {
    0          
    0          
    0          
450 6         9 $ucsc_type = 'genePredExtBin';
451             # genePredExt with bin
452             # bin name chrom strand txStart txEnd cdsStart cdsEnd
453             # exonCount exonStarts exonEnds score name2 cdsStartSt
454             # cdsEndStat exonFrames
455 6 50       27 unless($self->{2}{name} =~
456             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
457 0         0 $ucsc_check = 0;
458 0         0 $error .= " Column 2 name not chromosome-like.";
459             }
460 6 100       26 unless($self->{4}{name} =~ /start|position/i) {
461 3         5 $ucsc_check = 0;
462 3         8 $error .= " Column 4 name not start-like.";
463             }
464 6 100       22 unless($self->{5}{name} =~ /stop|end|position/i) {
465 3         6 $ucsc_check = 0;
466 3         5 $error .= " Column 5 name not stop-like.";
467             }
468 6 100       21 unless($self->{6}{name} =~ /start|position/i) {
469 3         4 $ucsc_check = 0;
470 3         4 $error .= " Column 6 name not start-like.";
471             }
472 6 100       19 unless($self->{7}{name} =~ /stop|end|position/i) {
473 3         5 $ucsc_check = 0;
474 3         5 $error .= " Column 7 name not stop-like.";
475             }
476 6 50       17 unless($self->_column_is_integers(4,5,6,7,8)) {
477 0         0 $ucsc_check = 0;
478 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
479             }
480 6 50       26 unless ($self->_column_is_comma_integers(9,10)) {
481 0         0 $ucsc_check = 0;
482 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
483             }
484 6 50       14 unless($self->_column_is_stranded(3)) {
485 0         0 $ucsc_check = 0;
486 0         0 $error .= " Column 3 not strand values.";
487             }
488             }
489             elsif ($colnumber == 15 or $colnumber == 12) {
490 0 0       0 $ucsc_type = $colnumber == 15 ? 'genePredExt' : 'knownGene';
491             # GenePredExt
492             # name chrom strand txStart txEnd cdsStart cdsEnd
493             # exonCount exonStarts exonEnds score name2 cdsStartSt
494             # cdsEndStat exonFrames
495             # or knownGene
496             # name chrom strand txStart txEnd cdsStart cdsEnd
497             # exonCount exonStarts exonEnds proteinID alignID
498 0 0       0 unless($self->{1}{name} =~
499             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
500 0         0 $ucsc_check = 0;
501 0         0 $error .= " Column 1 name not chromosome-like.";
502             }
503 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
504 0         0 $ucsc_check = 0;
505 0         0 $error .= " Column 3 name not start-like.";
506             }
507 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
508 0         0 $ucsc_check = 0;
509 0         0 $error .= " Column 4 name not stop-like.";
510             }
511 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
512 0         0 $ucsc_check = 0;
513 0         0 $error .= " Column 5 name not start-like.";
514             }
515 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
516 0         0 $ucsc_check = 0;
517 0         0 $error .= " Column 6 name not stop-like.";
518             }
519 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
520 0         0 $ucsc_check = 0;
521 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
522             }
523 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
524 0         0 $ucsc_check = 0;
525 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
526             }
527 0 0       0 unless($self->_column_is_stranded(2)) {
528 0         0 $ucsc_check = 0;
529 0         0 $error .= " Column 2 not strand values.";
530             }
531             }
532             elsif ($colnumber == 11) {
533 0         0 $ucsc_type = 'refFlat';
534             # geneName transcriptName chrom strand txStart txEnd
535             # cdsStart cdsEnd exonCount exonStarts exonEnds
536 0 0       0 unless($self->{2}{name} =~
537             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
538 0         0 $ucsc_check = 0;
539 0         0 $error .= " Column 2 name not chromosome-like.";
540             }
541 0 0       0 unless($self->{4}{name} =~ /start|position/i) {
542 0         0 $ucsc_check = 0;
543 0         0 $error .= " Column 4 name not start-like.";
544             }
545 0 0       0 unless($self->{5}{name} =~ /stop|end|position/i) {
546 0         0 $ucsc_check = 0;
547 0         0 $error .= " Column 5 name not stop-like.";
548             }
549 0 0       0 unless($self->{6}{name} =~ /start|position/i) {
550 0         0 $ucsc_check = 0;
551 0         0 $error .= " Column 6 name not start-like.";
552             }
553 0 0       0 unless($self->{7}{name} =~ /stop|end|position/i) {
554 0         0 $ucsc_check = 0;
555 0         0 $error .= " Column 7 name not stop-like.";
556             }
557 0 0       0 unless($self->_column_is_integers(4,5,6,7,8)) {
558 0         0 $ucsc_check = 0;
559 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
560             }
561 0 0       0 unless ($self->_column_is_comma_integers(9,10)) {
562 0         0 $ucsc_check = 0;
563 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
564             }
565 0 0       0 unless($self->_column_is_stranded(3)) {
566 0         0 $ucsc_check = 0;
567 0         0 $error .= " Column 3 not strand values.";
568             }
569             }
570             elsif ($colnumber == 10) {
571 0         0 $ucsc_type = 'genePred';
572             # name chrom strand txStart txEnd cdsStart cdsEnd
573             # exonCount exonStarts exonEnds
574 0 0       0 unless($self->{1}{name} =~
575             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
576 0         0 $ucsc_check = 0;
577 0         0 $error .= " Column 1 name not chromosome-like.";
578             }
579 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
580 0         0 $ucsc_check = 0;
581 0         0 $error .= " Column 3 name not start-like.";
582             }
583 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
584 0         0 $ucsc_check = 0;
585 0         0 $error .= " Column 4 name not stop-like.";
586             }
587 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
588 0         0 $ucsc_check = 0;
589 0         0 $error .= " Column 5 name not start-like.";
590             }
591 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
592 0         0 $ucsc_check = 0;
593 0         0 $error .= " Column 6 name not stop-like.";
594             }
595 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
596 0         0 $ucsc_check = 0;
597 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
598             }
599 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
600 0         0 $ucsc_check = 0;
601 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
602             }
603 0 0       0 unless($self->_column_is_stranded(2)) {
604 0         0 $ucsc_check = 0;
605 0         0 $error .= " Column 2 not strand values.";
606             }
607             }
608             else {
609 0         0 $ucsc_type = 'UCSC';
610 0         0 $ucsc_check = 0;
611 0         0 $error .= " Wrong # of columns.";
612             }
613              
614 6 100       16 if ($ucsc_check == 0) {
615             # failed the check
616 3         7 my $ext = $self->{'extension'};
617 3         27 $self->{'filename'} =~ s/$ext/.txt/;
618 3         8 $self->{'extension'} = '.txt';
619 3         5 $self->{'ucsc'} = 0;
620 3         8 $self->{'format'} = '';
621            
622             # remove the AUTO key
623 3         9 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
624 48 50       66 if (exists $self->{$i}{'AUTO'}) {
625 48         65 delete $self->{$i}{'AUTO'};
626             }
627             }
628 3 50       8 print "\n $ucsc_type FILE FORMAT ERROR: $error\n" unless $silence;
629             }
630             }
631            
632             # check VCF format
633 52 50       100 if ($self->{vcf}) {
634             # if any of these checks fail, we will reset the vcf flag to 0
635             # to make it not a vcf file format
636 0         0 my $vcf_check = 1; # start with assumption it is correct
637            
638             # check number of columns
639 0 0       0 if ($self->{'number_columns'} < 8) {
640 0         0 $vcf_check = 0;
641 0         0 $error .= " Number of Columns is too few.";
642             }
643            
644             # check column index names
645 0 0       0 if ($self->{0}{'name'} !~ m/chrom/i) {
646 0         0 $vcf_check = 0;
647 0         0 $error .= " Column 0 name not chromosome.";
648             }
649 0 0 0     0 if (
650             exists $self->{1} and
651             $self->{1}{'name'} !~ m/^pos|start/i
652             ) {
653 0         0 $vcf_check = 0;
654 0         0 $error .= " Column 1 name not position.";
655             }
656            
657             # check column data
658 0 0       0 unless ($self->_column_is_integers(1)) {
659 0         0 $vcf_check = 0;
660 0         0 $error .= " Columns 1 not integers.";
661             }
662            
663             # reset the vcf tag value as appropriate
664 0 0       0 if ($vcf_check) {
665             # in case we had a fake true set it to a more reasonable value?
666 0 0       0 $self->{'vcf'} = 4 if $self->{'vcf'} == 1;
667             }
668             else {
669             # reset metadata
670 0         0 $self->{'vcf'} = 0;
671 0         0 $self->{'headers'} = 1;
672 0         0 $self->{'format'} = '';
673            
674             # remove the AUTO key from the metadata
675 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
676 0 0       0 if (exists $self->{$i}{'AUTO'}) {
677 0         0 delete $self->{$i}{'AUTO'};
678             }
679             }
680 0 0       0 print "\n VCF FILE FORMAT ERROR: $error\n" unless $silence;
681             }
682             }
683            
684             # check proper SGR file structure
685 52 50 66     295 if (exists $self->{'extension'} and
      66        
686             defined $self->{'extension'} and
687             $self->{'extension'} =~ /sgr/i
688             ) {
689             # there is no sgr field in the data structure
690             # so we're just checking for the extension
691             # we will change the extension as necessary if it doesn't conform
692 0         0 my $sgr_check = 1;
693 0 0       0 if ($self->{'number_columns'} != 3) {
694 0         0 $sgr_check = 0;
695 0         0 $error .= " Column number is not 3.";
696             }
697 0 0       0 if ($self->{0}{'name'} !~ m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
698 0         0 $sgr_check = 0;
699 0         0 $error .= " Column 0 name not chromosome-like.";
700             }
701 0 0       0 if ($self->{1}{'name'} !~ /start|position/i) {
702 0         0 $sgr_check = 0;
703 0         0 $error .= " Column 1 name not start-like.";
704             }
705 0 0       0 unless ($self->_column_is_integers(1)) {
706 0         0 $sgr_check = 0;
707 0         0 $error .= " Columns 1 not integers.";
708             }
709 0 0       0 if ($sgr_check == 0) {
710             # doesn't smell like a SGR file
711             # change the extension so the write subroutine won't think it is
712             # make it a text file
713 0         0 $self->{'extension'} =~ s/sgr/txt/i;
714 0         0 $self->{'filename'} =~ s/sgr/txt/i;
715 0         0 $self->{'headers'} = 1;
716 0         0 $self->{'format'} = '';
717            
718             # remove the AUTO key from the metadata
719 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
720 0 0       0 if (exists $self->{$i}{'AUTO'}) {
721 0         0 delete $self->{$i}{'AUTO'};
722             }
723             }
724 0 0       0 print "\n SGR FILE FORMAT ERROR: $error\n" unless $silence;
725             }
726             }
727            
728             # check file headers value because this may have changed
729             # this can happen when we reset bed/gff/vcf flags when we add columns
730 52 100 100     442 if (
    100 100        
      66        
      66        
      33        
      33        
      66        
      33        
731             $self->{'bed'} or
732             $self->{'gff'} or
733             $self->{'ucsc'} or
734             ($self->{'extension'} and $self->{'extension'} =~ /sgr/i)
735             ) {
736 29         62 $self->{'headers'} = 0;
737             }
738             elsif (
739             $self->{'bed'} == 0 and
740             $self->{'gff'} == 0 and
741             $self->{'ucsc'} == 0 and
742             ($self->{'extension'} and $self->{'extension'} !~ /sgr/i)
743             ) {
744 18 50       47 $self->{'headers'} = 1 unless $self->{'headers'} == -1;
745             }
746            
747             # if we have made it here, then there were no major structural problems
748             # file is verified, any minor issues should have been fixed
749 52         120 return 1;
750             }
751              
752             # internal method to check if a column is nothing but integers, i.e. start, stop
753             sub _column_is_integers {
754 40     40   51 my $self = shift;
755 40         82 my @index = @_;
756 40 100       98 return 1 if ($self->{last_row} == 0); # can't check if table is empty
757 38         68 foreach (@index) {
758 101 50       201 return 0 unless exists $self->{$_};
759             }
760 38         80 for my $row (1 .. $self->{last_row}) {
761 709         723 for my $i (@index) {
762 1648 50       2801 return 0 unless ($self->{data_table}->[$row][$i] =~ /^\d+$/);
763             }
764             }
765 38         92 return 1;
766             }
767              
768             # internal method to check if a column appears numeric, i.e. scores
769             sub _column_is_numeric {
770 29     29   39 my $self = shift;
771 29         44 my @index = @_;
772 29 50       68 return 1 if ($self->{last_row} == 0); # can't check if table is empty
773 29         54 foreach (@index) {
774 45 50       91 return 0 unless exists $self->{$_};
775             }
776 29         52 for my $row (1 .. $self->{last_row}) {
777 327         348 for my $i (@index) {
778             # we have a very loose definition of numeric: exponents, signs, commas
779 407 50       748 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d\-\+\.,eE]+$/);
780             }
781             }
782 29         63 return 1;
783             }
784              
785              
786              
787             # internal method to check if a column is nothing but comma delimited integers
788             sub _column_is_comma_integers {
789 13     13   17 my $self = shift;
790 13         22 my @index = @_;
791 13 50       34 return 1 if ($self->{last_row} == 0); # can't check if table is empty
792 13         23 foreach (@index) {
793 26 50       53 return 0 unless exists $self->{$_};
794             }
795 13         24 for my $row (1 .. $self->{last_row}) {
796 110         121 for my $i (@index) {
797 220 50       413 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d,]+$/);
798             }
799             }
800 13         37 return 1;
801             }
802              
803             # internal method to check if a column looks like a strand column
804             sub _column_is_stranded {
805 27     27   53 my ($self, $index) = @_;
806 27 50       65 return unless exists $self->{$index};
807 27         63 for my $row (1 .. $self->{last_row}) {
808 347 50       690 return 0 if ($self->{data_table}->[$row][$index] !~ /^(?:\-1|0|1|\+|\-|\.)$/);
809             }
810 27         62 return 1;
811             }
812              
813              
814              
815              
816              
817             #### Database methods ##############################################################
818              
819             sub open_database {
820 1     1 1 2 my $self = shift;
821 1 50 0     3 if (not defined $_[0]) {
    0          
    0          
822 1         4 return $self->open_meta_database;
823             }
824             elsif ($_[0] eq '0' or $_[0] eq '1') {
825             # likely a boolean value to indicate force
826 0         0 return $self->open_meta_database($_[0]);
827             }
828             elsif ($_[0] =~ /[a-zA-Z]+/) {
829             # likely the name of a database
830 0         0 return $self->open_new_database(@_);
831             }
832             else {
833             # original default
834 0         0 return $self->open_meta_database(@_);
835             }
836             }
837              
838             sub open_meta_database {
839 15     15 1 18 my $self = shift;
840 15   50     37 my $force = shift || 0;
841 15 50       25 return unless $self->{db};
842 15 50       31 return if $self->{db} =~ /^Parsed:/; # we don't open parsed annotation files
843 15 100       23 if (exists $self->{db_connection}) {
844 14 50       50 return $self->{db_connection} unless $force;
845             }
846 1         5 my $db = open_db_connection($self->{db}, $force);
847 1 50       2 return unless $db;
848 1         2 $self->{db_connection} = $db;
849 1         2 return $db;
850             }
851              
852             sub open_new_database {
853 0     0 1 0 my $self = shift;
854 0         0 my $database = shift;
855 0   0     0 my $force = shift || 0;
856 0         0 return open_db_connection($database, $force);
857             }
858              
859             sub verify_dataset {
860 12     12 1 21 my ($self, $dataset, $database) = @_;
861 12 50       23 return unless $dataset;
862 12 100       48 if (exists $self->{verfied_dataset}{$dataset}) {
863 11         32 return $self->{verfied_dataset}{$dataset};
864             }
865             else {
866 1 50       5 if ($dataset =~ /^(?:file|http|ftp)/) {
867             # local or remote file already verified?
868 0         0 $self->{verfied_dataset}{$dataset} = $dataset;
869 0         0 return $dataset;
870             }
871 1   33     2 $database ||= $self->open_meta_database;
872 1         5 my ($verified) = verify_or_request_feature_types(
873             # normally returns an array of verified features, we're only checking one
874             db => $database,
875             feature => $dataset,
876             );
877 1 50       3 if ($verified) {
878 1         2 $self->{verfied_dataset}{$dataset} = $verified;
879 1         3 return $verified;
880             }
881             }
882 0         0 return;
883             }
884              
885              
886              
887             #### Column Manipulation ####
888              
889             sub delete_column {
890 2     2 1 4 my $self = shift;
891            
892             # check for Stream
893 2 50       5 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
894 0 0       0 unless ($self->mode) {
895 0         0 cluck "We have a read-only Stream object, cannot add columns";
896 0         0 return;
897             }
898 0 0       0 if (defined $self->{fh}) {
899             # Stream file handle is opened
900 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
901 0         0 return;
902             }
903             }
904 2 50       4 unless (@_) {
905 0         0 cluck "must provide a list";
906 0         0 return;
907             }
908            
909 2         5 my @deletion_list = sort {$a <=> $b} @_;
  0         0  
910 2         3 my @retain_list;
911 2         5 for (my $i = 0; $i < $self->number_columns; $i++) {
912             # compare each current index with the first one in the list of
913             # deleted indices. if it matches, delete. if not, keep
914 18 100       20 if ( $i == $deletion_list[0] ) {
915             # this particular index should be deleted
916 2         4 shift @deletion_list;
917             }
918             else {
919             # this particular index should be kept
920 16         21 push @retain_list, $i;
921             }
922             }
923 2         7 return $self->reorder_column(@retain_list);
924             }
925              
926             sub reorder_column {
927 3     3 1 5 my $self = shift;
928            
929             # check for Stream
930 3 50       6 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
931 0 0       0 unless ($self->mode) {
932 0         0 cluck "We have a read-only Stream object, cannot add columns";
933 0         0 return;
934             }
935 0 0       0 if (defined $self->{fh}) {
936             # Stream file handle is opened
937 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
938 0         0 return;
939             }
940             }
941            
942             # reorder data table
943 3 50       5 unless (@_) {
944 0         0 carp "must provide a list";
945 0         0 return;
946             }
947 3         6 my @order = @_;
948 3         7 for (my $row = 0; $row <= $self->last_row; $row++) {
949 237         303 my @old = $self->row_values($row);
950 237         252 my @new = map { $old[$_] } @order;
  1580         1660  
951 237         215 splice( @{ $self->{data_table} }, $row, 1, \@new);
  237         527  
952             }
953            
954             # reorder metadata
955 3         5 my %old_metadata;
956 3         6 for (my $i = 0; $i < $self->number_columns; $i++) {
957             # copy the metadata info hash into a temporary hash
958 28         38 $old_metadata{$i} = $self->{$i};
959 28         37 delete $self->{$i}; # delete original
960             }
961 3         6 for (my $i = 0; $i < scalar(@order); $i++) {
962             # now copy back from the old_metadata into the main data hash
963             # using the new index number in the @order array
964             # must regenerate the hash, not just link to the old anonymous hash, in
965             # case we're duplicating columns
966 20         25 $self->{$i} = {};
967 20         18 foreach my $k (keys %{ $old_metadata{$order[$i]} }) {
  20         32  
968 50         73 $self->{$i}{$k} = $old_metadata{$order[$i]}{$k};
969             }
970             # assign new index number
971 20         38 $self->{$i}{'index'} = $i;
972             }
973 3         3 $self->{'number_columns'} = scalar @order;
974 3 100       9 delete $self->{column_indices} if exists $self->{column_indices};
975 3 50 66     6 if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
      66        
      33        
976             # check if we maintain integrity, at least insofar what we test
977 1         2 $self->verify(1); # silence so user doesn't get these messages
978             }
979 3         18 return 1;
980             }
981              
982              
983              
984             #### General Metadata ####
985              
986             sub feature {
987 54     54 1 404 my $self = shift;
988 54 100       128 if (@_) {
989 20         36 $self->{feature} = shift;
990             }
991 54         135 return $self->{feature};
992             }
993              
994             sub feature_type {
995 37     37 1 49 my $self = shift;
996 37 50       64 carp "feature_type is a read only method" if @_;
997 37 100       69 if (defined $self->{feature_type}) {
998 24         70 return $self->{feature_type};
999             }
1000 13         47 my $feature_type;
1001 13 100 66     40 if (defined $self->chromo_column and defined $self->start_column) {
    50 0        
      33        
      0        
      0        
1002 8         12 $feature_type = 'coordinate';
1003             }
1004             elsif (defined $self->id_column or
1005             ( defined $self->type_column and defined $self->name_column ) or
1006             ( defined $self->feature and defined $self->name_column )
1007             ) {
1008 5         8 $feature_type = 'named';
1009             }
1010             else {
1011 0         0 $feature_type = 'unknown';
1012             }
1013 13         26 $self->{feature_type} = $feature_type;
1014 13         56 return $feature_type;
1015             }
1016              
1017             sub program {
1018 68     68 1 90 my $self = shift;
1019 68 100       133 if (@_) {
1020 60         88 $self->{program} = shift;
1021             }
1022 68         166 return $self->{program};
1023             }
1024              
1025             sub database {
1026 46     46 1 396 my $self = shift;
1027 46 100       116 if (@_) {
1028 20         41 $self->{db} = shift;
1029 20 0 33     43 if (exists $self->{db_connection} and $self->{db_connection}) {
1030 0         0 $self->open_meta_database(1);
1031             }
1032             }
1033 46         194 return $self->{db};
1034             }
1035              
1036             sub bam_adapter {
1037 0     0 1 0 my $self = shift;
1038 0         0 return use_bam_adapter(@_);
1039             }
1040              
1041             sub big_adapter {
1042 0     0 1 0 my $self = shift;
1043 0         0 return use_big_adapter(@_);
1044             }
1045              
1046             sub format {
1047 222     222 1 255 my $self = shift;
1048 222 100       406 if (defined $_[0]) {
1049 72         117 $self->{format} = $_[0];
1050             }
1051 222         1014 return $self->{format};
1052             }
1053              
1054             sub gff {
1055 102     102 1 1672 my $self = shift;
1056 102 100 66     270 if (defined $_[0] and $_[0] =~ /^(?:0|1|2|2\.[2|5]|3)$/) {
1057 5         12 $self->{gff} = $_[0];
1058             }
1059 102         383 return $self->{gff};
1060             }
1061              
1062             sub bed {
1063 137     137 1 508 my $self = shift;
1064 137 100 66     449 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1065 43         86 $self->{bed} = $_[0];
1066             }
1067 137         363 return $self->{bed};
1068             }
1069              
1070             sub ucsc {
1071 62     62 1 75 my $self = shift;
1072 62 100 66     166 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1073 3         7 $self->{ucsc} = $_[0];
1074             }
1075 62         212 return $self->{ucsc};
1076             }
1077              
1078             sub vcf {
1079 44     44 1 60 my $self = shift;
1080 44 50 33     78 if (defined $_[0] and $_[0] =~ /^[\d\.]+$/) {
1081 0         0 $self->{vcf} = $_[0];
1082             }
1083 44         127 return $self->{vcf};
1084             }
1085              
1086             sub number_columns {
1087 553     553 1 7066 my $self = shift;
1088 553 50       794 carp "number_columns is a read only method" if @_;
1089 553         1210 return $self->{number_columns};
1090             }
1091              
1092             sub number_rows {
1093 3     3 1 7 my $self = shift;
1094 3 50       10 carp "number_rows is a read only method" if @_;
1095 3         10 return $self->{last_row};
1096             }
1097              
1098             sub last_column {
1099 0     0 1 0 my $self = shift;
1100 0 0       0 carp "last_column is a read only method" if @_;
1101 0         0 return $self->{number_columns} - 1;
1102             }
1103              
1104             sub last_row {
1105 892     892 1 2803 my $self = shift;
1106 892 50       1221 carp "last_row is a read only method" if @_;
1107 892         1680 return $self->{last_row};
1108             }
1109              
1110             sub filename {
1111 138     138 1 181 my $self = shift;
1112 138 50       250 carp "filename is a read only method. Use add_file_metadata()." if @_;
1113 138         382 return $self->{filename};
1114             }
1115              
1116             sub basename {
1117 29     29 1 2782 my $self = shift;
1118 29 50       74 carp "basename is a read only method. Use add_file_metadata()." if @_;
1119 29         117 return $self->{basename};
1120             }
1121              
1122             sub path {
1123 0     0 1 0 my $self = shift;
1124 0 0       0 carp "path is a read only method. Use add_file_metadata()." if @_;
1125 0         0 return $self->{path};
1126             }
1127              
1128             sub extension {
1129 129     129 1 174 my $self = shift;
1130 129 50       203 carp "extension() is a read only method. Use add_file_metadata()." if @_;
1131 129         471 return $self->{extension};
1132             }
1133              
1134              
1135              
1136             #### General Comments ####
1137              
1138             sub comments {
1139 26     26 1 40 my $self = shift;
1140 26         36 my @comments = @{ $self->{comments} };
  26         64  
1141 26         56 foreach (@comments) {s/[\r\n]+//g}
  25         92  
1142             # comments are not chomped when loading
1143             # side effect of dealing with rare commented header lines with null values at end
1144 26         68 return @comments;
1145             }
1146              
1147             sub add_comment {
1148 53     53 1 516 my $self = shift;
1149 53 50       118 my $comment = shift or return;
1150             # comment is not required to be prefixed with "# ", it will be added when saving
1151 53         78 push @{ $self->{comments} }, $comment;
  53         110  
1152 53         93 return 1;
1153             }
1154              
1155             sub delete_comment {
1156 1     1 1 1153 my $self = shift;
1157 1         1 my $index = shift;
1158 1 50       3 if (defined $index) {
1159 1         1 eval {splice @{$self->{comments}}, $index, 1};
  1         2  
  1         3  
1160             }
1161             else {
1162 0         0 $self->{comments} = [];
1163             }
1164             }
1165              
1166             sub vcf_headers {
1167 0     0 1 0 my $self = shift;
1168 0 0       0 return unless $self->vcf;
1169 0 0       0 return $self->{vcf_headers} if exists $self->{vcf_headers};
1170 0         0 my $headers = {};
1171 0         0 foreach my $comment ($self->comments) {
1172 0         0 my ($key, $value);
1173 0 0       0 if ($comment =~ /^##([\w\-\.]+)=(.+)$/) {
1174 0         0 $key = $1;
1175 0         0 $value = $2;
1176             }
1177             else {
1178             # invalid vcf header format!?
1179 0         0 next;
1180             }
1181 0 0       0 if ($value !~ /^<.+>$/) {
1182             # simple value
1183 0         0 $headers->{$key} = $value;
1184             }
1185             else {
1186             # process complex values
1187             # extract ID with regex which should have
1188 0         0 my $id = ($value =~ /ID=([\w\-\.:]+)/)[0];
1189 0         0 $headers->{$key}{$id} = $value;
1190             }
1191             }
1192            
1193             # store and return
1194 0         0 $self->{vcf_headers} = $headers;
1195 0         0 return $headers;
1196             }
1197              
1198             sub rewrite_vcf_headers {
1199 0     0 1 0 my $self = shift;
1200 0 0       0 return unless $self->vcf;
1201 0 0       0 return unless exists $self->{vcf_headers};
1202 0         0 my @newComments;
1203            
1204             # file format always comes first
1205             push @newComments, sprintf("##fileformat=%s\n",
1206 0         0 $self->{vcf_headers}{fileformat});
1207            
1208             # common attributes
1209 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{vcf_headers} } ) {
  0         0  
  0         0  
1210 0 0       0 next if $key eq 'fileformat';
1211 0 0       0 if (ref $self->{vcf_headers}{$key} eq 'HASH') {
1212             # we have a complex VCF header with multiple keys
1213             # we will rewrite for each ID
1214 0         0 foreach my $id (sort {$a cmp $b}
  0         0  
1215 0         0 keys %{ $self->{vcf_headers}{$key} }
1216             ) {
1217             # to avoid complexity of writing correct formatting
1218             push @newComments, sprintf("##%s=%s\n", $key,
1219 0         0 $self->{vcf_headers}{$key}{$id} );
1220             }
1221             }
1222             else {
1223             # a simple value
1224             push @newComments, sprintf("##%s=%s\n", $key,
1225 0         0 $self->{vcf_headers}{$key} );
1226             }
1227             }
1228            
1229             # replace the headers
1230 0         0 $self->{comments} = \@newComments;
1231             }
1232              
1233              
1234              
1235             #### Column Metadata ####
1236              
1237             sub list_columns {
1238 4     4 1 17 my $self = shift;
1239 4 50       10 carp "list_columns is a read only method" if @_;
1240 4         5 my @list;
1241 4         9 for (my $i = 0; $i < $self->number_columns; $i++) {
1242 26         48 push @list, $self->{$i}{'name'};
1243             }
1244 4 100       15 return wantarray ? @list : \@list;
1245             }
1246              
1247             sub name {
1248 24     24 1 1845 my $self = shift;
1249 24         44 my ($index, $new_name) = @_;
1250 24 50       49 return unless defined $index;
1251 24 50       68 return unless exists $self->{$index}{name};
1252 24 100       46 if (defined $new_name) {
1253 1         2 $self->{$index}{name} = $new_name;
1254 1 50       3 if (exists $self->{data_table}) {
    0          
1255 1         3 $self->{data_table}->[0][$index] = $new_name;
1256             }
1257             elsif (exists $self->{column_names}) {
1258 0         0 $self->{column_names}->[$index] = $new_name;
1259             }
1260             }
1261 24         138 return $self->{$index}{name};
1262             }
1263              
1264             sub metadata {
1265 26     26 1 26 my $self = shift;
1266 26         35 my ($index, $key, $value) = @_;
1267 26 50       36 return unless defined $index;
1268 26 50       44 return unless exists $self->{$index};
1269 26 100 100     51 if ($key and $key eq 'name') {
1270 2         4 return $self->name($index, $value);
1271             }
1272 24 100 66     54 if ($key and defined $value) {
    50 33        
1273             # we are setting a new value
1274 3         5 $self->{$index}{$key} = $value;
1275 3         6 return $value;
1276             }
1277             elsif ($key and not defined $value) {
1278 0 0       0 if (exists $self->{$index}{$key}) {
1279             # retrieve a value
1280 0         0 return $self->{$index}{$key};
1281             }
1282             else {
1283             # key does not exist
1284 0         0 return;
1285             }
1286             }
1287             else {
1288 21         19 my %hash = %{ $self->{$index} };
  21         64  
1289 21 100       83 return wantarray ? %hash : \%hash;
1290             }
1291             }
1292              
1293             sub delete_metadata {
1294 0     0 1 0 my $self = shift;
1295 0         0 my ($index, $key) = @_;
1296 0 0       0 return unless defined $index;
1297 0 0       0 if (defined $key) {
1298 0 0       0 if (exists $self->{$index}{$key}) {
1299 0         0 return delete $self->{$index}{$key};
1300             }
1301             }
1302             else {
1303             # user wants to delete the metadata
1304             # but we need to keep the basics name and index
1305 0         0 foreach my $key (keys %{ $self->{$index} }) {
  0         0  
1306 0 0       0 next if $key eq 'name';
1307 0 0       0 next if $key eq 'index';
1308 0         0 delete $self->{$index}{$key};
1309             }
1310             }
1311             }
1312              
1313             sub copy_metadata {
1314 2     2 1 4 my ($self, $source, $target) = @_;
1315 2 50 33     9 return unless (exists $self->{$source}{name} and exists $self->{$target}{name});
1316 2         4 my $md = $self->metadata($source);
1317 2         4 delete $md->{name};
1318 2         3 delete $md->{'index'};
1319 2 50       20 delete $md->{'AUTO'} if exists $md->{'AUTO'}; # presume this is no longer auto index
1320 2         6 foreach (keys %$md) {
1321 1         2 $self->{$target}{$_} = $md->{$_};
1322             }
1323 2         5 return 1;
1324             }
1325              
1326              
1327              
1328             #### Column Indices ####
1329              
1330             sub find_column {
1331 220     220 1 340 my ($self, $name) = @_;
1332 220 50       305 return unless $name;
1333            
1334             # the $name variable will be used as a regex in identifying the name
1335             # fix it so that it will possible accept a # character at the beginning
1336             # without a following space, in case the first column has a # prefix
1337             # also place the remainder of the text in a non-capturing parentheses for
1338             # grouping purposes while maintaining the anchors
1339 220         1050 $name =~ s/ \A (\^?) (.+) (\$?)\Z /$1#?(?:$2)$3/x;
1340            
1341             # walk through each column index
1342 220         276 my $index;
1343 220         369 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
1344             # check the names of each column
1345 736 100       5613 if ($self->{$i}{'name'} =~ /$name/i) {
1346 88         129 $index = $i;
1347 88         96 last;
1348             }
1349             }
1350 220         443 return $index;
1351             }
1352              
1353             sub _find_column_indices {
1354 24     24   34 my $self = shift;
1355             # these are hard coded index name regex to accomodate different possibilities
1356             # these do not include parentheses for grouping
1357             # non-capturing parentheses will be added later in the sub for proper
1358             # anchoring and grouping - long story why, don't ask
1359 24         69 my $name = $self->find_column('^name|gene.?name|transcript.?name|geneid|id|gene|alias');
1360 24         53 my $type = $self->find_column('^type|class|primary_tag|biotype');
1361 24         63 my $id = $self->find_column('^primary_id');
1362 24         53 my $chromo = $self->find_column('^chr|seq|ref|ref.?seq');
1363 24         48 my $start = $self->find_column('^start|position|pos|txStart');
1364 24         46 my $stop = $self->find_column('^stop|end|txEnd');
1365 24         52 my $strand = $self->find_column('^strand');
1366 24         229 my $score = $self->find_column('^score$');
1367            
1368             # check for coordinate string
1369 24         44 my $coord = $self->find_column('^coordinate$');
1370 24 50 66     120 if (not defined $coord and not defined $chromo and not defined $start) {
      66        
1371             # check if ID or name id looks like a coordinate string
1372 12 50 33     171 if (defined $id and defined $self->{data_table}->[1]) {
1373 12 100       61 if ($self->{data_table}->[1][$id] =~ /^[\w\-\.]+:\d+(?:[\-\.]{1,2}\d+)?$/) {
1374             # first value looks like a coordinate string
1375 8         15 $coord = $id;
1376             }
1377             }
1378 12 50 66     53 if (
      66        
1379             not defined $coord and defined $name and
1380             defined defined $self->{data_table}->[1]
1381             ) {
1382 4 50       14 if ($self->{data_table}->[1][$name] =~ /^[\w\-\.]+:\d+(?:[\-\.]{1,2}\d+)?$/) {
1383             # first value looks like a coordinate string
1384 0         0 $coord = $name;
1385             }
1386             }
1387             }
1388            
1389             # determine zero-based start coordinates
1390 24 100 100     115 if (
      100        
1391             $self->{zerostart} == 0 and
1392             defined $start and
1393             substr($self->name($start), -1) eq '0'
1394             ) {
1395 1         3 $self->{zerostart} = 1;
1396             }
1397            
1398             # cache the indexes
1399             $self->{column_indices} = {
1400 24         209 'name' => $name,
1401             'type' => $type,
1402             'id' => $id,
1403             'seq_id' => $chromo,
1404             'chromo' => $chromo,
1405             'start' => $start,
1406             'stop' => $stop,
1407             'end' => $stop,
1408             'strand' => $strand,
1409             'score' => $score,
1410             'coord' => $coord,
1411             };
1412 24         40 return 1;
1413             }
1414              
1415             sub chromo_column {
1416 26     26 1 36 my $self = shift;
1417 26 100       102 $self->_find_column_indices unless exists $self->{column_indices};
1418 26 100       52 if (defined $_[0]) {
1419 2 50 33     14 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1420 2         3 $self->{column_indices}{chromo} = $_[0];
1421             }
1422             else {
1423 0         0 carp sprintf("chromo_column %s is not a valid index!", $_[0]);
1424             }
1425             }
1426 26         103 return $self->{column_indices}{chromo};
1427             }
1428              
1429             sub start_column {
1430 117     117 1 131 my $self = shift;
1431 117 100       173 $self->_find_column_indices unless exists $self->{column_indices};
1432 117 100       156 if (defined $_[0]) {
1433 2 50 33     11 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1434 2         4 $self->{column_indices}{start} = $_[0];
1435             }
1436             else {
1437 0         0 carp sprintf("start_column %s is not a valid index!", $_[0]);
1438             }
1439             }
1440 117         226 return $self->{column_indices}{start};
1441             }
1442              
1443             sub stop_column {
1444 21     21 1 33 my $self = shift;
1445 21 50       47 $self->_find_column_indices unless exists $self->{column_indices};
1446 21 100       40 if (defined $_[0]) {
1447 2 50 33     11 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1448 2         3 $self->{column_indices}{stop} = $_[0];
1449             }
1450             else {
1451 0         0 carp sprintf("stop_column %s is not a valid index!", $_[0]);
1452             }
1453             }
1454 21         51 return $self->{column_indices}{stop};
1455             }
1456             *end_column = \&stop_column;
1457              
1458             sub coord_column {
1459 109     109 0 105 my $self = shift;
1460 109 50       169 $self->_find_column_indices unless exists $self->{column_indices};
1461 109 50       140 if (defined $_[0]) {
1462 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1463 0         0 $self->{column_indices}{coord} = $_[0];
1464             }
1465             else {
1466 0         0 carp sprintf("coord_column %s is not a valid index!", $_[0]);
1467             }
1468             }
1469 109         162 return $self->{column_indices}{coord};
1470             }
1471              
1472             sub strand_column {
1473 20     20 1 32 my $self = shift;
1474 20 100       55 $self->_find_column_indices unless exists $self->{column_indices};
1475 20 100       44 if (defined $_[0]) {
1476 2 50 33     11 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1477 2         4 $self->{column_indices}{strand} = $_[0];
1478             }
1479             else {
1480 0         0 carp sprintf("strand_column %s is not a valid index!", $_[0]);
1481             }
1482             }
1483 20         64 return $self->{column_indices}{strand};
1484             }
1485              
1486             sub name_column {
1487 72     72 1 76 my $self = shift;
1488 72 100       109 $self->_find_column_indices unless exists $self->{column_indices};
1489 72 50       107 if (defined $_[0]) {
1490 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1491 0         0 $self->{column_indices}{name} = $_[0];
1492             }
1493             else {
1494 0         0 carp sprintf("name_column %s is not a valid index!", $_[0]);
1495             }
1496             }
1497 72         152 return $self->{column_indices}{name};
1498             }
1499              
1500             sub type_column {
1501 14     14 1 20 my $self = shift;
1502 14 100       49 $self->_find_column_indices unless exists $self->{column_indices};
1503 14 100       26 if (defined $_[0]) {
1504 2 50 33     12 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1505 2         3 $self->{column_indices}{type} = $_[0];
1506             }
1507             else {
1508 0         0 carp sprintf("type_column %s is not a valid index!", $_[0]);
1509             }
1510             }
1511 14         34 return $self->{column_indices}{type};
1512             }
1513              
1514             sub id_column {
1515 71     71 1 83 my $self = shift;
1516 71 100       111 $self->_find_column_indices unless exists $self->{column_indices};
1517 71 50       103 if (defined $_[0]) {
1518 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1519 0         0 $self->{column_indices}{id} = $_[0];
1520             }
1521             else {
1522 0         0 carp sprintf("id_column %s is not a valid index!", $_[0]);
1523             }
1524             }
1525 71         133 return $self->{column_indices}{id};
1526             }
1527              
1528             sub score_column {
1529 0     0 1 0 my $self = shift;
1530 0 0       0 $self->_find_column_indices unless exists $self->{column_indices};
1531 0 0       0 if (defined $_[0]) {
1532 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1533 0         0 $self->{column_indices}{score} = $_[0];
1534             }
1535             else {
1536 0         0 carp sprintf("score_column %s is not a valid index!", $_[0]);
1537             }
1538             }
1539 0         0 return $self->{column_indices}{score};
1540             }
1541              
1542             *zero_start = \&interbase;
1543             sub interbase {
1544 90     90 1 97 my $self = shift;
1545 90 50       116 if (@_) {
1546 0         0 my $i = $self->start_column;
1547 0         0 my $n = $self->name($i);
1548 0 0 0     0 if ($_[0] eq '1' and $n =~ /^start$/i) {
    0 0        
1549 0         0 $self->{zerostart} = 1;
1550 0         0 $self->name($i, 'Start0');
1551             }
1552             elsif ($_[0] eq '0' and $n =~ /^start0$/i) {
1553 0         0 $self->{zerostart} = 0;
1554 0         0 $self->name($i, 'Start');
1555             }
1556             else {
1557 0         0 carp "use 1 (true) or 0 (false) to set interbase mode";
1558             }
1559             }
1560 90         157 return $self->{zerostart};
1561             }
1562              
1563             #### Special Row methods ####
1564              
1565             # Why is this in core and not in Data? I keep asking myself.
1566             # Because this can get called from a Data::Feature object, which might be
1567             # associated with a Data::Stream object. No, Stream objects don't have stored
1568             # SeqFeatures, but I don't want the entire program to crash because of an
1569             # undefined method because some doofus forgot. Since both Data and Stream
1570             # objects inherit from Data::core, this is in here.
1571             sub get_seqfeature {
1572 13     13 1 34 my ($self, $row) = @_;
1573 13 50       51 return unless (ref($self) eq 'Bio::ToolBox::Data');
1574 13 50 33     78 return unless ($row and $row <= $self->{last_row});
1575 13 50       40 return unless exists $self->{SeqFeatureObjects};
1576 13   50     48 return $self->{SeqFeatureObjects}->[$row] || undef;
1577             }
1578              
1579              
1580              
1581             __END__