line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::ToolBox::Data; |
2
|
|
|
|
|
|
|
our $VERSION = '1.68'; |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
=head1 NAME |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
Bio::ToolBox::Data - Reading, writing, and manipulating data structure |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 SYNOPSIS |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Bio::ToolBox::Data; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
### Open a pre-existing file |
13
|
|
|
|
|
|
|
# a data table with same columns as input file |
14
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
15
|
|
|
|
|
|
|
file => 'coordinates.bed', |
16
|
|
|
|
|
|
|
); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
### Parse a GTF, GFF3, refFlat, or genePred gene table |
19
|
|
|
|
|
|
|
# data table with names and references to fully parsed |
20
|
|
|
|
|
|
|
# SeqFeature transcript objects with exon SeqFeature subfeatures |
21
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
22
|
|
|
|
|
|
|
file => 'annotation.gtf.gz', |
23
|
|
|
|
|
|
|
parse => 1, |
24
|
|
|
|
|
|
|
feature => 'transcript', |
25
|
|
|
|
|
|
|
subfeature => 'exon' |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
### New gene list from a Bio::DB::SeqFeature::Store database |
29
|
|
|
|
|
|
|
# data table with name and reference ID to database SeqFeature objects |
30
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
31
|
|
|
|
|
|
|
db => 'hg19.sqlite', |
32
|
|
|
|
|
|
|
feature => 'gene:ensGene', |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
### Get a specific value |
37
|
|
|
|
|
|
|
my $value = $Data->value($row, $column); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
### Replace or add a value |
40
|
|
|
|
|
|
|
$Data->value($row, $column, $new_value); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
### Add a column |
43
|
|
|
|
|
|
|
my $new_index = $Data->add_column('Data2'); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
### Find a column |
46
|
|
|
|
|
|
|
my $old_index = $Data->find_column('Data1'); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
### Return a specific row as an object |
49
|
|
|
|
|
|
|
my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
### Iterate through a Data structure one row at a time |
52
|
|
|
|
|
|
|
my $stream = $Data->row_stream; |
53
|
|
|
|
|
|
|
while (my $row = $stream->next_row) { |
54
|
|
|
|
|
|
|
# each row is returned as Bio::ToolBox::Data::Feature object |
55
|
|
|
|
|
|
|
# get the positional information from the file data |
56
|
|
|
|
|
|
|
# assuming that the input file had these identifiable columns |
57
|
|
|
|
|
|
|
my $seq_id = $row->seq_id; |
58
|
|
|
|
|
|
|
my $start = $row->start; |
59
|
|
|
|
|
|
|
my $stop = $row->end; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# work with the referenced SeqFeature object |
62
|
|
|
|
|
|
|
my $seqf = $row->seqfeature; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# generate a new Bio::Seq object from the database using |
65
|
|
|
|
|
|
|
# these coordinates |
66
|
|
|
|
|
|
|
my $region = $db->segment($seq_id, $start, $stop); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# modify a row value |
69
|
|
|
|
|
|
|
my $value = $row->value($old_index); |
70
|
|
|
|
|
|
|
my $new_value = $value + 1; |
71
|
|
|
|
|
|
|
$row->value($new_index, $new_value); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
### Iterate through a Data table with a code reference |
75
|
|
|
|
|
|
|
$Data->iterate(\&my_code); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
### write the data to file |
79
|
|
|
|
|
|
|
my $success = $Data->write_file( |
80
|
|
|
|
|
|
|
filename => 'new_data.txt', |
81
|
|
|
|
|
|
|
gz => 1, |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
print "wrote new file $success\n"; # file is new_data.txt.gz |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head1 DESCRIPTION |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
This module works with the primary Bio::ToolBox Data structure. Simply, it |
88
|
|
|
|
|
|
|
is a complex data structure representing a tabbed-delimited table (array |
89
|
|
|
|
|
|
|
of arrays), with plenty of options for metadata. Many common bioinformatic |
90
|
|
|
|
|
|
|
file formats are simply tabbed-delimited text files (think BED, GFF, VCF). |
91
|
|
|
|
|
|
|
Each row is a feature or genomic interval, and each column is a piece of |
92
|
|
|
|
|
|
|
information about that feature, such as name, type, and/or coordinates. |
93
|
|
|
|
|
|
|
We can append to that file additional columns of information, perhaps |
94
|
|
|
|
|
|
|
scores from genomic data sets. We can record metadata regarding how |
95
|
|
|
|
|
|
|
and where we obtained that data. Finally, we can write the updated |
96
|
|
|
|
|
|
|
table to a new file. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head1 METHODS |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 Initializing the structure |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=over 4 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item new |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Initialize a new Data structure. This generally requires options, |
107
|
|
|
|
|
|
|
provided as an array of key =E values. A new list of features |
108
|
|
|
|
|
|
|
may be obtained from an annotation database or an existing file |
109
|
|
|
|
|
|
|
may be loaded. If you do not pass any options, a new empty |
110
|
|
|
|
|
|
|
structure will be generated for you to populate. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
These are the options available. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=over 4 |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=item file |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item in |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Provide the path and name to an existing tabbed-delimited text |
123
|
|
|
|
|
|
|
file from which to load the contents. This is a shortcut to the |
124
|
|
|
|
|
|
|
load_file() method. See that method for more details. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item stream |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Boolean option indicating that the file should be opened as a file |
131
|
|
|
|
|
|
|
stream. A L object will be returned. This |
132
|
|
|
|
|
|
|
is a convenience method. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=item noheader |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1); |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Boolean option indicating that the file does not have file headers, |
139
|
|
|
|
|
|
|
in which case dummy headers are provided. This is not necessary for |
140
|
|
|
|
|
|
|
defined file types that don't normally have file headers, such as |
141
|
|
|
|
|
|
|
BED, GFF, or UCSC files. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=item parse |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1); |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Boolean option indicating that a gene annotation table or file should |
148
|
|
|
|
|
|
|
be parsed into SeqFeature objects and a general table of names and IDs |
149
|
|
|
|
|
|
|
representing those objects be generated. The annotation file may |
150
|
|
|
|
|
|
|
be specified in one of two ways: Through the file option above, |
151
|
|
|
|
|
|
|
or in the database metadata of an existing table file representing |
152
|
|
|
|
|
|
|
previously parsed objects. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item db |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene'); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Provide the name of the database from which to collect the |
159
|
|
|
|
|
|
|
features. It may be a short name, whereupon it is checked in |
160
|
|
|
|
|
|
|
the L configuration file F<.biotoolbox.cfg> for |
161
|
|
|
|
|
|
|
connection information. Alternatively, a path to a database |
162
|
|
|
|
|
|
|
file or directory may be given. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
If you already have an opened L database |
165
|
|
|
|
|
|
|
object, you can simply pass that. See L for |
166
|
|
|
|
|
|
|
more information. However, this in general should be discouraged, |
167
|
|
|
|
|
|
|
since the name of the database will not be properly recorded when |
168
|
|
|
|
|
|
|
saving to file. It is better to simply pass the name of database |
169
|
|
|
|
|
|
|
again; multiple connections to the same database are smartly handled |
170
|
|
|
|
|
|
|
in the background. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=item feature |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene'); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
For de novo lists from an annotation database, provide the GFF |
177
|
|
|
|
|
|
|
type or type:source (columns 3 and 2) for collection. A comma |
178
|
|
|
|
|
|
|
delimited string may be accepted for databases. For parsed files, |
179
|
|
|
|
|
|
|
only a simple string is accepted. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
For a list of genomic intervals across the genome, specify a |
182
|
|
|
|
|
|
|
feature of 'genome' with a database object. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item subfeature |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
When parsing annotation files such as GTF, one or more subfeature |
187
|
|
|
|
|
|
|
types, e.g. C or C, may be specified as a comma-delimited |
188
|
|
|
|
|
|
|
string. This ensures that the subfeatures will be parsed into |
189
|
|
|
|
|
|
|
SeqFeature objects. Otherwise, only the top level features will be |
190
|
|
|
|
|
|
|
parsed. This expedites parsing by skipping unwanted features. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item win |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item step |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
If generating a list of genomic intervals, optionally provide |
199
|
|
|
|
|
|
|
the window and step values. The default is 500 bp. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=item chrskip |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Provide a regular expression compatible or C string for skipping or |
204
|
|
|
|
|
|
|
excluding specific or classes of chromosomes, for example the mitochondrial |
205
|
|
|
|
|
|
|
chromosome or unmapped contigs. This works with both feature collection |
206
|
|
|
|
|
|
|
and genomic intervals. The default is to take all chromosomes. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=back |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
When no file is given or database given to search, then a new, |
211
|
|
|
|
|
|
|
empty Data object is returned. In this case, you may optionally |
212
|
|
|
|
|
|
|
specify the names of the columns or indicate a specific file |
213
|
|
|
|
|
|
|
format structure to assign column names. The following options can |
214
|
|
|
|
|
|
|
then be provided. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=over 4 |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=item columns |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item datasets |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] ); |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Provide the column names in advance as an anonymous array. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item gff |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item bed |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Pass the number of BED columns (3-12). |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item ucsc |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Pass the number of columns to indicate the type of UCSC format. These |
237
|
|
|
|
|
|
|
include 10 (refFlat without gene names), 11 (refFlat with gene names), |
238
|
|
|
|
|
|
|
12 (knownGene gene prediction table), and 15 |
239
|
|
|
|
|
|
|
(an extended gene prediction or genePredExt table). |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=back |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
If successful, the method will return a Bio::ToolBox::Data object. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item duplicate |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This will create a new Data object containing the same column |
248
|
|
|
|
|
|
|
headers and metadata, but lacking the table content, i.e. no |
249
|
|
|
|
|
|
|
rows of data. File name metadata, if present in the original, is |
250
|
|
|
|
|
|
|
not preserved. The purpose here, for example, is to allow one |
251
|
|
|
|
|
|
|
to selectively copy rows from one Data object to another. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item parse_table |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
$Data->parse_table($file) |
256
|
|
|
|
|
|
|
$Data->parse_table( { |
257
|
|
|
|
|
|
|
file => $file, |
258
|
|
|
|
|
|
|
feature => 'gene', |
259
|
|
|
|
|
|
|
subfeature => 'exon', |
260
|
|
|
|
|
|
|
chrskip => 'chrM|contig', |
261
|
|
|
|
|
|
|
} ); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
This will parse a gene annotation table into SeqFeature objects. |
264
|
|
|
|
|
|
|
If this is called from an empty Data object, then the table will |
265
|
|
|
|
|
|
|
be filled with the SeqFeature object names and IDs. If this is |
266
|
|
|
|
|
|
|
called from a non-empty Data object, then the table's contents |
267
|
|
|
|
|
|
|
will be associated with the SeqFeature objects using their name and |
268
|
|
|
|
|
|
|
ID. The stored SeqFeature objects can be retrieved using the |
269
|
|
|
|
|
|
|
L method. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Pass the method a single argument. This may be either a simple |
272
|
|
|
|
|
|
|
scalar to a filename to be parsed, or a reference to hash of |
273
|
|
|
|
|
|
|
one or more argument options. Possible options include: |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=over 4 |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item file |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
The file name of the gene table to be parsed. This may |
280
|
|
|
|
|
|
|
be a GFF, GFF3, GTF, or any of the UCSC gene table formats. |
281
|
|
|
|
|
|
|
These will be parsed using Bio::ToolBox::parser::* adapters. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=item feature |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
286
|
|
|
|
|
|
|
the top features C to keep. The C is not |
287
|
|
|
|
|
|
|
checked. The default is 'gene', although a transcript type could |
288
|
|
|
|
|
|
|
alternatively be specified (in which case transcripts are not |
289
|
|
|
|
|
|
|
parsed in gene features). |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=item subfeature |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
294
|
|
|
|
|
|
|
any sub features C to parse. The C is not checked. |
295
|
|
|
|
|
|
|
Typically these include exon, CDS, or UTR. The default is nothing. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item chrskip |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
300
|
|
|
|
|
|
|
chromosomes to be skipped or excluded. Any feature with a matching |
301
|
|
|
|
|
|
|
C chromosome name will be skipped. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=back |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=back |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 General Metadata |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
There is a variety of general metadata regarding the Data |
310
|
|
|
|
|
|
|
structure. The following methods may be used to access or set these |
311
|
|
|
|
|
|
|
metadata properties. Some of these are stored as comment lines at |
312
|
|
|
|
|
|
|
the beginning of the file, and will be read or set when the file is |
313
|
|
|
|
|
|
|
loaded. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=over |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item feature |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
$Data->feature('something'); |
320
|
|
|
|
|
|
|
my $feature = $Data->feature; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
Returns or sets the name of the features used to collect |
323
|
|
|
|
|
|
|
the list of features. The actual feature types are listed |
324
|
|
|
|
|
|
|
in the table, so this metadata is merely descriptive. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=item feature_type |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Returns one of three specific values describing the contents |
329
|
|
|
|
|
|
|
of the data table inferred by the presence of specific column |
330
|
|
|
|
|
|
|
names. This provides a clue as to whether the table features |
331
|
|
|
|
|
|
|
represent genomic regions (defined by coordinate positions) or |
332
|
|
|
|
|
|
|
named database features. The return values include: |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 4 |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * coordinate |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Table includes at least chromosome and start columns. |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=item * named |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Table includes name, type, and/or Primary_ID, possibly |
343
|
|
|
|
|
|
|
referring to named database features. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=item * unknown |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Table is unrecognized format. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=back |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=item program |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Returns or sets the name of the program generating the list. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=item database |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Returns or sets the name or path of the database from which the |
358
|
|
|
|
|
|
|
features were derived. |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=item gff |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Returns or sets the version of loaded GFF files. Supported versions |
363
|
|
|
|
|
|
|
included 1, 2, 2.5 (GTF), and 3. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=item bed |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Returns or sets the BED file version. Here, the BED version is simply |
368
|
|
|
|
|
|
|
the number of columns. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=item ucsc |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
Returns or sets the UCSC file format version. Here, the version is |
373
|
|
|
|
|
|
|
simply the number of columns. Supported versions include 10 (gene |
374
|
|
|
|
|
|
|
prediction), 11 (refFlat, or gene prediction with gene name), 12 |
375
|
|
|
|
|
|
|
(knownGene table), 15 (extended gene prediction), or 16 (extended |
376
|
|
|
|
|
|
|
gene prediction with bin). |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=item vcf |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Returns or sets the VCF file version number. VCF support is limited. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=back |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 File information |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
These methods provide information about the file from which the |
387
|
|
|
|
|
|
|
data table was loaded. This does not include parsed annotation tables. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=over 4 |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=item filename |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=item path |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item basename |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=item extension |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Returns the filename, full path, basename, and extension of |
400
|
|
|
|
|
|
|
the filename. Concatenating the last three values will reconstitute |
401
|
|
|
|
|
|
|
the first original filename. |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=item add_file_metadata |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
$Data->add_file_metadata('/path/to/file.txt'); |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Add filename metadata. This will automatically parse the path, |
408
|
|
|
|
|
|
|
basename, and recognized extension from the passed filename and |
409
|
|
|
|
|
|
|
set the appropriate metadata attributes. |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=back |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 Metadata comments |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Comments are any other commented lines from a text file (lines |
416
|
|
|
|
|
|
|
beginning with a #) that were not parsed as general metadata. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=over 4 |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item comments |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Returns a copy of the array containing commented lines. Each |
423
|
|
|
|
|
|
|
comment line becomes an element in the array. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=item add_comment |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Appends the text string to the comment array. |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item delete_comment |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Deletes a comment. Provide the array index of the comment to |
432
|
|
|
|
|
|
|
delete. If an index is not provided, ALL comments will be deleted! |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item vcf_headers |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
For VCF files, this will partially parse the VCF headers into a |
437
|
|
|
|
|
|
|
hash structure that can be queried or manipulated. Each header |
438
|
|
|
|
|
|
|
line is parsed for the primary key, being the first word after the |
439
|
|
|
|
|
|
|
## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple |
440
|
|
|
|
|
|
|
values, they are stored as the value. For complex entries, such as |
441
|
|
|
|
|
|
|
with INFO and FORMAT, a second level hash is created with the ID |
442
|
|
|
|
|
|
|
extracted and used as the second level key. The value is always the |
443
|
|
|
|
|
|
|
always the remainder of the string. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
For example, the following would be a simple parsed vcf header in |
446
|
|
|
|
|
|
|
code representation. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
$vcf_header = { |
449
|
|
|
|
|
|
|
FORMAT => { |
450
|
|
|
|
|
|
|
GT = q(ID=GT,Number=1,Type=String,Description="Genotype"), |
451
|
|
|
|
|
|
|
AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"), |
452
|
|
|
|
|
|
|
}, |
453
|
|
|
|
|
|
|
fileDate => 20150715, |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item rewrite_vcf_headers |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
If you have altered the vcf headers exported by the vcf_headers() |
459
|
|
|
|
|
|
|
method, then this method will rewrite the hash structure as new |
460
|
|
|
|
|
|
|
comment lines. Do this prior to writing or saving the Data sturcture |
461
|
|
|
|
|
|
|
or else you will lose your changed VCF header metadata. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=back |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=head2 The Data table |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
The Data table is the array of arrays containing all of the |
468
|
|
|
|
|
|
|
actual information. Rows and columns are indexed using 0-based |
469
|
|
|
|
|
|
|
indexing as with all Perl arrays. Row 0 is always the column |
470
|
|
|
|
|
|
|
header row containing the column names, regardless whether an |
471
|
|
|
|
|
|
|
actual header name existed in the original file format (e.g. |
472
|
|
|
|
|
|
|
BED or GFF formats). Any individual table "cell" can be |
473
|
|
|
|
|
|
|
specified as C<[$row][$column]>. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=over 4 |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item list_columns |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Returns an array or array reference of the column names |
480
|
|
|
|
|
|
|
in ascending (left to right) order. |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=item number_columns |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Returns the number of columns in the Data table. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item last_column |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Returns the array index number for the last (right most) |
489
|
|
|
|
|
|
|
column. This number is always 1 less than the value |
490
|
|
|
|
|
|
|
returned by number_columns() due to 0-based indexing. |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=item last_row |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Returns the array index number of the last row. |
495
|
|
|
|
|
|
|
Since the header row is index 0, this is also the |
496
|
|
|
|
|
|
|
number of actual content rows. |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=item column_values |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
my $values = $Data->column_values($i); |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
Returns an array or array reference representing the values |
503
|
|
|
|
|
|
|
in the specified column. This includes the column header as |
504
|
|
|
|
|
|
|
the first element. Pass the method the column index. |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=item add_column |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# add empty column with name |
509
|
|
|
|
|
|
|
my $i = $Data->add_column($name); |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# add entire column |
512
|
|
|
|
|
|
|
my $i = $Data->add_column(\@array); |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Appends a new column to the Data table at the |
515
|
|
|
|
|
|
|
rightmost position (highest index). It adds the column |
516
|
|
|
|
|
|
|
header name and creates a new column metadata hash. |
517
|
|
|
|
|
|
|
Pass the method one of two possibilities. Pass a text |
518
|
|
|
|
|
|
|
string representing the new column name, in which case |
519
|
|
|
|
|
|
|
no data will be added to the column. Alternatively, pass |
520
|
|
|
|
|
|
|
an array reference, and the contents of the array will |
521
|
|
|
|
|
|
|
become the column data. If the Data table already has |
522
|
|
|
|
|
|
|
rows, then the passed array reference must have the same |
523
|
|
|
|
|
|
|
number of elements. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
It returns the new column index if successful. |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=item copy_column |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
my $j = $Data->copy_column($i); |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
This will copy a column, appending the duplicate column at |
532
|
|
|
|
|
|
|
the rightmost position (highest index). It will duplicate |
533
|
|
|
|
|
|
|
column metadata as well. It will return the new index |
534
|
|
|
|
|
|
|
position. |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item delete_column |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
$Data->delete_column($i, $j); |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
Deletes one or more specified columns. Any remaining |
541
|
|
|
|
|
|
|
columns rightwards will have their indices shifted |
542
|
|
|
|
|
|
|
down appropriately. If you had identified one of the |
543
|
|
|
|
|
|
|
shifted columns, you may need to re-find or calculate |
544
|
|
|
|
|
|
|
its new index. |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=item reorder_column |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
$Data->reorder_column($c,$b,$a,$a); |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
Reorders columns into the specified order. Provide the |
551
|
|
|
|
|
|
|
new desired order of indices. Columns could be duplicated |
552
|
|
|
|
|
|
|
or deleted using this method. The columns will adopt their |
553
|
|
|
|
|
|
|
new index numbers. |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=item add_row |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
$Data->add_row(\@values); |
558
|
|
|
|
|
|
|
$Data->add_row($Row); # Bio::ToolBox::Data::Feature object |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
Add a new row of data values to the end of the Data table. |
561
|
|
|
|
|
|
|
Optionally provide either a reference to an array of values |
562
|
|
|
|
|
|
|
to put in the row, or pass a L |
563
|
|
|
|
|
|
|
Row object, such as one obtained from another Data object. |
564
|
|
|
|
|
|
|
If the number of columns do not match, the array is filled |
565
|
|
|
|
|
|
|
up with null values for missing columns, or excess values |
566
|
|
|
|
|
|
|
are dropped. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=item get_row |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
$Data->get_row($i); # Bio::ToolBox::Data::Feature object |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
Returns the specified row as a L |
573
|
|
|
|
|
|
|
object. |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=item delete_row |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
$Data->delete_row($i, $j); |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
Deletes one or more specified rows. Rows are spliced out |
580
|
|
|
|
|
|
|
highest to lowest index to avoid issues. Be very careful |
581
|
|
|
|
|
|
|
deleting rows while simultaneously iterating through the |
582
|
|
|
|
|
|
|
table! |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
=item row_values |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
my $row_values = $Data->row_values($i); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
Returns a copy of an array for the specified row index. |
589
|
|
|
|
|
|
|
Modifying this returned array does not migrate back to the |
590
|
|
|
|
|
|
|
Data table; Use the L method below instead. |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=item value |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
my $value = $Data->value($row, $column); |
595
|
|
|
|
|
|
|
$Data->value($row, $column, $new_value); |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Returns or sets the value at a specific row or column index. |
598
|
|
|
|
|
|
|
Index positions are 0-based (header row is index 0). |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=back |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=head2 Column Metadata |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Each column has metadata. Each metadata is a series of key =E |
605
|
|
|
|
|
|
|
value pairs. The minimum keys are 'index' (the 0-based index |
606
|
|
|
|
|
|
|
of the column) and 'name' (the column header name). Additional |
607
|
|
|
|
|
|
|
keys and values may be queried or set as appropriate. When the |
608
|
|
|
|
|
|
|
file is written, these are stored as commented metadata lines at |
609
|
|
|
|
|
|
|
the beginning of the file. |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=over 4 |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=item name |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
$Data->name($index, $new_name); |
616
|
|
|
|
|
|
|
my $name = $Data->name($i); |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
Convenient method to return the name of the column given the |
619
|
|
|
|
|
|
|
index number. A column may also be renamed by passing a new name. |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=item metadata |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
$Data->metadata($index, $key, $new_value); |
624
|
|
|
|
|
|
|
my $value = $Data->metadata($index, $key) |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
Returns or sets the metadata value for a specific $key for a |
627
|
|
|
|
|
|
|
specific column $index. |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
This may also be used to add a new metadata key. Simply provide |
630
|
|
|
|
|
|
|
the name of a new $key that is not present |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
If no key is provided, then a hash or hash reference is returned |
633
|
|
|
|
|
|
|
representing the entire metadata for that column. |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
=item copy_metadata |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
$Data->copy_metadata($source, $target); |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
This method will copy the metadata (everything except name and |
640
|
|
|
|
|
|
|
index) between the source column and target column. Returns 1 if |
641
|
|
|
|
|
|
|
successful. |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=item delete_metadata |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
$Data->delete_metadata($index, $key); |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
Deletes a column-specific metadata $key and value for a specific |
648
|
|
|
|
|
|
|
column $index. If a $key is not provided, then all metadata keys |
649
|
|
|
|
|
|
|
for that index will be deleted. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=item find_column |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
my $i = $Data->find_column('Gene'); |
654
|
|
|
|
|
|
|
my $i = $Data->find_column('^Gene$') |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
Searches the column names for the specified column name. This |
657
|
|
|
|
|
|
|
employs a case-insensitive grep search, so simple substitutions |
658
|
|
|
|
|
|
|
may be made. |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=item chromo_column |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item start_column |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=item stop_column |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=item strand_column |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item name_column |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=item type_column |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item id_column |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
These methods will return the identified column best matching |
675
|
|
|
|
|
|
|
the description. Returns C if that column is not present. |
676
|
|
|
|
|
|
|
These use the L method with a predefined list of |
677
|
|
|
|
|
|
|
aliases. |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
=back |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=head2 Working with databases |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
These are methods for working primarily with annotation databases. |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=over 4 |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item open_database |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
This is wrapper method that tries to do the right thing and passes |
690
|
|
|
|
|
|
|
on to either L or L methods. |
691
|
|
|
|
|
|
|
Basically a legacy method for L. |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=item open_meta_database |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
Open the database that is listed in the metadata. Returns the |
696
|
|
|
|
|
|
|
database connection. Pass a true value to force a new database |
697
|
|
|
|
|
|
|
connection to be opened, rather than returning a cached connection |
698
|
|
|
|
|
|
|
object (useful when forking). |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item open_new_database |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Convenience method for opening a second or new database that is |
703
|
|
|
|
|
|
|
not specified in the metadata, useful for data collection. This |
704
|
|
|
|
|
|
|
is a shortcut to L. |
705
|
|
|
|
|
|
|
Pass the database name. |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
=back |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head2 SeqFeature Objects |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
SeqFeature objects corresponding to data rows can be stored in the Data |
712
|
|
|
|
|
|
|
object. This can be useful if the SeqFeature object is not readily |
713
|
|
|
|
|
|
|
available from a database or is processor intensive in generating or |
714
|
|
|
|
|
|
|
parsing. Note that storing large numbers of objects will increase memory |
715
|
|
|
|
|
|
|
usage. |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
SeqFeature objects are usually either L, |
718
|
|
|
|
|
|
|
L, or L objects, depending |
719
|
|
|
|
|
|
|
upon their source. More information can obtained from the L |
720
|
|
|
|
|
|
|
abstract API. |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
=over 4 |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
=item store_seqfeature |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
$Data->store_seqfeature($row_index, $seqfeature); |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
Stores the SeqFeature object for the given row index. Only one SeqFeature |
729
|
|
|
|
|
|
|
object can be stored per row. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item get_seqfeature |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
my $feature = $Data->get_seqfeature($row_index); |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
Retrieves the SeqFeature object for the given row index. |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
=item delete_seqfeature |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
$Data->store_seqfeature($row_index); |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
Removes the SeqFeature object for the given row index. |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=item collapse_gene_transcripts |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
my $success = $Data->collapse_gene_transcripts; |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
This method will iterate through a Data table and collapse multiple alternative |
748
|
|
|
|
|
|
|
transcript subfeatures of stored gene SeqFeature objects in the table. Exons |
749
|
|
|
|
|
|
|
of multiple transcripts will be merged, maximizing exon size and minimizing |
750
|
|
|
|
|
|
|
intron size. Genes with only one transcript will not be affected. Stored |
751
|
|
|
|
|
|
|
SeqFeature objects that are do not have a C of "gene" are silently |
752
|
|
|
|
|
|
|
skipped. Refer to the L method |
753
|
|
|
|
|
|
|
for more details. The number of rows successfully collapsed is returned. |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item add_transcript_length |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
my $length_index = $Data->add_transcript_length; |
758
|
|
|
|
|
|
|
my $length_index = $Data->add_transcript_length('cds'); |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
This method will generate a new column in the Data table representing the |
761
|
|
|
|
|
|
|
length of a transcript, i.e. the sum of the length of subfeatures for |
762
|
|
|
|
|
|
|
the stored SeqFeature object in the Data table. The default subfeature is |
763
|
|
|
|
|
|
|
C; however, alternative subfeature types may be passed to the method |
764
|
|
|
|
|
|
|
and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR, |
765
|
|
|
|
|
|
|
and the 3C<'> UTR, respectively. See the corresponding transcript length |
766
|
|
|
|
|
|
|
methods in L for more information. If a length |
767
|
|
|
|
|
|
|
is not calculated, for example the feature C is a "gene", |
768
|
|
|
|
|
|
|
then the simple length of the feature is recorded. |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
The name of the new column is one of "Merged_Transcript_Length" for exon, |
771
|
|
|
|
|
|
|
"Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length". |
772
|
|
|
|
|
|
|
The index of the new length column is returned. |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=back |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
=head2 Data Table Functions |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
These methods alter the Data table en masse. |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=over 4 |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=item verify |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
This method will verify the Data structure, including the metadata and the |
785
|
|
|
|
|
|
|
Data table. It ensures that the table has the correct number of rows and |
786
|
|
|
|
|
|
|
columns as described in the metadata, and that each column has the basic |
787
|
|
|
|
|
|
|
metadata. |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
If the Data structure is marked as a GFF or BED structure, then the table |
790
|
|
|
|
|
|
|
is checked that the structure matches the proper format. If not, for |
791
|
|
|
|
|
|
|
example when additional columns have been added, then the GFF or BED value |
792
|
|
|
|
|
|
|
is set to null. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
This method is automatically called prior to writing the Data table to file. |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
=item sort_data |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
$Data->sort_data($index, 'i'); # increasing sort |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
This method will sort the Data table by the values in the indicated column. |
801
|
|
|
|
|
|
|
It will automatically determine whether the contents of the column are |
802
|
|
|
|
|
|
|
numbers or alphanumeric, and will sort accordingly, either numerically or |
803
|
|
|
|
|
|
|
asciibetically. The first non-null value in the column is used to determine. |
804
|
|
|
|
|
|
|
The sort may fail if the values are not consistent. Pass a second optional |
805
|
|
|
|
|
|
|
value to indicate the direction of the sort. The value should be either |
806
|
|
|
|
|
|
|
'i' for 'increasing' or 'd' for 'decreasing'. The default order is |
807
|
|
|
|
|
|
|
increasing. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=item gsort_data |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
This method will sort the Data table by increasing genomic coordinates. It |
812
|
|
|
|
|
|
|
requires the presence of chromosome and start (or position) columns, |
813
|
|
|
|
|
|
|
identified by their column names. These are automatically identified. |
814
|
|
|
|
|
|
|
Failure to find these columns mean a failure to sort the table. Chromosome |
815
|
|
|
|
|
|
|
names are sorted first by their digits (e.g. chr2 before chr10), and then |
816
|
|
|
|
|
|
|
alphanumerically. Base coordinates are sorted by increasing value. |
817
|
|
|
|
|
|
|
Identical positions are kept in their original order. |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=item splice_data |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file); |
822
|
|
|
|
|
|
|
my $pm = Parallel::ForkManager->new(4); |
823
|
|
|
|
|
|
|
for my $i (1..4) { |
824
|
|
|
|
|
|
|
$pm->start and next; |
825
|
|
|
|
|
|
|
### in child ### |
826
|
|
|
|
|
|
|
$Data->splice_data($i, 4); |
827
|
|
|
|
|
|
|
# do something with this portion |
828
|
|
|
|
|
|
|
# then save to a temporary unique file |
829
|
|
|
|
|
|
|
$Data->save("$file_$i"); |
830
|
|
|
|
|
|
|
$pm->finish; |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
$pm->wait_all_children; |
833
|
|
|
|
|
|
|
# reload children files |
834
|
|
|
|
|
|
|
$Data->reload_children(glob "$file_*"); |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
This method will splice the Data table into C<$total_parts> number of pieces, |
837
|
|
|
|
|
|
|
retaining the C<$current_part> piece. The other parts are discarded. This |
838
|
|
|
|
|
|
|
method is intended to be used when a program is forked into separate |
839
|
|
|
|
|
|
|
processes, allowing each child process to work on a subset of the original |
840
|
|
|
|
|
|
|
Data table. |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
Two values are passed to the method. The first is the current part number, |
843
|
|
|
|
|
|
|
1-based. The second value is the total number of parts that the table |
844
|
|
|
|
|
|
|
should be divided, corresponding to the number of concurrent processes. |
845
|
|
|
|
|
|
|
One easy approach to forking is to use L. The |
846
|
|
|
|
|
|
|
above example shows how to fork into four concurrent processes. |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Since each forked child process is separate from their parent process, |
849
|
|
|
|
|
|
|
their contents must be reloaded into the current Data object. The |
850
|
|
|
|
|
|
|
L documentation recommends going through a disk |
851
|
|
|
|
|
|
|
file intermediate. Therefore, write each child Data object to file using |
852
|
|
|
|
|
|
|
a unique name. Once all children have been reaped, they can be reloaded |
853
|
|
|
|
|
|
|
into the current Data object using the L method. |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
Remember that if you fork your script into child processes, any database |
856
|
|
|
|
|
|
|
connections must be re-opened; they are typically not clone safe. If you |
857
|
|
|
|
|
|
|
have an existing database connection by using the L method, |
858
|
|
|
|
|
|
|
it should be automatically re-opened for you when you use the L |
859
|
|
|
|
|
|
|
method, but you will need to call L again in the child |
860
|
|
|
|
|
|
|
process to obtain the new database object. |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=item reload_children |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
Discards the current data table in memory and reloads two or more files |
865
|
|
|
|
|
|
|
written from forked children processes. Provide the name of the child |
866
|
|
|
|
|
|
|
files in the order you want them loaded. The files will be automatically |
867
|
|
|
|
|
|
|
deleted if successfully loaded. Returns the number of lines reloaded on |
868
|
|
|
|
|
|
|
success. |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=back |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
=head2 File Functions |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
The Data table may be read in from a file or written out as a file. In |
875
|
|
|
|
|
|
|
all cases, it is a tab-delimited text file, whether as an ad hoc table |
876
|
|
|
|
|
|
|
or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common |
877
|
|
|
|
|
|
|
file formats are supported. Column headers are assumed, except in those |
878
|
|
|
|
|
|
|
cases where it is not, e.g. BED, GFF, etc. Metadata may be included as |
879
|
|
|
|
|
|
|
commented lines at the beginning of the file, prefixed with a C<#> symbol. |
880
|
|
|
|
|
|
|
Reading and writing gzip compressed files is fully supported. |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=over 4 |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
=item load_file |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
$Data->load_file($filename); |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
This will load a file into a new, empty Data table. This function is |
889
|
|
|
|
|
|
|
called automatically when a filename is provided to the L function. |
890
|
|
|
|
|
|
|
The existence of the file is first checked (appending common missing |
891
|
|
|
|
|
|
|
extensions as necessary), metadata and column headers processed and/or |
892
|
|
|
|
|
|
|
generated from default settings, the content loaded into the table, and |
893
|
|
|
|
|
|
|
the structure verified. Error messages may be printed if the structure or |
894
|
|
|
|
|
|
|
format is inconsistent or doesn't match the expected format, e.g a file |
895
|
|
|
|
|
|
|
with a F<.bed> extension doesn't match the UCSC specification. |
896
|
|
|
|
|
|
|
Pass the name of the filename. |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=item taste_file |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
my $flavor = $Data->taste_file($filename); |
901
|
|
|
|
|
|
|
# returns gff, bed, ucsc, or undef |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
Tastes, or checks, a file for a certain flavor, or known gene file formats. |
904
|
|
|
|
|
|
|
This is based on file extension, metadata headers, and/or file content |
905
|
|
|
|
|
|
|
in the first 10 lines or so. Returns a string based on the file format. |
906
|
|
|
|
|
|
|
Values include gff, bed, ucsc, or undefined. Useful for determining if |
907
|
|
|
|
|
|
|
the file represents a known gene table format that lacks a defined file |
908
|
|
|
|
|
|
|
extension, e.g. UCSC formats. |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
=item save |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
=item write_file |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
my $success = $Data->save; |
915
|
|
|
|
|
|
|
my $success = $Data->save('my_file.txt'); |
916
|
|
|
|
|
|
|
my $success = $Data->save(filename => $file, gz => 1); |
917
|
|
|
|
|
|
|
print "file $success was saved!\n"; |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
Pass the file name to be written. If no file name is passed, then |
920
|
|
|
|
|
|
|
the filename and path stored in the metadata are used, if present. |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
These methods will write the Data structure out to file. It will |
923
|
|
|
|
|
|
|
be first verified as to proper structure. Opened BED and GFF files |
924
|
|
|
|
|
|
|
are checked to see if their structure is maintained. If so, they |
925
|
|
|
|
|
|
|
are written in the same format; if not, they are written as regular |
926
|
|
|
|
|
|
|
tab-delimited text files. |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
You may pass additional options. |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
=over 4 |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
=item filename |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
Optionally pass a new filename. Required for new objects; previous |
935
|
|
|
|
|
|
|
opened files may be overwritten if a new name is not provided. If |
936
|
|
|
|
|
|
|
necessary, the file extension may be changed; for example, BED files |
937
|
|
|
|
|
|
|
that no longer match the defined format lose the .bed and gain a .txt |
938
|
|
|
|
|
|
|
extension. Compression may or add or strip .gz as appropriate. If |
939
|
|
|
|
|
|
|
a path is not provided, the current working directory is used. |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=item gz |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
Boolean value to change the compression status of the output file. If |
944
|
|
|
|
|
|
|
overwriting an input file, the default is maintain the compression status, |
945
|
|
|
|
|
|
|
otherwise no compression. Pass a 0 for no compression, 1 for standard |
946
|
|
|
|
|
|
|
gzip compression, or 2 for block gzip (bgzip) compression for tabix |
947
|
|
|
|
|
|
|
compatibility. |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=back |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
If the file save is successful, it will return the full path and |
952
|
|
|
|
|
|
|
name of the saved file, complete with any changes to the file extension. |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
=item summary_file |
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
Write a separate file summarizing columns of data (mean values). |
957
|
|
|
|
|
|
|
The mean value of each column becomes a row value, and each column |
958
|
|
|
|
|
|
|
header becomes a row identifier (i.e. the table is transposed). The |
959
|
|
|
|
|
|
|
best use of this is to summarize the mean profile of windowed data |
960
|
|
|
|
|
|
|
collected across a feature. See the Bio::ToolBox scripts |
961
|
|
|
|
|
|
|
L and L as examples. |
962
|
|
|
|
|
|
|
For data from L where the columns are expressed |
963
|
|
|
|
|
|
|
as percentile bins, the reported midpoint column is automatically |
964
|
|
|
|
|
|
|
converted based on a length of 1000 bp. |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
You may pass these options. They are optional. |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
=over 4 |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
=item filename |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
Pass an optional new filename. The default is to take the basename |
973
|
|
|
|
|
|
|
and append "_summed" to it. |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
=item startcolumn |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
=item stopcolumn |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
Provide the starting and ending columns to summarize. The default |
980
|
|
|
|
|
|
|
start is the leftmost column without a recognized standard name. |
981
|
|
|
|
|
|
|
The default ending column is the last rightmost column. Indexes are |
982
|
|
|
|
|
|
|
0-based. |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=item dataset |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
Pass a string that is the name of the dataset. This could be collected |
987
|
|
|
|
|
|
|
from the metadata, if present. This will become the name of the score |
988
|
|
|
|
|
|
|
column if defined. |
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
=back |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
The name of the summarized column is either the provided dataset name, |
993
|
|
|
|
|
|
|
the defined basename in the metadata of the Data structure, or a generic |
994
|
|
|
|
|
|
|
name. If successful, it will return the name of the file saved. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
=back |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=head2 Verifying Datasets |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
When working with row Features and collecting scores, the dataset |
1001
|
|
|
|
|
|
|
from which you are collecting must be verified prior to collection. |
1002
|
|
|
|
|
|
|
This ensures that the proper database adaptor is available and loaded, |
1003
|
|
|
|
|
|
|
and that the dataset is correctly specified (otherwise nothing would be |
1004
|
|
|
|
|
|
|
collected). This verification is normally performed transparently when |
1005
|
|
|
|
|
|
|
you call L or |
1006
|
|
|
|
|
|
|
L. |
1007
|
|
|
|
|
|
|
However, datasets may be explicitly verified prior to calling the score |
1008
|
|
|
|
|
|
|
methods. |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
=over 4 |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
=item verify_dataset |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
my $dataset = $Data->verify_dataset($dataset, $database); |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
Pass the name of the dataset (GFF type or type:source) for a GFF3-based |
1017
|
|
|
|
|
|
|
database, e.g. , or path and file name for a |
1018
|
|
|
|
|
|
|
data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database |
1019
|
|
|
|
|
|
|
is being used, pass the name or opened database object as a second |
1020
|
|
|
|
|
|
|
parameter. For more advance options, see |
1021
|
|
|
|
|
|
|
L. |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
The name of the verified dataset, with a prefix if necessary, is returned. |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
=back |
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
=head2 Efficient Data Access |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
Most of the time we need to iterate over the Data table, one row |
1030
|
|
|
|
|
|
|
at a time, collecting data or processing information. These methods |
1031
|
|
|
|
|
|
|
simplify the process. |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
=over 4 |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
=item iterate |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
$Data->iterate( sub { |
1038
|
|
|
|
|
|
|
my $row = shift; |
1039
|
|
|
|
|
|
|
my $number = $row->value($index); |
1040
|
|
|
|
|
|
|
my $log_number = log($number); |
1041
|
|
|
|
|
|
|
$row->value($index, $log_number); |
1042
|
|
|
|
|
|
|
} ); |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
This method will process a code reference on every row in the data |
1045
|
|
|
|
|
|
|
table. Pass a subroutine or code reference. The subroutine will |
1046
|
|
|
|
|
|
|
receive the row as a L object. With this |
1047
|
|
|
|
|
|
|
object, you can retrieve values, set values, and add new values. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=item row_stream |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
This returns an C object, which has one |
1052
|
|
|
|
|
|
|
method, C. Call this method repeatedly until it returns |
1053
|
|
|
|
|
|
|
C to work through each row of data. |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
Users of the C family of database adaptors may recognize the |
1056
|
|
|
|
|
|
|
analogy to the C method. |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
=item next_row |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
my $stream = $Data->row_stream; |
1061
|
|
|
|
|
|
|
while (my $row = $stream->next_row) { |
1062
|
|
|
|
|
|
|
# each $row is a Bio::ToolBox::Data::Feature object |
1063
|
|
|
|
|
|
|
# representing the row in the data table |
1064
|
|
|
|
|
|
|
my $value = $row->value($index); |
1065
|
|
|
|
|
|
|
# do something with $value |
1066
|
|
|
|
|
|
|
} |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
Called from a C object, it returns a |
1069
|
|
|
|
|
|
|
L row object. If SeqFeature objects are |
1070
|
|
|
|
|
|
|
associated with the row, perhaps from a parsed input annotation file, |
1071
|
|
|
|
|
|
|
then they are automatically associated with the row object. (Previous |
1072
|
|
|
|
|
|
|
versions required separately calling the seqfeature() row method to |
1073
|
|
|
|
|
|
|
perform this.) |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=back |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
=head1 SEE ALSO |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
L, L, L, |
1080
|
|
|
|
|
|
|
L, L, L, L, |
1081
|
|
|
|
|
|
|
L, L |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=cut |
1084
|
|
|
|
|
|
|
|
1085
|
3
|
|
|
3
|
|
4197
|
use strict; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
118
|
|
1086
|
3
|
|
|
3
|
|
21
|
use Carp qw(carp cluck croak confess); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
233
|
|
1087
|
3
|
|
|
3
|
|
20
|
use List::Util qw(sum0); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
208
|
|
1088
|
3
|
|
|
3
|
|
19
|
use base 'Bio::ToolBox::Data::core'; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
1680
|
|
1089
|
3
|
|
|
|
|
204
|
use Bio::ToolBox::db_helper qw( |
1090
|
|
|
|
|
|
|
get_new_feature_list |
1091
|
|
|
|
|
|
|
get_new_genome_list |
1092
|
|
|
|
|
|
|
get_db_feature |
1093
|
3
|
|
|
3
|
|
26
|
); |
|
3
|
|
|
|
|
6
|
|
1094
|
3
|
|
|
3
|
|
22
|
use Bio::ToolBox::utility qw(simplify_dataset_name); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
155
|
|
1095
|
3
|
|
|
3
|
|
22
|
use Module::Load; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
17
|
|
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
1; |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
### Initialize |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
sub new { |
1103
|
45
|
|
|
45
|
1
|
7983
|
my $class = shift; |
1104
|
45
|
|
|
|
|
120
|
my %args = @_; |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
# check for important arguments |
1107
|
45
|
|
100
|
|
|
411
|
$args{features} ||= $args{feature} || 'gene'; |
|
|
|
33
|
|
|
|
|
1108
|
45
|
|
50
|
|
|
272
|
$args{stream} ||= $args{Stream} || 0; |
|
|
|
66
|
|
|
|
|
1109
|
45
|
|
100
|
|
|
245
|
$args{file} ||= $args{in} || undef; |
|
|
|
100
|
|
|
|
|
1110
|
45
|
|
100
|
|
|
172
|
$args{parse} ||= 0; |
1111
|
45
|
|
50
|
|
|
217
|
$args{noheader} ||= 0; |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
# check for stream |
1114
|
45
|
100
|
|
|
|
112
|
if ($args{stream}) { |
1115
|
1
|
|
|
|
|
2
|
$class = "Bio::ToolBox::Data::Stream"; |
1116
|
1
|
|
|
|
|
6
|
load($class); |
1117
|
1
|
|
|
|
|
156
|
return $class->new(@_); |
1118
|
|
|
|
|
|
|
} |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
# initialize |
1121
|
44
|
|
|
|
|
179
|
my $self = $class->SUPER::new(); |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
# prepare a new table based on the provided arguments |
1124
|
44
|
100
|
100
|
|
|
257
|
if ($args{file} and $args{parse}) { |
|
|
100
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
# parse from file |
1126
|
7
|
|
100
|
|
|
39
|
$args{subfeature} ||= ''; |
1127
|
7
|
100
|
|
|
|
24
|
unless ( $self->parse_table(\%args) ) { |
1128
|
5
|
|
|
|
|
104
|
my $l = $self->load_file($args{file}); |
1129
|
5
|
50
|
|
|
|
23
|
return unless $l; |
1130
|
5
|
50
|
33
|
|
|
19
|
if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') { |
1131
|
|
|
|
|
|
|
# looks like the loaded file was from a previously parsed table |
1132
|
|
|
|
|
|
|
# let's try this again |
1133
|
|
|
|
|
|
|
# this may die if it doesn't work |
1134
|
5
|
|
|
|
|
19
|
$args{file} = $1; |
1135
|
5
|
|
|
|
|
16
|
$args{feature} = $self->feature; |
1136
|
5
|
|
|
|
|
15
|
$self->parse_table(\%args); |
1137
|
|
|
|
|
|
|
} |
1138
|
|
|
|
|
|
|
} |
1139
|
|
|
|
|
|
|
} |
1140
|
|
|
|
|
|
|
elsif ($args{file}) { |
1141
|
|
|
|
|
|
|
# load from file |
1142
|
5
|
50
|
|
|
|
52
|
unless ( $self->load_file($args{file}, $args{noheader}) ) { |
1143
|
0
|
|
|
|
|
0
|
return; |
1144
|
|
|
|
|
|
|
} |
1145
|
|
|
|
|
|
|
} |
1146
|
|
|
|
|
|
|
elsif (exists $args{db} and $args{features}) { |
1147
|
|
|
|
|
|
|
# generate new list |
1148
|
0
|
|
|
|
|
0
|
$self->feature($args{features}); |
1149
|
0
|
|
|
|
|
0
|
$self->database($args{db}); |
1150
|
0
|
|
|
|
|
0
|
$args{data} = $self; |
1151
|
0
|
|
|
|
|
0
|
my $result; |
1152
|
0
|
0
|
|
|
|
0
|
if ($args{features} eq 'genome') { |
1153
|
0
|
|
|
|
|
0
|
$result = get_new_genome_list(%args); |
1154
|
|
|
|
|
|
|
} |
1155
|
|
|
|
|
|
|
else { |
1156
|
0
|
|
|
|
|
0
|
$result = get_new_feature_list(%args); |
1157
|
|
|
|
|
|
|
} |
1158
|
0
|
0
|
|
|
|
0
|
unless ($result) { |
1159
|
0
|
|
|
|
|
0
|
carp " Cannot generate new $args{features} list!"; |
1160
|
0
|
|
|
|
|
0
|
return; |
1161
|
|
|
|
|
|
|
} |
1162
|
|
|
|
|
|
|
} |
1163
|
|
|
|
|
|
|
else { |
1164
|
|
|
|
|
|
|
# a new empty structure |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# check to see if user provided column names |
1167
|
32
|
|
50
|
|
|
192
|
$args{columns} ||= $args{datasets} || undef; |
|
|
|
66
|
|
|
|
|
1168
|
32
|
100
|
33
|
|
|
203
|
if (defined $args{columns}) { |
|
|
50
|
33
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1169
|
1
|
|
|
|
|
2
|
foreach my $c ( @{ $args{columns} } ) { |
|
1
|
|
|
|
|
4
|
|
1170
|
9
|
|
|
|
|
19
|
$self->add_column($c); |
1171
|
|
|
|
|
|
|
} |
1172
|
1
|
50
|
|
|
|
6
|
$self->{feature} = $args{feature} if exists $args{feature}; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
# or possibly a specified format structure |
1176
|
|
|
|
|
|
|
elsif (exists $args{gff} and $args{gff}) { |
1177
|
|
|
|
|
|
|
# use standard names for the number of columns indicated |
1178
|
|
|
|
|
|
|
# we trust that the user knows the subtle difference between gff versions |
1179
|
0
|
|
|
|
|
0
|
$self->add_gff_metadata($args{gff}); |
1180
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /g[tf]f/) { |
1181
|
|
|
|
|
|
|
$self->{extension} = $args{gff} == 2.5 ? '.gtf' : |
1182
|
0
|
0
|
|
|
|
0
|
$args{gff} == 3 ? '.gff3' : '.gff'; |
|
|
0
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
} |
1184
|
|
|
|
|
|
|
} |
1185
|
|
|
|
|
|
|
elsif (exists $args{bed} and $args{bed}) { |
1186
|
|
|
|
|
|
|
# use standard names for the number of columns indicated |
1187
|
0
|
0
|
0
|
|
|
0
|
unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) { |
1188
|
0
|
|
|
|
|
0
|
carp "bed parameter must be an integer 3-12!"; |
1189
|
0
|
|
|
|
|
0
|
return; |
1190
|
|
|
|
|
|
|
} |
1191
|
0
|
|
|
|
|
0
|
$self->add_bed_metadata($args{bed}); |
1192
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /bed|peak/) { |
1193
|
0
|
|
|
|
|
0
|
$self->{extension} = '.bed'; |
1194
|
|
|
|
|
|
|
} |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
elsif (exists $args{ucsc} and $args{ucsc}) { |
1197
|
|
|
|
|
|
|
# a ucsc format such as refFlat, genePred, or genePredExt |
1198
|
0
|
|
|
|
|
0
|
my $u = $self->add_ucsc_metadata($args{ucsc}); |
1199
|
0
|
0
|
|
|
|
0
|
unless ($u) { |
1200
|
0
|
|
|
|
|
0
|
carp "unrecognized number of columns for ucsc format!"; |
1201
|
0
|
|
|
|
|
0
|
return; |
1202
|
|
|
|
|
|
|
}; |
1203
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /ucsc|ref+lat|genepred/) { |
1204
|
0
|
|
|
|
|
0
|
$self->{extension} = '.ucsc'; |
1205
|
|
|
|
|
|
|
} |
1206
|
|
|
|
|
|
|
} |
1207
|
|
|
|
|
|
|
} |
1208
|
|
|
|
|
|
|
|
1209
|
44
|
|
|
|
|
264
|
return $self; |
1210
|
|
|
|
|
|
|
} |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
sub duplicate { |
1213
|
1
|
|
|
1
|
1
|
1578
|
my $self = shift; |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
# duplicate the data structure |
1216
|
1
|
|
|
|
|
6
|
my $columns = $self->list_columns; |
1217
|
1
|
50
|
|
|
|
7
|
my $Dupe = $self->new( |
1218
|
|
|
|
|
|
|
'columns' => $columns, |
1219
|
|
|
|
|
|
|
) or return; |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
# copy the metadata |
1222
|
1
|
|
|
|
|
5
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
1223
|
|
|
|
|
|
|
# column metadata |
1224
|
9
|
|
|
|
|
21
|
my %md = $self->metadata($i); |
1225
|
9
|
|
|
|
|
31
|
$Dupe->{$i} = \%md; |
1226
|
|
|
|
|
|
|
} |
1227
|
1
|
|
|
|
|
4
|
foreach (qw(feature program db bed gff ucsc vcf headers)) { |
1228
|
|
|
|
|
|
|
# various keys |
1229
|
8
|
|
|
|
|
16
|
$Dupe->{$_} = $self->{$_}; |
1230
|
|
|
|
|
|
|
} |
1231
|
1
|
|
|
|
|
4
|
my @comments = $self->comments; |
1232
|
1
|
|
|
|
|
2
|
push @{$Dupe->{comments}}, @comments; |
|
1
|
|
|
|
|
3
|
|
1233
|
|
|
|
|
|
|
|
1234
|
1
|
|
|
|
|
15
|
return $Dupe; |
1235
|
|
|
|
|
|
|
} |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
sub parse_table { |
1238
|
18
|
|
|
18
|
1
|
4210
|
my $self = shift; |
1239
|
18
|
|
|
|
|
37
|
my $args = shift; |
1240
|
18
|
|
|
|
|
30
|
my ($file, $feature, $subfeature, $simplify); |
1241
|
18
|
100
|
|
|
|
51
|
if (ref $args) { |
1242
|
12
|
|
50
|
|
|
43
|
$file = $args->{file} || ''; |
1243
|
12
|
|
100
|
|
|
53
|
$feature = $args->{feature} || ''; |
1244
|
12
|
|
100
|
|
|
50
|
$subfeature = $args->{subfeature} || ''; |
1245
|
|
|
|
|
|
|
$simplify = (exists $args->{simplify} and defined $args->{simplify}) ? |
1246
|
12
|
50
|
33
|
|
|
40
|
$args->{simplify} : 1; # default is to simplify |
1247
|
|
|
|
|
|
|
} |
1248
|
|
|
|
|
|
|
else { |
1249
|
|
|
|
|
|
|
# no hash reference, assume just a file name |
1250
|
6
|
|
|
|
|
14
|
$file = $args; |
1251
|
6
|
|
|
|
|
14
|
undef $args; |
1252
|
6
|
|
|
|
|
8
|
$feature = undef; |
1253
|
6
|
|
|
|
|
13
|
$subfeature = ''; |
1254
|
6
|
|
|
|
|
10
|
$simplify = 1; |
1255
|
|
|
|
|
|
|
} |
1256
|
18
|
50
|
|
|
|
49
|
unless ($file) { |
1257
|
0
|
|
|
|
|
0
|
carp "no annotation file provided to parse!"; |
1258
|
0
|
|
|
|
|
0
|
return; |
1259
|
|
|
|
|
|
|
} |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
# the file format determines the parser class |
1262
|
18
|
100
|
|
|
|
61
|
my $flavor = $self->taste_file($file) or return; |
1263
|
13
|
|
|
|
|
41
|
my $class = 'Bio::ToolBox::parser::' . $flavor; |
1264
|
13
|
|
|
|
|
27
|
eval {load $class}; |
|
13
|
|
|
|
|
56
|
|
1265
|
13
|
50
|
|
|
|
1063
|
if ($@) { |
1266
|
0
|
|
|
|
|
0
|
carp "unable to load $class! cannot parse $flavor!"; |
1267
|
0
|
|
|
|
|
0
|
return; |
1268
|
|
|
|
|
|
|
} |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# open parser |
1271
|
13
|
50
|
|
|
|
116
|
my $parser = $class->new() or return; |
1272
|
13
|
50
|
|
|
|
45
|
$parser->open_file($file) or return; |
1273
|
13
|
|
|
|
|
52
|
my $typelist = $parser->typelist; |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
# set feature based on the type list from the parser |
1276
|
13
|
100
|
|
|
|
37
|
unless ($feature) { |
1277
|
6
|
100
|
|
|
|
49
|
if ($typelist =~ /gene/i) { |
|
|
100
|
|
|
|
|
|
1278
|
2
|
|
|
|
|
12
|
$feature = 'gene'; |
1279
|
|
|
|
|
|
|
} |
1280
|
|
|
|
|
|
|
elsif ($typelist eq 'region') { |
1281
|
3
|
|
|
|
|
6
|
$feature = 'region'; |
1282
|
|
|
|
|
|
|
} |
1283
|
|
|
|
|
|
|
else { |
1284
|
1
|
|
|
|
|
2
|
$feature = 'rna'; # generic RNA |
1285
|
|
|
|
|
|
|
} |
1286
|
|
|
|
|
|
|
} |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
# set parser parameters |
1289
|
13
|
|
|
|
|
46
|
$parser->simplify($simplify); |
1290
|
13
|
100
|
|
|
|
34
|
if ($subfeature) { |
1291
|
2
|
100
|
|
|
|
14
|
$parser->do_exon(1) if $subfeature =~ /exon/i; |
1292
|
2
|
100
|
|
|
|
26
|
$parser->do_cds(1) if $subfeature =~ /cds/i; |
1293
|
2
|
50
|
|
|
|
19
|
$parser->do_utr(1) if $subfeature =~ /utr|untranslated/i; |
1294
|
2
|
50
|
|
|
|
8
|
$parser->do_codon(1) if $subfeature =~/codon/i; |
1295
|
|
|
|
|
|
|
} |
1296
|
13
|
100
|
|
|
|
57
|
if ($feature =~ /gene$/i) { |
1297
|
4
|
|
|
|
|
17
|
$parser->do_gene(1); |
1298
|
|
|
|
|
|
|
} |
1299
|
|
|
|
|
|
|
else { |
1300
|
9
|
|
|
|
|
27
|
$parser->do_gene(0); |
1301
|
|
|
|
|
|
|
} |
1302
|
13
|
|
|
|
|
23
|
my $mrna_check = 0; |
1303
|
13
|
50
|
66
|
|
|
49
|
if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) { |
|
|
|
33
|
|
|
|
|
1304
|
|
|
|
|
|
|
# user requested mRNA for a new data file but it's not present in the type list |
1305
|
|
|
|
|
|
|
# look for it the hard way by parsing CDS too - sigh |
1306
|
0
|
|
|
|
|
0
|
load('Bio::ToolBox::GeneTools', 'is_coding'); |
1307
|
0
|
|
|
|
|
0
|
$parser->do_cds(1); |
1308
|
0
|
|
|
|
|
0
|
$mrna_check = 1; |
1309
|
|
|
|
|
|
|
} |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
# parse the table |
1312
|
13
|
50
|
|
|
|
40
|
$parser->parse_file or return; |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
# store the SeqFeature objects |
1315
|
13
|
100
|
|
|
|
83
|
if ($self->last_row > 0) { |
1316
|
|
|
|
|
|
|
# we already have a table, presumably representing the features |
1317
|
5
|
|
|
|
|
14
|
my $count = 0; |
1318
|
|
|
|
|
|
|
$self->iterate( sub { |
1319
|
65
|
|
|
65
|
|
85
|
my $row = shift; |
1320
|
65
|
|
|
|
|
170
|
my $f = $parser->find_gene( |
1321
|
|
|
|
|
|
|
name => $row->name, |
1322
|
|
|
|
|
|
|
id => $row->id, |
1323
|
|
|
|
|
|
|
); |
1324
|
65
|
50
|
|
|
|
144
|
if ($f) { |
1325
|
65
|
|
|
|
|
155
|
$self->store_seqfeature($row->row_index, $f); |
1326
|
65
|
|
|
|
|
200
|
$count++; |
1327
|
|
|
|
|
|
|
} |
1328
|
5
|
|
|
|
|
51
|
} ); |
1329
|
5
|
50
|
|
|
|
47
|
unless ($count == $self->last_row) { |
1330
|
0
|
|
|
|
|
0
|
die <
|
1331
|
|
|
|
|
|
|
Not all features in the input file could be matched to a corresponding SeqFeature |
1332
|
|
|
|
|
|
|
object in the annotation file $file. |
1333
|
|
|
|
|
|
|
Double check your input and annotation files. You can create a new table by just |
1334
|
|
|
|
|
|
|
providing your annotation file. |
1335
|
|
|
|
|
|
|
PARSEFAIL |
1336
|
|
|
|
|
|
|
} |
1337
|
|
|
|
|
|
|
} |
1338
|
|
|
|
|
|
|
else { |
1339
|
|
|
|
|
|
|
# create a new table |
1340
|
8
|
|
|
|
|
37
|
$self->add_column('Primary_ID'); |
1341
|
8
|
|
|
|
|
22
|
$self->add_column('Name'); |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
# check for chromosome exclude |
1344
|
8
|
|
|
|
|
14
|
my $chr_exclude; |
1345
|
8
|
100
|
|
|
|
23
|
if ($args) { |
1346
|
2
|
|
50
|
|
|
12
|
$chr_exclude = $args->{chrskip} || undef; |
1347
|
|
|
|
|
|
|
} |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
# fill table with features |
1350
|
8
|
|
|
|
|
35
|
while (my $f = $parser->next_top_feature) { |
1351
|
120
|
50
|
|
|
|
206
|
if ($chr_exclude) { |
1352
|
0
|
0
|
|
|
|
0
|
next if $f->seq_id =~ /$chr_exclude/i; |
1353
|
|
|
|
|
|
|
} |
1354
|
120
|
|
|
|
|
243
|
my $type = $f->type; |
1355
|
120
|
100
|
33
|
|
|
243
|
if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) { |
|
|
|
66
|
|
|
|
|
1356
|
113
|
|
|
|
|
261
|
my $index = $self->add_row([ $f->primary_id, $f->display_name ]); |
1357
|
113
|
|
|
|
|
255
|
$self->store_seqfeature($index, $f); |
1358
|
|
|
|
|
|
|
} |
1359
|
|
|
|
|
|
|
} |
1360
|
8
|
50
|
|
|
|
27
|
unless ($self->last_row) { |
1361
|
0
|
|
|
|
|
0
|
printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n", |
1362
|
|
|
|
|
|
|
$feature; |
1363
|
|
|
|
|
|
|
} |
1364
|
8
|
|
|
|
|
52
|
$self->database("Parsed:$file"); |
1365
|
8
|
|
|
|
|
33
|
$self->feature($feature); |
1366
|
8
|
50
|
|
|
|
15
|
$self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude; |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
# add input parsed file metadata |
1369
|
8
|
|
|
|
|
44
|
$self->add_file_metadata($file); |
1370
|
|
|
|
|
|
|
# but delete some stuff, just want basename |
1371
|
8
|
|
|
|
|
16
|
undef $self->{extension}; |
1372
|
8
|
|
|
|
|
18
|
undef $self->{filename}; |
1373
|
8
|
|
|
|
|
14
|
undef $self->{path}; |
1374
|
|
|
|
|
|
|
} |
1375
|
13
|
|
|
|
|
300
|
return 1; |
1376
|
|
|
|
|
|
|
} |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
### Column manipulation |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
sub column_values { |
1384
|
2
|
|
|
2
|
1
|
1360
|
my ($self, $column) = @_; |
1385
|
2
|
50
|
|
|
|
9
|
return unless defined $column; |
1386
|
2
|
50
|
|
|
|
8
|
return unless exists $self->{$column}{name}; |
1387
|
2
|
|
|
|
|
10
|
my @values = map {$self->value($_, $column)} (0 .. $self->last_row); |
|
159
|
|
|
|
|
240
|
|
1388
|
2
|
50
|
|
|
|
24
|
return wantarray ? @values : \@values; |
1389
|
|
|
|
|
|
|
} |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
sub add_column { |
1392
|
28
|
|
|
28
|
1
|
71
|
my ($self, $name) = @_; |
1393
|
28
|
50
|
|
|
|
59
|
return unless $name; |
1394
|
28
|
|
|
|
|
48
|
my $column = $self->{number_columns}; |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
# check for array of column data |
1397
|
28
|
|
|
|
|
47
|
my $name_ref = ref $name; |
1398
|
28
|
100
|
|
|
|
94
|
if ($name_ref eq 'ARRAY') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1399
|
2
|
50
|
|
|
|
16
|
if ($self->last_row > 1) { |
1400
|
|
|
|
|
|
|
# table has existing data beyond column headers |
1401
|
2
|
50
|
|
|
|
6
|
if ($self->last_row == (scalar @$name - 1)) { |
1402
|
|
|
|
|
|
|
# same number of elements, add it the table |
1403
|
2
|
|
|
|
|
10
|
$self->{$column} = { |
1404
|
|
|
|
|
|
|
'name' => $name->[0], |
1405
|
|
|
|
|
|
|
'index' => $column, |
1406
|
|
|
|
|
|
|
}; |
1407
|
2
|
|
|
|
|
8
|
for (my $r = 0; $r <= $self->last_row; $r++) { |
1408
|
158
|
|
|
|
|
368
|
$self->{data_table}->[$r][$column] = $name->[$r]; |
1409
|
|
|
|
|
|
|
} |
1410
|
|
|
|
|
|
|
} |
1411
|
|
|
|
|
|
|
else { |
1412
|
|
|
|
|
|
|
# different number of elements |
1413
|
0
|
|
|
|
|
0
|
cluck "array has different number of elements than Data table!\n"; |
1414
|
0
|
|
|
|
|
0
|
return; |
1415
|
|
|
|
|
|
|
} |
1416
|
|
|
|
|
|
|
} |
1417
|
|
|
|
|
|
|
else { |
1418
|
|
|
|
|
|
|
# table has no existing data |
1419
|
0
|
|
|
|
|
0
|
$self->{$column} = { |
1420
|
|
|
|
|
|
|
'name' => $name->[0], |
1421
|
|
|
|
|
|
|
'index' => $column, |
1422
|
|
|
|
|
|
|
}; |
1423
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < scalar @$name; $i++) { |
1424
|
0
|
|
|
|
|
0
|
$self->value($i, $column, $name->[$i]); |
1425
|
|
|
|
|
|
|
} |
1426
|
0
|
|
|
|
|
0
|
$self->{last_row} = scalar @$name - 1; |
1427
|
0
|
|
|
|
|
0
|
$self->{headers} = 1; # boolean to indicate the table now has headers |
1428
|
|
|
|
|
|
|
} |
1429
|
|
|
|
|
|
|
} |
1430
|
|
|
|
|
|
|
elsif ($name_ref eq 'Bio::DB::GFF::Typename') { |
1431
|
|
|
|
|
|
|
# a Typename object that was selected from a SeqFeature::Store database |
1432
|
0
|
|
|
|
|
0
|
$self->{$column} = { |
1433
|
|
|
|
|
|
|
'name' => $name->asString, |
1434
|
|
|
|
|
|
|
'index' => $column, |
1435
|
|
|
|
|
|
|
}; |
1436
|
0
|
|
|
|
|
0
|
$self->{data_table}->[0][$column] = $name->asString; |
1437
|
|
|
|
|
|
|
} |
1438
|
|
|
|
|
|
|
elsif ($name_ref eq '') { |
1439
|
|
|
|
|
|
|
# just a name |
1440
|
26
|
|
|
|
|
98
|
$self->{$column} = { |
1441
|
|
|
|
|
|
|
'name' => $name, |
1442
|
|
|
|
|
|
|
'index' => $column, |
1443
|
|
|
|
|
|
|
}; |
1444
|
26
|
|
|
|
|
75
|
$self->{data_table}->[0][$column] = $name; |
1445
|
|
|
|
|
|
|
} |
1446
|
|
|
|
|
|
|
else { |
1447
|
0
|
|
|
|
|
0
|
cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference"; |
1448
|
0
|
|
|
|
|
0
|
return; |
1449
|
|
|
|
|
|
|
} |
1450
|
|
|
|
|
|
|
|
1451
|
28
|
|
|
|
|
46
|
$self->{number_columns}++; |
1452
|
28
|
50
|
|
|
|
59
|
delete $self->{column_indices} if exists $self->{column_indices}; |
1453
|
28
|
50
|
33
|
|
|
70
|
if ($self->gff or $self->bed or $self->ucsc or $self->vcf) { |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1454
|
|
|
|
|
|
|
# check if we maintain integrity, at least insofar what we test |
1455
|
0
|
|
|
|
|
0
|
$self->verify(1); # silence so user doesn't get these messages |
1456
|
|
|
|
|
|
|
} |
1457
|
28
|
|
|
|
|
59
|
return $column; |
1458
|
|
|
|
|
|
|
} |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
sub copy_column { |
1461
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
1462
|
1
|
|
|
|
|
3
|
my $index = shift; |
1463
|
1
|
50
|
|
|
|
5
|
return unless defined $index; |
1464
|
1
|
|
|
|
|
5
|
my $data = $self->column_values($index); |
1465
|
1
|
|
|
|
|
5
|
my $new_index = $self->add_column($data); |
1466
|
1
|
|
|
|
|
12
|
$self->copy_metadata($index, $new_index); |
1467
|
1
|
|
|
|
|
6
|
return $new_index; |
1468
|
|
|
|
|
|
|
} |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
### Rows and Data access |
1473
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
sub add_row { |
1475
|
231
|
|
|
231
|
1
|
331
|
my $self = shift; |
1476
|
231
|
|
|
|
|
310
|
my @row_data; |
1477
|
231
|
100
|
66
|
|
|
918
|
if ($_[0] and ref $_[0] eq 'ARRAY') { |
|
|
50
|
33
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
1478
|
191
|
|
|
|
|
270
|
@row_data = @{ $_[0] }; |
|
191
|
|
|
|
|
565
|
|
1479
|
|
|
|
|
|
|
} |
1480
|
|
|
|
|
|
|
elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') { |
1481
|
40
|
|
|
|
|
441
|
@row_data = $_[0]->row_values; |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
elsif ($_[0] and $_[0] =~ /\t/) { |
1484
|
0
|
|
|
|
|
0
|
@row_data = split /\t/, $_[0]; |
1485
|
|
|
|
|
|
|
} |
1486
|
|
|
|
|
|
|
else { |
1487
|
0
|
|
|
|
|
0
|
@row_data = map {'.'} (1 .. $self->{number_columns}); |
|
0
|
|
|
|
|
0
|
|
1488
|
|
|
|
|
|
|
} |
1489
|
231
|
50
|
|
|
|
546
|
if (scalar @row_data > $self->{number_columns}) { |
1490
|
0
|
|
|
|
|
0
|
cluck "row added has more elements than table columns! truncating row elements\n"; |
1491
|
0
|
|
|
|
|
0
|
splice @row_data, 0, $self->{number_columns}; |
1492
|
|
|
|
|
|
|
} |
1493
|
231
|
|
|
|
|
520
|
until (scalar @row_data == $self->{number_columns}) { |
1494
|
0
|
|
|
|
|
0
|
push @row_data, '.'; |
1495
|
|
|
|
|
|
|
} |
1496
|
231
|
|
|
|
|
451
|
my $row_index = $self->{last_row} + 1; |
1497
|
231
|
|
|
|
|
443
|
$self->{data_table}->[$row_index] = \@row_data; |
1498
|
231
|
|
|
|
|
332
|
$self->{last_row}++; |
1499
|
231
|
|
|
|
|
631
|
return $row_index; |
1500
|
|
|
|
|
|
|
} |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
sub get_row { |
1503
|
5
|
|
|
5
|
1
|
692
|
my ($self, $row_number) = @_; |
1504
|
5
|
50
|
|
|
|
23
|
return unless defined $self->{data_table}->[$row_number]; |
1505
|
5
|
|
|
|
|
19
|
my @options = ( |
1506
|
|
|
|
|
|
|
'data' => $self, |
1507
|
|
|
|
|
|
|
'index' => $row_number, |
1508
|
|
|
|
|
|
|
); |
1509
|
5
|
100
|
66
|
|
|
30
|
if ( |
1510
|
|
|
|
|
|
|
exists $self->{SeqFeatureObjects} and |
1511
|
|
|
|
|
|
|
defined $self->{SeqFeatureObjects}->[$row_number] |
1512
|
|
|
|
|
|
|
) { |
1513
|
4
|
|
|
|
|
15
|
push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number]; |
1514
|
|
|
|
|
|
|
} |
1515
|
5
|
|
|
|
|
47
|
return Bio::ToolBox::Data::Feature->new(@options); |
1516
|
|
|
|
|
|
|
} |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
sub delete_row { |
1519
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
1520
|
1
|
|
|
|
|
6
|
my @deleted = sort {$b <=> $a} @_; |
|
0
|
|
|
|
|
0
|
|
1521
|
1
|
|
|
|
|
5
|
while (@deleted) { |
1522
|
1
|
|
|
|
|
3
|
my $d = shift @deleted; |
1523
|
1
|
|
|
|
|
3
|
splice( @{ $self->{data_table} }, $d, 1); |
|
1
|
|
|
|
|
4
|
|
1524
|
1
|
|
|
|
|
6
|
$self->{last_row}--; |
1525
|
1
|
50
|
|
|
|
5
|
if (exists $self->{SeqFeatureObjects}) { |
1526
|
0
|
|
|
|
|
0
|
splice( @{ $self->{SeqFeatureObjects} }, $d, 1); |
|
0
|
|
|
|
|
0
|
|
1527
|
|
|
|
|
|
|
} |
1528
|
|
|
|
|
|
|
} |
1529
|
1
|
|
|
|
|
3
|
return 1; |
1530
|
|
|
|
|
|
|
} |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
sub row_values { |
1533
|
237
|
|
|
237
|
1
|
371
|
my ($self, $row) = @_; |
1534
|
237
|
|
|
|
|
327
|
my @data = @{ $self->{data_table}->[$row] }; |
|
237
|
|
|
|
|
602
|
|
1535
|
237
|
50
|
|
|
|
811
|
return wantarray ? @data : \@data; |
1536
|
|
|
|
|
|
|
} |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
sub value { |
1539
|
508
|
|
|
508
|
1
|
1978
|
my ($self, $row, $column, $value) = @_; |
1540
|
508
|
50
|
33
|
|
|
1518
|
return unless (defined $row and defined $column); |
1541
|
508
|
50
|
|
|
|
808
|
if (defined $value) { |
1542
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$row][$column] = $value; |
1543
|
|
|
|
|
|
|
} |
1544
|
508
|
|
|
|
|
1162
|
return $self->{data_table}->[$row][$column]; |
1545
|
|
|
|
|
|
|
} |
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
sub row_stream { |
1548
|
9
|
|
|
9
|
1
|
2430
|
my $self = shift; |
1549
|
9
|
|
|
|
|
71
|
return Bio::ToolBox::Data::Iterator->new($self); |
1550
|
|
|
|
|
|
|
} |
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
sub iterate { |
1553
|
6
|
|
|
6
|
1
|
654
|
my $self = shift; |
1554
|
6
|
|
|
|
|
14
|
my $code = shift; |
1555
|
6
|
50
|
|
|
|
25
|
unless (ref $code eq 'CODE') { |
1556
|
0
|
|
|
|
|
0
|
confess "iterate_function() method requires a code reference!"; |
1557
|
|
|
|
|
|
|
} |
1558
|
6
|
|
|
|
|
20
|
my $stream = $self->row_stream; |
1559
|
6
|
|
|
|
|
22
|
while (my $row = $stream->next_row) { |
1560
|
143
|
|
|
|
|
293
|
&$code($row); |
1561
|
|
|
|
|
|
|
} |
1562
|
6
|
|
|
|
|
28
|
return 1; |
1563
|
|
|
|
|
|
|
} |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
#### Stored SeqFeature manipulation |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
sub store_seqfeature { |
1569
|
178
|
|
|
178
|
1
|
300
|
my ($self, $row_i, $seqfeature) = @_; |
1570
|
178
|
50
|
33
|
|
|
614
|
unless (defined $row_i and ref($seqfeature)) { |
1571
|
0
|
|
|
|
|
0
|
confess "must provide a row index and SeqFeature object!"; |
1572
|
|
|
|
|
|
|
} |
1573
|
178
|
50
|
|
|
|
386
|
confess "invalid row index" unless ($row_i <= $self->last_row); |
1574
|
178
|
|
100
|
|
|
364
|
$self->{SeqFeatureObjects} ||= []; |
1575
|
178
|
|
|
|
|
348
|
$self->{SeqFeatureObjects}->[$row_i] = $seqfeature; |
1576
|
178
|
|
|
|
|
383
|
return 1; |
1577
|
|
|
|
|
|
|
} |
1578
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
sub delete_seqfeature { |
1580
|
0
|
|
|
0
|
1
|
0
|
my ($self, $row_i) = @_; |
1581
|
0
|
0
|
|
|
|
0
|
confess "invalid row index" unless ($row_i <= $self->last_row); |
1582
|
0
|
0
|
|
|
|
0
|
return unless $self->{SeqFeatureObjects}; |
1583
|
0
|
|
|
|
|
0
|
undef $self->{SeqFeatureObjects}->[$row_i]; |
1584
|
|
|
|
|
|
|
} |
1585
|
|
|
|
|
|
|
|
1586
|
|
|
|
|
|
|
sub collapse_gene_transcripts { |
1587
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1588
|
0
|
0
|
|
|
|
0
|
unless ($self->feature_type eq 'named') { |
1589
|
0
|
|
|
|
|
0
|
carp "Table does not contain named features!"; |
1590
|
0
|
|
|
|
|
0
|
return; |
1591
|
|
|
|
|
|
|
} |
1592
|
|
|
|
|
|
|
|
1593
|
|
|
|
|
|
|
# load module |
1594
|
0
|
|
|
|
|
0
|
my $class = "Bio::ToolBox::GeneTools"; |
1595
|
0
|
|
|
|
|
0
|
eval {load $class, qw(collapse_transcripts)}; |
|
0
|
|
|
|
|
0
|
|
1596
|
0
|
0
|
|
|
|
0
|
if ($@) { |
1597
|
0
|
|
|
|
|
0
|
carp "unable to load $class! cannot collapse transcripts!"; |
1598
|
0
|
|
|
|
|
0
|
return; |
1599
|
|
|
|
|
|
|
} |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
# collapse the transcripts |
1602
|
0
|
|
|
|
|
0
|
my $success = 0; |
1603
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1604
|
|
|
|
|
|
|
# we should have stored SeqFeature objects, probably from parsed table |
1605
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1606
|
0
|
0
|
|
|
|
0
|
my $feature = $self->{SeqFeatureObjects}->[$i] or next; |
1607
|
0
|
0
|
|
|
|
0
|
my $collSeqFeat = collapse_transcripts($feature) or next; |
1608
|
0
|
|
|
|
|
0
|
$success += $self->store_seqfeature($i, $collSeqFeat); |
1609
|
|
|
|
|
|
|
} |
1610
|
|
|
|
|
|
|
} |
1611
|
|
|
|
|
|
|
else { |
1612
|
|
|
|
|
|
|
# no stored SeqFeature objects, probably names pointing to a database |
1613
|
|
|
|
|
|
|
# we will have to fetch the feature from a database |
1614
|
0
|
0
|
|
|
|
0
|
my $db = $self->open_meta_database(1) or # force open a new db connection |
1615
|
|
|
|
|
|
|
confess "No SeqFeature objects stored and no database connection!"; |
1616
|
0
|
|
|
|
|
0
|
my $name_i = $self->name_column; |
1617
|
0
|
|
|
|
|
0
|
my $id_i = $self->id_column; |
1618
|
0
|
|
|
|
|
0
|
my $type_i = $self->type_column; |
1619
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1620
|
0
|
0
|
0
|
|
|
0
|
my $feature = get_db_feature( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1621
|
|
|
|
|
|
|
db => $db, |
1622
|
|
|
|
|
|
|
name => $self->value($i, $name_i) || undef, |
1623
|
|
|
|
|
|
|
type => $self->value($i, $type_i) || undef, |
1624
|
|
|
|
|
|
|
id => $self->value($i, $id_i) || undef, |
1625
|
|
|
|
|
|
|
) or next; |
1626
|
0
|
0
|
|
|
|
0
|
my $collSeqFeat = collapse_transcripts($feature) or next; |
1627
|
0
|
|
|
|
|
0
|
$success += $self->store_seqfeature($i, $collSeqFeat); |
1628
|
|
|
|
|
|
|
} |
1629
|
|
|
|
|
|
|
} |
1630
|
|
|
|
|
|
|
|
1631
|
0
|
|
|
|
|
0
|
return $success; |
1632
|
|
|
|
|
|
|
} |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
sub add_transcript_length { |
1635
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1636
|
0
|
|
0
|
|
|
0
|
my $subfeature = shift || 'exon'; |
1637
|
0
|
0
|
|
|
|
0
|
unless (exists $self->{SeqFeatureObjects}) { |
1638
|
0
|
|
|
|
|
0
|
carp "no SeqFeature objects stored for collapsing!"; |
1639
|
0
|
|
|
|
|
0
|
return; |
1640
|
|
|
|
|
|
|
} |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
# load module |
1643
|
0
|
|
|
|
|
0
|
eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length |
|
0
|
|
|
|
|
0
|
|
1644
|
|
|
|
|
|
|
get_transcript_5p_utr_length get_transcript_3p_utr_length)}; |
1645
|
0
|
0
|
|
|
|
0
|
if ($@) { |
1646
|
0
|
|
|
|
|
0
|
carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!"; |
1647
|
0
|
|
|
|
|
0
|
return; |
1648
|
|
|
|
|
|
|
} |
1649
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
# determine name and calculation method |
1651
|
0
|
|
|
|
|
0
|
my $length_calculator; |
1652
|
|
|
|
|
|
|
my $length_name; |
1653
|
0
|
0
|
|
|
|
0
|
if ($subfeature eq 'exon') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1654
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_length; |
1655
|
0
|
|
|
|
|
0
|
$length_name= 'Merged_Transcript_Length'; |
1656
|
|
|
|
|
|
|
} |
1657
|
|
|
|
|
|
|
elsif ($subfeature eq 'cds') { |
1658
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_cds_length; |
1659
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_CDS_Length'; |
1660
|
|
|
|
|
|
|
} |
1661
|
|
|
|
|
|
|
elsif ($subfeature eq '5p_utr') { |
1662
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_5p_utr_length; |
1663
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_5p_UTR_Length'; |
1664
|
|
|
|
|
|
|
} |
1665
|
|
|
|
|
|
|
elsif ($subfeature eq '3p_utr') { |
1666
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_3p_utr_length; |
1667
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_3p_UTR_Length'; |
1668
|
|
|
|
|
|
|
} |
1669
|
|
|
|
|
|
|
else { |
1670
|
0
|
|
|
|
|
0
|
carp "unrecognized subfeature type '$subfeature'! No length calculated"; |
1671
|
0
|
|
|
|
|
0
|
return; |
1672
|
|
|
|
|
|
|
} |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
# add new column and calculate |
1675
|
0
|
|
|
|
|
0
|
my $length_i = $self->add_column($length_name); |
1676
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1677
|
|
|
|
|
|
|
# we should have stored SeqFeature objects, probably from parsed table |
1678
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1679
|
0
|
0
|
|
|
|
0
|
my $feature = $self->{SeqFeatureObjects}->[$i] or next; |
1680
|
0
|
|
|
|
|
0
|
my $length = &$length_calculator($feature); |
1681
|
0
|
|
0
|
|
|
0
|
$self->{data_table}->[$i][$length_i] = $length || $feature->length; |
1682
|
|
|
|
|
|
|
} |
1683
|
|
|
|
|
|
|
} |
1684
|
|
|
|
|
|
|
else { |
1685
|
|
|
|
|
|
|
# no stored SeqFeature objects, probably names pointing to a database |
1686
|
|
|
|
|
|
|
# we will have to fetch the feature from a database |
1687
|
0
|
0
|
|
|
|
0
|
my $db = $self->open_meta_database(1) or # force open a new db connection |
1688
|
|
|
|
|
|
|
confess "No SeqFeature objects stored and no database connection!"; |
1689
|
0
|
|
|
|
|
0
|
my $name_i = $self->name_column; |
1690
|
0
|
|
|
|
|
0
|
my $id_i = $self->id_column; |
1691
|
0
|
|
|
|
|
0
|
my $type_i = $self->type_column; |
1692
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1693
|
0
|
0
|
0
|
|
|
0
|
my $feature = get_db_feature( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1694
|
|
|
|
|
|
|
db => $db, |
1695
|
|
|
|
|
|
|
name => $self->value($i, $name_i) || undef, |
1696
|
|
|
|
|
|
|
type => $self->value($i, $type_i) || undef, |
1697
|
|
|
|
|
|
|
id => $self->value($i, $id_i) || undef, |
1698
|
|
|
|
|
|
|
) or next; |
1699
|
0
|
|
|
|
|
0
|
my $length = &$length_calculator($feature); |
1700
|
0
|
|
0
|
|
|
0
|
$self->{data_table}->[$i][$length_i] = $length || $feature->length; |
1701
|
0
|
|
|
|
|
0
|
$self->store_seqfeature($i, $feature); |
1702
|
|
|
|
|
|
|
# to store or not to store? May explode memory, but could save from |
1703
|
|
|
|
|
|
|
# expensive db calls later |
1704
|
|
|
|
|
|
|
} |
1705
|
|
|
|
|
|
|
} |
1706
|
|
|
|
|
|
|
|
1707
|
0
|
|
|
|
|
0
|
return $length_i; |
1708
|
|
|
|
|
|
|
} |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
|
1712
|
|
|
|
|
|
|
### Data structure manipulation |
1713
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
sub sort_data { |
1715
|
1
|
|
|
1
|
1
|
1860
|
my $self = shift; |
1716
|
1
|
|
|
|
|
3
|
my $index = shift; |
1717
|
1
|
|
50
|
|
|
5
|
my $direction = shift || 'i'; |
1718
|
|
|
|
|
|
|
|
1719
|
|
|
|
|
|
|
# confirm options |
1720
|
1
|
50
|
|
|
|
5
|
return unless exists $self->{$index}{name}; |
1721
|
1
|
50
|
|
|
|
7
|
unless ($direction =~ /^[id]/i) { |
1722
|
0
|
|
|
|
|
0
|
carp "unrecognized sort order '$direction'! Must be i or d"; |
1723
|
0
|
|
|
|
|
0
|
return; |
1724
|
|
|
|
|
|
|
} |
1725
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
# Sample the dataset values |
1727
|
|
|
|
|
|
|
# this will be used to guess the sort method, below |
1728
|
1
|
|
|
|
|
3
|
my $example; # an example of the dataset |
1729
|
1
|
|
|
|
|
6
|
foreach (my $i = 1; $i <= $self->last_row; $i++) { |
1730
|
|
|
|
|
|
|
# we want to avoid null values, so keep trying |
1731
|
|
|
|
|
|
|
# null being . or any variation of N/A, NaN, inf |
1732
|
1
|
|
|
|
|
7
|
my $v = $self->value($i, $index); |
1733
|
1
|
50
|
33
|
|
|
10
|
if (defined $v and $v !~ /^(?:\.|n\/?a|nan|\-?inf)$/i) { |
1734
|
|
|
|
|
|
|
# a non-null value, take it |
1735
|
1
|
|
|
|
|
3
|
$example = $v; |
1736
|
1
|
|
|
|
|
2
|
last; |
1737
|
|
|
|
|
|
|
} |
1738
|
|
|
|
|
|
|
} |
1739
|
|
|
|
|
|
|
|
1740
|
|
|
|
|
|
|
# Determine sort method, either numerical or alphabetical |
1741
|
1
|
|
|
|
|
2
|
my $sortmethod; |
1742
|
1
|
50
|
|
|
|
8
|
if ($example =~ /[a-z]/i) { |
|
|
0
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
# there are detectable letters |
1744
|
1
|
|
|
|
|
2
|
$sortmethod = 'ascii'; |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
elsif ($example =~ /^\-?\d+\.?\d*$/) { |
1747
|
|
|
|
|
|
|
# there are only digits, allowing for minus sign and a decimal point |
1748
|
|
|
|
|
|
|
# I don't think this allows for exponents, though |
1749
|
0
|
|
|
|
|
0
|
$sortmethod = 'numeric'; |
1750
|
|
|
|
|
|
|
} |
1751
|
|
|
|
|
|
|
else { |
1752
|
|
|
|
|
|
|
# unable to determine (probably alphanumeric), sort asciibetical |
1753
|
0
|
|
|
|
|
0
|
$sortmethod = 'ascii'; |
1754
|
|
|
|
|
|
|
} |
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
# Re-order the datasets |
1757
|
|
|
|
|
|
|
# Directly sorting the @data array is proving difficult. It keeps giving me |
1758
|
|
|
|
|
|
|
# a segmentation fault. So I'm using a different approach by copying the |
1759
|
|
|
|
|
|
|
# @data_table into a temporary hash. |
1760
|
|
|
|
|
|
|
# put data_table array into a temporary hash |
1761
|
|
|
|
|
|
|
# the hash key will the be dataset value, |
1762
|
|
|
|
|
|
|
# the hash value will be the reference the row data |
1763
|
1
|
|
|
|
|
2
|
my %datahash; |
1764
|
|
|
|
|
|
|
|
1765
|
|
|
|
|
|
|
# reorder numerically |
1766
|
1
|
50
|
|
|
|
8
|
if ($sortmethod eq 'numeric') { |
|
|
50
|
|
|
|
|
|
1767
|
0
|
|
|
|
|
0
|
for my $row (1..$self->last_row) { |
1768
|
0
|
|
|
|
|
0
|
my $value = $self->value($row, $index); |
1769
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
# check to see whether this value exists or not |
1771
|
0
|
|
|
|
|
0
|
while (exists $datahash{$value}) { |
1772
|
|
|
|
|
|
|
# add a really small number to bump it up and make it unique |
1773
|
|
|
|
|
|
|
# this, of course, presumes that none of the dataset values |
1774
|
|
|
|
|
|
|
# are really this small - this may be an entirely bad |
1775
|
|
|
|
|
|
|
# assumption!!!!! I suppose we could somehow calculate an |
1776
|
|
|
|
|
|
|
# appropriate value.... nah. |
1777
|
|
|
|
|
|
|
# don't worry, we're only modifying the value used for sorting, |
1778
|
|
|
|
|
|
|
# not the actual value |
1779
|
0
|
|
|
|
|
0
|
$value += 0.00000001; |
1780
|
|
|
|
|
|
|
} |
1781
|
|
|
|
|
|
|
# store the row data reference |
1782
|
0
|
|
|
|
|
0
|
$datahash{$value} = $self->{data_table}->[$row]; |
1783
|
|
|
|
|
|
|
} |
1784
|
|
|
|
|
|
|
|
1785
|
|
|
|
|
|
|
# re-fill the array based on the sort direction |
1786
|
0
|
0
|
|
|
|
0
|
if ($direction =~ /^i/i) { |
1787
|
|
|
|
|
|
|
# increasing sort |
1788
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row, skip the header |
1789
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1790
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1791
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1792
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1793
|
|
|
|
|
|
|
} |
1794
|
|
|
|
|
|
|
} |
1795
|
|
|
|
|
|
|
else { |
1796
|
|
|
|
|
|
|
# decreasing sort |
1797
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row, skip the header |
1798
|
0
|
|
|
|
|
0
|
foreach (sort {$b <=> $a} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1799
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1800
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1801
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1802
|
|
|
|
|
|
|
} |
1803
|
|
|
|
|
|
|
} |
1804
|
|
|
|
|
|
|
|
1805
|
|
|
|
|
|
|
# summary prompt |
1806
|
0
|
|
|
|
|
0
|
printf " Data table sorted numerically by the contents of %s\n", |
1807
|
|
|
|
|
|
|
$self->name($index); |
1808
|
|
|
|
|
|
|
} |
1809
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
# reorder asciibetically |
1811
|
|
|
|
|
|
|
elsif ($sortmethod eq 'ascii') { |
1812
|
1
|
|
|
|
|
4
|
for my $row (1..$self->last_row) { |
1813
|
|
|
|
|
|
|
# get the value to sort by |
1814
|
78
|
|
|
|
|
126
|
my $value = $self->value($row, $index); |
1815
|
78
|
50
|
|
|
|
127
|
if (exists $datahash{$value}) { |
1816
|
|
|
|
|
|
|
# not unique |
1817
|
0
|
|
|
|
|
0
|
my $n = 1; |
1818
|
0
|
|
|
|
|
0
|
my $lookup = $value . sprintf("03%d", $n); |
1819
|
|
|
|
|
|
|
# we'll try to make a unique value by appending |
1820
|
|
|
|
|
|
|
# a number to the original value |
1821
|
0
|
|
|
|
|
0
|
while (exists $datahash{$lookup}) { |
1822
|
|
|
|
|
|
|
# keep bumping up the number till it's unique |
1823
|
0
|
|
|
|
|
0
|
$n++; |
1824
|
0
|
|
|
|
|
0
|
$lookup = $value . sprintf("03%d", $n); |
1825
|
|
|
|
|
|
|
} |
1826
|
0
|
|
|
|
|
0
|
$datahash{$lookup} = $self->{data_table}->[$row]; |
1827
|
|
|
|
|
|
|
} |
1828
|
|
|
|
|
|
|
else { |
1829
|
|
|
|
|
|
|
# unique |
1830
|
78
|
|
|
|
|
218
|
$datahash{$value} = $self->{data_table}->[$row]; |
1831
|
|
|
|
|
|
|
} |
1832
|
|
|
|
|
|
|
} |
1833
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
# re-fill the array based on the sort direction |
1835
|
1
|
50
|
33
|
|
|
12
|
if ($direction eq 'i' or $direction eq 'I') { |
|
|
50
|
33
|
|
|
|
|
1836
|
|
|
|
|
|
|
# increasing |
1837
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row |
1838
|
0
|
|
|
|
|
0
|
foreach (sort {$a cmp $b} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1839
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1840
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1841
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1842
|
|
|
|
|
|
|
} |
1843
|
|
|
|
|
|
|
} |
1844
|
|
|
|
|
|
|
elsif ($direction eq 'd' or $direction eq 'D') { |
1845
|
|
|
|
|
|
|
# decreasing |
1846
|
1
|
|
|
|
|
4
|
my $i = 1; # keep track of the row |
1847
|
1
|
|
|
|
|
25
|
foreach (sort {$b cmp $a} keys %datahash) { |
|
392
|
|
|
|
|
499
|
|
1848
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1849
|
78
|
|
|
|
|
109
|
$self->{data_table}->[$i] = $datahash{$_}; |
1850
|
78
|
|
|
|
|
93
|
$i++; # increment for next row |
1851
|
|
|
|
|
|
|
} |
1852
|
|
|
|
|
|
|
} |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
# summary prompt |
1855
|
1
|
|
|
|
|
9
|
printf " Data table sorted asciibetically by the contents of '%s'\n", |
1856
|
|
|
|
|
|
|
$self->name($index); |
1857
|
|
|
|
|
|
|
} |
1858
|
|
|
|
|
|
|
|
1859
|
1
|
|
|
|
|
17
|
return 1; |
1860
|
|
|
|
|
|
|
} |
1861
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
sub gsort_data { |
1863
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
1864
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
# identify indices |
1866
|
1
|
50
|
|
|
|
4
|
unless ($self->feature_type eq 'coordinate') { |
1867
|
0
|
|
|
|
|
0
|
carp "no chromosome and start/position columns to sort!\n"; |
1868
|
0
|
|
|
|
|
0
|
return; |
1869
|
|
|
|
|
|
|
} |
1870
|
1
|
|
|
|
|
4
|
my $chromo_i = $self->chromo_column; |
1871
|
1
|
|
|
|
|
4
|
my $start_i = $self->start_column; |
1872
|
|
|
|
|
|
|
|
1873
|
|
|
|
|
|
|
# Load the data into a temporary hash |
1874
|
|
|
|
|
|
|
# The datalines will be put into a hash of hashes: The first key will be |
1875
|
|
|
|
|
|
|
# the chromosome name, the second hash will be the start value. |
1876
|
|
|
|
|
|
|
# |
1877
|
|
|
|
|
|
|
# To deal with some chromosomes that don't have numbers (e.g. chrM), we'll |
1878
|
|
|
|
|
|
|
# use two separate hashes: one is for numbers, the other for strings |
1879
|
|
|
|
|
|
|
# when it comes time to sort, we'll put the numbers first, then strings |
1880
|
|
|
|
|
|
|
|
1881
|
1
|
|
|
|
|
3
|
my %num_datahash; |
1882
|
|
|
|
|
|
|
my %str_datahash; |
1883
|
1
|
|
|
|
|
5
|
for my $row (1 .. $self->last_row) { |
1884
|
|
|
|
|
|
|
|
1885
|
78
|
|
|
|
|
137
|
my $startvalue = $self->value($row, $start_i); |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
# put the dataline into the appropriate temporary hash |
1888
|
78
|
50
|
|
|
|
134
|
if ($self->value($row, $chromo_i) =~ /^(?:chr)?(\d+)$/) { |
1889
|
|
|
|
|
|
|
# dealing with a numeric chromosome name |
1890
|
|
|
|
|
|
|
# restricting to either chr2 or just 2 but not 2-micron |
1891
|
0
|
|
|
|
|
0
|
my $chromovalue = $1; |
1892
|
0
|
|
|
|
|
0
|
while (exists $num_datahash{$chromovalue}{$startvalue}) { |
1893
|
|
|
|
|
|
|
# if another item already exists at this location |
1894
|
|
|
|
|
|
|
# add a really small number to bump it up and make it unique |
1895
|
0
|
|
|
|
|
0
|
$startvalue += 0.001; |
1896
|
|
|
|
|
|
|
} |
1897
|
0
|
|
|
|
|
0
|
$num_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row]; |
1898
|
|
|
|
|
|
|
} |
1899
|
|
|
|
|
|
|
else { |
1900
|
|
|
|
|
|
|
# dealing with a non-numeric chromosome name |
1901
|
78
|
|
|
|
|
125
|
my $chromovalue = $self->value($row, $chromo_i); |
1902
|
|
|
|
|
|
|
# use the entire chromosome name as key |
1903
|
78
|
|
|
|
|
164
|
while (exists $str_datahash{$chromovalue}{$startvalue}) { |
1904
|
|
|
|
|
|
|
# if another item already exists at this location |
1905
|
|
|
|
|
|
|
# add a really small number to bump it up and make it unique |
1906
|
36
|
|
|
|
|
174
|
$startvalue += 0.001; |
1907
|
|
|
|
|
|
|
} |
1908
|
78
|
|
|
|
|
281
|
$str_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row]; |
1909
|
|
|
|
|
|
|
} |
1910
|
|
|
|
|
|
|
} |
1911
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
# Now re-load the data array with sorted data |
1914
|
|
|
|
|
|
|
# put the numeric chromosome data back first |
1915
|
1
|
|
|
|
|
3
|
my $i = 1; # keep track of the row |
1916
|
1
|
|
|
|
|
5
|
foreach my $chromovalue (sort {$a <=> $b} keys %num_datahash) { |
|
0
|
|
|
|
|
0
|
|
1917
|
|
|
|
|
|
|
# first, numeric sort on increasing chromosome number |
1918
|
0
|
|
|
|
|
0
|
foreach my $startvalue ( |
1919
|
0
|
|
|
|
|
0
|
sort {$a <=> $b} keys %{ $num_datahash{$chromovalue} } |
|
0
|
|
|
|
|
0
|
|
1920
|
|
|
|
|
|
|
) { |
1921
|
|
|
|
|
|
|
# second, numeric sort on increasing position value |
1922
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $num_datahash{$chromovalue}{$startvalue}; |
1923
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1924
|
|
|
|
|
|
|
} |
1925
|
|
|
|
|
|
|
} |
1926
|
|
|
|
|
|
|
# next put the string chromosome data back |
1927
|
1
|
|
|
|
|
6
|
foreach my $chromovalue (sort {$a cmp $b} keys %str_datahash) { |
|
0
|
|
|
|
|
0
|
|
1928
|
|
|
|
|
|
|
# first, ascii sort on increasing chromosome name |
1929
|
1
|
|
|
|
|
2
|
foreach my $startvalue ( |
1930
|
396
|
|
|
|
|
504
|
sort {$a <=> $b} keys %{ $str_datahash{$chromovalue} } |
|
1
|
|
|
|
|
13
|
|
1931
|
|
|
|
|
|
|
) { |
1932
|
|
|
|
|
|
|
# second, numeric sort on increasing position value |
1933
|
78
|
|
|
|
|
128
|
$self->{data_table}->[$i] = $str_datahash{$chromovalue}{$startvalue}; |
1934
|
78
|
|
|
|
|
101
|
$i++; # increment for next row |
1935
|
|
|
|
|
|
|
} |
1936
|
|
|
|
|
|
|
} |
1937
|
|
|
|
|
|
|
|
1938
|
1
|
|
|
|
|
13
|
return 1; |
1939
|
|
|
|
|
|
|
} |
1940
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
sub splice_data { |
1942
|
1
|
|
|
1
|
1
|
4
|
my ($self, $part, $total_parts) = @_; |
1943
|
|
|
|
|
|
|
|
1944
|
1
|
50
|
33
|
|
|
10
|
unless ($part and $total_parts) { |
1945
|
0
|
|
|
|
|
0
|
confess "ordinal part and total number of parts not passed\n"; |
1946
|
|
|
|
|
|
|
} |
1947
|
1
|
|
|
|
|
6
|
my $part_length = int($self->last_row / $total_parts); |
1948
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
# check for SeqFeatureObjects array |
1950
|
1
|
50
|
|
|
|
4
|
if (exists $self->{SeqFeatureObjects}) { |
1951
|
|
|
|
|
|
|
# it needs to be the same length as the data table, it should be |
1952
|
0
|
|
|
|
|
0
|
while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1953
|
0
|
|
|
|
|
0
|
push @{$self->{SeqFeatureObjects}}, undef; |
|
0
|
|
|
|
|
0
|
|
1954
|
|
|
|
|
|
|
} |
1955
|
|
|
|
|
|
|
} |
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
# splicing based on which part we do |
1958
|
1
|
50
|
|
|
|
6
|
if ($part == 1) { |
|
|
50
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
# remove all but the first part |
1960
|
|
|
|
|
|
|
splice( |
1961
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
1962
|
|
|
|
|
|
|
$part_length + 1 |
1963
|
|
|
|
|
|
|
); |
1964
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1965
|
|
|
|
|
|
|
splice( |
1966
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
1967
|
|
|
|
|
|
|
$part_length + 1 |
1968
|
|
|
|
|
|
|
); |
1969
|
|
|
|
|
|
|
} |
1970
|
|
|
|
|
|
|
} |
1971
|
|
|
|
|
|
|
elsif ($part == $total_parts) { |
1972
|
|
|
|
|
|
|
# remove all but the last part |
1973
|
|
|
|
|
|
|
splice( |
1974
|
1
|
|
|
|
|
2
|
@{$self->{'data_table'}}, |
|
1
|
|
|
|
|
15
|
|
1975
|
|
|
|
|
|
|
1, |
1976
|
|
|
|
|
|
|
$part_length * ($total_parts - 1) |
1977
|
|
|
|
|
|
|
); |
1978
|
1
|
50
|
|
|
|
4
|
if (exists $self->{SeqFeatureObjects}) { |
1979
|
|
|
|
|
|
|
splice( |
1980
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
1981
|
|
|
|
|
|
|
1, |
1982
|
|
|
|
|
|
|
$part_length * ($total_parts - 1) |
1983
|
|
|
|
|
|
|
); |
1984
|
|
|
|
|
|
|
} |
1985
|
|
|
|
|
|
|
} |
1986
|
|
|
|
|
|
|
else { |
1987
|
|
|
|
|
|
|
# splicing in the middle requires two rounds |
1988
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
# remove the last parts |
1990
|
|
|
|
|
|
|
splice( |
1991
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
1992
|
|
|
|
|
|
|
($part * $part_length) + 1 |
1993
|
|
|
|
|
|
|
); |
1994
|
|
|
|
|
|
|
|
1995
|
|
|
|
|
|
|
# remove the first parts |
1996
|
|
|
|
|
|
|
splice( |
1997
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
1998
|
|
|
|
|
|
|
1, |
1999
|
|
|
|
|
|
|
$part_length * ($part - 1) |
2000
|
|
|
|
|
|
|
); |
2001
|
|
|
|
|
|
|
|
2002
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
2003
|
|
|
|
|
|
|
splice( |
2004
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
2005
|
|
|
|
|
|
|
($part * $part_length) + 1 |
2006
|
|
|
|
|
|
|
); |
2007
|
|
|
|
|
|
|
splice( |
2008
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
2009
|
|
|
|
|
|
|
1, |
2010
|
|
|
|
|
|
|
$part_length * ($part - 1) |
2011
|
|
|
|
|
|
|
); |
2012
|
|
|
|
|
|
|
} |
2013
|
|
|
|
|
|
|
} |
2014
|
|
|
|
|
|
|
|
2015
|
|
|
|
|
|
|
# update last row metadata |
2016
|
1
|
|
|
|
|
2
|
$self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1; |
|
1
|
|
|
|
|
4
|
|
2017
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
# re-open a new un-cached database connection |
2019
|
1
|
50
|
|
|
|
4
|
if (exists $self->{db_connection}) { |
2020
|
0
|
|
|
|
|
0
|
delete $self->{db_connection}; |
2021
|
|
|
|
|
|
|
} |
2022
|
1
|
|
|
|
|
4
|
return 1; |
2023
|
|
|
|
|
|
|
} |
2024
|
|
|
|
|
|
|
|
2025
|
|
|
|
|
|
|
|
2026
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
### File functions |
2028
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
sub reload_children { |
2030
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
2031
|
1
|
|
|
|
|
4
|
my @files = @_; |
2032
|
1
|
50
|
|
|
|
5
|
return unless @files; |
2033
|
|
|
|
|
|
|
|
2034
|
|
|
|
|
|
|
# prepare Stream |
2035
|
1
|
|
|
|
|
3
|
my $class = "Bio::ToolBox::Data::Stream"; |
2036
|
1
|
|
|
|
|
3
|
eval {load $class}; |
|
1
|
|
|
|
|
6
|
|
2037
|
1
|
50
|
|
|
|
98
|
if ($@) { |
2038
|
0
|
|
|
|
|
0
|
carp "unable to load $class! can't reload children!"; |
2039
|
0
|
|
|
|
|
0
|
return; |
2040
|
|
|
|
|
|
|
} |
2041
|
|
|
|
|
|
|
|
2042
|
|
|
|
|
|
|
# open first stream |
2043
|
1
|
|
|
|
|
4
|
my $first = shift @files; |
2044
|
1
|
|
|
|
|
9
|
my $Stream = $class->new(in => $first); |
2045
|
1
|
50
|
|
|
|
5
|
unless ($Stream) { |
2046
|
0
|
|
|
|
|
0
|
carp "unable to load first child file $first"; |
2047
|
0
|
|
|
|
|
0
|
return; |
2048
|
|
|
|
|
|
|
} |
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
# destroy current data table and metadata |
2051
|
1
|
|
|
|
|
4
|
$self->{data_table} = []; |
2052
|
1
|
|
|
|
|
5
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
2053
|
0
|
|
|
|
|
0
|
delete $self->{$i}; |
2054
|
|
|
|
|
|
|
} |
2055
|
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
# copy the metadata |
2057
|
1
|
|
|
|
|
5
|
foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) { |
2058
|
|
|
|
|
|
|
# various keys |
2059
|
9
|
|
|
|
|
19
|
$self->{$_} = $Stream->{$_}; |
2060
|
|
|
|
|
|
|
} |
2061
|
1
|
|
|
|
|
5
|
for (my $i = 0; $i < $Stream->number_columns; $i++) { |
2062
|
|
|
|
|
|
|
# column metadata |
2063
|
4
|
|
|
|
|
12
|
my %md = $Stream->metadata($i); |
2064
|
4
|
|
|
|
|
15
|
$self->{$i} = \%md; |
2065
|
|
|
|
|
|
|
} |
2066
|
1
|
|
|
|
|
4
|
my @comments = $Stream->comments; |
2067
|
1
|
|
|
|
|
3
|
push @{$self->{other}}, @comments; |
|
1
|
|
|
|
|
5
|
|
2068
|
|
|
|
|
|
|
|
2069
|
|
|
|
|
|
|
# first row column headers |
2070
|
1
|
|
|
|
|
3
|
$self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ]; |
|
1
|
|
|
|
|
15
|
|
2071
|
|
|
|
|
|
|
|
2072
|
|
|
|
|
|
|
# load the data |
2073
|
1
|
|
|
|
|
7
|
while (my $row = $Stream->next_row) { |
2074
|
39
|
|
|
|
|
81
|
$self->add_row($row); |
2075
|
|
|
|
|
|
|
} |
2076
|
1
|
|
|
|
|
45
|
$Stream->close_fh; |
2077
|
|
|
|
|
|
|
|
2078
|
|
|
|
|
|
|
# load remaining files |
2079
|
1
|
|
|
|
|
24
|
foreach my $file (@files) { |
2080
|
2
|
|
|
|
|
28
|
my $Stream = $class->new(in => $file); |
2081
|
2
|
50
|
|
|
|
8
|
if ($Stream->number_columns != $self->number_columns) { |
2082
|
0
|
|
|
|
|
0
|
confess "fatal error: child file $file has a different number of columns!"; |
2083
|
|
|
|
|
|
|
} |
2084
|
2
|
|
|
|
|
7
|
while (my $row = $Stream->next_row) { |
2085
|
78
|
|
|
|
|
158
|
my $a = $row->row_values; |
2086
|
78
|
|
|
|
|
164
|
$self->add_row($a); |
2087
|
|
|
|
|
|
|
} |
2088
|
2
|
|
|
|
|
75
|
$Stream->close_fh; |
2089
|
|
|
|
|
|
|
} |
2090
|
1
|
|
|
|
|
25
|
$self->verify; |
2091
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
# clean up |
2093
|
1
|
|
|
|
|
224
|
unlink($first, @files); |
2094
|
1
|
|
|
|
|
10
|
return $self->last_row; |
2095
|
|
|
|
|
|
|
} |
2096
|
|
|
|
|
|
|
|
2097
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
### Export summary data file |
2101
|
|
|
|
|
|
|
sub summary_file { |
2102
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2103
|
|
|
|
|
|
|
|
2104
|
|
|
|
|
|
|
# Collect passed arguments |
2105
|
0
|
|
|
|
|
0
|
my %args = @_; |
2106
|
|
|
|
|
|
|
|
2107
|
|
|
|
|
|
|
# parameters |
2108
|
|
|
|
|
|
|
# either one or more datasets can be summarized |
2109
|
0
|
|
0
|
|
|
0
|
my $outfile = $args{'filename'} || undef; |
2110
|
0
|
|
|
|
|
0
|
my @datasets; |
2111
|
0
|
0
|
0
|
|
|
0
|
if ($args{dataset} and ref $args{dataset} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2112
|
0
|
|
|
|
|
0
|
@datasets = @{ $args{dataset} }; |
|
0
|
|
|
|
|
0
|
|
2113
|
|
|
|
|
|
|
} |
2114
|
|
|
|
|
|
|
elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') { |
2115
|
0
|
|
|
|
|
0
|
push @datasets, $args{dataset}; |
2116
|
|
|
|
|
|
|
} |
2117
|
0
|
|
|
|
|
0
|
my @startcolumns; |
2118
|
0
|
0
|
0
|
|
|
0
|
if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2119
|
0
|
|
|
|
|
0
|
@startcolumns = @{ $args{startcolumn} }; |
|
0
|
|
|
|
|
0
|
|
2120
|
|
|
|
|
|
|
} |
2121
|
|
|
|
|
|
|
elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') { |
2122
|
0
|
|
|
|
|
0
|
push @startcolumns, $args{startcolumn}; |
2123
|
|
|
|
|
|
|
} |
2124
|
0
|
|
|
|
|
0
|
my @endcolumns; |
2125
|
0
|
|
0
|
|
|
0
|
$args{endcolumn} ||= $args{stopcolumn}; |
2126
|
0
|
0
|
0
|
|
|
0
|
if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2127
|
0
|
|
|
|
|
0
|
@endcolumns = @{ $args{endcolumn} }; |
|
0
|
|
|
|
|
0
|
|
2128
|
|
|
|
|
|
|
} |
2129
|
|
|
|
|
|
|
elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') { |
2130
|
0
|
|
|
|
|
0
|
push @endcolumns, $args{endcolumn}; |
2131
|
|
|
|
|
|
|
} |
2132
|
|
|
|
|
|
|
|
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
# Check required values |
2135
|
0
|
0
|
|
|
|
0
|
unless ($self->verify) { |
2136
|
0
|
|
|
|
|
0
|
cluck "bad data structure!"; |
2137
|
0
|
|
|
|
|
0
|
return; |
2138
|
|
|
|
|
|
|
} |
2139
|
0
|
0
|
|
|
|
0
|
unless (defined $outfile) { |
2140
|
0
|
0
|
|
|
|
0
|
if ($self->basename) { |
2141
|
|
|
|
|
|
|
# use the opened file's filename if it exists |
2142
|
|
|
|
|
|
|
# prepend the path if it exists |
2143
|
|
|
|
|
|
|
# the extension will be added later |
2144
|
0
|
|
|
|
|
0
|
$outfile = $self->path . $self->basename; |
2145
|
|
|
|
|
|
|
} |
2146
|
|
|
|
|
|
|
else { |
2147
|
0
|
|
|
|
|
0
|
cluck "no filename passed to write_summary_data!\n"; |
2148
|
0
|
|
|
|
|
0
|
return; |
2149
|
|
|
|
|
|
|
} |
2150
|
|
|
|
|
|
|
} |
2151
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
# Identify possible dataset columns |
2153
|
0
|
|
|
|
|
0
|
my %possibles; |
2154
|
0
|
|
|
|
|
0
|
my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class |
|
0
|
|
|
|
|
0
|
|
2155
|
|
|
|
|
|
|
geneclass chromosome chromo seq_id seqid start stop end gene strand |
2156
|
|
|
|
|
|
|
length primary_id merged_transcript_length transcript_cds_length |
2157
|
|
|
|
|
|
|
transcript_5p_utr_length transcript_3p_utr_length); |
2158
|
|
|
|
|
|
|
# walk through the dataset names |
2159
|
0
|
|
|
|
|
0
|
$possibles{unknown} = []; |
2160
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
2161
|
0
|
0
|
|
|
|
0
|
if (not exists $skip{ lc $self->{$i}{'name'} }) { |
2162
|
0
|
|
0
|
|
|
0
|
my $d = $self->metadata($i, 'dataset') || undef; |
2163
|
0
|
0
|
|
|
|
0
|
if (defined $d) { |
2164
|
|
|
|
|
|
|
# we have what appears to be a dataset column |
2165
|
0
|
|
0
|
|
|
0
|
$possibles{$d} ||= []; |
2166
|
0
|
|
|
|
|
0
|
push @{$possibles{$d}}, $i; |
|
0
|
|
|
|
|
0
|
|
2167
|
|
|
|
|
|
|
} |
2168
|
|
|
|
|
|
|
else { |
2169
|
|
|
|
|
|
|
# still an unknown possibility |
2170
|
0
|
|
|
|
|
0
|
push @{$possibles{unknown}}, $i; |
|
0
|
|
|
|
|
0
|
|
2171
|
|
|
|
|
|
|
} |
2172
|
|
|
|
|
|
|
} |
2173
|
|
|
|
|
|
|
} |
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
# check datasets |
2176
|
0
|
0
|
|
|
|
0
|
unless (@datasets) { |
2177
|
0
|
0
|
|
|
|
0
|
if (scalar keys %possibles > 1) { |
2178
|
|
|
|
|
|
|
# we will always have the unknown category, so anything more than one |
2179
|
|
|
|
|
|
|
# means we found legitimate dataset columns |
2180
|
0
|
|
|
|
|
0
|
delete $possibles{unknown}; |
2181
|
|
|
|
|
|
|
} |
2182
|
0
|
|
|
|
|
0
|
@datasets = sort {$a cmp $b} keys %possibles; |
|
0
|
|
|
|
|
0
|
|
2183
|
|
|
|
|
|
|
} |
2184
|
|
|
|
|
|
|
|
2185
|
|
|
|
|
|
|
# check starts |
2186
|
0
|
0
|
|
|
|
0
|
if (scalar @startcolumns != scalar @datasets) { |
2187
|
0
|
|
|
|
|
0
|
@startcolumns = (); # ignore what we were given? |
2188
|
0
|
|
|
|
|
0
|
foreach my $d (@datasets) { |
2189
|
|
|
|
|
|
|
# take the first column with this dataset |
2190
|
0
|
|
|
|
|
0
|
push @startcolumns, $possibles{$d}->[0]; |
2191
|
|
|
|
|
|
|
} |
2192
|
|
|
|
|
|
|
} |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
# check stops |
2195
|
0
|
0
|
|
|
|
0
|
if (scalar @endcolumns != scalar @datasets) { |
2196
|
0
|
|
|
|
|
0
|
@endcolumns = (); # ignore what we were given? |
2197
|
0
|
|
|
|
|
0
|
foreach my $d (@datasets) { |
2198
|
|
|
|
|
|
|
# take the last column with this dataset |
2199
|
0
|
|
|
|
|
0
|
push @endcolumns, $possibles{$d}->[-1]; |
2200
|
|
|
|
|
|
|
} |
2201
|
|
|
|
|
|
|
} |
2202
|
|
|
|
|
|
|
|
2203
|
|
|
|
|
|
|
|
2204
|
|
|
|
|
|
|
# Prepare Data object to store the summed data |
2205
|
0
|
|
|
|
|
0
|
my $summed_data = $self->new( |
2206
|
|
|
|
|
|
|
feature => 'averaged_windows', |
2207
|
|
|
|
|
|
|
columns => ['Window','Midpoint'], |
2208
|
|
|
|
|
|
|
); |
2209
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
# Go through each dataset |
2211
|
0
|
|
|
|
|
0
|
foreach my $d (0 .. $#datasets) { |
2212
|
|
|
|
|
|
|
|
2213
|
|
|
|
|
|
|
# Prepare score column name |
2214
|
0
|
|
|
|
|
0
|
my $data_name = simplify_dataset_name($datasets[$d]); |
2215
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
# add column |
2217
|
0
|
|
|
|
|
0
|
my $i = $summed_data->add_column($data_name); |
2218
|
0
|
|
|
|
|
0
|
$summed_data->metadata($i, 'dataset', $datasets[$d]); |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
# tag for remembering we're working with percentile bins |
2221
|
0
|
|
|
|
|
0
|
my $do_percentile = 0; |
2222
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
# remember the row |
2224
|
0
|
|
|
|
|
0
|
my $row = 1; |
2225
|
|
|
|
|
|
|
|
2226
|
|
|
|
|
|
|
# Collect summarized data |
2227
|
0
|
|
|
|
|
0
|
for ( |
2228
|
|
|
|
|
|
|
my $column = $startcolumns[$d]; |
2229
|
|
|
|
|
|
|
$column <= $endcolumns[$d]; |
2230
|
|
|
|
|
|
|
$column++ |
2231
|
|
|
|
|
|
|
) { |
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
# determine the midpoint position of the window |
2234
|
|
|
|
|
|
|
# this assumes the column metadata has start and stop |
2235
|
0
|
|
|
|
|
0
|
my $midpoint = int(sum0($self->metadata($column, 'start'), |
2236
|
|
|
|
|
|
|
$self->metadata($column, 'stop')) / 2); |
2237
|
|
|
|
|
|
|
|
2238
|
|
|
|
|
|
|
# convert midpoint to fraction of 1000 for plotting if necessary |
2239
|
0
|
0
|
|
|
|
0
|
if (substr($self->name($column), -1) eq '%') { |
2240
|
0
|
|
|
|
|
0
|
$midpoint *= 10; # midpoint * 0.01 * 1000 bp |
2241
|
0
|
|
|
|
|
0
|
$do_percentile++; |
2242
|
|
|
|
|
|
|
} |
2243
|
0
|
0
|
0
|
|
|
0
|
if ($do_percentile and substr($self->name($column), -2) eq 'bp') { |
2244
|
|
|
|
|
|
|
# working on the extension after the percentile bins |
2245
|
0
|
|
|
|
|
0
|
$midpoint += 1000; |
2246
|
|
|
|
|
|
|
} |
2247
|
|
|
|
|
|
|
|
2248
|
|
|
|
|
|
|
# collect the values in the column |
2249
|
0
|
|
|
|
|
0
|
my @values; |
2250
|
0
|
|
|
|
|
0
|
for my $row (1..$self->last_row) { |
2251
|
0
|
|
|
|
|
0
|
my $v = $self->value($row, $column); |
2252
|
0
|
0
|
|
|
|
0
|
push @values, $v eq '.' ? 0 : $v; # treat nulls as zero |
2253
|
|
|
|
|
|
|
} |
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
# adjust if log value |
2256
|
0
|
|
0
|
|
|
0
|
my $log = $self->metadata($column, 'log2') || 0; |
2257
|
0
|
0
|
|
|
|
0
|
if ($log) { |
2258
|
0
|
|
|
|
|
0
|
@values = map { 2 ** $_ } @values; |
|
0
|
|
|
|
|
0
|
|
2259
|
|
|
|
|
|
|
} |
2260
|
|
|
|
|
|
|
|
2261
|
|
|
|
|
|
|
# determine mean value |
2262
|
0
|
|
|
|
|
0
|
my $window_mean = sum0(@values) / scalar(@values); |
2263
|
0
|
0
|
|
|
|
0
|
if ($log) { |
2264
|
0
|
|
|
|
|
0
|
$window_mean = log($window_mean) / log(2); |
2265
|
|
|
|
|
|
|
} |
2266
|
|
|
|
|
|
|
|
2267
|
|
|
|
|
|
|
# push to summed output |
2268
|
0
|
0
|
|
|
|
0
|
if ($d == 0) { |
2269
|
|
|
|
|
|
|
# this is the first dataset, so we need to add a row |
2270
|
0
|
|
|
|
|
0
|
$summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] ); |
2271
|
|
|
|
|
|
|
} |
2272
|
|
|
|
|
|
|
else { |
2273
|
|
|
|
|
|
|
# we're summarizing multiple datasets, we already have name midpoint |
2274
|
|
|
|
|
|
|
# first do sanity check |
2275
|
0
|
0
|
|
|
|
0
|
if ($summed_data->value($row, 1) != $midpoint) { |
2276
|
0
|
|
|
|
|
0
|
carp("unable to summarize multiple datasets with nonequal columns of data!"); |
2277
|
0
|
|
|
|
|
0
|
return; |
2278
|
|
|
|
|
|
|
} |
2279
|
0
|
|
|
|
|
0
|
$summed_data->value($row, $i, $window_mean); |
2280
|
|
|
|
|
|
|
} |
2281
|
0
|
|
|
|
|
0
|
$row++; |
2282
|
|
|
|
|
|
|
} |
2283
|
|
|
|
|
|
|
} |
2284
|
|
|
|
|
|
|
|
2285
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
|
2287
|
|
|
|
|
|
|
# Write summed data |
2288
|
0
|
|
|
|
|
0
|
$outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present |
2289
|
0
|
|
|
|
|
0
|
my $written_file = $summed_data->write_file( |
2290
|
|
|
|
|
|
|
'filename' => $outfile . '_summary.txt', |
2291
|
|
|
|
|
|
|
'gz' => 0, |
2292
|
|
|
|
|
|
|
); |
2293
|
0
|
|
|
|
|
0
|
return $written_file; |
2294
|
|
|
|
|
|
|
} |
2295
|
|
|
|
|
|
|
|
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
|
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
#################################################### |
2301
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
package Bio::ToolBox::Data::Iterator; |
2303
|
3
|
|
|
3
|
|
22464
|
use Bio::ToolBox::Data::Feature; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
142
|
|
2304
|
3
|
|
|
3
|
|
34
|
use Carp; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
926
|
|
2305
|
|
|
|
|
|
|
|
2306
|
|
|
|
|
|
|
sub new { |
2307
|
9
|
|
|
9
|
|
24
|
my ($class, $data) = @_; |
2308
|
9
|
|
|
|
|
35
|
my %iterator = ( |
2309
|
|
|
|
|
|
|
'index' => 1, |
2310
|
|
|
|
|
|
|
'data' => $data, |
2311
|
|
|
|
|
|
|
); |
2312
|
9
|
|
|
|
|
39
|
return bless \%iterator, $class; |
2313
|
|
|
|
|
|
|
} |
2314
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
sub next_row { |
2316
|
154
|
|
|
154
|
|
1985
|
my $self = shift; |
2317
|
154
|
100
|
|
|
|
357
|
return if $self->{'index'} > $self->{data}->{last_row}; # no more |
2318
|
148
|
|
|
|
|
199
|
my $i = $self->{'index'}; |
2319
|
148
|
|
|
|
|
207
|
$self->{'index'}++; |
2320
|
|
|
|
|
|
|
my @options = ( |
2321
|
|
|
|
|
|
|
'data' => $self->{data}, |
2322
|
148
|
|
|
|
|
294
|
'index' => $i, |
2323
|
|
|
|
|
|
|
); |
2324
|
148
|
50
|
66
|
|
|
456
|
if ( |
2325
|
|
|
|
|
|
|
exists $self->{data}->{SeqFeatureObjects} and |
2326
|
|
|
|
|
|
|
defined $self->{data}->{SeqFeatureObjects}->[$i] |
2327
|
|
|
|
|
|
|
) { |
2328
|
0
|
|
|
|
|
0
|
push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i]; |
2329
|
|
|
|
|
|
|
} |
2330
|
148
|
|
|
|
|
388
|
return Bio::ToolBox::Data::Feature->new(@options); |
2331
|
|
|
|
|
|
|
} |
2332
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
sub row_index { |
2334
|
0
|
|
|
0
|
|
|
my $self = shift; |
2335
|
0
|
0
|
|
|
|
|
carp "row_index is a read only method" if @_; |
2336
|
0
|
|
|
|
|
|
return $self->{'index'}; |
2337
|
|
|
|
|
|
|
} |
2338
|
|
|
|
|
|
|
|
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
|
2341
|
|
|
|
|
|
|
#################################################### |
2342
|
|
|
|
|
|
|
|
2343
|
|
|
|
|
|
|
__END__ |