line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::ToolBox::Data; |
2
|
|
|
|
|
|
|
our $VERSION = '1.69'; |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
=head1 NAME |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
Bio::ToolBox::Data - Reading, writing, and manipulating data structure |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 SYNOPSIS |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
use Bio::ToolBox::Data; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
### Open a pre-existing file |
13
|
|
|
|
|
|
|
# a data table with same columns as input file |
14
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
15
|
|
|
|
|
|
|
file => 'coordinates.bed', |
16
|
|
|
|
|
|
|
); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
### Parse a GTF, GFF3, refFlat, or genePred gene table |
19
|
|
|
|
|
|
|
# data table with names and references to fully parsed |
20
|
|
|
|
|
|
|
# SeqFeature transcript objects with exon SeqFeature subfeatures |
21
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
22
|
|
|
|
|
|
|
file => 'annotation.gtf.gz', |
23
|
|
|
|
|
|
|
parse => 1, |
24
|
|
|
|
|
|
|
feature => 'transcript', |
25
|
|
|
|
|
|
|
subfeature => 'exon' |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
### New gene list from a Bio::DB::SeqFeature::Store database |
29
|
|
|
|
|
|
|
# data table with name and reference ID to database SeqFeature objects |
30
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new( |
31
|
|
|
|
|
|
|
db => 'hg19.sqlite', |
32
|
|
|
|
|
|
|
feature => 'gene:ensGene', |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
### Get a specific value |
37
|
|
|
|
|
|
|
my $value = $Data->value($row, $column); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
### Replace or add a value |
40
|
|
|
|
|
|
|
$Data->value($row, $column, $new_value); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
### Add a column |
43
|
|
|
|
|
|
|
my $new_index = $Data->add_column('Data2'); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
### Find a column |
46
|
|
|
|
|
|
|
my $old_index = $Data->find_column('Data1'); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
### Return a specific row as an object |
49
|
|
|
|
|
|
|
my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
### Iterate through a Data structure one row at a time |
52
|
|
|
|
|
|
|
my $stream = $Data->row_stream; |
53
|
|
|
|
|
|
|
while (my $row = $stream->next_row) { |
54
|
|
|
|
|
|
|
# each row is returned as Bio::ToolBox::Data::Feature object |
55
|
|
|
|
|
|
|
# get the positional information from the file data |
56
|
|
|
|
|
|
|
# assuming that the input file had these identifiable columns |
57
|
|
|
|
|
|
|
my $seq_id = $row->seq_id; |
58
|
|
|
|
|
|
|
my $start = $row->start; |
59
|
|
|
|
|
|
|
my $stop = $row->end; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# work with the referenced SeqFeature object |
62
|
|
|
|
|
|
|
my $seqf = $row->seqfeature; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# generate a new Bio::Seq object from the database using |
65
|
|
|
|
|
|
|
# these coordinates |
66
|
|
|
|
|
|
|
my $region = $db->segment($seq_id, $start, $stop); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# modify a row value |
69
|
|
|
|
|
|
|
my $value = $row->value($old_index); |
70
|
|
|
|
|
|
|
my $new_value = $value + 1; |
71
|
|
|
|
|
|
|
$row->value($new_index, $new_value); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
### Iterate through a Data table with a code reference |
75
|
|
|
|
|
|
|
$Data->iterate(\&my_code); |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
### write the data to file |
79
|
|
|
|
|
|
|
my $success = $Data->write_file( |
80
|
|
|
|
|
|
|
filename => 'new_data.txt', |
81
|
|
|
|
|
|
|
gz => 1, |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
print "wrote new file $success\n"; # file is new_data.txt.gz |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head1 DESCRIPTION |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
This module works with the primary Bio::ToolBox Data structure. Simply, it |
88
|
|
|
|
|
|
|
is a complex data structure representing a tabbed-delimited table (array |
89
|
|
|
|
|
|
|
of arrays), with plenty of options for metadata. Many common bioinformatic |
90
|
|
|
|
|
|
|
file formats are simply tabbed-delimited text files (think BED, GFF, VCF). |
91
|
|
|
|
|
|
|
Each row is a feature or genomic interval, and each column is a piece of |
92
|
|
|
|
|
|
|
information about that feature, such as name, type, and/or coordinates. |
93
|
|
|
|
|
|
|
We can append to that file additional columns of information, perhaps |
94
|
|
|
|
|
|
|
scores from genomic data sets. We can record metadata regarding how |
95
|
|
|
|
|
|
|
and where we obtained that data. Finally, we can write the updated |
96
|
|
|
|
|
|
|
table to a new file. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head1 METHODS |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 Initializing the structure |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=over 4 |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item new |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Initialize a new Data structure. This generally requires options, |
107
|
|
|
|
|
|
|
provided as an array of key =E values. A new list of features |
108
|
|
|
|
|
|
|
may be obtained from an annotation database or an existing file |
109
|
|
|
|
|
|
|
may be loaded. If you do not pass any options, a new empty |
110
|
|
|
|
|
|
|
structure will be generated for you to populate. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
These are the options available. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=over 4 |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=item file |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item in |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Provide the path and name to an existing tabbed-delimited text |
123
|
|
|
|
|
|
|
file from which to load the contents. This is a shortcut to the |
124
|
|
|
|
|
|
|
load_file() method. See that method for more details. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item stream |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Boolean option indicating that the file should be opened as a file |
131
|
|
|
|
|
|
|
stream. A L object will be returned. This |
132
|
|
|
|
|
|
|
is a convenience method. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=item noheader |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1); |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Boolean option indicating that the file does not have file headers, |
139
|
|
|
|
|
|
|
in which case dummy headers are provided. This is not necessary for |
140
|
|
|
|
|
|
|
defined file types that don't normally have file headers, such as |
141
|
|
|
|
|
|
|
BED, GFF, or UCSC files. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=item parse |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1); |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Boolean option indicating that a gene annotation table or file should |
148
|
|
|
|
|
|
|
be parsed into SeqFeature objects and a general table of names and IDs |
149
|
|
|
|
|
|
|
representing those objects be generated. The annotation file may |
150
|
|
|
|
|
|
|
be specified in one of two ways: Through the file option above, |
151
|
|
|
|
|
|
|
or in the database metadata of an existing table file representing |
152
|
|
|
|
|
|
|
previously parsed objects. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item db |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene'); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Provide the name of the database from which to collect the |
159
|
|
|
|
|
|
|
features. It may be a short name, whereupon it is checked in |
160
|
|
|
|
|
|
|
the L configuration file F<.biotoolbox.cfg> for |
161
|
|
|
|
|
|
|
connection information. Alternatively, a path to a database |
162
|
|
|
|
|
|
|
file or directory may be given. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
If you already have an opened L database |
165
|
|
|
|
|
|
|
object, you can simply pass that. See L for |
166
|
|
|
|
|
|
|
more information. However, this in general should be discouraged, |
167
|
|
|
|
|
|
|
since the name of the database will not be properly recorded when |
168
|
|
|
|
|
|
|
saving to file. It is better to simply pass the name of database |
169
|
|
|
|
|
|
|
again; multiple connections to the same database are smartly handled |
170
|
|
|
|
|
|
|
in the background. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=item feature |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene'); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
For de novo lists from an annotation database, provide the GFF |
177
|
|
|
|
|
|
|
type or type:source (columns 3 and 2) for collection. A comma |
178
|
|
|
|
|
|
|
delimited string may be accepted for databases. For parsed files, |
179
|
|
|
|
|
|
|
only a simple string is accepted. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
For a list of genomic intervals across the genome, specify a |
182
|
|
|
|
|
|
|
feature of 'genome' with a database object. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item subfeature |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
When parsing annotation files such as GTF, one or more subfeature |
187
|
|
|
|
|
|
|
types, e.g. C or C, may be specified as a comma-delimited |
188
|
|
|
|
|
|
|
string. This ensures that the subfeatures will be parsed into |
189
|
|
|
|
|
|
|
SeqFeature objects. Otherwise, only the top level features will be |
190
|
|
|
|
|
|
|
parsed. This expedites parsing by skipping unwanted features. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=item win |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=item step |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
If generating a list of genomic intervals, optionally provide |
199
|
|
|
|
|
|
|
the window and step values. The default is 500 bp. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=item chrskip |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Provide a regular expression compatible or C string for skipping or |
204
|
|
|
|
|
|
|
excluding specific or classes of chromosomes, for example the mitochondrial |
205
|
|
|
|
|
|
|
chromosome or unmapped contigs. This works with both feature collection |
206
|
|
|
|
|
|
|
and genomic intervals. The default is to take all chromosomes. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=back |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
When no file is given or database given to search, then a new, |
211
|
|
|
|
|
|
|
empty Data object is returned. In this case, you may optionally |
212
|
|
|
|
|
|
|
specify the names of the columns or indicate a specific file |
213
|
|
|
|
|
|
|
format structure to assign column names. The following options can |
214
|
|
|
|
|
|
|
then be provided. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=over 4 |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=item columns |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item datasets |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] ); |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Provide the column names in advance as an anonymous array. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=item gff |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item bed |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Pass the number of BED columns (3-12). |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=item ucsc |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Pass the number of columns to indicate the type of UCSC format. These |
237
|
|
|
|
|
|
|
include 10 (refFlat without gene names), 11 (refFlat with gene names), |
238
|
|
|
|
|
|
|
12 (knownGene gene prediction table), and 15 |
239
|
|
|
|
|
|
|
(an extended gene prediction or genePredExt table). |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=back |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
If successful, the method will return a Bio::ToolBox::Data object. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item duplicate |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
This will create a new Data object containing the same column |
248
|
|
|
|
|
|
|
headers and metadata, but lacking the table content, i.e. no |
249
|
|
|
|
|
|
|
rows of data. File name metadata, if present in the original, is |
250
|
|
|
|
|
|
|
not preserved. The purpose here, for example, is to allow one |
251
|
|
|
|
|
|
|
to selectively copy rows from one Data object to another. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item parse_table |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
$Data->parse_table($file) |
256
|
|
|
|
|
|
|
$Data->parse_table( { |
257
|
|
|
|
|
|
|
file => $file, |
258
|
|
|
|
|
|
|
feature => 'gene', |
259
|
|
|
|
|
|
|
subfeature => 'exon', |
260
|
|
|
|
|
|
|
chrskip => 'chrM|contig', |
261
|
|
|
|
|
|
|
} ); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
This will parse a gene annotation table into SeqFeature objects. |
264
|
|
|
|
|
|
|
If this is called from an empty Data object, then the table will |
265
|
|
|
|
|
|
|
be filled with the SeqFeature object names and IDs. If this is |
266
|
|
|
|
|
|
|
called from a non-empty Data object, then the table's contents |
267
|
|
|
|
|
|
|
will be associated with the SeqFeature objects using their name and |
268
|
|
|
|
|
|
|
ID. The stored SeqFeature objects can be retrieved using the |
269
|
|
|
|
|
|
|
L method. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Pass the method a single argument. This may be either a simple |
272
|
|
|
|
|
|
|
scalar to a filename to be parsed, or a reference to hash of |
273
|
|
|
|
|
|
|
one or more argument options. Possible options include: |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=over 4 |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item file |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
The file name of the gene table to be parsed. This may |
280
|
|
|
|
|
|
|
be a GFF, GFF3, GTF, or any of the UCSC gene table formats. |
281
|
|
|
|
|
|
|
These will be parsed using Bio::ToolBox::parser::* adapters. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=item feature |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
286
|
|
|
|
|
|
|
the top features C to keep. The C is not |
287
|
|
|
|
|
|
|
checked. The default is 'gene', although a transcript type could |
288
|
|
|
|
|
|
|
alternatively be specified (in which case transcripts are not |
289
|
|
|
|
|
|
|
parsed in gene features). |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=item subfeature |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
294
|
|
|
|
|
|
|
any sub features C to parse. The C is not checked. |
295
|
|
|
|
|
|
|
Typically these include exon, CDS, or UTR. The default is nothing. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item chrskip |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
A regular expression compatible string or C string to match |
300
|
|
|
|
|
|
|
chromosomes to be skipped or excluded. Any feature with a matching |
301
|
|
|
|
|
|
|
C chromosome name will be skipped. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=back |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=back |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 General Metadata |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
There is a variety of general metadata regarding the Data |
310
|
|
|
|
|
|
|
structure. The following methods may be used to access or set these |
311
|
|
|
|
|
|
|
metadata properties. Some of these are stored as comment lines at |
312
|
|
|
|
|
|
|
the beginning of the file, and will be read or set when the file is |
313
|
|
|
|
|
|
|
loaded. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=over |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item feature |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
$Data->feature('something'); |
320
|
|
|
|
|
|
|
my $feature = $Data->feature; |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
Returns or sets the name of the features used to collect |
323
|
|
|
|
|
|
|
the list of features. The actual feature types are listed |
324
|
|
|
|
|
|
|
in the table, so this metadata is merely descriptive. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=item feature_type |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Returns one of three specific values describing the contents |
329
|
|
|
|
|
|
|
of the data table inferred by the presence of specific column |
330
|
|
|
|
|
|
|
names. This provides a clue as to whether the table features |
331
|
|
|
|
|
|
|
represent genomic regions (defined by coordinate positions) or |
332
|
|
|
|
|
|
|
named database features. The return values include: |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 4 |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item * coordinate |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Table includes at least chromosome and start columns. |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=item * named |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Table includes name, type, and/or Primary_ID, possibly |
343
|
|
|
|
|
|
|
referring to named database features. |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=item * unknown |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Table is unrecognized format. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=back |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=item program |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Returns or sets the name of the program generating the list. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=item database |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Returns or sets the name or path of the database from which the |
358
|
|
|
|
|
|
|
features were derived. |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=item gff |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Returns or sets the version of loaded GFF files. Supported versions |
363
|
|
|
|
|
|
|
included 1, 2, 2.5 (GTF), and 3. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=item bed |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Returns or sets the BED file version. Here, the BED version is simply |
368
|
|
|
|
|
|
|
the number of columns. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=item ucsc |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
Returns or sets the UCSC file format version. Here, the version is |
373
|
|
|
|
|
|
|
simply the number of columns. Supported versions include 10 (gene |
374
|
|
|
|
|
|
|
prediction), 11 (refFlat, or gene prediction with gene name), 12 |
375
|
|
|
|
|
|
|
(knownGene table), 15 (extended gene prediction), or 16 (extended |
376
|
|
|
|
|
|
|
gene prediction with bin). |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=item vcf |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
Returns or sets the VCF file version number. VCF support is limited. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=back |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 File information |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
These methods provide information about the file from which the |
387
|
|
|
|
|
|
|
data table was loaded. This does not include parsed annotation tables. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=over 4 |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=item filename |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=item path |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item basename |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=item extension |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Returns the filename, full path, basename, and extension of |
400
|
|
|
|
|
|
|
the filename. Concatenating the last three values will reconstitute |
401
|
|
|
|
|
|
|
the first original filename. |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=item add_file_metadata |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
$Data->add_file_metadata('/path/to/file.txt'); |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Add filename metadata. This will automatically parse the path, |
408
|
|
|
|
|
|
|
basename, and recognized extension from the passed filename and |
409
|
|
|
|
|
|
|
set the appropriate metadata attributes. |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=back |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 Metadata comments |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
Comments are any other commented lines from a text file (lines |
416
|
|
|
|
|
|
|
beginning with a #) that were not parsed as general metadata. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=over 4 |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item comments |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Returns a copy of the array containing commented lines. Each |
423
|
|
|
|
|
|
|
comment line becomes an element in the array. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=item add_comment |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Appends the text string to the comment array. |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item delete_comment |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Deletes a comment. Provide the array index of the comment to |
432
|
|
|
|
|
|
|
delete. If an index is not provided, ALL comments will be deleted! |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item vcf_headers |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
For VCF files, this will partially parse the VCF headers into a |
437
|
|
|
|
|
|
|
hash structure that can be queried or manipulated. Each header |
438
|
|
|
|
|
|
|
line is parsed for the primary key, being the first word after the |
439
|
|
|
|
|
|
|
## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple |
440
|
|
|
|
|
|
|
values, they are stored as the value. For complex entries, such as |
441
|
|
|
|
|
|
|
with INFO and FORMAT, a second level hash is created with the ID |
442
|
|
|
|
|
|
|
extracted and used as the second level key. The value is always the |
443
|
|
|
|
|
|
|
always the remainder of the string. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
For example, the following would be a simple parsed vcf header in |
446
|
|
|
|
|
|
|
code representation. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
$vcf_header = { |
449
|
|
|
|
|
|
|
FORMAT => { |
450
|
|
|
|
|
|
|
GT = q(ID=GT,Number=1,Type=String,Description="Genotype"), |
451
|
|
|
|
|
|
|
AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"), |
452
|
|
|
|
|
|
|
}, |
453
|
|
|
|
|
|
|
fileDate => 20150715, |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item rewrite_vcf_headers |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
If you have altered the vcf headers exported by the vcf_headers() |
459
|
|
|
|
|
|
|
method, then this method will rewrite the hash structure as new |
460
|
|
|
|
|
|
|
comment lines. Do this prior to writing or saving the Data sturcture |
461
|
|
|
|
|
|
|
or else you will lose your changed VCF header metadata. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
=back |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=head2 The Data table |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
The Data table is the array of arrays containing all of the |
468
|
|
|
|
|
|
|
actual information. Rows and columns are indexed using 0-based |
469
|
|
|
|
|
|
|
indexing as with all Perl arrays. Row 0 is always the column |
470
|
|
|
|
|
|
|
header row containing the column names, regardless whether an |
471
|
|
|
|
|
|
|
actual header name existed in the original file format (e.g. |
472
|
|
|
|
|
|
|
BED or GFF formats). Any individual table "cell" can be |
473
|
|
|
|
|
|
|
specified as C<[$row][$column]>. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=over 4 |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item list_columns |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Returns an array or array reference of the column names |
480
|
|
|
|
|
|
|
in ascending (left to right) order. |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=item number_columns |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Returns the number of columns in the Data table. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item number_rows |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Returns the number of rows in the Data table. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=item last_column |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
Returns the array index number for the last (right most) |
493
|
|
|
|
|
|
|
column. This number is always 1 less than the value |
494
|
|
|
|
|
|
|
returned by number_columns() due to 0-based indexing. |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=item last_row |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
Returns the array index number of the last row. |
499
|
|
|
|
|
|
|
Since the header row is index 0, this is also the |
500
|
|
|
|
|
|
|
number of actual content rows. |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=item column_values |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
my $values = $Data->column_values($i); |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
Returns an array or array reference representing the values |
507
|
|
|
|
|
|
|
in the specified column. This includes the column header as |
508
|
|
|
|
|
|
|
the first element. Pass the method the column index. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=item add_column |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# add empty column with name |
513
|
|
|
|
|
|
|
my $i = $Data->add_column($name); |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
# add entire column |
516
|
|
|
|
|
|
|
my $i = $Data->add_column(\@array); |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Appends a new column to the Data table at the |
519
|
|
|
|
|
|
|
rightmost position (highest index). It adds the column |
520
|
|
|
|
|
|
|
header name and creates a new column metadata hash. |
521
|
|
|
|
|
|
|
Pass the method one of two possibilities. Pass a text |
522
|
|
|
|
|
|
|
string representing the new column name, in which case |
523
|
|
|
|
|
|
|
no data will be added to the column. Alternatively, pass |
524
|
|
|
|
|
|
|
an array reference, and the contents of the array will |
525
|
|
|
|
|
|
|
become the column data. If the Data table already has |
526
|
|
|
|
|
|
|
rows, then the passed array reference must have the same |
527
|
|
|
|
|
|
|
number of elements. |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
It returns the new column index if successful. |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=item copy_column |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
my $j = $Data->copy_column($i); |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
This will copy a column, appending the duplicate column at |
536
|
|
|
|
|
|
|
the rightmost position (highest index). It will duplicate |
537
|
|
|
|
|
|
|
column metadata as well. It will return the new index |
538
|
|
|
|
|
|
|
position. |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item delete_column |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
$Data->delete_column($i, $j); |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
Deletes one or more specified columns. Any remaining |
545
|
|
|
|
|
|
|
columns rightwards will have their indices shifted |
546
|
|
|
|
|
|
|
down appropriately. If you had identified one of the |
547
|
|
|
|
|
|
|
shifted columns, you may need to re-find or calculate |
548
|
|
|
|
|
|
|
its new index. |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=item reorder_column |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
$Data->reorder_column($c,$b,$a,$a); |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
Reorders columns into the specified order. Provide the |
555
|
|
|
|
|
|
|
new desired order of indices. Columns could be duplicated |
556
|
|
|
|
|
|
|
or deleted using this method. The columns will adopt their |
557
|
|
|
|
|
|
|
new index numbers. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=item add_row |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
$Data->add_row(\@values); |
562
|
|
|
|
|
|
|
$Data->add_row($Row); # Bio::ToolBox::Data::Feature object |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
Add a new row of data values to the end of the Data table. |
565
|
|
|
|
|
|
|
Optionally provide either a reference to an array of values |
566
|
|
|
|
|
|
|
to put in the row, or pass a L |
567
|
|
|
|
|
|
|
Row object, such as one obtained from another Data object. |
568
|
|
|
|
|
|
|
If the number of columns do not match, the array is filled |
569
|
|
|
|
|
|
|
up with null values for missing columns, or excess values |
570
|
|
|
|
|
|
|
are dropped. |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item get_row |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
$Data->get_row($i); # Bio::ToolBox::Data::Feature object |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
Returns the specified row as a L |
577
|
|
|
|
|
|
|
object. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=item delete_row |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
$Data->delete_row($i, $j); |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Deletes one or more specified rows. Rows are spliced out |
584
|
|
|
|
|
|
|
highest to lowest index to avoid issues. Be very careful |
585
|
|
|
|
|
|
|
deleting rows while simultaneously iterating through the |
586
|
|
|
|
|
|
|
table! |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=item row_values |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
my $row_values = $Data->row_values($i); |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
Returns a copy of an array for the specified row index. |
593
|
|
|
|
|
|
|
Modifying this returned array does not migrate back to the |
594
|
|
|
|
|
|
|
Data table; Use the L method below instead. |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item value |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
my $value = $Data->value($row, $column); |
599
|
|
|
|
|
|
|
$Data->value($row, $column, $new_value); |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
Returns or sets the value at a specific row or column index. |
602
|
|
|
|
|
|
|
Index positions are 0-based (header row is index 0). |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=back |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=head2 Column Metadata |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
Each column has metadata. Each metadata is a series of key =E |
609
|
|
|
|
|
|
|
value pairs. The minimum keys are 'index' (the 0-based index |
610
|
|
|
|
|
|
|
of the column) and 'name' (the column header name). Additional |
611
|
|
|
|
|
|
|
keys and values may be queried or set as appropriate. When the |
612
|
|
|
|
|
|
|
file is written, these are stored as commented metadata lines at |
613
|
|
|
|
|
|
|
the beginning of the file. |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=over 4 |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=item name |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
$Data->name($index, $new_name); |
620
|
|
|
|
|
|
|
my $name = $Data->name($i); |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
Convenient method to return the name of the column given the |
623
|
|
|
|
|
|
|
index number. A column may also be renamed by passing a new name. |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=item metadata |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
$Data->metadata($index, $key, $new_value); |
628
|
|
|
|
|
|
|
my $value = $Data->metadata($index, $key) |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
Returns or sets the metadata value for a specific $key for a |
631
|
|
|
|
|
|
|
specific column $index. |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
This may also be used to add a new metadata key. Simply provide |
634
|
|
|
|
|
|
|
the name of a new $key that is not present |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
If no key is provided, then a hash or hash reference is returned |
637
|
|
|
|
|
|
|
representing the entire metadata for that column. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=item copy_metadata |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
$Data->copy_metadata($source, $target); |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
This method will copy the metadata (everything except name and |
644
|
|
|
|
|
|
|
index) between the source column and target column. Returns 1 if |
645
|
|
|
|
|
|
|
successful. |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
=item delete_metadata |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
$Data->delete_metadata($index, $key); |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
Deletes a column-specific metadata $key and value for a specific |
652
|
|
|
|
|
|
|
column $index. If a $key is not provided, then all metadata keys |
653
|
|
|
|
|
|
|
for that index will be deleted. |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item find_column |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
my $i = $Data->find_column('Gene'); |
658
|
|
|
|
|
|
|
my $i = $Data->find_column('^Gene$') |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Searches the column names for the specified column name. This |
661
|
|
|
|
|
|
|
employs a case-insensitive grep search, so simple substitutions |
662
|
|
|
|
|
|
|
may be made. |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=item chromo_column |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=item start_column |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item stop_column |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=item strand_column |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item name_column |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=item type_column |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item id_column |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
These methods will return the identified column best matching |
679
|
|
|
|
|
|
|
the description. Returns C if that column is not present. |
680
|
|
|
|
|
|
|
These use the L method with a predefined list of |
681
|
|
|
|
|
|
|
aliases. |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
=back |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=head2 Working with databases |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
These are methods for working primarily with annotation databases. |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=over 4 |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
=item open_database |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
This is wrapper method that tries to do the right thing and passes |
694
|
|
|
|
|
|
|
on to either L or L methods. |
695
|
|
|
|
|
|
|
Basically a legacy method for L. |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=item open_meta_database |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Open the database that is listed in the metadata. Returns the |
700
|
|
|
|
|
|
|
database connection. Pass a true value to force a new database |
701
|
|
|
|
|
|
|
connection to be opened, rather than returning a cached connection |
702
|
|
|
|
|
|
|
object (useful when forking). |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
=item open_new_database |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
Convenience method for opening a second or new database that is |
707
|
|
|
|
|
|
|
not specified in the metadata, useful for data collection. This |
708
|
|
|
|
|
|
|
is a shortcut to L. |
709
|
|
|
|
|
|
|
Pass the database name. |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=back |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=head2 SeqFeature Objects |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
SeqFeature objects corresponding to data rows can be stored in the Data |
716
|
|
|
|
|
|
|
object. This can be useful if the SeqFeature object is not readily |
717
|
|
|
|
|
|
|
available from a database or is processor intensive in generating or |
718
|
|
|
|
|
|
|
parsing. Note that storing large numbers of objects will increase memory |
719
|
|
|
|
|
|
|
usage. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
SeqFeature objects are usually either L, |
722
|
|
|
|
|
|
|
L, or L objects, depending |
723
|
|
|
|
|
|
|
upon their source. More information can obtained from the L |
724
|
|
|
|
|
|
|
abstract API. |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
=over 4 |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=item store_seqfeature |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
$Data->store_seqfeature($row_index, $seqfeature); |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
Stores the SeqFeature object for the given row index. Only one SeqFeature |
733
|
|
|
|
|
|
|
object can be stored per row. |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=item get_seqfeature |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
my $feature = $Data->get_seqfeature($row_index); |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
Retrieves the SeqFeature object for the given row index. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=item delete_seqfeature |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
$Data->store_seqfeature($row_index); |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Removes the SeqFeature object for the given row index. |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
=item collapse_gene_transcripts |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
my $success = $Data->collapse_gene_transcripts; |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
This method will iterate through a Data table and collapse multiple alternative |
752
|
|
|
|
|
|
|
transcript subfeatures of stored gene SeqFeature objects in the table. Exons |
753
|
|
|
|
|
|
|
of multiple transcripts will be merged, maximizing exon size and minimizing |
754
|
|
|
|
|
|
|
intron size. Genes with only one transcript will not be affected. Stored |
755
|
|
|
|
|
|
|
SeqFeature objects that are do not have a C of "gene" are silently |
756
|
|
|
|
|
|
|
skipped. Refer to the L method |
757
|
|
|
|
|
|
|
for more details. The number of rows successfully collapsed is returned. |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=item add_transcript_length |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
my $length_index = $Data->add_transcript_length; |
762
|
|
|
|
|
|
|
my $length_index = $Data->add_transcript_length('cds'); |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
This method will generate a new column in the Data table representing the |
765
|
|
|
|
|
|
|
length of a transcript, i.e. the sum of the length of subfeatures for |
766
|
|
|
|
|
|
|
the stored SeqFeature object in the Data table. The default subfeature is |
767
|
|
|
|
|
|
|
C; however, alternative subfeature types may be passed to the method |
768
|
|
|
|
|
|
|
and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR, |
769
|
|
|
|
|
|
|
and the 3C<'> UTR, respectively. See the corresponding transcript length |
770
|
|
|
|
|
|
|
methods in L for more information. If a length |
771
|
|
|
|
|
|
|
is not calculated, for example the feature C is a "gene", |
772
|
|
|
|
|
|
|
then the simple length of the feature is recorded. |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
The name of the new column is one of "Merged_Transcript_Length" for exon, |
775
|
|
|
|
|
|
|
"Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length". |
776
|
|
|
|
|
|
|
The index of the new length column is returned. |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
=back |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=head2 Data Table Functions |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
These methods alter the Data table en masse. |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=over 4 |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=item verify |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
This method will verify the Data structure, including the metadata and the |
789
|
|
|
|
|
|
|
Data table. It ensures that the table has the correct number of rows and |
790
|
|
|
|
|
|
|
columns as described in the metadata, and that each column has the basic |
791
|
|
|
|
|
|
|
metadata. |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
If the Data structure is marked as a GFF or BED structure, then the table |
794
|
|
|
|
|
|
|
is checked that the structure matches the proper format. If not, for |
795
|
|
|
|
|
|
|
example when additional columns have been added, then the GFF or BED value |
796
|
|
|
|
|
|
|
is set to null. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
This method is automatically called prior to writing the Data table to file. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=item sort_data |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
$Data->sort_data($index, 'i'); # increasing sort |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
This method will sort the Data table by the values in the indicated column. |
805
|
|
|
|
|
|
|
It will automatically determine whether the contents of the column are |
806
|
|
|
|
|
|
|
numbers or alphanumeric, and will sort accordingly, either numerically or |
807
|
|
|
|
|
|
|
asciibetically. The first non-null value in the column is used to determine. |
808
|
|
|
|
|
|
|
The sort may fail if the values are not consistent. Pass a second optional |
809
|
|
|
|
|
|
|
value to indicate the direction of the sort. The value should be either |
810
|
|
|
|
|
|
|
'i' for 'increasing' or 'd' for 'decreasing'. The default order is |
811
|
|
|
|
|
|
|
increasing. |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
=item gsort_data |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
This method will sort the Data table by increasing genomic coordinates. It |
816
|
|
|
|
|
|
|
requires the presence of chromosome and start (or position) columns, |
817
|
|
|
|
|
|
|
identified by their column names. These are automatically identified. |
818
|
|
|
|
|
|
|
Failure to find these columns mean a failure to sort the table. Chromosome |
819
|
|
|
|
|
|
|
names are sorted first by their digits (e.g. chr2 before chr10), and then |
820
|
|
|
|
|
|
|
alphanumerically. Base coordinates are sorted by increasing value. |
821
|
|
|
|
|
|
|
Identical positions are kept in their original order. |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=item splice_data |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
my $Data = Bio::ToolBox::Data->new(file => $file); |
826
|
|
|
|
|
|
|
my $pm = Parallel::ForkManager->new(4); |
827
|
|
|
|
|
|
|
for my $i (1..4) { |
828
|
|
|
|
|
|
|
$pm->start and next; |
829
|
|
|
|
|
|
|
### in child ### |
830
|
|
|
|
|
|
|
$Data->splice_data($i, 4); |
831
|
|
|
|
|
|
|
# do something with this portion |
832
|
|
|
|
|
|
|
# then save to a temporary unique file |
833
|
|
|
|
|
|
|
$Data->save("$file_$i"); |
834
|
|
|
|
|
|
|
$pm->finish; |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
$pm->wait_all_children; |
837
|
|
|
|
|
|
|
# reload children files |
838
|
|
|
|
|
|
|
$Data->reload_children(glob "$file_*"); |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
This method will splice the Data table into C<$total_parts> number of pieces, |
841
|
|
|
|
|
|
|
retaining the C<$current_part> piece. The other parts are discarded. This |
842
|
|
|
|
|
|
|
method is intended to be used when a program is forked into separate |
843
|
|
|
|
|
|
|
processes, allowing each child process to work on a subset of the original |
844
|
|
|
|
|
|
|
Data table. |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
Two values are passed to the method. The first is the current part number, |
847
|
|
|
|
|
|
|
1-based. The second value is the total number of parts that the table |
848
|
|
|
|
|
|
|
should be divided, corresponding to the number of concurrent processes. |
849
|
|
|
|
|
|
|
One easy approach to forking is to use L. The |
850
|
|
|
|
|
|
|
above example shows how to fork into four concurrent processes. |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
Since each forked child process is separate from their parent process, |
853
|
|
|
|
|
|
|
their contents must be reloaded into the current Data object. The |
854
|
|
|
|
|
|
|
L documentation recommends going through a disk |
855
|
|
|
|
|
|
|
file intermediate. Therefore, write each child Data object to file using |
856
|
|
|
|
|
|
|
a unique name. Once all children have been reaped, they can be reloaded |
857
|
|
|
|
|
|
|
into the current Data object using the L method. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
Remember that if you fork your script into child processes, any database |
860
|
|
|
|
|
|
|
connections must be re-opened; they are typically not clone safe. If you |
861
|
|
|
|
|
|
|
have an existing database connection by using the L method, |
862
|
|
|
|
|
|
|
it should be automatically re-opened for you when you use the L |
863
|
|
|
|
|
|
|
method, but you will need to call L again in the child |
864
|
|
|
|
|
|
|
process to obtain the new database object. |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
=item reload_children |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
Discards the current data table in memory and reloads two or more files |
869
|
|
|
|
|
|
|
written from forked children processes. Provide the name of the child |
870
|
|
|
|
|
|
|
files in the order you want them loaded. The files will be automatically |
871
|
|
|
|
|
|
|
deleted if successfully loaded. Returns the number of lines reloaded on |
872
|
|
|
|
|
|
|
success. |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=back |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=head2 File Functions |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
The Data table may be read in from a file or written out as a file. In |
879
|
|
|
|
|
|
|
all cases, it is a tab-delimited text file, whether as an ad hoc table |
880
|
|
|
|
|
|
|
or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common |
881
|
|
|
|
|
|
|
file formats are supported. Column headers are assumed, except in those |
882
|
|
|
|
|
|
|
cases where it is not, e.g. BED, GFF, etc. Metadata may be included as |
883
|
|
|
|
|
|
|
commented lines at the beginning of the file, prefixed with a C<#> symbol. |
884
|
|
|
|
|
|
|
Reading and writing gzip compressed files is fully supported. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
=over 4 |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
=item load_file |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
$Data->load_file($filename); |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
This will load a file into a new, empty Data table. This function is |
893
|
|
|
|
|
|
|
called automatically when a filename is provided to the L function. |
894
|
|
|
|
|
|
|
The existence of the file is first checked (appending common missing |
895
|
|
|
|
|
|
|
extensions as necessary), metadata and column headers processed and/or |
896
|
|
|
|
|
|
|
generated from default settings, the content loaded into the table, and |
897
|
|
|
|
|
|
|
the structure verified. Error messages may be printed if the structure or |
898
|
|
|
|
|
|
|
format is inconsistent or doesn't match the expected format, e.g a file |
899
|
|
|
|
|
|
|
with a F<.bed> extension doesn't match the UCSC specification. |
900
|
|
|
|
|
|
|
Pass the name of the filename. |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=item taste_file |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
my ($flavor, $format) = $Data->taste_file($filename); |
905
|
|
|
|
|
|
|
# $flavor is generic term: gff, bed, ucsc, or undef |
906
|
|
|
|
|
|
|
# $format is specific term: gtf, gff3, narrowPeak, genePred, etc. |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
Tastes, or checks, a file for a certain flavor, or known gene file formats. |
909
|
|
|
|
|
|
|
Useful for determining if the file represents a known gene table format |
910
|
|
|
|
|
|
|
that lacks a defined file extension, e.g. UCSC formats. This can be based |
911
|
|
|
|
|
|
|
on the file extension, metadata headers, and/or file contents from the |
912
|
|
|
|
|
|
|
first 10 lines. Returns two strings: the first is a generic flavor, and the |
913
|
|
|
|
|
|
|
second is a more specific format, if applicable. Generic flavor values will |
914
|
|
|
|
|
|
|
be one of `gff`, `bed`, `ucsc`, or `undefined`. These correlate to specific |
915
|
|
|
|
|
|
|
Parser adapters. Specific formats could be any number of possibilities, for |
916
|
|
|
|
|
|
|
example `undefined`, `gtf`, `gff3`, `narrowPeak`, `genePred`, etc. |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=item save |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
=item write_file |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
my $success = $Data->save; |
924
|
|
|
|
|
|
|
my $success = $Data->save('my_file.txt'); |
925
|
|
|
|
|
|
|
my $success = $Data->save(filename => $file, gz => 1); |
926
|
|
|
|
|
|
|
print "file $success was saved!\n"; |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
Pass the file name to be written. If no file name is passed, then |
929
|
|
|
|
|
|
|
the filename and path stored in the metadata are used, if present. |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
These methods will write the Data structure out to file. It will |
932
|
|
|
|
|
|
|
be first verified as to proper structure. Opened BED and GFF files |
933
|
|
|
|
|
|
|
are checked to see if their structure is maintained. If so, they |
934
|
|
|
|
|
|
|
are written in the same format; if not, they are written as regular |
935
|
|
|
|
|
|
|
tab-delimited text files. |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
You may pass additional options. |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=over 4 |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=item filename |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
Optionally pass a new filename. Required for new objects; previous |
944
|
|
|
|
|
|
|
opened files may be overwritten if a new name is not provided. If |
945
|
|
|
|
|
|
|
necessary, the file extension may be changed; for example, BED files |
946
|
|
|
|
|
|
|
that no longer match the defined format lose the .bed and gain a .txt |
947
|
|
|
|
|
|
|
extension. Compression may or add or strip .gz as appropriate. If |
948
|
|
|
|
|
|
|
a path is not provided, the current working directory is used. |
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=item gz |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
Boolean value to change the compression status of the output file. If |
953
|
|
|
|
|
|
|
overwriting an input file, the default is maintain the compression status, |
954
|
|
|
|
|
|
|
otherwise no compression. Pass a 0 for no compression, 1 for standard |
955
|
|
|
|
|
|
|
gzip compression, or 2 for block gzip (bgzip) compression for tabix |
956
|
|
|
|
|
|
|
compatibility. |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
=back |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
If the file save is successful, it will return the full path and |
961
|
|
|
|
|
|
|
name of the saved file, complete with any changes to the file extension. |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
=item summary_file |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
Write a separate file summarizing columns of data (mean values). |
966
|
|
|
|
|
|
|
The mean value of each column becomes a row value, and each column |
967
|
|
|
|
|
|
|
header becomes a row identifier (i.e. the table is transposed). The |
968
|
|
|
|
|
|
|
best use of this is to summarize the mean profile of windowed data |
969
|
|
|
|
|
|
|
collected across a feature. See the Bio::ToolBox scripts |
970
|
|
|
|
|
|
|
L and L as examples. |
971
|
|
|
|
|
|
|
For data from L where the columns are expressed |
972
|
|
|
|
|
|
|
as percentile bins, the reported midpoint column is automatically |
973
|
|
|
|
|
|
|
converted based on a length of 1000 bp. |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
You may pass these options. They are optional. |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
=over 4 |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
=item filename |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
Pass an optional new filename. The default is to take the basename |
982
|
|
|
|
|
|
|
and append "_summed" to it. |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=item startcolumn |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=item stopcolumn |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
Provide the starting and ending columns to summarize. The default |
989
|
|
|
|
|
|
|
start is the leftmost column without a recognized standard name. |
990
|
|
|
|
|
|
|
The default ending column is the last rightmost column. Indexes are |
991
|
|
|
|
|
|
|
0-based. |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
=item dataset |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
Pass a string that is the name of the dataset. This could be collected |
996
|
|
|
|
|
|
|
from the metadata, if present. This will become the name of the score |
997
|
|
|
|
|
|
|
column if defined. |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
=back |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
The name of the summarized column is either the provided dataset name, |
1002
|
|
|
|
|
|
|
the defined basename in the metadata of the Data structure, or a generic |
1003
|
|
|
|
|
|
|
name. If successful, it will return the name of the file saved. |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
=back |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 Verifying Datasets |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
When working with row Features and collecting scores, the dataset |
1010
|
|
|
|
|
|
|
from which you are collecting must be verified prior to collection. |
1011
|
|
|
|
|
|
|
This ensures that the proper database adaptor is available and loaded, |
1012
|
|
|
|
|
|
|
and that the dataset is correctly specified (otherwise nothing would be |
1013
|
|
|
|
|
|
|
collected). This verification is normally performed transparently when |
1014
|
|
|
|
|
|
|
you call L or |
1015
|
|
|
|
|
|
|
L. |
1016
|
|
|
|
|
|
|
However, datasets may be explicitly verified prior to calling the score |
1017
|
|
|
|
|
|
|
methods. |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
=over 4 |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
=item verify_dataset |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
my $dataset = $Data->verify_dataset($dataset, $database); |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
Pass the name of the dataset (GFF type or type:source) for a GFF3-based |
1026
|
|
|
|
|
|
|
database, e.g. , or path and file name for a |
1027
|
|
|
|
|
|
|
data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database |
1028
|
|
|
|
|
|
|
is being used, pass the name or opened database object as a second |
1029
|
|
|
|
|
|
|
parameter. For more advance options, see |
1030
|
|
|
|
|
|
|
L. |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
The name of the verified dataset, with a prefix if necessary, is returned. |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
=back |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
=head2 Efficient Data Access |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
Most of the time we need to iterate over the Data table, one row |
1039
|
|
|
|
|
|
|
at a time, collecting data or processing information. These methods |
1040
|
|
|
|
|
|
|
simplify the process. |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
=over 4 |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
=item iterate |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
$Data->iterate( sub { |
1047
|
|
|
|
|
|
|
my $row = shift; |
1048
|
|
|
|
|
|
|
my $number = $row->value($index); |
1049
|
|
|
|
|
|
|
my $log_number = log($number); |
1050
|
|
|
|
|
|
|
$row->value($index, $log_number); |
1051
|
|
|
|
|
|
|
} ); |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
This method will process a code reference on every row in the data |
1054
|
|
|
|
|
|
|
table. Pass a subroutine or code reference. The subroutine will |
1055
|
|
|
|
|
|
|
receive the row as a L object. With this |
1056
|
|
|
|
|
|
|
object, you can retrieve values, set values, and add new values. |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
=item row_stream |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
This returns an C object, which has one |
1061
|
|
|
|
|
|
|
method, C. Call this method repeatedly until it returns |
1062
|
|
|
|
|
|
|
C to work through each row of data. |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
Users of the C family of database adaptors may recognize the |
1065
|
|
|
|
|
|
|
analogy to the C method. |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
=item next_row |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
my $stream = $Data->row_stream; |
1070
|
|
|
|
|
|
|
while (my $row = $stream->next_row) { |
1071
|
|
|
|
|
|
|
# each $row is a Bio::ToolBox::Data::Feature object |
1072
|
|
|
|
|
|
|
# representing the row in the data table |
1073
|
|
|
|
|
|
|
my $value = $row->value($index); |
1074
|
|
|
|
|
|
|
# do something with $value |
1075
|
|
|
|
|
|
|
} |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
Called from a C object, it returns a |
1078
|
|
|
|
|
|
|
L row object. If SeqFeature objects are |
1079
|
|
|
|
|
|
|
associated with the row, perhaps from a parsed input annotation file, |
1080
|
|
|
|
|
|
|
then they are automatically associated with the row object. (Previous |
1081
|
|
|
|
|
|
|
versions required separately calling the seqfeature() row method to |
1082
|
|
|
|
|
|
|
perform this.) |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
=back |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=head1 SEE ALSO |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
L, L, L, |
1089
|
|
|
|
|
|
|
L, L, L, L, |
1090
|
|
|
|
|
|
|
L, L |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
=cut |
1093
|
|
|
|
|
|
|
|
1094
|
3
|
|
|
3
|
|
3001
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
81
|
|
1095
|
3
|
|
|
3
|
|
10
|
use Carp qw(carp cluck croak confess); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
154
|
|
1096
|
3
|
|
|
3
|
|
13
|
use List::Util qw(sum0); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
143
|
|
1097
|
3
|
|
|
3
|
|
12
|
use base 'Bio::ToolBox::Data::core'; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
1129
|
|
1098
|
3
|
|
|
|
|
168
|
use Bio::ToolBox::db_helper qw( |
1099
|
|
|
|
|
|
|
get_new_feature_list |
1100
|
|
|
|
|
|
|
get_new_genome_list |
1101
|
|
|
|
|
|
|
get_db_feature |
1102
|
3
|
|
|
3
|
|
25
|
); |
|
3
|
|
|
|
|
5
|
|
1103
|
3
|
|
|
3
|
|
16
|
use Bio::ToolBox::utility qw(simplify_dataset_name sane_chromo_sort); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
116
|
|
1104
|
3
|
|
|
3
|
|
11
|
use Module::Load; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
26
|
|
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
1; |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
### Initialize |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
sub new { |
1112
|
46
|
|
|
46
|
1
|
7912
|
my $class = shift; |
1113
|
46
|
|
|
|
|
92
|
my %args = @_; |
1114
|
46
|
100
|
|
|
|
116
|
if (ref($class)) { |
1115
|
25
|
|
|
|
|
35
|
$class = ref($class); |
1116
|
|
|
|
|
|
|
} |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
# check for important arguments |
1119
|
46
|
|
100
|
|
|
284
|
$args{features} ||= $args{feature} || 'gene'; |
|
|
|
33
|
|
|
|
|
1120
|
46
|
|
50
|
|
|
209
|
$args{stream} ||= $args{Stream} || 0; |
|
|
|
66
|
|
|
|
|
1121
|
46
|
|
100
|
|
|
176
|
$args{file} ||= $args{in} || undef; |
|
|
|
100
|
|
|
|
|
1122
|
46
|
|
100
|
|
|
142
|
$args{parse} ||= 0; |
1123
|
46
|
|
50
|
|
|
151
|
$args{noheader} ||= 0; |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
# check for stream |
1126
|
46
|
100
|
|
|
|
110
|
if ($args{stream}) { |
1127
|
1
|
|
|
|
|
3
|
$class = "Bio::ToolBox::Data::Stream"; |
1128
|
1
|
|
|
|
|
5
|
load($class); |
1129
|
1
|
|
|
|
|
74
|
return $class->new(@_); |
1130
|
|
|
|
|
|
|
} |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
# initialize |
1133
|
45
|
|
|
|
|
184
|
my $self = $class->SUPER::new(); |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
# prepare a new table based on the provided arguments |
1136
|
45
|
100
|
100
|
|
|
207
|
if ($args{file} and $args{parse}) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
# parse from file |
1138
|
7
|
|
100
|
|
|
30
|
$args{subfeature} ||= ''; |
1139
|
7
|
100
|
|
|
|
17
|
unless ( $self->parse_table(\%args) ) { |
1140
|
5
|
|
|
|
|
23
|
my $l = $self->load_file($args{file}); |
1141
|
5
|
50
|
|
|
|
9
|
return unless $l; |
1142
|
5
|
50
|
33
|
|
|
13
|
if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') { |
1143
|
|
|
|
|
|
|
# looks like the loaded file was from a previously parsed table |
1144
|
|
|
|
|
|
|
# let's try this again |
1145
|
|
|
|
|
|
|
# this may die if it doesn't work |
1146
|
5
|
|
|
|
|
10
|
$args{file} = $1; |
1147
|
5
|
|
|
|
|
12
|
$args{feature} = $self->feature; |
1148
|
5
|
|
|
|
|
12
|
$self->parse_table(\%args); |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
elsif ($args{file}) { |
1153
|
|
|
|
|
|
|
# load from file |
1154
|
5
|
50
|
|
|
|
31
|
unless ( $self->load_file($args{file}, $args{noheader}) ) { |
1155
|
0
|
|
|
|
|
0
|
return; |
1156
|
|
|
|
|
|
|
} |
1157
|
|
|
|
|
|
|
} |
1158
|
|
|
|
|
|
|
elsif (exists $args{db} and $args{features}) { |
1159
|
|
|
|
|
|
|
# generate new list |
1160
|
1
|
|
|
|
|
23
|
$self->feature($args{features}); |
1161
|
1
|
|
|
|
|
5
|
$self->database($args{db}); |
1162
|
1
|
|
|
|
|
2
|
$args{data} = $self; |
1163
|
1
|
|
|
|
|
2
|
my $result; |
1164
|
1
|
50
|
|
|
|
4
|
if ($args{features} eq 'genome') { |
1165
|
|
|
|
|
|
|
# create a genomic interval list |
1166
|
|
|
|
|
|
|
# first must parse any exclusion list provided as a new Data object |
1167
|
1
|
|
50
|
|
|
8
|
$args{blacklist} ||= $args{exclude} || undef; |
|
|
|
33
|
|
|
|
|
1168
|
1
|
50
|
|
|
|
3
|
if (defined $args{blacklist}) { |
1169
|
|
|
|
|
|
|
my $exclusion_Data = $self->new( |
1170
|
|
|
|
|
|
|
file => $args{blacklist}, |
1171
|
0
|
|
|
|
|
0
|
parse => 0 # we don't need to parse |
1172
|
|
|
|
|
|
|
); |
1173
|
0
|
0
|
0
|
|
|
0
|
if ($exclusion_Data and $exclusion_Data->feature_type eq 'coordinate') { |
1174
|
0
|
|
|
|
|
0
|
printf " Loaded %d exclusion list items\n", |
1175
|
|
|
|
|
|
|
$exclusion_Data->number_rows; |
1176
|
0
|
|
|
|
|
0
|
delete $args{blacklist}; |
1177
|
0
|
|
|
|
|
0
|
delete $args{exclude}; |
1178
|
0
|
|
|
|
|
0
|
$args{exclude} = $exclusion_Data; |
1179
|
|
|
|
|
|
|
} |
1180
|
|
|
|
|
|
|
else { |
1181
|
0
|
|
|
|
|
0
|
carp " Cannot not load exclusion coordinate list file $args{blacklist}!"; |
1182
|
0
|
|
|
|
|
0
|
return; |
1183
|
|
|
|
|
|
|
} |
1184
|
|
|
|
|
|
|
} |
1185
|
1
|
|
|
|
|
7
|
$result = get_new_genome_list(%args); |
1186
|
|
|
|
|
|
|
} |
1187
|
|
|
|
|
|
|
else { |
1188
|
0
|
|
|
|
|
0
|
$result = get_new_feature_list(%args); |
1189
|
|
|
|
|
|
|
} |
1190
|
1
|
50
|
|
|
|
4
|
unless ($result) { |
1191
|
0
|
|
|
|
|
0
|
carp " Cannot generate new $args{features} list!"; |
1192
|
0
|
|
|
|
|
0
|
return; |
1193
|
|
|
|
|
|
|
} |
1194
|
|
|
|
|
|
|
} |
1195
|
|
|
|
|
|
|
else { |
1196
|
|
|
|
|
|
|
# a new empty structure |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# check to see if user provided column names |
1199
|
32
|
|
50
|
|
|
150
|
$args{columns} ||= $args{datasets} || undef; |
|
|
|
66
|
|
|
|
|
1200
|
32
|
100
|
33
|
|
|
179
|
if (defined $args{columns}) { |
|
|
50
|
33
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1201
|
1
|
|
|
|
|
2
|
foreach my $c ( @{ $args{columns} } ) { |
|
1
|
|
|
|
|
2
|
|
1202
|
9
|
|
|
|
|
14
|
$self->add_column($c); |
1203
|
|
|
|
|
|
|
} |
1204
|
1
|
50
|
|
|
|
3
|
$self->{feature} = $args{feature} if exists $args{feature}; |
1205
|
|
|
|
|
|
|
} |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
# or possibly a specified format structure |
1208
|
|
|
|
|
|
|
elsif (exists $args{gff} and $args{gff}) { |
1209
|
|
|
|
|
|
|
# use standard names for the number of columns indicated |
1210
|
|
|
|
|
|
|
# we trust that the user knows the subtle difference between gff versions |
1211
|
0
|
|
|
|
|
0
|
$self->add_gff_metadata($args{gff}); |
1212
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /g[tf]f/) { |
1213
|
|
|
|
|
|
|
$self->{extension} = $args{gff} == 2.5 ? '.gtf' : |
1214
|
0
|
0
|
|
|
|
0
|
$args{gff} == 3 ? '.gff3' : '.gff'; |
|
|
0
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
} |
1216
|
|
|
|
|
|
|
} |
1217
|
|
|
|
|
|
|
elsif (exists $args{bed} and $args{bed}) { |
1218
|
|
|
|
|
|
|
# use standard names for the number of columns indicated |
1219
|
0
|
0
|
0
|
|
|
0
|
unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) { |
1220
|
0
|
|
|
|
|
0
|
carp "bed parameter must be an integer 3-12!"; |
1221
|
0
|
|
|
|
|
0
|
return; |
1222
|
|
|
|
|
|
|
} |
1223
|
0
|
|
|
|
|
0
|
$self->add_bed_metadata($args{bed}); |
1224
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /bed|peak/) { |
1225
|
0
|
|
|
|
|
0
|
$self->{extension} = '.bed'; |
1226
|
|
|
|
|
|
|
} |
1227
|
|
|
|
|
|
|
} |
1228
|
|
|
|
|
|
|
elsif (exists $args{ucsc} and $args{ucsc}) { |
1229
|
|
|
|
|
|
|
# a ucsc format such as refFlat, genePred, or genePredExt |
1230
|
0
|
|
|
|
|
0
|
my $u = $self->add_ucsc_metadata($args{ucsc}); |
1231
|
0
|
0
|
|
|
|
0
|
unless ($u) { |
1232
|
0
|
|
|
|
|
0
|
carp "unrecognized number of columns for ucsc format!"; |
1233
|
0
|
|
|
|
|
0
|
return; |
1234
|
|
|
|
|
|
|
}; |
1235
|
0
|
0
|
|
|
|
0
|
unless ($self->extension =~ /ucsc|ref+lat|genepred/) { |
1236
|
0
|
|
|
|
|
0
|
$self->{extension} = '.ucsc'; |
1237
|
|
|
|
|
|
|
} |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
|
1241
|
45
|
|
|
|
|
195
|
return $self; |
1242
|
|
|
|
|
|
|
} |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
sub duplicate { |
1245
|
1
|
|
|
1
|
1
|
1157
|
my $self = shift; |
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
# duplicate the data structure |
1248
|
1
|
|
|
|
|
7
|
my $columns = $self->list_columns; |
1249
|
1
|
50
|
|
|
|
4
|
my $Dupe = $self->new( |
1250
|
|
|
|
|
|
|
'columns' => $columns, |
1251
|
|
|
|
|
|
|
) or return; |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
# copy the metadata |
1254
|
1
|
|
|
|
|
2
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
1255
|
|
|
|
|
|
|
# column metadata |
1256
|
9
|
|
|
|
|
15
|
my %md = $self->metadata($i); |
1257
|
9
|
|
|
|
|
24
|
$Dupe->{$i} = \%md; |
1258
|
|
|
|
|
|
|
} |
1259
|
1
|
|
|
|
|
2
|
foreach (qw(feature program db bed gff ucsc vcf headers)) { |
1260
|
|
|
|
|
|
|
# various keys |
1261
|
8
|
|
|
|
|
13
|
$Dupe->{$_} = $self->{$_}; |
1262
|
|
|
|
|
|
|
} |
1263
|
1
|
|
|
|
|
3
|
my @comments = $self->comments; |
1264
|
1
|
|
|
|
|
2
|
push @{$Dupe->{comments}}, @comments; |
|
1
|
|
|
|
|
3
|
|
1265
|
|
|
|
|
|
|
|
1266
|
1
|
|
|
|
|
3
|
return $Dupe; |
1267
|
|
|
|
|
|
|
} |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
sub parse_table { |
1270
|
18
|
|
|
18
|
1
|
4750
|
my $self = shift; |
1271
|
18
|
|
|
|
|
22
|
my $args = shift; |
1272
|
18
|
|
|
|
|
48
|
my ($file, $feature, $subfeature, $simplify); |
1273
|
18
|
100
|
|
|
|
47
|
if (ref $args) { |
1274
|
12
|
|
50
|
|
|
30
|
$file = $args->{file} || ''; |
1275
|
12
|
|
100
|
|
|
27
|
$feature = $args->{feature} || ''; |
1276
|
12
|
|
100
|
|
|
24
|
$subfeature = $args->{subfeature} || ''; |
1277
|
|
|
|
|
|
|
$simplify = (exists $args->{simplify} and defined $args->{simplify}) ? |
1278
|
12
|
50
|
33
|
|
|
29
|
$args->{simplify} : 1; # default is to simplify |
1279
|
|
|
|
|
|
|
} |
1280
|
|
|
|
|
|
|
else { |
1281
|
|
|
|
|
|
|
# no hash reference, assume just a file name |
1282
|
6
|
|
|
|
|
7
|
$file = $args; |
1283
|
6
|
|
|
|
|
15
|
undef $args; |
1284
|
6
|
|
|
|
|
9
|
$feature = undef; |
1285
|
6
|
|
|
|
|
7
|
$subfeature = ''; |
1286
|
6
|
|
|
|
|
7
|
$simplify = 1; |
1287
|
|
|
|
|
|
|
} |
1288
|
18
|
50
|
|
|
|
31
|
unless ($file) { |
1289
|
0
|
|
|
|
|
0
|
carp "no annotation file provided to parse!"; |
1290
|
0
|
|
|
|
|
0
|
return; |
1291
|
|
|
|
|
|
|
} |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
# the file format determines the parser class |
1294
|
18
|
100
|
|
|
|
44
|
my ($flavor, $format) = $self->taste_file($file) or return; |
1295
|
13
|
|
|
|
|
40
|
my $class = 'Bio::ToolBox::parser::' . $flavor; |
1296
|
13
|
|
|
|
|
20
|
eval {load $class}; |
|
13
|
|
|
|
|
35
|
|
1297
|
13
|
50
|
|
|
|
811
|
if ($@) { |
1298
|
0
|
|
|
|
|
0
|
carp "unable to load $class! cannot parse $flavor!"; |
1299
|
0
|
|
|
|
|
0
|
return; |
1300
|
|
|
|
|
|
|
} |
1301
|
13
|
|
|
|
|
36
|
$self->format($format); # specify the specific format |
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
# open parser |
1304
|
13
|
50
|
|
|
|
70
|
my $parser = $class->new() or return; |
1305
|
13
|
50
|
|
|
|
34
|
$parser->open_file($file) or return; |
1306
|
13
|
|
|
|
|
34
|
my $typelist = $parser->typelist; |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
# set feature based on the type list from the parser |
1309
|
13
|
100
|
|
|
|
26
|
unless ($feature) { |
1310
|
6
|
100
|
|
|
|
22
|
if ($typelist =~ /gene/i) { |
|
|
100
|
|
|
|
|
|
1311
|
2
|
|
|
|
|
4
|
$feature = 'gene'; |
1312
|
|
|
|
|
|
|
} |
1313
|
|
|
|
|
|
|
elsif ($typelist eq 'region') { |
1314
|
3
|
|
|
|
|
4
|
$feature = 'region'; |
1315
|
|
|
|
|
|
|
} |
1316
|
|
|
|
|
|
|
else { |
1317
|
1
|
|
|
|
|
1
|
$feature = 'rna'; # generic RNA |
1318
|
|
|
|
|
|
|
} |
1319
|
|
|
|
|
|
|
} |
1320
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
# set parser parameters |
1322
|
13
|
|
|
|
|
36
|
$parser->simplify($simplify); |
1323
|
13
|
100
|
|
|
|
23
|
if ($subfeature) { |
1324
|
2
|
100
|
|
|
|
9
|
$parser->do_exon(1) if $subfeature =~ /exon/i; |
1325
|
2
|
100
|
|
|
|
8
|
$parser->do_cds(1) if $subfeature =~ /cds/i; |
1326
|
2
|
50
|
|
|
|
8
|
$parser->do_utr(1) if $subfeature =~ /utr|untranslated/i; |
1327
|
2
|
50
|
|
|
|
4
|
$parser->do_codon(1) if $subfeature =~/codon/i; |
1328
|
|
|
|
|
|
|
} |
1329
|
13
|
100
|
|
|
|
31
|
if ($feature =~ /gene$/i) { |
1330
|
4
|
|
|
|
|
9
|
$parser->do_gene(1); |
1331
|
|
|
|
|
|
|
} |
1332
|
|
|
|
|
|
|
else { |
1333
|
9
|
|
|
|
|
20
|
$parser->do_gene(0); |
1334
|
|
|
|
|
|
|
} |
1335
|
13
|
|
|
|
|
18
|
my $mrna_check = 0; |
1336
|
13
|
50
|
66
|
|
|
36
|
if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) { |
|
|
|
33
|
|
|
|
|
1337
|
|
|
|
|
|
|
# user requested mRNA for a new data file but it's not present in the type list |
1338
|
|
|
|
|
|
|
# look for it the hard way by parsing CDS too - sigh |
1339
|
0
|
|
|
|
|
0
|
load('Bio::ToolBox::GeneTools', 'is_coding'); |
1340
|
0
|
|
|
|
|
0
|
$parser->do_cds(1); |
1341
|
0
|
|
|
|
|
0
|
$mrna_check = 1; |
1342
|
|
|
|
|
|
|
} |
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
# parse the table |
1345
|
13
|
50
|
|
|
|
29
|
$parser->parse_file or return; |
1346
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
# store the SeqFeature objects |
1348
|
13
|
100
|
|
|
|
46
|
if ($self->last_row > 0) { |
1349
|
|
|
|
|
|
|
# we already have a table, presumably representing the features |
1350
|
5
|
|
|
|
|
13
|
my $count = 0; |
1351
|
|
|
|
|
|
|
$self->iterate( sub { |
1352
|
65
|
|
|
65
|
|
64
|
my $row = shift; |
1353
|
65
|
|
|
|
|
103
|
my $f = $parser->find_gene( |
1354
|
|
|
|
|
|
|
name => $row->name, |
1355
|
|
|
|
|
|
|
id => $row->id, |
1356
|
|
|
|
|
|
|
); |
1357
|
65
|
50
|
|
|
|
102
|
if ($f) { |
1358
|
65
|
|
|
|
|
104
|
$self->store_seqfeature($row->row_index, $f); |
1359
|
65
|
|
|
|
|
142
|
$count++; |
1360
|
|
|
|
|
|
|
} |
1361
|
5
|
|
|
|
|
58
|
} ); |
1362
|
5
|
50
|
|
|
|
26
|
unless ($count == $self->last_row) { |
1363
|
0
|
|
|
|
|
0
|
die <
|
1364
|
|
|
|
|
|
|
Not all features in the input file could be matched to a corresponding SeqFeature |
1365
|
|
|
|
|
|
|
object in the annotation file $file. |
1366
|
|
|
|
|
|
|
Double check your input and annotation files. You can create a new table by just |
1367
|
|
|
|
|
|
|
providing your annotation file. |
1368
|
|
|
|
|
|
|
PARSEFAIL |
1369
|
|
|
|
|
|
|
} |
1370
|
|
|
|
|
|
|
} |
1371
|
|
|
|
|
|
|
else { |
1372
|
|
|
|
|
|
|
# create a new table |
1373
|
8
|
|
|
|
|
22
|
$self->add_column('Primary_ID'); |
1374
|
8
|
|
|
|
|
19
|
$self->add_column('Name'); |
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
# check for chromosome exclude |
1377
|
8
|
|
|
|
|
9
|
my $chr_exclude; |
1378
|
8
|
100
|
|
|
|
18
|
if ($args) { |
1379
|
2
|
|
50
|
|
|
9
|
$chr_exclude = $args->{chrskip} || undef; |
1380
|
|
|
|
|
|
|
} |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
# fill table with features |
1383
|
8
|
|
|
|
|
29
|
while (my $f = $parser->next_top_feature) { |
1384
|
120
|
50
|
|
|
|
143
|
if ($chr_exclude) { |
1385
|
0
|
0
|
|
|
|
0
|
next if $f->seq_id =~ /$chr_exclude/i; |
1386
|
|
|
|
|
|
|
} |
1387
|
120
|
|
|
|
|
161
|
my $type = $f->type; |
1388
|
120
|
100
|
33
|
|
|
170
|
if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) { |
|
|
|
66
|
|
|
|
|
1389
|
113
|
|
|
|
|
221
|
my $index = $self->add_row([ $f->primary_id, $f->display_name ]); |
1390
|
113
|
|
|
|
|
184
|
$self->store_seqfeature($index, $f); |
1391
|
|
|
|
|
|
|
} |
1392
|
|
|
|
|
|
|
} |
1393
|
8
|
50
|
|
|
|
16
|
unless ($self->last_row) { |
1394
|
0
|
|
|
|
|
0
|
printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n", |
1395
|
|
|
|
|
|
|
$feature; |
1396
|
|
|
|
|
|
|
} |
1397
|
8
|
|
|
|
|
33
|
$self->database("Parsed:$file"); |
1398
|
8
|
|
|
|
|
22
|
$self->feature($feature); |
1399
|
8
|
50
|
|
|
|
14
|
$self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude; |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
# add input parsed file metadata |
1402
|
8
|
|
|
|
|
19
|
$self->add_file_metadata($file); |
1403
|
|
|
|
|
|
|
# but delete some stuff, just want basename |
1404
|
8
|
|
|
|
|
12
|
undef $self->{extension}; |
1405
|
8
|
|
|
|
|
10
|
undef $self->{filename}; |
1406
|
8
|
|
|
|
|
11
|
undef $self->{path}; |
1407
|
|
|
|
|
|
|
} |
1408
|
13
|
|
|
|
|
251
|
return 1; |
1409
|
|
|
|
|
|
|
} |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
### Column manipulation |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
sub column_values { |
1417
|
2
|
|
|
2
|
1
|
1014
|
my ($self, $column) = @_; |
1418
|
2
|
50
|
|
|
|
8
|
return unless defined $column; |
1419
|
2
|
50
|
|
|
|
6
|
return unless exists $self->{$column}{name}; |
1420
|
2
|
|
|
|
|
11
|
my @values = map {$self->value($_, $column)} (0 .. $self->last_row); |
|
159
|
|
|
|
|
178
|
|
1421
|
2
|
50
|
|
|
|
11
|
return wantarray ? @values : \@values; |
1422
|
|
|
|
|
|
|
} |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
sub add_column { |
1425
|
31
|
|
|
31
|
1
|
53
|
my ($self, $name) = @_; |
1426
|
31
|
50
|
|
|
|
91
|
return unless $name; |
1427
|
31
|
|
|
|
|
39
|
my $column = $self->{number_columns}; |
1428
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
# check for array of column data |
1430
|
31
|
|
|
|
|
45
|
my $name_ref = ref $name; |
1431
|
31
|
100
|
|
|
|
91
|
if ($name_ref eq 'ARRAY') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1432
|
2
|
50
|
|
|
|
4
|
if ($self->last_row > 1) { |
1433
|
|
|
|
|
|
|
# table has existing data beyond column headers |
1434
|
2
|
50
|
|
|
|
4
|
if ($self->last_row == (scalar @$name - 1)) { |
1435
|
|
|
|
|
|
|
# same number of elements, add it the table |
1436
|
2
|
|
|
|
|
6
|
$self->{$column} = { |
1437
|
|
|
|
|
|
|
'name' => $name->[0], |
1438
|
|
|
|
|
|
|
'index' => $column, |
1439
|
|
|
|
|
|
|
}; |
1440
|
2
|
|
|
|
|
4
|
for (my $r = 0; $r <= $self->last_row; $r++) { |
1441
|
158
|
|
|
|
|
265
|
$self->{data_table}->[$r][$column] = $name->[$r]; |
1442
|
|
|
|
|
|
|
} |
1443
|
|
|
|
|
|
|
} |
1444
|
|
|
|
|
|
|
else { |
1445
|
|
|
|
|
|
|
# different number of elements |
1446
|
0
|
|
|
|
|
0
|
cluck "array has different number of elements than Data table!\n"; |
1447
|
0
|
|
|
|
|
0
|
return; |
1448
|
|
|
|
|
|
|
} |
1449
|
|
|
|
|
|
|
} |
1450
|
|
|
|
|
|
|
else { |
1451
|
|
|
|
|
|
|
# table has no existing data |
1452
|
0
|
|
|
|
|
0
|
$self->{$column} = { |
1453
|
|
|
|
|
|
|
'name' => $name->[0], |
1454
|
|
|
|
|
|
|
'index' => $column, |
1455
|
|
|
|
|
|
|
}; |
1456
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < scalar @$name; $i++) { |
1457
|
0
|
|
|
|
|
0
|
$self->value($i, $column, $name->[$i]); |
1458
|
|
|
|
|
|
|
} |
1459
|
0
|
|
|
|
|
0
|
$self->{last_row} = scalar @$name - 1; |
1460
|
0
|
|
|
|
|
0
|
$self->{headers} = 1; # boolean to indicate the table now has headers |
1461
|
|
|
|
|
|
|
} |
1462
|
|
|
|
|
|
|
} |
1463
|
|
|
|
|
|
|
elsif ($name_ref eq 'Bio::DB::GFF::Typename') { |
1464
|
|
|
|
|
|
|
# a Typename object that was selected from a SeqFeature::Store database |
1465
|
0
|
|
|
|
|
0
|
$self->{$column} = { |
1466
|
|
|
|
|
|
|
'name' => $name->asString, |
1467
|
|
|
|
|
|
|
'index' => $column, |
1468
|
|
|
|
|
|
|
}; |
1469
|
0
|
|
|
|
|
0
|
$self->{data_table}->[0][$column] = $name->asString; |
1470
|
|
|
|
|
|
|
} |
1471
|
|
|
|
|
|
|
elsif ($name_ref eq '') { |
1472
|
|
|
|
|
|
|
# just a name |
1473
|
29
|
|
|
|
|
91
|
$self->{$column} = { |
1474
|
|
|
|
|
|
|
'name' => $name, |
1475
|
|
|
|
|
|
|
'index' => $column, |
1476
|
|
|
|
|
|
|
}; |
1477
|
29
|
|
|
|
|
69
|
$self->{data_table}->[0][$column] = $name; |
1478
|
|
|
|
|
|
|
} |
1479
|
|
|
|
|
|
|
else { |
1480
|
0
|
|
|
|
|
0
|
cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference"; |
1481
|
0
|
|
|
|
|
0
|
return; |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
|
1484
|
31
|
|
|
|
|
38
|
$self->{number_columns}++; |
1485
|
31
|
50
|
|
|
|
51
|
delete $self->{column_indices} if exists $self->{column_indices}; |
1486
|
31
|
50
|
33
|
|
|
63
|
if ($self->gff or $self->bed or $self->ucsc or $self->vcf) { |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1487
|
|
|
|
|
|
|
# check if we maintain integrity, at least insofar what we test |
1488
|
0
|
|
|
|
|
0
|
$self->verify(1); # silence so user doesn't get these messages |
1489
|
|
|
|
|
|
|
} |
1490
|
31
|
|
|
|
|
46
|
return $column; |
1491
|
|
|
|
|
|
|
} |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
sub copy_column { |
1494
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
1495
|
1
|
|
|
|
|
1
|
my $index = shift; |
1496
|
1
|
50
|
|
|
|
4
|
return unless defined $index; |
1497
|
1
|
|
|
|
|
5
|
my $data = $self->column_values($index); |
1498
|
1
|
|
|
|
|
4
|
my $new_index = $self->add_column($data); |
1499
|
1
|
|
|
|
|
11
|
$self->copy_metadata($index, $new_index); |
1500
|
1
|
|
|
|
|
4
|
return $new_index; |
1501
|
|
|
|
|
|
|
} |
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
### Rows and Data access |
1506
|
|
|
|
|
|
|
|
1507
|
|
|
|
|
|
|
sub add_row { |
1508
|
353
|
|
|
353
|
1
|
339
|
my $self = shift; |
1509
|
353
|
|
|
|
|
344
|
my @row_data; |
1510
|
353
|
100
|
66
|
|
|
812
|
if ($_[0] and ref $_[0] eq 'ARRAY') { |
|
|
50
|
33
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
1511
|
313
|
|
|
|
|
296
|
@row_data = @{ $_[0] }; |
|
313
|
|
|
|
|
556
|
|
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') { |
1514
|
40
|
|
|
|
|
66
|
@row_data = $_[0]->row_values; |
1515
|
|
|
|
|
|
|
} |
1516
|
|
|
|
|
|
|
elsif ($_[0] and $_[0] =~ /\t/) { |
1517
|
0
|
|
|
|
|
0
|
@row_data = split /\t/, $_[0]; |
1518
|
|
|
|
|
|
|
} |
1519
|
|
|
|
|
|
|
else { |
1520
|
0
|
|
|
|
|
0
|
@row_data = map {'.'} (1 .. $self->{number_columns}); |
|
0
|
|
|
|
|
0
|
|
1521
|
|
|
|
|
|
|
} |
1522
|
353
|
50
|
|
|
|
564
|
if (scalar @row_data > $self->{number_columns}) { |
1523
|
0
|
|
|
|
|
0
|
cluck "row added has more elements than table columns! truncating row elements\n"; |
1524
|
0
|
|
|
|
|
0
|
splice @row_data, 0, $self->{number_columns}; |
1525
|
|
|
|
|
|
|
} |
1526
|
353
|
|
|
|
|
590
|
until (scalar @row_data == $self->{number_columns}) { |
1527
|
0
|
|
|
|
|
0
|
push @row_data, '.'; |
1528
|
|
|
|
|
|
|
} |
1529
|
353
|
|
|
|
|
359
|
my $row_index = $self->{last_row} + 1; |
1530
|
353
|
|
|
|
|
411
|
$self->{data_table}->[$row_index] = \@row_data; |
1531
|
353
|
|
|
|
|
366
|
$self->{last_row}++; |
1532
|
353
|
|
|
|
|
660
|
return $row_index; |
1533
|
|
|
|
|
|
|
} |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
sub get_row { |
1536
|
9
|
|
|
9
|
1
|
494
|
my ($self, $row_number) = @_; |
1537
|
9
|
50
|
|
|
|
29
|
return unless defined $self->{data_table}->[$row_number]; |
1538
|
9
|
|
|
|
|
22
|
my @options = ( |
1539
|
|
|
|
|
|
|
'data' => $self, |
1540
|
|
|
|
|
|
|
'index' => $row_number, |
1541
|
|
|
|
|
|
|
); |
1542
|
9
|
100
|
66
|
|
|
44
|
if ( |
1543
|
|
|
|
|
|
|
exists $self->{SeqFeatureObjects} and |
1544
|
|
|
|
|
|
|
defined $self->{SeqFeatureObjects}->[$row_number] |
1545
|
|
|
|
|
|
|
) { |
1546
|
5
|
|
|
|
|
12
|
push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number]; |
1547
|
|
|
|
|
|
|
} |
1548
|
9
|
|
|
|
|
66
|
return Bio::ToolBox::Data::Feature->new(@options); |
1549
|
|
|
|
|
|
|
} |
1550
|
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
|
sub delete_row { |
1552
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
1553
|
1
|
|
|
|
|
3
|
my @deleted = sort {$b <=> $a} @_; |
|
0
|
|
|
|
|
0
|
|
1554
|
1
|
|
|
|
|
4
|
while (@deleted) { |
1555
|
1
|
|
|
|
|
2
|
my $d = shift @deleted; |
1556
|
1
|
|
|
|
|
2
|
splice( @{ $self->{data_table} }, $d, 1); |
|
1
|
|
|
|
|
3
|
|
1557
|
1
|
|
|
|
|
4
|
$self->{last_row}--; |
1558
|
1
|
50
|
|
|
|
5
|
if (exists $self->{SeqFeatureObjects}) { |
1559
|
0
|
|
|
|
|
0
|
splice( @{ $self->{SeqFeatureObjects} }, $d, 1); |
|
0
|
|
|
|
|
0
|
|
1560
|
|
|
|
|
|
|
} |
1561
|
|
|
|
|
|
|
} |
1562
|
1
|
|
|
|
|
2
|
return 1; |
1563
|
|
|
|
|
|
|
} |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
sub row_values { |
1566
|
237
|
|
|
237
|
1
|
257
|
my ($self, $row) = @_; |
1567
|
237
|
|
|
|
|
200
|
my @data = @{ $self->{data_table}->[$row] }; |
|
237
|
|
|
|
|
464
|
|
1568
|
237
|
50
|
|
|
|
585
|
return wantarray ? @data : \@data; |
1569
|
|
|
|
|
|
|
} |
1570
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
sub value { |
1572
|
272
|
|
|
272
|
1
|
1380
|
my ($self, $row, $column, $value) = @_; |
1573
|
272
|
50
|
33
|
|
|
527
|
return unless (defined $row and defined $column); |
1574
|
272
|
50
|
|
|
|
323
|
if (defined $value) { |
1575
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$row][$column] = $value; |
1576
|
|
|
|
|
|
|
} |
1577
|
272
|
|
|
|
|
750
|
return $self->{data_table}->[$row][$column]; |
1578
|
|
|
|
|
|
|
} |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
sub row_stream { |
1581
|
9
|
|
|
9
|
1
|
1712
|
my $self = shift; |
1582
|
9
|
|
|
|
|
94
|
return Bio::ToolBox::Data::Iterator->new($self); |
1583
|
|
|
|
|
|
|
} |
1584
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
sub iterate { |
1586
|
6
|
|
|
6
|
1
|
453
|
my $self = shift; |
1587
|
6
|
|
|
|
|
112
|
my $code = shift; |
1588
|
6
|
50
|
|
|
|
21
|
unless (ref $code eq 'CODE') { |
1589
|
0
|
|
|
|
|
0
|
confess "iterate_function() method requires a code reference!"; |
1590
|
|
|
|
|
|
|
} |
1591
|
6
|
|
|
|
|
16
|
my $stream = $self->row_stream; |
1592
|
6
|
|
|
|
|
33
|
while (my $row = $stream->next_row) { |
1593
|
143
|
|
|
|
|
171
|
&$code($row); |
1594
|
|
|
|
|
|
|
} |
1595
|
6
|
|
|
|
|
26
|
return 1; |
1596
|
|
|
|
|
|
|
} |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
#### Stored SeqFeature manipulation |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
sub store_seqfeature { |
1602
|
178
|
|
|
178
|
1
|
218
|
my ($self, $row_i, $seqfeature) = @_; |
1603
|
178
|
50
|
33
|
|
|
399
|
unless (defined $row_i and ref($seqfeature)) { |
1604
|
0
|
|
|
|
|
0
|
confess "must provide a row index and SeqFeature object!"; |
1605
|
|
|
|
|
|
|
} |
1606
|
178
|
50
|
|
|
|
310
|
confess "invalid row index" unless ($row_i <= $self->last_row); |
1607
|
178
|
|
100
|
|
|
279
|
$self->{SeqFeatureObjects} ||= []; |
1608
|
178
|
|
|
|
|
202
|
$self->{SeqFeatureObjects}->[$row_i] = $seqfeature; |
1609
|
178
|
|
|
|
|
267
|
return 1; |
1610
|
|
|
|
|
|
|
} |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
sub delete_seqfeature { |
1613
|
0
|
|
|
0
|
1
|
0
|
my ($self, $row_i) = @_; |
1614
|
0
|
0
|
|
|
|
0
|
confess "invalid row index" unless ($row_i <= $self->last_row); |
1615
|
0
|
0
|
|
|
|
0
|
return unless $self->{SeqFeatureObjects}; |
1616
|
0
|
|
|
|
|
0
|
undef $self->{SeqFeatureObjects}->[$row_i]; |
1617
|
|
|
|
|
|
|
} |
1618
|
|
|
|
|
|
|
|
1619
|
|
|
|
|
|
|
sub collapse_gene_transcripts { |
1620
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1621
|
0
|
0
|
|
|
|
0
|
unless ($self->feature_type eq 'named') { |
1622
|
0
|
|
|
|
|
0
|
carp "Table does not contain named features!"; |
1623
|
0
|
|
|
|
|
0
|
return; |
1624
|
|
|
|
|
|
|
} |
1625
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
# load module |
1627
|
0
|
|
|
|
|
0
|
my $class = "Bio::ToolBox::GeneTools"; |
1628
|
0
|
|
|
|
|
0
|
eval {load $class, qw(collapse_transcripts)}; |
|
0
|
|
|
|
|
0
|
|
1629
|
0
|
0
|
|
|
|
0
|
if ($@) { |
1630
|
0
|
|
|
|
|
0
|
carp "unable to load $class! cannot collapse transcripts!"; |
1631
|
0
|
|
|
|
|
0
|
return; |
1632
|
|
|
|
|
|
|
} |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
# collapse the transcripts |
1635
|
0
|
|
|
|
|
0
|
my $success = 0; |
1636
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1637
|
|
|
|
|
|
|
# we should have stored SeqFeature objects, probably from parsed table |
1638
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1639
|
0
|
0
|
|
|
|
0
|
my $feature = $self->{SeqFeatureObjects}->[$i] or next; |
1640
|
0
|
0
|
|
|
|
0
|
my $collSeqFeat = collapse_transcripts($feature) or next; |
1641
|
0
|
|
|
|
|
0
|
$success += $self->store_seqfeature($i, $collSeqFeat); |
1642
|
|
|
|
|
|
|
} |
1643
|
|
|
|
|
|
|
} |
1644
|
|
|
|
|
|
|
else { |
1645
|
|
|
|
|
|
|
# no stored SeqFeature objects, probably names pointing to a database |
1646
|
|
|
|
|
|
|
# we will have to fetch the feature from a database |
1647
|
0
|
0
|
|
|
|
0
|
my $db = $self->open_meta_database(1) or # force open a new db connection |
1648
|
|
|
|
|
|
|
confess "No SeqFeature objects stored and no database connection!"; |
1649
|
0
|
|
|
|
|
0
|
my $name_i = $self->name_column; |
1650
|
0
|
|
|
|
|
0
|
my $id_i = $self->id_column; |
1651
|
0
|
|
|
|
|
0
|
my $type_i = $self->type_column; |
1652
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1653
|
0
|
0
|
0
|
|
|
0
|
my $feature = get_db_feature( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1654
|
|
|
|
|
|
|
db => $db, |
1655
|
|
|
|
|
|
|
name => $self->value($i, $name_i) || undef, |
1656
|
|
|
|
|
|
|
type => $self->value($i, $type_i) || undef, |
1657
|
|
|
|
|
|
|
id => $self->value($i, $id_i) || undef, |
1658
|
|
|
|
|
|
|
) or next; |
1659
|
0
|
0
|
|
|
|
0
|
my $collSeqFeat = collapse_transcripts($feature) or next; |
1660
|
0
|
|
|
|
|
0
|
$success += $self->store_seqfeature($i, $collSeqFeat); |
1661
|
|
|
|
|
|
|
} |
1662
|
|
|
|
|
|
|
} |
1663
|
|
|
|
|
|
|
|
1664
|
0
|
|
|
|
|
0
|
return $success; |
1665
|
|
|
|
|
|
|
} |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
sub add_transcript_length { |
1668
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1669
|
0
|
|
0
|
|
|
0
|
my $subfeature = shift || 'exon'; |
1670
|
0
|
0
|
|
|
|
0
|
unless (exists $self->{SeqFeatureObjects}) { |
1671
|
0
|
|
|
|
|
0
|
carp "no SeqFeature objects stored for collapsing!"; |
1672
|
0
|
|
|
|
|
0
|
return; |
1673
|
|
|
|
|
|
|
} |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
# load module |
1676
|
0
|
|
|
|
|
0
|
eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length |
|
0
|
|
|
|
|
0
|
|
1677
|
|
|
|
|
|
|
get_transcript_5p_utr_length get_transcript_3p_utr_length)}; |
1678
|
0
|
0
|
|
|
|
0
|
if ($@) { |
1679
|
0
|
|
|
|
|
0
|
carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!"; |
1680
|
0
|
|
|
|
|
0
|
return; |
1681
|
|
|
|
|
|
|
} |
1682
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
# determine name and calculation method |
1684
|
0
|
|
|
|
|
0
|
my $length_calculator; |
1685
|
|
|
|
|
|
|
my $length_name; |
1686
|
0
|
0
|
|
|
|
0
|
if ($subfeature eq 'exon') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1687
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_length; |
1688
|
0
|
|
|
|
|
0
|
$length_name= 'Merged_Transcript_Length'; |
1689
|
|
|
|
|
|
|
} |
1690
|
|
|
|
|
|
|
elsif ($subfeature eq 'cds') { |
1691
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_cds_length; |
1692
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_CDS_Length'; |
1693
|
|
|
|
|
|
|
} |
1694
|
|
|
|
|
|
|
elsif ($subfeature eq '5p_utr') { |
1695
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_5p_utr_length; |
1696
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_5p_UTR_Length'; |
1697
|
|
|
|
|
|
|
} |
1698
|
|
|
|
|
|
|
elsif ($subfeature eq '3p_utr') { |
1699
|
0
|
|
|
|
|
0
|
$length_calculator = \&get_transcript_3p_utr_length; |
1700
|
0
|
|
|
|
|
0
|
$length_name= 'Transcript_3p_UTR_Length'; |
1701
|
|
|
|
|
|
|
} |
1702
|
|
|
|
|
|
|
else { |
1703
|
0
|
|
|
|
|
0
|
carp "unrecognized subfeature type '$subfeature'! No length calculated"; |
1704
|
0
|
|
|
|
|
0
|
return; |
1705
|
|
|
|
|
|
|
} |
1706
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
# add new column and calculate |
1708
|
0
|
|
|
|
|
0
|
my $length_i = $self->add_column($length_name); |
1709
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1710
|
|
|
|
|
|
|
# we should have stored SeqFeature objects, probably from parsed table |
1711
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1712
|
0
|
0
|
|
|
|
0
|
my $feature = $self->{SeqFeatureObjects}->[$i] or next; |
1713
|
0
|
|
|
|
|
0
|
my $length = &$length_calculator($feature); |
1714
|
0
|
|
0
|
|
|
0
|
$self->{data_table}->[$i][$length_i] = $length || $feature->length; |
1715
|
|
|
|
|
|
|
} |
1716
|
|
|
|
|
|
|
} |
1717
|
|
|
|
|
|
|
else { |
1718
|
|
|
|
|
|
|
# no stored SeqFeature objects, probably names pointing to a database |
1719
|
|
|
|
|
|
|
# we will have to fetch the feature from a database |
1720
|
0
|
0
|
|
|
|
0
|
my $db = $self->open_meta_database(1) or # force open a new db connection |
1721
|
|
|
|
|
|
|
confess "No SeqFeature objects stored and no database connection!"; |
1722
|
0
|
|
|
|
|
0
|
my $name_i = $self->name_column; |
1723
|
0
|
|
|
|
|
0
|
my $id_i = $self->id_column; |
1724
|
0
|
|
|
|
|
0
|
my $type_i = $self->type_column; |
1725
|
0
|
|
|
|
|
0
|
for (my $i = 1; $i <= $self->last_row; $i++) { |
1726
|
0
|
0
|
0
|
|
|
0
|
my $feature = get_db_feature( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1727
|
|
|
|
|
|
|
db => $db, |
1728
|
|
|
|
|
|
|
name => $self->value($i, $name_i) || undef, |
1729
|
|
|
|
|
|
|
type => $self->value($i, $type_i) || undef, |
1730
|
|
|
|
|
|
|
id => $self->value($i, $id_i) || undef, |
1731
|
|
|
|
|
|
|
) or next; |
1732
|
0
|
|
|
|
|
0
|
my $length = &$length_calculator($feature); |
1733
|
0
|
|
0
|
|
|
0
|
$self->{data_table}->[$i][$length_i] = $length || $feature->length; |
1734
|
0
|
|
|
|
|
0
|
$self->store_seqfeature($i, $feature); |
1735
|
|
|
|
|
|
|
# to store or not to store? May explode memory, but could save from |
1736
|
|
|
|
|
|
|
# expensive db calls later |
1737
|
|
|
|
|
|
|
} |
1738
|
|
|
|
|
|
|
} |
1739
|
|
|
|
|
|
|
|
1740
|
0
|
|
|
|
|
0
|
return $length_i; |
1741
|
|
|
|
|
|
|
} |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
|
1744
|
|
|
|
|
|
|
|
1745
|
|
|
|
|
|
|
### Data structure manipulation |
1746
|
|
|
|
|
|
|
|
1747
|
|
|
|
|
|
|
sub sort_data { |
1748
|
1
|
|
|
1
|
1
|
1314
|
my $self = shift; |
1749
|
1
|
|
|
|
|
2
|
my $index = shift; |
1750
|
1
|
|
50
|
|
|
4
|
my $direction = shift || 'i'; |
1751
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
# confirm options |
1753
|
1
|
50
|
|
|
|
4
|
return unless exists $self->{$index}{name}; |
1754
|
1
|
50
|
|
|
|
5
|
unless ($direction =~ /^[id]/i) { |
1755
|
0
|
|
|
|
|
0
|
carp "unrecognized sort order '$direction'! Must be i or d"; |
1756
|
0
|
|
|
|
|
0
|
return; |
1757
|
|
|
|
|
|
|
} |
1758
|
|
|
|
|
|
|
|
1759
|
|
|
|
|
|
|
# Sample the dataset values |
1760
|
|
|
|
|
|
|
# this will be used to guess the sort method, below |
1761
|
1
|
|
|
|
|
2
|
my $example; # an example of the dataset |
1762
|
1
|
|
|
|
|
5
|
foreach (my $i = 1; $i <= $self->last_row; $i++) { |
1763
|
|
|
|
|
|
|
# we want to avoid null values, so keep trying |
1764
|
|
|
|
|
|
|
# null being . or any variation of N/A, NaN, inf |
1765
|
1
|
|
|
|
|
5
|
my $v = $self->value($i, $index); |
1766
|
1
|
50
|
33
|
|
|
9
|
if (defined $v and $v !~ /^(?:\.|n\/?a|nan|\-?inf)$/i) { |
1767
|
|
|
|
|
|
|
# a non-null value, take it |
1768
|
1
|
|
|
|
|
3
|
$example = $v; |
1769
|
1
|
|
|
|
|
2
|
last; |
1770
|
|
|
|
|
|
|
} |
1771
|
|
|
|
|
|
|
} |
1772
|
|
|
|
|
|
|
|
1773
|
|
|
|
|
|
|
# Determine sort method, either numerical or alphabetical |
1774
|
1
|
|
|
|
|
3
|
my $sortmethod; |
1775
|
1
|
50
|
|
|
|
5
|
if ($example =~ /[a-z]/i) { |
|
|
0
|
|
|
|
|
|
1776
|
|
|
|
|
|
|
# there are detectable letters |
1777
|
1
|
|
|
|
|
2
|
$sortmethod = 'ascii'; |
1778
|
|
|
|
|
|
|
} |
1779
|
|
|
|
|
|
|
elsif ($example =~ /^\-?\d+\.?\d*$/) { |
1780
|
|
|
|
|
|
|
# there are only digits, allowing for minus sign and a decimal point |
1781
|
|
|
|
|
|
|
# I don't think this allows for exponents, though |
1782
|
0
|
|
|
|
|
0
|
$sortmethod = 'numeric'; |
1783
|
|
|
|
|
|
|
} |
1784
|
|
|
|
|
|
|
else { |
1785
|
|
|
|
|
|
|
# unable to determine (probably alphanumeric), sort asciibetical |
1786
|
0
|
|
|
|
|
0
|
$sortmethod = 'ascii'; |
1787
|
|
|
|
|
|
|
} |
1788
|
|
|
|
|
|
|
|
1789
|
|
|
|
|
|
|
# Re-order the datasets |
1790
|
|
|
|
|
|
|
# Directly sorting the @data array is proving difficult. It keeps giving me |
1791
|
|
|
|
|
|
|
# a segmentation fault. So I'm using a different approach by copying the |
1792
|
|
|
|
|
|
|
# @data_table into a temporary hash. |
1793
|
|
|
|
|
|
|
# put data_table array into a temporary hash |
1794
|
|
|
|
|
|
|
# the hash key will the be dataset value, |
1795
|
|
|
|
|
|
|
# the hash value will be the reference the row data |
1796
|
1
|
|
|
|
|
2
|
my %datahash; |
1797
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
# reorder numerically |
1799
|
1
|
50
|
|
|
|
5
|
if ($sortmethod eq 'numeric') { |
|
|
50
|
|
|
|
|
|
1800
|
0
|
|
|
|
|
0
|
for my $row (1..$self->last_row) { |
1801
|
0
|
|
|
|
|
0
|
my $value = $self->value($row, $index); |
1802
|
|
|
|
|
|
|
|
1803
|
|
|
|
|
|
|
# check to see whether this value exists or not |
1804
|
0
|
|
|
|
|
0
|
while (exists $datahash{$value}) { |
1805
|
|
|
|
|
|
|
# add a really small number to bump it up and make it unique |
1806
|
|
|
|
|
|
|
# this, of course, presumes that none of the dataset values |
1807
|
|
|
|
|
|
|
# are really this small - this may be an entirely bad |
1808
|
|
|
|
|
|
|
# assumption!!!!! I suppose we could somehow calculate an |
1809
|
|
|
|
|
|
|
# appropriate value.... nah. |
1810
|
|
|
|
|
|
|
# don't worry, we're only modifying the value used for sorting, |
1811
|
|
|
|
|
|
|
# not the actual value |
1812
|
0
|
|
|
|
|
0
|
$value += 0.00000001; |
1813
|
|
|
|
|
|
|
} |
1814
|
|
|
|
|
|
|
# store the row data reference |
1815
|
0
|
|
|
|
|
0
|
$datahash{$value} = $self->{data_table}->[$row]; |
1816
|
|
|
|
|
|
|
} |
1817
|
|
|
|
|
|
|
|
1818
|
|
|
|
|
|
|
# re-fill the array based on the sort direction |
1819
|
0
|
0
|
|
|
|
0
|
if ($direction =~ /^i/i) { |
1820
|
|
|
|
|
|
|
# increasing sort |
1821
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row, skip the header |
1822
|
0
|
|
|
|
|
0
|
foreach (sort {$a <=> $b} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1823
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1824
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1825
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1826
|
|
|
|
|
|
|
} |
1827
|
|
|
|
|
|
|
} |
1828
|
|
|
|
|
|
|
else { |
1829
|
|
|
|
|
|
|
# decreasing sort |
1830
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row, skip the header |
1831
|
0
|
|
|
|
|
0
|
foreach (sort {$b <=> $a} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1832
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1833
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1834
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1835
|
|
|
|
|
|
|
} |
1836
|
|
|
|
|
|
|
} |
1837
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
# summary prompt |
1839
|
0
|
|
|
|
|
0
|
printf " Data table sorted numerically by the contents of %s\n", |
1840
|
|
|
|
|
|
|
$self->name($index); |
1841
|
|
|
|
|
|
|
} |
1842
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
# reorder asciibetically |
1844
|
|
|
|
|
|
|
elsif ($sortmethod eq 'ascii') { |
1845
|
1
|
|
|
|
|
4
|
for my $row (1..$self->last_row) { |
1846
|
|
|
|
|
|
|
# get the value to sort by |
1847
|
78
|
|
|
|
|
87
|
my $value = $self->value($row, $index); |
1848
|
78
|
50
|
|
|
|
89
|
if (exists $datahash{$value}) { |
1849
|
|
|
|
|
|
|
# not unique |
1850
|
0
|
|
|
|
|
0
|
my $n = 1; |
1851
|
0
|
|
|
|
|
0
|
my $lookup = $value . sprintf("03%d", $n); |
1852
|
|
|
|
|
|
|
# we'll try to make a unique value by appending |
1853
|
|
|
|
|
|
|
# a number to the original value |
1854
|
0
|
|
|
|
|
0
|
while (exists $datahash{$lookup}) { |
1855
|
|
|
|
|
|
|
# keep bumping up the number till it's unique |
1856
|
0
|
|
|
|
|
0
|
$n++; |
1857
|
0
|
|
|
|
|
0
|
$lookup = $value . sprintf("03%d", $n); |
1858
|
|
|
|
|
|
|
} |
1859
|
0
|
|
|
|
|
0
|
$datahash{$lookup} = $self->{data_table}->[$row]; |
1860
|
|
|
|
|
|
|
} |
1861
|
|
|
|
|
|
|
else { |
1862
|
|
|
|
|
|
|
# unique |
1863
|
78
|
|
|
|
|
126
|
$datahash{$value} = $self->{data_table}->[$row]; |
1864
|
|
|
|
|
|
|
} |
1865
|
|
|
|
|
|
|
} |
1866
|
|
|
|
|
|
|
|
1867
|
|
|
|
|
|
|
# re-fill the array based on the sort direction |
1868
|
1
|
50
|
33
|
|
|
8
|
if ($direction eq 'i' or $direction eq 'I') { |
|
|
50
|
33
|
|
|
|
|
1869
|
|
|
|
|
|
|
# increasing |
1870
|
0
|
|
|
|
|
0
|
my $i = 1; # keep track of the row |
1871
|
0
|
|
|
|
|
0
|
foreach (sort {$a cmp $b} keys %datahash) { |
|
0
|
|
|
|
|
0
|
|
1872
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1873
|
0
|
|
|
|
|
0
|
$self->{data_table}->[$i] = $datahash{$_}; |
1874
|
0
|
|
|
|
|
0
|
$i++; # increment for next row |
1875
|
|
|
|
|
|
|
} |
1876
|
|
|
|
|
|
|
} |
1877
|
|
|
|
|
|
|
elsif ($direction eq 'd' or $direction eq 'D') { |
1878
|
|
|
|
|
|
|
# decreasing |
1879
|
1
|
|
|
|
|
1
|
my $i = 1; # keep track of the row |
1880
|
1
|
|
|
|
|
28
|
foreach (sort {$b cmp $a} keys %datahash) { |
|
397
|
|
|
|
|
355
|
|
1881
|
|
|
|
|
|
|
# put back the reference to the anonymous array of row data |
1882
|
78
|
|
|
|
|
75
|
$self->{data_table}->[$i] = $datahash{$_}; |
1883
|
78
|
|
|
|
|
66
|
$i++; # increment for next row |
1884
|
|
|
|
|
|
|
} |
1885
|
|
|
|
|
|
|
} |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
# summary prompt |
1888
|
1
|
|
|
|
|
7
|
printf " Data table sorted asciibetically by the contents of '%s'\n", |
1889
|
|
|
|
|
|
|
$self->name($index); |
1890
|
|
|
|
|
|
|
} |
1891
|
|
|
|
|
|
|
|
1892
|
1
|
|
|
|
|
12
|
return 1; |
1893
|
|
|
|
|
|
|
} |
1894
|
|
|
|
|
|
|
|
1895
|
|
|
|
|
|
|
sub gsort_data { |
1896
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
1897
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
# identify indices |
1899
|
1
|
50
|
|
|
|
5
|
unless ($self->feature_type eq 'coordinate') { |
1900
|
0
|
|
|
|
|
0
|
carp "no chromosome and start/position columns to sort!\n"; |
1901
|
0
|
|
|
|
|
0
|
return; |
1902
|
|
|
|
|
|
|
} |
1903
|
1
|
|
|
|
|
5
|
my $chromo_i = $self->chromo_column; |
1904
|
1
|
|
|
|
|
5
|
my $start_i = $self->start_column; |
1905
|
1
|
|
33
|
|
|
4
|
my $stop_i = $self->stop_column || $start_i; |
1906
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
# collect the data by chromosome: start and end |
1908
|
1
|
|
|
|
|
3
|
my %chrom2row; |
1909
|
1
|
|
|
|
|
4
|
for my $row (1 .. $self->last_row) { |
1910
|
78
|
|
|
|
|
90
|
my $c = $self->{data_table}->[$row]->[$chromo_i]; |
1911
|
78
|
|
100
|
|
|
92
|
$chrom2row{$c} ||= []; |
1912
|
78
|
|
|
|
|
144
|
push @{ $chrom2row{$c} }, [ |
1913
|
|
|
|
|
|
|
$self->{data_table}->[$row]->[$start_i], |
1914
|
|
|
|
|
|
|
$self->{data_table}->[$row]->[$stop_i], |
1915
|
78
|
|
|
|
|
65
|
$self->{data_table}->[$row] |
1916
|
|
|
|
|
|
|
]; |
1917
|
|
|
|
|
|
|
} |
1918
|
|
|
|
|
|
|
|
1919
|
|
|
|
|
|
|
# get sane chromosome order |
1920
|
1
|
|
|
|
|
8
|
my @chroms = sane_chromo_sort(keys %chrom2row); |
1921
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
# sort the table |
1923
|
1
|
|
|
|
|
2
|
my @sorted_data; |
1924
|
1
|
|
|
|
|
3
|
push @sorted_data, $self->{data_table}->[0]; |
1925
|
1
|
50
|
|
|
|
6
|
if ($self->gff) { |
1926
|
|
|
|
|
|
|
# sort by increasing start and decreasing end, i.e. decreasing length |
1927
|
|
|
|
|
|
|
# this should mostly handle putting genes/transcripts before exons |
1928
|
0
|
|
|
|
|
0
|
foreach my $chr (@chroms) { |
1929
|
|
|
|
|
|
|
push @sorted_data, |
1930
|
0
|
|
|
|
|
0
|
map { $_->[2] } |
1931
|
0
|
0
|
|
|
|
0
|
sort { $a->[0] <=> $b->[0] or $b->[1] <=> $a->[1] } |
1932
|
0
|
|
|
|
|
0
|
@{ $chrom2row{$chr} }; |
|
0
|
|
|
|
|
0
|
|
1933
|
|
|
|
|
|
|
} |
1934
|
|
|
|
|
|
|
} |
1935
|
|
|
|
|
|
|
else { |
1936
|
|
|
|
|
|
|
# sort by increasing start followed by increasing end, i.e. increasing length |
1937
|
1
|
|
|
|
|
2
|
foreach my $chr (@chroms) { |
1938
|
|
|
|
|
|
|
push @sorted_data, |
1939
|
78
|
|
|
|
|
83
|
map { $_->[2] } |
1940
|
245
|
50
|
|
|
|
291
|
sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] } |
1941
|
1
|
|
|
|
|
2
|
@{ $chrom2row{$chr} }; |
|
1
|
|
|
|
|
5
|
|
1942
|
|
|
|
|
|
|
} |
1943
|
|
|
|
|
|
|
} |
1944
|
1
|
|
|
|
|
3
|
$self->{data_table} = \@sorted_data; |
1945
|
1
|
|
|
|
|
11
|
return 1; |
1946
|
|
|
|
|
|
|
} |
1947
|
|
|
|
|
|
|
|
1948
|
|
|
|
|
|
|
sub splice_data { |
1949
|
1
|
|
|
1
|
1
|
3
|
my ($self, $part, $total_parts) = @_; |
1950
|
|
|
|
|
|
|
|
1951
|
1
|
50
|
33
|
|
|
7
|
unless ($part and $total_parts) { |
1952
|
0
|
|
|
|
|
0
|
confess "ordinal part and total number of parts not passed\n"; |
1953
|
|
|
|
|
|
|
} |
1954
|
1
|
|
|
|
|
5
|
my $part_length = int($self->last_row / $total_parts); |
1955
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
# check for SeqFeatureObjects array |
1957
|
1
|
50
|
|
|
|
3
|
if (exists $self->{SeqFeatureObjects}) { |
1958
|
|
|
|
|
|
|
# it needs to be the same length as the data table, it should be |
1959
|
0
|
|
|
|
|
0
|
while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1960
|
0
|
|
|
|
|
0
|
push @{$self->{SeqFeatureObjects}}, undef; |
|
0
|
|
|
|
|
0
|
|
1961
|
|
|
|
|
|
|
} |
1962
|
|
|
|
|
|
|
} |
1963
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
# splicing based on which part we do |
1965
|
1
|
50
|
|
|
|
5
|
if ($part == 1) { |
|
|
50
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
# remove all but the first part |
1967
|
|
|
|
|
|
|
splice( |
1968
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
1969
|
|
|
|
|
|
|
$part_length + 1 |
1970
|
|
|
|
|
|
|
); |
1971
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
1972
|
|
|
|
|
|
|
splice( |
1973
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
1974
|
|
|
|
|
|
|
$part_length + 1 |
1975
|
|
|
|
|
|
|
); |
1976
|
|
|
|
|
|
|
} |
1977
|
|
|
|
|
|
|
} |
1978
|
|
|
|
|
|
|
elsif ($part == $total_parts) { |
1979
|
|
|
|
|
|
|
# remove all but the last part |
1980
|
|
|
|
|
|
|
splice( |
1981
|
1
|
|
|
|
|
2
|
@{$self->{'data_table'}}, |
|
1
|
|
|
|
|
25
|
|
1982
|
|
|
|
|
|
|
1, |
1983
|
|
|
|
|
|
|
$part_length * ($total_parts - 1) |
1984
|
|
|
|
|
|
|
); |
1985
|
1
|
50
|
|
|
|
4
|
if (exists $self->{SeqFeatureObjects}) { |
1986
|
|
|
|
|
|
|
splice( |
1987
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
1988
|
|
|
|
|
|
|
1, |
1989
|
|
|
|
|
|
|
$part_length * ($total_parts - 1) |
1990
|
|
|
|
|
|
|
); |
1991
|
|
|
|
|
|
|
} |
1992
|
|
|
|
|
|
|
} |
1993
|
|
|
|
|
|
|
else { |
1994
|
|
|
|
|
|
|
# splicing in the middle requires two rounds |
1995
|
|
|
|
|
|
|
|
1996
|
|
|
|
|
|
|
# remove the last parts |
1997
|
|
|
|
|
|
|
splice( |
1998
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
1999
|
|
|
|
|
|
|
($part * $part_length) + 1 |
2000
|
|
|
|
|
|
|
); |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
# remove the first parts |
2003
|
|
|
|
|
|
|
splice( |
2004
|
0
|
|
|
|
|
0
|
@{$self->{'data_table'}}, |
|
0
|
|
|
|
|
0
|
|
2005
|
|
|
|
|
|
|
1, |
2006
|
|
|
|
|
|
|
$part_length * ($part - 1) |
2007
|
|
|
|
|
|
|
); |
2008
|
|
|
|
|
|
|
|
2009
|
0
|
0
|
|
|
|
0
|
if (exists $self->{SeqFeatureObjects}) { |
2010
|
|
|
|
|
|
|
splice( |
2011
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
2012
|
|
|
|
|
|
|
($part * $part_length) + 1 |
2013
|
|
|
|
|
|
|
); |
2014
|
|
|
|
|
|
|
splice( |
2015
|
0
|
|
|
|
|
0
|
@{$self->{SeqFeatureObjects}}, |
|
0
|
|
|
|
|
0
|
|
2016
|
|
|
|
|
|
|
1, |
2017
|
|
|
|
|
|
|
$part_length * ($part - 1) |
2018
|
|
|
|
|
|
|
); |
2019
|
|
|
|
|
|
|
} |
2020
|
|
|
|
|
|
|
} |
2021
|
|
|
|
|
|
|
|
2022
|
|
|
|
|
|
|
# update last row metadata |
2023
|
1
|
|
|
|
|
3
|
$self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1; |
|
1
|
|
|
|
|
3
|
|
2024
|
|
|
|
|
|
|
|
2025
|
|
|
|
|
|
|
# re-open a new un-cached database connection |
2026
|
1
|
50
|
|
|
|
3
|
if (exists $self->{db_connection}) { |
2027
|
0
|
|
|
|
|
0
|
delete $self->{db_connection}; |
2028
|
|
|
|
|
|
|
} |
2029
|
1
|
|
|
|
|
2
|
return 1; |
2030
|
|
|
|
|
|
|
} |
2031
|
|
|
|
|
|
|
|
2032
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
|
2034
|
|
|
|
|
|
|
### File functions |
2035
|
|
|
|
|
|
|
|
2036
|
|
|
|
|
|
|
sub reload_children { |
2037
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
2038
|
1
|
|
|
|
|
3
|
my @files = @_; |
2039
|
1
|
50
|
|
|
|
4
|
return unless @files; |
2040
|
|
|
|
|
|
|
|
2041
|
|
|
|
|
|
|
# prepare Stream |
2042
|
1
|
|
|
|
|
2
|
my $class = "Bio::ToolBox::Data::Stream"; |
2043
|
1
|
|
|
|
|
2
|
eval {load $class}; |
|
1
|
|
|
|
|
4
|
|
2044
|
1
|
50
|
|
|
|
69
|
if ($@) { |
2045
|
0
|
|
|
|
|
0
|
carp "unable to load $class! can't reload children!"; |
2046
|
0
|
|
|
|
|
0
|
return; |
2047
|
|
|
|
|
|
|
} |
2048
|
|
|
|
|
|
|
|
2049
|
|
|
|
|
|
|
# open first stream |
2050
|
1
|
|
|
|
|
3
|
my $first = shift @files; |
2051
|
1
|
|
|
|
|
6
|
my $Stream = $class->new(in => $first); |
2052
|
1
|
50
|
|
|
|
4
|
unless ($Stream) { |
2053
|
0
|
|
|
|
|
0
|
carp "unable to load first child file $first"; |
2054
|
0
|
|
|
|
|
0
|
return; |
2055
|
|
|
|
|
|
|
} |
2056
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
# destroy current data table and metadata |
2058
|
1
|
|
|
|
|
2
|
$self->{data_table} = []; |
2059
|
1
|
|
|
|
|
5
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
2060
|
0
|
|
|
|
|
0
|
delete $self->{$i}; |
2061
|
|
|
|
|
|
|
} |
2062
|
|
|
|
|
|
|
|
2063
|
|
|
|
|
|
|
# copy the metadata |
2064
|
1
|
|
|
|
|
3
|
foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) { |
2065
|
|
|
|
|
|
|
# various keys |
2066
|
9
|
|
|
|
|
13
|
$self->{$_} = $Stream->{$_}; |
2067
|
|
|
|
|
|
|
} |
2068
|
1
|
|
|
|
|
3
|
for (my $i = 0; $i < $Stream->number_columns; $i++) { |
2069
|
|
|
|
|
|
|
# column metadata |
2070
|
4
|
|
|
|
|
9
|
my %md = $Stream->metadata($i); |
2071
|
4
|
|
|
|
|
9
|
$self->{$i} = \%md; |
2072
|
|
|
|
|
|
|
} |
2073
|
1
|
|
|
|
|
3
|
my @comments = $Stream->comments; |
2074
|
1
|
|
|
|
|
1
|
push @{$self->{other}}, @comments; |
|
1
|
|
|
|
|
4
|
|
2075
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
# first row column headers |
2077
|
1
|
|
|
|
|
2
|
$self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ]; |
|
1
|
|
|
|
|
2
|
|
2078
|
|
|
|
|
|
|
|
2079
|
|
|
|
|
|
|
# load the data |
2080
|
1
|
|
|
|
|
4
|
while (my $row = $Stream->next_row) { |
2081
|
39
|
|
|
|
|
56
|
$self->add_row($row); |
2082
|
|
|
|
|
|
|
} |
2083
|
1
|
|
|
|
|
36
|
$Stream->close_fh; |
2084
|
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
|
# load remaining files |
2086
|
1
|
|
|
|
|
20
|
foreach my $file (@files) { |
2087
|
2
|
|
|
|
|
32
|
my $Stream = $class->new(in => $file); |
2088
|
2
|
50
|
|
|
|
5
|
if ($Stream->number_columns != $self->number_columns) { |
2089
|
0
|
|
|
|
|
0
|
confess "fatal error: child file $file has a different number of columns!"; |
2090
|
|
|
|
|
|
|
} |
2091
|
2
|
|
|
|
|
5
|
while (my $row = $Stream->next_row) { |
2092
|
78
|
|
|
|
|
102
|
my $a = $row->row_values; |
2093
|
78
|
|
|
|
|
99
|
$self->add_row($a); |
2094
|
|
|
|
|
|
|
} |
2095
|
2
|
|
|
|
|
71
|
$Stream->close_fh; |
2096
|
|
|
|
|
|
|
} |
2097
|
1
|
|
|
|
|
20
|
$self->verify; |
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
# clean up |
2100
|
1
|
|
|
|
|
166
|
unlink($first, @files); |
2101
|
1
|
|
|
|
|
8
|
return $self->last_row; |
2102
|
|
|
|
|
|
|
} |
2103
|
|
|
|
|
|
|
|
2104
|
|
|
|
|
|
|
|
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
|
2107
|
|
|
|
|
|
|
### Export summary data file |
2108
|
|
|
|
|
|
|
sub summary_file { |
2109
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2110
|
|
|
|
|
|
|
|
2111
|
|
|
|
|
|
|
# Collect passed arguments |
2112
|
0
|
|
|
|
|
0
|
my %args = @_; |
2113
|
|
|
|
|
|
|
|
2114
|
|
|
|
|
|
|
# parameters |
2115
|
|
|
|
|
|
|
# either one or more datasets can be summarized |
2116
|
0
|
|
0
|
|
|
0
|
my $outfile = $args{'filename'} || undef; |
2117
|
0
|
|
|
|
|
0
|
my @datasets; |
2118
|
0
|
0
|
0
|
|
|
0
|
if ($args{dataset} and ref $args{dataset} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2119
|
0
|
|
|
|
|
0
|
@datasets = @{ $args{dataset} }; |
|
0
|
|
|
|
|
0
|
|
2120
|
|
|
|
|
|
|
} |
2121
|
|
|
|
|
|
|
elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') { |
2122
|
0
|
|
|
|
|
0
|
push @datasets, $args{dataset}; |
2123
|
|
|
|
|
|
|
} |
2124
|
0
|
|
|
|
|
0
|
my @startcolumns; |
2125
|
0
|
0
|
0
|
|
|
0
|
if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2126
|
0
|
|
|
|
|
0
|
@startcolumns = @{ $args{startcolumn} }; |
|
0
|
|
|
|
|
0
|
|
2127
|
|
|
|
|
|
|
} |
2128
|
|
|
|
|
|
|
elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') { |
2129
|
0
|
|
|
|
|
0
|
push @startcolumns, $args{startcolumn}; |
2130
|
|
|
|
|
|
|
} |
2131
|
0
|
|
|
|
|
0
|
my @endcolumns; |
2132
|
0
|
|
0
|
|
|
0
|
$args{endcolumn} ||= $args{stopcolumn}; |
2133
|
0
|
0
|
0
|
|
|
0
|
if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') { |
|
|
0
|
0
|
|
|
|
|
2134
|
0
|
|
|
|
|
0
|
@endcolumns = @{ $args{endcolumn} }; |
|
0
|
|
|
|
|
0
|
|
2135
|
|
|
|
|
|
|
} |
2136
|
|
|
|
|
|
|
elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') { |
2137
|
0
|
|
|
|
|
0
|
push @endcolumns, $args{endcolumn}; |
2138
|
|
|
|
|
|
|
} |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
# Check required values |
2142
|
0
|
0
|
|
|
|
0
|
unless ($self->verify) { |
2143
|
0
|
|
|
|
|
0
|
cluck "bad data structure!"; |
2144
|
0
|
|
|
|
|
0
|
return; |
2145
|
|
|
|
|
|
|
} |
2146
|
0
|
0
|
|
|
|
0
|
unless (defined $outfile) { |
2147
|
0
|
0
|
|
|
|
0
|
if ($self->basename) { |
2148
|
|
|
|
|
|
|
# use the opened file's filename if it exists |
2149
|
|
|
|
|
|
|
# prepend the path if it exists |
2150
|
|
|
|
|
|
|
# the extension will be added later |
2151
|
0
|
|
|
|
|
0
|
$outfile = $self->path . $self->basename; |
2152
|
|
|
|
|
|
|
} |
2153
|
|
|
|
|
|
|
else { |
2154
|
0
|
|
|
|
|
0
|
cluck "no filename passed to write_summary_data!\n"; |
2155
|
0
|
|
|
|
|
0
|
return; |
2156
|
|
|
|
|
|
|
} |
2157
|
|
|
|
|
|
|
} |
2158
|
|
|
|
|
|
|
|
2159
|
|
|
|
|
|
|
# Identify possible dataset columns |
2160
|
0
|
|
|
|
|
0
|
my %possibles; |
2161
|
0
|
|
|
|
|
0
|
my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class |
|
0
|
|
|
|
|
0
|
|
2162
|
|
|
|
|
|
|
geneclass chromosome chromo seq_id seqid start stop end gene strand |
2163
|
|
|
|
|
|
|
length primary_id merged_transcript_length transcript_cds_length |
2164
|
|
|
|
|
|
|
transcript_5p_utr_length transcript_3p_utr_length); |
2165
|
|
|
|
|
|
|
# walk through the dataset names |
2166
|
0
|
|
|
|
|
0
|
$possibles{unknown} = []; |
2167
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < $self->number_columns; $i++) { |
2168
|
0
|
0
|
|
|
|
0
|
if (not exists $skip{ lc $self->{$i}{'name'} }) { |
2169
|
0
|
|
0
|
|
|
0
|
my $d = $self->metadata($i, 'dataset') || undef; |
2170
|
0
|
0
|
|
|
|
0
|
if (defined $d) { |
2171
|
|
|
|
|
|
|
# we have what appears to be a dataset column |
2172
|
0
|
|
0
|
|
|
0
|
$possibles{$d} ||= []; |
2173
|
0
|
|
|
|
|
0
|
push @{$possibles{$d}}, $i; |
|
0
|
|
|
|
|
0
|
|
2174
|
|
|
|
|
|
|
} |
2175
|
|
|
|
|
|
|
else { |
2176
|
|
|
|
|
|
|
# still an unknown possibility |
2177
|
0
|
|
|
|
|
0
|
push @{$possibles{unknown}}, $i; |
|
0
|
|
|
|
|
0
|
|
2178
|
|
|
|
|
|
|
} |
2179
|
|
|
|
|
|
|
} |
2180
|
|
|
|
|
|
|
} |
2181
|
|
|
|
|
|
|
|
2182
|
|
|
|
|
|
|
# check datasets |
2183
|
0
|
0
|
|
|
|
0
|
unless (@datasets) { |
2184
|
0
|
0
|
|
|
|
0
|
if (scalar keys %possibles > 1) { |
2185
|
|
|
|
|
|
|
# we will always have the unknown category, so anything more than one |
2186
|
|
|
|
|
|
|
# means we found legitimate dataset columns |
2187
|
0
|
|
|
|
|
0
|
delete $possibles{unknown}; |
2188
|
|
|
|
|
|
|
} |
2189
|
0
|
|
|
|
|
0
|
@datasets = sort {$a cmp $b} keys %possibles; |
|
0
|
|
|
|
|
0
|
|
2190
|
|
|
|
|
|
|
} |
2191
|
|
|
|
|
|
|
|
2192
|
|
|
|
|
|
|
# check starts |
2193
|
0
|
0
|
|
|
|
0
|
if (scalar @startcolumns != scalar @datasets) { |
2194
|
0
|
|
|
|
|
0
|
@startcolumns = (); # ignore what we were given? |
2195
|
0
|
|
|
|
|
0
|
foreach my $d (@datasets) { |
2196
|
|
|
|
|
|
|
# take the first column with this dataset |
2197
|
0
|
|
|
|
|
0
|
push @startcolumns, $possibles{$d}->[0]; |
2198
|
|
|
|
|
|
|
} |
2199
|
|
|
|
|
|
|
} |
2200
|
|
|
|
|
|
|
|
2201
|
|
|
|
|
|
|
# check stops |
2202
|
0
|
0
|
|
|
|
0
|
if (scalar @endcolumns != scalar @datasets) { |
2203
|
0
|
|
|
|
|
0
|
@endcolumns = (); # ignore what we were given? |
2204
|
0
|
|
|
|
|
0
|
foreach my $d (@datasets) { |
2205
|
|
|
|
|
|
|
# take the last column with this dataset |
2206
|
0
|
|
|
|
|
0
|
push @endcolumns, $possibles{$d}->[-1]; |
2207
|
|
|
|
|
|
|
} |
2208
|
|
|
|
|
|
|
} |
2209
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
|
2211
|
|
|
|
|
|
|
# Prepare Data object to store the summed data |
2212
|
0
|
|
|
|
|
0
|
my $summed_data = $self->new( |
2213
|
|
|
|
|
|
|
feature => 'averaged_windows', |
2214
|
|
|
|
|
|
|
columns => ['Window','Midpoint'], |
2215
|
|
|
|
|
|
|
); |
2216
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
# Go through each dataset |
2218
|
0
|
|
|
|
|
0
|
foreach my $d (0 .. $#datasets) { |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
# Prepare score column name |
2221
|
0
|
|
|
|
|
0
|
my $data_name = simplify_dataset_name($datasets[$d]); |
2222
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
# add column |
2224
|
0
|
|
|
|
|
0
|
my $i = $summed_data->add_column($data_name); |
2225
|
0
|
|
|
|
|
0
|
$summed_data->metadata($i, 'dataset', $datasets[$d]); |
2226
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
# tag for remembering we're working with percentile bins |
2228
|
0
|
|
|
|
|
0
|
my $do_percentile = 0; |
2229
|
|
|
|
|
|
|
|
2230
|
|
|
|
|
|
|
# remember the row |
2231
|
0
|
|
|
|
|
0
|
my $row = 1; |
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
# Collect summarized data |
2234
|
0
|
|
|
|
|
0
|
for ( |
2235
|
|
|
|
|
|
|
my $column = $startcolumns[$d]; |
2236
|
|
|
|
|
|
|
$column <= $endcolumns[$d]; |
2237
|
|
|
|
|
|
|
$column++ |
2238
|
|
|
|
|
|
|
) { |
2239
|
|
|
|
|
|
|
|
2240
|
|
|
|
|
|
|
# determine the midpoint position of the window |
2241
|
|
|
|
|
|
|
# this assumes the column metadata has start and stop |
2242
|
0
|
|
|
|
|
0
|
my $midpoint = int(sum0($self->metadata($column, 'start'), |
2243
|
|
|
|
|
|
|
$self->metadata($column, 'stop')) / 2); |
2244
|
|
|
|
|
|
|
|
2245
|
|
|
|
|
|
|
# convert midpoint to fraction of 1000 for plotting if necessary |
2246
|
0
|
0
|
|
|
|
0
|
if (substr($self->name($column), -1) eq '%') { |
2247
|
0
|
|
|
|
|
0
|
$midpoint *= 10; # midpoint * 0.01 * 1000 bp |
2248
|
0
|
|
|
|
|
0
|
$do_percentile++; |
2249
|
|
|
|
|
|
|
} |
2250
|
0
|
0
|
0
|
|
|
0
|
if ($do_percentile and substr($self->name($column), -2) eq 'bp') { |
2251
|
|
|
|
|
|
|
# working on the extension after the percentile bins |
2252
|
0
|
|
|
|
|
0
|
$midpoint += 1000; |
2253
|
|
|
|
|
|
|
} |
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
# collect the values in the column |
2256
|
0
|
|
|
|
|
0
|
my @values; |
2257
|
0
|
|
|
|
|
0
|
for my $row (1..$self->last_row) { |
2258
|
0
|
|
|
|
|
0
|
my $v = $self->value($row, $column); |
2259
|
0
|
0
|
|
|
|
0
|
push @values, $v eq '.' ? 0 : $v; # treat nulls as zero |
2260
|
|
|
|
|
|
|
} |
2261
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
# adjust if log value |
2263
|
0
|
|
0
|
|
|
0
|
my $log = $self->metadata($column, 'log2') || 0; |
2264
|
0
|
0
|
|
|
|
0
|
if ($log) { |
2265
|
0
|
|
|
|
|
0
|
@values = map { 2 ** $_ } @values; |
|
0
|
|
|
|
|
0
|
|
2266
|
|
|
|
|
|
|
} |
2267
|
|
|
|
|
|
|
|
2268
|
|
|
|
|
|
|
# determine mean value |
2269
|
0
|
|
|
|
|
0
|
my $window_mean = sum0(@values) / scalar(@values); |
2270
|
0
|
0
|
|
|
|
0
|
if ($log) { |
2271
|
0
|
|
|
|
|
0
|
$window_mean = log($window_mean) / log(2); |
2272
|
|
|
|
|
|
|
} |
2273
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
# push to summed output |
2275
|
0
|
0
|
|
|
|
0
|
if ($d == 0) { |
2276
|
|
|
|
|
|
|
# this is the first dataset, so we need to add a row |
2277
|
0
|
|
|
|
|
0
|
$summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] ); |
2278
|
|
|
|
|
|
|
} |
2279
|
|
|
|
|
|
|
else { |
2280
|
|
|
|
|
|
|
# we're summarizing multiple datasets, we already have name midpoint |
2281
|
|
|
|
|
|
|
# first do sanity check |
2282
|
0
|
0
|
|
|
|
0
|
if ($summed_data->value($row, 1) != $midpoint) { |
2283
|
0
|
|
|
|
|
0
|
carp("unable to summarize multiple datasets with nonequal columns of data!"); |
2284
|
0
|
|
|
|
|
0
|
return; |
2285
|
|
|
|
|
|
|
} |
2286
|
0
|
|
|
|
|
0
|
$summed_data->value($row, $i, $window_mean); |
2287
|
|
|
|
|
|
|
} |
2288
|
0
|
|
|
|
|
0
|
$row++; |
2289
|
|
|
|
|
|
|
} |
2290
|
|
|
|
|
|
|
} |
2291
|
|
|
|
|
|
|
|
2292
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
|
2294
|
|
|
|
|
|
|
# Write summed data |
2295
|
0
|
|
|
|
|
0
|
$outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present |
2296
|
0
|
|
|
|
|
0
|
my $written_file = $summed_data->write_file( |
2297
|
|
|
|
|
|
|
'filename' => $outfile . '_summary.txt', |
2298
|
|
|
|
|
|
|
'gz' => 0, |
2299
|
|
|
|
|
|
|
); |
2300
|
0
|
|
|
|
|
0
|
return $written_file; |
2301
|
|
|
|
|
|
|
} |
2302
|
|
|
|
|
|
|
|
2303
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
|
2305
|
|
|
|
|
|
|
|
2306
|
|
|
|
|
|
|
|
2307
|
|
|
|
|
|
|
#################################################### |
2308
|
|
|
|
|
|
|
|
2309
|
|
|
|
|
|
|
package Bio::ToolBox::Data::Iterator; |
2310
|
3
|
|
|
3
|
|
16199
|
use Bio::ToolBox::Data::Feature; |
|
3
|
|
|
|
|
15
|
|
|
3
|
|
|
|
|
117
|
|
2311
|
3
|
|
|
3
|
|
25
|
use Carp; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
682
|
|
2312
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
sub new { |
2314
|
9
|
|
|
9
|
|
24
|
my ($class, $data) = @_; |
2315
|
9
|
|
|
|
|
34
|
my %iterator = ( |
2316
|
|
|
|
|
|
|
'index' => 1, |
2317
|
|
|
|
|
|
|
'data' => $data, |
2318
|
|
|
|
|
|
|
); |
2319
|
9
|
|
|
|
|
23
|
return bless \%iterator, $class; |
2320
|
|
|
|
|
|
|
} |
2321
|
|
|
|
|
|
|
|
2322
|
|
|
|
|
|
|
sub next_row { |
2323
|
154
|
|
|
154
|
|
1366
|
my $self = shift; |
2324
|
154
|
100
|
|
|
|
266
|
return if $self->{'index'} > $self->{data}->{last_row}; # no more |
2325
|
148
|
|
|
|
|
156
|
my $i = $self->{'index'}; |
2326
|
148
|
|
|
|
|
131
|
$self->{'index'}++; |
2327
|
|
|
|
|
|
|
my @options = ( |
2328
|
|
|
|
|
|
|
'data' => $self->{data}, |
2329
|
148
|
|
|
|
|
222
|
'index' => $i, |
2330
|
|
|
|
|
|
|
); |
2331
|
148
|
50
|
66
|
|
|
271
|
if ( |
2332
|
|
|
|
|
|
|
exists $self->{data}->{SeqFeatureObjects} and |
2333
|
|
|
|
|
|
|
defined $self->{data}->{SeqFeatureObjects}->[$i] |
2334
|
|
|
|
|
|
|
) { |
2335
|
0
|
|
|
|
|
0
|
push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i]; |
2336
|
|
|
|
|
|
|
} |
2337
|
148
|
|
|
|
|
269
|
return Bio::ToolBox::Data::Feature->new(@options); |
2338
|
|
|
|
|
|
|
} |
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
sub row_index { |
2341
|
0
|
|
|
0
|
|
|
my $self = shift; |
2342
|
0
|
0
|
|
|
|
|
carp "row_index is a read only method" if @_; |
2343
|
0
|
|
|
|
|
|
return $self->{'index'}; |
2344
|
|
|
|
|
|
|
} |
2345
|
|
|
|
|
|
|
|
2346
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
|
2348
|
|
|
|
|
|
|
#################################################### |
2349
|
|
|
|
|
|
|
|
2350
|
|
|
|
|
|
|
__END__ |