line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::DB::USeq; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.25'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Bio::DB::USeq - Read USeq archive database files |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Bio::DB::USeq; |
12
|
|
|
|
|
|
|
my $useq = Bio::DB::USeq->new('file.useq') or |
13
|
|
|
|
|
|
|
die "unable to open file.useq!\n"; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# sequence IDs |
16
|
|
|
|
|
|
|
my @seq_ids = $useq->seq_ids; |
17
|
|
|
|
|
|
|
my $length = $useq->length($chr); # approximate, not exact |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
### Retrieving features |
20
|
|
|
|
|
|
|
# all features or observations across chromosome I |
21
|
|
|
|
|
|
|
my @features = $useq->features( |
22
|
|
|
|
|
|
|
-seq_id => 'chrI', |
23
|
|
|
|
|
|
|
-type => 'region', |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# same thing using a simple form |
27
|
|
|
|
|
|
|
# use an array of (chromosome, start, stop, strand) |
28
|
|
|
|
|
|
|
my @features = $useq->features('chrI'); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# same thing with a memory efficient iterator |
31
|
|
|
|
|
|
|
my $iterator = $useq->get_seq_stream( |
32
|
|
|
|
|
|
|
-seq_id => 'chrI', |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
while (my $f = $iterator->next_seq) { |
35
|
|
|
|
|
|
|
# each feature $f supports most SeqFeatureI methods |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
### Retrieving simple scores |
40
|
|
|
|
|
|
|
my @scores = $useq->scores( |
41
|
|
|
|
|
|
|
-seq_id => 'chrI', |
42
|
|
|
|
|
|
|
-start => 1000, |
43
|
|
|
|
|
|
|
-end => 2000 |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
### Same methods as above after defining an interval first |
48
|
|
|
|
|
|
|
my $segment = $useq->segment( |
49
|
|
|
|
|
|
|
-seq_id => 'chrI', |
50
|
|
|
|
|
|
|
-start => 1000, |
51
|
|
|
|
|
|
|
-end => 2000, |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
my @scores = $segment->scores; |
54
|
|
|
|
|
|
|
my @features = $segment->features; |
55
|
|
|
|
|
|
|
my $iterator = $segment->get_seq_stream; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
### Efficient retrieval of positioned scores in 100 bins |
59
|
|
|
|
|
|
|
# compatible with Bio::Graphics |
60
|
|
|
|
|
|
|
my ($wig) = $useq->features( |
61
|
|
|
|
|
|
|
# assuming unstranded data here, otherwise two wiggle objects |
62
|
|
|
|
|
|
|
# would be returned, one for each strand |
63
|
|
|
|
|
|
|
-seq_id => 'chrI', |
64
|
|
|
|
|
|
|
-start => 1000, |
65
|
|
|
|
|
|
|
-end => 2000, |
66
|
|
|
|
|
|
|
-type => 'wiggle:100', |
67
|
|
|
|
|
|
|
); |
68
|
|
|
|
|
|
|
my @bins = $wig->wiggle; |
69
|
|
|
|
|
|
|
my @bins = $wig->coverage; # same thing |
70
|
|
|
|
|
|
|
my ($bins) = $wig->get_tag_values('coverage'); # same thing |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
### Statistical summaries of intervals |
74
|
|
|
|
|
|
|
# compatible with Bio::Graphics |
75
|
|
|
|
|
|
|
my ($summary) = $useq->features( |
76
|
|
|
|
|
|
|
# assuming unstranded data here, otherwise two summaries |
77
|
|
|
|
|
|
|
# would be returned, one for each strand |
78
|
|
|
|
|
|
|
-seq_id => 'chrI', |
79
|
|
|
|
|
|
|
-start => 1000, |
80
|
|
|
|
|
|
|
-end => 2000, |
81
|
|
|
|
|
|
|
-type => 'summary', |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
84
|
|
|
|
|
|
|
foreach (@$stats) { |
85
|
|
|
|
|
|
|
my $max = $_->{maxVal}; |
86
|
|
|
|
|
|
|
my $mean = Bio::DB::USeq::binMean($_); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
### Stranded data using an iterator |
91
|
|
|
|
|
|
|
# could be used with either wiggle or summary features |
92
|
|
|
|
|
|
|
my $stream = $useq->get_seq_stream( |
93
|
|
|
|
|
|
|
-seq_id => 'chrI', |
94
|
|
|
|
|
|
|
-start => 1000, |
95
|
|
|
|
|
|
|
-end => 2000, |
96
|
|
|
|
|
|
|
-type => 'wiggle:100', |
97
|
|
|
|
|
|
|
); |
98
|
|
|
|
|
|
|
my ($forward, $reverse); |
99
|
|
|
|
|
|
|
if ($useq->stranded) { |
100
|
|
|
|
|
|
|
$forward = $stream->next_seq; |
101
|
|
|
|
|
|
|
$reverse = $stream->next_seq; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
else { |
104
|
|
|
|
|
|
|
$forward = $stream->next_seq; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head1 DESCRIPTION |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Bio::DB::USeq is a B<BioPerl> style adaptor for reading USeq files. USeq files |
111
|
|
|
|
|
|
|
are compressed, indexed data files supporting modern bioinformatic datasets, |
112
|
|
|
|
|
|
|
including genomic points, scores, and intervals. More information about the |
113
|
|
|
|
|
|
|
USeq software package can be found at L<https://github.com/HuntsmanCancerInstitute/USeq>, |
114
|
|
|
|
|
|
|
including a description of the USeq data |
115
|
|
|
|
|
|
|
L<archive format|https://github.com/HuntsmanCancerInstitute/USeq/blob/master/Documentation/USeqDocumentation/useqArchiveFormat.html>. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
USeq files are typically half the size of corresponding bigBed and bigWig files, |
118
|
|
|
|
|
|
|
due to a compact internal format and lack of internal zoom data. This adaptor, |
119
|
|
|
|
|
|
|
however, can still return statistics across different zoom levels in the same |
120
|
|
|
|
|
|
|
manner as big files, albeit at a cost of calculating these in realtime. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 Generating useq files |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
USeq files may be generated using tools from the USeq package, available at |
125
|
|
|
|
|
|
|
L<https://github.com/HuntsmanCancerInstitute/USeq>. They may be generated from |
126
|
|
|
|
|
|
|
native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file |
127
|
|
|
|
|
|
|
formats. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head2 Compatibility |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
The adaptor follows most conventions of other B<BioPerl>-style Bio::DB |
132
|
|
|
|
|
|
|
adaptors. Observations or features in the useq file archive are |
133
|
|
|
|
|
|
|
returned as SeqFeatureI compatible objects. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Coordinates consumed and returned by the adaptor are 1-based, consistent |
136
|
|
|
|
|
|
|
with B<BioPerl> convention. This is not true of the useq file itself, which |
137
|
|
|
|
|
|
|
uses the interbase coordinate system. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Unlike wig and bigWig files, useq file archives support stranded data, |
140
|
|
|
|
|
|
|
which can make data collection much simpler for complex experiments. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
See below for GBrowse compatibility. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 Limitations |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
This adaptor is read only. USeq files, in general, are not modified |
147
|
|
|
|
|
|
|
or written with data. The exceptions are chromosome or global statistics |
148
|
|
|
|
|
|
|
are written to the F<archiveReadMe.txt> file inside the zip archive to |
149
|
|
|
|
|
|
|
cache for future queries. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
No support for genomic sequence is included. Users who need access to |
152
|
|
|
|
|
|
|
genomic sequence should seek an alternative B<BioPerl> adaptor, such as |
153
|
|
|
|
|
|
|
L<Bio::DB::Fasta>. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Useq files do not have a native concept of type, primary_tag, or source |
156
|
|
|
|
|
|
|
attributes, as expected with GFF-based database adaptors. The features |
157
|
|
|
|
|
|
|
method does support special types (see below). |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head2 Requirements |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
The adaptor is a Perl-only implementation. It only requires the |
162
|
|
|
|
|
|
|
L<Archive::Zip> module for opening and reading the useq file archive. |
163
|
|
|
|
|
|
|
B<BioPerl> is required for working with SeqFeatures objects generated |
164
|
|
|
|
|
|
|
from useq file observations. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head1 METHODS |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head2 Initializing the Bio::DB::USeq object |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
These are class methods for creating and working with the Bio::DB::USeq |
171
|
|
|
|
|
|
|
object itself. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=over 4 |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item new |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=item new($file) |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item new(-useq => $file) |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
This will create a new Bio::DB::USeq object. Optionally pass the path |
182
|
|
|
|
|
|
|
to a useq file archive to open immediately. There is not much to do |
183
|
|
|
|
|
|
|
unless you open a file. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Named arguments may be used to specify the file. Either -useq or -file |
186
|
|
|
|
|
|
|
may be used. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item open($file) |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Open a useq file archive. Useq files typically use the F<.useq> file |
191
|
|
|
|
|
|
|
extension. Returns true if successful. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
DO NOT open a subsequent useq file archive when one has already been |
194
|
|
|
|
|
|
|
opened. Please create a new object to open a new file. |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=item clone |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Force the object to re-open the useq archive file under a new file |
199
|
|
|
|
|
|
|
handle to make it clone safe. Do this in the child process before |
200
|
|
|
|
|
|
|
collecting data. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=item zip |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Return the L<Archive::Zip> object representing the useq file archive. |
205
|
|
|
|
|
|
|
Generally not recommended unless you know what you are doing. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=back |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head2 General information about the useq file |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
These are class methods for obtaining general information or |
212
|
|
|
|
|
|
|
metadata attributes regarding the contents of the file. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=over 4 |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item stranded |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Return true or false (1 or 0) indicating whether the contents of the |
219
|
|
|
|
|
|
|
useq file archive are recorded in a stranded fashion. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item attributes |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Return an array of available attribute keys that were recorded in the |
224
|
|
|
|
|
|
|
useq file F<archiveReadMe.txt> member. These are key = value pairs |
225
|
|
|
|
|
|
|
representing metadata for the useq file. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=item attribute($key) |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Return the metadata attribute value for the specified key. These |
230
|
|
|
|
|
|
|
are recorded in the useq file F<archiveReadMe.txt> member. Returns |
231
|
|
|
|
|
|
|
undefined if the key does not exist. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item type |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Return the useq file metadata C<dataType> value. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=item genome |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Return the useq file metadata C<versionedGenome> value. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item version |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
Return the useq file metadata C<useqArchiveVersion> value. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=back |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head2 Chromosome Information |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
These are methods to obtain information about the chromosomes or reference |
250
|
|
|
|
|
|
|
sequences represented in the file archive. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
B<Note> that generating score statistics across one or more chromosomes may |
253
|
|
|
|
|
|
|
be computationally expensive. Therefore, chromosome statistics, once |
254
|
|
|
|
|
|
|
calculated, are cached in the useq file metadata for future reference. |
255
|
|
|
|
|
|
|
This necessitates writing to the useq zip archive. This is currently the |
256
|
|
|
|
|
|
|
only supported method for modifying the useq zip archive. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=over 4 |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item seq_ids |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Return an array of the chromosome or sequence identifiers represented |
263
|
|
|
|
|
|
|
in the useq file archive. The names are sorted ASCIIbetically before |
264
|
|
|
|
|
|
|
they are returned. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item length($seq_id) |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Return the length of the provided chromosome or sequence identifier. |
269
|
|
|
|
|
|
|
Note that this may not be the actual length in the reference genome, but |
270
|
|
|
|
|
|
|
rather the last recorded position of an observation for that chromosome. |
271
|
|
|
|
|
|
|
Hence, it should be used only as an approximation. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=item chr_mean($seq_id) |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Return the mean score value across the entire chromosome. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item chr_stdev($seq_id) |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Return the score standard deviation across the entire chromosome. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item chr_stats($seq_id) |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Return a statistical summary across the entire chromosome. This |
284
|
|
|
|
|
|
|
is an anonymous hash with five keys: validCount, sumData, |
285
|
|
|
|
|
|
|
sumSquares, minVal, and maxVal. These are equivalent to the |
286
|
|
|
|
|
|
|
statistical summaries generated by the Bio::DB::BigWig adaptor. |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=item global_mean |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
Return the mean score value across all chromosomes. |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=item global_stdev |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Return the mean score value across all chromosomes. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=item global_stats |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Return a statistical summary across all chromosomes. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=back |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head2 Data accession |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
These are the primary methods for working with data contained in |
305
|
|
|
|
|
|
|
useq file archive. These should be familiar to most B<BioPerl> users. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=over 4 |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=item features |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
Returns an array or array reference of SeqFeatureI compatible |
312
|
|
|
|
|
|
|
objects overlapping a given genomic interval. |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Coordinates of the interrogated regions must be supplied. At a |
315
|
|
|
|
|
|
|
minimum, the seq_id must be supplied. A start of 1 and an end |
316
|
|
|
|
|
|
|
corresponding to the length of the seq_id is used if not directly |
317
|
|
|
|
|
|
|
provided. Coordinates may be specified in two different manners, |
318
|
|
|
|
|
|
|
either as a list of (seq_id, start, end, strand) or as one or |
319
|
|
|
|
|
|
|
more keyed values. |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
my @features = $useq->features($seq_id, $start, $end); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
@features = $useq->features( |
324
|
|
|
|
|
|
|
-seq_id => $seq_id, |
325
|
|
|
|
|
|
|
-start => $start, |
326
|
|
|
|
|
|
|
-end => $end, |
327
|
|
|
|
|
|
|
); |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
If the -iterator argument is supplied with a true value, then an |
330
|
|
|
|
|
|
|
iterator object is returned. See get_seq_stream() for details. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
Bio::DB::USeq supports four different feature types. Feature |
333
|
|
|
|
|
|
|
types may be specified using the -type argument. |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=over 4 |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=item region |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=item interval |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=item observation |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
The default feature type if the type argument is not specified. |
344
|
|
|
|
|
|
|
These are SeqFeatureI compatible objects representing observations |
345
|
|
|
|
|
|
|
in the useq file archive. These are compatible with the iterator. |
346
|
|
|
|
|
|
|
SeqFeature observations are returned in genomic order. |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item chromosome |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Returns SeqFeatureI compatible objects representing the |
351
|
|
|
|
|
|
|
reference sequences (chromosomes) listed in the useq file |
352
|
|
|
|
|
|
|
archive. These are not compatibile with the iterator. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item wiggle |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=item wiggle:$bins |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=item coverage |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=item coverage:$bins |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Returns an array of SeqFeatureI compatible objects for each |
363
|
|
|
|
|
|
|
strand requested. If the useq file contains stranded data, |
364
|
|
|
|
|
|
|
and no strand is requested, then two objects will be |
365
|
|
|
|
|
|
|
returned representing each strand. |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Each object contains an array representing scores across |
368
|
|
|
|
|
|
|
the requested coordinates. This object is designed to be |
369
|
|
|
|
|
|
|
backwards compatible with coverage features from the |
370
|
|
|
|
|
|
|
L<Bio::DB::Sam> adaptor for use with L<Bio::Graphics> and GBrowse. |
371
|
|
|
|
|
|
|
Note that only scores are returned, not a true depth coverage |
372
|
|
|
|
|
|
|
in the sense of the L<Bio::DB::Sam> coverage types. |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
By default, the wiggle or coverage array is provided at 1 |
375
|
|
|
|
|
|
|
bp resolution. To improve efficiency with large regions, |
376
|
|
|
|
|
|
|
the wiggle array may be limited by using a bin option, |
377
|
|
|
|
|
|
|
where the interrogated interval is divided into the number |
378
|
|
|
|
|
|
|
of bins requested. |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
To retrieve the scores, call the wiggle() or coverage() method. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
For example, to request wiggle scores in 100 equal bins across |
383
|
|
|
|
|
|
|
the interval, see the following example. The wiggle and |
384
|
|
|
|
|
|
|
coverage types are synonymous. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
my ($wiggle) = $useq->features( |
387
|
|
|
|
|
|
|
-seq_id => $chromosome, |
388
|
|
|
|
|
|
|
-start => $start, |
389
|
|
|
|
|
|
|
-end => $end, |
390
|
|
|
|
|
|
|
-type => 'wiggle:100', |
391
|
|
|
|
|
|
|
); |
392
|
|
|
|
|
|
|
my @scores = $wiggle->wiggle; |
393
|
|
|
|
|
|
|
@scores = $wiggle->coverage; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
Wiggle objects may also be obtained with a get_seq_stream() |
396
|
|
|
|
|
|
|
iterator objects. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=item summary |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item summary:$bins |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
Returns an array of SeqFeatureI compatibile Summary objects |
403
|
|
|
|
|
|
|
for each strand requested. If the useq file contains stranded |
404
|
|
|
|
|
|
|
data, and no strand is requested, then two objects will be |
405
|
|
|
|
|
|
|
returned representing each strand. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Each Summary object can then be used to call statistical |
408
|
|
|
|
|
|
|
summaries for one or more bins across the interval. Each |
409
|
|
|
|
|
|
|
statistical summary is an anonymous hash with five keys: |
410
|
|
|
|
|
|
|
validCount, sumData, sumSquares, minVal, and maxVal. From |
411
|
|
|
|
|
|
|
these values, a mean and standard deviation may also be |
412
|
|
|
|
|
|
|
calculated. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
my ($summary) = $useq->features( |
415
|
|
|
|
|
|
|
-seq_id => $chromosome, |
416
|
|
|
|
|
|
|
-start => $start, |
417
|
|
|
|
|
|
|
-end => $end, |
418
|
|
|
|
|
|
|
-type => 'summary', |
419
|
|
|
|
|
|
|
); |
420
|
|
|
|
|
|
|
my @stats = $summary->statistical_summary(100); |
421
|
|
|
|
|
|
|
foreach my $stat (@stats) { |
422
|
|
|
|
|
|
|
my $count = $stat->{validCount}; |
423
|
|
|
|
|
|
|
my $sum = $stat->{sumData}; |
424
|
|
|
|
|
|
|
my $mean = $sum / $count; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Statistical summaries are equivalent to those generated by the |
428
|
|
|
|
|
|
|
L<Bio::DB::BigWig> adaptor and may be used interchangeably. They |
429
|
|
|
|
|
|
|
are compatible with the L<Bio::Graphics> modules. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Summary objects may also be obtained with a get_seq_stream() |
432
|
|
|
|
|
|
|
iterator object. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=back |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item get_seq_stream |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
This is a memory efficient data accessor. An iterator object is |
439
|
|
|
|
|
|
|
returned for an interval specified using coordinate values in the |
440
|
|
|
|
|
|
|
same manner as features(). Call the method next_seq() on the |
441
|
|
|
|
|
|
|
iterator object to retrieve the observation SeqFeature objects |
442
|
|
|
|
|
|
|
one at a time. The iterator is compatible with region, wiggle, |
443
|
|
|
|
|
|
|
and summary feature types. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# establish the iterator object |
446
|
|
|
|
|
|
|
my $iterator = $useq->get_seq_stream( |
447
|
|
|
|
|
|
|
-seq_id => $seq_id, |
448
|
|
|
|
|
|
|
-start => $start, |
449
|
|
|
|
|
|
|
-end => $end, |
450
|
|
|
|
|
|
|
-type => 'region', |
451
|
|
|
|
|
|
|
); |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# retrieve the features one at a time |
454
|
|
|
|
|
|
|
while (my $f = $iterator->next_seq) { |
455
|
|
|
|
|
|
|
# each feature $f is either a |
456
|
|
|
|
|
|
|
# Bio::DB::USeq::Feature, |
457
|
|
|
|
|
|
|
# a Bio::DB::USeq::Wiggle, or |
458
|
|
|
|
|
|
|
# a Bio::DB::USeq::Summary object |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=item scores |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
This is a simplified data accessor that only returns the score |
464
|
|
|
|
|
|
|
values overlapping an interrogated interval, rather than |
465
|
|
|
|
|
|
|
assembling each observation into a SeqFeature object. The scores |
466
|
|
|
|
|
|
|
are not associated with genomic coordinates, and are not guaranteed |
467
|
|
|
|
|
|
|
to be returned in original genomic order. |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Provide the interval coordinates in the same manner as the |
470
|
|
|
|
|
|
|
features() method. Stranded data collection is supported. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
my @scores = $useq->scores( |
473
|
|
|
|
|
|
|
-seq_id => $seq_id, |
474
|
|
|
|
|
|
|
-start => $start, |
475
|
|
|
|
|
|
|
-end => $end, |
476
|
|
|
|
|
|
|
); |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=item observations |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
This is a low-level data accessor, similar to features() but returning |
481
|
|
|
|
|
|
|
an array of references to the original USeq observations. Each USeq |
482
|
|
|
|
|
|
|
observation is an anonymous array reference of 2-4 elements: start, stop, |
483
|
|
|
|
|
|
|
score, and text, depending on the stored data type. All coordinates are in |
484
|
|
|
|
|
|
|
0-based coordinates, unlike high level accessors. Note that while strand is |
485
|
|
|
|
|
|
|
supported, it is not reported for each observation. If observations exist on |
486
|
|
|
|
|
|
|
both strands, then each strand should be searched separately. Observations |
487
|
|
|
|
|
|
|
are not guaranteed to be returned in genomic order. |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
my @observations = $useq->observations( |
490
|
|
|
|
|
|
|
-seq_id => $seq_id, |
491
|
|
|
|
|
|
|
-start => $start, |
492
|
|
|
|
|
|
|
-end => $end, |
493
|
|
|
|
|
|
|
); |
494
|
|
|
|
|
|
|
foreach my $obs (@observations) { |
495
|
|
|
|
|
|
|
my $start = $obs->[0] + 1; # convert to 1-based coordinate |
496
|
|
|
|
|
|
|
my $stop = $obs->[1]; |
497
|
|
|
|
|
|
|
my $score = $obs->[2]; # may not be present |
498
|
|
|
|
|
|
|
my $text = $obs->[3]; # may not be present |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item segment |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
This returns a L<Bio::DB::USeq::Segment> object, which is a SeqFeatureI |
504
|
|
|
|
|
|
|
compatible segment object corresponding to the specified coordinates. |
505
|
|
|
|
|
|
|
From this object, one can call the features(), scores(), or |
506
|
|
|
|
|
|
|
get_seq_stream() methods directly. Keyed options or location |
507
|
|
|
|
|
|
|
information need not be provided. Stranded segments are |
508
|
|
|
|
|
|
|
supported. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
my $segment = $useq->segment( |
511
|
|
|
|
|
|
|
-seq_id => $seq_id, |
512
|
|
|
|
|
|
|
-start => $start, |
513
|
|
|
|
|
|
|
-end => $end, |
514
|
|
|
|
|
|
|
); |
515
|
|
|
|
|
|
|
my @scores = $segment->scores; |
516
|
|
|
|
|
|
|
my @features = $segment->features('wiggle:100'); |
517
|
|
|
|
|
|
|
my $iterator = $segment->get_seq_stream('region'); |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=item get_features_by_location |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
Convenience method for returning features restricted by location. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item get_feature_by_id |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=item get_feature_by_name |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Compatibility methods for returning a specific feature or |
528
|
|
|
|
|
|
|
observation in the USeq file. Text fields, if present, are not |
529
|
|
|
|
|
|
|
indexed in the USeq file, preventing efficient searching of names. |
530
|
|
|
|
|
|
|
As a workaround, an ID or name comprised of "$seq_id:$start:$end" |
531
|
|
|
|
|
|
|
may be used, although a direct search of coordinates would be |
532
|
|
|
|
|
|
|
more efficient. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
my $feature = $useq->get_feature_by_id("$seq_id:$start:$end"); |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=back |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=head1 ADDITIONAL CLASSES |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
These are additional class object that may be returned by various |
542
|
|
|
|
|
|
|
methods above. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Feature objects |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
These are SeqFeatureI compliant objects returned by the features() |
547
|
|
|
|
|
|
|
or next_seq() methods. They support the following methods. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
seq_id |
550
|
|
|
|
|
|
|
start |
551
|
|
|
|
|
|
|
end |
552
|
|
|
|
|
|
|
strand |
553
|
|
|
|
|
|
|
score |
554
|
|
|
|
|
|
|
type |
555
|
|
|
|
|
|
|
source (returns the useq archive filename) |
556
|
|
|
|
|
|
|
name (chromosome:start..stop) |
557
|
|
|
|
|
|
|
Bio::RangeI methods |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
Additionally, chromosome and global statistics are also available |
560
|
|
|
|
|
|
|
from any feature, as well as from Segment, Wiggle, Iterator, and |
561
|
|
|
|
|
|
|
Summary objects. See the corresponding USeq methods for details. |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Segment objects |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
This is a SeqFeatureI compliant object representing a genomic segment |
566
|
|
|
|
|
|
|
or interval. These support the following methods. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=over 4 |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=item features |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item features($type) |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=item get_seq_stream |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=item get_seq_stream($type) |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=item scores |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=item observations |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
Direct methods for returning features or scores. Coordinate information |
583
|
|
|
|
|
|
|
need not be provided. See the corresponding Bio::DB::USeq methods for |
584
|
|
|
|
|
|
|
more information. |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=item wiggle |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=item wiggle($bins) |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=item coverage |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=item coverage($bins) |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=item statistical_summary |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item statistical_summary($bins) |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
Convenience methods for returning wiggle (coverage) or summary features |
599
|
|
|
|
|
|
|
over the segment. If desired, the number of bins may be specified. See |
600
|
|
|
|
|
|
|
the features() method for more information. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=item slices |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Returns an array of splice member names that overlap this segment. |
605
|
|
|
|
|
|
|
See L<USEQ SLICES> for more information. |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=back |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Iterator objects |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
This is an iterator object for retrieving useq observation SeqFeatureI |
612
|
|
|
|
|
|
|
objects in a memory efficient manner by returning one feature at a time. |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
=over 4 |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=item next_seq |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=item next_feature |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
Returns the next feature present in the interrogated interval. Features |
621
|
|
|
|
|
|
|
are generally returned in ascending coordinate order. Returns undefined |
622
|
|
|
|
|
|
|
when no features are remaining in the interval. Features may include |
623
|
|
|
|
|
|
|
either region or wiggle types, depending on how the iterator object was |
624
|
|
|
|
|
|
|
established. See features() and get_seq_stream() methods for more |
625
|
|
|
|
|
|
|
information. |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=back |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Wiggle objects |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
These are SeqFeatureI compliant objects for backwards compatibility with |
632
|
|
|
|
|
|
|
L<Bio::Graphics> and GBrowse. They support the wiggle() and coverage() |
633
|
|
|
|
|
|
|
methods, which returns an array of scores over the interrogated region. By |
634
|
|
|
|
|
|
|
default, the array is equal to the length of the region (1 bp resolution), |
635
|
|
|
|
|
|
|
or may be limited to a specified number of bins for efficiency. See the |
636
|
|
|
|
|
|
|
features() method for more information. |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=over 4 |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=item wiggle |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=item coverage |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
The scores are stored as an array in the coverage attribute. For |
645
|
|
|
|
|
|
|
convenience, the wiggle() and coverage() methods may be used to retrieve |
646
|
|
|
|
|
|
|
the array or array reference of scores. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=item statistical_summary |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
Generate a statistical summary hash for the collected wiggle scores |
651
|
|
|
|
|
|
|
(not the original data). This method is not entirely that useful; best |
652
|
|
|
|
|
|
|
to use the summary feature type in the first place. |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item chromosome and global statistics |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
Chromosome and global statistics, including mean and standard deviation, |
657
|
|
|
|
|
|
|
are available from wiggle objects. See the corresponding USeq methods |
658
|
|
|
|
|
|
|
for details. |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=back |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Summary objects |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
These are SeqFeatureI compliant Summary objects, similar to those |
665
|
|
|
|
|
|
|
generated by the L<Bio::DB::BigWig> database adaptor. As such, they are |
666
|
|
|
|
|
|
|
compatible with L<Bio::Graphics> and GBrowse. |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
Summary objects can generate statistical summaries over a specified |
669
|
|
|
|
|
|
|
number of bins (default is 1 bin, or the entire requested region). |
670
|
|
|
|
|
|
|
Each statistical summary is an anonymous hash consisting of five |
671
|
|
|
|
|
|
|
keys: validCount, sumData, sumSquares, minVal, and maxVal. From |
672
|
|
|
|
|
|
|
these values, a mean and standard deviation may be calculated. |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
For convenience, three exported functions are available for calculating |
675
|
|
|
|
|
|
|
the mean and standard deviation from a statistical summary hash. |
676
|
|
|
|
|
|
|
See L<EXPORTED FUNCTIONS> for more information. |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
Use statistical summaries in the following manner. |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
681
|
|
|
|
|
|
|
my $stat = shift @$stats; |
682
|
|
|
|
|
|
|
my $min = $stat->{minVal}; |
683
|
|
|
|
|
|
|
my $max = $stat->{maxVal}; |
684
|
|
|
|
|
|
|
my $mean = $stat->{sumData} / $stat->{validCount}; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
=over 4 |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=item statistical_summary |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
=item statistical_summary($bins) |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
Generate a statistical summary hash for one or more bins across the |
693
|
|
|
|
|
|
|
interrogated region. Provide the number of bins desired. If a feature |
694
|
|
|
|
|
|
|
type of "summary:$bins" is requested through the features() or |
695
|
|
|
|
|
|
|
get_seq_stream() method, then $bins number of bins will be used. |
696
|
|
|
|
|
|
|
The default number of bins is 1. |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
=item score |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
Generate a single statistical summary over the entire region. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=item chromosome and global statistics |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
Chromosome and global statistics, including mean and standard deviation, |
705
|
|
|
|
|
|
|
are available from summary objects. See the corresponding USeq methods |
706
|
|
|
|
|
|
|
for details. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=back |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head1 EXPORTED FUNCTIONS |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
Three subroutine functions are available for export to assist |
713
|
|
|
|
|
|
|
in calculating the mean, variance, and standard deviation from |
714
|
|
|
|
|
|
|
statistical summaries. These functions are identical to those |
715
|
|
|
|
|
|
|
from the L<Bio::DB::BigWig> adaptor and may be used interchangeably. |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
They are not exported by default; they must explicitly listed. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
use Bio::DB::USeq qw(binMean binStdev); |
720
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
721
|
|
|
|
|
|
|
my $stat = shift @$stats; |
722
|
|
|
|
|
|
|
my $mean = binMean($stat); |
723
|
|
|
|
|
|
|
my $stdev = binStdev($stat); |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
=over 4 |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=item binMean( $stat ) |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
Calculate the mean from a statistical summary anonymous hash. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item binVariance( $stat ) |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Calculate the variance from a statistical summary anonymous hash. |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=item binStdev( $stat ) |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
Calculate the standard deviation from a statistical summary anonymous hash. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
=back |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=head1 USEQ SLICES |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
Genomic observations are recorded in groups, called slices, of |
744
|
|
|
|
|
|
|
usually 10000 observations at a time. Each slice is a separate |
745
|
|
|
|
|
|
|
zip file member in the useq file archive. These methods are for |
746
|
|
|
|
|
|
|
accessing information about each slice. In general, accessing |
747
|
|
|
|
|
|
|
data through slices is a lower level operation. Users should |
748
|
|
|
|
|
|
|
preferentially use the main data accessors. |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
The following are Bio::DB::USeq methods available for working |
751
|
|
|
|
|
|
|
with slices. |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=over 4 |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item slices |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
Returns an array of all the slice member names present in the |
758
|
|
|
|
|
|
|
useq archive file. |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=item slice_feature($slice) |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
Return a L<Bio::DB::USeq::Segment> object representing the slice interval. |
763
|
|
|
|
|
|
|
The features(), get_seq_stream(), and scores() methods are supported. |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item slice_seq_id($slice) |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Return the chromosome or sequence identifier associated with a slice. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=item slice_start($slice) |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Return the start position of the slice interval. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=item slice_end($slice) |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
Return the end position of the slice interval. |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=item slice_strand($slice) |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Return the strand of the slice interval. |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
=item slice_type($slice) |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Return the file type of the slice member. This corresponds to the |
784
|
|
|
|
|
|
|
file extension of the slice zip member and indicates how to parse |
785
|
|
|
|
|
|
|
the binary member. Each letter in the type corresponds to a data |
786
|
|
|
|
|
|
|
type, be it integer, short, floating-point, or text. See the |
787
|
|
|
|
|
|
|
useq file documentation for more details. |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=item slice_obs_number($slice) |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
Return the number of observations recorded in the slice interval. |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
=back |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=head1 GBROWSE COMPATIBILITY |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
The USeq adaptor has support for L<Bio::Graphics> and GBrowse. |
798
|
|
|
|
|
|
|
It will work with the segments glyph for intervals, |
799
|
|
|
|
|
|
|
the wiggle_xyplot glyph for displaying mean scores, and the |
800
|
|
|
|
|
|
|
wiggle_whiskers glyph for displaying detailed statistics. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Initialize the USeq database adaptor. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
[data1:database] |
805
|
|
|
|
|
|
|
db_adaptor = Bio::DB::USeq |
806
|
|
|
|
|
|
|
db_args = -file /path/to/data1.useq |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
Displaying simple intervals with the segments glyph. |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
[data1_segments] |
811
|
|
|
|
|
|
|
database = data1 |
812
|
|
|
|
|
|
|
feature = region |
813
|
|
|
|
|
|
|
glyph = segments |
814
|
|
|
|
|
|
|
stranded = 1 |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
Displaying scores using the wiggle_xyplot glyph. |
817
|
|
|
|
|
|
|
You may set the bins to whatever number is appropriate (in |
818
|
|
|
|
|
|
|
this example, 1000), or leave blank (not recommended, |
819
|
|
|
|
|
|
|
defaults to 1 bp resolution). |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
[data1_xyplot] |
822
|
|
|
|
|
|
|
database = data1 |
823
|
|
|
|
|
|
|
feature = wiggle:1000 |
824
|
|
|
|
|
|
|
glyph = wiggle_xyplot |
825
|
|
|
|
|
|
|
graph_type = histogram |
826
|
|
|
|
|
|
|
autoscale = chromosome |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Displaying scores using the wiggle_whiskers glyph. Note that |
829
|
|
|
|
|
|
|
generating statistical summaries are computationally more expensive |
830
|
|
|
|
|
|
|
than simple bins of mean values as with the wiggle feature type. |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
[data1_whiskers] |
833
|
|
|
|
|
|
|
database = data1 |
834
|
|
|
|
|
|
|
feature = summary |
835
|
|
|
|
|
|
|
glyph = wiggle_whiskers |
836
|
|
|
|
|
|
|
graph_type = histogram |
837
|
|
|
|
|
|
|
autoscale = chromosome |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=head1 PERFORMANCE |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
Because the Bio::DB::USeq is implemented as a Perl-only module, |
842
|
|
|
|
|
|
|
performance is subject to the limitations of Perl execution itself and |
843
|
|
|
|
|
|
|
the size of the data that needs to be parsed. In general when collecting |
844
|
|
|
|
|
|
|
score data, requesting scores is the fastest mode of operation, followed |
845
|
|
|
|
|
|
|
by wiggle feature types, and finally summary feature types. |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
In comparison to UCSC bigWig files, the USeq format is typically much |
848
|
|
|
|
|
|
|
faster when viewing intervals where the entire interval is represented by |
849
|
|
|
|
|
|
|
one or a few internal slices. This is especially true for repeated queries |
850
|
|
|
|
|
|
|
over the same or neighboring intervals, as the slice contents are retained |
851
|
|
|
|
|
|
|
in memory. As the number of internal slices that must be loaded into memory |
852
|
|
|
|
|
|
|
increases, for example querying intervals of hundreds of kilobases in size, |
853
|
|
|
|
|
|
|
performance will begin to lag as each internal slice must be parsed into |
854
|
|
|
|
|
|
|
memory. This is where the UCSC bigWig file format with internal zoom levels |
855
|
|
|
|
|
|
|
of summary statistics can outperform, at the cost of file complexity and size. |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=cut |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
require 5.010000; |
861
|
1
|
|
|
1
|
|
8795
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
862
|
1
|
|
|
1
|
|
6
|
use Carp qw(carp cluck croak confess); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
74
|
|
863
|
1
|
|
|
1
|
|
672
|
use Archive::Zip qw( :ERROR_CODES ); |
|
1
|
|
|
|
|
95370
|
|
|
1
|
|
|
|
|
106
|
|
864
|
1
|
|
|
1
|
|
448
|
use Set::IntervalTree; |
|
1
|
|
|
|
|
8057
|
|
|
1
|
|
|
|
|
74
|
|
865
|
1
|
|
|
1
|
|
9
|
use File::Spec; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
866
|
|
|
|
|
|
|
|
867
|
1
|
|
|
1
|
|
5
|
use base 'Exporter'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9010
|
|
868
|
|
|
|
|
|
|
our @EXPORT_OK = qw(binMean binVariance binStdev); |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
1; |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
#### USeq initialization #### |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
sub new { |
875
|
2
|
|
|
2
|
1
|
221
|
my $class = shift; |
876
|
2
|
|
|
|
|
22
|
my $self = { |
877
|
|
|
|
|
|
|
'name' => undef, |
878
|
|
|
|
|
|
|
'zip' => undef, |
879
|
|
|
|
|
|
|
'stranded' => undef, |
880
|
|
|
|
|
|
|
'seq_ids' => {}, |
881
|
|
|
|
|
|
|
'metadata' => {}, |
882
|
|
|
|
|
|
|
'coord2file' => {}, |
883
|
|
|
|
|
|
|
'file2attribute' => {}, |
884
|
|
|
|
|
|
|
'buffer' => {}, |
885
|
|
|
|
|
|
|
}; |
886
|
2
|
|
|
|
|
6
|
bless $self, $class; |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
# check for arguments |
889
|
2
|
|
|
|
|
4
|
my %args; |
890
|
2
|
50
|
|
|
|
8
|
if (@_) { |
891
|
2
|
100
|
|
|
|
13
|
if ($_[0] =~ /^-/) { |
892
|
1
|
|
|
|
|
5
|
%args = @_; |
893
|
|
|
|
|
|
|
} |
894
|
|
|
|
|
|
|
else { |
895
|
1
|
|
|
|
|
15
|
$args{-file} = shift; |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
} |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# open file |
900
|
2
|
|
0
|
|
|
7
|
my $file = $args{-file} || $args{-useq} || undef; |
901
|
2
|
50
|
|
|
|
8
|
if ($file) { |
902
|
2
|
50
|
|
|
|
11
|
return unless ($self->open($file)); # open must return true |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
# done |
906
|
2
|
|
|
|
|
8
|
return $self; |
907
|
|
|
|
|
|
|
} |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
sub open { |
910
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
# check file |
913
|
2
|
|
|
|
|
5
|
my $filename = shift; |
914
|
2
|
50
|
|
|
|
6
|
unless ($filename) { |
915
|
0
|
|
|
|
|
0
|
cluck("no file name passed!\n"); |
916
|
0
|
|
|
|
|
0
|
return; |
917
|
|
|
|
|
|
|
} |
918
|
2
|
50
|
|
|
|
11
|
unless ($filename =~ /\.useq$/i) { |
919
|
0
|
|
|
|
|
0
|
carp "'$filename' is not a .useq archive file!\n"; |
920
|
0
|
|
|
|
|
0
|
return; |
921
|
|
|
|
|
|
|
} |
922
|
2
|
50
|
|
|
|
9
|
if ($self->slices) { |
923
|
0
|
|
|
|
|
0
|
cluck "Only load one useq file per object!\n"; |
924
|
0
|
|
|
|
|
0
|
return; |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
# open the archive |
928
|
2
|
|
|
|
|
17
|
my $zip = Archive::Zip->new(); |
929
|
2
|
|
|
|
|
74
|
my $error = $zip->read($filename); |
930
|
2
|
50
|
|
|
|
3114
|
unless ($error == AZ_OK) { |
931
|
0
|
|
|
|
|
0
|
carp " unable to read USeq archive '$filename'! Error $error\n"; |
932
|
0
|
|
|
|
|
0
|
return; |
933
|
|
|
|
|
|
|
} |
934
|
2
|
|
|
|
|
6
|
$self->{'zip'} = $zip; |
935
|
2
|
|
|
|
|
50
|
(undef, undef, $self->{'name'}) = File::Spec->splitpath($filename); |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
# parse the contents |
938
|
2
|
50
|
|
|
|
10
|
return unless ($self->_parse_members); |
939
|
|
|
|
|
|
|
# we delay parsing metadata unless it is requested |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
# success |
942
|
2
|
|
|
|
|
6
|
return 1; |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
sub clone { |
946
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
947
|
1
|
50
|
|
|
|
6
|
return unless $self->{'zip'}; |
948
|
1
|
|
|
|
|
4
|
my $file = $self->zip->fileName; |
949
|
1
|
|
|
|
|
17
|
my $zip = Archive::Zip->new(); |
950
|
1
|
|
|
|
|
32
|
my $error = $zip->read($file); |
951
|
1
|
50
|
|
|
|
1560
|
unless ($error == AZ_OK) { |
952
|
0
|
|
|
|
|
0
|
carp " unable to read USeq archive '$file'! Error $error\n"; |
953
|
0
|
|
|
|
|
0
|
return; |
954
|
|
|
|
|
|
|
} |
955
|
1
|
|
|
|
|
29
|
$self->{'zip'} = $zip; |
956
|
1
|
|
|
|
|
5
|
return 1; |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
sub zip { |
960
|
9
|
|
|
9
|
1
|
57
|
return shift->{'zip'}; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
sub name { |
964
|
4
|
|
|
4
|
0
|
40
|
return shift->{'name'}; |
965
|
|
|
|
|
|
|
} |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
#### General USeq information #### |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
sub seq_ids { |
972
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
973
|
1
|
|
|
|
|
2
|
return sort {$a cmp $b} keys %{ $self->{'seq_ids'} }; |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
7
|
|
974
|
|
|
|
|
|
|
} |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
sub length { |
977
|
8
|
|
|
8
|
1
|
72
|
my $self = shift; |
978
|
8
|
50
|
|
|
|
25
|
my $seq_id = shift or return; |
979
|
8
|
50
|
|
|
|
34
|
if (exists $self->{'seq_ids'}{$seq_id}) { |
980
|
8
|
|
|
|
|
38
|
return $self->{'seq_ids'}{$seq_id}; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
sub stranded { |
985
|
8
|
|
|
8
|
1
|
53
|
return shift->{'stranded'}; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
sub attributes { |
989
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
990
|
0
|
0
|
|
|
|
0
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
0
|
|
|
|
|
0
|
|
991
|
0
|
|
|
|
|
0
|
return (sort {$a cmp $b} keys %{ $self->{'metadata'} }); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
sub attribute { |
995
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
996
|
1
|
|
|
|
|
3
|
my $key = shift; |
997
|
1
|
50
|
|
|
|
3
|
return $self->attributes unless $key; |
998
|
1
|
50
|
|
|
|
2
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
1
|
|
|
|
|
7
|
|
999
|
1
|
50
|
|
|
|
4
|
if (exists $self->{'metadata'}{$key}) { |
1000
|
1
|
|
|
|
|
7
|
return $self->{'metadata'}{$key}; |
1001
|
|
|
|
|
|
|
} |
1002
|
0
|
|
|
|
|
0
|
return; |
1003
|
|
|
|
|
|
|
} |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
sub type { |
1006
|
1
|
|
|
1
|
1
|
107
|
return shift->attribute('dataType'); |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
sub version { |
1010
|
0
|
|
|
0
|
1
|
0
|
return shift->attribute('useqArchiveVersion'); |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
sub genome { |
1014
|
0
|
|
|
0
|
1
|
0
|
return shift->attribute('versionedGenome'); |
1015
|
|
|
|
|
|
|
} |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub chr_stats { |
1018
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
1019
|
1
|
50
|
|
|
|
4
|
my $seq_id = shift or return; |
1020
|
1
|
|
|
|
|
3
|
my $delay_write = shift; # option to delay rewriting the metadata |
1021
|
1
|
50
|
|
|
|
1
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
1
|
|
|
|
|
6
|
|
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# return the chromosome stats |
1024
|
1
|
50
|
|
|
|
12
|
if (exists $self->{metadata}{"chromStats_$seq_id"}) { |
1025
|
0
|
|
|
|
|
0
|
my @data = split(',', $self->{metadata}{"chromStats_$seq_id"}); |
1026
|
0
|
|
|
|
|
0
|
my %stat = ( |
1027
|
|
|
|
|
|
|
'validCount' => $data[0], |
1028
|
|
|
|
|
|
|
'sumData' => $data[1], |
1029
|
|
|
|
|
|
|
'sumSquares' => $data[2], |
1030
|
|
|
|
|
|
|
'minVal' => $data[3], |
1031
|
|
|
|
|
|
|
'maxVal' => $data[4], |
1032
|
|
|
|
|
|
|
); |
1033
|
0
|
|
|
|
|
0
|
return \%stat; |
1034
|
|
|
|
|
|
|
} |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
# chromosome stats must be generated |
1037
|
1
|
|
|
|
|
14
|
my @slices = $self->_translate_coordinates_to_slices( |
1038
|
|
|
|
|
|
|
$seq_id, 1, $self->length($seq_id), 0); |
1039
|
1
|
|
|
|
|
5
|
$self->_clear_buffer(\@slices); |
1040
|
1
|
|
|
|
|
9
|
my $stat = $self->_stat_summary(1, $self->length($seq_id), \@slices); |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
# then associate with the metadata |
1043
|
1
|
|
|
|
|
7
|
$self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat->{$_} } |
|
5
|
|
|
|
|
44
|
|
1044
|
|
|
|
|
|
|
qw(validCount sumData sumSquares minVal maxVal) ); |
1045
|
1
|
50
|
|
|
|
9
|
$self->_rewrite_metadata unless $delay_write; |
1046
|
|
|
|
|
|
|
|
1047
|
1
|
|
|
|
|
6
|
return $stat; |
1048
|
|
|
|
|
|
|
} |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
sub chr_mean { |
1051
|
1
|
|
|
1
|
1
|
142
|
my $self = shift; |
1052
|
1
|
50
|
|
|
|
5
|
my $seq_id = shift or return; |
1053
|
1
|
|
|
|
|
4
|
return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) ); |
1054
|
|
|
|
|
|
|
} |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
sub chr_stdev { |
1057
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1058
|
0
|
0
|
|
|
|
0
|
my $seq_id = shift or return; |
1059
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) ); |
1060
|
|
|
|
|
|
|
} |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
sub global_stats { |
1063
|
|
|
|
|
|
|
# this is an expensive proposition, because it must parse through every |
1064
|
|
|
|
|
|
|
# slice in the archive |
1065
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1066
|
0
|
0
|
|
|
|
0
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
0
|
|
|
|
|
0
|
|
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
# return the chromosome stats |
1069
|
0
|
0
|
|
|
|
0
|
if (exists $self->{metadata}{globalStats}) { |
1070
|
0
|
|
|
|
|
0
|
my @data = split(',', $self->{metadata}{globalStats}); |
1071
|
0
|
|
|
|
|
0
|
my %stat = ( |
1072
|
|
|
|
|
|
|
'validCount' => $data[0], |
1073
|
|
|
|
|
|
|
'sumData' => $data[1], |
1074
|
|
|
|
|
|
|
'sumSquares' => $data[2], |
1075
|
|
|
|
|
|
|
'minVal' => $data[3], |
1076
|
|
|
|
|
|
|
'maxVal' => $data[4], |
1077
|
|
|
|
|
|
|
); |
1078
|
0
|
|
|
|
|
0
|
return \%stat; |
1079
|
|
|
|
|
|
|
} |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
# calculate new genome-wide statistics from individual chromosome stats |
1082
|
0
|
|
|
|
|
0
|
my $count = 0; |
1083
|
0
|
|
|
|
|
0
|
my $sum; |
1084
|
|
|
|
|
|
|
my $sum_squares; |
1085
|
0
|
|
|
|
|
0
|
my $min; |
1086
|
0
|
|
|
|
|
0
|
my $max; |
1087
|
0
|
|
|
|
|
0
|
foreach my $seq_id ($self->seq_ids) { |
1088
|
0
|
|
|
|
|
0
|
my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata |
1089
|
0
|
|
|
|
|
0
|
$count += $stats->{validCount}; |
1090
|
0
|
|
|
|
|
0
|
$sum += $stats->{sumData}; |
1091
|
0
|
|
|
|
|
0
|
$sum_squares += $stats->{sumSquares}; |
1092
|
0
|
0
|
0
|
|
|
0
|
$min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min); |
1093
|
0
|
0
|
0
|
|
|
0
|
$max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} < $max); |
1094
|
|
|
|
|
|
|
} |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
# assemble the statistical summary hash |
1097
|
0
|
|
0
|
|
|
0
|
my %stat = ( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1098
|
|
|
|
|
|
|
'validCount' => $count, |
1099
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
1100
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
1101
|
|
|
|
|
|
|
'minVal' => $min || 0, |
1102
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
1103
|
|
|
|
|
|
|
); |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
# update metadata |
1106
|
0
|
|
|
|
|
0
|
$self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} } |
|
0
|
|
|
|
|
0
|
|
1107
|
|
|
|
|
|
|
qw(validCount sumData sumSquares minVal maxVal) ); |
1108
|
0
|
|
|
|
|
0
|
$self->_rewrite_metadata; |
1109
|
|
|
|
|
|
|
|
1110
|
0
|
|
|
|
|
0
|
return \%stat; |
1111
|
|
|
|
|
|
|
} |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
sub global_mean { |
1114
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1115
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binMean( $self->global_stats ); |
1116
|
|
|
|
|
|
|
} |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
sub global_stdev { |
1119
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1120
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binStdev( $self->global_stats ); |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
#### slice information #### |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
sub slices { |
1126
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
1127
|
2
|
|
|
|
|
3
|
return sort {$a cmp $b} keys %{ $self->{'file2attribute'} }; |
|
0
|
|
|
|
|
0
|
|
|
2
|
|
|
|
|
13
|
|
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
sub slice_seq_id { |
1131
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1132
|
0
|
0
|
|
|
|
0
|
my $slice = shift or return; |
1133
|
0
|
|
|
|
|
0
|
return $self->{'file2attribute'}{$slice}->[0]; |
1134
|
|
|
|
|
|
|
} |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub slice_start { |
1137
|
1087
|
|
|
1087
|
1
|
1711
|
my $self = shift; |
1138
|
1087
|
50
|
|
|
|
2189
|
my $slice = shift or return; |
1139
|
1087
|
|
|
|
|
2424
|
return $self->{'file2attribute'}{$slice}->[1]; |
1140
|
|
|
|
|
|
|
} |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
sub slice_end { |
1143
|
2020
|
|
|
2020
|
1
|
2655
|
my $self = shift; |
1144
|
2020
|
50
|
|
|
|
3759
|
my $slice = shift or return; |
1145
|
2020
|
|
|
|
|
5941
|
return $self->{'file2attribute'}{$slice}->[2]; |
1146
|
|
|
|
|
|
|
} |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
sub slice_strand { |
1149
|
4
|
|
|
4
|
1
|
9
|
my $self = shift; |
1150
|
4
|
50
|
|
|
|
16
|
my $slice = shift or return; |
1151
|
4
|
|
|
|
|
23
|
return $self->{'file2attribute'}{$slice}->[3]; |
1152
|
|
|
|
|
|
|
} |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
sub slice_type { |
1155
|
1020
|
|
|
1020
|
1
|
1425
|
my $self = shift; |
1156
|
1020
|
50
|
|
|
|
2040
|
my $slice = shift or return; |
1157
|
1020
|
|
|
|
|
4047
|
return $self->{'file2attribute'}{$slice}->[4]; |
1158
|
|
|
|
|
|
|
} |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
sub slice_obs_number { |
1161
|
6
|
|
|
6
|
1
|
12
|
my $self = shift; |
1162
|
6
|
50
|
|
|
|
15
|
my $slice = shift or return; |
1163
|
6
|
|
|
|
|
19
|
return $self->{'file2attribute'}{$slice}->[5]; |
1164
|
|
|
|
|
|
|
} |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
sub slice_feature { |
1167
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1168
|
0
|
0
|
|
|
|
0
|
my $slice = shift or return; |
1169
|
|
|
|
|
|
|
return Bio::DB::USeq::Segment->new( |
1170
|
|
|
|
|
|
|
-seq_id => $self->{'file2attribute'}{$slice}->[0], |
1171
|
|
|
|
|
|
|
-start => $self->{'file2attribute'}{$slice}->[1], |
1172
|
|
|
|
|
|
|
-stop => $self->{'file2attribute'}{$slice}->[2], |
1173
|
|
|
|
|
|
|
-strand => $self->{'file2attribute'}{$slice}->[3], |
1174
|
0
|
|
|
|
|
0
|
-type => $self->{'file2attribute'}{$slice}->[4], |
1175
|
|
|
|
|
|
|
-source => $self->name, |
1176
|
|
|
|
|
|
|
-name => $slice, |
1177
|
|
|
|
|
|
|
); |
1178
|
|
|
|
|
|
|
} |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
#### Feature and data access #### |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
sub segment { |
1187
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1190
|
1
|
|
|
|
|
4
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1191
|
1
|
50
|
|
|
|
4
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1192
|
|
|
|
|
|
|
|
1193
|
1
|
|
|
|
|
12
|
return Bio::DB::USeq::Segment->new( |
1194
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1195
|
|
|
|
|
|
|
-start => $start, |
1196
|
|
|
|
|
|
|
-end => $stop, |
1197
|
|
|
|
|
|
|
-strand => $strand, |
1198
|
|
|
|
|
|
|
-type => 'segment', |
1199
|
|
|
|
|
|
|
-source => $self->name, |
1200
|
|
|
|
|
|
|
-useq => $self, |
1201
|
|
|
|
|
|
|
); |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
sub features { |
1205
|
2
|
|
|
2
|
1
|
344
|
my $self = shift; |
1206
|
2
|
|
|
|
|
10
|
my %args = @_; |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
# check for type |
1209
|
2
|
|
|
|
|
5
|
my $type; |
1210
|
2
|
|
0
|
|
|
7
|
$args{-type} ||= $args{-types} || $args{-primary_tag} || 'region'; |
|
|
|
33
|
|
|
|
|
1211
|
2
|
50
|
33
|
|
|
11
|
if (ref $args{-type} and ref $args{-type} eq 'ARRAY') { |
1212
|
0
|
|
0
|
|
|
0
|
$type = $args{-type}->[0] || 'region'; |
1213
|
0
|
|
|
|
|
0
|
$args{-type} = $type; |
1214
|
|
|
|
|
|
|
} |
1215
|
|
|
|
|
|
|
else { |
1216
|
2
|
|
|
|
|
6
|
$type = $args{-type}; |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# return an appropriate feature |
1220
|
2
|
50
|
|
|
|
20
|
if ($type =~ /chromosome/) { |
|
|
50
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
# compatibility to return chromosome features |
1222
|
|
|
|
|
|
|
my @chromos = map { |
1223
|
0
|
|
|
|
|
0
|
Bio::DB::USeq::Feature->new( |
|
0
|
|
|
|
|
0
|
|
1224
|
|
|
|
|
|
|
-seq_id => $_, |
1225
|
|
|
|
|
|
|
-start => 1, |
1226
|
|
|
|
|
|
|
-end => $self->length($_), |
1227
|
|
|
|
|
|
|
-type => $type, |
1228
|
|
|
|
|
|
|
-source => $self->name, |
1229
|
|
|
|
|
|
|
-name => $_, |
1230
|
|
|
|
|
|
|
) |
1231
|
|
|
|
|
|
|
} $self->seq_ids; |
1232
|
0
|
0
|
|
|
|
0
|
return wantarray ? @chromos : \@chromos; |
1233
|
|
|
|
|
|
|
} |
1234
|
|
|
|
|
|
|
elsif ($type =~ /region|interval|observation|coverage|wiggle|summary/) { |
1235
|
|
|
|
|
|
|
# region or segment are individual observation features |
1236
|
|
|
|
|
|
|
# coverage or wiggle are for efficient score retrieval with |
1237
|
|
|
|
|
|
|
# backwards compatibility with Bio::Graphics |
1238
|
|
|
|
|
|
|
# summary are statistical summaries akin to Bio::DB::BigFile |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
# set up an iterator |
1241
|
2
|
|
|
|
|
13
|
my $iterator = $self->get_seq_stream(%args); |
1242
|
2
|
50
|
|
|
|
9
|
return unless $iterator; |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
# if user requested an iterator |
1245
|
2
|
0
|
33
|
|
|
6
|
if (exists $args{-iterator} and $args{-iterator}) { |
1246
|
0
|
|
|
|
|
0
|
return $iterator; |
1247
|
|
|
|
|
|
|
} |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
# collect the features |
1250
|
2
|
|
|
|
|
4
|
my @features; |
1251
|
2
|
|
|
|
|
7
|
while (my $f = $iterator->next_seq) { |
1252
|
2
|
|
|
|
|
10
|
push @features, $f; |
1253
|
|
|
|
|
|
|
} |
1254
|
2
|
50
|
|
|
|
22
|
return wantarray ? @features : \@features; |
1255
|
|
|
|
|
|
|
} |
1256
|
|
|
|
|
|
|
else { |
1257
|
0
|
|
|
|
|
0
|
confess "unknown type request '$type'!\n"; |
1258
|
|
|
|
|
|
|
} |
1259
|
|
|
|
|
|
|
} |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
sub get_features_by_type { |
1263
|
|
|
|
|
|
|
# does not work without location information |
1264
|
0
|
|
|
0
|
0
|
0
|
cluck "please use features() method instead"; |
1265
|
0
|
|
|
|
|
0
|
return; |
1266
|
|
|
|
|
|
|
} |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
sub get_features_by_location { |
1270
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1273
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1274
|
|
|
|
|
|
|
|
1275
|
0
|
|
|
|
|
0
|
return $self->features($seq_id, $start, $stop, $strand); |
1276
|
|
|
|
|
|
|
} |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
sub get_feature_by_id { |
1280
|
|
|
|
|
|
|
# much as Bio::DB::BigWig fakes the id, so we will too here |
1281
|
|
|
|
|
|
|
# I don't know how necessary this really is |
1282
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
# id will be encoded as chr:start:end ? |
1285
|
0
|
|
|
|
|
0
|
my $id = shift; |
1286
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $end, $type) = split /:/, $id; |
1287
|
|
|
|
|
|
|
|
1288
|
0
|
|
0
|
|
|
0
|
my @list = $self->features( |
1289
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1290
|
|
|
|
|
|
|
-start => $start, |
1291
|
|
|
|
|
|
|
-end => $end, |
1292
|
|
|
|
|
|
|
-type => $type || undef, |
1293
|
|
|
|
|
|
|
); |
1294
|
0
|
0
|
|
|
|
0
|
return unless @list; |
1295
|
0
|
0
|
|
|
|
0
|
return $list[0] if scalar @list == 1; |
1296
|
0
|
|
|
|
|
0
|
foreach my $f (@list) { |
1297
|
0
|
0
|
0
|
|
|
0
|
return $f if ($f->start == $start and $f->end == $end); |
1298
|
|
|
|
|
|
|
} |
1299
|
0
|
|
|
|
|
0
|
return; |
1300
|
|
|
|
|
|
|
} |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
|
1303
|
|
|
|
|
|
|
sub get_feature_by_name { |
1304
|
0
|
|
|
0
|
1
|
0
|
return shift->get_feature_by_id(@_); |
1305
|
|
|
|
|
|
|
} |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
sub get_seq_stream { |
1309
|
3
|
|
|
3
|
1
|
95
|
my $self = shift; |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1312
|
3
|
|
|
|
|
12
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1313
|
3
|
50
|
|
|
|
12
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
# check for type |
1316
|
3
|
|
|
|
|
10
|
my %args = @_; |
1317
|
3
|
|
|
|
|
6
|
my $type; |
1318
|
3
|
|
50
|
|
|
18
|
$args{-type} ||= $args{-types} || $args{-primary_tag} || 'region'; |
|
|
|
66
|
|
|
|
|
1319
|
3
|
50
|
33
|
|
|
29
|
if (ref $args{-type} and ref $args{-type} eq 'ARRAY') { |
1320
|
0
|
|
0
|
|
|
0
|
$type = $args{-type}->[0] || 'region'; |
1321
|
|
|
|
|
|
|
} |
1322
|
|
|
|
|
|
|
else { |
1323
|
3
|
|
|
|
|
10
|
$type = $args{-type}; |
1324
|
|
|
|
|
|
|
} |
1325
|
|
|
|
|
|
|
|
1326
|
3
|
|
|
|
|
14
|
return Bio::DB::USeq::Iterator->new( |
1327
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1328
|
|
|
|
|
|
|
-start => $start, |
1329
|
|
|
|
|
|
|
-end => $stop, |
1330
|
|
|
|
|
|
|
-strand => $strand, |
1331
|
|
|
|
|
|
|
-type => $type, |
1332
|
|
|
|
|
|
|
-source => $self->name, |
1333
|
|
|
|
|
|
|
-useq => $self, |
1334
|
|
|
|
|
|
|
); |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
sub scores { |
1339
|
1
|
|
|
1
|
1
|
59
|
my $self = shift; |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1342
|
1
|
|
|
|
|
4
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1343
|
1
|
50
|
|
|
|
4
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
# determine which slices to retrieve |
1346
|
1
|
|
|
|
|
4
|
my @slices = $self->_translate_coordinates_to_slices( |
1347
|
|
|
|
|
|
|
$seq_id, $start, $stop, $strand); |
1348
|
1
|
50
|
|
|
|
3
|
return unless @slices; |
1349
|
1
|
|
|
|
|
5
|
$self->_clear_buffer(\@slices); |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
1352
|
1
|
|
|
|
|
2
|
my @scores; |
1353
|
1
|
|
|
|
|
3
|
foreach my $slice (@slices) { |
1354
|
|
|
|
|
|
|
|
1355
|
|
|
|
|
|
|
# load and unpack the data |
1356
|
1
|
|
|
|
|
4
|
$self->_load_slice($slice); |
1357
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
# find the overlapping observations |
1359
|
1
|
|
|
|
|
66
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
# record the scores |
1362
|
1
|
|
|
|
|
7
|
foreach my $r (@$results) { |
1363
|
31
|
50
|
|
|
|
65
|
push @scores, $r->[2] if defined $r->[2]; |
1364
|
|
|
|
|
|
|
} |
1365
|
|
|
|
|
|
|
} |
1366
|
|
|
|
|
|
|
|
1367
|
1
|
50
|
|
|
|
25
|
return wantarray ? @scores : \@scores; |
1368
|
|
|
|
|
|
|
} |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
sub observations { |
1371
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1374
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1375
|
0
|
0
|
|
|
|
0
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
# determine which slices to retrieve |
1378
|
0
|
|
|
|
|
0
|
my @slices = $self->_translate_coordinates_to_slices( |
1379
|
|
|
|
|
|
|
$seq_id, $start, $stop, $strand); |
1380
|
0
|
0
|
|
|
|
0
|
return unless @slices; |
1381
|
0
|
|
|
|
|
0
|
$self->_clear_buffer(\@slices); |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
1384
|
0
|
|
|
|
|
0
|
my @observations; |
1385
|
0
|
|
|
|
|
0
|
foreach my $slice (@slices) { |
1386
|
|
|
|
|
|
|
# load and unpack the data |
1387
|
0
|
|
|
|
|
0
|
$self->_load_slice($slice); |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
# find the overlapping observations |
1390
|
0
|
|
|
|
|
0
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
1391
|
0
|
|
|
|
|
0
|
push @observations, @$results; |
1392
|
|
|
|
|
|
|
} |
1393
|
|
|
|
|
|
|
|
1394
|
0
|
0
|
|
|
|
0
|
return wantarray ? @observations : \@observations; |
1395
|
|
|
|
|
|
|
} |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
|
1400
|
|
|
|
|
|
|
#### Private methods #### |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
sub _parse_metadata { |
1403
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
1404
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
# the metadata file should always be present in a USeq file |
1406
|
1
|
|
|
|
|
5
|
my $readMe = $self->{'zip'}->contents('archiveReadMe.txt'); |
1407
|
1
|
50
|
|
|
|
1366
|
unless ($readMe) { |
1408
|
0
|
|
|
|
|
0
|
carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n"; |
1409
|
0
|
|
|
|
|
0
|
return; |
1410
|
|
|
|
|
|
|
} |
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
# parse the archive file |
1413
|
|
|
|
|
|
|
# this is a simple text file, each line is key = value |
1414
|
1
|
|
|
|
|
4
|
$self->{'metadata'}{'comments'} = []; |
1415
|
1
|
|
|
|
|
6
|
foreach (split /\n/, $readMe) { |
1416
|
5
|
50
|
|
|
|
15
|
if (/^#/) { |
1417
|
0
|
0
|
|
|
|
0
|
push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/; |
|
0
|
|
|
|
|
0
|
|
1418
|
0
|
|
|
|
|
0
|
next; |
1419
|
|
|
|
|
|
|
} |
1420
|
5
|
50
|
|
|
|
21
|
if (/^\s*([^=\s]+)\s*=\s*(.+)\s*$/) { |
1421
|
|
|
|
|
|
|
# separate key = value pairs tolerating whitespace |
1422
|
|
|
|
|
|
|
# key may contain anything excluding = and whitespace |
1423
|
5
|
|
|
|
|
27
|
$self->{'metadata'}{$1} = $2; |
1424
|
|
|
|
|
|
|
} |
1425
|
|
|
|
|
|
|
} |
1426
|
1
|
|
|
|
|
4
|
return 1; |
1427
|
|
|
|
|
|
|
} |
1428
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
sub _parse_members { |
1431
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
# there is a lot of information encoded in each zip member file name |
1434
|
|
|
|
|
|
|
# the chromosome, strand, coordinates of represented interval, and file type |
1435
|
|
|
|
|
|
|
# this parses and indexes this metadata into a usable format |
1436
|
|
|
|
|
|
|
|
1437
|
2
|
|
|
|
|
4
|
my @errors; |
1438
|
2
|
|
|
|
|
8
|
foreach my $member ($self->{'zip'}->memberNames) { |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
# archive readMe |
1441
|
8
|
100
|
|
|
|
95
|
next if ($member eq 'archiveReadMe.txt'); |
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
# data file |
1444
|
6
|
|
|
|
|
52
|
my ($chromo, $strand, $start, $stop, $number, $extension) = |
1445
|
|
|
|
|
|
|
($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i); |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
# check extracted metadata |
1448
|
6
|
50
|
33
|
|
|
64
|
unless ($chromo and $strand and defined $start and defined $stop and |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1449
|
|
|
|
|
|
|
$number and $extension) { |
1450
|
0
|
|
|
|
|
0
|
push @errors, " data slice $member"; |
1451
|
0
|
|
|
|
|
0
|
next; |
1452
|
|
|
|
|
|
|
} |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
# check stranded data |
1455
|
6
|
100
|
|
|
|
15
|
unless (defined $self->{'stranded'}) { |
1456
|
2
|
100
|
|
|
|
7
|
if ($strand eq '.') { |
1457
|
1
|
|
|
|
|
3
|
$self->{'stranded'} = 0; |
1458
|
|
|
|
|
|
|
} |
1459
|
|
|
|
|
|
|
else { |
1460
|
1
|
|
|
|
|
4
|
$self->{'stranded'} = 1; |
1461
|
|
|
|
|
|
|
} |
1462
|
|
|
|
|
|
|
} |
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
# check seq_ids |
1465
|
|
|
|
|
|
|
# record the length for each seq_id, which may or may not be entirely |
1466
|
|
|
|
|
|
|
# accurate, since it is just the last interval position |
1467
|
6
|
100
|
|
|
|
14
|
if (exists $self->{'seq_ids'}{$chromo} ) { |
1468
|
3
|
50
|
|
|
|
10
|
if ($stop > $self->{'seq_ids'}{$chromo}) { |
1469
|
3
|
|
|
|
|
7
|
$self->{'seq_ids'}{$chromo} = $stop; |
1470
|
|
|
|
|
|
|
} |
1471
|
|
|
|
|
|
|
} |
1472
|
|
|
|
|
|
|
else { |
1473
|
3
|
|
|
|
|
9
|
$self->{'seq_ids'}{$chromo} = $stop; |
1474
|
|
|
|
|
|
|
} |
1475
|
|
|
|
|
|
|
|
1476
|
|
|
|
|
|
|
# convert to BioPerl convention |
1477
|
6
|
50
|
|
|
|
29
|
$strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0; |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1478
|
6
|
|
|
|
|
13
|
$start += 1; |
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
# store the member details for each member slice |
1481
|
6
|
|
|
|
|
24
|
$self->{'file2attribute'}{$member} = |
1482
|
|
|
|
|
|
|
[$chromo, $start, $stop, $strand, $extension, $number]; |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
# store the member into a chromosome-strand-specific interval tree |
1485
|
6
|
|
66
|
|
|
49
|
$self->{'coord2file'}{$chromo}{$strand} ||= Set::IntervalTree->new(); |
1486
|
6
|
|
|
|
|
27
|
$self->{'coord2file'}{$chromo}{$strand}->insert($member, $start, $stop); |
1487
|
|
|
|
|
|
|
} |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
# check parsing |
1490
|
2
|
50
|
|
|
|
16
|
if (@errors) { |
1491
|
0
|
|
|
|
|
0
|
carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n"; |
1492
|
|
|
|
|
|
|
} |
1493
|
2
|
50
|
|
|
|
5
|
unless (%{ $self->{'coord2file'} }) { |
|
2
|
|
|
|
|
9
|
|
1494
|
0
|
|
|
|
|
0
|
carp "no data slices present in USeq archive!\n"; |
1495
|
0
|
|
|
|
|
0
|
return; |
1496
|
|
|
|
|
|
|
} |
1497
|
|
|
|
|
|
|
|
1498
|
2
|
|
|
|
|
7
|
return 1; |
1499
|
|
|
|
|
|
|
} |
1500
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
sub _get_coordinates { |
1503
|
5
|
|
|
5
|
|
9
|
my $self = shift; |
1504
|
|
|
|
|
|
|
|
1505
|
5
|
|
|
|
|
10
|
my ($seq_id, $start, $stop, $strand); |
1506
|
5
|
50
|
|
|
|
26
|
if ($_[0] =~ /^\-/) { |
1507
|
5
|
|
|
|
|
23
|
my %args = @_; |
1508
|
5
|
|
0
|
|
|
19
|
$seq_id = $args{'-seq_id'} || $args{'-chromo'} || undef; |
1509
|
5
|
|
0
|
|
|
16
|
$start = $args{'-start'} || $args{'-pos'} || undef; |
1510
|
5
|
|
0
|
|
|
13
|
$stop = $args{'-end'} || $args{'-stop'} || undef; |
1511
|
5
|
|
50
|
|
|
29
|
$strand = $args{'-strand'} || '0'; # unstranded |
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
else { |
1514
|
0
|
|
|
|
|
0
|
($seq_id, $start, $stop, $strand) = @_; |
1515
|
|
|
|
|
|
|
} |
1516
|
5
|
50
|
|
|
|
11
|
unless ($seq_id) { |
1517
|
0
|
|
|
|
|
0
|
cluck "no sequence ID provided!"; |
1518
|
0
|
|
|
|
|
0
|
return; |
1519
|
|
|
|
|
|
|
} |
1520
|
5
|
|
50
|
|
|
14
|
$start ||= 1; |
1521
|
5
|
|
33
|
|
|
27
|
$stop ||= $self->length($seq_id); |
1522
|
5
|
|
50
|
|
|
20
|
$strand ||= 0; |
1523
|
5
|
|
|
|
|
19
|
return ($seq_id, $start, $stop, $strand); |
1524
|
|
|
|
|
|
|
} |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
sub _translate_coordinates_to_slices { |
1528
|
6
|
|
|
6
|
|
13
|
my $self = shift; |
1529
|
6
|
|
|
|
|
17
|
my ($seq_id, $start, $stop, $strand) = @_; |
1530
|
6
|
50
|
|
|
|
29
|
return unless (exists $self->{'coord2file'}{$seq_id}); |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
# check strand request |
1533
|
6
|
|
|
|
|
11
|
my $both = 0; |
1534
|
6
|
50
|
|
|
|
19
|
if ($strand == 0) { |
1535
|
|
|
|
|
|
|
# strand was not specified, |
1536
|
|
|
|
|
|
|
# but collect from both strands if we have stranded data |
1537
|
6
|
100
|
|
|
|
15
|
$both = 1 if $self->stranded; |
1538
|
|
|
|
|
|
|
} |
1539
|
|
|
|
|
|
|
else { |
1540
|
|
|
|
|
|
|
# strand was specified, |
1541
|
|
|
|
|
|
|
# but convert it to unstranded if the data is not stranded |
1542
|
0
|
0
|
|
|
|
0
|
$strand = 0 unless $self->stranded; |
1543
|
|
|
|
|
|
|
} |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
# look for the overlapping slices |
1546
|
6
|
|
|
|
|
13
|
my @slices; |
1547
|
6
|
100
|
|
|
|
17
|
if ($both) { |
1548
|
|
|
|
|
|
|
# need to collect from both strands |
1549
|
|
|
|
|
|
|
# plus strand first, then minus strand |
1550
|
1
|
|
|
|
|
9
|
my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start, $stop); |
1551
|
1
|
|
|
|
|
5
|
my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start, $stop); |
1552
|
1
|
|
|
|
|
5
|
push @slices, @$results, @$results2; |
1553
|
|
|
|
|
|
|
} |
1554
|
|
|
|
|
|
|
|
1555
|
|
|
|
|
|
|
# specific strand |
1556
|
|
|
|
|
|
|
else { |
1557
|
5
|
|
|
|
|
56
|
my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start, $stop); |
1558
|
5
|
|
|
|
|
16
|
push @slices, @$results; |
1559
|
|
|
|
|
|
|
} |
1560
|
|
|
|
|
|
|
|
1561
|
6
|
|
|
|
|
22
|
return @slices; |
1562
|
|
|
|
|
|
|
} |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
sub _clear_buffer { |
1566
|
6
|
|
|
6
|
|
11
|
my $self = shift; |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
# make a quick hash of wanted slices |
1569
|
6
|
|
|
|
|
12
|
my %wanted = map {$_ => 1} @{$_[0]}; |
|
9
|
|
|
|
|
31
|
|
|
6
|
|
|
|
|
25
|
|
1570
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
# delete the existing buffers of slices we do not want |
1572
|
6
|
|
|
|
|
11
|
foreach (keys %{ $self->{buffer} }) { |
|
6
|
|
|
|
|
28
|
|
1573
|
5
|
100
|
|
|
|
8151
|
delete $self->{buffer}{$_} unless exists $wanted{$_}; |
1574
|
|
|
|
|
|
|
} |
1575
|
|
|
|
|
|
|
} |
1576
|
|
|
|
|
|
|
|
1577
|
|
|
|
|
|
|
sub _load_slice { |
1578
|
1016
|
|
|
1016
|
|
1396
|
my $self = shift; |
1579
|
1016
|
|
|
|
|
1507
|
my $slice = shift; |
1580
|
1016
|
100
|
|
|
|
2379
|
return if (exists $self->{'buffer'}{$slice}); |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
# each slice is parsed into observations consisting of start, stop, and |
1583
|
|
|
|
|
|
|
# optionally score and text depending on type |
1584
|
|
|
|
|
|
|
# these are stored as anonymous arrays [start, stop, score, text] |
1585
|
|
|
|
|
|
|
# for quick retrieval each observation interval is stored in a Set::IntervalTree |
1586
|
|
|
|
|
|
|
# these operations maintain the original 0-based coordinate system |
1587
|
|
|
|
|
|
|
|
1588
|
7
|
|
|
|
|
22
|
my $type = $self->slice_type($slice); |
1589
|
7
|
50
|
|
|
|
59
|
if ($type eq 's') { $self->_load_s_slice($slice) } |
|
0
|
50
|
|
|
|
0
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1590
|
0
|
|
|
|
|
0
|
elsif ($type eq 'sf') { $self->_load_sf_slice($slice) } |
1591
|
0
|
|
|
|
|
0
|
elsif ($type eq 'st') { $self->_load_st_slice($slice) } |
1592
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ss') { $self->_load_ss_slice($slice) } |
1593
|
6
|
|
|
|
|
39
|
elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) } |
1594
|
0
|
|
|
|
|
0
|
elsif ($type eq 'sst') { $self->_load_sst_slice($slice) } |
1595
|
1
|
|
|
|
|
5
|
elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) } |
1596
|
0
|
|
|
|
|
0
|
elsif ($type eq 'i') { $self->_load_i_slice($slice) } |
1597
|
0
|
|
|
|
|
0
|
elsif ($type eq 'if') { $self->_load_if_slice($slice) } |
1598
|
0
|
|
|
|
|
0
|
elsif ($type eq 'it') { $self->_load_it_slice($slice) } |
1599
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ii') { $self->_load_ii_slice($slice) } |
1600
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iif') { $self->_load_iif_slice($slice) } |
1601
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iit') { $self->_load_iit_slice($slice) } |
1602
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iift') { $self->_load_iift_slice($slice) } |
1603
|
0
|
|
|
|
|
0
|
elsif ($type eq 'is') { $self->_load_is_slice($slice) } |
1604
|
0
|
|
|
|
|
0
|
elsif ($type eq 'isf') { $self->_load_isf_slice($slice) } |
1605
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ist') { $self->_load_ist_slice($slice) } |
1606
|
0
|
|
|
|
|
0
|
elsif ($type eq 'isft') { $self->_load_isft_slice($slice) } |
1607
|
|
|
|
|
|
|
else { |
1608
|
0
|
|
|
|
|
0
|
croak "unable to load slice '$slice'! Unsupported slice type $type\n"; |
1609
|
|
|
|
|
|
|
} |
1610
|
|
|
|
|
|
|
} |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
sub _load_s_slice { |
1614
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1615
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1616
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1617
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
# unpack the data slice zip member |
1619
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1620
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) ); |
1621
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1623
|
|
|
|
|
|
|
# first observation |
1624
|
0
|
|
|
|
|
0
|
my $position = shift @data; |
1625
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
# remaining observations |
1628
|
0
|
|
|
|
|
0
|
while (@data) { |
1629
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1630
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1631
|
|
|
|
|
|
|
} |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
# store Interval Tree buffer |
1634
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1635
|
|
|
|
|
|
|
} |
1636
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
sub _load_sf_slice { |
1638
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1639
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1640
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
# unpack the data slice zip member |
1643
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1644
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) ); |
1645
|
|
|
|
|
|
|
|
1646
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1647
|
|
|
|
|
|
|
# first observation |
1648
|
0
|
|
|
|
|
0
|
my $position = shift @data; |
1649
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
# remaining observations |
1652
|
0
|
|
|
|
|
0
|
while (@data) { |
1653
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1654
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1655
|
|
|
|
|
|
|
} |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
# store Interval Tree buffer |
1658
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1659
|
|
|
|
|
|
|
} |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
sub _load_st_slice { |
1662
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1663
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1664
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1665
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, (score), text |
1667
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
# load the slice contents |
1669
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
# first observation |
1672
|
0
|
|
|
|
|
0
|
my $i = 8; # go ahead and set index up to first observation text |
1673
|
0
|
|
|
|
|
0
|
my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i)); |
1674
|
|
|
|
|
|
|
# initial null, position, text_length |
1675
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1676
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1677
|
0
|
|
|
|
|
0
|
$i += $t; |
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
# remaining observations |
1680
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1681
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1682
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1683
|
0
|
|
|
|
|
0
|
my ($p, $t) = unpack('s>s>', substr($contents, $i, 4)); |
1684
|
0
|
|
|
|
|
0
|
$i += 4; |
1685
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1686
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1687
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1688
|
0
|
|
|
|
|
0
|
$i += $t; |
1689
|
|
|
|
|
|
|
} |
1690
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
# store Interval Tree buffer |
1692
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1693
|
|
|
|
|
|
|
} |
1694
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
sub _load_ss_slice { |
1696
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1697
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1698
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1699
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
# unpack the data slice zip member |
1701
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1702
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) ); |
1703
|
|
|
|
|
|
|
|
1704
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, no score |
1705
|
|
|
|
|
|
|
# first observation |
1706
|
0
|
|
|
|
|
0
|
my $position = @data; |
1707
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
1708
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
# remaining observations |
1711
|
0
|
|
|
|
|
0
|
while (@data) { |
1712
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1713
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
1714
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1715
|
|
|
|
|
|
|
} |
1716
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
# store Interval Tree buffer |
1718
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1719
|
|
|
|
|
|
|
} |
1720
|
|
|
|
|
|
|
|
1721
|
|
|
|
|
|
|
sub _load_ssf_slice { |
1722
|
6
|
|
|
6
|
|
15
|
my $self = shift; |
1723
|
6
|
|
|
|
|
11
|
my $slice = shift; |
1724
|
6
|
|
|
|
|
57
|
my $tree = Set::IntervalTree->new(); |
1725
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
# unpack the data slice zip member |
1727
|
6
|
|
|
|
|
21
|
my $number = $self->slice_obs_number($slice); |
1728
|
6
|
|
|
|
|
31
|
my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) ); |
1729
|
|
|
|
|
|
|
|
1730
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1731
|
|
|
|
|
|
|
# first observation |
1732
|
6
|
|
|
|
|
35114
|
my $position = (shift @data); |
1733
|
6
|
|
|
|
|
25
|
my $position2 = $position + (shift @data) + 32768; |
1734
|
6
|
|
|
|
|
58
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1735
|
|
|
|
|
|
|
|
1736
|
|
|
|
|
|
|
# remaining observations |
1737
|
6
|
|
|
|
|
28
|
while (@data) { |
1738
|
58298
|
|
|
|
|
83341
|
$position += (shift @data) + 32768; |
1739
|
58298
|
|
|
|
|
77348
|
$position2 = $position + (shift @data) + 32768; |
1740
|
58298
|
|
|
|
|
175094
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1741
|
|
|
|
|
|
|
} |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
# store Interval Tree buffer |
1744
|
6
|
|
|
|
|
51
|
$self->{buffer}{$slice} = $tree; |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
|
1747
|
|
|
|
|
|
|
sub _load_sst_slice { |
1748
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1749
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1750
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1751
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1752
|
|
|
|
|
|
|
|
1753
|
|
|
|
|
|
|
# first observation |
1754
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
1755
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i)); |
1756
|
|
|
|
|
|
|
# initial null, position, length, text_length |
1757
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
1758
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1759
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1760
|
0
|
|
|
|
|
0
|
$i += $t; |
1761
|
|
|
|
|
|
|
|
1762
|
|
|
|
|
|
|
# remaining observations |
1763
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1764
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1765
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1766
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6)); |
1767
|
0
|
|
|
|
|
0
|
$i += 6; |
1768
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1769
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
1770
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1771
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1772
|
0
|
|
|
|
|
0
|
$i += $t; |
1773
|
|
|
|
|
|
|
} |
1774
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
# store Interval Tree buffer |
1776
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1777
|
|
|
|
|
|
|
} |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
sub _load_ssft_slice { |
1780
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
1781
|
1
|
|
|
|
|
2
|
my $slice = shift; |
1782
|
1
|
|
|
|
|
5
|
my $tree = Set::IntervalTree->new(); |
1783
|
1
|
|
|
|
|
4
|
my $contents = $self->zip->contents($slice); |
1784
|
|
|
|
|
|
|
|
1785
|
|
|
|
|
|
|
# first observation |
1786
|
1
|
|
|
|
|
4361
|
my $i = 14; # go ahead and set index up to first observation text |
1787
|
1
|
|
|
|
|
7
|
my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i)); |
1788
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
1789
|
1
|
|
|
|
|
4
|
my $position2 = $position + $l + 32768; |
1790
|
1
|
|
|
|
|
7
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1791
|
1
|
|
|
|
|
7
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
1792
|
1
|
|
|
|
|
1
|
$i += $t; |
1793
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
# remaining observations |
1795
|
1
|
|
|
|
|
2
|
my $p; # position offset |
1796
|
1
|
|
|
|
|
3
|
my $len = CORE::length($contents); |
1797
|
1
|
|
|
|
|
4
|
while ($i < $len) { |
1798
|
10000
|
|
|
|
|
24967
|
($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10)); |
1799
|
10000
|
|
|
|
|
15473
|
$i += 10; |
1800
|
10000
|
|
|
|
|
12970
|
$position += $p + 32768; |
1801
|
10000
|
|
|
|
|
12770
|
$position2 = $position + $l + 32768; |
1802
|
10000
|
|
|
|
|
22424
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1803
|
10000
|
|
|
|
|
36496
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
1804
|
10000
|
|
|
|
|
18421
|
$i += $t; |
1805
|
|
|
|
|
|
|
} |
1806
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
# store Interval Tree buffer |
1808
|
1
|
|
|
|
|
10
|
$self->{buffer}{$slice} = $tree; |
1809
|
|
|
|
|
|
|
} |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
sub _load_i_slice { |
1812
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1813
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1814
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1815
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
# unpack the data slice zip member |
1817
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1818
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) ); |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1821
|
|
|
|
|
|
|
# first observation |
1822
|
0
|
|
|
|
|
0
|
my $position = @data; |
1823
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1824
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
# remaining observations |
1826
|
0
|
|
|
|
|
0
|
while (@data) { |
1827
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1828
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1829
|
|
|
|
|
|
|
} |
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
# store Interval Tree buffer |
1832
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1833
|
|
|
|
|
|
|
} |
1834
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
sub _load_if_slice { |
1836
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1837
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1838
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1839
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
# unpack the data slice zip member |
1841
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1842
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) ); |
1843
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1845
|
|
|
|
|
|
|
# first observation |
1846
|
0
|
|
|
|
|
0
|
my $position = @data; |
1847
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1848
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
# remaining observations |
1850
|
0
|
|
|
|
|
0
|
while (@data) { |
1851
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1852
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1853
|
|
|
|
|
|
|
} |
1854
|
|
|
|
|
|
|
|
1855
|
|
|
|
|
|
|
# store Interval Tree buffer |
1856
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1857
|
|
|
|
|
|
|
} |
1858
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
sub _load_it_slice { |
1860
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1861
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1862
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1863
|
|
|
|
|
|
|
|
1864
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, (score), text |
1865
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
# load the slice contents |
1867
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1868
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
# first observation |
1870
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
1871
|
0
|
|
|
|
|
0
|
my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i)); |
1872
|
|
|
|
|
|
|
# initial null, position, text_length |
1873
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1874
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1875
|
0
|
|
|
|
|
0
|
$i += $t; |
1876
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
# remaining observations |
1878
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1879
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1880
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1881
|
0
|
|
|
|
|
0
|
($p, $t) = unpack('i>s>', substr($contents, $i, 6)); |
1882
|
0
|
|
|
|
|
0
|
$i += 6; |
1883
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1884
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1885
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1886
|
0
|
|
|
|
|
0
|
$i += $t; |
1887
|
|
|
|
|
|
|
} |
1888
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
# store Interval Tree buffer |
1890
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1891
|
|
|
|
|
|
|
} |
1892
|
|
|
|
|
|
|
|
1893
|
|
|
|
|
|
|
sub _load_ii_slice { |
1894
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1895
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1896
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1897
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
# unpack the data slice zip member |
1899
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1900
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) ); |
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1903
|
|
|
|
|
|
|
# first observation |
1904
|
0
|
|
|
|
|
0
|
my $position = @data; |
1905
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data); |
1906
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1907
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
# remaining observations |
1909
|
0
|
|
|
|
|
0
|
while (@data) { |
1910
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1911
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data); |
1912
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1913
|
|
|
|
|
|
|
} |
1914
|
|
|
|
|
|
|
|
1915
|
|
|
|
|
|
|
# store Interval Tree buffer |
1916
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1917
|
|
|
|
|
|
|
} |
1918
|
|
|
|
|
|
|
|
1919
|
|
|
|
|
|
|
sub _load_iif_slice { |
1920
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1921
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1922
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1923
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
# unpack the data slice zip member |
1925
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1926
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) ); |
1927
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1929
|
|
|
|
|
|
|
# first observation |
1930
|
0
|
|
|
|
|
0
|
my $position = @data; |
1931
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data); |
1932
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1933
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
# remaining observations |
1935
|
0
|
|
|
|
|
0
|
while (@data) { |
1936
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1937
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data); |
1938
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1939
|
|
|
|
|
|
|
} |
1940
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
# store Interval Tree buffer |
1942
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1943
|
|
|
|
|
|
|
} |
1944
|
|
|
|
|
|
|
|
1945
|
|
|
|
|
|
|
sub _load_iit_slice { |
1946
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1947
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1948
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1949
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1950
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
# first observation |
1952
|
0
|
|
|
|
|
0
|
my $i = 12; # go ahead and set index up to first observation text |
1953
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i)); |
1954
|
|
|
|
|
|
|
# initial null, position, length, text_length |
1955
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l; |
1956
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1957
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1958
|
0
|
|
|
|
|
0
|
$i += $t; |
1959
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
# remaining observations |
1961
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1962
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1963
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1964
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10)); |
1965
|
0
|
|
|
|
|
0
|
$i += 10; |
1966
|
0
|
|
|
|
|
0
|
$position += $p; |
1967
|
0
|
|
|
|
|
0
|
$position2 = $position + $l; |
1968
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1969
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1970
|
0
|
|
|
|
|
0
|
$i += $t; |
1971
|
|
|
|
|
|
|
} |
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
# store Interval Tree buffer |
1974
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1975
|
|
|
|
|
|
|
} |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
sub _load_iift_slice { |
1978
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1979
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1980
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1981
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1982
|
|
|
|
|
|
|
|
1983
|
|
|
|
|
|
|
# first observation |
1984
|
0
|
|
|
|
|
0
|
my $i = 16; # go ahead and set index up to first observation text |
1985
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i)); |
1986
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
1987
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l; |
1988
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1989
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
1990
|
0
|
|
|
|
|
0
|
$i += $t; |
1991
|
|
|
|
|
|
|
|
1992
|
|
|
|
|
|
|
# remaining observations |
1993
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1994
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1995
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1996
|
0
|
|
|
|
|
0
|
($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14)); |
1997
|
0
|
|
|
|
|
0
|
$i += 14; |
1998
|
0
|
|
|
|
|
0
|
$position += $p; |
1999
|
0
|
|
|
|
|
0
|
$position2 = $position + $l; |
2000
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2001
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2002
|
0
|
|
|
|
|
0
|
$i += $t; |
2003
|
|
|
|
|
|
|
} |
2004
|
|
|
|
|
|
|
|
2005
|
|
|
|
|
|
|
# store Interval Tree buffer |
2006
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2007
|
|
|
|
|
|
|
} |
2008
|
|
|
|
|
|
|
|
2009
|
|
|
|
|
|
|
sub _load_is_slice { |
2010
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2011
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2012
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2013
|
|
|
|
|
|
|
|
2014
|
|
|
|
|
|
|
# unpack the data slice zip member |
2015
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
2016
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) ); |
2017
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
2019
|
|
|
|
|
|
|
# first observation |
2020
|
0
|
|
|
|
|
0
|
my $position = @data; |
2021
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
2022
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
2023
|
|
|
|
|
|
|
|
2024
|
|
|
|
|
|
|
# remaining observations |
2025
|
0
|
|
|
|
|
0
|
while (@data) { |
2026
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
2027
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
2028
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
2029
|
|
|
|
|
|
|
} |
2030
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
# store Interval Tree buffer |
2032
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2033
|
|
|
|
|
|
|
} |
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
sub _load_isf_slice { |
2036
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2037
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2038
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2039
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
# unpack the data slice zip member |
2041
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
2042
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) ); |
2043
|
|
|
|
|
|
|
|
2044
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
2045
|
|
|
|
|
|
|
# first observation |
2046
|
0
|
|
|
|
|
0
|
my $position = @data; |
2047
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
2048
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
# remaining observations |
2051
|
0
|
|
|
|
|
0
|
while (@data) { |
2052
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
2053
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
2054
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
2055
|
|
|
|
|
|
|
} |
2056
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
# store Interval Tree buffer |
2058
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2059
|
|
|
|
|
|
|
} |
2060
|
|
|
|
|
|
|
|
2061
|
|
|
|
|
|
|
sub _load_ist_slice { |
2062
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2063
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2064
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2065
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
2066
|
|
|
|
|
|
|
|
2067
|
|
|
|
|
|
|
# first observation |
2068
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
2069
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i)); |
2070
|
|
|
|
|
|
|
# initial null, position, length, text_length |
2071
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
2072
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2073
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2074
|
0
|
|
|
|
|
0
|
$i += $t; |
2075
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
# remaining observations |
2077
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2078
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
2079
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
2080
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8)); |
2081
|
0
|
|
|
|
|
0
|
$i += 8; |
2082
|
0
|
|
|
|
|
0
|
$position += $p; |
2083
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
2084
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2085
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2086
|
0
|
|
|
|
|
0
|
$i += $t; |
2087
|
|
|
|
|
|
|
} |
2088
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
# store Interval Tree buffer |
2090
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2091
|
|
|
|
|
|
|
} |
2092
|
|
|
|
|
|
|
|
2093
|
|
|
|
|
|
|
sub _load_isft_slice { |
2094
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2095
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2096
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2097
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
# first observation |
2100
|
0
|
|
|
|
|
0
|
my $i = 14; # go ahead and set index up to first observation text |
2101
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i)); |
2102
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
2103
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
2104
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2105
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2106
|
0
|
|
|
|
|
0
|
$i += $t; |
2107
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
# remaining observations |
2109
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2110
|
0
|
|
|
|
|
0
|
while ($i < CORE::length($contents)) { |
2111
|
0
|
|
|
|
|
0
|
($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12)); |
2112
|
0
|
|
|
|
|
0
|
$i += 12; |
2113
|
0
|
|
|
|
|
0
|
$position += $p; |
2114
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
2115
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2116
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2117
|
0
|
|
|
|
|
0
|
$i += $t; |
2118
|
|
|
|
|
|
|
} |
2119
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
# store Interval Tree buffer |
2121
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2122
|
|
|
|
|
|
|
} |
2123
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
sub _mean_score { |
2125
|
1000
|
|
|
1000
|
|
1383
|
my $self = shift; |
2126
|
1000
|
|
|
|
|
1864
|
my ($start, $stop, $slices) = @_; |
2127
|
1000
|
50
|
|
|
|
1771
|
return unless @$slices; |
2128
|
|
|
|
|
|
|
|
2129
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2130
|
1000
|
|
|
|
|
1259
|
my $sum; |
2131
|
1000
|
|
|
|
|
1481
|
my $count = 0; |
2132
|
1000
|
|
|
|
|
1482
|
foreach my $slice (@$slices) { |
2133
|
1001
|
50
|
|
|
|
1773
|
next unless $self->slice_type($slice) =~ /f/; |
2134
|
|
|
|
|
|
|
# no sense going through if no score present |
2135
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
# load and unpack the data |
2137
|
1001
|
|
|
|
|
2672
|
$self->_load_slice($slice); |
2138
|
|
|
|
|
|
|
|
2139
|
|
|
|
|
|
|
# find the overlapping observations and record values |
2140
|
1001
|
|
|
|
|
111068
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2141
|
1001
|
|
|
|
|
2748
|
foreach my $r (@$results) { |
2142
|
2276
|
|
100
|
|
|
5843
|
$sum += $r->[2] || 0; |
2143
|
2276
|
|
|
|
|
4221
|
$count++; |
2144
|
|
|
|
|
|
|
} |
2145
|
|
|
|
|
|
|
} |
2146
|
1000
|
50
|
|
|
|
5129
|
return $count ? $sum / $count : undef; |
2147
|
|
|
|
|
|
|
} |
2148
|
|
|
|
|
|
|
|
2149
|
|
|
|
|
|
|
sub _stat_summary { |
2150
|
11
|
|
|
11
|
|
20
|
my $self = shift; |
2151
|
11
|
|
|
|
|
30
|
my ($start, $stop, $slices) = @_; |
2152
|
11
|
50
|
|
|
|
27
|
return unless @$slices; |
2153
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
# initialize the statistical scores |
2155
|
11
|
|
|
|
|
19
|
my $count = 0; |
2156
|
11
|
|
|
|
|
30
|
my $sum; |
2157
|
|
|
|
|
|
|
my $sum_squares; |
2158
|
11
|
|
|
|
|
0
|
my $min; |
2159
|
11
|
|
|
|
|
0
|
my $max; |
2160
|
|
|
|
|
|
|
|
2161
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2162
|
11
|
|
|
|
|
20
|
foreach my $slice (@$slices) { |
2163
|
12
|
50
|
|
|
|
45
|
next unless $self->slice_type($slice) =~ /f/; |
2164
|
|
|
|
|
|
|
# no sense going through if no score present |
2165
|
|
|
|
|
|
|
|
2166
|
|
|
|
|
|
|
# load and unpack the data |
2167
|
12
|
|
|
|
|
44
|
$self->_load_slice($slice); |
2168
|
|
|
|
|
|
|
|
2169
|
|
|
|
|
|
|
# find the overlapping observations |
2170
|
12
|
|
|
|
|
3675
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2171
|
12
|
|
|
|
|
51
|
foreach my $r (@$results) { |
2172
|
|
|
|
|
|
|
# each observation is [start, stop, score] |
2173
|
21145
|
50
|
|
|
|
36885
|
if (defined $r->[2]) { |
2174
|
21145
|
|
|
|
|
25897
|
$count++; |
2175
|
21145
|
|
|
|
|
26920
|
$sum += $r->[2]; |
2176
|
21145
|
|
|
|
|
27491
|
$sum_squares += ($r->[2] * $r->[2]); |
2177
|
21145
|
100
|
100
|
|
|
56032
|
$min = $r->[2] if (!defined $min or $r->[2] < $min); |
2178
|
21145
|
100
|
100
|
|
|
57879
|
$max = $r->[2] if (!defined $max or $r->[2] > $max); |
2179
|
|
|
|
|
|
|
} |
2180
|
|
|
|
|
|
|
} |
2181
|
|
|
|
|
|
|
} |
2182
|
|
|
|
|
|
|
|
2183
|
|
|
|
|
|
|
# assemble the statistical summary hash |
2184
|
11
|
|
50
|
|
|
159
|
my %summary = ( |
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
2185
|
|
|
|
|
|
|
'validCount' => $count, |
2186
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
2187
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
2188
|
|
|
|
|
|
|
'minVal' => $min || 0, |
2189
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
2190
|
|
|
|
|
|
|
); |
2191
|
11
|
|
|
|
|
110
|
return \%summary; |
2192
|
|
|
|
|
|
|
} |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
sub _rewrite_metadata { |
2195
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
2196
|
1
|
50
|
|
|
|
4
|
return unless (-w $self->zip->fileName); |
2197
|
1
|
|
|
|
|
58
|
my $md = $self->{'metadata'}; |
2198
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
# generate new metadata as an array |
2200
|
1
|
|
|
|
|
3
|
my @new_md; |
2201
|
1
|
|
|
|
|
7
|
push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion}; |
2202
|
1
|
50
|
|
|
|
4
|
push @new_md, @{ $md->{comments} } if exists $md->{comments}; |
|
1
|
|
|
|
|
6
|
|
2203
|
1
|
|
|
|
|
4
|
push @new_md, "dataType = " . $md->{dataType}; |
2204
|
1
|
|
|
|
|
3
|
push @new_md, "versionedGenome = " . $md->{versionedGenome}; |
2205
|
|
|
|
|
|
|
|
2206
|
|
|
|
|
|
|
# additional keys that may be present |
2207
|
1
|
|
|
|
|
8
|
foreach (keys %$md) { |
2208
|
7
|
100
|
|
|
|
40
|
next if /useqArchiveVersion|dataType|versionedGenome|comments|chromStats|globalStats/; |
2209
|
2
|
|
|
|
|
8
|
push @new_md, "$_ = $md->{$_}"; |
2210
|
|
|
|
|
|
|
} |
2211
|
|
|
|
|
|
|
|
2212
|
|
|
|
|
|
|
# global and chromosome stats |
2213
|
1
|
|
|
|
|
3
|
push @new_md, |
2214
|
|
|
|
|
|
|
"# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal"; |
2215
|
1
|
50
|
|
|
|
5
|
push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats}; |
2216
|
1
|
|
|
|
|
8
|
foreach (sort {$a cmp $b} keys %$md) { |
|
13
|
|
|
|
|
22
|
|
2217
|
7
|
100
|
|
|
|
22
|
push @new_md, "$_ = $md->{$_}" if /chromStats/; |
2218
|
|
|
|
|
|
|
} |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
# write the new metadata to the zip Archive |
2221
|
1
|
|
|
|
|
11
|
$self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md)); |
2222
|
1
|
|
|
|
|
713
|
$self->{'zip'}->overwrite; |
2223
|
1
|
|
|
|
|
8302
|
$self->clone; # not sure if this is necessary, but just in case we reopen the zip |
2224
|
|
|
|
|
|
|
} |
2225
|
|
|
|
|
|
|
|
2226
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
######## Exported Functions ######## |
2228
|
|
|
|
|
|
|
# these are borrowed from Bio::DB::BigWig from Lincoln Stein |
2229
|
|
|
|
|
|
|
|
2230
|
|
|
|
|
|
|
sub binMean { |
2231
|
2
|
|
|
2
|
1
|
126
|
my $score = shift; |
2232
|
2
|
50
|
|
|
|
9
|
return 0 unless $score->{validCount}; |
2233
|
2
|
|
|
|
|
11
|
$score->{sumData}/$score->{validCount}; |
2234
|
|
|
|
|
|
|
} |
2235
|
|
|
|
|
|
|
|
2236
|
|
|
|
|
|
|
sub binVariance { |
2237
|
1
|
|
|
1
|
1
|
2
|
my $score = shift; |
2238
|
1
|
50
|
|
|
|
5
|
return 0 unless $score->{validCount}; |
2239
|
1
|
|
|
|
|
8
|
my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount}; |
2240
|
1
|
50
|
|
|
|
3
|
if ($score->{validCount} > 1) { |
2241
|
1
|
|
|
|
|
5
|
$var /= $score->{validCount}-1; |
2242
|
|
|
|
|
|
|
} |
2243
|
1
|
50
|
|
|
|
4
|
return 0 if $var < 0; |
2244
|
1
|
|
|
|
|
4
|
return $var; |
2245
|
|
|
|
|
|
|
} |
2246
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
sub binStdev { |
2248
|
1
|
|
|
1
|
1
|
5
|
my $score = shift; |
2249
|
1
|
|
|
|
|
5
|
return sqrt(binVariance($score)); |
2250
|
|
|
|
|
|
|
} |
2251
|
|
|
|
|
|
|
|
2252
|
|
|
|
|
|
|
|
2253
|
|
|
|
|
|
|
|
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
######## Other Classes ############# |
2256
|
|
|
|
|
|
|
|
2257
|
|
|
|
|
|
|
package Bio::DB::USeq::Feature; |
2258
|
1
|
|
|
1
|
|
14
|
use base 'Bio::SeqFeature::Lite'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
702
|
|
2259
|
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
|
sub new { |
2261
|
13
|
|
|
13
|
|
42
|
my $class = shift; |
2262
|
13
|
|
|
|
|
60
|
return $class->SUPER::new(@_); |
2263
|
|
|
|
|
|
|
} |
2264
|
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
|
sub type { |
2266
|
|
|
|
|
|
|
# Bio::SeqFeature::Lite mangles the type and returns |
2267
|
|
|
|
|
|
|
# primary_tag:source if both are set |
2268
|
|
|
|
|
|
|
# this may wreck havoc with parsers when the type already has a : |
2269
|
|
|
|
|
|
|
# as in wiggle:1000 |
2270
|
16
|
|
|
16
|
|
161
|
my $self = shift; |
2271
|
16
|
|
|
|
|
105
|
return $self->{type}; |
2272
|
|
|
|
|
|
|
} |
2273
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
sub chr_stats { |
2275
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2276
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2277
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_stats( $self->seq_id ); |
2278
|
|
|
|
|
|
|
} |
2279
|
|
|
|
|
|
|
|
2280
|
|
|
|
|
|
|
sub chr_mean { |
2281
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2282
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2283
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_mean( $self->seq_id ); |
2284
|
|
|
|
|
|
|
} |
2285
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
sub chr_stdev { |
2287
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2288
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2289
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_stdev( $self->seq_id ); |
2290
|
|
|
|
|
|
|
} |
2291
|
|
|
|
|
|
|
|
2292
|
|
|
|
|
|
|
sub global_stats { |
2293
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2294
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2295
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_stats( $self->seq_id ); |
2296
|
|
|
|
|
|
|
} |
2297
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
sub global_mean { |
2299
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2300
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2301
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_mean( $self->seq_id ); |
2302
|
|
|
|
|
|
|
} |
2303
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
sub global_stdev { |
2305
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2306
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2307
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_stdev( $self->seq_id ); |
2308
|
|
|
|
|
|
|
} |
2309
|
|
|
|
|
|
|
|
2310
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
|
2312
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
package Bio::DB::USeq::Segment; |
2314
|
1
|
|
|
1
|
|
74120
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
907
|
|
2315
|
|
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
sub new { |
2317
|
1
|
|
|
1
|
|
2
|
my $class = shift; |
2318
|
1
|
|
|
|
|
6
|
my %args = @_; |
2319
|
1
|
|
|
|
|
10
|
my $segment = $class->SUPER::new(@_); |
2320
|
1
|
50
|
|
|
|
90
|
$segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'}; |
2321
|
1
|
|
|
|
|
5
|
return $segment; |
2322
|
|
|
|
|
|
|
} |
2323
|
|
|
|
|
|
|
|
2324
|
|
|
|
|
|
|
sub scores { |
2325
|
1
|
|
|
1
|
|
141
|
my $self = shift; |
2326
|
1
|
|
|
|
|
7
|
return $self->{'useq'}->scores( |
2327
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2328
|
|
|
|
|
|
|
-start => $self->start, |
2329
|
|
|
|
|
|
|
-end => $self->end, |
2330
|
|
|
|
|
|
|
-strand => $self->strand, |
2331
|
|
|
|
|
|
|
); |
2332
|
|
|
|
|
|
|
} |
2333
|
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
sub features { |
2335
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2336
|
0
|
|
|
|
|
0
|
my $type = shift; |
2337
|
0
|
|
0
|
|
|
0
|
$type ||= 'region'; |
2338
|
0
|
|
|
|
|
0
|
return $self->{'useq'}->features( |
2339
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2340
|
|
|
|
|
|
|
-start => $self->start, |
2341
|
|
|
|
|
|
|
-end => $self->end, |
2342
|
|
|
|
|
|
|
-strand => $self->strand, |
2343
|
|
|
|
|
|
|
-type => $type, |
2344
|
|
|
|
|
|
|
); |
2345
|
|
|
|
|
|
|
} |
2346
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
sub get_seq_stream { |
2348
|
1
|
|
|
1
|
|
354
|
my $self = shift; |
2349
|
1
|
|
|
|
|
2
|
my $type = shift; |
2350
|
1
|
|
50
|
|
|
9
|
$type ||= 'region'; |
2351
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2352
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2353
|
|
|
|
|
|
|
-start => $self->start, |
2354
|
|
|
|
|
|
|
-end => $self->end, |
2355
|
|
|
|
|
|
|
-strand => $self->strand, |
2356
|
|
|
|
|
|
|
-source => $self->source, |
2357
|
|
|
|
|
|
|
-type => $type, |
2358
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2359
|
1
|
|
|
|
|
8
|
); |
2360
|
|
|
|
|
|
|
} |
2361
|
|
|
|
|
|
|
|
2362
|
|
|
|
|
|
|
sub slices { |
2363
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2364
|
0
|
|
|
|
|
0
|
my @slices = $self->{'useq'}->_translate_coordinates_to_slices( |
2365
|
|
|
|
|
|
|
$self->seq_id, $self->start, $self->end, $self->strand |
2366
|
|
|
|
|
|
|
); |
2367
|
0
|
0
|
|
|
|
0
|
return wantarray ? @slices : \@slices; |
2368
|
|
|
|
|
|
|
} |
2369
|
|
|
|
|
|
|
|
2370
|
|
|
|
|
|
|
sub coverage { |
2371
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2372
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2373
|
0
|
|
0
|
|
|
0
|
$bins ||= $self->length; |
2374
|
0
|
|
|
|
|
0
|
return $self->features("coverage:$bins"); |
2375
|
|
|
|
|
|
|
} |
2376
|
|
|
|
|
|
|
|
2377
|
|
|
|
|
|
|
sub wiggle { |
2378
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2379
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2380
|
0
|
|
0
|
|
|
0
|
$bins ||= $self->length; |
2381
|
0
|
|
|
|
|
0
|
return $self->features("wiggle:$bins"); |
2382
|
|
|
|
|
|
|
} |
2383
|
|
|
|
|
|
|
|
2384
|
|
|
|
|
|
|
sub statistical_summary { |
2385
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2386
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2387
|
0
|
|
0
|
|
|
0
|
$bins ||= 1; |
2388
|
0
|
|
|
|
|
0
|
return $self->features("summary:$bins"); |
2389
|
|
|
|
|
|
|
} |
2390
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
sub observations { |
2392
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2393
|
0
|
|
|
|
|
0
|
return $self->{'useq'}->observations( |
2394
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2395
|
|
|
|
|
|
|
-start => $self->start, |
2396
|
|
|
|
|
|
|
-end => $self->end, |
2397
|
|
|
|
|
|
|
-strand => $self->strand, |
2398
|
|
|
|
|
|
|
); |
2399
|
|
|
|
|
|
|
} |
2400
|
|
|
|
|
|
|
|
2401
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
package Bio::DB::USeq::Iterator; |
2403
|
1
|
|
|
1
|
|
15
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1590
|
|
2404
|
|
|
|
|
|
|
|
2405
|
|
|
|
|
|
|
sub new { |
2406
|
4
|
|
|
4
|
|
74
|
my $class = shift; |
2407
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
# create object |
2409
|
4
|
|
|
|
|
22
|
my %args = @_; |
2410
|
4
|
50
|
|
|
|
15
|
my $useq = $args{'-useq'} or |
2411
|
|
|
|
|
|
|
die "Bio::DB::USeq::Iterator cannot be created without a -useq argument"; |
2412
|
4
|
|
|
|
|
19
|
my $iterator = $class->SUPER::new(@_); |
2413
|
4
|
|
|
|
|
289
|
$iterator->{'useq'} = $useq; |
2414
|
|
|
|
|
|
|
|
2415
|
|
|
|
|
|
|
# determine which members to retrieve |
2416
|
|
|
|
|
|
|
my @slices = $useq->_translate_coordinates_to_slices( |
2417
|
|
|
|
|
|
|
$args{-seq_id}, |
2418
|
|
|
|
|
|
|
$args{-start}, |
2419
|
|
|
|
|
|
|
$args{-end}, |
2420
|
|
|
|
|
|
|
$args{-strand}, |
2421
|
4
|
|
|
|
|
20
|
); |
2422
|
4
|
|
|
|
|
18
|
$useq->_clear_buffer(\@slices); |
2423
|
|
|
|
|
|
|
|
2424
|
|
|
|
|
|
|
# sort the slices as necessary |
2425
|
4
|
100
|
|
|
|
20
|
if (scalar(@slices) > 1) { |
2426
|
6
|
|
|
|
|
17
|
@slices = map {$_->[1]} |
2427
|
3
|
|
|
|
|
11
|
sort {$a->[0] <=> $b->[0]} |
2428
|
3
|
|
|
|
|
21
|
map { [$useq->slice_start($_), $_] } @slices; |
|
6
|
|
|
|
|
21
|
|
2429
|
|
|
|
|
|
|
} |
2430
|
|
|
|
|
|
|
|
2431
|
|
|
|
|
|
|
# how we set up the iterator depends on the feature type requested |
2432
|
|
|
|
|
|
|
# we need to add specific information to iterator object |
2433
|
|
|
|
|
|
|
|
2434
|
4
|
100
|
|
|
|
18
|
if ($iterator->type =~ /region|interval|observation/) { |
2435
|
|
|
|
|
|
|
# useq observation features are simple |
2436
|
2
|
|
|
|
|
8
|
$iterator->{'wiggle'} = 0; |
2437
|
|
|
|
|
|
|
|
2438
|
|
|
|
|
|
|
# prepare specific iterator information |
2439
|
2
|
|
|
|
|
6
|
$iterator->{'slices'} = \@slices; |
2440
|
2
|
|
|
|
|
4
|
$iterator->{'current_results'} = undef; |
2441
|
2
|
|
|
|
|
6
|
$iterator->{'current_strand'} = undef; |
2442
|
|
|
|
|
|
|
|
2443
|
2
|
|
|
|
|
10
|
return $iterator; |
2444
|
|
|
|
|
|
|
} |
2445
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
# otherwise we work with more complex wiggle or summary features |
2447
|
|
|
|
|
|
|
# set the type of wiggle feature: 1 = wiggle, 2 = summary |
2448
|
2
|
100
|
|
|
|
7
|
$iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1; |
2449
|
|
|
|
|
|
|
|
2450
|
|
|
|
|
|
|
# normally the iterator will only return one wigggle|summary feature, but |
2451
|
|
|
|
|
|
|
# will return two if stranded data, per behavior for Bio::Grapics... |
2452
|
2
|
50
|
33
|
|
|
15
|
if ($iterator->strand == 0 and $useq->stranded) { |
2453
|
|
|
|
|
|
|
# we have stranded data, so need to return two features |
2454
|
|
|
|
|
|
|
# separate the slices into each respective strands for each feature |
2455
|
0
|
|
|
|
|
0
|
my (@f, @r); |
2456
|
0
|
|
|
|
|
0
|
foreach (@slices) { |
2457
|
0
|
0
|
|
|
|
0
|
if ($useq->slice_strand($_) == 1) { |
2458
|
0
|
|
|
|
|
0
|
push @f, $_; |
2459
|
|
|
|
|
|
|
} |
2460
|
|
|
|
|
|
|
else { |
2461
|
0
|
|
|
|
|
0
|
push @r, $_; |
2462
|
|
|
|
|
|
|
} |
2463
|
|
|
|
|
|
|
} |
2464
|
0
|
|
|
|
|
0
|
$iterator->{slices} = [\@f, \@r]; |
2465
|
|
|
|
|
|
|
} |
2466
|
|
|
|
|
|
|
else { |
2467
|
|
|
|
|
|
|
# only need to return one feature |
2468
|
2
|
|
|
|
|
8
|
$iterator->{slices} = [ \@slices ]; |
2469
|
|
|
|
|
|
|
} |
2470
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
# check for type and bins |
2472
|
2
|
|
|
|
|
5
|
my ($bin, $step); |
2473
|
2
|
100
|
|
|
|
6
|
if ($iterator->type =~ /:(\d+)$/i) { |
2474
|
1
|
|
|
|
|
4
|
$bin = $1; |
2475
|
1
|
|
|
|
|
7
|
my $length = $iterator->length; |
2476
|
1
|
50
|
|
|
|
24
|
$bin = $length if $bin > $length; |
2477
|
1
|
|
|
|
|
4
|
$step = $length/$bin; |
2478
|
|
|
|
|
|
|
} |
2479
|
2
|
|
|
|
|
7
|
$iterator->{bin} = $bin; |
2480
|
2
|
|
|
|
|
5
|
$iterator->{step} = $step; |
2481
|
|
|
|
|
|
|
|
2482
|
2
|
|
|
|
|
10
|
return $iterator; |
2483
|
|
|
|
|
|
|
} |
2484
|
|
|
|
|
|
|
|
2485
|
|
|
|
|
|
|
sub next_seq { |
2486
|
10
|
|
|
10
|
|
409
|
my $self = shift; |
2487
|
|
|
|
|
|
|
|
2488
|
10
|
100
|
|
|
|
40
|
if ($self->{'wiggle'} == 1) { |
|
|
100
|
|
|
|
|
|
2489
|
2
|
|
|
|
|
8
|
return $self->_next_wiggle; |
2490
|
|
|
|
|
|
|
} |
2491
|
|
|
|
|
|
|
elsif ($self->{'wiggle'} == 2) { |
2492
|
2
|
|
|
|
|
7
|
return $self->_next_summary; |
2493
|
|
|
|
|
|
|
} |
2494
|
|
|
|
|
|
|
else { |
2495
|
6
|
|
|
|
|
14
|
return $self->_next_region; |
2496
|
|
|
|
|
|
|
} |
2497
|
|
|
|
|
|
|
} |
2498
|
|
|
|
|
|
|
|
2499
|
0
|
|
|
0
|
|
0
|
sub next_feature {return shift->next_seq} |
2500
|
|
|
|
|
|
|
|
2501
|
|
|
|
|
|
|
sub _next_region { |
2502
|
|
|
|
|
|
|
# this will keep returning observations as SeqFeature objects until we run out |
2503
|
|
|
|
|
|
|
# of observations in the requested interval |
2504
|
6
|
|
|
6
|
|
11
|
my $self = shift; |
2505
|
6
|
|
|
|
|
11
|
my $useq = $self->{'useq'}; |
2506
|
|
|
|
|
|
|
|
2507
|
|
|
|
|
|
|
# get current results |
2508
|
6
|
100
|
|
|
|
16
|
unless ($self->{'current_results'}) { |
2509
|
2
|
|
50
|
|
|
5
|
my $slice = shift @{ $self->{'slices'} } || undef; |
2510
|
2
|
50
|
|
|
|
8
|
return unless $slice; # no more slices, we're done |
2511
|
|
|
|
|
|
|
|
2512
|
|
|
|
|
|
|
# collect the observations |
2513
|
2
|
|
|
|
|
15
|
$useq->_load_slice($slice); |
2514
|
2
|
|
|
|
|
19
|
my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end); |
2515
|
|
|
|
|
|
|
|
2516
|
|
|
|
|
|
|
# sort the observations and store the results |
2517
|
2
|
50
|
|
|
|
275
|
my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result; |
|
303
|
|
|
|
|
522
|
|
2518
|
2
|
|
|
|
|
13
|
$self->{'current_results'} = \@sorted; |
2519
|
2
|
|
|
|
|
11
|
$self->{'current_strand'} = $useq->slice_strand($slice); |
2520
|
|
|
|
|
|
|
} |
2521
|
|
|
|
|
|
|
|
2522
|
|
|
|
|
|
|
# return next observation as a Feature object |
2523
|
6
|
|
|
|
|
10
|
my $obs = shift @{ $self->{'current_results'} }; |
|
6
|
|
|
|
|
10
|
|
2524
|
6
|
50
|
|
|
|
17
|
unless (defined $obs) { |
2525
|
|
|
|
|
|
|
# no more observations for current slice, try to move to next slice |
2526
|
0
|
|
|
|
|
0
|
undef $self->{'current_results'}; |
2527
|
0
|
|
|
|
|
0
|
return $self->_next_region; |
2528
|
|
|
|
|
|
|
} |
2529
|
|
|
|
|
|
|
return Bio::DB::USeq::Feature->new( |
2530
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2531
|
|
|
|
|
|
|
-start => $obs->[0] + 1, |
2532
|
|
|
|
|
|
|
-end => $obs->[1], |
2533
|
6
|
100
|
|
|
|
24
|
-strand => $self->{'current_strand'}, |
2534
|
|
|
|
|
|
|
-score => $obs->[2], |
2535
|
|
|
|
|
|
|
-source => $self->source, |
2536
|
|
|
|
|
|
|
-type => $self->type, |
2537
|
|
|
|
|
|
|
-name => defined $obs->[3] ? $obs->[3] : |
2538
|
|
|
|
|
|
|
join(':', $self->seq_id, $obs->[0], $obs->[1]), |
2539
|
|
|
|
|
|
|
) |
2540
|
|
|
|
|
|
|
} |
2541
|
|
|
|
|
|
|
|
2542
|
|
|
|
|
|
|
sub _next_wiggle { |
2543
|
|
|
|
|
|
|
# this will return one wiggle Feature, two if separate strands are requested |
2544
|
|
|
|
|
|
|
|
2545
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
2546
|
2
|
|
|
|
|
6
|
my $useq = $self->{'useq'}; |
2547
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
# determine which slices to retrieve |
2549
|
2
|
|
|
|
|
3
|
my $slices = shift @{ $self->{slices} }; |
|
2
|
|
|
|
|
6
|
|
2550
|
2
|
100
|
|
|
|
8
|
return unless $slices; |
2551
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
# more information |
2553
|
1
|
|
|
|
|
3
|
my $start = $self->start; |
2554
|
1
|
|
|
|
|
9
|
my $stop = $self->end; |
2555
|
1
|
|
|
|
|
9
|
my $step = $self->{step}; |
2556
|
|
|
|
|
|
|
|
2557
|
|
|
|
|
|
|
# check whether we are working with binned wiggle or not |
2558
|
1
|
|
|
|
|
2
|
my @scores; |
2559
|
1
|
50
|
33
|
|
|
6
|
if ($self->{bin} and $step > 1) { |
2560
|
|
|
|
|
|
|
# we will be collecting the mean score value in bins |
2561
|
|
|
|
|
|
|
|
2562
|
|
|
|
|
|
|
# collect the scores for each bin |
2563
|
1
|
|
|
|
|
4
|
for (my $begin = $start; $begin < $stop; $begin += $step) { |
2564
|
|
|
|
|
|
|
|
2565
|
|
|
|
|
|
|
# round off coordinates to integers |
2566
|
|
|
|
|
|
|
# beginning point and step may not be integers |
2567
|
1000
|
|
|
|
|
2189
|
my $s = int($begin + 0.5); |
2568
|
1000
|
|
|
|
|
1642
|
my $e = int($s + $step - 0.5); # start + step - 1 + 0.5 |
2569
|
|
|
|
|
|
|
|
2570
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2571
|
1000
|
50
|
|
|
|
1915
|
if (scalar @$slices > 1) { |
2572
|
|
|
|
|
|
|
# more than one slice, identify which subset of slices to collect from |
2573
|
|
|
|
|
|
|
# may or may not be all of the current slices |
2574
|
1000
|
|
|
|
|
1412
|
my @sub_slices; |
2575
|
1000
|
|
|
|
|
1583
|
foreach my $slice (@$slices) { |
2576
|
2000
|
100
|
|
|
|
4135
|
next if $s > $useq->slice_end($slice); |
2577
|
1061
|
100
|
|
|
|
1965
|
next if $e < $useq->slice_start($slice); |
2578
|
1001
|
|
|
|
|
2254
|
push @sub_slices, $slice; |
2579
|
|
|
|
|
|
|
} |
2580
|
1000
|
|
|
|
|
2192
|
push @scores, $useq->_mean_score($s, $e, \@sub_slices); |
2581
|
|
|
|
|
|
|
} |
2582
|
|
|
|
|
|
|
else { |
2583
|
0
|
|
|
|
|
0
|
push @scores, $useq->_mean_score($s, $e, $slices); |
2584
|
|
|
|
|
|
|
} |
2585
|
|
|
|
|
|
|
} |
2586
|
|
|
|
|
|
|
} |
2587
|
|
|
|
|
|
|
else { |
2588
|
|
|
|
|
|
|
# otherwise we collect in one step and associate scores at bp resolution |
2589
|
|
|
|
|
|
|
# collect the scores from each of the requested slices and |
2590
|
|
|
|
|
|
|
# assemble them into a hash of values |
2591
|
|
|
|
|
|
|
|
2592
|
|
|
|
|
|
|
# correlate scores with position |
2593
|
0
|
|
|
|
|
0
|
my %pos2score; |
2594
|
0
|
|
|
|
|
0
|
foreach my $slice (@$slices) { |
2595
|
|
|
|
|
|
|
|
2596
|
|
|
|
|
|
|
# load and unpack the data |
2597
|
0
|
|
|
|
|
0
|
$useq->_load_slice($slice); |
2598
|
|
|
|
|
|
|
|
2599
|
|
|
|
|
|
|
# fetch the overlapping observations |
2600
|
|
|
|
|
|
|
# each observation is [start, stop, score] |
2601
|
0
|
|
|
|
|
0
|
my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2602
|
0
|
|
|
|
|
0
|
foreach my $r (@$result) { |
2603
|
0
|
|
|
|
|
0
|
foreach my $p ( $r->[0] + 1 .. $r->[1] ) { |
2604
|
0
|
|
|
|
|
0
|
$pos2score{$p} = $r->[2]; |
2605
|
|
|
|
|
|
|
} |
2606
|
|
|
|
|
|
|
} |
2607
|
|
|
|
|
|
|
} |
2608
|
|
|
|
|
|
|
|
2609
|
|
|
|
|
|
|
# convert positioned scores into an array |
2610
|
0
|
|
|
|
|
0
|
foreach (my $s = $start; $s <= $stop; $s++) { |
2611
|
0
|
0
|
|
|
|
0
|
push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef; |
2612
|
|
|
|
|
|
|
# for Bio::Graphics it is better to store undef than 0 |
2613
|
|
|
|
|
|
|
# which can do wonky things with graphs |
2614
|
|
|
|
|
|
|
} |
2615
|
|
|
|
|
|
|
} |
2616
|
|
|
|
|
|
|
|
2617
|
|
|
|
|
|
|
# generate the wiggle object |
2618
|
1
|
|
50
|
|
|
8
|
my $strand = $useq->slice_strand( $slices->[0] ) || 0; |
2619
|
1
|
50
|
|
|
|
11
|
return Bio::DB::USeq::Wiggle->new( |
|
|
50
|
|
|
|
|
|
2620
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2621
|
|
|
|
|
|
|
-start => $start, |
2622
|
|
|
|
|
|
|
-end => $stop, |
2623
|
|
|
|
|
|
|
-strand => $strand, |
2624
|
|
|
|
|
|
|
-type => $self->type, |
2625
|
|
|
|
|
|
|
-source => $self->source, |
2626
|
|
|
|
|
|
|
-attributes => { 'coverage' => [ \@scores ] }, |
2627
|
|
|
|
|
|
|
-name => $strand == 1 ? 'Forward' : |
2628
|
|
|
|
|
|
|
$strand == -1 ? 'Reverse' : q(), |
2629
|
|
|
|
|
|
|
-useq => $useq, |
2630
|
|
|
|
|
|
|
); |
2631
|
|
|
|
|
|
|
} |
2632
|
|
|
|
|
|
|
|
2633
|
|
|
|
|
|
|
sub _next_summary { |
2634
|
|
|
|
|
|
|
# this will return one summary Feature, two if separate strands are requested |
2635
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
2636
|
|
|
|
|
|
|
|
2637
|
|
|
|
|
|
|
# determine which slices to retrieve |
2638
|
2
|
|
|
|
|
3
|
my $slices = shift @{ $self->{slices} }; |
|
2
|
|
|
|
|
6
|
|
2639
|
2
|
100
|
|
|
|
7
|
return unless $slices; |
2640
|
|
|
|
|
|
|
|
2641
|
|
|
|
|
|
|
# all of the real statistical work is done elsewhere |
2642
|
|
|
|
|
|
|
# just return the summary object |
2643
|
1
|
|
50
|
|
|
6
|
my $strand = $self->{useq}->slice_strand( $slices->[0] ) || 0; |
2644
|
|
|
|
|
|
|
return Bio::DB::USeq::Summary->new( |
2645
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2646
|
|
|
|
|
|
|
-start => $self->start, |
2647
|
|
|
|
|
|
|
-end => $self->end, |
2648
|
|
|
|
|
|
|
-strand => $strand, |
2649
|
|
|
|
|
|
|
-type => $self->type, |
2650
|
|
|
|
|
|
|
-source => $self->source, |
2651
|
|
|
|
|
|
|
-name => $strand == 1 ? 'Forward' : |
2652
|
|
|
|
|
|
|
$strand == -1 ? 'Reverse' : q(), |
2653
|
|
|
|
|
|
|
-useq => $self->{'useq'}, |
2654
|
|
|
|
|
|
|
-slices => $slices, |
2655
|
|
|
|
|
|
|
-bin => $self->{bin}, |
2656
|
|
|
|
|
|
|
-step => $self->{step}, |
2657
|
1
|
50
|
|
|
|
5
|
); |
|
|
50
|
|
|
|
|
|
2658
|
|
|
|
|
|
|
} |
2659
|
|
|
|
|
|
|
|
2660
|
|
|
|
|
|
|
|
2661
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
package Bio::DB::USeq::Wiggle; |
2663
|
1
|
|
|
1
|
|
12
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
643
|
|
2664
|
|
|
|
|
|
|
# Wiggle scores are stored in the coverage attribute for backwards |
2665
|
|
|
|
|
|
|
# compatibility with Bio::Graphics. |
2666
|
|
|
|
|
|
|
|
2667
|
|
|
|
|
|
|
sub new { |
2668
|
1
|
|
|
1
|
|
32
|
my $class = shift; |
2669
|
1
|
|
|
|
|
15
|
my $wig = $class->SUPER::new(@_); |
2670
|
1
|
|
|
|
|
130
|
my %args = @_; |
2671
|
1
|
50
|
|
|
|
5
|
my $useq = $args{'-useq'} or |
2672
|
|
|
|
|
|
|
die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument"; |
2673
|
1
|
|
|
|
|
4
|
$wig->{useq} = $useq; |
2674
|
1
|
|
|
|
|
9
|
return $wig; |
2675
|
|
|
|
|
|
|
} |
2676
|
|
|
|
|
|
|
|
2677
|
|
|
|
|
|
|
sub coverage { |
2678
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
2679
|
1
|
|
|
|
|
10
|
my ($coverage) = $self->get_tag_values('coverage'); |
2680
|
1
|
50
|
|
|
|
120
|
return wantarray ? @$coverage : $coverage; |
2681
|
|
|
|
|
|
|
} |
2682
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
sub wiggle { |
2684
|
1
|
|
|
1
|
|
131
|
return shift->coverage; |
2685
|
|
|
|
|
|
|
} |
2686
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
# Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam |
2688
|
|
|
|
|
|
|
sub gff3_string { |
2689
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2690
|
0
|
|
|
|
|
0
|
my $gff3 = $self->SUPER::gff3_string(@_); |
2691
|
0
|
|
|
|
|
0
|
my $coverage = $self->escape(join(',',$self->coverage)); |
2692
|
0
|
|
|
|
|
0
|
$gff3 =~ s/coverage=[^;]+/coverage=$coverage/g; |
2693
|
0
|
|
|
|
|
0
|
return $gff3; |
2694
|
|
|
|
|
|
|
} |
2695
|
|
|
|
|
|
|
|
2696
|
|
|
|
|
|
|
sub statistical_summary { |
2697
|
|
|
|
|
|
|
# this is just for the wiggle scores, not original data |
2698
|
|
|
|
|
|
|
# This is a fake statistical_summary to fool Bio::Graphics into |
2699
|
|
|
|
|
|
|
# calculating chromosome or global statistics like a BigWig adaptor. |
2700
|
|
|
|
|
|
|
# A real statistical_summary call would provide the number of bins, |
2701
|
|
|
|
|
|
|
# so return null if bins is present. |
2702
|
|
|
|
|
|
|
# We could calculate a real statistical summary, but that would be an |
2703
|
|
|
|
|
|
|
# expensive circuitous route after we just collected wiggle scores. |
2704
|
|
|
|
|
|
|
# Better to request the correct feature type in the first place. |
2705
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2706
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2707
|
0
|
0
|
0
|
|
|
0
|
return if $bins && $bins > 1; |
2708
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
# initialize the statistical scores |
2710
|
0
|
|
|
|
|
0
|
my $count = 0; |
2711
|
0
|
|
|
|
|
0
|
my $sum; |
2712
|
|
|
|
|
|
|
my $sum_squares; |
2713
|
0
|
|
|
|
|
0
|
my $min; |
2714
|
0
|
|
|
|
|
0
|
my $max; |
2715
|
|
|
|
|
|
|
|
2716
|
|
|
|
|
|
|
# generate a statistical summary of just the wiggle scores |
2717
|
0
|
|
|
|
|
0
|
foreach my $s ($self->coverage) { |
2718
|
0
|
|
|
|
|
0
|
$count++; |
2719
|
0
|
0
|
|
|
|
0
|
next unless defined $s; |
2720
|
0
|
|
|
|
|
0
|
$sum += $s; |
2721
|
0
|
|
|
|
|
0
|
$sum_squares += $s * $s; |
2722
|
0
|
0
|
0
|
|
|
0
|
$min = $s if (!defined $min or $s < $min); |
2723
|
0
|
0
|
0
|
|
|
0
|
$max = $s if (!defined $max or $s > $max); |
2724
|
|
|
|
|
|
|
} |
2725
|
|
|
|
|
|
|
|
2726
|
|
|
|
|
|
|
# return the statistical hash |
2727
|
0
|
|
0
|
|
|
0
|
my %stat = ( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
2728
|
|
|
|
|
|
|
'validCount' => $count, |
2729
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
2730
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
2731
|
|
|
|
|
|
|
'minVal' => $min || 0, |
2732
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
2733
|
|
|
|
|
|
|
); |
2734
|
0
|
|
|
|
|
0
|
return \%stat; |
2735
|
|
|
|
|
|
|
} |
2736
|
|
|
|
|
|
|
|
2737
|
|
|
|
|
|
|
sub get_seq_stream { |
2738
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2739
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2740
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2741
|
|
|
|
|
|
|
-start => $self->start, |
2742
|
|
|
|
|
|
|
-end => $self->end, |
2743
|
|
|
|
|
|
|
-strand => $self->strand, |
2744
|
|
|
|
|
|
|
-type => 'region', |
2745
|
|
|
|
|
|
|
-source => $self->source, |
2746
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2747
|
0
|
|
|
|
|
0
|
); |
2748
|
|
|
|
|
|
|
} |
2749
|
|
|
|
|
|
|
|
2750
|
|
|
|
|
|
|
|
2751
|
|
|
|
|
|
|
|
2752
|
|
|
|
|
|
|
|
2753
|
|
|
|
|
|
|
package Bio::DB::USeq::Summary; |
2754
|
1
|
|
|
1
|
|
8
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
786
|
|
2755
|
|
|
|
|
|
|
|
2756
|
|
|
|
|
|
|
sub new { |
2757
|
1
|
|
|
1
|
|
27
|
my $class = shift; |
2758
|
1
|
|
|
|
|
11
|
my $summary = $class->SUPER::new(@_); |
2759
|
1
|
|
|
|
|
87
|
my %args = @_; |
2760
|
1
|
50
|
|
|
|
5
|
my $useq = $args{'-useq'} or |
2761
|
|
|
|
|
|
|
die "Bio::DB::USeq::Summary cannot be created without a -useq argument"; |
2762
|
1
|
|
|
|
|
3
|
$summary->{useq} = $useq; |
2763
|
1
|
50
|
|
|
|
4
|
$summary->{slices} = $args{-slices} if exists $args{-slices}; |
2764
|
1
|
50
|
|
|
|
5
|
$summary->{bin} = $args{-bin} if exists $args{-bin}; |
2765
|
1
|
50
|
|
|
|
4
|
$summary->{step} = $args{-step} if exists $args{-step}; |
2766
|
1
|
|
|
|
|
7
|
return $summary; |
2767
|
|
|
|
|
|
|
} |
2768
|
|
|
|
|
|
|
|
2769
|
|
|
|
|
|
|
sub statistical_summary { |
2770
|
1
|
|
|
1
|
|
171
|
my $self = shift; |
2771
|
1
|
|
|
|
|
3
|
my $useq = $self->{useq}; |
2772
|
|
|
|
|
|
|
|
2773
|
|
|
|
|
|
|
# get the number of bins to calculate the statistical summaries |
2774
|
1
|
|
|
|
|
3
|
my $bin = shift; |
2775
|
1
|
50
|
33
|
|
|
5
|
$bin ||= $self->{bin} if exists $self->{bin}; |
2776
|
1
|
|
50
|
|
|
4
|
$bin ||= 1; |
2777
|
1
|
50
|
|
|
|
4
|
my $step = $self->{step} if exists $self->{step}; |
2778
|
1
|
|
33
|
|
|
13
|
$step ||= $self->length / $bin; |
2779
|
|
|
|
|
|
|
|
2780
|
|
|
|
|
|
|
# get the slices |
2781
|
1
|
50
|
|
|
|
30
|
my $slices = $self->{slices} if exists $self->{slices}; |
2782
|
1
|
50
|
|
|
|
4
|
unless ($slices) { |
2783
|
|
|
|
|
|
|
# this should already be established, but just in case |
2784
|
0
|
|
|
|
|
0
|
my @a = $useq->_translate_coordinates_to_slices($self->seq_id, |
2785
|
|
|
|
|
|
|
$self->start, $self->end, $self->strand); |
2786
|
0
|
|
|
|
|
0
|
$useq->_clear_buffer(\@a); |
2787
|
0
|
|
|
|
|
0
|
$slices = \@a; |
2788
|
|
|
|
|
|
|
} |
2789
|
|
|
|
|
|
|
|
2790
|
|
|
|
|
|
|
# collect the statistical summaries for each bin |
2791
|
1
|
|
|
|
|
378
|
my @summaries; |
2792
|
1
|
|
|
|
|
6
|
for (my $begin = $self->start; $begin < $self->end; $begin += $step) { |
2793
|
|
|
|
|
|
|
|
2794
|
|
|
|
|
|
|
# round off coordinates to integers |
2795
|
|
|
|
|
|
|
# beginning point and step may not be integers |
2796
|
10
|
|
|
|
|
131
|
my $s = int($begin + 0.5); |
2797
|
10
|
|
|
|
|
25
|
my $e = int($s + $step - 0.5); # start + step - 1 + 0.5 |
2798
|
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2800
|
10
|
50
|
|
|
|
31
|
if (scalar @$slices > 1) { |
2801
|
|
|
|
|
|
|
# more than one slice, identify which subset of slices to collect from |
2802
|
|
|
|
|
|
|
# may or may not be all of the current slices |
2803
|
10
|
|
|
|
|
17
|
my @sub_slices; |
2804
|
10
|
|
|
|
|
22
|
foreach my $slice (@$slices) { |
2805
|
20
|
50
|
|
|
|
47
|
next if $s > $useq->slice_end($slice); |
2806
|
20
|
100
|
|
|
|
42
|
next if $e < $useq->slice_start($slice); |
2807
|
11
|
|
|
|
|
27
|
push @sub_slices, $slice; |
2808
|
|
|
|
|
|
|
} |
2809
|
10
|
|
|
|
|
50
|
push @summaries, $useq->_stat_summary($s, $e, \@sub_slices); |
2810
|
|
|
|
|
|
|
} |
2811
|
|
|
|
|
|
|
else { |
2812
|
0
|
|
|
|
|
0
|
push @summaries, $useq->_stat_summary($s, $e, $slices); |
2813
|
|
|
|
|
|
|
} |
2814
|
|
|
|
|
|
|
} |
2815
|
|
|
|
|
|
|
|
2816
|
|
|
|
|
|
|
# return the reference to the statistical summaries |
2817
|
1
|
|
|
|
|
19
|
return \@summaries; |
2818
|
|
|
|
|
|
|
} |
2819
|
|
|
|
|
|
|
|
2820
|
|
|
|
|
|
|
sub score { |
2821
|
0
|
|
|
0
|
|
|
my $self = shift; |
2822
|
0
|
|
|
|
|
|
my $a = $self->statistical_summary(1); |
2823
|
0
|
|
|
|
|
|
return $a->[0]; |
2824
|
|
|
|
|
|
|
} |
2825
|
|
|
|
|
|
|
|
2826
|
|
|
|
|
|
|
sub gff3_string { |
2827
|
|
|
|
|
|
|
# this is going to be a little convoluted, since what we |
2828
|
|
|
|
|
|
|
# really want here is coverage, which is easier to calculate with means, |
2829
|
|
|
|
|
|
|
# rather than doing statistical summaries and calculate means from those |
2830
|
0
|
|
|
0
|
|
|
my $self = shift; |
2831
|
|
|
|
|
|
|
my ($wig) = $self->{useq}->features( |
2832
|
|
|
|
|
|
|
# this should only return one feature because we have specific strand |
2833
|
0
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2834
|
|
|
|
|
|
|
-start => $self->start, |
2835
|
|
|
|
|
|
|
-end => $self->end, |
2836
|
|
|
|
|
|
|
-strand => $self->strand, |
2837
|
|
|
|
|
|
|
-type => 'coverage:1000', |
2838
|
|
|
|
|
|
|
); |
2839
|
0
|
|
|
|
|
|
return $wig->gff3_string; |
2840
|
|
|
|
|
|
|
} |
2841
|
|
|
|
|
|
|
|
2842
|
|
|
|
|
|
|
sub get_seq_stream { |
2843
|
0
|
|
|
0
|
|
|
my $self = shift; |
2844
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2845
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2846
|
|
|
|
|
|
|
-start => $self->start, |
2847
|
|
|
|
|
|
|
-end => $self->end, |
2848
|
|
|
|
|
|
|
-strand => $self->strand, |
2849
|
|
|
|
|
|
|
-type => 'region', |
2850
|
|
|
|
|
|
|
-source => $self->source, |
2851
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2852
|
0
|
|
|
|
|
|
); |
2853
|
|
|
|
|
|
|
} |
2854
|
|
|
|
|
|
|
|
2855
|
|
|
|
|
|
|
=head1 AUTHOR |
2856
|
|
|
|
|
|
|
|
2857
|
|
|
|
|
|
|
Timothy J. Parnell, PhD |
2858
|
|
|
|
|
|
|
Dept of Oncological Sciences |
2859
|
|
|
|
|
|
|
Huntsman Cancer Institute |
2860
|
|
|
|
|
|
|
University of Utah |
2861
|
|
|
|
|
|
|
Salt Lake City, UT, 84112 |
2862
|
|
|
|
|
|
|
|
2863
|
|
|
|
|
|
|
This package is free software; you can redistribute it and/or modify |
2864
|
|
|
|
|
|
|
it under the terms of the Artistic License 2.0. |