line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::DB::USeq; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.26'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Bio::DB::USeq - Read USeq archive database files |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Bio::DB::USeq; |
12
|
|
|
|
|
|
|
my $useq = Bio::DB::USeq->new('file.useq') or |
13
|
|
|
|
|
|
|
die "unable to open file.useq!\n"; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# sequence IDs |
16
|
|
|
|
|
|
|
my @seq_ids = $useq->seq_ids; |
17
|
|
|
|
|
|
|
my $length = $useq->length($chr); # approximate, not exact |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
### Retrieving features |
20
|
|
|
|
|
|
|
# all features or observations across chromosome I |
21
|
|
|
|
|
|
|
my @features = $useq->features( |
22
|
|
|
|
|
|
|
-seq_id => 'chrI', |
23
|
|
|
|
|
|
|
-type => 'region', |
24
|
|
|
|
|
|
|
); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# same thing using a simple form |
27
|
|
|
|
|
|
|
# use an array of (chromosome, start, stop, strand) |
28
|
|
|
|
|
|
|
my @features = $useq->features('chrI'); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# same thing with a memory efficient iterator |
31
|
|
|
|
|
|
|
my $iterator = $useq->get_seq_stream( |
32
|
|
|
|
|
|
|
-seq_id => 'chrI', |
33
|
|
|
|
|
|
|
); |
34
|
|
|
|
|
|
|
while (my $f = $iterator->next_seq) { |
35
|
|
|
|
|
|
|
# each feature $f supports most SeqFeatureI methods |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
### Retrieving simple scores |
40
|
|
|
|
|
|
|
my @scores = $useq->scores( |
41
|
|
|
|
|
|
|
-seq_id => 'chrI', |
42
|
|
|
|
|
|
|
-start => 1000, |
43
|
|
|
|
|
|
|
-end => 2000 |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
### Same methods as above after defining an interval first |
48
|
|
|
|
|
|
|
my $segment = $useq->segment( |
49
|
|
|
|
|
|
|
-seq_id => 'chrI', |
50
|
|
|
|
|
|
|
-start => 1000, |
51
|
|
|
|
|
|
|
-end => 2000, |
52
|
|
|
|
|
|
|
); |
53
|
|
|
|
|
|
|
my @scores = $segment->scores; |
54
|
|
|
|
|
|
|
my @features = $segment->features; |
55
|
|
|
|
|
|
|
my $iterator = $segment->get_seq_stream; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
### Efficient retrieval of positioned scores in 100 bins |
59
|
|
|
|
|
|
|
# compatible with Bio::Graphics |
60
|
|
|
|
|
|
|
my ($wig) = $useq->features( |
61
|
|
|
|
|
|
|
# assuming unstranded data here, otherwise two wiggle objects |
62
|
|
|
|
|
|
|
# would be returned, one for each strand |
63
|
|
|
|
|
|
|
-seq_id => 'chrI', |
64
|
|
|
|
|
|
|
-start => 1000, |
65
|
|
|
|
|
|
|
-end => 2000, |
66
|
|
|
|
|
|
|
-type => 'wiggle:100', |
67
|
|
|
|
|
|
|
); |
68
|
|
|
|
|
|
|
my @bins = $wig->wiggle; |
69
|
|
|
|
|
|
|
my @bins = $wig->coverage; # same thing |
70
|
|
|
|
|
|
|
my ($bins) = $wig->get_tag_values('coverage'); # same thing |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
### Statistical summaries of intervals |
74
|
|
|
|
|
|
|
# compatible with Bio::Graphics |
75
|
|
|
|
|
|
|
my ($summary) = $useq->features( |
76
|
|
|
|
|
|
|
# assuming unstranded data here, otherwise two summaries |
77
|
|
|
|
|
|
|
# would be returned, one for each strand |
78
|
|
|
|
|
|
|
-seq_id => 'chrI', |
79
|
|
|
|
|
|
|
-start => 1000, |
80
|
|
|
|
|
|
|
-end => 2000, |
81
|
|
|
|
|
|
|
-type => 'summary', |
82
|
|
|
|
|
|
|
); |
83
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
84
|
|
|
|
|
|
|
foreach (@$stats) { |
85
|
|
|
|
|
|
|
my $max = $_->{maxVal}; |
86
|
|
|
|
|
|
|
my $mean = Bio::DB::USeq::binMean($_); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
### Stranded data using an iterator |
91
|
|
|
|
|
|
|
# could be used with either wiggle or summary features |
92
|
|
|
|
|
|
|
my $stream = $useq->get_seq_stream( |
93
|
|
|
|
|
|
|
-seq_id => 'chrI', |
94
|
|
|
|
|
|
|
-start => 1000, |
95
|
|
|
|
|
|
|
-end => 2000, |
96
|
|
|
|
|
|
|
-type => 'wiggle:100', |
97
|
|
|
|
|
|
|
); |
98
|
|
|
|
|
|
|
my ($forward, $reverse); |
99
|
|
|
|
|
|
|
if ($useq->stranded) { |
100
|
|
|
|
|
|
|
$forward = $stream->next_seq; |
101
|
|
|
|
|
|
|
$reverse = $stream->next_seq; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
else { |
104
|
|
|
|
|
|
|
$forward = $stream->next_seq; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head1 DESCRIPTION |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Bio::DB::USeq is a B style adaptor for reading USeq files. USeq files |
111
|
|
|
|
|
|
|
are compressed, indexed data files supporting modern bioinformatic datasets, |
112
|
|
|
|
|
|
|
including genomic points, scores, and intervals. More information about the |
113
|
|
|
|
|
|
|
USeq software package can be found at L, |
114
|
|
|
|
|
|
|
including a description of the USeq data |
115
|
|
|
|
|
|
|
L. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
USeq files are typically half the size of corresponding bigBed and bigWig files, |
118
|
|
|
|
|
|
|
due to a compact internal format and lack of internal zoom data. This adaptor, |
119
|
|
|
|
|
|
|
however, can still return statistics across different zoom levels in the same |
120
|
|
|
|
|
|
|
manner as big files, albeit at a cost of calculating these in realtime. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 Generating useq files |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
USeq files may be generated using tools from the USeq package, available at |
125
|
|
|
|
|
|
|
L. They may be generated from |
126
|
|
|
|
|
|
|
native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file |
127
|
|
|
|
|
|
|
formats. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head2 Compatibility |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
The adaptor follows most conventions of other B-style Bio::DB |
132
|
|
|
|
|
|
|
adaptors. Observations or features in the useq file archive are |
133
|
|
|
|
|
|
|
returned as SeqFeatureI compatible objects. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Coordinates consumed and returned by the adaptor are 1-based, consistent |
136
|
|
|
|
|
|
|
with B convention. This is not true of the useq file itself, which |
137
|
|
|
|
|
|
|
uses the interbase coordinate system. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
Unlike wig and bigWig files, useq file archives support stranded data, |
140
|
|
|
|
|
|
|
which can make data collection much simpler for complex experiments. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
See below for GBrowse compatibility. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 Limitations |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
This adaptor is read only. USeq files, in general, are not modified |
147
|
|
|
|
|
|
|
or written with data. The exceptions are chromosome or global statistics |
148
|
|
|
|
|
|
|
are written to the F file inside the zip archive to |
149
|
|
|
|
|
|
|
cache for future queries. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
No support for genomic sequence is included. Users who need access to |
152
|
|
|
|
|
|
|
genomic sequence should seek an alternative B adaptor, such as |
153
|
|
|
|
|
|
|
L. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Useq files do not have a native concept of type, primary_tag, or source |
156
|
|
|
|
|
|
|
attributes, as expected with GFF-based database adaptors. The features |
157
|
|
|
|
|
|
|
method does support special types (see below). |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head2 Requirements |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
The adaptor is a Perl-only implementation. It only requires the |
162
|
|
|
|
|
|
|
L module for opening and reading the useq file archive. |
163
|
|
|
|
|
|
|
B is required for working with SeqFeatures objects generated |
164
|
|
|
|
|
|
|
from useq file observations. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head1 METHODS |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head2 Initializing the Bio::DB::USeq object |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
These are class methods for creating and working with the Bio::DB::USeq |
171
|
|
|
|
|
|
|
object itself. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=over 4 |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item new |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=item new($file) |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item new(-useq => $file) |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
This will create a new Bio::DB::USeq object. Optionally pass the path |
182
|
|
|
|
|
|
|
to a useq file archive to open immediately. There is not much to do |
183
|
|
|
|
|
|
|
unless you open a file. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Named arguments may be used to specify the file. Either -useq or -file |
186
|
|
|
|
|
|
|
may be used. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=item open($file) |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Open a useq file archive. Useq files typically use the F<.useq> file |
191
|
|
|
|
|
|
|
extension. Returns true if successful. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
DO NOT open a subsequent useq file archive when one has already been |
194
|
|
|
|
|
|
|
opened. Please create a new object to open a new file. |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=item clone |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Force the object to re-open the useq archive file under a new file |
199
|
|
|
|
|
|
|
handle to make it clone safe. Do this in the child process before |
200
|
|
|
|
|
|
|
collecting data. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=item zip |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Return the L object representing the useq file archive. |
205
|
|
|
|
|
|
|
Generally not recommended unless you know what you are doing. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=back |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head2 General information about the useq file |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
These are class methods for obtaining general information or |
212
|
|
|
|
|
|
|
metadata attributes regarding the contents of the file. |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=over 4 |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item stranded |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Return true or false (1 or 0) indicating whether the contents of the |
219
|
|
|
|
|
|
|
useq file archive are recorded in a stranded fashion. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=item attributes |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Return an array of available attribute keys that were recorded in the |
224
|
|
|
|
|
|
|
useq file F member. These are key = value pairs |
225
|
|
|
|
|
|
|
representing metadata for the useq file. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=item attribute($key) |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Return the metadata attribute value for the specified key. These |
230
|
|
|
|
|
|
|
are recorded in the useq file F member. Returns |
231
|
|
|
|
|
|
|
undefined if the key does not exist. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item type |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Return the useq file metadata C value. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=item genome |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
Return the useq file metadata C value. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=item version |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
Return the useq file metadata C value. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=back |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=head2 Chromosome Information |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
These are methods to obtain information about the chromosomes or reference |
250
|
|
|
|
|
|
|
sequences represented in the file archive. |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
B that generating score statistics across one or more chromosomes may |
253
|
|
|
|
|
|
|
be computationally expensive. Therefore, chromosome statistics, once |
254
|
|
|
|
|
|
|
calculated, are cached in the useq file metadata for future reference. |
255
|
|
|
|
|
|
|
This necessitates writing to the useq zip archive. This is currently the |
256
|
|
|
|
|
|
|
only supported method for modifying the useq zip archive. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=over 4 |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item seq_ids |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Return an array of the chromosome or sequence identifiers represented |
263
|
|
|
|
|
|
|
in the useq file archive. The names are sorted ASCIIbetically before |
264
|
|
|
|
|
|
|
they are returned. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item length($seq_id) |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Return the length of the provided chromosome or sequence identifier. |
269
|
|
|
|
|
|
|
Note that this may not be the actual length in the reference genome, but |
270
|
|
|
|
|
|
|
rather the last recorded position of an observation for that chromosome. |
271
|
|
|
|
|
|
|
Hence, it should be used only as an approximation. |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=item chr_mean($seq_id) |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Return the mean score value across the entire chromosome. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item chr_stdev($seq_id) |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Return the score standard deviation across the entire chromosome. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=item chr_stats($seq_id) |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Return a statistical summary across the entire chromosome. This |
284
|
|
|
|
|
|
|
is an anonymous hash with five keys: validCount, sumData, |
285
|
|
|
|
|
|
|
sumSquares, minVal, and maxVal. These are equivalent to the |
286
|
|
|
|
|
|
|
statistical summaries generated by the Bio::DB::BigWig adaptor. |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
=item global_mean |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
Return the mean score value across all chromosomes. |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=item global_stdev |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Return the mean score value across all chromosomes. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=item global_stats |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Return a statistical summary across all chromosomes. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=back |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head2 Data accession |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
These are the primary methods for working with data contained in |
305
|
|
|
|
|
|
|
useq file archive. These should be familiar to most B users. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=over 4 |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=item features |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
Returns an array or array reference of SeqFeatureI compatible |
312
|
|
|
|
|
|
|
objects overlapping a given genomic interval. |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Coordinates of the interrogated regions must be supplied. At a |
315
|
|
|
|
|
|
|
minimum, the seq_id must be supplied. A start of 1 and an end |
316
|
|
|
|
|
|
|
corresponding to the length of the seq_id is used if not directly |
317
|
|
|
|
|
|
|
provided. Coordinates may be specified in two different manners, |
318
|
|
|
|
|
|
|
either as a list of (seq_id, start, end, strand) or as one or |
319
|
|
|
|
|
|
|
more keyed values. |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
my @features = $useq->features($seq_id, $start, $end); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
@features = $useq->features( |
324
|
|
|
|
|
|
|
-seq_id => $seq_id, |
325
|
|
|
|
|
|
|
-start => $start, |
326
|
|
|
|
|
|
|
-end => $end, |
327
|
|
|
|
|
|
|
); |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
If the -iterator argument is supplied with a true value, then an |
330
|
|
|
|
|
|
|
iterator object is returned. See get_seq_stream() for details. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
Bio::DB::USeq supports four different feature types. Feature |
333
|
|
|
|
|
|
|
types may be specified using the -type argument. |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=over 4 |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
=item region |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=item interval |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=item observation |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
The default feature type if the type argument is not specified. |
344
|
|
|
|
|
|
|
These are SeqFeatureI compatible objects representing observations |
345
|
|
|
|
|
|
|
in the useq file archive. These are compatible with the iterator. |
346
|
|
|
|
|
|
|
SeqFeature observations are returned in genomic order. |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=item chromosome |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Returns SeqFeatureI compatible objects representing the |
351
|
|
|
|
|
|
|
reference sequences (chromosomes) listed in the useq file |
352
|
|
|
|
|
|
|
archive. These are not compatibile with the iterator. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item wiggle |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=item wiggle:$bins |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=item coverage |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
=item coverage:$bins |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Returns an array of SeqFeatureI compatible objects for each |
363
|
|
|
|
|
|
|
strand requested. If the useq file contains stranded data, |
364
|
|
|
|
|
|
|
and no strand is requested, then two objects will be |
365
|
|
|
|
|
|
|
returned representing each strand. |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Each object contains an array representing scores across |
368
|
|
|
|
|
|
|
the requested coordinates. This object is designed to be |
369
|
|
|
|
|
|
|
backwards compatible with coverage features from the |
370
|
|
|
|
|
|
|
L adaptor for use with L and GBrowse. |
371
|
|
|
|
|
|
|
Note that only scores are returned, not a true depth coverage |
372
|
|
|
|
|
|
|
in the sense of the L coverage types. |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
By default, the wiggle or coverage array is provided at 1 |
375
|
|
|
|
|
|
|
bp resolution. To improve efficiency with large regions, |
376
|
|
|
|
|
|
|
the wiggle array may be limited by using a bin option, |
377
|
|
|
|
|
|
|
where the interrogated interval is divided into the number |
378
|
|
|
|
|
|
|
of bins requested. |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
To retrieve the scores, call the wiggle() or coverage() method. |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
For example, to request wiggle scores in 100 equal bins across |
383
|
|
|
|
|
|
|
the interval, see the following example. The wiggle and |
384
|
|
|
|
|
|
|
coverage types are synonymous. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
my ($wiggle) = $useq->features( |
387
|
|
|
|
|
|
|
-seq_id => $chromosome, |
388
|
|
|
|
|
|
|
-start => $start, |
389
|
|
|
|
|
|
|
-end => $end, |
390
|
|
|
|
|
|
|
-type => 'wiggle:100', |
391
|
|
|
|
|
|
|
); |
392
|
|
|
|
|
|
|
my @scores = $wiggle->wiggle; |
393
|
|
|
|
|
|
|
@scores = $wiggle->coverage; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
Wiggle objects may also be obtained with a get_seq_stream() |
396
|
|
|
|
|
|
|
iterator objects. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=item summary |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item summary:$bins |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
Returns an array of SeqFeatureI compatibile Summary objects |
403
|
|
|
|
|
|
|
for each strand requested. If the useq file contains stranded |
404
|
|
|
|
|
|
|
data, and no strand is requested, then two objects will be |
405
|
|
|
|
|
|
|
returned representing each strand. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
Each Summary object can then be used to call statistical |
408
|
|
|
|
|
|
|
summaries for one or more bins across the interval. Each |
409
|
|
|
|
|
|
|
statistical summary is an anonymous hash with five keys: |
410
|
|
|
|
|
|
|
validCount, sumData, sumSquares, minVal, and maxVal. From |
411
|
|
|
|
|
|
|
these values, a mean and standard deviation may also be |
412
|
|
|
|
|
|
|
calculated. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
my ($summary) = $useq->features( |
415
|
|
|
|
|
|
|
-seq_id => $chromosome, |
416
|
|
|
|
|
|
|
-start => $start, |
417
|
|
|
|
|
|
|
-end => $end, |
418
|
|
|
|
|
|
|
-type => 'summary', |
419
|
|
|
|
|
|
|
); |
420
|
|
|
|
|
|
|
my @stats = $summary->statistical_summary(100); |
421
|
|
|
|
|
|
|
foreach my $stat (@stats) { |
422
|
|
|
|
|
|
|
my $count = $stat->{validCount}; |
423
|
|
|
|
|
|
|
my $sum = $stat->{sumData}; |
424
|
|
|
|
|
|
|
my $mean = $sum / $count; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
Statistical summaries are equivalent to those generated by the |
428
|
|
|
|
|
|
|
L adaptor and may be used interchangeably. They |
429
|
|
|
|
|
|
|
are compatible with the L modules. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Summary objects may also be obtained with a get_seq_stream() |
432
|
|
|
|
|
|
|
iterator object. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=back |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item get_seq_stream |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
This is a memory efficient data accessor. An iterator object is |
439
|
|
|
|
|
|
|
returned for an interval specified using coordinate values in the |
440
|
|
|
|
|
|
|
same manner as features(). Call the method next_seq() on the |
441
|
|
|
|
|
|
|
iterator object to retrieve the observation SeqFeature objects |
442
|
|
|
|
|
|
|
one at a time. The iterator is compatible with region, wiggle, |
443
|
|
|
|
|
|
|
and summary feature types. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# establish the iterator object |
446
|
|
|
|
|
|
|
my $iterator = $useq->get_seq_stream( |
447
|
|
|
|
|
|
|
-seq_id => $seq_id, |
448
|
|
|
|
|
|
|
-start => $start, |
449
|
|
|
|
|
|
|
-end => $end, |
450
|
|
|
|
|
|
|
-type => 'region', |
451
|
|
|
|
|
|
|
); |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# retrieve the features one at a time |
454
|
|
|
|
|
|
|
while (my $f = $iterator->next_seq) { |
455
|
|
|
|
|
|
|
# each feature $f is either a |
456
|
|
|
|
|
|
|
# Bio::DB::USeq::Feature, |
457
|
|
|
|
|
|
|
# a Bio::DB::USeq::Wiggle, or |
458
|
|
|
|
|
|
|
# a Bio::DB::USeq::Summary object |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=item scores |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
This is a simplified data accessor that only returns the score |
464
|
|
|
|
|
|
|
values overlapping an interrogated interval, rather than |
465
|
|
|
|
|
|
|
assembling each observation into a SeqFeature object. The scores |
466
|
|
|
|
|
|
|
are not associated with genomic coordinates, and are not guaranteed |
467
|
|
|
|
|
|
|
to be returned in original genomic order. |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Provide the interval coordinates in the same manner as the |
470
|
|
|
|
|
|
|
features() method. Stranded data collection is supported. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
my @scores = $useq->scores( |
473
|
|
|
|
|
|
|
-seq_id => $seq_id, |
474
|
|
|
|
|
|
|
-start => $start, |
475
|
|
|
|
|
|
|
-end => $end, |
476
|
|
|
|
|
|
|
); |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=item observations |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
This is a low-level data accessor, similar to features() but returning |
481
|
|
|
|
|
|
|
an array of references to the original USeq observations. Each USeq |
482
|
|
|
|
|
|
|
observation is an anonymous array reference of 2-4 elements: start, stop, |
483
|
|
|
|
|
|
|
score, and text, depending on the stored data type. All coordinates are in |
484
|
|
|
|
|
|
|
0-based coordinates, unlike high level accessors. Note that while strand is |
485
|
|
|
|
|
|
|
supported, it is not reported for each observation. If observations exist on |
486
|
|
|
|
|
|
|
both strands, then each strand should be searched separately. Observations |
487
|
|
|
|
|
|
|
are not guaranteed to be returned in genomic order. |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
my @observations = $useq->observations( |
490
|
|
|
|
|
|
|
-seq_id => $seq_id, |
491
|
|
|
|
|
|
|
-start => $start, |
492
|
|
|
|
|
|
|
-end => $end, |
493
|
|
|
|
|
|
|
); |
494
|
|
|
|
|
|
|
foreach my $obs (@observations) { |
495
|
|
|
|
|
|
|
my $start = $obs->[0] + 1; # convert to 1-based coordinate |
496
|
|
|
|
|
|
|
my $stop = $obs->[1]; |
497
|
|
|
|
|
|
|
my $score = $obs->[2]; # may not be present |
498
|
|
|
|
|
|
|
my $text = $obs->[3]; # may not be present |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item segment |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
This returns a L object, which is a SeqFeatureI |
504
|
|
|
|
|
|
|
compatible segment object corresponding to the specified coordinates. |
505
|
|
|
|
|
|
|
From this object, one can call the features(), scores(), or |
506
|
|
|
|
|
|
|
get_seq_stream() methods directly. Keyed options or location |
507
|
|
|
|
|
|
|
information need not be provided. Stranded segments are |
508
|
|
|
|
|
|
|
supported. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
my $segment = $useq->segment( |
511
|
|
|
|
|
|
|
-seq_id => $seq_id, |
512
|
|
|
|
|
|
|
-start => $start, |
513
|
|
|
|
|
|
|
-end => $end, |
514
|
|
|
|
|
|
|
); |
515
|
|
|
|
|
|
|
my @scores = $segment->scores; |
516
|
|
|
|
|
|
|
my @features = $segment->features('wiggle:100'); |
517
|
|
|
|
|
|
|
my $iterator = $segment->get_seq_stream('region'); |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=item get_features_by_location |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
Convenience method for returning features restricted by location. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item get_feature_by_id |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=item get_feature_by_name |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Compatibility methods for returning a specific feature or |
528
|
|
|
|
|
|
|
observation in the USeq file. Text fields, if present, are not |
529
|
|
|
|
|
|
|
indexed in the USeq file, preventing efficient searching of names. |
530
|
|
|
|
|
|
|
As a workaround, an ID or name comprised of "$seq_id:$start:$end" |
531
|
|
|
|
|
|
|
may be used, although a direct search of coordinates would be |
532
|
|
|
|
|
|
|
more efficient. |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
my $feature = $useq->get_feature_by_id("$seq_id:$start:$end"); |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=back |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=head1 ADDITIONAL CLASSES |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
These are additional class object that may be returned by various |
542
|
|
|
|
|
|
|
methods above. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Feature objects |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
These are SeqFeatureI compliant objects returned by the features() |
547
|
|
|
|
|
|
|
or next_seq() methods. They support the following methods. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
seq_id |
550
|
|
|
|
|
|
|
start |
551
|
|
|
|
|
|
|
end |
552
|
|
|
|
|
|
|
strand |
553
|
|
|
|
|
|
|
score |
554
|
|
|
|
|
|
|
type |
555
|
|
|
|
|
|
|
source (returns the useq archive filename) |
556
|
|
|
|
|
|
|
name (chromosome:start..stop) |
557
|
|
|
|
|
|
|
Bio::RangeI methods |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
Additionally, chromosome and global statistics are also available |
560
|
|
|
|
|
|
|
from any feature, as well as from Segment, Wiggle, Iterator, and |
561
|
|
|
|
|
|
|
Summary objects. See the corresponding USeq methods for details. |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Segment objects |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
This is a SeqFeatureI compliant object representing a genomic segment |
566
|
|
|
|
|
|
|
or interval. These support the following methods. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=over 4 |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=item features |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item features($type) |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=item get_seq_stream |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=item get_seq_stream($type) |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=item scores |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=item observations |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
Direct methods for returning features or scores. Coordinate information |
583
|
|
|
|
|
|
|
need not be provided. See the corresponding Bio::DB::USeq methods for |
584
|
|
|
|
|
|
|
more information. |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=item wiggle |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=item wiggle($bins) |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=item coverage |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=item coverage($bins) |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=item statistical_summary |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=item statistical_summary($bins) |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
Convenience methods for returning wiggle (coverage) or summary features |
599
|
|
|
|
|
|
|
over the segment. If desired, the number of bins may be specified. See |
600
|
|
|
|
|
|
|
the features() method for more information. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=item slices |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Returns an array of splice member names that overlap this segment. |
605
|
|
|
|
|
|
|
See L for more information. |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=back |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Iterator objects |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
This is an iterator object for retrieving useq observation SeqFeatureI |
612
|
|
|
|
|
|
|
objects in a memory efficient manner by returning one feature at a time. |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
=over 4 |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=item next_seq |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=item next_feature |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
Returns the next feature present in the interrogated interval. Features |
621
|
|
|
|
|
|
|
are generally returned in ascending coordinate order. Returns undefined |
622
|
|
|
|
|
|
|
when no features are remaining in the interval. Features may include |
623
|
|
|
|
|
|
|
either region or wiggle types, depending on how the iterator object was |
624
|
|
|
|
|
|
|
established. See features() and get_seq_stream() methods for more |
625
|
|
|
|
|
|
|
information. |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=back |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Wiggle objects |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
These are SeqFeatureI compliant objects for backwards compatibility with |
632
|
|
|
|
|
|
|
L and GBrowse. They support the wiggle() and coverage() |
633
|
|
|
|
|
|
|
methods, which returns an array of scores over the interrogated region. By |
634
|
|
|
|
|
|
|
default, the array is equal to the length of the region (1 bp resolution), |
635
|
|
|
|
|
|
|
or may be limited to a specified number of bins for efficiency. See the |
636
|
|
|
|
|
|
|
features() method for more information. |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=over 4 |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=item wiggle |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=item coverage |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
The scores are stored as an array in the coverage attribute. For |
645
|
|
|
|
|
|
|
convenience, the wiggle() and coverage() methods may be used to retrieve |
646
|
|
|
|
|
|
|
the array or array reference of scores. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=item statistical_summary |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
Generate a statistical summary hash for the collected wiggle scores |
651
|
|
|
|
|
|
|
(not the original data). This method is not entirely that useful; best |
652
|
|
|
|
|
|
|
to use the summary feature type in the first place. |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item chromosome and global statistics |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
Chromosome and global statistics, including mean and standard deviation, |
657
|
|
|
|
|
|
|
are available from wiggle objects. See the corresponding USeq methods |
658
|
|
|
|
|
|
|
for details. |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=back |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=head2 Bio::DB::USeq::Summary objects |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
These are SeqFeatureI compliant Summary objects, similar to those |
665
|
|
|
|
|
|
|
generated by the L database adaptor. As such, they are |
666
|
|
|
|
|
|
|
compatible with L and GBrowse. |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
Summary objects can generate statistical summaries over a specified |
669
|
|
|
|
|
|
|
number of bins (default is 1 bin, or the entire requested region). |
670
|
|
|
|
|
|
|
Each statistical summary is an anonymous hash consisting of five |
671
|
|
|
|
|
|
|
keys: validCount, sumData, sumSquares, minVal, and maxVal. From |
672
|
|
|
|
|
|
|
these values, a mean and standard deviation may be calculated. |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
For convenience, three exported functions are available for calculating |
675
|
|
|
|
|
|
|
the mean and standard deviation from a statistical summary hash. |
676
|
|
|
|
|
|
|
See L for more information. |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
Use statistical summaries in the following manner. |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
681
|
|
|
|
|
|
|
my $stat = shift @$stats; |
682
|
|
|
|
|
|
|
my $min = $stat->{minVal}; |
683
|
|
|
|
|
|
|
my $max = $stat->{maxVal}; |
684
|
|
|
|
|
|
|
my $mean = $stat->{sumData} / $stat->{validCount}; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
=over 4 |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=item statistical_summary |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
=item statistical_summary($bins) |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
Generate a statistical summary hash for one or more bins across the |
693
|
|
|
|
|
|
|
interrogated region. Provide the number of bins desired. If a feature |
694
|
|
|
|
|
|
|
type of "summary:$bins" is requested through the features() or |
695
|
|
|
|
|
|
|
get_seq_stream() method, then $bins number of bins will be used. |
696
|
|
|
|
|
|
|
The default number of bins is 1. |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
=item score |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
Generate a single statistical summary over the entire region. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
=item chromosome and global statistics |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
Chromosome and global statistics, including mean and standard deviation, |
705
|
|
|
|
|
|
|
are available from summary objects. See the corresponding USeq methods |
706
|
|
|
|
|
|
|
for details. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=back |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head1 EXPORTED FUNCTIONS |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
Three subroutine functions are available for export to assist |
713
|
|
|
|
|
|
|
in calculating the mean, variance, and standard deviation from |
714
|
|
|
|
|
|
|
statistical summaries. These functions are identical to those |
715
|
|
|
|
|
|
|
from the L adaptor and may be used interchangeably. |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
They are not exported by default; they must explicitly listed. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
use Bio::DB::USeq qw(binMean binStdev); |
720
|
|
|
|
|
|
|
my $stats = $summary->statistical_summary(10); |
721
|
|
|
|
|
|
|
my $stat = shift @$stats; |
722
|
|
|
|
|
|
|
my $mean = binMean($stat); |
723
|
|
|
|
|
|
|
my $stdev = binStdev($stat); |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
=over 4 |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
=item binMean( $stat ) |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
Calculate the mean from a statistical summary anonymous hash. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item binVariance( $stat ) |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
Calculate the variance from a statistical summary anonymous hash. |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
=item binStdev( $stat ) |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
Calculate the standard deviation from a statistical summary anonymous hash. |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
=back |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=head1 USEQ SLICES |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
Genomic observations are recorded in groups, called slices, of |
744
|
|
|
|
|
|
|
usually 10000 observations at a time. Each slice is a separate |
745
|
|
|
|
|
|
|
zip file member in the useq file archive. These methods are for |
746
|
|
|
|
|
|
|
accessing information about each slice. In general, accessing |
747
|
|
|
|
|
|
|
data through slices is a lower level operation. Users should |
748
|
|
|
|
|
|
|
preferentially use the main data accessors. |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
The following are Bio::DB::USeq methods available for working |
751
|
|
|
|
|
|
|
with slices. |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
=over 4 |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item slices |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
Returns an array of all the slice member names present in the |
758
|
|
|
|
|
|
|
useq archive file. |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=item slice_feature($slice) |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
Return a L object representing the slice interval. |
763
|
|
|
|
|
|
|
The features(), get_seq_stream(), and scores() methods are supported. |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item slice_seq_id($slice) |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Return the chromosome or sequence identifier associated with a slice. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=item slice_start($slice) |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
Return the start position of the slice interval. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=item slice_end($slice) |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
Return the end position of the slice interval. |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=item slice_strand($slice) |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
Return the strand of the slice interval. |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
=item slice_type($slice) |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Return the file type of the slice member. This corresponds to the |
784
|
|
|
|
|
|
|
file extension of the slice zip member and indicates how to parse |
785
|
|
|
|
|
|
|
the binary member. Each letter in the type corresponds to a data |
786
|
|
|
|
|
|
|
type, be it integer, short, floating-point, or text. See the |
787
|
|
|
|
|
|
|
useq file documentation for more details. |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=item slice_obs_number($slice) |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
Return the number of observations recorded in the slice interval. |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
=back |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=head1 GBROWSE COMPATIBILITY |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
The USeq adaptor has support for L and GBrowse. |
798
|
|
|
|
|
|
|
It will work with the segments glyph for intervals, |
799
|
|
|
|
|
|
|
the wiggle_xyplot glyph for displaying mean scores, and the |
800
|
|
|
|
|
|
|
wiggle_whiskers glyph for displaying detailed statistics. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Initialize the USeq database adaptor. |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
[data1:database] |
805
|
|
|
|
|
|
|
db_adaptor = Bio::DB::USeq |
806
|
|
|
|
|
|
|
db_args = -file /path/to/data1.useq |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
Displaying simple intervals with the segments glyph. |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
[data1_segments] |
811
|
|
|
|
|
|
|
database = data1 |
812
|
|
|
|
|
|
|
feature = region |
813
|
|
|
|
|
|
|
glyph = segments |
814
|
|
|
|
|
|
|
stranded = 1 |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
Displaying scores using the wiggle_xyplot glyph. |
817
|
|
|
|
|
|
|
You may set the bins to whatever number is appropriate (in |
818
|
|
|
|
|
|
|
this example, 1000), or leave blank (not recommended, |
819
|
|
|
|
|
|
|
defaults to 1 bp resolution). |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
[data1_xyplot] |
822
|
|
|
|
|
|
|
database = data1 |
823
|
|
|
|
|
|
|
feature = wiggle:1000 |
824
|
|
|
|
|
|
|
glyph = wiggle_xyplot |
825
|
|
|
|
|
|
|
graph_type = histogram |
826
|
|
|
|
|
|
|
autoscale = chromosome |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Displaying scores using the wiggle_whiskers glyph. Note that |
829
|
|
|
|
|
|
|
generating statistical summaries are computationally more expensive |
830
|
|
|
|
|
|
|
than simple bins of mean values as with the wiggle feature type. |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
[data1_whiskers] |
833
|
|
|
|
|
|
|
database = data1 |
834
|
|
|
|
|
|
|
feature = summary |
835
|
|
|
|
|
|
|
glyph = wiggle_whiskers |
836
|
|
|
|
|
|
|
graph_type = histogram |
837
|
|
|
|
|
|
|
autoscale = chromosome |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=head1 PERFORMANCE |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
Because the Bio::DB::USeq is implemented as a Perl-only module, |
842
|
|
|
|
|
|
|
performance is subject to the limitations of Perl execution itself and |
843
|
|
|
|
|
|
|
the size of the data that needs to be parsed. In general when collecting |
844
|
|
|
|
|
|
|
score data, requesting scores is the fastest mode of operation, followed |
845
|
|
|
|
|
|
|
by wiggle feature types, and finally summary feature types. |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
In comparison to UCSC bigWig files, the USeq format is typically much |
848
|
|
|
|
|
|
|
faster when viewing intervals where the entire interval is represented by |
849
|
|
|
|
|
|
|
one or a few internal slices. This is especially true for repeated queries |
850
|
|
|
|
|
|
|
over the same or neighboring intervals, as the slice contents are retained |
851
|
|
|
|
|
|
|
in memory. As the number of internal slices that must be loaded into memory |
852
|
|
|
|
|
|
|
increases, for example querying intervals of hundreds of kilobases in size, |
853
|
|
|
|
|
|
|
performance will begin to lag as each internal slice must be parsed into |
854
|
|
|
|
|
|
|
memory. This is where the UCSC bigWig file format with internal zoom levels |
855
|
|
|
|
|
|
|
of summary statistics can outperform, at the cost of file complexity and size. |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
=cut |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
require 5.010000; |
861
|
1
|
|
|
1
|
|
7188
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
862
|
1
|
|
|
1
|
|
3
|
use Carp qw(carp cluck croak confess); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
63
|
|
863
|
1
|
|
|
1
|
|
517
|
use Archive::Zip qw( :ERROR_CODES ); |
|
1
|
|
|
|
|
70010
|
|
|
1
|
|
|
|
|
80
|
|
864
|
1
|
|
|
1
|
|
351
|
use Set::IntervalTree; |
|
1
|
|
|
|
|
4977
|
|
|
1
|
|
|
|
|
50
|
|
865
|
1
|
|
|
1
|
|
6
|
use File::Spec; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
20
|
|
866
|
|
|
|
|
|
|
|
867
|
1
|
|
|
1
|
|
4
|
use base 'Exporter'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
6439
|
|
868
|
|
|
|
|
|
|
our @EXPORT_OK = qw(binMean binVariance binStdev); |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
1; |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
#### USeq initialization #### |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
sub new { |
875
|
2
|
|
|
2
|
1
|
181
|
my $class = shift; |
876
|
2
|
|
|
|
|
21
|
my $self = { |
877
|
|
|
|
|
|
|
'name' => undef, |
878
|
|
|
|
|
|
|
'zip' => undef, |
879
|
|
|
|
|
|
|
'stranded' => undef, |
880
|
|
|
|
|
|
|
'seq_ids' => {}, |
881
|
|
|
|
|
|
|
'metadata' => {}, |
882
|
|
|
|
|
|
|
'coord2file' => {}, |
883
|
|
|
|
|
|
|
'file2attribute' => {}, |
884
|
|
|
|
|
|
|
'buffer' => {}, |
885
|
|
|
|
|
|
|
}; |
886
|
2
|
|
|
|
|
6
|
bless $self, $class; |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
# check for arguments |
889
|
2
|
|
|
|
|
3
|
my %args; |
890
|
2
|
50
|
|
|
|
8
|
if (@_) { |
891
|
2
|
100
|
|
|
|
11
|
if ($_[0] =~ /^-/) { |
892
|
1
|
|
|
|
|
5
|
%args = @_; |
893
|
|
|
|
|
|
|
} |
894
|
|
|
|
|
|
|
else { |
895
|
1
|
|
|
|
|
11
|
$args{-file} = shift; |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
} |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# open file |
900
|
2
|
|
0
|
|
|
7
|
my $file = $args{-file} || $args{-useq} || undef; |
901
|
2
|
50
|
|
|
|
5
|
if ($file) { |
902
|
2
|
50
|
|
|
|
9
|
return unless ($self->open($file)); # open must return true |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
# done |
906
|
2
|
|
|
|
|
6
|
return $self; |
907
|
|
|
|
|
|
|
} |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
sub open { |
910
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
# check file |
913
|
2
|
|
|
|
|
3
|
my $filename = shift; |
914
|
2
|
50
|
|
|
|
5
|
unless ($filename) { |
915
|
0
|
|
|
|
|
0
|
cluck("no file name passed!\n"); |
916
|
0
|
|
|
|
|
0
|
return; |
917
|
|
|
|
|
|
|
} |
918
|
2
|
50
|
|
|
|
12
|
unless ($filename =~ /\.useq$/i) { |
919
|
0
|
|
|
|
|
0
|
carp "'$filename' is not a .useq archive file!\n"; |
920
|
0
|
|
|
|
|
0
|
return; |
921
|
|
|
|
|
|
|
} |
922
|
2
|
50
|
|
|
|
6
|
if ($self->slices) { |
923
|
0
|
|
|
|
|
0
|
cluck "Only load one useq file per object!\n"; |
924
|
0
|
|
|
|
|
0
|
return; |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
# open the archive |
928
|
2
|
|
|
|
|
14
|
my $zip = Archive::Zip->new(); |
929
|
2
|
|
|
|
|
74
|
my $error = $zip->read($filename); |
930
|
2
|
50
|
|
|
|
2682
|
unless ($error == AZ_OK) { |
931
|
0
|
|
|
|
|
0
|
carp " unable to read USeq archive '$filename'! Error $error\n"; |
932
|
0
|
|
|
|
|
0
|
return; |
933
|
|
|
|
|
|
|
} |
934
|
2
|
|
|
|
|
5
|
$self->{'zip'} = $zip; |
935
|
2
|
|
|
|
|
44
|
(undef, undef, $self->{'name'}) = File::Spec->splitpath($filename); |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
# parse the contents |
938
|
2
|
50
|
|
|
|
8
|
return unless ($self->_parse_members); |
939
|
|
|
|
|
|
|
# we delay parsing metadata unless it is requested |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
# success |
942
|
2
|
|
|
|
|
4
|
return 1; |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
sub clone { |
946
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
947
|
1
|
50
|
|
|
|
4
|
return unless $self->{'zip'}; |
948
|
1
|
|
|
|
|
3
|
my $file = $self->zip->fileName; |
949
|
1
|
|
|
|
|
13
|
my $zip = Archive::Zip->new(); |
950
|
1
|
|
|
|
|
35
|
my $error = $zip->read($file); |
951
|
1
|
50
|
|
|
|
1273
|
unless ($error == AZ_OK) { |
952
|
0
|
|
|
|
|
0
|
carp " unable to read USeq archive '$file'! Error $error\n"; |
953
|
0
|
|
|
|
|
0
|
return; |
954
|
|
|
|
|
|
|
} |
955
|
1
|
|
|
|
|
22
|
$self->{'zip'} = $zip; |
956
|
1
|
|
|
|
|
4
|
return 1; |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
sub zip { |
960
|
9
|
|
|
9
|
1
|
52
|
return shift->{'zip'}; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
sub name { |
964
|
4
|
|
|
4
|
0
|
36
|
return shift->{'name'}; |
965
|
|
|
|
|
|
|
} |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
#### General USeq information #### |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
sub seq_ids { |
972
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
973
|
1
|
|
|
|
|
1
|
return sort {$a cmp $b} keys %{ $self->{'seq_ids'} }; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
6
|
|
974
|
|
|
|
|
|
|
} |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
sub length { |
977
|
7
|
|
|
7
|
1
|
52
|
my $self = shift; |
978
|
7
|
50
|
|
|
|
17
|
my $seq_id = shift or return; |
979
|
7
|
50
|
|
|
|
23
|
if (exists $self->{'seq_ids'}{$seq_id}) { |
980
|
7
|
|
|
|
|
22
|
return $self->{'seq_ids'}{$seq_id}; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
sub stranded { |
985
|
8
|
|
|
8
|
1
|
42
|
return shift->{'stranded'}; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
sub attributes { |
989
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
990
|
0
|
0
|
|
|
|
0
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
0
|
|
|
|
|
0
|
|
991
|
0
|
|
|
|
|
0
|
return (sort {$a cmp $b} keys %{ $self->{'metadata'} }); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
sub attribute { |
995
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
996
|
1
|
|
|
|
|
2
|
my $key = shift; |
997
|
1
|
50
|
|
|
|
3
|
return $self->attributes unless $key; |
998
|
1
|
50
|
|
|
|
1
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
1
|
|
|
|
|
5
|
|
999
|
1
|
50
|
|
|
|
3
|
if (exists $self->{'metadata'}{$key}) { |
1000
|
1
|
|
|
|
|
4
|
return $self->{'metadata'}{$key}; |
1001
|
|
|
|
|
|
|
} |
1002
|
0
|
|
|
|
|
0
|
return; |
1003
|
|
|
|
|
|
|
} |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
sub type { |
1006
|
1
|
|
|
1
|
1
|
85
|
return shift->attribute('dataType'); |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
sub version { |
1010
|
0
|
|
|
0
|
1
|
0
|
return shift->attribute('useqArchiveVersion'); |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
sub genome { |
1014
|
0
|
|
|
0
|
1
|
0
|
return shift->attribute('versionedGenome'); |
1015
|
|
|
|
|
|
|
} |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub chr_stats { |
1018
|
1
|
|
|
1
|
1
|
1
|
my $self = shift; |
1019
|
1
|
50
|
|
|
|
3
|
my $seq_id = shift or return; |
1020
|
1
|
|
|
|
|
2
|
my $delay_write = shift; # option to delay rewriting the metadata |
1021
|
1
|
50
|
|
|
|
2
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
1
|
|
|
|
|
14
|
|
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# return the chromosome stats |
1024
|
1
|
50
|
|
|
|
6
|
if (exists $self->{metadata}{"chromStats_$seq_id"}) { |
1025
|
0
|
|
|
|
|
0
|
my @data = split(',', $self->{metadata}{"chromStats_$seq_id"}); |
1026
|
0
|
|
|
|
|
0
|
my %stat = ( |
1027
|
|
|
|
|
|
|
'validCount' => $data[0], |
1028
|
|
|
|
|
|
|
'sumData' => $data[1], |
1029
|
|
|
|
|
|
|
'sumSquares' => $data[2], |
1030
|
|
|
|
|
|
|
'minVal' => $data[3], |
1031
|
|
|
|
|
|
|
'maxVal' => $data[4], |
1032
|
|
|
|
|
|
|
); |
1033
|
0
|
|
|
|
|
0
|
return \%stat; |
1034
|
|
|
|
|
|
|
} |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
# chromosome stats must be generated |
1037
|
1
|
|
|
|
|
6
|
my @slices = $self->_translate_coordinates_to_slices( |
1038
|
|
|
|
|
|
|
$seq_id, 1, $self->length($seq_id), 0); |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
# the normal _stat_summary() function requires loading entire chromosome worth of |
1041
|
|
|
|
|
|
|
# slices into memory and can become a huge memory hog, so we'll go one at time |
1042
|
|
|
|
|
|
|
# instead in a slightly more efficient manner |
1043
|
1
|
|
|
|
|
3
|
my $count = 0; |
1044
|
1
|
|
|
|
|
4
|
my $sum; |
1045
|
|
|
|
|
|
|
my $sum_squares; |
1046
|
1
|
|
|
|
|
0
|
my $min; |
1047
|
1
|
|
|
|
|
0
|
my $max; |
1048
|
1
|
|
|
|
|
3
|
foreach my $slice (@slices) { |
1049
|
1
|
50
|
|
|
|
3
|
next unless $self->slice_type($slice) =~ /f/; |
1050
|
|
|
|
|
|
|
# no sense going through if no score present |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# load and unpack the data |
1053
|
1
|
|
|
|
|
4
|
$self->_clear_buffer([$slice]); |
1054
|
1
|
|
|
|
|
7
|
$self->_load_slice($slice); |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
# find the overlapping observations |
1057
|
1
|
|
|
|
|
5
|
my $results = $self->{'buffer'}{$slice}->fetch( |
1058
|
|
|
|
|
|
|
$self->slice_start($slice) - 1, $self->slice_end($slice)); |
1059
|
1
|
|
|
|
|
9
|
foreach my $r (@$results) { |
1060
|
|
|
|
|
|
|
# each observation is [start, stop, score] |
1061
|
9151
|
50
|
|
|
|
10648
|
if (defined $r->[2]) { |
1062
|
9151
|
|
|
|
|
7624
|
$count++; |
1063
|
9151
|
|
|
|
|
7944
|
$sum += $r->[2]; |
1064
|
9151
|
|
|
|
|
7972
|
$sum_squares += ($r->[2] * $r->[2]); |
1065
|
9151
|
100
|
100
|
|
|
16267
|
$min = $r->[2] if (!defined $min or $r->[2] < $min); |
1066
|
9151
|
100
|
100
|
|
|
16583
|
$max = $r->[2] if (!defined $max or $r->[2] > $max); |
1067
|
|
|
|
|
|
|
} |
1068
|
|
|
|
|
|
|
} |
1069
|
|
|
|
|
|
|
} |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
# assemble stats |
1072
|
1
|
|
50
|
|
|
22
|
my %stat = ( |
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
1073
|
|
|
|
|
|
|
'validCount' => $count, |
1074
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
1075
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
1076
|
|
|
|
|
|
|
'minVal' => $min || 0, |
1077
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
1078
|
|
|
|
|
|
|
); |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
# then associate with the metadata |
1081
|
1
|
|
|
|
|
5
|
$self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat{$_} } |
|
5
|
|
|
|
|
45
|
|
1082
|
|
|
|
|
|
|
qw(validCount sumData sumSquares minVal maxVal) ); |
1083
|
1
|
50
|
|
|
|
11
|
$self->_rewrite_metadata unless $delay_write; |
1084
|
|
|
|
|
|
|
|
1085
|
1
|
|
|
|
|
4
|
return \%stat; |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
sub chr_mean { |
1089
|
1
|
|
|
1
|
1
|
127
|
my $self = shift; |
1090
|
1
|
50
|
|
|
|
5
|
my $seq_id = shift or return; |
1091
|
1
|
|
|
|
|
5
|
return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) ); |
1092
|
|
|
|
|
|
|
} |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
sub chr_stdev { |
1095
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1096
|
0
|
0
|
|
|
|
0
|
my $seq_id = shift or return; |
1097
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) ); |
1098
|
|
|
|
|
|
|
} |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
sub global_stats { |
1101
|
|
|
|
|
|
|
# this is an expensive proposition, because it must parse through every |
1102
|
|
|
|
|
|
|
# slice in the archive |
1103
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1104
|
0
|
0
|
|
|
|
0
|
$self->_parse_metadata unless %{ $self->{'metadata'} }; |
|
0
|
|
|
|
|
0
|
|
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
# return the chromosome stats |
1107
|
0
|
0
|
|
|
|
0
|
if (exists $self->{metadata}{globalStats}) { |
1108
|
0
|
|
|
|
|
0
|
my @data = split(',', $self->{metadata}{globalStats}); |
1109
|
0
|
|
|
|
|
0
|
my %stat = ( |
1110
|
|
|
|
|
|
|
'validCount' => $data[0], |
1111
|
|
|
|
|
|
|
'sumData' => $data[1], |
1112
|
|
|
|
|
|
|
'sumSquares' => $data[2], |
1113
|
|
|
|
|
|
|
'minVal' => $data[3], |
1114
|
|
|
|
|
|
|
'maxVal' => $data[4], |
1115
|
|
|
|
|
|
|
); |
1116
|
0
|
|
|
|
|
0
|
return \%stat; |
1117
|
|
|
|
|
|
|
} |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
# calculate new genome-wide statistics from individual chromosome stats |
1120
|
0
|
|
|
|
|
0
|
my $count = 0; |
1121
|
0
|
|
|
|
|
0
|
my $sum; |
1122
|
|
|
|
|
|
|
my $sum_squares; |
1123
|
0
|
|
|
|
|
0
|
my $min; |
1124
|
0
|
|
|
|
|
0
|
my $max; |
1125
|
0
|
|
|
|
|
0
|
foreach my $seq_id ($self->seq_ids) { |
1126
|
0
|
|
|
|
|
0
|
my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata |
1127
|
0
|
|
|
|
|
0
|
$count += $stats->{validCount}; |
1128
|
0
|
|
|
|
|
0
|
$sum += $stats->{sumData}; |
1129
|
0
|
|
|
|
|
0
|
$sum_squares += $stats->{sumSquares}; |
1130
|
0
|
0
|
0
|
|
|
0
|
$min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min); |
1131
|
0
|
0
|
0
|
|
|
0
|
$max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} > $max); |
1132
|
|
|
|
|
|
|
} |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
# assemble the statistical summary hash |
1135
|
0
|
|
0
|
|
|
0
|
my %stat = ( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1136
|
|
|
|
|
|
|
'validCount' => $count, |
1137
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
1138
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
1139
|
|
|
|
|
|
|
'minVal' => $min || 0, |
1140
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
1141
|
|
|
|
|
|
|
); |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
# update metadata |
1144
|
0
|
|
|
|
|
0
|
$self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} } |
|
0
|
|
|
|
|
0
|
|
1145
|
|
|
|
|
|
|
qw(validCount sumData sumSquares minVal maxVal) ); |
1146
|
0
|
|
|
|
|
0
|
$self->_rewrite_metadata; |
1147
|
|
|
|
|
|
|
|
1148
|
0
|
|
|
|
|
0
|
return \%stat; |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
sub global_mean { |
1152
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1153
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binMean( $self->global_stats ); |
1154
|
|
|
|
|
|
|
} |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
sub global_stdev { |
1157
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1158
|
0
|
|
|
|
|
0
|
return Bio::DB::USeq::binStdev( $self->global_stats ); |
1159
|
|
|
|
|
|
|
} |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
#### slice information #### |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
sub slices { |
1164
|
2
|
|
|
2
|
1
|
4
|
my $self = shift; |
1165
|
2
|
|
|
|
|
3
|
return sort {$a cmp $b} keys %{ $self->{'file2attribute'} }; |
|
0
|
|
|
|
|
0
|
|
|
2
|
|
|
|
|
13
|
|
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
sub slice_seq_id { |
1169
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1170
|
0
|
0
|
|
|
|
0
|
my $slice = shift or return; |
1171
|
0
|
|
|
|
|
0
|
return $self->{'file2attribute'}{$slice}->[0]; |
1172
|
|
|
|
|
|
|
} |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
sub slice_start { |
1175
|
1088
|
|
|
1088
|
1
|
1079
|
my $self = shift; |
1176
|
1088
|
50
|
|
|
|
1429
|
my $slice = shift or return; |
1177
|
1088
|
|
|
|
|
1752
|
return $self->{'file2attribute'}{$slice}->[1]; |
1178
|
|
|
|
|
|
|
} |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
sub slice_end { |
1181
|
2021
|
|
|
2021
|
1
|
2116
|
my $self = shift; |
1182
|
2021
|
50
|
|
|
|
2490
|
my $slice = shift or return; |
1183
|
2021
|
|
|
|
|
4608
|
return $self->{'file2attribute'}{$slice}->[2]; |
1184
|
|
|
|
|
|
|
} |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
sub slice_strand { |
1187
|
4
|
|
|
4
|
1
|
11
|
my $self = shift; |
1188
|
4
|
50
|
|
|
|
12
|
my $slice = shift or return; |
1189
|
4
|
|
|
|
|
20
|
return $self->{'file2attribute'}{$slice}->[3]; |
1190
|
|
|
|
|
|
|
} |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
sub slice_type { |
1193
|
1020
|
|
|
1020
|
1
|
1071
|
my $self = shift; |
1194
|
1020
|
50
|
|
|
|
1238
|
my $slice = shift or return; |
1195
|
1020
|
|
|
|
|
2731
|
return $self->{'file2attribute'}{$slice}->[4]; |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub slice_obs_number { |
1199
|
6
|
|
|
6
|
1
|
12
|
my $self = shift; |
1200
|
6
|
50
|
|
|
|
12
|
my $slice = shift or return; |
1201
|
6
|
|
|
|
|
13
|
return $self->{'file2attribute'}{$slice}->[5]; |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
sub slice_feature { |
1205
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1206
|
0
|
0
|
|
|
|
0
|
my $slice = shift or return; |
1207
|
|
|
|
|
|
|
return Bio::DB::USeq::Segment->new( |
1208
|
|
|
|
|
|
|
-seq_id => $self->{'file2attribute'}{$slice}->[0], |
1209
|
|
|
|
|
|
|
-start => $self->{'file2attribute'}{$slice}->[1], |
1210
|
|
|
|
|
|
|
-stop => $self->{'file2attribute'}{$slice}->[2], |
1211
|
|
|
|
|
|
|
-strand => $self->{'file2attribute'}{$slice}->[3], |
1212
|
0
|
|
|
|
|
0
|
-type => $self->{'file2attribute'}{$slice}->[4], |
1213
|
|
|
|
|
|
|
-source => $self->name, |
1214
|
|
|
|
|
|
|
-name => $slice, |
1215
|
|
|
|
|
|
|
); |
1216
|
|
|
|
|
|
|
} |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
#### Feature and data access #### |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
sub segment { |
1225
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1228
|
1
|
|
|
|
|
3
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1229
|
1
|
50
|
|
|
|
3
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1230
|
|
|
|
|
|
|
|
1231
|
1
|
|
|
|
|
4
|
return Bio::DB::USeq::Segment->new( |
1232
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1233
|
|
|
|
|
|
|
-start => $start, |
1234
|
|
|
|
|
|
|
-end => $stop, |
1235
|
|
|
|
|
|
|
-strand => $strand, |
1236
|
|
|
|
|
|
|
-type => 'segment', |
1237
|
|
|
|
|
|
|
-source => $self->name, |
1238
|
|
|
|
|
|
|
-useq => $self, |
1239
|
|
|
|
|
|
|
); |
1240
|
|
|
|
|
|
|
} |
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
sub features { |
1243
|
2
|
|
|
2
|
1
|
268
|
my $self = shift; |
1244
|
2
|
|
|
|
|
9
|
my %args = @_; |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
# check for type |
1247
|
2
|
|
|
|
|
3
|
my $type; |
1248
|
2
|
|
0
|
|
|
6
|
$args{-type} ||= $args{-types} || $args{-primary_tag} || 'region'; |
|
|
|
33
|
|
|
|
|
1249
|
2
|
50
|
33
|
|
|
15
|
if (ref $args{-type} and ref $args{-type} eq 'ARRAY') { |
1250
|
0
|
|
0
|
|
|
0
|
$type = $args{-type}->[0] || 'region'; |
1251
|
0
|
|
|
|
|
0
|
$args{-type} = $type; |
1252
|
|
|
|
|
|
|
} |
1253
|
|
|
|
|
|
|
else { |
1254
|
2
|
|
|
|
|
4
|
$type = $args{-type}; |
1255
|
|
|
|
|
|
|
} |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
# return an appropriate feature |
1258
|
2
|
50
|
|
|
|
20
|
if ($type =~ /chromosome/) { |
|
|
50
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
# compatibility to return chromosome features |
1260
|
|
|
|
|
|
|
my @chromos = map { |
1261
|
0
|
|
|
|
|
0
|
Bio::DB::USeq::Feature->new( |
|
0
|
|
|
|
|
0
|
|
1262
|
|
|
|
|
|
|
-seq_id => $_, |
1263
|
|
|
|
|
|
|
-start => 1, |
1264
|
|
|
|
|
|
|
-end => $self->length($_), |
1265
|
|
|
|
|
|
|
-type => $type, |
1266
|
|
|
|
|
|
|
-source => $self->name, |
1267
|
|
|
|
|
|
|
-name => $_, |
1268
|
|
|
|
|
|
|
) |
1269
|
|
|
|
|
|
|
} $self->seq_ids; |
1270
|
0
|
0
|
|
|
|
0
|
return wantarray ? @chromos : \@chromos; |
1271
|
|
|
|
|
|
|
} |
1272
|
|
|
|
|
|
|
elsif ($type =~ /region|interval|observation|coverage|wiggle|summary/) { |
1273
|
|
|
|
|
|
|
# region or segment are individual observation features |
1274
|
|
|
|
|
|
|
# coverage or wiggle are for efficient score retrieval with |
1275
|
|
|
|
|
|
|
# backwards compatibility with Bio::Graphics |
1276
|
|
|
|
|
|
|
# summary are statistical summaries akin to Bio::DB::BigFile |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
# set up an iterator |
1279
|
2
|
|
|
|
|
12
|
my $iterator = $self->get_seq_stream(%args); |
1280
|
2
|
50
|
|
|
|
8
|
return unless $iterator; |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
# if user requested an iterator |
1283
|
2
|
0
|
33
|
|
|
5
|
if (exists $args{-iterator} and $args{-iterator}) { |
1284
|
0
|
|
|
|
|
0
|
return $iterator; |
1285
|
|
|
|
|
|
|
} |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
# collect the features |
1288
|
2
|
|
|
|
|
2
|
my @features; |
1289
|
2
|
|
|
|
|
6
|
while (my $f = $iterator->next_seq) { |
1290
|
2
|
|
|
|
|
9
|
push @features, $f; |
1291
|
|
|
|
|
|
|
} |
1292
|
2
|
50
|
|
|
|
19
|
return wantarray ? @features : \@features; |
1293
|
|
|
|
|
|
|
} |
1294
|
|
|
|
|
|
|
else { |
1295
|
0
|
|
|
|
|
0
|
confess "unknown type request '$type'!\n"; |
1296
|
|
|
|
|
|
|
} |
1297
|
|
|
|
|
|
|
} |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
sub get_features_by_type { |
1301
|
|
|
|
|
|
|
# does not work without location information |
1302
|
0
|
|
|
0
|
0
|
0
|
cluck "please use features() method instead"; |
1303
|
0
|
|
|
|
|
0
|
return; |
1304
|
|
|
|
|
|
|
} |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
sub get_features_by_location { |
1308
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1311
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1312
|
|
|
|
|
|
|
|
1313
|
0
|
|
|
|
|
0
|
return $self->features($seq_id, $start, $stop, $strand); |
1314
|
|
|
|
|
|
|
} |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
sub get_feature_by_id { |
1318
|
|
|
|
|
|
|
# much as Bio::DB::BigWig fakes the id, so we will too here |
1319
|
|
|
|
|
|
|
# I don't know how necessary this really is |
1320
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
# id will be encoded as chr:start:end ? |
1323
|
0
|
|
|
|
|
0
|
my $id = shift; |
1324
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $end, $type) = split /:/, $id; |
1325
|
|
|
|
|
|
|
|
1326
|
0
|
|
0
|
|
|
0
|
my @list = $self->features( |
1327
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1328
|
|
|
|
|
|
|
-start => $start, |
1329
|
|
|
|
|
|
|
-end => $end, |
1330
|
|
|
|
|
|
|
-type => $type || undef, |
1331
|
|
|
|
|
|
|
); |
1332
|
0
|
0
|
|
|
|
0
|
return unless @list; |
1333
|
0
|
0
|
|
|
|
0
|
return $list[0] if scalar @list == 1; |
1334
|
0
|
|
|
|
|
0
|
foreach my $f (@list) { |
1335
|
0
|
0
|
0
|
|
|
0
|
return $f if ($f->start == $start and $f->end == $end); |
1336
|
|
|
|
|
|
|
} |
1337
|
0
|
|
|
|
|
0
|
return; |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
sub get_feature_by_name { |
1342
|
0
|
|
|
0
|
1
|
0
|
return shift->get_feature_by_id(@_); |
1343
|
|
|
|
|
|
|
} |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
sub get_seq_stream { |
1347
|
3
|
|
|
3
|
1
|
70
|
my $self = shift; |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1350
|
3
|
|
|
|
|
11
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1351
|
3
|
50
|
|
|
|
11
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
# check for type |
1354
|
3
|
|
|
|
|
9
|
my %args = @_; |
1355
|
3
|
|
|
|
|
3
|
my $type; |
1356
|
3
|
|
50
|
|
|
22
|
$args{-type} ||= $args{-types} || $args{-primary_tag} || 'region'; |
|
|
|
66
|
|
|
|
|
1357
|
3
|
50
|
33
|
|
|
13
|
if (ref $args{-type} and ref $args{-type} eq 'ARRAY') { |
1358
|
0
|
|
0
|
|
|
0
|
$type = $args{-type}->[0] || 'region'; |
1359
|
|
|
|
|
|
|
} |
1360
|
|
|
|
|
|
|
else { |
1361
|
3
|
|
|
|
|
4
|
$type = $args{-type}; |
1362
|
|
|
|
|
|
|
} |
1363
|
|
|
|
|
|
|
|
1364
|
3
|
|
|
|
|
12
|
return Bio::DB::USeq::Iterator->new( |
1365
|
|
|
|
|
|
|
-seq_id => $seq_id, |
1366
|
|
|
|
|
|
|
-start => $start, |
1367
|
|
|
|
|
|
|
-end => $stop, |
1368
|
|
|
|
|
|
|
-strand => $strand, |
1369
|
|
|
|
|
|
|
-type => $type, |
1370
|
|
|
|
|
|
|
-source => $self->name, |
1371
|
|
|
|
|
|
|
-useq => $self, |
1372
|
|
|
|
|
|
|
); |
1373
|
|
|
|
|
|
|
} |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
sub scores { |
1377
|
1
|
|
|
1
|
1
|
37
|
my $self = shift; |
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1380
|
1
|
|
|
|
|
2
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1381
|
1
|
50
|
|
|
|
3
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1382
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
# determine which slices to retrieve |
1384
|
1
|
|
|
|
|
4
|
my @slices = $self->_translate_coordinates_to_slices( |
1385
|
|
|
|
|
|
|
$seq_id, $start, $stop, $strand); |
1386
|
1
|
50
|
|
|
|
3
|
return unless @slices; |
1387
|
1
|
|
|
|
|
3
|
$self->_clear_buffer(\@slices); |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
1390
|
1
|
|
|
|
|
2
|
my @scores; |
1391
|
1
|
|
|
|
|
2
|
foreach my $slice (@slices) { |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
# load and unpack the data |
1394
|
1
|
|
|
|
|
3
|
$self->_load_slice($slice); |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
# find the overlapping observations |
1397
|
1
|
|
|
|
|
33
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
# record the scores |
1400
|
1
|
|
|
|
|
5
|
foreach my $r (@$results) { |
1401
|
31
|
50
|
|
|
|
44
|
push @scores, $r->[2] if defined $r->[2]; |
1402
|
|
|
|
|
|
|
} |
1403
|
|
|
|
|
|
|
} |
1404
|
|
|
|
|
|
|
|
1405
|
1
|
50
|
|
|
|
18
|
return wantarray ? @scores : \@scores; |
1406
|
|
|
|
|
|
|
} |
1407
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
sub observations { |
1409
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
# arguments can be chromo, start, stop, strand |
1412
|
0
|
|
|
|
|
0
|
my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_); |
1413
|
0
|
0
|
|
|
|
0
|
return unless $self->length($seq_id); # make sure chromosome is represented |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
# determine which slices to retrieve |
1416
|
0
|
|
|
|
|
0
|
my @slices = $self->_translate_coordinates_to_slices( |
1417
|
|
|
|
|
|
|
$seq_id, $start, $stop, $strand); |
1418
|
0
|
0
|
|
|
|
0
|
return unless @slices; |
1419
|
0
|
|
|
|
|
0
|
$self->_clear_buffer(\@slices); |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
1422
|
0
|
|
|
|
|
0
|
my @observations; |
1423
|
0
|
|
|
|
|
0
|
foreach my $slice (@slices) { |
1424
|
|
|
|
|
|
|
# load and unpack the data |
1425
|
0
|
|
|
|
|
0
|
$self->_load_slice($slice); |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
# find the overlapping observations |
1428
|
0
|
|
|
|
|
0
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
1429
|
0
|
|
|
|
|
0
|
push @observations, @$results; |
1430
|
|
|
|
|
|
|
} |
1431
|
|
|
|
|
|
|
|
1432
|
0
|
0
|
|
|
|
0
|
return wantarray ? @observations : \@observations; |
1433
|
|
|
|
|
|
|
} |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
|
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
#### Private methods #### |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
sub _parse_metadata { |
1441
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
# the metadata file should always be present in a USeq file |
1444
|
1
|
|
|
|
|
4
|
my $readMe = $self->{'zip'}->contents('archiveReadMe.txt'); |
1445
|
1
|
50
|
|
|
|
1119
|
unless ($readMe) { |
1446
|
0
|
|
|
|
|
0
|
carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n"; |
1447
|
0
|
|
|
|
|
0
|
return; |
1448
|
|
|
|
|
|
|
} |
1449
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
# parse the archive file |
1451
|
|
|
|
|
|
|
# this is a simple text file, each line is key = value |
1452
|
1
|
|
|
|
|
3
|
$self->{'metadata'}{'comments'} = []; |
1453
|
1
|
|
|
|
|
5
|
foreach (split /\n/, $readMe) { |
1454
|
5
|
50
|
|
|
|
10
|
if (/^#/) { |
1455
|
0
|
0
|
|
|
|
0
|
push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/; |
|
0
|
|
|
|
|
0
|
|
1456
|
0
|
|
|
|
|
0
|
next; |
1457
|
|
|
|
|
|
|
} |
1458
|
5
|
50
|
|
|
|
14
|
if (/^\s*([^=\s]+)\s*=\s*(.+)\s*$/) { |
1459
|
|
|
|
|
|
|
# separate key = value pairs tolerating whitespace |
1460
|
|
|
|
|
|
|
# key may contain anything excluding = and whitespace |
1461
|
5
|
|
|
|
|
12
|
$self->{'metadata'}{$1} = $2; |
1462
|
|
|
|
|
|
|
} |
1463
|
|
|
|
|
|
|
} |
1464
|
1
|
|
|
|
|
2
|
return 1; |
1465
|
|
|
|
|
|
|
} |
1466
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
sub _parse_members { |
1469
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
# there is a lot of information encoded in each zip member file name |
1472
|
|
|
|
|
|
|
# the chromosome, strand, coordinates of represented interval, and file type |
1473
|
|
|
|
|
|
|
# this parses and indexes this metadata into a usable format |
1474
|
|
|
|
|
|
|
|
1475
|
2
|
|
|
|
|
5
|
my @errors; |
1476
|
2
|
|
|
|
|
8
|
foreach my $member ($self->{'zip'}->memberNames) { |
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
# archive readMe |
1479
|
8
|
100
|
|
|
|
71
|
next if ($member eq 'archiveReadMe.txt'); |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
# data file |
1482
|
6
|
|
|
|
|
39
|
my ($chromo, $strand, $start, $stop, $number, $extension) = |
1483
|
|
|
|
|
|
|
($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i); |
1484
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
# check extracted metadata |
1486
|
6
|
50
|
33
|
|
|
45
|
unless ($chromo and $strand and defined $start and defined $stop and |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1487
|
|
|
|
|
|
|
$number and $extension) { |
1488
|
0
|
|
|
|
|
0
|
push @errors, " data slice $member"; |
1489
|
0
|
|
|
|
|
0
|
next; |
1490
|
|
|
|
|
|
|
} |
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
# check stranded data |
1493
|
6
|
100
|
|
|
|
11
|
unless (defined $self->{'stranded'}) { |
1494
|
2
|
100
|
|
|
|
6
|
if ($strand eq '.') { |
1495
|
1
|
|
|
|
|
2
|
$self->{'stranded'} = 0; |
1496
|
|
|
|
|
|
|
} |
1497
|
|
|
|
|
|
|
else { |
1498
|
1
|
|
|
|
|
3
|
$self->{'stranded'} = 1; |
1499
|
|
|
|
|
|
|
} |
1500
|
|
|
|
|
|
|
} |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
# check seq_ids |
1503
|
|
|
|
|
|
|
# record the length for each seq_id, which may or may not be entirely |
1504
|
|
|
|
|
|
|
# accurate, since it is just the last interval position |
1505
|
6
|
100
|
|
|
|
11
|
if (exists $self->{'seq_ids'}{$chromo} ) { |
1506
|
3
|
50
|
|
|
|
9
|
if ($stop > $self->{'seq_ids'}{$chromo}) { |
1507
|
3
|
|
|
|
|
5
|
$self->{'seq_ids'}{$chromo} = $stop; |
1508
|
|
|
|
|
|
|
} |
1509
|
|
|
|
|
|
|
} |
1510
|
|
|
|
|
|
|
else { |
1511
|
3
|
|
|
|
|
7
|
$self->{'seq_ids'}{$chromo} = $stop; |
1512
|
|
|
|
|
|
|
} |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
# convert to BioPerl convention |
1515
|
6
|
50
|
|
|
|
16
|
$strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0; |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1516
|
6
|
|
|
|
|
8
|
$start += 1; |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
# check coordinates |
1519
|
|
|
|
|
|
|
# hack to cover a bug? Can't have zero or negative coordinates |
1520
|
6
|
50
|
|
|
|
12
|
if ($start >= $stop) { |
1521
|
0
|
|
|
|
|
0
|
$stop += 1; |
1522
|
|
|
|
|
|
|
} |
1523
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
# store the member details for each member slice |
1525
|
6
|
|
|
|
|
19
|
$self->{'file2attribute'}{$member} = |
1526
|
|
|
|
|
|
|
[$chromo, $start, $stop, $strand, $extension, $number]; |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
# store the member into a chromosome-strand-specific interval tree |
1529
|
|
|
|
|
|
|
# interval trees are stored as 0-based, half-open intervals |
1530
|
6
|
|
66
|
|
|
39
|
$self->{'coord2file'}{$chromo}{$strand} ||= Set::IntervalTree->new(); |
1531
|
6
|
|
|
|
|
19
|
$self->{'coord2file'}{$chromo}{$strand}->insert($member, $start - 1, $stop); |
1532
|
|
|
|
|
|
|
} |
1533
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
# check parsing |
1535
|
2
|
50
|
|
|
|
5
|
if (@errors) { |
1536
|
0
|
|
|
|
|
0
|
carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n"; |
1537
|
|
|
|
|
|
|
} |
1538
|
2
|
50
|
|
|
|
4
|
unless (%{ $self->{'coord2file'} }) { |
|
2
|
|
|
|
|
4
|
|
1539
|
0
|
|
|
|
|
0
|
carp "no data slices present in USeq archive!\n"; |
1540
|
0
|
|
|
|
|
0
|
return; |
1541
|
|
|
|
|
|
|
} |
1542
|
|
|
|
|
|
|
|
1543
|
2
|
|
|
|
|
6
|
return 1; |
1544
|
|
|
|
|
|
|
} |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
sub _get_coordinates { |
1548
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
1549
|
|
|
|
|
|
|
|
1550
|
5
|
|
|
|
|
11
|
my ($seq_id, $start, $stop, $strand); |
1551
|
5
|
50
|
|
|
|
19
|
if ($_[0] =~ /^\-/) { |
1552
|
5
|
|
|
|
|
14
|
my %args = @_; |
1553
|
5
|
|
0
|
|
|
16
|
$seq_id = $args{'-seq_id'} || $args{'-chromo'} || undef; |
1554
|
5
|
|
0
|
|
|
9
|
$start = $args{'-start'} || $args{'-pos'} || undef; |
1555
|
5
|
|
0
|
|
|
13
|
$stop = $args{'-end'} || $args{'-stop'} || undef; |
1556
|
5
|
|
50
|
|
|
21
|
$strand = $args{'-strand'} || '0'; # unstranded |
1557
|
|
|
|
|
|
|
} |
1558
|
|
|
|
|
|
|
else { |
1559
|
0
|
|
|
|
|
0
|
($seq_id, $start, $stop, $strand) = @_; |
1560
|
|
|
|
|
|
|
} |
1561
|
5
|
50
|
|
|
|
11
|
unless ($seq_id) { |
1562
|
0
|
|
|
|
|
0
|
cluck "no sequence ID provided!"; |
1563
|
0
|
|
|
|
|
0
|
return; |
1564
|
|
|
|
|
|
|
} |
1565
|
5
|
|
50
|
|
|
11
|
$start ||= 1; |
1566
|
5
|
|
33
|
|
|
8
|
$stop ||= $self->length($seq_id); |
1567
|
5
|
|
50
|
|
|
15
|
$strand ||= 0; |
1568
|
5
|
|
|
|
|
14
|
return ($seq_id, $start, $stop, $strand); |
1569
|
|
|
|
|
|
|
} |
1570
|
|
|
|
|
|
|
|
1571
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
sub _translate_coordinates_to_slices { |
1573
|
6
|
|
|
6
|
|
17
|
my $self = shift; |
1574
|
6
|
|
|
|
|
15
|
my ($seq_id, $start, $stop, $strand) = @_; |
1575
|
6
|
50
|
|
|
|
17
|
return unless (exists $self->{'coord2file'}{$seq_id}); |
1576
|
|
|
|
|
|
|
|
1577
|
|
|
|
|
|
|
# check strand request |
1578
|
6
|
|
|
|
|
9
|
my $both = 0; |
1579
|
6
|
50
|
|
|
|
17
|
if ($strand == 0) { |
1580
|
|
|
|
|
|
|
# strand was not specified, |
1581
|
|
|
|
|
|
|
# but collect from both strands if we have stranded data |
1582
|
6
|
100
|
|
|
|
14
|
$both = 1 if $self->stranded; |
1583
|
|
|
|
|
|
|
} |
1584
|
|
|
|
|
|
|
else { |
1585
|
|
|
|
|
|
|
# strand was specified, |
1586
|
|
|
|
|
|
|
# but convert it to unstranded if the data is not stranded |
1587
|
0
|
0
|
|
|
|
0
|
$strand = 0 unless $self->stranded; |
1588
|
|
|
|
|
|
|
} |
1589
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
# look for the overlapping slices |
1591
|
6
|
|
|
|
|
8
|
my @slices; |
1592
|
6
|
100
|
|
|
|
11
|
if ($both) { |
1593
|
|
|
|
|
|
|
# need to collect from both strands |
1594
|
|
|
|
|
|
|
# plus strand first, then minus strand |
1595
|
1
|
|
|
|
|
7
|
my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start - 1, $stop); |
1596
|
1
|
|
|
|
|
4
|
my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start - 1, $stop); |
1597
|
1
|
|
|
|
|
12
|
push @slices, @$results, @$results2; |
1598
|
|
|
|
|
|
|
} |
1599
|
|
|
|
|
|
|
|
1600
|
|
|
|
|
|
|
# specific strand |
1601
|
|
|
|
|
|
|
else { |
1602
|
5
|
|
|
|
|
42
|
my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start - 1, $stop); |
1603
|
5
|
|
|
|
|
12
|
push @slices, @$results; |
1604
|
|
|
|
|
|
|
} |
1605
|
|
|
|
|
|
|
|
1606
|
6
|
|
|
|
|
17
|
return @slices; |
1607
|
|
|
|
|
|
|
} |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
sub _clear_buffer { |
1611
|
6
|
|
|
6
|
|
9
|
my $self = shift; |
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
# make a quick hash of wanted slices |
1614
|
6
|
|
|
|
|
8
|
my %wanted = map {$_ => 1} @{$_[0]}; |
|
9
|
|
|
|
|
25
|
|
|
6
|
|
|
|
|
20
|
|
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
# delete the existing buffers of slices we do not want |
1617
|
6
|
|
|
|
|
10
|
foreach (keys %{ $self->{buffer} }) { |
|
6
|
|
|
|
|
18
|
|
1618
|
5
|
100
|
|
|
|
6653
|
delete $self->{buffer}{$_} unless exists $wanted{$_}; |
1619
|
|
|
|
|
|
|
} |
1620
|
|
|
|
|
|
|
} |
1621
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
sub _load_slice { |
1623
|
1016
|
|
|
1016
|
|
980
|
my $self = shift; |
1624
|
1016
|
|
|
|
|
977
|
my $slice = shift; |
1625
|
1016
|
100
|
|
|
|
1663
|
return if (exists $self->{'buffer'}{$slice}); |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
# each slice is parsed into observations consisting of start, stop, and |
1628
|
|
|
|
|
|
|
# optionally score and text depending on type |
1629
|
|
|
|
|
|
|
# these are stored as anonymous arrays [start, stop, score, text] |
1630
|
|
|
|
|
|
|
# for quick retrieval each observation interval is stored in a Set::IntervalTree |
1631
|
|
|
|
|
|
|
# these operations maintain the original 0-based coordinate system |
1632
|
|
|
|
|
|
|
|
1633
|
7
|
|
|
|
|
56
|
my $type = $self->slice_type($slice); |
1634
|
7
|
50
|
|
|
|
58
|
if ($type eq 's') { $self->_load_s_slice($slice) } |
|
0
|
50
|
|
|
|
0
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1635
|
0
|
|
|
|
|
0
|
elsif ($type eq 'sf') { $self->_load_sf_slice($slice) } |
1636
|
0
|
|
|
|
|
0
|
elsif ($type eq 'st') { $self->_load_st_slice($slice) } |
1637
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ss') { $self->_load_ss_slice($slice) } |
1638
|
6
|
|
|
|
|
20
|
elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) } |
1639
|
0
|
|
|
|
|
0
|
elsif ($type eq 'sst') { $self->_load_sst_slice($slice) } |
1640
|
1
|
|
|
|
|
5
|
elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) } |
1641
|
0
|
|
|
|
|
0
|
elsif ($type eq 'i') { $self->_load_i_slice($slice) } |
1642
|
0
|
|
|
|
|
0
|
elsif ($type eq 'if') { $self->_load_if_slice($slice) } |
1643
|
0
|
|
|
|
|
0
|
elsif ($type eq 'it') { $self->_load_it_slice($slice) } |
1644
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ii') { $self->_load_ii_slice($slice) } |
1645
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iif') { $self->_load_iif_slice($slice) } |
1646
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iit') { $self->_load_iit_slice($slice) } |
1647
|
0
|
|
|
|
|
0
|
elsif ($type eq 'iift') { $self->_load_iift_slice($slice) } |
1648
|
0
|
|
|
|
|
0
|
elsif ($type eq 'is') { $self->_load_is_slice($slice) } |
1649
|
0
|
|
|
|
|
0
|
elsif ($type eq 'isf') { $self->_load_isf_slice($slice) } |
1650
|
0
|
|
|
|
|
0
|
elsif ($type eq 'ist') { $self->_load_ist_slice($slice) } |
1651
|
0
|
|
|
|
|
0
|
elsif ($type eq 'isft') { $self->_load_isft_slice($slice) } |
1652
|
|
|
|
|
|
|
else { |
1653
|
0
|
|
|
|
|
0
|
croak "unable to load slice '$slice'! Unsupported slice type $type\n"; |
1654
|
|
|
|
|
|
|
} |
1655
|
|
|
|
|
|
|
} |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
sub _load_s_slice { |
1659
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1660
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1661
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1662
|
|
|
|
|
|
|
|
1663
|
|
|
|
|
|
|
# unpack the data slice zip member |
1664
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1665
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) ); |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1668
|
|
|
|
|
|
|
# first observation |
1669
|
0
|
|
|
|
|
0
|
my $position = shift @data; |
1670
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
# remaining observations |
1673
|
0
|
|
|
|
|
0
|
while (@data) { |
1674
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1675
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1676
|
|
|
|
|
|
|
} |
1677
|
|
|
|
|
|
|
|
1678
|
|
|
|
|
|
|
# store Interval Tree buffer |
1679
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1680
|
|
|
|
|
|
|
} |
1681
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
sub _load_sf_slice { |
1683
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1684
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1685
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1686
|
|
|
|
|
|
|
|
1687
|
|
|
|
|
|
|
# unpack the data slice zip member |
1688
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1689
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) ); |
1690
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1692
|
|
|
|
|
|
|
# first observation |
1693
|
0
|
|
|
|
|
0
|
my $position = shift @data; |
1694
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1695
|
|
|
|
|
|
|
|
1696
|
|
|
|
|
|
|
# remaining observations |
1697
|
0
|
|
|
|
|
0
|
while (@data) { |
1698
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1699
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1700
|
|
|
|
|
|
|
} |
1701
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
# store Interval Tree buffer |
1703
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1704
|
|
|
|
|
|
|
} |
1705
|
|
|
|
|
|
|
|
1706
|
|
|
|
|
|
|
sub _load_st_slice { |
1707
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1708
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1709
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, (score), text |
1712
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
# load the slice contents |
1714
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1715
|
|
|
|
|
|
|
|
1716
|
|
|
|
|
|
|
# first observation |
1717
|
0
|
|
|
|
|
0
|
my $i = 8; # go ahead and set index up to first observation text |
1718
|
0
|
|
|
|
|
0
|
my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i)); |
1719
|
|
|
|
|
|
|
# initial null, position, text_length |
1720
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1721
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1722
|
0
|
|
|
|
|
0
|
$i += $t; |
1723
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
# remaining observations |
1725
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1726
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1727
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1728
|
0
|
|
|
|
|
0
|
my ($p, $t) = unpack('s>s>', substr($contents, $i, 4)); |
1729
|
0
|
|
|
|
|
0
|
$i += 4; |
1730
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1731
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1732
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1733
|
0
|
|
|
|
|
0
|
$i += $t; |
1734
|
|
|
|
|
|
|
} |
1735
|
|
|
|
|
|
|
|
1736
|
|
|
|
|
|
|
# store Interval Tree buffer |
1737
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1738
|
|
|
|
|
|
|
} |
1739
|
|
|
|
|
|
|
|
1740
|
|
|
|
|
|
|
sub _load_ss_slice { |
1741
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1742
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1743
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1744
|
|
|
|
|
|
|
|
1745
|
|
|
|
|
|
|
# unpack the data slice zip member |
1746
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1747
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) ); |
1748
|
|
|
|
|
|
|
|
1749
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, no score |
1750
|
|
|
|
|
|
|
# first observation |
1751
|
0
|
|
|
|
|
0
|
my $position = @data; |
1752
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
1753
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1754
|
|
|
|
|
|
|
|
1755
|
|
|
|
|
|
|
# remaining observations |
1756
|
0
|
|
|
|
|
0
|
while (@data) { |
1757
|
0
|
|
|
|
|
0
|
$position += (shift @data) + 32768; |
1758
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
1759
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1760
|
|
|
|
|
|
|
} |
1761
|
|
|
|
|
|
|
|
1762
|
|
|
|
|
|
|
# store Interval Tree buffer |
1763
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1764
|
|
|
|
|
|
|
} |
1765
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
sub _load_ssf_slice { |
1767
|
6
|
|
|
6
|
|
8
|
my $self = shift; |
1768
|
6
|
|
|
|
|
9
|
my $slice = shift; |
1769
|
6
|
|
|
|
|
46
|
my $tree = Set::IntervalTree->new(); |
1770
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
# unpack the data slice zip member |
1772
|
6
|
|
|
|
|
14
|
my $number = $self->slice_obs_number($slice); |
1773
|
6
|
|
|
|
|
26
|
my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) ); |
1774
|
|
|
|
|
|
|
|
1775
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1776
|
|
|
|
|
|
|
# first observation |
1777
|
6
|
|
|
|
|
25966
|
my $position = (shift @data); |
1778
|
6
|
|
|
|
|
26
|
my $position2 = $position + (shift @data) + 32768; |
1779
|
6
|
|
|
|
|
41
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1780
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
# remaining observations |
1782
|
6
|
|
|
|
|
18
|
while (@data) { |
1783
|
58298
|
|
|
|
|
54937
|
$position += (shift @data) + 32768; |
1784
|
58298
|
|
|
|
|
51623
|
$position2 = $position + (shift @data) + 32768; |
1785
|
58298
|
|
|
|
|
120028
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1786
|
|
|
|
|
|
|
} |
1787
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
# store Interval Tree buffer |
1789
|
6
|
|
|
|
|
41
|
$self->{buffer}{$slice} = $tree; |
1790
|
|
|
|
|
|
|
} |
1791
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
sub _load_sst_slice { |
1793
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1794
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1795
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1796
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1797
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
# first observation |
1799
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
1800
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i)); |
1801
|
|
|
|
|
|
|
# initial null, position, length, text_length |
1802
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
1803
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1804
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1805
|
0
|
|
|
|
|
0
|
$i += $t; |
1806
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
# remaining observations |
1808
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1809
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1810
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1811
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6)); |
1812
|
0
|
|
|
|
|
0
|
$i += 6; |
1813
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1814
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
1815
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1816
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
1817
|
0
|
|
|
|
|
0
|
$i += $t; |
1818
|
|
|
|
|
|
|
} |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
# store Interval Tree buffer |
1821
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1822
|
|
|
|
|
|
|
} |
1823
|
|
|
|
|
|
|
|
1824
|
|
|
|
|
|
|
sub _load_ssft_slice { |
1825
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
1826
|
1
|
|
|
|
|
2
|
my $slice = shift; |
1827
|
1
|
|
|
|
|
6
|
my $tree = Set::IntervalTree->new(); |
1828
|
1
|
|
|
|
|
13
|
my $contents = $self->zip->contents($slice); |
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
# first observation |
1831
|
1
|
|
|
|
|
3388
|
my $i = 14; # go ahead and set index up to first observation text |
1832
|
1
|
|
|
|
|
6
|
my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i)); |
1833
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
1834
|
1
|
|
|
|
|
4
|
my $position2 = $position + $l + 32768; |
1835
|
1
|
|
|
|
|
5
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1836
|
1
|
|
|
|
|
5
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
1837
|
1
|
|
|
|
|
2
|
$i += $t; |
1838
|
|
|
|
|
|
|
|
1839
|
|
|
|
|
|
|
# remaining observations |
1840
|
1
|
|
|
|
|
1
|
my $p; # position offset |
1841
|
1
|
|
|
|
|
2
|
my $len = CORE::length($contents); |
1842
|
1
|
|
|
|
|
4
|
while ($i < $len) { |
1843
|
10000
|
|
|
|
|
17382
|
($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10)); |
1844
|
10000
|
|
|
|
|
10362
|
$i += 10; |
1845
|
10000
|
|
|
|
|
8556
|
$position += $p + 32768; |
1846
|
10000
|
|
|
|
|
8743
|
$position2 = $position + $l + 32768; |
1847
|
10000
|
|
|
|
|
16254
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1848
|
10000
|
|
|
|
|
27084
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
1849
|
10000
|
|
|
|
|
12977
|
$i += $t; |
1850
|
|
|
|
|
|
|
} |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
# store Interval Tree buffer |
1853
|
1
|
|
|
|
|
14
|
$self->{buffer}{$slice} = $tree; |
1854
|
|
|
|
|
|
|
} |
1855
|
|
|
|
|
|
|
|
1856
|
|
|
|
|
|
|
sub _load_i_slice { |
1857
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1858
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1859
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1860
|
|
|
|
|
|
|
|
1861
|
|
|
|
|
|
|
# unpack the data slice zip member |
1862
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1863
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) ); |
1864
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1866
|
|
|
|
|
|
|
# first observation |
1867
|
0
|
|
|
|
|
0
|
my $position = @data; |
1868
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1869
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
# remaining observations |
1871
|
0
|
|
|
|
|
0
|
while (@data) { |
1872
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1873
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef], $position, $position + 1); |
1874
|
|
|
|
|
|
|
} |
1875
|
|
|
|
|
|
|
|
1876
|
|
|
|
|
|
|
# store Interval Tree buffer |
1877
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1878
|
|
|
|
|
|
|
} |
1879
|
|
|
|
|
|
|
|
1880
|
|
|
|
|
|
|
sub _load_if_slice { |
1881
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1882
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1883
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1884
|
|
|
|
|
|
|
|
1885
|
|
|
|
|
|
|
# unpack the data slice zip member |
1886
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1887
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) ); |
1888
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1890
|
|
|
|
|
|
|
# first observation |
1891
|
0
|
|
|
|
|
0
|
my $position = @data; |
1892
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
# remaining observations |
1895
|
0
|
|
|
|
|
0
|
while (@data) { |
1896
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1897
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1); |
1898
|
|
|
|
|
|
|
} |
1899
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
# store Interval Tree buffer |
1901
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1902
|
|
|
|
|
|
|
} |
1903
|
|
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
sub _load_it_slice { |
1905
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1906
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1907
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1908
|
|
|
|
|
|
|
|
1909
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, (score), text |
1910
|
|
|
|
|
|
|
|
1911
|
|
|
|
|
|
|
# load the slice contents |
1912
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1913
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
# first observation |
1915
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
1916
|
0
|
|
|
|
|
0
|
my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i)); |
1917
|
|
|
|
|
|
|
# initial null, position, text_length |
1918
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
1919
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1920
|
0
|
|
|
|
|
0
|
$i += $t; |
1921
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
# remaining observations |
1923
|
0
|
|
|
|
|
0
|
my $p; # position offset |
1924
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
1925
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
1926
|
0
|
|
|
|
|
0
|
($p, $t) = unpack('i>s>', substr($contents, $i, 6)); |
1927
|
0
|
|
|
|
|
0
|
$i += 6; |
1928
|
0
|
|
|
|
|
0
|
$position += $p + 32768; |
1929
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
1930
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1); |
1931
|
0
|
|
|
|
|
0
|
$i += $t; |
1932
|
|
|
|
|
|
|
} |
1933
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
# store Interval Tree buffer |
1935
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1936
|
|
|
|
|
|
|
} |
1937
|
|
|
|
|
|
|
|
1938
|
|
|
|
|
|
|
sub _load_ii_slice { |
1939
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1940
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1941
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1942
|
|
|
|
|
|
|
|
1943
|
|
|
|
|
|
|
# unpack the data slice zip member |
1944
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1945
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) ); |
1946
|
|
|
|
|
|
|
|
1947
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1948
|
|
|
|
|
|
|
# first observation |
1949
|
0
|
|
|
|
|
0
|
my $position = @data; |
1950
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data); |
1951
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1952
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
# remaining observations |
1954
|
0
|
|
|
|
|
0
|
while (@data) { |
1955
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1956
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data); |
1957
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
1958
|
|
|
|
|
|
|
} |
1959
|
|
|
|
|
|
|
|
1960
|
|
|
|
|
|
|
# store Interval Tree buffer |
1961
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1962
|
|
|
|
|
|
|
} |
1963
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
sub _load_iif_slice { |
1965
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1966
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1967
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
# unpack the data slice zip member |
1970
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
1971
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) ); |
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
1974
|
|
|
|
|
|
|
# first observation |
1975
|
0
|
|
|
|
|
0
|
my $position = @data; |
1976
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data); |
1977
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
# remaining observations |
1980
|
0
|
|
|
|
|
0
|
while (@data) { |
1981
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
1982
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data); |
1983
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
1984
|
|
|
|
|
|
|
} |
1985
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
# store Interval Tree buffer |
1987
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
1988
|
|
|
|
|
|
|
} |
1989
|
|
|
|
|
|
|
|
1990
|
|
|
|
|
|
|
sub _load_iit_slice { |
1991
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1992
|
0
|
|
|
|
|
0
|
my $slice = shift; |
1993
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
1994
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
1995
|
|
|
|
|
|
|
|
1996
|
|
|
|
|
|
|
# first observation |
1997
|
0
|
|
|
|
|
0
|
my $i = 12; # go ahead and set index up to first observation text |
1998
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i)); |
1999
|
|
|
|
|
|
|
# initial null, position, length, text_length |
2000
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l; |
2001
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2002
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2003
|
0
|
|
|
|
|
0
|
$i += $t; |
2004
|
|
|
|
|
|
|
|
2005
|
|
|
|
|
|
|
# remaining observations |
2006
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2007
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
2008
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
2009
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10)); |
2010
|
0
|
|
|
|
|
0
|
$i += 10; |
2011
|
0
|
|
|
|
|
0
|
$position += $p; |
2012
|
0
|
|
|
|
|
0
|
$position2 = $position + $l; |
2013
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2014
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2015
|
0
|
|
|
|
|
0
|
$i += $t; |
2016
|
|
|
|
|
|
|
} |
2017
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
# store Interval Tree buffer |
2019
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2020
|
|
|
|
|
|
|
} |
2021
|
|
|
|
|
|
|
|
2022
|
|
|
|
|
|
|
sub _load_iift_slice { |
2023
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2024
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2025
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2026
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
2027
|
|
|
|
|
|
|
|
2028
|
|
|
|
|
|
|
# first observation |
2029
|
0
|
|
|
|
|
0
|
my $i = 16; # go ahead and set index up to first observation text |
2030
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i)); |
2031
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
2032
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l; |
2033
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2034
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2035
|
0
|
|
|
|
|
0
|
$i += $t; |
2036
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
# remaining observations |
2038
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2039
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
2040
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
2041
|
0
|
|
|
|
|
0
|
($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14)); |
2042
|
0
|
|
|
|
|
0
|
$i += 14; |
2043
|
0
|
|
|
|
|
0
|
$position += $p; |
2044
|
0
|
|
|
|
|
0
|
$position2 = $position + $l; |
2045
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2046
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2047
|
0
|
|
|
|
|
0
|
$i += $t; |
2048
|
|
|
|
|
|
|
} |
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
# store Interval Tree buffer |
2051
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2052
|
|
|
|
|
|
|
} |
2053
|
|
|
|
|
|
|
|
2054
|
|
|
|
|
|
|
sub _load_is_slice { |
2055
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2056
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2057
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2058
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
# unpack the data slice zip member |
2060
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
2061
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) ); |
2062
|
|
|
|
|
|
|
|
2063
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
2064
|
|
|
|
|
|
|
# first observation |
2065
|
0
|
|
|
|
|
0
|
my $position = @data; |
2066
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
2067
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
2068
|
|
|
|
|
|
|
|
2069
|
|
|
|
|
|
|
# remaining observations |
2070
|
0
|
|
|
|
|
0
|
while (@data) { |
2071
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
2072
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
2073
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef], $position, $position2); |
2074
|
|
|
|
|
|
|
} |
2075
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
# store Interval Tree buffer |
2077
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2078
|
|
|
|
|
|
|
} |
2079
|
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
|
sub _load_isf_slice { |
2081
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2082
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2083
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2084
|
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
|
# unpack the data slice zip member |
2086
|
0
|
|
|
|
|
0
|
my $number = $self->slice_obs_number($slice); |
2087
|
0
|
|
|
|
|
0
|
my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) ); |
2088
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
# convert the unpacked data into start, stop, score |
2090
|
|
|
|
|
|
|
# first observation |
2091
|
0
|
|
|
|
|
0
|
my $position = @data; |
2092
|
0
|
|
|
|
|
0
|
my $position2 = $position + (shift @data) + 32768; |
2093
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
2094
|
|
|
|
|
|
|
|
2095
|
|
|
|
|
|
|
# remaining observations |
2096
|
0
|
|
|
|
|
0
|
while (@data) { |
2097
|
0
|
|
|
|
|
0
|
$position += (shift @data); |
2098
|
0
|
|
|
|
|
0
|
$position2 = $position + (shift @data) + 32768; |
2099
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, shift(@data)], $position, $position2); |
2100
|
|
|
|
|
|
|
} |
2101
|
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
# store Interval Tree buffer |
2103
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2104
|
|
|
|
|
|
|
} |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
sub _load_ist_slice { |
2107
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2108
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2109
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2110
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
# first observation |
2113
|
0
|
|
|
|
|
0
|
my $i = 10; # go ahead and set index up to first observation text |
2114
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i)); |
2115
|
|
|
|
|
|
|
# initial null, position, length, text_length |
2116
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
2117
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2118
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2119
|
0
|
|
|
|
|
0
|
$i += $t; |
2120
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
# remaining observations |
2122
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2123
|
0
|
|
|
|
|
0
|
my $len = CORE::length($contents); |
2124
|
0
|
|
|
|
|
0
|
while ($i < $len) { |
2125
|
0
|
|
|
|
|
0
|
($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8)); |
2126
|
0
|
|
|
|
|
0
|
$i += 8; |
2127
|
0
|
|
|
|
|
0
|
$position += $p; |
2128
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
2129
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2130
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, undef, $text], $position, $position2); |
2131
|
0
|
|
|
|
|
0
|
$i += $t; |
2132
|
|
|
|
|
|
|
} |
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
# store Interval Tree buffer |
2135
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2136
|
|
|
|
|
|
|
} |
2137
|
|
|
|
|
|
|
|
2138
|
|
|
|
|
|
|
sub _load_isft_slice { |
2139
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2140
|
0
|
|
|
|
|
0
|
my $slice = shift; |
2141
|
0
|
|
|
|
|
0
|
my $tree = Set::IntervalTree->new(); |
2142
|
0
|
|
|
|
|
0
|
my $contents = $self->zip->contents($slice); |
2143
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
# first observation |
2145
|
0
|
|
|
|
|
0
|
my $i = 14; # go ahead and set index up to first observation text |
2146
|
0
|
|
|
|
|
0
|
my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i)); |
2147
|
|
|
|
|
|
|
# initial null, position, length, score, text_length |
2148
|
0
|
|
|
|
|
0
|
my $position2 = $position + $l + 32768; |
2149
|
0
|
|
|
|
|
0
|
my $text = unpack("A$t", substr($contents, $i, $t)); |
2150
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2151
|
0
|
|
|
|
|
0
|
$i += $t; |
2152
|
|
|
|
|
|
|
|
2153
|
|
|
|
|
|
|
# remaining observations |
2154
|
0
|
|
|
|
|
0
|
my $p; # position offset |
2155
|
0
|
|
|
|
|
0
|
while ($i < CORE::length($contents)) { |
2156
|
0
|
|
|
|
|
0
|
($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12)); |
2157
|
0
|
|
|
|
|
0
|
$i += 12; |
2158
|
0
|
|
|
|
|
0
|
$position += $p; |
2159
|
0
|
|
|
|
|
0
|
$position2 = $position + $l + 32768; |
2160
|
0
|
|
|
|
|
0
|
$text = unpack("A$t", substr($contents, $i, $t)); |
2161
|
0
|
|
|
|
|
0
|
$tree->insert( [$position, $position2, $s, $text], $position, $position2); |
2162
|
0
|
|
|
|
|
0
|
$i += $t; |
2163
|
|
|
|
|
|
|
} |
2164
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
# store Interval Tree buffer |
2166
|
0
|
|
|
|
|
0
|
$self->{buffer}{$slice} = $tree; |
2167
|
|
|
|
|
|
|
} |
2168
|
|
|
|
|
|
|
|
2169
|
|
|
|
|
|
|
sub _mean_score { |
2170
|
1000
|
|
|
1000
|
|
899
|
my $self = shift; |
2171
|
1000
|
|
|
|
|
1231
|
my ($start, $stop, $slices) = @_; |
2172
|
1000
|
50
|
|
|
|
1333
|
return unless @$slices; |
2173
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2175
|
1000
|
|
|
|
|
876
|
my $sum; |
2176
|
1000
|
|
|
|
|
897
|
my $count = 0; |
2177
|
1000
|
|
|
|
|
1050
|
foreach my $slice (@$slices) { |
2178
|
1001
|
50
|
|
|
|
1296
|
next unless $self->slice_type($slice) =~ /f/; |
2179
|
|
|
|
|
|
|
# no sense going through if no score present |
2180
|
|
|
|
|
|
|
|
2181
|
|
|
|
|
|
|
# load and unpack the data |
2182
|
1001
|
|
|
|
|
2009
|
$self->_load_slice($slice); |
2183
|
|
|
|
|
|
|
|
2184
|
|
|
|
|
|
|
# find the overlapping observations and record values |
2185
|
1001
|
|
|
|
|
87617
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2186
|
1001
|
|
|
|
|
2002
|
foreach my $r (@$results) { |
2187
|
2276
|
|
100
|
|
|
3729
|
$sum += $r->[2] || 0; |
2188
|
2276
|
|
|
|
|
2763
|
$count++; |
2189
|
|
|
|
|
|
|
} |
2190
|
|
|
|
|
|
|
} |
2191
|
1000
|
50
|
|
|
|
3614
|
return $count ? $sum / $count : undef; |
2192
|
|
|
|
|
|
|
} |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
sub _stat_summary { |
2195
|
10
|
|
|
10
|
|
10
|
my $self = shift; |
2196
|
10
|
|
|
|
|
13
|
my ($start, $stop, $slices) = @_; |
2197
|
10
|
50
|
|
|
|
18
|
return unless @$slices; |
2198
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
# initialize the statistical scores |
2200
|
10
|
|
|
|
|
24
|
my $count = 0; |
2201
|
10
|
|
|
|
|
19
|
my $sum; |
2202
|
|
|
|
|
|
|
my $sum_squares; |
2203
|
10
|
|
|
|
|
0
|
my $min; |
2204
|
10
|
|
|
|
|
0
|
my $max; |
2205
|
|
|
|
|
|
|
|
2206
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2207
|
10
|
|
|
|
|
13
|
foreach my $slice (@$slices) { |
2208
|
11
|
50
|
|
|
|
19
|
next unless $self->slice_type($slice) =~ /f/; |
2209
|
|
|
|
|
|
|
# no sense going through if no score present |
2210
|
|
|
|
|
|
|
|
2211
|
|
|
|
|
|
|
# load and unpack the data |
2212
|
11
|
|
|
|
|
26
|
$self->_load_slice($slice); |
2213
|
|
|
|
|
|
|
|
2214
|
|
|
|
|
|
|
# find the overlapping observations |
2215
|
11
|
|
|
|
|
1558
|
my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2216
|
11
|
|
|
|
|
32
|
foreach my $r (@$results) { |
2217
|
|
|
|
|
|
|
# each observation is [start, stop, score] |
2218
|
11994
|
50
|
|
|
|
14234
|
if (defined $r->[2]) { |
2219
|
11994
|
|
|
|
|
10021
|
$count++; |
2220
|
11994
|
|
|
|
|
10760
|
$sum += $r->[2]; |
2221
|
11994
|
|
|
|
|
10801
|
$sum_squares += ($r->[2] * $r->[2]); |
2222
|
11994
|
100
|
100
|
|
|
21679
|
$min = $r->[2] if (!defined $min or $r->[2] < $min); |
2223
|
11994
|
100
|
100
|
|
|
21912
|
$max = $r->[2] if (!defined $max or $r->[2] > $max); |
2224
|
|
|
|
|
|
|
} |
2225
|
|
|
|
|
|
|
} |
2226
|
|
|
|
|
|
|
} |
2227
|
|
|
|
|
|
|
|
2228
|
|
|
|
|
|
|
# assemble the statistical summary hash |
2229
|
10
|
|
50
|
|
|
79
|
my %summary = ( |
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
2230
|
|
|
|
|
|
|
'validCount' => $count, |
2231
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
2232
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
2233
|
|
|
|
|
|
|
'minVal' => $min || 0, |
2234
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
2235
|
|
|
|
|
|
|
); |
2236
|
10
|
|
|
|
|
74
|
return \%summary; |
2237
|
|
|
|
|
|
|
} |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
sub _rewrite_metadata { |
2240
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
2241
|
1
|
50
|
|
|
|
6
|
return unless (-w $self->zip->fileName); |
2242
|
1
|
|
|
|
|
54
|
my $md = $self->{'metadata'}; |
2243
|
|
|
|
|
|
|
|
2244
|
|
|
|
|
|
|
# generate new metadata as an array |
2245
|
1
|
|
|
|
|
2
|
my @new_md; |
2246
|
1
|
|
|
|
|
5
|
push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion}; |
2247
|
1
|
50
|
|
|
|
4
|
push @new_md, @{ $md->{comments} } if exists $md->{comments}; |
|
1
|
|
|
|
|
3
|
|
2248
|
1
|
|
|
|
|
3
|
push @new_md, "dataType = " . $md->{dataType}; |
2249
|
1
|
|
|
|
|
3
|
push @new_md, "versionedGenome = " . $md->{versionedGenome}; |
2250
|
|
|
|
|
|
|
|
2251
|
|
|
|
|
|
|
# additional keys that may be present |
2252
|
1
|
|
|
|
|
7
|
foreach (keys %$md) { |
2253
|
7
|
100
|
|
|
|
35
|
next if /useqArchiveVersion|dataType|versionedGenome|comments|chromStats|globalStats/; |
2254
|
2
|
|
|
|
|
6
|
push @new_md, "$_ = $md->{$_}"; |
2255
|
|
|
|
|
|
|
} |
2256
|
|
|
|
|
|
|
|
2257
|
|
|
|
|
|
|
# global and chromosome stats |
2258
|
1
|
|
|
|
|
2
|
push @new_md, |
2259
|
|
|
|
|
|
|
"# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal"; |
2260
|
1
|
50
|
|
|
|
3
|
push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats}; |
2261
|
1
|
|
|
|
|
10
|
foreach (sort {$a cmp $b} keys %$md) { |
|
12
|
|
|
|
|
15
|
|
2262
|
7
|
100
|
|
|
|
19
|
push @new_md, "$_ = $md->{$_}" if /chromStats/; |
2263
|
|
|
|
|
|
|
} |
2264
|
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
|
# write the new metadata to the zip Archive |
2266
|
1
|
|
|
|
|
10
|
$self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md)); |
2267
|
1
|
|
|
|
|
668
|
$self->{'zip'}->overwrite; |
2268
|
1
|
|
|
|
|
6661
|
$self->clone; # not sure if this is necessary, but just in case we reopen the zip |
2269
|
|
|
|
|
|
|
} |
2270
|
|
|
|
|
|
|
|
2271
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
######## Exported Functions ######## |
2273
|
|
|
|
|
|
|
# these are borrowed from Bio::DB::BigWig from Lincoln Stein |
2274
|
|
|
|
|
|
|
|
2275
|
|
|
|
|
|
|
sub binMean { |
2276
|
2
|
|
|
2
|
1
|
130
|
my $score = shift; |
2277
|
2
|
50
|
|
|
|
9
|
return 0 unless $score->{validCount}; |
2278
|
2
|
|
|
|
|
9
|
$score->{sumData}/$score->{validCount}; |
2279
|
|
|
|
|
|
|
} |
2280
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
sub binVariance { |
2282
|
1
|
|
|
1
|
1
|
2
|
my $score = shift; |
2283
|
1
|
50
|
|
|
|
4
|
return 0 unless $score->{validCount}; |
2284
|
1
|
|
|
|
|
7
|
my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount}; |
2285
|
1
|
50
|
|
|
|
4
|
if ($score->{validCount} > 1) { |
2286
|
1
|
|
|
|
|
3
|
$var /= $score->{validCount}-1; |
2287
|
|
|
|
|
|
|
} |
2288
|
1
|
50
|
|
|
|
4
|
return 0 if $var < 0; |
2289
|
1
|
|
|
|
|
3
|
return $var; |
2290
|
|
|
|
|
|
|
} |
2291
|
|
|
|
|
|
|
|
2292
|
|
|
|
|
|
|
sub binStdev { |
2293
|
1
|
|
|
1
|
1
|
7
|
my $score = shift; |
2294
|
1
|
|
|
|
|
4
|
return sqrt(binVariance($score)); |
2295
|
|
|
|
|
|
|
} |
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
|
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
######## Other Classes ############# |
2301
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
package Bio::DB::USeq::Feature; |
2303
|
1
|
|
|
1
|
|
7
|
use base 'Bio::SeqFeature::Lite'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
481
|
|
2304
|
|
|
|
|
|
|
|
2305
|
|
|
|
|
|
|
sub new { |
2306
|
13
|
|
|
13
|
|
24
|
my $class = shift; |
2307
|
13
|
|
|
|
|
56
|
return $class->SUPER::new(@_); |
2308
|
|
|
|
|
|
|
} |
2309
|
|
|
|
|
|
|
|
2310
|
|
|
|
|
|
|
sub type { |
2311
|
|
|
|
|
|
|
# Bio::SeqFeature::Lite mangles the type and returns |
2312
|
|
|
|
|
|
|
# primary_tag:source if both are set |
2313
|
|
|
|
|
|
|
# this may wreck havoc with parsers when the type already has a : |
2314
|
|
|
|
|
|
|
# as in wiggle:1000 |
2315
|
16
|
|
|
16
|
|
130
|
my $self = shift; |
2316
|
16
|
|
|
|
|
94
|
return $self->{type}; |
2317
|
|
|
|
|
|
|
} |
2318
|
|
|
|
|
|
|
|
2319
|
|
|
|
|
|
|
sub chr_stats { |
2320
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2321
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2322
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_stats( $self->seq_id ); |
2323
|
|
|
|
|
|
|
} |
2324
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
sub chr_mean { |
2326
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2327
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2328
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_mean( $self->seq_id ); |
2329
|
|
|
|
|
|
|
} |
2330
|
|
|
|
|
|
|
|
2331
|
|
|
|
|
|
|
sub chr_stdev { |
2332
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2333
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2334
|
0
|
|
|
|
|
0
|
return $self->{useq}->chr_stdev( $self->seq_id ); |
2335
|
|
|
|
|
|
|
} |
2336
|
|
|
|
|
|
|
|
2337
|
|
|
|
|
|
|
sub global_stats { |
2338
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2339
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2340
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_stats( $self->seq_id ); |
2341
|
|
|
|
|
|
|
} |
2342
|
|
|
|
|
|
|
|
2343
|
|
|
|
|
|
|
sub global_mean { |
2344
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2345
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2346
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_mean( $self->seq_id ); |
2347
|
|
|
|
|
|
|
} |
2348
|
|
|
|
|
|
|
|
2349
|
|
|
|
|
|
|
sub global_stdev { |
2350
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2351
|
0
|
0
|
|
|
|
0
|
return unless exists $self->{useq}; |
2352
|
0
|
|
|
|
|
0
|
return $self->{useq}->global_stdev( $self->seq_id ); |
2353
|
|
|
|
|
|
|
} |
2354
|
|
|
|
|
|
|
|
2355
|
|
|
|
|
|
|
|
2356
|
|
|
|
|
|
|
|
2357
|
|
|
|
|
|
|
|
2358
|
|
|
|
|
|
|
package Bio::DB::USeq::Segment; |
2359
|
1
|
|
|
1
|
|
50483
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
670
|
|
2360
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
sub new { |
2362
|
1
|
|
|
1
|
|
2
|
my $class = shift; |
2363
|
1
|
|
|
|
|
4
|
my %args = @_; |
2364
|
1
|
|
|
|
|
7
|
my $segment = $class->SUPER::new(@_); |
2365
|
1
|
50
|
|
|
|
74
|
$segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'}; |
2366
|
1
|
|
|
|
|
2
|
return $segment; |
2367
|
|
|
|
|
|
|
} |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
sub scores { |
2370
|
1
|
|
|
1
|
|
108
|
my $self = shift; |
2371
|
1
|
|
|
|
|
6
|
return $self->{'useq'}->scores( |
2372
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2373
|
|
|
|
|
|
|
-start => $self->start, |
2374
|
|
|
|
|
|
|
-end => $self->end, |
2375
|
|
|
|
|
|
|
-strand => $self->strand, |
2376
|
|
|
|
|
|
|
); |
2377
|
|
|
|
|
|
|
} |
2378
|
|
|
|
|
|
|
|
2379
|
|
|
|
|
|
|
sub features { |
2380
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2381
|
0
|
|
|
|
|
0
|
my $type = shift; |
2382
|
0
|
|
0
|
|
|
0
|
$type ||= 'region'; |
2383
|
0
|
|
|
|
|
0
|
return $self->{'useq'}->features( |
2384
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2385
|
|
|
|
|
|
|
-start => $self->start, |
2386
|
|
|
|
|
|
|
-end => $self->end, |
2387
|
|
|
|
|
|
|
-strand => $self->strand, |
2388
|
|
|
|
|
|
|
-type => $type, |
2389
|
|
|
|
|
|
|
); |
2390
|
|
|
|
|
|
|
} |
2391
|
|
|
|
|
|
|
|
2392
|
|
|
|
|
|
|
sub get_seq_stream { |
2393
|
1
|
|
|
1
|
|
258
|
my $self = shift; |
2394
|
1
|
|
|
|
|
2
|
my $type = shift; |
2395
|
1
|
|
50
|
|
|
16
|
$type ||= 'region'; |
2396
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2397
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2398
|
|
|
|
|
|
|
-start => $self->start, |
2399
|
|
|
|
|
|
|
-end => $self->end, |
2400
|
|
|
|
|
|
|
-strand => $self->strand, |
2401
|
|
|
|
|
|
|
-source => $self->source, |
2402
|
|
|
|
|
|
|
-type => $type, |
2403
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2404
|
1
|
|
|
|
|
5
|
); |
2405
|
|
|
|
|
|
|
} |
2406
|
|
|
|
|
|
|
|
2407
|
|
|
|
|
|
|
sub slices { |
2408
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2409
|
0
|
|
|
|
|
0
|
my @slices = $self->{'useq'}->_translate_coordinates_to_slices( |
2410
|
|
|
|
|
|
|
$self->seq_id, $self->start, $self->end, $self->strand |
2411
|
|
|
|
|
|
|
); |
2412
|
0
|
0
|
|
|
|
0
|
return wantarray ? @slices : \@slices; |
2413
|
|
|
|
|
|
|
} |
2414
|
|
|
|
|
|
|
|
2415
|
|
|
|
|
|
|
sub coverage { |
2416
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2417
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2418
|
0
|
|
0
|
|
|
0
|
$bins ||= $self->length; |
2419
|
0
|
|
|
|
|
0
|
return $self->features("coverage:$bins"); |
2420
|
|
|
|
|
|
|
} |
2421
|
|
|
|
|
|
|
|
2422
|
|
|
|
|
|
|
sub wiggle { |
2423
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2424
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2425
|
0
|
|
0
|
|
|
0
|
$bins ||= $self->length; |
2426
|
0
|
|
|
|
|
0
|
return $self->features("wiggle:$bins"); |
2427
|
|
|
|
|
|
|
} |
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
sub statistical_summary { |
2430
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2431
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2432
|
0
|
|
0
|
|
|
0
|
$bins ||= 1; |
2433
|
0
|
|
|
|
|
0
|
return $self->features("summary:$bins"); |
2434
|
|
|
|
|
|
|
} |
2435
|
|
|
|
|
|
|
|
2436
|
|
|
|
|
|
|
sub observations { |
2437
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2438
|
0
|
|
|
|
|
0
|
return $self->{'useq'}->observations( |
2439
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2440
|
|
|
|
|
|
|
-start => $self->start, |
2441
|
|
|
|
|
|
|
-end => $self->end, |
2442
|
|
|
|
|
|
|
-strand => $self->strand, |
2443
|
|
|
|
|
|
|
); |
2444
|
|
|
|
|
|
|
} |
2445
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
|
2447
|
|
|
|
|
|
|
package Bio::DB::USeq::Iterator; |
2448
|
1
|
|
|
1
|
|
18
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1252
|
|
2449
|
|
|
|
|
|
|
|
2450
|
|
|
|
|
|
|
sub new { |
2451
|
4
|
|
|
4
|
|
57
|
my $class = shift; |
2452
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
# create object |
2454
|
4
|
|
|
|
|
18
|
my %args = @_; |
2455
|
4
|
50
|
|
|
|
11
|
my $useq = $args{'-useq'} or |
2456
|
|
|
|
|
|
|
die "Bio::DB::USeq::Iterator cannot be created without a -useq argument"; |
2457
|
4
|
|
|
|
|
18
|
my $iterator = $class->SUPER::new(@_); |
2458
|
4
|
|
|
|
|
218
|
$iterator->{'useq'} = $useq; |
2459
|
|
|
|
|
|
|
|
2460
|
|
|
|
|
|
|
# determine which members to retrieve |
2461
|
|
|
|
|
|
|
my @slices = $useq->_translate_coordinates_to_slices( |
2462
|
|
|
|
|
|
|
$args{-seq_id}, |
2463
|
|
|
|
|
|
|
$args{-start}, |
2464
|
|
|
|
|
|
|
$args{-end}, |
2465
|
|
|
|
|
|
|
$args{-strand}, |
2466
|
4
|
|
|
|
|
15
|
); |
2467
|
4
|
|
|
|
|
12
|
$useq->_clear_buffer(\@slices); |
2468
|
|
|
|
|
|
|
|
2469
|
|
|
|
|
|
|
# sort the slices as necessary |
2470
|
4
|
100
|
|
|
|
16
|
if (scalar(@slices) > 1) { |
2471
|
6
|
|
|
|
|
12
|
@slices = map {$_->[1]} |
2472
|
3
|
|
|
|
|
11
|
sort {$a->[0] <=> $b->[0]} |
2473
|
3
|
|
|
|
|
17
|
map { [$useq->slice_start($_), $_] } @slices; |
|
6
|
|
|
|
|
15
|
|
2474
|
|
|
|
|
|
|
} |
2475
|
|
|
|
|
|
|
|
2476
|
|
|
|
|
|
|
# how we set up the iterator depends on the feature type requested |
2477
|
|
|
|
|
|
|
# we need to add specific information to iterator object |
2478
|
|
|
|
|
|
|
|
2479
|
4
|
100
|
|
|
|
15
|
if ($iterator->type =~ /region|interval|observation/) { |
2480
|
|
|
|
|
|
|
# useq observation features are simple |
2481
|
2
|
|
|
|
|
5
|
$iterator->{'wiggle'} = 0; |
2482
|
|
|
|
|
|
|
|
2483
|
|
|
|
|
|
|
# prepare specific iterator information |
2484
|
2
|
|
|
|
|
5
|
$iterator->{'slices'} = \@slices; |
2485
|
2
|
|
|
|
|
11
|
$iterator->{'current_results'} = undef; |
2486
|
2
|
|
|
|
|
6
|
$iterator->{'current_strand'} = undef; |
2487
|
|
|
|
|
|
|
|
2488
|
2
|
|
|
|
|
9
|
return $iterator; |
2489
|
|
|
|
|
|
|
} |
2490
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
# otherwise we work with more complex wiggle or summary features |
2492
|
|
|
|
|
|
|
# set the type of wiggle feature: 1 = wiggle, 2 = summary |
2493
|
2
|
100
|
|
|
|
6
|
$iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1; |
2494
|
|
|
|
|
|
|
|
2495
|
|
|
|
|
|
|
# normally the iterator will only return one wigggle|summary feature, but |
2496
|
|
|
|
|
|
|
# will return two if stranded data, per behavior for Bio::Grapics... |
2497
|
2
|
50
|
33
|
|
|
11
|
if ($iterator->strand == 0 and $useq->stranded) { |
2498
|
|
|
|
|
|
|
# we have stranded data, so need to return two features |
2499
|
|
|
|
|
|
|
# separate the slices into each respective strands for each feature |
2500
|
0
|
|
|
|
|
0
|
my (@f, @r); |
2501
|
0
|
|
|
|
|
0
|
foreach (@slices) { |
2502
|
0
|
0
|
|
|
|
0
|
if ($useq->slice_strand($_) == 1) { |
2503
|
0
|
|
|
|
|
0
|
push @f, $_; |
2504
|
|
|
|
|
|
|
} |
2505
|
|
|
|
|
|
|
else { |
2506
|
0
|
|
|
|
|
0
|
push @r, $_; |
2507
|
|
|
|
|
|
|
} |
2508
|
|
|
|
|
|
|
} |
2509
|
0
|
|
|
|
|
0
|
$iterator->{slices} = [\@f, \@r]; |
2510
|
|
|
|
|
|
|
} |
2511
|
|
|
|
|
|
|
else { |
2512
|
|
|
|
|
|
|
# only need to return one feature |
2513
|
2
|
|
|
|
|
6
|
$iterator->{slices} = [ \@slices ]; |
2514
|
|
|
|
|
|
|
} |
2515
|
|
|
|
|
|
|
|
2516
|
|
|
|
|
|
|
# check for type and bins |
2517
|
2
|
|
|
|
|
4
|
my ($bin, $step); |
2518
|
2
|
100
|
|
|
|
6
|
if ($iterator->type =~ /:(\d+)$/i) { |
2519
|
1
|
|
|
|
|
3
|
$bin = $1; |
2520
|
1
|
|
|
|
|
6
|
my $length = $iterator->length; |
2521
|
1
|
50
|
|
|
|
18
|
$bin = $length if $bin > $length; |
2522
|
1
|
|
|
|
|
3
|
$step = $length/$bin; |
2523
|
|
|
|
|
|
|
} |
2524
|
2
|
|
|
|
|
5
|
$iterator->{bin} = $bin; |
2525
|
2
|
|
|
|
|
4
|
$iterator->{step} = $step; |
2526
|
|
|
|
|
|
|
|
2527
|
2
|
|
|
|
|
8
|
return $iterator; |
2528
|
|
|
|
|
|
|
} |
2529
|
|
|
|
|
|
|
|
2530
|
|
|
|
|
|
|
sub next_seq { |
2531
|
10
|
|
|
10
|
|
324
|
my $self = shift; |
2532
|
|
|
|
|
|
|
|
2533
|
10
|
100
|
|
|
|
32
|
if ($self->{'wiggle'} == 1) { |
|
|
100
|
|
|
|
|
|
2534
|
2
|
|
|
|
|
6
|
return $self->_next_wiggle; |
2535
|
|
|
|
|
|
|
} |
2536
|
|
|
|
|
|
|
elsif ($self->{'wiggle'} == 2) { |
2537
|
2
|
|
|
|
|
6
|
return $self->_next_summary; |
2538
|
|
|
|
|
|
|
} |
2539
|
|
|
|
|
|
|
else { |
2540
|
6
|
|
|
|
|
14
|
return $self->_next_region; |
2541
|
|
|
|
|
|
|
} |
2542
|
|
|
|
|
|
|
} |
2543
|
|
|
|
|
|
|
|
2544
|
0
|
|
|
0
|
|
0
|
sub next_feature {return shift->next_seq} |
2545
|
|
|
|
|
|
|
|
2546
|
|
|
|
|
|
|
sub _next_region { |
2547
|
|
|
|
|
|
|
# this will keep returning observations as SeqFeature objects until we run out |
2548
|
|
|
|
|
|
|
# of observations in the requested interval |
2549
|
6
|
|
|
6
|
|
8
|
my $self = shift; |
2550
|
6
|
|
|
|
|
9
|
my $useq = $self->{'useq'}; |
2551
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
# get current results |
2553
|
6
|
100
|
|
|
|
11
|
unless ($self->{'current_results'}) { |
2554
|
2
|
|
50
|
|
|
3
|
my $slice = shift @{ $self->{'slices'} } || undef; |
2555
|
2
|
50
|
|
|
|
10
|
return unless $slice; # no more slices, we're done |
2556
|
|
|
|
|
|
|
|
2557
|
|
|
|
|
|
|
# collect the observations |
2558
|
2
|
|
|
|
|
7
|
$useq->_load_slice($slice); |
2559
|
2
|
|
|
|
|
20
|
my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end); |
2560
|
|
|
|
|
|
|
|
2561
|
|
|
|
|
|
|
# sort the observations and store the results |
2562
|
2
|
50
|
|
|
|
367
|
my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result; |
|
303
|
|
|
|
|
372
|
|
2563
|
2
|
|
|
|
|
8
|
$self->{'current_results'} = \@sorted; |
2564
|
2
|
|
|
|
|
19
|
$self->{'current_strand'} = $useq->slice_strand($slice); |
2565
|
|
|
|
|
|
|
} |
2566
|
|
|
|
|
|
|
|
2567
|
|
|
|
|
|
|
# return next observation as a Feature object |
2568
|
6
|
|
|
|
|
8
|
my $obs = shift @{ $self->{'current_results'} }; |
|
6
|
|
|
|
|
11
|
|
2569
|
6
|
50
|
|
|
|
12
|
unless (defined $obs) { |
2570
|
|
|
|
|
|
|
# no more observations for current slice, try to move to next slice |
2571
|
0
|
|
|
|
|
0
|
undef $self->{'current_results'}; |
2572
|
0
|
|
|
|
|
0
|
return $self->_next_region; |
2573
|
|
|
|
|
|
|
} |
2574
|
|
|
|
|
|
|
return Bio::DB::USeq::Feature->new( |
2575
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2576
|
|
|
|
|
|
|
-start => $obs->[0] + 1, |
2577
|
|
|
|
|
|
|
-end => $obs->[1], |
2578
|
6
|
100
|
|
|
|
17
|
-strand => $self->{'current_strand'}, |
2579
|
|
|
|
|
|
|
-score => $obs->[2], |
2580
|
|
|
|
|
|
|
-source => $self->source, |
2581
|
|
|
|
|
|
|
-type => $self->type, |
2582
|
|
|
|
|
|
|
-name => defined $obs->[3] ? $obs->[3] : |
2583
|
|
|
|
|
|
|
join(':', $self->seq_id, $obs->[0], $obs->[1]), |
2584
|
|
|
|
|
|
|
) |
2585
|
|
|
|
|
|
|
} |
2586
|
|
|
|
|
|
|
|
2587
|
|
|
|
|
|
|
sub _next_wiggle { |
2588
|
|
|
|
|
|
|
# this will return one wiggle Feature, two if separate strands are requested |
2589
|
|
|
|
|
|
|
|
2590
|
2
|
|
|
2
|
|
3
|
my $self = shift; |
2591
|
2
|
|
|
|
|
3
|
my $useq = $self->{'useq'}; |
2592
|
|
|
|
|
|
|
|
2593
|
|
|
|
|
|
|
# determine which slices to retrieve |
2594
|
2
|
|
|
|
|
4
|
my $slices = shift @{ $self->{slices} }; |
|
2
|
|
|
|
|
5
|
|
2595
|
2
|
100
|
|
|
|
7
|
return unless $slices; |
2596
|
|
|
|
|
|
|
|
2597
|
|
|
|
|
|
|
# more information |
2598
|
1
|
|
|
|
|
2
|
my $start = $self->start; |
2599
|
1
|
|
|
|
|
7
|
my $stop = $self->end; |
2600
|
1
|
|
|
|
|
7
|
my $step = $self->{step}; |
2601
|
|
|
|
|
|
|
|
2602
|
|
|
|
|
|
|
# check whether we are working with binned wiggle or not |
2603
|
1
|
|
|
|
|
1
|
my @scores; |
2604
|
1
|
50
|
33
|
|
|
5
|
if ($self->{bin} and $step > 1) { |
2605
|
|
|
|
|
|
|
# we will be collecting the mean score value in bins |
2606
|
|
|
|
|
|
|
|
2607
|
|
|
|
|
|
|
# collect the scores for each bin |
2608
|
1
|
|
|
|
|
3
|
for (my $begin = $start; $begin < $stop; $begin += $step) { |
2609
|
|
|
|
|
|
|
|
2610
|
|
|
|
|
|
|
# round off coordinates to integers |
2611
|
|
|
|
|
|
|
# beginning point and step may not be integers |
2612
|
1000
|
|
|
|
|
1358
|
my $s = int($begin + 0.5); |
2613
|
1000
|
|
|
|
|
1136
|
my $e = int($s + $step - 0.5); # start + step - 1 + 0.5 |
2614
|
|
|
|
|
|
|
|
2615
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2616
|
1000
|
50
|
|
|
|
1180
|
if (scalar @$slices > 1) { |
2617
|
|
|
|
|
|
|
# more than one slice, identify which subset of slices to collect from |
2618
|
|
|
|
|
|
|
# may or may not be all of the current slices |
2619
|
1000
|
|
|
|
|
930
|
my @sub_slices; |
2620
|
1000
|
|
|
|
|
1099
|
foreach my $slice (@$slices) { |
2621
|
2000
|
100
|
|
|
|
2956
|
next if $s > $useq->slice_end($slice); |
2622
|
1061
|
100
|
|
|
|
1355
|
next if $e < $useq->slice_start($slice); |
2623
|
1001
|
|
|
|
|
1445
|
push @sub_slices, $slice; |
2624
|
|
|
|
|
|
|
} |
2625
|
1000
|
|
|
|
|
1435
|
push @scores, $useq->_mean_score($s, $e, \@sub_slices); |
2626
|
|
|
|
|
|
|
} |
2627
|
|
|
|
|
|
|
else { |
2628
|
0
|
|
|
|
|
0
|
push @scores, $useq->_mean_score($s, $e, $slices); |
2629
|
|
|
|
|
|
|
} |
2630
|
|
|
|
|
|
|
} |
2631
|
|
|
|
|
|
|
} |
2632
|
|
|
|
|
|
|
else { |
2633
|
|
|
|
|
|
|
# otherwise we collect in one step and associate scores at bp resolution |
2634
|
|
|
|
|
|
|
# collect the scores from each of the requested slices and |
2635
|
|
|
|
|
|
|
# assemble them into a hash of values |
2636
|
|
|
|
|
|
|
|
2637
|
|
|
|
|
|
|
# correlate scores with position |
2638
|
0
|
|
|
|
|
0
|
my %pos2score; |
2639
|
0
|
|
|
|
|
0
|
foreach my $slice (@$slices) { |
2640
|
|
|
|
|
|
|
|
2641
|
|
|
|
|
|
|
# load and unpack the data |
2642
|
0
|
|
|
|
|
0
|
$useq->_load_slice($slice); |
2643
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
# fetch the overlapping observations |
2645
|
|
|
|
|
|
|
# each observation is [start, stop, score] |
2646
|
0
|
|
|
|
|
0
|
my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop); |
2647
|
0
|
|
|
|
|
0
|
foreach my $r (@$result) { |
2648
|
0
|
|
|
|
|
0
|
foreach my $p ( $r->[0] + 1 .. $r->[1] ) { |
2649
|
0
|
|
|
|
|
0
|
$pos2score{$p} = $r->[2]; |
2650
|
|
|
|
|
|
|
} |
2651
|
|
|
|
|
|
|
} |
2652
|
|
|
|
|
|
|
} |
2653
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
# convert positioned scores into an array |
2655
|
0
|
|
|
|
|
0
|
foreach (my $s = $start; $s <= $stop; $s++) { |
2656
|
0
|
0
|
|
|
|
0
|
push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef; |
2657
|
|
|
|
|
|
|
# for Bio::Graphics it is better to store undef than 0 |
2658
|
|
|
|
|
|
|
# which can do wonky things with graphs |
2659
|
|
|
|
|
|
|
} |
2660
|
|
|
|
|
|
|
} |
2661
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
# generate the wiggle object |
2663
|
1
|
|
50
|
|
|
6
|
my $strand = $useq->slice_strand( $slices->[0] ) || 0; |
2664
|
1
|
50
|
|
|
|
11
|
return Bio::DB::USeq::Wiggle->new( |
|
|
50
|
|
|
|
|
|
2665
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2666
|
|
|
|
|
|
|
-start => $start, |
2667
|
|
|
|
|
|
|
-end => $stop, |
2668
|
|
|
|
|
|
|
-strand => $strand, |
2669
|
|
|
|
|
|
|
-type => $self->type, |
2670
|
|
|
|
|
|
|
-source => $self->source, |
2671
|
|
|
|
|
|
|
-attributes => { 'coverage' => [ \@scores ] }, |
2672
|
|
|
|
|
|
|
-name => $strand == 1 ? 'Forward' : |
2673
|
|
|
|
|
|
|
$strand == -1 ? 'Reverse' : q(), |
2674
|
|
|
|
|
|
|
-useq => $useq, |
2675
|
|
|
|
|
|
|
); |
2676
|
|
|
|
|
|
|
} |
2677
|
|
|
|
|
|
|
|
2678
|
|
|
|
|
|
|
sub _next_summary { |
2679
|
|
|
|
|
|
|
# this will return one summary Feature, two if separate strands are requested |
2680
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
2681
|
|
|
|
|
|
|
|
2682
|
|
|
|
|
|
|
# determine which slices to retrieve |
2683
|
2
|
|
|
|
|
2
|
my $slices = shift @{ $self->{slices} }; |
|
2
|
|
|
|
|
4
|
|
2684
|
2
|
100
|
|
|
|
6
|
return unless $slices; |
2685
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
# all of the real statistical work is done elsewhere |
2687
|
|
|
|
|
|
|
# just return the summary object |
2688
|
1
|
|
50
|
|
|
5
|
my $strand = $self->{useq}->slice_strand( $slices->[0] ) || 0; |
2689
|
|
|
|
|
|
|
return Bio::DB::USeq::Summary->new( |
2690
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2691
|
|
|
|
|
|
|
-start => $self->start, |
2692
|
|
|
|
|
|
|
-end => $self->end, |
2693
|
|
|
|
|
|
|
-strand => $strand, |
2694
|
|
|
|
|
|
|
-type => $self->type, |
2695
|
|
|
|
|
|
|
-source => $self->source, |
2696
|
|
|
|
|
|
|
-name => $strand == 1 ? 'Forward' : |
2697
|
|
|
|
|
|
|
$strand == -1 ? 'Reverse' : q(), |
2698
|
|
|
|
|
|
|
-useq => $self->{'useq'}, |
2699
|
|
|
|
|
|
|
-slices => $slices, |
2700
|
|
|
|
|
|
|
-bin => $self->{bin}, |
2701
|
|
|
|
|
|
|
-step => $self->{step}, |
2702
|
1
|
50
|
|
|
|
6
|
); |
|
|
50
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
} |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
|
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
package Bio::DB::USeq::Wiggle; |
2708
|
1
|
|
|
1
|
|
9
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
491
|
|
2709
|
|
|
|
|
|
|
# Wiggle scores are stored in the coverage attribute for backwards |
2710
|
|
|
|
|
|
|
# compatibility with Bio::Graphics. |
2711
|
|
|
|
|
|
|
|
2712
|
|
|
|
|
|
|
sub new { |
2713
|
1
|
|
|
1
|
|
30
|
my $class = shift; |
2714
|
1
|
|
|
|
|
18
|
my $wig = $class->SUPER::new(@_); |
2715
|
1
|
|
|
|
|
126
|
my %args = @_; |
2716
|
1
|
50
|
|
|
|
5
|
my $useq = $args{'-useq'} or |
2717
|
|
|
|
|
|
|
die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument"; |
2718
|
1
|
|
|
|
|
3
|
$wig->{useq} = $useq; |
2719
|
1
|
|
|
|
|
18
|
return $wig; |
2720
|
|
|
|
|
|
|
} |
2721
|
|
|
|
|
|
|
|
2722
|
|
|
|
|
|
|
sub coverage { |
2723
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
2724
|
1
|
|
|
|
|
10
|
my ($coverage) = $self->get_tag_values('coverage'); |
2725
|
1
|
50
|
|
|
|
85
|
return wantarray ? @$coverage : $coverage; |
2726
|
|
|
|
|
|
|
} |
2727
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
sub wiggle { |
2729
|
1
|
|
|
1
|
|
116
|
return shift->coverage; |
2730
|
|
|
|
|
|
|
} |
2731
|
|
|
|
|
|
|
|
2732
|
|
|
|
|
|
|
# Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam |
2733
|
|
|
|
|
|
|
sub gff3_string { |
2734
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2735
|
0
|
|
|
|
|
0
|
my $gff3 = $self->SUPER::gff3_string(@_); |
2736
|
0
|
|
|
|
|
0
|
my $coverage = $self->escape(join(',',$self->coverage)); |
2737
|
0
|
|
|
|
|
0
|
$gff3 =~ s/coverage=[^;]+/coverage=$coverage/g; |
2738
|
0
|
|
|
|
|
0
|
return $gff3; |
2739
|
|
|
|
|
|
|
} |
2740
|
|
|
|
|
|
|
|
2741
|
|
|
|
|
|
|
sub statistical_summary { |
2742
|
|
|
|
|
|
|
# this is just for the wiggle scores, not original data |
2743
|
|
|
|
|
|
|
# This is a fake statistical_summary to fool Bio::Graphics into |
2744
|
|
|
|
|
|
|
# calculating chromosome or global statistics like a BigWig adaptor. |
2745
|
|
|
|
|
|
|
# A real statistical_summary call would provide the number of bins, |
2746
|
|
|
|
|
|
|
# so return null if bins is present. |
2747
|
|
|
|
|
|
|
# We could calculate a real statistical summary, but that would be an |
2748
|
|
|
|
|
|
|
# expensive circuitous route after we just collected wiggle scores. |
2749
|
|
|
|
|
|
|
# Better to request the correct feature type in the first place. |
2750
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2751
|
0
|
|
|
|
|
0
|
my $bins = shift; |
2752
|
0
|
0
|
0
|
|
|
0
|
return if $bins && $bins > 1; |
2753
|
|
|
|
|
|
|
|
2754
|
|
|
|
|
|
|
# initialize the statistical scores |
2755
|
0
|
|
|
|
|
0
|
my $count = 0; |
2756
|
0
|
|
|
|
|
0
|
my $sum; |
2757
|
|
|
|
|
|
|
my $sum_squares; |
2758
|
0
|
|
|
|
|
0
|
my $min; |
2759
|
0
|
|
|
|
|
0
|
my $max; |
2760
|
|
|
|
|
|
|
|
2761
|
|
|
|
|
|
|
# generate a statistical summary of just the wiggle scores |
2762
|
0
|
|
|
|
|
0
|
foreach my $s ($self->coverage) { |
2763
|
0
|
|
|
|
|
0
|
$count++; |
2764
|
0
|
0
|
|
|
|
0
|
next unless defined $s; |
2765
|
0
|
|
|
|
|
0
|
$sum += $s; |
2766
|
0
|
|
|
|
|
0
|
$sum_squares += $s * $s; |
2767
|
0
|
0
|
0
|
|
|
0
|
$min = $s if (!defined $min or $s < $min); |
2768
|
0
|
0
|
0
|
|
|
0
|
$max = $s if (!defined $max or $s > $max); |
2769
|
|
|
|
|
|
|
} |
2770
|
|
|
|
|
|
|
|
2771
|
|
|
|
|
|
|
# return the statistical hash |
2772
|
0
|
|
0
|
|
|
0
|
my %stat = ( |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
2773
|
|
|
|
|
|
|
'validCount' => $count, |
2774
|
|
|
|
|
|
|
'sumData' => $sum || 0, |
2775
|
|
|
|
|
|
|
'sumSquares' => $sum_squares || 0, |
2776
|
|
|
|
|
|
|
'minVal' => $min || 0, |
2777
|
|
|
|
|
|
|
'maxVal' => $max || 0, |
2778
|
|
|
|
|
|
|
); |
2779
|
0
|
|
|
|
|
0
|
return \%stat; |
2780
|
|
|
|
|
|
|
} |
2781
|
|
|
|
|
|
|
|
2782
|
|
|
|
|
|
|
sub get_seq_stream { |
2783
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2784
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2785
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2786
|
|
|
|
|
|
|
-start => $self->start, |
2787
|
|
|
|
|
|
|
-end => $self->end, |
2788
|
|
|
|
|
|
|
-strand => $self->strand, |
2789
|
|
|
|
|
|
|
-type => 'region', |
2790
|
|
|
|
|
|
|
-source => $self->source, |
2791
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2792
|
0
|
|
|
|
|
0
|
); |
2793
|
|
|
|
|
|
|
} |
2794
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
|
2796
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
|
2798
|
|
|
|
|
|
|
package Bio::DB::USeq::Summary; |
2799
|
1
|
|
|
1
|
|
6
|
use base 'Bio::DB::USeq::Feature'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
618
|
|
2800
|
|
|
|
|
|
|
|
2801
|
|
|
|
|
|
|
sub new { |
2802
|
1
|
|
|
1
|
|
22
|
my $class = shift; |
2803
|
1
|
|
|
|
|
7
|
my $summary = $class->SUPER::new(@_); |
2804
|
1
|
|
|
|
|
71
|
my %args = @_; |
2805
|
1
|
50
|
|
|
|
4
|
my $useq = $args{'-useq'} or |
2806
|
|
|
|
|
|
|
die "Bio::DB::USeq::Summary cannot be created without a -useq argument"; |
2807
|
1
|
|
|
|
|
2
|
$summary->{useq} = $useq; |
2808
|
1
|
50
|
|
|
|
4
|
$summary->{slices} = $args{-slices} if exists $args{-slices}; |
2809
|
1
|
50
|
|
|
|
4
|
$summary->{bin} = $args{-bin} if exists $args{-bin}; |
2810
|
1
|
50
|
|
|
|
2
|
$summary->{step} = $args{-step} if exists $args{-step}; |
2811
|
1
|
|
|
|
|
6
|
return $summary; |
2812
|
|
|
|
|
|
|
} |
2813
|
|
|
|
|
|
|
|
2814
|
|
|
|
|
|
|
sub statistical_summary { |
2815
|
1
|
|
|
1
|
|
107
|
my $self = shift; |
2816
|
1
|
|
|
|
|
2
|
my $useq = $self->{useq}; |
2817
|
|
|
|
|
|
|
|
2818
|
|
|
|
|
|
|
# get the number of bins to calculate the statistical summaries |
2819
|
1
|
|
|
|
|
2
|
my $bin = shift; |
2820
|
1
|
50
|
33
|
|
|
5
|
$bin ||= $self->{bin} if exists $self->{bin}; |
2821
|
1
|
|
50
|
|
|
3
|
$bin ||= 1; |
2822
|
1
|
50
|
|
|
|
4
|
my $step = $self->{step} if exists $self->{step}; |
2823
|
1
|
|
33
|
|
|
11
|
$step ||= $self->length / $bin; |
2824
|
|
|
|
|
|
|
|
2825
|
|
|
|
|
|
|
# get the slices |
2826
|
1
|
50
|
|
|
|
24
|
my $slices = $self->{slices} if exists $self->{slices}; |
2827
|
1
|
50
|
|
|
|
3
|
unless ($slices) { |
2828
|
|
|
|
|
|
|
# this should already be established, but just in case |
2829
|
0
|
|
|
|
|
0
|
my @a = $useq->_translate_coordinates_to_slices($self->seq_id, |
2830
|
|
|
|
|
|
|
$self->start, $self->end, $self->strand); |
2831
|
0
|
|
|
|
|
0
|
$useq->_clear_buffer(\@a); |
2832
|
0
|
|
|
|
|
0
|
$slices = \@a; |
2833
|
|
|
|
|
|
|
} |
2834
|
|
|
|
|
|
|
|
2835
|
|
|
|
|
|
|
# collect the statistical summaries for each bin |
2836
|
1
|
|
|
|
|
2
|
my @summaries; |
2837
|
1
|
|
|
|
|
2
|
for (my $begin = $self->start; $begin < $self->end; $begin += $step) { |
2838
|
|
|
|
|
|
|
|
2839
|
|
|
|
|
|
|
# round off coordinates to integers |
2840
|
|
|
|
|
|
|
# beginning point and step may not be integers |
2841
|
10
|
|
|
|
|
93
|
my $s = int($begin + 0.5); |
2842
|
10
|
|
|
|
|
309
|
my $e = int($s + $step - 0.5); # start + step - 1 + 0.5 |
2843
|
|
|
|
|
|
|
|
2844
|
|
|
|
|
|
|
# collect the scores from each of the requested slices |
2845
|
10
|
50
|
|
|
|
20
|
if (scalar @$slices > 1) { |
2846
|
|
|
|
|
|
|
# more than one slice, identify which subset of slices to collect from |
2847
|
|
|
|
|
|
|
# may or may not be all of the current slices |
2848
|
10
|
|
|
|
|
11
|
my @sub_slices; |
2849
|
10
|
|
|
|
|
17
|
foreach my $slice (@$slices) { |
2850
|
20
|
50
|
|
|
|
29
|
next if $s > $useq->slice_end($slice); |
2851
|
20
|
100
|
|
|
|
30
|
next if $e < $useq->slice_start($slice); |
2852
|
11
|
|
|
|
|
19
|
push @sub_slices, $slice; |
2853
|
|
|
|
|
|
|
} |
2854
|
10
|
|
|
|
|
19
|
push @summaries, $useq->_stat_summary($s, $e, \@sub_slices); |
2855
|
|
|
|
|
|
|
} |
2856
|
|
|
|
|
|
|
else { |
2857
|
0
|
|
|
|
|
0
|
push @summaries, $useq->_stat_summary($s, $e, $slices); |
2858
|
|
|
|
|
|
|
} |
2859
|
|
|
|
|
|
|
} |
2860
|
|
|
|
|
|
|
|
2861
|
|
|
|
|
|
|
# return the reference to the statistical summaries |
2862
|
1
|
|
|
|
|
19
|
return \@summaries; |
2863
|
|
|
|
|
|
|
} |
2864
|
|
|
|
|
|
|
|
2865
|
|
|
|
|
|
|
sub score { |
2866
|
0
|
|
|
0
|
|
|
my $self = shift; |
2867
|
0
|
|
|
|
|
|
my $a = $self->statistical_summary(1); |
2868
|
0
|
|
|
|
|
|
return $a->[0]; |
2869
|
|
|
|
|
|
|
} |
2870
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
sub gff3_string { |
2872
|
|
|
|
|
|
|
# this is going to be a little convoluted, since what we |
2873
|
|
|
|
|
|
|
# really want here is coverage, which is easier to calculate with means, |
2874
|
|
|
|
|
|
|
# rather than doing statistical summaries and calculate means from those |
2875
|
0
|
|
|
0
|
|
|
my $self = shift; |
2876
|
|
|
|
|
|
|
my ($wig) = $self->{useq}->features( |
2877
|
|
|
|
|
|
|
# this should only return one feature because we have specific strand |
2878
|
0
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2879
|
|
|
|
|
|
|
-start => $self->start, |
2880
|
|
|
|
|
|
|
-end => $self->end, |
2881
|
|
|
|
|
|
|
-strand => $self->strand, |
2882
|
|
|
|
|
|
|
-type => 'coverage:1000', |
2883
|
|
|
|
|
|
|
); |
2884
|
0
|
|
|
|
|
|
return $wig->gff3_string; |
2885
|
|
|
|
|
|
|
} |
2886
|
|
|
|
|
|
|
|
2887
|
|
|
|
|
|
|
sub get_seq_stream { |
2888
|
0
|
|
|
0
|
|
|
my $self = shift; |
2889
|
|
|
|
|
|
|
return Bio::DB::USeq::Iterator->new( |
2890
|
|
|
|
|
|
|
-seq_id => $self->seq_id, |
2891
|
|
|
|
|
|
|
-start => $self->start, |
2892
|
|
|
|
|
|
|
-end => $self->end, |
2893
|
|
|
|
|
|
|
-strand => $self->strand, |
2894
|
|
|
|
|
|
|
-type => 'region', |
2895
|
|
|
|
|
|
|
-source => $self->source, |
2896
|
|
|
|
|
|
|
-useq => $self->{useq}, |
2897
|
0
|
|
|
|
|
|
); |
2898
|
|
|
|
|
|
|
} |
2899
|
|
|
|
|
|
|
|
2900
|
|
|
|
|
|
|
=head1 AUTHOR |
2901
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
Timothy J. Parnell, PhD |
2903
|
|
|
|
|
|
|
Dept of Oncological Sciences |
2904
|
|
|
|
|
|
|
Huntsman Cancer Institute |
2905
|
|
|
|
|
|
|
University of Utah |
2906
|
|
|
|
|
|
|
Salt Lake City, UT, 84112 |
2907
|
|
|
|
|
|
|
|
2908
|
|
|
|
|
|
|
This package is free software; you can redistribute it and/or modify |
2909
|
|
|
|
|
|
|
it under the terms of the Artistic License 2.0. |