File Coverage

blib/lib/Bio/DB/USeq.pm

Criterion	Covered	Total	%
statement	479	949	50.4
branch	158	338	46.7
condition	49	156	31.4
subroutine	62	111	55.8
pod	37	39	94.8
total	785	1593	49.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::DB::USeq;
2
3							our $VERSION = '0.26';
4
5							=head1 NAME
6
7							Bio::DB::USeq - Read USeq archive database files
8
9							=head1 SYNOPSIS
10
11							use Bio::DB::USeq;
12							my $useq = Bio::DB::USeq->new('file.useq') or
13							die "unable to open file.useq!\n";
14
15							# sequence IDs
16							my @seq_ids = $useq->seq_ids;
17							my $length = $useq->length($chr); # approximate, not exact
18
19							### Retrieving features
20							# all features or observations across chromosome I
21							my @features = $useq->features(
22							-seq_id => 'chrI',
23							-type => 'region',
24							);
25
26							# same thing using a simple form
27							# use an array of (chromosome, start, stop, strand)
28							my @features = $useq->features('chrI');
29
30							# same thing with a memory efficient iterator
31							my $iterator = $useq->get_seq_stream(
32							-seq_id => 'chrI',
33							);
34							while (my $f = $iterator->next_seq) {
35							# each feature $f supports most SeqFeatureI methods
36							}
37
38
39							### Retrieving simple scores
40							my @scores = $useq->scores(
41							-seq_id => 'chrI',
42							-start => 1000,
43							-end => 2000
44							);
45
46
47							### Same methods as above after defining an interval first
48							my $segment = $useq->segment(
49							-seq_id => 'chrI',
50							-start => 1000,
51							-end => 2000,
52							);
53							my @scores = $segment->scores;
54							my @features = $segment->features;
55							my $iterator = $segment->get_seq_stream;
56
57
58							### Efficient retrieval of positioned scores in 100 bins
59							# compatible with Bio::Graphics
60							my ($wig) = $useq->features(
61							# assuming unstranded data here, otherwise two wiggle objects
62							# would be returned, one for each strand
63							-seq_id => 'chrI',
64							-start => 1000,
65							-end => 2000,
66							-type => 'wiggle:100',
67							);
68							my @bins = $wig->wiggle;
69							my @bins = $wig->coverage; # same thing
70							my ($bins) = $wig->get_tag_values('coverage'); # same thing
71
72
73							### Statistical summaries of intervals
74							# compatible with Bio::Graphics
75							my ($summary) = $useq->features(
76							# assuming unstranded data here, otherwise two summaries
77							# would be returned, one for each strand
78							-seq_id => 'chrI',
79							-start => 1000,
80							-end => 2000,
81							-type => 'summary',
82							);
83							my $stats = $summary->statistical_summary(10);
84							foreach (@$stats) {
85							my $max = $_->{maxVal};
86							my $mean = Bio::DB::USeq::binMean($_);
87							}
88
89
90							### Stranded data using an iterator
91							# could be used with either wiggle or summary features
92							my $stream = $useq->get_seq_stream(
93							-seq_id => 'chrI',
94							-start => 1000,
95							-end => 2000,
96							-type => 'wiggle:100',
97							);
98							my ($forward, $reverse);
99							if ($useq->stranded) {
100							$forward = $stream->next_seq;
101							$reverse = $stream->next_seq;
102							}
103							else {
104							$forward = $stream->next_seq;
105							}
106
107
108							=head1 DESCRIPTION
109
110							Bio::DB::USeq is a B style adaptor for reading USeq files. USeq files
111							are compressed, indexed data files supporting modern bioinformatic datasets,
112							including genomic points, scores, and intervals. More information about the
113							USeq software package can be found at L,
114							including a description of the USeq data
115							L.
116
117							USeq files are typically half the size of corresponding bigBed and bigWig files,
118							due to a compact internal format and lack of internal zoom data. This adaptor,
119							however, can still return statistics across different zoom levels in the same
120							manner as big files, albeit at a cost of calculating these in realtime.
121
122							=head2 Generating useq files
123
124							USeq files may be generated using tools from the USeq package, available at
125							L. They may be generated from
126							native Bar files, text Wig files, text Bed files, and UCSC bigWig and bigBed file
127							formats.
128
129							=head2 Compatibility
130
131							The adaptor follows most conventions of other B-style Bio::DB
132							adaptors. Observations or features in the useq file archive are
133							returned as SeqFeatureI compatible objects.
134
135							Coordinates consumed and returned by the adaptor are 1-based, consistent
136							with B convention. This is not true of the useq file itself, which
137							uses the interbase coordinate system.
138
139							Unlike wig and bigWig files, useq file archives support stranded data,
140							which can make data collection much simpler for complex experiments.
141
142							See below for GBrowse compatibility.
143
144							=head2 Limitations
145
146							This adaptor is read only. USeq files, in general, are not modified
147							or written with data. The exceptions are chromosome or global statistics
148							are written to the F file inside the zip archive to
149							cache for future queries.
150
151							No support for genomic sequence is included. Users who need access to
152							genomic sequence should seek an alternative B adaptor, such as
153							L.
154
155							Useq files do not have a native concept of type, primary_tag, or source
156							attributes, as expected with GFF-based database adaptors. The features
157							method does support special types (see below).
158
159							=head2 Requirements
160
161							The adaptor is a Perl-only implementation. It only requires the
162							L module for opening and reading the useq file archive.
163							B is required for working with SeqFeatures objects generated
164							from useq file observations.
165
166							=head1 METHODS
167
168							=head2 Initializing the Bio::DB::USeq object
169
170							These are class methods for creating and working with the Bio::DB::USeq
171							object itself.
172
173							=over 4
174
175							=item new
176
177							=item new($file)
178
179							=item new(-useq => $file)
180
181							This will create a new Bio::DB::USeq object. Optionally pass the path
182							to a useq file archive to open immediately. There is not much to do
183							unless you open a file.
184
185							Named arguments may be used to specify the file. Either -useq or -file
186							may be used.
187
188							=item open($file)
189
190							Open a useq file archive. Useq files typically use the F<.useq> file
191							extension. Returns true if successful.
192
193							DO NOT open a subsequent useq file archive when one has already been
194							opened. Please create a new object to open a new file.
195
196							=item clone
197
198							Force the object to re-open the useq archive file under a new file
199							handle to make it clone safe. Do this in the child process before
200							collecting data.
201
202							=item zip
203
204							Return the L object representing the useq file archive.
205							Generally not recommended unless you know what you are doing.
206
207							=back
208
209							=head2 General information about the useq file
210
211							These are class methods for obtaining general information or
212							metadata attributes regarding the contents of the file.
213
214							=over 4
215
216							=item stranded
217
218							Return true or false (1 or 0) indicating whether the contents of the
219							useq file archive are recorded in a stranded fashion.
220
221							=item attributes
222
223							Return an array of available attribute keys that were recorded in the
224							useq file F member. These are key = value pairs
225							representing metadata for the useq file.
226
227							=item attribute($key)
228
229							Return the metadata attribute value for the specified key. These
230							are recorded in the useq file F member. Returns
231							undefined if the key does not exist.
232
233							=item type
234
235							Return the useq file metadata C value.
236
237							=item genome
238
239							Return the useq file metadata C value.
240
241							=item version
242
243							Return the useq file metadata C value.
244
245							=back
246
247							=head2 Chromosome Information
248
249							These are methods to obtain information about the chromosomes or reference
250							sequences represented in the file archive.
251
252							B that generating score statistics across one or more chromosomes may
253							be computationally expensive. Therefore, chromosome statistics, once
254							calculated, are cached in the useq file metadata for future reference.
255							This necessitates writing to the useq zip archive. This is currently the
256							only supported method for modifying the useq zip archive.
257
258							=over 4
259
260							=item seq_ids
261
262							Return an array of the chromosome or sequence identifiers represented
263							in the useq file archive. The names are sorted ASCIIbetically before
264							they are returned.
265
266							=item length($seq_id)
267
268							Return the length of the provided chromosome or sequence identifier.
269							Note that this may not be the actual length in the reference genome, but
270							rather the last recorded position of an observation for that chromosome.
271							Hence, it should be used only as an approximation.
272
273							=item chr_mean($seq_id)
274
275							Return the mean score value across the entire chromosome.
276
277							=item chr_stdev($seq_id)
278
279							Return the score standard deviation across the entire chromosome.
280
281							=item chr_stats($seq_id)
282
283							Return a statistical summary across the entire chromosome. This
284							is an anonymous hash with five keys: validCount, sumData,
285							sumSquares, minVal, and maxVal. These are equivalent to the
286							statistical summaries generated by the Bio::DB::BigWig adaptor.
287
288							=item global_mean
289
290							Return the mean score value across all chromosomes.
291
292							=item global_stdev
293
294							Return the mean score value across all chromosomes.
295
296							=item global_stats
297
298							Return a statistical summary across all chromosomes.
299
300							=back
301
302							=head2 Data accession
303
304							These are the primary methods for working with data contained in
305							useq file archive. These should be familiar to most B users.
306
307							=over 4
308
309							=item features
310
311							Returns an array or array reference of SeqFeatureI compatible
312							objects overlapping a given genomic interval.
313
314							Coordinates of the interrogated regions must be supplied. At a
315							minimum, the seq_id must be supplied. A start of 1 and an end
316							corresponding to the length of the seq_id is used if not directly
317							provided. Coordinates may be specified in two different manners,
318							either as a list of (seq_id, start, end, strand) or as one or
319							more keyed values.
320
321							my @features = $useq->features($seq_id, $start, $end);
322
323							@features = $useq->features(
324							-seq_id => $seq_id,
325							-start => $start,
326							-end => $end,
327							);
328
329							If the -iterator argument is supplied with a true value, then an
330							iterator object is returned. See get_seq_stream() for details.
331
332							Bio::DB::USeq supports four different feature types. Feature
333							types may be specified using the -type argument.
334
335							=over 4
336
337							=item region
338
339							=item interval
340
341							=item observation
342
343							The default feature type if the type argument is not specified.
344							These are SeqFeatureI compatible objects representing observations
345							in the useq file archive. These are compatible with the iterator.
346							SeqFeature observations are returned in genomic order.
347
348							=item chromosome
349
350							Returns SeqFeatureI compatible objects representing the
351							reference sequences (chromosomes) listed in the useq file
352							archive. These are not compatibile with the iterator.
353
354							=item wiggle
355
356							=item wiggle:$bins
357
358							=item coverage
359
360							=item coverage:$bins
361
362							Returns an array of SeqFeatureI compatible objects for each
363							strand requested. If the useq file contains stranded data,
364							and no strand is requested, then two objects will be
365							returned representing each strand.
366
367							Each object contains an array representing scores across
368							the requested coordinates. This object is designed to be
369							backwards compatible with coverage features from the
370							L adaptor for use with L and GBrowse.
371							Note that only scores are returned, not a true depth coverage
372							in the sense of the L coverage types.
373
374							By default, the wiggle or coverage array is provided at 1
375							bp resolution. To improve efficiency with large regions,
376							the wiggle array may be limited by using a bin option,
377							where the interrogated interval is divided into the number
378							of bins requested.
379
380							To retrieve the scores, call the wiggle() or coverage() method.
381
382							For example, to request wiggle scores in 100 equal bins across
383							the interval, see the following example. The wiggle and
384							coverage types are synonymous.
385
386							my ($wiggle) = $useq->features(
387							-seq_id => $chromosome,
388							-start => $start,
389							-end => $end,
390							-type => 'wiggle:100',
391							);
392							my @scores = $wiggle->wiggle;
393							@scores = $wiggle->coverage;
394
395							Wiggle objects may also be obtained with a get_seq_stream()
396							iterator objects.
397
398							=item summary
399
400							=item summary:$bins
401
402							Returns an array of SeqFeatureI compatibile Summary objects
403							for each strand requested. If the useq file contains stranded
404							data, and no strand is requested, then two objects will be
405							returned representing each strand.
406
407							Each Summary object can then be used to call statistical
408							summaries for one or more bins across the interval. Each
409							statistical summary is an anonymous hash with five keys:
410							validCount, sumData, sumSquares, minVal, and maxVal. From
411							these values, a mean and standard deviation may also be
412							calculated.
413
414							my ($summary) = $useq->features(
415							-seq_id => $chromosome,
416							-start => $start,
417							-end => $end,
418							-type => 'summary',
419							);
420							my @stats = $summary->statistical_summary(100);
421							foreach my $stat (@stats) {
422							my $count = $stat->{validCount};
423							my $sum = $stat->{sumData};
424							my $mean = $sum / $count;
425							}
426
427							Statistical summaries are equivalent to those generated by the
428							L adaptor and may be used interchangeably. They
429							are compatible with the L modules.
430
431							Summary objects may also be obtained with a get_seq_stream()
432							iterator object.
433
434							=back
435
436							=item get_seq_stream
437
438							This is a memory efficient data accessor. An iterator object is
439							returned for an interval specified using coordinate values in the
440							same manner as features(). Call the method next_seq() on the
441							iterator object to retrieve the observation SeqFeature objects
442							one at a time. The iterator is compatible with region, wiggle,
443							and summary feature types.
444
445							# establish the iterator object
446							my $iterator = $useq->get_seq_stream(
447							-seq_id => $seq_id,
448							-start => $start,
449							-end => $end,
450							-type => 'region',
451							);
452
453							# retrieve the features one at a time
454							while (my $f = $iterator->next_seq) {
455							# each feature $f is either a
456							# Bio::DB::USeq::Feature,
457							# a Bio::DB::USeq::Wiggle, or
458							# a Bio::DB::USeq::Summary object
459							}
460
461							=item scores
462
463							This is a simplified data accessor that only returns the score
464							values overlapping an interrogated interval, rather than
465							assembling each observation into a SeqFeature object. The scores
466							are not associated with genomic coordinates, and are not guaranteed
467							to be returned in original genomic order.
468
469							Provide the interval coordinates in the same manner as the
470							features() method. Stranded data collection is supported.
471
472							my @scores = $useq->scores(
473							-seq_id => $seq_id,
474							-start => $start,
475							-end => $end,
476							);
477
478							=item observations
479
480							This is a low-level data accessor, similar to features() but returning
481							an array of references to the original USeq observations. Each USeq
482							observation is an anonymous array reference of 2-4 elements: start, stop,
483							score, and text, depending on the stored data type. All coordinates are in
484							0-based coordinates, unlike high level accessors. Note that while strand is
485							supported, it is not reported for each observation. If observations exist on
486							both strands, then each strand should be searched separately. Observations
487							are not guaranteed to be returned in genomic order.
488
489							my @observations = $useq->observations(
490							-seq_id => $seq_id,
491							-start => $start,
492							-end => $end,
493							);
494							foreach my $obs (@observations) {
495							my $start = $obs->[0] + 1; # convert to 1-based coordinate
496							my $stop = $obs->[1];
497							my $score = $obs->[2]; # may not be present
498							my $text = $obs->[3]; # may not be present
499							}
500
501							=item segment
502
503							This returns a L object, which is a SeqFeatureI
504							compatible segment object corresponding to the specified coordinates.
505							From this object, one can call the features(), scores(), or
506							get_seq_stream() methods directly. Keyed options or location
507							information need not be provided. Stranded segments are
508							supported.
509
510							my $segment = $useq->segment(
511							-seq_id => $seq_id,
512							-start => $start,
513							-end => $end,
514							);
515							my @scores = $segment->scores;
516							my @features = $segment->features('wiggle:100');
517							my $iterator = $segment->get_seq_stream('region');
518
519							=item get_features_by_location
520
521							Convenience method for returning features restricted by location.
522
523							=item get_feature_by_id
524
525							=item get_feature_by_name
526
527							Compatibility methods for returning a specific feature or
528							observation in the USeq file. Text fields, if present, are not
529							indexed in the USeq file, preventing efficient searching of names.
530							As a workaround, an ID or name comprised of "$seq_id:$start:$end"
531							may be used, although a direct search of coordinates would be
532							more efficient.
533
534							my $feature = $useq->get_feature_by_id("$seq_id:$start:$end");
535
536
537							=back
538
539							=head1 ADDITIONAL CLASSES
540
541							These are additional class object that may be returned by various
542							methods above.
543
544							=head2 Bio::DB::USeq::Feature objects
545
546							These are SeqFeatureI compliant objects returned by the features()
547							or next_seq() methods. They support the following methods.
548
549							seq_id
550							start
551							end
552							strand
553							score
554							type
555							source (returns the useq archive filename)
556							name (chromosome:start..stop)
557							Bio::RangeI methods
558
559							Additionally, chromosome and global statistics are also available
560							from any feature, as well as from Segment, Wiggle, Iterator, and
561							Summary objects. See the corresponding USeq methods for details.
562
563							=head2 Bio::DB::USeq::Segment objects
564
565							This is a SeqFeatureI compliant object representing a genomic segment
566							or interval. These support the following methods.
567
568							=over 4
569
570							=item features
571
572							=item features($type)
573
574							=item get_seq_stream
575
576							=item get_seq_stream($type)
577
578							=item scores
579
580							=item observations
581
582							Direct methods for returning features or scores. Coordinate information
583							need not be provided. See the corresponding Bio::DB::USeq methods for
584							more information.
585
586							=item wiggle
587
588							=item wiggle($bins)
589
590							=item coverage
591
592							=item coverage($bins)
593
594							=item statistical_summary
595
596							=item statistical_summary($bins)
597
598							Convenience methods for returning wiggle (coverage) or summary features
599							over the segment. If desired, the number of bins may be specified. See
600							the features() method for more information.
601
602							=item slices
603
604							Returns an array of splice member names that overlap this segment.
605							See L for more information.
606
607							=back
608
609							=head2 Bio::DB::USeq::Iterator objects
610
611							This is an iterator object for retrieving useq observation SeqFeatureI
612							objects in a memory efficient manner by returning one feature at a time.
613
614							=over 4
615
616							=item next_seq
617
618							=item next_feature
619
620							Returns the next feature present in the interrogated interval. Features
621							are generally returned in ascending coordinate order. Returns undefined
622							when no features are remaining in the interval. Features may include
623							either region or wiggle types, depending on how the iterator object was
624							established. See features() and get_seq_stream() methods for more
625							information.
626
627							=back
628
629							=head2 Bio::DB::USeq::Wiggle objects
630
631							These are SeqFeatureI compliant objects for backwards compatibility with
632							L and GBrowse. They support the wiggle() and coverage()
633							methods, which returns an array of scores over the interrogated region. By
634							default, the array is equal to the length of the region (1 bp resolution),
635							or may be limited to a specified number of bins for efficiency. See the
636							features() method for more information.
637
638							=over 4
639
640							=item wiggle
641
642							=item coverage
643
644							The scores are stored as an array in the coverage attribute. For
645							convenience, the wiggle() and coverage() methods may be used to retrieve
646							the array or array reference of scores.
647
648							=item statistical_summary
649
650							Generate a statistical summary hash for the collected wiggle scores
651							(not the original data). This method is not entirely that useful; best
652							to use the summary feature type in the first place.
653
654							=item chromosome and global statistics
655
656							Chromosome and global statistics, including mean and standard deviation,
657							are available from wiggle objects. See the corresponding USeq methods
658							for details.
659
660							=back
661
662							=head2 Bio::DB::USeq::Summary objects
663
664							These are SeqFeatureI compliant Summary objects, similar to those
665							generated by the L database adaptor. As such, they are
666							compatible with L and GBrowse.
667
668							Summary objects can generate statistical summaries over a specified
669							number of bins (default is 1 bin, or the entire requested region).
670							Each statistical summary is an anonymous hash consisting of five
671							keys: validCount, sumData, sumSquares, minVal, and maxVal. From
672							these values, a mean and standard deviation may be calculated.
673
674							For convenience, three exported functions are available for calculating
675							the mean and standard deviation from a statistical summary hash.
676							See L for more information.
677
678							Use statistical summaries in the following manner.
679
680							my $stats = $summary->statistical_summary(10);
681							my $stat = shift @$stats;
682							my $min = $stat->{minVal};
683							my $max = $stat->{maxVal};
684							my $mean = $stat->{sumData} / $stat->{validCount};
685
686							=over 4
687
688							=item statistical_summary
689
690							=item statistical_summary($bins)
691
692							Generate a statistical summary hash for one or more bins across the
693							interrogated region. Provide the number of bins desired. If a feature
694							type of "summary:$bins" is requested through the features() or
695							get_seq_stream() method, then $bins number of bins will be used.
696							The default number of bins is 1.
697
698							=item score
699
700							Generate a single statistical summary over the entire region.
701
702							=item chromosome and global statistics
703
704							Chromosome and global statistics, including mean and standard deviation,
705							are available from summary objects. See the corresponding USeq methods
706							for details.
707
708							=back
709
710							=head1 EXPORTED FUNCTIONS
711
712							Three subroutine functions are available for export to assist
713							in calculating the mean, variance, and standard deviation from
714							statistical summaries. These functions are identical to those
715							from the L adaptor and may be used interchangeably.
716
717							They are not exported by default; they must explicitly listed.
718
719							use Bio::DB::USeq qw(binMean binStdev);
720							my $stats = $summary->statistical_summary(10);
721							my $stat = shift @$stats;
722							my $mean = binMean($stat);
723							my $stdev = binStdev($stat);
724
725							=over 4
726
727							=item binMean( $stat )
728
729							Calculate the mean from a statistical summary anonymous hash.
730
731							=item binVariance( $stat )
732
733							Calculate the variance from a statistical summary anonymous hash.
734
735							=item binStdev( $stat )
736
737							Calculate the standard deviation from a statistical summary anonymous hash.
738
739							=back
740
741							=head1 USEQ SLICES
742
743							Genomic observations are recorded in groups, called slices, of
744							usually 10000 observations at a time. Each slice is a separate
745							zip file member in the useq file archive. These methods are for
746							accessing information about each slice. In general, accessing
747							data through slices is a lower level operation. Users should
748							preferentially use the main data accessors.
749
750							The following are Bio::DB::USeq methods available for working
751							with slices.
752
753							=over 4
754
755							=item slices
756
757							Returns an array of all the slice member names present in the
758							useq archive file.
759
760							=item slice_feature($slice)
761
762							Return a L object representing the slice interval.
763							The features(), get_seq_stream(), and scores() methods are supported.
764
765							=item slice_seq_id($slice)
766
767							Return the chromosome or sequence identifier associated with a slice.
768
769							=item slice_start($slice)
770
771							Return the start position of the slice interval.
772
773							=item slice_end($slice)
774
775							Return the end position of the slice interval.
776
777							=item slice_strand($slice)
778
779							Return the strand of the slice interval.
780
781							=item slice_type($slice)
782
783							Return the file type of the slice member. This corresponds to the
784							file extension of the slice zip member and indicates how to parse
785							the binary member. Each letter in the type corresponds to a data
786							type, be it integer, short, floating-point, or text. See the
787							useq file documentation for more details.
788
789							=item slice_obs_number($slice)
790
791							Return the number of observations recorded in the slice interval.
792
793							=back
794
795							=head1 GBROWSE COMPATIBILITY
796
797							The USeq adaptor has support for L and GBrowse.
798							It will work with the segments glyph for intervals,
799							the wiggle_xyplot glyph for displaying mean scores, and the
800							wiggle_whiskers glyph for displaying detailed statistics.
801
802							Initialize the USeq database adaptor.
803
804							[data1:database]
805							db_adaptor = Bio::DB::USeq
806							db_args = -file /path/to/data1.useq
807
808							Displaying simple intervals with the segments glyph.
809
810							[data1_segments]
811							database = data1
812							feature = region
813							glyph = segments
814							stranded = 1
815
816							Displaying scores using the wiggle_xyplot glyph.
817							You may set the bins to whatever number is appropriate (in
818							this example, 1000), or leave blank (not recommended,
819							defaults to 1 bp resolution).
820
821							[data1_xyplot]
822							database = data1
823							feature = wiggle:1000
824							glyph = wiggle_xyplot
825							graph_type = histogram
826							autoscale = chromosome
827
828							Displaying scores using the wiggle_whiskers glyph. Note that
829							generating statistical summaries are computationally more expensive
830							than simple bins of mean values as with the wiggle feature type.
831
832							[data1_whiskers]
833							database = data1
834							feature = summary
835							glyph = wiggle_whiskers
836							graph_type = histogram
837							autoscale = chromosome
838
839							=head1 PERFORMANCE
840
841							Because the Bio::DB::USeq is implemented as a Perl-only module,
842							performance is subject to the limitations of Perl execution itself and
843							the size of the data that needs to be parsed. In general when collecting
844							score data, requesting scores is the fastest mode of operation, followed
845							by wiggle feature types, and finally summary feature types.
846
847							In comparison to UCSC bigWig files, the USeq format is typically much
848							faster when viewing intervals where the entire interval is represented by
849							one or a few internal slices. This is especially true for repeated queries
850							over the same or neighboring intervals, as the slice contents are retained
851							in memory. As the number of internal slices that must be loaded into memory
852							increases, for example querying intervals of hundreds of kilobases in size,
853							performance will begin to lag as each internal slice must be parsed into
854							memory. This is where the UCSC bigWig file format with internal zoom levels
855							of summary statistics can outperform, at the cost of file complexity and size.
856
857							=cut
858
859
860							require 5.010000;
861	1			1		7188	use strict;
	1					2
	1					26
862	1			1		3	use Carp qw(carp cluck croak confess);
	1					2
	1					63
863	1			1		517	use Archive::Zip qw( :ERROR_CODES );
	1					70010
	1					80
864	1			1		351	use Set::IntervalTree;
	1					4977
	1					50
865	1			1		6	use File::Spec;
	1					2
	1					20
866
867	1			1		4	use base 'Exporter';
	1					1
	1					6439
868							our @EXPORT_OK = qw(binMean binVariance binStdev);
869
870							1;
871
872							#### USeq initialization ####
873
874							sub new {
875	2			2	1	181	my $class = shift;
876	2					21	my $self = {
877							'name' => undef,
878							'zip' => undef,
879							'stranded' => undef,
880							'seq_ids' => {},
881							'metadata' => {},
882							'coord2file' => {},
883							'file2attribute' => {},
884							'buffer' => {},
885							};
886	2					6	bless $self, $class;
887
888							# check for arguments
889	2					3	my %args;
890	2	50				8	if (@_) {
891	2	100				11	if ($_[0] =~ /^-/) {
892	1					5	%args = @_;
893							}
894							else {
895	1					11	$args{-file} = shift;
896							}
897							}
898
899							# open file
900	2		0			7	my $file = $args{-file} \|\| $args{-useq} \|\| undef;
901	2	50				5	if ($file) {
902	2	50				9	return unless ($self->open($file)); # open must return true
903							}
904
905							# done
906	2					6	return $self;
907							}
908
909							sub open {
910	2			2	1	4	my $self = shift;
911
912							# check file
913	2					3	my $filename = shift;
914	2	50				5	unless ($filename) {
915	0					0	cluck("no file name passed!\n");
916	0					0	return;
917							}
918	2	50				12	unless ($filename =~ /\.useq$/i) {
919	0					0	carp "'$filename' is not a .useq archive file!\n";
920	0					0	return;
921							}
922	2	50				6	if ($self->slices) {
923	0					0	cluck "Only load one useq file per object!\n";
924	0					0	return;
925							}
926
927							# open the archive
928	2					14	my $zip = Archive::Zip->new();
929	2					74	my $error = $zip->read($filename);
930	2	50				2682	unless ($error == AZ_OK) {
931	0					0	carp " unable to read USeq archive '$filename'! Error $error\n";
932	0					0	return;
933							}
934	2					5	$self->{'zip'} = $zip;
935	2					44	(undef, undef, $self->{'name'}) = File::Spec->splitpath($filename);
936
937							# parse the contents
938	2	50				8	return unless ($self->_parse_members);
939							# we delay parsing metadata unless it is requested
940
941							# success
942	2					4	return 1;
943							}
944
945							sub clone {
946	1			1	1	3	my $self = shift;
947	1	50				4	return unless $self->{'zip'};
948	1					3	my $file = $self->zip->fileName;
949	1					13	my $zip = Archive::Zip->new();
950	1					35	my $error = $zip->read($file);
951	1	50				1273	unless ($error == AZ_OK) {
952	0					0	carp " unable to read USeq archive '$file'! Error $error\n";
953	0					0	return;
954							}
955	1					22	$self->{'zip'} = $zip;
956	1					4	return 1;
957							}
958
959							sub zip {
960	9			9	1	52	return shift->{'zip'};
961							}
962
963							sub name {
964	4			4	0	36	return shift->{'name'};
965							}
966
967
968
969							#### General USeq information ####
970
971							sub seq_ids {
972	1			1	1	2	my $self = shift;
973	1					1	return sort {$a cmp $b} keys %{ $self->{'seq_ids'} };
	1					5
	1					6
974							}
975
976							sub length {
977	7			7	1	52	my $self = shift;
978	7	50				17	my $seq_id = shift or return;
979	7	50				23	if (exists $self->{'seq_ids'}{$seq_id}) {
980	7					22	return $self->{'seq_ids'}{$seq_id};
981							}
982							}
983
984							sub stranded {
985	8			8	1	42	return shift->{'stranded'};
986							}
987
988							sub attributes {
989	0			0	1	0	my $self = shift;
990	0	0				0	$self->_parse_metadata unless %{ $self->{'metadata'} };
	0					0
991	0					0	return (sort {$a cmp $b} keys %{ $self->{'metadata'} });
	0					0
	0					0
992							}
993
994							sub attribute {
995	1			1	1	2	my $self = shift;
996	1					2	my $key = shift;
997	1	50				3	return $self->attributes unless $key;
998	1	50				1	$self->_parse_metadata unless %{ $self->{'metadata'} };
	1					5
999	1	50				3	if (exists $self->{'metadata'}{$key}) {
1000	1					4	return $self->{'metadata'}{$key};
1001							}
1002	0					0	return;
1003							}
1004
1005							sub type {
1006	1			1	1	85	return shift->attribute('dataType');
1007							}
1008
1009							sub version {
1010	0			0	1	0	return shift->attribute('useqArchiveVersion');
1011							}
1012
1013							sub genome {
1014	0			0	1	0	return shift->attribute('versionedGenome');
1015							}
1016
1017							sub chr_stats {
1018	1			1	1	1	my $self = shift;
1019	1	50				3	my $seq_id = shift or return;
1020	1					2	my $delay_write = shift; # option to delay rewriting the metadata
1021	1	50				2	$self->_parse_metadata unless %{ $self->{'metadata'} };
	1					14
1022
1023							# return the chromosome stats
1024	1	50				6	if (exists $self->{metadata}{"chromStats_$seq_id"}) {
1025	0					0	my @data = split(',', $self->{metadata}{"chromStats_$seq_id"});
1026	0					0	my %stat = (
1027							'validCount' => $data[0],
1028							'sumData' => $data[1],
1029							'sumSquares' => $data[2],
1030							'minVal' => $data[3],
1031							'maxVal' => $data[4],
1032							);
1033	0					0	return \%stat;
1034							}
1035
1036							# chromosome stats must be generated
1037	1					6	my @slices = $self->_translate_coordinates_to_slices(
1038							$seq_id, 1, $self->length($seq_id), 0);
1039
1040							# the normal _stat_summary() function requires loading entire chromosome worth of
1041							# slices into memory and can become a huge memory hog, so we'll go one at time
1042							# instead in a slightly more efficient manner
1043	1					3	my $count = 0;
1044	1					4	my $sum;
1045							my $sum_squares;
1046	1					0	my $min;
1047	1					0	my $max;
1048	1					3	foreach my $slice (@slices) {
1049	1	50				3	next unless $self->slice_type($slice) =~ /f/;
1050							# no sense going through if no score present
1051
1052							# load and unpack the data
1053	1					4	$self->_clear_buffer([$slice]);
1054	1					7	$self->_load_slice($slice);
1055
1056							# find the overlapping observations
1057	1					5	my $results = $self->{'buffer'}{$slice}->fetch(
1058							$self->slice_start($slice) - 1, $self->slice_end($slice));
1059	1					9	foreach my $r (@$results) {
1060							# each observation is [start, stop, score]
1061	9151	50				10648	if (defined $r->[2]) {
1062	9151					7624	$count++;
1063	9151					7944	$sum += $r->[2];
1064	9151					7972	$sum_squares += ($r->[2] * $r->[2]);
1065	9151	100	100			16267	$min = $r->[2] if (!defined $min or $r->[2] < $min);
1066	9151	100	100			16583	$max = $r->[2] if (!defined $max or $r->[2] > $max);
1067							}
1068							}
1069							}
1070
1071							# assemble stats
1072	1		50			22	my %stat = (
			50
			50
			50
1073							'validCount' => $count,
1074							'sumData' => $sum \|\| 0,
1075							'sumSquares' => $sum_squares \|\| 0,
1076							'minVal' => $min \|\| 0,
1077							'maxVal' => $max \|\| 0,
1078							);
1079
1080							# then associate with the metadata
1081	1					5	$self->{'metadata'}{"chromStats_$seq_id"} = join(',', map { $stat{$_} }
	5					45
1082							qw(validCount sumData sumSquares minVal maxVal) );
1083	1	50				11	$self->_rewrite_metadata unless $delay_write;
1084
1085	1					4	return \%stat;
1086							}
1087
1088							sub chr_mean {
1089	1			1	1	127	my $self = shift;
1090	1	50				5	my $seq_id = shift or return;
1091	1					5	return Bio::DB::USeq::binMean( $self->chr_stats($seq_id) );
1092							}
1093
1094							sub chr_stdev {
1095	0			0	1	0	my $self = shift;
1096	0	0				0	my $seq_id = shift or return;
1097	0					0	return Bio::DB::USeq::binStdev( $self->chr_stats($seq_id) );
1098							}
1099
1100							sub global_stats {
1101							# this is an expensive proposition, because it must parse through every
1102							# slice in the archive
1103	0			0	1	0	my $self = shift;
1104	0	0				0	$self->_parse_metadata unless %{ $self->{'metadata'} };
	0					0
1105
1106							# return the chromosome stats
1107	0	0				0	if (exists $self->{metadata}{globalStats}) {
1108	0					0	my @data = split(',', $self->{metadata}{globalStats});
1109	0					0	my %stat = (
1110							'validCount' => $data[0],
1111							'sumData' => $data[1],
1112							'sumSquares' => $data[2],
1113							'minVal' => $data[3],
1114							'maxVal' => $data[4],
1115							);
1116	0					0	return \%stat;
1117							}
1118
1119							# calculate new genome-wide statistics from individual chromosome stats
1120	0					0	my $count = 0;
1121	0					0	my $sum;
1122							my $sum_squares;
1123	0					0	my $min;
1124	0					0	my $max;
1125	0					0	foreach my $seq_id ($self->seq_ids) {
1126	0					0	my $stats = $self->chr_stats($seq_id, 1); # delay writing metadata
1127	0					0	$count += $stats->{validCount};
1128	0					0	$sum += $stats->{sumData};
1129	0					0	$sum_squares += $stats->{sumSquares};
1130	0	0	0			0	$min = $stats->{minVal} if (!defined $min or $stats->{minVal} < $min);
1131	0	0	0			0	$max = $stats->{maxVal} if (!defined $max or $stats->{maxVal} > $max);
1132							}
1133
1134							# assemble the statistical summary hash
1135	0		0			0	my %stat = (
			0
			0
			0
1136							'validCount' => $count,
1137							'sumData' => $sum \|\| 0,
1138							'sumSquares' => $sum_squares \|\| 0,
1139							'minVal' => $min \|\| 0,
1140							'maxVal' => $max \|\| 0,
1141							);
1142
1143							# update metadata
1144	0					0	$self->{'metadata'}{'globalStats'} = join(',', map { $stat{$_} }
	0					0
1145							qw(validCount sumData sumSquares minVal maxVal) );
1146	0					0	$self->_rewrite_metadata;
1147
1148	0					0	return \%stat;
1149							}
1150
1151							sub global_mean {
1152	0			0	1	0	my $self = shift;
1153	0					0	return Bio::DB::USeq::binMean( $self->global_stats );
1154							}
1155
1156							sub global_stdev {
1157	0			0	1	0	my $self = shift;
1158	0					0	return Bio::DB::USeq::binStdev( $self->global_stats );
1159							}
1160
1161							#### slice information ####
1162
1163							sub slices {
1164	2			2	1	4	my $self = shift;
1165	2					3	return sort {$a cmp $b} keys %{ $self->{'file2attribute'} };
	0					0
	2					13
1166							}
1167
1168							sub slice_seq_id {
1169	0			0	1	0	my $self = shift;
1170	0	0				0	my $slice = shift or return;
1171	0					0	return $self->{'file2attribute'}{$slice}->[0];
1172							}
1173
1174							sub slice_start {
1175	1088			1088	1	1079	my $self = shift;
1176	1088	50				1429	my $slice = shift or return;
1177	1088					1752	return $self->{'file2attribute'}{$slice}->[1];
1178							}
1179
1180							sub slice_end {
1181	2021			2021	1	2116	my $self = shift;
1182	2021	50				2490	my $slice = shift or return;
1183	2021					4608	return $self->{'file2attribute'}{$slice}->[2];
1184							}
1185
1186							sub slice_strand {
1187	4			4	1	11	my $self = shift;
1188	4	50				12	my $slice = shift or return;
1189	4					20	return $self->{'file2attribute'}{$slice}->[3];
1190							}
1191
1192							sub slice_type {
1193	1020			1020	1	1071	my $self = shift;
1194	1020	50				1238	my $slice = shift or return;
1195	1020					2731	return $self->{'file2attribute'}{$slice}->[4];
1196							}
1197
1198							sub slice_obs_number {
1199	6			6	1	12	my $self = shift;
1200	6	50				12	my $slice = shift or return;
1201	6					13	return $self->{'file2attribute'}{$slice}->[5];
1202							}
1203
1204							sub slice_feature {
1205	0			0	1	0	my $self = shift;
1206	0	0				0	my $slice = shift or return;
1207							return Bio::DB::USeq::Segment->new(
1208							-seq_id => $self->{'file2attribute'}{$slice}->[0],
1209							-start => $self->{'file2attribute'}{$slice}->[1],
1210							-stop => $self->{'file2attribute'}{$slice}->[2],
1211							-strand => $self->{'file2attribute'}{$slice}->[3],
1212	0					0	-type => $self->{'file2attribute'}{$slice}->[4],
1213							-source => $self->name,
1214							-name => $slice,
1215							);
1216							}
1217
1218
1219
1220
1221
1222							#### Feature and data access ####
1223
1224							sub segment {
1225	1			1	1	2	my $self = shift;
1226
1227							# arguments can be chromo, start, stop, strand
1228	1					3	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1229	1	50				3	return unless $self->length($seq_id); # make sure chromosome is represented
1230
1231	1					4	return Bio::DB::USeq::Segment->new(
1232							-seq_id => $seq_id,
1233							-start => $start,
1234							-end => $stop,
1235							-strand => $strand,
1236							-type => 'segment',
1237							-source => $self->name,
1238							-useq => $self,
1239							);
1240							}
1241
1242							sub features {
1243	2			2	1	268	my $self = shift;
1244	2					9	my %args = @_;
1245
1246							# check for type
1247	2					3	my $type;
1248	2		0			6	$args{-type} \|\|= $args{-types} \|\| $args{-primary_tag} \|\| 'region';
			33
1249	2	50	33			15	if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1250	0		0			0	$type = $args{-type}->[0] \|\| 'region';
1251	0					0	$args{-type} = $type;
1252							}
1253							else {
1254	2					4	$type = $args{-type};
1255							}
1256
1257							# return an appropriate feature
1258	2	50				20	if ($type =~ /chromosome/) {
		50
1259							# compatibility to return chromosome features
1260							my @chromos = map {
1261	0					0	Bio::DB::USeq::Feature->new(
	0					0
1262							-seq_id => $_,
1263							-start => 1,
1264							-end => $self->length($_),
1265							-type => $type,
1266							-source => $self->name,
1267							-name => $_,
1268							)
1269							} $self->seq_ids;
1270	0	0				0	return wantarray ? @chromos : \@chromos;
1271							}
1272							elsif ($type =~ /region\|interval\|observation\|coverage\|wiggle\|summary/) {
1273							# region or segment are individual observation features
1274							# coverage or wiggle are for efficient score retrieval with
1275							# backwards compatibility with Bio::Graphics
1276							# summary are statistical summaries akin to Bio::DB::BigFile
1277
1278							# set up an iterator
1279	2					12	my $iterator = $self->get_seq_stream(%args);
1280	2	50				8	return unless $iterator;
1281
1282							# if user requested an iterator
1283	2	0	33			5	if (exists $args{-iterator} and $args{-iterator}) {
1284	0					0	return $iterator;
1285							}
1286
1287							# collect the features
1288	2					2	my @features;
1289	2					6	while (my $f = $iterator->next_seq) {
1290	2					9	push @features, $f;
1291							}
1292	2	50				19	return wantarray ? @features : \@features;
1293							}
1294							else {
1295	0					0	confess "unknown type request '$type'!\n";
1296							}
1297							}
1298
1299
1300							sub get_features_by_type {
1301							# does not work without location information
1302	0			0	0	0	cluck "please use features() method instead";
1303	0					0	return;
1304							}
1305
1306
1307							sub get_features_by_location {
1308	0			0	1	0	my $self = shift;
1309
1310							# arguments can be chromo, start, stop, strand
1311	0					0	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1312
1313	0					0	return $self->features($seq_id, $start, $stop, $strand);
1314							}
1315
1316
1317							sub get_feature_by_id {
1318							# much as Bio::DB::BigWig fakes the id, so we will too here
1319							# I don't know how necessary this really is
1320	0			0	1	0	my $self = shift;
1321
1322							# id will be encoded as chr:start:end ?
1323	0					0	my $id = shift;
1324	0					0	my ($seq_id, $start, $end, $type) = split /:/, $id;
1325
1326	0		0			0	my @list = $self->features(
1327							-seq_id => $seq_id,
1328							-start => $start,
1329							-end => $end,
1330							-type => $type \|\| undef,
1331							);
1332	0	0				0	return unless @list;
1333	0	0				0	return $list[0] if scalar @list == 1;
1334	0					0	foreach my $f (@list) {
1335	0	0	0			0	return $f if ($f->start == $start and $f->end == $end);
1336							}
1337	0					0	return;
1338							}
1339
1340
1341							sub get_feature_by_name {
1342	0			0	1	0	return shift->get_feature_by_id(@_);
1343							}
1344
1345
1346							sub get_seq_stream {
1347	3			3	1	70	my $self = shift;
1348
1349							# arguments can be chromo, start, stop, strand
1350	3					11	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1351	3	50				11	return unless $self->length($seq_id); # make sure chromosome is represented
1352
1353							# check for type
1354	3					9	my %args = @_;
1355	3					3	my $type;
1356	3		50			22	$args{-type} \|\|= $args{-types} \|\| $args{-primary_tag} \|\| 'region';
			66
1357	3	50	33			13	if (ref $args{-type} and ref $args{-type} eq 'ARRAY') {
1358	0		0			0	$type = $args{-type}->[0] \|\| 'region';
1359							}
1360							else {
1361	3					4	$type = $args{-type};
1362							}
1363
1364	3					12	return Bio::DB::USeq::Iterator->new(
1365							-seq_id => $seq_id,
1366							-start => $start,
1367							-end => $stop,
1368							-strand => $strand,
1369							-type => $type,
1370							-source => $self->name,
1371							-useq => $self,
1372							);
1373							}
1374
1375
1376							sub scores {
1377	1			1	1	37	my $self = shift;
1378
1379							# arguments can be chromo, start, stop, strand
1380	1					2	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1381	1	50				3	return unless $self->length($seq_id); # make sure chromosome is represented
1382
1383							# determine which slices to retrieve
1384	1					4	my @slices = $self->_translate_coordinates_to_slices(
1385							$seq_id, $start, $stop, $strand);
1386	1	50				3	return unless @slices;
1387	1					3	$self->_clear_buffer(\@slices);
1388
1389							# collect the scores from each of the requested slices
1390	1					2	my @scores;
1391	1					2	foreach my $slice (@slices) {
1392
1393							# load and unpack the data
1394	1					3	$self->_load_slice($slice);
1395
1396							# find the overlapping observations
1397	1					33	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1398
1399							# record the scores
1400	1					5	foreach my $r (@$results) {
1401	31	50				44	push @scores, $r->[2] if defined $r->[2];
1402							}
1403							}
1404
1405	1	50				18	return wantarray ? @scores : \@scores;
1406							}
1407
1408							sub observations {
1409	0			0	1	0	my $self = shift;
1410
1411							# arguments can be chromo, start, stop, strand
1412	0					0	my ($seq_id, $start, $stop, $strand) = $self->_get_coordinates(@_);
1413	0	0				0	return unless $self->length($seq_id); # make sure chromosome is represented
1414
1415							# determine which slices to retrieve
1416	0					0	my @slices = $self->_translate_coordinates_to_slices(
1417							$seq_id, $start, $stop, $strand);
1418	0	0				0	return unless @slices;
1419	0					0	$self->_clear_buffer(\@slices);
1420
1421							# collect the scores from each of the requested slices
1422	0					0	my @observations;
1423	0					0	foreach my $slice (@slices) {
1424							# load and unpack the data
1425	0					0	$self->_load_slice($slice);
1426
1427							# find the overlapping observations
1428	0					0	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
1429	0					0	push @observations, @$results;
1430							}
1431
1432	0	0				0	return wantarray ? @observations : \@observations;
1433							}
1434
1435
1436
1437
1438							#### Private methods ####
1439
1440							sub _parse_metadata {
1441	1			1		2	my $self = shift;
1442
1443							# the metadata file should always be present in a USeq file
1444	1					4	my $readMe = $self->{'zip'}->contents('archiveReadMe.txt');
1445	1	50				1119	unless ($readMe) {
1446	0					0	carp " USeq file is malformed! It does not contain an archiveReadMe.txt file\n";
1447	0					0	return;
1448							}
1449
1450							# parse the archive file
1451							# this is a simple text file, each line is key = value
1452	1					3	$self->{'metadata'}{'comments'} = [];
1453	1					5	foreach (split /\n/, $readMe) {
1454	5	50				10	if (/^#/) {
1455	0	0				0	push @{ $self->{'metadata'}{'comments'} }, $_ unless $_ =~ /validCount/;
	0					0
1456	0					0	next;
1457							}
1458	5	50				14	if (/^\s([^=\s]+)\s=\s(.+)\s$/) {
1459							# separate key = value pairs tolerating whitespace
1460							# key may contain anything excluding = and whitespace
1461	5					12	$self->{'metadata'}{$1} = $2;
1462							}
1463							}
1464	1					2	return 1;
1465							}
1466
1467
1468							sub _parse_members {
1469	2			2		4	my $self = shift;
1470
1471							# there is a lot of information encoded in each zip member file name
1472							# the chromosome, strand, coordinates of represented interval, and file type
1473							# this parses and indexes this metadata into a usable format
1474
1475	2					5	my @errors;
1476	2					8	foreach my $member ($self->{'zip'}->memberNames) {
1477
1478							# archive readMe
1479	8	100				71	next if ($member eq 'archiveReadMe.txt');
1480
1481							# data file
1482	6					39	my ($chromo, $strand, $start, $stop, $number, $extension) =
1483							($member =~ /^([\w\-\.]+)([\+\-\.])(\d+)\-(\d+)\-(\d+)\.(\w+)$/i);
1484
1485							# check extracted metadata
1486	6	50	33			45	unless ($chromo and $strand and defined $start and defined $stop and
			33
			33
			33
			33
1487							$number and $extension) {
1488	0					0	push @errors, " data slice $member";
1489	0					0	next;
1490							}
1491
1492							# check stranded data
1493	6	100				11	unless (defined $self->{'stranded'}) {
1494	2	100				6	if ($strand eq '.') {
1495	1					2	$self->{'stranded'} = 0;
1496							}
1497							else {
1498	1					3	$self->{'stranded'} = 1;
1499							}
1500							}
1501
1502							# check seq_ids
1503							# record the length for each seq_id, which may or may not be entirely
1504							# accurate, since it is just the last interval position
1505	6	100				11	if (exists $self->{'seq_ids'}{$chromo} ) {
1506	3	50				9	if ($stop > $self->{'seq_ids'}{$chromo}) {
1507	3					5	$self->{'seq_ids'}{$chromo} = $stop;
1508							}
1509							}
1510							else {
1511	3					7	$self->{'seq_ids'}{$chromo} = $stop;
1512							}
1513
1514							# convert to BioPerl convention
1515	6	50				16	$strand = $strand eq '+' ? 1 : $strand eq '.' ? 0 : $strand eq '-' ? -1 : 0;
		100
		100
1516	6					8	$start += 1;
1517
1518							# check coordinates
1519							# hack to cover a bug? Can't have zero or negative coordinates
1520	6	50				12	if ($start >= $stop) {
1521	0					0	$stop += 1;
1522							}
1523
1524							# store the member details for each member slice
1525	6					19	$self->{'file2attribute'}{$member} =
1526							[$chromo, $start, $stop, $strand, $extension, $number];
1527
1528							# store the member into a chromosome-strand-specific interval tree
1529							# interval trees are stored as 0-based, half-open intervals
1530	6		66			39	$self->{'coord2file'}{$chromo}{$strand} \|\|= Set::IntervalTree->new();
1531	6					19	$self->{'coord2file'}{$chromo}{$strand}->insert($member, $start - 1, $stop);
1532							}
1533
1534							# check parsing
1535	2	50				5	if (@errors) {
1536	0					0	carp "Errors parsing data slice filenames:\n" . join("\n", @errors) . "\n";
1537							}
1538	2	50				4	unless (%{ $self->{'coord2file'} }) {
	2					4
1539	0					0	carp "no data slices present in USeq archive!\n";
1540	0					0	return;
1541							}
1542
1543	2					6	return 1;
1544							}
1545
1546
1547							sub _get_coordinates {
1548	5			5		8	my $self = shift;
1549
1550	5					11	my ($seq_id, $start, $stop, $strand);
1551	5	50				19	if ($_[0] =~ /^\-/) {
1552	5					14	my %args = @_;
1553	5		0			16	$seq_id = $args{'-seq_id'} \|\| $args{'-chromo'} \|\| undef;
1554	5		0			9	$start = $args{'-start'} \|\| $args{'-pos'} \|\| undef;
1555	5		0			13	$stop = $args{'-end'} \|\| $args{'-stop'} \|\| undef;
1556	5		50			21	$strand = $args{'-strand'} \|\| '0'; # unstranded
1557							}
1558							else {
1559	0					0	($seq_id, $start, $stop, $strand) = @_;
1560							}
1561	5	50				11	unless ($seq_id) {
1562	0					0	cluck "no sequence ID provided!";
1563	0					0	return;
1564							}
1565	5		50			11	$start \|\|= 1;
1566	5		33			8	$stop \|\|= $self->length($seq_id);
1567	5		50			15	$strand \|\|= 0;
1568	5					14	return ($seq_id, $start, $stop, $strand);
1569							}
1570
1571
1572							sub _translate_coordinates_to_slices {
1573	6			6		17	my $self = shift;
1574	6					15	my ($seq_id, $start, $stop, $strand) = @_;
1575	6	50				17	return unless (exists $self->{'coord2file'}{$seq_id});
1576
1577							# check strand request
1578	6					9	my $both = 0;
1579	6	50				17	if ($strand == 0) {
1580							# strand was not specified,
1581							# but collect from both strands if we have stranded data
1582	6	100				14	$both = 1 if $self->stranded;
1583							}
1584							else {
1585							# strand was specified,
1586							# but convert it to unstranded if the data is not stranded
1587	0	0				0	$strand = 0 unless $self->stranded;
1588							}
1589
1590							# look for the overlapping slices
1591	6					8	my @slices;
1592	6	100				11	if ($both) {
1593							# need to collect from both strands
1594							# plus strand first, then minus strand
1595	1					7	my $results = $self->{'coord2file'}{$seq_id}{1}->fetch($start - 1, $stop);
1596	1					4	my $results2 = $self->{'coord2file'}{$seq_id}{-1}->fetch($start - 1, $stop);
1597	1					12	push @slices, @$results, @$results2;
1598							}
1599
1600							# specific strand
1601							else {
1602	5					42	my $results = $self->{'coord2file'}{$seq_id}{$strand}->fetch($start - 1, $stop);
1603	5					12	push @slices, @$results;
1604							}
1605
1606	6					17	return @slices;
1607							}
1608
1609
1610							sub _clear_buffer {
1611	6			6		9	my $self = shift;
1612
1613							# make a quick hash of wanted slices
1614	6					8	my %wanted = map {$_ => 1} @{$_[0]};
	9					25
	6					20
1615
1616							# delete the existing buffers of slices we do not want
1617	6					10	foreach (keys %{ $self->{buffer} }) {
	6					18
1618	5	100				6653	delete $self->{buffer}{$_} unless exists $wanted{$_};
1619							}
1620							}
1621
1622							sub _load_slice {
1623	1016			1016		980	my $self = shift;
1624	1016					977	my $slice = shift;
1625	1016	100				1663	return if (exists $self->{'buffer'}{$slice});
1626
1627							# each slice is parsed into observations consisting of start, stop, and
1628							# optionally score and text depending on type
1629							# these are stored as anonymous arrays [start, stop, score, text]
1630							# for quick retrieval each observation interval is stored in a Set::IntervalTree
1631							# these operations maintain the original 0-based coordinate system
1632
1633	7					56	my $type = $self->slice_type($slice);
1634	7	50				58	if ($type eq 's') { $self->_load_s_slice($slice) }
	0	50				0
		50
		50
		100
		50
		50
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1635	0					0	elsif ($type eq 'sf') { $self->_load_sf_slice($slice) }
1636	0					0	elsif ($type eq 'st') { $self->_load_st_slice($slice) }
1637	0					0	elsif ($type eq 'ss') { $self->_load_ss_slice($slice) }
1638	6					20	elsif ($type eq 'ssf') { $self->_load_ssf_slice($slice) }
1639	0					0	elsif ($type eq 'sst') { $self->_load_sst_slice($slice) }
1640	1					5	elsif ($type eq 'ssft') { $self->_load_ssft_slice($slice) }
1641	0					0	elsif ($type eq 'i') { $self->_load_i_slice($slice) }
1642	0					0	elsif ($type eq 'if') { $self->_load_if_slice($slice) }
1643	0					0	elsif ($type eq 'it') { $self->_load_it_slice($slice) }
1644	0					0	elsif ($type eq 'ii') { $self->_load_ii_slice($slice) }
1645	0					0	elsif ($type eq 'iif') { $self->_load_iif_slice($slice) }
1646	0					0	elsif ($type eq 'iit') { $self->_load_iit_slice($slice) }
1647	0					0	elsif ($type eq 'iift') { $self->_load_iift_slice($slice) }
1648	0					0	elsif ($type eq 'is') { $self->_load_is_slice($slice) }
1649	0					0	elsif ($type eq 'isf') { $self->_load_isf_slice($slice) }
1650	0					0	elsif ($type eq 'ist') { $self->_load_ist_slice($slice) }
1651	0					0	elsif ($type eq 'isft') { $self->_load_isft_slice($slice) }
1652							else {
1653	0					0	croak "unable to load slice '$slice'! Unsupported slice type $type\n";
1654							}
1655							}
1656
1657
1658							sub _load_s_slice {
1659	0			0		0	my $self = shift;
1660	0					0	my $slice = shift;
1661	0					0	my $tree = Set::IntervalTree->new();
1662
1663							# unpack the data slice zip member
1664	0					0	my $number = $self->slice_obs_number($slice);
1665	0					0	my ($null, @data) = unpack("si>(s>)$number", $self->zip->contents($slice) );
1666
1667							# convert the unpacked data into start, stop, score
1668							# first observation
1669	0					0	my $position = shift @data;
1670	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1671
1672							# remaining observations
1673	0					0	while (@data) {
1674	0					0	$position += (shift @data) + 32768;
1675	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1676							}
1677
1678							# store Interval Tree buffer
1679	0					0	$self->{buffer}{$slice} = $tree;
1680							}
1681
1682							sub _load_sf_slice {
1683	0			0		0	my $self = shift;
1684	0					0	my $slice = shift;
1685	0					0	my $tree = Set::IntervalTree->new();
1686
1687							# unpack the data slice zip member
1688	0					0	my $number = $self->slice_obs_number($slice);
1689	0					0	my ($null, @data) = unpack("si>f>(s>f>)$number", $self->zip->contents($slice) );
1690
1691							# convert the unpacked data into start, stop, score
1692							# first observation
1693	0					0	my $position = shift @data;
1694	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1695
1696							# remaining observations
1697	0					0	while (@data) {
1698	0					0	$position += (shift @data) + 32768;
1699	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1700							}
1701
1702							# store Interval Tree buffer
1703	0					0	$self->{buffer}{$slice} = $tree;
1704							}
1705
1706							sub _load_st_slice {
1707	0			0		0	my $self = shift;
1708	0					0	my $slice = shift;
1709	0					0	my $tree = Set::IntervalTree->new();
1710
1711							# convert the unpacked data into start, stop, (score), text
1712
1713							# load the slice contents
1714	0					0	my $contents = $self->zip->contents($slice);
1715
1716							# first observation
1717	0					0	my $i = 8; # go ahead and set index up to first observation text
1718	0					0	my (undef, $position, $t) = unpack('si>s>', substr($contents, 0, $i));
1719							# initial null, position, text_length
1720	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1721	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1722	0					0	$i += $t;
1723
1724							# remaining observations
1725	0					0	my $p; # position offset
1726	0					0	my $len = CORE::length($contents);
1727	0					0	while ($i < $len) {
1728	0					0	my ($p, $t) = unpack('s>s>', substr($contents, $i, 4));
1729	0					0	$i += 4;
1730	0					0	$position += $p + 32768;
1731	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1732	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1733	0					0	$i += $t;
1734							}
1735
1736							# store Interval Tree buffer
1737	0					0	$self->{buffer}{$slice} = $tree;
1738							}
1739
1740							sub _load_ss_slice {
1741	0			0		0	my $self = shift;
1742	0					0	my $slice = shift;
1743	0					0	my $tree = Set::IntervalTree->new();
1744
1745							# unpack the data slice zip member
1746	0					0	my $number = $self->slice_obs_number($slice);
1747	0					0	my ($null, @data) = unpack("si>s>(s>s>)$number", $self->zip->contents($slice) );
1748
1749							# convert the unpacked data into start, stop, no score
1750							# first observation
1751	0					0	my $position = @data;
1752	0					0	my $position2 = $position + (shift @data) + 32768;
1753	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1754
1755							# remaining observations
1756	0					0	while (@data) {
1757	0					0	$position += (shift @data) + 32768;
1758	0					0	$position2 = $position + (shift @data) + 32768;
1759	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1760							}
1761
1762							# store Interval Tree buffer
1763	0					0	$self->{buffer}{$slice} = $tree;
1764							}
1765
1766							sub _load_ssf_slice {
1767	6			6		8	my $self = shift;
1768	6					9	my $slice = shift;
1769	6					46	my $tree = Set::IntervalTree->new();
1770
1771							# unpack the data slice zip member
1772	6					14	my $number = $self->slice_obs_number($slice);
1773	6					26	my ($null, @data) = unpack("si>s>f>(s>s>f>)$number", $self->zip->contents($slice) );
1774
1775							# convert the unpacked data into start, stop, score
1776							# first observation
1777	6					25966	my $position = (shift @data);
1778	6					26	my $position2 = $position + (shift @data) + 32768;
1779	6					41	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1780
1781							# remaining observations
1782	6					18	while (@data) {
1783	58298					54937	$position += (shift @data) + 32768;
1784	58298					51623	$position2 = $position + (shift @data) + 32768;
1785	58298					120028	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1786							}
1787
1788							# store Interval Tree buffer
1789	6					41	$self->{buffer}{$slice} = $tree;
1790							}
1791
1792							sub _load_sst_slice {
1793	0			0		0	my $self = shift;
1794	0					0	my $slice = shift;
1795	0					0	my $tree = Set::IntervalTree->new();
1796	0					0	my $contents = $self->zip->contents($slice);
1797
1798							# first observation
1799	0					0	my $i = 10; # go ahead and set index up to first observation text
1800	0					0	my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
1801							# initial null, position, length, text_length
1802	0					0	my $position2 = $position + $l + 32768;
1803	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1804	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1805	0					0	$i += $t;
1806
1807							# remaining observations
1808	0					0	my $p; # position offset
1809	0					0	my $len = CORE::length($contents);
1810	0					0	while ($i < $len) {
1811	0					0	($p, $l, $t) = unpack('s>s>s>', substr($contents, $i, 6));
1812	0					0	$i += 6;
1813	0					0	$position += $p + 32768;
1814	0					0	$position2 = $position + $l + 32768;
1815	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1816	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
1817	0					0	$i += $t;
1818							}
1819
1820							# store Interval Tree buffer
1821	0					0	$self->{buffer}{$slice} = $tree;
1822							}
1823
1824							sub _load_ssft_slice {
1825	1			1		1	my $self = shift;
1826	1					2	my $slice = shift;
1827	1					6	my $tree = Set::IntervalTree->new();
1828	1					13	my $contents = $self->zip->contents($slice);
1829
1830							# first observation
1831	1					3388	my $i = 14; # go ahead and set index up to first observation text
1832	1					6	my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
1833							# initial null, position, length, score, text_length
1834	1					4	my $position2 = $position + $l + 32768;
1835	1					5	my $text = unpack("A$t", substr($contents, $i, $t));
1836	1					5	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
1837	1					2	$i += $t;
1838
1839							# remaining observations
1840	1					1	my $p; # position offset
1841	1					2	my $len = CORE::length($contents);
1842	1					4	while ($i < $len) {
1843	10000					17382	($p, $l, $s, $t) = unpack('s>s>f>s>', substr($contents, $i, 10));
1844	10000					10362	$i += 10;
1845	10000					8556	$position += $p + 32768;
1846	10000					8743	$position2 = $position + $l + 32768;
1847	10000					16254	$text = unpack("A$t", substr($contents, $i, $t));
1848	10000					27084	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
1849	10000					12977	$i += $t;
1850							}
1851
1852							# store Interval Tree buffer
1853	1					14	$self->{buffer}{$slice} = $tree;
1854							}
1855
1856							sub _load_i_slice {
1857	0			0		0	my $self = shift;
1858	0					0	my $slice = shift;
1859	0					0	my $tree = Set::IntervalTree->new();
1860
1861							# unpack the data slice zip member
1862	0					0	my $number = $self->slice_obs_number($slice);
1863	0					0	my ($null, @data) = unpack("si>(i>)$number", $self->zip->contents($slice) );
1864
1865							# convert the unpacked data into start, stop, score
1866							# first observation
1867	0					0	my $position = @data;
1868	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1869
1870							# remaining observations
1871	0					0	while (@data) {
1872	0					0	$position += (shift @data);
1873	0					0	$tree->insert( [$position, $position + 1, undef], $position, $position + 1);
1874							}
1875
1876							# store Interval Tree buffer
1877	0					0	$self->{buffer}{$slice} = $tree;
1878							}
1879
1880							sub _load_if_slice {
1881	0			0		0	my $self = shift;
1882	0					0	my $slice = shift;
1883	0					0	my $tree = Set::IntervalTree->new();
1884
1885							# unpack the data slice zip member
1886	0					0	my $number = $self->slice_obs_number($slice);
1887	0					0	my ($null, @data) = unpack("si>f>(i>f>)$number", $self->zip->contents($slice) );
1888
1889							# convert the unpacked data into start, stop, score
1890							# first observation
1891	0					0	my $position = @data;
1892	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1893
1894							# remaining observations
1895	0					0	while (@data) {
1896	0					0	$position += (shift @data);
1897	0					0	$tree->insert( [$position, $position + 1, shift(@data)], $position, $position + 1);
1898							}
1899
1900							# store Interval Tree buffer
1901	0					0	$self->{buffer}{$slice} = $tree;
1902							}
1903
1904							sub _load_it_slice {
1905	0			0		0	my $self = shift;
1906	0					0	my $slice = shift;
1907	0					0	my $tree = Set::IntervalTree->new();
1908
1909							# convert the unpacked data into start, stop, (score), text
1910
1911							# load the slice contents
1912	0					0	my $contents = $self->zip->contents($slice);
1913
1914							# first observation
1915	0					0	my $i = 10; # go ahead and set index up to first observation text
1916	0					0	my (undef, $position, $t) = unpack('si>i>', substr($contents, 0, $i));
1917							# initial null, position, text_length
1918	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
1919	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1920	0					0	$i += $t;
1921
1922							# remaining observations
1923	0					0	my $p; # position offset
1924	0					0	my $len = CORE::length($contents);
1925	0					0	while ($i < $len) {
1926	0					0	($p, $t) = unpack('i>s>', substr($contents, $i, 6));
1927	0					0	$i += 6;
1928	0					0	$position += $p + 32768;
1929	0					0	$text = unpack("A$t", substr($contents, $i, $t));
1930	0					0	$tree->insert( [$position, $position + 1, undef, $text], $position, $position + 1);
1931	0					0	$i += $t;
1932							}
1933
1934							# store Interval Tree buffer
1935	0					0	$self->{buffer}{$slice} = $tree;
1936							}
1937
1938							sub _load_ii_slice {
1939	0			0		0	my $self = shift;
1940	0					0	my $slice = shift;
1941	0					0	my $tree = Set::IntervalTree->new();
1942
1943							# unpack the data slice zip member
1944	0					0	my $number = $self->slice_obs_number($slice);
1945	0					0	my ($null, @data) = unpack("si>i>(i>i>)$number", $self->zip->contents($slice) );
1946
1947							# convert the unpacked data into start, stop, score
1948							# first observation
1949	0					0	my $position = @data;
1950	0					0	my $position2 = $position + (shift @data);
1951	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1952
1953							# remaining observations
1954	0					0	while (@data) {
1955	0					0	$position += (shift @data);
1956	0					0	$position2 = $position + (shift @data);
1957	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
1958							}
1959
1960							# store Interval Tree buffer
1961	0					0	$self->{buffer}{$slice} = $tree;
1962							}
1963
1964							sub _load_iif_slice {
1965	0			0		0	my $self = shift;
1966	0					0	my $slice = shift;
1967	0					0	my $tree = Set::IntervalTree->new();
1968
1969							# unpack the data slice zip member
1970	0					0	my $number = $self->slice_obs_number($slice);
1971	0					0	my ($null, @data) = unpack("si>i>f>(i>i>f>)$number", $self->zip->contents($slice) );
1972
1973							# convert the unpacked data into start, stop, score
1974							# first observation
1975	0					0	my $position = @data;
1976	0					0	my $position2 = $position + (shift @data);
1977	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1978
1979							# remaining observations
1980	0					0	while (@data) {
1981	0					0	$position += (shift @data);
1982	0					0	$position2 = $position + (shift @data);
1983	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
1984							}
1985
1986							# store Interval Tree buffer
1987	0					0	$self->{buffer}{$slice} = $tree;
1988							}
1989
1990							sub _load_iit_slice {
1991	0			0		0	my $self = shift;
1992	0					0	my $slice = shift;
1993	0					0	my $tree = Set::IntervalTree->new();
1994	0					0	my $contents = $self->zip->contents($slice);
1995
1996							# first observation
1997	0					0	my $i = 12; # go ahead and set index up to first observation text
1998	0					0	my (undef, $position, $l, $t) = unpack('si>i>s>', substr($contents, 0, $i));
1999							# initial null, position, length, text_length
2000	0					0	my $position2 = $position + $l;
2001	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2002	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2003	0					0	$i += $t;
2004
2005							# remaining observations
2006	0					0	my $p; # position offset
2007	0					0	my $len = CORE::length($contents);
2008	0					0	while ($i < $len) {
2009	0					0	($p, $l, $t) = unpack('i>i>s>', substr($contents, $i, 10));
2010	0					0	$i += 10;
2011	0					0	$position += $p;
2012	0					0	$position2 = $position + $l;
2013	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2014	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2015	0					0	$i += $t;
2016							}
2017
2018							# store Interval Tree buffer
2019	0					0	$self->{buffer}{$slice} = $tree;
2020							}
2021
2022							sub _load_iift_slice {
2023	0			0		0	my $self = shift;
2024	0					0	my $slice = shift;
2025	0					0	my $tree = Set::IntervalTree->new();
2026	0					0	my $contents = $self->zip->contents($slice);
2027
2028							# first observation
2029	0					0	my $i = 16; # go ahead and set index up to first observation text
2030	0					0	my (undef, $position, $l, $s, $t) = unpack('si>i>f>s>', substr($contents, 0, $i));
2031							# initial null, position, length, score, text_length
2032	0					0	my $position2 = $position + $l;
2033	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2034	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2035	0					0	$i += $t;
2036
2037							# remaining observations
2038	0					0	my $p; # position offset
2039	0					0	my $len = CORE::length($contents);
2040	0					0	while ($i < $len) {
2041	0					0	($p, $l, $s, $t) = unpack('i>i>f>s>', substr($contents, $i, 14));
2042	0					0	$i += 14;
2043	0					0	$position += $p;
2044	0					0	$position2 = $position + $l;
2045	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2046	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2047	0					0	$i += $t;
2048							}
2049
2050							# store Interval Tree buffer
2051	0					0	$self->{buffer}{$slice} = $tree;
2052							}
2053
2054							sub _load_is_slice {
2055	0			0		0	my $self = shift;
2056	0					0	my $tree = Set::IntervalTree->new();
2057	0					0	my $slice = shift;
2058
2059							# unpack the data slice zip member
2060	0					0	my $number = $self->slice_obs_number($slice);
2061	0					0	my ($null, @data) = unpack("si>s>(i>s>)$number", $self->zip->contents($slice) );
2062
2063							# convert the unpacked data into start, stop, score
2064							# first observation
2065	0					0	my $position = @data;
2066	0					0	my $position2 = $position + (shift @data) + 32768;
2067	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
2068
2069							# remaining observations
2070	0					0	while (@data) {
2071	0					0	$position += (shift @data);
2072	0					0	$position2 = $position + (shift @data) + 32768;
2073	0					0	$tree->insert( [$position, $position2, undef], $position, $position2);
2074							}
2075
2076							# store Interval Tree buffer
2077	0					0	$self->{buffer}{$slice} = $tree;
2078							}
2079
2080							sub _load_isf_slice {
2081	0			0		0	my $self = shift;
2082	0					0	my $slice = shift;
2083	0					0	my $tree = Set::IntervalTree->new();
2084
2085							# unpack the data slice zip member
2086	0					0	my $number = $self->slice_obs_number($slice);
2087	0					0	my ($null, @data) = unpack("si>s>f>(i>s>f>)$number", $self->zip->contents($slice) );
2088
2089							# convert the unpacked data into start, stop, score
2090							# first observation
2091	0					0	my $position = @data;
2092	0					0	my $position2 = $position + (shift @data) + 32768;
2093	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2094
2095							# remaining observations
2096	0					0	while (@data) {
2097	0					0	$position += (shift @data);
2098	0					0	$position2 = $position + (shift @data) + 32768;
2099	0					0	$tree->insert( [$position, $position2, shift(@data)], $position, $position2);
2100							}
2101
2102							# store Interval Tree buffer
2103	0					0	$self->{buffer}{$slice} = $tree;
2104							}
2105
2106							sub _load_ist_slice {
2107	0			0		0	my $self = shift;
2108	0					0	my $slice = shift;
2109	0					0	my $tree = Set::IntervalTree->new();
2110	0					0	my $contents = $self->zip->contents($slice);
2111
2112							# first observation
2113	0					0	my $i = 10; # go ahead and set index up to first observation text
2114	0					0	my (undef, $position, $l, $t) = unpack('si>s>s>', substr($contents, 0, $i));
2115							# initial null, position, length, text_length
2116	0					0	my $position2 = $position + $l + 32768;
2117	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2118	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2119	0					0	$i += $t;
2120
2121							# remaining observations
2122	0					0	my $p; # position offset
2123	0					0	my $len = CORE::length($contents);
2124	0					0	while ($i < $len) {
2125	0					0	($p, $l, $t) = unpack('i>s>s>', substr($contents, $i, 8));
2126	0					0	$i += 8;
2127	0					0	$position += $p;
2128	0					0	$position2 = $position + $l + 32768;
2129	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2130	0					0	$tree->insert( [$position, $position2, undef, $text], $position, $position2);
2131	0					0	$i += $t;
2132							}
2133
2134							# store Interval Tree buffer
2135	0					0	$self->{buffer}{$slice} = $tree;
2136							}
2137
2138							sub _load_isft_slice {
2139	0			0		0	my $self = shift;
2140	0					0	my $slice = shift;
2141	0					0	my $tree = Set::IntervalTree->new();
2142	0					0	my $contents = $self->zip->contents($slice);
2143
2144							# first observation
2145	0					0	my $i = 14; # go ahead and set index up to first observation text
2146	0					0	my (undef, $position, $l, $s, $t) = unpack('si>s>f>s>', substr($contents, 0, $i));
2147							# initial null, position, length, score, text_length
2148	0					0	my $position2 = $position + $l + 32768;
2149	0					0	my $text = unpack("A$t", substr($contents, $i, $t));
2150	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2151	0					0	$i += $t;
2152
2153							# remaining observations
2154	0					0	my $p; # position offset
2155	0					0	while ($i < CORE::length($contents)) {
2156	0					0	($p, $l, $s, $t) = unpack('i>s>f>s>', substr($contents, $i, 12));
2157	0					0	$i += 12;
2158	0					0	$position += $p;
2159	0					0	$position2 = $position + $l + 32768;
2160	0					0	$text = unpack("A$t", substr($contents, $i, $t));
2161	0					0	$tree->insert( [$position, $position2, $s, $text], $position, $position2);
2162	0					0	$i += $t;
2163							}
2164
2165							# store Interval Tree buffer
2166	0					0	$self->{buffer}{$slice} = $tree;
2167							}
2168
2169							sub _mean_score {
2170	1000			1000		899	my $self = shift;
2171	1000					1231	my ($start, $stop, $slices) = @_;
2172	1000	50				1333	return unless @$slices;
2173
2174							# collect the scores from each of the requested slices
2175	1000					876	my $sum;
2176	1000					897	my $count = 0;
2177	1000					1050	foreach my $slice (@$slices) {
2178	1001	50				1296	next unless $self->slice_type($slice) =~ /f/;
2179							# no sense going through if no score present
2180
2181							# load and unpack the data
2182	1001					2009	$self->_load_slice($slice);
2183
2184							# find the overlapping observations and record values
2185	1001					87617	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2186	1001					2002	foreach my $r (@$results) {
2187	2276		100			3729	$sum += $r->[2] \|\| 0;
2188	2276					2763	$count++;
2189							}
2190							}
2191	1000	50				3614	return $count ? $sum / $count : undef;
2192							}
2193
2194							sub _stat_summary {
2195	10			10		10	my $self = shift;
2196	10					13	my ($start, $stop, $slices) = @_;
2197	10	50				18	return unless @$slices;
2198
2199							# initialize the statistical scores
2200	10					24	my $count = 0;
2201	10					19	my $sum;
2202							my $sum_squares;
2203	10					0	my $min;
2204	10					0	my $max;
2205
2206							# collect the scores from each of the requested slices
2207	10					13	foreach my $slice (@$slices) {
2208	11	50				19	next unless $self->slice_type($slice) =~ /f/;
2209							# no sense going through if no score present
2210
2211							# load and unpack the data
2212	11					26	$self->_load_slice($slice);
2213
2214							# find the overlapping observations
2215	11					1558	my $results = $self->{'buffer'}{$slice}->fetch($start - 1, $stop);
2216	11					32	foreach my $r (@$results) {
2217							# each observation is [start, stop, score]
2218	11994	50				14234	if (defined $r->[2]) {
2219	11994					10021	$count++;
2220	11994					10760	$sum += $r->[2];
2221	11994					10801	$sum_squares += ($r->[2] * $r->[2]);
2222	11994	100	100			21679	$min = $r->[2] if (!defined $min or $r->[2] < $min);
2223	11994	100	100			21912	$max = $r->[2] if (!defined $max or $r->[2] > $max);
2224							}
2225							}
2226							}
2227
2228							# assemble the statistical summary hash
2229	10		50			79	my %summary = (
			50
			50
			50
2230							'validCount' => $count,
2231							'sumData' => $sum \|\| 0,
2232							'sumSquares' => $sum_squares \|\| 0,
2233							'minVal' => $min \|\| 0,
2234							'maxVal' => $max \|\| 0,
2235							);
2236	10					74	return \%summary;
2237							}
2238
2239							sub _rewrite_metadata {
2240	1			1		4	my $self = shift;
2241	1	50				6	return unless (-w $self->zip->fileName);
2242	1					54	my $md = $self->{'metadata'};
2243
2244							# generate new metadata as an array
2245	1					2	my @new_md;
2246	1					5	push @new_md, "useqArchiveVersion = " . $md->{useqArchiveVersion};
2247	1	50				4	push @new_md, @{ $md->{comments} } if exists $md->{comments};
	1					3
2248	1					3	push @new_md, "dataType = " . $md->{dataType};
2249	1					3	push @new_md, "versionedGenome = " . $md->{versionedGenome};
2250
2251							# additional keys that may be present
2252	1					7	foreach (keys %$md) {
2253	7	100				35	next if /useqArchiveVersion\|dataType\|versionedGenome\|comments\|chromStats\|globalStats/;
2254	2					6	push @new_md, "$_ = $md->{$_}";
2255							}
2256
2257							# global and chromosome stats
2258	1					2	push @new_md,
2259							"# Bio::DB::USeq statistics validCount,sumData,sumSquares,minVal,maxVal";
2260	1	50				3	push @new_md, "globalStats = $md->{globalStats}" if exists $md->{globalStats};
2261	1					10	foreach (sort {$a cmp $b} keys %$md) {
	12					15
2262	7	100				19	push @new_md, "$_ = $md->{$_}" if /chromStats/;
2263							}
2264
2265							# write the new metadata to the zip Archive
2266	1					10	$self->{'zip'}->contents('archiveReadMe.txt', join("\n", @new_md));
2267	1					668	$self->{'zip'}->overwrite;
2268	1					6661	$self->clone; # not sure if this is necessary, but just in case we reopen the zip
2269							}
2270
2271
2272							######## Exported Functions ########
2273							# these are borrowed from Bio::DB::BigWig from Lincoln Stein
2274
2275							sub binMean {
2276	2			2	1	130	my $score = shift;
2277	2	50				9	return 0 unless $score->{validCount};
2278	2					9	$score->{sumData}/$score->{validCount};
2279							}
2280
2281							sub binVariance {
2282	1			1	1	2	my $score = shift;
2283	1	50				4	return 0 unless $score->{validCount};
2284	1					7	my $var = $score->{sumSquares} - $score->{sumData}**2/$score->{validCount};
2285	1	50				4	if ($score->{validCount} > 1) {
2286	1					3	$var /= $score->{validCount}-1;
2287							}
2288	1	50				4	return 0 if $var < 0;
2289	1					3	return $var;
2290							}
2291
2292							sub binStdev {
2293	1			1	1	7	my $score = shift;
2294	1					4	return sqrt(binVariance($score));
2295							}
2296
2297
2298
2299
2300							######## Other Classes #############
2301
2302							package Bio::DB::USeq::Feature;
2303	1			1		7	use base 'Bio::SeqFeature::Lite';
	1					2
	1					481
2304
2305							sub new {
2306	13			13		24	my $class = shift;
2307	13					56	return $class->SUPER::new(@_);
2308							}
2309
2310							sub type {
2311							# Bio::SeqFeature::Lite mangles the type and returns
2312							# primary_tag:source if both are set
2313							# this may wreck havoc with parsers when the type already has a :
2314							# as in wiggle:1000
2315	16			16		130	my $self = shift;
2316	16					94	return $self->{type};
2317							}
2318
2319							sub chr_stats {
2320	0			0		0	my $self = shift;
2321	0	0				0	return unless exists $self->{useq};
2322	0					0	return $self->{useq}->chr_stats( $self->seq_id );
2323							}
2324
2325							sub chr_mean {
2326	0			0		0	my $self = shift;
2327	0	0				0	return unless exists $self->{useq};
2328	0					0	return $self->{useq}->chr_mean( $self->seq_id );
2329							}
2330
2331							sub chr_stdev {
2332	0			0		0	my $self = shift;
2333	0	0				0	return unless exists $self->{useq};
2334	0					0	return $self->{useq}->chr_stdev( $self->seq_id );
2335							}
2336
2337							sub global_stats {
2338	0			0		0	my $self = shift;
2339	0	0				0	return unless exists $self->{useq};
2340	0					0	return $self->{useq}->global_stats( $self->seq_id );
2341							}
2342
2343							sub global_mean {
2344	0			0		0	my $self = shift;
2345	0	0				0	return unless exists $self->{useq};
2346	0					0	return $self->{useq}->global_mean( $self->seq_id );
2347							}
2348
2349							sub global_stdev {
2350	0			0		0	my $self = shift;
2351	0	0				0	return unless exists $self->{useq};
2352	0					0	return $self->{useq}->global_stdev( $self->seq_id );
2353							}
2354
2355
2356
2357
2358							package Bio::DB::USeq::Segment;
2359	1			1		50483	use base 'Bio::DB::USeq::Feature';
	1					3
	1					670
2360
2361							sub new {
2362	1			1		2	my $class = shift;
2363	1					4	my %args = @_;
2364	1					7	my $segment = $class->SUPER::new(@_);
2365	1	50				74	$segment->{'useq'} = $args{'-useq'} if exists $args{'-useq'};
2366	1					2	return $segment;
2367							}
2368
2369							sub scores {
2370	1			1		108	my $self = shift;
2371	1					6	return $self->{'useq'}->scores(
2372							-seq_id => $self->seq_id,
2373							-start => $self->start,
2374							-end => $self->end,
2375							-strand => $self->strand,
2376							);
2377							}
2378
2379							sub features {
2380	0			0		0	my $self = shift;
2381	0					0	my $type = shift;
2382	0		0			0	$type \|\|= 'region';
2383	0					0	return $self->{'useq'}->features(
2384							-seq_id => $self->seq_id,
2385							-start => $self->start,
2386							-end => $self->end,
2387							-strand => $self->strand,
2388							-type => $type,
2389							);
2390							}
2391
2392							sub get_seq_stream {
2393	1			1		258	my $self = shift;
2394	1					2	my $type = shift;
2395	1		50			16	$type \|\|= 'region';
2396							return Bio::DB::USeq::Iterator->new(
2397							-seq_id => $self->seq_id,
2398							-start => $self->start,
2399							-end => $self->end,
2400							-strand => $self->strand,
2401							-source => $self->source,
2402							-type => $type,
2403							-useq => $self->{useq},
2404	1					5	);
2405							}
2406
2407							sub slices {
2408	0			0		0	my $self = shift;
2409	0					0	my @slices = $self->{'useq'}->_translate_coordinates_to_slices(
2410							$self->seq_id, $self->start, $self->end, $self->strand
2411							);
2412	0	0				0	return wantarray ? @slices : \@slices;
2413							}
2414
2415							sub coverage {
2416	0			0		0	my $self = shift;
2417	0					0	my $bins = shift;
2418	0		0			0	$bins \|\|= $self->length;
2419	0					0	return $self->features("coverage:$bins");
2420							}
2421
2422							sub wiggle {
2423	0			0		0	my $self = shift;
2424	0					0	my $bins = shift;
2425	0		0			0	$bins \|\|= $self->length;
2426	0					0	return $self->features("wiggle:$bins");
2427							}
2428
2429							sub statistical_summary {
2430	0			0		0	my $self = shift;
2431	0					0	my $bins = shift;
2432	0		0			0	$bins \|\|= 1;
2433	0					0	return $self->features("summary:$bins");
2434							}
2435
2436							sub observations {
2437	0			0		0	my $self = shift;
2438	0					0	return $self->{'useq'}->observations(
2439							-seq_id => $self->seq_id,
2440							-start => $self->start,
2441							-end => $self->end,
2442							-strand => $self->strand,
2443							);
2444							}
2445
2446
2447							package Bio::DB::USeq::Iterator;
2448	1			1		18	use base 'Bio::DB::USeq::Feature';
	1					2
	1					1252
2449
2450							sub new {
2451	4			4		57	my $class = shift;
2452
2453							# create object
2454	4					18	my %args = @_;
2455	4	50				11	my $useq = $args{'-useq'} or
2456							die "Bio::DB::USeq::Iterator cannot be created without a -useq argument";
2457	4					18	my $iterator = $class->SUPER::new(@_);
2458	4					218	$iterator->{'useq'} = $useq;
2459
2460							# determine which members to retrieve
2461							my @slices = $useq->_translate_coordinates_to_slices(
2462							$args{-seq_id},
2463							$args{-start},
2464							$args{-end},
2465							$args{-strand},
2466	4					15	);
2467	4					12	$useq->_clear_buffer(\@slices);
2468
2469							# sort the slices as necessary
2470	4	100				16	if (scalar(@slices) > 1) {
2471	6					12	@slices = map {$_->[1]}
2472	3					11	sort {$a->[0] <=> $b->[0]}
2473	3					17	map { [$useq->slice_start($_), $_] } @slices;
	6					15
2474							}
2475
2476							# how we set up the iterator depends on the feature type requested
2477							# we need to add specific information to iterator object
2478
2479	4	100				15	if ($iterator->type =~ /region\|interval\|observation/) {
2480							# useq observation features are simple
2481	2					5	$iterator->{'wiggle'} = 0;
2482
2483							# prepare specific iterator information
2484	2					5	$iterator->{'slices'} = \@slices;
2485	2					11	$iterator->{'current_results'} = undef;
2486	2					6	$iterator->{'current_strand'} = undef;
2487
2488	2					9	return $iterator;
2489							}
2490
2491							# otherwise we work with more complex wiggle or summary features
2492							# set the type of wiggle feature: 1 = wiggle, 2 = summary
2493	2	100				6	$iterator->{'wiggle'} = $iterator->type =~ /summary/ ? 2 : 1;
2494
2495							# normally the iterator will only return one wigggle\|summary feature, but
2496							# will return two if stranded data, per behavior for Bio::Grapics...
2497	2	50	33			11	if ($iterator->strand == 0 and $useq->stranded) {
2498							# we have stranded data, so need to return two features
2499							# separate the slices into each respective strands for each feature
2500	0					0	my (@f, @r);
2501	0					0	foreach (@slices) {
2502	0	0				0	if ($useq->slice_strand($_) == 1) {
2503	0					0	push @f, $_;
2504							}
2505							else {
2506	0					0	push @r, $_;
2507							}
2508							}
2509	0					0	$iterator->{slices} = [\@f, \@r];
2510							}
2511							else {
2512							# only need to return one feature
2513	2					6	$iterator->{slices} = [ \@slices ];
2514							}
2515
2516							# check for type and bins
2517	2					4	my ($bin, $step);
2518	2	100				6	if ($iterator->type =~ /:(\d+)$/i) {
2519	1					3	$bin = $1;
2520	1					6	my $length = $iterator->length;
2521	1	50				18	$bin = $length if $bin > $length;
2522	1					3	$step = $length/$bin;
2523							}
2524	2					5	$iterator->{bin} = $bin;
2525	2					4	$iterator->{step} = $step;
2526
2527	2					8	return $iterator;
2528							}
2529
2530							sub next_seq {
2531	10			10		324	my $self = shift;
2532
2533	10	100				32	if ($self->{'wiggle'} == 1) {
		100
2534	2					6	return $self->_next_wiggle;
2535							}
2536							elsif ($self->{'wiggle'} == 2) {
2537	2					6	return $self->_next_summary;
2538							}
2539							else {
2540	6					14	return $self->_next_region;
2541							}
2542							}
2543
2544	0			0		0	sub next_feature {return shift->next_seq}
2545
2546							sub _next_region {
2547							# this will keep returning observations as SeqFeature objects until we run out
2548							# of observations in the requested interval
2549	6			6		8	my $self = shift;
2550	6					9	my $useq = $self->{'useq'};
2551
2552							# get current results
2553	6	100				11	unless ($self->{'current_results'}) {
2554	2		50			3	my $slice = shift @{ $self->{'slices'} } \|\| undef;
2555	2	50				10	return unless $slice; # no more slices, we're done
2556
2557							# collect the observations
2558	2					7	$useq->_load_slice($slice);
2559	2					20	my $result = $useq->{buffer}{$slice}->fetch($self->start - 1, $self->end);
2560
2561							# sort the observations and store the results
2562	2	50				367	my @sorted = sort {$a->[0] <=> $b->[0] or $a->[1] <=> $b->[1]} @$result;
	303					372
2563	2					8	$self->{'current_results'} = \@sorted;
2564	2					19	$self->{'current_strand'} = $useq->slice_strand($slice);
2565							}
2566
2567							# return next observation as a Feature object
2568	6					8	my $obs = shift @{ $self->{'current_results'} };
	6					11
2569	6	50				12	unless (defined $obs) {
2570							# no more observations for current slice, try to move to next slice
2571	0					0	undef $self->{'current_results'};
2572	0					0	return $self->_next_region;
2573							}
2574							return Bio::DB::USeq::Feature->new(
2575							-seq_id => $self->seq_id,
2576							-start => $obs->[0] + 1,
2577							-end => $obs->[1],
2578	6	100				17	-strand => $self->{'current_strand'},
2579							-score => $obs->[2],
2580							-source => $self->source,
2581							-type => $self->type,
2582							-name => defined $obs->[3] ? $obs->[3] :
2583							join(':', $self->seq_id, $obs->[0], $obs->[1]),
2584							)
2585							}
2586
2587							sub _next_wiggle {
2588							# this will return one wiggle Feature, two if separate strands are requested
2589
2590	2			2		3	my $self = shift;
2591	2					3	my $useq = $self->{'useq'};
2592
2593							# determine which slices to retrieve
2594	2					4	my $slices = shift @{ $self->{slices} };
	2					5
2595	2	100				7	return unless $slices;
2596
2597							# more information
2598	1					2	my $start = $self->start;
2599	1					7	my $stop = $self->end;
2600	1					7	my $step = $self->{step};
2601
2602							# check whether we are working with binned wiggle or not
2603	1					1	my @scores;
2604	1	50	33			5	if ($self->{bin} and $step > 1) {
2605							# we will be collecting the mean score value in bins
2606
2607							# collect the scores for each bin
2608	1					3	for (my $begin = $start; $begin < $stop; $begin += $step) {
2609
2610							# round off coordinates to integers
2611							# beginning point and step may not be integers
2612	1000					1358	my $s = int($begin + 0.5);
2613	1000					1136	my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2614
2615							# collect the scores from each of the requested slices
2616	1000	50				1180	if (scalar @$slices > 1) {
2617							# more than one slice, identify which subset of slices to collect from
2618							# may or may not be all of the current slices
2619	1000					930	my @sub_slices;
2620	1000					1099	foreach my $slice (@$slices) {
2621	2000	100				2956	next if $s > $useq->slice_end($slice);
2622	1061	100				1355	next if $e < $useq->slice_start($slice);
2623	1001					1445	push @sub_slices, $slice;
2624							}
2625	1000					1435	push @scores, $useq->_mean_score($s, $e, \@sub_slices);
2626							}
2627							else {
2628	0					0	push @scores, $useq->_mean_score($s, $e, $slices);
2629							}
2630							}
2631							}
2632							else {
2633							# otherwise we collect in one step and associate scores at bp resolution
2634							# collect the scores from each of the requested slices and
2635							# assemble them into a hash of values
2636
2637							# correlate scores with position
2638	0					0	my %pos2score;
2639	0					0	foreach my $slice (@$slices) {
2640
2641							# load and unpack the data
2642	0					0	$useq->_load_slice($slice);
2643
2644							# fetch the overlapping observations
2645							# each observation is [start, stop, score]
2646	0					0	my $result = $useq->{'buffer'}{$slice}->fetch($start - 1, $stop);
2647	0					0	foreach my $r (@$result) {
2648	0					0	foreach my $p ( $r->[0] + 1 .. $r->[1] ) {
2649	0					0	$pos2score{$p} = $r->[2];
2650							}
2651							}
2652							}
2653
2654							# convert positioned scores into an array
2655	0					0	foreach (my $s = $start; $s <= $stop; $s++) {
2656	0	0				0	push @scores, exists $pos2score{$s} ? $pos2score{$s} : undef;
2657							# for Bio::Graphics it is better to store undef than 0
2658							# which can do wonky things with graphs
2659							}
2660							}
2661
2662							# generate the wiggle object
2663	1		50			6	my $strand = $useq->slice_strand( $slices->[0] ) \|\| 0;
2664	1	50				11	return Bio::DB::USeq::Wiggle->new(
		50
2665							-seq_id => $self->seq_id,
2666							-start => $start,
2667							-end => $stop,
2668							-strand => $strand,
2669							-type => $self->type,
2670							-source => $self->source,
2671							-attributes => { 'coverage' => [ \@scores ] },
2672							-name => $strand == 1 ? 'Forward' :
2673							$strand == -1 ? 'Reverse' : q(),
2674							-useq => $useq,
2675							);
2676							}
2677
2678							sub _next_summary {
2679							# this will return one summary Feature, two if separate strands are requested
2680	2			2		4	my $self = shift;
2681
2682							# determine which slices to retrieve
2683	2					2	my $slices = shift @{ $self->{slices} };
	2					4
2684	2	100				6	return unless $slices;
2685
2686							# all of the real statistical work is done elsewhere
2687							# just return the summary object
2688	1		50			5	my $strand = $self->{useq}->slice_strand( $slices->[0] ) \|\| 0;
2689							return Bio::DB::USeq::Summary->new(
2690							-seq_id => $self->seq_id,
2691							-start => $self->start,
2692							-end => $self->end,
2693							-strand => $strand,
2694							-type => $self->type,
2695							-source => $self->source,
2696							-name => $strand == 1 ? 'Forward' :
2697							$strand == -1 ? 'Reverse' : q(),
2698							-useq => $self->{'useq'},
2699							-slices => $slices,
2700							-bin => $self->{bin},
2701							-step => $self->{step},
2702	1	50				6	);
		50
2703							}
2704
2705
2706
2707							package Bio::DB::USeq::Wiggle;
2708	1			1		9	use base 'Bio::DB::USeq::Feature';
	1					2
	1					491
2709							# Wiggle scores are stored in the coverage attribute for backwards
2710							# compatibility with Bio::Graphics.
2711
2712							sub new {
2713	1			1		30	my $class = shift;
2714	1					18	my $wig = $class->SUPER::new(@_);
2715	1					126	my %args = @_;
2716	1	50				5	my $useq = $args{'-useq'} or
2717							die "Bio::DB::USeq::Wiggle cannot be created without a -useq argument";
2718	1					3	$wig->{useq} = $useq;
2719	1					18	return $wig;
2720							}
2721
2722							sub coverage {
2723	1			1		2	my $self = shift;
2724	1					10	my ($coverage) = $self->get_tag_values('coverage');
2725	1	50				85	return wantarray ? @$coverage : $coverage;
2726							}
2727
2728							sub wiggle {
2729	1			1		116	return shift->coverage;
2730							}
2731
2732							# Borrowed from Bio::SeqFeature::Coverage from Bio::DB::Sam
2733							sub gff3_string {
2734	0			0		0	my $self = shift;
2735	0					0	my $gff3 = $self->SUPER::gff3_string(@_);
2736	0					0	my $coverage = $self->escape(join(',',$self->coverage));
2737	0					0	$gff3 =~ s/coverage=[^;]+/coverage=$coverage/g;
2738	0					0	return $gff3;
2739							}
2740
2741							sub statistical_summary {
2742							# this is just for the wiggle scores, not original data
2743							# This is a fake statistical_summary to fool Bio::Graphics into
2744							# calculating chromosome or global statistics like a BigWig adaptor.
2745							# A real statistical_summary call would provide the number of bins,
2746							# so return null if bins is present.
2747							# We could calculate a real statistical summary, but that would be an
2748							# expensive circuitous route after we just collected wiggle scores.
2749							# Better to request the correct feature type in the first place.
2750	0			0		0	my $self = shift;
2751	0					0	my $bins = shift;
2752	0	0	0			0	return if $bins && $bins > 1;
2753
2754							# initialize the statistical scores
2755	0					0	my $count = 0;
2756	0					0	my $sum;
2757							my $sum_squares;
2758	0					0	my $min;
2759	0					0	my $max;
2760
2761							# generate a statistical summary of just the wiggle scores
2762	0					0	foreach my $s ($self->coverage) {
2763	0					0	$count++;
2764	0	0				0	next unless defined $s;
2765	0					0	$sum += $s;
2766	0					0	$sum_squares += $s * $s;
2767	0	0	0			0	$min = $s if (!defined $min or $s < $min);
2768	0	0	0			0	$max = $s if (!defined $max or $s > $max);
2769							}
2770
2771							# return the statistical hash
2772	0		0			0	my %stat = (
			0
			0
			0
2773							'validCount' => $count,
2774							'sumData' => $sum \|\| 0,
2775							'sumSquares' => $sum_squares \|\| 0,
2776							'minVal' => $min \|\| 0,
2777							'maxVal' => $max \|\| 0,
2778							);
2779	0					0	return \%stat;
2780							}
2781
2782							sub get_seq_stream {
2783	0			0		0	my $self = shift;
2784							return Bio::DB::USeq::Iterator->new(
2785							-seq_id => $self->seq_id,
2786							-start => $self->start,
2787							-end => $self->end,
2788							-strand => $self->strand,
2789							-type => 'region',
2790							-source => $self->source,
2791							-useq => $self->{useq},
2792	0					0	);
2793							}
2794
2795
2796
2797
2798							package Bio::DB::USeq::Summary;
2799	1			1		6	use base 'Bio::DB::USeq::Feature';
	1					1
	1					618
2800
2801							sub new {
2802	1			1		22	my $class = shift;
2803	1					7	my $summary = $class->SUPER::new(@_);
2804	1					71	my %args = @_;
2805	1	50				4	my $useq = $args{'-useq'} or
2806							die "Bio::DB::USeq::Summary cannot be created without a -useq argument";
2807	1					2	$summary->{useq} = $useq;
2808	1	50				4	$summary->{slices} = $args{-slices} if exists $args{-slices};
2809	1	50				4	$summary->{bin} = $args{-bin} if exists $args{-bin};
2810	1	50				2	$summary->{step} = $args{-step} if exists $args{-step};
2811	1					6	return $summary;
2812							}
2813
2814							sub statistical_summary {
2815	1			1		107	my $self = shift;
2816	1					2	my $useq = $self->{useq};
2817
2818							# get the number of bins to calculate the statistical summaries
2819	1					2	my $bin = shift;
2820	1	50	33			5	$bin \|\|= $self->{bin} if exists $self->{bin};
2821	1		50			3	$bin \|\|= 1;
2822	1	50				4	my $step = $self->{step} if exists $self->{step};
2823	1		33			11	$step \|\|= $self->length / $bin;
2824
2825							# get the slices
2826	1	50				24	my $slices = $self->{slices} if exists $self->{slices};
2827	1	50				3	unless ($slices) {
2828							# this should already be established, but just in case
2829	0					0	my @a = $useq->_translate_coordinates_to_slices($self->seq_id,
2830							$self->start, $self->end, $self->strand);
2831	0					0	$useq->_clear_buffer(\@a);
2832	0					0	$slices = \@a;
2833							}
2834
2835							# collect the statistical summaries for each bin
2836	1					2	my @summaries;
2837	1					2	for (my $begin = $self->start; $begin < $self->end; $begin += $step) {
2838
2839							# round off coordinates to integers
2840							# beginning point and step may not be integers
2841	10					93	my $s = int($begin + 0.5);
2842	10					309	my $e = int($s + $step - 0.5); # start + step - 1 + 0.5
2843
2844							# collect the scores from each of the requested slices
2845	10	50				20	if (scalar @$slices > 1) {
2846							# more than one slice, identify which subset of slices to collect from
2847							# may or may not be all of the current slices
2848	10					11	my @sub_slices;
2849	10					17	foreach my $slice (@$slices) {
2850	20	50				29	next if $s > $useq->slice_end($slice);
2851	20	100				30	next if $e < $useq->slice_start($slice);
2852	11					19	push @sub_slices, $slice;
2853							}
2854	10					19	push @summaries, $useq->_stat_summary($s, $e, \@sub_slices);
2855							}
2856							else {
2857	0					0	push @summaries, $useq->_stat_summary($s, $e, $slices);
2858							}
2859							}
2860
2861							# return the reference to the statistical summaries
2862	1					19	return \@summaries;
2863							}
2864
2865							sub score {
2866	0			0			my $self = shift;
2867	0						my $a = $self->statistical_summary(1);
2868	0						return $a->[0];
2869							}
2870
2871							sub gff3_string {
2872							# this is going to be a little convoluted, since what we
2873							# really want here is coverage, which is easier to calculate with means,
2874							# rather than doing statistical summaries and calculate means from those
2875	0			0			my $self = shift;
2876							my ($wig) = $self->{useq}->features(
2877							# this should only return one feature because we have specific strand
2878	0						-seq_id => $self->seq_id,
2879							-start => $self->start,
2880							-end => $self->end,
2881							-strand => $self->strand,
2882							-type => 'coverage:1000',
2883							);
2884	0						return $wig->gff3_string;
2885							}
2886
2887							sub get_seq_stream {
2888	0			0			my $self = shift;
2889							return Bio::DB::USeq::Iterator->new(
2890							-seq_id => $self->seq_id,
2891							-start => $self->start,
2892							-end => $self->end,
2893							-strand => $self->strand,
2894							-type => 'region',
2895							-source => $self->source,
2896							-useq => $self->{useq},
2897	0						);
2898							}
2899
2900							=head1 AUTHOR
2901
2902							Timothy J. Parnell, PhD
2903							Dept of Oncological Sciences
2904							Huntsman Cancer Institute
2905							University of Utah
2906							Salt Lake City, UT, 84112
2907
2908							This package is free software; you can redistribute it and/or modify
2909							it under the terms of the Artistic License 2.0.