File Coverage

lib/Bio/Graphics/Wiggle.pm

Criterion	Covered	Total	%
statement	262	421	62.2
branch	58	162	35.8
condition	35	105	33.3
subroutine	48	65	73.8
pod	18	40	45.0
total	421	793	53.0

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Graphics::Wiggle;
2
3							=head1 NAME
4
5							Bio::Graphics::Wiggle -- Binary storage for dense genomic features
6
7							=head1 SYNOPSIS
8
9							# all positions are 1-based
10
11							my $wig = Bio::Graphics::Wiggle->new('./test.wig',
12							$writeable,
13							{ seqid => $seqid,
14							start => $start,
15							step => $step,
16							min => $min,
17							max => $max });
18
19							$wig->erase;
20
21							my $seqid = $wig->seqid('new_id');
22							my $max = $wig->max($new_max);
23							my $min = $wig->min($new_min);
24							my $step = $wig->step($new_step); # data stored at modulus step == 0; all else is blank
25
26							$wig->set_value($position => $value); # store $value at position
27							$wig->set_values($position => \@values); # store array of values at position
28							$wig->set_range($start=>$end,$value); # store the same $value from $start to $end
29
30							my $value = $wig->value($position); # fetch value from position
31							my $values = $wig->values($start,$end); # fetch range of data from $start to $end
32
33							$wig->window(100); # sample window size
34							$wig->smoothing('mean'); # when sampling, compute the mean value across sample window
35							my $values = $wig->values($start,$end,$samples); # fetch $samples data points from $start to $end
36
37
38							=head1 DESCRIPTION
39
40							IMPORTANT NOTE: This implementation is still not right. See
41							http://genomewiki.ucsc.edu/index.php/Wiggle for a more space-efficient
42							implementation.
43
44							This module stores "wiggle" style quantitative genome data for display
45							in a genome browser application. The data for each chromosome (or
46							contig, or other reference sequence) is stored in a single file in the
47							following format:
48
49							256 byte header
50							50 bytes seqid, zero-terminated C string
51							4 byte long integer, value of "step" (explained later)
52							4 byte perl native float, the "min" value
53							4 byte perl native float, the "max" value
54							4 byte long integer, value of "span"
55							4 byte perl native float, the mean
56							4 byte perl native float, the standard deviation
57							2 byte unsigned short, the version number (currently version 0)
58							4 byte long integer, sequence start position (in 0-based coordinates)
59							null padding to 256 bytes for future use
60
61							The remainder of the file consists of 8-bit unsigned scaled integer
62							values. This means that all quantitative data will be scaled to 8-bit
63							precision!
64
65							For a convenient method of creating Wiggle files from UCSC-type WIG
66							input and creating GFF3 output, please see
67							L.
68
69							=head1 METHODS
70
71							=head2 Constructor and Accessors
72
73							=over 4
74
75							=item $wig = Bio::Graphics::Wiggle->new($filename,$writeable,{options})
76
77							Open/create a wiggle-format data file:
78
79							$filename -- path to the file to open/create
80							$writeable -- boolean value indicating whether file is
81							writeable. Missing files will only be created
82							if $writeable set to a true value. If path is
83							empty (undef or empty string) and writeable is true,
84							new() will create a temporary file that will be
85							deleted when the object goes out of scope.
86							{options} -- hash ref of the following named options, only valid
87							when creating a new wig file with $writeable true.
88
89							option name description default
90							----------- ----- -------
91							seqid name/id of sequence empty name
92							min minimum value of data points 0
93							max maximum value of data points 255
94							step interval between data points 1
95							span width of data points value of "step"
96
97							The "step" can be used to create sparse files to save space. By
98							default, step is set to 1, in which case a data value will be stored
99							at each base of the sequence. By setting step to 10, then each value
100							is taken to correspond to 10 bp, and the file will be 10x smaller.
101							For example, consider this step 5 data set:
102
103							1 2 3 4 5 6 7 8 9 10 11 12 13 14
104							20 . . . . 60 . . . . 80 . . .
105
106							We have stored the values "20" "60" and "80" at positions 1, 6 and 11,
107							respectively. When retrieving this data, it will appear as if
108							positions 1 through 5 have a value of 20, positions 6-10 have a value
109							of 60, and positions 11-14 have a value of 80. In the data file, we
110							store, positions 1,6,and 11 in adjacent bytes.
111
112							Note that no locking is performed by this module. If you wish to allow
113							multi-user write access to the databases files, you will need to
114							flock() the files yourself.
115
116							=item $seqid = $wig->seqid(['new_id'])
117
118							=item $max = $wig->max([$new_max])
119
120							=item $min = $wig->min([$new_min])
121
122							=item $step = $wig->step([$new_step])
123
124							=item $span = $wig->span([$new_span])
125
126							=item $mean = $wig->mean([$new_mean]);
127
128							=item $stdev = $wig->stdev([$new_stdev]);
129
130							These accessors get or set the corresponding values. Setting is only
131							allowed if the file was opened for writing. Note that changing the
132							min, max and step after writing data to the file under another
133							parameter set will produce unexpected (and invalid) results, as the
134							existing data is not automatically updated to be consistent.
135
136							=item $trim = $wig->trim([$new_trim]);
137
138							The trim method sets the trimming method, which can be used to trim
139							out extreme values. Three methods are currently supported:
140
141							none No trimming
142							stdev Trim 1 standard deviation above and below mean
143							stdevN Trim N standard deviations above and below the mean
144
145							In "stdevN", any can be any positive integer.
146
147							=back
148
149							=head2 Setting Data
150
151							=over 4
152
153							=item $wig->set_value($position => $value)
154
155							This method sets the value at $position to $value. If a step>1 is in
156							force, then $position will be rounded down to the nearest multiple of
157							step.
158
159							=item $wig->set_range($start=>$end, $value)
160
161							This method sets the value of all bases between $start and $end to
162							$value, honoring step.
163
164							=item $sig->set_values($position => \@values)
165
166							This method writes an array of values into the datababase beginning at
167							$position (or the nearest lower multiple of step). If step>1, then
168							values will be written at step intervals.
169
170							=back
171
172							=head2 Retrieving Data
173
174							=over 4
175
176							=item $value = $wig->value($position)
177
178							Retrieve the single data item at position $position, or the nearest
179							lower multiple of $step if step>1.
180
181							=item $values = $wig->values($start=>$end)
182
183							Retrieve the values in the range $start to $end and return them as an
184							array ref. Note that you will always get an array of size
185							($end-$start+1) even if step>1; the data in between the step intervals
186							will be filled in.
187
188							=item $values = $wig->values($start=>$end,$samples)
189
190							Retrieve a sampling of the values between $start and $end. Nothing
191							very sophisticated is done here; the code simply returns the number of
192							values indicated in $samples, smoothed according to the smoothing
193							method selected (default to "mean"), then selected at even intervals
194							from the range $start to $end. The return value is an arrayref of
195							exactly $samples values.
196
197							=item $string = $wig->export_to_wif($start,$end)
198
199							=item $string = $wig->export_to_wif64($start,$end)
200
201							Export the region from start to end in the "wif" format. This data can
202							later be imported into another Bio::Graphics::Wiggle object. The first
203							version returns a binary string. The second version returns a base64
204							encoded version that is safe for ascii-oriented formata such as GFF3
205							and XML.
206
207							=item $wig->import_from_wif($string)
208
209							=item $wig->import_from_wif64($string)
210
211							Import a wif format data string into the Bio::Graphics::Wiggle
212							object. The first version expects a binary string. The second version
213							expects a base64 encoded version that is safe for ascii-oriented
214							formata such as GFF3 and XML.
215
216							=back
217
218
219							=cut
220
221							# read/write genome tiling data, to be compatible with Jim Kent's WIG format
222	1			1		5	use strict;
	1					2
	1					33
223	1			1		4	use warnings;
	1					2
	1					35
224	1			1		730	use IO::File;
	1					1142
	1					155
225	1			1		8	use Carp 'croak','carp','confess';
	1					2
	1					71
226
227	1			1		5	use constant HEADER_LEN => 256;
	1					2
	1					101
228							# seqid, step, min, max, span, mean, stdev, version, start
229	1			1		6	use constant HEADER => '(Z50LFFLFFSL)@'.HEADER_LEN;
	1					1
	1					51
230	1			1		5	use constant BODY => 'C';
	1					1
	1					105
231	1			1		6	use constant DEBUG => 0;
	1					1
	1					65
232	1			1		10	use constant DEFAULT_SMOOTHING => 'mean';
	1					2
	1					56
233	1			1		5	use constant VERSION => 0;
	1					2
	1					4927
234							our $VERSION = '1.0';
235
236							sub new {
237	2			2	1	1132	my $class = shift;
238	2					5	my ($path,$write,$options) = @_;
239	2		50			7	$path \|\|= ''; # to avoid uninit warning
240	2	50				73	my $mode = $write ? -e $path # if file already exists...
		100
241							? '+<' # ...open for read/write
242							: '+>' # ...else clobber and open a new one
243							: '<'; # read only
244	2					8	my $fh = $class->new_fh($path,$mode);
245	2	50	0			418	$fh or die (($path\|\|'temporary file').": $!");
246
247	2		100			11	$options \|\|= {};
248
249	2		33			26	my $self = bless {fh => $fh,
250							write => $write,
251							dirty => scalar keys %$options
252							}, ref $class \|\| $class;
253
254	2		100			5	my $stored_options = eval {$self->_readoptions} \|\| {};
255	2	50				56	$options->{start}-- if defined $options->{start}; # 1-based ==> 0-based coordinates
256	2					19	my %merged_options = (%$stored_options,%$options);
257							# warn "merged options = ",join ' ',%merged_options;
258	2		50			11	$merged_options{version}\|\|= 0;
259	2		50			5	$merged_options{seqid} \|\|= 'chrUnknown';
260	2		50			6	$merged_options{min} \|\|= 0;
261	2		50			6	$merged_options{max} \|\|= 255;
262	2		50			4	$merged_options{mean} \|\|= 128;
263	2		50			7	$merged_options{stdev} \|\|= 255;
264	2		100			8	$merged_options{trim} \|\|= 'none';
265	2		50			5	$merged_options{step} \|\|= 1;
266	2		50			8	$merged_options{start} \|\|= 0;
267	2		33			5	$merged_options{span} \|\|= $merged_options{step};
268	2					4	$self->{options} = \%merged_options;
269	2	100				8	$self->_do_trim unless $self->trim eq 'none';
270	2					10	return $self;
271							}
272
273							sub new_fh {
274	2			2	0	3	my $self = shift;
275	2					4	my ($path,$mode) = @_;
276	2	50				25	return $path ? IO::File->new($path,$mode)
277							: IO::File->new_tmpfile;
278							}
279
280							sub end {
281	4			4	0	5	my $self = shift;
282	4	100				8	unless (defined $self->{end}) {
283	1					4	my $size = (stat($self->fh))[7];
284	1					2	my $data_len = $size - HEADER_LEN();
285	1	50				3	return unless $data_len>0; # undef end
286	1					3	$self->{end} = ($self->start-1) + $data_len * $self->step;
287							}
288	4					11	return $self->{end};
289							}
290
291	2			2		1140	sub DESTROY { shift->write }
292
293							sub erase {
294	0			0	0	0	my $self = shift;
295	0					0	$self->fh->truncate(HEADER_LEN);
296							}
297
298	7631			7631	0	25213	sub fh { shift->{fh} }
299	3813			3813	0	5063	sub seek { shift->fh->seek(shift,0) }
300	0			0	0	0	sub tell { shift->fh->tell() }
301
302							sub _option {
303	7660			7660		6416	my $self = shift;
304	7660					6351	my $option = shift;
305	7660					8043	my $d = $self->{options}{$option};
306	7660	100				11095	if (@_) {
307	2					3	$self->{dirty}++;
308	2					3	$self->{options}{$option} = shift;
309	2	50	66			12	delete $self->{scale} if $option eq 'min' or $option eq 'max';
310							}
311	7660					9799	return $d;
312							}
313
314	0			0	0	0	sub version { shift->_option('version',@_) }
315	2			2	1	338	sub seqid { shift->_option('seqid',@_) }
316	4			4	1	8	sub min { shift->_option('min',@_) }
317	4			4	1	7	sub max { shift->_option('max',@_) }
318	3818			3818	1	5729	sub step { shift->_option('step',@_) }
319	9			9	1	16	sub span { shift->_option('span',@_) }
320	1			1	1	4	sub mean { shift->_option('mean',@_) }
321	1			1	1	4	sub stdev { shift->_option('stdev',@_) }
322	3			3	1	9	sub trim { shift->_option('trim',@_) }
323
324							sub start { # slightly different because we have to deal with 1 vs 0-based coordinates
325	3818			3818	0	3199	my $self = shift;
326	3818					4856	my $start = $self->_option('start');
327	3818					3586	$start++; # convert into 1-based coordinates
328	3818	50				5380	if (@_) {
329	0					0	my $newstart = shift;
330	0					0	$self->_option('start',$newstart-1); # store in zero-based coordinates
331							}
332	3818					4117	return $start;
333							}
334
335							sub smoothing {
336	0			0	0	0	my $self = shift;
337	0		0			0	my $d = $self->{smoothing} \|\| DEFAULT_SMOOTHING;
338	0	0				0	$self->{smoothing} = shift if @_;
339	0					0	$d;
340							}
341
342							sub write {
343	3			3	0	5	my $self = shift;
344	3	50	66			240	if ($self->{dirty} && $self->{write}) {
345	1					6	$self->_writeoptions($self->{options});
346	1					10	undef $self->{dirty};
347	1					2	$self->fh->flush;
348							}
349							}
350
351							sub _readoptions {
352	2			2		3	my $self = shift;
353	2					7	my $fh = $self->fh;
354	2					5	my $header;
355	2					25	$fh->seek(0,0);
356	2	100				23	return unless $fh->read($header,HEADER_LEN) == HEADER_LEN;
357	1					19	return $self->_parse_header($header);
358							}
359
360							sub _parse_header {
361	1			1		2	my $self = shift;
362	1					3	my $header = shift;
363	1					12	my ($seqid,$step,$min,$max,$span,
364							$mean,$stdev,$version,$start) = unpack(HEADER,$header);
365	1					11	return { seqid => $seqid,
366							step => $step,
367							span => $span,
368							min => $min,
369							max => $max,
370							mean => $mean,
371							stdev => $stdev,
372							version => $version,
373							start => $start,
374							};
375							}
376
377							sub _generate_header {
378	1			1		2	my $self = shift;
379	1					1	my $options = shift;
380	1					3	return pack(HEADER,@{$options}{qw(seqid step min max span mean stdev version start)});
	1					16
381							}
382
383							sub _writeoptions {
384	1			1		2	my $self = shift;
385	1					2	my $options = shift;
386	1					3	my $fh = $self->fh;
387	1					5	my $header = $self->_generate_header($options);
388	1					6	$fh->seek(0,0);
389	1	50				56	$fh->print($header) or die "write failed: $!";
390							}
391
392							sub _do_trim {
393	1			1		2	my $self = shift;
394
395							# don't trim if there is no score range
396	1	50				4	($self->max - $self->min) or return;
397
398	1					4	my $trim = lc $self->trim;
399	1					2	my ($method,$arg);
400	1	50				9	if ($trim =~ /([a-z]+)(\d+)/) {
401	1					6	$method = "_trim_${1}";
402	1					4	$arg = $2;
403							}
404							else {
405	0					0	$method = "_trim_${trim}";
406							}
407	1	50				6	unless ($self->can($method)) {
408	0					0	carp "invalid trim method $trim";
409	0					0	return;
410							}
411
412	1					4	$self->$method($arg);
413							}
414
415							# trim n standard deviations from the mean
416							sub _trim_stdev {
417	1			1		2	my $self = shift;
418	1		50			3	my $factor = shift \|\| 1;
419	1					3	my $mean = $self->mean;
420	1					3	my $stdev = $self->stdev * $factor;
421	1	50				2	my $min = $self->min > $mean - $stdev ? $self->min : $mean - $stdev;
422	1	50				4	my $max = $self->max < $mean + $stdev ? $self->max : $mean + $stdev;
423	1					1	warn "_trim_stdev (* $factor) : setting min to $min, max to $max (was ",$self->min,',',$self->max,')'
424							if DEBUG;
425	1					3	$self->min($min);
426	1					2	$self->max($max);
427							}
428
429							sub set_value {
430	3809			3809	1	3566	my $self = shift;
431	3809	50				6543	croak "usage: \$wig->set_value(\$position => \$value)"
432							unless @_ == 2;
433	3809					5789	$self->value(@_);
434							}
435
436							sub set_range {
437	0			0	1	0	my $self = shift;
438	0	0				0	croak "usage: \$wig->set_range(\$start_position => \$end_position, \$value)"
439							unless @_ == 3;
440	0					0	$self->value(@_);
441							}
442
443							sub value {
444	3809			3809	1	3288	my $self = shift;
445	3809					3266	my $position = shift;
446
447	3809					4909	my $offset = $self->_calculate_offset($position);
448	3809	50				6804	$offset >= HEADER_LEN or die "Tried to retrieve data from before start position";
449	3809	50				5691	$self->seek($offset) or die "Seek failed: $!";
450
451	3809	50				65624	if (@_ == 2) {
		50
452	0					0	my $end = shift;
453	0					0	my $new_value = shift;
454	0					0	my $step = $self->step;
455	0					0	my $scaled_value = $self->scale($new_value);
456	0	0				0	$self->fh->print(pack('C*',($scaled_value)x(($end-$position+1)/$step))) or die "Write failed: $!";
457	0	0	0			0	$self->{end} = $end if !exists $self->{end} \|\| $self->{end} < $end;
458							}
459
460							elsif (@_==1) {
461	3809					4299	my $new_value = shift;
462	3809					6428	my $scaled_value = $self->scale($new_value);
463	3809	50				5529	$self->fh->print(pack('C*',$scaled_value)) or die "Write failed: $!";
464	3809	50	66			34923	$self->{end} = $position if !exists $self->{end} \|\| $self->{end} < $position;
465	3809					7451	return $new_value;
466							}
467
468							else { # retrieving data
469	0					0	my $buffer;
470	0	0				0	$self->fh->read($buffer,1) or die "Read failed: $!";
471	0					0	my $scaled_value = unpack('C*',$buffer);
472
473							# missing data, so look back at most span values to get it
474	0	0	0			0	if ($scaled_value == 0 && (my $span = $self->span) > 1) {
475	0					0	$offset = $self->_calculate_offset($position-$span+1);
476	0	0				0	$offset >= HEADER_LEN or die "Tried to retrieve data from before start position";
477	0	0				0	$self->seek($offset) or die "Seek failed: $!";
478
479	0					0	$self->fh->read($buffer,$span/$self->step);
480	0					0	for (my $i=length($buffer)-2;$i>=0;$i--) {
481	0					0	my $val = substr($buffer,$i,1);
482	0	0				0	next if $val eq "\0";
483	0					0	$scaled_value = unpack('C*',$val);
484	0					0	last;
485							}
486
487							}
488	0					0	return $self->unscale($scaled_value);
489							}
490							}
491
492							sub _calculate_offset {
493	3813			3813		3591	my $self = shift;
494	3813					3019	my $position = shift;
495	3813					5983	my $step = $self->step;
496	3813					5451	my $start = $self->start;
497	3813					8629	return HEADER_LEN + int(($position-$start)/$step);
498							}
499
500							sub set_values {
501	0			0	1	0	my $self = shift;
502	0	0	0			0	croak "usage: \$wig->set_values(\$position => \@values)"
503							unless @_ == 2 and ref $_[1] eq 'ARRAY';
504	0					0	$self->values(@_);
505							}
506
507							# read or write a series of values
508							sub values {
509	4			4	1	6	my $self = shift;
510	4					3	my $start = shift;
511	4	50	33			15	if (ref $_[0] && ref $_[0] eq 'ARRAY') {
512	0					0	$self->_store_values($start,@_);
513							} else {
514	4					14	$self->_retrieve_values($start,@_);
515							}
516							}
517
518							sub export_to_wif64 {
519	0			0	1	0	my $self = shift;
520	0					0	my $data = $self->export_to_wif(@_);
521	0	0				0	eval "require MIME::Base64"
522							unless MIME::Base64->can('encode_base64');
523	0					0	return MIME::Base64::encode_base64($data);
524							}
525							sub import_from_wif64 {
526	0			0	1	0	my $self = shift;
527	0					0	my $data = shift;
528
529	0	0				0	eval "require MIME::Base64"
530							unless MIME::Base64->can('decode_base64');
531	0					0	return $self->import_from_wif(MIME::Base64::decode_base64($data));
532							}
533
534							# subregion in "wiggle interchange format" (wif)
535							sub export_to_wif {
536	0			0	1	0	my $self = shift;
537	0					0	my ($start,$end) = @_;
538
539							# get the 256 byte header
540	0					0	my $data = $self->_generate_header($self->{options});
541
542							# add the range to the data (8 bytes overhead)
543	0					0	$data .= pack("L",$start);
544	0					0	$data .= pack("L",$end);
545
546							# add the packed data for this range
547	0					0	$data .= $self->_retrieve_packed_range($start,$end-$start+1,$self->step);
548	0					0	return $data;
549							}
550
551							sub export_to_bedgraph {
552	1			1	0	613	my $self = shift;
553	1					3	my ($start,$end,$fh) = @_;
554	1					2	my $max_range = 100_000;
555
556	1		50			4	$start \|\|= 1;
557	1		33			3	$end \|\|= $self->end;
558
559	1					2	my $lines;
560	1					4	for (my $s=$start;$s<$end;$s+=$max_range) {
561	1					3	my $e = $s + $max_range - 1;
562	1	50				4	$e = $end if $e > $end;
563	1					4	my $b = $self->values($s,$e);
564	1					8	$lines .= $self->_bedgraph_lines($s,$b,$fh);
565							}
566
567	1					8	return $lines;
568							}
569
570							sub _bedgraph_lines {
571	1			1		3	my $self = shift;
572	1					3	my ($start,$values,$fh) = @_;
573	1					6	my $seqid = $self->seqid;
574	1					1	my $result;
575
576	1					2	my ($last_val,$last_start,$end);
577	1					2	$last_start = $start-1; # 0 based indexing
578	1					7	for (my $i=0;$i<@$values;$i++) {
579	5000					3603	my $v = $values->[$i];
580
581	5000	100				6165	if (!defined $v) {
582	2168	100				2587	if (defined $last_val) {
583	56					103	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$start+$i-1,$last_val);
584	56					61	undef $last_val;
585							}
586	2168					1418	$last_start = $start+$i;
587	2168					3045	next;
588							}
589
590	2832	50	66			7524	if (defined $last_val && $last_val != $v) {
591	0					0	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$start+$i-1,$last_val);
592	0					0	$last_start = $start+$i-1;
593							}
594
595	2832					1856	$last_val = $v;
596	2832					4046	$end = $start+$i-1;
597							}
598	1	50				12	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$end+1,$last_val) if $last_val;
599	1					85	return $result;
600							}
601
602							sub _append_or_print_bedgraph {
603	57			57		49	my $self = shift;
604	57					64	my ($fh,$seqid,$start,$end,$val) = @_;
605	57					309	my $data = join("\t",$seqid,$start,$end,sprintf("%.2f",$val))."\n";
606	57	50				79	if ($fh) {
607	0					0	print $fh $data;
608	0					0	return '';
609							} else {
610	57					103	return $data;
611							}
612							}
613
614							sub import_from_wif {
615	0			0	1	0	my $self = shift;
616	0					0	my $wifdata = shift;
617
618							# BUG: should check that header is compatible
619	0					0	my $header = substr($wifdata,0,HEADER_LEN);
620	0					0	my $start = unpack('L',substr($wifdata,HEADER_LEN, 4));
621	0					0	my $end = unpack('L',substr($wifdata,HEADER_LEN+4,4));
622
623	0					0	my $options = $self->_parse_header($header);
624	0		0			0	my $stored_options = eval {$self->_readoptions} \|\| {};
625	0					0	my %merged_options = (%$stored_options,%$options);
626	0					0	$self->{options} = \%merged_options;
627	0					0	$self->{dirty}++;
628
629							# write the data
630	0					0	$self->seek($self->_calculate_offset($start));
631	0	0				0	$self->fh->print(substr($wifdata,HEADER_LEN+8)) or die "write failed: $!";
632	0	0	0			0	$self->{end} = $end if !defined $self->{end} or $self->{end} < $end;
633							}
634
635							sub _retrieve_values {
636	4			4		3	my $self = shift;
637	4					7	my ($start,$end,$samples) = @_;
638
639	4					6	my $data_start = $self->start;
640	4					10	my $step = $self->step;
641	4					9	my $span = $self->span;
642
643	4	50				11	croak "Value of start position ($start) is less than data start of $data_start"
644							unless $start >= $data_start;
645	4	50				9	croak "Value of end position ($end) is greater than data end of ",$self->end+$span,
646							unless $end <= $self->end + $span;
647
648							# generate list of positions to sample from
649	4					7	my $length = $end-$start+1;
650	4		33			14	$samples \|\|= $length;
651
652							# warn "samples = $samples, length=$length, span=$span, step=$step";
653
654							# if the length is grossly greater than the samples, then we won't even
655							# bother fetching all the data, but just sample into the disk file
656	4	50	33			13	if ($length/$samples > 100 && $step == 1) {
657	0					0	my @result;
658							# my $window = 20*($span/$step);
659	0					0	my $interval = $length/$samples;
660							# my $window = 100*$interval/$span;
661	0					0	my $window = $interval/2;
662							# warn "window = $window, interval = $interval";
663	0					0	for (my $i=0;$i<$samples;$i++) {
664	0					0	my $packed_data = $self->_retrieve_packed_range(int($start+$i*$interval-$window),
665							int($window),
666							$step);
667	0					0	my @bases= grep {$_} unpack('C*',$packed_data);
	0					0
668	0	0				0	if (@bases) {
669	0					0	local $^W = 0;
670	0					0	my $arry = $self->unscale(\@bases);
671	0					0	my $n = @$arry;
672	0					0	my $total = 0;
673	0					0	$total += $_ foreach @$arry;
674	0					0	my $mean = $total/$n;
675	0					0	my $max;
676	0	0	0			0	for (@$arry) { $max = $_ if !defined $max \|\| $max < $_ }
	0					0
677							# warn $start+$i*$interval,': ',join(',',map {int($_)} @$arry),
678							# " mean = $mean, max = $max";
679							# push @result,$mean;
680	0					0	push @result,$max;
681							} else {
682	0					0	push @result,0;
683							}
684							}
685	0					0	return \@result;
686							}
687
688	4					8	my $packed_data = $self->_retrieve_packed_range($start,$length,$step);
689
690	4					5	my @bases;
691	4					44	$#bases = $length-1;
692
693	4	50				8	if ($step == $span) {
694							# in this case, we do not have any partially-empty
695							# steps, so can operate on the step-length data structure
696							# directly
697	0					0	@bases = unpack('C*',$packed_data);
698							}
699
700							else {
701							# In this case some regions may have partially missing data,
702							# so we create an array equal to the length of the requested region,
703							# fill it in, and then sample it
704	4					11	for (my $i=0; $i
705	5202					3698	my $index = $i * $step;
706	5202					4845	my $value = unpack('C',substr($packed_data,$i,1));
707	5202	100				10143	next unless $value; # ignore 0 values
708	61					628	@bases[$index..$index+$span-1] = ($value) x $span;
709							}
710	4					14	$#bases = $length-1;
711							}
712
713	4					17	my $r = $self->unscale(\@bases);
714	4					15	$r = $self->sample($r,$samples);
715	4	50	33			108	$r = $self->smooth($r,$self->window * $samples/@bases)
716							if defined $self->window && $self->window>1;
717	4					84	return $r;
718							}
719
720							sub _retrieve_packed_range {
721	4			4		5	my $self = shift;
722	4					5	my ($start,$length,$step) = @_;
723	4					7	my $span = $self->span;
724
725	4					7	my $offset = $self->_calculate_offset($start);
726
727	4					8	$self->seek($offset);
728	4					139	my $packed_data;
729	4					8	$self->fh->read($packed_data,$length/$step);
730
731							# pad data up to required amount
732	4	50				63	$packed_data .= "\0" x ($length/$step-length($packed_data))
733							if length $packed_data < $length/$step;
734	4					9	return $packed_data;
735							}
736
737
738							sub sample {
739	4			4	0	5	my $self = shift;
740	4					10	my ($values,$samples) = @_;
741	4					5	my $length = @$values;
742	4					7	my $window_size = $length/$samples;
743
744	4					3	my @samples;
745	4					43	$#samples = $samples-1;
746
747	4	50				11	if ($window_size < 2) { # no data smoothing needed
748	4					56	@samples = map { $values->[$_*$window_size] } (0..$samples-1);
	5202					4985
749							}
750							else {
751	0					0	my $smoothsub = $self->smoothsub;
752	0					0	for (my $i=0; $i<$samples; $i++) {
753	0					0	my $start = $i * $window_size;
754	0					0	my $end = $start + $window_size - 1;
755	0					0	my @window = @{$values}[$start..$end];
	0					0
756
757	0					0	my $value = $smoothsub->(\@window);
758	0					0	$samples[$i] = $value;
759							}
760							}
761
762	4					271	return \@samples;
763							}
764
765							sub smoothsub {
766	0			0	0	0	my $self = shift;
767
768	0					0	my $smoothing = $self->smoothing;
769	0	0				0	my $smoothsub = $smoothing eq 'mean' ? \&sample_mean
		0
		0
		0
770							:$smoothing eq 'max' ? \&sample_max
771							:$smoothing eq 'min' ? \&sample_min
772							:$smoothing eq 'none' ? \&sample_center
773							:croak("invalid smoothing type '$smoothing'");
774	0					0	return $smoothsub;
775							}
776
777							sub smooth {
778	0			0	0	0	my ($self,$data,$window) = @_;
779
780	0					0	my $smoothing = $self->smoothing;
781	0		0			0	$window \|\|= $self->window;
782
783	0	0	0			0	return $data if $smoothing eq 'none' \|\| $window < 2;
784
785	0					0	my @data = @$data;
786	0					0	my $smoother = $self->smoothsub;
787	0	0				0	$window++ unless $window % 2;
788	0					0	my $offset = int($window/2);
789
790	0					0	for (my $i=$offset; $i<@$data-$offset; $i++) {
791	0					0	my $start = $i - $offset;
792	0					0	my $end = $i + $offset;
793	0					0	my @subset = @data[$start..$end];
794	0					0	$data->[$i] = $smoother->(\@subset);
795							}
796
797	0					0	return $data;
798							}
799
800							sub window {
801	4			4	0	5	my $self = shift;
802	4					8	my $d = $self->{window};
803	4	50				12	$self->{window} = shift if @_;
804	4					15	$d;
805							}
806
807							sub sample_mean {
808	0			0	0	0	my $values = shift;
809	0					0	my ($total,$items);
810	0					0	for my $v (@$values) {
811	0	0				0	next unless defined $v;
812	0					0	$items++;
813	0					0	$total+=$v;
814							}
815	0	0				0	return $items ? $total/$items : undef;
816							}
817
818							sub sample_max {
819	0			0	0	0	my $values = shift;
820	0					0	my $max;
821	0					0	for my $v (@$values) {
822	0	0				0	next unless defined $v;
823	0	0	0			0	$max = $v if !defined $max or $max < $v;
824							}
825	0					0	return $max;
826							}
827
828							sub sample_min {
829	0			0	0	0	my $values = shift;
830	0					0	my $min;
831	0					0	for my $v (@$values) {
832	0	0				0	next unless defined $v;
833	0	0	0			0	$min = $v if !defined $min or $min > $v;
834							}
835	0					0	return $min;
836							}
837
838							sub sample_center {
839	0			0	0	0	my $values = shift;
840	0					0	return $values->[@$values/2];
841							}
842
843							sub _store_values {
844	0			0		0	my $self = shift;
845	0					0	my ($position,$data) = @_;
846
847							# where does data start
848	0					0	my $offset = $self->_calculate_offset($position);
849	0					0	my $fh = $self->fh;
850	0					0	my $step = $self->step;
851
852	0					0	my $scaled = $self->scale($data);
853
854	0					0	$self->seek($offset);
855	0					0	my $packed_data = pack('C*',@$scaled);
856	0	0				0	$fh->print($packed_data) or die "Write failed: $!";
857
858	0					0	my $new_end = $position+@$data-1;
859	0	0	0			0	$self->{end} = $new_end if !exists $self->{end} \|\| $self->{end} < $new_end;
860							}
861
862							# zero means "no data"
863							# everything else is scaled from 1-255
864							sub scale {
865	3809			3809	0	3427	my $self = shift;
866	3809					3328	my $values = shift;
867	3809					4838	my $scale = $self->_get_scale;
868	3809					4888	my $min = $self->{options}{min};
869	3809	50	33			7720	if (ref $values && ref $values eq 'ARRAY') {
870							my @return = map {
871	0					0	my $i = ($_ - $min)/$scale;
	0					0
872	0					0	my $v = 1 + int($i+0.5*($i<=>0)); # avoid call to round()
873	0	0				0	$v = 1 if $v < 1;
874	0	0				0	$v = 255 if $v > 255;
875	0					0	$v;
876							} @$values;
877	0					0	return \@return;
878							} else {
879	3809					9771	my $v = 1 + round (($values - $min)/$scale);
880	3809	50				6490	$v = 1 if $v < 1;
881	3809	50				5453	$v = 255 if $v > 255;
882	3809					5958	return $v;
883							}
884							}
885
886							sub unscale {
887	4			4	0	5	my $self = shift;
888	4					4	my $values = shift;
889	4					9	my $scale = $self->_get_scale;
890	4					8	my $min = $self->{options}{min};
891
892	4	50	33			28	if (ref $values && ref $values eq 'ARRAY') {
893	4	100				133	my @return = map {$_ ? (($_-1) * $scale + $min) : undef} @$values;
	5202					6152
894	4					110	return \@return;
895							} else {
896	0	0				0	return $values ? ($values-1) * $scale + $min : undef;
897							}
898							}
899
900							sub _get_scale {
901	3813			3813		3388	my $self = shift;
902	3813	100				6844	unless ($self->{scale}) {
903	2					4	my $min = $self->{options}{min};
904	2					4	my $max = $self->{options}{max};
905	2		50			7	my $range = $max - $min \|\| 0.001; # can't be zero!
906	2					5	$self->{scale} = $range/254;
907							}
908	3813					5149	return $self->{scale};
909							}
910
911							sub round {
912	3809			3809	0	7714	return int($_[0]+0.5*($_[0]<=>0));
913							}
914
915
916							1;
917
918							__END__