File Coverage

lib/Bio/Graphics/Wiggle.pm

Criterion	Covered	Total	%
statement	262	421	62.2
branch	59	162	36.4
condition	35	105	33.3
subroutine	48	65	73.8
pod	18	40	45.0
total	422	793	53.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Graphics::Wiggle;
2
3							=head1 NAME
4
5							Bio::Graphics::Wiggle -- Binary storage for dense genomic features
6
7							=head1 SYNOPSIS
8
9							# all positions are 1-based
10
11							my $wig = Bio::Graphics::Wiggle->new('./test.wig',
12							$writeable,
13							{ seqid => $seqid,
14							start => $start,
15							step => $step,
16							min => $min,
17							max => $max });
18
19							$wig->erase;
20
21							my $seqid = $wig->seqid('new_id');
22							my $max = $wig->max($new_max);
23							my $min = $wig->min($new_min);
24							my $step = $wig->step($new_step); # data stored at modulus step == 0; all else is blank
25
26							$wig->set_value($position => $value); # store $value at position
27							$wig->set_values($position => \@values); # store array of values at position
28							$wig->set_range($start=>$end,$value); # store the same $value from $start to $end
29
30							my $value = $wig->value($position); # fetch value from position
31							my $values = $wig->values($start,$end); # fetch range of data from $start to $end
32
33							$wig->window(100); # sample window size
34							$wig->smoothing('mean'); # when sampling, compute the mean value across sample window
35							my $values = $wig->values($start,$end,$samples); # fetch $samples data points from $start to $end
36
37
38							=head1 DESCRIPTION
39
40							IMPORTANT NOTE: This implementation is still not right. See
41							http://genomewiki.ucsc.edu/index.php/Wiggle for a more space-efficient
42							implementation.
43
44							This module stores "wiggle" style quantitative genome data for display
45							in a genome browser application. The data for each chromosome (or
46							contig, or other reference sequence) is stored in a single file in the
47							following format:
48
49							256 byte header
50							50 bytes seqid, zero-terminated C string
51							4 byte long integer, value of "step" (explained later)
52							4 byte perl native float, the "min" value
53							4 byte perl native float, the "max" value
54							4 byte long integer, value of "span"
55							4 byte perl native float, the mean
56							4 byte perl native float, the standard deviation
57							2 byte unsigned short, the version number (currently version 0)
58							4 byte long integer, sequence start position (in 0-based coordinates)
59							null padding to 256 bytes for future use
60
61							The remainder of the file consists of 8-bit unsigned scaled integer
62							values. This means that all quantitative data will be scaled to 8-bit
63							precision!
64
65							For a convenient method of creating Wiggle files from UCSC-type WIG
66							input and creating GFF3 output, please see
67							L<Bio::Graphics::Wiggle::Loader>.
68
69							=head1 METHODS
70
71							=head2 Constructor and Accessors
72
73							=over 4
74
75							=item $wig = Bio::Graphics::Wiggle->new($filename,$writeable,{options})
76
77							Open/create a wiggle-format data file:
78
79							$filename -- path to the file to open/create
80							$writeable -- boolean value indicating whether file is
81							writeable. Missing files will only be created
82							if $writeable set to a true value. If path is
83							empty (undef or empty string) and writeable is true,
84							new() will create a temporary file that will be
85							deleted when the object goes out of scope.
86							{options} -- hash ref of the following named options, only valid
87							when creating a new wig file with $writeable true.
88
89							option name description default
90							----------- ----- -------
91							seqid name/id of sequence empty name
92							min minimum value of data points 0
93							max maximum value of data points 255
94							step interval between data points 1
95							span width of data points value of "step"
96
97							The "step" can be used to create sparse files to save space. By
98							default, step is set to 1, in which case a data value will be stored
99							at each base of the sequence. By setting step to 10, then each value
100							is taken to correspond to 10 bp, and the file will be 10x smaller.
101							For example, consider this step 5 data set:
102
103							1 2 3 4 5 6 7 8 9 10 11 12 13 14
104							20 . . . . 60 . . . . 80 . . .
105
106							We have stored the values "20" "60" and "80" at positions 1, 6 and 11,
107							respectively. When retrieving this data, it will appear as if
108							positions 1 through 5 have a value of 20, positions 6-10 have a value
109							of 60, and positions 11-14 have a value of 80. In the data file, we
110							store, positions 1,6,and 11 in adjacent bytes.
111
112							Note that no locking is performed by this module. If you wish to allow
113							multi-user write access to the databases files, you will need to
114							flock() the files yourself.
115
116							=item $seqid = $wig->seqid(['new_id'])
117
118							=item $max = $wig->max([$new_max])
119
120							=item $min = $wig->min([$new_min])
121
122							=item $step = $wig->step([$new_step])
123
124							=item $span = $wig->span([$new_span])
125
126							=item $mean = $wig->mean([$new_mean]);
127
128							=item $stdev = $wig->stdev([$new_stdev]);
129
130							These accessors get or set the corresponding values. Setting is only
131							allowed if the file was opened for writing. Note that changing the
132							min, max and step after writing data to the file under another
133							parameter set will produce unexpected (and invalid) results, as the
134							existing data is not automatically updated to be consistent.
135
136							=item $trim = $wig->trim([$new_trim]);
137
138							The trim method sets the trimming method, which can be used to trim
139							out extreme values. Three methods are currently supported:
140
141							none No trimming
142							stdev Trim 1 standard deviation above and below mean
143							stdevN Trim N standard deviations above and below the mean
144
145							In "stdevN", any can be any positive integer.
146
147							=back
148
149							=head2 Setting Data
150
151							=over 4
152
153							=item $wig->set_value($position => $value)
154
155							This method sets the value at $position to $value. If a step>1 is in
156							force, then $position will be rounded down to the nearest multiple of
157							step.
158
159							=item $wig->set_range($start=>$end, $value)
160
161							This method sets the value of all bases between $start and $end to
162							$value, honoring step.
163
164							=item $sig->set_values($position => \@values)
165
166							This method writes an array of values into the datababase beginning at
167							$position (or the nearest lower multiple of step). If step>1, then
168							values will be written at step intervals.
169
170							=back
171
172							=head2 Retrieving Data
173
174							=over 4
175
176							=item $value = $wig->value($position)
177
178							Retrieve the single data item at position $position, or the nearest
179							lower multiple of $step if step>1.
180
181							=item $values = $wig->values($start=>$end)
182
183							Retrieve the values in the range $start to $end and return them as an
184							array ref. Note that you will always get an array of size
185							($end-$start+1) even if step>1; the data in between the step intervals
186							will be filled in.
187
188							=item $values = $wig->values($start=>$end,$samples)
189
190							Retrieve a sampling of the values between $start and $end. Nothing
191							very sophisticated is done here; the code simply returns the number of
192							values indicated in $samples, smoothed according to the smoothing
193							method selected (default to "mean"), then selected at even intervals
194							from the range $start to $end. The return value is an arrayref of
195							exactly $samples values.
196
197							=item $string = $wig->export_to_wif($start,$end)
198
199							=item $string = $wig->export_to_wif64($start,$end)
200
201							Export the region from start to end in the "wif" format. This data can
202							later be imported into another Bio::Graphics::Wiggle object. The first
203							version returns a binary string. The second version returns a base64
204							encoded version that is safe for ascii-oriented formata such as GFF3
205							and XML.
206
207							=item $wig->import_from_wif($string)
208
209							=item $wig->import_from_wif64($string)
210
211							Import a wif format data string into the Bio::Graphics::Wiggle
212							object. The first version expects a binary string. The second version
213							expects a base64 encoded version that is safe for ascii-oriented
214							formata such as GFF3 and XML.
215
216							=back
217
218
219							=cut
220
221							# read/write genome tiling data, to be compatible with Jim Kent's WIG format
222	1			1		5	use strict;
	1					1
	1					34
223	1			1		6	use warnings;
	1					2
	1					31
224	1			1		910	use IO::File;
	1					1156
	1					153
225	1			1		7	use Carp 'croak','carp','confess';
	1					1
	1					55
226
227	1			1		6	use constant HEADER_LEN => 256;
	1					2
	1					79
228							# seqid, step, min, max, span, mean, stdev, version, start
229	1			1		5	use constant HEADER => '(Z50LFFLFFSL)@'.HEADER_LEN;
	1					1
	1					39
230	1			1		5	use constant BODY => 'C';
	1					2
	1					69
231	1			1		10	use constant DEBUG => 0;
	1					2
	1					52
232	1			1		5	use constant DEFAULT_SMOOTHING => 'mean';
	1					2
	1					36
233	1			1		5	use constant VERSION => 0;
	1					2
	1					5306
234							our $VERSION = '1.0';
235
236							sub new {
237	2			2	1	6	my $class = shift;
238	2					6	my ($path,$write,$options) = @_;
239	2		50			7	$path \|\|= ''; # to avoid uninit warning
240	2	50				152	my $mode = $write ? -e $path # if file already exists...
		100
241							? '+<' # ...open for read/write
242							: '+>' # ...else clobber and open a new one
243							: '<'; # read only
244	2					13	my $fh = $class->new_fh($path,$mode);
245	2	50	0			370	$fh or die (($path\|\|'temporary file').": $!");
246
247	2		100			10	$options \|\|= {};
248
249	2		33			32	my $self = bless {fh => $fh,
250							write => $write,
251							dirty => scalar keys %$options
252							}, ref $class \|\| $class;
253
254	2		100			5	my $stored_options = eval {$self->_readoptions} \|\| {};
255	2	50				49	$options->{start}-- if defined $options->{start}; # 1-based ==> 0-based coordinates
256	2					19	my %merged_options = (%$stored_options,%$options);
257							# warn "merged options = ",join ' ',%merged_options;
258	2		50			14	$merged_options{version}\|\|= 0;
259	2		50			9	$merged_options{seqid} \|\|= 'chrUnknown';
260	2		50			7	$merged_options{min} \|\|= 0;
261	2		50			7	$merged_options{max} \|\|= 255;
262	2		50			6	$merged_options{mean} \|\|= 128;
263	2		50			52	$merged_options{stdev} \|\|= 255;
264	2		100			10	$merged_options{trim} \|\|= 'none';
265	2		50			10	$merged_options{step} \|\|= 1;
266	2		50			12	$merged_options{start} \|\|= 0;
267	2		33			7	$merged_options{span} \|\|= $merged_options{step};
268	2					8	$self->{options} = \%merged_options;
269	2	100				8	$self->_do_trim unless $self->trim eq 'none';
270	2					12	return $self;
271							}
272
273							sub new_fh {
274	2			2	0	4	my $self = shift;
275	2					5	my ($path,$mode) = @_;
276	2	50				23	return $path ? IO::File->new($path,$mode)
277							: IO::File->new_tmpfile;
278							}
279
280							sub end {
281	4			4	0	5	my $self = shift;
282	4	100				13	unless (defined $self->{end}) {
283	1					4	my $size = (stat($self->fh))[7];
284	1					4	my $data_len = $size - HEADER_LEN();
285	1	50				4	return unless $data_len>0; # undef end
286	1					4	$self->{end} = ($self->start-1) + $data_len * $self->step;
287							}
288	4					15	return $self->{end};
289							}
290
291	2			2		5565	sub DESTROY { shift->write }
292
293							sub erase {
294	0			0	0	0	my $self = shift;
295	0					0	$self->fh->truncate(HEADER_LEN);
296							}
297
298	7631			7631	0	31428	sub fh { shift->{fh} }
299	3813			3813	0	5916	sub seek { shift->fh->seek(shift,0) }
300	0			0	0	0	sub tell { shift->fh->tell() }
301
302							sub _option {
303	7660			7660		8763	my $self = shift;
304	7660					9325	my $option = shift;
305	7660					14091	my $d = $self->{options}{$option};
306	7660	100				14676	if (@_) {
307	2					5	$self->{dirty}++;
308	2					4	$self->{options}{$option} = shift;
309	2	50	66			11	delete $self->{scale} if $option eq 'min' or $option eq 'max';
310							}
311	7660					14806	return $d;
312							}
313
314	0			0	0	0	sub version { shift->_option('version',@_) }
315	2			2	1	633	sub seqid { shift->_option('seqid',@_) }
316	4			4	1	9	sub min { shift->_option('min',@_) }
317	4			4	1	10	sub max { shift->_option('max',@_) }
318	3818			3818	1	7471	sub step { shift->_option('step',@_) }
319	9			9	1	25	sub span { shift->_option('span',@_) }
320	1			1	1	4	sub mean { shift->_option('mean',@_) }
321	1			1	1	4	sub stdev { shift->_option('stdev',@_) }
322	3			3	1	11	sub trim { shift->_option('trim',@_) }
323
324							sub start { # slightly different because we have to deal with 1 vs 0-based coordinates
325	3818			3818	0	4775	my $self = shift;
326	3818					6633	my $start = $self->_option('start');
327	3818					4607	$start++; # convert into 1-based coordinates
328	3818	50				7061	if (@_) {
329	0					0	my $newstart = shift;
330	0					0	$self->_option('start',$newstart-1); # store in zero-based coordinates
331							}
332	3818					5792	return $start;
333							}
334
335							sub smoothing {
336	0			0	0	0	my $self = shift;
337	0		0			0	my $d = $self->{smoothing} \|\| DEFAULT_SMOOTHING;
338	0	0				0	$self->{smoothing} = shift if @_;
339	0					0	$d;
340							}
341
342							sub write {
343	3			3	0	6	my $self = shift;
344	3	100	66			261	if ($self->{dirty} && $self->{write}) {
345	1					6	$self->_writeoptions($self->{options});
346	1					12	undef $self->{dirty};
347	1					4	$self->fh->flush;
348							}
349							}
350
351							sub _readoptions {
352	2			2		5	my $self = shift;
353	2					7	my $fh = $self->fh;
354	2					4	my $header;
355	2					22	$fh->seek(0,0);
356	2	100				33	return unless $fh->read($header,HEADER_LEN) == HEADER_LEN;
357	1					46	return $self->_parse_header($header);
358							}
359
360							sub _parse_header {
361	1			1		2	my $self = shift;
362	1					3	my $header = shift;
363	1					13	my ($seqid,$step,$min,$max,$span,
364							$mean,$stdev,$version,$start) = unpack(HEADER,$header);
365	1					13	return { seqid => $seqid,
366							step => $step,
367							span => $span,
368							min => $min,
369							max => $max,
370							mean => $mean,
371							stdev => $stdev,
372							version => $version,
373							start => $start,
374							};
375							}
376
377							sub _generate_header {
378	1			1		3	my $self = shift;
379	1					2	my $options = shift;
380	1					3	return pack(HEADER,@{$options}{qw(seqid step min max span mean stdev version start)});
	1					14
381							}
382
383							sub _writeoptions {
384	1			1		2	my $self = shift;
385	1					3	my $options = shift;
386	1					3	my $fh = $self->fh;
387	1					5	my $header = $self->_generate_header($options);
388	1					7	$fh->seek(0,0);
389	1	50				30	$fh->print($header) or die "write failed: $!";
390							}
391
392							sub _do_trim {
393	1			1		3	my $self = shift;
394
395							# don't trim if there is no score range
396	1	50				4	($self->max - $self->min) or return;
397
398	1					6	my $trim = lc $self->trim;
399	1					1	my ($method,$arg);
400	1	50				7	if ($trim =~ /([a-z]+)(\d+)/) {
401	1					4	$method = "_trim_${1}";
402	1					4	$arg = $2;
403							}
404							else {
405	0					0	$method = "_trim_${trim}";
406							}
407	1	50				7	unless ($self->can($method)) {
408	0					0	carp "invalid trim method $trim";
409	0					0	return;
410							}
411
412	1					4	$self->$method($arg);
413							}
414
415							# trim n standard deviations from the mean
416							sub _trim_stdev {
417	1			1		2	my $self = shift;
418	1		50			5	my $factor = shift \|\| 1;
419	1					4	my $mean = $self->mean;
420	1					4	my $stdev = $self->stdev * $factor;
421	1	50				3	my $min = $self->min > $mean - $stdev ? $self->min : $mean - $stdev;
422	1	50				3	my $max = $self->max < $mean + $stdev ? $self->max : $mean + $stdev;
423	1					3	warn "_trim_stdev (* $factor) : setting min to $min, max to $max (was ",$self->min,',',$self->max,')'
424							if DEBUG;
425	1					4	$self->min($min);
426	1					4	$self->max($max);
427							}
428
429							sub set_value {
430	3809			3809	1	4472	my $self = shift;
431	3809	50				8451	croak "usage: \$wig->set_value(\$position => \$value)"
432							unless @_ == 2;
433	3809					8076	$self->value(@_);
434							}
435
436							sub set_range {
437	0			0	1	0	my $self = shift;
438	0	0				0	croak "usage: \$wig->set_range(\$start_position => \$end_position, \$value)"
439							unless @_ == 3;
440	0					0	$self->value(@_);
441							}
442
443							sub value {
444	3809			3809	1	4109	my $self = shift;
445	3809					4453	my $position = shift;
446
447	3809					6153	my $offset = $self->_calculate_offset($position);
448	3809	50				8341	$offset >= HEADER_LEN or die "Tried to retrieve data from before start position";
449	3809	50				7386	$self->seek($offset) or die "Seek failed: $!";
450
451	3809	50				84831	if (@_ == 2) {
		50
452	0					0	my $end = shift;
453	0					0	my $new_value = shift;
454	0					0	my $step = $self->step;
455	0					0	my $scaled_value = $self->scale($new_value);
456	0	0				0	$self->fh->print(pack('C*',($scaled_value)x(($end-$position+1)/$step))) or die "Write failed: $!";
457	0	0	0			0	$self->{end} = $end if !exists $self->{end} \|\| $self->{end} < $end;
458							}
459
460							elsif (@_==1) {
461	3809					6223	my $new_value = shift;
462	3809					7360	my $scaled_value = $self->scale($new_value);
463	3809	50				13694	$self->fh->print(pack('C*',$scaled_value)) or die "Write failed: $!";
464	3809	50	66			47208	$self->{end} = $position if !exists $self->{end} \|\| $self->{end} < $position;
465	3809					11010	return $new_value;
466							}
467
468							else { # retrieving data
469	0					0	my $buffer;
470	0	0				0	$self->fh->read($buffer,1) or die "Read failed: $!";
471	0					0	my $scaled_value = unpack('C*',$buffer);
472
473							# missing data, so look back at most span values to get it
474	0	0	0			0	if ($scaled_value == 0 && (my $span = $self->span) > 1) {
475	0					0	$offset = $self->_calculate_offset($position-$span+1);
476	0	0				0	$offset >= HEADER_LEN or die "Tried to retrieve data from before start position";
477	0	0				0	$self->seek($offset) or die "Seek failed: $!";
478
479	0					0	$self->fh->read($buffer,$span/$self->step);
480	0					0	for (my $i=length($buffer)-2;$i>=0;$i--) {
481	0					0	my $val = substr($buffer,$i,1);
482	0	0				0	next if $val eq "\0";
483	0					0	$scaled_value = unpack('C*',$val);
484	0					0	last;
485							}
486
487							}
488	0					0	return $self->unscale($scaled_value);
489							}
490							}
491
492							sub _calculate_offset {
493	3813			3813		4870	my $self = shift;
494	3813					4457	my $position = shift;
495	3813					7291	my $step = $self->step;
496	3813					7878	my $start = $self->start;
497	3813					10199	return HEADER_LEN + int(($position-$start)/$step);
498							}
499
500							sub set_values {
501	0			0	1	0	my $self = shift;
502	0	0	0			0	croak "usage: \$wig->set_values(\$position => \@values)"
503							unless @_ == 2 and ref $_[1] eq 'ARRAY';
504	0					0	$self->values(@_);
505							}
506
507							# read or write a series of values
508							sub values {
509	4			4	1	11	my $self = shift;
510	4					7	my $start = shift;
511	4	50	33			18	if (ref $_[0] && ref $_[0] eq 'ARRAY') {
512	0					0	$self->_store_values($start,@_);
513							} else {
514	4					18	$self->_retrieve_values($start,@_);
515							}
516							}
517
518							sub export_to_wif64 {
519	0			0	1	0	my $self = shift;
520	0					0	my $data = $self->export_to_wif(@_);
521	0	0				0	eval "require MIME::Base64"
522							unless MIME::Base64->can('encode_base64');
523	0					0	return MIME::Base64::encode_base64($data);
524							}
525							sub import_from_wif64 {
526	0			0	1	0	my $self = shift;
527	0					0	my $data = shift;
528
529	0	0				0	eval "require MIME::Base64"
530							unless MIME::Base64->can('decode_base64');
531	0					0	return $self->import_from_wif(MIME::Base64::decode_base64($data));
532							}
533
534							# subregion in "wiggle interchange format" (wif)
535							sub export_to_wif {
536	0			0	1	0	my $self = shift;
537	0					0	my ($start,$end) = @_;
538
539							# get the 256 byte header
540	0					0	my $data = $self->_generate_header($self->{options});
541
542							# add the range to the data (8 bytes overhead)
543	0					0	$data .= pack("L",$start);
544	0					0	$data .= pack("L",$end);
545
546							# add the packed data for this range
547	0					0	$data .= $self->_retrieve_packed_range($start,$end-$start+1,$self->step);
548	0					0	return $data;
549							}
550
551							sub export_to_bedgraph {
552	1			1	0	1162	my $self = shift;
553	1					2	my ($start,$end,$fh) = @_;
554	1					3	my $max_range = 100_000;
555
556	1		50			7	$start \|\|= 1;
557	1		33			4	$end \|\|= $self->end;
558
559	1					2	my $lines;
560	1					6	for (my $s=$start;$s<$end;$s+=$max_range) {
561	1					3	my $e = $s + $max_range - 1;
562	1	50				4	$e = $end if $e > $end;
563	1					5	my $b = $self->values($s,$e);
564	1					13	$lines .= $self->_bedgraph_lines($s,$b,$fh);
565							}
566
567	1					8	return $lines;
568							}
569
570							sub _bedgraph_lines {
571	1			1		4	my $self = shift;
572	1					3	my ($start,$values,$fh) = @_;
573	1					8	my $seqid = $self->seqid;
574	1					2	my $result;
575
576	1					3	my ($last_val,$last_start,$end);
577	1					4	$last_start = $start-1; # 0 based indexing
578	1					6	for (my $i=0;$i<@$values;$i++) {
579	5000					19311	my $v = $values->[$i];
580
581	5000	100				47273	if (!defined $v) {
582	2168	100				13169	if (defined $last_val) {
583	56					354	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$start+$i-1,$last_val);
584	56					136	undef $last_val;
585							}
586	2168					4394	$last_start = $start+$i;
587	2168					23099	next;
588							}
589
590	2832	50	66			44513	if (defined $last_val && $last_val != $v) {
591	0					0	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$start+$i-1,$last_val);
592	0					0	$last_start = $start+$i-1;
593							}
594
595	2832					7334	$last_val = $v;
596	2832					20723	$end = $start+$i-1;
597							}
598	1	50				12	$result .= $self->_append_or_print_bedgraph($fh,$seqid,$last_start,$end+1,$last_val) if $last_val;
599	1					114	return $result;
600							}
601
602							sub _append_or_print_bedgraph {
603	57			57		246	my $self = shift;
604	57					973	my ($fh,$seqid,$start,$end,$val) = @_;
605	57					5846	my $data = join("\t",$seqid,$start,$end,sprintf("%.2f",$val))."\n";
606	57	50				381	if ($fh) {
607	0					0	print $fh $data;
608	0					0	return '';
609							} else {
610	57					481	return $data;
611							}
612							}
613
614							sub import_from_wif {
615	0			0	1	0	my $self = shift;
616	0					0	my $wifdata = shift;
617
618							# BUG: should check that header is compatible
619	0					0	my $header = substr($wifdata,0,HEADER_LEN);
620	0					0	my $start = unpack('L',substr($wifdata,HEADER_LEN, 4));
621	0					0	my $end = unpack('L',substr($wifdata,HEADER_LEN+4,4));
622
623	0					0	my $options = $self->_parse_header($header);
624	0		0			0	my $stored_options = eval {$self->_readoptions} \|\| {};
625	0					0	my %merged_options = (%$stored_options,%$options);
626	0					0	$self->{options} = \%merged_options;
627	0					0	$self->{dirty}++;
628
629							# write the data
630	0					0	$self->seek($self->_calculate_offset($start));
631	0	0				0	$self->fh->print(substr($wifdata,HEADER_LEN+8)) or die "write failed: $!";
632	0	0	0			0	$self->{end} = $end if !defined $self->{end} or $self->{end} < $end;
633							}
634
635							sub _retrieve_values {
636	4			4		6	my $self = shift;
637	4					8	my ($start,$end,$samples) = @_;
638
639	4					11	my $data_start = $self->start;
640	4					42	my $step = $self->step;
641	4					10	my $span = $self->span;
642
643	4	50				14	croak "Value of start position ($start) is less than data start of $data_start"
644							unless $start >= $data_start;
645	4	50				15	croak "Value of end position ($end) is greater than data end of ",$self->end+$span,
646							unless $end <= $self->end + $span;
647
648							# generate list of positions to sample from
649	4					10	my $length = $end-$start+1;
650	4		33			22	$samples \|\|= $length;
651
652							# warn "samples = $samples, length=$length, span=$span, step=$step";
653
654							# if the length is grossly greater than the samples, then we won't even
655							# bother fetching all the data, but just sample into the disk file
656	4	50	33			36	if ($length/$samples > 100 && $step == 1) {
657	0					0	my @result;
658							# my $window = 20*($span/$step);
659	0					0	my $interval = $length/$samples;
660							# my $window = 100*$interval/$span;
661	0					0	my $window = $interval/2;
662							# warn "window = $window, interval = $interval";
663	0					0	for (my $i=0;$i<$samples;$i++) {
664	0					0	my $packed_data = $self->_retrieve_packed_range(int($start+$i*$interval-$window),
665							int($window),
666							$step);
667	0					0	my @bases= grep {$_} unpack('C*',$packed_data);
	0					0
668	0	0				0	if (@bases) {
669	0					0	local $^W = 0;
670	0					0	my $arry = $self->unscale(\@bases);
671	0					0	my $n = @$arry;
672	0					0	my $total = 0;
673	0					0	$total += $_ foreach @$arry;
674	0					0	my $mean = $total/$n;
675	0					0	my $max;
676	0	0	0			0	for (@$arry) { $max = $_ if !defined $max \|\| $max < $_ }
	0					0
677							# warn $start+$i*$interval,': ',join(',',map {int($_)} @$arry),
678							# " mean = $mean, max = $max";
679							# push @result,$mean;
680	0					0	push @result,$max;
681							} else {
682	0					0	push @result,0;
683							}
684							}
685	0					0	return \@result;
686							}
687
688	4					16	my $packed_data = $self->_retrieve_packed_range($start,$length,$step);
689
690	4					6	my @bases;
691	4					91	$#bases = $length-1;
692
693	4	50				50	if ($step == $span) {
694							# in this case, we do not have any partially-empty
695							# steps, so can operate on the step-length data structure
696							# directly
697	0					0	@bases = unpack('C*',$packed_data);
698							}
699
700							else {
701							# In this case some regions may have partially missing data,
702							# so we create an array equal to the length of the requested region,
703							# fill it in, and then sample it
704	4					16	for (my $i=0; $i<length $packed_data; $i++) {
705	5202					13606	my $index = $i * $step;
706	5202					10343	my $value = unpack('C',substr($packed_data,$i,1));
707	5202	100				23650	next unless $value; # ignore 0 values
708	61					1507	@bases[$index..$index+$span-1] = ($value) x $span;
709							}
710	4					17	$#bases = $length-1;
711							}
712
713	4					25	my $r = $self->unscale(\@bases);
714	4					36	$r = $self->sample($r,$samples);
715	4	50	33			2599	$r = $self->smooth($r,$self->window * $samples/@bases)
716							if defined $self->window && $self->window>1;
717	4					121	return $r;
718							}
719
720							sub _retrieve_packed_range {
721	4			4		5	my $self = shift;
722	4					95	my ($start,$length,$step) = @_;
723	4					9	my $span = $self->span;
724
725	4					11	my $offset = $self->_calculate_offset($start);
726
727	4					14	$self->seek($offset);
728	4					60	my $packed_data;
729	4					10	$self->fh->read($packed_data,$length/$step);
730
731							# pad data up to required amount
732	4	50				88	$packed_data .= "\0" x ($length/$step-length($packed_data))
733							if length $packed_data < $length/$step;
734	4					157	return $packed_data;
735							}
736
737
738							sub sample {
739	4			4	0	9	my $self = shift;
740	4					9	my ($values,$samples) = @_;
741	4					8	my $length = @$values;
742	4					10	my $window_size = $length/$samples;
743
744	4					6	my @samples;
745	4					389	$#samples = $samples-1;
746
747	4	50				16	if ($window_size < 2) { # no data smoothing needed
748	4					183	@samples = map { $values->[$_*$window_size] } (0..$samples-1);
	5202					16953
749							}
750							else {
751	0					0	my $smoothsub = $self->smoothsub;
752	0					0	for (my $i=0; $i<$samples; $i++) {
753	0					0	my $start = $i * $window_size;
754	0					0	my $end = $start + $window_size - 1;
755	0					0	my @window = @{$values}[$start..$end];
	0					0
756
757	0					0	my $value = $smoothsub->(\@window);
758	0					0	$samples[$i] = $value;
759							}
760							}
761
762	4					858	return \@samples;
763							}
764
765							sub smoothsub {
766	0			0	0	0	my $self = shift;
767
768	0					0	my $smoothing = $self->smoothing;
769	0	0				0	my $smoothsub = $smoothing eq 'mean' ? \&sample_mean
		0
		0
		0
770							:$smoothing eq 'max' ? \&sample_max
771							:$smoothing eq 'min' ? \&sample_min
772							:$smoothing eq 'none' ? \&sample_center
773							:croak("invalid smoothing type '$smoothing'");
774	0					0	return $smoothsub;
775							}
776
777							sub smooth {
778	0			0	0	0	my ($self,$data,$window) = @_;
779
780	0					0	my $smoothing = $self->smoothing;
781	0		0			0	$window \|\|= $self->window;
782
783	0	0	0			0	return $data if $smoothing eq 'none' \|\| $window < 2;
784
785	0					0	my @data = @$data;
786	0					0	my $smoother = $self->smoothsub;
787	0	0				0	$window++ unless $window % 2;
788	0					0	my $offset = int($window/2);
789
790	0					0	for (my $i=$offset; $i<@$data-$offset; $i++) {
791	0					0	my $start = $i - $offset;
792	0					0	my $end = $i + $offset;
793	0					0	my @subset = @data[$start..$end];
794	0					0	$data->[$i] = $smoother->(\@subset);
795							}
796
797	0					0	return $data;
798							}
799
800							sub window {
801	4			4	0	9	my $self = shift;
802	4					10	my $d = $self->{window};
803	4	50				347	$self->{window} = shift if @_;
804	4					20	$d;
805							}
806
807							sub sample_mean {
808	0			0	0	0	my $values = shift;
809	0					0	my ($total,$items);
810	0					0	for my $v (@$values) {
811	0	0				0	next unless defined $v;
812	0					0	$items++;
813	0					0	$total+=$v;
814							}
815	0	0				0	return $items ? $total/$items : undef;
816							}
817
818							sub sample_max {
819	0			0	0	0	my $values = shift;
820	0					0	my $max;
821	0					0	for my $v (@$values) {
822	0	0				0	next unless defined $v;
823	0	0	0			0	$max = $v if !defined $max or $max < $v;
824							}
825	0					0	return $max;
826							}
827
828							sub sample_min {
829	0			0	0	0	my $values = shift;
830	0					0	my $min;
831	0					0	for my $v (@$values) {
832	0	0				0	next unless defined $v;
833	0	0	0			0	$min = $v if !defined $min or $min > $v;
834							}
835	0					0	return $min;
836							}
837
838							sub sample_center {
839	0			0	0	0	my $values = shift;
840	0					0	return $values->[@$values/2];
841							}
842
843							sub _store_values {
844	0			0		0	my $self = shift;
845	0					0	my ($position,$data) = @_;
846
847							# where does data start
848	0					0	my $offset = $self->_calculate_offset($position);
849	0					0	my $fh = $self->fh;
850	0					0	my $step = $self->step;
851
852	0					0	my $scaled = $self->scale($data);
853
854	0					0	$self->seek($offset);
855	0					0	my $packed_data = pack('C*',@$scaled);
856	0	0				0	$fh->print($packed_data) or die "Write failed: $!";
857
858	0					0	my $new_end = $position+@$data-1;
859	0	0	0			0	$self->{end} = $new_end if !exists $self->{end} \|\| $self->{end} < $new_end;
860							}
861
862							# zero means "no data"
863							# everything else is scaled from 1-255
864							sub scale {
865	3809			3809	0	4104	my $self = shift;
866	3809					4184	my $values = shift;
867	3809					6514	my $scale = $self->_get_scale;
868	3809					7978	my $min = $self->{options}{min};
869	3809	50	33			10110	if (ref $values && ref $values eq 'ARRAY') {
870	0					0	my @return = map {
871	0					0	my $i = ($_ - $min)/$scale;
872	0					0	my $v = 1 + int($i+0.5*($i<=>0)); # avoid call to round()
873	0	0				0	$v = 1 if $v < 1;
874	0	0				0	$v = 255 if $v > 255;
875	0					0	$v;
876							} @$values;
877	0					0	return \@return;
878							} else {
879	3809					12077	my $v = 1 + round (($values - $min)/$scale);
880	3809	50				8130	$v = 1 if $v < 1;
881	3809	50				7193	$v = 255 if $v > 255;
882	3809					7442	return $v;
883							}
884							}
885
886							sub unscale {
887	4			4	0	7	my $self = shift;
888	4					6	my $values = shift;
889	4					12	my $scale = $self->_get_scale;
890	4					12	my $min = $self->{options}{min};
891
892	4	50	33			32	if (ref $values && ref $values eq 'ARRAY') {
893	4	100				332	my @return = map {$_ ? (($_-1) * $scale + $min) : undef} @$values;
	5202					14863
894	4					166	return \@return;
895							} else {
896	0	0				0	return $values ? ($values-1) * $scale + $min : undef;
897							}
898							}
899
900							sub _get_scale {
901	3813			3813		4521	my $self = shift;
902	3813	100				8797	unless ($self->{scale}) {
903	2					6	my $min = $self->{options}{min};
904	2					26	my $max = $self->{options}{max};
905	2		50			10	my $range = $max - $min \|\| 0.001; # can't be zero!
906	2					267	$self->{scale} = $range/254;
907							}
908	3813					6663	return $self->{scale};
909							}
910
911							sub round {
912	3809			3809	0	10586	return int($_[0]+0.5*($_[0]<=>0));
913							}
914
915
916							1;
917
918							__END__
919
920							=head1 SEE ALSO
921
922							L<Bio::Graphics::Wiggle::Loader>,
923							L<Bio::Graphics::Panel>,
924							L<Bio::Graphics::Glyph>,
925							L<Bio::Graphics::Feature>,
926							L<Bio::Graphics::FeatureFile>
927
928							=head1 AUTHOR
929
930							Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
931
932							Copyright (c) 2007 Cold Spring Harbor Laboratory
933
934							This package and its accompanying libraries is free software; you can
935							redistribute it and/or modify it under the terms of the GPL (either
936							version 1, or at your option, any later version) or the Artistic
937							License 2.0. Refer to LICENSE for the full license text. In addition,
938							please see DISCLAIMER.txt for disclaimers of warranty.
939
940							=cut