File Coverage

blib/lib/Math/NumSeq/OEIS/File.pm

Criterion	Covered	Total	%
statement	93	389	23.9
branch	21	184	11.4
condition	9	85	10.5
subroutine	21	40	52.5
pod	8	10	80.0
total	152	708	21.4

)? # empty or

line	stmt	bran	cond	sub	pod	time	code
1							# Copyright 2011, 2012, 2013, 2014 Kevin Ryde
2
3							# This file is part of Math-NumSeq.
4							#
5							# Math-NumSeq is free software; you can redistribute it and/or modify
6							# it under the terms of the GNU General Public License as published by the
7							# Free Software Foundation; either version 3, or (at your option) any later
8							# version.
9							#
10							# Math-NumSeq is distributed in the hope that it will be useful, but
11							# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12							# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13							# for more details.
14							#
15							# You should have received a copy of the GNU General Public License along
16							# with Math-NumSeq. If not, see .
17
18
19							# http://oeis.org/wiki/Clear-cut_examples_of_keywords
20							#
21							# ENHANCE-ME: share most of the a-file/b-file reading with Math::NumSeq::File
22
23							package Math::NumSeq::OEIS::File;
24	2			2		13905	use 5.004;
	2					8
	2					109
25	2			2		9	use strict;
	2					5
	2					57
26	2			2		9	use Carp;
	2					4
	2					211
27	2			2		1999	use POSIX ();
	2					15846
	2					56
28	2			2		18	use File::Spec;
	2					4
	2					50
29	2			2		4601	use Symbol 'gensym';
	2					5159
	2					402
30
31	2			2		16	use vars '$VERSION','@ISA';
	2					5
	2					149
32							$VERSION = 71;
33
34	2			2		3683	use Math::NumSeq;
	2					5
	2					98
35							@ISA = ('Math::NumSeq');
36							*_to_bigint = \&Math::NumSeq::_to_bigint;
37
38	2			2		12	use vars '$VERSION';
	2					4
	2					135
39							$VERSION = 71;
40
41	2			2		17	eval q{use Scalar::Util 'weaken'; 1}
	2					3
	2					224
42							\|\| eval q{sub weaken { $_[0] = undef }; 1 }
43							\|\| die "Oops, error making a weaken() fallback: $@";
44
45							# uncomment this to run the ### lines
46							# use Smart::Comments;
47
48
49							# use constant name => Math::NumSeq::__('OEIS File');
50	2			2		9269	use Math::NumSeq::OEIS;
	2					8
	2					139
51							*parameter_info_array = \&Math::NumSeq::OEIS::parameter_info_array;
52
53							use constant::defer _HAVE_ENCODE => sub {
54	0	0				0	eval { require Encode; 1 } \|\| 0;
	0					0
	0					0
55	2			2		11	};
	2					3
	2					25
56
57							sub description {
58	0			0	1	0	my ($class_or_self) = @_;
59	0	0	0			0	if (ref $class_or_self && defined $class_or_self->{'description'}) {
60							# instance
61	0					0	return $class_or_self->{'description'};
62							} else {
63							# class
64	0					0	return Math::NumSeq::__('OEIS sequence from file.');
65							}
66							}
67
68							sub values_min {
69	0			0	1	0	my ($self) = @_;
70							### OEIS-File values_min() ...
71	0					0	return _analyze($self)->{'values_min'};
72							}
73							sub values_max {
74	0			0	1	0	my ($self) = @_;
75							### OEIS-File values_max() ...
76	0					0	return _analyze($self)->{'values_max'};
77							}
78
79							my %analyze_characteristics = (increasing => 1,
80							increasing_from_i => 1,
81							non_decreasing => 1,
82							non_decreasing_from_i => 1,
83							smaller => 1,
84							);
85							sub characteristic {
86	0			0	1	0	my ($self, $key) = @_;
87	0	0				0	if ($analyze_characteristics{$key}) {
88	0					0	_analyze($self);
89							}
90	0					0	return shift->SUPER::characteristic(@_);
91							}
92
93							sub oeis_dir {
94	247			247	0	21799	require File::HomeDir;
95	247					14928	return File::Spec->catfile (File::HomeDir->my_home, 'OEIS');
96							}
97							sub anum_to_bfile {
98	0			0	0	0	my ($anum, $prefix) = @_;
99	0		0			0	$prefix \|\|= 'b';
100	0					0	$anum =~ s/^A/$prefix/;
101	0					0	return "$anum.txt";
102							}
103
104							#------------------------------------------------------------------------------
105							# Keep track of all instances which exist and on an ithread CLONE re-open
106							# any filehandles in the instances, so they have their own independent file
107							# positions in the new thread.
108
109							my %instances;
110							sub DESTROY {
111	65			65		169	my ($self) = @_;
112	65					592	delete $instances{$self+0};
113							}
114							sub CLONE {
115	0			0		0	my ($class) = @_;
116	0					0	foreach my $self (values %instances) {
117	0	0				0	next unless $self;
118	0	0				0	next unless $self->{'fh'};
119	0					0	my $pos = _tell($self);
120	0					0	my $fh = gensym;
121	0	0				0	if (open $fh, "< $self->{'filename'}") {
122	0					0	$self->{'fh'} = $fh;
123	0					0	_seek ($self, $pos);
124							} else {
125	0					0	delete $self->{'fh'};
126	0					0	delete $self->{'filename'};
127							}
128							}
129							}
130
131							#------------------------------------------------------------------------------
132
133							# The length in decimal digits of the biggest value which fits in a plain
134							# Perl integer. For example on a 32-bit system this is 9 since 9 digit
135							# numbers such as "999_999_999" are the biggest which fit a signed IV
136							# (+2^31).
137							#
138							# The IV size is probed rather than using ~0 since under "perl -Minteger"
139							# have ~0 as -1 rather than the biggest UV ... except "use integer" is not
140							# normally global.
141							#
142							# The NV size is applied to the limit too since not sure should trust values
143							# to stay in IV or UV. This means on a 64-bit integer with 53-bit NV
144							# "double" the limit is 53-bits.
145							#
146	2					9	use constant 1.02 _MAX_DIGIT_LENGTH => do {
147							### ~0 is: ~0
148
149	2					29	my $iv = 0;
150	2					9	for (1 .. 256) {
151	130					137	my $new = ($iv << 1) \| 1;
152	130	100	66			424	unless ($new > $iv && ($new & 1) == 1) {
153	2					7	last;
154							}
155	128					152	$iv = $new;
156							}
157							### $iv
158
159	2					10	require POSIX;
160	2					47	my $nv = POSIX::FLT_RADIX() ** (POSIX::DBL_MANT_DIG()-5);
161							### $nv
162
163	2					7	my $iv_len = length($iv) - 1;
164	2					16	my $nv_len = length($nv) - 1;
165	2	50				10529	($iv_len < $nv_len ? $iv_len : $nv_len) # smaller of the two lengths;
166	2			2		1676	};
	2					44
167							### _MAX_DIGIT_LENGTH: _MAX_DIGIT_LENGTH()
168
169
170							#------------------------------------------------------------------------------
171
172							# special case a000000.txt files to exclude
173							#
174							my %afile_exclude
175							= (
176							# a003849.txt has replication level words rather than the individual
177							# sequence values.
178							'a003849.txt' => 1,
179
180							# a027750.txt is unflattened divisors as lists.
181							# Its first line is a correct looking "1 1" so _afile_is_good() doesn't
182							# notice.
183							'a027750.txt' => 1,
184							);
185
186
187							# Fields:
188							# fh File handle ref, if reading B-file or A-file
189							#
190							# next_seek File pos to seek $fh for next() to read from.
191							# ith() sets this when it moves the file position.
192							#
193							# array Arrayref of values if using .internal or .html.
194							# array_pos Index 0,1,2,... of next value of $array to return by next().
195							#
196							# i Next $i for next() to return.
197							# When reading a file this is ignored, use the file i instead.
198
199							sub new {
200							### OEIS-File new() ...
201	65			65	1	33396	my $self = shift->SUPER::new(@_);
202
203	65					251	delete $self->{'next_seek'}; # no initial seek
204	65					266	$self->{'characteristic'}->{'integer'} = 1;
205
206	65					138	my $anum = $self->{'anum'};
207	65					516	(my $num = $anum) =~ s/^A//;
208	65					331	foreach my $basefile ("a$num.txt",
209							"b$num.txt") {
210	130	100				405	next if $afile_exclude{$basefile};
211
212	120	100	100			801	next if $self->{'_dont_use_afile'} && $basefile =~ /^a/;
213	76	100	66			462	next if $self->{'_dont_use_bfile'} && $basefile =~ /^b/;
214
215	37					123	my $filename = File::Spec->catfile (oeis_dir(), $basefile);
216							### $filename
217	37					3793	my $fh = gensym();
218	37	50				1815	if (! open $fh, "< $filename") {
219							### cannot open: $!
220	37					213	next;
221							}
222
223	0					0	$self->{'filename'} = $filename; # the B-file or A-file name
224	0					0	$self->{'fh'} = $fh;
225	0	0				0	if (! _afile_is_good($self)) {
226							### this afile not good ...
227	0					0	close delete $self->{'fh'};
228	0					0	delete $self->{'filename'};
229	0					0	next;
230							}
231	0					0	$self->{'fh_i'} = $self->i_start; # at first entry
232
233							### opened: $fh
234	0					0	last;
235							}
236
237	65		33			331	my $have_info = (_read_internal_txt($self, $anum)
238							\|\| _read_internal_html($self, $anum)
239							\|\| _read_html($self, $anum));
240
241	65	50	33			481	if (! $have_info && ! $self->{'fh'}) {
242	65					19668	croak 'OEIS file(s) not found for A-number "',$anum,'"';
243							}
244
245	0					0	weaken($instances{$self+0} = $self);
246	0					0	return $self;
247							}
248
249							sub _analyze {
250	0			0		0	my ($self) = @_;
251
252	0	0				0	if ($self->{'analyze_done'}) {
253	0					0	return $self;
254							}
255	0					0	$self->{'analyze_done'} = 1;
256
257							### _analyze() ...
258
259	0					0	my $i_start = $self->i_start;
260	0					0	my ($i, $value);
261	0					0	my ($prev_i, $prev_value);
262
263	0					0	my $values_min;
264	0					0	my $values_max;
265	0					0	my $increasing_from_i = $i_start;
266	0					0	my $non_decreasing_from_i = $i_start;
267	0					0	my $strictly_smaller_count = 0;
268	0					0	my $smaller_count = 0;
269	0					0	my $total_count = 0;
270
271							my $analyze = sub {
272							### $prev_value
273							### $value
274	0	0	0	0		0	if (! defined $values_min \|\| $value < $values_min) {
275	0					0	$values_min = $value;
276							}
277	0	0	0			0	if (! defined $values_max \|\| $value > $values_max) {
278	0					0	$values_max = $value;
279							}
280
281	0	0				0	if (defined $prev_value) {
282	0					0	my $cmp = ($value <=> $prev_value);
283	0	0				0	if ($cmp < 0) {
284							# value < $prev_value
285	0					0	$increasing_from_i = $i;
286	0					0	$non_decreasing_from_i = $i;
287							}
288	0	0				0	if ($cmp <= 0) {
289							# value <= $prev_value
290	0					0	$increasing_from_i = $i;
291							}
292							}
293
294	0					0	$total_count++;
295	0					0	$smaller_count += (abs($value) <= $i);
296	0					0	$strictly_smaller_count += ($value < $i);
297
298	0					0	$prev_i = $value;
299	0					0	$prev_value = $value;
300	0					0	};
301
302	0	0				0	if (my $fh = $self->{'fh'}) {
303	0					0	my $oldpos = _tell($self);
304	0					0	while (($i, $value) = _readline($self)) {
305	0					0	$analyze->($value);
306	0	0				0	last if $total_count > 200;
307							}
308	0					0	_seek ($self, $oldpos);
309							} else {
310	0					0	$i = $i_start;
311	0					0	foreach (@{$self->{'array'}}) {
	0					0
312	0					0	$i++;
313	0					0	$value = $_;
314	0					0	$analyze->();
315							}
316							}
317
318	0		0			0	my $range_is_small = (defined $values_max
319							&& $values_max - $values_min <= 16);
320							### $range_is_small
321
322							# "full" means whole sequence in sample values
323							# "sign" means negatives in sequence
324	0	0	0			0	if (! defined $self->{'values_min'}
			0
325							&& ($range_is_small
326							\|\| $self->{'characteristic'}->{'OEIS_full'}
327							\|\| ! $self->{'characteristic'}->{'OEIS_sign'})) {
328							### set values_min: $values_min
329	0					0	$self->{'values_min'} = $values_min;
330							}
331	0	0	0			0	if (! defined $self->{'values_max'}
			0
332							&& ($range_is_small
333							\|\| $self->{'characteristic'}->{'OEIS_full'})) {
334							### set values_max: $values_max
335	0					0	$self->{'values_max'} = $values_max;
336							}
337
338	0		0			0	$self->{'characteristic'}->{'smaller'}
339							= ($total_count == 0
340							\|\| ($smaller_count / $total_count >= .9
341							&& $strictly_smaller_count > 0));
342							### decide smaller: $self->{'characteristic'}->{'smaller'}
343
344							### $increasing_from_i
345	0	0	0			0	if (defined $prev_i && $increasing_from_i < $prev_i) {
346	0	0				0	if ($increasing_from_i - $i_start < 20) {
347	0					0	$self->{'characteristic'}->{'increasing_from_i'} = $increasing_from_i;
348							}
349	0	0				0	if ($increasing_from_i == $i_start) {
350	0					0	$self->{'characteristic'}->{'increasing'} = 1;
351							}
352							}
353
354							### $non_decreasing_from_i
355	0	0	0			0	if (defined $prev_i && $non_decreasing_from_i < $prev_i) {
356	0	0				0	if ($non_decreasing_from_i - $i_start < 20) {
357	0					0	$self->{'characteristic'}->{'non_decreasing_from_i'} = $non_decreasing_from_i;
358							}
359	0	0				0	if ($non_decreasing_from_i == $i_start) {
360	0					0	$self->{'characteristic'}->{'non_decreasing'} = 1;
361							}
362							}
363
364	0					0	return $self;
365							}
366
367							# # compare $x <=> $y but in strings in case they're bigger than IV or NV
368							# # my $cmp = _value_cmp ($value, $prev_value);
369							# sub _value_cmp {
370							# my ($x, $y) = @_;
371							# ### _value_cmp(): "$x $y"
372							# ### cmp: $x cmp $y
373							#
374							# my $x_neg = substr($x,0,1) eq '-';
375							# my $y_neg = substr($y,0,1) eq '-';
376							# ### $x_neg
377							# ### $y_neg
378							#
379							# return ($y_neg <=> $x_neg
380							# \|\| ($x_neg ? -1 : 1) * (length($x) <=> length($y)
381							# \|\| $x cmp $y));
382							# }
383
384							sub _seek {
385	0			0		0	my ($self, $pos) = @_;
386	0	0				0	seek ($self->{'fh'}, $pos, 0)
387							or croak "Cannot seek $self->{'filename'}: $!";
388							}
389							sub _tell {
390	0			0		0	my ($self) = @_;
391	0					0	my $pos = tell $self->{'fh'};
392	0	0				0	if ($pos < 0) {
393	0					0	croak "Cannot tell file position $self->{'filename'}: $!";
394							}
395	0					0	return $pos;
396							}
397
398							sub rewind {
399	65			65	1	178	my ($self) = @_;
400							### OEIS-File rewind() ...
401
402	65					381	$self->{'i'} = $self->i_start;
403	65					188	$self->{'array_pos'} = 0;
404	65					193	$self->{'next_seek'} = 0;
405							}
406
407							sub next {
408	0			0	1	0	my ($self) = @_;
409							### OEIS-File next(): "i=$self->{'i'}"
410
411	0					0	my $value;
412	0	0				0	if (my $fh = $self->{'fh'}) {
413							### from readline ...
414	0	0				0	if (defined (my $pos = delete $self->{'next_seek'})) {
415							### seek to: $pos
416	0					0	_seek($self, $pos);
417							}
418	0					0	return _readline($self);
419
420							} else {
421							### from array ...
422	0	0				0	my ($value) = _array_value($self, $self->{'array_pos'}++)
423							or return;
424	0					0	return ($self->{'i'}++, $value);
425							}
426							}
427
428							# Return $self->{'array'}->[$pos], or no values if $pos past end of array.
429							# Array values are promoted to BigInt if necessary.
430							sub _array_value {
431	0			0		0	my ($self, $pos) = @_;
432							### _array_value(): $pos
433
434	0					0	my $array = $self->{'array'};
435	0	0				0	if ($pos > $#$array) {
436							### past end of array ...
437	0					0	return;
438							}
439	0					0	my $value = $array->[$pos];
440
441							# large values as Math::BigInt
442							# initially $array has strings, make bigint objects when required
443	0	0	0			0	if (! ref $value && length($value) > _MAX_DIGIT_LENGTH) {
444	0					0	$value = $array->[$pos] = _to_bigint($value);
445							}
446							### $value
447	0					0	return $value;
448							}
449
450							# Read a line from an open B-file or A-file, return ($i,$value).
451							# At EOF return empty ().
452							#
453							sub _readline {
454	0			0		0	my ($self) = @_;
455	0					0	my $fh = $self->{'fh'};
456	0					0	while (defined (my $line = <$fh>)) {
457	0					0	chomp $line;
458	0					0	$line =~ tr/\r//d; # delete CR if CRLF line endings, eg. b009000.txt
459							### $line
460
461	0	0				0	if ($line =~ /^\s*(#\|$)/) {
462							### ignore blank or comment ...
463							# comment lines with "#" eg. b002182.txt
464	0					0	next;
465							}
466
467							# leading whitespace allowed as per b195467.txt
468	0	0				0	if (my ($i, $value) = ($line =~ /^\s*
469							([0-9]+) # i
470							[ \t]+
471							(-?[0-9]+) # value
472							[ \t]*
473							$/x)) {
474							### _readline: "$i $value"
475	0	0				0	if (length($value) > _MAX_DIGIT_LENGTH) {
476	0					0	$value = _to_bigint($value);
477							}
478	0					0	$self->{'fh_i'} = $i+1;
479	0					0	return ($i, $value);
480							}
481							}
482	0					0	undef $self->{'fh_i'};
483	0					0	return;
484							}
485
486							# Return true if the a000000.txt file in $self->{'fh'} looks good.
487							# Various a-files are source code or tables rather than sequence values.
488							#
489							sub _afile_is_good {
490	0			0		0	my ($self) = @_;
491	0					0	my $fh = $self->{'fh'};
492	0					0	my $good = 0;
493	0					0	my $prev_i;
494	0					0	while (defined (my $line = <$fh>)) {
495	0					0	chomp $line;
496	0					0	$line =~ tr/\r//d; # delete CR if CRLF line endings, eg. b009000.txt
497							### $line
498
499	0	0				0	if ($line =~ /^\s*(#\|$)/) {
500							### ignore blank or comment ...
501	0					0	next;
502							}
503
504							# Must have line like "0 123". Can have negative OFFSET and so index i,
505							# eg. A166242 (though that one doesn't have an A-file).
506	0	0				0	my ($i,$value) = ($line =~ /^(-?[0-9]+) # i
507							[ \t]+
508							(-?[0-9]+) # value
509							[ \t]*
510							$/x)
511							or last;
512
513	0	0	0			0	if (defined $prev_i && $i != $prev_i+1) {
514							### bad A-file, initial "i" values not consecutive ...
515	0					0	last;
516							}
517	0					0	$prev_i = $i;
518
519	0					0	$good++;
520	0	0				0	if ($good >= 3) {
521							### three good lines, A-file is good ...
522	0					0	_seek ($self, 0);
523	0					0	return 1;
524							}
525							}
526	0					0	return 0;
527							}
528
529							sub _read_internal_txt {
530	65			65		145	my ($self, $anum) = @_;
531							### _read_internal_txt(): $anum
532
533	65	100				280	return 0 if $self->{'_dont_use_internal'};
534
535	52					229	foreach my $basefile ("$anum.internal.txt") {
536	52	50				223	my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
537							or next;
538	0	0				0	if (_HAVE_ENCODE) {
539							# "Internal" text format is utf-8.
540	0					0	$contents = Encode::decode('utf-8', $contents, Encode::FB_PERLQQ());
541							}
542
543							### $contents
544
545							# eg. "%O A007318 0,5"
546	0					0	my $offset;
547	0	0				0	if ($contents =~ /^%O\s+\Q$anum\E\s+(\d+)/im) {
548	0					0	$offset = $1;
549							### %O line: $offset
550							} else {
551	0					0	$offset = 0;
552							}
553
554							# eg. "%N A007318 Pascal's triangle ..."
555	0	0				0	if ($contents =~ m{^%N\s+\Q$anum\E\s+(.*)}im) {
556	0					0	_set_description ($self, $1);
557							} else {
558							### description not matched ...
559							}
560
561							# eg. "%K A007318 nonn,tabl,nice,easy,core,look,hear,changed"
562	0		0			0	_set_characteristics ($self,
563							$contents =~ /^%K\s+\Q$anum\E\s+(.*)/im && $1);
564
565							# the eishelp1.html says
566							# %V,%W,%X lines for signed sequences
567							# %S,%T,%U lines for non-negative sequences
568							# though now %S is signed and unsigned both is it?
569							#
570	0	0				0	if (! $self->{'fh'}) {
571	0					0	my @samples;
572							# capital %STU etc, but any case
573	0					0	while ($contents =~ m{^%[VWX]\s+\Q$anum\E\s+(.*)}mg) {
574	0					0	push @samples, $1;
575							}
576	0	0				0	unless (@samples) {
577	0					0	while ($contents =~ m{^%[STU]\s+\Q$anum\E\s+(.*)}mg) {
578	0					0	push @samples, $1;
579							}
580	0	0				0	unless (@samples) {
581	0					0	croak "Oops list of values not found in ",$self->{'filename'};
582							}
583							}
584							# join multiple lines of samples
585	0					0	_split_sample_values ($self, join(', ',@samples));
586							}
587
588							# %O "OFFSET" is subscript of first number.
589							# Or for digit expansions it's the number of terms before the decimal
590							# point, per http://oeis.org/eishelp2.html#RO
591							#
592	0	0				0	unless ($self->{'characteristic'}->{'digits'}) {
593	0					0	$self->{'i'} = $self->{'i_start'} = $offset;
594							}
595							### i: $self->{'i'}
596							### i_start: $self->{'i_start'}
597
598	0					0	return 1; # success
599							}
600
601	52					611	return 0; # file not found
602							}
603
604							sub _read_internal_html {
605	65			65		137	my ($self, $anum) = @_;
606							### _read_internal_html(): $anum
607
608	65	100				545	return 0 if $self->{'_dont_use_internal'};
609
610	52					188	foreach my $basefile ("$anum.internal.html") {
611	52	50				114	my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
612							or next;
613							# "Internal" files are served as html with a charset indicator
614	0					0	$contents = _decode_html_charset($contents);
615							### $contents
616
617	0					0	my $offset;
618	0	0				0	if ($contents =~ /(^\|`)%O\s+(\d+)/im) {`
619	0					0	$offset = $2;
620							### %O line: $offset
621							} else {
622	0					0	$offset = 0;
623							}
624
625	0	0				0	if ($contents =~ m{(^\|`)%N (.*?)(\|$)}im) {`
626	0					0	_set_description ($self, $2);
627							} else {
628							### description not matched ...
629							}
630
631	0		0			0	_set_characteristics ($self,
632							$contents =~ /(^\|`)%K (.*?)(\|$)/im`
633							&& $2);
634
635							# the eishelp1.html says
636							# %V,%W,%X lines for signed sequences
637							# %S,%T,%U lines for non-negative sequences
638							# though now %S is signed and unsigned both is it?
639							#
640	0	0				0	if (! $self->{'fh'}) {
641	0					0	my @samples;
642							# capital %STU etc, but any case
643	0					0	while ($contents =~ m{(^\|<[tT][tT]>)%[VWX] (.*?)(\|$)}mg) {
644	0					0	push @samples, $2;
645							}
646	0	0				0	unless (@samples) {
647	0					0	while ($contents =~ m{(^\|<[tT][tT]>)%[STU] (.*?)(\|$)}mg) {
648	0					0	push @samples, $2;
649							}
650	0	0				0	unless (@samples) {
651	0					0	croak "Oops list of values not found in ",$self->{'filename'};
652							}
653							}
654							# join multiple lines of samples
655	0					0	_split_sample_values ($self, join(', ',@samples));
656							}
657
658							# %O "OFFSET" is subscript of first number.
659							# Or for digit expansions it's the number of terms before the decimal
660							# point, per http://oeis.org/eishelp2.html#RO
661							#
662	0	0				0	unless ($self->{'characteristic'}->{'digits'}) {
663	0					0	$self->{'i'} = $self->{'i_start'} = $offset;
664							}
665							### i: $self->{'i'}
666							### i_start: $self->{'i_start'}
667
668	0					0	return 1; # success
669							}
670
671	52					598	return 0; # file not found
672							}
673
674							# Fill $self with contents of ~/OEIS/A000000.html but various fragile greps
675							# of the html.
676							# Return 1 if .html or .htm file exists, 0 if not.
677							#
678							sub _read_html {
679	65			65		116	my ($self, $anum) = @_;
680							### _read_html(): $anum
681
682	65	100				235	return 0 if $self->{'_dont_use_html'};
683
684	52					189	foreach my $basefile ("$anum.html", "$anum.htm") {
685	104	50				225	my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
686							or next;
687	0					0	$contents = _decode_html_charset($contents);
688
689	0	0				0	if ($contents =~
690							m{$anum[ \t]\n.? # target anum
691							]>\s(?:
692							]*> #
693							\s*
694							(.*?) # text through to ...
695							( \| or
696							}isx) {
697	0					0	_set_description ($self, $1);
698							} else {
699							### description not matched ...
700							}
701
702	0		0			0	my $offset = ($contents =~ /OFFSET.*?<[tT][tT]>(\d+)/s
703							&& $1);
704							### $offset
705
706							# fragile grep out of the html ...
707	0					0	my $keywords;
708	0	0				0	if ($contents =~ m{KEYWORD.?<[tT][tT][^>]>(.*?)}s) {
709							### html keywords match: $1
710	0					0	$keywords = $1;
711							} else {
712							# die "Oops, KEYWORD not matched: $anum";
713							}
714	0					0	_set_characteristics ($self, $keywords);
715
716	0	0				0	if (! $self->{'fh'}) {
717							# fragile grep out of the html ...
718	0					0	$contents =~ s{>graph.*}{};
719	0					0	$contents =~ m{.*`([^<]+)`}i;
720	0					0	my $list = $1;
721	0					0	_split_sample_values ($self, $list);
722							}
723
724							# %O "OFFSET" is subscript of first number, but for digit expansions
725							# it's the position of the decimal point
726							# http://oeis.org/eishelp2.html#RO
727	0	0				0	if (! $self->{'characteristic'}->{'digits'}) {
728	0					0	$self->{'i'} = $self->{'i_start'} = $offset;
729							}
730							### i: $self->{'i'}
731							### i_start: $self->{'i_start'}
732
733	0					0	return 1;
734							}
735	52					222	return 0;
736							}
737
738							# Return the contents of ~/OEIS/$filename.
739							# $filename is like "A000000.html" to be taken relative to oeis_dir().
740							# If $filename cannot be read then return undef.
741							sub _slurp_oeis_file {
742	208			208		357	my ($self,$filename) = @_;
743	208					643	$filename = File::Spec->catfile (oeis_dir(), $filename);
744							### $filename
745
746	208	50				17882	if (! open FH, "< $filename") {
747							### cannot open file: $!
748	208					1857	return;
749							}
750	0						my $contents = do { local $/; }; # slurp
	0
	0
751	0	0					close FH
752							or return;
753	0		0				$self->{'filename'} \|\|= $filename;
754	0						return ($filename, $contents);
755							}
756
757							sub _set_description {
758	0			0			my ($self, $description) = @_;
759							### _set_description(): $description
760
761	0						$description =~ s/\s+$//; # trailing whitespace
762	0						$description =~ s/\s+/ /g; # collapse whitespace
763	0						$description =~ s/<[^>]*?>//sg; # tags
764	0						$description =~ s/</
765	0						$description =~ s/>/>/ig; # unentitize >
766	0						$description =~ s/&/&/ig; # unentitize &
767	0						$description =~ s/&#(\d+);/chr($1)/ge; # unentitize numeric ' and "
	0
768
769							# ENHANCE-ME: maybe __x() if made available, or an sprintf "... %s" would
770							# be enough ...
771	0						$description .= "\n";
772	0	0					if ($self->{'fh'}) {
773	0						$description .= sprintf(Math::NumSeq::__('Values from B-file %s'),
774							$self->{'filename'})
775							} else {
776	0						$description .= sprintf(Math::NumSeq::__('Values from %s'),
777							$self->{'filename'})
778							}
779	0						$self->{'description'} = $description;
780							}
781
782							sub _set_characteristics {
783	0			0			my ($self, $keywords) = @_;
784							### _set_characteristics()
785							### $keywords
786
787	0	0					if (! defined $keywords) {
788	0						return; # if perhaps match of .html failed
789							}
790
791	0						$keywords =~ s{<[^>]*>}{}g; # tags
792							### $keywords
793
794	0		0				foreach my $key (split /[, \t]+/, ($keywords\|\|'')) {
795							### $key
796	0						$self->{'characteristic'}->{"OEIS_$key"} = 1;
797							}
798
799							# if ($self->{'characteristic'}->{'OEIS_cofr'}) {
800							# $self->{'characteristic'}->{'continued_fraction'} = 1;
801							# }
802
803							# "cons" means decimal digits of a constant
804							# but don't reckon A000012 all-ones that way
805							# "base" means non-decimal, it seems, maybe
806	0	0	0				if ($self->{'characteristic'}->{'OEIS_cons'}
			0
807							&& ! $self->{'characteristic'}->{'OEIS_base'}
808							&& $self->{'anum'} ne 'A000012') {
809	0						$self->{'values_min'} = 0;
810	0						$self->{'values_max'} = 9;
811	0						$self->{'characteristic'}->{'digits'} = 10;
812							}
813
814	0	0					if (defined (my $description = $self->{'description'})) {
815	0	0					if ($description =~ /expansion of .* in base (\d+)/i) {
816	0						$self->{'values_min'} = 0;
817	0						$self->{'values_max'} = $1 - 1;
818	0						$self->{'characteristic'}->{'digits'} = $1;
819							}
820	0	0					if ($description =~ /^number of /i) {
821	0						$self->{'characteristic'}->{'count'} = 1;
822							}
823							}
824							}
825
826							sub _split_sample_values {
827	0			0			my ($self, $str) = @_;
828							### _split_sample_values(): $str
829	0	0	0				unless (defined $str && $str =~ m{^([0-9,-]\|\s)+$}) {
830	0	0					croak "Oops list of sample values not recognised in ",$self->{'filename'},"\n",
831							(defined $str ? $str : ());
832							}
833	0						$self->{'array'} = [ split /[, \t\r\n]+/, $str ];
834							}
835
836							sub _decode_html_charset {
837	0			0			my ($contents) = @_;
838
839							# eg.
840							# HTTP::Message has a blob of code for this, using the full HTTP::Parser,
841							# but a slack regexp should be enough for OEIS pages.
842							#
843	0	0	0				if (_HAVE_ENCODE
844							&& $contents =~ m{]+
845							http-equiv=[^>]+
846							content-type[^>]+
847							charset=([a-z0-9-_]+)}isx) {
848	0						return Encode::decode($1, $contents, Encode::FB_PERLQQ());
849							} else {
850	0						return $contents;
851							}
852							}
853
854							#------------------------------------------------------------------------------
855
856							# Similar bsearch to Search::Dict, but Search::Dict doesn't allow for
857							# comment lines at the start of the file or blank lines at the end.
858							#
859							#use Smart::Comments;
860
861							sub ith {
862	0			0	1		my ($self, $i) = @_;
863							### ith(): "$i cf fh_i=".($self->{'fh_i'} \|\| -999)
864
865	0	0					if (my $fh = $self->{'fh'}) {
866	0	0					if (! defined $self->{'next_seek'}) {
867	0						$self->{'next_seek'} = tell($fh);
868							}
869
870	0	0	0				if (defined $self->{'fh_i'} && $i <= $self->{'fh_i'} + 20) {
871							### fh_i is target ...
872	0	0					if (my ($line_i, $value) = _readline($self)) {
873	0	0					if ($line_i == $i) {
874	0						return $value;
875							}
876							}
877							}
878
879	0						my $lo = 0;
880	0						my $hi = -s $fh;
881	0						for (;;) {
882							### at: "lo=$lo hi=$hi consider mid=".int(($lo+$hi)/2)
883	0						my $mid = int(($lo+$hi)/2);
884	0						_seek ($self, $mid);
885
886	0	0					if (! defined(readline $fh)) {
887							### mid is EOF ...
888	0						last;
889							}
890							### skip partial line to: tell($fh)
891	0						$mid = tell($fh);
892	0	0					if ($mid >= $hi) {
893	0						last;
894							}
895
896	0	0					my ($line_i,$value) = _readline($self)
897							or last; # only blank lines between $mid and EOF, go linear
898
899							### $line_i
900							### $value
901	0	0					if ($line_i == $i) {
902							### found by binary search ...
903	0						return $value;
904							}
905	0	0					if ($line_i < $i) {
906							### line_i before the target, advance lo ...
907	0						$lo = tell($fh);
908							} else {
909							### line_i after target, reduce hi ...
910	0						$hi = $mid;
911							}
912							}
913
914	0						_seek ($self, $lo);
915	0						for (;;) {
916	0	0					my ($line_i,$value) = _readline($self)
917							or last;
918	0	0					if ($line_i == $i) {
919							### found by linear search ...
920	0						$self->{'fh_i'} = $line_i+1;
921	0						return $value;
922							}
923	0	0					if ($line_i > $i) {
924	0						return undef;
925							}
926							}
927	0						return undef;
928
929							} else {
930	0						$i -= $self->i_start;
931	0	0					unless ($i >= 0) {
932	0						return undef; # negative or NaN
933							}
934	0						return $self->{'array'}->[$i];
935							}
936							}
937
938							1;
939							__END__