File Coverage

blib/lib/Tie/CacheHash.pm
Criterion Covered Total %
statement 9 361 2.4
branch 0 292 0.0
condition 0 101 0.0
subroutine 3 23 13.0
pod 0 12 0.0
total 12 789 1.5


line stmt bran cond sub pod time code
1             # Tie::CacheHash -- Maintains sorted lists of top entries. -*- perl -*-
2             #
3             # Copyright 1999 by Jamie McCarthy
4             #
5             # This program is free software; you can redistribute it and/or modify it
6             # under the same terms as Perl itself.
7             #
8             # Version 0.50: First version, does what it's supposed to do,
9             # tested only in-house so far.
10              
11             ############################################################################
12             # Modules and declarations
13             ############################################################################
14              
15             package Tie::CacheHash;
16              
17             require 5.003;
18              
19 1     1   7466 use strict;
  1         2  
  1         39  
20 1         45 use vars qw(
21             $VERSION
22             $DEBUG
23 1     1   4 );
  1         2  
24              
25 1     1   5 use Carp;
  1         12  
  1         12016  
26              
27             # The version of this module is its CVS revision.
28             $VERSION = '0.50';
29             $DEBUG = 0;
30              
31             ############################################################################
32             # Tie methods
33             ############################################################################
34              
35             sub TIEHASH {
36 0     0     my($class, @args) = @_;
37 0   0       $class = ref($class) || $class;
38              
39 0           my $details = "see 'perldoc Tie::CacheHash' for details";
40 0 0         if (scalar(@args) != 1) {
41 0           Carp::croak("Tie::CacheHash requires exactly one argument,"
42             . " and it must be a reference to a hash - $details");
43             }
44              
45             # "ahr" = "argument hash ref"
46 0           my $ahr = $args[0];
47 0 0 0       if (!ref $ahr or ref $ahr ne 'HASH') {
48 0           Carp::croak("the argument passed to Tie::CacheHash must be a"
49             . " hash reference - $details");
50             }
51              
52 0 0         $ahr->{min} = '1%' if !defined($ahr->{min});
53 0 0         $ahr->{min_margin} = '1%' if !defined($ahr->{min_margin});
54 0 0         $ahr->{max_margin} = '5%' if !defined($ahr->{max_margin});
55 0 0         $ahr->{max} = '10%' if !defined($ahr->{max});
56 0 0         $ahr->{sort_func} = undef if !defined($ahr->{sort_func});
57 0 0         $ahr->{sort_rev} = 0 if !defined($ahr->{sort_rev});
58 0 0         $ahr->{scan_min} = 800 if !defined($ahr->{scan_min});
59 0 0         $ahr->{fudge} = 0.1 if !defined($ahr->{fudge});
60 0 0         $ahr->{sub_hash} = undef if !defined($ahr->{sub_hash});
61              
62 0           my $self = {
63              
64             h => { },
65             cache => [ ],
66              
67             # The user may set these variables directly, either in the
68             # arguments passed to tie() or BEFORE any data is stored in
69             # the hash. As soon as any data is stored in the hash,
70             # the user MUST NOT change these.
71              
72             min => $ahr->{min},
73             min_margin => $ahr->{min_margin},
74             max_margin => $ahr->{min_margin},
75             max => $ahr->{max},
76              
77             sort_func => $ahr->{sort_func},
78             sort_rev => $ahr->{sort_rev},
79              
80             # These variables are used to perform cache_remove efficiently.
81             # They may be edited by the user at any time.
82              
83             scan_min => $ahr->{scan_min},
84             fudge => $ahr->{fudge},
85              
86             # This is a convenience variable which the user should never
87             # write to but may read. It's faster than "scalar(keys %myhash)"
88             # (or at least it should be, haven't tested this).
89              
90             num_keys => 0,
91              
92             };
93 0           bless $self, $class;
94              
95 0 0         if ($ahr->{sub_hash}) {
96 0           $self->{h} = $ahr->{sub_hash};
97 0           $self->{num_keys} = undef;
98 0 0         if ($DEBUG) {
99 0           my $info = "num_keys=(not_yet_defined)";
100 0           my $href = ref $self->{h};
101 0 0         $info .= " ref=$href" if $href;
102 0           my $tied = tied %{$self->{h}};
  0            
103 0 0         if ($tied) {
104 0           my $tref = ref $tied;
105 0 0         $info .= " ref(tied())=$tref" if $tref;
106             }
107 0           print STDERR scalar(localtime) . " using reference to existing hash: $info\n";
108             }
109 0           $self->cache_rebuild();
110             }
111              
112 0           $self;
113             }
114              
115              
116             sub FETCH {
117 0 0   0     if ($DEBUG > 2) {
118 0 0         my $value = (defined($_[0]->{h}{$_[1]}) ? $_[0]->{h}{$_[1]} : '(undef)');
119 0           print STDERR scalar(localtime) . " FETCH key=$_[1] value=$value\n";
120             }
121             # FETCH is one function that must be very fast, so we
122             # streamline it by not even using any variables.
123 0           $_[0]->{h}{$_[1]};
124             }
125              
126             sub STORE {
127 0     0     my($self, $key, $value) = @_;
128              
129 0 0 0       return if defined($self->{h}{$key}) and $self->{h}{$key} eq $value;
130              
131 0 0         print STDERR scalar(localtime) . " STORE BEGIN key=$key value=$value oldvalue=" . ((defined($self->{h}{$key}) ? $self->{h}{$key} : '(undef)')) . " num_keys=$self->{num_keys}\n" if $DEBUG > 2;
    0          
132              
133 0 0         if (defined($self->{h}{$key})) {
134              
135             # There already exists a different value for this key. Remove
136             # it so we can store the new value.
137              
138 0           $self->DELETE($key);
139              
140 0 0         print STDERR scalar(localtime) . " STORE1 key=$key deletedvalue=" . ((defined($self->{h}{$key}) ? $self->{h}{$key} : '(undef)')) . "\n" if $DEBUG > 2;
    0          
141              
142             }
143              
144 0           $self->{h}{$key} = $value;
145 0           $self->{num_keys}++;
146              
147             # Should we insert this value into the cache? Only if it's less than
148             # the current cache_top, or if the size of the entire hash is less
149             # than the maximum cache size.
150              
151 0           my $cr = $self->{cache};
152 0           my $cache_top = $self->get_cache_top();
153 0           my $max = $self->cache_value('max');
154 0 0         print STDERR scalar(localtime) . " STORE2 key=$key value=$value cache_top=$cache_top cr=$#$cr max=$max cmp()=" . ($self->cmp($value, $cache_top)) . "\n" if $DEBUG > 2;
155 0 0 0       if ($self->{num_keys} <= $max
      0        
156             or !defined($cache_top)
157             or $self->cmp($value, $cache_top) <= 0) { # shouldn't matter whether this is "<0" or "<=0"
158 0           $self->cache_insert($key, $value);
159             }
160 0 0         print STDERR scalar(localtime) . " STORE3 key=$key cache_top=$cache_top cr=$#$cr\n" if $DEBUG > 2;
161 0 0 0       if ($DEBUG and int(rand(200)) == 0) {
162 0           $self->cache_sort_check("inserted $key $value");
163             }
164              
165 0 0         print STDERR scalar(localtime) . " STORE END key=$key value=$value cache_top=" . (defined($cache_top) ? $cache_top : '(undef)') . " cr=$#$cr max=$max num_keys=$self->{num_keys}\n" if $DEBUG > 2;
    0          
166             }
167              
168             sub DELETE {
169 0     0     my($self, $key) = @_;
170 0 0         return unless exists($self->{h}{$key});
171 0           my $value = $self->{h}{$key};
172              
173 0           my $cache_top = $self->get_cache_top();
174 0 0         print STDERR scalar(localtime) . " DELETE BEGIN key=$key value=$value cr=$#{$self->{cache}} cache_top=$cache_top\n" if $DEBUG > 2;
  0            
175 0 0 0       if (!defined($cache_top) or $self->cmp($value, $cache_top) <= 0) {
176 0 0         print STDERR scalar(localtime) . " DELETE key=$key before cache_remove cr=$#{$self->{cache}}\n" if $DEBUG > 2;
  0            
177 0           $self->cache_remove($key, $value);
178 0 0         print STDERR scalar(localtime) . " DELETE key=$key after cache_remove cr=$#{$self->{cache}}\n" if $DEBUG > 2;
  0            
179             }
180              
181 0           delete $self->{h}{$key};
182 0           $self->{num_keys}--;
183              
184 0 0 0       if ($DEBUG and int(rand(200)) == 0) {
185 0           $self->cache_sort_check("deleted $key $value $#{$self->{cache}}");
  0            
186             }
187 0 0         if ($self->cache_too_small()) {
188 0           $self->cache_rebuild();
189 0 0         $self->cache_sort_check("deleted $key $value $#{$self->{cache}} and rebuilt") if $DEBUG;
  0            
190             }
191              
192 0 0         if ($DEBUG > 2) {
193 0   0       my $cache_top = $self->get_cache_top() || '(none)';
194 0           print STDERR scalar(localtime) . " DELETE END"
195             . " key=$key value=$value"
196 0           . " cr=$#{$self->{cache}} cache_top=$cache_top\n";
197             }
198             }
199              
200             sub FIRSTKEY {
201 0     0     my($self) = @_;
202 0           scalar keys %{$self->{h}};
  0            
203 0           my @kv = $self->NEXTKEY;
204 0 0         return undef if !@kv;
205 0 0         return (wantarray ? @kv : $kv[0]) if @kv;
    0          
206             }
207              
208             sub NEXTKEY {
209 0     0     my($self) = @_;
210 0           my @kv = each %{$self->{h}};
  0            
211 0 0         return undef if !@kv;
212 0 0         return (wantarray ? @kv : $kv[0]) if @kv;
    0          
213             }
214              
215             sub EXISTS {
216 0     0     my($self, $key) = @_;
217 0 0 0       return 1 if $key and exists($self->{h}{$key});
218             }
219              
220             # CLEAR gets called when someone wants to erase all our data. This
221             # is a pretty powerful command.
222              
223             sub CLEAR {
224 0     0     my($self) = @_;
225 0           %{$self->{h}} = ( );
  0            
226 0           $self->{cache} = [ ];
227 0           $self->{num_keys} = 0;
228 0           $self->cache_rebuild(); # not necessary but doesn't hurt
229             }
230              
231             ############################################################################
232             # Internal-use-only methods.
233             ############################################################################
234              
235             sub cmp {
236 0     0 0   my $self = shift @_;
237 0           local($Tie::Cache::a, $Tie::Cache::b);
238 0 0         if ($self->{sort_rev}) {
239 0           ($Tie::Cache::b, $Tie::Cache::a) = @_;
240             } else {
241 0           ($Tie::Cache::a, $Tie::Cache::b) = @_;
242             }
243 0           my $retval = 0;
244 0           my $sf = $self->{sort_func};
245 0           my $ref_sf = ref $sf;
246              
247             # Always sort undef to the beginning. (Maybe there should be an
248             # option to switch this to the end?)
249 0 0         if (!defined($Tie::Cache::a)) {
    0          
250 0 0         if (!defined($Tie::Cache::b)) {
251 0           $retval = 0;
252             } else {
253 0           $retval = -1;
254             }
255             } elsif (!defined($Tie::Cache::b)) {
256 0           $retval = 1;
257             } else {
258              
259 0 0 0       if (!defined($sf) or ($ref_sf and $ref_sf ne 'CODE')) {
      0        
260 0           $retval = ($Tie::Cache::a cmp $Tie::Cache::b);
261             } else {
262 0 0         if ($ref_sf) {
    0          
263 0           $retval = &$sf;
264             } elsif (not ref $sf) {
265 0 0         SWITCH: {
266 0           $retval = ($Tie::Cache::a <=> $Tie::Cache::b), last SWITCH if $sf eq '<=>';
267             # Insert additional defined sort_func strings here.
268 0           $retval = ($Tie::Cache::a cmp $Tie::Cache::b);
269             }
270             } else {
271 0           $retval = ($Tie::Cache::a cmp $Tie::Cache::b);
272             }
273             }
274              
275             }
276              
277 0           $retval;
278             }
279              
280             sub get_cache_top {
281 0     0 0   my($self) = @_;
282 0           my $cr = $self->{cache};
283 0 0         return undef if $#$cr < 0;
284 0           $self->{h}{$cr->[$#$cr]};
285             }
286              
287             sub cache_pos {
288 0     0 0   my($self, $key, $value) = @_;
289              
290 0           my $cr = $self->{cache};
291 0           my($min, $max) = (0, $#$cr+1);
292 0           MINMAX: while ($min <= $max) {
293 0 0         last MINMAX if $min > $#$cr; # Here, ">" seems to work where ">=" does not.
294 0           my $mid = int(($min+$max)/2);
295 0           my $mid_key = $cr->[$mid];
296 0           my $mid_value = $self->{h}{$mid_key};
297 0           my $cmp = $self->cmp($value, $mid_value);
298 0 0         if (!$cmp) {
299 0           $cmp = ($key cmp $mid_key);
300             }
301              
302 0 0         if ($cmp < 0) {
    0          
303 0           $max = $mid-1; # Is this wrong, should it be just "$max=$mid"?
304             } elsif ($cmp > 0) {
305 0           $min = $mid+1;
306             } else {
307 0           $min = $mid, last MINMAX; # same key, same value
308             }
309             }
310              
311             # If there is a run of two or more keys in the cache with the same
312             # value, we may have binary-jumped right into the middle.
313             # Deciding arbitrarily whether to go up or down one key doesn't
314             # necessarily get us to exactly the right place. We have to scan
315             # backward or forward until either the value changes or our key
316             # "fits." (Actually, I'm not sure this section is necessary.
317             # Now that we're properly doing a bi-level, well-defined sort,
318             # there should be no need for such scanning.)
319              
320 0 0         if ($DEBUG > 2) {
321 0 0 0       my $keyleft = ($min ? ($cr->[$min-1] || '') : '');
322 0 0         my $valleft = ($keyleft ? $self->{h}{$keyleft} : '');
323 0   0       my $keymin = $cr->[$min] || '';
324 0 0         my $valmin = ($keymin ? $self->{h}{$keymin} : '');
325 0   0       my $keyright = $cr->[$min+1] || '';
326 0 0         my $valright = ($keyright ? $self->{h}{$keyright} : '');
327 0           print STDERR scalar(localtime) . " cache_pos BEGIN key=$key value=$value initial_cache_pos=$min: ($keyleft/$valleft) ($keymin/$valmin) ($keyright/$valright)\n";
328             }
329              
330 0 0 0       if ($min > 0 and $min < $#$cr
      0        
331             and $self->cmp( $value, $self->{h}{$cr->[$min]} ) == 0) {
332              
333 0   0       while ($min > 0
      0        
334             and $self->cmp( $value, $self->{h}{$cr->[$min-1]} ) == 0
335             and $cr->[$min-1] ge $key) {
336 0           print STDERR scalar(localtime) . " cache_pos key=$key have to decrement min: $min ($cr->[$min-1] >= $key) ($value) (was $cr->[$min]/$self->{h}{$cr->[$min]})\n";
337 0           --$min;
338             }
339 0   0       while ($min < $#$cr
      0        
340             and $self->cmp( $value, $self->{h}{$cr->[$min+1]} ) == 0
341             and $key ge $cr->[$min+1]) {
342 0           print STDERR scalar(localtime) . " cache_pos key=$key have to increment min: $min ($key >= $cr->[$min+1]) ($value) (was $cr->[$min]/$self->{h}{$cr->[$min]})\n";
343 0           ++$min;
344             }
345              
346             }
347              
348 0 0         if ($DEBUG) {
349              
350             # Let's do some sanity checking.
351              
352             # Here, min is the location where the value goes.
353 0 0 0       if ($min < 0 or $min > $#$cr + 1 or $min != int($min)) {
      0        
354 0           warn "logic err 0 invalid value for min '$min' '$#$cr'";
355             }
356 0 0         if ($#$cr == -1) {
    0          
357             # Empty cache array.
358 0 0         if ($min != 0) {
359 0           warn "logic err 1 only one place to go '$min' '$value'";
360             }
361             } elsif ($#$cr == 0) {
362             # One item in the cache array, we're either before it or after it.
363 0 0 0       if ( $min == 0 and $self->cmp($value, $self->{h}{$cr->[0]}) > 0) {
    0 0        
    0 0        
364 0           warn "logic err 2 should be before single item '$min' '$value' '$self->{h}{$cr->[0]}'";
365             } elsif ($min == 1 and $self->cmp($self->{h}{$cr->[0]}, $value) > 0) {
366 0           warn "logic err 3 should be after single item '$min' '$value' '$self->{h}{$cr->[0]}'";
367             } elsif ($min < 0 or $min > 1) {
368 0           warn "logic err 4 bogus min '$min' '$value' '$self->{h}{$cr->[0]}'";
369             }
370             } else {
371 0 0 0       if ( $min == 0 and $self->cmp($value, $self->{h}{$cr->[0]}) > 0) {
    0 0        
    0 0        
372 0           warn "logic err 5 wrongly at beginning '$value' '$self->{h}{$cr->[0]})'";
373             } elsif ($min == $#$cr+1 and $self->cmp($self->{h}{$cr->[$#$cr]}, $value) > 0) {
374 0           warn "logic err 6 wrongly at end '$value' '$self->{h}{$cr->[$#$cr]})'";
375             } elsif ($min > 0 and $min <= $#$cr) {
376 0 0         if ($self->cmp($self->{h}{$cr->[$min-1]}, $value) > 0) {
377 0           warn "logic err 7 preceding entry larger $min $#$cr $value '"
378 0           . join(' ', map { $self->{h}{$cr->[$_]} } ( $min-2 .. $min+2 ) )
379             . "'";
380             }
381 0 0         if ($self->cmp($value, $self->{h}{$cr->[$min]}) > 0) {
382 0           warn "logic err 8 succeeding entry smaller '$value' '$self->{h}{$cr->[$min]}'";
383             }
384             }
385             }
386              
387             }
388              
389 0 0         print STDERR scalar(localtime) . " cache_pos END key=$key value=$value cache_pos=$min\n" if $DEBUG > 2;
390              
391 0           $min;
392             }
393              
394             sub cache_insert {
395 0     0 0   my($self, $key, $value) = @_;
396 0           my $cr = $self->{cache};
397 0 0         print STDERR scalar(localtime) . " cache_insert key=$key value=$value \$\#\$cr=$#$cr\n" if $DEBUG > 2;
398 0           my $cache_pos = $self->cache_pos($key, $value);
399 0 0         print STDERR scalar(localtime) . " cache_insert key=$key cache_pos=$cache_pos\n" if $DEBUG > 2;
400 0           my @replacement_keys = ($key, @$cr[$cache_pos..$#$cr]);
401 0 0         print STDERR scalar(localtime) . " cache_insert key=$key \$\#replacement_keys=$#replacement_keys\n" if $DEBUG > 2;
402 0           splice(@$cr, # array
403             $cache_pos, # offset
404             $#$cr - $cache_pos + 1, # length
405             @replacement_keys
406             );
407 0 0         print STDERR scalar(localtime) . " cache_insert key=$key \$\#\$cr=$#$cr\n" if $DEBUG > 2;
408 0 0         if ($self->cache_too_large()) {
409 0 0         print STDERR scalar(localtime) . " cache_insert key=$key cache_too_large\n" if $DEBUG > 2;
410             # Pop the last item.
411 0           $#$cr--;
412             }
413 0 0         print STDERR scalar(localtime) . " cache_insert key=$key done\n" if $DEBUG > 2;
414             }
415              
416             sub cache_remove {
417 0     0 0   my($self, $key, $value) = @_;
418 0           my $cr = $self->{cache};
419 0           my $cache_pos = $self->cache_pos($key, $value);
420              
421             # We don't know whether the cache position returned is of the
422             # actual key we're looking for, or of where it would go if it
423             # weren't (incorrectly) missing from the cache.
424              
425 0 0 0       if ($cache_pos <= $#$cr and $cr->[$cache_pos] eq $key) {
426 0           splice(@$cr, # array
427             $cache_pos, # offset
428             1 # length
429             ); # replacement (none)
430             }
431             }
432              
433             sub cache_value {
434 0     0 0   my($self, $field) = @_;
435 0           my $value = $self->{$field};
436 0           my $round = 0.5;
437 0 0         $round = 0 if $field eq 'min';
438 0 0         if ($value =~ /^(\d+(?:\.\d*)?|\.\d+)\%$/) {
439 0           $value = int($self->{num_keys} * $1 + $round)
440             }
441 0           $value;
442             }
443              
444             sub cache_too_small {
445             # The cache is too small and MUST be rebuilt if it is both
446             # smaller than the min variable and smaller than the
447             # size of the entire hash.
448 0     0 0   my($self) = @_;
449 0           my $cr = $self->{cache};
450 0           my $min = $self->cache_value('min');
451 0 0         $#$cr+1 < $min and $#$cr+1 < $self->{num_keys};
452             }
453              
454             sub cache_may_rebuild {
455             # The cache is small enough and MAY be rebuilt if it is both
456             # smaller than the (min+min_margin) variable and smaller than the
457             # size of the entire hash.
458 0     0 0   my($self) = @_;
459 0           my $cr = $self->{cache};
460 0           my $min = $self->cache_value('min');
461 0           my $min_margin = $self->cache_value('min_margin');
462 0           my $small_enuf = $min + $min_margin;
463 0           my $max = $self->cache_value('max');
464 0 0         $small_enuf = $max - 1 if $small_enuf > $max - 1;
465 0 0         $#$cr <= $small_enuf and $#$cr+1 < $self->{num_keys};
466             }
467              
468             sub cache_may_accept {
469             # The cache is large enough and MAY be accepted if it is
470             # larger than the (max-max_margin) variable or equal to the
471             # size of the entire hash.
472 0     0 0   my($self) = @_;
473 0           my $cr = $self->{cache};
474 0           my $max = $self->cache_value('max');
475 0           my $max_margin = $self->cache_value('max_margin');
476 0           my $large_enuf = $max - $max_margin;
477 0           my $min = $self->cache_value('min');
478 0 0         $large_enuf = $min + 1 if $large_enuf < $min + 1;
479 0 0         $#$cr >= $large_enuf or $#$cr+1 == $self->{num_keys};
480             }
481              
482             sub cache_too_large {
483             # The cache is too large and MUST be shrunk if it is
484             # larger than the max variable.
485 0     0 0   my($self) = @_;
486 0           my $cr = $self->{cache};
487 0           my $max = $self->cache_value('max');
488 0           $#$cr >= $max;
489             }
490              
491             sub cache_rebuild {
492 0     0 0   my($self) = @_;
493 0           my($start_time, $elapsed_time);
494 0           my $cr = $self->{cache};
495 0 0         if ($DEBUG > 1) {
496 0           $start_time = time;
497             }
498              
499 0           my @scanned = ( );
500 0           my @unscanned = keys %{$self->{h}}; # This can take a while but there's no way around it.
  0            
501 0 0         if (!defined($self->{num_keys})) {
502 0           $self->{num_keys} = $#unscanned+1;
503             }
504              
505 0 0         if ($DEBUG > 1) {
506 0           my @sort_unscanned = sort @unscanned;
507 0           my @sort_unscanned_print;
508 0 0         if ($#sort_unscanned > 10) {
509 0           @sort_unscanned_print = (@sort_unscanned[0..3], '...', @sort_unscanned[-4..-1]);
510             } else {
511 0           @sort_unscanned_print = @sort_unscanned;
512             }
513 0           print STDERR scalar(localtime) . " cache_rebuild"
514             . " $#sort_unscanned: @sort_unscanned_print\n";
515             }
516              
517 0           my $do_it_the_stupid_way = 0;
518 0           my $max = $self->cache_value('max');
519              
520 0 0         if ($self->{num_keys} <= $max*(1+$self->{fudge})) {
521              
522 0           $do_it_the_stupid_way = 1;
523              
524             } else {
525              
526             # Do Monte Carlo sampling in order to sort as little as possible.
527              
528 0           my $key;
529 0 0 0       $self->{scan_min} = 10 if !$self->{scan_min} or $self->{scan_min} < 10; # sanity check
530 0           my $desired_fraction = ($max/$self->{num_keys}) * (1+$self->{fudge});
531 0           my $max_num_to_scan = int($self->{num_keys}/2);
532 0           my $num_to_scan = $self->{scan_min}; # If too large, will be reduced below.
533              
534             # Repeat the following attempts until Monte fails us and we
535             # must finally give up.
536              
537 0           my $n_failures = 0;
538 0           my $success = 0;
539 0   0       while (!$success and !$do_it_the_stupid_way) {
540              
541 0 0         print STDERR scalar(localtime) . " MONTE1"
542             . " num_to_scan=$num_to_scan frac=$desired_fraction"
543             . " \$\#scanned=$#scanned \$\#unscanned=$#unscanned"
544             . " num_keys=$self->{num_keys} \$\#\$cr=$#$cr\n" if $DEBUG > 1;
545              
546             # Gather a pseudorandom (thanks to the hashing algorithm)
547             # sampling of keys.
548 0 0         $num_to_scan = $max_num_to_scan - ($#scanned+1)
549             if $num_to_scan >= $max_num_to_scan - $#scanned;
550 0 0         $num_to_scan = $#unscanned+1
551             if $num_to_scan > $#unscanned+1;
552 0 0         if ($num_to_scan) {
553 0           my $start_scanned = $#scanned;
554 0 0         print STDERR scalar(localtime) . " MONTE1a"
555             . " start_scanned=$start_scanned"
556             . " num_to_scan=$num_to_scan\n" if $DEBUG > 1;
557             # To make it even more random, try taking a random section out of
558             # the source array. It shouldn't matter unless the hash
559             # algorithm is pathological, but who knows, someday it might
560             # do some good.
561 0           my $unscanned_start = 0;
562 0 0         if ($#unscanned >= $num_to_scan) {
563 0           $unscanned_start = int(rand($#unscanned-$num_to_scan+1)); # could be +2
564             }
565 0           splice(@scanned, $#scanned+1, 0,
566             splice(@unscanned, $unscanned_start, $num_to_scan)
567             );
568 0           $num_to_scan = 0;
569 0 0         print STDERR scalar(localtime) . " MONTE2"
570             . " unscanned_start=$unscanned_start num_to_scan=$num_to_scan"
571             . " \$\#scanned=$#scanned \$\#unscanned=$#unscanned\n" if $DEBUG > 1;
572             }
573              
574             # Make a guess at the "max value" we should accumulate into our
575             # soon-to-be-cache.
576             # BUG: This sorted array need not be rebuilt if %monte has not
577             # changed since the last time through this loop.
578 0 0         my @sorted_scanned = sort {
579 0           $self->cmp( $self->{h}{$a}, $self->{h}{$b} )
580             ||
581             $a cmp $b
582             } @scanned;
583              
584 0 0         if ($#sorted_scanned >= 0) {
585              
586 0           my $guess_max_key_index = int( $#sorted_scanned * $desired_fraction + 0.5 );
587 0           $guess_max_key_index += $n_failures;
588 0 0         $guess_max_key_index = 1 if $guess_max_key_index == 0; # never use the lowest scanned
589 0 0         $guess_max_key_index = $#sorted_scanned
590             if $guess_max_key_index > $#sorted_scanned;
591 0           my $guess_max_key = $sorted_scanned[$guess_max_key_index];
592 0           my $guess_max_value = $self->{h}{$guess_max_key};
593             # BUG: if this isn't our first time through this loop,
594             # make sure we're not using the exact same key value as
595             # we used last time - if so, scan $guess_max_key_index up
596             # thru the list until we get to the next largest value.
597              
598 0 0         print STDERR scalar(localtime) . " MONTE3"
599             . " guess_max_key_index=$guess_max_key_index"
600             . " guess_max_key=$guess_max_key"
601             . " guess_value=$guess_max_value\n" if $DEBUG > 1;
602              
603             # Accumulate any duples less than that max value into our cache.
604 0           @$cr = ( );
605 0           for $key (@scanned) {
606 0 0         push @$cr, $key if $self->cmp( $self->{h}{$key}, $guess_max_value ) <= 0;
607             }
608 0           for $key (@unscanned) {
609 0 0         push @$cr, $key if $self->cmp( $self->{h}{$key}, $guess_max_value ) <= 0;
610             }
611              
612 0 0         print STDERR scalar(localtime) . " MONTE4 \$\#\$cr=$#$cr\n" if $DEBUG > 1;
613              
614 0 0         if ($self->cache_may_accept()) {
615              
616             # Hey, it worked, we have enough data. Sort it, trim it, and
617             # we're done.
618 0 0         print STDERR scalar(localtime) . " MONTE5 sorting"
619             . " array size \$\#\$cr=$#$cr"
620             . " (max=$max):"
621             . " @$cr[0..4]\n" if $DEBUG > 1;
622 0 0         @$cr = sort {
623 0           $self->cmp( $self->{h}{$a}, $self->{h}{$b} )
624             ||
625             $a cmp $b
626             } @$cr;
627 0 0         $#$cr = $max-1 if $#$cr >= $max;
628 0           $success = 1;
629 0 0         print STDERR scalar(localtime) . " MONTE6 SUCCESS"
630             . " \$\#\$cr=$#$cr: @$cr[0..4]\n" if $DEBUG > 1;
631              
632             }
633              
634             }
635              
636 0 0         if (!$success) {
637              
638             # Well, that didn't work, either because there were no monte
639             # keys or because our guess didn't net us enough cache keys.
640             # Either way, we need to try again. Pull a few more keys into
641             # our pseudorandom sampling to try to make it more accurate
642             # (though don't go beyond double the original scan_min; the
643             # point of rapidly diminishing returns is below 1000 samples).
644             # Then we kick up the fraction by an appropriate amount,
645             # at least the fudge factor. Increasing the fraction is
646             # the thing that will really get us more data next time.
647             # Then repeat.
648              
649 0           my $fraction_multiplier = $max/($#$cr+2);
650 0 0         $fraction_multiplier = 2 if $fraction_multiplier > 2;
651 0 0         $fraction_multiplier = 1+$self->{fudge} if $fraction_multiplier < 1+$self->{fudge};
652 0           $desired_fraction *= $fraction_multiplier;
653 0 0         if ($desired_fraction > 0.8) {
654             # Screw it. Just scan the whole hash.
655 0           $do_it_the_stupid_way = 1;
656             }
657 0 0         if ($#scanned < $self->{scan_min}*1.9) {
658 0           $num_to_scan = int($self->{scan_min}/2);
659             }
660 0 0         print STDERR scalar(localtime) . " MONTE7 failure"
661             . " frac_mul=$fraction_multiplier"
662             . " frac=$desired_fraction"
663             . " num_to_scan=$num_to_scan"
664             . " stupid=$do_it_the_stupid_way\n" if $DEBUG > 1;
665              
666             }
667            
668             }
669              
670             }
671              
672 0 0         if ($do_it_the_stupid_way) {
673              
674             # This would be the stupid way of doing it. The point of this module
675             # is not to do it the stupid way when the number of keys in the hash
676             # gets large. Looks like we suck! Oh well!
677              
678 0 0         @$cr = sort {
679 0           $self->cmp( $self->{h}{$a}, $self->{h}{$b} )
680             ||
681             $a cmp $b
682             } @unscanned;
683 0 0         if ($#$cr >= $max) {
684 0           $#$cr = $max-1;
685             }
686              
687             }
688              
689 0 0         if ($DEBUG > 1) {
690 0           $elapsed_time = time - $start_time;
691             }
692 0 0         if ($DEBUG) {
693 0           $self->cache_sort_check();
694             }
695 0 0         if ($DEBUG > 1) {
696 0           my $total_elapsed_time = time - $start_time;
697 0           my $n_cache_keys = $#$cr + 1;
698 0           my $n_hash_keys = scalar(keys %{$self->{h}});
  0            
699 0           print STDERR scalar(localtime) . " rebuilt cache ($n_cache_keys/$n_hash_keys keys) in $elapsed_time seconds (counting the check, $total_elapsed_time seconds)\n";
700             }
701             }
702              
703             sub cache_sort_check {
704 0     0 0   my($self, $info) = @_;
705 0 0         return unless $DEBUG;
706 0 0         $info = '' if !$info;
707 0           my $error = 0;
708 0           my $cr = $self->{cache};
709 0 0         if ($#$cr > 0) {
710 0           my $lastval = $self->{h}{$cr->[0]};
711 0           my $i;
712 0           for $i (1..$#$cr) {
713 0           my $newval = $self->{h}{$cr->[$i]};
714 0 0 0       if (!defined($newval) or !defined($lastval) or $self->cmp($newval, $lastval) < 0) {
      0        
715 0 0         $newval = '(undef)' if !defined($newval);
716 0 0         $lastval = '(undef)' if !defined($lastval);
717 0           $error = "\nnewval ($newval) < lastval ($lastval) at pos $i/$#$cr: $info\n";
718 0           last;
719             }
720 0           $lastval = $newval;
721             }
722             }
723 0 0 0       if ($#$cr >= 0 and !$error) {
724 0           my %cache = map { $_ => 1 } @$cr;
  0            
725 0           my $key;
726 0           my $pivot_value = $self->{h}{$cr->[$#$cr]};
727 0           for $key (keys %{$self->{h}}) {
  0            
728 0 0         if ($cache{$key}) {
729             # It's in the cache so its value should be small.
730 0 0         if ($self->cmp( $self->{h}{$key}, $pivot_value ) > 0) {
731 0           $error = "\nkey=$key value=$self->{h}{$key} in cache but"
732             . " larger than $cr->[$#$cr]: $info\n";
733 0           last;
734             }
735             } else {
736             # It's not in the cache so its value should be large.
737 0 0         if ($self->cmp( $self->{h}{$key}, $pivot_value ) < 0) {
738 0           $error = "\nkey=$key value=$self->{h}{$key} not in cache but"
739             . " smaller than key=$cr->[$#$cr] value=$pivot_value: $info\n";
740 0           last;
741             }
742             }
743             }
744             }
745 0 0         if ($error) {
746 0           my $j;
747 0           for $j (0..$#$cr) {
748 0 0         my $key = defined($cr->[$j]) ? $cr->[$j] : '';
749 0 0         my $val = ''; $val = defined($self->{h}{$key}) ? $self->{h}{$key} : '' if $key;
  0 0          
750 0           print STDERR "cache \#$j\t$key\t$val\n";
751             }
752 0           croak $error;
753             }
754             }
755              
756             ############################################################################
757             # Wrap-up
758             ############################################################################
759              
760             # Make sure the module returns true.
761             1;
762              
763             __DATA__