File Coverage

blib/lib/Tree/Trie.pm
Criterion Covered Total %
statement 324 348 93.1
branch 161 188 85.6
condition 43 63 68.2
subroutine 23 23 100.0
pod 11 11 100.0
total 562 633 88.7


line stmt bran cond sub pod time code
1             # Tree::Trie, a module implementing a trie data structure.
2             # A formal description of tries can be found at:
3             # http://www.cs.queensu.ca/home/daver/235/Notes/Tries.pdf
4              
5             package Tree::Trie;
6              
7 4     4   117014 use strict;
  4         11  
  4         154  
8 4     4   25 use warnings;
  4         6  
  4         199  
9              
10             our $VERSION = "1.9";
11              
12             # A handful of helpful constants
13 4     4   23 use constant DEFAULT_END_MARKER => '';
  4         12  
  4         365  
14              
15 4     4   22 use constant BOOLEAN => 0;
  4         7  
  4         181  
16 4     4   21 use constant CHOOSE => 1;
  4         5  
  4         236  
17 4     4   20 use constant COUNT => 2;
  4         4  
  4         149  
18 4     4   31 use constant PREFIX => 3;
  4         14  
  4         167  
19 4     4   21 use constant EXACT => 4;
  4         7  
  4         16806  
20              
21             ## Public methods begin here
22              
23             # The constructor method. It's very simple.
24             sub new {
25 16     16 1 3202 my($proto) = shift;
26 16         31 my($options) = shift;
27 16   33     220 my($class) = ref($proto) || $proto;
28 16         36 my($self) = {};
29 16         78 bless($self, $class);
30 16         64 $self->{_MAINHASHREF} = {};
31             # These are default values
32 16         55 $self->{_END} = &DEFAULT_END_MARKER;
33 16         36 $self->{_DEEPSEARCH} = CHOOSE;
34 16         26 $self->{_FREEZE_END} = 0;
35 16 100 66     60 unless ( defined($options) && (ref($options) eq "HASH") ) {
36 15         25 $options = {};
37             }
38 16         75 $self->deepsearch($options->{'deepsearch'});
39 16 100       110 if (exists $options->{end_marker}) {
40 1         5 $self->end_marker($options->{end_marker});
41             }
42 16 100       42 if (exists $options->{freeze_end_marker}) {
43 1         4 $self->freeze_end_marker($options->{freeze_end_marker});
44             }
45 16         135 return($self);
46             }
47              
48             # Sets the value of the end marker, for those people who think they know
49             # better than Tree::Trie. Note it does not allow the setting of single
50             # character end markers.
51             sub end_marker {
52 6     6 1 10 my $self = shift;
53 6 100 66     34 if ($_[0] && length $_[0] > 1) {
54             # If they decide to set a new end marker, we have to be sure to
55             # go through and update all existing markers.
56 5         7 my $newend = shift;
57 5         11 my @refs = ($self->{_MAINHASHREF});
58 5         15 while (@refs) {
59 35         64 my $ref = shift @refs;
60 35         38 for my $key (keys %{$ref}) {
  35         93  
61 41 100       151 if ($key eq $self->{_END}) {
62 11         23 $ref->{$newend} = $ref->{$key};
63 11         38 delete $ref->{$key};
64             }
65             else {
66 30         92 push(@refs, $ref->{$key});
67             }
68             }
69             }
70 5         10 $self->{_END} = $newend;
71             }
72 6         13 return $self->{_END};
73             }
74              
75             # Sets the option to not attempt to update the end marker based on added
76             # letters.
77             # The above is the most awkward sentence I have ever written.
78             sub freeze_end_marker {
79 2     2 1 482 my $self = shift;
80 2 50       7 if (scalar @_) {
81 2 100       6 if (shift) {
82 1         2 $self->{_FREEZE_END} = 1;
83             }
84             else {
85 1         3 $self->{_FREEZE_END} = 0;
86             }
87             }
88 2         6 return $self->{_FREEZE_END};
89             }
90              
91             # Sets the value of the deepsearch parameter. Can be passed either words
92             # describing the parameter, or their numerical equivalents. Legal values
93             # are:
94             # boolean => 0
95             # choose => 1
96             # count => 2
97             # prefix => 3
98             # exact => 4
99             # See the POD for the 'lookup' method for details on this option.
100             sub deepsearch {
101 28     28 1 54 my($self) = shift;
102 28         51 my($option) = shift;
103 28 100       64 if(defined($option)) {
104 12 100 66     177 if ($option eq BOOLEAN || $option eq 'boolean') {
    100 66        
    100 66        
    100 66        
    50 33        
105 2         7 $self->{_DEEPSEARCH} = BOOLEAN;
106             }
107             elsif ($option eq CHOOSE || $option eq 'choose') {
108 4         9 $self->{_DEEPSEARCH} = CHOOSE;
109             }
110             elsif ($option eq COUNT || $option eq 'count') {
111 3         7 $self->{_DEEPSEARCH} = COUNT;
112             }
113             elsif ($option eq PREFIX || $option eq 'prefix') {
114 2         5 $self->{_DEEPSEARCH} = PREFIX;
115             }
116             elsif ($option eq EXACT || $option eq 'exact') {
117 1         2 $self->{_DEEPSEARCH} = EXACT;
118             }
119             }
120 28         75 return $self->{_DEEPSEARCH};
121             }
122              
123             # The add() method takes a list of words as arguments and attempts to add
124             # them to the trie. In list context, returns a list of words successfully
125             # added. In scalar context, returns a count of these words. As of this
126             # version, the only reason a word can fail to be added is if it is already
127             # in the trie. Or, I suppose, if there was a bug. :)
128             sub add {
129 13     13 1 145 my($self) = shift;
130 13         37 my(@words) = @_;
131              
132 13         306 my @retarray;
133 13         20 my $retnum = 0;
134              
135             # Process each word...
136 13         24 for my $word (@words) {
137             # And just call the internal thingy for it.
138 53 100       113 if ($self->_add_internal($word, undef)) {
139             # Updating return values as needed
140 50 50       83 if (wantarray) {
141 0         0 push(@retarray,$word);
142             }
143             else {
144 50         263 $retnum++;
145             }
146             }
147             }
148             # When done, return results.
149 13 50       71 return (wantarray ? @retarray : $retnum);
150             }
151              
152             # add_data() takes a hash of word => data pairs, adds the words to the trie and
153             # associates the data to those words.
154             sub add_data {
155 7     7 1 67 my($self) = shift;
156 7         11 my($retnum, @retarray);
157 7         9 my $word = "";
158             # Making sure that we've gotten data in pairs. Can't just turn @_
159             # into %data, because that would stringify arrayrefs
160 7   66     44 while(defined($word = shift) && @_) {
161             # This also just uses the internal add method.
162 13 100       35 if ($self->_add_internal($word, shift())) {
163 12 50       24 if (wantarray) {
164 0         0 push(@retarray, $word);
165             }
166             else {
167 12         48 $retnum++;
168             }
169             }
170             }
171 7 50       18 return @retarray if wantarray;
172 7         31 return $retnum;
173             }
174              
175             # add_all() takes one or more other tries and adds all of their entries
176             # to the trie. If both tries have data stored for the same key, the data
177             # from the trie on which this method was invoked will be overwritten. I can't
178             # think of anything useful to return from this method, so it has no return
179             # value. If you can think of anything that would make sense, please let me
180             # know.
181             # This idea and most of its implementation come from Aaron Stone.
182             # Thanks!
183             sub add_all {
184 2     2 1 11 my $self = shift;
185 2         5 for my $trie (@_) {
186 2   66     13 my $ignore_end = (
187             $self->{_FREEZE_END} ||
188             ($self->{_END} eq $trie->{_END})
189             );
190 2         16 my @nodepairs = ({
191             from => $trie->{_MAINHASHREF},
192             to => $self->{_MAINHASHREF},
193             });
194 2         7 while (scalar @nodepairs) {
195 34         49 my $np = pop @nodepairs;
196 34         34 for my $letter (keys %{$np->{from}}) {
  34         82  
197 39 100       128 unless ($ignore_end) {
198 9 100       23 if ($letter eq $self->{_END}) {
199 1         7 $self->end_marker($self->_gen_new_marker(
200             bad => [$letter],
201             ));
202             }
203             }
204 39 100       350 if ($letter eq $trie->{_END}) {
205 7         434 $np->{to}{$self->{_END}} = $np->{from}{$trie->{_END}};
206             }
207             else {
208 32 100       63 unless (exists $np->{to}{$letter}) {
209 24         79 $np->{to}{$letter} = {};
210             }
211 32         239 push @nodepairs, {
212             from => $np->{from}{$letter},
213             to => $np->{to}->{$letter},
214             };
215             }
216             }
217             }
218             }
219             }
220              
221             # delete_data() takes a list of words in the trie and deletes the associated
222             # data from the internal data store. In list context, returns a list of words
223             # whose associated data have been removed -- in scalar context, returns a count
224             # thereof.
225             sub delete_data {
226 1     1 1 5 my($self, @words) = @_;
227 1         2 my($retnum, @retarray) = 0;
228 1         3 my @letters;
229             # Process each word...
230 1         3 for my $word (@words) {
231 3 50       13 if (ref($word) eq 'ARRAY') {
232 0         0 @letters = (@{$word});
  0         0  
233             }
234             else {
235 3         12 @letters = split(//, $word);
236             }
237 3         6 my $ref = $self->{_MAINHASHREF};
238             # Walk down the tree...
239 3         6 for my $letter (@letters) {
240 8 100       20 if ($ref->{$letter}) {
241 7         16 $ref = $ref->{$letter};
242             }
243             else {
244             # This will cause the test right after this loop to fail and
245             # skip the the next word -- we want that because if we're here
246             # it means the word isn't in the trie.
247 1         2 $ref = {};
248 1         3 last;
249             }
250             }
251 3 100       14 next unless (exists $ref->{$self->{_END}});
252             # This is all we need to do to clear out the data
253 2         6 $ref->{$self->{_END}} = undef;
254 2 50       6 if (wantarray) {
255 0         0 push(@retarray, $word);
256             }
257             else {
258 2         5 $retnum++;
259             }
260             }
261 1 50       8 if (wantarray) {
262 0         0 return @retarray;
263             }
264             else {
265 1         6 return $retnum;
266             }
267             }
268              
269             # The lookup() method searches for words (or beginnings of words) in the trie.
270             # It takes a single word as an argument and, in list context, returns a list
271             # of all the words in the trie which begin with the given word. In scalar
272             # context, the return value depends on the value of the deepsearch parameter.
273             # An optional second argument is available: This should be a numerical
274             # argument, and specifies 2 things: first, that you want only word suffixes
275             # to be returned, and second, the maximum length of those suffices. All
276             # other configurations still apply. See the POD on this method for more
277             # details.
278             sub lookup {
279 37     37 1 2723 my($self) = shift;
280 37         52 my($word) = shift;
281             # This is the argument for doing suffix lookup.
282 37         43 my($suff_length) = shift;
283              
284             # Abstraction is kind of cool
285 37         93 return $self->_lookup_internal(
286             word => $word,
287             suff_len => $suff_length,
288             want_arr => wantarray(),
289             data => 0,
290             );
291             }
292              
293             # lookup_data() works basically the same as lookup, with the following
294             # exceptions -- in list context, returns a hash of ward => data pairings,
295             # and in scalar context, wherever it would return a word, it will instead
296             # return the datum associated with that word. Note that, depending on
297             # the deepsearch setting, lookup_data and lookup may return exactly the
298             # same scalar context.
299             sub lookup_data {
300 11     11 1 25 my($self, $word) = @_;
301              
302 11         600 return $self->_lookup_internal(
303             word => $word,
304             want_arr => wantarray(),
305             data => 1,
306             );
307             }
308              
309             # The remove() method takes a list of words and, surprisingly, removes them
310             # from the trie. It returns, in scalar context, the number of words removed.
311             # In list context, returns a list of the words removed. As of now, the only
312             # reason a word would fail to be removed is if it's not in the trie in the
313             # first place. Or, again, if there's a bug... :)
314             sub remove {
315 3     3 1 571 my($self) = shift;
316 3         9 my(@words) = @_;
317              
318 3         6 my($letter,$ref) = ("","","");
319 3         5 my(@letters,@ldn,@retarray);
320 3         6 my($retnum) = 0;
321             # The basic strategy here is as follows:
322             ##
323             # We walk down the trie one node at a time. If at any point, we see that a
324             # node can be deleted (that is, its only child is the one which continues the
325             # word we're deleting) then we mark it as the 'last deleteable'. If at any
326             # point we find a node which *cannot* be deleted (it has more children other
327             # than the one for the word we're working on), then we unmark our 'last
328             # deleteable' from before. Once done, delete from the last deleteable node
329             # down.
330              
331 3         7 for my $word (@words) {
332 5 100       14 if (ref($word) eq 'ARRAY') {
333 1         1 @letters = (@{$word});
  1         3  
334             }
335             else {
336 4         17 @letters = split('',$word);
337             }
338             # For each word, we need to put the leaf node entry at the end of the list
339             # of letters. We then reset the starting ref, and @ldn, which stands for
340             # 'last deleteable node'. It contains the ref of the hash and the key to
341             # be deleted. It does not seem possible to store a value passable to
342             # the 'delete' builtin in a scalar, so we're forced to do this.
343 5         11 push(@letters,$self->{_END});
344 5         9 $ref = $self->{_MAINHASHREF};
345 5         10 @ldn = ();
346            
347             # This is a special case, if the first letter of the word is the only
348             # key of the main hash. I might not really need it, but this works as
349             # it is.
350 5 100 66     7 if (((scalar keys(%{ $ref })) == 1) && (exists $ref->{$letters[0]})) {
  5         27  
351 1         2 @ldn = ($ref);
352             }
353             # And now we go down the trie, as described above.
354 5         17 while (defined($letter = shift(@letters))) {
355             # We break out if we're at the end, or if we're run out of trie before
356             # finding the end of the word -- that is, if the word isn't in the
357             # trie.
358 25 100       67 last if ($letter eq $self->{_END});
359 20 50       47 last unless exists($ref->{$letter});
360 20 100 66     21 if (
361 20         104 scalar keys(%{ $ref->{$letter} }) == 1 &&
362             exists $ref->{$letter}{$letters[0]}
363             ) {
364 18 100       37 unless (scalar @ldn) {
365 4         10 @ldn = ($ref,$letter);
366             }
367             }
368             else {
369 2         5 @ldn = ();
370             }
371 20         55 $ref = $ref->{$letter};
372             }
373             # If we broke out and there were still letters left in @letters, then the
374             # word must not be in the trie. Furthermore, if we got all the way to
375             # the end, but there's no leaf node, the word must not be in the trie.
376 5 50       16 next if (scalar @letters);
377 5 50       22 next unless (exists($ref->{$self->{_END}}));
378             # If @ldn is empty, then the only deleteable node is the leaf node, so
379             # we set this up.
380 5 100       12 if (scalar @ldn == 0) {
381 1         4 @ldn = ($ref,$self->{_END});
382             }
383             # If there's only one entry in @ldn, then it's the ref of the top of our
384             # Trie. If that's marked as deleteable, then we can just nuke the entire
385             # hash.
386 5 100       12 if (scalar @ldn == 1) {
387 1         2 %{ $ldn[0] } = ();
  1         6  
388             }
389             # Otherwise, we just delete the key we want to.
390             else {
391 4         12 delete($ldn[0]->{$ldn[1]});
392             }
393             # And then just return stuff.
394 5 50       11 if (wantarray) {
395 0         0 push (@retarray,$word);
396             }
397             else {
398 5         11 $retnum++;
399             }
400             }
401 3 50       10 if (wantarray) {
402 0         0 return @retarray;
403             }
404 3         18 return $retnum;
405             }
406              
407             ## These are PRIVATE METHODS. Don't call them directly unless you really
408             # know what you're doing, or you enjoy things working funny.
409              
410             # The _walktree() sub takes a word beginning and a hashref (hopefully to a trie)
411             # and walks down the trie, gathering all of the word endings and retuning them
412             # appended to the word beginning.
413             sub _walktree {
414 238     238   4397 my($self, %args) = @_;
415 238         506 my $word = $args{word};
416 238         279 my $ref = $args{ref};
417             # These 2 arguments are used to control how far down the tree this
418             # path will go.
419             # This first argument is passed in by external subs
420 238   100     892 my $suffix_length = $args{suf_len} || 0;
421             # And this one is used only by the recursive calls.
422 238   100     493 my $walked_suffix_length = $args{walked} || 0;
423              
424 238         415 my $wantref = ref($word) eq 'ARRAY';
425              
426 238         346 my($key) = "";
427 238         427 my(@retarray) = ();
428 238         248 my($ret) = 0;
429              
430             # For some reason, I used to think this was complicated and had a lot of
431             # stupid, useless code here. It's a lot simpler now. If the key we find
432             # is our magic reference, then we just give back the word. Otherwise, we
433             # walk down the new subtree we've discovered.
434 238         267 foreach $key (keys %{ $ref }) {
  238         806  
435 291 100       993 if ($key eq $self->{_END}) {
436 61 100       99 if (wantarray) {
437 36         50 push(@retarray,$word);
438 36 100       123 if ($args{data}) {
439 3         5 push(@retarray, $ref->{$key});
440             }
441             }
442             else {
443 25         26 $ret++;
444             }
445 61         137 next;
446             }
447 230 100       865 my $nextval = $wantref ? [(@{$word}), $key] : $word . $key;
  12         27  
448             # If we've reached the max depth we need to travel for the suffix (if
449             # specified), then stop and collect everything up.
450 230 100 100     559 if ($suffix_length > 0 && ($suffix_length - $walked_suffix_length == 1)) {
451 11 100       14 if (wantarray) {
452 1         4 push(@retarray, $nextval);
453             }
454             else {
455 10         25 $ret++;
456             }
457             }
458             else {
459             # Look, recursion!
460 219         1068 my %arguments = (
461             word => $nextval,
462             'ref' => $ref->{$key},
463             suf_len => $suffix_length,
464             walked => $walked_suffix_length + 1,
465             data => $args{data},
466             );
467 219 100       462 if (wantarray) {
468 142         1242 push(@retarray, $self->_walktree(%arguments));
469             }
470             else {
471 77         266 $ret += scalar $self->_walktree(%arguments);
472             }
473             }
474             }
475 238 100       445 if (wantarray) {
476 151         1147 return @retarray;
477             }
478             else {
479 87         466 return $ret;
480             }
481             }
482              
483             # This code used to use some fairly hoary recursive code which caused it to
484             # run fairly slowly, mainly due to the relatively slow way that perl handles
485             # OO method invocation. This was pointed out to me by Justin Hicks, and he
486             # helped me fix it up, to be quite a bit more reasonable now.
487             sub _lookup_internal {
488 48     48   60 my $self = shift;
489 48         182 my %args = @_;
490 48         634 my($ref) = $self->{_MAINHASHREF};
491              
492 48         76 my($letter, $nextletter) = ("", "");
493 48         78 my(@letters) = ();
494 48         54 my(@retarray) = ();
495 48         60 my($wantref) = 0;
496              
497 48         72 my $word = $args{word};
498              
499             # Here we split the word up into letters in the appropriate way.
500 48 100       99 if (ref($word) eq 'ARRAY') {
501 5         4 @letters = (@{$word});
  5         10  
502             # Keeping track of what kind of word it was.
503 5         6 $wantref = 1;
504             }
505             else {
506 43         145 @letters = split('',$word);
507             }
508              
509             # These three are to keep hold of possibly returned values.
510 48 100       110 my $lastword = $wantref ? [] : "";
511 48         54 my $lastwordref = undef;
512 48 100       89 my $pref = $wantref ? [] : "";
513              
514             # Like everything else, we step across each letter.
515 48         140 while(defined($letter = shift(@letters))) {
516             # This is to keep track of stuff for the "prefix" version of deepsearch.
517 130 100 66     465 if ($self->{_DEEPSEARCH} == PREFIX && !$args{want_arr}) {
518 72 100       169 if (exists $ref->{$self->{_END}}) {
519             # The "data" argument tells us if we want to return the word
520             # or the data associated with it.
521 10 100       27 if ($args{data}) {
    50          
522 5         9 $lastwordref = $ref;
523             }
524             elsif ($wantref) {
525 0         0 push(@{$lastword}, @{$pref});
  0         0  
  0         0  
526             }
527             else {
528 5         9 $lastword .= $pref;
529             }
530 10 50       22 $pref = $wantref ? [] : "";
531             }
532 72 100       138 unless ($args{data}) {
533 36 50       50 if ($wantref) {
534 0         0 push(@{$pref}, $letter);
  0         0  
535             }
536             else {
537 36         49 $pref .= $letter;
538             }
539             }
540             }
541             # If, at any point, we find that we've run out of tree before we've run out
542             # of word, then there is nothing in the trie that begins with the input
543             # word, so we return appropriately.
544 130 100       275 unless (exists $ref->{$letter}) {
545             # Array case.
546 9 50       44 if ($args{want_arr}) {
    100          
    100          
547 0         0 return ();
548             }
549             # "count" case.
550             elsif ($self->{_DEEPSEARCH} == COUNT) {
551 2         16 return 0;
552             }
553             # "prefix" case.
554             elsif ($self->{_DEEPSEARCH} == PREFIX) {
555 4 100 66     19 if ($args{data} && $lastwordref) {
556 2         16 return $lastwordref->{$self->{_END}};
557             }
558 2 50 50     24 if (($wantref && scalar @{$lastword}) || length $lastword) {
  0   33     0  
559 2         18 return $lastword;
560             }
561 0         0 return undef;
562             }
563             # All other deepsearch cases are the same.
564             else {
565 3         19 return undef;
566             }
567             }
568             # If the letter is there, we just walk one step down the trie.
569 121         313 $ref = $ref->{$letter};
570             }
571             # Once we've walked all the way down the tree to the end of the word we were
572             # given, there are a few things that can be done, depending on the context
573             # that the method was called in.
574 39 100       82 if ($args{want_arr}) {
575             # If they want an array, then we use the walktree subroutine to collect all
576             # of the words beneath our current location in the trie, and return them.
577 9 100       44 @retarray = $self->_walktree(
578             # When fetching suffixes, we don't want to give the word begnning.
579             word => $args{suff_len} ? "" : $word,
580             'ref' => $ref,
581             suf_len => $args{suff_len},
582             data => $args{data},
583             );
584 9         89 return @retarray;
585             }
586             else {
587 30 100       166 if ($self->{_DEEPSEARCH} == BOOLEAN) {
    100          
    100          
    100          
    50          
588             # Here, the user only wants to know if any words in the trie begin
589             # with their word, so that's what we give them.
590 3         19 return 1;
591             }
592             elsif ($self->{_DEEPSEARCH} == EXACT) {
593             # In this case, the user wants us to return something only if the
594             # exact word exists in the trie, and undef otherwise.
595             # This option only really makes sense with when looking up data,
596             # as otherwise it's essentially the same as BOOLEAN, above, but it
597             # doesn't hurt to allow it to work with normal lookup, either.
598             # I'd initially left this out because I didn't see a use for it, but
599             # thanks to Otmal Lendl for pointing out to me a situation in which
600             # it would be helpful to have.
601 4 100       12 if (exists $ref->{$self->{_END}}) {
602 2 100       5 if ($args{data}) {
603 1         9 return $ref->{$self->{_END}};
604             }
605 1         8 return $word;
606             }
607 2         54 return undef;
608             }
609             elsif ($self->{_DEEPSEARCH} == CHOOSE) {
610             # If they want this, then we continue to walk down the trie, collecting
611             # letters, until we find a leaf node, at which point we stop. Note that
612             # this works properly if the exact word is in the trie. Yay.
613             # Of course, making it work that way means that we tend to get shorter
614             # words in choose... is this a bad thing? I dunno.
615 9 50       24 my($stub) = $wantref ? [] : "";
616 9   100     12 while (scalar keys %{$ref} && !exists $ref->{$self->{_END}}) {
  21         130  
617 12         14 $nextletter = each(%{ $ref });
  12         22  
618             # I need to call this to clear the each() call. Wish I didn't...
619 12         15 keys(%{ $ref });
  12         13  
620 12 50       26 if ($wantref) {
621 0         0 push(@{$stub}, $nextletter);
  0         0  
622             }
623             else {
624 12         16 $stub .= $nextletter;
625             }
626 12         16 $ref = $ref->{$nextletter};
627             # If we're doing suffixes, bail out early once it's the right length.
628 12 100       44 if ($args{suff_len}) {
629 10 50       24 my $cmpr = $wantref ? scalar @{$stub} : length $stub;
  0         0  
630 10 50       35 last if $cmpr == $args{suff_len};
631             }
632             }
633 9 100       21 if ($args{data}) {
634 4         35 return $ref->{$self->{_END}};
635             }
636             # If they've specified a suffix length, then they don't want the
637             # beginning part of the word.
638 5 100       13 if ($args{suff_len}) {
639 3         19 return $stub;
640             }
641             # Otherwise, they do.
642             else {
643 2 50       15 return $wantref ? [@{$word}, @{$stub}] : $word . $stub;
  0         0  
  0         0  
644             }
645             }
646             elsif ($self->{_DEEPSEARCH} == COUNT) {
647             # Here, the user simply wants a count of words in the trie that begin
648             # with their word, so we get that by calling our walktree method in
649             # scalar context.
650 10 100       46 return scalar $self->_walktree(
651             # When fetching suffixes, we don't want to give the word begnning.
652             word => $args{suff_len} ? "" : $word,
653             'ref' => $ref,
654             suf_len => $args{suff_len},
655             );
656             }
657             elsif ($self->{_DEEPSEARCH} == PREFIX) {
658             # This is the "longest prefix found" case.
659 4 100       15 if (exists $ref->{$self->{_END}}) {
660 2 100       9 if ($args{data}) {
661 1         10 return $ref->{$self->{_END}};
662             }
663 1 50       4 if ($wantref) {
664 0         0 return [@{$lastword}, @{$pref}];
  0         0  
  0         0  
665             }
666             else {
667 1         9 return $lastword . $pref;
668             }
669             }
670 2 100       14 if ($args{data}) {
671 1         9 return $lastwordref->{$self->{_END}};
672             }
673 1         8 return $lastword;
674             }
675             }
676             }
677              
678             # This is the method which does all of the heavy lifting for add and
679             # add_data. Given a word and a datum, it walks down the trie until
680             # it finds a branch that hasn't been created yet. It then makes the rest
681             # of the branch, and slaps an end marker and the datum inside of it.
682             sub _add_internal {
683 66     66   206 my $self = shift;
684 66         72 my $word = shift;
685 66         218 my $datum = shift;
686 66         64 my @letters;
687             # We don't NEED to split a string into letters; Any array of tokens
688             # will do.
689 66 100       181 if (ref($word) eq 'ARRAY') {
690             # Note: this is a copy
691 8         12 @letters = (@{$word});
  8         24  
692             # Because in this case, a "letter" can be more than on character
693             # long, we have to make sure we don't collide with whatever we're
694             # using as an end marker.
695             # However, if the user is feeling all fanciful and told us not to
696             # bother, we won't.
697 8 50       40 unless ($self->{_FREEZE_END}) {
698 8         13 for my $letter (@letters) {
699 29 100       69 if ($letter eq $self->{_END}) {
700             # If we had a collision, then make a new end marker.
701 3         9 $self->end_marker($self->_gen_new_marker(
702             bad => \@letters,
703             ));
704 3         7 last;
705             }
706             }
707             }
708             }
709             else {
710 58         205 @letters = split('',$word);
711             }
712             # Start at the top of the Trie...
713 66         122 my $ref = $self->{_MAINHASHREF};
714             # This will walk down the trie as far as it can, until it either runs
715             # out of word or out of trie.
716 66   100     533 while (
717             (scalar @letters) &&
718             exists($ref->{$letters[0]})
719             ) {
720 73         320 $ref = $ref->{shift(@letters)};
721             }
722             # If it ran out of trie before it ran out of word then this will create
723             # the rest of the trie structure.
724 66         116 for my $letter (@letters) {
725 238         705 $ref = $ref->{$letter} = {};
726             }
727             # In either case, this will make the new end marker for the end of the
728             # word (assuming it wasn't already there) and set the return value
729             # appropriately.
730 66         170 my $ret = 1;
731 66 100       718 if (exists $ref->{$self->{_END}}) {
732 4         7 $ret = 0;
733             }
734             else {
735 62         132 $ref->{$self->{_END}} = undef;
736             }
737             # This will set the data if it was provided.
738 66 100       133 if (defined $datum) {
739 13         22 $ref->{$self->{_END}} = $datum;
740             }
741 66         224 return $ret;
742             }
743              
744             # This uses a heuristic (that is, a crappy method) to generate a new
745             # end marker for the trie. In addition to being sure that whatever is
746             # generated is not in use as a letter in the trie, it also makes a bold
747             # yet mostly vain attempt to try to make something that might not be
748             # used in the future.
749             # In general, I do not try to make this functionality good or fast or
750             # perfect -- if it's being called often, the module is being mis-used.
751             # If a user is using multi-character letters, then they ought to find
752             # a string that will be safe and set it themselves.
753             sub _gen_new_marker {
754 4     4   6 my $self = shift;
755 4         10 my %args = @_;
756             # This will keep track of all of the letters used in the trie already
757 4         8 my %used = ();
758             # This will keep track of what lengths they are
759 4         7 my %sizes = ();
760             # First we process the letters of the word which sparked this
761             # re-evaluation.
762 4         75 for my $letter (@{$args{bad}}) {
  4         11  
763 11         14 my $len = length($letter);
764 11 100       24 if ($len != 1) {
765 8         18 $used{$letter}++;
766 8         18 $sizes{$len}++;
767             }
768             }
769             # Then we walk the tree and get the info on all the other letters.
770 4         10 my @refs = ($self->{_MAINHASHREF});
771 4         11 while (@refs) {
772 5         130 my $ref = shift @refs;
773 5         6 for my $key (keys %{$ref}) {
  5         14  
774             # Note we don't even care about length 1 letters.
775 7 100 66     37 if (
776             (length($key) != 1) &&
777             ($key ne $self->{_END})
778             ) {
779 1         2 $used{$key}++;
780 1         3 $sizes{length($key)}++;
781 1         2 push(@refs, $ref->{$key});
782             }
783             }
784             }
785             # The idea here is that we want to make the end marker as small as possible,
786             # as it's stuck all over the place. However, we don't want to spend forever
787             # trying to find one that isn't in use.
788             # So, we find the smallest length such that there are fewer than 1/4 of
789             # the total number of possible letters in use of that length, and we make
790             # a key of that length.
791 4         7 my $newlen = 2;
792 4         12 for my $len (sort keys %sizes) {
793             # Yes, I know there are well more than 26 available compositors, but
794             # this will only mean I'm being too careful.
795 6 100       27 if ($sizes{$len} < ((26 ** $len) / 4)) {
796 3         6 $newlen = $len;
797 3         4 last;
798             }
799             else {
800             # This makes it so that if all existing lengths are too full ( !! )
801             # then we will just use a key that's one longer than the longest
802             # one already there.
803 3         6 $newlen = $len + 1;
804             }
805             }
806             # Now we just generate end markers until we find one that isn't in use.
807 4         6 my $newend;
808 4         6 do {
809 4         9 $newend = join '', map { chr(int(rand(128))) } (('') x $newlen);
  7         98  
810             } while (exists($used{$newend}));
811             # And return it.
812 4         21 return $newend;
813             }
814              
815             # Strewth!
816             1;
817              
818             __END__