File Coverage

blib/lib/Tree/Trie.pm

Criterion	Covered	Total	%
statement	324	348	93.1
branch	161	188	85.6
condition	43	63	68.2
subroutine	23	23	100.0
pod	11	11	100.0
total	562	633	88.7

line	stmt	bran	cond	sub	pod	time	code
1							# Tree::Trie, a module implementing a trie data structure.
2							# A formal description of tries can be found at:
3							# http://www.cs.queensu.ca/home/daver/235/Notes/Tries.pdf
4
5							package Tree::Trie;
6
7	4			4		117014	use strict;
	4					11
	4					154
8	4			4		25	use warnings;
	4					6
	4					199
9
10							our $VERSION = "1.9";
11
12							# A handful of helpful constants
13	4			4		23	use constant DEFAULT_END_MARKER => '';
	4					12
	4					365
14
15	4			4		22	use constant BOOLEAN => 0;
	4					7
	4					181
16	4			4		21	use constant CHOOSE => 1;
	4					5
	4					236
17	4			4		20	use constant COUNT => 2;
	4					4
	4					149
18	4			4		31	use constant PREFIX => 3;
	4					14
	4					167
19	4			4		21	use constant EXACT => 4;
	4					7
	4					16806
20
21							## Public methods begin here
22
23							# The constructor method. It's very simple.
24							sub new {
25	16			16	1	3202	my($proto) = shift;
26	16					31	my($options) = shift;
27	16		33			220	my($class) = ref($proto) \|\| $proto;
28	16					36	my($self) = {};
29	16					78	bless($self, $class);
30	16					64	$self->{_MAINHASHREF} = {};
31							# These are default values
32	16					55	$self->{_END} = &DEFAULT_END_MARKER;
33	16					36	$self->{_DEEPSEARCH} = CHOOSE;
34	16					26	$self->{_FREEZE_END} = 0;
35	16	100	66			60	unless ( defined($options) && (ref($options) eq "HASH") ) {
36	15					25	$options = {};
37							}
38	16					75	$self->deepsearch($options->{'deepsearch'});
39	16	100				110	if (exists $options->{end_marker}) {
40	1					5	$self->end_marker($options->{end_marker});
41							}
42	16	100				42	if (exists $options->{freeze_end_marker}) {
43	1					4	$self->freeze_end_marker($options->{freeze_end_marker});
44							}
45	16					135	return($self);
46							}
47
48							# Sets the value of the end marker, for those people who think they know
49							# better than Tree::Trie. Note it does not allow the setting of single
50							# character end markers.
51							sub end_marker {
52	6			6	1	10	my $self = shift;
53	6	100	66			34	if ($_[0] && length $_[0] > 1) {
54							# If they decide to set a new end marker, we have to be sure to
55							# go through and update all existing markers.
56	5					7	my $newend = shift;
57	5					11	my @refs = ($self->{_MAINHASHREF});
58	5					15	while (@refs) {
59	35					64	my $ref = shift @refs;
60	35					38	for my $key (keys %{$ref}) {
	35					93
61	41	100				151	if ($key eq $self->{_END}) {
62	11					23	$ref->{$newend} = $ref->{$key};
63	11					38	delete $ref->{$key};
64							}
65							else {
66	30					92	push(@refs, $ref->{$key});
67							}
68							}
69							}
70	5					10	$self->{_END} = $newend;
71							}
72	6					13	return $self->{_END};
73							}
74
75							# Sets the option to not attempt to update the end marker based on added
76							# letters.
77							# The above is the most awkward sentence I have ever written.
78							sub freeze_end_marker {
79	2			2	1	482	my $self = shift;
80	2	50				7	if (scalar @_) {
81	2	100				6	if (shift) {
82	1					2	$self->{_FREEZE_END} = 1;
83							}
84							else {
85	1					3	$self->{_FREEZE_END} = 0;
86							}
87							}
88	2					6	return $self->{_FREEZE_END};
89							}
90
91							# Sets the value of the deepsearch parameter. Can be passed either words
92							# describing the parameter, or their numerical equivalents. Legal values
93							# are:
94							# boolean => 0
95							# choose => 1
96							# count => 2
97							# prefix => 3
98							# exact => 4
99							# See the POD for the 'lookup' method for details on this option.
100							sub deepsearch {
101	28			28	1	54	my($self) = shift;
102	28					51	my($option) = shift;
103	28	100				64	if(defined($option)) {
104	12	100	66			177	if ($option eq BOOLEAN \|\| $option eq 'boolean') {
		100	66
		100	66
		100	66
		50	33
105	2					7	$self->{_DEEPSEARCH} = BOOLEAN;
106							}
107							elsif ($option eq CHOOSE \|\| $option eq 'choose') {
108	4					9	$self->{_DEEPSEARCH} = CHOOSE;
109							}
110							elsif ($option eq COUNT \|\| $option eq 'count') {
111	3					7	$self->{_DEEPSEARCH} = COUNT;
112							}
113							elsif ($option eq PREFIX \|\| $option eq 'prefix') {
114	2					5	$self->{_DEEPSEARCH} = PREFIX;
115							}
116							elsif ($option eq EXACT \|\| $option eq 'exact') {
117	1					2	$self->{_DEEPSEARCH} = EXACT;
118							}
119							}
120	28					75	return $self->{_DEEPSEARCH};
121							}
122
123							# The add() method takes a list of words as arguments and attempts to add
124							# them to the trie. In list context, returns a list of words successfully
125							# added. In scalar context, returns a count of these words. As of this
126							# version, the only reason a word can fail to be added is if it is already
127							# in the trie. Or, I suppose, if there was a bug. :)
128							sub add {
129	13			13	1	145	my($self) = shift;
130	13					37	my(@words) = @_;
131
132	13					306	my @retarray;
133	13					20	my $retnum = 0;
134
135							# Process each word...
136	13					24	for my $word (@words) {
137							# And just call the internal thingy for it.
138	53	100				113	if ($self->_add_internal($word, undef)) {
139							# Updating return values as needed
140	50	50				83	if (wantarray) {
141	0					0	push(@retarray,$word);
142							}
143							else {
144	50					263	$retnum++;
145							}
146							}
147							}
148							# When done, return results.
149	13	50				71	return (wantarray ? @retarray : $retnum);
150							}
151
152							# add_data() takes a hash of word => data pairs, adds the words to the trie and
153							# associates the data to those words.
154							sub add_data {
155	7			7	1	67	my($self) = shift;
156	7					11	my($retnum, @retarray);
157	7					9	my $word = "";
158							# Making sure that we've gotten data in pairs. Can't just turn @_
159							# into %data, because that would stringify arrayrefs
160	7		66			44	while(defined($word = shift) && @_) {
161							# This also just uses the internal add method.
162	13	100				35	if ($self->_add_internal($word, shift())) {
163	12	50				24	if (wantarray) {
164	0					0	push(@retarray, $word);
165							}
166							else {
167	12					48	$retnum++;
168							}
169							}
170							}
171	7	50				18	return @retarray if wantarray;
172	7					31	return $retnum;
173							}
174
175							# add_all() takes one or more other tries and adds all of their entries
176							# to the trie. If both tries have data stored for the same key, the data
177							# from the trie on which this method was invoked will be overwritten. I can't
178							# think of anything useful to return from this method, so it has no return
179							# value. If you can think of anything that would make sense, please let me
180							# know.
181							# This idea and most of its implementation come from Aaron Stone.
182							# Thanks!
183							sub add_all {
184	2			2	1	11	my $self = shift;
185	2					5	for my $trie (@_) {
186	2		66			13	my $ignore_end = (
187							$self->{_FREEZE_END} \|\|
188							($self->{_END} eq $trie->{_END})
189							);
190	2					16	my @nodepairs = ({
191							from => $trie->{_MAINHASHREF},
192							to => $self->{_MAINHASHREF},
193							});
194	2					7	while (scalar @nodepairs) {
195	34					49	my $np = pop @nodepairs;
196	34					34	for my $letter (keys %{$np->{from}}) {
	34					82
197	39	100				128	unless ($ignore_end) {
198	9	100				23	if ($letter eq $self->{_END}) {
199	1					7	$self->end_marker($self->_gen_new_marker(
200							bad => [$letter],
201							));
202							}
203							}
204	39	100				350	if ($letter eq $trie->{_END}) {
205	7					434	$np->{to}{$self->{_END}} = $np->{from}{$trie->{_END}};
206							}
207							else {
208	32	100				63	unless (exists $np->{to}{$letter}) {
209	24					79	$np->{to}{$letter} = {};
210							}
211	32					239	push @nodepairs, {
212							from => $np->{from}{$letter},
213							to => $np->{to}->{$letter},
214							};
215							}
216							}
217							}
218							}
219							}
220
221							# delete_data() takes a list of words in the trie and deletes the associated
222							# data from the internal data store. In list context, returns a list of words
223							# whose associated data have been removed -- in scalar context, returns a count
224							# thereof.
225							sub delete_data {
226	1			1	1	5	my($self, @words) = @_;
227	1					2	my($retnum, @retarray) = 0;
228	1					3	my @letters;
229							# Process each word...
230	1					3	for my $word (@words) {
231	3	50				13	if (ref($word) eq 'ARRAY') {
232	0					0	@letters = (@{$word});
	0					0
233							}
234							else {
235	3					12	@letters = split(//, $word);
236							}
237	3					6	my $ref = $self->{_MAINHASHREF};
238							# Walk down the tree...
239	3					6	for my $letter (@letters) {
240	8	100				20	if ($ref->{$letter}) {
241	7					16	$ref = $ref->{$letter};
242							}
243							else {
244							# This will cause the test right after this loop to fail and
245							# skip the the next word -- we want that because if we're here
246							# it means the word isn't in the trie.
247	1					2	$ref = {};
248	1					3	last;
249							}
250							}
251	3	100				14	next unless (exists $ref->{$self->{_END}});
252							# This is all we need to do to clear out the data
253	2					6	$ref->{$self->{_END}} = undef;
254	2	50				6	if (wantarray) {
255	0					0	push(@retarray, $word);
256							}
257							else {
258	2					5	$retnum++;
259							}
260							}
261	1	50				8	if (wantarray) {
262	0					0	return @retarray;
263							}
264							else {
265	1					6	return $retnum;
266							}
267							}
268
269							# The lookup() method searches for words (or beginnings of words) in the trie.
270							# It takes a single word as an argument and, in list context, returns a list
271							# of all the words in the trie which begin with the given word. In scalar
272							# context, the return value depends on the value of the deepsearch parameter.
273							# An optional second argument is available: This should be a numerical
274							# argument, and specifies 2 things: first, that you want only word suffixes
275							# to be returned, and second, the maximum length of those suffices. All
276							# other configurations still apply. See the POD on this method for more
277							# details.
278							sub lookup {
279	37			37	1	2723	my($self) = shift;
280	37					52	my($word) = shift;
281							# This is the argument for doing suffix lookup.
282	37					43	my($suff_length) = shift;
283
284							# Abstraction is kind of cool
285	37					93	return $self->_lookup_internal(
286							word => $word,
287							suff_len => $suff_length,
288							want_arr => wantarray(),
289							data => 0,
290							);
291							}
292
293							# lookup_data() works basically the same as lookup, with the following
294							# exceptions -- in list context, returns a hash of ward => data pairings,
295							# and in scalar context, wherever it would return a word, it will instead
296							# return the datum associated with that word. Note that, depending on
297							# the deepsearch setting, lookup_data and lookup may return exactly the
298							# same scalar context.
299							sub lookup_data {
300	11			11	1	25	my($self, $word) = @_;
301
302	11					600	return $self->_lookup_internal(
303							word => $word,
304							want_arr => wantarray(),
305							data => 1,
306							);
307							}
308
309							# The remove() method takes a list of words and, surprisingly, removes them
310							# from the trie. It returns, in scalar context, the number of words removed.
311							# In list context, returns a list of the words removed. As of now, the only
312							# reason a word would fail to be removed is if it's not in the trie in the
313							# first place. Or, again, if there's a bug... :)
314							sub remove {
315	3			3	1	571	my($self) = shift;
316	3					9	my(@words) = @_;
317
318	3					6	my($letter,$ref) = ("","","");
319	3					5	my(@letters,@ldn,@retarray);
320	3					6	my($retnum) = 0;
321							# The basic strategy here is as follows:
322							##
323							# We walk down the trie one node at a time. If at any point, we see that a
324							# node can be deleted (that is, its only child is the one which continues the
325							# word we're deleting) then we mark it as the 'last deleteable'. If at any
326							# point we find a node which cannot be deleted (it has more children other
327							# than the one for the word we're working on), then we unmark our 'last
328							# deleteable' from before. Once done, delete from the last deleteable node
329							# down.
330
331	3					7	for my $word (@words) {
332	5	100				14	if (ref($word) eq 'ARRAY') {
333	1					1	@letters = (@{$word});
	1					3
334							}
335							else {
336	4					17	@letters = split('',$word);
337							}
338							# For each word, we need to put the leaf node entry at the end of the list
339							# of letters. We then reset the starting ref, and @ldn, which stands for
340							# 'last deleteable node'. It contains the ref of the hash and the key to
341							# be deleted. It does not seem possible to store a value passable to
342							# the 'delete' builtin in a scalar, so we're forced to do this.
343	5					11	push(@letters,$self->{_END});
344	5					9	$ref = $self->{_MAINHASHREF};
345	5					10	@ldn = ();
346
347							# This is a special case, if the first letter of the word is the only
348							# key of the main hash. I might not really need it, but this works as
349							# it is.
350	5	100	66			7	if (((scalar keys(%{ $ref })) == 1) && (exists $ref->{$letters[0]})) {
	5					27
351	1					2	@ldn = ($ref);
352							}
353							# And now we go down the trie, as described above.
354	5					17	while (defined($letter = shift(@letters))) {
355							# We break out if we're at the end, or if we're run out of trie before
356							# finding the end of the word -- that is, if the word isn't in the
357							# trie.
358	25	100				67	last if ($letter eq $self->{_END});
359	20	50				47	last unless exists($ref->{$letter});
360	20	100	66			21	if (
361	20					104	scalar keys(%{ $ref->{$letter} }) == 1 &&
362							exists $ref->{$letter}{$letters[0]}
363							) {
364	18	100				37	unless (scalar @ldn) {
365	4					10	@ldn = ($ref,$letter);
366							}
367							}
368							else {
369	2					5	@ldn = ();
370							}
371	20					55	$ref = $ref->{$letter};
372							}
373							# If we broke out and there were still letters left in @letters, then the
374							# word must not be in the trie. Furthermore, if we got all the way to
375							# the end, but there's no leaf node, the word must not be in the trie.
376	5	50				16	next if (scalar @letters);
377	5	50				22	next unless (exists($ref->{$self->{_END}}));
378							# If @ldn is empty, then the only deleteable node is the leaf node, so
379							# we set this up.
380	5	100				12	if (scalar @ldn == 0) {
381	1					4	@ldn = ($ref,$self->{_END});
382							}
383							# If there's only one entry in @ldn, then it's the ref of the top of our
384							# Trie. If that's marked as deleteable, then we can just nuke the entire
385							# hash.
386	5	100				12	if (scalar @ldn == 1) {
387	1					2	%{ $ldn[0] } = ();
	1					6
388							}
389							# Otherwise, we just delete the key we want to.
390							else {
391	4					12	delete($ldn[0]->{$ldn[1]});
392							}
393							# And then just return stuff.
394	5	50				11	if (wantarray) {
395	0					0	push (@retarray,$word);
396							}
397							else {
398	5					11	$retnum++;
399							}
400							}
401	3	50				10	if (wantarray) {
402	0					0	return @retarray;
403							}
404	3					18	return $retnum;
405							}
406
407							## These are PRIVATE METHODS. Don't call them directly unless you really
408							# know what you're doing, or you enjoy things working funny.
409
410							# The _walktree() sub takes a word beginning and a hashref (hopefully to a trie)
411							# and walks down the trie, gathering all of the word endings and retuning them
412							# appended to the word beginning.
413							sub _walktree {
414	238			238		4397	my($self, %args) = @_;
415	238					506	my $word = $args{word};
416	238					279	my $ref = $args{ref};
417							# These 2 arguments are used to control how far down the tree this
418							# path will go.
419							# This first argument is passed in by external subs
420	238		100			892	my $suffix_length = $args{suf_len} \|\| 0;
421							# And this one is used only by the recursive calls.
422	238		100			493	my $walked_suffix_length = $args{walked} \|\| 0;
423
424	238					415	my $wantref = ref($word) eq 'ARRAY';
425
426	238					346	my($key) = "";
427	238					427	my(@retarray) = ();
428	238					248	my($ret) = 0;
429
430							# For some reason, I used to think this was complicated and had a lot of
431							# stupid, useless code here. It's a lot simpler now. If the key we find
432							# is our magic reference, then we just give back the word. Otherwise, we
433							# walk down the new subtree we've discovered.
434	238					267	foreach $key (keys %{ $ref }) {
	238					806
435	291	100				993	if ($key eq $self->{_END}) {
436	61	100				99	if (wantarray) {
437	36					50	push(@retarray,$word);
438	36	100				123	if ($args{data}) {
439	3					5	push(@retarray, $ref->{$key});
440							}
441							}
442							else {
443	25					26	$ret++;
444							}
445	61					137	next;
446							}
447	230	100				865	my $nextval = $wantref ? [(@{$word}), $key] : $word . $key;
	12					27
448							# If we've reached the max depth we need to travel for the suffix (if
449							# specified), then stop and collect everything up.
450	230	100	100			559	if ($suffix_length > 0 && ($suffix_length - $walked_suffix_length == 1)) {
451	11	100				14	if (wantarray) {
452	1					4	push(@retarray, $nextval);
453							}
454							else {
455	10					25	$ret++;
456							}
457							}
458							else {
459							# Look, recursion!
460	219					1068	my %arguments = (
461							word => $nextval,
462							'ref' => $ref->{$key},
463							suf_len => $suffix_length,
464							walked => $walked_suffix_length + 1,
465							data => $args{data},
466							);
467	219	100				462	if (wantarray) {
468	142					1242	push(@retarray, $self->_walktree(%arguments));
469							}
470							else {
471	77					266	$ret += scalar $self->_walktree(%arguments);
472							}
473							}
474							}
475	238	100				445	if (wantarray) {
476	151					1147	return @retarray;
477							}
478							else {
479	87					466	return $ret;
480							}
481							}
482
483							# This code used to use some fairly hoary recursive code which caused it to
484							# run fairly slowly, mainly due to the relatively slow way that perl handles
485							# OO method invocation. This was pointed out to me by Justin Hicks, and he
486							# helped me fix it up, to be quite a bit more reasonable now.
487							sub _lookup_internal {
488	48			48		60	my $self = shift;
489	48					182	my %args = @_;
490	48					634	my($ref) = $self->{_MAINHASHREF};
491
492	48					76	my($letter, $nextletter) = ("", "");
493	48					78	my(@letters) = ();
494	48					54	my(@retarray) = ();
495	48					60	my($wantref) = 0;
496
497	48					72	my $word = $args{word};
498
499							# Here we split the word up into letters in the appropriate way.
500	48	100				99	if (ref($word) eq 'ARRAY') {
501	5					4	@letters = (@{$word});
	5					10
502							# Keeping track of what kind of word it was.
503	5					6	$wantref = 1;
504							}
505							else {
506	43					145	@letters = split('',$word);
507							}
508
509							# These three are to keep hold of possibly returned values.
510	48	100				110	my $lastword = $wantref ? [] : "";
511	48					54	my $lastwordref = undef;
512	48	100				89	my $pref = $wantref ? [] : "";
513
514							# Like everything else, we step across each letter.
515	48					140	while(defined($letter = shift(@letters))) {
516							# This is to keep track of stuff for the "prefix" version of deepsearch.
517	130	100	66			465	if ($self->{_DEEPSEARCH} == PREFIX && !$args{want_arr}) {
518	72	100				169	if (exists $ref->{$self->{_END}}) {
519							# The "data" argument tells us if we want to return the word
520							# or the data associated with it.
521	10	100				27	if ($args{data}) {
		50
522	5					9	$lastwordref = $ref;
523							}
524							elsif ($wantref) {
525	0					0	push(@{$lastword}, @{$pref});
	0					0
	0					0
526							}
527							else {
528	5					9	$lastword .= $pref;
529							}
530	10	50				22	$pref = $wantref ? [] : "";
531							}
532	72	100				138	unless ($args{data}) {
533	36	50				50	if ($wantref) {
534	0					0	push(@{$pref}, $letter);
	0					0
535							}
536							else {
537	36					49	$pref .= $letter;
538							}
539							}
540							}
541							# If, at any point, we find that we've run out of tree before we've run out
542							# of word, then there is nothing in the trie that begins with the input
543							# word, so we return appropriately.
544	130	100				275	unless (exists $ref->{$letter}) {
545							# Array case.
546	9	50				44	if ($args{want_arr}) {
		100
		100
547	0					0	return ();
548							}
549							# "count" case.
550							elsif ($self->{_DEEPSEARCH} == COUNT) {
551	2					16	return 0;
552							}
553							# "prefix" case.
554							elsif ($self->{_DEEPSEARCH} == PREFIX) {
555	4	100	66			19	if ($args{data} && $lastwordref) {
556	2					16	return $lastwordref->{$self->{_END}};
557							}
558	2	50	50			24	if (($wantref && scalar @{$lastword}) \|\| length $lastword) {
	0		33			0
559	2					18	return $lastword;
560							}
561	0					0	return undef;
562							}
563							# All other deepsearch cases are the same.
564							else {
565	3					19	return undef;
566							}
567							}
568							# If the letter is there, we just walk one step down the trie.
569	121					313	$ref = $ref->{$letter};
570							}
571							# Once we've walked all the way down the tree to the end of the word we were
572							# given, there are a few things that can be done, depending on the context
573							# that the method was called in.
574	39	100				82	if ($args{want_arr}) {
575							# If they want an array, then we use the walktree subroutine to collect all
576							# of the words beneath our current location in the trie, and return them.
577	9	100				44	@retarray = $self->_walktree(
578							# When fetching suffixes, we don't want to give the word begnning.
579							word => $args{suff_len} ? "" : $word,
580							'ref' => $ref,
581							suf_len => $args{suff_len},
582							data => $args{data},
583							);
584	9					89	return @retarray;
585							}
586							else {
587	30	100				166	if ($self->{_DEEPSEARCH} == BOOLEAN) {
		100
		100
		100
		50
588							# Here, the user only wants to know if any words in the trie begin
589							# with their word, so that's what we give them.
590	3					19	return 1;
591							}
592							elsif ($self->{_DEEPSEARCH} == EXACT) {
593							# In this case, the user wants us to return something only if the
594							# exact word exists in the trie, and undef otherwise.
595							# This option only really makes sense with when looking up data,
596							# as otherwise it's essentially the same as BOOLEAN, above, but it
597							# doesn't hurt to allow it to work with normal lookup, either.
598							# I'd initially left this out because I didn't see a use for it, but
599							# thanks to Otmal Lendl for pointing out to me a situation in which
600							# it would be helpful to have.
601	4	100				12	if (exists $ref->{$self->{_END}}) {
602	2	100				5	if ($args{data}) {
603	1					9	return $ref->{$self->{_END}};
604							}
605	1					8	return $word;
606							}
607	2					54	return undef;
608							}
609							elsif ($self->{_DEEPSEARCH} == CHOOSE) {
610							# If they want this, then we continue to walk down the trie, collecting
611							# letters, until we find a leaf node, at which point we stop. Note that
612							# this works properly if the exact word is in the trie. Yay.
613							# Of course, making it work that way means that we tend to get shorter
614							# words in choose... is this a bad thing? I dunno.
615	9	50				24	my($stub) = $wantref ? [] : "";
616	9		100			12	while (scalar keys %{$ref} && !exists $ref->{$self->{_END}}) {
	21					130
617	12					14	$nextletter = each(%{ $ref });
	12					22
618							# I need to call this to clear the each() call. Wish I didn't...
619	12					15	keys(%{ $ref });
	12					13
620	12	50				26	if ($wantref) {
621	0					0	push(@{$stub}, $nextletter);
	0					0
622							}
623							else {
624	12					16	$stub .= $nextletter;
625							}
626	12					16	$ref = $ref->{$nextletter};
627							# If we're doing suffixes, bail out early once it's the right length.
628	12	100				44	if ($args{suff_len}) {
629	10	50				24	my $cmpr = $wantref ? scalar @{$stub} : length $stub;
	0					0
630	10	50				35	last if $cmpr == $args{suff_len};
631							}
632							}
633	9	100				21	if ($args{data}) {
634	4					35	return $ref->{$self->{_END}};
635							}
636							# If they've specified a suffix length, then they don't want the
637							# beginning part of the word.
638	5	100				13	if ($args{suff_len}) {
639	3					19	return $stub;
640							}
641							# Otherwise, they do.
642							else {
643	2	50				15	return $wantref ? [@{$word}, @{$stub}] : $word . $stub;
	0					0
	0					0
644							}
645							}
646							elsif ($self->{_DEEPSEARCH} == COUNT) {
647							# Here, the user simply wants a count of words in the trie that begin
648							# with their word, so we get that by calling our walktree method in
649							# scalar context.
650	10	100				46	return scalar $self->_walktree(
651							# When fetching suffixes, we don't want to give the word begnning.
652							word => $args{suff_len} ? "" : $word,
653							'ref' => $ref,
654							suf_len => $args{suff_len},
655							);
656							}
657							elsif ($self->{_DEEPSEARCH} == PREFIX) {
658							# This is the "longest prefix found" case.
659	4	100				15	if (exists $ref->{$self->{_END}}) {
660	2	100				9	if ($args{data}) {
661	1					10	return $ref->{$self->{_END}};
662							}
663	1	50				4	if ($wantref) {
664	0					0	return [@{$lastword}, @{$pref}];
	0					0
	0					0
665							}
666							else {
667	1					9	return $lastword . $pref;
668							}
669							}
670	2	100				14	if ($args{data}) {
671	1					9	return $lastwordref->{$self->{_END}};
672							}
673	1					8	return $lastword;
674							}
675							}
676							}
677
678							# This is the method which does all of the heavy lifting for add and
679							# add_data. Given a word and a datum, it walks down the trie until
680							# it finds a branch that hasn't been created yet. It then makes the rest
681							# of the branch, and slaps an end marker and the datum inside of it.
682							sub _add_internal {
683	66			66		206	my $self = shift;
684	66					72	my $word = shift;
685	66					218	my $datum = shift;
686	66					64	my @letters;
687							# We don't NEED to split a string into letters; Any array of tokens
688							# will do.
689	66	100				181	if (ref($word) eq 'ARRAY') {
690							# Note: this is a copy
691	8					12	@letters = (@{$word});
	8					24
692							# Because in this case, a "letter" can be more than on character
693							# long, we have to make sure we don't collide with whatever we're
694							# using as an end marker.
695							# However, if the user is feeling all fanciful and told us not to
696							# bother, we won't.
697	8	50				40	unless ($self->{_FREEZE_END}) {
698	8					13	for my $letter (@letters) {
699	29	100				69	if ($letter eq $self->{_END}) {
700							# If we had a collision, then make a new end marker.
701	3					9	$self->end_marker($self->_gen_new_marker(
702							bad => \@letters,
703							));
704	3					7	last;
705							}
706							}
707							}
708							}
709							else {
710	58					205	@letters = split('',$word);
711							}
712							# Start at the top of the Trie...
713	66					122	my $ref = $self->{_MAINHASHREF};
714							# This will walk down the trie as far as it can, until it either runs
715							# out of word or out of trie.
716	66		100			533	while (
717							(scalar @letters) &&
718							exists($ref->{$letters[0]})
719							) {
720	73					320	$ref = $ref->{shift(@letters)};
721							}
722							# If it ran out of trie before it ran out of word then this will create
723							# the rest of the trie structure.
724	66					116	for my $letter (@letters) {
725	238					705	$ref = $ref->{$letter} = {};
726							}
727							# In either case, this will make the new end marker for the end of the
728							# word (assuming it wasn't already there) and set the return value
729							# appropriately.
730	66					170	my $ret = 1;
731	66	100				718	if (exists $ref->{$self->{_END}}) {
732	4					7	$ret = 0;
733							}
734							else {
735	62					132	$ref->{$self->{_END}} = undef;
736							}
737							# This will set the data if it was provided.
738	66	100				133	if (defined $datum) {
739	13					22	$ref->{$self->{_END}} = $datum;
740							}
741	66					224	return $ret;
742							}
743
744							# This uses a heuristic (that is, a crappy method) to generate a new
745							# end marker for the trie. In addition to being sure that whatever is
746							# generated is not in use as a letter in the trie, it also makes a bold
747							# yet mostly vain attempt to try to make something that might not be
748							# used in the future.
749							# In general, I do not try to make this functionality good or fast or
750							# perfect -- if it's being called often, the module is being mis-used.
751							# If a user is using multi-character letters, then they ought to find
752							# a string that will be safe and set it themselves.
753							sub _gen_new_marker {
754	4			4		6	my $self = shift;
755	4					10	my %args = @_;
756							# This will keep track of all of the letters used in the trie already
757	4					8	my %used = ();
758							# This will keep track of what lengths they are
759	4					7	my %sizes = ();
760							# First we process the letters of the word which sparked this
761							# re-evaluation.
762	4					75	for my $letter (@{$args{bad}}) {
	4					11
763	11					14	my $len = length($letter);
764	11	100				24	if ($len != 1) {
765	8					18	$used{$letter}++;
766	8					18	$sizes{$len}++;
767							}
768							}
769							# Then we walk the tree and get the info on all the other letters.
770	4					10	my @refs = ($self->{_MAINHASHREF});
771	4					11	while (@refs) {
772	5					130	my $ref = shift @refs;
773	5					6	for my $key (keys %{$ref}) {
	5					14
774							# Note we don't even care about length 1 letters.
775	7	100	66			37	if (
776							(length($key) != 1) &&
777							($key ne $self->{_END})
778							) {
779	1					2	$used{$key}++;
780	1					3	$sizes{length($key)}++;
781	1					2	push(@refs, $ref->{$key});
782							}
783							}
784							}
785							# The idea here is that we want to make the end marker as small as possible,
786							# as it's stuck all over the place. However, we don't want to spend forever
787							# trying to find one that isn't in use.
788							# So, we find the smallest length such that there are fewer than 1/4 of
789							# the total number of possible letters in use of that length, and we make
790							# a key of that length.
791	4					7	my $newlen = 2;
792	4					12	for my $len (sort keys %sizes) {
793							# Yes, I know there are well more than 26 available compositors, but
794							# this will only mean I'm being too careful.
795	6	100				27	if ($sizes{$len} < ((26 ** $len) / 4)) {
796	3					6	$newlen = $len;
797	3					4	last;
798							}
799							else {
800							# This makes it so that if all existing lengths are too full ( !! )
801							# then we will just use a key that's one longer than the longest
802							# one already there.
803	3					6	$newlen = $len + 1;
804							}
805							}
806							# Now we just generate end markers until we find one that isn't in use.
807	4					6	my $newend;
808	4					6	do {
809	4					9	$newend = join '', map { chr(int(rand(128))) } (('') x $newlen);
	7					98
810							} while (exists($used{$newend}));
811							# And return it.
812	4					21	return $newend;
813							}
814
815							# Strewth!
816							1;
817
818							__END__