line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Tree::Trie, a module implementing a trie data structure. |
2
|
|
|
|
|
|
|
# A formal description of tries can be found at: |
3
|
|
|
|
|
|
|
# http://www.cs.queensu.ca/home/daver/235/Notes/Tries.pdf |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
package Tree::Trie; |
6
|
|
|
|
|
|
|
|
7
|
4
|
|
|
4
|
|
117014
|
use strict; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
154
|
|
8
|
4
|
|
|
4
|
|
25
|
use warnings; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
199
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = "1.9"; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# A handful of helpful constants |
13
|
4
|
|
|
4
|
|
23
|
use constant DEFAULT_END_MARKER => ''; |
|
4
|
|
|
|
|
12
|
|
|
4
|
|
|
|
|
365
|
|
14
|
|
|
|
|
|
|
|
15
|
4
|
|
|
4
|
|
22
|
use constant BOOLEAN => 0; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
181
|
|
16
|
4
|
|
|
4
|
|
21
|
use constant CHOOSE => 1; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
236
|
|
17
|
4
|
|
|
4
|
|
20
|
use constant COUNT => 2; |
|
4
|
|
|
|
|
4
|
|
|
4
|
|
|
|
|
149
|
|
18
|
4
|
|
|
4
|
|
31
|
use constant PREFIX => 3; |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
167
|
|
19
|
4
|
|
|
4
|
|
21
|
use constant EXACT => 4; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
16806
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
## Public methods begin here |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# The constructor method. It's very simple. |
24
|
|
|
|
|
|
|
sub new { |
25
|
16
|
|
|
16
|
1
|
3202
|
my($proto) = shift; |
26
|
16
|
|
|
|
|
31
|
my($options) = shift; |
27
|
16
|
|
33
|
|
|
220
|
my($class) = ref($proto) || $proto; |
28
|
16
|
|
|
|
|
36
|
my($self) = {}; |
29
|
16
|
|
|
|
|
78
|
bless($self, $class); |
30
|
16
|
|
|
|
|
64
|
$self->{_MAINHASHREF} = {}; |
31
|
|
|
|
|
|
|
# These are default values |
32
|
16
|
|
|
|
|
55
|
$self->{_END} = &DEFAULT_END_MARKER; |
33
|
16
|
|
|
|
|
36
|
$self->{_DEEPSEARCH} = CHOOSE; |
34
|
16
|
|
|
|
|
26
|
$self->{_FREEZE_END} = 0; |
35
|
16
|
100
|
66
|
|
|
60
|
unless ( defined($options) && (ref($options) eq "HASH") ) { |
36
|
15
|
|
|
|
|
25
|
$options = {}; |
37
|
|
|
|
|
|
|
} |
38
|
16
|
|
|
|
|
75
|
$self->deepsearch($options->{'deepsearch'}); |
39
|
16
|
100
|
|
|
|
110
|
if (exists $options->{end_marker}) { |
40
|
1
|
|
|
|
|
5
|
$self->end_marker($options->{end_marker}); |
41
|
|
|
|
|
|
|
} |
42
|
16
|
100
|
|
|
|
42
|
if (exists $options->{freeze_end_marker}) { |
43
|
1
|
|
|
|
|
4
|
$self->freeze_end_marker($options->{freeze_end_marker}); |
44
|
|
|
|
|
|
|
} |
45
|
16
|
|
|
|
|
135
|
return($self); |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# Sets the value of the end marker, for those people who think they know |
49
|
|
|
|
|
|
|
# better than Tree::Trie. Note it does not allow the setting of single |
50
|
|
|
|
|
|
|
# character end markers. |
51
|
|
|
|
|
|
|
sub end_marker { |
52
|
6
|
|
|
6
|
1
|
10
|
my $self = shift; |
53
|
6
|
100
|
66
|
|
|
34
|
if ($_[0] && length $_[0] > 1) { |
54
|
|
|
|
|
|
|
# If they decide to set a new end marker, we have to be sure to |
55
|
|
|
|
|
|
|
# go through and update all existing markers. |
56
|
5
|
|
|
|
|
7
|
my $newend = shift; |
57
|
5
|
|
|
|
|
11
|
my @refs = ($self->{_MAINHASHREF}); |
58
|
5
|
|
|
|
|
15
|
while (@refs) { |
59
|
35
|
|
|
|
|
64
|
my $ref = shift @refs; |
60
|
35
|
|
|
|
|
38
|
for my $key (keys %{$ref}) { |
|
35
|
|
|
|
|
93
|
|
61
|
41
|
100
|
|
|
|
151
|
if ($key eq $self->{_END}) { |
62
|
11
|
|
|
|
|
23
|
$ref->{$newend} = $ref->{$key}; |
63
|
11
|
|
|
|
|
38
|
delete $ref->{$key}; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
else { |
66
|
30
|
|
|
|
|
92
|
push(@refs, $ref->{$key}); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
5
|
|
|
|
|
10
|
$self->{_END} = $newend; |
71
|
|
|
|
|
|
|
} |
72
|
6
|
|
|
|
|
13
|
return $self->{_END}; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Sets the option to not attempt to update the end marker based on added |
76
|
|
|
|
|
|
|
# letters. |
77
|
|
|
|
|
|
|
# The above is the most awkward sentence I have ever written. |
78
|
|
|
|
|
|
|
sub freeze_end_marker { |
79
|
2
|
|
|
2
|
1
|
482
|
my $self = shift; |
80
|
2
|
50
|
|
|
|
7
|
if (scalar @_) { |
81
|
2
|
100
|
|
|
|
6
|
if (shift) { |
82
|
1
|
|
|
|
|
2
|
$self->{_FREEZE_END} = 1; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
else { |
85
|
1
|
|
|
|
|
3
|
$self->{_FREEZE_END} = 0; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
} |
88
|
2
|
|
|
|
|
6
|
return $self->{_FREEZE_END}; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# Sets the value of the deepsearch parameter. Can be passed either words |
92
|
|
|
|
|
|
|
# describing the parameter, or their numerical equivalents. Legal values |
93
|
|
|
|
|
|
|
# are: |
94
|
|
|
|
|
|
|
# boolean => 0 |
95
|
|
|
|
|
|
|
# choose => 1 |
96
|
|
|
|
|
|
|
# count => 2 |
97
|
|
|
|
|
|
|
# prefix => 3 |
98
|
|
|
|
|
|
|
# exact => 4 |
99
|
|
|
|
|
|
|
# See the POD for the 'lookup' method for details on this option. |
100
|
|
|
|
|
|
|
sub deepsearch { |
101
|
28
|
|
|
28
|
1
|
54
|
my($self) = shift; |
102
|
28
|
|
|
|
|
51
|
my($option) = shift; |
103
|
28
|
100
|
|
|
|
64
|
if(defined($option)) { |
104
|
12
|
100
|
66
|
|
|
177
|
if ($option eq BOOLEAN || $option eq 'boolean') { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
105
|
2
|
|
|
|
|
7
|
$self->{_DEEPSEARCH} = BOOLEAN; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
elsif ($option eq CHOOSE || $option eq 'choose') { |
108
|
4
|
|
|
|
|
9
|
$self->{_DEEPSEARCH} = CHOOSE; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
elsif ($option eq COUNT || $option eq 'count') { |
111
|
3
|
|
|
|
|
7
|
$self->{_DEEPSEARCH} = COUNT; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
elsif ($option eq PREFIX || $option eq 'prefix') { |
114
|
2
|
|
|
|
|
5
|
$self->{_DEEPSEARCH} = PREFIX; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
elsif ($option eq EXACT || $option eq 'exact') { |
117
|
1
|
|
|
|
|
2
|
$self->{_DEEPSEARCH} = EXACT; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
} |
120
|
28
|
|
|
|
|
75
|
return $self->{_DEEPSEARCH}; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# The add() method takes a list of words as arguments and attempts to add |
124
|
|
|
|
|
|
|
# them to the trie. In list context, returns a list of words successfully |
125
|
|
|
|
|
|
|
# added. In scalar context, returns a count of these words. As of this |
126
|
|
|
|
|
|
|
# version, the only reason a word can fail to be added is if it is already |
127
|
|
|
|
|
|
|
# in the trie. Or, I suppose, if there was a bug. :) |
128
|
|
|
|
|
|
|
sub add { |
129
|
13
|
|
|
13
|
1
|
145
|
my($self) = shift; |
130
|
13
|
|
|
|
|
37
|
my(@words) = @_; |
131
|
|
|
|
|
|
|
|
132
|
13
|
|
|
|
|
306
|
my @retarray; |
133
|
13
|
|
|
|
|
20
|
my $retnum = 0; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Process each word... |
136
|
13
|
|
|
|
|
24
|
for my $word (@words) { |
137
|
|
|
|
|
|
|
# And just call the internal thingy for it. |
138
|
53
|
100
|
|
|
|
113
|
if ($self->_add_internal($word, undef)) { |
139
|
|
|
|
|
|
|
# Updating return values as needed |
140
|
50
|
50
|
|
|
|
83
|
if (wantarray) { |
141
|
0
|
|
|
|
|
0
|
push(@retarray,$word); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
else { |
144
|
50
|
|
|
|
|
263
|
$retnum++; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
# When done, return results. |
149
|
13
|
50
|
|
|
|
71
|
return (wantarray ? @retarray : $retnum); |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# add_data() takes a hash of word => data pairs, adds the words to the trie and |
153
|
|
|
|
|
|
|
# associates the data to those words. |
154
|
|
|
|
|
|
|
sub add_data { |
155
|
7
|
|
|
7
|
1
|
67
|
my($self) = shift; |
156
|
7
|
|
|
|
|
11
|
my($retnum, @retarray); |
157
|
7
|
|
|
|
|
9
|
my $word = ""; |
158
|
|
|
|
|
|
|
# Making sure that we've gotten data in pairs. Can't just turn @_ |
159
|
|
|
|
|
|
|
# into %data, because that would stringify arrayrefs |
160
|
7
|
|
66
|
|
|
44
|
while(defined($word = shift) && @_) { |
161
|
|
|
|
|
|
|
# This also just uses the internal add method. |
162
|
13
|
100
|
|
|
|
35
|
if ($self->_add_internal($word, shift())) { |
163
|
12
|
50
|
|
|
|
24
|
if (wantarray) { |
164
|
0
|
|
|
|
|
0
|
push(@retarray, $word); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
else { |
167
|
12
|
|
|
|
|
48
|
$retnum++; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
} |
171
|
7
|
50
|
|
|
|
18
|
return @retarray if wantarray; |
172
|
7
|
|
|
|
|
31
|
return $retnum; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# add_all() takes one or more other tries and adds all of their entries |
176
|
|
|
|
|
|
|
# to the trie. If both tries have data stored for the same key, the data |
177
|
|
|
|
|
|
|
# from the trie on which this method was invoked will be overwritten. I can't |
178
|
|
|
|
|
|
|
# think of anything useful to return from this method, so it has no return |
179
|
|
|
|
|
|
|
# value. If you can think of anything that would make sense, please let me |
180
|
|
|
|
|
|
|
# know. |
181
|
|
|
|
|
|
|
# This idea and most of its implementation come from Aaron Stone. |
182
|
|
|
|
|
|
|
# Thanks! |
183
|
|
|
|
|
|
|
sub add_all { |
184
|
2
|
|
|
2
|
1
|
11
|
my $self = shift; |
185
|
2
|
|
|
|
|
5
|
for my $trie (@_) { |
186
|
2
|
|
66
|
|
|
13
|
my $ignore_end = ( |
187
|
|
|
|
|
|
|
$self->{_FREEZE_END} || |
188
|
|
|
|
|
|
|
($self->{_END} eq $trie->{_END}) |
189
|
|
|
|
|
|
|
); |
190
|
2
|
|
|
|
|
16
|
my @nodepairs = ({ |
191
|
|
|
|
|
|
|
from => $trie->{_MAINHASHREF}, |
192
|
|
|
|
|
|
|
to => $self->{_MAINHASHREF}, |
193
|
|
|
|
|
|
|
}); |
194
|
2
|
|
|
|
|
7
|
while (scalar @nodepairs) { |
195
|
34
|
|
|
|
|
49
|
my $np = pop @nodepairs; |
196
|
34
|
|
|
|
|
34
|
for my $letter (keys %{$np->{from}}) { |
|
34
|
|
|
|
|
82
|
|
197
|
39
|
100
|
|
|
|
128
|
unless ($ignore_end) { |
198
|
9
|
100
|
|
|
|
23
|
if ($letter eq $self->{_END}) { |
199
|
1
|
|
|
|
|
7
|
$self->end_marker($self->_gen_new_marker( |
200
|
|
|
|
|
|
|
bad => [$letter], |
201
|
|
|
|
|
|
|
)); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
} |
204
|
39
|
100
|
|
|
|
350
|
if ($letter eq $trie->{_END}) { |
205
|
7
|
|
|
|
|
434
|
$np->{to}{$self->{_END}} = $np->{from}{$trie->{_END}}; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
else { |
208
|
32
|
100
|
|
|
|
63
|
unless (exists $np->{to}{$letter}) { |
209
|
24
|
|
|
|
|
79
|
$np->{to}{$letter} = {}; |
210
|
|
|
|
|
|
|
} |
211
|
32
|
|
|
|
|
239
|
push @nodepairs, { |
212
|
|
|
|
|
|
|
from => $np->{from}{$letter}, |
213
|
|
|
|
|
|
|
to => $np->{to}->{$letter}, |
214
|
|
|
|
|
|
|
}; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# delete_data() takes a list of words in the trie and deletes the associated |
222
|
|
|
|
|
|
|
# data from the internal data store. In list context, returns a list of words |
223
|
|
|
|
|
|
|
# whose associated data have been removed -- in scalar context, returns a count |
224
|
|
|
|
|
|
|
# thereof. |
225
|
|
|
|
|
|
|
sub delete_data { |
226
|
1
|
|
|
1
|
1
|
5
|
my($self, @words) = @_; |
227
|
1
|
|
|
|
|
2
|
my($retnum, @retarray) = 0; |
228
|
1
|
|
|
|
|
3
|
my @letters; |
229
|
|
|
|
|
|
|
# Process each word... |
230
|
1
|
|
|
|
|
3
|
for my $word (@words) { |
231
|
3
|
50
|
|
|
|
13
|
if (ref($word) eq 'ARRAY') { |
232
|
0
|
|
|
|
|
0
|
@letters = (@{$word}); |
|
0
|
|
|
|
|
0
|
|
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
else { |
235
|
3
|
|
|
|
|
12
|
@letters = split(//, $word); |
236
|
|
|
|
|
|
|
} |
237
|
3
|
|
|
|
|
6
|
my $ref = $self->{_MAINHASHREF}; |
238
|
|
|
|
|
|
|
# Walk down the tree... |
239
|
3
|
|
|
|
|
6
|
for my $letter (@letters) { |
240
|
8
|
100
|
|
|
|
20
|
if ($ref->{$letter}) { |
241
|
7
|
|
|
|
|
16
|
$ref = $ref->{$letter}; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
else { |
244
|
|
|
|
|
|
|
# This will cause the test right after this loop to fail and |
245
|
|
|
|
|
|
|
# skip the the next word -- we want that because if we're here |
246
|
|
|
|
|
|
|
# it means the word isn't in the trie. |
247
|
1
|
|
|
|
|
2
|
$ref = {}; |
248
|
1
|
|
|
|
|
3
|
last; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
3
|
100
|
|
|
|
14
|
next unless (exists $ref->{$self->{_END}}); |
252
|
|
|
|
|
|
|
# This is all we need to do to clear out the data |
253
|
2
|
|
|
|
|
6
|
$ref->{$self->{_END}} = undef; |
254
|
2
|
50
|
|
|
|
6
|
if (wantarray) { |
255
|
0
|
|
|
|
|
0
|
push(@retarray, $word); |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
else { |
258
|
2
|
|
|
|
|
5
|
$retnum++; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
} |
261
|
1
|
50
|
|
|
|
8
|
if (wantarray) { |
262
|
0
|
|
|
|
|
0
|
return @retarray; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
else { |
265
|
1
|
|
|
|
|
6
|
return $retnum; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# The lookup() method searches for words (or beginnings of words) in the trie. |
270
|
|
|
|
|
|
|
# It takes a single word as an argument and, in list context, returns a list |
271
|
|
|
|
|
|
|
# of all the words in the trie which begin with the given word. In scalar |
272
|
|
|
|
|
|
|
# context, the return value depends on the value of the deepsearch parameter. |
273
|
|
|
|
|
|
|
# An optional second argument is available: This should be a numerical |
274
|
|
|
|
|
|
|
# argument, and specifies 2 things: first, that you want only word suffixes |
275
|
|
|
|
|
|
|
# to be returned, and second, the maximum length of those suffices. All |
276
|
|
|
|
|
|
|
# other configurations still apply. See the POD on this method for more |
277
|
|
|
|
|
|
|
# details. |
278
|
|
|
|
|
|
|
sub lookup { |
279
|
37
|
|
|
37
|
1
|
2723
|
my($self) = shift; |
280
|
37
|
|
|
|
|
52
|
my($word) = shift; |
281
|
|
|
|
|
|
|
# This is the argument for doing suffix lookup. |
282
|
37
|
|
|
|
|
43
|
my($suff_length) = shift; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Abstraction is kind of cool |
285
|
37
|
|
|
|
|
93
|
return $self->_lookup_internal( |
286
|
|
|
|
|
|
|
word => $word, |
287
|
|
|
|
|
|
|
suff_len => $suff_length, |
288
|
|
|
|
|
|
|
want_arr => wantarray(), |
289
|
|
|
|
|
|
|
data => 0, |
290
|
|
|
|
|
|
|
); |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# lookup_data() works basically the same as lookup, with the following |
294
|
|
|
|
|
|
|
# exceptions -- in list context, returns a hash of ward => data pairings, |
295
|
|
|
|
|
|
|
# and in scalar context, wherever it would return a word, it will instead |
296
|
|
|
|
|
|
|
# return the datum associated with that word. Note that, depending on |
297
|
|
|
|
|
|
|
# the deepsearch setting, lookup_data and lookup may return exactly the |
298
|
|
|
|
|
|
|
# same scalar context. |
299
|
|
|
|
|
|
|
sub lookup_data { |
300
|
11
|
|
|
11
|
1
|
25
|
my($self, $word) = @_; |
301
|
|
|
|
|
|
|
|
302
|
11
|
|
|
|
|
600
|
return $self->_lookup_internal( |
303
|
|
|
|
|
|
|
word => $word, |
304
|
|
|
|
|
|
|
want_arr => wantarray(), |
305
|
|
|
|
|
|
|
data => 1, |
306
|
|
|
|
|
|
|
); |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# The remove() method takes a list of words and, surprisingly, removes them |
310
|
|
|
|
|
|
|
# from the trie. It returns, in scalar context, the number of words removed. |
311
|
|
|
|
|
|
|
# In list context, returns a list of the words removed. As of now, the only |
312
|
|
|
|
|
|
|
# reason a word would fail to be removed is if it's not in the trie in the |
313
|
|
|
|
|
|
|
# first place. Or, again, if there's a bug... :) |
314
|
|
|
|
|
|
|
sub remove { |
315
|
3
|
|
|
3
|
1
|
571
|
my($self) = shift; |
316
|
3
|
|
|
|
|
9
|
my(@words) = @_; |
317
|
|
|
|
|
|
|
|
318
|
3
|
|
|
|
|
6
|
my($letter,$ref) = ("","",""); |
319
|
3
|
|
|
|
|
5
|
my(@letters,@ldn,@retarray); |
320
|
3
|
|
|
|
|
6
|
my($retnum) = 0; |
321
|
|
|
|
|
|
|
# The basic strategy here is as follows: |
322
|
|
|
|
|
|
|
## |
323
|
|
|
|
|
|
|
# We walk down the trie one node at a time. If at any point, we see that a |
324
|
|
|
|
|
|
|
# node can be deleted (that is, its only child is the one which continues the |
325
|
|
|
|
|
|
|
# word we're deleting) then we mark it as the 'last deleteable'. If at any |
326
|
|
|
|
|
|
|
# point we find a node which *cannot* be deleted (it has more children other |
327
|
|
|
|
|
|
|
# than the one for the word we're working on), then we unmark our 'last |
328
|
|
|
|
|
|
|
# deleteable' from before. Once done, delete from the last deleteable node |
329
|
|
|
|
|
|
|
# down. |
330
|
|
|
|
|
|
|
|
331
|
3
|
|
|
|
|
7
|
for my $word (@words) { |
332
|
5
|
100
|
|
|
|
14
|
if (ref($word) eq 'ARRAY') { |
333
|
1
|
|
|
|
|
1
|
@letters = (@{$word}); |
|
1
|
|
|
|
|
3
|
|
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
else { |
336
|
4
|
|
|
|
|
17
|
@letters = split('',$word); |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
# For each word, we need to put the leaf node entry at the end of the list |
339
|
|
|
|
|
|
|
# of letters. We then reset the starting ref, and @ldn, which stands for |
340
|
|
|
|
|
|
|
# 'last deleteable node'. It contains the ref of the hash and the key to |
341
|
|
|
|
|
|
|
# be deleted. It does not seem possible to store a value passable to |
342
|
|
|
|
|
|
|
# the 'delete' builtin in a scalar, so we're forced to do this. |
343
|
5
|
|
|
|
|
11
|
push(@letters,$self->{_END}); |
344
|
5
|
|
|
|
|
9
|
$ref = $self->{_MAINHASHREF}; |
345
|
5
|
|
|
|
|
10
|
@ldn = (); |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
# This is a special case, if the first letter of the word is the only |
348
|
|
|
|
|
|
|
# key of the main hash. I might not really need it, but this works as |
349
|
|
|
|
|
|
|
# it is. |
350
|
5
|
100
|
66
|
|
|
7
|
if (((scalar keys(%{ $ref })) == 1) && (exists $ref->{$letters[0]})) { |
|
5
|
|
|
|
|
27
|
|
351
|
1
|
|
|
|
|
2
|
@ldn = ($ref); |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
# And now we go down the trie, as described above. |
354
|
5
|
|
|
|
|
17
|
while (defined($letter = shift(@letters))) { |
355
|
|
|
|
|
|
|
# We break out if we're at the end, or if we're run out of trie before |
356
|
|
|
|
|
|
|
# finding the end of the word -- that is, if the word isn't in the |
357
|
|
|
|
|
|
|
# trie. |
358
|
25
|
100
|
|
|
|
67
|
last if ($letter eq $self->{_END}); |
359
|
20
|
50
|
|
|
|
47
|
last unless exists($ref->{$letter}); |
360
|
20
|
100
|
66
|
|
|
21
|
if ( |
361
|
20
|
|
|
|
|
104
|
scalar keys(%{ $ref->{$letter} }) == 1 && |
362
|
|
|
|
|
|
|
exists $ref->{$letter}{$letters[0]} |
363
|
|
|
|
|
|
|
) { |
364
|
18
|
100
|
|
|
|
37
|
unless (scalar @ldn) { |
365
|
4
|
|
|
|
|
10
|
@ldn = ($ref,$letter); |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
else { |
369
|
2
|
|
|
|
|
5
|
@ldn = (); |
370
|
|
|
|
|
|
|
} |
371
|
20
|
|
|
|
|
55
|
$ref = $ref->{$letter}; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
# If we broke out and there were still letters left in @letters, then the |
374
|
|
|
|
|
|
|
# word must not be in the trie. Furthermore, if we got all the way to |
375
|
|
|
|
|
|
|
# the end, but there's no leaf node, the word must not be in the trie. |
376
|
5
|
50
|
|
|
|
16
|
next if (scalar @letters); |
377
|
5
|
50
|
|
|
|
22
|
next unless (exists($ref->{$self->{_END}})); |
378
|
|
|
|
|
|
|
# If @ldn is empty, then the only deleteable node is the leaf node, so |
379
|
|
|
|
|
|
|
# we set this up. |
380
|
5
|
100
|
|
|
|
12
|
if (scalar @ldn == 0) { |
381
|
1
|
|
|
|
|
4
|
@ldn = ($ref,$self->{_END}); |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
# If there's only one entry in @ldn, then it's the ref of the top of our |
384
|
|
|
|
|
|
|
# Trie. If that's marked as deleteable, then we can just nuke the entire |
385
|
|
|
|
|
|
|
# hash. |
386
|
5
|
100
|
|
|
|
12
|
if (scalar @ldn == 1) { |
387
|
1
|
|
|
|
|
2
|
%{ $ldn[0] } = (); |
|
1
|
|
|
|
|
6
|
|
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
# Otherwise, we just delete the key we want to. |
390
|
|
|
|
|
|
|
else { |
391
|
4
|
|
|
|
|
12
|
delete($ldn[0]->{$ldn[1]}); |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
# And then just return stuff. |
394
|
5
|
50
|
|
|
|
11
|
if (wantarray) { |
395
|
0
|
|
|
|
|
0
|
push (@retarray,$word); |
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
else { |
398
|
5
|
|
|
|
|
11
|
$retnum++; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
} |
401
|
3
|
50
|
|
|
|
10
|
if (wantarray) { |
402
|
0
|
|
|
|
|
0
|
return @retarray; |
403
|
|
|
|
|
|
|
} |
404
|
3
|
|
|
|
|
18
|
return $retnum; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
## These are PRIVATE METHODS. Don't call them directly unless you really |
408
|
|
|
|
|
|
|
# know what you're doing, or you enjoy things working funny. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# The _walktree() sub takes a word beginning and a hashref (hopefully to a trie) |
411
|
|
|
|
|
|
|
# and walks down the trie, gathering all of the word endings and retuning them |
412
|
|
|
|
|
|
|
# appended to the word beginning. |
413
|
|
|
|
|
|
|
sub _walktree { |
414
|
238
|
|
|
238
|
|
4397
|
my($self, %args) = @_; |
415
|
238
|
|
|
|
|
506
|
my $word = $args{word}; |
416
|
238
|
|
|
|
|
279
|
my $ref = $args{ref}; |
417
|
|
|
|
|
|
|
# These 2 arguments are used to control how far down the tree this |
418
|
|
|
|
|
|
|
# path will go. |
419
|
|
|
|
|
|
|
# This first argument is passed in by external subs |
420
|
238
|
|
100
|
|
|
892
|
my $suffix_length = $args{suf_len} || 0; |
421
|
|
|
|
|
|
|
# And this one is used only by the recursive calls. |
422
|
238
|
|
100
|
|
|
493
|
my $walked_suffix_length = $args{walked} || 0; |
423
|
|
|
|
|
|
|
|
424
|
238
|
|
|
|
|
415
|
my $wantref = ref($word) eq 'ARRAY'; |
425
|
|
|
|
|
|
|
|
426
|
238
|
|
|
|
|
346
|
my($key) = ""; |
427
|
238
|
|
|
|
|
427
|
my(@retarray) = (); |
428
|
238
|
|
|
|
|
248
|
my($ret) = 0; |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# For some reason, I used to think this was complicated and had a lot of |
431
|
|
|
|
|
|
|
# stupid, useless code here. It's a lot simpler now. If the key we find |
432
|
|
|
|
|
|
|
# is our magic reference, then we just give back the word. Otherwise, we |
433
|
|
|
|
|
|
|
# walk down the new subtree we've discovered. |
434
|
238
|
|
|
|
|
267
|
foreach $key (keys %{ $ref }) { |
|
238
|
|
|
|
|
806
|
|
435
|
291
|
100
|
|
|
|
993
|
if ($key eq $self->{_END}) { |
436
|
61
|
100
|
|
|
|
99
|
if (wantarray) { |
437
|
36
|
|
|
|
|
50
|
push(@retarray,$word); |
438
|
36
|
100
|
|
|
|
123
|
if ($args{data}) { |
439
|
3
|
|
|
|
|
5
|
push(@retarray, $ref->{$key}); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
else { |
443
|
25
|
|
|
|
|
26
|
$ret++; |
444
|
|
|
|
|
|
|
} |
445
|
61
|
|
|
|
|
137
|
next; |
446
|
|
|
|
|
|
|
} |
447
|
230
|
100
|
|
|
|
865
|
my $nextval = $wantref ? [(@{$word}), $key] : $word . $key; |
|
12
|
|
|
|
|
27
|
|
448
|
|
|
|
|
|
|
# If we've reached the max depth we need to travel for the suffix (if |
449
|
|
|
|
|
|
|
# specified), then stop and collect everything up. |
450
|
230
|
100
|
100
|
|
|
559
|
if ($suffix_length > 0 && ($suffix_length - $walked_suffix_length == 1)) { |
451
|
11
|
100
|
|
|
|
14
|
if (wantarray) { |
452
|
1
|
|
|
|
|
4
|
push(@retarray, $nextval); |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
else { |
455
|
10
|
|
|
|
|
25
|
$ret++; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
else { |
459
|
|
|
|
|
|
|
# Look, recursion! |
460
|
219
|
|
|
|
|
1068
|
my %arguments = ( |
461
|
|
|
|
|
|
|
word => $nextval, |
462
|
|
|
|
|
|
|
'ref' => $ref->{$key}, |
463
|
|
|
|
|
|
|
suf_len => $suffix_length, |
464
|
|
|
|
|
|
|
walked => $walked_suffix_length + 1, |
465
|
|
|
|
|
|
|
data => $args{data}, |
466
|
|
|
|
|
|
|
); |
467
|
219
|
100
|
|
|
|
462
|
if (wantarray) { |
468
|
142
|
|
|
|
|
1242
|
push(@retarray, $self->_walktree(%arguments)); |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
else { |
471
|
77
|
|
|
|
|
266
|
$ret += scalar $self->_walktree(%arguments); |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
} |
475
|
238
|
100
|
|
|
|
445
|
if (wantarray) { |
476
|
151
|
|
|
|
|
1147
|
return @retarray; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
else { |
479
|
87
|
|
|
|
|
466
|
return $ret; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# This code used to use some fairly hoary recursive code which caused it to |
484
|
|
|
|
|
|
|
# run fairly slowly, mainly due to the relatively slow way that perl handles |
485
|
|
|
|
|
|
|
# OO method invocation. This was pointed out to me by Justin Hicks, and he |
486
|
|
|
|
|
|
|
# helped me fix it up, to be quite a bit more reasonable now. |
487
|
|
|
|
|
|
|
sub _lookup_internal { |
488
|
48
|
|
|
48
|
|
60
|
my $self = shift; |
489
|
48
|
|
|
|
|
182
|
my %args = @_; |
490
|
48
|
|
|
|
|
634
|
my($ref) = $self->{_MAINHASHREF}; |
491
|
|
|
|
|
|
|
|
492
|
48
|
|
|
|
|
76
|
my($letter, $nextletter) = ("", ""); |
493
|
48
|
|
|
|
|
78
|
my(@letters) = (); |
494
|
48
|
|
|
|
|
54
|
my(@retarray) = (); |
495
|
48
|
|
|
|
|
60
|
my($wantref) = 0; |
496
|
|
|
|
|
|
|
|
497
|
48
|
|
|
|
|
72
|
my $word = $args{word}; |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
# Here we split the word up into letters in the appropriate way. |
500
|
48
|
100
|
|
|
|
99
|
if (ref($word) eq 'ARRAY') { |
501
|
5
|
|
|
|
|
4
|
@letters = (@{$word}); |
|
5
|
|
|
|
|
10
|
|
502
|
|
|
|
|
|
|
# Keeping track of what kind of word it was. |
503
|
5
|
|
|
|
|
6
|
$wantref = 1; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
else { |
506
|
43
|
|
|
|
|
145
|
@letters = split('',$word); |
507
|
|
|
|
|
|
|
} |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# These three are to keep hold of possibly returned values. |
510
|
48
|
100
|
|
|
|
110
|
my $lastword = $wantref ? [] : ""; |
511
|
48
|
|
|
|
|
54
|
my $lastwordref = undef; |
512
|
48
|
100
|
|
|
|
89
|
my $pref = $wantref ? [] : ""; |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
# Like everything else, we step across each letter. |
515
|
48
|
|
|
|
|
140
|
while(defined($letter = shift(@letters))) { |
516
|
|
|
|
|
|
|
# This is to keep track of stuff for the "prefix" version of deepsearch. |
517
|
130
|
100
|
66
|
|
|
465
|
if ($self->{_DEEPSEARCH} == PREFIX && !$args{want_arr}) { |
518
|
72
|
100
|
|
|
|
169
|
if (exists $ref->{$self->{_END}}) { |
519
|
|
|
|
|
|
|
# The "data" argument tells us if we want to return the word |
520
|
|
|
|
|
|
|
# or the data associated with it. |
521
|
10
|
100
|
|
|
|
27
|
if ($args{data}) { |
|
|
50
|
|
|
|
|
|
522
|
5
|
|
|
|
|
9
|
$lastwordref = $ref; |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
elsif ($wantref) { |
525
|
0
|
|
|
|
|
0
|
push(@{$lastword}, @{$pref}); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
else { |
528
|
5
|
|
|
|
|
9
|
$lastword .= $pref; |
529
|
|
|
|
|
|
|
} |
530
|
10
|
50
|
|
|
|
22
|
$pref = $wantref ? [] : ""; |
531
|
|
|
|
|
|
|
} |
532
|
72
|
100
|
|
|
|
138
|
unless ($args{data}) { |
533
|
36
|
50
|
|
|
|
50
|
if ($wantref) { |
534
|
0
|
|
|
|
|
0
|
push(@{$pref}, $letter); |
|
0
|
|
|
|
|
0
|
|
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
else { |
537
|
36
|
|
|
|
|
49
|
$pref .= $letter; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
# If, at any point, we find that we've run out of tree before we've run out |
542
|
|
|
|
|
|
|
# of word, then there is nothing in the trie that begins with the input |
543
|
|
|
|
|
|
|
# word, so we return appropriately. |
544
|
130
|
100
|
|
|
|
275
|
unless (exists $ref->{$letter}) { |
545
|
|
|
|
|
|
|
# Array case. |
546
|
9
|
50
|
|
|
|
44
|
if ($args{want_arr}) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
547
|
0
|
|
|
|
|
0
|
return (); |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
# "count" case. |
550
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == COUNT) { |
551
|
2
|
|
|
|
|
16
|
return 0; |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
# "prefix" case. |
554
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == PREFIX) { |
555
|
4
|
100
|
66
|
|
|
19
|
if ($args{data} && $lastwordref) { |
556
|
2
|
|
|
|
|
16
|
return $lastwordref->{$self->{_END}}; |
557
|
|
|
|
|
|
|
} |
558
|
2
|
50
|
50
|
|
|
24
|
if (($wantref && scalar @{$lastword}) || length $lastword) { |
|
0
|
|
33
|
|
|
0
|
|
559
|
2
|
|
|
|
|
18
|
return $lastword; |
560
|
|
|
|
|
|
|
} |
561
|
0
|
|
|
|
|
0
|
return undef; |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
# All other deepsearch cases are the same. |
564
|
|
|
|
|
|
|
else { |
565
|
3
|
|
|
|
|
19
|
return undef; |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
# If the letter is there, we just walk one step down the trie. |
569
|
121
|
|
|
|
|
313
|
$ref = $ref->{$letter}; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
# Once we've walked all the way down the tree to the end of the word we were |
572
|
|
|
|
|
|
|
# given, there are a few things that can be done, depending on the context |
573
|
|
|
|
|
|
|
# that the method was called in. |
574
|
39
|
100
|
|
|
|
82
|
if ($args{want_arr}) { |
575
|
|
|
|
|
|
|
# If they want an array, then we use the walktree subroutine to collect all |
576
|
|
|
|
|
|
|
# of the words beneath our current location in the trie, and return them. |
577
|
9
|
100
|
|
|
|
44
|
@retarray = $self->_walktree( |
578
|
|
|
|
|
|
|
# When fetching suffixes, we don't want to give the word begnning. |
579
|
|
|
|
|
|
|
word => $args{suff_len} ? "" : $word, |
580
|
|
|
|
|
|
|
'ref' => $ref, |
581
|
|
|
|
|
|
|
suf_len => $args{suff_len}, |
582
|
|
|
|
|
|
|
data => $args{data}, |
583
|
|
|
|
|
|
|
); |
584
|
9
|
|
|
|
|
89
|
return @retarray; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
else { |
587
|
30
|
100
|
|
|
|
166
|
if ($self->{_DEEPSEARCH} == BOOLEAN) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# Here, the user only wants to know if any words in the trie begin |
589
|
|
|
|
|
|
|
# with their word, so that's what we give them. |
590
|
3
|
|
|
|
|
19
|
return 1; |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == EXACT) { |
593
|
|
|
|
|
|
|
# In this case, the user wants us to return something only if the |
594
|
|
|
|
|
|
|
# exact word exists in the trie, and undef otherwise. |
595
|
|
|
|
|
|
|
# This option only really makes sense with when looking up data, |
596
|
|
|
|
|
|
|
# as otherwise it's essentially the same as BOOLEAN, above, but it |
597
|
|
|
|
|
|
|
# doesn't hurt to allow it to work with normal lookup, either. |
598
|
|
|
|
|
|
|
# I'd initially left this out because I didn't see a use for it, but |
599
|
|
|
|
|
|
|
# thanks to Otmal Lendl for pointing out to me a situation in which |
600
|
|
|
|
|
|
|
# it would be helpful to have. |
601
|
4
|
100
|
|
|
|
12
|
if (exists $ref->{$self->{_END}}) { |
602
|
2
|
100
|
|
|
|
5
|
if ($args{data}) { |
603
|
1
|
|
|
|
|
9
|
return $ref->{$self->{_END}}; |
604
|
|
|
|
|
|
|
} |
605
|
1
|
|
|
|
|
8
|
return $word; |
606
|
|
|
|
|
|
|
} |
607
|
2
|
|
|
|
|
54
|
return undef; |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == CHOOSE) { |
610
|
|
|
|
|
|
|
# If they want this, then we continue to walk down the trie, collecting |
611
|
|
|
|
|
|
|
# letters, until we find a leaf node, at which point we stop. Note that |
612
|
|
|
|
|
|
|
# this works properly if the exact word is in the trie. Yay. |
613
|
|
|
|
|
|
|
# Of course, making it work that way means that we tend to get shorter |
614
|
|
|
|
|
|
|
# words in choose... is this a bad thing? I dunno. |
615
|
9
|
50
|
|
|
|
24
|
my($stub) = $wantref ? [] : ""; |
616
|
9
|
|
100
|
|
|
12
|
while (scalar keys %{$ref} && !exists $ref->{$self->{_END}}) { |
|
21
|
|
|
|
|
130
|
|
617
|
12
|
|
|
|
|
14
|
$nextletter = each(%{ $ref }); |
|
12
|
|
|
|
|
22
|
|
618
|
|
|
|
|
|
|
# I need to call this to clear the each() call. Wish I didn't... |
619
|
12
|
|
|
|
|
15
|
keys(%{ $ref }); |
|
12
|
|
|
|
|
13
|
|
620
|
12
|
50
|
|
|
|
26
|
if ($wantref) { |
621
|
0
|
|
|
|
|
0
|
push(@{$stub}, $nextletter); |
|
0
|
|
|
|
|
0
|
|
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
else { |
624
|
12
|
|
|
|
|
16
|
$stub .= $nextletter; |
625
|
|
|
|
|
|
|
} |
626
|
12
|
|
|
|
|
16
|
$ref = $ref->{$nextletter}; |
627
|
|
|
|
|
|
|
# If we're doing suffixes, bail out early once it's the right length. |
628
|
12
|
100
|
|
|
|
44
|
if ($args{suff_len}) { |
629
|
10
|
50
|
|
|
|
24
|
my $cmpr = $wantref ? scalar @{$stub} : length $stub; |
|
0
|
|
|
|
|
0
|
|
630
|
10
|
50
|
|
|
|
35
|
last if $cmpr == $args{suff_len}; |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
} |
633
|
9
|
100
|
|
|
|
21
|
if ($args{data}) { |
634
|
4
|
|
|
|
|
35
|
return $ref->{$self->{_END}}; |
635
|
|
|
|
|
|
|
} |
636
|
|
|
|
|
|
|
# If they've specified a suffix length, then they don't want the |
637
|
|
|
|
|
|
|
# beginning part of the word. |
638
|
5
|
100
|
|
|
|
13
|
if ($args{suff_len}) { |
639
|
3
|
|
|
|
|
19
|
return $stub; |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
# Otherwise, they do. |
642
|
|
|
|
|
|
|
else { |
643
|
2
|
50
|
|
|
|
15
|
return $wantref ? [@{$word}, @{$stub}] : $word . $stub; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == COUNT) { |
647
|
|
|
|
|
|
|
# Here, the user simply wants a count of words in the trie that begin |
648
|
|
|
|
|
|
|
# with their word, so we get that by calling our walktree method in |
649
|
|
|
|
|
|
|
# scalar context. |
650
|
10
|
100
|
|
|
|
46
|
return scalar $self->_walktree( |
651
|
|
|
|
|
|
|
# When fetching suffixes, we don't want to give the word begnning. |
652
|
|
|
|
|
|
|
word => $args{suff_len} ? "" : $word, |
653
|
|
|
|
|
|
|
'ref' => $ref, |
654
|
|
|
|
|
|
|
suf_len => $args{suff_len}, |
655
|
|
|
|
|
|
|
); |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
elsif ($self->{_DEEPSEARCH} == PREFIX) { |
658
|
|
|
|
|
|
|
# This is the "longest prefix found" case. |
659
|
4
|
100
|
|
|
|
15
|
if (exists $ref->{$self->{_END}}) { |
660
|
2
|
100
|
|
|
|
9
|
if ($args{data}) { |
661
|
1
|
|
|
|
|
10
|
return $ref->{$self->{_END}}; |
662
|
|
|
|
|
|
|
} |
663
|
1
|
50
|
|
|
|
4
|
if ($wantref) { |
664
|
0
|
|
|
|
|
0
|
return [@{$lastword}, @{$pref}]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
else { |
667
|
1
|
|
|
|
|
9
|
return $lastword . $pref; |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
} |
670
|
2
|
100
|
|
|
|
14
|
if ($args{data}) { |
671
|
1
|
|
|
|
|
9
|
return $lastwordref->{$self->{_END}}; |
672
|
|
|
|
|
|
|
} |
673
|
1
|
|
|
|
|
8
|
return $lastword; |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
} |
676
|
|
|
|
|
|
|
} |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
# This is the method which does all of the heavy lifting for add and |
679
|
|
|
|
|
|
|
# add_data. Given a word and a datum, it walks down the trie until |
680
|
|
|
|
|
|
|
# it finds a branch that hasn't been created yet. It then makes the rest |
681
|
|
|
|
|
|
|
# of the branch, and slaps an end marker and the datum inside of it. |
682
|
|
|
|
|
|
|
sub _add_internal { |
683
|
66
|
|
|
66
|
|
206
|
my $self = shift; |
684
|
66
|
|
|
|
|
72
|
my $word = shift; |
685
|
66
|
|
|
|
|
218
|
my $datum = shift; |
686
|
66
|
|
|
|
|
64
|
my @letters; |
687
|
|
|
|
|
|
|
# We don't NEED to split a string into letters; Any array of tokens |
688
|
|
|
|
|
|
|
# will do. |
689
|
66
|
100
|
|
|
|
181
|
if (ref($word) eq 'ARRAY') { |
690
|
|
|
|
|
|
|
# Note: this is a copy |
691
|
8
|
|
|
|
|
12
|
@letters = (@{$word}); |
|
8
|
|
|
|
|
24
|
|
692
|
|
|
|
|
|
|
# Because in this case, a "letter" can be more than on character |
693
|
|
|
|
|
|
|
# long, we have to make sure we don't collide with whatever we're |
694
|
|
|
|
|
|
|
# using as an end marker. |
695
|
|
|
|
|
|
|
# However, if the user is feeling all fanciful and told us not to |
696
|
|
|
|
|
|
|
# bother, we won't. |
697
|
8
|
50
|
|
|
|
40
|
unless ($self->{_FREEZE_END}) { |
698
|
8
|
|
|
|
|
13
|
for my $letter (@letters) { |
699
|
29
|
100
|
|
|
|
69
|
if ($letter eq $self->{_END}) { |
700
|
|
|
|
|
|
|
# If we had a collision, then make a new end marker. |
701
|
3
|
|
|
|
|
9
|
$self->end_marker($self->_gen_new_marker( |
702
|
|
|
|
|
|
|
bad => \@letters, |
703
|
|
|
|
|
|
|
)); |
704
|
3
|
|
|
|
|
7
|
last; |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
else { |
710
|
58
|
|
|
|
|
205
|
@letters = split('',$word); |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
# Start at the top of the Trie... |
713
|
66
|
|
|
|
|
122
|
my $ref = $self->{_MAINHASHREF}; |
714
|
|
|
|
|
|
|
# This will walk down the trie as far as it can, until it either runs |
715
|
|
|
|
|
|
|
# out of word or out of trie. |
716
|
66
|
|
100
|
|
|
533
|
while ( |
717
|
|
|
|
|
|
|
(scalar @letters) && |
718
|
|
|
|
|
|
|
exists($ref->{$letters[0]}) |
719
|
|
|
|
|
|
|
) { |
720
|
73
|
|
|
|
|
320
|
$ref = $ref->{shift(@letters)}; |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
# If it ran out of trie before it ran out of word then this will create |
723
|
|
|
|
|
|
|
# the rest of the trie structure. |
724
|
66
|
|
|
|
|
116
|
for my $letter (@letters) { |
725
|
238
|
|
|
|
|
705
|
$ref = $ref->{$letter} = {}; |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
# In either case, this will make the new end marker for the end of the |
728
|
|
|
|
|
|
|
# word (assuming it wasn't already there) and set the return value |
729
|
|
|
|
|
|
|
# appropriately. |
730
|
66
|
|
|
|
|
170
|
my $ret = 1; |
731
|
66
|
100
|
|
|
|
718
|
if (exists $ref->{$self->{_END}}) { |
732
|
4
|
|
|
|
|
7
|
$ret = 0; |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
else { |
735
|
62
|
|
|
|
|
132
|
$ref->{$self->{_END}} = undef; |
736
|
|
|
|
|
|
|
} |
737
|
|
|
|
|
|
|
# This will set the data if it was provided. |
738
|
66
|
100
|
|
|
|
133
|
if (defined $datum) { |
739
|
13
|
|
|
|
|
22
|
$ref->{$self->{_END}} = $datum; |
740
|
|
|
|
|
|
|
} |
741
|
66
|
|
|
|
|
224
|
return $ret; |
742
|
|
|
|
|
|
|
} |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
# This uses a heuristic (that is, a crappy method) to generate a new |
745
|
|
|
|
|
|
|
# end marker for the trie. In addition to being sure that whatever is |
746
|
|
|
|
|
|
|
# generated is not in use as a letter in the trie, it also makes a bold |
747
|
|
|
|
|
|
|
# yet mostly vain attempt to try to make something that might not be |
748
|
|
|
|
|
|
|
# used in the future. |
749
|
|
|
|
|
|
|
# In general, I do not try to make this functionality good or fast or |
750
|
|
|
|
|
|
|
# perfect -- if it's being called often, the module is being mis-used. |
751
|
|
|
|
|
|
|
# If a user is using multi-character letters, then they ought to find |
752
|
|
|
|
|
|
|
# a string that will be safe and set it themselves. |
753
|
|
|
|
|
|
|
sub _gen_new_marker { |
754
|
4
|
|
|
4
|
|
6
|
my $self = shift; |
755
|
4
|
|
|
|
|
10
|
my %args = @_; |
756
|
|
|
|
|
|
|
# This will keep track of all of the letters used in the trie already |
757
|
4
|
|
|
|
|
8
|
my %used = (); |
758
|
|
|
|
|
|
|
# This will keep track of what lengths they are |
759
|
4
|
|
|
|
|
7
|
my %sizes = (); |
760
|
|
|
|
|
|
|
# First we process the letters of the word which sparked this |
761
|
|
|
|
|
|
|
# re-evaluation. |
762
|
4
|
|
|
|
|
75
|
for my $letter (@{$args{bad}}) { |
|
4
|
|
|
|
|
11
|
|
763
|
11
|
|
|
|
|
14
|
my $len = length($letter); |
764
|
11
|
100
|
|
|
|
24
|
if ($len != 1) { |
765
|
8
|
|
|
|
|
18
|
$used{$letter}++; |
766
|
8
|
|
|
|
|
18
|
$sizes{$len}++; |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
} |
769
|
|
|
|
|
|
|
# Then we walk the tree and get the info on all the other letters. |
770
|
4
|
|
|
|
|
10
|
my @refs = ($self->{_MAINHASHREF}); |
771
|
4
|
|
|
|
|
11
|
while (@refs) { |
772
|
5
|
|
|
|
|
130
|
my $ref = shift @refs; |
773
|
5
|
|
|
|
|
6
|
for my $key (keys %{$ref}) { |
|
5
|
|
|
|
|
14
|
|
774
|
|
|
|
|
|
|
# Note we don't even care about length 1 letters. |
775
|
7
|
100
|
66
|
|
|
37
|
if ( |
776
|
|
|
|
|
|
|
(length($key) != 1) && |
777
|
|
|
|
|
|
|
($key ne $self->{_END}) |
778
|
|
|
|
|
|
|
) { |
779
|
1
|
|
|
|
|
2
|
$used{$key}++; |
780
|
1
|
|
|
|
|
3
|
$sizes{length($key)}++; |
781
|
1
|
|
|
|
|
2
|
push(@refs, $ref->{$key}); |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
# The idea here is that we want to make the end marker as small as possible, |
786
|
|
|
|
|
|
|
# as it's stuck all over the place. However, we don't want to spend forever |
787
|
|
|
|
|
|
|
# trying to find one that isn't in use. |
788
|
|
|
|
|
|
|
# So, we find the smallest length such that there are fewer than 1/4 of |
789
|
|
|
|
|
|
|
# the total number of possible letters in use of that length, and we make |
790
|
|
|
|
|
|
|
# a key of that length. |
791
|
4
|
|
|
|
|
7
|
my $newlen = 2; |
792
|
4
|
|
|
|
|
12
|
for my $len (sort keys %sizes) { |
793
|
|
|
|
|
|
|
# Yes, I know there are well more than 26 available compositors, but |
794
|
|
|
|
|
|
|
# this will only mean I'm being too careful. |
795
|
6
|
100
|
|
|
|
27
|
if ($sizes{$len} < ((26 ** $len) / 4)) { |
796
|
3
|
|
|
|
|
6
|
$newlen = $len; |
797
|
3
|
|
|
|
|
4
|
last; |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
else { |
800
|
|
|
|
|
|
|
# This makes it so that if all existing lengths are too full ( !! ) |
801
|
|
|
|
|
|
|
# then we will just use a key that's one longer than the longest |
802
|
|
|
|
|
|
|
# one already there. |
803
|
3
|
|
|
|
|
6
|
$newlen = $len + 1; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
# Now we just generate end markers until we find one that isn't in use. |
807
|
4
|
|
|
|
|
6
|
my $newend; |
808
|
4
|
|
|
|
|
6
|
do { |
809
|
4
|
|
|
|
|
9
|
$newend = join '', map { chr(int(rand(128))) } (('') x $newlen); |
|
7
|
|
|
|
|
98
|
|
810
|
|
|
|
|
|
|
} while (exists($used{$newend})); |
811
|
|
|
|
|
|
|
# And return it. |
812
|
4
|
|
|
|
|
21
|
return $newend; |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
# Strewth! |
816
|
|
|
|
|
|
|
1; |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
__END__ |