line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Tree::BPTree; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# $Id: BPTree.pm,v 1.4 2003/09/15 19:50:39 sterling Exp $ |
4
|
|
|
|
|
|
|
|
5
|
12
|
|
|
12
|
|
472882
|
use 5.008; |
|
12
|
|
|
|
|
48
|
|
|
12
|
|
|
|
|
522
|
|
6
|
12
|
|
|
12
|
|
93
|
use strict; |
|
12
|
|
|
|
|
29
|
|
|
12
|
|
|
|
|
457
|
|
7
|
12
|
|
|
12
|
|
67
|
use warnings; |
|
12
|
|
|
|
|
30
|
|
|
12
|
|
|
|
|
366
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# all the math is for indexing |
10
|
12
|
|
|
12
|
|
14112
|
use integer; |
|
12
|
|
|
|
|
233
|
|
|
12
|
|
|
|
|
88
|
|
11
|
|
|
|
|
|
|
|
12
|
12
|
|
|
12
|
|
526
|
use Carp; |
|
12
|
|
|
|
|
21
|
|
|
12
|
|
|
|
|
1814
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '1.08'; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Tree::BPTree - Perl implementation of B+ trees |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Tree::BPTree; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# These arguments are actually the defaults |
25
|
|
|
|
|
|
|
my $tree = new Tree::BPTree( |
26
|
|
|
|
|
|
|
-n => 3, |
27
|
|
|
|
|
|
|
-unique => 0, |
28
|
|
|
|
|
|
|
-keycmp => sub { $_[0] cmp $_[1] }, |
29
|
|
|
|
|
|
|
-valuecmp => sub { $_[0] <=> $_[1] }, |
30
|
|
|
|
|
|
|
); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# index the entries in this string: |
33
|
|
|
|
|
|
|
my $string = "THERE'S MORE THAN ONE WAY TO DO IT"; # TMTOWTDI |
34
|
|
|
|
|
|
|
my $i = 0; |
35
|
|
|
|
|
|
|
$tree->insert($_, $i++) foreach (split //, $string); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# find the index of the first 'T' |
38
|
|
|
|
|
|
|
my $t = $tree->find('T'); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# find the indexes of every 'T' |
41
|
|
|
|
|
|
|
my @t = $tree->find('T'); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# We don't like the word 'WAY ', so let's remove it |
44
|
|
|
|
|
|
|
my $i = index $string, 'W'; |
45
|
|
|
|
|
|
|
$tree->delete($_, $i++) foreach (split //, substr($string, $i, 4)); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Reverse the sort order |
48
|
|
|
|
|
|
|
$tree->reverse; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Iterate through each key/value pair just like built-in each operator |
51
|
|
|
|
|
|
|
while (my ($key, $value) = $tree->each) { |
52
|
|
|
|
|
|
|
print "$key => $value\n"; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Reset the iterator when we quit from an "each-loop" early |
56
|
|
|
|
|
|
|
$tree->reset; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# You might also be interested in using multiple each loops at once, which is |
59
|
|
|
|
|
|
|
# possible through the cursor syntax. You can even delete individual pairs |
60
|
|
|
|
|
|
|
# from the list during iteration. |
61
|
|
|
|
|
|
|
my $cursor = $tree->new_cursor; |
62
|
|
|
|
|
|
|
while (my ($key, $value) = $cursor->each) { |
63
|
|
|
|
|
|
|
my $nested = $tree->new_cursor; |
64
|
|
|
|
|
|
|
while (my ($nkey, $nvalue) = $nested->each) { |
65
|
|
|
|
|
|
|
if ($key->shouldnt_be_in_this_tree_with($nkey)) { |
66
|
|
|
|
|
|
|
$nested->delete; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Iterate using an iterator subroutine |
72
|
|
|
|
|
|
|
$tree->iterate(sub { print "$_[0] => $_[1]\n" }); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# Iterate using an iterator subroutine that returns the list of return values |
75
|
|
|
|
|
|
|
# returned by the iterator |
76
|
|
|
|
|
|
|
print join(', ', $tree->map(sub { "$_[0] => $_[1]" })),"\n"; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# Grep-like operations |
79
|
|
|
|
|
|
|
my @pairs = $tree->grep (sub { $_[0] =~ /\S/ }); |
80
|
|
|
|
|
|
|
my @keys = $tree->grep_keys (sub { $_[0] =~ /\S/ }); |
81
|
|
|
|
|
|
|
my @values = $tree->grep_values (sub { $_[0] =~ /\S/ }); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# Get all keys, values |
84
|
|
|
|
|
|
|
my @all_keys = $tree->keys; |
85
|
|
|
|
|
|
|
my @all_values = $tree->values; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Clear it out and start over |
88
|
|
|
|
|
|
|
$tree->clear; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head1 DESCRIPTION |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
B+ trees are balanced trees which provide an ordered map from keys to values. |
93
|
|
|
|
|
|
|
They are useful for indexing large bodies of data. They are similar to 2-3-4 |
94
|
|
|
|
|
|
|
Trees and Red-Black Trees. This implementation supports B+ trees using an |
95
|
|
|
|
|
|
|
arbitrary I value. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 STRUCTURE |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Each node in a B+ tree contains I pointers and I keys. The pointers |
100
|
|
|
|
|
|
|
in the node are placed between the ordered keys so that there is one pointer on |
101
|
|
|
|
|
|
|
either end and one pointer in between each value. Searching for a key involves |
102
|
|
|
|
|
|
|
checking to see which keys in the node the key falls between and then following |
103
|
|
|
|
|
|
|
the corresponding pointers down the tree. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
The pointers in the branches of thre tree always point to nodes deeper in the |
106
|
|
|
|
|
|
|
tree. The leaves use all pointers but the last to point to buckets containing |
107
|
|
|
|
|
|
|
values. The last pointer in each leaf forms a singly-linked list called the |
108
|
|
|
|
|
|
|
linked leaf list. Iterating through this list gives us an ordered traversal of |
109
|
|
|
|
|
|
|
all keys and/or values in the tree. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Finally, all non-root branch nodes must contain at least I pointers. If it |
112
|
|
|
|
|
|
|
becomes necessary to add values to a node which already contains I pointers, |
113
|
|
|
|
|
|
|
then the node will be split in half first (possibly requiring the split of |
114
|
|
|
|
|
|
|
parents). If deletion of a node leaves a branch with fewer than I pointers, |
115
|
|
|
|
|
|
|
the node will either be coalesced (joined to) a neigboring node or it will take |
116
|
|
|
|
|
|
|
on a pointer from a neighbor node. Coalescing can also result in the further |
117
|
|
|
|
|
|
|
rebalancing of the tree in parents using more coalesce or redistribute |
118
|
|
|
|
|
|
|
operations. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Here's a diagram of a valid B+ tree when n = 3 that stores my last name, |
121
|
|
|
|
|
|
|
"HANENKAMP": |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
------- |
124
|
|
|
|
|
|
|
/ \ |
125
|
|
|
|
|
|
|
/ \ |
126
|
|
|
|
|
|
|
---- |
127
|
|
|
|
|
|
|
/ \ / \ |
128
|
|
|
|
|
|
|
/ \ / | |
129
|
|
|
|
|
|
|
> > > > |
130
|
|
|
|
|
|
|
/ \ | / \ / \ |
131
|
|
|
|
|
|
|
/ \ | | | | \ |
132
|
|
|
|
|
|
|
[1,6] [3] [0] [5][7] [2,4] [8] |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Anyway, you don't need to know any of that to use this implementation. The |
135
|
|
|
|
|
|
|
abstraction layer set on top makes it look something like a typical hash. |
136
|
|
|
|
|
|
|
Insertion and deletion both require a specific key and value since multiple |
137
|
|
|
|
|
|
|
values can be mapped to each key--unless the "-unique" flag has been set. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
By default, the tree assumes that it is being used to map strings to indexes. I |
140
|
|
|
|
|
|
|
chose to set this default because this is the most common use I will put it to. |
141
|
|
|
|
|
|
|
That is, I have lists of strings that I want to index, so the keys will be the |
142
|
|
|
|
|
|
|
strings to index and the values will be indexes into the list. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
If you need to store something different, all you need to do is store a |
145
|
|
|
|
|
|
|
reference to the objects (keys or values) and set the "-keycmp" and "-valuecmp" |
146
|
|
|
|
|
|
|
options to appropriate values during initialization. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=head2 PERFORMANCE |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
At some point, I want to post the best, average, and worst-case operation speed |
151
|
|
|
|
|
|
|
for this implementation of B+ trees, but for now we'll just have to live without |
152
|
|
|
|
|
|
|
those stats. For raw benchmarks, you should see the L section as the |
153
|
|
|
|
|
|
|
actual performance of this module is pretty slow. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head2 IMPLEMENTATION |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
As a quick note on implementation, if you want to know how specific operations |
158
|
|
|
|
|
|
|
work, please browse the source. I have included extensive comments within the |
159
|
|
|
|
|
|
|
definitions of the methods themselves explaining most of the important steps. I |
160
|
|
|
|
|
|
|
did this for my own sanity because B+ trees can be quite complicated. |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
This code has been optimized a bit, but I haven't nearly made as many |
163
|
|
|
|
|
|
|
optimizations as are likely possible. I'm open to any suggestions. If you have |
164
|
|
|
|
|
|
|
some, send me email at the address given below. |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 METHOD REFERENCE |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=over |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=cut |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
package Tree::BPTree::Node; |
173
|
|
|
|
|
|
|
|
174
|
12
|
|
|
12
|
|
67
|
use integer; |
|
12
|
|
|
|
|
21
|
|
|
12
|
|
|
|
|
51
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub new { |
177
|
8329
|
|
|
8329
|
|
37146
|
my ($class, @data) = @_; |
178
|
8329
|
100
|
|
|
|
23544
|
@data = ( undef ) unless @data; |
179
|
8329
|
|
33
|
|
|
86496
|
return bless \@data, ref $class || $class; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# sub key { |
183
|
|
|
|
|
|
|
# my ($self, $k, $new) = @_; |
184
|
|
|
|
|
|
|
# $$self[$k * 2 + 1] = $new if defined $new; |
185
|
|
|
|
|
|
|
# return $$self[$k * 2 + 1]; |
186
|
|
|
|
|
|
|
# } |
187
|
|
|
|
|
|
|
# |
188
|
|
|
|
|
|
|
# sub value { |
189
|
|
|
|
|
|
|
# my ($self, $v, $new) = @_; |
190
|
|
|
|
|
|
|
# $$self[$v * 2] = $new if defined $new; |
191
|
|
|
|
|
|
|
# return $$self[$v * 2]; |
192
|
|
|
|
|
|
|
# } |
193
|
|
|
|
|
|
|
# |
194
|
|
|
|
|
|
|
# sub last_key { |
195
|
|
|
|
|
|
|
# my ($self, $new) = @_; |
196
|
|
|
|
|
|
|
# $$self[-2] = $new if defined $new; |
197
|
|
|
|
|
|
|
# return $$self[-2]; |
198
|
|
|
|
|
|
|
# } |
199
|
|
|
|
|
|
|
# |
200
|
|
|
|
|
|
|
# sub last_value { |
201
|
|
|
|
|
|
|
# my ($self, $new) = @_; |
202
|
|
|
|
|
|
|
# $$self[-1] = $new if defined $new; |
203
|
|
|
|
|
|
|
# return $$self[-1]; |
204
|
|
|
|
|
|
|
# } |
205
|
|
|
|
|
|
|
# |
206
|
|
|
|
|
|
|
sub first_leaf { |
207
|
21459
|
|
|
21459
|
|
34421
|
my ($self) = @_; |
208
|
21459
|
|
|
|
|
39434
|
my $current = $self; |
209
|
21459
|
|
|
|
|
170710
|
until ($current->isa('Tree::BPTree::Leaf')) { |
210
|
20124
|
|
|
|
|
137171
|
$current = $$current[0]; |
211
|
|
|
|
|
|
|
} |
212
|
21459
|
|
|
|
|
110563
|
return $current; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub last_leaf { |
216
|
536
|
|
|
536
|
|
849
|
my ($self) = @_; |
217
|
536
|
|
|
|
|
555
|
my $current = $self; |
218
|
536
|
|
|
|
|
1762
|
until ($current->isa('Tree::BPTree::Leaf')) { |
219
|
110
|
|
|
|
|
397
|
$current = $$current[-1]; |
220
|
|
|
|
|
|
|
} |
221
|
536
|
|
|
|
|
954
|
return $current; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
# |
224
|
|
|
|
|
|
|
# sub nkeys { |
225
|
|
|
|
|
|
|
# my ($self) = @_; |
226
|
|
|
|
|
|
|
# return (scalar(@$self) - 1) / 2; |
227
|
|
|
|
|
|
|
# } |
228
|
|
|
|
|
|
|
# |
229
|
|
|
|
|
|
|
# sub nvalues { |
230
|
|
|
|
|
|
|
# my ($self) = @_; |
231
|
|
|
|
|
|
|
# return (scalar(@$self) + 1) / 2; |
232
|
|
|
|
|
|
|
# } |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# The find operation differs slightly between branch and leaf. See the comment |
235
|
|
|
|
|
|
|
# near Tree::BPTree::Leaf::find for details. |
236
|
|
|
|
|
|
|
sub find { |
237
|
147499
|
|
|
147499
|
|
291744
|
my ($self, $cmp, $key) = @_; |
238
|
147499
|
|
|
|
|
260714
|
my $nkeys = (@$self - 1) / 2; |
239
|
147499
|
|
|
|
|
411784
|
for (my $k = 0; $k < $nkeys; $k++) { |
240
|
263731
|
100
|
|
|
|
640939
|
if (&$cmp($key, $self->[($k) * 2 + 1]) < 0) { |
241
|
93749
|
|
|
|
|
279206
|
return $k; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
53750
|
|
|
|
|
161956
|
return (@$self + 1) / 2 - 1; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
sub insert { |
248
|
37980
|
|
|
37980
|
|
70190
|
my ($self, $v, $key, $value) = @_; |
249
|
37980
|
|
|
|
|
4657606
|
splice @$self, $v * 2, 0, $value, $key; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub split { |
253
|
26988
|
|
|
26988
|
|
50552
|
my ($self, $n, $cmp, $key) = @_; |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# find the node we're going to insert to; split that node; if it splits |
256
|
|
|
|
|
|
|
# either incorporate the split in ourselves or split ourselves if we are |
257
|
|
|
|
|
|
|
# full |
258
|
26988
|
|
|
|
|
68110
|
my $v = $self->find($cmp, $key); |
259
|
26988
|
|
|
|
|
77335
|
my $result = $self->[($v) * 2]->split($n, $cmp, $key); |
260
|
26988
|
100
|
100
|
|
|
123107
|
if ((@$self + 1) / 2 == $n && defined $result) { |
|
|
100
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# We're full and they split, we must split too. The way the split must |
262
|
|
|
|
|
|
|
# be handled will depend upon whether this is a Left, Center, or Right |
263
|
|
|
|
|
|
|
# split. That is, is the sub-split node pointer on the left side, the |
264
|
|
|
|
|
|
|
# middle, or the right. But first, let's go ahead and split the node in |
265
|
|
|
|
|
|
|
# half. |
266
|
|
|
|
|
|
|
# |
267
|
|
|
|
|
|
|
# The way a node can be split depends on the oddness of n. If n is odd |
268
|
|
|
|
|
|
|
# (normal looking node split), then we split at index n-1 and give the |
269
|
|
|
|
|
|
|
# new node n elements. If n is even, we split at index n and give the |
270
|
|
|
|
|
|
|
# new node n-1 elements. The combinatorics of this solution are kind of |
271
|
|
|
|
|
|
|
# interesting. In any case, we create the new node complete while |
272
|
|
|
|
|
|
|
# leaving the current node with a missing end-pointer. |
273
|
660
|
|
|
|
|
2525
|
my $new_node = Tree::BPTree::Node->new( |
274
|
|
|
|
|
|
|
splice @$self, |
275
|
|
|
|
|
|
|
$n - ($n % 2), # n - 1 for odd or n - 0 for even |
276
|
|
|
|
|
|
|
$n - (($n + 1) % 2), # n - 0 for odd or n - 1 for even |
277
|
|
|
|
|
|
|
); |
278
|
|
|
|
|
|
|
|
279
|
660
|
|
|
|
|
2761
|
my $root_key; |
280
|
660
|
100
|
|
|
|
1667
|
if ($v < $n / 2) { |
|
|
100
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# This is a left split. We need to clip off the last key, insert the |
282
|
|
|
|
|
|
|
# child's new root key and set the pointers on either side to the |
283
|
|
|
|
|
|
|
# new root nodes. Finally, return a new root with clipped key |
284
|
|
|
|
|
|
|
# pointing to us and the new node. |
285
|
324
|
|
|
|
|
581
|
$root_key = pop @$self; |
286
|
324
|
|
|
|
|
1746
|
my $i = $self->find($cmp, $result->[1]); |
287
|
324
|
|
|
|
|
858
|
$self->insert($i, $result->[1], $result->[0]); |
288
|
324
|
|
|
|
|
684
|
$self->[($i+1) * 2] = $result->[2]; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
} elsif ($v > $n / 2) { |
291
|
|
|
|
|
|
|
# This is a right split. Same as left in reverse, basically. We do |
292
|
|
|
|
|
|
|
# need to first shear of the first pointer to the new node and |
293
|
|
|
|
|
|
|
# append it back onto as the last pointer of the first node first. |
294
|
216
|
|
|
|
|
391
|
push @$self, shift @$new_node; |
295
|
216
|
|
|
|
|
350
|
$root_key = shift @$new_node; |
296
|
216
|
|
|
|
|
552
|
my $i = $new_node->find($cmp, $result->[1]); |
297
|
216
|
|
|
|
|
1343
|
$new_node->[($i) * 2] = $result->[2]; |
298
|
216
|
|
|
|
|
536
|
$new_node->insert($i, $result->[1], $result->[0]); |
299
|
|
|
|
|
|
|
} else { |
300
|
|
|
|
|
|
|
# This is a center split. Here, we append to ourself a new pointer |
301
|
|
|
|
|
|
|
# pointing to the new left node. We set the new node's first pointer |
302
|
|
|
|
|
|
|
# to the new right node. And we set the new root key to the child's |
303
|
|
|
|
|
|
|
# new root key. |
304
|
120
|
|
|
|
|
225
|
push @$self, $result->[0]; |
305
|
120
|
|
|
|
|
205
|
$new_node->[0] = $result->[2]; |
306
|
120
|
|
|
|
|
216
|
$root_key = $result->[1]; |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
|
309
|
660
|
|
|
|
|
1652
|
return Tree::BPTree::Node->new($self, $root_key, $new_node); |
310
|
|
|
|
|
|
|
} elsif (defined $result) { |
311
|
|
|
|
|
|
|
# We have room to accomodate their split, add the new nodes here. |
312
|
|
|
|
|
|
|
# Regular insert will do this in the wrong order. |
313
|
|
|
|
|
|
|
# $self->insert($v, $$result[-1]->first_leaf->[1], $$result[-1]); |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
# The new node will always be the last node, so we need to insert the |
316
|
|
|
|
|
|
|
# key/pointer in reverse order from normal such that the key happens at |
317
|
|
|
|
|
|
|
# $i and the value is at $i + 1 |
318
|
2496
|
|
|
|
|
6079
|
my $i = $self->find($cmp, $key); |
319
|
2496
|
|
|
|
|
8081
|
splice @$self, $i * 2 + 1, 0, $$result[-1]->first_leaf->[1], $$result[-1]; |
320
|
2496
|
|
|
|
|
8374
|
return undef; |
321
|
|
|
|
|
|
|
} else { |
322
|
|
|
|
|
|
|
# They didn't split, so we don't have to either |
323
|
23832
|
|
|
|
|
45930
|
return undef; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub delete { |
328
|
6234
|
|
|
6234
|
|
17642
|
my ($self, $n, $cmp, $key) = @_; |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# Go to the bottom and drop the key from the leaf node |
331
|
6234
|
|
|
|
|
22650
|
my $v = $self->find($cmp, $key); |
332
|
6234
|
|
|
|
|
27025
|
my $result = $self->[($v) * 2]->delete($n, $cmp, $key); |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# On our way back up, make the tree consistent; i.e., no empty leaves and no |
335
|
|
|
|
|
|
|
# non-root nodes with less than n/2 values. If a key is deleted, but doesn't |
336
|
|
|
|
|
|
|
# cause a coalesce or redistribute, we may keep that key in a branch node as |
337
|
|
|
|
|
|
|
# a sort key, this shouldn't hurt us. |
338
|
6234
|
100
|
|
|
|
39185
|
if ($self->[($v) * 2]->isa('Tree::BPTree::Leaf')) { |
339
|
|
|
|
|
|
|
# Since this is a leaf, we only care if the leaf becomes empty. If it |
340
|
|
|
|
|
|
|
# does, we remove the pointer to it from the current node and pass |
341
|
|
|
|
|
|
|
# control upwards. |
342
|
5226
|
100
|
|
|
|
22275
|
if ($result == 1) { |
343
|
|
|
|
|
|
|
# The leaf is too small, so we need to delete it from our list. This |
344
|
|
|
|
|
|
|
# may result in rebalancing further up the tree. |
345
|
|
|
|
|
|
|
# |
346
|
|
|
|
|
|
|
# NOTE: This operation will leave orphaned nodes in the linked leaf |
347
|
|
|
|
|
|
|
# list. It is too hard to remove the orphans here. Instead, orphans |
348
|
|
|
|
|
|
|
# should be removed by the iterators. |
349
|
536
|
100
|
|
|
|
1925
|
if ($v == 0) { |
350
|
|
|
|
|
|
|
# This node is the first index, so we delete it and the next key |
351
|
316
|
|
|
|
|
875
|
splice @$self, 0, 2; |
352
|
|
|
|
|
|
|
} else { |
353
|
|
|
|
|
|
|
# This node is not first, so we delete it and the preceding key |
354
|
220
|
|
|
|
|
1226
|
splice @$self, $v * 2 - 1, 2; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} # else no rebalancing will take place here on up |
357
|
|
|
|
|
|
|
} else { |
358
|
|
|
|
|
|
|
# As a branch, the child node must not have fewer than n/2 children. If |
359
|
|
|
|
|
|
|
# it does, we need to try to coalesce it with a neighbor or redistribute |
360
|
|
|
|
|
|
|
# the children from a neighbor to the small node. |
361
|
1008
|
100
|
|
|
|
2883
|
if ($result <= $n / 2) { |
362
|
|
|
|
|
|
|
# The branch is too small, we'll try to coalesce first |
363
|
192
|
100
|
100
|
|
|
888
|
if ($v > 0 && ((@{$self->[($v - 1) * 2]} + 1) / 2) + ((@{$self->[($v ) * 2]} + 1) / 2) <= $n) { |
|
67
|
100
|
100
|
|
|
175
|
|
|
67
|
|
|
|
|
436
|
|
|
140
|
|
|
|
|
393
|
|
364
|
|
|
|
|
|
|
# We can coalesce the small node with it's left neighbor |
365
|
28
|
|
|
|
|
107
|
$self->[($v-1) * 2]->coalesce($self->[($v) * 2]); |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# The removed node (the small node) is not first, so we delete |
368
|
|
|
|
|
|
|
# it and the preceding key |
369
|
28
|
|
|
|
|
66
|
splice @$self, $v * 2 - 1, 2; |
370
|
140
|
|
|
|
|
635
|
} elsif ($v < (((@$self + 1) / 2) - 1) && ((@{$self->[($v ) * 2]} + 1) / 2) + ((@{$self->[($v + 1) * 2]} + 1) / 2) <= $n) { |
371
|
|
|
|
|
|
|
# We can coalesce the small node with it's right neighbor |
372
|
82
|
|
|
|
|
328
|
$self->[($v) * 2]->coalesce($self->[($v+1) * 2]); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# The removed node (the right neighbor) is not first, so we |
375
|
|
|
|
|
|
|
# delete it and the preceding key |
376
|
82
|
|
|
|
|
234
|
splice @$self, ($v + 1) * 2 - 1, 2; |
377
|
|
|
|
|
|
|
} else { |
378
|
|
|
|
|
|
|
# We must redistribute, we pull the node from the left neighbor, |
379
|
|
|
|
|
|
|
# if there is a left neighbor; otherwise, we'll pull the node |
380
|
|
|
|
|
|
|
# from the right. |
381
|
82
|
100
|
|
|
|
179
|
if ($v > 0) { |
382
|
34
|
|
|
|
|
147
|
$self->[($v-1) * 2]->redistribute($self->[($v) * 2]); |
383
|
|
|
|
|
|
|
} else { |
384
|
48
|
|
|
|
|
249
|
$self->[($v) * 2]->redistribute($self->[($v+1) * 2]); |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# Furthermore, we need to reset the key affected in this node to |
388
|
|
|
|
|
|
|
# make sure that we don't lose sort order in the branches. (That |
389
|
|
|
|
|
|
|
# is, we might have just moved a lower key right making this key |
390
|
|
|
|
|
|
|
# too high or a higher key left making this key too low. |
391
|
|
|
|
|
|
|
# |
392
|
|
|
|
|
|
|
# We always use the latter pointer which is normally $v+1 or $v |
393
|
|
|
|
|
|
|
# if it is already the last pointer. |
394
|
82
|
100
|
|
|
|
214
|
if ($v > 0) { |
395
|
34
|
|
|
|
|
88
|
$self->[($v - 1) * 2 + 1] = $self->[$v * 2]->first_leaf->[1]; |
396
|
|
|
|
|
|
|
} else { |
397
|
48
|
|
|
|
|
152
|
$self->[($v) * 2 + 1] = $self->[($v + 1) * 2]->first_leaf->[1]; |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# Return the number of values remaining |
404
|
6234
|
|
|
|
|
20727
|
return (@$self + 1) / 2; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub coalesce { |
408
|
110
|
|
|
110
|
|
181
|
my ($self, $that) = @_; |
409
|
110
|
|
|
|
|
354
|
push @$self, $$that[0]->first_leaf->[1], @$that; |
410
|
110
|
|
|
|
|
247
|
return $self; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
sub redistribute { |
414
|
82
|
|
|
82
|
|
126
|
my ($self, $that) = @_; |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# Who's stealing nodes from whom? When deciding on the new index key to |
417
|
|
|
|
|
|
|
# insert, we choose to use the first key of that, in either case, as it will |
418
|
|
|
|
|
|
|
# always be higher than the last key of self. (The first key in that is |
419
|
|
|
|
|
|
|
# always the key associated with the value being redistributed.) |
420
|
82
|
100
|
|
|
|
282
|
if ((@$that + 1) / 2 < (@$self + 1) / 2) { |
421
|
|
|
|
|
|
|
# Redistribute values from left to right |
422
|
34
|
|
|
|
|
119
|
my @middle = splice @$self, -2, 2; |
423
|
34
|
|
|
|
|
126
|
unshift @$that, $middle[-1], $$that[0]->first_leaf->[1]; |
424
|
|
|
|
|
|
|
} else { |
425
|
|
|
|
|
|
|
# Redistribute values from right to left |
426
|
48
|
|
|
|
|
180
|
my @middle = splice @$that, 0, 2; |
427
|
48
|
|
|
|
|
194
|
push @$self, $middle[0]->first_leaf->[1], $middle[0]; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub reverse { |
432
|
230
|
|
|
230
|
|
316
|
my ($self) = @_; |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Reverses the children, reverses the internal list, and then connects the |
435
|
|
|
|
|
|
|
# linked-list pointer of the last_leaf of each subnode to the |
436
|
|
|
|
|
|
|
# first_leaf of the following subnode. Finally, we need to change the |
437
|
|
|
|
|
|
|
# index key. |
438
|
230
|
|
|
|
|
1131
|
@$self = reverse @$self; |
439
|
230
|
|
|
|
|
466
|
my $nvalues = (@$self + 1) / 2; |
440
|
230
|
|
|
|
|
665
|
for (my $v = 0; $v < $nvalues; ++$v) { |
441
|
766
|
|
|
|
|
1814
|
$self->[($v) * 2]->reverse; |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
230
|
|
|
|
|
377
|
my $nkeys = (@$self - 1) / 2; |
445
|
230
|
|
|
|
|
584
|
for (my $k = 0; $k < $nkeys; ++$k) { |
446
|
|
|
|
|
|
|
# Set the last pointer in the first node's last leaf to the first leaf |
447
|
536
|
|
|
|
|
1322
|
$self->[($k) * 2 ]->last_leaf->[-1] = $self->[($k + 1) * 2]->first_leaf; |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# Set the current key to the second node's first leaf's key |
450
|
536
|
|
|
|
|
1188
|
$self->[($k) * 2 + 1] = $self->[($k + 1) * 2]->first_leaf->[1]; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
package Tree::BPTree::Leaf; |
455
|
|
|
|
|
|
|
|
456
|
12
|
|
|
12
|
|
34725
|
use integer; |
|
12
|
|
|
|
|
35
|
|
|
12
|
|
|
|
|
69
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
our @ISA = qw(Tree::BPTree::Node); |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# Ordering in leaves is slightly different because we want to store the buckets |
461
|
|
|
|
|
|
|
# for the node in the same pointer as the node when keys are equal. In branches, |
462
|
|
|
|
|
|
|
# we want to find the value by the pointer *after* the node if the keys are |
463
|
|
|
|
|
|
|
# equal. |
464
|
|
|
|
|
|
|
sub find { |
465
|
132624
|
|
|
132624
|
|
247852
|
my ($self, $cmp, $key) = @_; |
466
|
132624
|
|
|
|
|
241721
|
my $nkeys = (@$self - 1) / 2; |
467
|
132624
|
|
|
|
|
377277
|
for (my $k = 0; $k < $nkeys; $k++) { |
468
|
1127612
|
100
|
|
|
|
2544324
|
if (&$cmp($key, $self->[($k) * 2 + 1]) <= 0) { |
469
|
120180
|
|
|
|
|
1378162
|
return $k; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} |
472
|
12444
|
|
|
|
|
43253
|
return (@$self + 1) / 2 - 1; |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub split { |
476
|
37440
|
|
|
37440
|
|
67408
|
my ($self, $n) = @_; |
477
|
|
|
|
|
|
|
|
478
|
37440
|
100
|
|
|
|
124230
|
if ((@$self + 1) / 2 == $n) { |
479
|
|
|
|
|
|
|
# We're big enough, we must split in anticipation of an insert. See the |
480
|
|
|
|
|
|
|
# comments in Tree::BPTree::split if you want to know more about why |
481
|
|
|
|
|
|
|
# choosing where and how many nodes to splice looks so weird. |
482
|
3216
|
|
|
|
|
22267
|
my $new_node = Tree::BPTree::Leaf->new( |
483
|
|
|
|
|
|
|
splice @$self, |
484
|
|
|
|
|
|
|
$n - ($n % 2), # n - 1 for odd or n - 0 for even |
485
|
|
|
|
|
|
|
$n - (($n + 1) % 2), # n - 0 for odd or n - 1 for even |
486
|
|
|
|
|
|
|
); |
487
|
3216
|
|
|
|
|
21712
|
push @$self, $new_node; |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# return new root, which is used or tossed depending on the needs of the |
490
|
|
|
|
|
|
|
# caller |
491
|
3216
|
|
|
|
|
9701
|
return Tree::BPTree::Node->new($self, $$new_node[1], $new_node); |
492
|
|
|
|
|
|
|
} else { |
493
|
|
|
|
|
|
|
# We're not too big, so we can take at least one more value |
494
|
34224
|
|
|
|
|
84464
|
return undef; |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub delete { |
499
|
6240
|
|
|
6240
|
|
12629
|
my ($self, $n, $cmp, $key) = @_; |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
# Find the node and delete it (we assume this node exists if we've been |
502
|
|
|
|
|
|
|
# called!) |
503
|
6240
|
|
|
|
|
26467
|
my $i = $self->find($cmp, $key); |
504
|
6240
|
|
|
|
|
22649
|
splice @$self, $i * 2, 2; |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# Return the number of values remaining |
507
|
6240
|
|
|
|
|
22234
|
return (@$self + 1) / 2; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
sub reverse { |
511
|
632
|
|
|
632
|
|
797
|
my ($self) = @_; |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# For leaves, we must before the reverse, then copy the value pointers |
514
|
|
|
|
|
|
|
# backwards one position. We even reverse the buckets to create a completely |
515
|
|
|
|
|
|
|
# symmetric reversal. |
516
|
632
|
|
|
|
|
6067
|
@$self = reverse @$self; |
517
|
632
|
|
|
|
|
1595
|
my $nvalues = (@$self + 1) / 2 - 1; |
518
|
632
|
|
|
|
|
1457
|
for (my $v = 0; $v < $nvalues; ++$v) { |
519
|
6240
|
|
|
|
|
6064
|
$self->[($v) * 2] = [ reverse @{ $self->[($v+1)*2] } ]; |
|
6240
|
|
|
|
|
29646
|
|
520
|
|
|
|
|
|
|
} |
521
|
632
|
|
|
|
|
2032
|
$$self[-1] = undef; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
package Tree::BPTree; |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=item $tree = Tree::BPTree->new(%args) |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
The constructor builds a new tree using the given arguments. All arguments are |
529
|
|
|
|
|
|
|
optional and have defaults that should suit many applications. The arguments |
530
|
|
|
|
|
|
|
include: |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=over |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=item -n |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
This sets the maximum number of pointers permitted in each node. Setting this |
537
|
|
|
|
|
|
|
number very high will cause search operations to slow down as it will spend a |
538
|
|
|
|
|
|
|
lot of time searching arrays incrementally--something like a binary search could |
539
|
|
|
|
|
|
|
be used to speed these times a bit, but no such method is used at this time. |
540
|
|
|
|
|
|
|
Setting this number very low will cause insert and delete operations to slow |
541
|
|
|
|
|
|
|
down as they are required to split and coalesce more often. The default is the |
542
|
|
|
|
|
|
|
minimum value of 3. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=item -unique |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
This determines whether keys are unique or not. If this is set, then an |
547
|
|
|
|
|
|
|
exception will be raised whenever an insert is attempted for a key that already |
548
|
|
|
|
|
|
|
exists in the tree. |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=item -keycmp |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
This is a comparator function that takes two arguments and returns -1, 0, or 1 |
553
|
|
|
|
|
|
|
to indicate the result of the comparison. If the first argument is less than the |
554
|
|
|
|
|
|
|
second, then -1 is returned. If the first argument is greater than the second, |
555
|
|
|
|
|
|
|
then 1 is returned. If the arguments are equal, then 0 is returned. This |
556
|
|
|
|
|
|
|
comparator should be appropriate for comparing keys. By default, the built-in |
557
|
|
|
|
|
|
|
string comparator C is used. See L for details on C. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=item -valuecmp |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
This is a comparator function that takes two arguments and returns -1, 0, or 1 |
562
|
|
|
|
|
|
|
to indicate the result of the comparison--just like the "-keycmp" argument. This |
563
|
|
|
|
|
|
|
comparator should be appropriate for comparing values. By default, the built-in |
564
|
|
|
|
|
|
|
numeric comparator C=E> is used. See L for details on |
565
|
|
|
|
|
|
|
C=E>. |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=back |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
The tree created by this constructor is always initially empty. |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=cut |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub new { |
574
|
577
|
|
|
577
|
1
|
1890060
|
my ($class, %args) = @_; |
575
|
|
|
|
|
|
|
|
576
|
577
|
100
|
|
|
|
3123
|
$args{-n} = 3 unless defined $args{-n}; |
577
|
577
|
50
|
|
1474347
|
|
4547
|
$args{-keycmp} = sub { $_[0] cmp $_[1] } unless defined $args{-keycmp}; |
|
1474347
|
|
|
|
|
6330297
|
|
578
|
577
|
50
|
|
11232
|
|
3962
|
$args{-valuecmp} = sub { $_[0] <=> $_[1] } unless defined $args{-valuecmp}; |
|
11232
|
|
|
|
|
50733
|
|
579
|
577
|
50
|
|
|
|
3144
|
$args{-unique} = 0 unless defined $args{-unique}; |
580
|
577
|
|
|
|
|
3036
|
$args{-root} = Tree::BPTree::Leaf->new; |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# This cursor is special as it doesn't have a link back to self. It will not |
583
|
|
|
|
|
|
|
# be released to the user to call methods on directly anyway. Having the |
584
|
|
|
|
|
|
|
# link back to self would cause a memory leak. |
585
|
577
|
|
|
|
|
2702
|
$args{-cursor} = bless {}, 'Tree::BPTree::Cursor'; |
586
|
|
|
|
|
|
|
|
587
|
577
|
50
|
|
|
|
2173
|
croak "Illegal value for n $args{-n}. It must be greater than or equal to 3." |
588
|
|
|
|
|
|
|
if $args{-n} < 3; |
589
|
|
|
|
|
|
|
|
590
|
577
|
|
33
|
|
|
4043
|
return bless \%args, ref $class || $class; |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
sub _find_leaf { |
594
|
126384
|
|
|
126384
|
|
253771
|
my ($self, $key) = @_; |
595
|
|
|
|
|
|
|
|
596
|
126384
|
|
|
|
|
264634
|
my $cmp = $$self{-keycmp}; |
597
|
126384
|
|
|
|
|
240593
|
my $current = $$self{-root}; |
598
|
126384
|
|
66
|
|
|
1222656
|
while (defined $current and not $current->isa('Tree::BPTree::Leaf')) { |
599
|
111241
|
|
|
|
|
325593
|
my $v = $current->find($cmp, $key); |
600
|
111241
|
|
|
|
|
1019244
|
$current = $current->[$v * 2]; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
|
603
|
126384
|
|
|
|
|
361829
|
return $current; |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=item $value = $tree->find($key) |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
=item @values = $tree->find($key) |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
This method attempts to find the value or values in the bucket matching C<$key>. |
611
|
|
|
|
|
|
|
If no such C<$key> has been stored in the tree, then C is returned. If |
612
|
|
|
|
|
|
|
the C<$key> is found, then either the first value stored in the bucket is |
613
|
|
|
|
|
|
|
returned (in scalar context) or all values stored are returned (in list |
614
|
|
|
|
|
|
|
context). Using scalar context is useful when the tree stores unique keys where |
615
|
|
|
|
|
|
|
there will never be more than one value per key. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=cut |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub find { |
620
|
20928
|
|
|
20928
|
1
|
67489195
|
my ($self, $key) = @_; |
621
|
|
|
|
|
|
|
|
622
|
20928
|
|
|
|
|
53614
|
my $cmp = $$self{-keycmp}; |
623
|
20928
|
|
|
|
|
50320
|
my $leaf = $self->_find_leaf($key); |
624
|
20928
|
|
|
|
|
60268
|
my $v = $leaf->find($cmp, $key); |
625
|
20928
|
50
|
|
|
|
56747
|
if (&$cmp($leaf->[($v) * 2 + 1], $key) == 0) { |
626
|
20928
|
100
|
|
|
|
47131
|
return wantarray ? @{ $leaf->[($v) * 2] } : ${ $leaf->[($v) * 2] }[0]; |
|
10464
|
|
|
|
|
59614
|
|
|
10464
|
|
|
|
|
46982
|
|
627
|
|
|
|
|
|
|
} else { |
628
|
0
|
|
|
|
|
0
|
return undef; |
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
=item $tree->insert($key, $value) |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
This method inserts the key/value pair given into the tree. If the tree requires |
635
|
|
|
|
|
|
|
unique keys, an exception will be thrown if C<$key> is already stored. |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
=cut |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
sub insert { |
640
|
62784
|
|
|
62784
|
1
|
298189
|
my ($self, $key, $value) = @_; |
641
|
62784
|
|
|
|
|
189353
|
my $n = $$self{-n}; |
642
|
62784
|
|
|
|
|
125441
|
my $cmp = $$self{-keycmp}; |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
# In the case of insert, we have three steps: |
645
|
|
|
|
|
|
|
# 1. See if the key already exists. If so, add the value to the bucket |
646
|
|
|
|
|
|
|
# there (or die if keys are unique). Otherwise, go to step 2. |
647
|
|
|
|
|
|
|
# 2. Tell the tree to split if it is full along the path to where the new |
648
|
|
|
|
|
|
|
# key will be placed. |
649
|
|
|
|
|
|
|
# 3. Find the leaf and insert the key/value pair there. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
# First, see if the value is already there |
652
|
62784
|
|
|
|
|
133242
|
my $leaf = $self->_find_leaf($key); |
653
|
62784
|
|
|
|
|
190283
|
my $k = $leaf->find($cmp, $key); |
654
|
62784
|
100
|
100
|
|
|
276880
|
if (defined $leaf->[($k) * 2 + 1] && &$cmp($leaf->[($k) * 2 + 1], $key) == 0) { |
655
|
25344
|
50
|
|
|
|
75979
|
croak "Unique key violation." if $$self{-unique}; |
656
|
25344
|
|
|
|
|
30057
|
push @{ $leaf->[($k) * 2] }, $value; |
|
25344
|
|
|
|
|
4476742
|
|
657
|
25344
|
|
|
|
|
74755
|
return; |
658
|
|
|
|
|
|
|
} |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# Then, tell the tree to split straight down if it will need to |
661
|
37440
|
|
|
|
|
123622
|
my $new_root = $$self{-root}->split($n, $cmp, $key); |
662
|
37440
|
100
|
|
|
|
113036
|
$$self{-root} = $new_root if defined $new_root; |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
# Next, insert the new value (we need a new leaf in case a split occurred) |
665
|
37440
|
|
|
|
|
113776
|
$leaf = $self->_find_leaf($key); |
666
|
37440
|
|
|
|
|
95364
|
$leaf->insert($leaf->find($cmp, $key), $key, [ $value ]); |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=item $tree->delete($key, $value) |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
This method removes the key/value pair given from the tree. If the pair cannot |
672
|
|
|
|
|
|
|
be found, then the tree is not changed. If C<$value> is stored multiple times at |
673
|
|
|
|
|
|
|
C<$key>, then all values matching C<$value> will be removed. |
674
|
|
|
|
|
|
|
=cut |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
sub delete { |
677
|
5232
|
|
|
5232
|
1
|
9814629
|
my ($self, $key, $value) = @_; |
678
|
5232
|
|
|
|
|
23119
|
my $cmp = $$self{-keycmp}; |
679
|
5232
|
|
|
|
|
15321
|
my $valcmp = $$self{-valuecmp}; |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# In the case of delete, we have two steps: |
682
|
|
|
|
|
|
|
# 1. Find the leaf containing the key. |
683
|
|
|
|
|
|
|
# a. If no matching key is found in the leaf where it should be, quit. |
684
|
|
|
|
|
|
|
# b. If the bucket for the key found contains multiple values, remove |
685
|
|
|
|
|
|
|
# one and quit. |
686
|
|
|
|
|
|
|
# c. Otherwise, continue to step 2. |
687
|
|
|
|
|
|
|
# 2. Starting at the top, tell the tree to delete the node. |
688
|
|
|
|
|
|
|
# a. The tree will then prune off any leaves that become empty. |
689
|
|
|
|
|
|
|
# b. The tree will prune of branches that aren't needed. This may |
690
|
|
|
|
|
|
|
# result in branches with less than n/2 nodes, so we will need to |
691
|
|
|
|
|
|
|
# rebalance the tree. |
692
|
|
|
|
|
|
|
# c. The tree will perform rebalancing on it's way back up from the |
693
|
|
|
|
|
|
|
# leaf. It will attempt to coalesce where needed and possible and |
694
|
|
|
|
|
|
|
# redistribute if needed and coalesce won't work. |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# First, find the leaf containing the key |
697
|
5232
|
|
|
|
|
20114
|
my $leaf = $self->_find_leaf($key); |
698
|
5232
|
|
|
|
|
28728
|
my $i = $leaf->find($cmp, $key); |
699
|
5232
|
50
|
33
|
|
|
64563
|
if (defined $leaf->[($i) * 2 + 1] && &$cmp($leaf->[($i) * 2 + 1], $key) == 0) { |
700
|
5232
|
100
|
|
|
|
7309
|
if (scalar(@{ $leaf->[($i) * 2] }) > 1) { |
|
5232
|
50
|
|
|
|
61201
|
|
|
3120
|
|
|
|
|
8483
|
|
701
|
2112
|
|
|
|
|
9674
|
my $bucket = $leaf->[($i) * 2]; |
702
|
2112
|
|
|
|
|
13000
|
@$bucket = grep { &$valcmp($value, $_) != 0 } @$bucket; |
|
8112
|
|
|
|
|
20026
|
|
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
# If the bucket has more elements, we quit here. Otherwise, we need |
705
|
|
|
|
|
|
|
# to remove the node. |
706
|
2112
|
50
|
|
|
|
12791
|
return if @$bucket > 0; |
707
|
3120
|
|
|
|
|
8326
|
} elsif (!grep { &$valcmp($value, $_) == 0 } @{ $leaf->[($i) * 2] }) { |
708
|
|
|
|
|
|
|
# no match for value, let's quit |
709
|
0
|
|
|
|
|
0
|
return; |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
} else { |
712
|
|
|
|
|
|
|
# no match for key, let's quit |
713
|
0
|
|
|
|
|
0
|
return; |
714
|
|
|
|
|
|
|
} |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
# Then, since we're still here, we know there is a key/value match that |
717
|
|
|
|
|
|
|
# we intend to remove. Since this removal will empty a bucket, we need to |
718
|
|
|
|
|
|
|
# bring out the big guns. Tell the tree to take care of it and it will take |
719
|
|
|
|
|
|
|
# care of coalescing and redistributing nodes. |
720
|
3120
|
|
|
|
|
21737
|
my $values = $$self{-root}->delete($$self{-n}, $cmp, $key); |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
# if the tree contains only a single value and is a branch, then the tree is |
723
|
|
|
|
|
|
|
# one level shallower than before the delete |
724
|
3120
|
100
|
100
|
|
|
47090
|
$$self{-root} = $$self{-root}->[0] |
725
|
|
|
|
|
|
|
if not $$self{-root}->isa('Tree::BPTree::Leaf') and $values == 1; |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=item $tree->reverse |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
Reverse the sort order. This is done by reversing every key in the tree, |
731
|
|
|
|
|
|
|
adjusting the linked leaf list, and replacing the "-keycmp" method with a new |
732
|
|
|
|
|
|
|
one that simply negates the old one. If this method is called again, the same |
733
|
|
|
|
|
|
|
node reversal will happen, but the original "-keycmp" will be reinstated rather |
734
|
|
|
|
|
|
|
than doing a double negation. |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
=cut |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
sub reverse { |
739
|
96
|
|
|
96
|
1
|
1311
|
my ($self) = @_; |
740
|
96
|
|
|
|
|
452
|
$$self{-root}->reverse; |
741
|
96
|
50
|
|
|
|
444
|
if (defined $$self{-reverse_keycmp}) { |
742
|
0
|
|
|
|
|
0
|
$$self{-keycmp} = delete $$self{-reverse_keycmp}; |
743
|
|
|
|
|
|
|
} else { |
744
|
96
|
|
|
|
|
461
|
$$self{-reverse_keycmp} = $$self{-keycmp}; |
745
|
96
|
|
|
|
|
206
|
my $cmp = $$self{-keycmp}; |
746
|
96
|
|
|
147458
|
|
874
|
$$self{-keycmp} = sub { -( &$cmp(@_) ) }; |
|
147458
|
|
|
|
|
280781
|
|
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=item $cursor = $tree->new_cursor |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
This method allows you to have multiple, simultaneous iterators through the |
753
|
|
|
|
|
|
|
same index. If you pass the C<$cursor> value returned from C to |
754
|
|
|
|
|
|
|
C, it will be used instead of the default internal cursor. That is, |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my $c1 = $tree->new_cursor; |
757
|
|
|
|
|
|
|
my $c2 = $tree->new_cursor; |
758
|
|
|
|
|
|
|
while (my ($key, $values) = $tree->each($c1)) { |
759
|
|
|
|
|
|
|
# let's go through $c1 twice as fast |
760
|
|
|
|
|
|
|
my ($nextkey, $nextvalue) = $tree->each($c1); |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# next is an alias for each |
763
|
|
|
|
|
|
|
my ($otherkey, $othervalue) = $tree->next($c2); |
764
|
|
|
|
|
|
|
} |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
# and we can reset $c2 after we're done too |
767
|
|
|
|
|
|
|
$tree->reset($c2); |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
Cursors also have their own methods, so this same snippet could have been |
770
|
|
|
|
|
|
|
written like this instead: |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
my $c1 = $tree->new_cursor; |
773
|
|
|
|
|
|
|
my $c2 = $tree->new_cursor; |
774
|
|
|
|
|
|
|
while (my ($key, $value) = $c1->each) { |
775
|
|
|
|
|
|
|
# let's go through $c1 twice as fast |
776
|
|
|
|
|
|
|
my ($nextkey, $nextvalue) = $c1->each; |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# next is an alias for each |
779
|
|
|
|
|
|
|
my ($otherkey, $othervalue) = $c2->each; |
780
|
|
|
|
|
|
|
} |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
# and we can reset $c2 after we're done too |
783
|
|
|
|
|
|
|
$c2->reset; |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
There are additional features provided with cursors that are not provided when |
786
|
|
|
|
|
|
|
using the internal cursor. You may delete the last key/values pair returned by a |
787
|
|
|
|
|
|
|
call to C/C by calling C on the cursor. Or, you may specify |
788
|
|
|
|
|
|
|
a specific value in the bucket to be deleted. For example: |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
my $cursor = $tree->new_cursor; |
791
|
|
|
|
|
|
|
while (my ($key, $value) = $cursor->next) { |
792
|
|
|
|
|
|
|
# In this example, the keys are objects with a is_bad method. If "bad" is |
793
|
|
|
|
|
|
|
# set, we want to remove the corresponding values. |
794
|
|
|
|
|
|
|
if ($key->is_bad) { |
795
|
|
|
|
|
|
|
$cursor->delete; |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
This form of delete is completely safe and will not cause the iterator to slip |
800
|
|
|
|
|
|
|
off track as a similar operation might mess up array iteration if one isn't |
801
|
|
|
|
|
|
|
careful. |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
Another feature of cursors, is that you may retrieve the previously returned |
804
|
|
|
|
|
|
|
value by calling the C method. This will return the same result as the |
805
|
|
|
|
|
|
|
last call to C or C. That is, unless C has been called or |
806
|
|
|
|
|
|
|
C removed the previously returned key, then this will return an empty |
807
|
|
|
|
|
|
|
list. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
For example: |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
# This assumes you use the typical string keys with numeric values |
812
|
|
|
|
|
|
|
$cursor = $tree->new_cursor; |
813
|
|
|
|
|
|
|
while (my ($key, $value) = $cursor->next) { |
814
|
|
|
|
|
|
|
my ($currkey, $currval) = $cursor->current; |
815
|
|
|
|
|
|
|
die unless $key eq $currkey and $value == $currval |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
This example shouldn't die. |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
=cut |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
package Tree::BPTree::Cursor; |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
# These keep the real work in Tree::BPTree |
825
|
|
|
|
|
|
|
sub each { |
826
|
15744
|
|
|
15744
|
|
26702
|
my ($self) = @_; |
827
|
15744
|
|
|
|
|
66141
|
$$self{-tree}->each($self); |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
sub next { |
831
|
5280
|
|
|
5280
|
|
64237867
|
my ($self) = @_; |
832
|
5280
|
|
|
|
|
39185
|
$$self{-tree}->each($self); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
sub current { |
836
|
5232
|
|
|
5232
|
|
26127
|
my ($self) = @_; |
837
|
5232
|
50
|
|
|
|
17221
|
return () unless defined $$self{-last}; |
838
|
|
|
|
|
|
|
return ( |
839
|
5232
|
|
|
|
|
57940
|
$$self{-last}{-node}->[($$self{-last}{-index}) + 1], |
840
|
|
|
|
|
|
|
$$self{-last}{-node}->[($$self{-last}{-index})][($$self{-last}{-value})], |
841
|
|
|
|
|
|
|
); |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
sub reset { |
846
|
5232
|
|
|
5232
|
|
16136840
|
my ($self) = @_; |
847
|
5232
|
|
|
|
|
22072
|
$$self{-tree}->reset($self); |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
sub delete { |
851
|
5232
|
|
|
5232
|
|
32354
|
my ($self) = @_; |
852
|
|
|
|
|
|
|
|
853
|
5232
|
50
|
|
|
|
26994
|
Carp::croak "No node to delete. This has occurred because a delete was attempted before iteration started or delete was attempted twice on the same node." |
854
|
|
|
|
|
|
|
unless defined $$self{-last}; |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
# We must be careful as removing the node might throw off $$self{-index} if |
857
|
|
|
|
|
|
|
# $$self{-node} == $$self{-last}{-node}. In the case that we remove the node |
858
|
|
|
|
|
|
|
# altogether and $$self{-node} == $$self{-last}{-node}, we must decrement |
859
|
|
|
|
|
|
|
# $$self{-index} by 2 to keep it from skipping a node or falling off the end |
860
|
|
|
|
|
|
|
# of the node. |
861
|
5232
|
|
|
|
|
68064
|
my $cmp = $$self{-tree}{-keycmp}; |
862
|
5232
|
|
|
|
|
27948
|
my $valcmp = $$self{-tree}{-valuecmp}; |
863
|
|
|
|
|
|
|
|
864
|
5232
|
|
|
|
|
13339
|
my $leaf = $$self{-last}{-node}; |
865
|
5232
|
|
|
|
|
13196
|
my $i = $$self{-last}{-index}; |
866
|
5232
|
|
|
|
|
12276
|
my $value = $$self{-last}{-value}; |
867
|
5232
|
100
|
|
|
|
6428
|
if (@{ $leaf->[$i] } > 1) { |
|
5232
|
|
|
|
|
19873
|
|
868
|
|
|
|
|
|
|
# The bucket contains more than one value. Drop the current index, keep |
869
|
|
|
|
|
|
|
# us from calling delete again and quit. |
870
|
2112
|
|
|
|
|
3769
|
my $bucket = $leaf->[$i]; |
871
|
2112
|
|
|
|
|
5551
|
splice @$bucket, $value, 1; |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
# If this node and the last node are equivalent, we need to decrement |
874
|
|
|
|
|
|
|
# the current value to keep us from skipping nodes are falling of the |
875
|
|
|
|
|
|
|
# end of the bucket |
876
|
2112
|
50
|
33
|
|
|
18901
|
--$$self{-value} if defined $$self{-node} and $$self{-last}{-node} == $$self{-node}; |
877
|
|
|
|
|
|
|
|
878
|
2112
|
|
|
|
|
9230
|
delete $$self{-last}; |
879
|
2112
|
|
|
|
|
6979
|
return; |
880
|
|
|
|
|
|
|
} # Otherwise, this value is the last in the node and we drop it entirely |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
# We're still here, so the $value is the only remaining value |
883
|
3120
|
|
|
|
|
23717
|
my $values = $$self{-tree}{-root}->delete($$self{-tree}{-n}, $cmp, $leaf->[$i + 1]); |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
# if the tree contains only a single value and is a branch, then the tree is |
886
|
|
|
|
|
|
|
# one level shallower than before the delete |
887
|
3120
|
100
|
100
|
|
|
49250
|
$$self{-tree}{-root} = $$self{-tree}{-root}->[0] |
888
|
|
|
|
|
|
|
if not $$self{-tree}{-root}->isa('Tree::BPTree::Leaf') and $values == 1; |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
# If this node and the last node are equivalent, we need to decrement the |
891
|
|
|
|
|
|
|
# current index to keep the cursor going in the correct place. |
892
|
3120
|
100
|
100
|
|
|
30897
|
$$self{-index} -= 2 if defined $$self{-node} and $$self{-last}{-node} == $$self{-node}; |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
# We can't delete again since we've just annihilated the key |
895
|
3120
|
|
|
|
|
15452
|
delete $$self{-last}; |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
package Tree::BPTree; |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
sub new_cursor { |
901
|
144
|
|
|
144
|
1
|
3029
|
my ($self) = @_; |
902
|
144
|
|
|
|
|
870
|
return bless { -tree => $self }, 'Tree::BPTree::Cursor'; |
903
|
|
|
|
|
|
|
} |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=item ($key, $value) = $tree->each [ ($cursor) ] |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
This method provides a similar facility as that of the C operator. Each |
908
|
|
|
|
|
|
|
call will iterate through each key/value pair in sort order. After the last |
909
|
|
|
|
|
|
|
key/value pair has been returned, C will be returned once before starting |
910
|
|
|
|
|
|
|
again. This is useful for using within C loops: |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
while (my ($key, $value) = $tree->each) { |
913
|
|
|
|
|
|
|
# do stuff |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
=cut |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
sub each { |
919
|
797185
|
|
|
797185
|
1
|
14165106
|
my ($self, $cursor) = @_; |
920
|
797185
|
100
|
|
|
|
2679932
|
$cursor = $$self{-cursor} unless defined $cursor; |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
# This method operates on a cursor in three states: |
923
|
|
|
|
|
|
|
# 1. Fresh. $$cursor{-index} is undefined to show that we are in a fresh |
924
|
|
|
|
|
|
|
# state and should return the very first index. |
925
|
|
|
|
|
|
|
# 2. Iterating. $$cursor{-index} and $$cursor{-node} are defined to show |
926
|
|
|
|
|
|
|
# that we are somewhere in the middle of the list. |
927
|
|
|
|
|
|
|
# 3. Dead. $$cursor{-node} is undefined to show that we have reached the |
928
|
|
|
|
|
|
|
# last node. At this point () should be returned and then |
929
|
|
|
|
|
|
|
# $$cursor{-index} deleted to return us to Fresh state. |
930
|
|
|
|
|
|
|
# |
931
|
|
|
|
|
|
|
# It is possible to move directly from Fresh to Dead in one call by checking |
932
|
|
|
|
|
|
|
# the size of $$cursor{-node}. If $$cursor{-node}->nvalues == 1, then the |
933
|
|
|
|
|
|
|
# very first node is empty, so we immediately return that we are Dead and |
934
|
|
|
|
|
|
|
# return to a Fresh state. |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
# If the cursor is empty, then they haven't ran each yet (or the last run |
937
|
|
|
|
|
|
|
# has concluded). Set a new iteration run up. |
938
|
797185
|
100
|
|
|
|
2422408
|
unless (defined $$cursor{-index}) { |
939
|
17617
|
|
|
|
|
75172
|
$$cursor{-node} = $$self{-root}->first_leaf; |
940
|
17617
|
|
|
|
|
50637
|
$$cursor{-index} = 0; |
941
|
17617
|
|
|
|
|
81169
|
$$cursor{-value} = 0; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
|
944
|
797185
|
100
|
100
|
|
|
2786347
|
if (defined $$cursor{-node} and @{$$cursor{-node}} > 1) { |
|
784897
|
|
|
|
|
3448033
|
|
945
|
|
|
|
|
|
|
# The last run didn't detect the end of the list, so give them the next |
946
|
|
|
|
|
|
|
# value |
947
|
784800
|
|
|
|
|
4520682
|
my @next = ( |
948
|
|
|
|
|
|
|
$$cursor{-node}->[($$cursor{-index}) + 1], |
949
|
|
|
|
|
|
|
$$cursor{-node}->[($$cursor{-index})][($$cursor{-value})], |
950
|
|
|
|
|
|
|
); |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
# Remember this position, in case we want to delete it |
953
|
784800
|
|
|
|
|
2245492
|
$$cursor{-last}{-node} = $$cursor{-node}; |
954
|
784800
|
|
|
|
|
1965016
|
$$cursor{-last}{-index} = $$cursor{-index}; |
955
|
784800
|
|
|
|
|
1938440
|
$$cursor{-last}{-value} = $$cursor{-value}; |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
# Increment the value point first |
958
|
784800
|
100
|
|
|
|
1401404
|
if ($$cursor{-value} == $#{$$cursor{-node}[$$cursor{-index}]}) { |
|
784800
|
|
|
|
|
2846227
|
|
959
|
|
|
|
|
|
|
# In this case, we're at the end, so we need to increment in the |
960
|
|
|
|
|
|
|
# index and return this to the first value of the next bucket |
961
|
492576
|
|
|
|
|
919154
|
$$cursor{-value} = 0; |
962
|
|
|
|
|
|
|
|
963
|
492576
|
100
|
|
|
|
956184
|
if ($$cursor{-index} + 2 == $#{$$cursor{-node}}) { |
|
492576
|
|
|
|
|
9653863
|
|
964
|
|
|
|
|
|
|
# We've reached the end of a node, move to the next |
965
|
61545
|
|
|
|
|
189137
|
my $next_node = $$cursor{-node}->[$$cursor{-index} + 2]; |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
# Check for orphaned nodes and remove them |
968
|
61545
|
|
100
|
|
|
356736
|
while (defined $next_node and @$next_node == 1) { |
969
|
257
|
|
|
|
|
927
|
$next_node = $next_node->[0]; |
970
|
|
|
|
|
|
|
} |
971
|
61545
|
|
|
|
|
176170
|
$$cursor{-node}->[$$cursor{-index} + 2] = $next_node; |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
# Move to the next node |
974
|
61545
|
|
|
|
|
111339
|
$$cursor{-node} = $next_node; |
975
|
61545
|
|
|
|
|
148333
|
$$cursor{-index} = 0; |
976
|
|
|
|
|
|
|
} else { |
977
|
|
|
|
|
|
|
# We've still got more key/value pairs to read in this node |
978
|
431031
|
|
|
|
|
803663
|
$$cursor{-index} += 2; |
979
|
|
|
|
|
|
|
} |
980
|
|
|
|
|
|
|
|
981
|
492576
|
|
|
|
|
2650250
|
return @next; |
982
|
|
|
|
|
|
|
} else { |
983
|
|
|
|
|
|
|
# We've still got more values, so we need to get ready for the next |
984
|
292224
|
|
|
|
|
525109
|
++$$cursor{-value}; |
985
|
292224
|
|
|
|
|
9876549
|
return @next; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
} else { |
988
|
|
|
|
|
|
|
# The last run reached the end of the list, so delete the -index element |
989
|
|
|
|
|
|
|
# so we can start anew and return undef once, just like the each |
990
|
|
|
|
|
|
|
# operator. |
991
|
12385
|
|
|
|
|
36970
|
delete $$cursor{-index}; |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# Also clear the last pointers so we can't call delete on the cursor |
994
|
|
|
|
|
|
|
# until we've called each at least once. |
995
|
12385
|
|
|
|
|
39831
|
delete $$cursor{-last}; |
996
|
|
|
|
|
|
|
|
997
|
12385
|
|
|
|
|
54177
|
return (); |
998
|
|
|
|
|
|
|
} |
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
=item $tree->reset [ ($cursor) ] |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
Reset the given cursor to a fresh state--that is, ready to return the first |
1004
|
|
|
|
|
|
|
value on the next call to C. If no C<$cursor> is given, then the default |
1005
|
|
|
|
|
|
|
internal cursor is reset. |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=cut |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
sub reset { |
1010
|
5232
|
|
|
5232
|
1
|
8571
|
my ($self, $cursor) = @_; |
1011
|
5232
|
50
|
|
|
|
19828
|
$cursor = $$self{-cursor} unless defined $cursor; |
1012
|
5232
|
|
|
|
|
47645
|
delete $$cursor{-index}; |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
=item $tree->iterate(\&iter) |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
For each key/value pair in the database, the function C<&iter> will be called |
1018
|
|
|
|
|
|
|
with the key as the first argument and value as the second. Iteration will occur |
1019
|
|
|
|
|
|
|
in sort order. |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
=cut |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
sub iterate { |
1024
|
48
|
|
|
48
|
1
|
1109
|
my ($self, $iter) = @_; |
1025
|
|
|
|
|
|
|
|
1026
|
48
|
|
|
|
|
205
|
while (my ($k, $v) = $self->each) { |
1027
|
5232
|
|
|
|
|
13472
|
&$iter($k, $v); |
1028
|
|
|
|
|
|
|
} |
1029
|
|
|
|
|
|
|
} |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=item @results = $tree->map(\&mapper) |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
Nearly identical to C, this method captures the return values of each |
1034
|
|
|
|
|
|
|
call and then returns all the results as a list. The C<&mapper> function takes |
1035
|
|
|
|
|
|
|
the same arguments as in C. |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
=cut |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
sub map { |
1040
|
48
|
|
|
48
|
1
|
999
|
my ($self, $mapper) = @_; |
1041
|
|
|
|
|
|
|
|
1042
|
48
|
|
|
|
|
104
|
my @result; |
1043
|
48
|
|
|
|
|
212
|
while (my ($k, $v) = $self->each) { |
1044
|
5232
|
|
|
|
|
11571
|
push @result, &$mapper($k, $v); |
1045
|
|
|
|
|
|
|
} |
1046
|
|
|
|
|
|
|
|
1047
|
48
|
|
|
|
|
2388
|
return @result; |
1048
|
|
|
|
|
|
|
} |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
=item @pairs = $tree->grep(\&pred) |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
=item @keys = $tree->grep_keys(\&pred) |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
=item @values = $tree->grep_values(\&pred) |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
Iterates through all key/value pairs in sort order. For each key/value pair, the |
1057
|
|
|
|
|
|
|
function C<&pred> will be called by passing the key as the first argument and |
1058
|
|
|
|
|
|
|
the value as the second. If C<&pred> returns a true value, then the matched |
1059
|
|
|
|
|
|
|
value will be added to the returned list. |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
C returns a list of pairs such that each element is a two-element array |
1062
|
|
|
|
|
|
|
reference where the first element is they key and the second is the value. |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
C returns a list of keys. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
C returns a list of values. |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=cut |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
sub grep { |
1071
|
480
|
|
|
480
|
1
|
5212386
|
my ($self, $pred) = @_; |
1072
|
|
|
|
|
|
|
|
1073
|
480
|
|
|
|
|
1537
|
my @result; |
1074
|
480
|
|
|
|
|
1685
|
while (my ($k, $v) = $self->each) { |
1075
|
52320
|
100
|
|
|
|
137516
|
push @result, [ $k, $v ] if &$pred($k, $v); |
1076
|
|
|
|
|
|
|
} |
1077
|
|
|
|
|
|
|
|
1078
|
480
|
|
|
|
|
9276
|
return @result; |
1079
|
|
|
|
|
|
|
} |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
sub grep_keys { |
1082
|
480
|
|
|
480
|
1
|
1311263
|
my ($self, $pred) = @_; |
1083
|
|
|
|
|
|
|
|
1084
|
480
|
|
|
|
|
769
|
my @result; |
1085
|
480
|
|
|
|
|
1554
|
while (my ($k, $v) = $self->each) { |
1086
|
52320
|
100
|
|
|
|
151283
|
push @result, $k if &$pred($k, $v); |
1087
|
|
|
|
|
|
|
} |
1088
|
|
|
|
|
|
|
|
1089
|
480
|
|
|
|
|
4677
|
return @result; |
1090
|
|
|
|
|
|
|
} |
1091
|
|
|
|
|
|
|
|
1092
|
|
|
|
|
|
|
sub grep_values { |
1093
|
480
|
|
|
480
|
1
|
806689
|
my ($self, $pred) = @_; |
1094
|
|
|
|
|
|
|
|
1095
|
480
|
|
|
|
|
858
|
my @result; |
1096
|
480
|
|
|
|
|
1533
|
while (my ($k, $v) = $self->each) { |
1097
|
52320
|
100
|
|
|
|
132491
|
push @result, $v if &$pred($k, $v); |
1098
|
|
|
|
|
|
|
} |
1099
|
|
|
|
|
|
|
|
1100
|
480
|
|
|
|
|
3933
|
return @result; |
1101
|
|
|
|
|
|
|
} |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=item @pairs = $tree->pairs |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=item @keys = $tree->keys |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
=item @values = $tree->values |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
Returns all elements of the given type. |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
C returns all key/value pairs stored in the tree. Each pair is returned |
1112
|
|
|
|
|
|
|
as an array reference contain two elements. The first element is the key. The |
1113
|
|
|
|
|
|
|
second element is a bucket, which is an array-reference of stored values. |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
C returns all keys stored in the tree. |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
C returns all values stored in the tree. |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
=cut |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
sub pairs { |
1122
|
5280
|
|
|
5280
|
1
|
35434
|
my ($self) = @_; |
1123
|
|
|
|
|
|
|
|
1124
|
5280
|
|
|
|
|
15984
|
my @pairs; |
1125
|
5280
|
|
|
|
|
21207
|
while (my ($k, $v) = $self->each) { |
1126
|
287760
|
|
|
|
|
1201051
|
push @pairs, [ $k, $v ]; |
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
|
1129
|
5280
|
|
|
|
|
2308474
|
return @pairs; |
1130
|
|
|
|
|
|
|
} |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
sub keys { |
1133
|
48
|
|
|
48
|
1
|
118
|
my ($self) = @_; |
1134
|
|
|
|
|
|
|
|
1135
|
48
|
|
|
|
|
86
|
my @keys; |
1136
|
48
|
|
|
|
|
229
|
while (my ($k, $v) = $self->each) { |
1137
|
5232
|
|
|
|
|
16601
|
push @keys, $k; |
1138
|
|
|
|
|
|
|
} |
1139
|
|
|
|
|
|
|
|
1140
|
48
|
|
|
|
|
2750
|
return @keys; |
1141
|
|
|
|
|
|
|
} |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
sub values { |
1144
|
5376
|
|
|
5376
|
1
|
24636
|
my ($self) = @_; |
1145
|
|
|
|
|
|
|
|
1146
|
5376
|
|
|
|
|
39777
|
my @values; |
1147
|
5376
|
|
|
|
|
17160
|
while (my ($k, $v) = $self->each) { |
1148
|
298224
|
|
|
|
|
1142670
|
push @values, $v; |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
|
1151
|
5376
|
|
|
|
|
169301
|
return @values; |
1152
|
|
|
|
|
|
|
} |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
=item $tree->clear |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
This method empties the tree of all values. This basically creates a new tree |
1157
|
|
|
|
|
|
|
and allows the old tree to be garbage collected at the interpreter's leisure. |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
=cut |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
sub clear { |
1162
|
0
|
|
|
0
|
1
|
|
my ($self) = @_; |
1163
|
0
|
|
|
|
|
|
$$self{-root} = Tree::BPTree::Leaf->new; |
1164
|
|
|
|
|
|
|
} |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
=back |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head1 CREDITS |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
The basis for B+ trees implemented here can be found in I
|
1171
|
|
|
|
|
|
|
Concepts>, 4th ed. by Silbershatz et al. published by McGraw-Hill. I have |
1172
|
|
|
|
|
|
|
somewhat modified the structure specified there to make the code easier to read |
1173
|
|
|
|
|
|
|
and to adapt the code to Perl. |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
In addition, while preparing to write this module I also consulted an old book |
1176
|
|
|
|
|
|
|
of mine, I by Robert Sedgewick (Addison Wesley), for more |
1177
|
|
|
|
|
|
|
general information on trees. I also used some ideas on how and when to perform |
1178
|
|
|
|
|
|
|
split, coalesce, and redistribute as the Silbershatz pseudo-code is a little |
1179
|
|
|
|
|
|
|
obfuscated--or at least, the different operations are presented monolithically |
1180
|
|
|
|
|
|
|
so that it's difficult to digest. The sections in Sedgewick on 2-3-4 and |
1181
|
|
|
|
|
|
|
Red-Black trees were especially helpful. |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
=head1 BUGS |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
This module is pretty slow. Better performance is possible, especially for small |
1186
|
|
|
|
|
|
|
bodies of data, if you use a hash to do most of these operations. See |
1187
|
|
|
|
|
|
|
F for a sample of the performance issues. There you can also find |
1188
|
|
|
|
|
|
|
code for performing essentially the same thing using different data structures. |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
On my machine, a small benchmark showed the following: |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
Insert into B+ Trees (this implementation) is: |
1193
|
|
|
|
|
|
|
61 times slower than hash insert and |
1194
|
|
|
|
|
|
|
3.9 times slower than ordered list insert. |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
Ordered iteration of B+ Trees is: |
1197
|
|
|
|
|
|
|
1.6 times slower than ordering a hash and then iterating the pairs and |
1198
|
|
|
|
|
|
|
14 times slower than iterating through an ordered list. |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
Finding a key in B+ Trees is: |
1201
|
|
|
|
|
|
|
34 times slower than hash fetch but |
1202
|
|
|
|
|
|
|
1.2 times faster than searching an ordered list (with grep, which probably |
1203
|
|
|
|
|
|
|
isn't the fastest solution, a manual binary search should be better). |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
I'm still putting together more benchmarks and looking into places where |
1206
|
|
|
|
|
|
|
improvement is possible. Iteration of this structure should scale better than |
1207
|
|
|
|
|
|
|
taking a hash and ordering the keys to iterate through. |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
I have made some recent headway by removing some simple functions and replacing |
1210
|
|
|
|
|
|
|
them with raw computation. If I did this the way I'd really like to, I need to |
1211
|
|
|
|
|
|
|
find or build a L module to perform something similar to a C |
1212
|
|
|
|
|
|
|
C<#define> or C++ C function. However, instead I just did a search and |
1213
|
|
|
|
|
|
|
replace with Vim. |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
I should probably port this to XS to make it really compete with built-in |
1216
|
|
|
|
|
|
|
hashes. |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
=head1 AUTHOR |
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
Andrew Sterling Hanenkamp, Ehanenkamp@users.sourceforge.netE |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
Copyright 2003 by Andrew Sterling Hanenkamp |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
1227
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
=cut |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
1 |