line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bloom::Filter; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
78749
|
use strict; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
115
|
|
4
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
87
|
|
5
|
3
|
|
|
3
|
|
15
|
use Carp; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
291
|
|
6
|
3
|
|
|
3
|
|
18971
|
use Digest::SHA qw(sha1); |
|
3
|
|
|
|
|
23569
|
|
|
3
|
|
|
|
|
5416
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '1.2'; |
9
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=encoding UTF-8 |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Bloom::Filter - Sample Perl Bloom filter implementation |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 DESCRIPTION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
A Bloom filter is a probabilistic algorithm for doing existence tests |
20
|
|
|
|
|
|
|
in less memory than a full list of keys would require. The tradeoff to |
21
|
|
|
|
|
|
|
using Bloom filters is a certain configurable risk of false positives. |
22
|
|
|
|
|
|
|
This module implements a simple Bloom filter with configurable capacity |
23
|
|
|
|
|
|
|
and false positive rate. Bloom filters were first described in a 1970 |
24
|
|
|
|
|
|
|
paper by Burton Bloom, see L. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 SYNOPSIS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Bloom::Filter |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $bf = Bloom::Filter->new( capacity => 10, error_rate => .001 ); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$bf->add( @keys ); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
while ( <> ) { |
35
|
|
|
|
|
|
|
chomp; |
36
|
|
|
|
|
|
|
print "Found $_\n" if $bf->check( $_ ); |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=over |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=item new %PARAMS |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Create a brand new instance. Allowable params are C, C. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=cut |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub new |
50
|
|
|
|
|
|
|
{ |
51
|
5
|
|
|
5
|
1
|
43
|
my ( $class, %params ) = @_; |
52
|
|
|
|
|
|
|
|
53
|
5
|
|
|
|
|
43
|
my $self = |
54
|
|
|
|
|
|
|
{ |
55
|
|
|
|
|
|
|
# some defaults |
56
|
|
|
|
|
|
|
error_rate => 0.001, |
57
|
|
|
|
|
|
|
capacity => 100, |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
%params, |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# internal data |
62
|
|
|
|
|
|
|
key_count => 0, |
63
|
|
|
|
|
|
|
filter_length => 0, |
64
|
|
|
|
|
|
|
num_hash_funcs => 0, |
65
|
|
|
|
|
|
|
salts => [], |
66
|
|
|
|
|
|
|
}; |
67
|
5
|
|
|
|
|
12
|
bless $self, $class; |
68
|
5
|
|
|
|
|
22
|
$self->init(); |
69
|
5
|
|
|
|
|
15
|
return $self; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item init |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Calculates the best number of hash functions and optimum filter length, |
75
|
|
|
|
|
|
|
creates some random salts, and generates a blank bit vector. Called |
76
|
|
|
|
|
|
|
automatically by constructor. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub init |
81
|
|
|
|
|
|
|
{ |
82
|
5
|
|
|
5
|
1
|
9
|
my ( $self ) = @_; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# some sanity checks |
85
|
5
|
50
|
|
|
|
31
|
croak "Capacity must be greater than zero" unless $self->{capacity}; |
86
|
5
|
50
|
|
|
|
14
|
croak "Error rate must be greater than zero" unless $self->{error_rate}; |
87
|
5
|
50
|
|
|
|
23
|
croak "Error rate cannot exceed 1" unless $self->{error_rate} < 1; |
88
|
|
|
|
|
|
|
|
89
|
5
|
|
|
|
|
21
|
my ( $length, $num_funcs ) = $self->_calculate_shortest_filter_length |
90
|
|
|
|
|
|
|
($self->{capacity}, $self->{error_rate} ); |
91
|
|
|
|
|
|
|
|
92
|
5
|
|
|
|
|
9
|
$self->{num_hash_funcs} = $num_funcs; |
93
|
5
|
|
|
|
|
8
|
$self->{filter_length} = $length; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# create some random salts; |
96
|
5
|
|
|
|
|
8
|
my %collisions; |
97
|
5
|
|
|
|
|
17
|
while ( scalar keys %collisions < $self->{num_hash_funcs} ) { |
98
|
57
|
|
|
|
|
1023
|
$collisions{rand()}++; |
99
|
|
|
|
|
|
|
} |
100
|
5
|
|
|
|
|
28
|
$self->{salts} = [ keys %collisions ]; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# make an empty filter |
103
|
5
|
|
|
|
|
245
|
$self->{filter} = pack( "b*", '0' x $self->{filter_length} ); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# make some blank vectors to use |
106
|
5
|
|
|
|
|
14
|
$self->{blankvec} = pack( "N", 0 ); |
107
|
|
|
|
|
|
|
|
108
|
5
|
|
|
|
|
15
|
return 1; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=back |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 ACCESSORS |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=over |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=item capacity |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Returns the total capacity of the Bloom filter |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=cut |
122
|
|
|
|
|
|
|
|
123
|
2
|
|
|
2
|
1
|
16
|
sub capacity { $_[0]->{capacity} }; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=item error_rate |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
Returns the configured maximum error rate |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=cut |
130
|
|
|
|
|
|
|
|
131
|
2
|
|
|
2
|
1
|
8
|
sub error_rate { $_[0]->{error_rate} }; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item length |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Returns the length of the Bloom filter in bits |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
138
|
|
|
|
|
|
|
|
139
|
2
|
|
|
2
|
1
|
7
|
sub length { $_[0]->{filter_length} }; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=item key_count |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Returns the number of items currently stored in the filter |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
4
|
|
|
4
|
1
|
1200
|
sub key_count { $_[0]->{key_count} }; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item on_bits |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Returns the number of 'on' bits in the filter |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub on_bits |
156
|
|
|
|
|
|
|
{ |
157
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
158
|
0
|
0
|
|
|
|
0
|
return unless $self->{filter}; |
159
|
0
|
|
|
|
|
0
|
return unpack( "%32b*", $self->{filter}) |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=item salts |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Returns the list of salts used to create the hash functions |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=cut |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub salts |
169
|
|
|
|
|
|
|
{ |
170
|
2
|
|
|
2
|
1
|
13
|
my ( $self ) = @_; |
171
|
2
|
50
|
33
|
|
|
27
|
return unless exists $self->{salts} |
|
|
|
33
|
|
|
|
|
172
|
|
|
|
|
|
|
and ref $self->{salts} |
173
|
|
|
|
|
|
|
and ref $self->{salts} eq 'ARRAY'; |
174
|
|
|
|
|
|
|
|
175
|
2
|
|
|
|
|
4
|
return @{ $self->{salts} }; |
|
2
|
|
|
|
|
12
|
|
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=back |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=over |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item add @KEYS |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Adds the list of keys to the filter. Will fail, return C and complain |
187
|
|
|
|
|
|
|
if the number of keys in the filter exceeds the configured capacity. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
sub add |
192
|
|
|
|
|
|
|
{ |
193
|
103
|
|
|
103
|
1
|
1104
|
my ( $self, @keys ) = @_; |
194
|
|
|
|
|
|
|
|
195
|
103
|
50
|
|
|
|
334
|
return unless @keys; |
196
|
|
|
|
|
|
|
# Hash our list of keys into the empty filter |
197
|
103
|
50
|
|
|
|
144
|
my @salts = @{ $self->{salts} } |
|
103
|
|
|
|
|
414
|
|
198
|
|
|
|
|
|
|
or croak "No salts found, cannot make bitmask"; |
199
|
103
|
|
|
|
|
196
|
foreach my $key ( @keys ) { |
200
|
105
|
100
|
|
|
|
223
|
if ($self->{key_count} >= $self->{capacity}) { |
201
|
1
|
|
|
|
|
186
|
carp "Exceeded filter capacity"; |
202
|
1
|
|
|
|
|
192
|
return; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
# flip the appropriate bits on |
205
|
104
|
|
|
|
|
104
|
vec($self->{filter}, $_, 1) = 1 foreach @{$self->_get_cells($key)}; |
|
104
|
|
|
|
|
224
|
|
206
|
104
|
|
|
|
|
293
|
$self->{key_count}++; |
207
|
|
|
|
|
|
|
} |
208
|
102
|
|
|
|
|
333
|
return 1; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=item check @KEYS |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Checks the provided key list against the Bloom filter, |
214
|
|
|
|
|
|
|
and returns a list of equivalent length, with true or |
215
|
|
|
|
|
|
|
false values depending on whether there was a match. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=cut |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub check |
220
|
|
|
|
|
|
|
{ |
221
|
|
|
|
|
|
|
|
222
|
3
|
|
|
3
|
1
|
8
|
my ( $self, @keys ) = @_; |
223
|
|
|
|
|
|
|
|
224
|
3
|
50
|
|
|
|
8
|
return unless @keys; |
225
|
3
|
|
|
|
|
3
|
my @result; |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# A match occurs if every bit we check is on |
228
|
3
|
|
|
|
|
5
|
foreach my $key ( @keys ) { |
229
|
3
|
|
|
|
|
6
|
my $match = 1; |
230
|
3
|
|
|
|
|
3
|
foreach my $cell (@{$self->_get_cells($key)} ) { |
|
3
|
|
|
|
|
6
|
|
231
|
21
|
|
|
|
|
41
|
$match = vec( $self->{filter}, $cell, 1 ) ; |
232
|
21
|
100
|
|
|
|
83
|
last unless $match; |
233
|
|
|
|
|
|
|
} |
234
|
3
|
|
|
|
|
9
|
push @result, $match; |
235
|
|
|
|
|
|
|
} |
236
|
3
|
50
|
|
|
|
17
|
return ( wantarray() ? @result : $result[0] ); |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=back |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 INTERNAL METHODS |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=over |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=item _calculate_shortest_filter_length CAPACITY ERR_RATE |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Given a desired error rate and maximum capacity, returns the optimum |
248
|
|
|
|
|
|
|
combination of vector length (in bits) and number of hash functions |
249
|
|
|
|
|
|
|
to use in building the filter, where "optimum" means shortest vector length. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub _calculate_shortest_filter_length |
254
|
|
|
|
|
|
|
{ |
255
|
5
|
|
|
5
|
|
9
|
my ( $self, $num_keys, $error_rate ) = @_; |
256
|
5
|
|
|
|
|
9
|
my $lowest_m; |
257
|
5
|
|
|
|
|
8
|
my $best_k = 1; |
258
|
|
|
|
|
|
|
|
259
|
5
|
|
|
|
|
14
|
foreach my $k ( 1..100 ) { |
260
|
500
|
|
|
|
|
1494
|
my $m = (-1 * $k * $num_keys) / |
261
|
|
|
|
|
|
|
( log( 1 - ($error_rate ** (1/$k)))); |
262
|
|
|
|
|
|
|
|
263
|
500
|
100
|
100
|
|
|
1555
|
if ( !defined $lowest_m or ($m < $lowest_m) ) { |
264
|
57
|
|
|
|
|
46
|
$lowest_m = $m; |
265
|
57
|
|
|
|
|
68
|
$best_k = $k; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
5
|
|
|
|
|
13
|
$lowest_m = int( $lowest_m ) + 1; |
269
|
5
|
|
|
|
|
14
|
return ( $lowest_m, $best_k ); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=item _get_cells KEY |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Given a key, hashes it using the list of salts and returns |
275
|
|
|
|
|
|
|
an array of cell indexes corresponding to the key. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=cut |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
sub _get_cells |
280
|
|
|
|
|
|
|
{ |
281
|
|
|
|
|
|
|
|
282
|
107
|
|
|
107
|
|
129
|
my ( $self, $key ) = @_; |
283
|
|
|
|
|
|
|
|
284
|
107
|
50
|
|
|
|
203
|
croak "Filter length is undefined" unless $self->{filter_length}; |
285
|
107
|
50
|
|
|
|
109
|
my @salts = @{ $self->{salts} } |
|
107
|
|
|
|
|
507
|
|
286
|
|
|
|
|
|
|
or croak "No salts found, cannot make bitmask"; |
287
|
|
|
|
|
|
|
|
288
|
107
|
|
|
|
|
115
|
my @cells; |
289
|
107
|
|
|
|
|
180
|
foreach my $salt ( @salts ){ |
290
|
|
|
|
|
|
|
|
291
|
1070
|
|
|
|
|
5745
|
my $hash = sha1( $key, $salt ); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# blank 32 bit vector |
294
|
1070
|
|
|
|
|
1775
|
my $vec = $self->{blankvec}; |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# split the 160-bit hash into five 32-bit ints |
297
|
|
|
|
|
|
|
# and XOR the pieces together |
298
|
|
|
|
|
|
|
|
299
|
1070
|
|
|
|
|
2042
|
my @pieces = map {pack( "N", $_ )} unpack("N*", $hash ); |
|
5350
|
|
|
|
|
11939
|
|
300
|
1070
|
|
|
|
|
4866
|
$vec = $_ ^ $vec foreach @pieces; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# Calculate bit offset by modding |
303
|
1070
|
|
|
|
|
1862
|
my $result = unpack( "N", $vec ); |
304
|
1070
|
|
|
|
|
1974
|
my $bit_offset = $result % $self->{filter_length}; |
305
|
1070
|
|
|
|
|
3185
|
push @cells, $bit_offset; |
306
|
|
|
|
|
|
|
} |
307
|
107
|
|
|
|
|
1531
|
return \@cells; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=back |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=head1 AUTHOR |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
Originally written by Maciej Ceglowski Emaciej@ceglowski.comE. |
315
|
|
|
|
|
|
|
Currently maintained by Grzegorz Rożniecki Exaerxess@gmail.comE. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
Dmitriy Ryaboy Edmitriy.ryaboy@ask.comE (big speedup in February 2007, thanks!) |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
(c) 2004 Maciej Ceglowski |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
This is free software, distributed under version 2 |
326
|
|
|
|
|
|
|
of the GNU Public License (GPL). |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=cut |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
1; |