File Coverage

blib/lib/Algorithm/RabinKarp.pm
Criterion Covered Total %
statement 47 49 95.9
branch 7 8 87.5
condition 8 13 61.5
subroutine 9 9 100.0
pod 3 4 75.0
total 74 83 89.1


line stmt bran cond sub pod time code
1             package Algorithm::RabinKarp;
2              
3 2     2   74829 use warnings;
  2         5  
  2         69  
4 2     2   13 use strict;
  2         5  
  2         78  
5              
6 2     2   1448 use Algorithm::RabinKarp::Util qw(stream_fh stream_string);
  2         6  
  2         183  
7              
8 2     2   2180 use UNIVERSAL;
  2         30  
  2         11  
9              
10 2     2   69 use constant BASE => 2;
  2         3  
  2         1319  
11              
12             our $VERSION = "0.41";
13              
14             =head1 NAME
15              
16             Algorithm::RabinKarp - Rabin-Karp streaming hash
17              
18             =head1 SYNOPSIS
19              
20             my $text = "A do run run run, a do run run";
21             my $kgram = Algorithm::RabinKarp->new($window, $text);
22              
23             or
24              
25             my $kgram2 = Algorithm::RabinKarp->new($window, $fh);
26              
27             or
28             my $kgram3 = Algorithm::RabinKarp->new($window, sub {
29             ...
30             return $num, $position;
31             });
32            
33             my ($hash, $start_position, $end_position) = $kgram->next;
34            
35             my @values = $kgram->values;
36            
37             my %occurances; # a dictionary of all kgrams.
38             while (my ($hash, @pos) = @{shift @values}) {
39             push @{$occurances{$hash}}, \@pos;
40             }
41            
42             my $needle = Algorithm::RabinKarp->new(6, "needle");
43             open my $fh, '<', "haystack.txt";
44             my $haystack = Algorithm::RabinKarp->new(6, $fh);
45             my $needle_hash = $needle->next;
46            
47             while (my ($hay_hash, @pos) = $haystack->next) {
48             warn "Possible match for 'needle' at @pos"
49             if $needle_hash eq $hay_hash;
50             }
51            
52            
53             =head1 DESCRIPTION
54              
55             This is an implementation of Rabin and Karp's streaming hash, as
56             described in "Winnowing: Local Algorithms for Document Fingerprinting" by
57             Schleimer, Wilkerson, and Aiken. Following the suggestion of Schleimer,
58             I am using their second equation:
59              
60             $H[ $c[2..$k + 1] ] = (( $H[ $c[1..$k] ] - $c[1] ** $k ) + $c[$k+1] ) * $k
61              
62             The results of this hash encodes information about the next k values in
63             the stream (hense k-gram.) This means for any given stream of length n
64             integer values (or characters), you will get back n - k + 1 hash
65             values.
66              
67             For best results, you will want to create a code generator that filters
68             your data to remove all unnecessary information. For example, in a large
69             english document, you should probably remove all white space, as well
70             as removing all capitalization.
71              
72             =head1 INTENT
73              
74             By preprocessing your document with the Rabin Karp hashing algorithm,
75             it makes it possible to create a "fingerprint" of your document (or documents),
76             and then perform multiple searches for fragments contained within your document
77             database.
78              
79             Schleimer, Wilkerson, and Aiken suggest preproccessing to remove
80             unnecessary information (like whitespace), as well as known redundent information
81             (like, say, copyright notices or other boilerplate that is 'acceptable'.)
82              
83             They also suggest a post processing pass to reduce data volume, using a technique
84             called winnowing (see the link at the end of this documentation.)
85              
86             =head1 METHODS
87              
88             =over
89              
90             =item new($k, [FileHandle|Scalar|Coderef] )
91              
92             Creates a new hash generator. If you provide a callback function, it must
93             return the next integer value in the stream. Additionally, you may
94             return the original position of the value in the stream (ie, you may have been
95             filtering characters out because they're redundant.)
96              
97             =cut
98              
99             sub new {
100 5     5 1 2276 my $class = shift;
101 5         9 my $k = shift;
102 5         17 my $stream = $class->make_stream(shift);
103 4         8 my $rm_k = BASE;
104            
105 4   33     49 bless {
106             k => $k % 32,
107             vals => [],
108             stream => $stream,
109             }, ref $class || $class;
110             }
111              
112             sub make_stream {
113 5     5 0 8 my $class = shift;
114 5         8 my $source = shift;
115            
116 5 100       19 return $source if ref $source eq 'CODE';
117            
118 4         7 my $stream;
119 4 100 66     33 if (defined $source && !ref $source) {
    50 33        
120 3         17 $stream = stream_string($source);
121             } elsif (UNIVERSAL::isa($source, "IO::Handle")
122             || UNIVERSAL::isa($source,"GLOB")) {
123 0         0 require IO::Handle;
124             # The simplest way of getting character position right now.
125 0         0 $stream = stream_fh($source);
126             } else {
127 1         12 die __PACKAGE__." requires its source stream be one of the ".
128             "following types: scalar, file handle, coderef, or IO::Handle";
129             }
130 3         8 return $stream;
131             }
132            
133              
134             =item next()
135              
136             Returns an array containing (kgram hash value, start position , end position, start, end)
137             for every call that can have a hash generated, or () when we have reached the end
138             of the stream.
139              
140             C pulls the first $k from the stream on the first call. Each successive
141             call to C has a complexity of O(1).
142              
143             =cut
144             sub next {
145 18     18 1 20 my $self = shift;
146              
147             # assume, for now, that each value is an integer, or can
148             # auto cast to char
149 18         21 my $values = $self->{vals}; #assume that @values always contains k values
150 18   100     38 my $prev = shift @$values || [0, undef];
151 18   100     38 my $hash = $self->{hash} || 0;
152 18         34 while (@$values < $self->{k}) {
153 22         51 my $nextval = [$self->{stream}->()];
154 22 100       46 return unless @$nextval;
155 21         25 push @$values, $nextval;
156 21         19 $hash <<= 1;
157 21         27 $hash -= $prev->[0] << $self->{k};
158 21         44 $hash += $nextval->[0];
159            
160             }
161              
162 17         21 $self->{hash} = $hash;
163            
164 17         49 return $hash, $values->[0][1], $values->[-1][1], @{ $values }[0, -1];
  17         67  
165             }
166              
167             =item values
168              
169             Returns an array containing all C hash values contained
170             within the data stream, and the positions associated with them (in the same
171             format as yielded by L.)
172              
173             After calling C the stream will be completely exhausted, causing
174             subsequent calls to C and C to return C.
175              
176             NOTE: You should use C if your source stream is infinite, as values
177             will greedily attempt to consume all values.
178              
179             =cut
180              
181             sub values {
182 1     1 1 5 my $self = shift;
183            
184 1         2 my @values;
185 1         5 while (my @next = $self->next()) {
186 17         131 push @values, \@next;
187             }
188 1         23 return @values;
189             }
190              
191             =back
192              
193             =cut
194              
195             =head1 BUGS
196              
197             The current multipliers and modulus lead to very poor hash
198             distributions. I'll investigate methods of improving this
199             in future versions.
200              
201             =head1 SEE ALSO
202              
203             "Winnowing: Local Algorithms for Document Fingerprinting"
204             L
205              
206             Wikipedia: Rabin-Karp string search algorithm
207             L
208              
209             =head1 AUTHOR
210              
211             Norman Nunley Ennunley@gmail.comE
212             Nicholas Clark (Who paired with me)
213              
214             =cut
215              
216             1;