File Coverage

blib/lib/Algorithm/RabinKarp.pm

Criterion	Covered	Total	%
statement	47	49	95.9
branch	7	8	87.5
condition	8	13	61.5
subroutine	9	9	100.0
pod	3	4	75.0
total	74	83	89.1

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::RabinKarp;
2
3	2			2		74829	use warnings;
	2					5
	2					69
4	2			2		13	use strict;
	2					5
	2					78
5
6	2			2		1448	use Algorithm::RabinKarp::Util qw(stream_fh stream_string);
	2					6
	2					183
7
8	2			2		2180	use UNIVERSAL;
	2					30
	2					11
9
10	2			2		69	use constant BASE => 2;
	2					3
	2					1319
11
12							our $VERSION = "0.41";
13
14							=head1 NAME
15
16							Algorithm::RabinKarp - Rabin-Karp streaming hash
17
18							=head1 SYNOPSIS
19
20							my $text = "A do run run run, a do run run";
21							my $kgram = Algorithm::RabinKarp->new($window, $text);
22
23							or
24
25							my $kgram2 = Algorithm::RabinKarp->new($window, $fh);
26
27							or
28							my $kgram3 = Algorithm::RabinKarp->new($window, sub {
29							...
30							return $num, $position;
31							});
32
33							my ($hash, $start_position, $end_position) = $kgram->next;
34
35							my @values = $kgram->values;
36
37							my %occurances; # a dictionary of all kgrams.
38							while (my ($hash, @pos) = @{shift @values}) {
39							push @{$occurances{$hash}}, \@pos;
40							}
41
42							my $needle = Algorithm::RabinKarp->new(6, "needle");
43							open my $fh, '<', "haystack.txt";
44							my $haystack = Algorithm::RabinKarp->new(6, $fh);
45							my $needle_hash = $needle->next;
46
47							while (my ($hay_hash, @pos) = $haystack->next) {
48							warn "Possible match for 'needle' at @pos"
49							if $needle_hash eq $hay_hash;
50							}
51
52
53							=head1 DESCRIPTION
54
55							This is an implementation of Rabin and Karp's streaming hash, as
56							described in "Winnowing: Local Algorithms for Document Fingerprinting" by
57							Schleimer, Wilkerson, and Aiken. Following the suggestion of Schleimer,
58							I am using their second equation:
59
60							$H[ $c[2..$k + 1] ] = (( $H[ $c[1..$k] ] - $c[1] ** $k ) + $c[$k+1] ) * $k
61
62							The results of this hash encodes information about the next k values in
63							the stream (hense k-gram.) This means for any given stream of length n
64							integer values (or characters), you will get back n - k + 1 hash
65							values.
66
67							For best results, you will want to create a code generator that filters
68							your data to remove all unnecessary information. For example, in a large
69							english document, you should probably remove all white space, as well
70							as removing all capitalization.
71
72							=head1 INTENT
73
74							By preprocessing your document with the Rabin Karp hashing algorithm,
75							it makes it possible to create a "fingerprint" of your document (or documents),
76							and then perform multiple searches for fragments contained within your document
77							database.
78
79							Schleimer, Wilkerson, and Aiken suggest preproccessing to remove
80							unnecessary information (like whitespace), as well as known redundent information
81							(like, say, copyright notices or other boilerplate that is 'acceptable'.)
82
83							They also suggest a post processing pass to reduce data volume, using a technique
84							called winnowing (see the link at the end of this documentation.)
85
86							=head1 METHODS
87
88							=over
89
90							=item new($k, [FileHandle\|Scalar\|Coderef] )
91
92							Creates a new hash generator. If you provide a callback function, it must
93							return the next integer value in the stream. Additionally, you may
94							return the original position of the value in the stream (ie, you may have been
95							filtering characters out because they're redundant.)
96
97							=cut
98
99							sub new {
100	5			5	1	2276	my $class = shift;
101	5					9	my $k = shift;
102	5					17	my $stream = $class->make_stream(shift);
103	4					8	my $rm_k = BASE;
104
105	4		33			49	bless {
106							k => $k % 32,
107							vals => [],
108							stream => $stream,
109							}, ref $class \|\| $class;
110							}
111
112							sub make_stream {
113	5			5	0	8	my $class = shift;
114	5					8	my $source = shift;
115
116	5	100				19	return $source if ref $source eq 'CODE';
117
118	4					7	my $stream;
119	4	100	66			33	if (defined $source && !ref $source) {
		50	33
120	3					17	$stream = stream_string($source);
121							} elsif (UNIVERSAL::isa($source, "IO::Handle")
122							\|\| UNIVERSAL::isa($source,"GLOB")) {
123	0					0	require IO::Handle;
124							# The simplest way of getting character position right now.
125	0					0	$stream = stream_fh($source);
126							} else {
127	1					12	die __PACKAGE__." requires its source stream be one of the ".
128							"following types: scalar, file handle, coderef, or IO::Handle";
129							}
130	3					8	return $stream;
131							}
132
133
134							=item next()
135
136							Returns an array containing (kgram hash value, start position , end position, start, end)
137							for every call that can have a hash generated, or () when we have reached the end
138							of the stream.
139
140							C pulls the first $k from the stream on the first call. Each successive
141							call to C has a complexity of O(1).
142
143							=cut
144							sub next {
145	18			18	1	20	my $self = shift;
146
147							# assume, for now, that each value is an integer, or can
148							# auto cast to char
149	18					21	my $values = $self->{vals}; #assume that @values always contains k values
150	18		100			38	my $prev = shift @$values \|\| [0, undef];
151	18		100			38	my $hash = $self->{hash} \|\| 0;
152	18					34	while (@$values < $self->{k}) {
153	22					51	my $nextval = [$self->{stream}->()];
154	22	100				46	return unless @$nextval;
155	21					25	push @$values, $nextval;
156	21					19	$hash <<= 1;
157	21					27	$hash -= $prev->[0] << $self->{k};
158	21					44	$hash += $nextval->[0];
159
160							}
161
162	17					21	$self->{hash} = $hash;
163
164	17					49	return $hash, $values->[0][1], $values->[-1][1], @{ $values }[0, -1];
	17					67
165							}
166
167							=item values
168
169							Returns an array containing all C hash values contained
170							within the data stream, and the positions associated with them (in the same
171							format as yielded by L.)
172
173							After calling C the stream will be completely exhausted, causing
174							subsequent calls to C and C to return C.
175
176							NOTE: You should use C if your source stream is infinite, as values
177							will greedily attempt to consume all values.
178
179							=cut
180
181							sub values {
182	1			1	1	5	my $self = shift;
183
184	1					2	my @values;
185	1					5	while (my @next = $self->next()) {
186	17					131	push @values, \@next;
187							}
188	1					23	return @values;
189							}
190
191							=back
192
193							=cut
194
195							=head1 BUGS
196
197							The current multipliers and modulus lead to very poor hash
198							distributions. I'll investigate methods of improving this
199							in future versions.
200
201							=head1 SEE ALSO
202
203							"Winnowing: Local Algorithms for Document Fingerprinting"
204							L
205
206							Wikipedia: Rabin-Karp string search algorithm
207							L
208
209							=head1 AUTHOR
210
211							Norman Nunley Ennunley@gmail.comE
212							Nicholas Clark (Who paired with me)
213
214							=cut
215
216							1;