File Coverage

blib/lib/Algorithm/NGram.pm

Criterion	Covered	Total	%
statement	107	107	100.0
branch	17	26	65.3
condition	12	19	63.1
subroutine	21	21	100.0
pod	13	13	100.0
total	170	186	91.4

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::NGram;
2	1			1		35568	use strict;
	1					3
	1					41
3	1			1		6	use warnings;
	1					2
	1					32
4
5	1			1		7	use Carp qw (croak);
	1					8
	1					86
6	1			1		994	use Class::Accessor::Fast;
	1					3633
	1					9
7	1			1		31	use List::Util qw (shuffle);
	1					2
	1					131
8	1			1		1151	use Storable;
	1					3879
	1					69
9
10	1			1		7	use base qw/Class::Accessor::Fast/;
	1					2
	1					105
11							__PACKAGE__->mk_accessors(qw/ngram_width token_table tokens dirty/);
12
13							use constant {
14	1					1402	START_TOK => ':STARTTOK:',
15							END_TOK => ':ENDTOK:',
16	1			1		5	};
	1					2
17
18							our $VERSION = '0.9';
19
20							=head1 NAME
21
22							Algorithm::NGram
23
24							=head1 SYNPOSIS
25
26							use Algorithm::NGram;
27							my $ng = Algorithm::NGram->new(ngram_width => 3); # use trigrams
28
29							# feed in text
30							$ng->add_text($text1); # analyze $text1
31							$ng->add_text($text2); # analyze $text2
32
33							# feed in arbitrary sequence of tokens
34							$ng->add_start_token;
35							$ng->add_tokens(qw/token1 token2 token3/);
36							$ng->add_end_token;
37
38							my $output = $ng->generate_text;
39
40							=head1 DESCRIPTION
41
42							This is a module for analyzing token sequences with n-grams. You can
43							use it to parse a block of text, or feed in your own tokens. It can
44							generate new sequences of tokens from what has been fed in.
45
46							=head1 EXPORT
47
48							None.
49
50							=head1 METHODS
51
52							=over 4
53
54							=item new
55
56							Create a new n-gram analyzer instance.
57
58							B
59
60							=over 4
61
62							=item ngram_width
63
64							This is the "window size" of how many tokens the analyzer will keep
65							track of. A ngram_width of two will make a bigram, a ngram_width of
66							three will make a trigram, etc...
67
68							=back
69
70							=cut
71
72							sub new {
73	2			2	1	15	my ($class, %opts) = @_;
74
75							# trigram by default
76	2		50			7	my $ngram_width = delete $opts{ngram_width} \|\| 3;
77
78	2		100			8	my $token_table = delete $opts{token_table} \|\| {};
79	2		100			8	my $tokens = delete $opts{tokens} \|\| [];
80
81	2					6	my $self = {
82							ngram_width => $ngram_width,
83							token_table => $token_table,
84							tokens => $tokens,
85							};
86
87	2					4	bless $self, $class;
88	2					8	$self->dirty(1);
89
90	2					20	return $self;
91							}
92
93							=item ngram_width
94
95							Returns token window size (e.g. the "n" in n-gram)
96
97							=cut
98
99							=item token_table
100
101							Returns n-gram table
102
103							=cut
104
105							=item add_text
106
107							Splits a block of text up by whitespace and processes each word as a
108							token. Automatically calls C at the beginning of
109							the text and C at the end.
110
111							=cut
112
113							# process a block of text, auto-tokenizing it
114							sub add_text {
115	2			2	1	12	my ($self, $text) = @_;
116
117	2					5	$self->add_start_token;
118
119							# tokenize text
120	2					17	foreach my $tok (split(/ /, $text)) {
121	16					63	$tok =~ s/ +//g; # remove spaces
122
123	16	50				22	next unless $tok;
124
125	16					26	$self->add_token($tok);
126							}
127
128	2					15	$self->add_end_token;
129							}
130
131							=item add_tokens
132
133							Adds an arbitrary list of tokens.
134
135							=cut
136
137							*add_token = \&add_tokens;
138							sub add_tokens {
139	20			20	1	25	my ($self, @tokens) = @_;
140	20					17	push @{$self->{tokens}}, @tokens;
	20					33
141	20					40	$self->dirty(1);
142							}
143
144							=item add_start_token
145
146							Adds the "start token." This is useful because you often will want to
147							mark the beginnings and ends of a token sequence so that when
148							generating your output the generator will know what tokens start a
149							sequence and when to end.
150
151							=cut
152
153							sub add_start_token {
154	2			2	1	3	my ($self) = @_;
155	2					5	$self->add_token(START_TOK);
156							}
157
158							=item add_end_token
159
160							Adds the "end token." See C.
161
162							=cut
163
164							sub add_end_token {
165	2			2	1	2	my ($self) = @_;
166	2					4	$self->add_token(END_TOK);
167							}
168
169							=item analyze
170
171							Generates an n-gram frequency table. Returns a hashref of
172							I<< N => tokens => count >>, where N is the number of tokens (will be from 2
173							to ngram_width). You will not normally need to call this unless you
174							want to get the n-gram frequency table.
175
176							=cut
177
178							sub analyze {
179	1			1	1	19	my $self = shift;
180
181	1					3	$self->{token_table} = {};
182
183	1					3	my @all_tokens = @{$self->tokens};
	1					4
184
185	1					14	for (my $i = 1; $i <= $self->ngram_width; $i++) {
186	3					23	for (my $tok_idx = 0; $tok_idx < @all_tokens; $tok_idx++) {
187	60					63	my $tok_idx_end = $tok_idx + $i - 1;
188	60	100				95	$tok_idx_end = @all_tokens if $tok_idx_end > @all_tokens;
189
190							# get tokens
191	60					101	my @tokens = @all_tokens[$tok_idx ... $tok_idx_end];
192
193							# get the token that follows this ngram
194	60					75	my $next_tok = $all_tokens[$tok_idx_end + 1];
195	60	100	66			197	next unless $next_tok && @tokens;
196
197							# don't care about what follows END_TOK
198	54	100				92	next if $tokens[0] eq END_TOK;
199
200	51					78	my $token_key = $self->token_key(@tokens);
201
202							# increment the count of $next_tok after this ngram
203	51					228	$self->{token_table}->{$i}->{$token_key}->{$next_tok}++;
204							}
205							}
206
207	1					6	$self->dirty(0);
208
209	1					6	return $self->{token_table};
210							}
211
212							=item generate_text
213
214							After feeding in text tokens, this will return a new block of text
215							based on whatever text was added.
216
217							=cut
218
219							sub generate_text {
220	1			1	1	10	my ($self, %opts) = @_;
221
222	1					6	my @toks = $self->generate(%opts);
223	1					11	return join(' ', @toks);
224							}
225
226							=item generate
227
228							Generates a new sequence of tokens based on whatever tokens have
229							previously been fed in.
230
231							=cut
232
233							sub generate {
234	1			1	1	2	my ($self, %opts) = @_;
235
236							# update n-gram if things have changed
237	1	50	33			4	$self->analyze
238							if $self->dirty && ! $opts{no_analyze};
239
240	1					1	my @ret_toks;
241	1					2	my $tok = START_TOK;
242
243	1					1	my @cur_toks = ();
244
245	1		66			2	do {
246	9	100				18	push @ret_toks, $tok if $tok ne START_TOK;
247
248	9					15	push @cur_toks, $tok;
249	9					19	shift @cur_toks while @cur_toks > $self->ngram_width;
250
251	9					68	$tok = $self->next_tok(@cur_toks);
252							} while $tok && $tok ne END_TOK;
253
254	1					14	return @ret_toks;
255							}
256
257							=item next_tok
258
259							Given a list of tokens, will pick a possible token to come next.
260
261							=cut
262
263							sub next_tok {
264	9			9	1	13	my ($self, @toks) = @_;
265
266	9	50				7	return undef unless %{$self->token_table};
	9					17
267
268	9					45	my $tok_next = $self->token_lookup(@toks);
269	9	50				16	croak "No next tokens defined for tokens " . $self->token_key(@toks)
270							unless defined $tok_next;
271
272	9					7	my @possible_toks;
273
274	9					24	while (my ($next_tok, $count) = each %$tok_next) {
275	11					44	push @possible_toks, $next_tok for 1 .. $count;
276							}
277
278	9					63	@possible_toks = shuffle @possible_toks;
279	9					11	my $tok = shift @possible_toks;
280
281	9					40	return $tok;
282							}
283
284							=item token_lookup
285
286							Returns a hashref of the counts of tokens that follow a sequence of tokens.
287
288							=cut
289
290							sub token_lookup {
291	9			9	1	14	my ($self, @toks) = @_;
292
293	9					7	my $tok_count = @toks;
294	9	50				18	croak "token_lookup passed more than ngram_width tokens"
295							unless $tok_count <= $self->ngram_width;
296
297	9					38	my $tok_key = $self->token_key(@toks);
298
299	9		50			29	return $self->{token_table}->{$tok_count}->{$tok_key} \|\| undef;
300							}
301
302							=item token_key
303
304							Serializes a sequence of tokens for use as a key into the n-gram
305							table. You will not normally need to call this.
306
307							=cut
308
309							sub token_key {
310	60			60	1	88	my ($self, @toks) = @_;
311	60					128	return join('-', @toks);
312							}
313
314							=item serialize
315
316							Returns the tokens and n-gram (if one has been generated) in a string
317
318							=cut
319
320							sub serialize {
321	1			1	1	2	my ($self) = @_;
322
323	1					3	my $save = {
324							ngram_width => $self->ngram_width,
325							tokens => $self->tokens,
326							ngram => $self->token_table,
327							};
328
329	1					13	return Storable::nfreeze($save);
330							}
331
332							=item deserialize($string)
333
334							Deserializes a string and returns an C instance
335
336							=cut
337
338							sub deserialize {
339	1			1	1	131	my ($class, $str) = @_;
340	1	50				3	croak "Empty string passed to deserialize" unless $str;
341
342	1	50				4	my $save = Storable::thaw($str)
343							or croak "Invalid serialized data passed to deserialize";
344
345	1		50			68	my $tokens = $save->{tokens} \|\| [];
346	1					2	my $ngram = $save->{ngram};
347	1	50				4	my $width = $save->{ngram_width} or croak "No n-gram width saved";
348
349	1					4	my $instance = $class->new(
350							ngram_width => $width,
351							token_table => $ngram,
352							tokens => $tokens,
353							);
354
355	1	50				4	$instance->dirty(0) if $ngram;
356
357	1					6	return $instance;
358							}
359
360							1;
361
362							__END__