File Coverage

blib/lib/Text/Ngram.pm

Criterion	Covered	Total	%
statement	50	50	100.0
branch	23	26	88.4
condition	5	7	71.4
subroutine	8	8	100.0
pod	2	2	100.0
total	88	93	94.6

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Ngram;
2
3	2			2		35019	use 5.008008;
	2					9
	2					91
4	2			2		12	use strict;
	2					4
	2					78
5	2			2		11	use warnings;
	2					9
	2					78
6
7	2			2		2098	use Unicode::CaseFold;
	2					2574
	2					1542
8
9							require Exporter;
10
11							our @ISA = qw(Exporter);
12							our %EXPORT_TAGS = ( 'all' => [ qw( ngram_counts add_to_counts) ] );
13							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
14							our @EXPORT = qw();
15
16							our $VERSION = '0.15';
17
18							=head1 NAME
19
20							Text::Ngram - Ngram analysis of text
21
22							=head1 SYNOPSIS
23
24							use Text::Ngram qw(ngram_counts add_to_counts);
25							my $text = "abcdefghijklmnop";
26							my $hash_r = ngram_counts($text, 3); # Window size = 3
27							# $hash_r => { abc => 1, bcd => 1, ... }
28
29							add_to_counts($more_text, 3, $hash_r);
30
31							=head1 DESCRIPTION
32
33							n-Gram analysis is a field in textual analysis which uses sliding window
34							character sequences in order to aid topic analysis, language
35							determination and so on. The n-gram spectrum of a document can be used
36							to compare and filter documents in multiple languages, prepare word
37							prediction networks, and perform spelling correction.
38
39							The neat thing about n-grams, though, is that they're really easy to
40							determine. For n=3, for instance, we compute the n-gram counts like so:
41
42							the cat sat on the mat
43							--- $counts{"the"}++;
44							--- $counts{"he "}++;
45							--- $counts{"e c"}++;
46							...
47
48							This module provides an efficient XS-based implementation of n-gram
49							spectrum analysis.
50
51							There are two functions which can be imported:
52
53							=cut
54
55							require XSLoader;
56							XSLoader::load('Text::Ngram', $VERSION);
57
58							sub _clean_buffer {
59	19			19		18	my %config = %{+shift};
	19					94
60	19					28	my $buffer = shift;
61	19	50		1		81	$buffer = fc $buffer if $config{lowercase};
	1					38
	1					1
	1					16
62	19					26760	$buffer =~ s/\s+/ /g;
63	19	100				49	unless ($config{punctuation}) {
64	15	100				27	if ($config{flankbreaks}) {
65	11					87	$buffer =~ s/[^[:alpha:] ]+/ \xff /g;
66							}
67							else {
68	4					22	$buffer =~ s/[^[:alpha:] ]+/\xff/g;
69							}
70							}
71	19					38	$buffer =~ y/ / /s;
72	19					258	return $buffer;
73							}
74
75							=head2 ngram_counts
76
77							This first function returns a hash reference with the n-gram histogram
78							of the text for the given window size. The default window size is 5.
79
80							$href = ngram_counts(\%config, $text, $window_size);
81
82							As of version 0.14, the %config may instead be passed in as named arguments:
83
84							$href = ngram_counts($text, $window_size, %config);
85
86							The only necessary parameter is $text.
87
88							The possible value for %config are:
89
90							=head3 flankbreaks
91
92							If set to 1 (default), breaks are flanked by spaces; if set to 0,
93							they're not. Breaks are punctuation and other non-alphabetic
94							characters, which, unless you use C<< punctuation => 0 >> in your
95							configuration, do not make it into the returned hash.
96
97							Here's an example, supposing you're using the default value
98							for punctuation (1):
99
100							my $text = "Hello, world";
101							my $hash = ngram_counts($text, 5);
102
103							That produces the following ngrams:
104
105							{
106							'Hello' => 1,
107							'ello ' => 1,
108							' worl' => 1,
109							'world' => 1,
110							}
111
112							On the other hand, this:
113
114							my $text = "Hello, world";
115							my $hash = ngram_counts({flankbreaks => 0}, $text, 5);
116
117							Produces the following ngrams:
118
119							{
120							'Hello' => 1,
121							' worl' => 1,
122							'world' => 1,
123							}
124
125							=head3 lowercase
126
127							If set to 0, casing is preserved. If set to 1, all letters are
128							lowercased before counting ngrams. Default is 1.
129
130							# Get all ngrams of size 4 preserving case
131							$href_p = ngram_counts( {lowercase => 0}, $text, 4 );
132
133							=head3 punctuation
134
135							If set to 0 (default), punctuation is removed before calculating the
136							ngrams. Set to 1 to preserve it.
137
138							# Get all ngrams of size 2 preserving punctuation
139							$href_p = ngram_counts( {punctuation => 1}, $text, 2 );
140
141							=head3 spaces
142
143							If set to 0 (default is 1), no ngrams containing spaces will be returned.
144
145							# Get all ngrams of size 3 that do not contain spaces
146							$href = ngram_counts( {spaces => 0}, $text, 3);
147
148							If you're going to request both types of ngrams, than the best way to
149							avoid calculating the same thing twice is probably this:
150
151							$href_with_spaces = ngram_counts($text[, $window]);
152							$href_no_spaces = $href_with_spaces;
153							for (keys %$href_no_spaces) { delete $href->{$_} if / / }
154
155							=cut
156
157							sub ngram_counts {
158	16			16	1	832	my %config = (
159							spaces => 1,
160							punctuation => 0,
161							lowercase => 1,
162							flankbreaks => 1
163							);
164	16	100				59	if (ref($_[0]) eq 'HASH') {
		100
165	7					14	%config = (%config, %{+shift});
	7					33
166							}
167							elsif (@_ > 2) {
168	2	100				17	%config = (%config, splice @_, (@_ & 1) ? 1 : 2);
169							}
170	16					29	my ($buffer, $width) = @_;
171	16		100			36	$width \|\|= 5;
172	16	50				33	return {} if $width < 1;
173	16					45	my $href = _process_buffer(_clean_buffer(\%config, $buffer), $width);
174	16	100				47	unless ($config{punctuation}) {
175	12	100				59	for (keys %$href) { delete $href->{$_} if /\xff/ }
	169					356
176							}
177	16	100				52	unless ($config{spaces}) {
178	1	100				5	for (keys %$href) { delete $href->{$_} if / / }
	14					29
179							}
180	16					131	return $href;
181							}
182
183							=head2 add_to_counts
184
185							This incrementally adds to the supplied hash; if C<$window> is zero or
186							undefined, then the window size is computed from the hash keys.
187
188							add_to_counts($more_text, $window, $href)
189
190							=cut
191
192							sub add_to_counts {
193	3			3	1	2525	my %config = (punctuation => 0, lowercase => 1);
194	3					6	my ($buffer, $width, $href) = @_;
195	3	100	66			19	if (!defined $width or !$width) {
196	1					3	my ($key, undef) = each %$href; # Just gimme a random key
197	1		50			5	$width = length $key \|\| 5;
198							}
199	3					10	_process_buffer_incrementally(_clean_buffer(\%config, $buffer), $width, $href);
200	3	50				24	for (keys %$href) { delete $href->{$_} if /\xff/ }
	42					78
201							}
202
203							1;
204							__END__