File Coverage

blib/lib/Text/Ngrams.pm

Criterion	Covered	Total	%
statement	293	332	88.2
branch	117	158	74.0
condition	35	48	72.9
subroutine	22	23	95.6
pod	8	8	100.0
total	475	569	83.4

line	stmt	bran	cond	sub	pod	time	code
1							# (c) 2003-2017 Vlado Keselj http://web.cs.dal.ca/~vlado
2							#
3							# Text::Ngrams - A Perl module for N-grams processing
4
5							package Text::Ngrams;
6
7	9			9		34186	use strict;
	9					11
	9					291
8							require Exporter;
9	9			9		33	use Carp;
	9					10
	9					589
10	9			9		41	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	9					8
	9					918
11							@ISA = qw(Exporter);
12							%EXPORT_TAGS = ( 'all' => [ qw(new encode_S decode_S) ] );
13							@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
14							@EXPORT = qw(new);
15							$VERSION = '2.006';
16
17	9			9		35	use vars qw($Version);
	9					8
	9					318
18							$Version = $VERSION;
19
20	9			9		28	use vars @EXPORT_OK;
	9					18
	9					143
21	9			9		32	use vars qw(); # non-exported package globals go here
	9					18
	9					2341
22
23							sub new {
24	12			12	1	11573	my $package = shift;
25	12		33			75	$package = ref($package) \|\| $package;
26	12					19	my $self = {};
27
28	12					42	my (%params) = @_;
29
30							$self->{windowsize} = exists($params{windowsize}) ?
31	12	100				51	$params{windowsize} : 3;
32	12	50				37	die "nonpositive window size: $self->{windowsize}" unless $self->{windowsize} > 0;
33	12					21	delete($params{windowsize});
34
35	12	100	66			126	if (! exists($params{type}) or $params{type} eq 'character') {
		50
		50
		100
		50
36	3					8	$self->{skiprex} = '';
37	3					12	$self->{tokenrex} = qr/([a-zA-Z]\|[^a-zA-Z]+)/;
38	3			30		10	$self->{processtoken} = sub { s/[^a-zA-Z]+/ /; $_ = uc $_ };
	30					35
	30					26
39	3					5	$self->{allow_iproc} = 1;
40							}
41							elsif ($params{type} eq 'utf8') {
42	0					0	$self->{skiprex} = '';
43	0					0	$self->{tokenrex} = qr/([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]
44							\|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]
45							\|[\xC2-\xDF][\x80-\xBF]
46							\|[\x00-\xFF])/x;
47	0					0	$self->{processtoken} = '';
48							}
49							#MJ ->
50							#this type is analogous to the "character" type but defined for utf8 characters
51							elsif ($params{type} eq 'utf8_character') {
52							# $self->{inputlayer}
53							# input layer to be put on the input stream by the function binmode
54							# before reading from a given stream and to be removed by
55							# *binmode HANDLE,":pop"* after the reading from the particular
56							# stream is done has to be a real layer (like ":encoding(utf8)"), not a
57							# pseudo layer (like ":utf8") so that the psuedo layer ":pop" is able to
58							# remove this input layer
59
60	0					0	$self->{inputlayer} = ':encoding(utf8)';
61							# this will automatically decode input text from utf8 into Perl internal
62							# reporesentation of Unicode strings and so the regular expressions for
63							# Unicode as well as the uc function can be performed on them
64
65	0					0	$self->{skiprex} = '';
66
67	9			9		4703	$self->{tokenrex} = qr/(\p{IsAlpha}\|\P{IsAlpha}+)/;
	9					71
	9					108
	0					0
68
69	0			0		0	$self->{processtoken} = sub { s/\P{IsAlpha}+/ /; $_ = uc $_ ;
	0					0
70	0					0	$_ = Encode::encode_utf8( $_ ); };
	0					0
71							# the last operation *$_=Encode::encode_utf8( $_ )* is necessary
72							# to go back to utf8 encoding from the internal Perl representation
73							# so that for the output the n-grams are in utf8 (encoded by encode_S though)
74
75	0					0	$self->{allow_iproc} = 0;
76							# allow_iproc has to be 0. Otherwise the last token in the read block will
77							# be preprocessed and encoded in utf8,
78							# and then attached at the beginning of the next block read from input,
79							# which will be in the internal Perl representation
80							}
81							#MJ <-
82							elsif ($params{type} eq 'byte') {
83	4					8	$self->{skiprex} = '';
84	4					7	$self->{tokenrex} = '';
85	4					7	$self->{processtoken} = '';
86							}
87							elsif ($params{type} eq 'word') {
88	5					26	$self->{skiprex} = qr/[^a-zA-Z0-9]+/;
89	5					12	$self->{skipinsert} = ' ';
90	5					16	$self->{tokenrex} = qr/([a-zA-Z]+\|(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?)/;
91	47			47		114	$self->{processtoken} = sub { s/(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?// }
92	5					23	}
93	0					0	else { die "unknown type: $params{type}" }
94	12					19	delete($params{type});
95
96	12					26	$self->{'table'} = [ ];
97	12					22	$self->{'total'} = [ ];
98	12					23	$self->{'total_distinct_count'} = 0;
99	12					23	$self->{'lastngram'} = [ ];
100	12					18	$self->{'next_token_id'} = 0;
101	12					21	$self->{'token_dict'} = { };
102	12					20	$self->{'token_S'} = [ ];
103	12					18	$self->{'token'} = [ ];
104
105	12					38	foreach my $i ( 1 .. $self->{windowsize} ) {
106	33					44	$self->{table}[$i] = { };
107	33					38	$self->{total}[$i] = 0;
108	33					34	$self->{firstngram}[$i] = '';
109	33					40	$self->{lastngram}[$i] = '';
110							}
111
112	12	100				38	if (exists($params{'limit'})) {
113	1	50				3	die "limit=$params{'limit'}" if $params{'limit'} < 1;
114	1					11	$self->{'limit'} = $params{'limit'};
115							}
116	12					14	delete($params{'limit'});
117
118	12	50				99	die "unknown parameters:".join(' ', %params) if %params;
119
120	12					44	bless($self, $package);
121	12					32	return $self;
122							}
123
124							sub feed_tokens {
125	24			24	1	42	my $self = shift;
126							# count all n-grams sizes starting from max to 1
127	24					30	foreach my $t1 (@_) {
128	204					125	my $t = $t1;
129	204	100				248	if (defined($self->{token_dict}->{$t})) {
130	106					91	$t = $self->{token_dict}->{$t};
131							} else {
132	98					82	my $id = $self->{next_token_id}++;
133	98					111	$self->{token_S}->[$id] = &encode_S($t);
134	98					94	$self->{token}->[$id] = $t;
135	98					139	$t = $self->{token_dict}->{$t} = $id;
136							}
137	204					306	for (my $n=$self->{windowsize}; $n > 0; $n--) {
138	571	100				539	if ($n > 1) {
139	367	100				530	next unless $self->{lastngram}[$n-1] ne '';
140	335					384	$self->{lastngram}[$n] = $self->{lastngram}[$n-1] .
141							' ' . $t;
142	204					173	} else { $self->{lastngram}[$n] = $t }
143
144	539	100				934	if ( ($self->{table}[$n]{$self->{lastngram}[$n]} += 1)==1)
145	330					246	{ $self->{'total_distinct_count'} += 1 }
146
147	539					381	$self->{'total'}[$n] += 1;
148	539	100				1017	if ($self->{'firstngram'}[$n] eq '')
149	31					74	{ $self->{'firstngram'}[$n] = $self->{lastngram}[$n] }
150							}
151							}
152	24	100	100			85	if (exists($self->{'limit'}) and
153							$self->{'total_distinct_count'} > 2 * $self->{'limit'})
154	1					2	{ $self->_reduce_to_limit }
155							}
156
157							sub process_text {
158	4			4	1	18	my $self = shift;
159	4					17	$self->_process_text(0, @_);
160	4	50				17	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	0					0
161							}
162
163							sub _process_text {
164	18			18		19	my $self = shift;
165	18					12	my $cont = shift; # the minimal number of chars left for
166							# continuation (the new-line problem, and the
167							# problem with too long lines)
168							# The remainder of unprocessed string is
169							# returned.
170	18	50				56	if ($cont < 0) { $cont = 0 }
	0					0
171
172	18	50	100			116	if ( # type is byte
			66
			66
173							$self->{skiprex} eq '' and
174							$self->{tokenrex} eq '' and
175							$self->{processtoken} eq '' and
176							$cont == 0
177							)
178	6					17	{ return $self->_process_text_byte(@_) }
179
180	12					13	my (@tokens);
181							my $text;
182	12					26	while (@_) {
183	12					20	$text .= shift @_;
184	12					26	while (length($text) > 0) {
185	86					61	my $textl = $text;
186	86					59	my $skip = '';
187	86	100	100			454	if ($self->{skiprex} ne '' && $textl =~ /^$self->{skiprex}/)
188	54					67	{ $skip = $&; $textl = $'; }
	54					60
189	86	100				123	if (defined($self->{skipinsert})) {
190	56					43	$skip = $self->{skipinsert};
191	56					69	$text = $skip.$textl;
192							}
193	86	100				112	if (length($textl) < $cont) { last }
	4					13
194	82	100				103	if (length($textl)==0) { $text=$textl; last; }
	5					10
	5					12
195
196	77					47	local $_;
197	77	50				102	if ($self->{tokenrex} ne '') {
198	77	50				462	if ($textl =~ /^$self->{tokenrex}/)
199	77					88	{ $_ = $&; $textl = $'; }
	77					94
200							}
201							else
202	0					0	{ $_ = substr($textl, 0, 1); $textl = substr($textl, 1) }
	0					0
203	77	50				115	last if $_ eq '';
204
205	77	100				99	if (length($textl) < $cont) {
206	1	50	33			9	if (defined($self->{allow_iproc}) && $self->{allow_iproc}
			33
207							&& ref($self->{processtoken}) eq 'CODE')
208	1					2	{ &{$self->{processtoken}} }
	1					4
209	1					2	$text = $skip.$_.$textl;
210	1					4	last;
211							}
212	76	50				122	if (ref($self->{processtoken}) eq 'CODE')
213	76					51	{ &{$self->{processtoken}} }
	76					98
214	76					99	push @tokens, $_;
215	76					141	$text = $textl;
216							}
217							}
218	12					28	$self->feed_tokens(@tokens);
219	12					26	return $text;
220							}
221
222							sub _process_text_byte {
223	6			6		6	my $self = shift;
224
225	6					16	for (my $i=0; $i<=$#_; ++$i) {
226	6					29	my @a = split('', $_[$i]);
227	6	100				18	next if $#a==-1;
228	4					10	$self->feed_tokens( @a );
229							}
230	6					13	return '';
231							}
232
233							sub process_files {
234	7			7	1	48	my $self = shift;
235
236							#MJ ->
237	7					7	my $input_layer='';
238	7	50				34	if (defined($self->{inputlayer})) {$input_layer=$self->{inputlayer};}
	0					0
239							#MJ <-
240
241	7					13	foreach my $f (@_) {
242	7					5	my $f1;
243	7					13	local *F;
244	7	50				17	if (not ref($f))
245	7	50				170	{ open(F, "$f") or die "cannot open $f:$!"; $f1 = *F }
	7					19
246	0					0	else { $f1 = $f }
247	7					11	binmode $f1; # avoid text mode
248
249							#MJ ->
250							#put the encoding layer on the input when requested
251	7	50				17	if ($input_layer ne '') {
252	0					0	binmode $f1, $input_layer;
253							}
254							#MJ <-
255
256	7					13	my ($text, $text_l, $cont) = ('', 0, 1);
257	7	100	100			37	if ( # type is byte
			66
258							$self->{skiprex} eq '' and
259							$self->{tokenrex} eq '' and
260							$self->{processtoken} eq ''
261							)
262	2					3	{ $cont = 0 }
263
264	7					7	while (1) {
265	14					12	$text_l = length($text);
266	14					102	read($f1, $text, 1024, length($text));
267	14	100				30	last if length($text) <= $text_l;
268	7					17	$text = $self->_process_text($cont, $text);
269							}
270	7					15	$text = $self->_process_text(0, $text);
271
272							#MJ ->
273							#remove the encoding layer from the input stream if it was added
274							#Caution: here is what the Perl documentation says about the pseudo layer ":pop"
275							#"Should be considered as experimental. (...) A more elegant (and safer) interface is needed."
276	7	50				14	if ($input_layer ne '') {
277	0					0	binmode $f1,":pop";
278							}
279							#MJ <-
280
281	7	50				63	close($f1) if not ref($f);
282	7	100				34	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	1					2
283							}
284							}
285
286							sub _reduce_to_limit {
287	2			2		2	my $self = shift;
288							return unless exists($self->{'limit'}) and
289	2	50	33			8	$self->{'limit'} > 0;
290
291	2					6	while ($self->{'total_distinct_count'} > $self->{'limit'}) {
292	1					1	my $nextprunefrequency = 0;
293	1					1	for (my $prunefrequency=1;; $prunefrequency = $nextprunefrequency) {
294	2					1	$nextprunefrequency = $self->{'total'}[1];
295
296	2					3	foreach my $n (1 .. $self->{'windowsize'}) {
297
298	4					3	my @keys = keys(%{$self->{table}[$n]});
	4					10
299	4					4	foreach my $ngram (@keys) {
300	22					16	my $f = $self->{table}[$n]{$ngram};
301	22	100				27	if ($f <= $prunefrequency) {
		100
302	11					7	delete $self->{'table'}[$n]{$ngram};
303	11					8	$self->{'total'}[$n] -= $prunefrequency;
304	11					9	$self->{'total_distinct_count'} -= 1;
305							}
306							elsif ($nextprunefrequency > $f)
307	4					4	{ $nextprunefrequency = $f }
308							}
309
310	4	100				8	return if $self->{'total_distinct_count'} <= $self->{'limit'};
311	3	50				5	die if $nextprunefrequency <= $prunefrequency;
312							} } } }
313
314							# Sorts keys according to the lexicographic order.
315							sub _keys_sorted {
316	47			47		46	my $self = shift;
317	47					40	my $n = shift;
318	47					33	my @k = keys(%{$self->{table}[$n]});
	47					167
319	47					73	my %k1 = ();
320	47					59	foreach my $k (@k) {
321							$k1{
322	479					489	join (' ', map { $self->{token}->[$_] } split(/ /, $k) )
	999					1488
323							} = $k;
324							}
325	47					60	@k = ();
326	47					185	foreach my $k (sort(keys(%k1))) {
327	479					445	push @k, $k1{$k};
328							}
329	47					191	return @k;
330							}
331
332							sub get_ngrams {
333	27			27	1	32	my $self = shift;
334	27					45	my (%params) = @_;
335	27	100				55	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
336	27	100				45	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
337	27	50				43	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
338
339	27					31	my $total = $self->{total}[$n]; my @keys = ();
	27					31
340	27	100	100			118	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram') {
		50
		50
341	20					47	@keys = $self->_keys_sorted($n);
342							} elsif ($params{'orderby'} eq 'none') {
343	0	0				0	die "onlyfirst requires order" if $onlyfirst;
344	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
345							}
346							elsif ($params{'orderby'} eq 'frequency') {
347	7					16	@keys = $self->_keys_sorted($n);
348	7					10	my %keysord = ();
349	7					22	for (my $i=0; $i<=$#keys; ++$i) { $keysord{$keys[$i]} = $i }
	55					104
350							@keys = sort { $self->{table}[$n]{$b} <=> $self->{table}[$n]{$a}
351	156	50				286	or $keysord{$a} <=> $keysord{$b} }
352	7					8	keys(%{$self->{table}[$n]});
	7					29
353							}
354	0					0	else { die }
355
356	27	100				62	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
357
358	27					25	my @ret;
359	27					30	foreach my $ngram (@keys) {
360	262					226	my $count = $self->{table}[$n]{$ngram};
361	262	50				248	$count = ($opt_normalize ? ($count / $total ) : $count);
362	262					290	push @ret, $self->_encode_S($ngram), $count;
363							}
364
365	27					154	return @ret;
366							}
367
368							sub to_string {
369	11			11	1	50	my $self = shift;
370	11					31	my (%params) = @_;
371	11	50				35	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
372	11	100				31	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
373	11	50				27	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
374
375							#my $onlyfirst = exists($params{'onlyfirst'}) ?
376							#$params{'onlyfirst'} : '';
377							#delete $params{'onlyfirst'};
378
379	11	100				26	my $out = exists($params{'out'}) ? $params{'out'} : '';
380	11					15	delete $params{'out'};
381	11					12	my $outh = $out;
382	11	100	66			39	if ($out and (not ref($out))) {
383	1	50				2	local *FH; open(FH, ">$out") or die "cannot open $out:$!";
	1					68
384	1					4	$outh = *FH;
385							}
386
387							#my $opt_normalize = $params{'normalize'};
388							#delete $params{'normalize'};
389
390	11					15	my $spartan = $params{'spartan'};
391	11					26	delete $params{'spartan'};
392
393	11					16	my $ret='';
394	11	100				37	$ret = "BEGIN OUTPUT BY Text::Ngrams version $VERSION\n\n" unless $spartan;
395
396	11					33	foreach my $n (1 .. $self->{windowsize}) {
397	30	100	100			72	if ($spartan and $n < $self->{windowsize}) { next }
	4					2
398	26	100				44	if (! $spartan ) {
399	25					58	my $tmp = "$n-GRAMS (total count: $self->{total}[$n])";
400							$ret .= "$tmp\n" .
401							"FIRST N-GRAM: ". $self->_encode_S($self->{firstngram}[$n]).
402	25					66	"\n LAST N-GRAM: ".$self->_encode_S($self->{lastngram}[$n])."\n".
403							('-' x length($tmp)) . "\n";
404							}
405	26					40	my $total = $self->{total}[$n];
406
407	26					19	my @keys;
408	26	100	100			126	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram')
		50
		50
409	20					38	{ @keys = $self->_keys_sorted($n) }
410							elsif ($params{'orderby'} eq 'none') {
411	0	0				0	die "onlyfirst requires order" if $onlyfirst;
412	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
413							}
414							elsif ($params{'orderby'} eq 'frequency') {
415							@keys = sort { $self->{table}[$n]{$b} <=>
416	26					38	$self->{table}[$n]{$a} }
417	6					7	keys(%{$self->{table}[$n]});
	6					26
418							}
419	0					0	else { die }
420
421	26	100				56	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
422
423	26					48	my %params1=%params; $params1{n}=$n;
	26					60
424	26					60	my @a = $self->get_ngrams(%params1);
425	26					71	for (my $i=0; $i<=$#a; $i+=2) {
426	229					164	my $ng = $a[$i]; my $f = $a[$i+1];
	229					162
427	229					374	$ret.="$ng\t$f\n";
428							}
429	26	100				45	if ($out) { print $outh $ret; $ret = '' }
	1					3
	1					3
430
431	26	100				94	$ret .= "\n" unless $spartan;
432							}
433
434	11	100				38	$ret .= "END OUTPUT BY Text::Ngrams\n" unless $spartan;
435
436	11	100				22	if ($out) {
437	1					2	print $outh $ret; $ret = '';
	1					1
438	1	50				7	close($outh) if not ref($out);
439							}
440
441	11					46	return $ret;
442							}
443
444							# http://web.cs.dal.ca/~vlado/srcperl/snip/decode_S
445							sub decode_S ( $ ) {
446	2			2	1	5	local $_ = shift;
447	2					3	my $out;
448
449	2					7	while (length($_) > 0) {
450	10	100				45	if (/^\\(\S)/) {
		50
		100
		50
		50
		50
451	4					5	$_ = $'; my $tmp = $1;
	4					7
452	4					3	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
453	4					10	$out .= $tmp;
454							}
455	0					0	elsif (/^\^_/) { $_ = $'; $out .= "\240" }
	0					0
456	2					3	elsif (/^\^(\S)/) { $_ = $'; $out .= pack('C',ord($1)+128); }
	2					23
457							elsif (/^\`(\S)/) {
458	0					0	$_ = $'; my $tmp = $1;
	0					0
459	0					0	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
460	0					0	$out .= pack('C', ord($tmp)+128);
461							}
462	0					0	elsif (/^_+/) { $_ = $'; my $tmp = $&; $tmp =~ tr/_/ /; $out .= $tmp; }
	0					0
	0					0
	0					0
463	4					6	elsif (/^[^\\^\`\s_]+/) { $_ = $'; $out .= $&; }
	4					11
464	0					0	else { die "decode_S unexpected:$_" }
465							}
466
467	2					8	return $out;
468							}
469
470							sub _encode_S {
471	312			312		230	my $self = shift;
472	312					235	my @r = ();
473	312					391	while (@_) {
474							push @r,
475	312					380	map { $self->{token_S}->[$_] } split(/ /, shift @_);
	648					1138
476							}
477	312					618	return join(' ', @r);
478							}
479
480							# http://www.cs.dal.ca/~vlado/srcperl/snip/encode_S
481							sub encode_S( $ ) {
482	100			100	1	104	local $_ = shift;
483
484	100					99	s/=/=0/g; # first hide a special character (=)
485	100					68	s/\\/=b/g; # encode backslashes
486
487	100					117	s/([\x80-\xFF])/=x$1/g; # replace >127 with 127
488	100					87	tr/\x80-\xFF/\x00-\x7F/;
489	100					74	s/=x=/=X/g; # hide again =
490
491	100					99	s/([\x00-\x1F\x5C\x5E-\x60\x7F])/=B$1/g;
492	100					67	tr/\x20\x00-\x1F\x7F/_0-5Aabtnvfroil6-9NSTcEseFGRUd/;
493
494	100					69	s/=x=B(\S)/`$1/g; # hex backslash
495	100					97	s/=x(\S)/^$1/g; # hex other
496	100					77	s/=B(\S)/\\$1/g; # backslashed
497	100					68	s/=b/\\\\/g; # original backslashes
498	100					66	s/=X/^=0/g;
499	100					57	s/=0/=/g; # put back =
500
501	100					158	return $_;
502							}
503
504							1;
505							__END__