File Coverage

blib/lib/Text/Ngrams.pm

Criterion	Covered	Total	%
statement	293	332	88.2
branch	117	158	74.0
condition	35	48	72.9
subroutine	22	23	95.6
pod	8	8	100.0
total	475	569	83.4

line	stmt	bran	cond	sub	pod	time	code
1							# (c) 2003-2022 Vlado Keselj http://web.cs.dal.ca/~vlado
2							#
3							# Text::Ngrams - A Perl module for N-grams processing
4
5							package Text::Ngrams;
6
7	9			9		119078	use strict;
	9					31
	9					279
8							require Exporter;
9	9			9		37	use Carp;
	9					13
	9					599
10	9			9		70	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	9					15
	9					1137
11							@ISA = qw(Exporter);
12							%EXPORT_TAGS = ( 'all' => [ qw(new encode_S decode_S) ] );
13							@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
14							@EXPORT = qw(new);
15							$VERSION = '2.007';
16
17	9			9		58	use vars qw($Version);
	9					15
	9					429
18							$Version = $VERSION;
19
20	9			9		48	use vars @EXPORT_OK;
	9					13
	9					232
21	9			9		39	use vars qw(); # non-exported package globals go here
	9					12
	9					2737
22
23							sub new {
24	12			12	1	11808	my $package = shift;
25	12		33			84	$package = ref($package) \|\| $package;
26	12					25	my $self = {};
27
28	12					43	my (%params) = @_;
29
30							$self->{windowsize} = exists($params{windowsize}) ?
31	12	100				54	$params{windowsize} : 3;
32							die "nonpositive window size: $self->{windowsize}"
33	12	50				42	unless $self->{windowsize} > 0;
34	12					28	delete($params{windowsize});
35
36	12	100	66			198	if (! exists($params{type}) or $params{type} eq 'character') {
		50
		50
		100
		50
37	3					9	$self->{skiprex} = '';
38	3					17	$self->{tokenrex} = qr/([a-zA-Z]\|[^a-zA-Z]+)/;
39	3			30		20	$self->{processtoken} = sub { s/[^a-zA-Z]+/ /; $_ = uc $_ };
	30					48
	30					41
40	3					9	$self->{allow_iproc} = 1;
41							}
42							elsif ($params{type} eq 'utf8') {
43	0					0	$self->{skiprex} = '';
44	0					0	$self->{tokenrex} = qr/([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]
45							\|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]
46							\|[\xC2-\xDF][\x80-\xBF]
47							\|[\x00-\xFF])/x;
48	0					0	$self->{processtoken} = '';
49							}
50							# MJ ->
51							# This type is analogous to the "character" type but defined for the utf8
52							# characters
53							elsif ($params{type} eq 'utf8_character') {
54							# $self->{inputlayer}
55							# input layer to be put on the input stream by the function binmode
56							# before reading from a given stream and to be removed by
57							# *binmode HANDLE,":pop"* after the reading from the particular
58							# stream is done has to be a real layer (like ":encoding(utf8)"), not a
59							# pseudo layer (like ":utf8") so that the psuedo layer ":pop" is able to
60							# remove this input layer
61
62	0					0	$self->{inputlayer} = ':encoding(utf8)';
63							# this will automatically decode input text from utf8 into Perl internal
64							# reporesentation of Unicode strings and so the regular expressions for
65							# Unicode as well as the uc function can be performed on them
66
67	0					0	$self->{skiprex} = '';
68
69	9			9		4757	$self->{tokenrex} = qr/(\p{IsAlpha}\|\P{IsAlpha}+)/;
	9					116
	9					116
	0					0
70
71	0			0		0	$self->{processtoken} = sub { s/\P{IsAlpha}+/ /; $_ = uc $_ ;
	0					0
72	0					0	$_ = Encode::encode_utf8( $_ ); };
	0					0
73							# the last operation *$_=Encode::encode_utf8( $_ )* is necessary
74							# to go back to utf8 encoding from the internal Perl representation
75							# so that for the output the n-grams are in utf8 (encoded by encode_S though)
76
77	0					0	$self->{allow_iproc} = 0;
78							# allow_iproc has to be 0. Otherwise the last token in the read block will
79							# be preprocessed and encoded in utf8,
80							# and then attached at the beginning of the next block read from input,
81							# which will be in the internal Perl representation
82							}
83							#MJ <-
84							elsif ($params{type} eq 'byte') {
85	4					12	$self->{skiprex} = '';
86	4					9	$self->{tokenrex} = '';
87	4					11	$self->{processtoken} = '';
88							}
89							elsif ($params{type} eq 'word') {
90	5					23	$self->{skiprex} = qr/[^a-zA-Z0-9]+/;
91	5					11	$self->{skipinsert} = ' ';
92							$self->{tokenrex} =
93	5					15	qr/([a-zA-Z]+\|(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?)/;
94							$self->{processtoken} = sub
95	47			47		117	{ s/(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?// }
96	5					22	}
97	0					0	else { die "unknown type: $params{type}" }
98	12					22	delete($params{type});
99
100	12					31	$self->{'table'} = [ ];
101	12					27	$self->{'total'} = [ ];
102	12					30	$self->{'total_distinct_count'} = 0;
103	12					50	$self->{'lastngram'} = [ ];
104	12					22	$self->{'next_token_id'} = 0;
105	12					29	$self->{'token_dict'} = { };
106	12					23	$self->{'token_S'} = [ ];
107	12					25	$self->{'token'} = [ ];
108
109	12					45	foreach my $i ( 1 .. $self->{windowsize} ) {
110	33					51	$self->{table}[$i] = { };
111	33					52	$self->{total}[$i] = 0;
112	33					56	$self->{firstngram}[$i] = '';
113	33					58	$self->{lastngram}[$i] = '';
114							}
115
116	12	100				43	if (exists($params{'limit'})) {
117	1	50				3	die "limit=$params{'limit'}" if $params{'limit'} < 1;
118	1					2	$self->{'limit'} = $params{'limit'};
119							}
120	12					23	delete($params{'limit'});
121
122	12	50				33	die "unknown parameters:".join(' ', %params) if %params;
123
124	12					24	bless($self, $package);
125	12					91	return $self;
126							}
127
128							sub feed_tokens {
129	24			24	1	53	my $self = shift;
130							# count all n-grams sizes starting from max to 1
131	24					45	foreach my $t1 (@_) {
132	204					232	my $t = $t1;
133	204	100				298	if (defined($self->{token_dict}->{$t})) {
134	106					118	$t = $self->{token_dict}->{$t};
135							} else {
136	98					126	my $id = $self->{next_token_id}++;
137	98					140	$self->{token_S}->[$id] = &encode_S($t);
138	98					145	$self->{token}->[$id] = $t;
139	98					185	$t = $self->{token_dict}->{$t} = $id;
140							}
141	204					369	for (my $n=$self->{windowsize}; $n > 0; $n--) {
142	571	100				742	if ($n > 1) {
143	367	100				593	next unless $self->{lastngram}[$n-1] ne '';
144	335					548	$self->{lastngram}[$n] = $self->{lastngram}[$n-1] .
145							' ' . $t;
146	204					239	} else { $self->{lastngram}[$n] = $t }
147
148	539	100				1077	if ( ($self->{table}[$n]{$self->{lastngram}[$n]} += 1)==1)
149	330					354	{ $self->{'total_distinct_count'} += 1 }
150
151	539					550	$self->{'total'}[$n] += 1;
152	539	100				991	if ($self->{'firstngram'}[$n] eq '')
153	31					81	{ $self->{'firstngram'}[$n] = $self->{lastngram}[$n] }
154							}
155							}
156	24	100	100			88	if (exists($self->{'limit'}) and
157							$self->{'total_distinct_count'} > 2 * $self->{'limit'})
158	1					3	{ $self->_reduce_to_limit }
159							}
160
161							sub process_text {
162	4			4	1	15	my $self = shift;
163	4					12	$self->_process_text(0, @_);
164	4	50				14	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	0					0
165							}
166
167							sub _process_text {
168	18			18		29	my $self = shift;
169	18					26	my $cont = shift; # the minimal number of chars left for
170							# continuation (the new-line problem, and the
171							# problem with too long lines)
172							# The remainder of unprocessed string is
173							# returned.
174	18	50				43	if ($cont < 0) { $cont = 0 }
	0					0
175
176	18	50	100			114	if ( # type is byte
			66
			66
177							$self->{skiprex} eq '' and
178							$self->{tokenrex} eq '' and
179							$self->{processtoken} eq '' and
180							$cont == 0
181							)
182	6					20	{ return $self->_process_text_byte(@_) }
183
184	12					22	my (@tokens);
185							my $text;
186	12					28	while (@_) {
187	12					26	$text .= shift @_;
188	12					28	while (length($text) > 0) {
189	86					97	my $textl = $text;
190	86					92	my $skip = '';
191	86	100	100			397	if ($self->{skiprex} ne '' && $textl =~ /^$self->{skiprex}/)
192	54					102	{ $skip = $&; $textl = $'; }
	54					68
193	86	100				161	if (defined($self->{skipinsert})) {
194	56					70	$skip = $self->{skipinsert};
195	56					77	$text = $skip.$textl;
196							}
197	86	100				127	if (length($textl) < $cont) { last }
	4					13
198	82	100				117	if (length($textl)==0) { $text=$textl; last; }
	5					7
	5					12
199
200	77					80	local $_;
201	77	50				118	if ($self->{tokenrex} ne '') {
202	77	50				455	if ($textl =~ /^$self->{tokenrex}/)
203	77					137	{ $_ = $&; $textl = $'; }
	77					124
204							}
205							else
206	0					0	{ $_ = substr($textl, 0, 1); $textl = substr($textl, 1) }
	0					0
207	77	50				127	last if $_ eq '';
208
209	77	100				107	if (length($textl) < $cont) {
210	1	50	33			18	if (defined($self->{allow_iproc}) && $self->{allow_iproc}
			33
211							&& ref($self->{processtoken}) eq 'CODE')
212	1					3	{ &{$self->{processtoken}} }
	1					4
213	1					4	$text = $skip.$_.$textl;
214	1					5	last;
215							}
216	76	50				132	if (ref($self->{processtoken}) eq 'CODE')
217	76					91	{ &{$self->{processtoken}} }
	76					101
218	76					120	push @tokens, $_;
219	76					149	$text = $textl;
220							}
221							}
222	12					54	$self->feed_tokens(@tokens);
223	12					32	return $text;
224							}
225
226							sub _process_text_byte {
227	6			6		9	my $self = shift;
228
229	6					20	for (my $i=0; $i<=$#_; ++$i) {
230	6					44	my @a = split('', $_[$i]);
231	6	100				17	next if $#a==-1;
232	4					18	$self->feed_tokens( @a );
233							}
234	6					16	return '';
235							}
236
237							sub process_files {
238	7			7	1	36	my $self = shift;
239
240							#MJ ->
241	7					13	my $input_layer='';
242	7	50				46	if (defined($self->{inputlayer})) {$input_layer=$self->{inputlayer};}
	0					0
243							#MJ <-
244
245	7					17	foreach my $f (@_) {
246	7					9	my $f1;
247	7					20	local *F;
248	7	50				18	if (not ref($f))
249	7	50				258	{ open(F, "$f") or die "cannot open $f:$!"; $f1 = *F }
	7					42
250	0					0	else { $f1 = $f }
251	7					27	binmode $f1; # avoid text mode
252
253							#MJ ->
254							#put the encoding layer on the input when requested
255	7	50				25	if ($input_layer ne '') {
256	0					0	binmode $f1, $input_layer;
257							}
258							#MJ <-
259
260	7					19	my ($text, $text_l, $cont) = ('', 0, 1);
261	7	100	100			47	if ( # type is byte
			66
262							$self->{skiprex} eq '' and
263							$self->{tokenrex} eq '' and
264							$self->{processtoken} eq ''
265							)
266	2					7	{ $cont = 0 }
267
268	7					11	while (1) {
269	14					21	$text_l = length($text);
270	14					245	read($f1, $text, 1024, length($text));
271	14	100				51	last if length($text) <= $text_l;
272	7					29	$text = $self->_process_text($cont, $text);
273							}
274	7					17	$text = $self->_process_text(0, $text);
275
276							#MJ ->
277							#remove the encoding layer from the input stream if it was added
278							#Caution: here is what the Perl documentation says about the pseudo layer ":pop"
279							#"Should be considered as experimental. (...) A more elegant (and safer) interface is needed."
280	7	50				19	if ($input_layer ne '') {
281	0					0	binmode $f1,":pop";
282							}
283							#MJ <-
284
285	7	50				120	close($f1) if not ref($f);
286	7	100				62	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	1					3
287							}
288							}
289
290							sub _reduce_to_limit {
291	2			2		2	my $self = shift;
292							return unless exists($self->{'limit'}) and
293	2	50	33			7	$self->{'limit'} > 0;
294
295	2					8	while ($self->{'total_distinct_count'} > $self->{'limit'}) {
296	1					1	my $nextprunefrequency = 0;
297	1					2	for (my $prunefrequency=1;; $prunefrequency = $nextprunefrequency) {
298	2					2	$nextprunefrequency = $self->{'total'}[1];
299
300	2					3	foreach my $n (1 .. $self->{'windowsize'}) {
301
302	4					3	my @keys = keys(%{$self->{table}[$n]});
	4					12
303	4					4	foreach my $ngram (@keys) {
304	22					23	my $f = $self->{table}[$n]{$ngram};
305	22	100				30	if ($f <= $prunefrequency) {
		100
306	11					15	delete $self->{'table'}[$n]{$ngram};
307	11					10	$self->{'total'}[$n] -= $prunefrequency;
308	11					27	$self->{'total_distinct_count'} -= 1;
309							}
310							elsif ($nextprunefrequency > $f)
311	3					4	{ $nextprunefrequency = $f }
312							}
313
314	4	100				6	return if $self->{'total_distinct_count'} <= $self->{'limit'};
315	3	50				8	die if $nextprunefrequency <= $prunefrequency;
316							} } } }
317
318							# Sorts keys according to the lexicographic order.
319							sub _keys_sorted {
320	47			47		57	my $self = shift;
321	47					55	my $n = shift;
322	47					55	my @k = keys(%{$self->{table}[$n]});
	47					174
323	47					67	my %k1 = ();
324	47					67	foreach my $k (@k) {
325							$k1{
326	479					732	join (' ', map { $self->{token}->[$_] } split(/ /, $k) )
	999					1905
327							} = $k;
328							}
329	47					81	@k = ();
330	47					230	foreach my $k (sort(keys(%k1))) {
331	479					623	push @k, $k1{$k};
332							}
333	47					196	return @k;
334							}
335
336							sub get_ngrams {
337	27			27	1	45	my $self = shift;
338	27					64	my (%params) = @_;
339	27	100				67	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
340	27	100				56	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
341	27	50				50	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
342
343	27					43	my $total = $self->{total}[$n]; my @keys = ();
	27					38
344	27	100	100			114	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram') {
		50
		50
345	20					41	@keys = $self->_keys_sorted($n);
346							} elsif ($params{'orderby'} eq 'none') {
347	0	0				0	die "onlyfirst requires order" if $onlyfirst;
348	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
349							}
350							elsif ($params{'orderby'} eq 'frequency') {
351	7					17	@keys = $self->_keys_sorted($n);
352	7					11	my %keysord = ();
353	7					19	for (my $i=0; $i<=$#keys; ++$i) { $keysord{$keys[$i]} = $i }
	55					91
354							@keys = sort { $self->{table}[$n]{$b} <=> $self->{table}[$n]{$a}
355	151	50				250	or $keysord{$a} <=> $keysord{$b} }
356	7					9	keys(%{$self->{table}[$n]});
	7					27
357							}
358	0					0	else { die }
359
360	27	100				60	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
361
362	27					31	my @ret;
363	27					43	foreach my $ngram (@keys) {
364	262					338	my $count = $self->{table}[$n]{$ngram};
365	262	50				309	$count = ($opt_normalize ? ($count / $total ) : $count);
366	262					348	push @ret, $self->_encode_S($ngram), $count;
367							}
368
369	27					133	return @ret;
370							}
371
372							sub to_string {
373	11			11	1	67	my $self = shift;
374	11					43	my (%params) = @_;
375	11	50				41	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
376	11	100				33	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
377	11	50				30	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
378
379							#my $onlyfirst = exists($params{'onlyfirst'}) ?
380							#$params{'onlyfirst'} : '';
381							#delete $params{'onlyfirst'};
382
383	11	100				28	my $out = exists($params{'out'}) ? $params{'out'} : '';
384	11					19	delete $params{'out'};
385	11					16	my $outh = $out;
386	11	100	66			33	if ($out and (not ref($out))) {
387	1	50				2	local *FH; open(FH, ">$out") or die "cannot open $out:$!";
	1					61
388	1					6	$outh = *FH;
389							}
390
391							#my $opt_normalize = $params{'normalize'};
392							#delete $params{'normalize'};
393
394	11					24	my $spartan = $params{'spartan'};
395	11					16	delete $params{'spartan'};
396
397	11					94	my $ret='';
398	11	100				50	$ret = "BEGIN OUTPUT BY Text::Ngrams version $VERSION\n\n" unless $spartan;
399
400	11					37	foreach my $n (1 .. $self->{windowsize}) {
401	30	100	100			76	if ($spartan and $n < $self->{windowsize}) { next }
	4					5
402	26	100				43	if (! $spartan ) {
403	25					70	my $tmp = "$n-GRAMS (total count: $self->{total}[$n])";
404							$ret .= "$tmp\n" .
405							"FIRST N-GRAM: ". $self->_encode_S($self->{firstngram}[$n]).
406	25					80	"\n LAST N-GRAM: ".$self->_encode_S($self->{lastngram}[$n])."\n".
407							('-' x length($tmp)) . "\n";
408							}
409	26					50	my $total = $self->{total}[$n];
410
411	26					32	my @keys;
412	26	100	100			112	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram')
		50
		50
413	20					57	{ @keys = $self->_keys_sorted($n) }
414							elsif ($params{'orderby'} eq 'none') {
415	0	0				0	die "onlyfirst requires order" if $onlyfirst;
416	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
417							}
418							elsif ($params{'orderby'} eq 'frequency') {
419							@keys = sort { $self->{table}[$n]{$b} <=>
420	27					47	$self->{table}[$n]{$a} }
421	6					8	keys(%{$self->{table}[$n]});
	6					25
422							}
423	0					0	else { die }
424
425	26	100				59	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
426
427	26					65	my %params1=%params; $params1{n}=$n;
	26					45
428	26					68	my @a = $self->get_ngrams(%params1);
429	26					71	for (my $i=0; $i<=$#a; $i+=2) {
430	229					253	my $ng = $a[$i]; my $f = $a[$i+1];
	229					244
431	229					441	$ret.="$ng\t$f\n";
432							}
433	26	100				53	if ($out) { print $outh $ret; $ret = '' }
	1					3
	1					2
434
435	26	100				88	$ret .= "\n" unless $spartan;
436							}
437
438	11	100				36	$ret .= "END OUTPUT BY Text::Ngrams\n" unless $spartan;
439
440	11	100				26	if ($out) {
441	1					36	print $outh $ret; $ret = '';
	1					4
442	1	50				13	close($outh) if not ref($out);
443							}
444
445	11					55	return $ret;
446							}
447
448							# http://web.cs.dal.ca/~vlado/srcperl/snip/decode_S
449							sub decode_S ( $ ) {
450	2			2	1	7	local $_ = shift;
451	2					4	my $out;
452
453	2					10	while (length($_) > 0) {
454	10	100				59	if (/^\\(\S)/) {
		50
		100
		50
		50
		50
455	4					28	$_ = $'; my $tmp = $1;
	4					15
456	4					7	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
457	4					8	$out .= $tmp;
458							}
459	0					0	elsif (/^\^_/) { $_ = $'; $out .= "\240" }
	0					0
460	2					5	elsif (/^\^(\S)/) { $_ = $'; $out .= pack('C',ord($1)+128); }
	2					14
461							elsif (/^\`(\S)/) {
462	0					0	$_ = $'; my $tmp = $1;
	0					0
463	0					0	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
464	0					0	$out .= pack('C', ord($tmp)+128);
465							}
466	0					0	elsif (/^_+/) { $_ = $'; my $tmp = $&; $tmp =~ tr/_/ /; $out .= $tmp; }
	0					0
	0					0
	0					0
467	4					10	elsif (/^[^\\^\`\s_]+/) { $_ = $'; $out .= $&; }
	4					12
468	0					0	else { die "decode_S unexpected:$_" }
469							}
470
471	2					9	return $out;
472							}
473
474							sub _encode_S {
475	312			312		328	my $self = shift;
476	312					323	my @r = ();
477	312					441	while (@_) {
478							push @r,
479	312					505	map { $self->{token_S}->[$_] } split(/ /, shift @_);
	648					1517
480							}
481	312					773	return join(' ', @r);
482							}
483
484							# http://www.cs.dal.ca/~vlado/srcperl/snip/encode_S
485							sub encode_S( $ ) {
486	100			100	1	144	local $_ = shift;
487
488	100					141	s/=/=0/g; # first hide a special character (=)
489	100					407	s/\\/=b/g; # encode backslashes
490
491	100					161	s/([\x80-\xFF])/=x$1/g; # replace >127 with 127
492	100					134	tr/\x80-\xFF/\x00-\x7F/;
493	100					118	s/=x=/=X/g; # hide again =
494
495	100					148	s/([\x00-\x1F\x5C\x5E-\x60\x7F])/=B$1/g;
496	100					107	tr/\x20\x00-\x1F\x7F/_0-5Aabtnvfroil6-9NSTcEseFGRUd/;
497
498	100					110	s/=x=B(\S)/`$1/g; # hex backslash
499	100					119	s/=x(\S)/^$1/g; # hex other
500	100					126	s/=B(\S)/\\$1/g; # backslashed
501	100					104	s/=b/\\\\/g; # original backslashes
502	100					137	s/=X/^=0/g;
503	100					97	s/=0/=/g; # put back =
504
505	100					206	return $_;
506							}
507
508							1;
509							__END__