File Coverage

blib/lib/Text/Ngrams.pm

Criterion	Covered	Total	%
statement	292	331	88.2
branch	117	158	74.0
condition	35	48	72.9
subroutine	22	23	95.6
pod	8	8	100.0
total	474	568	83.4

line	stmt	bran	cond	sub	pod	time	code
1							# (c) 2003-2017 Vlado Keselj http://web.cs.dal.ca/~vlado
2							#
3							# Text::Ngrams - A Perl module for N-grams processing
4
5							package Text::Ngrams;
6
7	9			9		29545	use strict;
	9					12
	9					236
8							require Exporter;
9	9			9		27	use Carp;
	9					9
	9					464
10	9			9		39	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	9					8
	9					938
11							@ISA = qw(Exporter);
12							%EXPORT_TAGS = ( 'all' => [ qw(new encode_S decode_S) ] );
13							@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
14							@EXPORT = qw(new);
15							$VERSION = '2.005';
16
17	9			9		33	use vars qw($Version);
	9					9
	9					291
18							$Version = $VERSION;
19
20	9			9		24	use vars @EXPORT_OK;
	9					11
	9					134
21	9			9		36	use vars qw(); # non-exported package globals go here
	9					8
	9					2151
22
23							sub new {
24	12			12	1	9403	my $package = shift;
25	12		33			69	$package = ref($package) \|\| $package;
26	12					18	my $self = {};
27
28	12					31	my (%params) = @_;
29
30							$self->{windowsize} = exists($params{windowsize}) ?
31	12	100				48	$params{windowsize} : 3;
32	12	50				31	die "nonpositive window size: $self->{windowsize}" unless $self->{windowsize} > 0;
33	12					19	delete($params{windowsize});
34
35	12	100	66			103	if (! exists($params{type}) or $params{type} eq 'character') {
		50
		50
		100
		50
36	3					8	$self->{skiprex} = '';
37	3					13	$self->{tokenrex} = qr/([a-zA-Z]\|[^a-zA-Z]+)/;
38	3			30		11	$self->{processtoken} = sub { s/[^a-zA-Z]+/ /; $_ = uc $_ };
	30					34
	30					32
39	3					6	$self->{allow_iproc} = 1;
40							}
41							elsif ($params{type} eq 'utf8') {
42	0					0	$self->{skiprex} = '';
43	0					0	$self->{tokenrex} = qr/([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]
44							\|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]
45							\|[\xC2-\xDF][\x80-\xBF]
46							\|[\x00-\xFF])/x;
47	0					0	$self->{processtoken} = '';
48							}
49							#MJ ->
50							#this type is analogous to the "character" type but defined for utf8 characters
51							elsif ($params{type} eq 'utf8_character') {
52							#$self->{inputlayer}
53							#input layer to be put on the input stream by the function binmode before reading from a given stream
54							#and to be removed by *binmode HANDLE,":pop"* after the reading from the particular stream is done
55							#has to be a real layer (like ":encoding(utf8)"), not a pseudo layer (like ":utf8")
56							#so that the psuedo layer ":pop" is able to remove this input layer
57
58	0					0	$self->{inputlayer}=':encoding(utf8)';
59							#this will automatically decode input text from utf8 into Perl internal reporesentation of Unicode strings
60							#and so the regular expressions for Unicode as well as the uc function can be performed on them
61
62
63	0					0	$self->{skiprex} = '';
64
65	9			9		4577	$self->{tokenrex} = qr/(\p{IsAlpha}\|\P{IsAlpha}+)/;
	9					71
	9					118
	0					0
66
67	0			0		0	$self->{processtoken} = sub { s/\P{IsAlpha}+/ /; $_ = uc $_ ; $_=Encode::encode_utf8( $_ );};
	0					0
	0					0
	0					0
68							#the last operation *$_=Encode::encode_utf8( $_ )* is necessary
69							#to go back to utf8 encoding from the internal Perl representation
70							#so that for the output the n-grams are in utf8 (encoded by encode_S though)
71
72	0					0	$self->{allow_iproc} = 0;
73							# allow_iproc has to be 0. Otherwise the last token in the read block will
74							# be preprocessed and encoded in utf8,
75							# and then attached at the beginning of the next block read from input,
76							# which will be in the internal Perl representation
77							}
78							#MJ <-
79							elsif ($params{type} eq 'byte') {
80	4					8	$self->{skiprex} = '';
81	4					5	$self->{tokenrex} = '';
82	4					7	$self->{processtoken} = '';
83							}
84							elsif ($params{type} eq 'word') {
85	5					21	$self->{skiprex} = qr/[^a-zA-Z0-9]+/;
86	5					8	$self->{skipinsert} = ' ';
87	5					13	$self->{tokenrex} = qr/([a-zA-Z]+\|(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?)/;
88	47			47		105	$self->{processtoken} = sub { s/(\d+(\.\d+)?\|\d*\.\d+)([eE][-+]?\d+)?// }
89	5					18	}
90	0					0	else { die "unknown type: $params{type}" }
91	12					13	delete($params{type});
92
93	12					20	$self->{'table'} = [ ];
94	12					16	$self->{'total'} = [ ];
95	12					21	$self->{'total_distinct_count'} = 0;
96	12					18	$self->{'lastngram'} = [ ];
97	12					15	$self->{'next_token_id'} = 0;
98	12					19	$self->{'token_dict'} = { };
99	12					15	$self->{'token_S'} = [ ];
100	12					16	$self->{'token'} = [ ];
101
102	12					36	foreach my $i ( 1 .. $self->{windowsize} ) {
103	33					40	$self->{table}[$i] = { };
104	33					29	$self->{total}[$i] = 0;
105	33					40	$self->{firstngram}[$i] = '';
106	33					39	$self->{lastngram}[$i] = '';
107							}
108
109	12	100				29	if (exists($params{'limit'})) {
110	1	50				4	die "limit=$params{'limit'}" if $params{'limit'} < 1;
111	1					20	$self->{'limit'} = $params{'limit'};
112							}
113	12					18	delete($params{'limit'});
114
115	12	50				81	die "unknown parameters:".join(' ', %params) if %params;
116
117	12					37	bless($self, $package);
118	12					31	return $self;
119							}
120
121							sub feed_tokens {
122	24			24	1	38	my $self = shift;
123							# count all n-grams sizes starting from max to 1
124	24					31	foreach my $t1 (@_) {
125	204					140	my $t = $t1;
126	204	100				238	if (defined($self->{token_dict}->{$t})) {
127	106					78	$t = $self->{token_dict}->{$t};
128							} else {
129	98					84	my $id = $self->{next_token_id}++;
130	98					106	$self->{token_S}->[$id] = &encode_S($t);
131	98					84	$self->{token}->[$id] = $t;
132	98					123	$t = $self->{token_dict}->{$t} = $id;
133							}
134	204					288	for (my $n=$self->{windowsize}; $n > 0; $n--) {
135	571	100				506	if ($n > 1) {
136	367	100				514	next unless $self->{lastngram}[$n-1] ne '';
137	335					356	$self->{lastngram}[$n] = $self->{lastngram}[$n-1] .
138							' ' . $t;
139	204					153	} else { $self->{lastngram}[$n] = $t }
140
141	539	100				855	if ( ($self->{table}[$n]{$self->{lastngram}[$n]} += 1)==1)
142	330					218	{ $self->{'total_distinct_count'} += 1 }
143
144	539					388	$self->{'total'}[$n] += 1;
145	539	100				967	if ($self->{'firstngram'}[$n] eq '')
146	31					65	{ $self->{'firstngram'}[$n] = $self->{lastngram}[$n] }
147							}
148							}
149	24	100	100			78	if (exists($self->{'limit'}) and
150							$self->{'total_distinct_count'} > 2 * $self->{'limit'})
151	1					4	{ $self->_reduce_to_limit }
152							}
153
154							sub process_text {
155	4			4	1	15	my $self = shift;
156	4					11	$self->_process_text(0, @_);
157	4	50				17	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	0					0
158							}
159
160							sub _process_text {
161	18			18		21	my $self = shift;
162	18					16	my $cont = shift; # the minimal number of chars left for
163							# continuation (the new-line problem, and the
164							# problem with too long lines)
165							# The remainder of unprocessed string is
166							# returned.
167	18	50				30	if ($cont < 0) { $cont = 0 }
	0					0
168
169	18	50	100			102	if ( # type is byte
			66
			66
170							$self->{skiprex} eq '' and
171							$self->{tokenrex} eq '' and
172							$self->{processtoken} eq '' and
173							$cont == 0
174							)
175	6					15	{ return $self->_process_text_byte(@_) }
176
177	12					11	my (@tokens);
178							my $text;
179	12					21	while (@_) {
180	12					21	$text .= shift @_;
181	12					21	while (length($text) > 0) {
182	86					70	my $textl = $text;
183	86					54	my $skip = '';
184	86	100	100			401	if ($self->{skiprex} ne '' && $textl =~ /^$self->{skiprex}/)
185	54					55	{ $skip = $&; $textl = $'; }
	54					51
186	86	100				114	if (defined($self->{skipinsert})) {
187	56					38	$skip = $self->{skipinsert};
188	56					59	$text = $skip.$textl;
189							}
190	86	100				102	if (length($textl) < $cont) { last }
	4					12
191	82	100				93	if (length($textl)==0) { $text=$textl; last; }
	5					6
	5					11
192
193	77					53	local $_;
194	77	50				91	if ($self->{tokenrex} ne '') {
195	77	50				392	if ($textl =~ /^$self->{tokenrex}/)
196	77					82	{ $_ = $&; $textl = $'; }
	77					90
197							}
198							else
199	0					0	{ $_ = substr($textl, 0, 1); $textl = substr($textl, 1) }
	0					0
200	77	50				170	last if $_ eq '';
201
202	77	100				94	if (length($textl) < $cont) {
203	1	50	33			9	if (defined($self->{allow_iproc}) && $self->{allow_iproc}
			33
204							&& ref($self->{processtoken}) eq 'CODE')
205	1					1	{ &{$self->{processtoken}} }
	1					5
206	1					2	$text = $skip.$_.$textl;
207	1					3	last;
208							}
209	76	50				122	if (ref($self->{processtoken}) eq 'CODE')
210	76					40	{ &{$self->{processtoken}} }
	76					93
211	76					92	push @tokens, $_;
212	76					118	$text = $textl;
213							}
214							}
215	12					26	$self->feed_tokens(@tokens);
216	12					26	return $text;
217							}
218
219							sub _process_text_byte {
220	6			6		5	my $self = shift;
221
222	6					14	for (my $i=0; $i<=$#_; ++$i) {
223	6					25	my @a = split('', $_[$i]);
224	6	100				14	next if $#a==-1;
225	4					9	$self->feed_tokens( @a );
226							}
227	6					11	return '';
228							}
229
230							sub process_files {
231	7			7	1	49	my $self = shift;
232
233							#MJ ->
234	7					8	my $input_layer='';
235	7	50				33	if (defined($self->{inputlayer})) {$input_layer=$self->{inputlayer};}
	0					0
236							#MJ <-
237
238	7					12	foreach my $f (@_) {
239	7					5	my $f1;
240	7					11	local *F;
241	7	50				15	if (not ref($f))
242	7	50				190	{ open(F, "$f") or die "cannot open $f:$!"; $f1 = *F }
	7					18
243	0					0	else { $f1 = $f }
244
245							#MJ ->
246							#put the encoding layer on the input when requested
247	7	50				16	if ($input_layer ne '') {
248	0					0	binmode $f1, $input_layer;
249							}
250							#MJ <-
251
252	7					10	my ($text, $text_l, $cont) = ('', 0, 1);
253	7	100	100			38	if ( # type is byte
			66
254							$self->{skiprex} eq '' and
255							$self->{tokenrex} eq '' and
256							$self->{processtoken} eq ''
257							)
258	2					2	{ $cont = 0 }
259
260	7					7	while (1) {
261	14					13	$text_l = length($text);
262	14					97	read($f1, $text, 1024, length($text));
263	14	100				28	last if length($text) <= $text_l;
264	7					15	$text = $self->_process_text($cont, $text);
265							}
266	7					12	$text = $self->_process_text(0, $text);
267
268							#MJ ->
269							#remove the encoding layer from the input stream if it was added
270							#Caution: here is what the Perl documentation says about the pseudo layer ":pop"
271							#"Should be considered as experimental. (...) A more elegant (and safer) interface is needed."
272	7	50				12	if ($input_layer ne '') {
273	0					0	binmode $f1,":pop";
274							}
275							#MJ <-
276
277	7	50				56	close($f1) if not ref($f);
278	7	100				32	if (exists($self->{'limit'})) { $self->_reduce_to_limit }
	1					1
279							}
280							}
281
282							sub _reduce_to_limit {
283	2			2		2	my $self = shift;
284							return unless exists($self->{'limit'}) and
285	2	50	33			8	$self->{'limit'} > 0;
286
287	2					8	while ($self->{'total_distinct_count'} > $self->{'limit'}) {
288	1					1	my $nextprunefrequency = 0;
289	1					1	for (my $prunefrequency=1;; $prunefrequency = $nextprunefrequency) {
290	2					1	$nextprunefrequency = $self->{'total'}[1];
291
292	2					3	foreach my $n (1 .. $self->{'windowsize'}) {
293
294	4					4	my @keys = keys(%{$self->{table}[$n]});
	4					11
295	4					5	foreach my $ngram (@keys) {
296	22					16	my $f = $self->{table}[$n]{$ngram};
297	22	100				26	if ($f <= $prunefrequency) {
		100
298	11					9	delete $self->{'table'}[$n]{$ngram};
299	11					6	$self->{'total'}[$n] -= $prunefrequency;
300	11					10	$self->{'total_distinct_count'} -= 1;
301							}
302							elsif ($nextprunefrequency > $f)
303	4					5	{ $nextprunefrequency = $f }
304							}
305
306	4	100				8	return if $self->{'total_distinct_count'} <= $self->{'limit'};
307	3	50				5	die if $nextprunefrequency <= $prunefrequency;
308							} } } }
309
310							# Sorts keys according to the lexicographic order.
311							sub _keys_sorted {
312	47			47		40	my $self = shift;
313	47					32	my $n = shift;
314	47					32	my @k = keys(%{$self->{table}[$n]});
	47					156
315	47					60	my %k1 = ();
316	47					63	foreach my $k (@k) {
317							$k1{
318	479					448	join (' ', map { $self->{token}->[$_] } split(/ /, $k) )
	999					1318
319							} = $k;
320							}
321	47					53	@k = ();
322	47					167	foreach my $k (sort(keys(%k1))) {
323	479					349	push @k, $k1{$k};
324							}
325	47					177	return @k;
326							}
327
328							sub get_ngrams {
329	27			27	1	27	my $self = shift;
330	27					39	my (%params) = @_;
331	27	100				46	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
332	27	100				39	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
333	27	50				37	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
334
335	27					27	my $total = $self->{total}[$n]; my @keys = ();
	27					28
336	27	100	100			121	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram') {
		50
		50
337	20					40	@keys = $self->_keys_sorted($n);
338							} elsif ($params{'orderby'} eq 'none') {
339	0	0				0	die "onlyfirst requires order" if $onlyfirst;
340	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
341							}
342							elsif ($params{'orderby'} eq 'frequency') {
343	7					14	@keys = $self->_keys_sorted($n);
344	7					10	my %keysord = ();
345	7					14	for (my $i=0; $i<=$#keys; ++$i) { $keysord{$keys[$i]} = $i }
	55					78
346							@keys = sort { $self->{table}[$n]{$b} <=> $self->{table}[$n]{$a}
347	157	50				206	or $keysord{$a} <=> $keysord{$b} }
348	7					7	keys(%{$self->{table}[$n]});
	7					20
349							}
350	0					0	else { die }
351
352	27	100				53	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
353
354	27					22	my @ret;
355	27					25	foreach my $ngram (@keys) {
356	262					191	my $count = $self->{table}[$n]{$ngram};
357	262	50				241	$count = ($opt_normalize ? ($count / $total ) : $count);
358	262					277	push @ret, $self->_encode_S($ngram), $count;
359							}
360
361	27					132	return @ret;
362							}
363
364							sub to_string {
365	11			11	1	56	my $self = shift;
366	11					30	my (%params) = @_;
367	11	50				27	my $n = exists($params{'n'})? $params{'n'} : $self->{windowsize};
368	11	100				25	my $onlyfirst = exists($params{'onlyfirst'}) ? $params{'onlyfirst'} : '';
369	11	50				23	my $opt_normalize = exists($params{'normalize'}) ?$params{'normalize'} : '';
370
371							#my $onlyfirst = exists($params{'onlyfirst'}) ?
372							#$params{'onlyfirst'} : '';
373							#delete $params{'onlyfirst'};
374
375	11	100				21	my $out = exists($params{'out'}) ? $params{'out'} : '';
376	11					13	delete $params{'out'};
377	11					12	my $outh = $out;
378	11	100	66			33	if ($out and (not ref($out))) {
379	1	50				3	local *FH; open(FH, ">$out") or die "cannot open $out:$!";
	1					50
380	1					5	$outh = *FH;
381							}
382
383							#my $opt_normalize = $params{'normalize'};
384							#delete $params{'normalize'};
385
386	11					13	my $spartan = $params{'spartan'};
387	11					12	delete $params{'spartan'};
388
389	11					25	my $ret='';
390	11	100				64	$ret = "BEGIN OUTPUT BY Text::Ngrams version $VERSION\n\n" unless $spartan;
391
392	11					25	foreach my $n (1 .. $self->{windowsize}) {
393	30	100	100			65	if ($spartan and $n < $self->{windowsize}) { next }
	4					5
394	26	100				37	if (! $spartan ) {
395	25					53	my $tmp = "$n-GRAMS (total count: $self->{total}[$n])";
396							$ret .= "$tmp\n" .
397							"FIRST N-GRAM: ". $self->_encode_S($self->{firstngram}[$n]).
398	25					54	"\n LAST N-GRAM: ".$self->_encode_S($self->{lastngram}[$n])."\n".
399							('-' x length($tmp)) . "\n";
400							}
401	26					33	my $total = $self->{total}[$n];
402
403	26					22	my @keys;
404	26	100	100			113	if (!exists($params{'orderby'}) or $params{'orderby'} eq 'ngram')
		50
		50
405	20					31	{ @keys = $self->_keys_sorted($n) }
406							elsif ($params{'orderby'} eq 'none') {
407	0	0				0	die "onlyfirst requires order" if $onlyfirst;
408	0					0	@keys = keys(%{$self->{table}[$n]})
	0					0
409							}
410							elsif ($params{'orderby'} eq 'frequency') {
411							@keys = sort { $self->{table}[$n]{$b} <=>
412	27					35	$self->{table}[$n]{$a} }
413	6					4	keys(%{$self->{table}[$n]});
	6					23
414							}
415	0					0	else { die }
416
417	26	100				50	@keys = splice(@keys,0,$onlyfirst) if $onlyfirst;
418
419	26					45	my %params1=%params; $params1{n}=$n;
	26					33
420	26					79	my @a = $self->get_ngrams(%params1);
421	26					65	for (my $i=0; $i<=$#a; $i+=2) {
422	229					146	my $ng = $a[$i]; my $f = $a[$i+1];
	229					158
423	229					347	$ret.="$ng\t$f\n";
424							}
425	26	100				41	if ($out) { print $outh $ret; $ret = '' }
	1					2
	1					4
426
427	26	100				80	$ret .= "\n" unless $spartan;
428							}
429
430	11	100				37	$ret .= "END OUTPUT BY Text::Ngrams\n" unless $spartan;
431
432	11	100				21	if ($out) {
433	1					2	print $outh $ret; $ret = '';
	1					1
434	1	50				7	close($outh) if not ref($out);
435							}
436
437	11					42	return $ret;
438							}
439
440							# http://web.cs.dal.ca/~vlado/srcperl/snip/decode_S
441							sub decode_S ( $ ) {
442	2			2	1	3	local $_ = shift;
443	2					4	my $out;
444
445	2					7	while (length($_) > 0) {
446	10	100				45	if (/^\\(\S)/) {
		50
		100
		50
		50
		50
447	4					5	$_ = $'; my $tmp = $1;
	4					7
448	4					4	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
449	4					9	$out .= $tmp;
450							}
451	0					0	elsif (/^\^_/) { $_ = $'; $out .= "\240" }
	0					0
452	2					4	elsif (/^\^(\S)/) { $_ = $'; $out .= pack('C',ord($1)+128); }
	2					21
453							elsif (/^\`(\S)/) {
454	0					0	$_ = $'; my $tmp = $1;
	0					0
455	0					0	$tmp =~ tr/0-5Aabtnvfroil6-9NSTcEseFGRUd/\x00-\x1F\x7F/;
456	0					0	$out .= pack('C', ord($tmp)+128);
457							}
458	0					0	elsif (/^_+/) { $_ = $'; my $tmp = $&; $tmp =~ tr/_/ /; $out .= $tmp; }
	0					0
	0					0
	0					0
459	4					18	elsif (/^[^\\^\`\s_]+/) { $_ = $'; $out .= $&; }
	4					12
460	0					0	else { die "decode_S unexpected:$_" }
461							}
462
463	2					7	return $out;
464							}
465
466							sub _encode_S {
467	312			312		186	my $self = shift;
468	312					218	my @r = ();
469	312					359	while (@_) {
470							push @r,
471	312					340	map { $self->{token_S}->[$_] } split(/ /, shift @_);
	648					997
472							}
473	312					549	return join(' ', @r);
474							}
475
476							# http://www.cs.dal.ca/~vlado/srcperl/snip/encode_S
477							sub encode_S( $ ) {
478	100			100	1	89	local $_ = shift;
479
480	100					81	s/=/=0/g; # first hide a special character (=)
481	100					70	s/\\/=b/g; # encode backslashes
482
483	100					114	s/([\x80-\xFF])/=x$1/g; # replace >127 with 127
484	100					69	tr/\x80-\xFF/\x00-\x7F/;
485	100					98	s/=x=/=X/g; # hide again =
486
487	100					97	s/([\x00-\x1F\x5C\x5E-\x60\x7F])/=B$1/g;
488	100					59	tr/\x20\x00-\x1F\x7F/_0-5Aabtnvfroil6-9NSTcEseFGRUd/;
489
490	100					68	s/=x=B(\S)/`$1/g; # hex backslash
491	100					62	s/=x(\S)/^$1/g; # hex other
492	100					78	s/=B(\S)/\\$1/g; # backslashed
493	100					63	s/=b/\\\\/g; # original backslashes
494	100					58	s/=X/^=0/g;
495	100					59	s/=0/=/g; # put back =
496
497	100					149	return $_;
498							}
499
500							1;
501							__END__