File Coverage

blib/lib/Lingua/FreeLing3/Utils.pm

Criterion	Covered	Total	%
statement	16	18	88.8
branch			n/a
condition			n/a
subroutine	6	6	100.0
pod			n/a
total	22	24	91.6

line	stmt	sub	time	code
1				package Lingua::FreeLing3::Utils;
2
3	1	1	24374	use 5.010;
	1		4
	1		40
4	1	1	7	use strict;
	1		2
	1		32
5	1	1	5	use warnings;
	1		13
	1		33
6	1	1	6	use Scalar::Util 'blessed';
	1		2
	1		130
7
8	1	1	985	no if $] >= 5.018, 'warnings', "experimental::smartmatch";
	1		10
	1		6
9
10				require Exporter;
11				our @ISA = qw(Exporter);
12
13	1	1	510	use FL3;
	0
	0
14				use Lingua::FreeLing3::Sentence;
15				use Lingua::FreeLing3::Word;
16				use Data::Dumper;
17
18				=head1 NAME
19
20				Lingua::FreeLing3::Utils - text processing utilities using FreeLing3 Perl inferface
21
22				=head1 VERSION
23
24				Version 0.09
25
26				=cut
27
28				our $VERSION = '0.09';
29
30				=head1 SYNOPSIS
31
32				Calculate n-grams for a given text.
33
34				use Lingua::FreeLing3::Utils qw/ngrams ngrams_pp/;
35
36				# calculate bigrams
37				my $ngrams = ngrams({ n => 2 }, $text);
38
39				# pretty print bigrams
40				ngrams_pp($ngrams);
41
42				Calculate word analysis (all possible for each word)
43
44				use Lingua::FreeLing3::Utils qw/word_analysis/;
45
46				# calculate analysis
47				my $analysis = word_analysis($word);
48
49				# in fact, you can get for a list of words
50				my @analysis = word_analysis(@words);
51
52				# or for a text, and we'll calculate the list for you
53				my @analysis = word_analysis($text);
54
55
56				=head1 EXPORT
57
58				The following functions can be exported:
59
60				=over 4
61
62				=item ngrams
63
64				=item ngrams_pp
65
66				=item word_analysis
67
68				=back
69
70				=cut
71
72				our @EXPORT_OK = qw(ngrams ngrams_pp word_analysis);
73
74				=head1 FUNCTIONS
75
76				=head2 word_analysis
77
78				Compute all possible analysis for a specific word, list of words, or
79				words from a text. You can pass an optional first argument (hash
80				reference) with extra configuration.
81
82				@analysis = word_analysis( { l=>'pt' }, @words );
83
84				=cut
85
86				sub word_analysis {
87				state $inited = {};
88
89				my %opts;
90				%opts = ( %{ shift @_ } ) if ref $_[0] eq "HASH";
91				my $l = $opts{l} \|\| 'en';
92
93				my @words;
94				if (scalar(@_) == 1) {
95				my $text = shift;
96				my $words = tokenizer($l)->tokenize($text);
97				@words = @$words;
98				} else {
99				@words = map {
100				if (blessed $_) {
101				if ($_->isa('Lingua::FreeLing3::Word')) {
102				$_
103				} else {
104				die "blessed argument to word_analysis is not a FL3 word."
105				}
106				} else {
107				word($_);
108				}
109				} @_;
110				}
111
112				if (!$inited->{$l}) {
113				morph($l,
114				ProbabilityAssignment => 'no',
115				QuantitiesDetection => 'no',
116				MultiwordsDetection => 'no',
117				NumbersDetection => 'no',
118				DatesDetection => 'no',
119				OrthographicCorrection => 'no',
120				NERecognition => 'no');
121				$inited->{$l}++;
122				}
123
124				my $analysis = morph($l)->analyze([Lingua::FreeLing3::Sentence->new(@words)]);
125
126				if (wantarray) {
127				return map { $_->analysis(FeatureStructure => 1) } $analysis->[0]->words
128				} else {
129				return $analysis->[0]->word(0)->analysis(FeatureStructure => 1);
130				}
131				}
132
133				=head2 ngrams
134
135				Compute n-grams for a given input. The argument to this function is
136				the text to process. You can optionally add a hash reference of
137				options.
138
139				ngrams({n => 2, l => 'en'}, $text);
140
141				The following options are available:
142
143				=over 4
144
145				=item C<-n>
146
147				Set n (default: bigrams, n = 2).
148
149				=item C<-l>
150
151				Select language (default: en).
152
153				=item C<-i 1\|0>
154
155				Case insensitive (default: off).
156
157				=item C<-t 1\|0>
158
159				Use C<~~> and C<~~> around sentences (default: on).
160
161				=item C<-a 1\|0>
162
163				Compute all i-grams with i from 1 to the specified n value (default:
164				off).
165
166				=back
167
168				=cut
169
170				sub ngrams {
171				my %opts;
172				%opts = ( %{ shift @_ } ) if ref $_[0] eq "HASH";
173
174				my ($text) = @_;
175
176				# handle options and defaults
177				my $n = $opts{n} // 2;
178				my $l = $opts{l} // 'en';
179				my $i = $opts{i} // 0;
180				my $t = $opts{t} // 1;
181				my $a = $opts{a} // 0;
182
183				# transform text into list of tokens
184				my $tokens;
185				if ($t) {
186				my $words = tokenizer($l)->tokenize($text);
187				my $sentences = splitter($l)->split($words, buffered => 0);
188				foreach (@$sentences) {
189				my @ts = map { $_->form } $_->words;
190				unshift @ts, '';
191				push @ts, '';
192				push @$tokens, @ts;
193				}
194				} else {
195				$tokens = tokenizer($l)->tokenize($text, to_text=>1 );
196				}
197
198				# compute ngrams
199				my $ngrams;
200				my $c = 0;
201
202				if ($a) {
203				my @window;
204				while ($c < @$tokens) {
205				push @window, $i ? lc $tokens->[$c] : $tokens->[$c];
206				for (1 .. $n) {
207				if (@window >= $_) {
208				my $tuple = __tuple(@window[scalar(@window)-$_ .. scalar(@window)-1]);
209				$ngrams->[$_-1]{$tuple}{count}++ if $tuple;
210				}
211				}
212				shift @window if @window > $n - 1;
213				$c++;
214				}
215				} else {
216				while ($c < @$tokens - $n + 1) {
217				my @s = @$tokens[$c .. $c+$n-1];
218				@s = map { lc $_ } @s if $i;
219				my $tuple = __tuple(@s);
220				$ngrams->[0]->{$tuple}->{count}++ if $tuple;
221				$c++;
222				}
223				}
224
225				# compute percentages
226				my $nn = $a ? 1 : $n;
227				for my $ngram (@$ngrams) {
228				my $total = @$tokens;
229				foreach (keys %$ngram) {
230				my ($numerator, $denominator);
231
232				$numerator = $ngram->{$_}->{count};
233				if ($nn > 1) {
234				my $count = 0;
235				my @search = __untuple($_);
236				pop @search;
237				my $c = 0;
238				while ($c < @$tokens - $nn + 1) {
239				my @s = @$tokens[$c .. $c+$nn-2];
240
241				$count++ if @s ~~ @search;
242				$c++;
243				}
244				$denominator = $count;
245				} else {
246				$denominator = $total;
247				}
248				if ($numerator and $denominator and $denominator != 0) {
249				$ngram->{$_}->{p} = $numerator / $denominator
250				}
251				}
252				++$nn;
253				}
254
255				return $a ? $ngrams : $ngrams->[0];
256				}
257
258				sub __tuple {
259				my $t = join ' ', @_;
260				return undef if $t =~ m{.};
261				return undef if $t =~ m{.};
262				return $t;
263				}
264
265				sub __untuple {
266				split /\s/, $_[0];
267				}
268
269				=head2 ngrams_pp
270
271				Pretty print n-grams data in plain text.
272
273				=cut
274
275				sub ngrams_pp {
276				my ($ngrams) = @_;
277
278				printf "%-25s %-10s %-10s\n", '# n-gram', 'count', 'p';
279				my $format = "%-25s %-10s %-.8f\n";
280				foreach (keys %$ngrams) {
281				printf $format, $_, $ngrams->{$_}->{count}, $ngrams->{$_}->{p};
282				}
283				}
284
285				=head1 AUTHOR
286
287				Nuno Carvalho, C<< >>
288
289				=head1 BUGS
290
291				Please report any bugs or feature requests to C, or through
292				the web interface at L. I will be notified, and then you'll
293				automatically be notified of progress on your bug as I make changes.
294
295
296
297
298				=head1 SUPPORT
299
300				You can find documentation for this module with the perldoc command.
301
302				perldoc Lingua::FreeLing3::Utils
303
304
305				You can also look for information at:
306
307				=over 4
308
309				=item * RT: CPAN's request tracker (report bugs here)
310
311				L
312
313				=item * AnnoCPAN: Annotated CPAN documentation
314
315				L
316
317				=item * CPAN Ratings
318
319				L
320
321				=item * Search CPAN
322
323				L
324
325				=back
326
327
328				=head1 ACKNOWLEDGEMENTS
329
330
331				=head1 LICENSE AND COPYRIGHT
332
333				Copyright 2012 Nuno Carvalho.
334
335				This program is free software; you can redistribute it and/or modify it
336				under the terms of either: the GNU General Public License as published
337				by the Free Software Foundation; or the Artistic License.
338
339				See http://dev.perl.org/licenses/ for more information.
340
341
342				=cut
343
344				1; # End of Lingua::FreeLing3::Utils