File Coverage

blib/lib/Lingua/FreeLing3/Utils.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package Lingua::FreeLing3::Utils;
2
3	1	1	23278	use 5.010;
	1		4
	1		46
4	1	1	7	use strict;
	1		2
	1		47
5	1	1	7	use warnings;
	1		6
	1		52
6	1	1	8	use Scalar::Util 'blessed';
	1		2
	1		161
7
8				require Exporter;
9				our @ISA = qw(Exporter);
10
11	1	1	323	use FL3;
	0
	0
12				use Lingua::FreeLing3::Sentence;
13				use Lingua::FreeLing3::Word;
14				use Data::Dumper;
15
16				=head1 NAME
17
18				Lingua::FreeLing3::Utils - text processing utilities using FreeLing3 Perl inferface
19
20				=head1 VERSION
21
22				Version 0.02_1
23
24				=cut
25
26				our $VERSION = '0.02_1';
27
28				=head1 SYNOPSIS
29
30				Calculate n-grams for a given text.
31
32				use Lingua::FreeLing3::Utils qw/ngrams ngrams_pp/;
33
34				# calculate bigrams
35				my $ngrams = ngrams({ n => 2 }, $text);
36
37				# pretty print bigrams
38				ngrams_pp($ngrams);
39
40				Calculate word analysis (all possible for each word)
41
42				use Lingua::FreeLing3::Utils qw/word_analysis/;
43
44				# calculate analysis
45				my $analysis = word_analysis($word);
46
47				# in fact, you can get for a list of words
48				my @analysis = word_analysis(@words);
49
50				# or for a text, and we'll calculate the list for you
51				my @analysis = word_analysis($text);
52
53
54				=head1 EXPORT
55
56				The following functions can be exported:
57
58				=over 4
59
60				=item ngrams
61
62				=item ngrams_pp
63
64				=item word_analysis
65
66				=back
67
68				=cut
69
70				our @EXPORT_OK = qw(ngrams ngrams_pp word_analysis);
71
72				=head1 FUNCTIONS
73
74				=head2 word_analysis
75
76				Compute all possible analysis for a specific word, list of words, or
77				words from a text. You can pass an optional first argument (hash
78				reference) with extra configuration.
79
80				@analysis = word_analysis( { l=>'pt' }, @words );
81
82				=cut
83
84				sub word_analysis {
85				state $inited = {};
86
87				my %opts;
88				%opts = ( %{ shift @_ } ) if ref $_[0] eq "HASH";
89				my $l = $opts{l} \|\| 'en';
90
91				my @words;
92				if (scalar(@_) == 1) {
93				my $text = shift;
94				my $words = tokenizer($l)->tokenize($text);
95				@words = @$words;
96				} else {
97				@words = map {
98				if (blessed $_) {
99				if ($_->isa('Lingua::FreeLing3::Word')) {
100				$_
101				} else {
102				die "blessed argument to word_analysis is not a FL3 word."
103				}
104				} else {
105				word($_);
106				}
107				} @_;
108				}
109
110				if (!$inited->{$l}) {
111				morph($l,
112				ProbabilityAssignment => 'no',
113				QuantitiesDetection => 'no',
114				MultiwordsDetection => 'no',
115				NumbersDetection => 'no',
116				DatesDetection => 'no',
117				OrthographicCorrection => 'no',
118				NERecognition => 'no');
119				$inited->{$l}++;
120				}
121
122				my $analysis = morph($l)->analyze([Lingua::FreeLing3::Sentence->new(@words)]);
123
124				if (wantarray) {
125				return map { $_->analysis(FeatureStructure => 1) } $analysis->[0]->words
126				} else {
127				return $analysis->[0]->word(0)->analysis(FeatureStructure => 1);
128				}
129				}
130
131				=head2 ngrams
132
133				Compute n-grams for a given input. The argument to this function is a
134				filname to process. You can optionally add a hash reference of
135				options.
136
137				ngrams({n => 2, l => 'en'}, $filename);
138
139				The following options are availaboe:
140
141				=over 4
142
143				=item C<-n>
144
145				Set n (default: bigrams n=2).
146
147				=item C<-l>
148
149				Select language (default: en).
150
151				=item C<-i 1\|0>
152
153				Case insensitive (default: off).
154
155				=item C<-t 1\|0>
156
157				Use C<~~> and C<~~> around sentences (default: on).
158
159				=back
160
161				=cut
162
163				sub ngrams {
164				my %opts;
165				%opts = ( %{ shift @_ } ) if ref $_[0] eq "HASH";
166
167				my ($text) = @_;
168
169				# handle options and defaults
170				my $n = $opts{n} \|\| 2;
171				my $l = $opts{l} \|\| 'en';
172				my $i = $opts{i} \|\| 0;
173				my $t = $opts{t} \|\| 0;
174
175				my $tokens;
176				if ($t) {
177				my $words = tokenizer($l)->tokenize($text);
178				my $sentences = splitter($l)->split($words, buffered => 0);
179				foreach (@$sentences) {
180				my @ts = map { $_->form } @$_;
181				unshift @ts, '';
182				push @ts, '';
183				push @$tokens, @ts;
184				}
185				} else {
186				$tokens = tokenizer($l)->tokenize($text, to_text=>1 );
187				}
188
189				my $ngrams;
190				my $c = 0;
191				while ($c < @$tokens - $n + 1) {
192				my @s = @$tokens[$c .. $c+$n-1];
193				@s = map {lc $_} @s if $i;
194				$ngrams->{__tuple(@s)}->{count}++;
195				$c++;
196				}
197
198				my $total = @$tokens;
199				foreach (keys %$ngrams) {
200				my ($numerator, $denominator);
201
202				$numerator = $ngrams->{$_}->{count};
203				if ($n > 1) {
204				my $count = 0;
205				my @search = __untuple($_);
206				pop @search;
207				my $c = 0;
208				while ($c < @$tokens - $n + 1) {
209				my @s = @$tokens[$c .. $c+$n-2];
210
211				$count++ if @s ~~ @search;
212				$c++;
213				}
214				$denominator = $count;
215				} else {
216				$denominator = $total;
217				}
218				if ($numerator and $denominator and $denominator != 0) {
219				$ngrams->{$_}->{p} = $numerator / $denominator
220				}
221				}
222
223				return $ngrams;
224				}
225
226				sub __tuple {
227				my (@l) = @_;
228				join(' ', @l);
229				}
230
231				sub __untuple {
232				my ($str) = @_;
233				split /\s/, $str;
234				}
235
236				=head2 ngrams_pp
237
238				Pretty print n-grams data in plain text.
239
240				=cut
241
242				sub ngrams_pp {
243				my ($ngrams) = @_;
244
245				printf "%-25s %-10s %-10s\n", '# n-gram', 'count', 'p';
246				my $format = "%-25s %-10s %-.8f\n";
247				foreach (keys %$ngrams) {
248				printf $format, $_, $ngrams->{$_}->{count}, $ngrams->{$_}->{p};
249				}
250				}
251
252				=head1 AUTHOR
253
254				Nuno Carvalho, C<< >>
255
256				=head1 BUGS
257
258				Please report any bugs or feature requests to C, or through
259				the web interface at L. I will be notified, and then you'll
260				automatically be notified of progress on your bug as I make changes.
261
262
263
264
265				=head1 SUPPORT
266
267				You can find documentation for this module with the perldoc command.
268
269				perldoc Lingua::FreeLing3::Utils
270
271
272				You can also look for information at:
273
274				=over 4
275
276				=item * RT: CPAN's request tracker (report bugs here)
277
278				L
279
280				=item * AnnoCPAN: Annotated CPAN documentation
281
282				L
283
284				=item * CPAN Ratings
285
286				L
287
288				=item * Search CPAN
289
290				L
291
292				=back
293
294
295				=head1 ACKNOWLEDGEMENTS
296
297
298				=head1 LICENSE AND COPYRIGHT
299
300				Copyright 2012 Nuno Carvalho.
301
302				This program is free software; you can redistribute it and/or modify it
303				under the terms of either: the GNU General Public License as published
304				by the Free Software Foundation; or the Artistic License.
305
306				See http://dev.perl.org/licenses/ for more information.
307
308
309				=cut
310
311				1; # End of Lingua::FreeLing3::Utils