File Coverage

blib/lib/Text/WordGrams.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package Text::WordGrams;
2
3	6	6	238959	use warnings;
	6		17
	6		211
4	6	6	37	use strict;
	6		12
	6		220
5	6	6	7155	use DB_File;
	0
	0
6				use File::Temp;
7				use Fcntl;
8
9				use base 'Exporter';
10
11				use Lingua::PT::PLNbase;
12
13				=encoding UTF-8
14
15				=head1 NAME
16
17				Text::WordGrams - Calculates statistics on word ngrams.
18
19				=head1 VERSION
20
21				Version 0.07
22
23				=cut
24
25				our $VERSION = '0.07';
26				our @EXPORT = ("word_grams", "word_grams_from_files");
27
28				=head1 SYNOPSIS
29
30				use Text::WordGrams;
31
32				my $data = word_grams( $text );
33
34				my $data = word_grams_from_files( $file1, $file2 );
35
36				=head1 FUNCTIONS
37
38				=head2 word_grams
39
40				Returns a reference to an hash table with word ngrams counts for a
41				specified string. Options are passed as a hash reference as first
42				argument if needed.
43
44				Options include:
45
46				=over 4
47
48				=item ignore_case
49
50				Set this option to ignore text case;
51
52				=item size
53
54				Set this option to the n-gram size you want. Notice that the value
55				should be greater or equal to two. Also, keep in mind that the bigger
56				size you ask for, the larger the hash will become.
57
58				=item tokenize
59
60				This option is activated by default. Give a zero value if your
61				document is already tokenized. In this case your text will be slitted
62				by space characters.
63
64				=back
65
66				=cut
67
68				sub word_grams {
69				my $conf = {};
70				$conf = shift if (ref($_[0]) eq "HASH");
71				$conf->{size} = 2 unless $conf->{size} && $conf->{size} >= 1;
72
73				my $text = shift;
74				$text = lc($text) if $conf->{ignore_case};
75
76				my @atoms;
77				if (!exists($conf->{tokenize}) \|\| $conf->{tokenize} == 1) {
78				@atoms = atomiza($text);
79				}
80				else {
81				$text =~ s/\n/ /g;
82				@atoms = split /\s+/, $text;
83				}
84
85				my %data;
86				my $fh = File::Temp->new();
87				my $fname = $fh->filename;
88				$DB_HASH->{cachesize} = 10000;
89				tie %data, 'DB_File', $fname, 0666, O_CREAT, $DB_HASH;
90
91				my $previous;
92				my $next;
93				while ($previous = shift @atoms) {
94				$next = _get($conf->{size}-1, \@atoms);
95				if (length($next)) {
96				$data{"$previous $next"}++;
97				} else {
98				$data{$previous}++;
99				}
100				}
101				return \%data
102				}
103
104				sub _get {
105				my ($n, $atoms) = @_;
106				if ($n && $n <= $#$atoms + 1) {
107				return join(" ", @{$atoms}[0..$n-1])
108				} else {
109				return ""
110				}
111				}
112
113				=head2 word_grams_from_files
114
115				Supports the same options of C function, but receives a
116				list of file names instead of a string.
117
118				=cut
119
120				sub word_grams_from_files {
121				my $conf = {};
122				$conf = shift if (ref($_[0]) eq "HASH");
123				my $data;
124
125				for my $file (@_) {
126				next unless -f $file;
127
128				local $/ = "\n\n";
129
130				open F, $file or die "Can't open file: $file\n";
131
132				binmode F, ":utf8" if exists($conf->{utf8}) && $conf->{utf8};
133
134				while() {
135				my $o = word_grams($conf, $_);
136				for my $w (keys %$o) {
137				$data->{$w}+=$o->{$w}
138				}
139				}
140				close F;
141				}
142
143				return $data;
144				}
145
146				=head1 AUTHOR
147
148				Alberto Simões, C<< >>
149
150				=head1 BUGS
151
152				Current method is very, very slow. if you find any faster method,
153				please let me know. I think the bottle neck is in the tokenisation
154				part.
155
156				Please report any bugs or feature requests to
157				C, or through the web interface at
158				L. I
159				will be notified, and then you'll automatically be notified of
160				progress on your bug as I make changes.
161
162				=head1 COPYRIGHT & LICENSE
163
164				Copyright 2005-2009 Alberto Simões, all rights reserved.
165
166				This program is free software; you can redistribute it and/or modify
167				it under the same terms as Perl itself.
168
169				=cut
170
171				1; # End of Text::WordGrams