File Coverage

blib/lib/Text/TFIDF/Ngram.pm

Criterion	Covered	Total	%
statement	76	76	100.0
branch	23	24	95.8
condition	9	11	81.8
subroutine	13	13	100.0
pod	4	5	80.0
total	125	129	96.9

line	stmt	bran	cond	sub	pod	time	code
1							package Text::TFIDF::Ngram;
2							our $AUTHORITY = 'cpan:GENE';
3
4							# ABSTRACT: Compute the TF-IDF measure for ngram phrases
5
6							our $VERSION = '0.0507';
7
8	1			1		1063	use Moo;
	1					9413
	1					4
9	1			1		1635	use strictures 2;
	1					1295
	1					33
10	1			1		552	use namespace::clean;
	1					9389
	1					5
11
12	1			1		219	use Carp;
	1					3
	1					85
13	1			1		426	use Lingua::EN::Ngram;
	1					1096
	1					29
14	1			1		376	use Lingua::StopWords qw( getStopWords );
	1					280
	1					50
15	1			1		7	use List::Util qw( sum0 );
	1					2
	1					1040
16
17
18							has files => (
19							is => 'ro',
20							isa => sub { croak 'Invalid files list' unless ref $_[0] eq 'ARRAY' },
21							);
22
23
24							has size => (
25							is => 'ro',
26							isa => sub { croak 'Invalid integer' unless $_[0] && $_[0] =~ /^\d+$/ && $_[0] > 0 },
27							default => sub { 1 },
28							);
29
30
31							has stopwords => (
32							is => 'ro',
33							isa => sub { croak 'Invalid Boolean' unless defined $_[0] },
34							default => sub { 1 },
35							);
36
37
38							has punctuation => (
39							is => 'ro',
40							default => sub { qr/(?!')[[:punct:]]/ },
41							);
42
43
44							has lowercase => (
45							is => 'ro',
46							default => sub { 0 },
47							);
48
49
50							has counts => (
51							is => 'ro',
52							init_arg => undef,
53							);
54
55
56							has file_tfidf => (
57							is => 'ro',
58							init_arg => undef,
59							);
60
61
62							sub BUILD {
63	7			7	0	54	my ( $self, $args ) = @_;
64
65	7	100				15	return unless $args->{files};
66
67	6		50			14	$args->{size} \|\|= 1;
68
69	6					6	for my $file ( @{ $args->{files} } ) {
	6					13
70	10					23	$self->_process_ngrams( $file, $args->{size} );
71							}
72							}
73
74							sub _process_ngrams {
75	10			10		17	my ( $self, $file, $size ) = @_;
76
77	10					31	my $ngram = Lingua::EN::Ngram->new( file => $file );
78	10					839	my $phrase = $ngram->ngram($size);
79
80	10	100				1004	if ( $self->lowercase ) {
81	1					4	$phrase = { map { lc $_ => $phrase->{$_} } keys %$phrase };
	13					25
82							}
83
84	10					23	my $stop = getStopWords('en');
85
86	10					3143	my $counts;
87
88	10					34	for my $p ( keys %$phrase ) {
89							next if $self->stopwords
90	79	100	100			154	&& grep { $stop->{$_} } split /\s+/, $p; # Exclude stopwords
	8					24
91
92	74					92	my $pat = $self->punctuation;
93	74	100				199	$p =~ s/$pat//g if $pat; # Remove unwanted punctuation
94
95							# Skip if we don't have an ngram of the requested size anymore
96	74					166	my @p = grep { $_ } split /\s+/, $p;
	122					213
97	74	100				130	next unless @p == $size;
98
99							# Skip an ngram with a lone single quote (allowed by the default punctuation)
100	59	50				63	next if grep { $_ eq "'" } @p;
	107					172
101
102	59	100				116	$p = lc $p if $self->lowercase;
103
104	59					109	$counts->{$p} = $phrase->{$p};
105							}
106
107	10					174	$self->{counts}{$file} = $counts;
108							}
109
110
111							sub tf {
112	15			15	1	4279	my ( $self, $file, $phrase ) = @_;
113	15	100	66			71	return 0 unless exists $self->counts->{$file} && exists $self->counts->{$file}{$phrase};
114	14					22	return $self->counts->{$file}{$phrase} / sum0( values %{ $self->counts->{$file} } );
	14					97
115							}
116
117
118							sub idf {
119	20			20	1	2650	my ( $self, $phrase ) = @_;
120
121	20					29	my $count = 0;
122
123	20					22	for my $file ( keys %{ $self->counts } ) {
	20					50
124	40	100				84	$count++ if exists $self->counts->{$file}{$phrase};
125							}
126
127	20	100				35	unless ( $count ) {
128	2					200	carp "'$phrase' is not present in any document";
129	2					145	return undef;
130							}
131
132	18					21	return - log( $count / keys %{ $self->counts } ) / log(10) + 0;
	18					82
133							}
134
135
136							sub tfidf {
137	14			14	1	2476	my ( $self, $file, $phrase ) = @_;
138	14					25	my $idf = $self->idf($phrase);
139	14	100				29	return undef unless $idf;
140	8					13	return $self->tf( $file, $phrase ) * $idf;
141							}
142
143
144							sub tfidf_by_file {
145	1			1	1	300	my ($self) = @_;
146
147	1					3	my %seen;
148
149	1					2	for my $file ( keys %{ $self->counts } ) {
	1					4
150	2					3	for my $phrase ( keys %{ $self->counts->{$file} } ) {
	2					6
151	8					13	my $tfidf = $self->tfidf( $file, $phrase );
152
153	8	100	100			27	next if $seen{$phrase}++ \|\| !defined $tfidf;
154
155	4					9	$self->{file_tfidf}{$file}{$phrase} = $tfidf;
156							}
157							}
158
159	1					4	return $self->file_tfidf;
160							}
161
162							1;
163
164							__END__