File Coverage

blib/lib/Text/TFIDF/Ngram.pm

Criterion	Covered	Total	%
statement	76	76	100.0
branch	23	24	95.8
condition	9	11	81.8
subroutine	13	13	100.0
pod	4	5	80.0
total	125	129	96.9

line	stmt	bran	cond	sub	pod	time	code
1							package Text::TFIDF::Ngram;
2							our $AUTHORITY = 'cpan:GENE';
3
4							# ABSTRACT: Compute the TF-IDF measure for ngram phrases
5
6							our $VERSION = '0.0508';
7
8	1			1		1383	use Moo;
	1					11658
	1					5
9	1			1		2032	use strictures 2;
	1					1600
	1					58
10	1			1		700	use namespace::clean;
	1					11421
	1					6
11
12	1			1		258	use Carp;
	1					2
	1					69
13	1			1		512	use Lingua::EN::Ngram;
	1					1434
	1					36
14	1			1		435	use Lingua::StopWords qw( getStopWords );
	1					320
	1					95
15	1			1		8	use List::Util qw( sum0 );
	1					2
	1					1238
16
17
18							has files => (
19							is => 'ro',
20							isa => sub { croak 'Invalid files list' unless ref $_[0] eq 'ARRAY' },
21							);
22
23
24							has size => (
25							is => 'ro',
26							isa => sub { croak 'Invalid integer' unless $_[0] && $_[0] =~ /^\d+$/ && $_[0] > 0 },
27							default => sub { 1 },
28							);
29
30
31							has stopwords => (
32							is => 'ro',
33							isa => sub { croak 'Invalid Boolean' unless defined $_[0] },
34							default => sub { 1 },
35							);
36
37
38							has punctuation => (
39							is => 'ro',
40							default => sub { qr/(?!')[[:punct:]]/ },
41							);
42
43
44							has lowercase => (
45							is => 'ro',
46							default => sub { 0 },
47							);
48
49
50							has counts => (
51							is => 'ro',
52							init_arg => undef,
53							);
54
55
56							has file_tfidf => (
57							is => 'ro',
58							init_arg => undef,
59							);
60
61
62							sub BUILD {
63	8			8	0	67	my ( $self, $args ) = @_;
64
65	8	100				24	return unless $args->{files};
66
67	7		50			18	$args->{size} \|\|= 1;
68
69	7					9	for my $file ( @{ $args->{files} } ) {
	7					17
70	11					26	$self->_process_ngrams( $file, $args->{size} );
71							}
72							}
73
74							sub _process_ngrams {
75	11			11		23	my ( $self, $file, $size ) = @_;
76
77	11					36	my $ngram = Lingua::EN::Ngram->new( file => $file );
78	11					1200	my $phrase = $ngram->ngram($size);
79
80	11	100				1385	if ( $self->lowercase ) {
81	2					9	$phrase = { map { lc $_ => $phrase->{$_} } keys %$phrase };
	25					56
82							}
83
84	11					30	my $stop = getStopWords('en');
85
86	11					4090	my $counts;
87
88	11					44	for my $p ( keys %$phrase ) {
89							next if $self->stopwords
90	91	100	100			209	&& grep { $stop->{$_} } split /\s+/, $p; # Exclude stopwords
	8					31
91
92	86					124	my $pat = $self->punctuation;
93	86	100				282	$p =~ s/$pat//g if $pat; # Remove unwanted punctuation
94
95							# Skip if we don't have an ngram of the requested size anymore
96	86					249	my @p = grep { $_ } split /\s+/, $p;
	142					306
97	86	100				227	next unless @p == $size;
98
99							# Skip an ngram with a lone single quote (allowed by the default punctuation)
100	67	50				103	next if grep { $_ eq "'" } @p;
	123					250
101
102	67	100				142	$p = lc $p if $self->lowercase;
103
104	67					154	$counts->{$p} = $phrase->{$p};
105							}
106
107	11					241	$self->{counts}{$file} = $counts;
108							}
109
110
111							sub tf {
112	15			15	1	5416	my ( $self, $file, $phrase ) = @_;
113	15	100	66			80	return 0 unless exists $self->counts->{$file} && exists $self->counts->{$file}{$phrase};
114	14					28	return $self->counts->{$file}{$phrase} / sum0( values %{ $self->counts->{$file} } );
	14					469
115							}
116
117
118							sub idf {
119	20			20	1	3401	my ( $self, $phrase ) = @_;
120
121	20					27	my $count = 0;
122
123	20					28	for my $file ( keys %{ $self->counts } ) {
	20					62
124	40	100				101	$count++ if exists $self->counts->{$file}{$phrase};
125							}
126
127	20	100				42	unless ( $count ) {
128	2					244	carp "'$phrase' is not present in any document";
129	2					191	return undef;
130							}
131
132	18					28	return - log( $count / keys %{ $self->counts } ) / log(10) + 0;
	18					96
133							}
134
135
136							sub tfidf {
137	14			14	1	3162	my ( $self, $file, $phrase ) = @_;
138	14					24	my $idf = $self->idf($phrase);
139	14	100				33	return undef unless $idf;
140	8					17	return $self->tf( $file, $phrase ) * $idf;
141							}
142
143
144							sub tfidf_by_file {
145	1			1	1	383	my ($self) = @_;
146
147	1					2	my %seen;
148
149	1					2	for my $file ( keys %{ $self->counts } ) {
	1					4
150	2					4	for my $phrase ( keys %{ $self->counts->{$file} } ) {
	2					6
151	8					14	my $tfidf = $self->tfidf( $file, $phrase );
152
153	8	100	100			32	next if $seen{$phrase}++ \|\| !defined $tfidf;
154
155	4					9	$self->{file_tfidf}{$file}{$phrase} = $tfidf;
156							}
157							}
158
159	1					5	return $self->file_tfidf;
160							}
161
162							1;
163
164							__END__