File Coverage

blib/lib/AI/Classifier/Text/Analyzer.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package AI::Classifier::Text::Analyzer;
2				{
3				$AI::Classifier::Text::Analyzer::VERSION = '0.03';
4				}
5
6	1	1	23344	use strict;
	1		2
	1		34
7	1	1	5	use warnings;
	1		2
	1		24
8	1	1	25	use 5.010;
	1		3
	1		46
9	1	1	1724	use Moose;
	0
	0
10
11				use Text::WordCounter;
12
13				has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
14				has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
15
16				sub analyze_urls {
17				my ( $self, $text, $features ) = @_;
18				my @urls;
19				my $p = URI::Find->new(
20				sub {
21				my ($uri, $t) = @_;
22				push @urls, $uri;
23				eval{
24				my $host = $uri->host;
25				$host =~ s/^www\.//;
26				$features->{ lc $host }++;
27				for (split /\//, $uri->path) {
28				if (length $_ > 3 ) {
29				$features->{ lc $_}++;
30				}
31				}
32				}
33				}
34				);
35				$p->find($text);
36				my $weight = $self->global_feature_weight;
37				if (!@urls) {
38				$features->{NO_URLS} = $weight;
39				}
40				if (scalar @urls > length( $text ) / 120 ) {
41				$features->{MANY_URLS} = $weight;
42				}
43				{
44				my %urls;
45				for my $url ( @urls ) {
46				if( $urls{$url}++ > 3 ){
47				$features->{REPEATED_URLS} = $weight;
48				last;
49				}
50				}
51				}
52				}
53
54				sub filter {
55				my ( $self, $text ) = @_;
56				$text =~ s/<[^>]+>//g;
57				return $text;
58				}
59
60				sub analyze {
61				my( $self, $text, $features ) = @_;
62				$features \|\|= {};
63				$self->analyze_urls( \$text, $features );
64				$text = $self->filter( $text );
65				$self->word_counter->word_count( $text, $features );
66				return $features;
67				}
68
69				__PACKAGE__->meta->make_immutable;
70
71				1;
72
73				=pod
74
75				=head1 NAME
76
77				AI::Classifier::Text::Analyzer - computing feature vectors from documents
78
79				=head1 VERSION
80
81				version 0.03
82
83				=head1 SYNOPSIS
84
85				use AI::Classifier::Text::Analyzer;
86
87				my $analyzer = AI::Classifier::Text::Analyzer->new();
88
89				my $features = $analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff' );
90
91				=head1 DESCRIPTION
92
93				Computes feature vectors of text using some heuristics and adds words count
94				(using L<Text::WordCounter> by default).
95
96				The object is immutable - but some methods use a second parameter as an accumulator for the
97				features found in given text.
98
99				It uses some specific values and methods that work for our case - but are not guaranteed
100				to bring good results universally - see the source for details!
101
102				=head1 ATTRIBUTES
103
104				=over 4
105
106				=item C<word_counter>
107
108				Object with a word_count method that will calculate the frequency of words in a text document.
109				By default L<Text::WordCounter>.
110
111				=item C<global_feature_weight>
112
113				The weight assigned for computed features of the text document. By default 2.
114
115				=back
116
117				=head1 METHODS
118
119				=over 4
120
121				=item C<< new(word_counter => $foo, global_feature_weight => 3) >>
122
123				Creates a new AI::Classifier::Text::Analyzer object. Both arguments are optional.
124
125				=item C<analyze($document, $features)>
126
127				Computes the feature vector of the given document and adds the initial vector of C<$features>.
128
129				=item C<analyze_urls($document, $features)>
130
131				Computes a vector special url related features of a given text - currently there are used
132				C<NO_URLS>, C<MANY_URLS> and C<REPEATED_URLS> features.
133
134				=item C<filter($document)>
135
136				Removes html related parts from the text.
137
138				=back
139
140				=head1 SEE ALSO
141
142				AI::NaiveBayes (3), AI::Classifier::Text(3)
143
144				=head1 AUTHOR
145
146				Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅ›nierz <tsosnierz@opera.com>
147
148				=head1 COPYRIGHT AND LICENSE
149
150				This software is copyright (c) 2012 by Opera Software ASA.
151
152				This is free software; you can redistribute it and/or modify it under
153				the same terms as the Perl 5 programming language system itself.
154
155				=cut
156
157				__END__
158
159				# ABSTRACT: computing feature vectors from documents
160