File Coverage

blib/lib/AI/Classifier/Text/Analyzer.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package AI::Classifier::Text::Analyzer;
2             {
3             $AI::Classifier::Text::Analyzer::VERSION = '0.03';
4             }
5              
6 1     1   23344 use strict;
  1         2  
  1         34  
7 1     1   5 use warnings;
  1         2  
  1         24  
8 1     1   25 use 5.010;
  1         3  
  1         46  
9 1     1   1724 use Moose;
  0            
  0            
10              
11             use Text::WordCounter;
12              
13             has word_counter => ( is => 'ro', default => sub{ Text::WordCounter->new() } );
14             has global_feature_weight => ( is => 'ro', isa => 'Num', default => 2 );
15              
16             sub analyze_urls {
17             my ( $self, $text, $features ) = @_;
18             my @urls;
19             my $p = URI::Find->new(
20             sub {
21             my ($uri, $t) = @_;
22             push @urls, $uri;
23             eval{
24             my $host = $uri->host;
25             $host =~ s/^www\.//;
26             $features->{ lc $host }++;
27             for (split /\//, $uri->path) {
28             if (length $_ > 3 ) {
29             $features->{ lc $_}++;
30             }
31             }
32             }
33             }
34             );
35             $p->find($text);
36             my $weight = $self->global_feature_weight;
37             if (!@urls) {
38             $features->{NO_URLS} = $weight;
39             }
40             if (scalar @urls > length( $text ) / 120 ) {
41             $features->{MANY_URLS} = $weight;
42             }
43             {
44             my %urls;
45             for my $url ( @urls ) {
46             if( $urls{$url}++ > 3 ){
47             $features->{REPEATED_URLS} = $weight;
48             last;
49             }
50             }
51             }
52             }
53              
54             sub filter {
55             my ( $self, $text ) = @_;
56             $text =~ s/<[^>]+>//g;
57             return $text;
58             }
59              
60             sub analyze {
61             my( $self, $text, $features ) = @_;
62             $features ||= {};
63             $self->analyze_urls( \$text, $features );
64             $text = $self->filter( $text );
65             $self->word_counter->word_count( $text, $features );
66             return $features;
67             }
68              
69             __PACKAGE__->meta->make_immutable;
70              
71             1;
72              
73             =pod
74              
75             =head1 NAME
76              
77             AI::Classifier::Text::Analyzer - computing feature vectors from documents
78              
79             =head1 VERSION
80              
81             version 0.03
82              
83             =head1 SYNOPSIS
84              
85             use AI::Classifier::Text::Analyzer;
86              
87             my $analyzer = AI::Classifier::Text::Analyzer->new();
88            
89             my $features = $analyzer->analyze( 'aaaa http://www.example.com/bbb?xx=yy&bb=cc;dd=ff' );
90              
91             =head1 DESCRIPTION
92              
93             Computes feature vectors of text using some heuristics and adds words count
94             (using L<Text::WordCounter> by default).
95              
96             The object is immutable - but some methods use a second parameter as an accumulator for the
97             features found in given text.
98              
99             It uses some specific values and methods that work for our case - but are not guaranteed
100             to bring good results universally - see the source for details!
101              
102             =head1 ATTRIBUTES
103              
104             =over 4
105              
106             =item C<word_counter>
107              
108             Object with a word_count method that will calculate the frequency of words in a text document.
109             By default L<Text::WordCounter>.
110              
111             =item C<global_feature_weight>
112              
113             The weight assigned for computed features of the text document. By default 2.
114              
115             =back
116              
117             =head1 METHODS
118              
119             =over 4
120              
121             =item C<< new(word_counter => $foo, global_feature_weight => 3) >>
122              
123             Creates a new AI::Classifier::Text::Analyzer object. Both arguments are optional.
124              
125             =item C<analyze($document, $features)>
126              
127             Computes the feature vector of the given document and adds the initial vector of C<$features>.
128              
129             =item C<analyze_urls($document, $features)>
130              
131             Computes a vector special url related features of a given text - currently there are used
132             C<NO_URLS>, C<MANY_URLS> and C<REPEATED_URLS> features.
133              
134             =item C<filter($document)>
135              
136             Removes html related parts from the text.
137              
138             =back
139              
140             =head1 SEE ALSO
141              
142             AI::NaiveBayes (3), AI::Classifier::Text(3)
143              
144             =head1 AUTHOR
145              
146             Zbigniew Lukasiak <zlukasiak@opera.com>, Tadeusz SoÅ›nierz <tsosnierz@opera.com>
147              
148             =head1 COPYRIGHT AND LICENSE
149              
150             This software is copyright (c) 2012 by Opera Software ASA.
151              
152             This is free software; you can redistribute it and/or modify it under
153             the same terms as the Perl 5 programming language system itself.
154              
155             =cut
156              
157             __END__
158              
159             # ABSTRACT: computing feature vectors from documents
160