File Coverage

blib/lib/HTML/ExtractContent.pm
Criterion Covered Total %
statement 115 131 87.7
branch 31 50 62.0
condition n/a
subroutine 19 19 100.0
pod 4 4 100.0
total 169 204 82.8


line stmt bran cond sub pod time code
1             package HTML::ExtractContent;
2 2     2   59795 use strict;
  2         4  
  2         53  
3 2     2   10 use warnings;
  2         3  
  2         55  
4 2     2   1772 use utf8;
  2         25  
  2         10  
5              
6             # core
7 2     2   68 use List::Util qw(reduce);
  2         3  
  2         215  
8              
9             # cpan
10             use Class::Accessor::Lite (
11 2         16 rw => [qw(opt content)],
12 2     2   1509 );
  2         2263  
13              
14             # lib
15 2     2   1192 use HTML::ExtractContent::Util;
  2         8  
  2         21  
16              
17             our $VERSION = '0.12';
18              
19             sub new {
20 1     1 1 349 my ($class, $opt) = @_;
21 1         3 my $self = bless {}, $class;
22             $self->{opt} = {
23 1         26 threshold => 60, # threhold for score of clusters
24             min_length => 30, # minimum length of blocks
25             decay_factor => 0.75, # decay factor for block scores
26             no_body_factor => 0.72,
27             continuous_factor => 1.62, # continuous factor for block scores
28             punctuation_weight => 10, # score weight for punctuations
29 2     2   682 punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is,
  2         23  
  2         37  
30             waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is,
31             # characteristic keywords including footer
32             affiliate_expressions =>
33             qr/amazon[a-z0-9\.\/\-\?&]+-22/is,
34             block_separator => qr/<\/?(?:div|center|td)[^>]*>|]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is,
35             # nocontent => qr/<\/frameset>|]*url/is,
36             nocontent => qr/<\/frameset>/is,
37             min_nolink => 8,
38             nolist_ratio => 0.2,
39             debug => 0
40             };
41             $self->{pattern} = {
42 1         21 a => qr/]*>.*?<\/a\s*>/is,
43             href => qr/
44             list => qr/<(ul|dl|ol)(.+)<\/\1>/is,
45             li => qr/(?:]*>|]*>)/is,
46             title => qr/]*>(.*?)<\/title\s*>/is,
47             headline => qr/(\s*(.*?)\s*<\/h\d\s*>)/is,
48             head => qr/]*>.*?<\/head\s*>/is,
49             comment => qr{ (?:
50             |
51             # remove invisible elements
52             < ( [\w:.-]+ ) \s [^>]*? style \s* = [^>]*? \b
53             (?: display \s* : \s* none | visibility \s* : \s* hidden )
54             \b [^>]* > .*?
55             ) }xis,
56             special => qr//is,
57             useless => [
58             qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is,
59             qr/]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is,
60             ],
61             };
62 1         4 return bless $self, $class;
63             }
64              
65             sub as_text {
66 2     2 1 930 my $self = shift;
67 2         7 return to_text $self->content;
68             }
69              
70             sub as_html {
71 2     2 1 9 my $self = shift;
72 2         6 return $self->content;
73             }
74              
75             sub extract {
76 2     2 1 870 my $self = shift;;
77 2         11 $self->content(shift);
78 2 50       21 if ($self->content =~ $self->opt->{nocontent}) {
79             # frameset or redirect
80 0         0 $self->content('');
81 0         0 return $self;
82             }
83 2         53 $self->_extract_title;
84 2         9 $self->_eliminate_head;
85              
86 2         8 $self->_eliminate_useless_symbols;
87 2         6 $self->_eliminate_useless_tags;
88              
89 2         3 my ($factor, $continuous);
90 2         4 $factor = $continuous = 1.0;
91 2         3 my $body = '';
92 2         4 my $score = 0;
93 2         7 my $best = {
94             content => "",
95             score => 0,
96             };
97 2         9 my @list = split $self->opt->{block_separator}, $self->content;
98 2         310 my $flag = 0;
99 2         5 for my $block (@list) {
100 44         183 $block = strip $block;
101 44 100       113 next unless decode $block;
102 13 100       54 $continuous /= $self->opt->{continuous_factor} if length $body;
103              
104             # ignore link list block
105 13         59 my $nolink = $self->_eliminate_links($block);
106 13         18 my $nolinklen = length $nolink;
107 13 100       40 next if $nolinklen < $self->opt->{min_length};
108              
109             # score
110 9         60 my $c = $self->_score($nolink, $factor);
111 9         65 $factor *= $self->opt->{decay_factor};
112              
113             # anti-scoring factors
114 9         45 my $no_body_rate = $self->_no_body_rate($block);
115              
116 9         28 $c *= ($self->opt->{no_body_factor} ** $no_body_rate);
117 9         58 my $c1 = $c * $continuous;
118              
119             # cluster scoring
120 9 100       25 if ($c1 > $self->opt->{threshold}) {
    100          
121 2         11 $flag = 1;
122             print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
123 2 50       7 if $self->opt->{debug};
124 2         15 $body .= $block . "\n";
125 2         4 $score += $c1;
126 2         5 $continuous = $self->opt->{continuous_factor};
127             } elsif ($c > $self->opt->{threshold}) {
128 1         9 $flag = 1;
129 1 50       5 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
130 1 50       11 if ($score > $best->{score}) {
131 1 50       5 print "!!!! best: score=$score\n" if $self->opt->{debug};
132 1         9 $best = {
133             content => $body,
134             score => $score,
135             };
136             }
137 1 50       5 print "\n" if $self->opt->{debug};
138 1         7 $body = $block . "\n";
139 1         2 $score = $c;
140 1         4 $continuous = $self->opt->{continuous_factor};
141             print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n"
142 1 50       11 if $self->opt->{debug};
143             } else {
144 6 100       76 $factor /= $self->opt->{decay_factor} if !$flag;
145             print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n",
146 6 50       28 "<< reject\n" if $self->opt->{debug};
147             }
148             }
149 2 50       9 print "\n---- end of cluster: $score\n" if $self->opt->{debug};
150 2 100       18 if ($best->{score} < $score) {
151 1 50       4 print "!!!! best: score=$score\n" if $self->opt->{debug};
152 1         9 $best = {
153             content =>$body,
154             score => $score,
155             };
156             }
157 2         9 $self->content($best->{content});
158              
159 2         23 return $self;
160             }
161              
162             sub _score {
163 9     9   18 my ($self, $nolink, $factor) = @_;
164             return ((length $nolink)
165             + (match_count $nolink, $self->opt->{punctuations})
166             * $self->opt->{punctuation_weight})
167 9         23 * $factor;
168             }
169              
170             sub _no_body_rate {
171 9     9   12 my ($self, $block) = @_;
172             return (match_count $block,$self->opt->{waste_expressions})
173 9         24 + (match_count $block,$self->opt->{affiliate_expressions})/2.0;
174             }
175              
176             sub _extract_title {
177 2     2   5 my $self = shift;
178 2         3 my $title;
179 2 50       6 if ($self->content =~ $self->{pattern}->{title}) {
180 2         49 $title = strip (strip_tags $1);
181 2 50       10 if (length $title) {
182 2         6 my $pat = $self->{pattern}->{headline};
183 2         39 $self->{content} =~ s/$pat/
184 4 100       19 (index $title, strip(strip_tags($2))) >= 0 ? "
$2<\/div>" : "$1"/igse;
185             }
186             }
187             }
188              
189             sub _eliminate_head {
190 2     2   3 my $self = shift;
191 2         5 my $pat = $self->{pattern}->{head};
192 2         24 $self->{content} =~ s/$pat//is;
193             }
194              
195             sub _eliminate_useless_symbols {
196 2     2   3 my $self = shift;
197 2         5 my $comment = $self->{pattern}->{comment};
198 2         3 my $special = $self->{pattern}->{special};
199 2         90 $self->{content} =~ s/$comment//igs;
200 2         29 $self->{content} =~ s/$special//igs;
201             }
202              
203             sub _eliminate_useless_tags {
204 2     2   4 my $self = shift;
205 2         9 my @useless = @{$self->{pattern}->{useless}};
  2         7  
206 2         5 for my $pat (@useless) {
207 4         145 $self->{content} =~ s/$pat//igs;
208             }
209             }
210              
211             sub _eliminate_links {
212 13     13   22 my ($self, $block) = @_;
213 13         43 my $count = match_count $block, $self->{pattern}->{a};
214 13         36 my $nolink = to_text (eliminate_forms (eliminate_links $block));
215 13 50       49 return '' if length $nolink < $self->opt->{min_nolink} * $count;
216 13 50       97 return '' if $self->_is_linklist($block);
217 13         33 return $nolink;
218             }
219              
220             sub _is_linklist {
221 13     13   24 my ($self, $block) = @_;
222 13         20 my $listpat = $self->{pattern}->{list};
223 13 50       82 if ($block =~ $listpat) {
224 0         0 my $list = $2;
225 0         0 my @fragments = split($listpat, $block, 2);
226 0         0 my $nolist = $list;
227 0         0 $nolist =~ s/$listpat//igs;
228 0         0 $nolist = to_text(join($nolist, @fragments));
229 0         0 my @listitems = split $self->{pattern}->{li}, $list;
230 0         0 shift @listitems;
231 0         0 my $rate = 0;
232 0         0 for my $li (@listitems) {
233 0 0       0 $rate++ if $li =~ $self->{pattern}->{href};
234             }
235 0 0       0 $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1;
236 0         0 $list = to_text $list;
237 0         0 my $limit = ($self->opt->{nolist_ratio}*$rate)
238             * ($rate * (length $list));
239 0         0 return length $nolist < $limit;
240             }
241 13         39 return 0;
242             }
243              
244             1;
245             __END__