blib/lib/HTML/ExtractContent.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 115 | 131 | 87.7 |
branch | 31 | 50 | 62.0 |
condition | n/a | ||
subroutine | 19 | 19 | 100.0 |
pod | 4 | 4 | 100.0 |
total | 169 | 204 | 82.8 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::ExtractContent; | ||||||
2 | 2 | 2 | 59836 | use strict; | |||
2 | 5 | ||||||
2 | 74 | ||||||
3 | 2 | 2 | 8 | use warnings; | |||
2 | 4 | ||||||
2 | 64 | ||||||
4 | 2 | 2 | 1691 | use utf8; | |||
2 | 22 | ||||||
2 | 9 | ||||||
5 | |||||||
6 | # core | ||||||
7 | 2 | 2 | 88 | use List::Util qw(reduce); | |||
2 | 3 | ||||||
2 | 271 | ||||||
8 | |||||||
9 | # cpan | ||||||
10 | use Class::Accessor::Lite ( | ||||||
11 | 2 | 16 | rw => [qw(opt content)], | ||||
12 | 2 | 2 | 973 | ); | |||
2 | 2642 | ||||||
13 | |||||||
14 | # lib | ||||||
15 | 2 | 2 | 903 | use HTML::ExtractContent::Util; | |||
2 | 12 | ||||||
2 | 25 | ||||||
16 | |||||||
17 | our $VERSION = '0.11'; | ||||||
18 | |||||||
19 | sub new { | ||||||
20 | 1 | 1 | 1 | 349 | my ($class, $opt) = @_; | ||
21 | 1 | 4 | my $self = bless {}, $class; | ||||
22 | 1 | 28 | $self->{opt} = { | ||||
23 | threshold => 60, # threhold for score of clusters | ||||||
24 | min_length => 30, # minimum length of blocks | ||||||
25 | decay_factor => 0.75, # decay factor for block scores | ||||||
26 | no_body_factor => 0.72, | ||||||
27 | continuous_factor => 1.62, # continuous factor for block scores | ||||||
28 | punctuation_weight => 10, # score weight for punctuations | ||||||
29 | 2 | 2 | 384 | punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is, | |||
2 | 2 | ||||||
2 | 28 | ||||||
30 | waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is, | ||||||
31 | # characteristic keywords including footer | ||||||
32 | affiliate_expressions => | ||||||
33 | qr/amazon[a-z0-9\.\/\-\?&]+-22/is, | ||||||
34 | block_separator => qr/<\/?(?:div|center|td)[^>]*>| ]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is, |
||||||
35 | # nocontent => qr/<\/frameset>|]*url/is, | ||||||
36 | nocontent => qr/<\/frameset>/is, | ||||||
37 | min_nolink => 8, | ||||||
38 | nolist_ratio => 0.2, | ||||||
39 | debug => 0 | ||||||
40 | }; | ||||||
41 | 1 | 27 | $self->{pattern} = { | ||||
42 | a => qr/]*>.*?<\/a\s*>/is, | ||||||
43 | href => qr/ | ||||||
44 | list => qr/<(ul|dl|ol)(.+)<\/\1>/is, | ||||||
45 | li => qr/(?: |
||||||
46 | title => qr/ |
||||||
47 | headline => qr/( |
||||||
48 | head => qr/]*>.*?<\/head\s*>/is, | ||||||
49 | comment => qr{ (?: | ||||||
50 | | | ||||||
51 | # remove invisible elements | ||||||
52 | < ( [\w:.-]+ ) \s [^>]*? style \s* = [^>]*? \b | ||||||
53 | (?: display \s* : \s* none | visibility \s* : \s* hidden ) | ||||||
54 | \b [^>]* > .*? \1 \s* > | ||||||
55 | ) }xis, | ||||||
56 | special => qr//is, | ||||||
57 | useless => [ | ||||||
58 | qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is, | ||||||
59 | qr/ ]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is, |
||||||
60 | ], | ||||||
61 | }; | ||||||
62 | 1 | 3 | return bless $self, $class; | ||||
63 | } | ||||||
64 | |||||||
65 | sub as_text { | ||||||
66 | 2 | 2 | 1 | 1049 | my $self = shift; | ||
67 | 2 | 7 | return to_text $self->content; | ||||
68 | } | ||||||
69 | |||||||
70 | sub as_html { | ||||||
71 | 2 | 2 | 1 | 8 | my $self = shift; | ||
72 | 2 | 5 | return $self->content; | ||||
73 | } | ||||||
74 | |||||||
75 | sub extract { | ||||||
76 | 2 | 2 | 1 | 709 | my $self = shift;; | ||
77 | 2 | 71 | $self->content(shift); | ||||
78 | 2 | 50 | 16 | if ($self->content =~ $self->opt->{nocontent}) { | |||
79 | # frameset or redirect | ||||||
80 | 0 | 0 | $self->content(''); | ||||
81 | 0 | 0 | return $self; | ||||
82 | } | ||||||
83 | 2 | 46 | $self->_extract_title; | ||||
84 | 2 | 6 | $self->_eliminate_head; | ||||
85 | |||||||
86 | 2 | 5 | $self->_eliminate_useless_symbols; | ||||
87 | 2 | 5 | $self->_eliminate_useless_tags; | ||||
88 | |||||||
89 | 2 | 4 | my ($factor, $continuous); | ||||
90 | 2 | 2 | $factor = $continuous = 1.0; | ||||
91 | 2 | 3 | my $body = ''; | ||||
92 | 2 | 2 | my $score = 0; | ||||
93 | 2 | 7 | my $best = { | ||||
94 | content => "", | ||||||
95 | score => 0, | ||||||
96 | }; | ||||||
97 | 2 | 6 | my @list = split $self->opt->{block_separator}, $self->content; | ||||
98 | 2 | 257 | my $flag = 0; | ||||
99 | 2 | 4 | for my $block (@list) { | ||||
100 | 44 | 124 | $block = strip $block; | ||||
101 | 44 | 100 | 70 | next unless decode $block; | |||
102 | 13 | 100 | 41 | $continuous /= $self->opt->{continuous_factor} if length $body; | |||
103 | |||||||
104 | # ignore link list block | ||||||
105 | 13 | 48 | my $nolink = $self->_eliminate_links($block); | ||||
106 | 13 | 22 | my $nolinklen = length $nolink; | ||||
107 | 13 | 100 | 24 | next if $nolinklen < $self->opt->{min_length}; | |||
108 | |||||||
109 | # score | ||||||
110 | 9 | 41 | my $c = $self->_score($nolink, $factor); | ||||
111 | 9 | 50 | $factor *= $self->opt->{decay_factor}; | ||||
112 | |||||||
113 | # anti-scoring factors | ||||||
114 | 9 | 34 | my $no_body_rate = $self->_no_body_rate($block); | ||||
115 | |||||||
116 | 9 | 18 | $c *= ($self->opt->{no_body_factor} ** $no_body_rate); | ||||
117 | 9 | 45 | my $c1 = $c * $continuous; | ||||
118 | |||||||
119 | # cluster scoring | ||||||
120 | 9 | 100 | 17 | if ($c1 > $self->opt->{threshold}) { | |||
100 | |||||||
121 | 2 | 7 | $flag = 1; | ||||
122 | 2 | 50 | 37 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
123 | if $self->opt->{debug}; | ||||||
124 | 2 | 13 | $body .= $block . "\n"; | ||||
125 | 2 | 3 | $score += $c1; | ||||
126 | 2 | 5 | $continuous = $self->opt->{continuous_factor}; | ||||
127 | } elsif ($c > $self->opt->{threshold}) { | ||||||
128 | 1 | 8 | $flag = 1; | ||||
129 | 1 | 50 | 3 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
130 | 1 | 50 | 11 | if ($score > $best->{score}) { | |||
131 | 1 | 50 | 4 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
132 | 1 | 7 | $best = { | ||||
133 | content => $body, | ||||||
134 | score => $score, | ||||||
135 | }; | ||||||
136 | } | ||||||
137 | 1 | 50 | 5 | print "\n" if $self->opt->{debug}; | |||
138 | 1 | 7 | $body = $block . "\n"; | ||||
139 | 1 | 1 | $score = $c; | ||||
140 | 1 | 3 | $continuous = $self->opt->{continuous_factor}; | ||||
141 | 1 | 50 | 4 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
142 | if $self->opt->{debug}; | ||||||
143 | } else { | ||||||
144 | 6 | 100 | 211 | $factor /= $self->opt->{decay_factor} if !$flag; | |||
145 | 6 | 50 | 18 | print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n", | |||
146 | "<< reject\n" if $self->opt->{debug}; | ||||||
147 | } | ||||||
148 | } | ||||||
149 | 2 | 50 | 6 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
150 | 2 | 100 | 16 | if ($best->{score} < $score) { | |||
151 | 1 | 50 | 2 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
152 | 1 | 7 | $best = { | ||||
153 | content =>$body, | ||||||
154 | score => $score, | ||||||
155 | }; | ||||||
156 | } | ||||||
157 | 2 | 7 | $self->content($best->{content}); | ||||
158 | |||||||
159 | 2 | 17 | return $self; | ||||
160 | } | ||||||
161 | |||||||
162 | sub _score { | ||||||
163 | 9 | 9 | 13 | my ($self, $nolink, $factor) = @_; | |||
164 | 9 | 20 | return ((length $nolink) | ||||
165 | + (match_count $nolink, $self->opt->{punctuations}) | ||||||
166 | * $self->opt->{punctuation_weight}) | ||||||
167 | * $factor; | ||||||
168 | } | ||||||
169 | |||||||
170 | sub _no_body_rate { | ||||||
171 | 9 | 9 | 10 | my ($self, $block) = @_; | |||
172 | 9 | 14 | return (match_count $block,$self->opt->{waste_expressions}) | ||||
173 | + (match_count $block,$self->opt->{affiliate_expressions})/2.0; | ||||||
174 | } | ||||||
175 | |||||||
176 | sub _extract_title { | ||||||
177 | 2 | 2 | 4 | my $self = shift; | |||
178 | 2 | 2 | my $title; | ||||
179 | 2 | 50 | 5 | if ($self->content =~ $self->{pattern}->{title}) { | |||
180 | 2 | 73 | $title = strip (strip_tags $1); | ||||
181 | 2 | 50 | 10 | if (length $title) { | |||
182 | 2 | 5 | my $pat = $self->{pattern}->{headline}; | ||||
183 | 2 | 30 | $self->{content} =~ s/$pat/ | ||||
184 | 4 | 100 | 9 | (index $title, strip(strip_tags($2))) >= 0 ? " $2<\/div>" : "$1"/igse; |
|||
185 | } | ||||||
186 | } | ||||||
187 | } | ||||||
188 | |||||||
189 | sub _eliminate_head { | ||||||
190 | 2 | 2 | 3 | my $self = shift; | |||
191 | 2 | 3 | my $pat = $self->{pattern}->{head}; | ||||
192 | 2 | 24 | $self->{content} =~ s/$pat//is; | ||||
193 | } | ||||||
194 | |||||||
195 | sub _eliminate_useless_symbols { | ||||||
196 | 2 | 2 | 3 | my $self = shift; | |||
197 | 2 | 3 | my $comment = $self->{pattern}->{comment}; | ||||
198 | 2 | 3 | my $special = $self->{pattern}->{special}; | ||||
199 | 2 | 69 | $self->{content} =~ s/$comment//igs; | ||||
200 | 2 | 18 | $self->{content} =~ s/$special//igs; | ||||
201 | } | ||||||
202 | |||||||
203 | sub _eliminate_useless_tags { | ||||||
204 | 2 | 2 | 3 | my $self = shift; | |||
205 | 2 | 2 | my @useless = @{$self->{pattern}->{useless}}; | ||||
2 | 5 | ||||||
206 | 2 | 5 | for my $pat (@useless) { | ||||
207 | 4 | 206 | $self->{content} =~ s/$pat//igs; | ||||
208 | } | ||||||
209 | } | ||||||
210 | |||||||
211 | sub _eliminate_links { | ||||||
212 | 13 | 13 | 16 | my ($self, $block) = @_; | |||
213 | 13 | 33 | my $count = match_count $block, $self->{pattern}->{a}; | ||||
214 | 13 | 32 | my $nolink = to_text (eliminate_forms (eliminate_links $block)); | ||||
215 | 13 | 50 | 46 | return '' if length $nolink < $self->opt->{min_nolink} * $count; | |||
216 | 13 | 50 | 75 | return '' if $self->_is_linklist($block); | |||
217 | 13 | 23 | return $nolink; | ||||
218 | } | ||||||
219 | |||||||
220 | sub _is_linklist { | ||||||
221 | 13 | 13 | 15 | my ($self, $block) = @_; | |||
222 | 13 | 18 | my $listpat = $self->{pattern}->{list}; | ||||
223 | 13 | 50 | 70 | if ($block =~ $listpat) { | |||
224 | 0 | 0 | my $list = $2; | ||||
225 | 0 | 0 | my @fragments = split($listpat, $block, 2); | ||||
226 | 0 | 0 | my $nolist = $list; | ||||
227 | 0 | 0 | $nolist =~ s/$listpat//igs; | ||||
228 | 0 | 0 | $nolist = to_text(join($nolist, @fragments)); | ||||
229 | 0 | 0 | my @listitems = split $self->{pattern}->{li}, $list; | ||||
230 | 0 | 0 | shift @listitems; | ||||
231 | 0 | 0 | my $rate = 0; | ||||
232 | 0 | 0 | for my $li (@listitems) { | ||||
233 | 0 | 0 | 0 | $rate++ if $li =~ $self->{pattern}->{href}; | |||
234 | } | ||||||
235 | 0 | 0 | 0 | $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1; | |||
236 | 0 | 0 | $list = to_text $list; | ||||
237 | 0 | 0 | my $limit = ($self->opt->{nolist_ratio}*$rate) | ||||
238 | * ($rate * (length $list)); | ||||||
239 | 0 | 0 | return length $nolist < $limit; | ||||
240 | } | ||||||
241 | 13 | 24 | return 0; | ||||
242 | } | ||||||
243 | |||||||
244 | 1; | ||||||
245 | __END__ |