blib/lib/HTML/ExtractContent.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 115 | 131 | 87.7 |
branch | 31 | 50 | 62.0 |
condition | n/a | ||
subroutine | 19 | 19 | 100.0 |
pod | 4 | 4 | 100.0 |
total | 169 | 204 | 82.8 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::ExtractContent; | ||||||
2 | 2 | 2 | 879 | use strict; | |||
2 | 4 | ||||||
2 | 79 | ||||||
3 | 2 | 2 | 10 | use warnings; | |||
2 | 4 | ||||||
2 | 68 | ||||||
4 | 2 | 2 | 1065 | use HTML::ExtractContent::Util; | |||
2 | 31 | ||||||
2 | 18 | ||||||
5 | 2 | 2 | 433 | use List::Util qw(reduce); | |||
2 | 5 | ||||||
2 | 302 | ||||||
6 | 2 | 2 | 11 | use utf8; | |||
2 | 4 | ||||||
2 | 16 | ||||||
7 | 2 | 2 | 61 | use base qw(Class::Accessor::Lvalue::Fast); | |||
2 | 4 | ||||||
2 | 2092 | ||||||
8 | our $VERSION = '0.10'; | ||||||
9 | __PACKAGE__->mk_accessors(qw(opt content)); | ||||||
10 | |||||||
11 | sub new { | ||||||
12 | 1 | 1 | 1 | 636 | my ($class, $opt) = @_; | ||
13 | 1 | 18 | my $self = $class->SUPER::new($opt); | ||||
14 | 1 | 56 | $self->opt = { | ||||
15 | threshold => 60, # threhold for score of clusters | ||||||
16 | min_length => 30, # minimum length of blocks | ||||||
17 | decay_factor => 0.75, # decay factor for block scores | ||||||
18 | no_body_factor => 0.72, | ||||||
19 | continuous_factor => 1.62, # continuous factor for block scores | ||||||
20 | punctuation_weight => 10, # score weight for punctuations | ||||||
21 | 2 | 2 | 23003 | punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is, | |||
2 | 6 | ||||||
2 | 51 | ||||||
22 | waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is, | ||||||
23 | # characteristic keywords including footer | ||||||
24 | affiliate_expressions => | ||||||
25 | qr/amazon[a-z0-9\.\/\-\?&]+-22/is, | ||||||
26 | block_separator => qr/<\/?(?:div|center|td)[^>]*>| ]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is, |
||||||
27 | # nocontent => qr/<\/frameset>|]*url/is, | ||||||
28 | nocontent => qr/<\/frameset>/is, | ||||||
29 | min_nolink => 8, | ||||||
30 | nolist_ratio => 0.2, | ||||||
31 | debug => 0 | ||||||
32 | }; | ||||||
33 | 1 | 46 | $self->{pattern} = { | ||||
34 | a => qr/]*>.*?<\/a\s*>/is, | ||||||
35 | href => qr/ | ||||||
36 | list => qr/<(ul|dl|ol)(.+)<\/\1>/is, | ||||||
37 | li => qr/(?: |
||||||
38 | title => qr/ |
||||||
39 | headline => qr/( |
||||||
40 | head => qr/]*>.*?<\/head\s*>/is, | ||||||
41 | comment => qr/(?:|<([^>\s]+)[^>]*\s+style=['"]?[^>'"]*(?:display:\s*none|visibility:\s*hidden)[^>'"]*['"]?[^>]*>.*?<\/\1\s*>)/is, | ||||||
42 | special => qr//is, | ||||||
43 | useless => [ | ||||||
44 | qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is, | ||||||
45 | qr/ ]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is, |
||||||
46 | ], | ||||||
47 | }; | ||||||
48 | 1 | 4 | return bless $self, $class; | ||||
49 | } | ||||||
50 | |||||||
51 | sub as_text { | ||||||
52 | 2 | 2 | 1 | 1883 | my $self = shift; | ||
53 | 2 | 10 | return to_text $self->content; | ||||
54 | } | ||||||
55 | |||||||
56 | sub as_html { | ||||||
57 | 2 | 2 | 1 | 10 | my $self = shift; | ||
58 | 2 | 6 | return $self->content; | ||||
59 | } | ||||||
60 | |||||||
61 | sub extract { | ||||||
62 | 2 | 2 | 1 | 15 | my $self = shift;; | ||
63 | 2 | 13 | $self->content = shift; | ||||
64 | 2 | 50 | 26 | if ($self->content =~ $self->opt->{nocontent}) { | |||
65 | # frameset or redirect | ||||||
66 | 0 | 0 | $self->content = ''; | ||||
67 | 0 | 0 | return $self; | ||||
68 | } | ||||||
69 | 2 | 46 | $self->_extract_title; | ||||
70 | 2 | 10 | $self->_eliminate_head; | ||||
71 | |||||||
72 | 2 | 42 | $self->_eliminate_useless_symbols; | ||||
73 | 2 | 48 | $self->_eliminate_useless_tags; | ||||
74 | |||||||
75 | 2 | 126 | my ($factor, $continuous); | ||||
76 | 2 | 4 | $factor = $continuous = 1.0; | ||||
77 | 2 | 3 | my $body = ''; | ||||
78 | 2 | 5 | my $score = 0; | ||||
79 | 2 | 8 | my $best = { | ||||
80 | content => "", | ||||||
81 | score => 0, | ||||||
82 | }; | ||||||
83 | 2 | 9 | my @list = split $self->opt->{block_separator}, $self->content; | ||||
84 | 2 | 329 | my $flag = 0; | ||||
85 | 2 | 53 | for my $block (@list) { | ||||
86 | 44 | 171 | $block = strip $block; | ||||
87 | 44 | 100 | 110 | next unless decode $block; | |||
88 | 13 | 100 | 57 | $continuous /= $self->opt->{continuous_factor} if length $body; | |||
89 | |||||||
90 | # ignore link list block | ||||||
91 | 13 | 62 | my $nolink = $self->_eliminate_links($block); | ||||
92 | 13 | 21 | my $nolinklen = length $nolink; | ||||
93 | 13 | 100 | 36 | next if $nolinklen < $self->opt->{min_length}; | |||
94 | |||||||
95 | # score | ||||||
96 | 9 | 65 | my $c = $self->_score($nolink, $factor); | ||||
97 | 9 | 66 | $factor *= $self->opt->{decay_factor}; | ||||
98 | |||||||
99 | # anti-scoring factors | ||||||
100 | 9 | 47 | my $no_body_rate = $self->_no_body_rate($block); | ||||
101 | |||||||
102 | 9 | 23 | $c *= ($self->opt->{no_body_factor} ** $no_body_rate); | ||||
103 | 9 | 62 | my $c1 = $c * $continuous; | ||||
104 | |||||||
105 | # cluster scoring | ||||||
106 | 9 | 100 | 24 | if ($c1 > $self->opt->{threshold}) { | |||
100 | |||||||
107 | 2 | 11 | $flag = 1; | ||||
108 | 2 | 50 | 7 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
109 | if $self->opt->{debug}; | ||||||
110 | 2 | 16 | $body .= $block . "\n"; | ||||
111 | 2 | 3 | $score += $c1; | ||||
112 | 2 | 6 | $continuous = $self->opt->{continuous_factor}; | ||||
113 | } elsif ($c > $self->opt->{threshold}) { | ||||||
114 | 1 | 11 | $flag = 1; | ||||
115 | 1 | 50 | 5 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
116 | 1 | 50 | 11 | if ($score > $best->{score}) { | |||
117 | 1 | 50 | 3 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
118 | 1 | 10 | $best = { | ||||
119 | content => $body, | ||||||
120 | score => $score, | ||||||
121 | }; | ||||||
122 | } | ||||||
123 | 1 | 50 | 6 | print "\n" if $self->opt->{debug}; | |||
124 | 1 | 9 | $body = $block . "\n"; | ||||
125 | 1 | 3 | $score = $c; | ||||
126 | 1 | 5 | $continuous = $self->opt->{continuous_factor}; | ||||
127 | 1 | 50 | 8 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | |||
128 | if $self->opt->{debug}; | ||||||
129 | } else { | ||||||
130 | 6 | 100 | 73 | $factor /= $self->opt->{decay_factor} if !$flag; | |||
131 | 6 | 50 | 31 | print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n", | |||
132 | "<< reject\n" if $self->opt->{debug}; | ||||||
133 | } | ||||||
134 | } | ||||||
135 | 2 | 50 | 8 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
136 | 2 | 100 | 19 | if ($best->{score} < $score) { | |||
137 | 1 | 50 | 4 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
138 | 1 | 8 | $best = { | ||||
139 | content =>$body, | ||||||
140 | score => $score, | ||||||
141 | }; | ||||||
142 | } | ||||||
143 | 2 | 12 | $self->content = $best->{content}; | ||||
144 | |||||||
145 | 2 | 21 | return $self; | ||||
146 | } | ||||||
147 | |||||||
148 | sub _score { | ||||||
149 | 9 | 9 | 17 | my ($self, $nolink, $factor) = @_; | |||
150 | 9 | 24 | return ((length $nolink) | ||||
151 | + (match_count $nolink, $self->opt->{punctuations}) | ||||||
152 | * $self->opt->{punctuation_weight}) | ||||||
153 | * $factor; | ||||||
154 | } | ||||||
155 | |||||||
156 | sub _no_body_rate { | ||||||
157 | 9 | 9 | 17 | my ($self, $block) = @_; | |||
158 | 9 | 3768 | return (match_count $block,$self->opt->{waste_expressions}) | ||||
159 | + (match_count $block,$self->opt->{affiliate_expressions})/2.0; | ||||||
160 | } | ||||||
161 | |||||||
162 | sub _extract_title { | ||||||
163 | 2 | 2 | 5 | my $self = shift; | |||
164 | 2 | 2 | my $title; | ||||
165 | 2 | 50 | 7 | if ($self->content =~ $self->{pattern}->{title}) { | |||
166 | 2 | 41 | $title = strip (strip_tags $1); | ||||
167 | 2 | 50 | 8 | if (length $title) { | |||
168 | 2 | 5 | my $pat = $self->{pattern}->{headline}; | ||||
169 | 2 | 10 | $self->content =~ s/$pat/ | ||||
170 | 4 | 100 | 48 | (index $title, strip(strip_tags($2))) >= 0 ? " $2<\/div>" : "$1"/igse; |
|||
171 | } | ||||||
172 | } | ||||||
173 | } | ||||||
174 | |||||||
175 | sub _eliminate_head { | ||||||
176 | 2 | 2 | 4 | my $self = shift; | |||
177 | 2 | 6 | my $pat = $self->{pattern}->{head}; | ||||
178 | 2 | 7 | $self->content =~ s/$pat//is; | ||||
179 | } | ||||||
180 | |||||||
181 | sub _eliminate_useless_symbols { | ||||||
182 | 2 | 2 | 4 | my $self = shift; | |||
183 | 2 | 7 | my $comment = $self->{pattern}->{comment}; | ||||
184 | 2 | 4 | my $special = $self->{pattern}->{special}; | ||||
185 | 2 | 8 | $self->content =~ s/$comment//igs; | ||||
186 | 2 | 477 | $self->content =~ s/$special//igs; | ||||
187 | } | ||||||
188 | |||||||
189 | sub _eliminate_useless_tags { | ||||||
190 | 2 | 2 | 4 | my $self = shift; | |||
191 | 2 | 3 | my @useless = @{$self->{pattern}->{useless}}; | ||||
2 | 9 | ||||||
192 | 2 | 5 | for my $pat (@useless) { | ||||
193 | 4 | 46 | $self->content =~ s/$pat//igs; | ||||
194 | } | ||||||
195 | } | ||||||
196 | |||||||
197 | sub _eliminate_links { | ||||||
198 | 13 | 13 | 19 | my ($self, $block) = @_; | |||
199 | 13 | 43 | my $count = match_count $block, $self->{pattern}->{a}; | ||||
200 | 13 | 37 | my $nolink = to_text (eliminate_forms (eliminate_links $block)); | ||||
201 | 13 | 50 | 54 | return '' if length $nolink < $self->opt->{min_nolink} * $count; | |||
202 | 13 | 50 | 104 | return '' if $self->_is_linklist($block); | |||
203 | 13 | 31 | return $nolink; | ||||
204 | } | ||||||
205 | |||||||
206 | sub _is_linklist { | ||||||
207 | 13 | 13 | 18 | my ($self, $block) = @_; | |||
208 | 13 | 29 | my $listpat = $self->{pattern}->{list}; | ||||
209 | 13 | 50 | 83 | if ($block =~ $listpat) { | |||
210 | 0 | 0 | my $list = $2; | ||||
211 | 0 | 0 | my @fragments = split($listpat, $block, 2); | ||||
212 | 0 | 0 | my $nolist = $list; | ||||
213 | 0 | 0 | $nolist =~ s/$listpat//igs; | ||||
214 | 0 | 0 | $nolist = to_text(join($nolist, @fragments)); | ||||
215 | 0 | 0 | my @listitems = split $self->{pattern}->{li}, $list; | ||||
216 | 0 | 0 | shift @listitems; | ||||
217 | 0 | 0 | my $rate = 0; | ||||
218 | 0 | 0 | for my $li (@listitems) { | ||||
219 | 0 | 0 | 0 | $rate++ if $li =~ $self->{pattern}->{href}; | |||
220 | } | ||||||
221 | 0 | 0 | 0 | $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1; | |||
222 | 0 | 0 | $list = to_text $list; | ||||
223 | 0 | 0 | my $limit = ($self->opt->{nolist_ratio}*$rate) | ||||
224 | * ($rate * (length $list)); | ||||||
225 | 0 | 0 | return length $nolist < $limit; | ||||
226 | } | ||||||
227 | 13 | 42 | return 0; | ||||
228 | } | ||||||
229 | |||||||
230 | 1; | ||||||
231 | __END__ |