| blib/lib/HTML/ExtractContent.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 115 | 131 | 87.7 |
| branch | 31 | 50 | 62.0 |
| condition | n/a | ||
| subroutine | 19 | 19 | 100.0 |
| pod | 4 | 4 | 100.0 |
| total | 169 | 204 | 82.8 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::ExtractContent; | ||||||
| 2 | 2 | 2 | 59795 | use strict; | |||
| 2 | 4 | ||||||
| 2 | 53 | ||||||
| 3 | 2 | 2 | 10 | use warnings; | |||
| 2 | 3 | ||||||
| 2 | 55 | ||||||
| 4 | 2 | 2 | 1772 | use utf8; | |||
| 2 | 25 | ||||||
| 2 | 10 | ||||||
| 5 | |||||||
| 6 | # core | ||||||
| 7 | 2 | 2 | 68 | use List::Util qw(reduce); | |||
| 2 | 3 | ||||||
| 2 | 215 | ||||||
| 8 | |||||||
| 9 | # cpan | ||||||
| 10 | use Class::Accessor::Lite ( | ||||||
| 11 | 2 | 16 | rw => [qw(opt content)], | ||||
| 12 | 2 | 2 | 1509 | ); | |||
| 2 | 2263 | ||||||
| 13 | |||||||
| 14 | # lib | ||||||
| 15 | 2 | 2 | 1192 | use HTML::ExtractContent::Util; | |||
| 2 | 8 | ||||||
| 2 | 21 | ||||||
| 16 | |||||||
| 17 | our $VERSION = '0.12'; | ||||||
| 18 | |||||||
| 19 | sub new { | ||||||
| 20 | 1 | 1 | 1 | 349 | my ($class, $opt) = @_; | ||
| 21 | 1 | 3 | my $self = bless {}, $class; | ||||
| 22 | $self->{opt} = { | ||||||
| 23 | 1 | 26 | threshold => 60, # threhold for score of clusters | ||||
| 24 | min_length => 30, # minimum length of blocks | ||||||
| 25 | decay_factor => 0.75, # decay factor for block scores | ||||||
| 26 | no_body_factor => 0.72, | ||||||
| 27 | continuous_factor => 1.62, # continuous factor for block scores | ||||||
| 28 | punctuation_weight => 10, # score weight for punctuations | ||||||
| 29 | 2 | 2 | 682 | punctuations => qr/(?:[。、.,!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/is, | |||
| 2 | 23 | ||||||
| 2 | 37 | ||||||
| 30 | waste_expressions => qr/Copyright|All\s*Rights?\s*Reserved?/is, | ||||||
| 31 | # characteristic keywords including footer | ||||||
| 32 | affiliate_expressions => | ||||||
| 33 | qr/amazon[a-z0-9\.\/\-\?&]+-22/is, | ||||||
| 34 | block_separator => qr/<\/?(?:div|center|td)[^>]*>| ]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/is, |
||||||
| 35 | # nocontent => qr/<\/frameset>|]*url/is, | ||||||
| 36 | nocontent => qr/<\/frameset>/is, | ||||||
| 37 | min_nolink => 8, | ||||||
| 38 | nolist_ratio => 0.2, | ||||||
| 39 | debug => 0 | ||||||
| 40 | }; | ||||||
| 41 | $self->{pattern} = { | ||||||
| 42 | 1 | 21 | a => qr/]*>.*?<\/a\s*>/is, | ||||
| 43 | href => qr/ | ||||||
| 44 | list => qr/<(ul|dl|ol)(.+)<\/\1>/is, | ||||||
| 45 | li => qr/(?: |
||||||
| 46 | title => qr/ |
||||||
| 47 | headline => qr/( |
||||||
| 48 | head => qr/]*>.*?<\/head\s*>/is, | ||||||
| 49 | comment => qr{ (?: | ||||||
| 50 | | | ||||||
| 51 | # remove invisible elements | ||||||
| 52 | < ( [\w:.-]+ ) \s [^>]*? style \s* = [^>]*? \b | ||||||
| 53 | (?: display \s* : \s* none | visibility \s* : \s* hidden ) | ||||||
| 54 | \b [^>]* > .*? \1 \s* > | ||||||
| 55 | ) }xis, | ||||||
| 56 | special => qr//is, | ||||||
| 57 | useless => [ | ||||||
| 58 | qr/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/is, | ||||||
| 59 | qr/ ]*(?:id|class)\s*=\s*['"]?\S*(?:more|menu|side|navi)\S*["']?[^>]*>/is, |
||||||
| 60 | ], | ||||||
| 61 | }; | ||||||
| 62 | 1 | 4 | return bless $self, $class; | ||||
| 63 | } | ||||||
| 64 | |||||||
| 65 | sub as_text { | ||||||
| 66 | 2 | 2 | 1 | 930 | my $self = shift; | ||
| 67 | 2 | 7 | return to_text $self->content; | ||||
| 68 | } | ||||||
| 69 | |||||||
| 70 | sub as_html { | ||||||
| 71 | 2 | 2 | 1 | 9 | my $self = shift; | ||
| 72 | 2 | 6 | return $self->content; | ||||
| 73 | } | ||||||
| 74 | |||||||
| 75 | sub extract { | ||||||
| 76 | 2 | 2 | 1 | 870 | my $self = shift;; | ||
| 77 | 2 | 11 | $self->content(shift); | ||||
| 78 | 2 | 50 | 21 | if ($self->content =~ $self->opt->{nocontent}) { | |||
| 79 | # frameset or redirect | ||||||
| 80 | 0 | 0 | $self->content(''); | ||||
| 81 | 0 | 0 | return $self; | ||||
| 82 | } | ||||||
| 83 | 2 | 53 | $self->_extract_title; | ||||
| 84 | 2 | 9 | $self->_eliminate_head; | ||||
| 85 | |||||||
| 86 | 2 | 8 | $self->_eliminate_useless_symbols; | ||||
| 87 | 2 | 6 | $self->_eliminate_useless_tags; | ||||
| 88 | |||||||
| 89 | 2 | 3 | my ($factor, $continuous); | ||||
| 90 | 2 | 4 | $factor = $continuous = 1.0; | ||||
| 91 | 2 | 3 | my $body = ''; | ||||
| 92 | 2 | 4 | my $score = 0; | ||||
| 93 | 2 | 7 | my $best = { | ||||
| 94 | content => "", | ||||||
| 95 | score => 0, | ||||||
| 96 | }; | ||||||
| 97 | 2 | 9 | my @list = split $self->opt->{block_separator}, $self->content; | ||||
| 98 | 2 | 310 | my $flag = 0; | ||||
| 99 | 2 | 5 | for my $block (@list) { | ||||
| 100 | 44 | 183 | $block = strip $block; | ||||
| 101 | 44 | 100 | 113 | next unless decode $block; | |||
| 102 | 13 | 100 | 54 | $continuous /= $self->opt->{continuous_factor} if length $body; | |||
| 103 | |||||||
| 104 | # ignore link list block | ||||||
| 105 | 13 | 59 | my $nolink = $self->_eliminate_links($block); | ||||
| 106 | 13 | 18 | my $nolinklen = length $nolink; | ||||
| 107 | 13 | 100 | 40 | next if $nolinklen < $self->opt->{min_length}; | |||
| 108 | |||||||
| 109 | # score | ||||||
| 110 | 9 | 60 | my $c = $self->_score($nolink, $factor); | ||||
| 111 | 9 | 65 | $factor *= $self->opt->{decay_factor}; | ||||
| 112 | |||||||
| 113 | # anti-scoring factors | ||||||
| 114 | 9 | 45 | my $no_body_rate = $self->_no_body_rate($block); | ||||
| 115 | |||||||
| 116 | 9 | 28 | $c *= ($self->opt->{no_body_factor} ** $no_body_rate); | ||||
| 117 | 9 | 58 | my $c1 = $c * $continuous; | ||||
| 118 | |||||||
| 119 | # cluster scoring | ||||||
| 120 | 9 | 100 | 25 | if ($c1 > $self->opt->{threshold}) { | |||
| 100 | |||||||
| 121 | 2 | 11 | $flag = 1; | ||||
| 122 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | ||||||
| 123 | 2 | 50 | 7 | if $self->opt->{debug}; | |||
| 124 | 2 | 15 | $body .= $block . "\n"; | ||||
| 125 | 2 | 4 | $score += $c1; | ||||
| 126 | 2 | 5 | $continuous = $self->opt->{continuous_factor}; | ||||
| 127 | } elsif ($c > $self->opt->{threshold}) { | ||||||
| 128 | 1 | 9 | $flag = 1; | ||||
| 129 | 1 | 50 | 5 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
| 130 | 1 | 50 | 11 | if ($score > $best->{score}) { | |||
| 131 | 1 | 50 | 5 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
| 132 | 1 | 9 | $best = { | ||||
| 133 | content => $body, | ||||||
| 134 | score => $score, | ||||||
| 135 | }; | ||||||
| 136 | } | ||||||
| 137 | 1 | 50 | 5 | print "\n" if $self->opt->{debug}; | |||
| 138 | 1 | 7 | $body = $block . "\n"; | ||||
| 139 | 1 | 2 | $score = $c; | ||||
| 140 | 1 | 4 | $continuous = $self->opt->{continuous_factor}; | ||||
| 141 | print "\n---- continue $c*$continuous=$c1 $nolinklen\n\n$block\n" | ||||||
| 142 | 1 | 50 | 11 | if $self->opt->{debug}; | |||
| 143 | } else { | ||||||
| 144 | 6 | 100 | 76 | $factor /= $self->opt->{decay_factor} if !$flag; | |||
| 145 | print "\n>> reject $c*$continuous=$c1 $nolinklen\n$block\n", | ||||||
| 146 | 6 | 50 | 28 | "<< reject\n" if $self->opt->{debug}; | |||
| 147 | } | ||||||
| 148 | } | ||||||
| 149 | 2 | 50 | 9 | print "\n---- end of cluster: $score\n" if $self->opt->{debug}; | |||
| 150 | 2 | 100 | 18 | if ($best->{score} < $score) { | |||
| 151 | 1 | 50 | 4 | print "!!!! best: score=$score\n" if $self->opt->{debug}; | |||
| 152 | 1 | 9 | $best = { | ||||
| 153 | content =>$body, | ||||||
| 154 | score => $score, | ||||||
| 155 | }; | ||||||
| 156 | } | ||||||
| 157 | 2 | 9 | $self->content($best->{content}); | ||||
| 158 | |||||||
| 159 | 2 | 23 | return $self; | ||||
| 160 | } | ||||||
| 161 | |||||||
| 162 | sub _score { | ||||||
| 163 | 9 | 9 | 18 | my ($self, $nolink, $factor) = @_; | |||
| 164 | return ((length $nolink) | ||||||
| 165 | + (match_count $nolink, $self->opt->{punctuations}) | ||||||
| 166 | * $self->opt->{punctuation_weight}) | ||||||
| 167 | 9 | 23 | * $factor; | ||||
| 168 | } | ||||||
| 169 | |||||||
| 170 | sub _no_body_rate { | ||||||
| 171 | 9 | 9 | 12 | my ($self, $block) = @_; | |||
| 172 | return (match_count $block,$self->opt->{waste_expressions}) | ||||||
| 173 | 9 | 24 | + (match_count $block,$self->opt->{affiliate_expressions})/2.0; | ||||
| 174 | } | ||||||
| 175 | |||||||
| 176 | sub _extract_title { | ||||||
| 177 | 2 | 2 | 5 | my $self = shift; | |||
| 178 | 2 | 3 | my $title; | ||||
| 179 | 2 | 50 | 6 | if ($self->content =~ $self->{pattern}->{title}) { | |||
| 180 | 2 | 49 | $title = strip (strip_tags $1); | ||||
| 181 | 2 | 50 | 10 | if (length $title) { | |||
| 182 | 2 | 6 | my $pat = $self->{pattern}->{headline}; | ||||
| 183 | 2 | 39 | $self->{content} =~ s/$pat/ | ||||
| 184 | 4 | 100 | 19 | (index $title, strip(strip_tags($2))) >= 0 ? " $2<\/div>" : "$1"/igse; |
|||
| 185 | } | ||||||
| 186 | } | ||||||
| 187 | } | ||||||
| 188 | |||||||
| 189 | sub _eliminate_head { | ||||||
| 190 | 2 | 2 | 3 | my $self = shift; | |||
| 191 | 2 | 5 | my $pat = $self->{pattern}->{head}; | ||||
| 192 | 2 | 24 | $self->{content} =~ s/$pat//is; | ||||
| 193 | } | ||||||
| 194 | |||||||
| 195 | sub _eliminate_useless_symbols { | ||||||
| 196 | 2 | 2 | 3 | my $self = shift; | |||
| 197 | 2 | 5 | my $comment = $self->{pattern}->{comment}; | ||||
| 198 | 2 | 3 | my $special = $self->{pattern}->{special}; | ||||
| 199 | 2 | 90 | $self->{content} =~ s/$comment//igs; | ||||
| 200 | 2 | 29 | $self->{content} =~ s/$special//igs; | ||||
| 201 | } | ||||||
| 202 | |||||||
| 203 | sub _eliminate_useless_tags { | ||||||
| 204 | 2 | 2 | 4 | my $self = shift; | |||
| 205 | 2 | 9 | my @useless = @{$self->{pattern}->{useless}}; | ||||
| 2 | 7 | ||||||
| 206 | 2 | 5 | for my $pat (@useless) { | ||||
| 207 | 4 | 145 | $self->{content} =~ s/$pat//igs; | ||||
| 208 | } | ||||||
| 209 | } | ||||||
| 210 | |||||||
| 211 | sub _eliminate_links { | ||||||
| 212 | 13 | 13 | 22 | my ($self, $block) = @_; | |||
| 213 | 13 | 43 | my $count = match_count $block, $self->{pattern}->{a}; | ||||
| 214 | 13 | 36 | my $nolink = to_text (eliminate_forms (eliminate_links $block)); | ||||
| 215 | 13 | 50 | 49 | return '' if length $nolink < $self->opt->{min_nolink} * $count; | |||
| 216 | 13 | 50 | 97 | return '' if $self->_is_linklist($block); | |||
| 217 | 13 | 33 | return $nolink; | ||||
| 218 | } | ||||||
| 219 | |||||||
| 220 | sub _is_linklist { | ||||||
| 221 | 13 | 13 | 24 | my ($self, $block) = @_; | |||
| 222 | 13 | 20 | my $listpat = $self->{pattern}->{list}; | ||||
| 223 | 13 | 50 | 82 | if ($block =~ $listpat) { | |||
| 224 | 0 | 0 | my $list = $2; | ||||
| 225 | 0 | 0 | my @fragments = split($listpat, $block, 2); | ||||
| 226 | 0 | 0 | my $nolist = $list; | ||||
| 227 | 0 | 0 | $nolist =~ s/$listpat//igs; | ||||
| 228 | 0 | 0 | $nolist = to_text(join($nolist, @fragments)); | ||||
| 229 | 0 | 0 | my @listitems = split $self->{pattern}->{li}, $list; | ||||
| 230 | 0 | 0 | shift @listitems; | ||||
| 231 | 0 | 0 | my $rate = 0; | ||||
| 232 | 0 | 0 | for my $li (@listitems) { | ||||
| 233 | 0 | 0 | 0 | $rate++ if $li =~ $self->{pattern}->{href}; | |||
| 234 | } | ||||||
| 235 | 0 | 0 | 0 | $rate = 1.0 * $rate / ($#listitems+1) if $#listitems+1; | |||
| 236 | 0 | 0 | $list = to_text $list; | ||||
| 237 | 0 | 0 | my $limit = ($self->opt->{nolist_ratio}*$rate) | ||||
| 238 | * ($rate * (length $list)); | ||||||
| 239 | 0 | 0 | return length $nolist < $limit; | ||||
| 240 | } | ||||||
| 241 | 13 | 39 | return 0; | ||||
| 242 | } | ||||||
| 243 | |||||||
| 244 | 1; | ||||||
| 245 | __END__ |