line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::Role::ContentTextExtractor; |
2
|
1
|
|
|
1
|
|
6990
|
use utf8; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
3
|
1
|
|
|
1
|
|
39
|
use Moo::Role; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
457
|
use Types::Standard qw(Str Maybe); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
13
|
|
6
|
1
|
|
|
1
|
|
1150
|
use List::Util qw(max); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
87
|
|
7
|
1
|
|
|
1
|
|
6
|
use HTML::ExtractContent; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
29
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
7
|
use Importer 'NewsExtractor::TextUtil' => qw( html2text ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
10
|
1
|
|
|
1
|
|
39
|
use Importer 'NewsExtractor::Constants' => qw( %RE ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
has site_name => ( |
13
|
|
|
|
|
|
|
is => "lazy", |
14
|
|
|
|
|
|
|
isa => Maybe[Str], |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has content_text => ( |
18
|
|
|
|
|
|
|
is => "lazy", |
19
|
|
|
|
|
|
|
isa => Maybe[Str], |
20
|
|
|
|
|
|
|
); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
sub _build_site_name { |
23
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
24
|
|
|
|
|
|
|
|
25
|
0
|
|
|
|
|
|
my $el = $self->dom->at("meta[property='og:site_name']"); |
26
|
0
|
0
|
|
|
|
|
if ($el) { |
27
|
0
|
|
|
|
|
|
return $el->attr('content'); |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
return undef; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub _build_content_text { |
34
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
35
|
0
|
|
|
|
|
|
my ($el, $html); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Cleanup some noisy elements that are known to interfere. |
38
|
0
|
|
|
|
|
|
$self->dom->find('script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather')->map('remove'); |
39
|
|
|
|
|
|
|
|
40
|
0
|
|
|
|
|
|
my $extractor = HTML::ExtractContent->new; |
41
|
0
|
0
|
|
|
|
|
if ($el = $self->dom->at('article')) { |
42
|
0
|
|
|
|
|
|
$html = $extractor->extract("$el")->as_html; |
43
|
|
|
|
|
|
|
} else { |
44
|
0
|
|
|
|
|
|
$html = $extractor->extract( $self->dom->to_string )->as_html; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
0
|
|
|
|
|
|
my $text = html2text( $html ); |
48
|
|
|
|
|
|
|
|
49
|
0
|
0
|
|
|
|
|
my @paragraphs = split(/\n\n/, $text) or return undef; |
50
|
|
|
|
|
|
|
|
51
|
0
|
0
|
|
|
|
|
if (my $site_name = $self->site_name) { |
52
|
0
|
|
|
|
|
|
$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x; |
53
|
0
|
|
|
|
|
|
$paragraphs[-1] =~ s/${site_name}//x; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
0
|
|
|
|
|
|
$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* $RE{newspaper_names} \s* \p{Punct}? \s* \z//x; |
57
|
|
|
|
|
|
|
|
58
|
0
|
0
|
|
|
|
|
if (max( map { length($_) } @paragraphs ) < 30) { |
|
0
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# err "[$$] Not enough contents"; |
60
|
0
|
|
|
|
|
|
return undef; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
0
|
|
|
|
|
|
return join "\n\n", @paragraphs; |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
1; |