line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
use utf8; |
2
|
1
|
|
|
1
|
|
3903
|
use Moo::Role; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
3
|
1
|
|
|
1
|
|
28
|
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
5
|
|
4
|
|
|
|
|
|
|
use Types::Standard qw(Str Maybe); |
5
|
1
|
|
|
1
|
|
321
|
use List::Util qw(max); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
6
|
1
|
|
|
1
|
|
808
|
use HTML::ExtractContent; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
50
|
|
7
|
1
|
|
|
1
|
|
4
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
8
|
|
|
|
|
|
|
use Importer 'NewsExtractor::TextUtil' => qw( html2text ); |
9
|
1
|
|
|
1
|
|
4
|
use Importer 'NewsExtractor::Constants' => qw( %RE ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
10
|
1
|
|
|
1
|
|
28
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
3
|
|
11
|
|
|
|
|
|
|
has site_name => ( |
12
|
|
|
|
|
|
|
is => "lazy", |
13
|
|
|
|
|
|
|
isa => Maybe[Str], |
14
|
|
|
|
|
|
|
); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has content_text => ( |
17
|
|
|
|
|
|
|
is => "lazy", |
18
|
|
|
|
|
|
|
isa => Maybe[Str], |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my ($self) = @_; |
22
|
|
|
|
|
|
|
|
23
|
0
|
|
|
0
|
|
|
my $el = $self->dom->at("meta[property='og:site_name']"); |
24
|
|
|
|
|
|
|
if ($el) { |
25
|
0
|
|
|
|
|
|
return $el->attr('content'); |
26
|
0
|
0
|
|
|
|
|
} |
27
|
0
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
return undef; |
29
|
|
|
|
|
|
|
} |
30
|
0
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my ($self) = @_; |
32
|
|
|
|
|
|
|
my ($el, $html); |
33
|
|
|
|
|
|
|
|
34
|
0
|
|
|
0
|
|
|
# Cleanup some noisy elements that are known to interfere. |
35
|
0
|
|
|
|
|
|
$self->dom->find('script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather')->map('remove'); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
my $extractor = HTML::ExtractContent->new; |
38
|
0
|
|
|
|
|
|
if ($el = $self->dom->at('article')) { |
39
|
|
|
|
|
|
|
$html = $extractor->extract("$el")->as_html; |
40
|
0
|
|
|
|
|
|
} else { |
41
|
0
|
0
|
|
|
|
|
$html = $extractor->extract( $self->dom->to_string )->as_html; |
42
|
0
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
0
|
|
|
|
|
|
my $text = html2text( $html ); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my @paragraphs = split(/\n\n/, $text) or return undef; |
47
|
0
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
if (my $site_name = $self->site_name) { |
49
|
0
|
0
|
|
|
|
|
$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x; |
50
|
|
|
|
|
|
|
$paragraphs[-1] =~ s/${site_name}//x; |
51
|
0
|
0
|
|
|
|
|
} |
52
|
0
|
|
|
|
|
|
|
53
|
0
|
|
|
|
|
|
$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* $RE{newspaper_names} \s* \p{Punct}? \s* \z//x; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
if (max( map { length($_) } @paragraphs ) < 30) { |
56
|
0
|
|
|
|
|
|
# err "[$$] Not enough contents"; |
57
|
|
|
|
|
|
|
return undef; |
58
|
0
|
0
|
|
|
|
|
} |
|
0
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
0
|
|
|
|
|
|
return join "\n\n", @paragraphs; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
0
|
|
|
|
|
|
1; |