File Coverage

blib/lib/NewsExtractor/Role/ContentTextExtractor.pm

Criterion	Covered	Total	%
statement	21	43	48.8
branch	0	10	0.0
condition			n/a
subroutine	7	9	77.7
pod			n/a
total	28	62	45.1

line	stmt	bran	sub	time	code
1					package NewsExtractor::Role::ContentTextExtractor;
2	1		1	5985	use utf8;
	1			4
	1			10
3	1		1	39	use Moo::Role;
	1			3
	1			8
4
5	1		1	452	use Types::Standard qw(Str Maybe);
	1			2
	1			16
6	1		1	1132	use List::Util qw(max);
	1			2
	1			92
7	1		1	7	use HTML::ExtractContent;
	1			2
	1			30
8
9	1		1	6	use Importer 'NewsExtractor::TextUtil' => qw( html2text );
	1			3
	1			10
10	1		1	35	use Importer 'NewsExtractor::Constants' => qw( %RE );
	1			3
	1			4
11
12					has site_name => (
13					is => "lazy",
14					isa => Maybe[Str],
15					);
16
17					has content_text => (
18					is => "lazy",
19					isa => Maybe[Str],
20					);
21
22					sub _build_site_name {
23	0		0		my ($self) = @_;
24
25	0				my $el = $self->dom->at("meta[property='og:site_name']");
26	0	0			if ($el) {
27	0				return $el->attr('content');
28					}
29
30	0				return undef;
31					}
32
33					sub _build_content_text {
34	0		0		my ($self) = @_;
35	0				my ($el, $html);
36
37					# Cleanup some noisy elements that are known to interfere.
38	0				$self->dom->find('script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather')->map('remove');
39
40	0				my $extractor = HTML::ExtractContent->new;
41	0	0			if ($el = $self->dom->at('article')) {
42	0				$html = $extractor->extract("$el")->as_html;
43					} else {
44	0				$html = $extractor->extract( $self->dom->to_string )->as_html;
45					}
46
47	0				my $text = html2text( $html );
48
49	0	0			my @paragraphs = split(/\n\n/, $text) or return undef;
50
51	0	0			if (my $site_name = $self->site_name) {
52	0				$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x;
53	0				$paragraphs[-1] =~ s/${site_name}//x;
54					}
55
56	0				$paragraphs[-1] =~ s/\A \s* \p{Punct}? \s* $RE{newspaper_names} \s* \p{Punct}? \s* \z//x;
57
58	0	0			if (max( map { length($_) } @paragraphs ) < 30) {
	0
59					# err "[$$] Not enough contents";
60	0				return undef;
61					}
62
63	0				return join "\n\n", @paragraphs;
64					}
65
66					1;