File Coverage

blib/lib/NewsExtractor/CSSExtractor.pm
Criterion Covered Total %
statement 14 37 37.8
branch 0 8 0.0
condition n/a
subroutine 5 10 50.0
pod 0 4 0.0
total 19 59 32.2


line stmt bran cond sub pod time code
1             package NewsExtractor::CSSExtractor;
2 1     1   20 use v5.18;
  1         4  
3 1     1   6 use utf8;
  1         3  
  1         26  
4 1     1   32 use Moo;
  1         3  
  1         8  
5             extends 'NewsExtractor::TXExtractor';
6 1     1   472 use Importer 'NewsExtractor::TextUtil' => qw( normalize_whitespace remove_control_characters );
  1         3  
  1         10  
7              
8 1     1   40 use Types::Standard qw( InstanceOf );
  1         2  
  1         20  
9              
10             has css_selector => (
11             required => 1,
12             is => 'ro',
13             isa => InstanceOf['NewsExtractor::CSSRuleSet']
14             );
15              
16             sub _take {
17 0     0     my ($self, $sel) = @_;
18              
19 0           $self->dom->find("$sel style, $sel script")->map('remove');
20 0           my $txt = "". $self->dom->find( $sel )->map('all_text')->join("\n\n");
21 0 0         return undef if $txt eq '';
22              
23 0           $txt = normalize_whitespace(remove_control_characters($txt));
24 0           $txt =~ s/\s+$//;
25 0           $txt =~ s/^\s+//;
26 0           $txt =~ s/\n\n+/\n\n/g;
27 0           return $txt;
28             }
29              
30             sub headline {
31 0     0 0   my ($self) = @_;
32 0 0         my $ret = $self->_take($self->css_selector->headline) or return;
33 0           $ret =~ s/\n/ /g;
34 0           return normalize_whitespace($ret);
35             }
36              
37             sub dateline {
38 0     0 0   my ($self) = @_;
39 0 0         my $ret = $self->_take($self->css_selector->dateline) or return;
40 0           $ret =~ s/\n/ /g;
41 0           return normalize_whitespace($ret);
42             }
43              
44             sub journalist {
45 0     0 0   my ($self) = @_;
46 0 0         my $ret = $self->_take( $self->css_selector->journalist ) or return;
47 0           $ret =~ s/\n/ /g;
48 0           return normalize_whitespace($ret);
49             }
50              
51             sub content_text {
52 0     0 0   my ($self) = @_;
53 0           return $self->_take( $self->css_selector->content_text );
54             }
55              
56             1;