line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::JSONLDExtractor; |
2
|
1
|
|
|
1
|
|
9
|
use Moo; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
3
|
|
|
|
|
|
|
extends 'NewsExtractor::TXExtractor'; |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
399
|
use Mojo::Transaction::HTTP; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
14
|
|
6
|
1
|
|
|
1
|
|
38
|
use Types::Standard qw( InstanceOf HashRef ArrayRef ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
20
|
|
7
|
1
|
|
|
1
|
|
882
|
use Mojo::JSON qw(from_json); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
90
|
|
8
|
1
|
|
|
1
|
|
9
|
use Importer 'NewsExtractor::TextUtil' => qw(u remove_control_characters); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has tx => ( |
11
|
|
|
|
|
|
|
required => 1, is => 'ro', |
12
|
|
|
|
|
|
|
isa => InstanceOf['Mojo::Transaction::HTTP'] ); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has schema_ld => ( |
15
|
|
|
|
|
|
|
required => 0, |
16
|
|
|
|
|
|
|
is => 'lazy', |
17
|
|
|
|
|
|
|
isa => HashRef, |
18
|
|
|
|
|
|
|
builder => 1, |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub _build_schema_ld { |
22
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
23
|
0
|
0
|
|
|
|
|
my $el = $self->dom->at('script[type="application/ld+json"]') or return {}; |
24
|
0
|
|
|
|
|
|
my $x = from_json( $el->text ); |
25
|
0
|
0
|
|
|
|
|
if (HashRef->check($x)) { |
26
|
0
|
|
|
|
|
|
return $x; |
27
|
|
|
|
|
|
|
} |
28
|
0
|
0
|
|
|
|
|
if (ArrayRef->check($x)) { |
29
|
0
|
|
|
|
|
|
return $x->[0]; |
30
|
|
|
|
|
|
|
} |
31
|
0
|
|
|
|
|
|
return {}; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub journalist { |
35
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
36
|
0
|
|
|
|
|
|
return remove_control_characters(u($self->schema_ld->{author}{name})); |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub headline { |
40
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
41
|
0
|
|
|
|
|
|
return remove_control_characters(u($self->schema_ld->{headline})); |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub dateline { |
45
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
46
|
0
|
|
|
|
|
|
return remove_control_characters(u($self->schema_ld->{datePublished})); |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub content_text { |
50
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
51
|
0
|
|
0
|
|
|
|
my $text = $self->schema_ld->{articleBody} // $self->schema_ld->{description} // ''; |
|
|
|
0
|
|
|
|
|
52
|
0
|
|
|
|
|
|
return remove_control_characters(u($text)); |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
1; |