line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::TXExtractor; |
2
|
1
|
|
|
1
|
|
736
|
use Moo; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
8
|
|
3
|
1
|
|
|
1
|
|
401
|
use Types::Standard qw( InstanceOf ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
12
|
|
4
|
1
|
|
|
1
|
|
551
|
use Encode 'find_encoding'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
428
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
has tx => ( |
7
|
|
|
|
|
|
|
required => 0, |
8
|
|
|
|
|
|
|
is => 'ro', |
9
|
|
|
|
|
|
|
isa => InstanceOf['Mojo::Transaction::HTTP'] |
10
|
|
|
|
|
|
|
); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
has dom => ( |
13
|
|
|
|
|
|
|
required => 0, |
14
|
|
|
|
|
|
|
isa => InstanceOf['Mojo::DOM'], |
15
|
|
|
|
|
|
|
is => 'lazy', |
16
|
|
|
|
|
|
|
builder => 1, |
17
|
|
|
|
|
|
|
); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub _build_dom { |
20
|
0
|
|
|
0
|
|
|
my $tx = $_[0]->tx; |
21
|
0
|
|
|
|
|
|
my $dom = $tx->result->dom; |
22
|
|
|
|
|
|
|
|
23
|
0
|
|
|
|
|
|
my $charset; |
24
|
0
|
0
|
|
|
|
|
if ($tx->result->headers->content_type =~ /charset=(\S+)/) { |
|
|
0
|
|
|
|
|
|
25
|
0
|
|
|
|
|
|
$charset = $1; |
26
|
|
|
|
|
|
|
} elsif (my $el = $dom->at('meta[http-equiv="content-type" i]')) { |
27
|
0
|
0
|
|
|
|
|
if ($el->attr("content") =~ /\;\s*charset=(\S+)/i) { |
28
|
0
|
|
|
|
|
|
$charset = $1; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
0
|
0
|
|
|
|
|
if ($charset) { |
33
|
0
|
|
|
|
|
|
my $enc = find_encoding( $charset ); |
34
|
0
|
0
|
|
|
|
|
if ($enc) { |
35
|
0
|
|
|
|
|
|
my $body = $enc->decode($tx->result->body); |
36
|
0
|
|
|
|
|
|
$dom = Mojo::DOM->new($body); |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
0
|
|
|
|
|
|
return $dom; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
1; |