line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::SiteSpecificExtractor::yimedia_com_tw; |
2
|
1
|
|
|
1
|
|
8
|
use utf8; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
3
|
1
|
|
|
1
|
|
34
|
use Moo; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
4
|
|
|
|
|
|
|
extends 'NewsExtractor::GenericExtractor'; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
325
|
use Importer 'NewsExtractor::TextUtil' => 'normalize_whitespace', 'u'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
before 'content_text', sub { |
9
|
|
|
|
|
|
|
my ($self) = @_; |
10
|
|
|
|
|
|
|
$self->dom->find('figure.fbyt-block')->map('remove'); |
11
|
|
|
|
|
|
|
if (my $el = $self->dom->at('#penci-post-entry-inner > p:last-of-type')) { |
12
|
|
|
|
|
|
|
if ($el->content() =~ /\A看更多<br>/) { |
13
|
|
|
|
|
|
|
$el->remove(); |
14
|
|
|
|
|
|
|
} |
15
|
|
|
|
|
|
|
} |
16
|
|
|
|
|
|
|
}; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
sub journalist { |
19
|
0
|
|
|
0
|
0
|
|
my $self = $_[0]; |
20
|
0
|
|
|
|
|
|
my $ret; |
21
|
0
|
0
|
|
|
|
|
if (my $el = $self->dom->at('#penci-post-entry-inner > p:nth-child(1)')) { |
22
|
0
|
0
|
|
|
|
|
if ($el->content() =~ /文字撰稿:(?<name> \p{Letter}+ )<br>/x) { |
23
|
1
|
|
|
1
|
|
892
|
($ret) = $+{"name"}; |
|
1
|
|
|
|
|
469
|
|
|
1
|
|
|
|
|
82
|
|
|
0
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
} |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
} |
27
|
0
|
|
|
|
|
|
return $ret; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
1; |