line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::SiteSpecificExtractor::yimedia_com_tw; |
2
|
1
|
|
|
1
|
|
9
|
use utf8; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
3
|
1
|
|
|
1
|
|
35
|
use Moo; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
4
|
|
|
|
|
|
|
extends 'NewsExtractor::GenericExtractor'; |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
346
|
use Importer 'NewsExtractor::TextUtil' => 'normalize_whitespace', 'u'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
before 'content_text', sub { |
9
|
|
|
|
|
|
|
my ($self) = @_; |
10
|
|
|
|
|
|
|
$self->dom->find('figure.fbyt-block')->map('remove'); |
11
|
|
|
|
|
|
|
if (my $el = $self->dom->at('#penci-post-entry-inner > p:last-of-type')) { |
12
|
|
|
|
|
|
|
print $el->content() ."\n"; |
13
|
|
|
|
|
|
|
if ($el->content() =~ /\A看更多<br>/) { |
14
|
|
|
|
|
|
|
$el->remove(); |
15
|
|
|
|
|
|
|
} |
16
|
|
|
|
|
|
|
} |
17
|
|
|
|
|
|
|
}; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub journalist { |
20
|
0
|
|
|
0
|
0
|
|
my $self = $_[0]; |
21
|
0
|
|
|
|
|
|
my $ret; |
22
|
0
|
0
|
|
|
|
|
if (my $el = $self->dom->at('#penci-post-entry-inner > p:nth-child(1)')) { |
23
|
0
|
0
|
|
|
|
|
if ($el->content() =~ /文字撰稿:(?<name> \p{Letter}+ )<br>/x) { |
24
|
1
|
|
|
1
|
|
1073
|
($ret) = $+{"name"}; |
|
1
|
|
|
|
|
503
|
|
|
1
|
|
|
|
|
151
|
|
|
0
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
} |
28
|
0
|
|
|
|
|
|
return $ret; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
1; |