line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::GenericExtractor; |
2
|
1
|
|
|
1
|
|
20
|
use v5.18; |
|
1
|
|
|
|
|
4
|
|
3
|
1
|
|
|
1
|
|
7
|
use utf8; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
26
|
use Moo; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
6
|
|
|
|
|
|
|
extends 'NewsExtractor::TXExtractor'; |
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
1124
|
use HTML::ExtractContent; |
|
1
|
|
|
|
|
26619
|
|
|
1
|
|
|
|
|
62
|
|
9
|
1
|
|
|
1
|
|
14
|
use Mojo::DOM; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
10
|
1
|
|
|
1
|
|
5
|
use NewsExtractor::Types qw(is_NewspaperName); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
17
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
1042
|
use Importer 'NewsExtractor::TextUtil' => qw( u normalize_whitespace parse_dateline_ymdhms ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
12
|
|
13
|
1
|
|
|
1
|
|
36
|
use Importer 'NewsExtractor::Constants' => qw( %RE ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
5
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
with 'NewsExtractor::Role::ContentTextExtractor'; |
16
|
|
|
|
|
|
|
|
17
|
1
|
|
|
1
|
|
70
|
no Moo; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
sub headline { |
20
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
21
|
|
|
|
|
|
|
|
22
|
0
|
|
|
|
|
|
my ($title, $el); |
23
|
0
|
|
|
|
|
|
my $dom = $self->dom; |
24
|
0
|
0
|
|
|
|
|
if ($el = $dom->at("#story #news_title, #news_are .newsin_title, .data_midlle_news_box01 dl td:first-child")) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25
|
0
|
|
|
|
|
|
$title = $el->text; |
26
|
|
|
|
|
|
|
} elsif ($el = $dom->at("meta[property='og:title']")) { |
27
|
0
|
|
|
|
|
|
$title = $el->attr("content"); |
28
|
|
|
|
|
|
|
} elsif ($el = $dom->at("meta[name='title']")) { |
29
|
0
|
|
|
|
|
|
$title = $el->attr('content'); |
30
|
|
|
|
|
|
|
} elsif ($el = $dom->at("title")) { |
31
|
0
|
|
|
|
|
|
$title = $el->text; |
32
|
|
|
|
|
|
|
} else { |
33
|
0
|
|
|
|
|
|
return; |
34
|
|
|
|
|
|
|
} |
35
|
0
|
|
|
|
|
|
$title .= ""; |
36
|
|
|
|
|
|
|
|
37
|
0
|
0
|
|
|
|
|
if (my $site_name = $self->site_name) { |
38
|
0
|
|
|
|
|
|
$title =~ s/\s* \p{Punct} \s* $site_name \s* \z//x; |
39
|
|
|
|
|
|
|
} |
40
|
0
|
0
|
|
|
|
|
if (defined($title)) { |
41
|
0
|
|
|
|
|
|
my $delim = qr<(?: \p{Punct} | \| | │ )>x; |
42
|
0
|
|
|
|
|
|
$title =~ s/ \s* $delim \s* $RE{newspaper_names} \s* \z//x; |
43
|
0
|
|
|
|
|
|
$title =~ s/\A $RE{newspaper_names} \s* $delim \s* //x; |
44
|
0
|
|
|
|
|
|
$title =~ s/\r\n/\n/g; |
45
|
0
|
|
|
|
|
|
$title =~ s/\A\s+//; |
46
|
0
|
|
|
|
|
|
$title =~ s/\s+\z//; |
47
|
|
|
|
|
|
|
} |
48
|
0
|
|
0
|
|
|
|
return $title && normalize_whitespace($title); |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub dateline { |
52
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
53
|
0
|
|
|
|
|
|
my $dateline; |
54
|
|
|
|
|
|
|
my $guess; |
55
|
|
|
|
|
|
|
|
56
|
0
|
|
|
|
|
|
my $dom = $self->dom; |
57
|
0
|
0
|
|
|
|
|
if ($guess = $dom->at("meta[property='article:modified_time'], meta[property='article:published_time'], meta[itemprop=dateModified][content], meta[itemprop=datePublished][content]")) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
$dateline = $guess->attr('content'); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
elsif ($guess = $dom->at("time[itemprop=datePublished][datetime], h1 time[datetime], .func_time time[pubdate], span.time > time.post-published")) { |
61
|
0
|
|
|
|
|
|
$dateline = $guess->attr('datetime'); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
elsif ($guess = $dom->at(".reporter time, span.viewtime, header.article-desc time, .timeBox .updatetime span, .caption div.label-date, .contents_page span.date, .main-content span.date, .newsin_date, .news .date, .author .date, ul.info > li.date > span:nth-child(2), #newsHeadline span.datetime, article p.date, .post-meta > .icon-clock > span, .article_info_content span.info_time, .content time.page-date, .c_time, .newsContent p.time, div.title > div.time, div.article-meta div.article-date, address.authorInfor time, .entry-meta .date a, .author-links .posts-date, .top_title span.post_time, .node-inner > .submitted > span")) { |
64
|
0
|
|
|
|
|
|
$dateline = $guess->text; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
elsif ($guess = $dom->at("div#articles cite")) { |
67
|
0
|
|
|
|
|
|
$guess->at("a")->remove; |
68
|
0
|
|
|
|
|
|
$dateline = $guess->text; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
elsif ($guess = $dom->at("article.ndArticle_leftColumn div.ndArticle_creat, ul.info li.date, .cpInfo .cp, .nsa3 .tt27")) { |
71
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}[\-/][0-9]{2}[\-/][0-9]{2} [0-9]{2}:[0-9]{2})#; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
elsif ($guess = $dom->at(".news-toolbar .news-toolbar__cell")) { |
74
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}/[0-9]{2}/[0-9]{2})#; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
elsif ($guess = $dom->at(".content .writer span:nth-child(2)")) { |
77
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2})#; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
elsif ($guess = $dom->at("div.content-wrapper-right > div > div > div:nth-child(4)")) { |
80
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2})#; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
elsif ($guess = $dom->at("span#ctl00_ContentPlaceHolder1_News_Label, #ctl00_ContentPlaceHolder1_UpdatePanel2 font[color=darkred]")) { |
83
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}/[0-9]{1,2}/[0-9]{1,2})#; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
elsif ($guess = $dom->at(".news-info dd.date:nth-child(6)")) { |
86
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日[0-9]{2}:[0-9]{2})#; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
elsif ($guess = $dom->at("article.entry-content div:nth-child(2)")) { |
89
|
0
|
|
|
|
|
|
($dateline) = $guess->text =~ m#([0-9]{4}-[0-9]{1,2}-[0-9]{1,2})#; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
elsif ($guess = $dom->at("span.submitted-by")) { |
92
|
|
|
|
|
|
|
# www.thinkingtaiwan.com |
93
|
0
|
|
|
|
|
|
my ($month, $day, $year) = $guess->text =~ m/([0-9]+)/g; |
94
|
0
|
|
|
|
|
|
$dateline = u( |
95
|
|
|
|
|
|
|
sprintf( |
96
|
|
|
|
|
|
|
'%04d-%02d-%02dT%02d:%02d:%02d%s', |
97
|
|
|
|
|
|
|
$year, $month, $day, 23, 59, 59, '+08:00' |
98
|
|
|
|
|
|
|
) |
99
|
|
|
|
|
|
|
); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
elsif ($guess = $dom->at('#story #news_author')) { |
102
|
0
|
|
|
|
|
|
($dateline) = $guess->all_text =~ m{\A 【記者.+ 】\s* (.+) \z}x; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
elsif ($guess = $dom->at('.data_midlle_news_box01 dl dd ul li:first-child')) { |
105
|
0
|
|
|
|
|
|
($dateline) = $guess->text; |
106
|
0
|
|
|
|
|
|
my ($year, $mmdd) = $dateline =~ /\A ([0-9]{3}) - (.+) \z /x; |
107
|
0
|
|
|
|
|
|
$year += 1911; |
108
|
0
|
|
|
|
|
|
$dateline = $year . '-' . $mmdd; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
elsif ($guess = $dom->at('#details_block .left .date, .article_header > .author > span:last-child')) { |
111
|
0
|
|
|
|
|
|
$dateline = normalize_whitespace $guess->text; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
elsif ($guess = $dom->at( |
114
|
|
|
|
|
|
|
join(',' |
115
|
|
|
|
|
|
|
, '.timebox > .publishtime' # howlife.cna.com.tw |
116
|
|
|
|
|
|
|
, 'div.newsInfo > span.time' # n.yam.com |
117
|
|
|
|
|
|
|
))) { |
118
|
0
|
|
|
|
|
|
$dateline = parse_dateline_ymdhms($guess->all_text, '+08:00'); |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
0
|
0
|
|
|
|
|
if ($dateline) { |
122
|
0
|
|
|
|
|
|
$dateline = normalize_whitespace($dateline); |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
$dateline =~ s<\A ([0-9]{4}) (\p{Punct}) ([0-9]{1,2}) \2 ([0-9]{1,2}) \z>< sprintf('%04d-%02d-%02d', $1, $3, $4) >ex; |
|
0
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
|
if ($dateline =~ /^([0-9]{4})[^0-9]/) { |
127
|
0
|
0
|
|
|
|
|
if ($1 > ((localtime)[5] + 1900)) { |
128
|
0
|
|
|
|
|
|
$dateline = undef; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
0
|
|
|
|
|
|
return $dateline; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub journalist { |
137
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
138
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
my $dom = $self->dom; |
140
|
0
|
|
|
|
|
|
my ($ret, $guess); |
141
|
|
|
|
|
|
|
|
142
|
0
|
0
|
|
|
|
|
if ( $guess = $dom->at('meta[property="og:article:author"]') ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
$ret = $guess->attr('content'); |
144
|
|
|
|
|
|
|
} elsif ( $guess = $dom->at('meta[name="author"]') ) { |
145
|
0
|
|
|
|
|
|
$ret = $guess->attr('content'); |
146
|
|
|
|
|
|
|
} elsif ( $guess = $dom->at('.bt_xmic span[itemprop=author], div.tdb_single_author a.tdb-author-name, div.field-item a[href^=/author/], div.content_reporter a[itemprop=author], span[itemprop=author] a, div.author div.intro a div.name, div.article-author > h5 > a, div.article-meta > div.article-author > a, div.authorInfo li.authorName > a, .article .writer > p, .info_author, .news-info dd[itemprop=author], .content_reporter a, .top_title span.reporter_name, .post-heading time span, header .article-meta .article-author, .article_header > .author > span:first-child, .mid-news > .m-left-side > .maintype-wapper > .subtype-sort, .newsCon > .newsInfo > span:first-child, .newsdetail_content > .title > h4 > a[href^="/news/searchresult/news?search_text="], .m-from-author > .m-from-author__name, .post-author-name a[itemprop*=author], a.post-author-avatar .post-author-name > b, div#news_content div.author') ) { |
147
|
0
|
|
|
|
|
|
$ret = $guess->text; |
148
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('div#yt_container_placeholder + p')) { |
149
|
0
|
|
|
|
|
|
($ret) = $guess->text =~ m{\A \s* (.+) \s+ 報導 \s+ / }x; |
150
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('h4.font_color5')) { |
151
|
0
|
|
|
|
|
|
($ret) = $guess->all_text =~ m{\A \s* 編輯 \s* (.+) \s+ 報導 }x; |
152
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('#story #news_author')) { |
153
|
0
|
|
|
|
|
|
($ret) = $guess->all_text =~ m{\A 【 (記者 .+) 】}x; |
154
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('#details_block .left .name, .articleMain .article-author a.author-title, .article__credit a[href^="/author/"], span[itemprop=author] span[itemprop=name], .post-header-additional .post-meta-info a.nickname')) { |
155
|
0
|
|
|
|
|
|
$ret = $guess->text; |
156
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('div.single-post-meta a[rel="author"]')) { |
157
|
0
|
|
|
|
|
|
($ret) = $guess->text =~ m<^工商時報 (.+)\z>x; |
158
|
|
|
|
|
|
|
} elsif ($guess = $dom->at('#PostContent .head-section-content p.meta')) { |
159
|
0
|
|
|
|
|
|
($ret) = $guess->text =~ m<(記者.+?報導)>x; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
|
$ret = undef if ($ret && is_NewspaperName($ret)); |
163
|
|
|
|
|
|
|
|
164
|
0
|
0
|
0
|
|
|
|
if ( !$ret && (my $content_text = $self->content_text)) { |
165
|
0
|
|
|
|
|
|
my @patterns = ( |
166
|
|
|
|
|
|
|
qr<\b (?:特[約派])? [记記]者 \s* ([\s\p{Letter}、]+?) \s* [/╱/] \s* (?: 特稿 | 專訪 | \p{Letter}+ (?:報導|报导)) \b>xs, |
167
|
|
|
|
|
|
|
qr<\A 【(記者.+?報導)】>x, |
168
|
|
|
|
|
|
|
qr<\A 中評社 .+? \d+ 月 \d+ 日電(記者(.+?))>x, |
169
|
|
|
|
|
|
|
qr<\A ( 記者[^/]+/.+?電 )>x, |
170
|
|
|
|
|
|
|
qr<\A 匯流新聞網記者 (\p{Letter}+) /(?:\p{Letter}+)報導 >x, |
171
|
|
|
|
|
|
|
qr<\A 匯流新聞網記者\s*/\s*(\p{Letter}+)綜合報導>x, |
172
|
|
|
|
|
|
|
qr<((中央社[记記]者 \S+ 日 專?[電电] | 大纪元记者\p{Letter}+报导 | 記者.+?報導/.+?))>x, |
173
|
|
|
|
|
|
|
qr< \( ( \p{Letter}+ / \p{Letter}+ 報導 ) \) >x, |
174
|
|
|
|
|
|
|
qr<\A 文:記者(\p{Letter}+) \n>x, |
175
|
|
|
|
|
|
|
qr<( (譯者:.+?/核稿:.+?) )[0-9]+(?:\n|\z)>x, |
176
|
|
|
|
|
|
|
qr< \(記者 (.+?) \) \z >x, |
177
|
|
|
|
|
|
|
qr<^(編譯[^/]+?/.+?報導)$>xsm, |
178
|
|
|
|
|
|
|
qr<(( (?:譯者|編輯):.+) ) (?:[0-9]{7})? \z >x, |
179
|
|
|
|
|
|
|
qr<(記者 (\p{Letter}+) ) \z>x, |
180
|
|
|
|
|
|
|
qr< (記者 (\p{Letter}+) 綜合報導)\s+ ( (責任編輯:\p{Letter}+) ) \z>x, |
181
|
|
|
|
|
|
|
qr< ( (責任編輯:\p{Letter}+) )\z>x, |
182
|
|
|
|
|
|
|
qr< \s (公民記者 .+ 採訪報導) \z>x, |
183
|
|
|
|
|
|
|
qr<\A 【大成報記者 (\p{Letter}+) / .+報導】 >x, |
184
|
|
|
|
|
|
|
qr<\A 記者 (\p{Letter}+) /報導 >x, |
185
|
|
|
|
|
|
|
qr<\A \[ (記者.+報導) \] >x, |
186
|
|
|
|
|
|
|
qr<\A ( (記者.+報導) ) >x, |
187
|
|
|
|
|
|
|
qr<\A 【(本報記者.+報導)】 >x, |
188
|
|
|
|
|
|
|
qr<\b ﹝記者(\p{Letter}+?)/.+?報導﹞ \b>x, |
189
|
|
|
|
|
|
|
qr<\A〔新網記者 ( \p{Letter}+ (?:報導|特稿))〕\b>x, |
190
|
|
|
|
|
|
|
qr<\A(芋傳媒記者(\p{Letter}+)報導)\b>x, |
191
|
|
|
|
|
|
|
qr<\b文\s*/\s*(\p{Letter}+)\s*(中央社編譯)\n>x, |
192
|
|
|
|
|
|
|
qr<\A 香港中通社[0-9]+月[0-9]+日電(記者\s(\p{Letter}+))>x, |
193
|
|
|
|
|
|
|
qr<\A \( (記者\p{Letter}+報導) \)>x, |
194
|
|
|
|
|
|
|
); |
195
|
|
|
|
|
|
|
|
196
|
0
|
|
|
|
|
|
for my $pat (@patterns) { |
197
|
0
|
|
|
|
|
|
($ret) = $content_text =~ m/$pat/; |
198
|
0
|
0
|
|
|
|
|
last if $ret; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
0
|
0
|
|
|
|
|
unless ($ret) { |
202
|
0
|
|
|
|
|
|
my ($guess) = $content_text =~ m{((\p{Letter}+))\z}xsm; |
203
|
0
|
0
|
0
|
0
|
|
|
if ($guess && $dom->descendant_nodes->first(sub { $_->type eq 'text' && $_->content =~ m<記者${guess}\b> })) { |
|
0
|
0
|
|
|
|
|
|
204
|
0
|
|
|
|
|
|
$ret = $guess |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
0
|
0
|
|
|
|
|
if ($ret) { |
210
|
0
|
|
|
|
|
|
$ret = normalize_whitespace($ret); |
211
|
0
|
0
|
|
|
|
|
$ret = "" if is_NewspaperName($ret); |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
0
|
|
|
|
|
|
return $ret; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
1; |