| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package NewsExtractor::Extractor; |
|
2
|
1
|
|
|
1
|
|
9
|
use Moo; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
|
|
extends 'NewsExtractor::TXExtractor'; |
|
4
|
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
474
|
use Mojo::Transaction::HTTP; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
17
|
|
|
6
|
1
|
|
|
1
|
|
31
|
use Mojo::URL; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
10
|
|
|
7
|
1
|
|
|
1
|
|
32
|
use Types::Standard qw(InstanceOf); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
13
|
|
|
8
|
1
|
|
|
1
|
|
1306
|
use NewsExtractor::CSSRuleSet; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
86
|
|
|
9
|
1
|
|
|
1
|
|
680
|
use NewsExtractor::CSSExtractor; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
67
|
|
|
10
|
1
|
|
|
1
|
|
669
|
use NewsExtractor::JSONLDExtractor; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
47
|
|
|
11
|
1
|
|
|
1
|
|
700
|
use NewsExtractor::GenericExtractor; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
56
|
|
|
12
|
1
|
|
|
1
|
|
767
|
use NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw; |
|
|
1
|
|
|
|
|
6
|
|
|
|
1
|
|
|
|
|
48
|
|
|
13
|
1
|
|
|
1
|
|
637
|
use NewsExtractor::SiteSpecificExtractor::www_allnews_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
37
|
|
|
14
|
1
|
|
|
1
|
|
585
|
use NewsExtractor::SiteSpecificExtractor::www_peopo_org; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
37
|
|
|
15
|
1
|
|
|
1
|
|
502
|
use NewsExtractor::SiteSpecificExtractor::www_ntdtv_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
45
|
|
|
16
|
1
|
|
|
1
|
|
549
|
use NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
38
|
|
|
17
|
1
|
|
|
1
|
|
524
|
use NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
33
|
|
|
18
|
1
|
|
|
1
|
|
492
|
use NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
19
|
1
|
|
|
1
|
|
527
|
use NewsExtractor::SiteSpecificExtractor::www_rti_org_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
44
|
|
|
20
|
1
|
|
|
1
|
|
547
|
use NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
34
|
|
|
21
|
1
|
|
|
1
|
|
508
|
use NewsExtractor::SiteSpecificExtractor::www_setn_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
39
|
|
|
22
|
1
|
|
|
1
|
|
524
|
use NewsExtractor::SiteSpecificExtractor::news_tnn_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
34
|
|
|
23
|
1
|
|
|
1
|
|
471
|
use NewsExtractor::SiteSpecificExtractor::turnnewsapp_com; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
42
|
|
|
24
|
1
|
|
|
1
|
|
478
|
use NewsExtractor::SiteSpecificExtractor::news_cts_com_tw; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
45
|
|
|
25
|
1
|
|
|
1
|
|
529
|
use NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
52
|
|
|
26
|
1
|
|
|
1
|
|
557
|
use NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
36
|
|
|
27
|
1
|
|
|
1
|
|
508
|
use NewsExtractor::SiteSpecificExtractor::www_upmedia_mg; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
37
|
|
|
28
|
1
|
|
|
1
|
|
606
|
use NewsExtractor::SiteSpecificExtractor::ctee_com_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
116
|
|
|
29
|
1
|
|
|
1
|
|
481
|
use NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
35
|
|
|
30
|
1
|
|
|
1
|
|
510
|
use NewsExtractor::SiteSpecificExtractor::newnet_tw; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
44
|
|
|
31
|
1
|
|
|
1
|
|
487
|
use NewsExtractor::SiteSpecificExtractor::www_thestandnews_com; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
32
|
1
|
|
|
1
|
|
488
|
use NewsExtractor::SiteSpecificExtractor::www_epochtimes_com; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
33
|
1
|
|
|
1
|
|
464
|
use NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
34
|
|
|
34
|
1
|
|
|
1
|
|
497
|
use NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw; |
|
|
1
|
|
|
|
|
6
|
|
|
|
1
|
|
|
|
|
35
|
|
|
35
|
1
|
|
|
1
|
|
479
|
use NewsExtractor::SiteSpecificExtractor::www_idn_com_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
36
|
1
|
|
|
1
|
|
470
|
use NewsExtractor::SiteSpecificExtractor::www_fountmedia_io; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
37
|
1
|
|
|
1
|
|
462
|
use NewsExtractor::SiteSpecificExtractor::news_pts_org_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
44
|
|
|
38
|
1
|
|
|
1
|
|
690
|
use NewsExtractor::SiteSpecificExtractor::www_twreporter_org; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
39
|
|
|
39
|
1
|
|
|
1
|
|
502
|
use NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
37
|
|
|
40
|
1
|
|
|
1
|
|
581
|
use NewsExtractor::SiteSpecificExtractor::hk_crntt_com; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
36
|
|
|
41
|
1
|
|
|
1
|
|
492
|
use NewsExtractor::SiteSpecificExtractor::hk_on_cc; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
34
|
|
|
42
|
1
|
|
|
1
|
|
527
|
use NewsExtractor::SiteSpecificExtractor::www_hkcna_hk; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
43
|
1
|
|
|
1
|
|
477
|
use NewsExtractor::SiteSpecificExtractor::www_hkcnews_com; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
33
|
|
|
44
|
1
|
|
|
1
|
|
519
|
use NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
33
|
|
|
45
|
1
|
|
|
1
|
|
494
|
use NewsExtractor::SiteSpecificExtractor::news_cctv_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
35
|
|
|
46
|
1
|
|
|
1
|
|
487
|
use NewsExtractor::SiteSpecificExtractor::m_news_cctv_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
35
|
|
|
47
|
1
|
|
|
1
|
|
565
|
use NewsExtractor::SiteSpecificExtractor::focustaiwan_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
48
|
1
|
|
|
1
|
|
487
|
use NewsExtractor::SiteSpecificExtractor::newtalk_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
44
|
|
|
49
|
1
|
|
|
1
|
|
501
|
use NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
44
|
|
|
50
|
1
|
|
|
1
|
|
489
|
use NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
33
|
|
|
51
|
1
|
|
|
1
|
|
481
|
use NewsExtractor::SiteSpecificExtractor::www_mdnkids_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
52
|
1
|
|
|
1
|
|
491
|
use NewsExtractor::SiteSpecificExtractor::www_nownews_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
53
|
1
|
|
|
1
|
|
517
|
use NewsExtractor::SiteSpecificExtractor::www_penghutimes_com; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
54
|
1
|
|
|
1
|
|
491
|
use NewsExtractor::SiteSpecificExtractor::www_aljazeera_com; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
45
|
|
|
55
|
1
|
|
|
1
|
|
636
|
use NewsExtractor::SiteSpecificExtractor::www_bbc_com; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
34
|
|
|
56
|
1
|
|
|
1
|
|
488
|
use NewsExtractor::SiteSpecificExtractor::yimedia_com_tw; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
39
|
|
|
57
|
1
|
|
|
1
|
|
539
|
use NewsExtractor::SiteSpecificExtractor::UDN; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
34
|
|
|
58
|
1
|
|
|
1
|
|
512
|
use NewsExtractor::SiteSpecificExtractor::ETtoday; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
37
|
|
|
59
|
1
|
|
|
1
|
|
455
|
use NewsExtractor::SiteSpecificExtractor::ChinaTimes; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
307
|
|
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
has extractor => ( |
|
62
|
|
|
|
|
|
|
required => 0, |
|
63
|
|
|
|
|
|
|
is => 'lazy', |
|
64
|
|
|
|
|
|
|
isa => InstanceOf["NewsExtractor::CSSExtractor", |
|
65
|
|
|
|
|
|
|
"NewsExtractor::JSONLDExtractor", |
|
66
|
|
|
|
|
|
|
"NewsExtractor::SiteSpecificExtractor", |
|
67
|
|
|
|
|
|
|
"NewsExtractor::GenericExtractor"], |
|
68
|
|
|
|
|
|
|
builder => 1, |
|
69
|
|
|
|
|
|
|
handles => [qw( headline dateline journalist content_text )], |
|
70
|
|
|
|
|
|
|
); |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
use constant { |
|
73
|
1
|
|
|
|
|
355
|
SiteSpecificExtractorByHost => { |
|
74
|
|
|
|
|
|
|
'www.bbc.com' => 'NewsExtractor::SiteSpecificExtractor::www_bbc_com', |
|
75
|
|
|
|
|
|
|
'www.aljazeera.com' => 'NewsExtractor::SiteSpecificExtractor::www_aljazeera_com', |
|
76
|
|
|
|
|
|
|
'www.penghutimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_penghutimes_com', |
|
77
|
|
|
|
|
|
|
'www.ustv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw', |
|
78
|
|
|
|
|
|
|
'www.epochtimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_epochtimes_com', |
|
79
|
|
|
|
|
|
|
'www.hkcnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_hkcnews_com', |
|
80
|
|
|
|
|
|
|
'www.thestandnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_thestandnews_com', |
|
81
|
|
|
|
|
|
|
'www.allnews.tw' => 'NewsExtractor::SiteSpecificExtractor::www_allnews_tw', |
|
82
|
|
|
|
|
|
|
'www.rvn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw', |
|
83
|
|
|
|
|
|
|
'www.chinatimes.com' => 'NewsExtractor::SiteSpecificExtractor::ChinaTimes', |
|
84
|
|
|
|
|
|
|
'video.udn.com' => 'NewsExtractor::JSONLDExtractor', |
|
85
|
|
|
|
|
|
|
'www.ctwant.com' => 'NewsExtractor::JSONLDExtractor', |
|
86
|
|
|
|
|
|
|
'www.peopo.org' => 'NewsExtractor::SiteSpecificExtractor::www_peopo_org', |
|
87
|
|
|
|
|
|
|
'www.ntdtv.com' => 'NewsExtractor::SiteSpecificExtractor::www_ntdtv_com', |
|
88
|
|
|
|
|
|
|
'www.ksnews.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw', |
|
89
|
|
|
|
|
|
|
'news.tvbs.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw', |
|
90
|
|
|
|
|
|
|
'udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
|
91
|
|
|
|
|
|
|
'stars.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
|
92
|
|
|
|
|
|
|
'money.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
|
93
|
|
|
|
|
|
|
'house.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
|
94
|
|
|
|
|
|
|
'opinion.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
|
95
|
|
|
|
|
|
|
'www.taipeitimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com', |
|
96
|
|
|
|
|
|
|
'www.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
|
97
|
|
|
|
|
|
|
'star.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
|
98
|
|
|
|
|
|
|
'house.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
|
99
|
|
|
|
|
|
|
'health.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
|
100
|
|
|
|
|
|
|
'www.rti.org.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rti_org_tw', |
|
101
|
|
|
|
|
|
|
'www.bcc.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw', |
|
102
|
|
|
|
|
|
|
'www.setn.com' => 'NewsExtractor::SiteSpecificExtractor::www_setn_com', |
|
103
|
|
|
|
|
|
|
'news.tnn.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tnn_tw', |
|
104
|
|
|
|
|
|
|
'turnnewsapp.com' => 'NewsExtractor::SiteSpecificExtractor::turnnewsapp_com', |
|
105
|
|
|
|
|
|
|
'news.cts.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_cts_com_tw', |
|
106
|
|
|
|
|
|
|
'talk.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw', |
|
107
|
|
|
|
|
|
|
'estate.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw', |
|
108
|
|
|
|
|
|
|
'www.upmedia.mg' => 'NewsExtractor::SiteSpecificExtractor::www_upmedia_mg', |
|
109
|
|
|
|
|
|
|
'ctee.com.tw' => 'NewsExtractor::SiteSpecificExtractor::ctee_com_tw', |
|
110
|
|
|
|
|
|
|
'news.ebc.net.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw', |
|
111
|
|
|
|
|
|
|
'newnet.tw' => 'NewsExtractor::SiteSpecificExtractor::newnet_tw', |
|
112
|
|
|
|
|
|
|
'www.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw', |
|
113
|
|
|
|
|
|
|
'news.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw', |
|
114
|
|
|
|
|
|
|
'www.idn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_idn_com_tw', |
|
115
|
|
|
|
|
|
|
'www.fountmedia.io' => 'NewsExtractor::SiteSpecificExtractor::www_fountmedia_io', |
|
116
|
|
|
|
|
|
|
'news.pts.org.tw' => 'NewsExtractor::SiteSpecificExtractor::news_pts_org_tw', |
|
117
|
|
|
|
|
|
|
'www.twreporter.org' => 'NewsExtractor::SiteSpecificExtractor::www_twreporter_org', |
|
118
|
|
|
|
|
|
|
'new.ctv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw', |
|
119
|
|
|
|
|
|
|
'hk.crntt.com' => 'NewsExtractor::SiteSpecificExtractor::hk_crntt_com', |
|
120
|
|
|
|
|
|
|
'hk.on.cc' => 'NewsExtractor::SiteSpecificExtractor::hk_on_cc', |
|
121
|
|
|
|
|
|
|
'www.hkcna.hk' => 'NewsExtractor::SiteSpecificExtractor::www_hkcna_hk', |
|
122
|
|
|
|
|
|
|
'www.xinhuanet.com' => 'NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com', |
|
123
|
|
|
|
|
|
|
'news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::news_cctv_com', |
|
124
|
|
|
|
|
|
|
'm.news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::m_news_cctv_com', |
|
125
|
|
|
|
|
|
|
'focustaiwan.tw' => 'NewsExtractor::SiteSpecificExtractor::focustaiwan_tw', |
|
126
|
|
|
|
|
|
|
'newtalk.tw' => 'NewsExtractor::SiteSpecificExtractor::newtalk_tw', |
|
127
|
|
|
|
|
|
|
'www.digitimes.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw', |
|
128
|
|
|
|
|
|
|
'www.mdnkids.com' => 'NewsExtractor::SiteSpecificExtractor::www_mdnkids_com', |
|
129
|
|
|
|
|
|
|
'www.nownews.com' => 'NewsExtractor::SiteSpecificExtractor::www_nownews_com', |
|
130
|
|
|
|
|
|
|
'yimedia.com.tw' => 'NewsExtractor::SiteSpecificExtractor::yimedia_com_tw', |
|
131
|
|
|
|
|
|
|
}, |
|
132
|
|
|
|
|
|
|
CSSRuleSetByHost => { |
|
133
|
|
|
|
|
|
|
'www.eventsinfocus.org' => { |
|
134
|
|
|
|
|
|
|
headline => 'h1.title > span', |
|
135
|
|
|
|
|
|
|
dateline => 'div.content time.datetime', |
|
136
|
|
|
|
|
|
|
journalist => 'div.content div.node__content div.clearfix.text-formatted > p:nth-child(1)', |
|
137
|
|
|
|
|
|
|
content_text => 'div.content article div.clearfix.text-formatted', |
|
138
|
|
|
|
|
|
|
}, |
|
139
|
|
|
|
|
|
|
'www.5ch.com.tw' => { |
|
140
|
|
|
|
|
|
|
headline => 'h3.m-ti', |
|
141
|
|
|
|
|
|
|
dateline => 'div.more-about div.date', |
|
142
|
|
|
|
|
|
|
journalist => 'div.more-about div.reporter', |
|
143
|
|
|
|
|
|
|
content_text => 'div.text-edit', |
|
144
|
|
|
|
|
|
|
}, |
|
145
|
|
|
|
|
|
|
'www.cw.com.tw' => { |
|
146
|
|
|
|
|
|
|
headline => 'div.article__head h1', |
|
147
|
|
|
|
|
|
|
dateline => 'div.article__detail > time', |
|
148
|
|
|
|
|
|
|
journalist => 'div.author--item > a', |
|
149
|
|
|
|
|
|
|
content_text => 'div.article__content', |
|
150
|
|
|
|
|
|
|
}, |
|
151
|
|
|
|
|
|
|
'www.taiwannews.com.tw' => { |
|
152
|
|
|
|
|
|
|
headline => 'h1.article-title', |
|
153
|
|
|
|
|
|
|
dateline => 'div.article-date', |
|
154
|
|
|
|
|
|
|
journalist => 'div.article-author', |
|
155
|
|
|
|
|
|
|
content_text => 'article.article', |
|
156
|
|
|
|
|
|
|
}, |
|
157
|
|
|
|
|
|
|
'www.enewstw.com' => { |
|
158
|
|
|
|
|
|
|
headline => 'td.blog_title > strong', |
|
159
|
|
|
|
|
|
|
dateline => 'td.blog_title tr:nth-child(2) > td.blog', |
|
160
|
|
|
|
|
|
|
journalist => 'td.blog_title tr:nth-child(1) > td.blog', |
|
161
|
|
|
|
|
|
|
content_text => 'td.new_t p', |
|
162
|
|
|
|
|
|
|
}, |
|
163
|
|
|
|
|
|
|
'www.storm.mg' => { |
|
164
|
|
|
|
|
|
|
headline => 'h1#article_title', |
|
165
|
|
|
|
|
|
|
dateline => 'span#info_time', |
|
166
|
|
|
|
|
|
|
journalist => '#article_info_wrapper #author_block a.link_author > span.info_author', |
|
167
|
|
|
|
|
|
|
content_text => 'div#article_inner_wrapper > article:nth-child(1)', |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
} |
|
170
|
1
|
|
|
1
|
|
9
|
}; |
|
|
1
|
|
|
|
|
4
|
|
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub _build_extractor { |
|
173
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
|
174
|
0
|
|
|
|
|
|
my $url = $self->tx->req->url; |
|
175
|
0
|
|
|
|
|
|
my $host = $url->host; |
|
176
|
0
|
|
|
|
|
|
my $extractor; |
|
177
|
0
|
0
|
|
|
|
|
if (my $sel = CSSRuleSetByHost->{$host}) { |
|
|
|
0
|
|
|
|
|
|
|
178
|
0
|
|
|
|
|
|
$extractor = NewsExtractor::CSSExtractor->new( |
|
179
|
|
|
|
|
|
|
css_selector => NewsExtractor::CSSRuleSet->new(%$sel), |
|
180
|
|
|
|
|
|
|
tx => $self->tx |
|
181
|
|
|
|
|
|
|
); |
|
182
|
|
|
|
|
|
|
} elsif (my $extractor_class = SiteSpecificExtractorByHost->{$host}) { |
|
183
|
0
|
|
|
|
|
|
$extractor = $extractor_class->new( tx => $self->tx ); |
|
184
|
|
|
|
|
|
|
} else { |
|
185
|
0
|
|
|
|
|
|
$extractor = NewsExtractor::GenericExtractor->new( tx => $self->tx ); |
|
186
|
|
|
|
|
|
|
} |
|
187
|
0
|
|
|
|
|
|
return $extractor; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |