line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package NewsExtractor::Extractor; |
2
|
1
|
|
|
1
|
|
9
|
use Moo; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
9
|
|
3
|
|
|
|
|
|
|
extends 'NewsExtractor::TXExtractor'; |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
400
|
use Mojo::Transaction::HTTP; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
15
|
|
6
|
1
|
|
|
1
|
|
30
|
use Mojo::URL; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
8
|
|
7
|
1
|
|
|
1
|
|
30
|
use Types::Standard qw(InstanceOf); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
13
|
|
8
|
1
|
|
|
1
|
|
1363
|
use NewsExtractor::CSSRuleSet; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
62
|
|
9
|
1
|
|
|
1
|
|
603
|
use NewsExtractor::CSSExtractor; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
51
|
|
10
|
1
|
|
|
1
|
|
728
|
use NewsExtractor::JSONLDExtractor; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
44
|
|
11
|
1
|
|
|
1
|
|
686
|
use NewsExtractor::GenericExtractor; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
54
|
|
12
|
1
|
|
|
1
|
|
696
|
use NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
48
|
|
13
|
1
|
|
|
1
|
|
665
|
use NewsExtractor::SiteSpecificExtractor::www_allnews_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
36
|
|
14
|
1
|
|
|
1
|
|
478
|
use NewsExtractor::SiteSpecificExtractor::www_peopo_org; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
15
|
1
|
|
|
1
|
|
494
|
use NewsExtractor::SiteSpecificExtractor::www_ntdtv_com; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
42
|
|
16
|
1
|
|
|
1
|
|
595
|
use NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
36
|
|
17
|
1
|
|
|
1
|
|
498
|
use NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
18
|
1
|
|
|
1
|
|
446
|
use NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
19
|
1
|
|
|
1
|
|
500
|
use NewsExtractor::SiteSpecificExtractor::www_rti_org_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
33
|
|
20
|
1
|
|
|
1
|
|
473
|
use NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
34
|
|
21
|
1
|
|
|
1
|
|
506
|
use NewsExtractor::SiteSpecificExtractor::www_setn_com; |
|
1
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
42
|
|
22
|
1
|
|
|
1
|
|
540
|
use NewsExtractor::SiteSpecificExtractor::news_tnn_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
36
|
|
23
|
1
|
|
|
1
|
|
453
|
use NewsExtractor::SiteSpecificExtractor::turnnewsapp_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
24
|
1
|
|
|
1
|
|
444
|
use NewsExtractor::SiteSpecificExtractor::news_cts_com_tw; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
51
|
|
25
|
1
|
|
|
1
|
|
591
|
use NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
47
|
|
26
|
1
|
|
|
1
|
|
564
|
use NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
37
|
|
27
|
1
|
|
|
1
|
|
501
|
use NewsExtractor::SiteSpecificExtractor::www_upmedia_mg; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
35
|
|
28
|
1
|
|
|
1
|
|
437
|
use NewsExtractor::SiteSpecificExtractor::ctee_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
32
|
|
29
|
1
|
|
|
1
|
|
525
|
use NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
30
|
1
|
|
|
1
|
|
426
|
use NewsExtractor::SiteSpecificExtractor::newnet_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
31
|
1
|
|
|
1
|
|
407
|
use NewsExtractor::SiteSpecificExtractor::www_thestandnews_com; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
31
|
|
32
|
1
|
|
|
1
|
|
453
|
use NewsExtractor::SiteSpecificExtractor::www_epochtimes_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
33
|
1
|
|
|
1
|
|
444
|
use NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
34
|
1
|
|
|
1
|
|
513
|
use NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
34
|
|
35
|
1
|
|
|
1
|
|
433
|
use NewsExtractor::SiteSpecificExtractor::www_idn_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
32
|
|
36
|
1
|
|
|
1
|
|
421
|
use NewsExtractor::SiteSpecificExtractor::www_fountmedia_io; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
37
|
1
|
|
|
1
|
|
444
|
use NewsExtractor::SiteSpecificExtractor::news_pts_org_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
38
|
1
|
|
|
1
|
|
442
|
use NewsExtractor::SiteSpecificExtractor::www_twreporter_org; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
33
|
|
39
|
1
|
|
|
1
|
|
452
|
use NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
40
|
1
|
|
|
1
|
|
421
|
use NewsExtractor::SiteSpecificExtractor::hk_crntt_com; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
32
|
|
41
|
1
|
|
|
1
|
|
462
|
use NewsExtractor::SiteSpecificExtractor::hk_on_cc; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
31
|
|
42
|
1
|
|
|
1
|
|
447
|
use NewsExtractor::SiteSpecificExtractor::www_hkcna_hk; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
43
|
1
|
|
|
1
|
|
426
|
use NewsExtractor::SiteSpecificExtractor::www_hkcnews_com; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
28
|
|
44
|
1
|
|
|
1
|
|
474
|
use NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
32
|
|
45
|
1
|
|
|
1
|
|
426
|
use NewsExtractor::SiteSpecificExtractor::news_cctv_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
46
|
1
|
|
|
1
|
|
471
|
use NewsExtractor::SiteSpecificExtractor::m_news_cctv_com; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
31
|
|
47
|
1
|
|
|
1
|
|
433
|
use NewsExtractor::SiteSpecificExtractor::focustaiwan_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
36
|
|
48
|
1
|
|
|
1
|
|
432
|
use NewsExtractor::SiteSpecificExtractor::newtalk_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
49
|
1
|
|
|
1
|
|
411
|
use NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
50
|
1
|
|
|
1
|
|
414
|
use NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
32
|
|
51
|
1
|
|
|
1
|
|
423
|
use NewsExtractor::SiteSpecificExtractor::www_mdnkids_com; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
52
|
1
|
|
|
1
|
|
426
|
use NewsExtractor::SiteSpecificExtractor::www_nownews_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
53
|
1
|
|
|
1
|
|
420
|
use NewsExtractor::SiteSpecificExtractor::www_penghutimes_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
54
|
1
|
|
|
1
|
|
438
|
use NewsExtractor::SiteSpecificExtractor::www_aljazeera_com; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
29
|
|
55
|
1
|
|
|
1
|
|
410
|
use NewsExtractor::SiteSpecificExtractor::www_bbc_com; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
56
|
1
|
|
|
1
|
|
462
|
use NewsExtractor::SiteSpecificExtractor::yimedia_com_tw; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
36
|
|
57
|
1
|
|
|
1
|
|
415
|
use NewsExtractor::SiteSpecificExtractor::UDN; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
32
|
|
58
|
1
|
|
|
1
|
|
440
|
use NewsExtractor::SiteSpecificExtractor::ETtoday; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
72
|
|
59
|
1
|
|
|
1
|
|
399
|
use NewsExtractor::SiteSpecificExtractor::ChinaTimes; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
276
|
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
has extractor => ( |
62
|
|
|
|
|
|
|
required => 0, |
63
|
|
|
|
|
|
|
is => 'lazy', |
64
|
|
|
|
|
|
|
isa => InstanceOf["NewsExtractor::CSSExtractor", |
65
|
|
|
|
|
|
|
"NewsExtractor::JSONLDExtractor", |
66
|
|
|
|
|
|
|
"NewsExtractor::SiteSpecificExtractor", |
67
|
|
|
|
|
|
|
"NewsExtractor::GenericExtractor"], |
68
|
|
|
|
|
|
|
builder => 1, |
69
|
|
|
|
|
|
|
handles => [qw( headline dateline journalist content_text )], |
70
|
|
|
|
|
|
|
); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
use constant { |
73
|
1
|
|
|
|
|
342
|
SiteSpecificExtractorByHost => { |
74
|
|
|
|
|
|
|
'www.bbc.com' => 'NewsExtractor::SiteSpecificExtractor::www_bbc_com', |
75
|
|
|
|
|
|
|
'www.aljazeera.com' => 'NewsExtractor::SiteSpecificExtractor::www_aljazeera_com', |
76
|
|
|
|
|
|
|
'www.penghutimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_penghutimes_com', |
77
|
|
|
|
|
|
|
'www.ustv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw', |
78
|
|
|
|
|
|
|
'www.epochtimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_epochtimes_com', |
79
|
|
|
|
|
|
|
'www.hkcnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_hkcnews_com', |
80
|
|
|
|
|
|
|
'www.thestandnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_thestandnews_com', |
81
|
|
|
|
|
|
|
'www.allnews.tw' => 'NewsExtractor::SiteSpecificExtractor::www_allnews_tw', |
82
|
|
|
|
|
|
|
'www.rvn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw', |
83
|
|
|
|
|
|
|
'www.chinatimes.com' => 'NewsExtractor::SiteSpecificExtractor::ChinaTimes', |
84
|
|
|
|
|
|
|
'video.udn.com' => 'NewsExtractor::JSONLDExtractor', |
85
|
|
|
|
|
|
|
'www.ctwant.com' => 'NewsExtractor::JSONLDExtractor', |
86
|
|
|
|
|
|
|
'www.peopo.org' => 'NewsExtractor::SiteSpecificExtractor::www_peopo_org', |
87
|
|
|
|
|
|
|
'www.ntdtv.com' => 'NewsExtractor::SiteSpecificExtractor::www_ntdtv_com', |
88
|
|
|
|
|
|
|
'www.ksnews.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw', |
89
|
|
|
|
|
|
|
'news.tvbs.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw', |
90
|
|
|
|
|
|
|
'udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
91
|
|
|
|
|
|
|
'stars.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
92
|
|
|
|
|
|
|
'money.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
93
|
|
|
|
|
|
|
'house.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
94
|
|
|
|
|
|
|
'opinion.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN', |
95
|
|
|
|
|
|
|
'www.taipeitimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com', |
96
|
|
|
|
|
|
|
'www.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
97
|
|
|
|
|
|
|
'star.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
98
|
|
|
|
|
|
|
'house.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
99
|
|
|
|
|
|
|
'health.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday', |
100
|
|
|
|
|
|
|
'www.rti.org.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rti_org_tw', |
101
|
|
|
|
|
|
|
'www.bcc.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw', |
102
|
|
|
|
|
|
|
'www.setn.com' => 'NewsExtractor::SiteSpecificExtractor::www_setn_com', |
103
|
|
|
|
|
|
|
'news.tnn.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tnn_tw', |
104
|
|
|
|
|
|
|
'turnnewsapp.com' => 'NewsExtractor::SiteSpecificExtractor::turnnewsapp_com', |
105
|
|
|
|
|
|
|
'news.cts.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_cts_com_tw', |
106
|
|
|
|
|
|
|
'talk.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw', |
107
|
|
|
|
|
|
|
'estate.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw', |
108
|
|
|
|
|
|
|
'www.upmedia.mg' => 'NewsExtractor::SiteSpecificExtractor::www_upmedia_mg', |
109
|
|
|
|
|
|
|
'ctee.com.tw' => 'NewsExtractor::SiteSpecificExtractor::ctee_com_tw', |
110
|
|
|
|
|
|
|
'news.ebc.net.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw', |
111
|
|
|
|
|
|
|
'newnet.tw' => 'NewsExtractor::SiteSpecificExtractor::newnet_tw', |
112
|
|
|
|
|
|
|
'www.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw', |
113
|
|
|
|
|
|
|
'news.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw', |
114
|
|
|
|
|
|
|
'www.idn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_idn_com_tw', |
115
|
|
|
|
|
|
|
'www.fountmedia.io' => 'NewsExtractor::SiteSpecificExtractor::www_fountmedia_io', |
116
|
|
|
|
|
|
|
'news.pts.org.tw' => 'NewsExtractor::SiteSpecificExtractor::news_pts_org_tw', |
117
|
|
|
|
|
|
|
'www.twreporter.org' => 'NewsExtractor::SiteSpecificExtractor::www_twreporter_org', |
118
|
|
|
|
|
|
|
'new.ctv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw', |
119
|
|
|
|
|
|
|
'hk.crntt.com' => 'NewsExtractor::SiteSpecificExtractor::hk_crntt_com', |
120
|
|
|
|
|
|
|
'hk.on.cc' => 'NewsExtractor::SiteSpecificExtractor::hk_on_cc', |
121
|
|
|
|
|
|
|
'www.hkcna.hk' => 'NewsExtractor::SiteSpecificExtractor::www_hkcna_hk', |
122
|
|
|
|
|
|
|
'www.xinhuanet.com' => 'NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com', |
123
|
|
|
|
|
|
|
'news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::news_cctv_com', |
124
|
|
|
|
|
|
|
'm.news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::m_news_cctv_com', |
125
|
|
|
|
|
|
|
'focustaiwan.tw' => 'NewsExtractor::SiteSpecificExtractor::focustaiwan_tw', |
126
|
|
|
|
|
|
|
'newtalk.tw' => 'NewsExtractor::SiteSpecificExtractor::newtalk_tw', |
127
|
|
|
|
|
|
|
'www.digitimes.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw', |
128
|
|
|
|
|
|
|
'www.mdnkids.com' => 'NewsExtractor::SiteSpecificExtractor::www_mdnkids_com', |
129
|
|
|
|
|
|
|
'www.nownews.com' => 'NewsExtractor::SiteSpecificExtractor::www_nownews_com', |
130
|
|
|
|
|
|
|
'yimedia.com.tw' => 'NewsExtractor::SiteSpecificExtractor::yimedia_com_tw', |
131
|
|
|
|
|
|
|
}, |
132
|
|
|
|
|
|
|
CSSRuleSetByHost => { |
133
|
|
|
|
|
|
|
'www.eventsinfocus.org' => { |
134
|
|
|
|
|
|
|
headline => 'h1.title > span', |
135
|
|
|
|
|
|
|
dateline => 'div.content time.datetime', |
136
|
|
|
|
|
|
|
journalist => 'div.content div.node__content div.clearfix.text-formatted > p:nth-child(1)', |
137
|
|
|
|
|
|
|
content_text => 'div.content article div.clearfix.text-formatted', |
138
|
|
|
|
|
|
|
}, |
139
|
|
|
|
|
|
|
'www.5ch.com.tw' => { |
140
|
|
|
|
|
|
|
headline => 'h3.m-ti', |
141
|
|
|
|
|
|
|
dateline => 'div.more-about div.date', |
142
|
|
|
|
|
|
|
journalist => 'div.more-about div.reporter', |
143
|
|
|
|
|
|
|
content_text => 'div.text-edit', |
144
|
|
|
|
|
|
|
}, |
145
|
|
|
|
|
|
|
'www.cw.com.tw' => { |
146
|
|
|
|
|
|
|
headline => 'div.article__head h1', |
147
|
|
|
|
|
|
|
dateline => 'div.article__detail > time', |
148
|
|
|
|
|
|
|
journalist => 'div.author--item > a', |
149
|
|
|
|
|
|
|
content_text => 'div.article__content', |
150
|
|
|
|
|
|
|
}, |
151
|
|
|
|
|
|
|
'www.taiwannews.com.tw' => { |
152
|
|
|
|
|
|
|
headline => 'h1.article-title', |
153
|
|
|
|
|
|
|
dateline => 'div.article-date', |
154
|
|
|
|
|
|
|
journalist => 'div.article-author', |
155
|
|
|
|
|
|
|
content_text => 'article.article', |
156
|
|
|
|
|
|
|
}, |
157
|
|
|
|
|
|
|
'www.enewstw.com' => { |
158
|
|
|
|
|
|
|
headline => 'td.blog_title > strong', |
159
|
|
|
|
|
|
|
dateline => 'td.blog_title tr:nth-child(2) > td.blog', |
160
|
|
|
|
|
|
|
journalist => 'td.blog_title tr:nth-child(1) > td.blog', |
161
|
|
|
|
|
|
|
content_text => 'td.new_t p', |
162
|
|
|
|
|
|
|
}, |
163
|
|
|
|
|
|
|
'www.storm.mg' => { |
164
|
|
|
|
|
|
|
headline => 'h1#article_title', |
165
|
|
|
|
|
|
|
dateline => 'span#info_time', |
166
|
|
|
|
|
|
|
journalist => '#article_info_wrapper #author_block a.link_author > span.info_author', |
167
|
|
|
|
|
|
|
content_text => 'div#article_inner_wrapper > article:nth-child(1)', |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} |
170
|
1
|
|
|
1
|
|
9
|
}; |
|
1
|
|
|
|
|
2
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub _build_extractor { |
173
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
174
|
0
|
|
|
|
|
|
my $url = $self->tx->req->url; |
175
|
0
|
|
|
|
|
|
my $host = $url->host; |
176
|
0
|
|
|
|
|
|
my $extractor; |
177
|
0
|
0
|
|
|
|
|
if (my $sel = CSSRuleSetByHost->{$host}) { |
|
|
0
|
|
|
|
|
|
178
|
0
|
|
|
|
|
|
$extractor = NewsExtractor::CSSExtractor->new( |
179
|
|
|
|
|
|
|
css_selector => NewsExtractor::CSSRuleSet->new(%$sel), |
180
|
|
|
|
|
|
|
tx => $self->tx |
181
|
|
|
|
|
|
|
); |
182
|
|
|
|
|
|
|
} elsif (my $extractor_class = SiteSpecificExtractorByHost->{$host}) { |
183
|
0
|
|
|
|
|
|
$extractor = $extractor_class->new( tx => $self->tx ); |
184
|
|
|
|
|
|
|
} else { |
185
|
0
|
|
|
|
|
|
$extractor = NewsExtractor::GenericExtractor->new( tx => $self->tx ); |
186
|
|
|
|
|
|
|
} |
187
|
0
|
|
|
|
|
|
return $extractor; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
1; |