File Coverage

blib/lib/NewsExtractor/Extractor.pm
Criterion Covered Total %
statement 171 180 95.0
branch 0 4 0.0
condition n/a
subroutine 57 58 98.2
pod n/a
total 228 242 94.2


line stmt bran cond sub pod time code
1             package NewsExtractor::Extractor;
2 1     1   9 use Moo;
  1         3  
  1         9  
3             extends 'NewsExtractor::TXExtractor';
4              
5 1     1   400 use Mojo::Transaction::HTTP;
  1         3  
  1         15  
6 1     1   30 use Mojo::URL;
  1         2  
  1         8  
7 1     1   30 use Types::Standard qw(InstanceOf);
  1         2  
  1         13  
8 1     1   1363 use NewsExtractor::CSSRuleSet;
  1         4  
  1         62  
9 1     1   603 use NewsExtractor::CSSExtractor;
  1         5  
  1         51  
10 1     1   728 use NewsExtractor::JSONLDExtractor;
  1         4  
  1         44  
11 1     1   686 use NewsExtractor::GenericExtractor;
  1         4  
  1         54  
12 1     1   696 use NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw;
  1         5  
  1         48  
13 1     1   665 use NewsExtractor::SiteSpecificExtractor::www_allnews_tw;
  1         4  
  1         36  
14 1     1   478 use NewsExtractor::SiteSpecificExtractor::www_peopo_org;
  1         3  
  1         35  
15 1     1   494 use NewsExtractor::SiteSpecificExtractor::www_ntdtv_com;
  1         4  
  1         42  
16 1     1   595 use NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw;
  1         4  
  1         36  
17 1     1   498 use NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw;
  1         3  
  1         33  
18 1     1   446 use NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com;
  1         3  
  1         33  
19 1     1   500 use NewsExtractor::SiteSpecificExtractor::www_rti_org_tw;
  1         4  
  1         33  
20 1     1   473 use NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw;
  1         4  
  1         34  
21 1     1   506 use NewsExtractor::SiteSpecificExtractor::www_setn_com;
  1         14  
  1         42  
22 1     1   540 use NewsExtractor::SiteSpecificExtractor::news_tnn_tw;
  1         3  
  1         36  
23 1     1   453 use NewsExtractor::SiteSpecificExtractor::turnnewsapp_com;
  1         3  
  1         34  
24 1     1   444 use NewsExtractor::SiteSpecificExtractor::news_cts_com_tw;
  1         5  
  1         51  
25 1     1   591 use NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw;
  1         4  
  1         47  
26 1     1   564 use NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw;
  1         4  
  1         37  
27 1     1   501 use NewsExtractor::SiteSpecificExtractor::www_upmedia_mg;
  1         4  
  1         35  
28 1     1   437 use NewsExtractor::SiteSpecificExtractor::ctee_com_tw;
  1         4  
  1         32  
29 1     1   525 use NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw;
  1         3  
  1         33  
30 1     1   426 use NewsExtractor::SiteSpecificExtractor::newnet_tw;
  1         3  
  1         34  
31 1     1   407 use NewsExtractor::SiteSpecificExtractor::www_thestandnews_com;
  1         4  
  1         31  
32 1     1   453 use NewsExtractor::SiteSpecificExtractor::www_epochtimes_com;
  1         3  
  1         34  
33 1     1   444 use NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw;
  1         3  
  1         34  
34 1     1   513 use NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw;
  1         4  
  1         34  
35 1     1   433 use NewsExtractor::SiteSpecificExtractor::www_idn_com_tw;
  1         4  
  1         32  
36 1     1   421 use NewsExtractor::SiteSpecificExtractor::www_fountmedia_io;
  1         3  
  1         33  
37 1     1   444 use NewsExtractor::SiteSpecificExtractor::news_pts_org_tw;
  1         3  
  1         33  
38 1     1   442 use NewsExtractor::SiteSpecificExtractor::www_twreporter_org;
  1         4  
  1         33  
39 1     1   452 use NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw;
  1         3  
  1         31  
40 1     1   421 use NewsExtractor::SiteSpecificExtractor::hk_crntt_com;
  1         4  
  1         32  
41 1     1   462 use NewsExtractor::SiteSpecificExtractor::hk_on_cc;
  1         5  
  1         31  
42 1     1   447 use NewsExtractor::SiteSpecificExtractor::www_hkcna_hk;
  1         3  
  1         32  
43 1     1   426 use NewsExtractor::SiteSpecificExtractor::www_hkcnews_com;
  1         4  
  1         28  
44 1     1   474 use NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com;
  1         5  
  1         32  
45 1     1   426 use NewsExtractor::SiteSpecificExtractor::news_cctv_com;
  1         3  
  1         31  
46 1     1   471 use NewsExtractor::SiteSpecificExtractor::m_news_cctv_com;
  1         4  
  1         31  
47 1     1   433 use NewsExtractor::SiteSpecificExtractor::focustaiwan_tw;
  1         3  
  1         36  
48 1     1   432 use NewsExtractor::SiteSpecificExtractor::newtalk_tw;
  1         3  
  1         32  
49 1     1   411 use NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw;
  1         3  
  1         30  
50 1     1   414 use NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw;
  1         4  
  1         32  
51 1     1   423 use NewsExtractor::SiteSpecificExtractor::www_mdnkids_com;
  1         2  
  1         33  
52 1     1   426 use NewsExtractor::SiteSpecificExtractor::www_nownews_com;
  1         3  
  1         35  
53 1     1   420 use NewsExtractor::SiteSpecificExtractor::www_penghutimes_com;
  1         3  
  1         31  
54 1     1   438 use NewsExtractor::SiteSpecificExtractor::www_aljazeera_com;
  1         5  
  1         29  
55 1     1   410 use NewsExtractor::SiteSpecificExtractor::www_bbc_com;
  1         3  
  1         35  
56 1     1   462 use NewsExtractor::SiteSpecificExtractor::yimedia_com_tw;
  1         4  
  1         36  
57 1     1   415 use NewsExtractor::SiteSpecificExtractor::UDN;
  1         3  
  1         32  
58 1     1   440 use NewsExtractor::SiteSpecificExtractor::ETtoday;
  1         3  
  1         72  
59 1     1   399 use NewsExtractor::SiteSpecificExtractor::ChinaTimes;
  1         3  
  1         276  
60              
61             has extractor => (
62             required => 0,
63             is => 'lazy',
64             isa => InstanceOf["NewsExtractor::CSSExtractor",
65             "NewsExtractor::JSONLDExtractor",
66             "NewsExtractor::SiteSpecificExtractor",
67             "NewsExtractor::GenericExtractor"],
68             builder => 1,
69             handles => [qw( headline dateline journalist content_text )],
70             );
71              
72             use constant {
73 1         342 SiteSpecificExtractorByHost => {
74             'www.bbc.com' => 'NewsExtractor::SiteSpecificExtractor::www_bbc_com',
75             'www.aljazeera.com' => 'NewsExtractor::SiteSpecificExtractor::www_aljazeera_com',
76             'www.penghutimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_penghutimes_com',
77             'www.ustv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ustv_com_tw',
78             'www.epochtimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_epochtimes_com',
79             'www.hkcnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_hkcnews_com',
80             'www.thestandnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_thestandnews_com',
81             'www.allnews.tw' => 'NewsExtractor::SiteSpecificExtractor::www_allnews_tw',
82             'www.rvn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw',
83             'www.chinatimes.com' => 'NewsExtractor::SiteSpecificExtractor::ChinaTimes',
84             'video.udn.com' => 'NewsExtractor::JSONLDExtractor',
85             'www.ctwant.com' => 'NewsExtractor::JSONLDExtractor',
86             'www.peopo.org' => 'NewsExtractor::SiteSpecificExtractor::www_peopo_org',
87             'www.ntdtv.com' => 'NewsExtractor::SiteSpecificExtractor::www_ntdtv_com',
88             'www.ksnews.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw',
89             'news.tvbs.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw',
90             'udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
91             'stars.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
92             'money.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
93             'house.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
94             'opinion.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
95             'www.taipeitimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com',
96             'www.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
97             'star.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
98             'house.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
99             'health.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
100             'www.rti.org.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rti_org_tw',
101             'www.bcc.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw',
102             'www.setn.com' => 'NewsExtractor::SiteSpecificExtractor::www_setn_com',
103             'news.tnn.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tnn_tw',
104             'turnnewsapp.com' => 'NewsExtractor::SiteSpecificExtractor::turnnewsapp_com',
105             'news.cts.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_cts_com_tw',
106             'talk.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::talk_ltn_com_tw',
107             'estate.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw',
108             'www.upmedia.mg' => 'NewsExtractor::SiteSpecificExtractor::www_upmedia_mg',
109             'ctee.com.tw' => 'NewsExtractor::SiteSpecificExtractor::ctee_com_tw',
110             'news.ebc.net.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw',
111             'newnet.tw' => 'NewsExtractor::SiteSpecificExtractor::newnet_tw',
112             'www.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw',
113             'news.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ttv_com_tw',
114             'www.idn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_idn_com_tw',
115             'www.fountmedia.io' => 'NewsExtractor::SiteSpecificExtractor::www_fountmedia_io',
116             'news.pts.org.tw' => 'NewsExtractor::SiteSpecificExtractor::news_pts_org_tw',
117             'www.twreporter.org' => 'NewsExtractor::SiteSpecificExtractor::www_twreporter_org',
118             'new.ctv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::new_ctv_com_tw',
119             'hk.crntt.com' => 'NewsExtractor::SiteSpecificExtractor::hk_crntt_com',
120             'hk.on.cc' => 'NewsExtractor::SiteSpecificExtractor::hk_on_cc',
121             'www.hkcna.hk' => 'NewsExtractor::SiteSpecificExtractor::www_hkcna_hk',
122             'www.xinhuanet.com' => 'NewsExtractor::SiteSpecificExtractor::www_xinhuanet_com',
123             'news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::news_cctv_com',
124             'm.news.cctv.com' => 'NewsExtractor::SiteSpecificExtractor::m_news_cctv_com',
125             'focustaiwan.tw' => 'NewsExtractor::SiteSpecificExtractor::focustaiwan_tw',
126             'newtalk.tw' => 'NewsExtractor::SiteSpecificExtractor::newtalk_tw',
127             'www.digitimes.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_digitimes_com_tw',
128             'www.mdnkids.com' => 'NewsExtractor::SiteSpecificExtractor::www_mdnkids_com',
129             'www.nownews.com' => 'NewsExtractor::SiteSpecificExtractor::www_nownews_com',
130             'yimedia.com.tw' => 'NewsExtractor::SiteSpecificExtractor::yimedia_com_tw',
131             },
132             CSSRuleSetByHost => {
133             'www.eventsinfocus.org' => {
134             headline => 'h1.title > span',
135             dateline => 'div.content time.datetime',
136             journalist => 'div.content div.node__content div.clearfix.text-formatted > p:nth-child(1)',
137             content_text => 'div.content article div.clearfix.text-formatted',
138             },
139             'www.5ch.com.tw' => {
140             headline => 'h3.m-ti',
141             dateline => 'div.more-about div.date',
142             journalist => 'div.more-about div.reporter',
143             content_text => 'div.text-edit',
144             },
145             'www.cw.com.tw' => {
146             headline => 'div.article__head h1',
147             dateline => 'div.article__detail > time',
148             journalist => 'div.author--item > a',
149             content_text => 'div.article__content',
150             },
151             'www.taiwannews.com.tw' => {
152             headline => 'h1.article-title',
153             dateline => 'div.article-date',
154             journalist => 'div.article-author',
155             content_text => 'article.article',
156             },
157             'www.enewstw.com' => {
158             headline => 'td.blog_title > strong',
159             dateline => 'td.blog_title tr:nth-child(2) > td.blog',
160             journalist => 'td.blog_title tr:nth-child(1) > td.blog',
161             content_text => 'td.new_t p',
162             },
163             'www.storm.mg' => {
164             headline => 'h1#article_title',
165             dateline => 'span#info_time',
166             journalist => '#article_info_wrapper #author_block a.link_author > span.info_author',
167             content_text => 'div#article_inner_wrapper > article:nth-child(1)',
168             }
169             }
170 1     1   9 };
  1         2  
171              
172             sub _build_extractor {
173 0     0     my ($self) = @_;
174 0           my $url = $self->tx->req->url;
175 0           my $host = $url->host;
176 0           my $extractor;
177 0 0         if (my $sel = CSSRuleSetByHost->{$host}) {
    0          
178 0           $extractor = NewsExtractor::CSSExtractor->new(
179             css_selector => NewsExtractor::CSSRuleSet->new(%$sel),
180             tx => $self->tx
181             );
182             } elsif (my $extractor_class = SiteSpecificExtractorByHost->{$host}) {
183 0           $extractor = $extractor_class->new( tx => $self->tx );
184             } else {
185 0           $extractor = NewsExtractor::GenericExtractor->new( tx => $self->tx );
186             }
187 0           return $extractor;
188             }
189              
190             1;