blib/lib/Novel/Robot/Parser.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 162 | 386 | 41.9 |
branch | 34 | 138 | 24.6 |
condition | 18 | 121 | 14.8 |
subroutine | 28 | 59 | 47.4 |
pod | 3 | 34 | 8.8 |
total | 245 | 738 | 33.2 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # ABSTRACT: get novel / bbs content from website | ||||||
2 | package Novel::Robot::Parser; | ||||||
3 | |||||||
4 | 3 | 3 | 587 | use strict; | |||
3 | 6 | ||||||
3 | 98 | ||||||
5 | 3 | 3 | 15 | use warnings; | |||
3 | 6 | ||||||
3 | 112 | ||||||
6 | 3 | 3 | 697 | use utf8; | |||
3 | 18 | ||||||
3 | 15 | ||||||
7 | |||||||
8 | 3 | 3 | 1443 | use Novel::Robot::Browser; | |||
3 | 33 | ||||||
3 | 126 | ||||||
9 | 3 | 3 | 20 | use URI; | |||
3 | 9 | ||||||
3 | 60 | ||||||
10 | 3 | 3 | 17 | use Encode; | |||
3 | 6 | ||||||
3 | 278 | ||||||
11 | 3 | 3 | 1601 | use Web::Scraper; | |||
3 | 230690 | ||||||
3 | 24 | ||||||
12 | 3 | 3 | 240 | use HTML::TreeBuilder; | |||
3 | 6 | ||||||
3 | 33 | ||||||
13 | 3 | 3 | 83 | use Data::Dumper; | |||
3 | 9 | ||||||
3 | 4941 | ||||||
14 | |||||||
15 | ### {{{ data | ||||||
16 | |||||||
17 | #our $VERSION = 0.32; | ||||||
18 | |||||||
19 | our %SITE_DOM_NAME = ( | ||||||
20 | 'bbs.jjwxc.net' => 'hjj', | ||||||
21 | 'www.jjwxc.net' => 'jjwxc', | ||||||
22 | 'tieba.baidu.com' => 'tieba', | ||||||
23 | |||||||
24 | 'www.ddshu.net' => 'ddshu', | ||||||
25 | 'www.kanunu8.com' => 'kanunu8', | ||||||
26 | ); | ||||||
27 | |||||||
28 | our %NULL_INDEX = ( | ||||||
29 | url => '', | ||||||
30 | book => '', | ||||||
31 | writer => '', | ||||||
32 | writer_url => '', | ||||||
33 | item_list => [], | ||||||
34 | |||||||
35 | intro => '', | ||||||
36 | series => '', | ||||||
37 | progress => '', | ||||||
38 | word_num => '', | ||||||
39 | ); | ||||||
40 | |||||||
41 | our %NULL_CHAPTER = ( | ||||||
42 | content => '', | ||||||
43 | id => 0, | ||||||
44 | pid => 0, | ||||||
45 | time => '', | ||||||
46 | title => '', | ||||||
47 | url => '', | ||||||
48 | writer => '', | ||||||
49 | writer_say => '', | ||||||
50 | abstract => '', | ||||||
51 | word_num => '', | ||||||
52 | type => '', | ||||||
53 | ); | ||||||
54 | |||||||
55 | ### }}} | ||||||
56 | |||||||
57 | ### init {{{ | ||||||
58 | sub new { | ||||||
59 | 2 | 2 | 1 | 1396 | my ( $self, %opt ) = @_; | ||
60 | |||||||
61 | 2 | 18 | $opt{site} = $self->detect_site( $opt{site} ); | ||||
62 | |||||||
63 | 2 | 8 | my $module = "Novel::Robot::Parser::$opt{site}"; | ||||
64 | 2 | 128 | eval "require $module;"; | ||||
65 | |||||||
66 | 2 | 30 | my $browser = Novel::Robot::Browser->new( %opt ); | ||||
67 | 2 | 21 | bless { browser => $browser, %opt }, $module; | ||||
68 | } | ||||||
69 | |||||||
70 | 0 | 0 | sub domain { } | ||||
71 | |||||||
72 | sub detect_domain { | ||||||
73 | 0 | 0 | 0 | 0 | my ( $self, $url ) = @_; | ||
74 | 0 | 0 | 0 | return ( $url, $url ) unless ( $url =~ /^https?:/ ); | |||
75 | |||||||
76 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)(?:/|$)#; | ||||
77 | |||||||
78 | 0 | 0 | my $base_dom = $dom; | ||||
79 | 0 | 0 | $base_dom =~ s/^[^.]+\.//; | ||||
80 | 0 | 0 | 0 | $base_dom = $base_dom =~ /\./ ? $base_dom : $dom; | |||
81 | 0 | 0 | return ( $dom, $base_dom ); | ||||
82 | } | ||||||
83 | |||||||
84 | sub detect_site { | ||||||
85 | 2 | 2 | 0 | 6 | my ( $self, $url ) = @_; | ||
86 | |||||||
87 | 2 | 50 | 33 | 19 | if ( $url and $url =~ /^https?:/ ) { | ||
88 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)/#; | ||||
89 | 0 | 0 | 0 | return $SITE_DOM_NAME{$dom} if ( exists $SITE_DOM_NAME{$dom} ); | |||
90 | 0 | 0 | return 'default'; | ||||
91 | } | ||||||
92 | |||||||
93 | 2 | 50 | 9 | return $url // 'default'; | |||
94 | } | ||||||
95 | ### }}} | ||||||
96 | |||||||
97 | ### {{{ common | ||||||
98 | 2 | 2 | 0 | 8 | sub site_type { 'novel' } | ||
99 | 0 | 0 | 0 | 0 | sub charset { 'cp936' } | ||
100 | 0 | 0 | sub base_url { } | ||||
101 | |||||||
102 | sub generate_novel_url { | ||||||
103 | 0 | 0 | 0 | 0 | my ( $self, $index_url, @args ) = @_; | ||
104 | 0 | 0 | return ( $index_url, @args ); | ||||
105 | } | ||||||
106 | |||||||
107 | ### }}} | ||||||
108 | |||||||
109 | ### {{{ novel | ||||||
110 | |||||||
111 | sub get_novel_info { | ||||||
112 | 1 | 1 | 1 | 12 | my ( $self, $url ) = @_; | ||
113 | 1 | 4 | my ( $i_url, $post_data ) = $self->generate_novel_url( $url ); | ||||
114 | 1 | 8 | my $c = $self->{browser}->request_url( $i_url, $post_data ); | ||||
115 | |||||||
116 | 1 | 25 | my $r = $self->extract_elements( | ||||
117 | \$c, | ||||||
118 | path => $self->scrape_novel(), | ||||||
119 | sub => $self->can( "parse_novel" ), | ||||||
120 | ); | ||||||
121 | 1 | 11 | $r->{item_list} = $self->parse_item_list( \$c, $r ); | ||||
122 | 1 | 6 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $url ); | ||||
123 | 1 | 6 | return $r; | ||||
124 | } | ||||||
125 | |||||||
126 | sub get_novel_ref { | ||||||
127 | 2 | 2 | 1 | 904 | my ( $self, $index_url, %o ) = @_; | ||
128 | |||||||
129 | 2 | 50 | 8 | return $self->get_tiezi_ref( $index_url, %o ) if ( $self->site_type() eq 'tiezi' ); | |||
130 | |||||||
131 | 2 | 5 | my ( $r, $item_list, $max_item_num ); | ||||
132 | 2 | 50 | 8 | if ( $index_url !~ /^https?:/ ) { | |||
133 | 2 | 10 | $r = $self->parse_novel( $index_url, %o ); | ||||
134 | } else { | ||||||
135 | 0 | 0 | my ( $i_url, $post_data ) = $self->generate_novel_url( $index_url ); | ||||
136 | |||||||
137 | ( $r, $item_list ) = $self->{browser}->request_url_whole( | ||||||
138 | $i_url, | ||||||
139 | post_data => $post_data, | ||||||
140 | info_sub => sub { | ||||||
141 | 0 | 0 | 0 | $self->extract_elements( | |||
142 | @_, | ||||||
143 | path => $self->scrape_novel(), | ||||||
144 | sub => $self->can( "parse_novel" ), | ||||||
145 | ); | ||||||
146 | }, | ||||||
147 | 0 | 0 | 0 | item_list_sub => sub { $self->can( "parse_item_list" )->( $self, @_ ) }, | |||
148 | item_sub => sub { | ||||||
149 | 0 | 0 | 0 | $self->extract_elements( | |||
150 | @_, | ||||||
151 | path => $self->scrape_novel_item(), | ||||||
152 | sub => $self->can( "parse_novel_item" ), | ||||||
153 | ); | ||||||
154 | }, | ||||||
155 | 0 | 0 | %o, | ||||
156 | ); | ||||||
157 | |||||||
158 | 0 | 0 | $r->{url} = $index_url; | ||||
159 | 0 | 0 | 0 | $r->{item_list} = $item_list || []; | |||
160 | |||||||
161 | #$r->{item_num} = $max_item_num || undef; | ||||||
162 | } ## end else [ if ( $index_url !~ /^https?:/)] | ||||||
163 | |||||||
164 | 2 | 10 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $index_url ); | ||||
165 | 2 | 12 | $self->filter_item_list( $r, %o ); | ||||
166 | 2 | 13 | $r->{writer_url} = $self->format_abs_url( $r->{writer_url}, $index_url ); | ||||
167 | |||||||
168 | 2 | 6 | for my $k ( qw/writer book/ ) { | ||||
169 | 4 | 50 | 12 | $r->{$k} = $o{$k} if ( exists $o{$k} ); | |||
170 | } | ||||||
171 | 2 | 66 | 35 | $r->{$_} ||= $NULL_INDEX{$_} for keys( %NULL_INDEX ); | |||
172 | 2 | 15 | $r->{$_} = $self->tidy_string( $r->{$_} ) for qw/writer book/; | ||||
173 | |||||||
174 | 2 | 8 | return $r; | ||||
175 | } ## end sub get_novel_ref | ||||||
176 | |||||||
177 | sub scrape_novel { | ||||||
178 | 1 | 1 | 0 | 5 | my ( $self ) = @_; | ||
179 | 1 | 5 | my $r = {}; | ||||
180 | 1 | 50 | 7 | push @{$r->{book}}, { path => $self->{book_path} } if ( exists $self->{book_path} ); | |||
0 | 0 | ||||||
181 | 1 | 50 | 5 | push @{$r->{book}}, { regex => $self->{book_regex} } if ( exists $self->{book_regex} ); | |||
0 | 0 | ||||||
182 | 1 | 50 | 5 | push @{$r->{writer}}, { path => $self->{writer_path} } if ( exists $self->{writer_path} ); | |||
0 | 0 | ||||||
183 | 1 | 50 | 5 | push @{$r->{writer}}, { regex => $self->{writer_regex} } if ( exists $self->{writer_regex} ); | |||
0 | 0 | ||||||
184 | 1 | 44 | return $r; | ||||
185 | } | ||||||
186 | |||||||
187 | sub parse_novel { | ||||||
188 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
189 | |||||||
190 | 0 | 0 | 0 | $r->{book} ||= $self->scrape_element_try( | |||
191 | $h, | ||||||
192 | [ { path => '//meta[@name="og:novel:book_name"]', extract => '@content' }, | ||||||
193 | { path => '//meta[@property="og:novel:book_name"]', extract => '@content' }, | ||||||
194 | { path => '//meta[@property="og:title"]', extract => '@content' }, | ||||||
195 | { path => '//div[@id="title"]', }, | ||||||
196 | { path => '//div[@class="title"]', }, | ||||||
197 | { regex => qr# |
||||||
198 | { regex => qr# |
||||||
199 | { regex => qr# |
||||||
200 | { path => '//h1', }, | ||||||
201 | { path => '//h2', }, | ||||||
202 | ], | ||||||
203 | sub => $self->can( "tidy_writer_book" ), | ||||||
204 | ); | ||||||
205 | |||||||
206 | 0 | 0 | 0 | $r->{writer} ||= $self->scrape_element_try( | |||
207 | $h, | ||||||
208 | [ { path => '//meta[@name="author"]', extract => '@content' }, | ||||||
209 | |||||||
210 | { path => '//meta[@name="og:novel:author"]', extract => '@content' }, | ||||||
211 | { path => '//meta[@property="og:novel:author"]', extract => '@content' }, | ||||||
212 | { path => '//*[@class="author"]', }, | ||||||
213 | { path => '//*[@class="writer"]', }, | ||||||
214 | 3 | 3 | 68 | { regex => qr#作者:([^<]+)#si, }, | |||
3 | 7 | ||||||
3 | 53 | ||||||
215 | { regex => qr#作者:([^<]+)#si, }, | ||||||
216 | { regex => qr#<(?:em|i|h3|h2|span)>作者:([^<]+)(?:em|i|h3|h2|span)>#si, }, | ||||||
217 | { regex => qr#作者:(?:)?]*>([^<]+)#si, }, | ||||||
218 | { regex => qr# 作(?: |\s)*者:([^<]+) #si, }, |
||||||
219 | { regex => qr#作者:([^<]+?) 发布时间:#s, }, | ||||||
220 | { regex => qr#content="([^"]+?)最新著作#s, }, | ||||||
221 | { regex => qr# |
||||||
222 | { regex => qr# |
||||||
223 | { regex => qr#content="[^"]+?,([^",]+?)作品#s, }, | ||||||
224 | ], | ||||||
225 | sub => $self->can( "tidy_writer_book" ), | ||||||
226 | ); | ||||||
227 | |||||||
228 | 0 | 0 | $r->{$_} = $self->tidy_writer_book( $r->{$_} ) for qw/writer book title/; | ||||
229 | |||||||
230 | 0 | 0 | return $r; | ||||
231 | } ## end sub parse_novel | ||||||
232 | |||||||
233 | sub scrape_item_list { | ||||||
234 | 0 | 0 | 0 | 0 | my ( $self ) = @_; | ||
235 | 0 | 0 | my $r = {}; | ||||
236 | 0 | 0 | 0 | $r->{path} = $self->{item_list_path} if ( exists $self->{item_list_path} ); | |||
237 | 0 | 0 | return $r; | ||||
238 | } | ||||||
239 | |||||||
240 | sub parse_item_list { | ||||||
241 | 1 | 1 | 0 | 5 | my ( $self, $h, $r ) = @_; | ||
242 | |||||||
243 | 1 | 50 | 10 | return $r->{item_list} if ( exists $r->{item_list} ); | |||
244 | |||||||
245 | 0 | 0 | my $path_r = $self->scrape_item_list(); | ||||
246 | |||||||
247 | 0 | 0 | 0 | return $self->guess_item_list( $h ) unless ( exists $path_r->{path} ); | |||
248 | |||||||
249 | my $parse_novel = scraper { | ||||||
250 | process $path_r->{path}, | ||||||
251 | 0 | 0 | 0 | 'item_list[]' => { | |||
252 | 'title' => 'TEXT', | ||||||
253 | 'url' => '@href' | ||||||
254 | }; | ||||||
255 | 0 | 0 | }; | ||||
256 | 0 | 0 | my $ref = $parse_novel->scrape( $h ); | ||||
257 | |||||||
258 | 0 | 0 | 0 | my @chap = grep { exists $_->{url} and $_->{url} } @{ $ref->{item_list} }; | |||
0 | 0 | ||||||
0 | 0 | ||||||
259 | |||||||
260 | 0 | 0 | 0 | if ( $path_r->{sort} ) { | |||
261 | 0 | 0 | @chap = sort { $a->{url} cmp $b->{url} } @chap; | ||||
0 | 0 | ||||||
262 | } | ||||||
263 | |||||||
264 | 0 | 0 | return \@chap; | ||||
265 | } ## end sub parse_item_list | ||||||
266 | |||||||
267 | sub guess_item_list { | ||||||
268 | 0 | 0 | 0 | 0 | my ( $self, $h, %opt ) = @_; | ||
269 | |||||||
270 | 0 | 0 | my $new_h = $$h; | ||||
271 | 0 | 0 | $new_h=~s# |
||||
272 | |||||||
273 | 0 | 0 | my $tree = HTML::TreeBuilder->new(); | ||||
274 | 0 | 0 | $tree->parse( $new_h ); | ||||
275 | |||||||
276 | 0 | 0 | my @links = $tree->look_down( '_tag', 'a' ); | ||||
277 | 0 | 0 | @links = grep { $_->attr( 'href' ) } @links; | ||||
0 | 0 | ||||||
278 | 0 | 0 | for my $x ( @links ) { | ||||
279 | 0 | 0 | my $up_url = $x->attr( 'href' ); | ||||
280 | 0 | 0 | $up_url =~ s#/[^/]+/?$#/#; | ||||
281 | 0 | 0 | 0 | $up_url = '.' if ( $up_url !~ m#/# ); | |||
282 | |||||||
283 | 0 | 0 | $x = { parent => $up_url, depth => $x->depth(), url => $x->attr( 'href' ), title => $x->as_text() }; | ||||
284 | } | ||||||
285 | |||||||
286 | 0 | 0 | my @out_links; | ||||
287 | 0 | 0 | my @temp_arr = ( $links[0] ); | ||||
288 | 0 | 0 | my $parent = $links[0]{parent}; | ||||
289 | 0 | 0 | my $depth = $links[0]{depth}; | ||||
290 | 0 | 0 | for ( my $i = 1 ; $i <= $#links ; $i++ ) { | ||||
291 | 0 | 0 | 0 | 0 | if ( $depth == $links[$i]{depth} and $parent eq $links[$i]{parent} ) { | ||
292 | 0 | 0 | push @temp_arr, $links[$i]; | ||||
293 | } else { | ||||||
294 | 0 | 0 | push @out_links, [@temp_arr]; | ||||
295 | 0 | 0 | @temp_arr = ( $links[$i] ); | ||||
296 | 0 | 0 | $depth = $links[$i]{depth}; | ||||
297 | 0 | 0 | $parent = $links[$i]{parent}; | ||||
298 | } | ||||||
299 | } | ||||||
300 | |||||||
301 | 0 | 0 | 0 | push @out_links, \@temp_arr if ( @temp_arr ); | |||
302 | |||||||
303 | 0 | 0 | @out_links = sort { scalar( @$b ) <=> scalar( @$a ) } @out_links; | ||||
0 | 0 | ||||||
304 | |||||||
305 | 0 | 0 | my $res_arr; | ||||
306 | 0 | 0 | my $title_regex = | ||||
307 | qr/引子|楔子|内容简介|正文|序言|文案|第\s*[0123456789零○〇一二三四五六七八九十百千\d]+\s*(章|节)|(^[0-9]+)/; | ||||||
308 | 0 | 0 | my $chap_num_regex = qr/(^|\/)\d+(\.html)?$/; | ||||
309 | 0 | 0 | for my $arr ( @out_links ) { | ||||
310 | 0 | 0 | my $x = $arr->[0]; | ||||
311 | 0 | 0 | my $y = $arr->[1]; | ||||
312 | 0 | 0 | my $z = $arr->[-1]; | ||||
313 | |||||||
314 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( $opt{chapter_url_regex} and $x->{url} =~ /$opt{chapter_url_regex}/ ); | ||
315 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( $opt{chapter_title_regex} and $x->{title} =~ /$opt{chapter_title_regex}/ ); | ||
316 | $res_arr = $arr | ||||||
317 | 0 | 0 | 0 | 0 | if ( ($x and $x->{title} =~ /$title_regex/) or ( $y and $y->{title} =~ /$title_regex/ ) or ( $z and $z->{title} =~ /$title_regex/ ) ); | ||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
318 | 0 | 0 | 0 | 0 | $res_arr = $arr if ( ( $x->{url} =~ /$chap_num_regex/ or $z->{url} =~ /$chap_num_regex/ ) and scalar( @$arr ) > 50 ); | ||
0 | |||||||
319 | |||||||
320 | #$res_arr= $arr if( ($x->{url}=~/\/?\d+$/ or $z->{url}=~/\/?\d+$/) and scalar(@$arr)>50); | ||||||
321 | 0 | 0 | 0 | last if ( $res_arr ); | |||
322 | } | ||||||
323 | |||||||
324 | #remove not chapter url | ||||||
325 | 0 | 0 | while ( 1 ) { | ||||
326 | 0 | 0 | my $x = $res_arr->[0]; | ||||
327 | 0 | 0 | my $y = $res_arr->[ int( $#$res_arr / 2 ) ]; | ||||
328 | 0 | 0 | 0 | 0 | if ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /\.html$/ and $x->{url} !~ /\.html$/ ) { | ||
0 | 0 | ||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
0 | |||||||
329 | 0 | 0 | shift( @$res_arr ); | ||||
330 | } elsif ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /$chap_num_regex/ and $x->{url} !~ /$chap_num_regex/ ) { | ||||||
331 | 0 | 0 | shift( @$res_arr ); | ||||
332 | } else { | ||||||
333 | 0 | 0 | last; | ||||
334 | } | ||||||
335 | } | ||||||
336 | |||||||
337 | #sort chapter url | ||||||
338 | 0 | 0 | 0 | 0 | if ( $res_arr and defined $res_arr->[0]{url} and $res_arr->[0]{url} =~ /$chap_num_regex/ ) { | ||
0 | |||||||
339 | 0 | 0 | 0 | my $trim_sub = sub { my $s = $_[0]; $s =~ s/^.+\///; $s =~ s/\.html$//; return $s }; | |||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
0 | 0 | ||||||
340 | 0 | 0 | my @sort_arr; | ||||
341 | 0 | 0 | 0 | if($opt{sort_chapter_url}){ | |||
342 | 0 | 0 | @sort_arr = sort { $trim_sub->( $a->{url} ) <=> $trim_sub->( $b->{url} ) } grep { $_->{url} =~ /$chap_num_regex/ } @$res_arr; | ||||
0 | 0 | ||||||
0 | 0 | ||||||
343 | }else{ | ||||||
344 | 0 | 0 | @sort_arr = @$res_arr; | ||||
345 | } | ||||||
346 | 0 | 0 | my @s = map { $trim_sub->( $_->{url} ) } @sort_arr; | ||||
0 | 0 | ||||||
347 | 0 | 0 | my $random_sort = 0; | ||||
348 | 0 | 0 | for my $i ( 0 .. $#s - 1 ) { | ||||
349 | 0 | 0 | 0 | $random_sort = 1 if ( $s[$i] > $s[ $i + 1 ] ); | |||
350 | 0 | 0 | 0 | last if ( $random_sort ); | |||
351 | } | ||||||
352 | 0 | 0 | 0 | return \@sort_arr if ( $random_sort == 0 ); | |||
353 | } | ||||||
354 | |||||||
355 | 0 | 0 | 0 | return $res_arr || []; | |||
356 | } ## end sub guess_item_list | ||||||
357 | |||||||
358 | sub scrape_novel_item { | ||||||
359 | 1 | 1 | 0 | 32 | my ( $self ) = @_; | ||
360 | 1 | 4 | my $r = {}; | ||||
361 | 1 | 50 | 8 | push @{$r->{content}}, { path => $self->{content_path}, extract => 'HTML' } if( exists $self->{content_path} ); | |||
0 | 0 | ||||||
362 | 1 | 50 | 6 | push @{$r->{content}}, { regex => $self->{content_regex} } if( exists $self->{content_regex} ); | |||
0 | 0 | ||||||
363 | 1 | 4 | push @{$r->{content}}, ( | ||||
1 | 8 | ||||||
364 | { path => '//div[@class="novel_content"]' }, | ||||||
365 | { path => '//div[@id="content"]' }, | ||||||
366 | ); | ||||||
367 | 1 | 13 | return $r; | ||||
368 | } | ||||||
369 | |||||||
370 | sub parse_novel_item { | ||||||
371 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
372 | |||||||
373 | 0 | 0 | 0 | $r = $self->guess_novel_item( $h ) unless ( $r->{content} ); | |||
374 | 0 | 0 | 0 | $r->{$_} ||= $NULL_CHAPTER{$_} for keys( %NULL_CHAPTER ); | |||
375 | 0 | 0 | $r->{content} = $self->tidy_content( $r->{content} ); | ||||
376 | |||||||
377 | 0 | 0 | my $next_url = $self->scrape_element_try($h, [ | ||||
378 | { path => '//a[@id="next_url"]', extract =>'@href' }, | ||||||
379 | { path => '//a[contains(text(),"下一页")]', extract => '@href' }, | ||||||
380 | ]); | ||||||
381 | 0 | 0 | 0 | if($next_url){ | |||
382 | 0 | 0 | $r->{next_url} = $next_url; | ||||
383 | } | ||||||
384 | |||||||
385 | 0 | 0 | return $r; | ||||
386 | } | ||||||
387 | |||||||
388 | sub guess_novel_item { | ||||||
389 | 0 | 0 | 0 | 0 | my ( $self, $h, %opt ) = @_; | ||
390 | |||||||
391 | 0 | 0 | $$h =~ s###sg; | ||||
392 | 0 | 0 | $$h =~ s###sg; | ||||
393 | |||||||
394 | 0 | 0 | my $tree = HTML::TreeBuilder->new(); | ||||
395 | 0 | 0 | $tree->parse( $$h ); | ||||
396 | |||||||
397 | 0 | 0 | my @links = $tree->look_down( 'text', undef ); | ||||
398 | 0 | 0 | for my $x ( @links ) { | ||||
399 | 0 | 0 | $x = { content => $x->as_HTML( '<>&' ) }; | ||||
400 | 0 | 0 | $self->calc_content_wordnum( $x ); | ||||
401 | } | ||||||
402 | 0 | 0 | my @out_links = sort { $b->{word_num} <=> $a->{word_num} } @links; | ||||
0 | 0 | ||||||
403 | |||||||
404 | 0 | 0 | my $no_next_r; | ||||
405 | 0 | 0 | for my $r ( @out_links ) { | ||||
406 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(style|head|body|html)>#s ); | |||
407 | 0 | 0 | 0 | next if ( $r->{content} =~ m#^\s* | |||
408 | 0 | 0 | 0 | next if ( $r->{content} =~ /(上|下)一(章|页|篇)/s ); | |||
409 | 0 | 0 | 0 | next if ( $r->{content} =~ m##s ); | |||
410 | 0 | 0 | 0 | next if ( $r->{content} =~ m#All rights reserved#s ); | |||
411 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(.+?){5,}#s ); | |||
412 | |||||||
413 | 0 | 0 | $no_next_r = $r; | ||||
414 | 0 | 0 | last; | ||||
415 | } | ||||||
416 | |||||||
417 | #my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)\w{0,20}$/s and $_->{word_num} > 50 } @out_links; | ||||||
418 | 0 | 0 | my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)/s | ||||
419 | 0 | 0 | 0 | and $_->{word_num} > 50 | |||
420 | } @out_links; | ||||||
421 | |||||||
422 | 0 | 0 | my $cc = $no_next_r->{content}; | ||||
423 | 0 | 0 | my $cc_n = $cc =~ s/(\n| ]*>| |
||||
424 | 0 | 0 | 0 | 0 | return $no_next_r if ( ( $cc_n > 5 and $no_next_r->{word_num} > 50) or !@grep_next_r ); | ||
0 | |||||||
425 | |||||||
426 | 0 | 0 | 0 | return $grep_next_r[-1] || {}; | |||
427 | } ## end sub guess_novel_item | ||||||
428 | |||||||
429 | ### }}} | ||||||
430 | |||||||
431 | ### {{{ tiezi | ||||||
432 | |||||||
433 | sub get_tiezi_ref { | ||||||
434 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
435 | |||||||
436 | 0 | 0 | my $class = 'novel'; | ||||
437 | my ( $topic, $item_list ) = $self->{browser}->request_url_whole( | ||||||
438 | $url, | ||||||
439 | |||||||
440 | #post_data => $o{post_data}, | ||||||
441 | info_sub => sub { | ||||||
442 | 0 | 0 | 0 | $self->extract_elements( | |||
443 | @_, | ||||||
444 | path => $self->can( "scrape_$class" )->( $self ), | ||||||
445 | sub => $self->can( "parse_$class" ), | ||||||
446 | ); | ||||||
447 | }, | ||||||
448 | 0 | 0 | 0 | item_list_sub => sub { $self->can( "parse_${class}_item" )->( $self, @_ ) }, | |||
449 | 0 | 0 | 0 | page_list_sub => sub { $self->can( "parse_${class}_list" )->( $self, @_ ) }, | |||
450 | |||||||
451 | #min_page_num => $o{"min_page_num"}, | ||||||
452 | #max_page_num => $o{"max_page_num"}, | ||||||
453 | stop_sub => sub { | ||||||
454 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
455 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
456 | }, | ||||||
457 | 0 | 0 | %o, | ||||
458 | ); | ||||||
459 | |||||||
460 | 0 | 0 | $item_list = $self->update_item_list( $item_list, $url ); | ||||
461 | |||||||
462 | 0 | 0 | 0 | unshift @$item_list, $topic if ( $topic->{content} ); | |||
463 | my %r = ( | ||||||
464 | %$topic, | ||||||
465 | writer => $o{writer} || $topic->{writer}, | ||||||
466 | book => $o{book} || $topic->{book} || $topic->{title}, | ||||||
467 | 0 | 0 | 0 | url => $url, | |||
0 | |||||||
468 | item_list => $item_list, | ||||||
469 | ); | ||||||
470 | 0 | 0 | $self->filter_item_list( \%r, %o ); | ||||
471 | |||||||
472 | 0 | 0 | return \%r; | ||||
473 | } ## end sub get_tiezi_ref | ||||||
474 | |||||||
475 | ### }}} | ||||||
476 | |||||||
477 | ### {{{ iterate_ref | ||||||
478 | sub get_iterate_ref { | ||||||
479 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
480 | my ( $info, $item_list ) = $self->{browser}->request_url_whole( | ||||||
481 | $url, | ||||||
482 | post_data => $o{post_data}, | ||||||
483 | info_sub => sub { | ||||||
484 | $self->extract_elements( | ||||||
485 | @_, | ||||||
486 | path => {}, | ||||||
487 | 0 | 0 | sub => sub { my ( $self, $html_ref, $r ) = @_; return $r; }, | ||||
0 | 0 | ||||||
488 | 0 | 0 | 0 | ); | |||
489 | }, | ||||||
490 | 0 | 0 | 0 | item_sub => sub { my ( $self, $html_ref ) = @_; return {}; }, | |||
0 | 0 | ||||||
491 | 0 | 0 | 0 | item_list_sub => sub { my ( $self, $html_ref ) = @_; return []; }, | |||
0 | 0 | ||||||
492 | |||||||
493 | #min_page_num => $o{"min_page_num"}, | ||||||
494 | #max_page_num => $o{"max_page_num"}, | ||||||
495 | stop_sub => sub { | ||||||
496 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
497 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
498 | }, | ||||||
499 | 0 | 0 | %o, | ||||
500 | ); | ||||||
501 | |||||||
502 | 0 | 0 | $info->{item_list} = $self->update_item_list( $item_list, $url ); | ||||
503 | |||||||
504 | 0 | 0 | return $info; | ||||
505 | } ## end sub get_iterate_ref | ||||||
506 | |||||||
507 | ### }}} | ||||||
508 | |||||||
509 | ### {{{ base | ||||||
510 | |||||||
511 | sub update_item_list { | ||||||
512 | 5 | 5 | 0 | 14 | my ( $self, $arr, $base_url ) = @_; | ||
513 | |||||||
514 | 5 | 9 | my %rem; | ||||
515 | 5 | 11 | for my $chap (@$arr){ | ||||
516 | 22 | 50 | 0 | 55 | $chap = { url => $chap || '' } if ( ref( $chap ) ne 'HASH' ); | ||
517 | 22 | 100 | 48 | if ( $chap->{url} ) { | |||
518 | 10 | 24 | $chap->{url} = $self->format_abs_url( $chap->{url}, $base_url ); | ||||
519 | 10 | 9283 | $rem{ $chap->{url} }++; | ||||
520 | } | ||||||
521 | } | ||||||
522 | |||||||
523 | 5 | 10 | my $i = 0; | ||||
524 | 5 | 12 | my @res; | ||||
525 | 5 | 10 | for my $chap ( @$arr ) { | ||||
526 | 22 | 50 | 66 | 63 | if($chap->{url} and $rem{ $chap->{url} }>1){ | ||
527 | 0 | 0 | $rem{$chap->{url}}--; | ||||
528 | }else{ | ||||||
529 | 22 | 33 | ++$i; | ||||
530 | 22 | 66 | 67 | $chap->{pid} //= $i; #page id | |||
531 | 22 | 66 | 51 | $chap->{id} //= $i; #item id | |||
532 | 22 | 100 | 54 | $chap->{content} //= ''; | |||
533 | 22 | 50 | 55 | push @res, $chap unless($chap->{content}=~m#正在手打中#s); | |||
534 | } | ||||||
535 | } | ||||||
536 | |||||||
537 | 5 | 33 | 29 | while(@res and $res[-1]{content}=~m#正在手打中#s ){ | |||
538 | 0 | 0 | pop @res; | ||||
539 | } | ||||||
540 | |||||||
541 | #$i = $arr->[-1]{id} if ( $#$arr >= 0 and exists $arr->[-1]{id} and $arr->[-1]{id} > $i ); | ||||||
542 | 5 | 100 | 31 | return wantarray ? ( \@res, $i ) : \@res; | |||
543 | } ## end sub update_item_list | ||||||
544 | |||||||
545 | sub format_abs_url { | ||||||
546 | 12 | 12 | 0 | 28 | my ( $self, $url, $base_url ) = @_; | ||
547 | 12 | 50 | 24 | return $url unless ( $base_url ); | |||
548 | 12 | 100 | 43 | return $url unless ( $base_url =~ /^https?:/ ); | |||
549 | 10 | 36 | my $abs_url = URI->new_abs( $url, $base_url )->as_string; | ||||
550 | } | ||||||
551 | |||||||
552 | sub extract_elements { | ||||||
553 | 2 | 2 | 0 | 12 | my ( $self, $h, %o ) = @_; | ||
554 | 2 | 50 | 11 | $o{path} ||= {}; | |||
555 | |||||||
556 | 2 | 6 | my $r = {}; | ||||
557 | 2 | 6 | while ( my ( $xk, $xr ) = each %{ $o{path} } ) { | ||||
3 | 24 | ||||||
558 | 1 | 11 | $r->{$xk} = $self->scrape_element_try( $h, $xr ); | ||||
559 | } | ||||||
560 | 2 | 50 | 18 | $r = $o{sub}->( $self, $h, $r ) if ( $o{sub} ); | |||
561 | 2 | 13 | return $r; | ||||
562 | } | ||||||
563 | |||||||
564 | sub scrape_element_try { | ||||||
565 | 1 | 1 | 0 | 4 | my ( $self, $h, $r_list, %o ) = @_; | ||
566 | 1 | 3 | my $c; | ||||
567 | 1 | 5 | for my $path_or_regex ( @$r_list ) { | ||||
568 | 2 | 13 | $c = $self->scrape_element( $h, $path_or_regex ); | ||||
569 | 2 | 50 | 13 | next unless ( $c ); | |||
570 | 0 | 0 | 0 | $c = $o{sub}->( $self, $c ) if ( exists $o{sub} ); | |||
571 | 0 | 0 | 0 | next unless ( $c ); | |||
572 | 0 | 0 | return $c; | ||||
573 | } | ||||||
574 | 1 | 8 | return; | ||||
575 | } | ||||||
576 | |||||||
577 | sub scrape_element { | ||||||
578 | 2 | 2 | 0 | 8 | my ( $self, $h, $o ) = @_; | ||
579 | 2 | 50 | 10 | return $self->extract_regex_element( $h, $o->{regex} ) if ( $o->{regex} ); | |||
580 | 2 | 50 | 11 | return $o->{sub}->( $h ) unless ( $o->{path} ); | |||
581 | |||||||
582 | 2 | 50 | 15 | $o->{extract} ||= 'TEXT'; | |||
583 | |||||||
584 | my $parse = $o->{is_list} | ||||||
585 | 0 | 0 | 0 | ? scraper { process $o->{path}, 'data[]' => $o->{extract}; } | |||
586 | 2 | 50 | 2 | 22 | : scraper { process_first $o->{path}, 'data' => $o->{extract}; }; | ||
2 | 123653 | ||||||
587 | 2 | 31 | my $r = $parse->scrape( $h ); | ||||
588 | 2 | 50 | 115359 | return unless ( defined $r->{data} ); | |||
589 | |||||||
590 | 0 | 0 | 0 | return $r->{data} unless ( $o->{sub} ); | |||
591 | 0 | 0 | return $o->{sub}->( $r->{data} ); | ||||
592 | } | ||||||
593 | |||||||
594 | sub extract_regex_element { | ||||||
595 | 0 | 0 | 0 | 0 | my ( $self, $h, $reg ) = @_; | ||
596 | 0 | 0 | my ( $d ) = $$h =~ m#$reg#s; | ||||
597 | 0 | 0 | return $d; | ||||
598 | } | ||||||
599 | |||||||
600 | sub filter_item_list { | ||||||
601 | 2 | 2 | 0 | 5 | my ( $self, $r, %o ) = @_; | ||
602 | |||||||
603 | 2 | 5 | my $flist = $r->{item_list}; | ||||
604 | |||||||
605 | #$r->{item_num} //= $flist->[-1]{id} // scalar( @$flist ); | ||||||
606 | |||||||
607 | 2 | 16 | $flist->[$_]{content} = $self->tidy_content( $flist->[$_]{content} ) for ( 0 .. $#$flist ); | ||||
608 | |||||||
609 | 2 | 7 | $flist = [ grep { $self->{browser}->is_item_in_range( $_->{id}, $o{min_item_num}, $o{max_item_num} ) } @$flist ]; | ||||
6 | 27 | ||||||
610 | |||||||
611 | 2 | 11 | $self->calc_content_wordnum( $_ ) for @$flist; | ||||
612 | |||||||
613 | 0 | 0 | $flist = [ grep { $_->{word_num} >= $o{min_content_word_num} } @$flist ] | ||||
614 | 2 | 50 | 6 | if ( $o{min_content_word_num} ); | |||
615 | |||||||
616 | 0 | 0 | $flist = [ grep { $_->{writer} eq $r->{writer} } @$flist ] | ||||
617 | 2 | 50 | 5 | if ( $o{only_poster} ); | |||
618 | |||||||
619 | 0 | 0 | $flist = [ grep { $_->{content} =~ /$o{grep_content}/s } @$flist ] | ||||
620 | 2 | 50 | 5 | if ( $o{grep_content} ); | |||
621 | |||||||
622 | 0 | 0 | $flist = [ grep { $_->{content} !~ /$o{filter_content}/s } @$flist ] | ||||
623 | 2 | 50 | 6 | if ( $o{filter_content} ); | |||
624 | |||||||
625 | 2 | 50 | 5 | $flist = [ grep { defined $_->{content} and $_->{content} =~ /\S/s } @$flist ]; | |||
6 | 42 | ||||||
626 | |||||||
627 | 2 | 50 | 12 | $r->{item_list} = $flist || []; | |||
628 | |||||||
629 | 2 | 5 | return $self; | ||||
630 | } ## end sub filter_item_list | ||||||
631 | |||||||
632 | sub calc_content_wordnum { | ||||||
633 | 6 | 6 | 0 | 12 | my ( $self, $f ) = @_; | ||
634 | 6 | 50 | 15 | return if ( $f->{word_num} ); | |||
635 | 6 | 50 | 12 | my $wd = $f->{content} || ''; | |||
636 | 6 | 46 | $wd =~ s/<[^>]+>//gs; | ||||
637 | 6 | 25 | $wd =~ s/\s+//sg; | ||||
638 | 6 | 54 | $f->{word_num} = $wd =~ s/\S//gs; | ||||
639 | 6 | 19 | return $f; | ||||
640 | } | ||||||
641 | |||||||
642 | sub tidy_writer_book { | ||||||
643 | 0 | 0 | 0 | 0 | my ( $self, $c ) = @_; | ||
644 | 0 | 0 | 0 | return unless ( defined $c ); | |||
645 | 0 | 0 | for ( $c ) { | ||||
646 | 0 | 0 | s/作\s*者://; | ||||
647 | 0 | 0 | s/^\s*作者-\s*//; | ||||
648 | 0 | 0 | s/小说全集//; | ||||
649 | 0 | 0 | s/作品全集//; | ||||
650 | 0 | 0 | s/专栏//; | ||||
651 | 0 | 0 | s/^.*版权属于作者([^,]+)$/$1/; | ||||
652 | 0 | 0 | s/\s*最新章节\s*$//; | ||||
653 | 0 | 0 | s/全文阅读//; | ||||
654 | 0 | 0 | s/在线阅读//; | ||||
655 | 0 | 0 | s/全集下载//; | ||||
656 | 0 | 0 | s/章节目录//; | ||||
657 | 0 | 0 | s/^\s*《(.*)》\s*$/$1/; | ||||
658 | 0 | 0 | s/^\s+|\s+$//g; | ||||
659 | 0 | 0 | s/\s+//g; | ||||
660 | } | ||||||
661 | 0 | 0 | return $c; | ||||
662 | } ## end sub tidy_writer_book | ||||||
663 | |||||||
664 | sub tidy_content { | ||||||
665 | 6 | 6 | 0 | 13 | my ( $self, $c ) = @_; | ||
666 | 6 | 12 | for ( $c ) { | ||||
667 | 6 | 50 | 12 | last unless ( $c ); | |||
668 | 6 | 14 | s# ##sg; | ||||
669 | 6 | 10 | s# #\n#sg; | ||||
670 | 6 | 18 | s#\s{5,}#\n#sg; | ||||
671 | 6 | 12 | s###sg; | ||||
672 | 6 | 54 | s#\s*\<[^>]+?\>#\n#sg; | ||||
673 | 6 | 32 | s{\n+}{\n}sg; | ||||
674 | 6 | 54 | s{\s*(\S.*?)\s*\n}{\n $1 }sg; |
||||
675 | 6 | 18 | s#\s+上一章\s+.+?下一章.+$##s; | ||||
676 | 6 | 11 | s#[^\n]+加入书签[^\n]+##s; | ||||
677 | } | ||||||
678 | 6 | 23 | return $c; | ||||
679 | } | ||||||
680 | |||||||
681 | sub tidy_string { | ||||||
682 | 4 | 4 | 0 | 10 | my ( $self, $c ) = @_; | ||
683 | 4 | 50 | 9 | $c ||= ''; | |||
684 | 4 | 8 | for ( $c ) { | ||||
685 | 4 | 14 | s/^\s+|\s+$//gs; | ||||
686 | 4 | 8 | s/[\*\/\\\[\(\)]+//g; | ||||
687 | 4 | 16 | s/[[:punct:]]//sg; | ||||
688 | 4 | 9 | s/[\]\s+\/\\]/-/g; | ||||
689 | } | ||||||
690 | |||||||
691 | 4 | 14 | return $c; | ||||
692 | } | ||||||
693 | |||||||
694 | sub get_inner_html { | ||||||
695 | 0 | 0 | 0 | my ( $self, $h ) = @_; | |||
696 | |||||||
697 | 0 | 0 | return '' unless ( $h ); | ||||
698 | |||||||
699 | 0 | my $head_i = index( $h, '>' ); | |||||
700 | 0 | substr( $h, 0, $head_i + 1 ) = ''; | |||||
701 | |||||||
702 | 0 | my $tail_i = rindex( $h, '<' ); | |||||
703 | 0 | substr( $h, $tail_i ) = ''; | |||||
704 | |||||||
705 | 0 | return $h; | |||||
706 | } | ||||||
707 | |||||||
708 | sub unescape_js { | ||||||
709 | 0 | 0 | 0 | my ( $self, $s ) = @_; | |||
710 | 0 | $s =~ s/%u([0-9a-f]{4})/chr(hex($1))/eigs; | |||||
0 | |||||||
711 | 0 | $s =~ s/%([0-9a-f]{2})/chr(hex($1))/eigs; | |||||
0 | |||||||
712 | 0 | return $s; | |||||
713 | } | ||||||
714 | |||||||
715 | sub encode_cjk_for_url { | ||||||
716 | 0 | 0 | 0 | my ( $self, $key ) = @_; | |||
717 | 0 | my $b = uc( unpack( "H*", encode( $self->charset(), $key ) ) ); | |||||
718 | 0 | $b =~ s/(..)/%$1/g; | |||||
719 | 0 | return $b; | |||||
720 | } | ||||||
721 | |||||||
722 | ### }}} | ||||||
723 | |||||||
724 | 1; | ||||||
725 |