blib/lib/Novel/Robot/Parser.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 235 | 383 | 61.3 |
branch | 57 | 136 | 41.9 |
condition | 43 | 118 | 36.4 |
subroutine | 34 | 59 | 57.6 |
pod | 3 | 34 | 8.8 |
total | 372 | 730 | 50.9 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # ABSTRACT: get novel / bbs content from website | ||||||
2 | package Novel::Robot::Parser; | ||||||
3 | |||||||
4 | 3 | 3 | 1333 | use strict; | |||
3 | 11 | ||||||
3 | 72 | ||||||
5 | 3 | 3 | 12 | use warnings; | |||
3 | 4 | ||||||
3 | 64 | ||||||
6 | 3 | 3 | 446 | use utf8; | |||
3 | 14 | ||||||
3 | 10 | ||||||
7 | |||||||
8 | 3 | 3 | 1113 | use Novel::Robot::Browser; | |||
3 | 27 | ||||||
3 | 97 | ||||||
9 | 3 | 3 | 16 | use URI; | |||
3 | 5 | ||||||
3 | 45 | ||||||
10 | 3 | 3 | 14 | use Encode; | |||
3 | 4 | ||||||
3 | 186 | ||||||
11 | 3 | 3 | 1263 | use Web::Scraper; | |||
3 | 194758 | ||||||
3 | 29 | ||||||
12 | 3 | 3 | 168 | use HTML::TreeBuilder; | |||
3 | 6 | ||||||
3 | 28 | ||||||
13 | 3 | 3 | 69 | use Data::Dumper; | |||
3 | 5 | ||||||
3 | 4303 | ||||||
14 | |||||||
15 | ### {{{ data | ||||||
16 | |||||||
17 | #our $VERSION = 0.32; | ||||||
18 | |||||||
19 | our %SITE_DOM_NAME = ( | ||||||
20 | 'bbs.jjwxc.net' => 'hjj', | ||||||
21 | 'www.jjwxc.net' => 'jjwxc', | ||||||
22 | 'tieba.baidu.com' => 'tieba', | ||||||
23 | |||||||
24 | 'www.bearead.com' => 'bearead', | ||||||
25 | 'wwwj.bearead.com' => 'bearead', | ||||||
26 | 'www.ddshu.net' => 'ddshu', | ||||||
27 | 'www.kanunu8.com' => 'kanunu8', | ||||||
28 | ); | ||||||
29 | |||||||
30 | our %NULL_INDEX = ( | ||||||
31 | url => '', | ||||||
32 | book => '', | ||||||
33 | writer => '', | ||||||
34 | writer_url => '', | ||||||
35 | item_list => [], | ||||||
36 | |||||||
37 | intro => '', | ||||||
38 | series => '', | ||||||
39 | progress => '', | ||||||
40 | word_num => '', | ||||||
41 | ); | ||||||
42 | |||||||
43 | our %NULL_CHAPTER = ( | ||||||
44 | content => '', | ||||||
45 | id => 0, | ||||||
46 | pid => 0, | ||||||
47 | time => '', | ||||||
48 | title => '', | ||||||
49 | url => '', | ||||||
50 | writer => '', | ||||||
51 | writer_say => '', | ||||||
52 | abstract => '', | ||||||
53 | word_num => '', | ||||||
54 | type => '', | ||||||
55 | ); | ||||||
56 | |||||||
57 | ### }}} | ||||||
58 | |||||||
59 | ### init {{{ | ||||||
60 | sub new { | ||||||
61 | 4 | 4 | 1 | 1543 | my ( $self, %opt ) = @_; | ||
62 | |||||||
63 | 4 | 20 | $opt{site} = $self->detect_site( $opt{site} ); | ||||
64 | |||||||
65 | 4 | 15 | my $module = "Novel::Robot::Parser::$opt{site}"; | ||||
66 | 4 | 262 | eval "require $module;"; | ||||
67 | |||||||
68 | 4 | 56 | my $browser = Novel::Robot::Browser->new( %opt ); | ||||
69 | 4 | 53 | bless { browser => $browser, %opt }, $module; | ||||
70 | } | ||||||
71 | |||||||
72 | 0 | 0 | sub domain { } | ||||
73 | |||||||
74 | sub detect_domain { | ||||||
75 | 0 | 0 | 0 | 0 | my ( $self, $url ) = @_; | ||
76 | 0 | 0 | 0 | return ( $url, $url ) unless ( $url =~ /^https?:/ ); | |||
77 | |||||||
78 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)(?:/|$)#; | ||||
79 | |||||||
80 | 0 | 0 | my $base_dom = $dom; | ||||
81 | 0 | 0 | $base_dom =~ s/^[^.]+\.//; | ||||
82 | 0 | 0 | 0 | $base_dom = $base_dom =~ /\./ ? $base_dom : $dom; | |||
83 | 0 | 0 | return ( $dom, $base_dom ); | ||||
84 | } | ||||||
85 | |||||||
86 | sub detect_site { | ||||||
87 | 4 | 4 | 0 | 13 | my ( $self, $url ) = @_; | ||
88 | |||||||
89 | 4 | 50 | 33 | 28 | if ( $url and $url =~ /^https?:/ ) { | ||
90 | 0 | 0 | my ( $dom ) = $url =~ m#^.*?\/\/(.+?)/#; | ||||
91 | 0 | 0 | 0 | return $SITE_DOM_NAME{$dom} if ( exists $SITE_DOM_NAME{$dom} ); | |||
92 | 0 | 0 | return 'default'; | ||||
93 | } | ||||||
94 | |||||||
95 | 4 | 50 | 18 | return $url // 'default'; | |||
96 | } | ||||||
97 | ### }}} | ||||||
98 | |||||||
99 | ### {{{ common | ||||||
100 | 3 | 3 | 0 | 11 | sub site_type { 'novel' } | ||
101 | 0 | 0 | 0 | 0 | sub charset { 'cp936' } | ||
102 | 0 | 0 | sub base_url { } | ||||
103 | |||||||
104 | sub generate_novel_url { | ||||||
105 | 0 | 0 | 0 | 0 | my ( $self, $index_url, @args ) = @_; | ||
106 | 0 | 0 | return ( $index_url, @args ); | ||||
107 | } | ||||||
108 | |||||||
109 | ### }}} | ||||||
110 | |||||||
111 | ### {{{ novel | ||||||
112 | |||||||
113 | sub get_novel_info { | ||||||
114 | 1 | 1 | 1 | 15 | my ( $self, $url ) = @_; | ||
115 | 1 | 5 | my ( $i_url, $post_data ) = $self->generate_novel_url( $url ); | ||||
116 | 1 | 10 | my $c = $self->{browser}->request_url( $i_url, $post_data ); | ||||
117 | |||||||
118 | 1 | 23 | my $r = $self->extract_elements( | ||||
119 | \$c, | ||||||
120 | path => $self->scrape_novel(), | ||||||
121 | sub => $self->can( "parse_novel" ), | ||||||
122 | ); | ||||||
123 | 1 | 12 | $r->{item_list} = $self->parse_item_list( \$c, $r ); | ||||
124 | 1 | 8 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $url ); | ||||
125 | 1 | 5 | return $r; | ||||
126 | } | ||||||
127 | |||||||
128 | sub get_novel_ref { | ||||||
129 | 3 | 3 | 1 | 820 | my ( $self, $index_url, %o ) = @_; | ||
130 | |||||||
131 | 3 | 50 | 14 | return $self->get_tiezi_ref( $index_url, %o ) if ( $self->site_type() eq 'tiezi' ); | |||
132 | |||||||
133 | 3 | 6 | my ( $r, $item_list, $max_item_num ); | ||||
134 | 3 | 100 | 16 | if ( $index_url !~ /^https?:/ ) { | |||
135 | 2 | 8 | $r = $self->parse_novel( $index_url, %o ); | ||||
136 | } else { | ||||||
137 | 1 | 4 | my ( $i_url, $post_data ) = $self->generate_novel_url( $index_url ); | ||||
138 | |||||||
139 | ( $r, $item_list ) = $self->{browser}->request_url_whole( | ||||||
140 | $i_url, | ||||||
141 | post_data => $post_data, | ||||||
142 | info_sub => sub { | ||||||
143 | 1 | 1 | 19 | $self->extract_elements( | |||
144 | @_, | ||||||
145 | path => $self->scrape_novel(), | ||||||
146 | sub => $self->can( "parse_novel" ), | ||||||
147 | ); | ||||||
148 | }, | ||||||
149 | 1 | 1 | 11 | item_list_sub => sub { $self->can( "parse_item_list" )->( $self, @_ ) }, | |||
150 | item_sub => sub { | ||||||
151 | 1 | 1 | 23 | $self->extract_elements( | |||
152 | @_, | ||||||
153 | path => $self->scrape_novel_item(), | ||||||
154 | sub => $self->can( "parse_novel_item" ), | ||||||
155 | ); | ||||||
156 | }, | ||||||
157 | 1 | 11 | %o, | ||||
158 | ); | ||||||
159 | |||||||
160 | 1 | 13 | $r->{url} = $index_url; | ||||
161 | 1 | 50 | 9 | $r->{item_list} = $item_list || []; | |||
162 | |||||||
163 | #$r->{item_num} = $max_item_num || undef; | ||||||
164 | } ## end else [ if ( $index_url !~ /^https?:/)] | ||||||
165 | |||||||
166 | 3 | 18 | ( $r->{item_list}, $r->{item_num} ) = $self->update_item_list( $r->{item_list}, $index_url ); | ||||
167 | 3 | 23 | $self->filter_item_list( $r, %o ); | ||||
168 | 3 | 15 | $r->{writer_url} = $self->format_abs_url( $r->{writer_url}, $index_url ); | ||||
169 | |||||||
170 | 3 | 433 | for my $k ( qw/writer book/ ) { | ||||
171 | 6 | 50 | 15 | $r->{$k} = $o{$k} if ( exists $o{$k} ); | |||
172 | } | ||||||
173 | 3 | 66 | 61 | $r->{$_} ||= $NULL_INDEX{$_} for keys( %NULL_INDEX ); | |||
174 | 3 | 18 | $r->{$_} = $self->tidy_string( $r->{$_} ) for qw/writer book/; | ||||
175 | |||||||
176 | 3 | 14 | return $r; | ||||
177 | } ## end sub get_novel_ref | ||||||
178 | |||||||
179 | sub scrape_novel { | ||||||
180 | 2 | 2 | 0 | 8 | my ( $self ) = @_; | ||
181 | 2 | 5 | my $r = {}; | ||||
182 | 2 | 50 | 9 | push @{$r->{book}}, { path => $self->{book_path} } if ( exists $self->{book_path} ); | |||
0 | 0 | ||||||
183 | 2 | 50 | 13 | push @{$r->{book}}, { regex => $self->{book_regex} } if ( exists $self->{book_regex} ); | |||
0 | 0 | ||||||
184 | 2 | 50 | 8 | push @{$r->{writer}}, { path => $self->{writer_path} } if ( exists $self->{writer_path} ); | |||
0 | 0 | ||||||
185 | 2 | 50 | 11 | push @{$r->{writer}}, { regex => $self->{writer_regex} } if ( exists $self->{writer_regex} ); | |||
0 | 0 | ||||||
186 | 2 | 51 | return $r; | ||||
187 | } | ||||||
188 | |||||||
189 | sub parse_novel { | ||||||
190 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
191 | |||||||
192 | 0 | 0 | 0 | $r->{book} ||= $self->scrape_element_try( | |||
193 | $h, | ||||||
194 | [ { path => '//meta[@name="og:novel:book_name"]', extract => '@content' }, | ||||||
195 | { path => '//meta[@property="og:novel:book_name"]', extract => '@content' }, | ||||||
196 | { path => '//meta[@property="og:title"]', extract => '@content' }, | ||||||
197 | { path => '//div[@id="title"]', }, | ||||||
198 | { path => '//div[@class="title"]', }, | ||||||
199 | { regex => qr# |
||||||
200 | { regex => qr# |
||||||
201 | { regex => qr# |
||||||
202 | { path => '//h1', }, | ||||||
203 | { path => '//h2', }, | ||||||
204 | ], | ||||||
205 | sub => $self->can( "tidy_writer_book" ), | ||||||
206 | ); | ||||||
207 | |||||||
208 | 0 | 0 | 0 | $r->{writer} ||= $self->scrape_element_try( | |||
209 | $h, | ||||||
210 | [ { path => '//meta[@name="author"]', extract => '@content' }, | ||||||
211 | |||||||
212 | { path => '//meta[@name="og:novel:author"]', extract => '@content' }, | ||||||
213 | { path => '//meta[@property="og:novel:author"]', extract => '@content' }, | ||||||
214 | { path => '//*[@class="author"]', }, | ||||||
215 | { path => '//*[@class="writer"]', }, | ||||||
216 | 3 | 3 | 36 | { regex => qr#作者:([^<]+)#si, }, | |||
3 | 6 | ||||||
3 | 53 | ||||||
217 | { regex => qr#作者:([^<]+)#si, }, | ||||||
218 | { regex => qr#<(?:em|i|h3|h2|span)>作者:([^<]+)(?:em|i|h3|h2|span)>#si, }, | ||||||
219 | { regex => qr#作者:(?:)?]*>([^<]+)#si, }, | ||||||
220 | { regex => qr# 作(?: |\s)*者:([^<]+) #si, }, |
||||||
221 | { regex => qr#作者:([^<]+?) 发布时间:#s, }, | ||||||
222 | { regex => qr#content="([^"]+?)最新著作#s, }, | ||||||
223 | { regex => qr# |
||||||
224 | { regex => qr#content="[^"]+?,([^",]+?)作品#s, }, | ||||||
225 | ], | ||||||
226 | sub => $self->can( "tidy_writer_book" ), | ||||||
227 | ); | ||||||
228 | |||||||
229 | 0 | 0 | $r->{$_} = $self->tidy_writer_book( $r->{$_} ) for qw/writer book title/; | ||||
230 | |||||||
231 | 0 | 0 | return $r; | ||||
232 | } ## end sub parse_novel | ||||||
233 | |||||||
234 | sub scrape_item_list { | ||||||
235 | 1 | 1 | 0 | 3 | my ( $self ) = @_; | ||
236 | 1 | 2 | my $r = {}; | ||||
237 | 1 | 50 | 6 | $r->{path} = $self->{item_list_path} if ( exists $self->{item_list_path} ); | |||
238 | 1 | 2 | return $r; | ||||
239 | } | ||||||
240 | |||||||
241 | sub parse_item_list { | ||||||
242 | 2 | 2 | 0 | 7 | my ( $self, $h, $r ) = @_; | ||
243 | |||||||
244 | 2 | 100 | 13 | return $r->{item_list} if ( exists $r->{item_list} ); | |||
245 | |||||||
246 | 1 | 7 | my $path_r = $self->scrape_item_list(); | ||||
247 | |||||||
248 | 1 | 50 | 10 | return $self->guess_item_list( $h ) unless ( exists $path_r->{path} ); | |||
249 | |||||||
250 | my $parse_novel = scraper { | ||||||
251 | process $path_r->{path}, | ||||||
252 | 0 | 0 | 0 | 'item_list[]' => { | |||
253 | 'title' => 'TEXT', | ||||||
254 | 'url' => '@href' | ||||||
255 | }; | ||||||
256 | 0 | 0 | }; | ||||
257 | 0 | 0 | my $ref = $parse_novel->scrape( $h ); | ||||
258 | |||||||
259 | 0 | 0 | 0 | my @chap = grep { exists $_->{url} and $_->{url} } @{ $ref->{item_list} }; | |||
0 | 0 | ||||||
0 | 0 | ||||||
260 | |||||||
261 | 0 | 0 | 0 | if ( $path_r->{sort} ) { | |||
262 | 0 | 0 | @chap = sort { $a->{url} cmp $b->{url} } @chap; | ||||
0 | 0 | ||||||
263 | } | ||||||
264 | |||||||
265 | 0 | 0 | return \@chap; | ||||
266 | } ## end sub parse_item_list | ||||||
267 | |||||||
268 | sub guess_item_list { | ||||||
269 | 1 | 1 | 0 | 4 | my ( $self, $h, %opt ) = @_; | ||
270 | |||||||
271 | 1 | 3 | my $new_h = $$h; | ||||
272 | 1 | 65 | $new_h=~s# |
||||
273 | |||||||
274 | 1 | 11 | my $tree = HTML::TreeBuilder->new(); | ||||
275 | 1 | 330 | $tree->parse( $new_h ); | ||||
276 | |||||||
277 | 1 | 27770 | my @links = $tree->look_down( '_tag', 'a' ); | ||||
278 | 1 | 873 | @links = grep { $_->attr( 'href' ) } @links; | ||||
24 | 189 | ||||||
279 | 1 | 12 | for my $x ( @links ) { | ||||
280 | 24 | 842 | my $up_url = $x->attr( 'href' ); | ||||
281 | 24 | 257 | $up_url =~ s#/[^/]+/?$#/#; | ||||
282 | 24 | 100 | 55 | $up_url = '.' if ( $up_url !~ m#/# ); | |||
283 | |||||||
284 | 24 | 46 | $x = { parent => $up_url, depth => $x->depth(), url => $x->attr( 'href' ), title => $x->as_text() }; | ||||
285 | } | ||||||
286 | |||||||
287 | 1 | 69 | my @out_links; | ||||
288 | 1 | 3 | my @temp_arr = ( $links[0] ); | ||||
289 | 1 | 2 | my $parent = $links[0]{parent}; | ||||
290 | 1 | 3 | my $depth = $links[0]{depth}; | ||||
291 | 1 | 5 | for ( my $i = 1 ; $i <= $#links ; $i++ ) { | ||||
292 | 23 | 100 | 100 | 55 | if ( $depth == $links[$i]{depth} and $parent eq $links[$i]{parent} ) { | ||
293 | 8 | 15 | push @temp_arr, $links[$i]; | ||||
294 | } else { | ||||||
295 | 15 | 24 | push @out_links, [@temp_arr]; | ||||
296 | 15 | 19 | @temp_arr = ( $links[$i] ); | ||||
297 | 15 | 18 | $depth = $links[$i]{depth}; | ||||
298 | 15 | 33 | $parent = $links[$i]{parent}; | ||||
299 | } | ||||||
300 | } | ||||||
301 | |||||||
302 | 1 | 50 | 5 | push @out_links, \@temp_arr if ( @temp_arr ); | |||
303 | |||||||
304 | 1 | 6 | @out_links = sort { scalar( @$b ) <=> scalar( @$a ) } @out_links; | ||||
36 | 40 | ||||||
305 | |||||||
306 | 1 | 2 | my $res_arr; | ||||
307 | 1 | 5 | my $title_regex = | ||||
308 | qr/引子|楔子|内容简介|正文|序言|文案|第\s*[0123456789零○〇一二三四五六七八九十百千\d]+\s*(章|节)|(^[0-9]+)/; | ||||||
309 | 1 | 4 | my $chap_num_regex = qr/(^|\/)\d+(\.html)?$/; | ||||
310 | 1 | 4 | for my $arr ( @out_links ) { | ||||
311 | 3 | 4 | my $x = $arr->[0]; | ||||
312 | 3 | 4 | my $y = $arr->[1]; | ||||
313 | 3 | 5 | my $z = $arr->[-1]; | ||||
314 | |||||||
315 | 3 | 50 | 33 | 17 | $res_arr = $arr if ( $opt{chapter_url_regex} and $x->{url} =~ /$opt{chapter_url_regex}/ ); | ||
316 | 3 | 50 | 33 | 10 | $res_arr = $arr if ( $opt{chapter_title_regex} and $x->{title} =~ /$opt{chapter_title_regex}/ ); | ||
317 | $res_arr = $arr | ||||||
318 | 3 | 50 | 33 | 44 | if ( $x->{title} =~ /$title_regex/ or ( $y and $y->{title} =~ /$title_regex/ ) or ( $z and $z->{title} =~ /$title_regex/ ) ); | ||
66 | |||||||
33 | |||||||
33 | |||||||
319 | 3 | 50 | 66 | 34 | $res_arr = $arr if ( ( $x->{url} =~ /$chap_num_regex/ or $z->{url} =~ /$chap_num_regex/ ) and scalar( @$arr ) > 50 ); | ||
66 | |||||||
320 | |||||||
321 | #$res_arr= $arr if( ($x->{url}=~/\/?\d+$/ or $z->{url}=~/\/?\d+$/) and scalar(@$arr)>50); | ||||||
322 | 3 | 100 | 9 | last if ( $res_arr ); | |||
323 | } | ||||||
324 | |||||||
325 | #remove not chapter url | ||||||
326 | 1 | 2 | while ( 1 ) { | ||||
327 | 1 | 2 | my $x = $res_arr->[0]; | ||||
328 | 1 | 18 | my $y = $res_arr->[ int( $#$res_arr / 2 ) ]; | ||||
329 | 1 | 50 | 33 | 56 | if ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /\.html$/ and $x->{url} !~ /\.html$/ ) { | ||
50 | 33 | ||||||
33 | |||||||
33 | |||||||
33 | |||||||
33 | |||||||
33 | |||||||
33 | |||||||
330 | 0 | 0 | shift( @$res_arr ); | ||||
331 | } elsif ( defined $y->{title} and $y->{title} =~ /$title_regex/ and defined $y->{url} and $y->{url} =~ /$chap_num_regex/ and $x->{url} !~ /$chap_num_regex/ ) { | ||||||
332 | 0 | 0 | shift( @$res_arr ); | ||||
333 | } else { | ||||||
334 | 1 | 3 | last; | ||||
335 | } | ||||||
336 | } | ||||||
337 | |||||||
338 | #sort chapter url | ||||||
339 | 1 | 50 | 33 | 21 | if ( $res_arr and defined $res_arr->[0]{url} and $res_arr->[0]{url} =~ /$chap_num_regex/ ) { | ||
33 | |||||||
340 | 1 | 3 | 6 | my $trim_sub = sub { my $s = $_[0]; $s =~ s/^.+\///; $s =~ s/\.html$//; return $s }; | |||
3 | 5 | ||||||
3 | 14 | ||||||
3 | 5 | ||||||
3 | 6 | ||||||
341 | 1 | 3 | my @sort_arr; | ||||
342 | 1 | 50 | 13 | if($opt{sort_chapter_url}){ | |||
343 | 0 | 0 | @sort_arr = sort { $trim_sub->( $a->{url} ) <=> $trim_sub->( $b->{url} ) } grep { $_->{url} =~ /$chap_num_regex/ } @$res_arr; | ||||
0 | 0 | ||||||
0 | 0 | ||||||
344 | }else{ | ||||||
345 | 1 | 10 | @sort_arr = @$res_arr; | ||||
346 | } | ||||||
347 | 1 | 5 | my @s = map { $trim_sub->( $_->{url} ) } @sort_arr; | ||||
3 | 9 | ||||||
348 | 1 | 3 | my $random_sort = 0; | ||||
349 | 1 | 5 | for my $i ( 0 .. $#s - 1 ) { | ||||
350 | 2 | 50 | 8 | $random_sort = 1 if ( $s[$i] > $s[ $i + 1 ] ); | |||
351 | 2 | 50 | 5 | last if ( $random_sort ); | |||
352 | } | ||||||
353 | 1 | 50 | 108 | return \@sort_arr if ( $random_sort == 0 ); | |||
354 | } | ||||||
355 | |||||||
356 | 0 | 0 | 0 | return $res_arr || []; | |||
357 | } ## end sub guess_item_list | ||||||
358 | |||||||
359 | sub scrape_novel_item { | ||||||
360 | 2 | 2 | 0 | 31 | my ( $self ) = @_; | ||
361 | 2 | 9 | my $r = {}; | ||||
362 | 2 | 50 | 16 | push @{$r->{content}}, { path => $self->{content_path}, extract => 'HTML' } if( exists $self->{content_path} ); | |||
0 | 0 | ||||||
363 | 2 | 50 | 11 | push @{$r->{content}}, { regex => $self->{content_regex} } if( exists $self->{content_regex} ); | |||
0 | 0 | ||||||
364 | 2 | 4 | push @{$r->{content}}, ( | ||||
2 | 17 | ||||||
365 | { path => '//div[@class="novel_content"]' }, | ||||||
366 | { path => '//div[@id="content"]' }, | ||||||
367 | ); | ||||||
368 | 2 | 28 | return $r; | ||||
369 | } | ||||||
370 | |||||||
371 | sub parse_novel_item { | ||||||
372 | 0 | 0 | 0 | 0 | my ( $self, $h, $r ) = @_; | ||
373 | |||||||
374 | 0 | 0 | 0 | $r = $self->guess_novel_item( $h ) unless ( $r->{content} ); | |||
375 | 0 | 0 | 0 | $r->{$_} ||= $NULL_CHAPTER{$_} for keys( %NULL_CHAPTER ); | |||
376 | 0 | 0 | $r->{content} = $self->tidy_content( $r->{content} ); | ||||
377 | 0 | 0 | return $r; | ||||
378 | } | ||||||
379 | |||||||
380 | sub guess_novel_item { | ||||||
381 | 0 | 0 | 0 | 0 | my ( $self, $h, %opt ) = @_; | ||
382 | |||||||
383 | 0 | 0 | $$h =~ s###sg; | ||||
384 | 0 | 0 | $$h =~ s###sg; | ||||
385 | |||||||
386 | 0 | 0 | my $tree = HTML::TreeBuilder->new(); | ||||
387 | 0 | 0 | $tree->parse( $$h ); | ||||
388 | |||||||
389 | 0 | 0 | my @links = $tree->look_down( 'text', undef ); | ||||
390 | 0 | 0 | for my $x ( @links ) { | ||||
391 | 0 | 0 | $x = { content => $x->as_HTML( '<>&' ) }; | ||||
392 | 0 | 0 | $self->calc_content_wordnum( $x ); | ||||
393 | } | ||||||
394 | 0 | 0 | my @out_links = sort { $b->{word_num} <=> $a->{word_num} } @links; | ||||
0 | 0 | ||||||
395 | |||||||
396 | 0 | 0 | my $no_next_r; | ||||
397 | 0 | 0 | for my $r ( @out_links ) { | ||||
398 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(style|head|body|html)>#s ); | |||
399 | 0 | 0 | 0 | next if ( $r->{content} =~ m#^\s* | |||
400 | 0 | 0 | 0 | next if ( $r->{content} =~ /(上|下)一(章|页|篇)/s ); | |||
401 | 0 | 0 | 0 | next if ( $r->{content} =~ m##s ); | |||
402 | 0 | 0 | 0 | next if ( $r->{content} =~ m#All rights reserved#s ); | |||
403 | 0 | 0 | 0 | next if ( $r->{content} =~ m#(.+?){5,}#s ); | |||
404 | |||||||
405 | 0 | 0 | $no_next_r = $r; | ||||
406 | 0 | 0 | last; | ||||
407 | } | ||||||
408 | |||||||
409 | #my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)\w{0,20}$/s and $_->{word_num} > 50 } @out_links; | ||||||
410 | 0 | 0 | my @grep_next_r = grep { $_->{content} =~ /(上|下)一(章|页|篇)/s | ||||
411 | 0 | 0 | 0 | and $_->{word_num} > 50 | |||
412 | } @out_links; | ||||||
413 | |||||||
414 | 0 | 0 | my $cc = $no_next_r->{content}; | ||||
415 | 0 | 0 | my $cc_n = $cc =~ s/(\n| ]*>| |
||||
416 | 0 | 0 | 0 | 0 | return $no_next_r if ( ( $cc_n > 5 and $no_next_r->{word_num} > 50) or !@grep_next_r ); | ||
0 | |||||||
417 | |||||||
418 | 0 | 0 | 0 | return $grep_next_r[-1] || {}; | |||
419 | } ## end sub guess_novel_item | ||||||
420 | |||||||
421 | ### }}} | ||||||
422 | |||||||
423 | ### {{{ tiezi | ||||||
424 | |||||||
425 | sub get_tiezi_ref { | ||||||
426 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
427 | |||||||
428 | 0 | 0 | my $class = 'novel'; | ||||
429 | my ( $topic, $item_list ) = $self->{browser}->request_url_whole( | ||||||
430 | $url, | ||||||
431 | |||||||
432 | #post_data => $o{post_data}, | ||||||
433 | info_sub => sub { | ||||||
434 | 0 | 0 | 0 | $self->extract_elements( | |||
435 | @_, | ||||||
436 | path => $self->can( "scrape_$class" )->( $self ), | ||||||
437 | sub => $self->can( "parse_$class" ), | ||||||
438 | ); | ||||||
439 | }, | ||||||
440 | 0 | 0 | 0 | item_list_sub => sub { $self->can( "parse_${class}_item" )->( $self, @_ ) }, | |||
441 | 0 | 0 | 0 | page_list_sub => sub { $self->can( "parse_${class}_list" )->( $self, @_ ) }, | |||
442 | |||||||
443 | #min_page_num => $o{"min_page_num"}, | ||||||
444 | #max_page_num => $o{"max_page_num"}, | ||||||
445 | stop_sub => sub { | ||||||
446 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
447 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
448 | }, | ||||||
449 | 0 | 0 | %o, | ||||
450 | ); | ||||||
451 | |||||||
452 | 0 | 0 | $item_list = $self->update_item_list( $item_list, $url ); | ||||
453 | |||||||
454 | 0 | 0 | 0 | unshift @$item_list, $topic if ( $topic->{content} ); | |||
455 | my %r = ( | ||||||
456 | %$topic, | ||||||
457 | writer => $o{writer} || $topic->{writer}, | ||||||
458 | book => $o{book} || $topic->{book} || $topic->{title}, | ||||||
459 | 0 | 0 | 0 | url => $url, | |||
0 | |||||||
460 | item_list => $item_list, | ||||||
461 | ); | ||||||
462 | 0 | 0 | $self->filter_item_list( \%r, %o ); | ||||
463 | |||||||
464 | 0 | 0 | return \%r; | ||||
465 | } ## end sub get_tiezi_ref | ||||||
466 | |||||||
467 | ### }}} | ||||||
468 | |||||||
469 | ### {{{ iterate_ref | ||||||
470 | sub get_iterate_ref { | ||||||
471 | 0 | 0 | 0 | 0 | my ( $self, $url, %o ) = @_; | ||
472 | my ( $info, $item_list ) = $self->{browser}->request_url_whole( | ||||||
473 | $url, | ||||||
474 | post_data => $o{post_data}, | ||||||
475 | info_sub => sub { | ||||||
476 | $self->extract_elements( | ||||||
477 | @_, | ||||||
478 | path => {}, | ||||||
479 | 0 | 0 | sub => sub { my ( $self, $html_ref, $r ) = @_; return $r; }, | ||||
0 | 0 | ||||||
480 | 0 | 0 | 0 | ); | |||
481 | }, | ||||||
482 | 0 | 0 | 0 | item_sub => sub { my ( $self, $html_ref ) = @_; return {}; }, | |||
0 | 0 | ||||||
483 | 0 | 0 | 0 | item_list_sub => sub { my ( $self, $html_ref ) = @_; return []; }, | |||
0 | 0 | ||||||
484 | |||||||
485 | #min_page_num => $o{"min_page_num"}, | ||||||
486 | #max_page_num => $o{"max_page_num"}, | ||||||
487 | stop_sub => sub { | ||||||
488 | 0 | 0 | 0 | my ( $info, $data_list, $i ) = @_; | |||
489 | 0 | 0 | $self->{browser}->is_list_overflow( $data_list, $o{"max_item_num"} ); | ||||
490 | }, | ||||||
491 | 0 | 0 | %o, | ||||
492 | ); | ||||||
493 | |||||||
494 | 0 | 0 | $info->{item_list} = $self->update_item_list( $item_list, $url ); | ||||
495 | |||||||
496 | 0 | 0 | return $info; | ||||
497 | } ## end sub get_iterate_ref | ||||||
498 | |||||||
499 | ### }}} | ||||||
500 | |||||||
501 | ### {{{ base | ||||||
502 | |||||||
503 | sub update_item_list { | ||||||
504 | 6 | 6 | 0 | 15 | my ( $self, $arr, $base_url ) = @_; | ||
505 | |||||||
506 | 6 | 9 | my %rem; | ||||
507 | 6 | 12 | for my $chap (@$arr){ | ||||
508 | 25 | 50 | 0 | 55 | $chap = { url => $chap || '' } if ( ref( $chap ) ne 'HASH' ); | ||
509 | 25 | 100 | 51 | if ( $chap->{url} ) { | |||
510 | 13 | 58 | $chap->{url} = $self->format_abs_url( $chap->{url}, $base_url ); | ||||
511 | 13 | 8503 | $rem{ $chap->{url} }++; | ||||
512 | } | ||||||
513 | } | ||||||
514 | |||||||
515 | 6 | 11 | my $i = 0; | ||||
516 | 6 | 11 | my @res; | ||||
517 | 6 | 11 | for my $chap ( @$arr ) { | ||||
518 | 25 | 50 | 66 | 72 | if($chap->{url} and $rem{ $chap->{url} }>1){ | ||
519 | 0 | 0 | $rem{$chap->{url}}--; | ||||
520 | }else{ | ||||||
521 | 25 | 29 | ++$i; | ||||
522 | 25 | 66 | 67 | $chap->{pid} //= $i; #page id | |||
523 | 25 | 66 | 50 | $chap->{id} //= $i; #item id | |||
524 | 25 | 100 | 58 | $chap->{content} //= ''; | |||
525 | 25 | 50 | 121 | push @res, $chap unless($chap->{content}=~m#正在手打中#s); | |||
526 | } | ||||||
527 | } | ||||||
528 | |||||||
529 | 6 | 33 | 34 | while(@res and $res[-1]{content}=~m#正在手打中#s ){ | |||
530 | 0 | 0 | pop @res; | ||||
531 | } | ||||||
532 | |||||||
533 | #$i = $arr->[-1]{id} if ( $#$arr >= 0 and exists $arr->[-1]{id} and $arr->[-1]{id} > $i ); | ||||||
534 | 6 | 100 | 31 | return wantarray ? ( \@res, $i ) : \@res; | |||
535 | } ## end sub update_item_list | ||||||
536 | |||||||
537 | sub format_abs_url { | ||||||
538 | 16 | 16 | 0 | 33 | my ( $self, $url, $base_url ) = @_; | ||
539 | 16 | 50 | 29 | return $url unless ( $base_url ); | |||
540 | 16 | 100 | 53 | return $url unless ( $base_url =~ /^https?:/ ); | |||
541 | 14 | 48 | my $abs_url = URI->new_abs( $url, $base_url )->as_string; | ||||
542 | } | ||||||
543 | |||||||
544 | sub extract_elements { | ||||||
545 | 4 | 4 | 0 | 27 | my ( $self, $h, %o ) = @_; | ||
546 | 4 | 50 | 18 | $o{path} ||= {}; | |||
547 | |||||||
548 | 4 | 13 | my $r = {}; | ||||
549 | 4 | 8 | while ( my ( $xk, $xr ) = each %{ $o{path} } ) { | ||||
6 | 40 | ||||||
550 | 2 | 17 | $r->{$xk} = $self->scrape_element_try( $h, $xr ); | ||||
551 | } | ||||||
552 | 4 | 50 | 31 | $r = $o{sub}->( $self, $h, $r ) if ( $o{sub} ); | |||
553 | 4 | 21 | return $r; | ||||
554 | } | ||||||
555 | |||||||
556 | sub scrape_element_try { | ||||||
557 | 2 | 2 | 0 | 8 | my ( $self, $h, $r_list, %o ) = @_; | ||
558 | 2 | 6 | my $c; | ||||
559 | 2 | 10 | for my $path_or_regex ( @$r_list ) { | ||||
560 | 4 | 20 | $c = $self->scrape_element( $h, $path_or_regex ); | ||||
561 | 4 | 50 | 16 | next unless ( $c ); | |||
562 | 0 | 0 | 0 | $c = $o{sub}->( $self, $c ) if ( exists $o{sub} ); | |||
563 | 0 | 0 | 0 | next unless ( $c ); | |||
564 | 0 | 0 | return $c; | ||||
565 | } | ||||||
566 | 2 | 12 | return; | ||||
567 | } | ||||||
568 | |||||||
569 | sub scrape_element { | ||||||
570 | 4 | 4 | 0 | 10 | my ( $self, $h, $o ) = @_; | ||
571 | 4 | 50 | 15 | return $self->extract_regex_element( $h, $o->{regex} ) if ( $o->{regex} ); | |||
572 | 4 | 50 | 14 | return $o->{sub}->( $h ) unless ( $o->{path} ); | |||
573 | |||||||
574 | 4 | 50 | 25 | $o->{extract} ||= 'TEXT'; | |||
575 | |||||||
576 | my $parse = $o->{is_list} | ||||||
577 | 0 | 0 | 0 | ? scraper { process $o->{path}, 'data[]' => $o->{extract}; } | |||
578 | 4 | 50 | 4 | 43 | : scraper { process_first $o->{path}, 'data' => $o->{extract}; }; | ||
4 | 185617 | ||||||
579 | 4 | 55 | my $r = $parse->scrape( $h ); | ||||
580 | 4 | 50 | 168260 | return unless ( defined $r->{data} ); | |||
581 | |||||||
582 | 0 | 0 | 0 | return $r->{data} unless ( $o->{sub} ); | |||
583 | 0 | 0 | return $o->{sub}->( $r->{data} ); | ||||
584 | } | ||||||
585 | |||||||
586 | sub extract_regex_element { | ||||||
587 | 0 | 0 | 0 | 0 | my ( $self, $h, $reg ) = @_; | ||
588 | 0 | 0 | my ( $d ) = $$h =~ m#$reg#s; | ||||
589 | 0 | 0 | return $d; | ||||
590 | } | ||||||
591 | |||||||
592 | sub filter_item_list { | ||||||
593 | 3 | 3 | 0 | 8 | my ( $self, $r, %o ) = @_; | ||
594 | |||||||
595 | 3 | 8 | my $flist = $r->{item_list}; | ||||
596 | |||||||
597 | #$r->{item_num} //= $flist->[-1]{id} // scalar( @$flist ); | ||||||
598 | |||||||
599 | 3 | 25 | $flist->[$_]{content} = $self->tidy_content( $flist->[$_]{content} ) for ( 0 .. $#$flist ); | ||||
600 | |||||||
601 | 3 | 9 | $flist = [ grep { $self->{browser}->is_item_in_range( $_->{id}, $o{min_item_num}, $o{max_item_num} ) } @$flist ]; | ||||
9 | 35 | ||||||
602 | |||||||
603 | 3 | 18 | $self->calc_content_wordnum( $_ ) for @$flist; | ||||
604 | |||||||
605 | 0 | 0 | $flist = [ grep { $_->{word_num} >= $o{min_content_word_num} } @$flist ] | ||||
606 | 3 | 50 | 8 | if ( $o{min_content_word_num} ); | |||
607 | |||||||
608 | 0 | 0 | $flist = [ grep { $_->{writer} eq $r->{writer} } @$flist ] | ||||
609 | 3 | 50 | 9 | if ( $o{only_poster} ); | |||
610 | |||||||
611 | 0 | 0 | $flist = [ grep { $_->{content} =~ /$o{grep_content}/s } @$flist ] | ||||
612 | 3 | 50 | 8 | if ( $o{grep_content} ); | |||
613 | |||||||
614 | 0 | 0 | $flist = [ grep { $_->{content} !~ /$o{filter_content}/s } @$flist ] | ||||
615 | 3 | 50 | 9 | if ( $o{filter_content} ); | |||
616 | |||||||
617 | 3 | 50 | 5 | $flist = [ grep { defined $_->{content} and $_->{content} =~ /\S/s } @$flist ]; | |||
7 | 37 | ||||||
618 | |||||||
619 | 3 | 50 | 11 | $r->{item_list} = $flist || []; | |||
620 | |||||||
621 | 3 | 6 | return $self; | ||||
622 | } ## end sub filter_item_list | ||||||
623 | |||||||
624 | sub calc_content_wordnum { | ||||||
625 | 7 | 7 | 0 | 13 | my ( $self, $f ) = @_; | ||
626 | 7 | 50 | 18 | return if ( $f->{word_num} ); | |||
627 | 7 | 50 | 16 | my $wd = $f->{content} || ''; | |||
628 | 7 | 125 | $wd =~ s/<[^>]+>//gs; | ||||
629 | 7 | 89 | $wd =~ s/\s+//sg; | ||||
630 | 7 | 643 | $f->{word_num} = $wd =~ s/\S//gs; | ||||
631 | 7 | 16 | return $f; | ||||
632 | } | ||||||
633 | |||||||
634 | sub tidy_writer_book { | ||||||
635 | 0 | 0 | 0 | 0 | my ( $self, $c ) = @_; | ||
636 | 0 | 0 | 0 | return unless ( defined $c ); | |||
637 | 0 | 0 | for ( $c ) { | ||||
638 | 0 | 0 | s/作\s*者://; | ||||
639 | 0 | 0 | s/^\s*作者-\s*//; | ||||
640 | 0 | 0 | s/小说全集//; | ||||
641 | 0 | 0 | s/作品全集//; | ||||
642 | 0 | 0 | s/专栏//; | ||||
643 | 0 | 0 | s/^.*版权属于作者([^,]+)$/$1/; | ||||
644 | 0 | 0 | s/\s*最新章节\s*$//; | ||||
645 | 0 | 0 | s/全文阅读//; | ||||
646 | 0 | 0 | s/在线阅读//; | ||||
647 | 0 | 0 | s/全集下载//; | ||||
648 | 0 | 0 | s/章节目录//; | ||||
649 | 0 | 0 | s/^\s*《(.*)》\s*$/$1/; | ||||
650 | 0 | 0 | s/^\s+|\s+$//g; | ||||
651 | 0 | 0 | s/\s+//g; | ||||
652 | } | ||||||
653 | 0 | 0 | return $c; | ||||
654 | } ## end sub tidy_writer_book | ||||||
655 | |||||||
656 | sub tidy_content { | ||||||
657 | 9 | 9 | 0 | 18 | my ( $self, $c ) = @_; | ||
658 | 9 | 15 | for ( $c ) { | ||||
659 | 9 | 100 | 17 | last unless ( $c ); | |||
660 | 7 | 30 | s# ##sg; | ||||
661 | 7 | 67 | s# #\n#sg; | ||||
662 | 7 | 80 | s#\s{5,}#\n#sg; | ||||
663 | 7 | 33 | s###sg; | ||||
664 | 7 | 295 | s#\s*\<[^>]+?\>#\n#sg; | ||||
665 | 7 | 80 | s{\n+}{\n}sg; | ||||
666 | 7 | 389 | s{\s*(\S.*?)\s*\n}{\n $1 }sg; |
||||
667 | 7 | 39 | s#\s+上一章\s+.+?下一章.+$##s; | ||||
668 | 7 | 28 | s#[^\n]+加入书签[^\n]+##s; | ||||
669 | } | ||||||
670 | 9 | 44 | return $c; | ||||
671 | } | ||||||
672 | |||||||
673 | sub tidy_string { | ||||||
674 | 6 | 6 | 0 | 14 | my ( $self, $c ) = @_; | ||
675 | 6 | 50 | 12 | $c ||= ''; | |||
676 | 6 | 11 | for ( $c ) { | ||||
677 | 6 | 18 | s/^\s+|\s+$//gs; | ||||
678 | 6 | 11 | s/[\*\/\\\[\(\)]+//g; | ||||
679 | 6 | 12 | s/[[:punct:]]//sg; | ||||
680 | 6 | 11 | s/[\]\s+\/\\]/-/g; | ||||
681 | } | ||||||
682 | |||||||
683 | 6 | 16 | return $c; | ||||
684 | } | ||||||
685 | |||||||
686 | sub get_inner_html { | ||||||
687 | 0 | 0 | 0 | my ( $self, $h ) = @_; | |||
688 | |||||||
689 | 0 | 0 | return '' unless ( $h ); | ||||
690 | |||||||
691 | 0 | my $head_i = index( $h, '>' ); | |||||
692 | 0 | substr( $h, 0, $head_i + 1 ) = ''; | |||||
693 | |||||||
694 | 0 | my $tail_i = rindex( $h, '<' ); | |||||
695 | 0 | substr( $h, $tail_i ) = ''; | |||||
696 | |||||||
697 | 0 | return $h; | |||||
698 | } | ||||||
699 | |||||||
700 | sub unescape_js { | ||||||
701 | 0 | 0 | 0 | my ( $self, $s ) = @_; | |||
702 | 0 | $s =~ s/%u([0-9a-f]{4})/chr(hex($1))/eigs; | |||||
0 | |||||||
703 | 0 | $s =~ s/%([0-9a-f]{2})/chr(hex($1))/eigs; | |||||
0 | |||||||
704 | 0 | return $s; | |||||
705 | } | ||||||
706 | |||||||
707 | sub encode_cjk_for_url { | ||||||
708 | 0 | 0 | 0 | my ( $self, $key ) = @_; | |||
709 | 0 | my $b = uc( unpack( "H*", encode( $self->charset(), $key ) ) ); | |||||
710 | 0 | $b =~ s/(..)/%$1/g; | |||||
711 | 0 | return $b; | |||||
712 | } | ||||||
713 | |||||||
714 | ### }}} | ||||||
715 | |||||||
716 | 1; | ||||||
717 |