line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ABSTRACT: get/post url, return unicode content, auto detect CJK charset
|
2
|
|
|
|
|
|
|
package Novel::Robot::Browser;
|
3
|
|
|
|
|
|
|
|
4
|
3
|
|
|
3
|
|
19
|
use strict;
|
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
118
|
|
5
|
3
|
|
|
3
|
|
24
|
use warnings;
|
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
81
|
|
6
|
3
|
|
|
3
|
|
15
|
use utf8;
|
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
16
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
#our $VERSION = 0.22;
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
#use Novel::Robot::Browser::CookieJar;
|
11
|
3
|
|
|
3
|
|
1593
|
use HTTP::CookieJar;
|
|
3
|
|
|
|
|
100905
|
|
|
3
|
|
|
|
|
176
|
|
12
|
3
|
|
|
3
|
|
2281
|
use Data::Dumper;
|
|
3
|
|
|
|
|
21074
|
|
|
3
|
|
|
|
|
250
|
|
13
|
|
|
|
|
|
|
|
14
|
3
|
|
|
3
|
|
1762
|
use File::Slurp qw/slurp/;
|
|
3
|
|
|
|
|
104643
|
|
|
3
|
|
|
|
|
241
|
|
15
|
3
|
|
|
3
|
|
5685
|
use Encode::Detect::CJK qw/detect/;
|
|
3
|
|
|
|
|
194537
|
|
|
3
|
|
|
|
|
322
|
|
16
|
3
|
|
|
3
|
|
42
|
use Encode;
|
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
232
|
|
17
|
3
|
|
|
3
|
|
2745
|
use HTTP::Tiny;
|
|
3
|
|
|
|
|
106388
|
|
|
3
|
|
|
|
|
147
|
|
18
|
3
|
|
|
3
|
|
1821
|
use Parallel::ForkManager;
|
|
3
|
|
|
|
|
157267
|
|
|
3
|
|
|
|
|
188
|
|
19
|
3
|
|
|
3
|
|
2364
|
use Term::ProgressBar;
|
|
3
|
|
|
|
|
208664
|
|
|
3
|
|
|
|
|
138
|
|
20
|
3
|
|
|
3
|
|
1889
|
use IO::Uncompress::Gunzip qw(gunzip);
|
|
3
|
|
|
|
|
116593
|
|
|
3
|
|
|
|
|
266
|
|
21
|
3
|
|
|
3
|
|
1452
|
use URI::Escape;
|
|
3
|
|
|
|
|
4955
|
|
|
3
|
|
|
|
|
214
|
|
22
|
3
|
|
|
3
|
|
1716
|
use URI;
|
|
3
|
|
|
|
|
8665
|
|
|
3
|
|
|
|
|
6559
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $DEFAULT_URL_CONTENT = '';
|
25
|
|
|
|
|
|
|
our %DEFAULT_HEADER = (
|
26
|
|
|
|
|
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
27
|
|
|
|
|
|
|
'Accept-Charset' => 'gb2312,utf-8;q=0.7,*;q=0.7',
|
28
|
|
|
|
|
|
|
'Accept-Encoding' => "gzip",
|
29
|
|
|
|
|
|
|
'Accept-Language' => 'zh,zh-cn;q=0.8,en-us;q=0.5,en;q=0.3',
|
30
|
|
|
|
|
|
|
'Connection' => 'keep-alive',
|
31
|
|
|
|
|
|
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
|
32
|
|
|
|
|
|
|
'DNT' => 1,
|
33
|
|
|
|
|
|
|
);
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub new {
|
36
|
2
|
|
|
2
|
0
|
32
|
my ( $self, %opt ) = @_;
|
37
|
2
|
|
50
|
|
|
21
|
$opt{retry} ||= 5;
|
38
|
2
|
|
100
|
|
|
13
|
$opt{max_process_num} ||= 5;
|
39
|
2
|
|
33
|
|
|
17
|
$opt{browser} ||= _init_browser( $opt{browser_headers} );
|
40
|
2
|
|
50
|
|
|
17
|
$opt{use_chrome} ||= 0;
|
41
|
2
|
|
|
|
|
15
|
bless {%opt}, __PACKAGE__;
|
42
|
|
|
|
|
|
|
}
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub _init_browser {
|
45
|
2
|
|
|
2
|
|
11
|
my ( $headers ) = @_;
|
46
|
|
|
|
|
|
|
|
47
|
2
|
|
50
|
|
|
13
|
$headers ||= {};
|
48
|
2
|
|
|
|
|
27
|
my %h = ( %DEFAULT_HEADER, %$headers );
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
#my $cookie_jar = Novel::Robot::Browser::CookieJar->new();
|
51
|
2
|
|
|
|
|
20
|
my $cookie_jar = HTTP::CookieJar->new;
|
52
|
|
|
|
|
|
|
|
53
|
2
|
|
|
|
|
38
|
my $http = HTTP::Tiny->new(
|
54
|
|
|
|
|
|
|
default_headers => \%h,
|
55
|
|
|
|
|
|
|
cookie_jar => $cookie_jar,
|
56
|
|
|
|
|
|
|
);
|
57
|
|
|
|
|
|
|
|
58
|
2
|
|
|
|
|
344
|
return $http;
|
59
|
|
|
|
|
|
|
}
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub request_url_whole {
|
62
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $url, %o ) = @_;
|
63
|
|
|
|
|
|
|
|
64
|
0
|
|
|
|
|
0
|
my $html = $self->request_url( $url, $o{post_data} );
|
65
|
|
|
|
|
|
|
|
66
|
0
|
|
0
|
|
|
0
|
my $info = $o{info_sub}->( \$html ) || {};
|
67
|
0
|
|
0
|
|
|
0
|
my $data_list = $o{item_list} || $o{item_list_sub}->( \$html ) || [];
|
68
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
my $i = 1;
|
70
|
0
|
0
|
0
|
|
|
0
|
unless ( $o{stop_sub} and $o{stop_sub}->( $info, $data_list, $i, %o ) or defined $o{item_list}) {
|
|
|
|
0
|
|
|
|
|
71
|
0
|
0
|
0
|
|
|
0
|
$data_list = [] if ( $o{min_page_num} and $o{min_page_num} > 1 );
|
72
|
0
|
0
|
|
|
|
0
|
my $page_list = exists $o{page_list_sub} ? $o{page_list_sub}->( \$html ) : undef;
|
73
|
0
|
|
|
|
|
0
|
while ( 1 ) {
|
74
|
0
|
|
|
|
|
0
|
$i++;
|
75
|
|
|
|
|
|
|
my $u =
|
76
|
|
|
|
|
|
|
$page_list ? $page_list->[ $i - 2 ] :
|
77
|
0
|
0
|
|
|
|
0
|
( exists $o{next_page_sub} ? $o{next_page_sub}->( $url, $i, \$html ) : undef );
|
|
|
0
|
|
|
|
|
|
78
|
0
|
0
|
|
|
|
0
|
last unless ( $u );
|
79
|
0
|
0
|
0
|
|
|
0
|
next if ( $o{min_page_num} and $i < $o{min_page_num} );
|
80
|
0
|
0
|
0
|
|
|
0
|
last if ( $o{max_page_num} and $i > $o{max_page_num} );
|
81
|
|
|
|
|
|
|
|
82
|
0
|
0
|
|
|
|
0
|
my ( $u_url, $u_post_data ) = ref( $u ) eq 'HASH' ? @{$u}{qw/url post_data/} : ( $u, undef );
|
|
0
|
|
|
|
|
0
|
|
83
|
0
|
|
|
|
|
0
|
my $c = $self->request_url( $u_url, $u_post_data );
|
84
|
0
|
|
|
|
|
0
|
my $fs = $o{item_list_sub}->( \$c );
|
85
|
0
|
0
|
|
|
|
0
|
last unless ( $fs );
|
86
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
0
|
push @$data_list, @$fs;
|
88
|
0
|
0
|
0
|
|
|
0
|
last if ( $o{stop_sub} and $o{stop_sub}->( $info, $data_list, $i, %o ) );
|
89
|
|
|
|
|
|
|
}
|
90
|
|
|
|
|
|
|
} ## end unless ( $o{stop_sub} and ...)
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
#lofter倒序
|
93
|
0
|
0
|
|
|
|
0
|
if ( $o{reverse_item_list} ){
|
94
|
0
|
|
|
|
|
0
|
$data_list = [ reverse @$data_list ];
|
95
|
0
|
|
|
|
|
0
|
my $max_id = $data_list->[0]{id};
|
96
|
0
|
0
|
|
|
|
0
|
if($max_id){
|
97
|
0
|
|
|
|
|
0
|
$_->{id} = $max_id - $_->{id} +1 for(@$data_list);
|
98
|
|
|
|
|
|
|
}
|
99
|
|
|
|
|
|
|
}
|
100
|
0
|
0
|
0
|
|
|
0
|
$info->{item_num} = ( $#$data_list >= 0 and exists $data_list->[-1]{id} ) ? $data_list->[-1]{id} : ( scalar( @$data_list ) || $i );
|
|
|
|
0
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
0
|
0
|
|
|
|
0
|
if ( $o{item_sub} ) {
|
103
|
0
|
|
|
|
|
0
|
my $item_id = 0;
|
104
|
0
|
0
|
|
|
|
0
|
print "\n\n" if ( $o{term_progress_bar} );
|
105
|
0
|
|
|
|
|
0
|
my $progress;
|
106
|
0
|
0
|
|
|
|
0
|
$progress = Term::ProgressBar->new( { count => scalar(@$data_list) } ) if ( $o{term_progress_bar} );
|
107
|
|
|
|
|
|
|
|
108
|
0
|
|
|
|
|
0
|
for my $i ( 0 .. $#$data_list ) {
|
109
|
0
|
|
|
|
|
0
|
my $r = $data_list->[$i];
|
110
|
0
|
|
0
|
|
|
0
|
$r->{id} //= ++$item_id;
|
111
|
0
|
|
|
|
|
0
|
$r->{url} = URI->new_abs( $r->{url}, $url )->as_string;
|
112
|
0
|
0
|
|
|
|
0
|
next unless ( $self->is_item_in_range( $r->{id}, $o{min_item_num}, $o{max_item_num} ) );
|
113
|
|
|
|
|
|
|
|
114
|
0
|
0
|
|
|
|
0
|
if($r->{url}){
|
115
|
0
|
|
|
|
|
0
|
my $c = $self->request_url( $r->{url}, $r->{post_data} );
|
116
|
0
|
|
|
|
|
0
|
my $temp_r = $o{item_sub}->( \$c );
|
117
|
0
|
|
0
|
|
|
0
|
$r->{$_} //= $temp_r->{$_} for keys(%$temp_r);
|
118
|
|
|
|
|
|
|
}else{
|
119
|
0
|
|
|
|
|
0
|
$r = $o{item_sub}->( $r );
|
120
|
|
|
|
|
|
|
}
|
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
0
|
my $next_url = URI->new_abs( $data_list->[$i+1]->{url}, $url )->as_string;
|
123
|
0
|
|
|
|
|
0
|
while($r->{next_url}){
|
124
|
0
|
|
|
|
|
0
|
$r->{next_url} = URI->new_abs( $r->{next_url}, $url )->as_string;
|
125
|
0
|
0
|
|
|
|
0
|
if($r->{next_url} ne $next_url){
|
126
|
0
|
|
|
|
|
0
|
my $c = $self->request_url( $r->{next_url}, $r->{post_data} );
|
127
|
0
|
|
|
|
|
0
|
my $temp_r = $o{item_sub}->( \$c );
|
128
|
0
|
|
|
|
|
0
|
$r->{content} .= "\n\n".$temp_r->{content};
|
129
|
0
|
0
|
|
|
|
0
|
last unless(exists $temp_r->{next_url});
|
130
|
0
|
|
|
|
|
0
|
$r->{next_url} = $temp_r->{next_url};
|
131
|
|
|
|
|
|
|
}else{
|
132
|
0
|
|
|
|
|
0
|
last;
|
133
|
|
|
|
|
|
|
}
|
134
|
|
|
|
|
|
|
}
|
135
|
|
|
|
|
|
|
|
136
|
0
|
0
|
|
|
|
0
|
$progress->update( $item_id ) if ( $o{term_progress_bar} );
|
137
|
|
|
|
|
|
|
}
|
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
0
|
$progress->update( scalar(@$data_list) ) if ( $o{term_progress_bar} );
|
140
|
|
|
|
|
|
|
}
|
141
|
0
|
0
|
|
|
|
0
|
print "\n\n" if ( $o{term_progress_bar} );
|
142
|
0
|
|
|
|
|
0
|
return ( $info, $data_list );
|
143
|
|
|
|
|
|
|
} ## end sub request_url_whole
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub is_item_in_range {
|
146
|
6
|
|
|
6
|
0
|
20
|
my ( $self, $id, $min, $max ) = @_;
|
147
|
6
|
50
|
|
|
|
10
|
return 1 unless ( $id );
|
148
|
6
|
50
|
33
|
|
|
15
|
return 0 if ( $min and $id < $min );
|
149
|
6
|
50
|
33
|
|
|
13
|
return 0 if ( $max and $id > $max );
|
150
|
6
|
|
|
|
|
18
|
return 1;
|
151
|
|
|
|
|
|
|
}
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub is_list_overflow {
|
154
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $r, $max ) = @_;
|
155
|
|
|
|
|
|
|
|
156
|
0
|
0
|
|
|
|
0
|
return unless ( $max );
|
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
0
|
my $item_num = scalar( @$r );
|
159
|
0
|
|
0
|
|
|
0
|
my $id = $r->[-1]{id} // $item_num;
|
160
|
|
|
|
|
|
|
|
161
|
0
|
0
|
|
|
|
0
|
return if ( $id < $max );
|
162
|
|
|
|
|
|
|
|
163
|
0
|
|
|
|
|
0
|
$#{$r} = $max - 1;
|
|
0
|
|
|
|
|
0
|
|
164
|
0
|
|
|
|
|
0
|
return 1;
|
165
|
|
|
|
|
|
|
}
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub request_url {
|
168
|
2
|
|
|
2
|
1
|
1463
|
my ( $self, $url, $form ) = @_;
|
169
|
2
|
50
|
|
|
|
8
|
return $DEFAULT_URL_CONTENT unless ( $url );
|
170
|
|
|
|
|
|
|
|
171
|
2
|
|
|
|
|
4
|
my $c;
|
172
|
2
|
|
|
|
|
12
|
for my $i ( 1 .. $self->{retry} ) {
|
173
|
2
|
|
|
|
|
4
|
eval { $c = $self->request_url_simple( $url, $form ); };
|
|
2
|
|
|
|
|
8
|
|
174
|
2
|
50
|
|
|
|
12
|
last if ( $c );
|
175
|
0
|
|
|
|
|
0
|
sleep 2;
|
176
|
|
|
|
|
|
|
}
|
177
|
|
|
|
|
|
|
|
178
|
2
|
|
33
|
|
|
17
|
return $c || $DEFAULT_URL_CONTENT;
|
179
|
|
|
|
|
|
|
}
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub format_post_content {
|
182
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $form ) = @_;
|
183
|
|
|
|
|
|
|
|
184
|
0
|
0
|
|
|
|
0
|
return $form unless ( ref( $form ) eq 'HASH' );
|
185
|
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
0
|
my @params;
|
187
|
0
|
|
|
|
|
0
|
while ( my ( $k, $v ) = each %$form ) {
|
188
|
0
|
|
|
|
|
0
|
push @params, uri_escape( $k ) . "=" . uri_escape( $v );
|
189
|
|
|
|
|
|
|
}
|
190
|
|
|
|
|
|
|
|
191
|
0
|
|
|
|
|
0
|
my $post_str = join( "&", @params );
|
192
|
0
|
|
|
|
|
0
|
return $post_str;
|
193
|
|
|
|
|
|
|
}
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub request_url_simple {
|
196
|
2
|
|
|
2
|
0
|
6
|
my ( $self, $url, $form ) = @_;
|
197
|
|
|
|
|
|
|
|
198
|
2
|
|
|
|
|
3
|
my $res;
|
199
|
2
|
50
|
|
|
|
9
|
if ( $form ) {
|
|
|
50
|
|
|
|
|
|
200
|
|
|
|
|
|
|
$res = $self->{browser}->request(
|
201
|
0
|
|
|
|
|
0
|
'POST', $url,
|
202
|
|
|
|
|
|
|
{ content => $self->format_post_content( $form ),
|
203
|
|
|
|
|
|
|
headers => {
|
204
|
|
|
|
|
|
|
'content-type' => 'application/x-www-form-urlencoded',
|
205
|
|
|
|
|
|
|
},
|
206
|
|
|
|
|
|
|
} );
|
207
|
|
|
|
|
|
|
} elsif ( $self->{use_chrome} ) {
|
208
|
0
|
|
|
|
|
0
|
$res->{content} = `chrome --no-sandbox --user-data-dir --headless --disable-gpu --dump-dom "$url" 2>/dev/null`;
|
209
|
0
|
|
|
|
|
0
|
$res->{success} = 1;
|
210
|
|
|
|
|
|
|
} else {
|
211
|
2
|
|
|
|
|
57
|
$res = $self->{browser}->get( $url );
|
212
|
|
|
|
|
|
|
}
|
213
|
2
|
50
|
|
|
|
996141
|
return $DEFAULT_URL_CONTENT unless ( $res->{success} );
|
214
|
|
|
|
|
|
|
|
215
|
2
|
|
|
|
|
4
|
my $html;
|
216
|
2
|
|
|
|
|
5
|
my $content = $res->{content};
|
217
|
2
|
50
|
33
|
|
|
19
|
if ( $res->{headers}{'content-encoding'}
|
218
|
|
|
|
|
|
|
and $res->{headers}{'content-encoding'} eq 'gzip' ) {
|
219
|
2
|
|
|
|
|
23
|
gunzip \$content => \$html, MultiStream => 1, Append => 1;
|
220
|
|
|
|
|
|
|
}
|
221
|
|
|
|
|
|
|
|
222
|
2
|
|
33
|
|
|
7073
|
my $charset = detect( $html || $content );
|
223
|
2
|
|
33
|
|
|
1071050
|
my $r = decode( $charset, $html || $content, Encode::FB_XMLCREF );
|
224
|
|
|
|
|
|
|
|
225
|
2
|
|
33
|
|
|
7828
|
return $r || $DEFAULT_URL_CONTENT;
|
226
|
|
|
|
|
|
|
} ## end sub request_url_simple
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub read_moz_cookie {
|
229
|
0
|
|
|
0
|
1
|
|
my ( $self, $cookie, $dom ) = @_;
|
230
|
|
|
|
|
|
|
|
231
|
0
|
|
|
|
|
|
my @segment;
|
232
|
0
|
0
|
0
|
|
|
|
if ( -f $cookie and $cookie =~ /\.sqlite$/ ) { # firefox sqlite3
|
|
|
0
|
0
|
|
|
|
|
233
|
0
|
|
|
|
|
|
my $sqlite3_cookie =
|
234
|
|
|
|
|
|
|
`sqlite3 "$cookie" "select host,isSecure,path,isHttpOnly,expiry,name,value from moz_cookies where baseDomain='$dom'"`;
|
235
|
0
|
|
|
|
|
|
@segment = map { [ split /\|/ ] } split /\n/, $sqlite3_cookie;
|
|
0
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
} elsif ( -f $cookie and $cookie =~ /\.txt$/ ) { # Netscape HTTP Cookie File
|
237
|
0
|
|
|
|
|
|
my @ck = slurp( $cookie );
|
238
|
0
|
0
|
|
|
|
|
@segment = grep { $_->[0] and $_->[0] =~ /(^|\.)\Q$dom\E$/ } map { [ split /\s+/ ] } @ck;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
} else { # cookie string : name1=value1; name2=value2
|
240
|
0
|
|
|
|
|
|
my @ck = split /;\s*/, $cookie;
|
241
|
0
|
|
|
|
|
|
@segment = map { my @c = split /=/; [ $dom, undef, '/', undef, 0, $c[0], $c[1] ] } @ck;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
}
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
0
|
0
|
|
|
|
|
@segment = grep { defined $_->[6] and $_->[6] =~ /\S/ } @segment;
|
|
0
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
|
247
|
0
|
|
|
|
|
|
my @jar = map { "$_->[5]=$_->[6]; Domain=$_->[0]; Path=$_->[2]; Expiry=$_->[4]" } @segment;
|
|
0
|
|
|
|
|
|
|
248
|
0
|
|
|
|
|
|
$self->{browser}{cookie_jar}->load_cookies( @jar );
|
249
|
|
|
|
|
|
|
|
250
|
0
|
|
|
|
|
|
$cookie = join( "; ", map { "$_->[5]=$_->[6]" } @segment );
|
|
0
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
#$self->{browser}{cookie_jar}{cookie} = $cookie;
|
253
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
|
return $cookie;
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
} ## end sub read_moz_cookie
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
1;
|