line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# ABSTRACT: get/post url, return unicode content, auto detect CJK charset
|
2
|
|
|
|
|
|
|
package Novel::Robot::Browser;
|
3
|
|
|
|
|
|
|
|
4
|
3
|
|
|
3
|
|
17
|
use strict;
|
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
70
|
|
5
|
3
|
|
|
3
|
|
11
|
use warnings;
|
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
61
|
|
6
|
3
|
|
|
3
|
|
12
|
use utf8;
|
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
10
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
#our $VERSION = 0.22;
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
#use Novel::Robot::Browser::CookieJar;
|
11
|
3
|
|
|
3
|
|
1201
|
use HTTP::CookieJar;
|
|
3
|
|
|
|
|
79710
|
|
|
3
|
|
|
|
|
114
|
|
12
|
3
|
|
|
3
|
|
1528
|
use Data::Dumper;
|
|
3
|
|
|
|
|
15629
|
|
|
3
|
|
|
|
|
166
|
|
13
|
|
|
|
|
|
|
|
14
|
3
|
|
|
3
|
|
1341
|
use File::Slurp qw/slurp/;
|
|
3
|
|
|
|
|
79124
|
|
|
3
|
|
|
|
|
179
|
|
15
|
3
|
|
|
3
|
|
4337
|
use Encode::Detect::CJK qw/detect/;
|
|
3
|
|
|
|
|
147853
|
|
|
3
|
|
|
|
|
239
|
|
16
|
3
|
|
|
3
|
|
35
|
use Encode;
|
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
226
|
|
17
|
3
|
|
|
3
|
|
2140
|
use HTTP::Tiny;
|
|
3
|
|
|
|
|
81773
|
|
|
3
|
|
|
|
|
107
|
|
18
|
3
|
|
|
3
|
|
1446
|
use Parallel::ForkManager;
|
|
3
|
|
|
|
|
126769
|
|
|
3
|
|
|
|
|
95
|
|
19
|
3
|
|
|
3
|
|
1479
|
use Term::ProgressBar;
|
|
3
|
|
|
|
|
165664
|
|
|
3
|
|
|
|
|
108
|
|
20
|
3
|
|
|
3
|
|
1580
|
use IO::Uncompress::Gunzip qw(gunzip);
|
|
3
|
|
|
|
|
92013
|
|
|
3
|
|
|
|
|
171
|
|
21
|
3
|
|
|
3
|
|
1147
|
use URI::Escape;
|
|
3
|
|
|
|
|
3448
|
|
|
3
|
|
|
|
|
147
|
|
22
|
3
|
|
|
3
|
|
1338
|
use URI;
|
|
3
|
|
|
|
|
6887
|
|
|
3
|
|
|
|
|
4650
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $DEFAULT_URL_CONTENT = '';
|
25
|
|
|
|
|
|
|
our %DEFAULT_HEADER = (
|
26
|
|
|
|
|
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
27
|
|
|
|
|
|
|
'Accept-Charset' => 'gb2312,utf-8;q=0.7,*;q=0.7',
|
28
|
|
|
|
|
|
|
'Accept-Encoding' => "gzip",
|
29
|
|
|
|
|
|
|
'Accept-Language' => 'zh,zh-cn;q=0.8,en-us;q=0.5,en;q=0.3',
|
30
|
|
|
|
|
|
|
'Connection' => 'keep-alive',
|
31
|
|
|
|
|
|
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
|
32
|
|
|
|
|
|
|
'DNT' => 1,
|
33
|
|
|
|
|
|
|
);
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub new {
|
36
|
4
|
|
|
4
|
0
|
24
|
my ( $self, %opt ) = @_;
|
37
|
4
|
|
50
|
|
|
33
|
$opt{retry} ||= 5;
|
38
|
4
|
|
100
|
|
|
23
|
$opt{max_process_num} ||= 5;
|
39
|
4
|
|
33
|
|
|
33
|
$opt{browser} ||= _init_browser( $opt{browser_headers} );
|
40
|
4
|
|
50
|
|
|
25
|
$opt{use_chrome} ||= 0;
|
41
|
4
|
|
|
|
|
26
|
bless {%opt}, __PACKAGE__;
|
42
|
|
|
|
|
|
|
}
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub _init_browser {
|
45
|
4
|
|
|
4
|
|
14
|
my ( $headers ) = @_;
|
46
|
|
|
|
|
|
|
|
47
|
4
|
|
50
|
|
|
24
|
$headers ||= {};
|
48
|
4
|
|
|
|
|
38
|
my %h = ( %DEFAULT_HEADER, %$headers );
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
#my $cookie_jar = Novel::Robot::Browser::CookieJar->new();
|
51
|
4
|
|
|
|
|
39
|
my $cookie_jar = HTTP::CookieJar->new;
|
52
|
|
|
|
|
|
|
|
53
|
4
|
|
|
|
|
67
|
my $http = HTTP::Tiny->new(
|
54
|
|
|
|
|
|
|
default_headers => \%h,
|
55
|
|
|
|
|
|
|
cookie_jar => $cookie_jar,
|
56
|
|
|
|
|
|
|
);
|
57
|
|
|
|
|
|
|
|
58
|
4
|
|
|
|
|
575
|
return $http;
|
59
|
|
|
|
|
|
|
}
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub request_url_whole {
|
62
|
1
|
|
|
1
|
1
|
5
|
my ( $self, $url, %o ) = @_;
|
63
|
|
|
|
|
|
|
|
64
|
1
|
|
|
|
|
4
|
my $html = $self->request_url( $url, $o{post_data} );
|
65
|
|
|
|
|
|
|
|
66
|
1
|
|
50
|
|
|
9
|
my $info = $o{info_sub}->( \$html ) || {};
|
67
|
1
|
50
|
|
|
|
5
|
print "\r$info->{writer}, $info->{book}, $url\n" if ( $o{verbose} );
|
68
|
|
|
|
|
|
|
#print Dumper('info_sub', $info);
|
69
|
1
|
|
50
|
|
|
7
|
my $data_list = $o{item_list} || $o{item_list_sub}->( \$html ) || [];
|
70
|
|
|
|
|
|
|
#print Dumper('data_list_sub', $data_list);
|
71
|
|
|
|
|
|
|
|
72
|
1
|
|
|
|
|
3
|
my $i = 1;
|
73
|
1
|
50
|
33
|
|
|
9
|
unless ( $o{stop_sub} and $o{stop_sub}->( $info, $data_list, $i, %o ) or defined $o{item_list}) {
|
|
|
|
33
|
|
|
|
|
74
|
1
|
50
|
33
|
|
|
5
|
$data_list = [] if ( $o{min_page_num} and $o{min_page_num} > 1 );
|
75
|
1
|
50
|
|
|
|
4
|
my $page_list = exists $o{page_list_sub} ? $o{page_list_sub}->( \$html ) : undef;
|
76
|
|
|
|
|
|
|
#print Dumper('page_list_sub', $page_list);
|
77
|
1
|
|
|
|
|
3
|
while ( 1 ) {
|
78
|
1
|
|
|
|
|
2
|
$i++;
|
79
|
|
|
|
|
|
|
my $u =
|
80
|
|
|
|
|
|
|
$page_list ? $page_list->[ $i - 2 ] :
|
81
|
1
|
50
|
|
|
|
5
|
( exists $o{next_page_sub} ? $o{next_page_sub}->( $url, $i, \$html ) : undef );
|
|
|
50
|
|
|
|
|
|
82
|
1
|
50
|
|
|
|
3
|
last unless ( $u );
|
83
|
0
|
0
|
0
|
|
|
0
|
next if ( $o{min_page_num} and $i < $o{min_page_num} );
|
84
|
0
|
0
|
0
|
|
|
0
|
last if ( $o{max_page_num} and $i > $o{max_page_num} );
|
85
|
|
|
|
|
|
|
|
86
|
0
|
0
|
|
|
|
0
|
my ( $u_url, $u_post_data ) = ref( $u ) eq 'HASH' ? @{$u}{qw/url post_data/} : ( $u, undef );
|
|
0
|
|
|
|
|
0
|
|
87
|
0
|
|
|
|
|
0
|
my $c = $self->request_url( $u_url, $u_post_data );
|
88
|
|
|
|
|
|
|
#print "item_list: $u_url\n" if ( $o{verbose} );
|
89
|
0
|
|
|
|
|
0
|
my $fs = $o{item_list_sub}->( \$c );
|
90
|
0
|
0
|
|
|
|
0
|
last unless ( $fs );
|
91
|
|
|
|
|
|
|
|
92
|
0
|
|
|
|
|
0
|
push @$data_list, @$fs;
|
93
|
0
|
0
|
0
|
|
|
0
|
last if ( $o{stop_sub} and $o{stop_sub}->( $info, $data_list, $i, %o ) );
|
94
|
|
|
|
|
|
|
}
|
95
|
|
|
|
|
|
|
} ## end unless ( $o{stop_sub} and ...)
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
#lofter倒序
|
98
|
1
|
50
|
|
|
|
4
|
if ( $o{reverse_item_list} ){
|
99
|
0
|
|
|
|
|
0
|
$data_list = [ reverse @$data_list ];
|
100
|
0
|
|
|
|
|
0
|
my $max_id = $data_list->[0]{id};
|
101
|
0
|
0
|
|
|
|
0
|
if($max_id){
|
102
|
0
|
|
|
|
|
0
|
$_->{id} = $max_id - $_->{id} +1 for(@$data_list);
|
103
|
|
|
|
|
|
|
}
|
104
|
|
|
|
|
|
|
}
|
105
|
1
|
50
|
33
|
|
|
9
|
$info->{item_num} = ( $#$data_list >= 0 and exists $data_list->[-1]{id} ) ? $data_list->[-1]{id} : ( scalar( @$data_list ) || $i );
|
|
|
|
33
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
1
|
50
|
|
|
|
5
|
if ( $o{item_sub} ) {
|
108
|
1
|
|
|
|
|
2
|
my $item_id = 0;
|
109
|
1
|
50
|
|
|
|
3
|
print "\n\n" if ( $o{term_progress_bar} );
|
110
|
1
|
|
|
|
|
1
|
my $progress;
|
111
|
1
|
50
|
|
|
|
3
|
$progress = Term::ProgressBar->new( { count => scalar(@$data_list) } ) if ( $o{term_progress_bar} );
|
112
|
1
|
|
|
|
|
3
|
for my $r ( @$data_list ) {
|
113
|
3
|
|
33
|
|
|
13
|
$r->{id} //= ++$item_id;
|
114
|
3
|
|
|
|
|
15
|
$r->{url} = URI->new_abs( $r->{url}, $url )->as_string;
|
115
|
3
|
100
|
|
|
|
8061
|
next unless ( $self->is_item_in_range( $r->{id}, $o{min_item_num}, $o{max_item_num} ) );
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#print "item_url: $r->{url}\n" if ( $o{verbose} );
|
118
|
1
|
50
|
|
|
|
3
|
if($r->{url}){
|
119
|
1
|
|
|
|
|
5
|
my $c = $self->request_url( $r->{url}, $r->{post_data} );
|
120
|
|
|
|
|
|
|
#print "item content: $c\n";
|
121
|
1
|
|
|
|
|
14
|
my $temp_r = $o{item_sub}->( \$c );
|
122
|
1
|
|
|
|
|
7
|
$r->{content} = $temp_r->{content};
|
123
|
|
|
|
|
|
|
#$r = { %$r, %$temp_r };
|
124
|
|
|
|
|
|
|
#print Dumper($r);
|
125
|
|
|
|
|
|
|
}else{
|
126
|
0
|
|
|
|
|
0
|
$r = $o{item_sub}->( $r );
|
127
|
|
|
|
|
|
|
}
|
128
|
1
|
50
|
|
|
|
6
|
$progress->update( $item_id ) if ( $o{term_progress_bar} );
|
129
|
|
|
|
|
|
|
}
|
130
|
1
|
50
|
|
|
|
5
|
$progress->update( scalar(@$data_list) ) if ( $o{term_progress_bar} );
|
131
|
|
|
|
|
|
|
}
|
132
|
1
|
50
|
|
|
|
5
|
print "\n\n" if ( $o{term_progress_bar} );
|
133
|
1
|
|
|
|
|
6
|
return ( $info, $data_list );
|
134
|
|
|
|
|
|
|
} ## end sub request_url_whole
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub is_item_in_range {
|
137
|
12
|
|
|
12
|
0
|
30
|
my ( $self, $id, $min, $max ) = @_;
|
138
|
12
|
50
|
|
|
|
23
|
return 1 unless ( $id );
|
139
|
12
|
100
|
100
|
|
|
41
|
return 0 if ( $min and $id < $min );
|
140
|
8
|
50
|
66
|
|
|
21
|
return 0 if ( $max and $id > $max );
|
141
|
8
|
|
|
|
|
23
|
return 1;
|
142
|
|
|
|
|
|
|
}
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub is_list_overflow {
|
145
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $r, $max ) = @_;
|
146
|
|
|
|
|
|
|
|
147
|
0
|
0
|
|
|
|
0
|
return unless ( $max );
|
148
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
0
|
my $item_num = scalar( @$r );
|
150
|
0
|
|
0
|
|
|
0
|
my $id = $r->[-1]{id} // $item_num;
|
151
|
|
|
|
|
|
|
|
152
|
0
|
0
|
|
|
|
0
|
return if ( $id < $max );
|
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
0
|
$#{$r} = $max - 1;
|
|
0
|
|
|
|
|
0
|
|
155
|
0
|
|
|
|
|
0
|
return 1;
|
156
|
|
|
|
|
|
|
}
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub request_url {
|
159
|
4
|
|
|
4
|
1
|
1250
|
my ( $self, $url, $form ) = @_;
|
160
|
4
|
50
|
|
|
|
15
|
return $DEFAULT_URL_CONTENT unless ( $url );
|
161
|
|
|
|
|
|
|
|
162
|
4
|
|
|
|
|
15
|
my $c;
|
163
|
4
|
|
|
|
|
21
|
for my $i ( 1 .. $self->{retry} ) {
|
164
|
4
|
|
|
|
|
8
|
eval { $c = $self->request_url_simple( $url, $form ); };
|
|
4
|
|
|
|
|
13
|
|
165
|
4
|
50
|
|
|
|
22
|
last if ( $c );
|
166
|
0
|
|
|
|
|
0
|
sleep 2;
|
167
|
|
|
|
|
|
|
}
|
168
|
|
|
|
|
|
|
|
169
|
4
|
|
33
|
|
|
28
|
return $c || $DEFAULT_URL_CONTENT;
|
170
|
|
|
|
|
|
|
}
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub format_post_content {
|
173
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $form ) = @_;
|
174
|
|
|
|
|
|
|
|
175
|
0
|
0
|
|
|
|
0
|
return $form unless ( ref( $form ) eq 'HASH' );
|
176
|
|
|
|
|
|
|
|
177
|
0
|
|
|
|
|
0
|
my @params;
|
178
|
0
|
|
|
|
|
0
|
while ( my ( $k, $v ) = each %$form ) {
|
179
|
0
|
|
|
|
|
0
|
push @params, uri_escape( $k ) . "=" . uri_escape( $v );
|
180
|
|
|
|
|
|
|
}
|
181
|
|
|
|
|
|
|
|
182
|
0
|
|
|
|
|
0
|
my $post_str = join( "&", @params );
|
183
|
0
|
|
|
|
|
0
|
return $post_str;
|
184
|
|
|
|
|
|
|
}
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub request_url_simple {
|
187
|
4
|
|
|
4
|
0
|
11
|
my ( $self, $url, $form ) = @_;
|
188
|
|
|
|
|
|
|
|
189
|
4
|
|
|
|
|
6
|
my $res;
|
190
|
4
|
50
|
|
|
|
17
|
if ( $form ) {
|
|
|
50
|
|
|
|
|
|
191
|
|
|
|
|
|
|
$res = $self->{browser}->request(
|
192
|
0
|
|
|
|
|
0
|
'POST', $url,
|
193
|
|
|
|
|
|
|
{ content => $self->format_post_content( $form ),
|
194
|
|
|
|
|
|
|
headers => {
|
195
|
|
|
|
|
|
|
'content-type' => 'application/x-www-form-urlencoded',
|
196
|
|
|
|
|
|
|
},
|
197
|
|
|
|
|
|
|
} );
|
198
|
|
|
|
|
|
|
} elsif ( $self->{use_chrome} ) {
|
199
|
0
|
|
|
|
|
0
|
$res->{content} = `chrome --no-sandbox --user-data-dir --headless --disable-gpu --dump-dom "$url" 2>/dev/null`;
|
200
|
0
|
|
|
|
|
0
|
$res->{success} = 1;
|
201
|
|
|
|
|
|
|
} else {
|
202
|
4
|
|
|
|
|
105
|
$res = $self->{browser}->get( $url );
|
203
|
|
|
|
|
|
|
}
|
204
|
4
|
50
|
|
|
|
2083102
|
return $DEFAULT_URL_CONTENT unless ( $res->{success} );
|
205
|
|
|
|
|
|
|
|
206
|
4
|
|
|
|
|
12
|
my $html;
|
207
|
4
|
|
|
|
|
25
|
my $content = $res->{content};
|
208
|
4
|
50
|
33
|
|
|
42
|
if ( $res->{headers}{'content-encoding'}
|
209
|
|
|
|
|
|
|
and $res->{headers}{'content-encoding'} eq 'gzip' ) {
|
210
|
4
|
|
|
|
|
41
|
gunzip \$content => \$html, MultiStream => 1, Append => 1;
|
211
|
|
|
|
|
|
|
}
|
212
|
|
|
|
|
|
|
|
213
|
4
|
|
33
|
|
|
17512
|
my $charset = detect( $html || $content );
|
214
|
4
|
|
33
|
|
|
1526225
|
my $r = decode( $charset, $html || $content, Encode::FB_XMLCREF );
|
215
|
|
|
|
|
|
|
|
216
|
4
|
|
33
|
|
|
12682
|
return $r || $DEFAULT_URL_CONTENT;
|
217
|
|
|
|
|
|
|
} ## end sub request_url_simple
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub read_moz_cookie {
|
220
|
0
|
|
|
0
|
1
|
|
my ( $self, $cookie, $dom ) = @_;
|
221
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
|
my @segment;
|
223
|
0
|
0
|
0
|
|
|
|
if ( -f $cookie and $cookie =~ /\.sqlite$/ ) { # firefox sqlite3
|
|
|
0
|
0
|
|
|
|
|
224
|
0
|
|
|
|
|
|
my $sqlite3_cookie =
|
225
|
|
|
|
|
|
|
`sqlite3 "$cookie" "select host,isSecure,path,isHttpOnly,expiry,name,value from moz_cookies where baseDomain='$dom'"`;
|
226
|
0
|
|
|
|
|
|
@segment = map { [ split /\|/ ] } split /\n/, $sqlite3_cookie;
|
|
0
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
} elsif ( -f $cookie and $cookie =~ /\.txt$/ ) { # Netscape HTTP Cookie File
|
228
|
0
|
|
|
|
|
|
my @ck = slurp( $cookie );
|
229
|
0
|
0
|
|
|
|
|
@segment = grep { $_->[0] and $_->[0] =~ /(^|\.)\Q$dom\E$/ } map { [ split /\s+/ ] } @ck;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} else { # cookie string : name1=value1; name2=value2
|
231
|
0
|
|
|
|
|
|
my @ck = split /;\s*/, $cookie;
|
232
|
0
|
|
|
|
|
|
@segment = map { my @c = split /=/; [ $dom, undef, '/', undef, 0, $c[0], $c[1] ] } @ck;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
}
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
#print Dumper(\@segment);
|
236
|
|
|
|
|
|
|
|
237
|
0
|
0
|
|
|
|
|
@segment = grep { defined $_->[6] and $_->[6] =~ /\S/ } @segment;
|
|
0
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
my @jar = map { "$_->[5]=$_->[6]; Domain=$_->[0]; Path=$_->[2]; Expiry=$_->[4]" } @segment;
|
|
0
|
|
|
|
|
|
|
240
|
0
|
|
|
|
|
|
$self->{browser}{cookie_jar}->load_cookies( @jar );
|
241
|
|
|
|
|
|
|
|
242
|
0
|
|
|
|
|
|
$cookie = join( "; ", map { "$_->[5]=$_->[6]" } @segment );
|
|
0
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
#$self->{browser}{cookie_jar}{cookie} = $cookie;
|
245
|
|
|
|
|
|
|
|
246
|
0
|
|
|
|
|
|
return $cookie;
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} ## end sub read_moz_cookie
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
1;
|