| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Crawl4AI::Client; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: UA-agnostic REST client for the Crawl4AI Docker API |
|
3
|
4
|
|
|
4
|
|
120645
|
use Moo; |
|
|
4
|
|
|
|
|
5384
|
|
|
|
4
|
|
|
|
|
19
|
|
|
4
|
4
|
|
|
4
|
|
2104
|
use Carp qw( croak ); |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
221
|
|
|
5
|
4
|
|
|
4
|
|
1936
|
use HTTP::Request (); |
|
|
4
|
|
|
|
|
31762
|
|
|
|
4
|
|
|
|
|
83
|
|
|
6
|
4
|
|
|
4
|
|
896
|
use JSON::MaybeXS (); |
|
|
4
|
|
|
|
|
23326
|
|
|
|
4
|
|
|
|
|
84
|
|
|
7
|
4
|
|
|
4
|
|
20
|
use URI (); |
|
|
4
|
|
|
|
|
5
|
|
|
|
4
|
|
|
|
|
78
|
|
|
8
|
4
|
|
|
4
|
|
17
|
use URI::Escape (); |
|
|
4
|
|
|
|
|
5
|
|
|
|
4
|
|
|
|
|
49
|
|
|
9
|
4
|
|
|
4
|
|
1632
|
use MIME::Base64 (); |
|
|
4
|
|
|
|
|
2054
|
|
|
|
4
|
|
|
|
|
131
|
|
|
10
|
4
|
|
|
4
|
|
1857
|
use Safe::Isa; |
|
|
4
|
|
|
|
|
2268
|
|
|
|
4
|
|
|
|
|
587
|
|
|
11
|
4
|
|
|
4
|
|
1605
|
use WWW::Crawl4AI::Markdown qw( resolve_markdown_chain ); |
|
|
4
|
|
|
|
|
13
|
|
|
|
4
|
|
|
|
|
220
|
|
|
12
|
4
|
|
|
4
|
|
1477
|
use WWW::Crawl4AI::Request (); |
|
|
4
|
|
|
|
|
13
|
|
|
|
4
|
|
|
|
|
119
|
|
|
13
|
4
|
|
|
4
|
|
1932
|
use WWW::Crawl4AI::Error (); |
|
|
4
|
|
|
|
|
26
|
|
|
|
4
|
|
|
|
|
15179
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $VERSION = '0.001'; |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has base_url => ( |
|
19
|
|
|
|
|
|
|
is => 'ro', |
|
20
|
|
|
|
|
|
|
default => sub { $ENV{CRAWL4AI_URL} || $ENV{CRAWL4AI_BASE_URL} || 'http://localhost:11235' }, |
|
21
|
|
|
|
|
|
|
); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
has api_token => ( |
|
25
|
|
|
|
|
|
|
is => 'ro', |
|
26
|
|
|
|
|
|
|
default => sub { $ENV{CRAWL4AI_API_TOKEN} }, |
|
27
|
|
|
|
|
|
|
); |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
has _json => ( |
|
31
|
|
|
|
|
|
|
is => 'lazy', |
|
32
|
|
|
|
|
|
|
default => sub { JSON::MaybeXS->new( utf8 => 1, canonical => 1, convert_blessed => 1 ) }, |
|
33
|
|
|
|
|
|
|
); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
has user_agent_string => ( |
|
36
|
|
|
|
|
|
|
is => 'ro', |
|
37
|
|
|
|
|
|
|
default => sub { "WWW-Crawl4AI/$VERSION" }, |
|
38
|
|
|
|
|
|
|
); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
has timeout => ( is => 'ro', default => sub { 120 } ); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
has ua => ( is => 'lazy' ); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
has max_attempts => ( is => 'ro', default => sub { 3 } ); |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
has retry_backoff => ( is => 'ro', default => sub { [ 1, 2, 4 ] } ); |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
has retry_statuses => ( is => 'ro', default => sub { [ 429, 502, 503, 504 ] } ); |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
has on_retry => ( is => 'ro' ); |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
has sleep_sub => ( |
|
60
|
|
|
|
|
|
|
is => 'ro', |
|
61
|
|
|
|
|
|
|
default => sub { |
|
62
|
|
|
|
|
|
|
require Time::HiRes; |
|
63
|
|
|
|
|
|
|
return sub { Time::HiRes::sleep( $_[0] ) }; |
|
64
|
|
|
|
|
|
|
}, |
|
65
|
|
|
|
|
|
|
); |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
has _retry_status_set => ( is => 'lazy' ); |
|
69
|
|
|
|
|
|
|
sub _build__retry_status_set { |
|
70
|
0
|
|
|
0
|
|
0
|
return { map { $_ => 1 } @{ $_[0]->retry_statuses } }; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub _build_ua { |
|
74
|
0
|
|
|
0
|
|
0
|
my ( $self ) = @_; |
|
75
|
0
|
|
|
|
|
0
|
require LWP::UserAgent; |
|
76
|
0
|
|
|
|
|
0
|
return LWP::UserAgent->new( |
|
77
|
|
|
|
|
|
|
agent => $self->user_agent_string, |
|
78
|
|
|
|
|
|
|
timeout => $self->timeout, |
|
79
|
|
|
|
|
|
|
); |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
0
|
|
|
0
|
1
|
0
|
sub is_request { $_[1]->$_isa('HTTP::Request') } |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _uri { |
|
86
|
15
|
|
|
15
|
|
23
|
my ( $self, $path ) = @_; |
|
87
|
15
|
|
|
|
|
98
|
( my $base = $self->base_url ) =~ s{/+$}{}; |
|
88
|
15
|
|
|
|
|
75
|
return URI->new( $base . $path ); |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub _request { |
|
92
|
15
|
|
|
15
|
|
35
|
my ( $self, $method, $path, $body ) = @_; |
|
93
|
15
|
|
|
|
|
37
|
my $req = HTTP::Request->new( $method => $self->_uri($path) ); |
|
94
|
15
|
|
|
|
|
16632
|
$req->header( Accept => 'application/json' ); |
|
95
|
15
|
100
|
|
|
|
794
|
$req->header( Authorization => 'Bearer ' . $self->api_token ) if defined $self->api_token; |
|
96
|
15
|
100
|
|
|
|
204
|
if ( defined $body ) { |
|
97
|
12
|
|
|
|
|
27
|
$req->header( 'Content-Type' => 'application/json' ); |
|
98
|
12
|
|
|
|
|
660
|
$req->content( $self->_json->encode($body) ); |
|
99
|
|
|
|
|
|
|
} |
|
100
|
15
|
|
|
|
|
429
|
return $req; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
104
|
|
|
|
|
|
|
# Request builders (no I/O) |
|
105
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub crawl_request { |
|
108
|
4
|
|
|
4
|
1
|
2350
|
my ( $self, $request ) = @_; |
|
109
|
4
|
|
|
|
|
13
|
return $self->_request( POST => '/crawl', $self->_payload( $request, 'to_crawl_payload' ) ); |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub md_request { |
|
114
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $request ) = @_; |
|
115
|
0
|
|
|
|
|
0
|
return $self->_request( POST => '/md', $self->_payload( $request, 'to_md_payload' ) ); |
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub job_submit_request { |
|
119
|
1
|
|
|
1
|
1
|
2
|
my ( $self, $request ) = @_; |
|
120
|
1
|
|
|
|
|
5
|
return $self->_request( POST => '/crawl/job', $self->_payload( $request, 'to_crawl_payload' ) ); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub job_status_request { |
|
124
|
1
|
|
|
1
|
1
|
656
|
my ( $self, $task_id ) = @_; |
|
125
|
1
|
50
|
33
|
|
|
7
|
croak "job_status_request needs a task_id" unless defined $task_id && length $task_id; |
|
126
|
1
|
|
|
|
|
4
|
return $self->_request( GET => '/crawl/job/' . $task_id ); |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
1
|
|
|
1
|
1
|
1274
|
sub health_request { $_[0]->_request( GET => '/health' ) } |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub screenshot_request { |
|
132
|
2
|
|
|
2
|
1
|
4332
|
my ( $self, $url, %opts ) = @_; |
|
133
|
2
|
50
|
33
|
|
|
13
|
croak "screenshot_request needs a url" unless defined $url && length $url; |
|
134
|
2
|
|
|
|
|
6
|
my %body = ( url => $url ); |
|
135
|
2
|
100
|
|
|
|
7
|
$body{screenshot_wait_for} = $opts{wait_for} if defined $opts{wait_for}; |
|
136
|
2
|
100
|
|
|
|
5
|
$body{output_path} = $opts{output_path} if defined $opts{output_path}; |
|
137
|
|
|
|
|
|
|
$body{wait_for_images} = |
|
138
|
|
|
|
|
|
|
$opts{wait_for_images} ? WWW::Crawl4AI::Request::JSON_true() : WWW::Crawl4AI::Request::JSON_false() |
|
139
|
2
|
50
|
|
|
|
9
|
if exists $opts{wait_for_images}; |
|
|
|
100
|
|
|
|
|
|
|
140
|
2
|
|
|
|
|
13
|
return $self->_request( POST => '/screenshot', \%body ); |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub pdf_request { |
|
145
|
1
|
|
|
1
|
1
|
2546
|
my ( $self, $url, %opts ) = @_; |
|
146
|
1
|
50
|
33
|
|
|
6
|
croak "pdf_request needs a url" unless defined $url && length $url; |
|
147
|
1
|
|
|
|
|
3
|
my %body = ( url => $url ); |
|
148
|
1
|
50
|
|
|
|
4
|
$body{output_path} = $opts{output_path} if defined $opts{output_path}; |
|
149
|
1
|
|
|
|
|
3
|
return $self->_request( POST => '/pdf', \%body ); |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub html_request { |
|
153
|
1
|
|
|
1
|
1
|
2602
|
my ( $self, $url ) = @_; |
|
154
|
1
|
50
|
33
|
|
|
7
|
croak "html_request needs a url" unless defined $url && length $url; |
|
155
|
1
|
|
|
|
|
4
|
return $self->_request( POST => '/html', { url => $url } ); |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub execute_js_request { |
|
159
|
4
|
|
|
4
|
1
|
8150
|
my ( $self, $url, $scripts ) = @_; |
|
160
|
4
|
50
|
33
|
|
|
20
|
croak "execute_js_request needs a url" unless defined $url && length $url; |
|
161
|
4
|
100
|
|
|
|
12
|
$scripts = [$scripts] unless ref $scripts eq 'ARRAY'; |
|
162
|
4
|
100
|
100
|
|
|
244
|
croak "execute_js_request needs at least one script" unless @$scripts && defined $scripts->[0]; |
|
163
|
2
|
|
|
|
|
7
|
return $self->_request( POST => '/execute_js', { url => $url, scripts => $scripts } ); |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub llm_request { |
|
167
|
1
|
|
|
1
|
1
|
2306
|
my ( $self, $url, $query, %opts ) = @_; |
|
168
|
1
|
50
|
33
|
|
|
6
|
croak "llm_request needs a url" unless defined $url && length $url; |
|
169
|
1
|
50
|
33
|
|
|
21
|
croak "llm_request needs a query" unless defined $query && length $query; |
|
170
|
|
|
|
|
|
|
# The page URL is a path segment ({url:path}); escape it so its own scheme and |
|
171
|
|
|
|
|
|
|
# query string don't merge into ours. The question and tuning go in the query. |
|
172
|
1
|
|
|
|
|
6
|
my $path = URI->new( '/llm/' . URI::Escape::uri_escape_utf8($url) ); |
|
173
|
|
|
|
|
|
|
$path->query_form( |
|
174
|
|
|
|
|
|
|
q => $query, |
|
175
|
|
|
|
|
|
|
( defined $opts{provider} ? ( provider => $opts{provider} ) : () ), |
|
176
|
|
|
|
|
|
|
( defined $opts{temperature} ? ( temperature => $opts{temperature} ) : () ), |
|
177
|
1
|
50
|
|
|
|
281
|
( defined $opts{base_url} ? ( base_url => $opts{base_url} ) : () ), |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
); |
|
179
|
1
|
|
|
|
|
187
|
return $self->_request( GET => $path->as_string ); |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub token_request { |
|
183
|
1
|
|
|
1
|
1
|
2806
|
my ( $self, $email, %opts ) = @_; |
|
184
|
1
|
50
|
33
|
|
|
6
|
croak "token_request needs an email" unless defined $email && length $email; |
|
185
|
1
|
|
|
|
|
3
|
my %body = ( email => $email ); |
|
186
|
1
|
50
|
|
|
|
11
|
$body{api_token} = $opts{api_token} if defined $opts{api_token}; |
|
187
|
1
|
|
|
|
|
3
|
return $self->_request( POST => '/token', \%body ); |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub _payload { |
|
191
|
5
|
|
|
5
|
|
9
|
my ( $self, $request, $method ) = @_; |
|
192
|
5
|
100
|
|
|
|
16
|
return $request->$method if $request->$_isa('WWW::Crawl4AI::Request'); |
|
193
|
1
|
50
|
|
|
|
12
|
return $request if ref $request eq 'HASH'; |
|
194
|
0
|
|
|
|
|
0
|
croak "expected a WWW::Crawl4AI::Request or hashref payload"; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
198
|
|
|
|
|
|
|
# Response parsers (no I/O) |
|
199
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
sub _decode { |
|
202
|
9
|
|
|
9
|
|
16
|
my ( $self, $res, $backend ) = @_; |
|
203
|
9
|
|
|
|
|
23
|
my $code = $res->code; |
|
204
|
9
|
|
33
|
|
|
83
|
my $body = $res->decoded_content // $res->content // ''; |
|
|
|
|
50
|
|
|
|
|
|
205
|
9
|
50
|
|
|
|
1121
|
my $data = eval { length $body ? $self->_json->decode($body) : undef }; |
|
|
9
|
|
|
|
|
232
|
|
|
206
|
9
|
100
|
|
|
|
112
|
if ( !$res->is_success ) { |
|
207
|
1
|
|
50
|
|
|
9
|
die WWW::Crawl4AI::Error->new( |
|
208
|
|
|
|
|
|
|
type => 'api', |
|
209
|
|
|
|
|
|
|
message => "Crawl4AI HTTP $code: " . ( $res->message // 'error' ), |
|
210
|
|
|
|
|
|
|
response => $res, |
|
211
|
|
|
|
|
|
|
data => $data, |
|
212
|
|
|
|
|
|
|
status_code => $code, |
|
213
|
|
|
|
|
|
|
backend => $backend, |
|
214
|
|
|
|
|
|
|
); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
8
|
|
|
|
|
71
|
return $data; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub parse_crawl_response { |
|
220
|
1
|
|
|
1
|
1
|
2594
|
my ( $self, $res, $backend ) = @_; |
|
221
|
1
|
|
|
|
|
3
|
my $data = $self->_decode( $res, $backend ); |
|
222
|
1
|
|
|
|
|
2
|
return [ map { $self->_normalize_page($_) } @{ $self->_result_list($data) } ]; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
4
|
|
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub parse_md_response { |
|
227
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $res, $backend ) = @_; |
|
228
|
0
|
|
|
|
|
0
|
my $data = $self->_decode( $res, $backend ); |
|
229
|
0
|
0
|
|
|
|
0
|
return $data if !ref $data; |
|
230
|
|
|
|
|
|
|
# /md commonly returns { markdown => ... } or { result => ... } |
|
231
|
0
|
|
0
|
|
|
0
|
return $data->{markdown} // $data->{result} // $data->{md} // $data; |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub parse_job_submit_response { |
|
235
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $res, $backend ) = @_; |
|
236
|
0
|
|
|
|
|
0
|
my $data = $self->_decode( $res, $backend ); |
|
237
|
0
|
|
0
|
|
|
0
|
my $id = $data->{task_id} // $data->{job_id} // $data->{id}; |
|
|
|
|
0
|
|
|
|
|
|
238
|
0
|
0
|
|
|
|
0
|
croak "Crawl4AI job submit returned no task_id" unless defined $id; |
|
239
|
0
|
|
|
|
|
0
|
return { task_id => $id, raw => $data }; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
sub parse_job_status_response { |
|
243
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $res, $backend ) = @_; |
|
244
|
0
|
|
|
|
|
0
|
my $data = $self->_decode( $res, $backend ); |
|
245
|
0
|
|
0
|
|
|
0
|
my $status = uc( $data->{status} // $data->{state} // 'UNKNOWN' ); |
|
|
|
|
0
|
|
|
|
|
|
246
|
0
|
|
|
|
|
0
|
my $pages; |
|
247
|
0
|
0
|
|
|
|
0
|
if ( $status eq 'COMPLETED' ) { |
|
248
|
0
|
|
0
|
|
|
0
|
my $results = $data->{results} // $data->{result} // $data->{data}; |
|
|
|
|
0
|
|
|
|
|
|
249
|
0
|
0
|
|
|
|
0
|
$pages = [ map { $self->_normalize_page($_) } @{ $self->_result_list($results) } ] |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
250
|
|
|
|
|
|
|
if defined $results; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
0
|
0
|
|
|
|
0
|
if ( $status eq 'FAILED' ) { |
|
253
|
|
|
|
|
|
|
die WWW::Crawl4AI::Error->new( |
|
254
|
|
|
|
|
|
|
type => 'job', |
|
255
|
0
|
|
0
|
|
|
0
|
message => "Crawl4AI job failed: " . ( $data->{error} // $data->{detail} // 'unknown' ), |
|
|
|
|
0
|
|
|
|
|
|
256
|
|
|
|
|
|
|
data => $data, |
|
257
|
|
|
|
|
|
|
backend => $backend, |
|
258
|
|
|
|
|
|
|
); |
|
259
|
|
|
|
|
|
|
} |
|
260
|
0
|
|
|
|
|
0
|
return { status => $status, pages => $pages, raw => $data }; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub parse_health_response { |
|
264
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $res ) = @_; |
|
265
|
0
|
0
|
|
|
|
0
|
return 0 unless $res->is_success; |
|
266
|
0
|
|
0
|
|
|
0
|
my $data = eval { $self->_json->decode( $res->decoded_content // '' ) }; |
|
|
0
|
|
|
|
|
0
|
|
|
267
|
0
|
0
|
|
|
|
0
|
return $data if ref $data; |
|
268
|
0
|
|
|
|
|
0
|
return 1; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub parse_screenshot_response { |
|
272
|
2
|
|
|
2
|
1
|
7377
|
my ( $self, $res, $backend ) = @_; |
|
273
|
2
|
|
|
|
|
7
|
return $self->_decode_b64_artifact( $self->_decode( $res, $backend ), 'screenshot', $backend ); |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub parse_pdf_response { |
|
278
|
2
|
|
|
2
|
1
|
2537
|
my ( $self, $res, $backend ) = @_; |
|
279
|
2
|
|
|
|
|
8
|
return $self->_decode_b64_artifact( $self->_decode( $res, $backend ), 'pdf', $backend ); |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# /screenshot and /pdf return { success => bool, => base64 }. Decode to raw |
|
283
|
|
|
|
|
|
|
# bytes; raise a content error if the server reported nothing usable. |
|
284
|
|
|
|
|
|
|
sub _decode_b64_artifact { |
|
285
|
3
|
|
|
3
|
|
7
|
my ( $self, $data, $key, $backend ) = @_; |
|
286
|
3
|
50
|
|
|
|
9
|
my $b64 = ref $data eq 'HASH' ? $data->{$key} : undef; |
|
287
|
3
|
100
|
66
|
|
|
12
|
unless ( defined $b64 && length $b64 ) { |
|
288
|
1
|
|
|
|
|
10
|
die WWW::Crawl4AI::Error->new( |
|
289
|
|
|
|
|
|
|
type => 'content', |
|
290
|
|
|
|
|
|
|
message => "Crawl4AI returned no $key", |
|
291
|
|
|
|
|
|
|
data => $data, |
|
292
|
|
|
|
|
|
|
backend => $backend, |
|
293
|
|
|
|
|
|
|
); |
|
294
|
|
|
|
|
|
|
} |
|
295
|
2
|
|
|
|
|
9
|
return MIME::Base64::decode_base64($b64); |
|
296
|
|
|
|
|
|
|
} |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub parse_html_response { |
|
299
|
1
|
|
|
1
|
1
|
740
|
my ( $self, $res, $backend ) = @_; |
|
300
|
1
|
|
|
|
|
3
|
my $data = $self->_decode( $res, $backend ); |
|
301
|
1
|
50
|
|
|
|
6
|
return ref $data eq 'HASH' ? $data->{html} : $data; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub parse_execute_js_response { |
|
306
|
1
|
|
|
1
|
1
|
630
|
my ( $self, $res, $backend ) = @_; |
|
307
|
1
|
|
|
|
|
3
|
my $data = $self->_decode( $res, $backend ); |
|
308
|
|
|
|
|
|
|
# /execute_js returns a single crawl result (same shape as one /crawl page) |
|
309
|
|
|
|
|
|
|
# with the script output added under js_execution_result. |
|
310
|
1
|
50
|
|
|
|
6
|
my $page = $self->_normalize_page( ref $data eq 'HASH' ? $data : { raw => $data } ); |
|
311
|
1
|
50
|
|
|
|
4
|
$page->{js_result} = ref $data eq 'HASH' ? $data->{js_execution_result} : undef; |
|
312
|
1
|
|
|
|
|
3
|
return $page; |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
sub parse_llm_response { |
|
317
|
1
|
|
|
1
|
1
|
3883
|
my ( $self, $res, $backend ) = @_; |
|
318
|
1
|
|
|
|
|
3
|
my $data = $self->_decode( $res, $backend ); |
|
319
|
1
|
50
|
33
|
|
|
7
|
return ref $data eq 'HASH' ? ( $data->{answer} // $data->{result} // $data ) : $data; |
|
|
|
|
0
|
|
|
|
|
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
sub parse_token_response { |
|
324
|
1
|
|
|
1
|
1
|
1328
|
my ( $self, $res, $backend ) = @_; |
|
325
|
1
|
|
|
|
|
2
|
return $self->_decode( $res, $backend ); |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Crawl4AI has returned the page list in several shapes across versions; |
|
330
|
|
|
|
|
|
|
# accept all of them rather than pin to one. |
|
331
|
|
|
|
|
|
|
sub _result_list { |
|
332
|
1
|
|
|
1
|
|
2
|
my ( $self, $data ) = @_; |
|
333
|
1
|
50
|
|
|
|
2
|
return [] unless defined $data; |
|
334
|
1
|
50
|
|
|
|
3
|
return $data if ref $data eq 'ARRAY'; |
|
335
|
1
|
50
|
|
|
|
4
|
if ( ref $data eq 'HASH' ) { |
|
336
|
1
|
|
|
|
|
3
|
for my $key (qw( results data result )) { |
|
337
|
1
|
50
|
|
|
|
3
|
my $v = $data->{$key} or next; |
|
338
|
1
|
50
|
|
|
|
4
|
return $v if ref $v eq 'ARRAY'; |
|
339
|
0
|
0
|
|
|
|
0
|
return [$v] if ref $v eq 'HASH'; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
# Looks like a single page itself. |
|
342
|
0
|
0
|
0
|
|
|
0
|
return [$data] if exists $data->{markdown} || exists $data->{html} || exists $data->{success}; |
|
|
|
|
0
|
|
|
|
|
|
343
|
|
|
|
|
|
|
} |
|
344
|
0
|
|
|
|
|
0
|
return []; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub _normalize_page { |
|
348
|
2
|
|
|
2
|
|
5
|
my ( $self, $page ) = @_; |
|
349
|
2
|
50
|
|
|
|
5
|
return { raw => $page } unless ref $page eq 'HASH'; |
|
350
|
2
|
|
50
|
|
|
20
|
my $meta = $page->{metadata} || {}; |
|
351
|
|
|
|
|
|
|
return { |
|
352
|
|
|
|
|
|
|
success => $page->{success}, |
|
353
|
|
|
|
|
|
|
url => $meta->{sourceURL} // $page->{url} // $meta->{url}, |
|
354
|
|
|
|
|
|
|
final_url => $page->{redirected_url} // $page->{url} // $meta->{url}, |
|
355
|
|
|
|
|
|
|
status_code => $page->{status_code} // $page->{status} // $meta->{statusCode}, |
|
356
|
|
|
|
|
|
|
markdown => $self->_extract_markdown($page), |
|
357
|
|
|
|
|
|
|
html => $page->{cleaned_html} // $page->{html}, |
|
358
|
|
|
|
|
|
|
raw_html => $page->{html}, |
|
359
|
|
|
|
|
|
|
title => $meta->{title}, |
|
360
|
|
|
|
|
|
|
links => $self->_extract_links($page), |
|
361
|
|
|
|
|
|
|
metadata => $meta, |
|
362
|
|
|
|
|
|
|
error => $page->{error_message} // $page->{error}, |
|
363
|
2
|
|
33
|
|
|
26
|
response_headers => $self->_lc_headers( $page->{response_headers} ), |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
364
|
|
|
|
|
|
|
raw => $page, |
|
365
|
|
|
|
|
|
|
}; |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# Lowercase all header keys for deterministic, case-insensitive matching by callers. |
|
369
|
|
|
|
|
|
|
sub _lc_headers { |
|
370
|
2
|
|
|
2
|
|
6
|
my ( $self, $h ) = @_; |
|
371
|
2
|
50
|
|
|
|
38
|
return {} unless ref $h eq 'HASH'; |
|
372
|
0
|
|
|
|
|
0
|
return { map { lc($_) => $h->{$_} } keys %$h }; |
|
|
0
|
|
|
|
|
0
|
|
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# Crawl4AI returns links as { internal => [...], external => [...] }, each entry |
|
376
|
|
|
|
|
|
|
# a hash with href/text/title (older servers may send bare strings). Normalize |
|
377
|
|
|
|
|
|
|
# to a stable { internal => [{href,text,title}], external => [...] } shape so the |
|
378
|
|
|
|
|
|
|
# Result can expose them without callers reaching into raw. |
|
379
|
|
|
|
|
|
|
sub _extract_links { |
|
380
|
2
|
|
|
2
|
|
5
|
my ( $self, $page ) = @_; |
|
381
|
2
|
|
|
|
|
4
|
my $links = $page->{links}; |
|
382
|
2
|
50
|
|
|
|
19
|
return { internal => [], external => [] } unless ref $links eq 'HASH'; |
|
383
|
|
|
|
|
|
|
return { |
|
384
|
|
|
|
|
|
|
internal => $self->_normalize_link_list( $links->{internal} ), |
|
385
|
0
|
|
|
|
|
0
|
external => $self->_normalize_link_list( $links->{external} ), |
|
386
|
|
|
|
|
|
|
}; |
|
387
|
|
|
|
|
|
|
} |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub _normalize_link_list { |
|
390
|
0
|
|
|
0
|
|
0
|
my ( $self, $list ) = @_; |
|
391
|
0
|
0
|
|
|
|
0
|
return [] unless ref $list eq 'ARRAY'; |
|
392
|
0
|
|
|
|
|
0
|
return [ map { $self->_normalize_link($_) } @$list ]; |
|
|
0
|
|
|
|
|
0
|
|
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
sub _normalize_link { |
|
396
|
0
|
|
|
0
|
|
0
|
my ( $self, $link ) = @_; |
|
397
|
0
|
0
|
|
|
|
0
|
return { href => $link } unless ref $link eq 'HASH'; |
|
398
|
0
|
|
|
|
|
0
|
my $text = $link->{text}; |
|
399
|
0
|
0
|
0
|
|
|
0
|
$text = undef unless defined $text && length $text; |
|
400
|
|
|
|
|
|
|
return { |
|
401
|
|
|
|
|
|
|
href => $link->{href}, |
|
402
|
|
|
|
|
|
|
text => $text, |
|
403
|
|
|
|
|
|
|
title => $link->{title}, |
|
404
|
0
|
|
|
|
|
0
|
}; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# markdown is a plain string on old servers and a structured object on new ones. |
|
408
|
|
|
|
|
|
|
# fit_markdown is preferred but is frequently an empty string (no content |
|
409
|
|
|
|
|
|
|
# filter matched), so skip empty candidates instead of stopping at the first |
|
410
|
|
|
|
|
|
|
# defined one. |
|
411
|
|
|
|
|
|
|
sub _extract_markdown { |
|
412
|
2
|
|
|
2
|
|
4
|
my ( $self, $page ) = @_; |
|
413
|
2
|
|
|
|
|
10
|
return resolve_markdown_chain( $page->{markdown} ); |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
417
|
|
|
|
|
|
|
# I/O + retry |
|
418
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub do_request { |
|
421
|
0
|
|
|
0
|
1
|
|
my ( $self, $req, $backend ) = @_; |
|
422
|
0
|
0
|
|
|
|
|
croak "do_request needs an HTTP::Request" unless $self->is_request($req); |
|
423
|
0
|
|
|
|
|
|
my $max = $self->max_attempts; |
|
424
|
0
|
|
|
|
|
|
my $res; |
|
425
|
0
|
|
|
|
|
|
for my $attempt ( 1 .. $max ) { |
|
426
|
0
|
|
|
|
|
|
$res = $self->ua->request($req); |
|
427
|
0
|
0
|
|
|
|
|
last if $res->is_success; |
|
428
|
0
|
|
|
|
|
|
my $code = $res->code; |
|
429
|
0
|
|
0
|
|
|
|
my $transport = $code == 599 || ( ( $res->header('Client-Warning') // '' ) eq 'Internal response' ); |
|
430
|
0
|
|
0
|
|
|
|
my $retryable = $transport || $self->_retry_status_set->{$code}; |
|
431
|
0
|
0
|
0
|
|
|
|
if ( $retryable && $attempt < $max ) { |
|
432
|
0
|
|
0
|
|
|
|
my $delay = $self->retry_backoff->[ $attempt - 1 ] // $self->retry_backoff->[-1] // 1; |
|
|
|
|
0
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
|
if ( my $ra = $res->header('Retry-After') ) { |
|
434
|
0
|
0
|
|
|
|
|
$delay = $ra if $ra =~ /^\d+$/; |
|
435
|
|
|
|
|
|
|
} |
|
436
|
0
|
0
|
|
|
|
|
$self->on_retry->( $attempt, $delay, $res ) if $self->on_retry; |
|
437
|
0
|
|
|
|
|
|
$self->sleep_sub->($delay); |
|
438
|
0
|
|
|
|
|
|
next; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
0
|
|
|
|
|
|
last; |
|
441
|
|
|
|
|
|
|
} |
|
442
|
0
|
0
|
0
|
|
|
|
if ( !$res->is_success && ( $res->code == 599 ) ) { |
|
443
|
0
|
|
0
|
|
|
|
die WWW::Crawl4AI::Error->new( |
|
|
|
|
0
|
|
|
|
|
|
444
|
|
|
|
|
|
|
type => 'transport', |
|
445
|
|
|
|
|
|
|
message => "Crawl4AI transport error: " . ( $res->content // $res->message // 'unreachable' ), |
|
446
|
|
|
|
|
|
|
response => $res, |
|
447
|
|
|
|
|
|
|
status_code => 0, |
|
448
|
|
|
|
|
|
|
backend => $backend, |
|
449
|
|
|
|
|
|
|
); |
|
450
|
|
|
|
|
|
|
} |
|
451
|
0
|
|
|
|
|
|
return $res; |
|
452
|
|
|
|
|
|
|
} |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
456
|
|
|
|
|
|
|
# Convenience (build + fire + parse) |
|
457
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub crawl { |
|
460
|
0
|
|
|
0
|
1
|
|
my ( $self, $request, $backend ) = @_; |
|
461
|
0
|
|
|
|
|
|
return $self->parse_crawl_response( $self->do_request( $self->crawl_request($request), $backend ), $backend ); |
|
462
|
|
|
|
|
|
|
} |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub md { |
|
466
|
0
|
|
|
0
|
1
|
|
my ( $self, $url, %opts ) = @_; |
|
467
|
0
|
0
|
|
|
|
|
my $request = $url->$_isa('WWW::Crawl4AI::Request') |
|
468
|
|
|
|
|
|
|
? $url |
|
469
|
|
|
|
|
|
|
: WWW::Crawl4AI::Request->new( urls => $url, %opts ); |
|
470
|
0
|
|
|
|
|
|
return $self->parse_md_response( $self->do_request( $self->md_request($request) ) ); |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
sub job_submit { |
|
474
|
0
|
|
|
0
|
1
|
|
my ( $self, $request ) = @_; |
|
475
|
0
|
|
|
|
|
|
return $self->parse_job_submit_response( $self->do_request( $self->job_submit_request($request) ) ); |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
sub job_status { |
|
479
|
0
|
|
|
0
|
1
|
|
my ( $self, $task_id ) = @_; |
|
480
|
0
|
|
|
|
|
|
return $self->parse_job_status_response( $self->do_request( $self->job_status_request($task_id) ) ); |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub health { |
|
484
|
0
|
|
|
0
|
1
|
|
my ( $self ) = @_; |
|
485
|
0
|
|
|
|
|
|
my $res = eval { $self->do_request( $self->health_request ) }; |
|
|
0
|
|
|
|
|
|
|
|
486
|
0
|
0
|
0
|
|
|
|
return 0 if $@ || !$res; |
|
487
|
0
|
0
|
|
|
|
|
return $self->parse_health_response($res) ? 1 : 0; |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub screenshot { |
|
491
|
0
|
|
|
0
|
1
|
|
my ( $self, $url, %opts ) = @_; |
|
492
|
0
|
|
|
|
|
|
return $self->parse_screenshot_response( $self->do_request( $self->screenshot_request( $url, %opts ) ) ); |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub pdf { |
|
497
|
0
|
|
|
0
|
1
|
|
my ( $self, $url, %opts ) = @_; |
|
498
|
0
|
|
|
|
|
|
return $self->parse_pdf_response( $self->do_request( $self->pdf_request( $url, %opts ) ) ); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
sub html { |
|
502
|
0
|
|
|
0
|
1
|
|
my ( $self, $url ) = @_; |
|
503
|
0
|
|
|
|
|
|
return $self->parse_html_response( $self->do_request( $self->html_request($url) ) ); |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub execute_js { |
|
507
|
0
|
|
|
0
|
1
|
|
my ( $self, $url, $scripts ) = @_; |
|
508
|
0
|
|
|
|
|
|
return $self->parse_execute_js_response( $self->do_request( $self->execute_js_request( $url, $scripts ) ) ); |
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub llm { |
|
512
|
0
|
|
|
0
|
1
|
|
my ( $self, $url, $query, %opts ) = @_; |
|
513
|
0
|
|
|
|
|
|
return $self->parse_llm_response( $self->do_request( $self->llm_request( $url, $query, %opts ) ) ); |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub token { |
|
517
|
0
|
|
|
0
|
1
|
|
my ( $self, $email, %opts ) = @_; |
|
518
|
0
|
|
|
|
|
|
return $self->parse_token_response( $self->do_request( $self->token_request( $email, %opts ) ) ); |
|
519
|
|
|
|
|
|
|
} |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
1; |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
__END__ |