File Coverage

blib/lib/WWW/Crawl4AI/Client.pm
Criterion Covered Total %
statement 139 238 58.4
branch 49 122 40.1
condition 26 136 19.1
subroutine 38 60 63.3
pod 35 35 100.0
total 287 591 48.5


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Client;
2             # ABSTRACT: UA-agnostic REST client for the Crawl4AI Docker API
3 4     4   120645 use Moo;
  4         5384  
  4         19  
4 4     4   2104 use Carp qw( croak );
  4         9  
  4         221  
5 4     4   1936 use HTTP::Request ();
  4         31762  
  4         83  
6 4     4   896 use JSON::MaybeXS ();
  4         23326  
  4         84  
7 4     4   20 use URI ();
  4         5  
  4         78  
8 4     4   17 use URI::Escape ();
  4         5  
  4         49  
9 4     4   1632 use MIME::Base64 ();
  4         2054  
  4         131  
10 4     4   1857 use Safe::Isa;
  4         2268  
  4         587  
11 4     4   1605 use WWW::Crawl4AI::Markdown qw( resolve_markdown_chain );
  4         13  
  4         220  
12 4     4   1477 use WWW::Crawl4AI::Request ();
  4         13  
  4         119  
13 4     4   1932 use WWW::Crawl4AI::Error ();
  4         26  
  4         15179  
14              
15             our $VERSION = '0.001';
16              
17              
18             has base_url => (
19             is => 'ro',
20             default => sub { $ENV{CRAWL4AI_URL} || $ENV{CRAWL4AI_BASE_URL} || 'http://localhost:11235' },
21             );
22              
23              
24             has api_token => (
25             is => 'ro',
26             default => sub { $ENV{CRAWL4AI_API_TOKEN} },
27             );
28              
29              
30             has _json => (
31             is => 'lazy',
32             default => sub { JSON::MaybeXS->new( utf8 => 1, canonical => 1, convert_blessed => 1 ) },
33             );
34              
35             has user_agent_string => (
36             is => 'ro',
37             default => sub { "WWW-Crawl4AI/$VERSION" },
38             );
39              
40              
41             has timeout => ( is => 'ro', default => sub { 120 } );
42              
43              
44             has ua => ( is => 'lazy' );
45              
46              
47             has max_attempts => ( is => 'ro', default => sub { 3 } );
48              
49              
50             has retry_backoff => ( is => 'ro', default => sub { [ 1, 2, 4 ] } );
51              
52              
53             has retry_statuses => ( is => 'ro', default => sub { [ 429, 502, 503, 504 ] } );
54              
55              
56             has on_retry => ( is => 'ro' );
57              
58              
59             has sleep_sub => (
60             is => 'ro',
61             default => sub {
62             require Time::HiRes;
63             return sub { Time::HiRes::sleep( $_[0] ) };
64             },
65             );
66              
67              
68             has _retry_status_set => ( is => 'lazy' );
69             sub _build__retry_status_set {
70 0     0   0 return { map { $_ => 1 } @{ $_[0]->retry_statuses } };
  0         0  
  0         0  
71             }
72              
73             sub _build_ua {
74 0     0   0 my ( $self ) = @_;
75 0         0 require LWP::UserAgent;
76 0         0 return LWP::UserAgent->new(
77             agent => $self->user_agent_string,
78             timeout => $self->timeout,
79             );
80             }
81              
82 0     0 1 0 sub is_request { $_[1]->$_isa('HTTP::Request') }
83              
84              
85             sub _uri {
86 15     15   23 my ( $self, $path ) = @_;
87 15         98 ( my $base = $self->base_url ) =~ s{/+$}{};
88 15         75 return URI->new( $base . $path );
89             }
90              
91             sub _request {
92 15     15   35 my ( $self, $method, $path, $body ) = @_;
93 15         37 my $req = HTTP::Request->new( $method => $self->_uri($path) );
94 15         16632 $req->header( Accept => 'application/json' );
95 15 100       794 $req->header( Authorization => 'Bearer ' . $self->api_token ) if defined $self->api_token;
96 15 100       204 if ( defined $body ) {
97 12         27 $req->header( 'Content-Type' => 'application/json' );
98 12         660 $req->content( $self->_json->encode($body) );
99             }
100 15         429 return $req;
101             }
102              
103             #----------------------------------------------------------------------
104             # Request builders (no I/O)
105             #----------------------------------------------------------------------
106              
107             sub crawl_request {
108 4     4 1 2350 my ( $self, $request ) = @_;
109 4         13 return $self->_request( POST => '/crawl', $self->_payload( $request, 'to_crawl_payload' ) );
110             }
111              
112              
113             sub md_request {
114 0     0 1 0 my ( $self, $request ) = @_;
115 0         0 return $self->_request( POST => '/md', $self->_payload( $request, 'to_md_payload' ) );
116             }
117              
118             sub job_submit_request {
119 1     1 1 2 my ( $self, $request ) = @_;
120 1         5 return $self->_request( POST => '/crawl/job', $self->_payload( $request, 'to_crawl_payload' ) );
121             }
122              
123             sub job_status_request {
124 1     1 1 656 my ( $self, $task_id ) = @_;
125 1 50 33     7 croak "job_status_request needs a task_id" unless defined $task_id && length $task_id;
126 1         4 return $self->_request( GET => '/crawl/job/' . $task_id );
127             }
128              
129 1     1 1 1274 sub health_request { $_[0]->_request( GET => '/health' ) }
130              
131             sub screenshot_request {
132 2     2 1 4332 my ( $self, $url, %opts ) = @_;
133 2 50 33     13 croak "screenshot_request needs a url" unless defined $url && length $url;
134 2         6 my %body = ( url => $url );
135 2 100       7 $body{screenshot_wait_for} = $opts{wait_for} if defined $opts{wait_for};
136 2 100       5 $body{output_path} = $opts{output_path} if defined $opts{output_path};
137             $body{wait_for_images} =
138             $opts{wait_for_images} ? WWW::Crawl4AI::Request::JSON_true() : WWW::Crawl4AI::Request::JSON_false()
139 2 50       9 if exists $opts{wait_for_images};
    100          
140 2         13 return $self->_request( POST => '/screenshot', \%body );
141             }
142              
143              
144             sub pdf_request {
145 1     1 1 2546 my ( $self, $url, %opts ) = @_;
146 1 50 33     6 croak "pdf_request needs a url" unless defined $url && length $url;
147 1         3 my %body = ( url => $url );
148 1 50       4 $body{output_path} = $opts{output_path} if defined $opts{output_path};
149 1         3 return $self->_request( POST => '/pdf', \%body );
150             }
151              
152             sub html_request {
153 1     1 1 2602 my ( $self, $url ) = @_;
154 1 50 33     7 croak "html_request needs a url" unless defined $url && length $url;
155 1         4 return $self->_request( POST => '/html', { url => $url } );
156             }
157              
158             sub execute_js_request {
159 4     4 1 8150 my ( $self, $url, $scripts ) = @_;
160 4 50 33     20 croak "execute_js_request needs a url" unless defined $url && length $url;
161 4 100       12 $scripts = [$scripts] unless ref $scripts eq 'ARRAY';
162 4 100 100     244 croak "execute_js_request needs at least one script" unless @$scripts && defined $scripts->[0];
163 2         7 return $self->_request( POST => '/execute_js', { url => $url, scripts => $scripts } );
164             }
165              
166             sub llm_request {
167 1     1 1 2306 my ( $self, $url, $query, %opts ) = @_;
168 1 50 33     6 croak "llm_request needs a url" unless defined $url && length $url;
169 1 50 33     21 croak "llm_request needs a query" unless defined $query && length $query;
170             # The page URL is a path segment ({url:path}); escape it so its own scheme and
171             # query string don't merge into ours. The question and tuning go in the query.
172 1         6 my $path = URI->new( '/llm/' . URI::Escape::uri_escape_utf8($url) );
173             $path->query_form(
174             q => $query,
175             ( defined $opts{provider} ? ( provider => $opts{provider} ) : () ),
176             ( defined $opts{temperature} ? ( temperature => $opts{temperature} ) : () ),
177 1 50       281 ( defined $opts{base_url} ? ( base_url => $opts{base_url} ) : () ),
    50          
    50          
178             );
179 1         187 return $self->_request( GET => $path->as_string );
180             }
181              
182             sub token_request {
183 1     1 1 2806 my ( $self, $email, %opts ) = @_;
184 1 50 33     6 croak "token_request needs an email" unless defined $email && length $email;
185 1         3 my %body = ( email => $email );
186 1 50       11 $body{api_token} = $opts{api_token} if defined $opts{api_token};
187 1         3 return $self->_request( POST => '/token', \%body );
188             }
189              
190             sub _payload {
191 5     5   9 my ( $self, $request, $method ) = @_;
192 5 100       16 return $request->$method if $request->$_isa('WWW::Crawl4AI::Request');
193 1 50       12 return $request if ref $request eq 'HASH';
194 0         0 croak "expected a WWW::Crawl4AI::Request or hashref payload";
195             }
196              
197             #----------------------------------------------------------------------
198             # Response parsers (no I/O)
199             #----------------------------------------------------------------------
200              
201             sub _decode {
202 9     9   16 my ( $self, $res, $backend ) = @_;
203 9         23 my $code = $res->code;
204 9   33     83 my $body = $res->decoded_content // $res->content // '';
      50        
205 9 50       1121 my $data = eval { length $body ? $self->_json->decode($body) : undef };
  9         232  
206 9 100       112 if ( !$res->is_success ) {
207 1   50     9 die WWW::Crawl4AI::Error->new(
208             type => 'api',
209             message => "Crawl4AI HTTP $code: " . ( $res->message // 'error' ),
210             response => $res,
211             data => $data,
212             status_code => $code,
213             backend => $backend,
214             );
215             }
216 8         71 return $data;
217             }
218              
219             sub parse_crawl_response {
220 1     1 1 2594 my ( $self, $res, $backend ) = @_;
221 1         3 my $data = $self->_decode( $res, $backend );
222 1         2 return [ map { $self->_normalize_page($_) } @{ $self->_result_list($data) } ];
  1         3  
  1         4  
223             }
224              
225              
226             sub parse_md_response {
227 0     0 1 0 my ( $self, $res, $backend ) = @_;
228 0         0 my $data = $self->_decode( $res, $backend );
229 0 0       0 return $data if !ref $data;
230             # /md commonly returns { markdown => ... } or { result => ... }
231 0   0     0 return $data->{markdown} // $data->{result} // $data->{md} // $data;
      0        
      0        
232             }
233              
234             sub parse_job_submit_response {
235 0     0 1 0 my ( $self, $res, $backend ) = @_;
236 0         0 my $data = $self->_decode( $res, $backend );
237 0   0     0 my $id = $data->{task_id} // $data->{job_id} // $data->{id};
      0        
238 0 0       0 croak "Crawl4AI job submit returned no task_id" unless defined $id;
239 0         0 return { task_id => $id, raw => $data };
240             }
241              
242             sub parse_job_status_response {
243 0     0 1 0 my ( $self, $res, $backend ) = @_;
244 0         0 my $data = $self->_decode( $res, $backend );
245 0   0     0 my $status = uc( $data->{status} // $data->{state} // 'UNKNOWN' );
      0        
246 0         0 my $pages;
247 0 0       0 if ( $status eq 'COMPLETED' ) {
248 0   0     0 my $results = $data->{results} // $data->{result} // $data->{data};
      0        
249 0 0       0 $pages = [ map { $self->_normalize_page($_) } @{ $self->_result_list($results) } ]
  0         0  
  0         0  
250             if defined $results;
251             }
252 0 0       0 if ( $status eq 'FAILED' ) {
253             die WWW::Crawl4AI::Error->new(
254             type => 'job',
255 0   0     0 message => "Crawl4AI job failed: " . ( $data->{error} // $data->{detail} // 'unknown' ),
      0        
256             data => $data,
257             backend => $backend,
258             );
259             }
260 0         0 return { status => $status, pages => $pages, raw => $data };
261             }
262              
263             sub parse_health_response {
264 0     0 1 0 my ( $self, $res ) = @_;
265 0 0       0 return 0 unless $res->is_success;
266 0   0     0 my $data = eval { $self->_json->decode( $res->decoded_content // '' ) };
  0         0  
267 0 0       0 return $data if ref $data;
268 0         0 return 1;
269             }
270              
271             sub parse_screenshot_response {
272 2     2 1 7377 my ( $self, $res, $backend ) = @_;
273 2         7 return $self->_decode_b64_artifact( $self->_decode( $res, $backend ), 'screenshot', $backend );
274             }
275              
276              
277             sub parse_pdf_response {
278 2     2 1 2537 my ( $self, $res, $backend ) = @_;
279 2         8 return $self->_decode_b64_artifact( $self->_decode( $res, $backend ), 'pdf', $backend );
280             }
281              
282             # /screenshot and /pdf return { success => bool, => base64 }. Decode to raw
283             # bytes; raise a content error if the server reported nothing usable.
284             sub _decode_b64_artifact {
285 3     3   7 my ( $self, $data, $key, $backend ) = @_;
286 3 50       9 my $b64 = ref $data eq 'HASH' ? $data->{$key} : undef;
287 3 100 66     12 unless ( defined $b64 && length $b64 ) {
288 1         10 die WWW::Crawl4AI::Error->new(
289             type => 'content',
290             message => "Crawl4AI returned no $key",
291             data => $data,
292             backend => $backend,
293             );
294             }
295 2         9 return MIME::Base64::decode_base64($b64);
296             }
297              
298             sub parse_html_response {
299 1     1 1 740 my ( $self, $res, $backend ) = @_;
300 1         3 my $data = $self->_decode( $res, $backend );
301 1 50       6 return ref $data eq 'HASH' ? $data->{html} : $data;
302             }
303              
304              
305             sub parse_execute_js_response {
306 1     1 1 630 my ( $self, $res, $backend ) = @_;
307 1         3 my $data = $self->_decode( $res, $backend );
308             # /execute_js returns a single crawl result (same shape as one /crawl page)
309             # with the script output added under js_execution_result.
310 1 50       6 my $page = $self->_normalize_page( ref $data eq 'HASH' ? $data : { raw => $data } );
311 1 50       4 $page->{js_result} = ref $data eq 'HASH' ? $data->{js_execution_result} : undef;
312 1         3 return $page;
313             }
314              
315              
316             sub parse_llm_response {
317 1     1 1 3883 my ( $self, $res, $backend ) = @_;
318 1         3 my $data = $self->_decode( $res, $backend );
319 1 50 33     7 return ref $data eq 'HASH' ? ( $data->{answer} // $data->{result} // $data ) : $data;
      0        
320             }
321              
322              
323             sub parse_token_response {
324 1     1 1 1328 my ( $self, $res, $backend ) = @_;
325 1         2 return $self->_decode( $res, $backend );
326             }
327              
328              
329             # Crawl4AI has returned the page list in several shapes across versions;
330             # accept all of them rather than pin to one.
331             sub _result_list {
332 1     1   2 my ( $self, $data ) = @_;
333 1 50       2 return [] unless defined $data;
334 1 50       3 return $data if ref $data eq 'ARRAY';
335 1 50       4 if ( ref $data eq 'HASH' ) {
336 1         3 for my $key (qw( results data result )) {
337 1 50       3 my $v = $data->{$key} or next;
338 1 50       4 return $v if ref $v eq 'ARRAY';
339 0 0       0 return [$v] if ref $v eq 'HASH';
340             }
341             # Looks like a single page itself.
342 0 0 0     0 return [$data] if exists $data->{markdown} || exists $data->{html} || exists $data->{success};
      0        
343             }
344 0         0 return [];
345             }
346              
347             sub _normalize_page {
348 2     2   5 my ( $self, $page ) = @_;
349 2 50       5 return { raw => $page } unless ref $page eq 'HASH';
350 2   50     20 my $meta = $page->{metadata} || {};
351             return {
352             success => $page->{success},
353             url => $meta->{sourceURL} // $page->{url} // $meta->{url},
354             final_url => $page->{redirected_url} // $page->{url} // $meta->{url},
355             status_code => $page->{status_code} // $page->{status} // $meta->{statusCode},
356             markdown => $self->_extract_markdown($page),
357             html => $page->{cleaned_html} // $page->{html},
358             raw_html => $page->{html},
359             title => $meta->{title},
360             links => $self->_extract_links($page),
361             metadata => $meta,
362             error => $page->{error_message} // $page->{error},
363 2   33     26 response_headers => $self->_lc_headers( $page->{response_headers} ),
      33        
      33        
      33        
      33        
      0        
      66        
      33        
364             raw => $page,
365             };
366             }
367              
368             # Lowercase all header keys for deterministic, case-insensitive matching by callers.
369             sub _lc_headers {
370 2     2   6 my ( $self, $h ) = @_;
371 2 50       38 return {} unless ref $h eq 'HASH';
372 0         0 return { map { lc($_) => $h->{$_} } keys %$h };
  0         0  
373             }
374              
375             # Crawl4AI returns links as { internal => [...], external => [...] }, each entry
376             # a hash with href/text/title (older servers may send bare strings). Normalize
377             # to a stable { internal => [{href,text,title}], external => [...] } shape so the
378             # Result can expose them without callers reaching into raw.
379             sub _extract_links {
380 2     2   5 my ( $self, $page ) = @_;
381 2         4 my $links = $page->{links};
382 2 50       19 return { internal => [], external => [] } unless ref $links eq 'HASH';
383             return {
384             internal => $self->_normalize_link_list( $links->{internal} ),
385 0         0 external => $self->_normalize_link_list( $links->{external} ),
386             };
387             }
388              
389             sub _normalize_link_list {
390 0     0   0 my ( $self, $list ) = @_;
391 0 0       0 return [] unless ref $list eq 'ARRAY';
392 0         0 return [ map { $self->_normalize_link($_) } @$list ];
  0         0  
393             }
394              
395             sub _normalize_link {
396 0     0   0 my ( $self, $link ) = @_;
397 0 0       0 return { href => $link } unless ref $link eq 'HASH';
398 0         0 my $text = $link->{text};
399 0 0 0     0 $text = undef unless defined $text && length $text;
400             return {
401             href => $link->{href},
402             text => $text,
403             title => $link->{title},
404 0         0 };
405             }
406              
407             # markdown is a plain string on old servers and a structured object on new ones.
408             # fit_markdown is preferred but is frequently an empty string (no content
409             # filter matched), so skip empty candidates instead of stopping at the first
410             # defined one.
411             sub _extract_markdown {
412 2     2   4 my ( $self, $page ) = @_;
413 2         10 return resolve_markdown_chain( $page->{markdown} );
414             }
415              
416             #----------------------------------------------------------------------
417             # I/O + retry
418             #----------------------------------------------------------------------
419              
420             sub do_request {
421 0     0 1   my ( $self, $req, $backend ) = @_;
422 0 0         croak "do_request needs an HTTP::Request" unless $self->is_request($req);
423 0           my $max = $self->max_attempts;
424 0           my $res;
425 0           for my $attempt ( 1 .. $max ) {
426 0           $res = $self->ua->request($req);
427 0 0         last if $res->is_success;
428 0           my $code = $res->code;
429 0   0       my $transport = $code == 599 || ( ( $res->header('Client-Warning') // '' ) eq 'Internal response' );
430 0   0       my $retryable = $transport || $self->_retry_status_set->{$code};
431 0 0 0       if ( $retryable && $attempt < $max ) {
432 0   0       my $delay = $self->retry_backoff->[ $attempt - 1 ] // $self->retry_backoff->[-1] // 1;
      0        
433 0 0         if ( my $ra = $res->header('Retry-After') ) {
434 0 0         $delay = $ra if $ra =~ /^\d+$/;
435             }
436 0 0         $self->on_retry->( $attempt, $delay, $res ) if $self->on_retry;
437 0           $self->sleep_sub->($delay);
438 0           next;
439             }
440 0           last;
441             }
442 0 0 0       if ( !$res->is_success && ( $res->code == 599 ) ) {
443 0   0       die WWW::Crawl4AI::Error->new(
      0        
444             type => 'transport',
445             message => "Crawl4AI transport error: " . ( $res->content // $res->message // 'unreachable' ),
446             response => $res,
447             status_code => 0,
448             backend => $backend,
449             );
450             }
451 0           return $res;
452             }
453              
454              
455             #----------------------------------------------------------------------
456             # Convenience (build + fire + parse)
457             #----------------------------------------------------------------------
458              
459             sub crawl {
460 0     0 1   my ( $self, $request, $backend ) = @_;
461 0           return $self->parse_crawl_response( $self->do_request( $self->crawl_request($request), $backend ), $backend );
462             }
463              
464              
465             sub md {
466 0     0 1   my ( $self, $url, %opts ) = @_;
467 0 0         my $request = $url->$_isa('WWW::Crawl4AI::Request')
468             ? $url
469             : WWW::Crawl4AI::Request->new( urls => $url, %opts );
470 0           return $self->parse_md_response( $self->do_request( $self->md_request($request) ) );
471             }
472              
473             sub job_submit {
474 0     0 1   my ( $self, $request ) = @_;
475 0           return $self->parse_job_submit_response( $self->do_request( $self->job_submit_request($request) ) );
476             }
477              
478             sub job_status {
479 0     0 1   my ( $self, $task_id ) = @_;
480 0           return $self->parse_job_status_response( $self->do_request( $self->job_status_request($task_id) ) );
481             }
482              
483             sub health {
484 0     0 1   my ( $self ) = @_;
485 0           my $res = eval { $self->do_request( $self->health_request ) };
  0            
486 0 0 0       return 0 if $@ || !$res;
487 0 0         return $self->parse_health_response($res) ? 1 : 0;
488             }
489              
490             sub screenshot {
491 0     0 1   my ( $self, $url, %opts ) = @_;
492 0           return $self->parse_screenshot_response( $self->do_request( $self->screenshot_request( $url, %opts ) ) );
493             }
494              
495              
496             sub pdf {
497 0     0 1   my ( $self, $url, %opts ) = @_;
498 0           return $self->parse_pdf_response( $self->do_request( $self->pdf_request( $url, %opts ) ) );
499             }
500              
501             sub html {
502 0     0 1   my ( $self, $url ) = @_;
503 0           return $self->parse_html_response( $self->do_request( $self->html_request($url) ) );
504             }
505              
506             sub execute_js {
507 0     0 1   my ( $self, $url, $scripts ) = @_;
508 0           return $self->parse_execute_js_response( $self->do_request( $self->execute_js_request( $url, $scripts ) ) );
509             }
510              
511             sub llm {
512 0     0 1   my ( $self, $url, $query, %opts ) = @_;
513 0           return $self->parse_llm_response( $self->do_request( $self->llm_request( $url, $query, %opts ) ) );
514             }
515              
516             sub token {
517 0     0 1   my ( $self, $email, %opts ) = @_;
518 0           return $self->parse_token_response( $self->do_request( $self->token_request( $email, %opts ) ) );
519             }
520              
521              
522             1;
523              
524             __END__