File Coverage

blib/lib/Net/Async/Firecrawl.pm
Criterion Covered Total %
statement 180 201 89.5
branch 41 68 60.2
condition 18 37 48.6
subroutine 42 46 91.3
pod 11 11 100.0
total 292 363 80.4


line stmt bran cond sub pod time code
1             package Net::Async::Firecrawl;
2             # ABSTRACT: IO::Async Firecrawl v2 client with flow helpers
3 5     5   718233 use strict;
  5         7  
  5         159  
4 5     5   33 use warnings;
  5         10  
  5         251  
5 5     5   25 use parent 'IO::Async::Notifier';
  5         6  
  5         40  
6              
7 5     5   33085 use Carp qw( croak );
  5         13  
  5         218  
8 5     5   2687 use WWW::Firecrawl ();
  5         240626  
  5         130  
9 5     5   32 use WWW::Firecrawl::Error ();
  5         6  
  5         62  
10 5     5   2859 use Net::Async::HTTP ();
  5         340992  
  5         166  
11 5     5   36 use Future ();
  5         12  
  5         115  
12 5     5   19 use Future::Utils qw( repeat );
  5         8  
  5         3456  
13              
14             our $VERSION = '0.001';
15              
16             sub _init {
17 14     14   962325 my ( $self, $args ) = @_;
18 14         65 $self->SUPER::_init($args);
19             $self->{firecrawl} ||= WWW::Firecrawl->new(
20             ( exists $args->{base_url} ? ( base_url => delete $args->{base_url} ) : () ),
21             ( exists $args->{api_key} ? ( api_key => delete $args->{api_key} ) : () ),
22 14 50 33     438 ( exists $args->{api_version} ? ( api_version => delete $args->{api_version} ) : () ),
    50          
    50          
23             );
24 14 100       10857 $self->{poll_interval} = exists $args->{poll_interval} ? delete $args->{poll_interval} : 3;
25 14         25 $self->{http} = delete $args->{http};
26 14 50       43 $self->{firecrawl} = delete $args->{firecrawl} if exists $args->{firecrawl};
27 14         47 $self->{delay_sub} = delete $args->{delay_sub};
28 14         34 return;
29             }
30              
31             sub configure_unknown {
32 0     0 1 0 my ( $self, %args ) = @_;
33 0         0 for my $k (qw( base_url api_key api_version poll_interval firecrawl http delay_sub )) {
34 0         0 delete $args{$k};
35             }
36 0 0       0 return unless %args;
37 0         0 croak "Unknown configuration keys: ".join(',', sort keys %args);
38             }
39              
40 42     42 1 74 sub firecrawl { $_[0]->{firecrawl} }
41 5 50   5 1 20 sub poll_interval { @_ > 1 ? ($_[0]->{poll_interval} = $_[1]) : $_[0]->{poll_interval} }
42              
43             sub http {
44 7     7 1 8 my ( $self ) = @_;
45 7 50       30 return $self->{http} if $self->{http};
46 0         0 my $http = Net::Async::HTTP->new(
47             user_agent => $self->firecrawl->user_agent_string,
48             max_connections_per_host => 4,
49             );
50 0         0 $self->add_child($http);
51 0         0 return $self->{http} = $http;
52             }
53              
54             sub _on_added_to_loop {
55 0     0   0 my ( $self, $loop ) = @_;
56 0 0       0 $self->SUPER::_on_added_to_loop($loop) if $self->can('SUPER::_on_added_to_loop');
57             # Lazy-build http so it's parented properly
58 0         0 $self->http;
59             }
60              
61             #----------------------------------------------------------------------
62             # Generic request dispatch
63             #----------------------------------------------------------------------
64              
65             sub do_request {
66 3     3 1 6 my ( $self, $request ) = @_;
67 3 50       7 croak "do_request requires HTTP::Request" unless $self->firecrawl->is_request($request);
68 3         50 return $self->_do_request_with_retry($request, 1);
69             }
70              
71             sub _delay_future {
72 6     6   10 my ( $self, $seconds ) = @_;
73 6 100       20 return $self->{delay_sub}->($seconds) if $self->{delay_sub};
74 2         7 return $self->loop->delay_future( after => $seconds );
75             }
76              
77             sub _do_request_with_retry {
78 7     7   12 my ( $self, $request, $attempt ) = @_;
79 7         13 my $fc = $self->firecrawl;
80 7         34 my $max = $fc->max_attempts;
81             return $self->http->do_request( request => $request )->then(sub {
82 7     7   502 my ( $response ) = @_;
83 7         17 my ( $err, $retryable ) = $fc->_classify_response( $response, $attempt );
84 7 100       2135 return Future->done($response) unless $err;
85 5 100 66     39 if ( $retryable && $attempt < $max ) {
86 4         10 my $delay = $fc->_retry_delay( $response, $attempt );
87 4 50       231 if ( my $cb = $fc->on_retry ) {
88 0         0 $cb->( $attempt, $delay, $err );
89             }
90             return $self->_delay_future($delay)->then(sub {
91 4         312 $self->_do_request_with_retry( $request, $attempt + 1 );
92 4         10 });
93             }
94 1         9 return Future->fail($err, 'firecrawl', $attempt);
95 7         14 });
96             }
97              
98             # Build a Future-returning wrapper named $name around a WWW::Firecrawl
99             # request builder + response parser pair.
100             sub _install_wrapper {
101 160     160   210 my ( $class, $name, $opts ) = @_;
102 160   100     363 $opts ||= {};
103 160   66     293 my $builder = $opts->{builder} || "${name}_request";
104 160   66     277 my $parser = $opts->{parser} || "parse_${name}_response";
105              
106 5     5   32 no strict 'refs';
  5         8  
  5         8653  
107 160         629 *{"${class}::${name}"} = sub {
108 27     27   519 my ( $self, @args ) = @_;
109 27         64 my $fc = $self->firecrawl;
110 27         103 my $req = $fc->$builder(@args);
111             return $self->do_request($req)->then(sub {
112 26     26   4344 my $response = $_[0];
113 26         32 my $data = eval { $fc->$parser($response) };
  26         87  
114 26 100       6698 if ( my $e = $@ ) {
115 2 50 33     51 my $err = ref $e && $e->isa('WWW::Firecrawl::Error')
116             ? $e
117             : WWW::Firecrawl::Error->new( type => 'api', message => "$e", response => $response );
118 2         15 return Future->fail($err, 'firecrawl');
119             }
120 24         93 return Future->done($data);
121 27         34491 });
122 160         449 };
123             }
124              
125             # Declarative endpoint list: one Future-returning method per endpoint.
126             my @ENDPOINTS = qw(
127             scrape
128             crawl
129             crawl_status
130             crawl_cancel
131             crawl_errors
132             crawl_active
133             crawl_params_preview
134             map
135             search
136             batch_scrape
137             batch_scrape_status
138             batch_scrape_cancel
139             batch_scrape_errors
140             extract
141             extract_status
142             agent
143             agent_status
144             agent_cancel
145             browser_create
146             browser_list
147             browser_delete
148             browser_execute
149             scrape_execute
150             scrape_browser_stop
151             credit_usage
152             credit_usage_historical
153             token_usage
154             token_usage_historical
155             queue_status
156             activity
157             );
158              
159             __PACKAGE__->_install_wrapper($_) for @ENDPOINTS;
160              
161             # Pagination-follow helpers — same parser as their base endpoint.
162             __PACKAGE__->_install_wrapper('crawl_status_next', {
163             builder => 'crawl_status_next_request',
164             parser => 'parse_crawl_status_response',
165             });
166             __PACKAGE__->_install_wrapper('batch_scrape_status_next', {
167             builder => 'batch_scrape_status_next_request',
168             parser => 'parse_batch_scrape_status_response',
169             });
170              
171             #----------------------------------------------------------------------
172             # Flow helpers
173             #----------------------------------------------------------------------
174              
175             sub _poll_until_done {
176 5     5   15 my ( $self, %args ) = @_;
177 5         9 my $status_cb = $args{status};
178 5   33     21 my $interval = $args{interval} || $self->poll_interval;
179 5 50       15 my $loop = $self->loop or croak "not added to a loop yet";
180              
181             return repeat {
182             $status_cb->()->then(sub {
183 7         525 my ( $status ) = @_;
184 7   50     20 my $st = $status->{status} // '';
185 7 100 100     28 if ( $st eq 'failed' || $st eq 'cancelled' ) {
186 2         3 my $msg = "Firecrawl job $st";
187 2 50       5 $msg .= ': ' . $status->{error} if defined $status->{error};
188 2         35 return Future->fail(
189             WWW::Firecrawl::Error->new(
190             type => 'job',
191             message => $msg,
192             data => $status,
193             ),
194             'firecrawl',
195             );
196             }
197 5 100       11 return Future->done($status) if $st eq 'completed';
198             return $self->_delay_future($interval)
199 2         10 ->then(sub { Future->done($status) });
  2         4226  
200 7     7   200 });
201             } until => sub {
202 7     7   393 my $f = $_[0];
203 7 100       18 return 1 if $f->is_failed;
204 5         27 my $s = $f->get;
205 5   50     65 return ($s->{status} // '') eq 'completed';
206 5         48 };
207             }
208              
209             sub _collect_pages {
210 2     2   4 my ( $self, $first_status, $next_method ) = @_;
211 2 50       4 my @pages = @{ $first_status->{data} || [] };
  2         7  
212 2         3 my $next = $first_status->{next};
213 2         2 my $last_status = $first_status;
214 2 100       27 return Future->done({ %$first_status, data => \@pages }) unless $next;
215 1         2 my $current = $next;
216             my $loop_f = repeat {
217 1     1   25 my $url = $current;
218             $self->$next_method($url)->on_done(sub {
219 1         58 my ( $s ) = @_;
220 1         2 $last_status = $s;
221 1 50       1 push @pages, @{ $s->{data} || [] };
  1         4  
222 1         3 $current = $s->{next};
223 1         6 });
224 1     1   7 } while => sub { defined $current };
  1         39  
225             return $loop_f->then(sub {
226             Future->done({
227             %$first_status,
228             data => \@pages,
229             status => $last_status->{status},
230 1     1   47 });
231 1         91 });
232             }
233              
234 1     1   5 sub _collect_crawl_pages { $_[0]->_collect_pages($_[1], 'crawl_status_next') }
235 1     1   4 sub _collect_batch_pages { $_[0]->_collect_pages($_[1], 'batch_scrape_status_next') }
236              
237             # Apply is_failure classification to a collected crawl/batch result,
238             # producing { data, failed, raw_data, stats } from raw `data`.
239             sub _split_pages {
240 2     2   4 my ( $self, $result ) = @_;
241 2         4 my $fc = $self->firecrawl;
242 2 50       4 my @raw = @{ $result->{data} || [] };
  2         6  
243 2         4 my @ok;
244             my @failed;
245 2         3 for my $page (@raw) {
246 5 50       14 if ( $fc->is_failure->($page) ) {
247 0   0     0 my $meta = $page->{metadata} || {};
248             push @failed, {
249             url => $meta->{sourceURL} // $meta->{url},
250             statusCode => $meta->{statusCode},
251 0   0     0 error => $fc->scrape_error($page),
252             page => $page,
253             };
254             }
255             else {
256 5         53 push @ok, $page;
257             }
258             }
259             return {
260 2         18 %$result,
261             data => \@ok,
262             failed => \@failed,
263             raw_data => \@raw,
264             stats => {
265             ok => scalar @ok,
266             failed => scalar @failed,
267             total => scalar @raw,
268             },
269             };
270             }
271              
272              
273             sub crawl_and_collect {
274 2     2 1 124 my ( $self, %args ) = @_;
275 2         4 my $interval = delete $args{poll_interval};
276             $self->crawl(%args)->then(sub {
277 2     2   133 my ( $job ) = @_;
278 2 50       8 my $id = $job->{id} or return Future->fail(
279             WWW::Firecrawl::Error->new( type => 'api', message => "crawl returned no id" ),
280             'firecrawl',
281             );
282             $self->_poll_until_done(
283 3         11 status => sub { $self->crawl_status($id) },
284             interval => $interval,
285             )
286 1         107 ->then(sub { $self->_collect_crawl_pages($_[0]) })
287 2         13 ->then(sub { Future->done( $self->_split_pages($_[0]) ) });
  1         118  
288 2         9 });
289             }
290              
291             sub batch_scrape_and_wait {
292 1     1 1 82 my ( $self, %args ) = @_;
293 1         3 my $interval = delete $args{poll_interval};
294             $self->batch_scrape(%args)->then(sub {
295 1     1   64 my ( $job ) = @_;
296 1 50       4 my $id = $job->{id} or return Future->fail(
297             WWW::Firecrawl::Error->new( type => 'api', message => "batch_scrape returned no id" ),
298             'firecrawl',
299             );
300             $self->_poll_until_done(
301 1         6 status => sub { $self->batch_scrape_status($id) },
302             interval => $interval,
303             )
304 1         107 ->then(sub { $self->_collect_batch_pages($_[0]) })
305 1         5 ->then(sub { Future->done( $self->_split_pages($_[0]) ) });
  1         50  
306 1         5 });
307             }
308              
309             # Start extract → poll until done → return the final payload.
310             sub extract_and_wait {
311 2     2 1 126 my ( $self, %args ) = @_;
312 2         5 my $interval = delete $args{poll_interval};
313             $self->extract(%args)->then(sub {
314 2     2   127 my ( $job ) = @_;
315 2 50       42 my $id = $job->{id} or return Future->fail("extract returned no id");
316             $self->_poll_until_done(
317 3         11 status => sub { $self->extract_status($id) },
318 2         11 interval => $interval,
319             );
320 2         9 });
321             }
322              
323             # Start agent → poll until done.
324             sub agent_and_wait {
325 0     0 1 0 my ( $self, %args ) = @_;
326 0         0 my $interval = delete $args{poll_interval};
327             $self->agent(%args)->then(sub {
328 0     0   0 my ( $job ) = @_;
329 0 0       0 my $id = $job->{id} or return Future->fail("agent returned no id");
330             $self->_poll_until_done(
331 0         0 status => sub { $self->agent_status($id) },
332 0         0 interval => $interval,
333             );
334 0         0 });
335             }
336              
337             sub scrape_many {
338 3     3 1 200 my ( $self, $urls, %common ) = @_;
339 3 50       48 croak "scrape_many: first arg must be arrayref of URLs" unless ref $urls eq 'ARRAY';
340 3         14 my $fc = $self->firecrawl;
341             my @futures = map {
342 3         7 my $url = $_;
  8         202  
343             $self->scrape( url => $url, %common )->then(
344             sub {
345 7     7   663 my $data = $_[0];
346 7 100       25 if ( $fc->is_failure->($data) ) {
347 1   50     12 my $err = WWW::Firecrawl::Error->new(
348             type => 'page',
349             message => 'Firecrawl scrape failed: ' . ($fc->scrape_error($data) // 'unknown'),
350             data => $data,
351             status_code => $fc->scrape_status($data),
352             url => $url,
353             );
354 1         1413 return Future->done({ url => $url, failed => { url => $url, error => $err } });
355             }
356 6         100 return Future->done({ url => $url, ok => { url => $url, data => $data } });
357             },
358             sub {
359 1     1   98 my ( $err ) = @_;
360 1 50 33     10 my $e = ref $err && $err->isa('WWW::Firecrawl::Error')
361             ? $err
362             : WWW::Firecrawl::Error->new( type => 'api', message => "$err", url => $url );
363 1         4 return Future->done({ url => $url, failed => { url => $url, error => $e } });
364             },
365 8         39 );
366             } @$urls;
367             return Future->wait_all(@futures)->then(sub {
368 3     3   401 my @resolved = map { $_->get } @_;
  8         95  
369 3         31 my @ok = map { $_->{ok} } grep { exists $_->{ok} } @resolved;
  6         9  
  8         14  
370 3         6 my @failed = map { $_->{failed} } grep { exists $_->{failed} } @resolved;
  2         3  
  8         13  
371 3         22 Future->done({
372             ok => \@ok,
373             failed => \@failed,
374             stats => { ok => scalar @ok, failed => scalar @failed, total => scalar @$urls },
375             });
376 3         115 });
377             }
378              
379             sub retry_failed_pages {
380 1     1 1 111 my ( $self, $result, %scrape_opts ) = @_;
381 1 50       2 my @urls = map { $_->{url} } @{ $result->{failed} || [] };
  2         5  
  1         4  
382 1         6 return $self->scrape_many( \@urls, %scrape_opts );
383             }
384              
385             1;
386              
387             __END__