| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Net::Async::Firecrawl; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: IO::Async Firecrawl v2 client with flow helpers |
|
3
|
5
|
|
|
5
|
|
718233
|
use strict; |
|
|
5
|
|
|
|
|
7
|
|
|
|
5
|
|
|
|
|
159
|
|
|
4
|
5
|
|
|
5
|
|
33
|
use warnings; |
|
|
5
|
|
|
|
|
10
|
|
|
|
5
|
|
|
|
|
251
|
|
|
5
|
5
|
|
|
5
|
|
25
|
use parent 'IO::Async::Notifier'; |
|
|
5
|
|
|
|
|
6
|
|
|
|
5
|
|
|
|
|
40
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
33085
|
use Carp qw( croak ); |
|
|
5
|
|
|
|
|
13
|
|
|
|
5
|
|
|
|
|
218
|
|
|
8
|
5
|
|
|
5
|
|
2687
|
use WWW::Firecrawl (); |
|
|
5
|
|
|
|
|
240626
|
|
|
|
5
|
|
|
|
|
130
|
|
|
9
|
5
|
|
|
5
|
|
32
|
use WWW::Firecrawl::Error (); |
|
|
5
|
|
|
|
|
6
|
|
|
|
5
|
|
|
|
|
62
|
|
|
10
|
5
|
|
|
5
|
|
2859
|
use Net::Async::HTTP (); |
|
|
5
|
|
|
|
|
340992
|
|
|
|
5
|
|
|
|
|
166
|
|
|
11
|
5
|
|
|
5
|
|
36
|
use Future (); |
|
|
5
|
|
|
|
|
12
|
|
|
|
5
|
|
|
|
|
115
|
|
|
12
|
5
|
|
|
5
|
|
19
|
use Future::Utils qw( repeat ); |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
3456
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $VERSION = '0.001'; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub _init { |
|
17
|
14
|
|
|
14
|
|
962325
|
my ( $self, $args ) = @_; |
|
18
|
14
|
|
|
|
|
65
|
$self->SUPER::_init($args); |
|
19
|
|
|
|
|
|
|
$self->{firecrawl} ||= WWW::Firecrawl->new( |
|
20
|
|
|
|
|
|
|
( exists $args->{base_url} ? ( base_url => delete $args->{base_url} ) : () ), |
|
21
|
|
|
|
|
|
|
( exists $args->{api_key} ? ( api_key => delete $args->{api_key} ) : () ), |
|
22
|
14
|
50
|
33
|
|
|
438
|
( exists $args->{api_version} ? ( api_version => delete $args->{api_version} ) : () ), |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
); |
|
24
|
14
|
100
|
|
|
|
10857
|
$self->{poll_interval} = exists $args->{poll_interval} ? delete $args->{poll_interval} : 3; |
|
25
|
14
|
|
|
|
|
25
|
$self->{http} = delete $args->{http}; |
|
26
|
14
|
50
|
|
|
|
43
|
$self->{firecrawl} = delete $args->{firecrawl} if exists $args->{firecrawl}; |
|
27
|
14
|
|
|
|
|
47
|
$self->{delay_sub} = delete $args->{delay_sub}; |
|
28
|
14
|
|
|
|
|
34
|
return; |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub configure_unknown { |
|
32
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_; |
|
33
|
0
|
|
|
|
|
0
|
for my $k (qw( base_url api_key api_version poll_interval firecrawl http delay_sub )) { |
|
34
|
0
|
|
|
|
|
0
|
delete $args{$k}; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
0
|
0
|
|
|
|
0
|
return unless %args; |
|
37
|
0
|
|
|
|
|
0
|
croak "Unknown configuration keys: ".join(',', sort keys %args); |
|
38
|
|
|
|
|
|
|
} |
|
39
|
|
|
|
|
|
|
|
|
40
|
42
|
|
|
42
|
1
|
74
|
sub firecrawl { $_[0]->{firecrawl} } |
|
41
|
5
|
50
|
|
5
|
1
|
20
|
sub poll_interval { @_ > 1 ? ($_[0]->{poll_interval} = $_[1]) : $_[0]->{poll_interval} } |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub http { |
|
44
|
7
|
|
|
7
|
1
|
8
|
my ( $self ) = @_; |
|
45
|
7
|
50
|
|
|
|
30
|
return $self->{http} if $self->{http}; |
|
46
|
0
|
|
|
|
|
0
|
my $http = Net::Async::HTTP->new( |
|
47
|
|
|
|
|
|
|
user_agent => $self->firecrawl->user_agent_string, |
|
48
|
|
|
|
|
|
|
max_connections_per_host => 4, |
|
49
|
|
|
|
|
|
|
); |
|
50
|
0
|
|
|
|
|
0
|
$self->add_child($http); |
|
51
|
0
|
|
|
|
|
0
|
return $self->{http} = $http; |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _on_added_to_loop { |
|
55
|
0
|
|
|
0
|
|
0
|
my ( $self, $loop ) = @_; |
|
56
|
0
|
0
|
|
|
|
0
|
$self->SUPER::_on_added_to_loop($loop) if $self->can('SUPER::_on_added_to_loop'); |
|
57
|
|
|
|
|
|
|
# Lazy-build http so it's parented properly |
|
58
|
0
|
|
|
|
|
0
|
$self->http; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
62
|
|
|
|
|
|
|
# Generic request dispatch |
|
63
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub do_request { |
|
66
|
3
|
|
|
3
|
1
|
6
|
my ( $self, $request ) = @_; |
|
67
|
3
|
50
|
|
|
|
7
|
croak "do_request requires HTTP::Request" unless $self->firecrawl->is_request($request); |
|
68
|
3
|
|
|
|
|
50
|
return $self->_do_request_with_retry($request, 1); |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _delay_future { |
|
72
|
6
|
|
|
6
|
|
10
|
my ( $self, $seconds ) = @_; |
|
73
|
6
|
100
|
|
|
|
20
|
return $self->{delay_sub}->($seconds) if $self->{delay_sub}; |
|
74
|
2
|
|
|
|
|
7
|
return $self->loop->delay_future( after => $seconds ); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub _do_request_with_retry { |
|
78
|
7
|
|
|
7
|
|
12
|
my ( $self, $request, $attempt ) = @_; |
|
79
|
7
|
|
|
|
|
13
|
my $fc = $self->firecrawl; |
|
80
|
7
|
|
|
|
|
34
|
my $max = $fc->max_attempts; |
|
81
|
|
|
|
|
|
|
return $self->http->do_request( request => $request )->then(sub { |
|
82
|
7
|
|
|
7
|
|
502
|
my ( $response ) = @_; |
|
83
|
7
|
|
|
|
|
17
|
my ( $err, $retryable ) = $fc->_classify_response( $response, $attempt ); |
|
84
|
7
|
100
|
|
|
|
2135
|
return Future->done($response) unless $err; |
|
85
|
5
|
100
|
66
|
|
|
39
|
if ( $retryable && $attempt < $max ) { |
|
86
|
4
|
|
|
|
|
10
|
my $delay = $fc->_retry_delay( $response, $attempt ); |
|
87
|
4
|
50
|
|
|
|
231
|
if ( my $cb = $fc->on_retry ) { |
|
88
|
0
|
|
|
|
|
0
|
$cb->( $attempt, $delay, $err ); |
|
89
|
|
|
|
|
|
|
} |
|
90
|
|
|
|
|
|
|
return $self->_delay_future($delay)->then(sub { |
|
91
|
4
|
|
|
|
|
312
|
$self->_do_request_with_retry( $request, $attempt + 1 ); |
|
92
|
4
|
|
|
|
|
10
|
}); |
|
93
|
|
|
|
|
|
|
} |
|
94
|
1
|
|
|
|
|
9
|
return Future->fail($err, 'firecrawl', $attempt); |
|
95
|
7
|
|
|
|
|
14
|
}); |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# Build a Future-returning wrapper named $name around a WWW::Firecrawl |
|
99
|
|
|
|
|
|
|
# request builder + response parser pair. |
|
100
|
|
|
|
|
|
|
sub _install_wrapper { |
|
101
|
160
|
|
|
160
|
|
210
|
my ( $class, $name, $opts ) = @_; |
|
102
|
160
|
|
100
|
|
|
363
|
$opts ||= {}; |
|
103
|
160
|
|
66
|
|
|
293
|
my $builder = $opts->{builder} || "${name}_request"; |
|
104
|
160
|
|
66
|
|
|
277
|
my $parser = $opts->{parser} || "parse_${name}_response"; |
|
105
|
|
|
|
|
|
|
|
|
106
|
5
|
|
|
5
|
|
32
|
no strict 'refs'; |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
8653
|
|
|
107
|
160
|
|
|
|
|
629
|
*{"${class}::${name}"} = sub { |
|
108
|
27
|
|
|
27
|
|
519
|
my ( $self, @args ) = @_; |
|
109
|
27
|
|
|
|
|
64
|
my $fc = $self->firecrawl; |
|
110
|
27
|
|
|
|
|
103
|
my $req = $fc->$builder(@args); |
|
111
|
|
|
|
|
|
|
return $self->do_request($req)->then(sub { |
|
112
|
26
|
|
|
26
|
|
4344
|
my $response = $_[0]; |
|
113
|
26
|
|
|
|
|
32
|
my $data = eval { $fc->$parser($response) }; |
|
|
26
|
|
|
|
|
87
|
|
|
114
|
26
|
100
|
|
|
|
6698
|
if ( my $e = $@ ) { |
|
115
|
2
|
50
|
33
|
|
|
51
|
my $err = ref $e && $e->isa('WWW::Firecrawl::Error') |
|
116
|
|
|
|
|
|
|
? $e |
|
117
|
|
|
|
|
|
|
: WWW::Firecrawl::Error->new( type => 'api', message => "$e", response => $response ); |
|
118
|
2
|
|
|
|
|
15
|
return Future->fail($err, 'firecrawl'); |
|
119
|
|
|
|
|
|
|
} |
|
120
|
24
|
|
|
|
|
93
|
return Future->done($data); |
|
121
|
27
|
|
|
|
|
34491
|
}); |
|
122
|
160
|
|
|
|
|
449
|
}; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Declarative endpoint list: one Future-returning method per endpoint. |
|
126
|
|
|
|
|
|
|
my @ENDPOINTS = qw( |
|
127
|
|
|
|
|
|
|
scrape |
|
128
|
|
|
|
|
|
|
crawl |
|
129
|
|
|
|
|
|
|
crawl_status |
|
130
|
|
|
|
|
|
|
crawl_cancel |
|
131
|
|
|
|
|
|
|
crawl_errors |
|
132
|
|
|
|
|
|
|
crawl_active |
|
133
|
|
|
|
|
|
|
crawl_params_preview |
|
134
|
|
|
|
|
|
|
map |
|
135
|
|
|
|
|
|
|
search |
|
136
|
|
|
|
|
|
|
batch_scrape |
|
137
|
|
|
|
|
|
|
batch_scrape_status |
|
138
|
|
|
|
|
|
|
batch_scrape_cancel |
|
139
|
|
|
|
|
|
|
batch_scrape_errors |
|
140
|
|
|
|
|
|
|
extract |
|
141
|
|
|
|
|
|
|
extract_status |
|
142
|
|
|
|
|
|
|
agent |
|
143
|
|
|
|
|
|
|
agent_status |
|
144
|
|
|
|
|
|
|
agent_cancel |
|
145
|
|
|
|
|
|
|
browser_create |
|
146
|
|
|
|
|
|
|
browser_list |
|
147
|
|
|
|
|
|
|
browser_delete |
|
148
|
|
|
|
|
|
|
browser_execute |
|
149
|
|
|
|
|
|
|
scrape_execute |
|
150
|
|
|
|
|
|
|
scrape_browser_stop |
|
151
|
|
|
|
|
|
|
credit_usage |
|
152
|
|
|
|
|
|
|
credit_usage_historical |
|
153
|
|
|
|
|
|
|
token_usage |
|
154
|
|
|
|
|
|
|
token_usage_historical |
|
155
|
|
|
|
|
|
|
queue_status |
|
156
|
|
|
|
|
|
|
activity |
|
157
|
|
|
|
|
|
|
); |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
__PACKAGE__->_install_wrapper($_) for @ENDPOINTS; |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# Pagination-follow helpers — same parser as their base endpoint. |
|
162
|
|
|
|
|
|
|
__PACKAGE__->_install_wrapper('crawl_status_next', { |
|
163
|
|
|
|
|
|
|
builder => 'crawl_status_next_request', |
|
164
|
|
|
|
|
|
|
parser => 'parse_crawl_status_response', |
|
165
|
|
|
|
|
|
|
}); |
|
166
|
|
|
|
|
|
|
__PACKAGE__->_install_wrapper('batch_scrape_status_next', { |
|
167
|
|
|
|
|
|
|
builder => 'batch_scrape_status_next_request', |
|
168
|
|
|
|
|
|
|
parser => 'parse_batch_scrape_status_response', |
|
169
|
|
|
|
|
|
|
}); |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
172
|
|
|
|
|
|
|
# Flow helpers |
|
173
|
|
|
|
|
|
|
#---------------------------------------------------------------------- |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub _poll_until_done { |
|
176
|
5
|
|
|
5
|
|
15
|
my ( $self, %args ) = @_; |
|
177
|
5
|
|
|
|
|
9
|
my $status_cb = $args{status}; |
|
178
|
5
|
|
33
|
|
|
21
|
my $interval = $args{interval} || $self->poll_interval; |
|
179
|
5
|
50
|
|
|
|
15
|
my $loop = $self->loop or croak "not added to a loop yet"; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
return repeat { |
|
182
|
|
|
|
|
|
|
$status_cb->()->then(sub { |
|
183
|
7
|
|
|
|
|
525
|
my ( $status ) = @_; |
|
184
|
7
|
|
50
|
|
|
20
|
my $st = $status->{status} // ''; |
|
185
|
7
|
100
|
100
|
|
|
28
|
if ( $st eq 'failed' || $st eq 'cancelled' ) { |
|
186
|
2
|
|
|
|
|
3
|
my $msg = "Firecrawl job $st"; |
|
187
|
2
|
50
|
|
|
|
5
|
$msg .= ': ' . $status->{error} if defined $status->{error}; |
|
188
|
2
|
|
|
|
|
35
|
return Future->fail( |
|
189
|
|
|
|
|
|
|
WWW::Firecrawl::Error->new( |
|
190
|
|
|
|
|
|
|
type => 'job', |
|
191
|
|
|
|
|
|
|
message => $msg, |
|
192
|
|
|
|
|
|
|
data => $status, |
|
193
|
|
|
|
|
|
|
), |
|
194
|
|
|
|
|
|
|
'firecrawl', |
|
195
|
|
|
|
|
|
|
); |
|
196
|
|
|
|
|
|
|
} |
|
197
|
5
|
100
|
|
|
|
11
|
return Future->done($status) if $st eq 'completed'; |
|
198
|
|
|
|
|
|
|
return $self->_delay_future($interval) |
|
199
|
2
|
|
|
|
|
10
|
->then(sub { Future->done($status) }); |
|
|
2
|
|
|
|
|
4226
|
|
|
200
|
7
|
|
|
7
|
|
200
|
}); |
|
201
|
|
|
|
|
|
|
} until => sub { |
|
202
|
7
|
|
|
7
|
|
393
|
my $f = $_[0]; |
|
203
|
7
|
100
|
|
|
|
18
|
return 1 if $f->is_failed; |
|
204
|
5
|
|
|
|
|
27
|
my $s = $f->get; |
|
205
|
5
|
|
50
|
|
|
65
|
return ($s->{status} // '') eq 'completed'; |
|
206
|
5
|
|
|
|
|
48
|
}; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub _collect_pages { |
|
210
|
2
|
|
|
2
|
|
4
|
my ( $self, $first_status, $next_method ) = @_; |
|
211
|
2
|
50
|
|
|
|
4
|
my @pages = @{ $first_status->{data} || [] }; |
|
|
2
|
|
|
|
|
7
|
|
|
212
|
2
|
|
|
|
|
3
|
my $next = $first_status->{next}; |
|
213
|
2
|
|
|
|
|
2
|
my $last_status = $first_status; |
|
214
|
2
|
100
|
|
|
|
27
|
return Future->done({ %$first_status, data => \@pages }) unless $next; |
|
215
|
1
|
|
|
|
|
2
|
my $current = $next; |
|
216
|
|
|
|
|
|
|
my $loop_f = repeat { |
|
217
|
1
|
|
|
1
|
|
25
|
my $url = $current; |
|
218
|
|
|
|
|
|
|
$self->$next_method($url)->on_done(sub { |
|
219
|
1
|
|
|
|
|
58
|
my ( $s ) = @_; |
|
220
|
1
|
|
|
|
|
2
|
$last_status = $s; |
|
221
|
1
|
50
|
|
|
|
1
|
push @pages, @{ $s->{data} || [] }; |
|
|
1
|
|
|
|
|
4
|
|
|
222
|
1
|
|
|
|
|
3
|
$current = $s->{next}; |
|
223
|
1
|
|
|
|
|
6
|
}); |
|
224
|
1
|
|
|
1
|
|
7
|
} while => sub { defined $current }; |
|
|
1
|
|
|
|
|
39
|
|
|
225
|
|
|
|
|
|
|
return $loop_f->then(sub { |
|
226
|
|
|
|
|
|
|
Future->done({ |
|
227
|
|
|
|
|
|
|
%$first_status, |
|
228
|
|
|
|
|
|
|
data => \@pages, |
|
229
|
|
|
|
|
|
|
status => $last_status->{status}, |
|
230
|
1
|
|
|
1
|
|
47
|
}); |
|
231
|
1
|
|
|
|
|
91
|
}); |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
1
|
|
|
1
|
|
5
|
sub _collect_crawl_pages { $_[0]->_collect_pages($_[1], 'crawl_status_next') } |
|
235
|
1
|
|
|
1
|
|
4
|
sub _collect_batch_pages { $_[0]->_collect_pages($_[1], 'batch_scrape_status_next') } |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# Apply is_failure classification to a collected crawl/batch result, |
|
238
|
|
|
|
|
|
|
# producing { data, failed, raw_data, stats } from raw `data`. |
|
239
|
|
|
|
|
|
|
sub _split_pages { |
|
240
|
2
|
|
|
2
|
|
4
|
my ( $self, $result ) = @_; |
|
241
|
2
|
|
|
|
|
4
|
my $fc = $self->firecrawl; |
|
242
|
2
|
50
|
|
|
|
4
|
my @raw = @{ $result->{data} || [] }; |
|
|
2
|
|
|
|
|
6
|
|
|
243
|
2
|
|
|
|
|
4
|
my @ok; |
|
244
|
|
|
|
|
|
|
my @failed; |
|
245
|
2
|
|
|
|
|
3
|
for my $page (@raw) { |
|
246
|
5
|
50
|
|
|
|
14
|
if ( $fc->is_failure->($page) ) { |
|
247
|
0
|
|
0
|
|
|
0
|
my $meta = $page->{metadata} || {}; |
|
248
|
|
|
|
|
|
|
push @failed, { |
|
249
|
|
|
|
|
|
|
url => $meta->{sourceURL} // $meta->{url}, |
|
250
|
|
|
|
|
|
|
statusCode => $meta->{statusCode}, |
|
251
|
0
|
|
0
|
|
|
0
|
error => $fc->scrape_error($page), |
|
252
|
|
|
|
|
|
|
page => $page, |
|
253
|
|
|
|
|
|
|
}; |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
else { |
|
256
|
5
|
|
|
|
|
53
|
push @ok, $page; |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
return { |
|
260
|
2
|
|
|
|
|
18
|
%$result, |
|
261
|
|
|
|
|
|
|
data => \@ok, |
|
262
|
|
|
|
|
|
|
failed => \@failed, |
|
263
|
|
|
|
|
|
|
raw_data => \@raw, |
|
264
|
|
|
|
|
|
|
stats => { |
|
265
|
|
|
|
|
|
|
ok => scalar @ok, |
|
266
|
|
|
|
|
|
|
failed => scalar @failed, |
|
267
|
|
|
|
|
|
|
total => scalar @raw, |
|
268
|
|
|
|
|
|
|
}, |
|
269
|
|
|
|
|
|
|
}; |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub crawl_and_collect { |
|
274
|
2
|
|
|
2
|
1
|
124
|
my ( $self, %args ) = @_; |
|
275
|
2
|
|
|
|
|
4
|
my $interval = delete $args{poll_interval}; |
|
276
|
|
|
|
|
|
|
$self->crawl(%args)->then(sub { |
|
277
|
2
|
|
|
2
|
|
133
|
my ( $job ) = @_; |
|
278
|
2
|
50
|
|
|
|
8
|
my $id = $job->{id} or return Future->fail( |
|
279
|
|
|
|
|
|
|
WWW::Firecrawl::Error->new( type => 'api', message => "crawl returned no id" ), |
|
280
|
|
|
|
|
|
|
'firecrawl', |
|
281
|
|
|
|
|
|
|
); |
|
282
|
|
|
|
|
|
|
$self->_poll_until_done( |
|
283
|
3
|
|
|
|
|
11
|
status => sub { $self->crawl_status($id) }, |
|
284
|
|
|
|
|
|
|
interval => $interval, |
|
285
|
|
|
|
|
|
|
) |
|
286
|
1
|
|
|
|
|
107
|
->then(sub { $self->_collect_crawl_pages($_[0]) }) |
|
287
|
2
|
|
|
|
|
13
|
->then(sub { Future->done( $self->_split_pages($_[0]) ) }); |
|
|
1
|
|
|
|
|
118
|
|
|
288
|
2
|
|
|
|
|
9
|
}); |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub batch_scrape_and_wait { |
|
292
|
1
|
|
|
1
|
1
|
82
|
my ( $self, %args ) = @_; |
|
293
|
1
|
|
|
|
|
3
|
my $interval = delete $args{poll_interval}; |
|
294
|
|
|
|
|
|
|
$self->batch_scrape(%args)->then(sub { |
|
295
|
1
|
|
|
1
|
|
64
|
my ( $job ) = @_; |
|
296
|
1
|
50
|
|
|
|
4
|
my $id = $job->{id} or return Future->fail( |
|
297
|
|
|
|
|
|
|
WWW::Firecrawl::Error->new( type => 'api', message => "batch_scrape returned no id" ), |
|
298
|
|
|
|
|
|
|
'firecrawl', |
|
299
|
|
|
|
|
|
|
); |
|
300
|
|
|
|
|
|
|
$self->_poll_until_done( |
|
301
|
1
|
|
|
|
|
6
|
status => sub { $self->batch_scrape_status($id) }, |
|
302
|
|
|
|
|
|
|
interval => $interval, |
|
303
|
|
|
|
|
|
|
) |
|
304
|
1
|
|
|
|
|
107
|
->then(sub { $self->_collect_batch_pages($_[0]) }) |
|
305
|
1
|
|
|
|
|
5
|
->then(sub { Future->done( $self->_split_pages($_[0]) ) }); |
|
|
1
|
|
|
|
|
50
|
|
|
306
|
1
|
|
|
|
|
5
|
}); |
|
307
|
|
|
|
|
|
|
} |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# Start extract → poll until done → return the final payload. |
|
310
|
|
|
|
|
|
|
sub extract_and_wait { |
|
311
|
2
|
|
|
2
|
1
|
126
|
my ( $self, %args ) = @_; |
|
312
|
2
|
|
|
|
|
5
|
my $interval = delete $args{poll_interval}; |
|
313
|
|
|
|
|
|
|
$self->extract(%args)->then(sub { |
|
314
|
2
|
|
|
2
|
|
127
|
my ( $job ) = @_; |
|
315
|
2
|
50
|
|
|
|
42
|
my $id = $job->{id} or return Future->fail("extract returned no id"); |
|
316
|
|
|
|
|
|
|
$self->_poll_until_done( |
|
317
|
3
|
|
|
|
|
11
|
status => sub { $self->extract_status($id) }, |
|
318
|
2
|
|
|
|
|
11
|
interval => $interval, |
|
319
|
|
|
|
|
|
|
); |
|
320
|
2
|
|
|
|
|
9
|
}); |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# Start agent → poll until done. |
|
324
|
|
|
|
|
|
|
sub agent_and_wait { |
|
325
|
0
|
|
|
0
|
1
|
0
|
my ( $self, %args ) = @_; |
|
326
|
0
|
|
|
|
|
0
|
my $interval = delete $args{poll_interval}; |
|
327
|
|
|
|
|
|
|
$self->agent(%args)->then(sub { |
|
328
|
0
|
|
|
0
|
|
0
|
my ( $job ) = @_; |
|
329
|
0
|
0
|
|
|
|
0
|
my $id = $job->{id} or return Future->fail("agent returned no id"); |
|
330
|
|
|
|
|
|
|
$self->_poll_until_done( |
|
331
|
0
|
|
|
|
|
0
|
status => sub { $self->agent_status($id) }, |
|
332
|
0
|
|
|
|
|
0
|
interval => $interval, |
|
333
|
|
|
|
|
|
|
); |
|
334
|
0
|
|
|
|
|
0
|
}); |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
sub scrape_many { |
|
338
|
3
|
|
|
3
|
1
|
200
|
my ( $self, $urls, %common ) = @_; |
|
339
|
3
|
50
|
|
|
|
48
|
croak "scrape_many: first arg must be arrayref of URLs" unless ref $urls eq 'ARRAY'; |
|
340
|
3
|
|
|
|
|
14
|
my $fc = $self->firecrawl; |
|
341
|
|
|
|
|
|
|
my @futures = map { |
|
342
|
3
|
|
|
|
|
7
|
my $url = $_; |
|
|
8
|
|
|
|
|
202
|
|
|
343
|
|
|
|
|
|
|
$self->scrape( url => $url, %common )->then( |
|
344
|
|
|
|
|
|
|
sub { |
|
345
|
7
|
|
|
7
|
|
663
|
my $data = $_[0]; |
|
346
|
7
|
100
|
|
|
|
25
|
if ( $fc->is_failure->($data) ) { |
|
347
|
1
|
|
50
|
|
|
12
|
my $err = WWW::Firecrawl::Error->new( |
|
348
|
|
|
|
|
|
|
type => 'page', |
|
349
|
|
|
|
|
|
|
message => 'Firecrawl scrape failed: ' . ($fc->scrape_error($data) // 'unknown'), |
|
350
|
|
|
|
|
|
|
data => $data, |
|
351
|
|
|
|
|
|
|
status_code => $fc->scrape_status($data), |
|
352
|
|
|
|
|
|
|
url => $url, |
|
353
|
|
|
|
|
|
|
); |
|
354
|
1
|
|
|
|
|
1413
|
return Future->done({ url => $url, failed => { url => $url, error => $err } }); |
|
355
|
|
|
|
|
|
|
} |
|
356
|
6
|
|
|
|
|
100
|
return Future->done({ url => $url, ok => { url => $url, data => $data } }); |
|
357
|
|
|
|
|
|
|
}, |
|
358
|
|
|
|
|
|
|
sub { |
|
359
|
1
|
|
|
1
|
|
98
|
my ( $err ) = @_; |
|
360
|
1
|
50
|
33
|
|
|
10
|
my $e = ref $err && $err->isa('WWW::Firecrawl::Error') |
|
361
|
|
|
|
|
|
|
? $err |
|
362
|
|
|
|
|
|
|
: WWW::Firecrawl::Error->new( type => 'api', message => "$err", url => $url ); |
|
363
|
1
|
|
|
|
|
4
|
return Future->done({ url => $url, failed => { url => $url, error => $e } }); |
|
364
|
|
|
|
|
|
|
}, |
|
365
|
8
|
|
|
|
|
39
|
); |
|
366
|
|
|
|
|
|
|
} @$urls; |
|
367
|
|
|
|
|
|
|
return Future->wait_all(@futures)->then(sub { |
|
368
|
3
|
|
|
3
|
|
401
|
my @resolved = map { $_->get } @_; |
|
|
8
|
|
|
|
|
95
|
|
|
369
|
3
|
|
|
|
|
31
|
my @ok = map { $_->{ok} } grep { exists $_->{ok} } @resolved; |
|
|
6
|
|
|
|
|
9
|
|
|
|
8
|
|
|
|
|
14
|
|
|
370
|
3
|
|
|
|
|
6
|
my @failed = map { $_->{failed} } grep { exists $_->{failed} } @resolved; |
|
|
2
|
|
|
|
|
3
|
|
|
|
8
|
|
|
|
|
13
|
|
|
371
|
3
|
|
|
|
|
22
|
Future->done({ |
|
372
|
|
|
|
|
|
|
ok => \@ok, |
|
373
|
|
|
|
|
|
|
failed => \@failed, |
|
374
|
|
|
|
|
|
|
stats => { ok => scalar @ok, failed => scalar @failed, total => scalar @$urls }, |
|
375
|
|
|
|
|
|
|
}); |
|
376
|
3
|
|
|
|
|
115
|
}); |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub retry_failed_pages { |
|
380
|
1
|
|
|
1
|
1
|
111
|
my ( $self, $result, %scrape_opts ) = @_; |
|
381
|
1
|
50
|
|
|
|
2
|
my @urls = map { $_->{url} } @{ $result->{failed} || [] }; |
|
|
2
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
4
|
|
|
382
|
1
|
|
|
|
|
6
|
return $self->scrape_many( \@urls, %scrape_opts ); |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
1; |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
__END__ |