File Coverage

blib/lib/WWW/Crawl4AI.pm
Criterion Covered Total %
statement 103 132 78.0
branch 29 48 60.4
condition 10 23 43.4
subroutine 24 34 70.5
pod 14 14 100.0
total 180 251 71.7


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI;
2             # ABSTRACT: Perl client and fallback orchestrator for Crawl4AI
3 2     2   197872 use Moo;
  2         15013  
  2         11  
4 2     2   2922 use Carp qw( croak );
  2         4  
  2         112  
5 2     2   11 use Time::HiRes ();
  2         4  
  2         47  
6 2     2   1391 use URI ();
  2         19315  
  2         55  
7 2     2   1127 use WWW::Crawl4AI::Client ();
  2         6  
  2         77  
8 2     2   1000 use WWW::Crawl4AI::Result ();
  2         7  
  2         69  
9 2     2   936 use WWW::Crawl4AI::Attempt ();
  2         7  
  2         53  
10 2     2   10 use WWW::Crawl4AI::Error ();
  2         3  
  2         22  
11 2     2   845 use WWW::Crawl4AI::Detect ();
  2         10  
  2         59  
12 2     2   812 use WWW::Crawl4AI::StrategyChain ();
  2         8  
  2         58  
13 2     2   854 use WWW::Crawl4AI::DeepCrawlIterator ();
  2         7  
  2         3147  
14              
15             our $VERSION = '0.002';
16              
17              
18             has strategy_chain => (
19             is => 'lazy',
20             );
21              
22              
23             sub _build_strategy_chain {
24 16     16   77 my ( $self ) = @_;
25 16         176 return WWW::Crawl4AI::StrategyChain->new;
26             }
27              
28             # Deprecated: kept for backward compat. Returns the applicable strategy list
29             # the same way the old _build_strategies did.
30             sub _strategies_for {
31 16     16   19 my ( $self ) = @_;
32 16         31 my $fb = $self->fallback;
33 16         17 my @all = @{ $self->strategy_chain->strategies };
  16         193  
34              
35 16 100       228 if ( ref $fb eq 'ARRAY' ) {
36 1         2 my %by_name = map { $_->name => $_ } @all;
  6         13  
37 1 50       2 return [ grep { $_ && $_->applicable($self) } map { $by_name{$_} } @$fb ];
  3         14  
  3         5  
38             }
39 15 100 33     64 if ( !$fb || $fb eq 'none' || $fb eq 'plain' ) {
      66        
40 9         12 return [ grep { $_->name eq 'crawl4ai_plain' } @all ];
  54         103  
41             }
42 6         83 return $self->strategy_chain->applicable($self);
43             }
44              
45             has strategies => ( is => 'lazy' );
46              
47              
48 16     16   109 sub _build_strategies { $_[0]->_strategies_for }
49              
50             has base_url => (
51             is => 'ro',
52             default => sub { $ENV{CRAWL4AI_URL} || $ENV{CRAWL4AI_BASE_URL} || 'http://localhost:11235' },
53             );
54              
55              
56             has api_token => (
57             is => 'ro',
58             default => sub { $ENV{CRAWL4AI_API_TOKEN} },
59             );
60              
61              
62             has cloakbrowser_url => (
63             is => 'ro',
64             default => sub { $ENV{CLOAKBROWSER_CDP_URL} },
65             );
66              
67              
68             has proxy_url => (
69             is => 'ro',
70             default => sub { $ENV{CRAWL4AI_PROXY_URL} },
71             );
72              
73              
74             has callback => ( is => 'ro' );
75              
76              
77             # 'auto' (full applicable chain), 'plain'/'none' (Plain only), or an arrayref
78             # of backend names to run in that explicit order.
79             has fallback => ( is => 'ro', default => sub { 'auto' } );
80              
81              
82             has timeout => ( is => 'ro', default => sub { 120 } );
83              
84              
85             has min_markdown => ( is => 'ro' );
86              
87              
88             has client => ( is => 'lazy' );
89              
90              
91             sub _build_client {
92 0     0   0 my ( $self ) = @_;
93 0         0 return WWW::Crawl4AI::Client->new(
94             base_url => $self->base_url,
95             api_token => $self->api_token,
96             timeout => $self->timeout,
97             );
98             }
99              
100             sub _normalize_args {
101 32     32   62 my ( $self, @args ) = @_;
102 32 50       56 return () unless @args;
103             # a single hashref: { url => ..., %opts }
104 32 50 66     93 if ( @args == 1 && ref $args[0] eq 'HASH' ) {
105 0         0 my %a = %{ $args[0] };
  0         0  
106 0         0 my $url = delete $a{url};
107 0         0 return ( $url, %a );
108             }
109             # a leading positional URL, optionally followed by named options
110 32 50 33     103 if ( @args % 2 == 1 && !ref $args[0] ) {
111 32         59 my ( $url, %a ) = @args;
112 32         94 return ( $url, %a );
113             }
114             # all named: ( url => ..., %opts )
115 0         0 my %a = @args;
116 0         0 my $url = delete $a{url};
117 0         0 return ( $url, %a );
118             }
119              
120             # Build the detect option hash (just min_markdown for now) from a per-call
121             # %opts plus the instance default. Shared with Net::Async::Crawl4AI.
122             sub _detect_opts {
123 25     25   35 my ( $self, %opts ) = @_;
124 25 100       62 my $min = defined $opts{min_markdown} ? $opts{min_markdown} : $self->min_markdown;
125 25 100       53 return defined $min ? { min_markdown => $min } : {};
126             }
127              
128             # Turn one strategy run — its page, error, and timing — into an Attempt,
129             # classifying the page via Detect. Shared with Net::Async::Crawl4AI so the sync
130             # and async chains build identical attempt history.
131             #
132             # Classification is performed by overridable methods. To swap in a different
133             # classifier (e.g. Crawl4AI's own quality score), subclass and override
134             # classify_signals and classify_why_failed.
135             sub _attempt_for {
136 38     38   83 my ( $self, $strategy, $page, $err, $elapsed, $detect ) = @_;
137 38   50     60 $detect ||= {};
138 38 100       60 return WWW::Crawl4AI::Attempt->new(
139             backend => $strategy->name,
140             cost_class => $strategy->cost_class,
141             ok => 0,
142             error => $err,
143             why_failed => 'error',
144             elapsed => $elapsed,
145             ) if $err;
146 36 50       53 return WWW::Crawl4AI::Attempt->new(
147             backend => $strategy->name,
148             cost_class => $strategy->cost_class,
149             ok => 0,
150             why_failed => 'empty',
151             elapsed => $elapsed,
152             ) unless defined $page;
153              
154 36         66 my $signals = $self->classify_signals( $page, %$detect );
155 36         83 my $good = WWW::Crawl4AI::Detect::is_good( $page, %$detect );
156 36 100       97 return WWW::Crawl4AI::Attempt->new(
157             backend => $strategy->name,
158             cost_class => $strategy->cost_class,
159             ok => $good,
160             page => $page,
161             signals => $signals,
162             why_failed => ( $good ? undef : $self->classify_why_failed( $page, %$detect ) ),
163             elapsed => $elapsed,
164             );
165             }
166              
167              
168             sub classify_signals {
169 36     36 1 50 my ( $self, $page, %opts ) = @_;
170 36         88 return WWW::Crawl4AI::Detect::signals( $page, %opts );
171             }
172              
173              
174             sub classify_why_failed {
175 13     13 1 18 my ( $self, $page, %opts ) = @_;
176 13         23 return WWW::Crawl4AI::Detect::why_failed( $page, %opts );
177             }
178              
179             sub crawl {
180 25     25 1 40 my ( $self, @args ) = @_;
181 25         47 my ( $url, %opts ) = $self->_normalize_args(@args);
182 25 50 33     69 croak "crawl needs a url" unless defined $url && length $url;
183 25         47 my $detect = $self->_detect_opts(%opts);
184              
185 25         28 my @attempts;
186 25         27 for my $strategy ( @{ $self->strategies } ) {
  25         544  
187 38         152 my $t0 = Time::HiRes::time();
188 38         65 my $page = eval { $strategy->crawl( $self, $url, %opts ) };
  38         95  
189 38         65 my $err = $@;
190 38         360 my $elapsed = sprintf( '%.3f', Time::HiRes::time() - $t0 ) + 0;
191              
192 38         75 my $attempt = $self->_attempt_for( $strategy, $page, $err, $elapsed, $detect );
193 38         2321 push @attempts, $attempt;
194 38 100       167 return WWW::Crawl4AI::Result->from_attempt( $attempt, attempts => \@attempts ) if $attempt->ok;
195             }
196              
197 2         9 return $self->_failed_result( $url, \@attempts );
198             }
199              
200              
201 8     8 1 423 sub markdown { my ( $self, @args ) = @_; return $self->crawl(@args) }
  8         22  
202              
203             # Drop the fragment so map apps (#5/lat/lon/...) and trailing-anchor links don't
204             # look like distinct pages during dedup.
205             sub _canon_url {
206 0     0   0 my ( $self, $url ) = @_;
207 0 0       0 my $u = eval { URI->new($url) } or return $url;
  0         0  
208 0         0 $u->fragment(undef);
209 0         0 return $u->as_string;
210             }
211              
212             sub deep_crawl {
213 7     7 1 24 my ( $self, @args ) = @_;
214 7         15 my ( $start, %opts ) = $self->_normalize_args(@args);
215 7 50 33     24 croak "deep_crawl needs a url" unless defined $start && length $start;
216              
217             my $iter = WWW::Crawl4AI::DeepCrawlIterator->new(
218             crawler => $self,
219             start_url => $start,
220             max_pages => ( exists $opts{max_pages} ? delete $opts{max_pages} : 25 ),
221             max_depth => ( exists $opts{max_depth} ? delete $opts{max_depth} : 2 ),
222             same_host => ( exists $opts{same_host} ? delete $opts{same_host} : 1 ),
223             url_filter => ( delete $opts{url_filter} ),
224 7 100       107 on_page => ( delete $opts{on_page} ),
    50          
    100          
225             );
226              
227 7         80 my @results;
228 7         17 while ( my $page = $iter->next ) {
229 17         45 push @results, $page->[0];
230             }
231 7         46 return \@results;
232             }
233              
234              
235             sub _failed_result {
236 2     2   4 my ( $self, $url, $attempts ) = @_;
237 2         4 my $last = $attempts->[-1];
238 2 50       4 return WWW::Crawl4AI::Result->new(
239             ok => 0,
240             url => $url,
241             attempts => $attempts,
242             error => WWW::Crawl4AI::Error->new(
243             type => 'content',
244             message => 'all crawl strategies failed',
245             url => $url,
246             ),
247             why_failed => 'no_strategies',
248             ) unless $last;
249              
250             return WWW::Crawl4AI::Result->new(
251             ok => 0,
252             url => $url,
253             final_url => ( $last->page ? $last->page->{final_url} : undef ),
254 2 50 33     99 status => ( $last->page ? $last->page->{status_code} : undef ),
    50          
255             backend => $last->backend,
256             cost_class => $last->cost_class,
257             signals => $last->signals,
258             why_failed => $last->why_failed,
259             attempts => $attempts,
260             error => (
261             $last->error || WWW::Crawl4AI::Error->new(
262             type => 'content',
263             message => 'all crawl strategies failed (last: ' . ( $last->why_failed // 'unknown' ) . ')',
264             url => $url,
265             backend => $last->backend,
266             )
267             ),
268             );
269             }
270              
271 0     0 1 0 sub health { $_[0]->client->health }
272              
273              
274 0     0 1 0 sub screenshot { my $self = shift; $self->client->screenshot(@_) }
  0         0  
275 0     0 1 0 sub pdf { my $self = shift; $self->client->pdf(@_) }
  0         0  
276 0     0 1 0 sub html { my $self = shift; $self->client->html(@_) }
  0         0  
277 0     0 1 0 sub execute_js { my $self = shift; $self->client->execute_js(@_) }
  0         0  
278 0     0 1 0 sub llm { my $self = shift; $self->client->llm(@_) }
  0         0  
279 0     0 1 0 sub token { my $self = shift; $self->client->token(@_) }
  0         0  
280              
281              
282 4     4 1 16 sub available_backends { [ map { $_->name } @{ $_[0]->strategies } ] }
  12         19  
  4         46  
283              
284              
285             sub detect {
286 0     0 1   my ( $self ) = @_;
287             return {
288 0 0         crawl4ai => $self->health,
    0          
    0          
289             crawl4ai_url => $self->base_url,
290             cloakbrowser => ( $self->cloakbrowser_url ? WWW::Crawl4AI::Detect::probe_cloakbrowser( $self->cloakbrowser_url ) : 0 ),
291             cloakbrowser_url => $self->cloakbrowser_url,
292             proxy => ( $self->proxy_url ? 1 : 0 ),
293             proxy_url => $self->proxy_url,
294             callback => ( $self->callback ? 1 : 0 ),
295             backends => $self->available_backends,
296             };
297             }
298              
299              
300              
301             1;
302              
303             __END__