File Coverage

blib/lib/WWW/Crawl4AI/Strategy/CloakBrowser.pm
Criterion Covered Total %
statement 24 27 88.8
branch 4 4 100.0
condition 1 2 50.0
subroutine 7 9 77.7
pod 3 4 75.0
total 39 46 84.7


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Strategy::CloakBrowser;
2             # ABSTRACT: Crawl4AI strategy attaching to CloakBrowser over CDP
3 3     3   79466 use Moo;
  3         6295  
  3         15  
4 3     3   2264 use URI ();
  3         8013  
  3         44  
5 3     3   387 use WWW::Crawl4AI::Request ();
  3         6  
  3         1049  
6             with 'WWW::Crawl4AI::Strategy';
7              
8             our $VERSION = '0.001';
9              
10              
11 11     11 1 14 sub name { 'crawl4ai_cloakbrowser' }
12              
13              
14 0     0 1 0 sub cost_class { 'stealth' }
15              
16              
17             sub applicable {
18 7     7 1 11 my ( $self, $crawler ) = @_;
19 7 100       21 return $crawler->cloakbrowser_url ? 1 : 0;
20             }
21              
22              
23             sub build_request {
24 0     0 0 0 my ( $self, $crawler, $url ) = @_;
25 0         0 return $self->_request(
26             $url,
27             browser => {
28             browser_mode => 'custom',
29             cdp_url => $self->_cdp_url( $crawler, $url ),
30             cache_cdp_connection => WWW::Crawl4AI::Request::JSON_true(),
31             create_isolated_context => WWW::Crawl4AI::Request::JSON_true(),
32             },
33             crawler => { wait_until => 'networkidle' },
34             );
35             }
36              
37             # Per-domain stable fingerprint: if the configured CDP URL has no query string,
38             # append ?fingerprint= so each domain gets a consistent CloakBrowser
39             # identity. CloakBrowser requires the seed to be a NON-NEGATIVE INTEGER — a
40             # non-numeric value (e.g. a raw host string) is rejected with HTTP 400. So the
41             # host is folded into a deterministic 32-bit hash: same domain → same seed,
42             # different domains → different seeds. A URL that already carries query params
43             # is used verbatim.
44             sub _cdp_url {
45 6     6   246377 my ( $self, $crawler, $url ) = @_;
46 6         11 my $cdp = $crawler->cloakbrowser_url;
47 6 100       33 return $cdp if $cdp =~ /\?/;
48 5   50     8 my $host = eval { URI->new($url)->host } || 'default';
49 5         8068 ( my $base = $cdp ) =~ s{/+$}{};
50 5         11 return "$base?fingerprint=" . $self->_fingerprint_seed($host);
51             }
52              
53             # Deterministic 32-bit FNV-1a hash of the host. Zero-dependency, stable across
54             # runs and processes, yields a non-negative integer CloakBrowser accepts.
55             sub _fingerprint_seed {
56 8     8   2428 my ( $self, $host ) = @_;
57 8         8 my $hash = 2166136261; # FNV-1a 32-bit offset basis
58 8         20 for my $byte ( unpack 'C*', $host ) {
59 112         108 $hash ^= $byte;
60 112         140 $hash = ( $hash * 16777619 ) & 0xFFFFFFFF; # FNV prime, wrap to 32 bits
61             }
62 8         59 return $hash;
63             }
64              
65             1;
66              
67             __END__