| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Crawl4AI::Strategy::CloakBrowser; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Crawl4AI strategy attaching to CloakBrowser over CDP |
|
3
|
3
|
|
|
3
|
|
79466
|
use Moo; |
|
|
3
|
|
|
|
|
6295
|
|
|
|
3
|
|
|
|
|
15
|
|
|
4
|
3
|
|
|
3
|
|
2264
|
use URI (); |
|
|
3
|
|
|
|
|
8013
|
|
|
|
3
|
|
|
|
|
44
|
|
|
5
|
3
|
|
|
3
|
|
387
|
use WWW::Crawl4AI::Request (); |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
1049
|
|
|
6
|
|
|
|
|
|
|
with 'WWW::Crawl4AI::Strategy'; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.001'; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
11
|
|
|
11
|
1
|
14
|
sub name { 'crawl4ai_cloakbrowser' } |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
0
|
|
|
0
|
1
|
0
|
sub cost_class { 'stealth' } |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub applicable { |
|
18
|
7
|
|
|
7
|
1
|
11
|
my ( $self, $crawler ) = @_; |
|
19
|
7
|
100
|
|
|
|
21
|
return $crawler->cloakbrowser_url ? 1 : 0; |
|
20
|
|
|
|
|
|
|
} |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub build_request { |
|
24
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $crawler, $url ) = @_; |
|
25
|
0
|
|
|
|
|
0
|
return $self->_request( |
|
26
|
|
|
|
|
|
|
$url, |
|
27
|
|
|
|
|
|
|
browser => { |
|
28
|
|
|
|
|
|
|
browser_mode => 'custom', |
|
29
|
|
|
|
|
|
|
cdp_url => $self->_cdp_url( $crawler, $url ), |
|
30
|
|
|
|
|
|
|
cache_cdp_connection => WWW::Crawl4AI::Request::JSON_true(), |
|
31
|
|
|
|
|
|
|
create_isolated_context => WWW::Crawl4AI::Request::JSON_true(), |
|
32
|
|
|
|
|
|
|
}, |
|
33
|
|
|
|
|
|
|
crawler => { wait_until => 'networkidle' }, |
|
34
|
|
|
|
|
|
|
); |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Per-domain stable fingerprint: if the configured CDP URL has no query string, |
|
38
|
|
|
|
|
|
|
# append ?fingerprint= so each domain gets a consistent CloakBrowser |
|
39
|
|
|
|
|
|
|
# identity. CloakBrowser requires the seed to be a NON-NEGATIVE INTEGER — a |
|
40
|
|
|
|
|
|
|
# non-numeric value (e.g. a raw host string) is rejected with HTTP 400. So the |
|
41
|
|
|
|
|
|
|
# host is folded into a deterministic 32-bit hash: same domain → same seed, |
|
42
|
|
|
|
|
|
|
# different domains → different seeds. A URL that already carries query params |
|
43
|
|
|
|
|
|
|
# is used verbatim. |
|
44
|
|
|
|
|
|
|
sub _cdp_url { |
|
45
|
6
|
|
|
6
|
|
246377
|
my ( $self, $crawler, $url ) = @_; |
|
46
|
6
|
|
|
|
|
11
|
my $cdp = $crawler->cloakbrowser_url; |
|
47
|
6
|
100
|
|
|
|
33
|
return $cdp if $cdp =~ /\?/; |
|
48
|
5
|
|
50
|
|
|
8
|
my $host = eval { URI->new($url)->host } || 'default'; |
|
49
|
5
|
|
|
|
|
8068
|
( my $base = $cdp ) =~ s{/+$}{}; |
|
50
|
5
|
|
|
|
|
11
|
return "$base?fingerprint=" . $self->_fingerprint_seed($host); |
|
51
|
|
|
|
|
|
|
} |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Deterministic 32-bit FNV-1a hash of the host. Zero-dependency, stable across |
|
54
|
|
|
|
|
|
|
# runs and processes, yields a non-negative integer CloakBrowser accepts. |
|
55
|
|
|
|
|
|
|
sub _fingerprint_seed { |
|
56
|
8
|
|
|
8
|
|
2428
|
my ( $self, $host ) = @_; |
|
57
|
8
|
|
|
|
|
8
|
my $hash = 2166136261; # FNV-1a 32-bit offset basis |
|
58
|
8
|
|
|
|
|
20
|
for my $byte ( unpack 'C*', $host ) { |
|
59
|
112
|
|
|
|
|
108
|
$hash ^= $byte; |
|
60
|
112
|
|
|
|
|
140
|
$hash = ( $hash * 16777619 ) & 0xFFFFFFFF; # FNV prime, wrap to 32 bits |
|
61
|
|
|
|
|
|
|
} |
|
62
|
8
|
|
|
|
|
59
|
return $hash; |
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
1; |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
__END__ |