File Coverage

blib/lib/WWW/Crawl4AI/Detect.pm
Criterion Covered Total %
statement 39 48 81.2
branch 31 38 81.5
condition 46 62 74.1
subroutine 7 8 87.5
pod 5 5 100.0
total 128 161 79.5


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Detect;
2             # ABSTRACT: service detection and content-quality classification for Crawl4AI
3 3     3   77714 use strict;
  3         5  
  3         81  
4 3     3   9 use warnings;
  3         8  
  3         3398  
5              
6             our $VERSION = '0.001';
7              
8              
9             # Default: a result needs at least this many markdown characters to count.
10             our $MIN_MARKDOWN = 500;
11              
12             # HTTP status codes that mean "the target pushed back", not "transport broke".
13             my %SOFT_FAIL = map { $_ => 1 } ( 401, 403, 429 );
14              
15             my $RE_JS = qr/enable\s+javascript|please\s+enable\s+js|requires?\s+javascript/i;
16             my $RE_BLOCK = qr/access\s+denied|checking\s+your\s+browser|are\s+you\s+(?:a\s+)?human|verify\s+you\s+are\s+human|unusual\s+traffic/i;
17             my $RE_CAPTCHA = qr/(?:\b(?:re)?captcha\b|hcaptcha|g-recaptcha|cf-turnstile)/i;
18             # Prompt language that signals a captcha *wall* (a gate the visitor must clear)
19             # rather than an incidental mention (a cookie-banner / privacy-policy note about
20             # reCAPTCHA). A wall tells the visitor to act on the captcha, or that the page is
21             # being held until they prove they are human.
22             my $RE_CAPTCHA_PROMPT = qr/
23             (?:complete|solve|finish|pass|verify)\b[^.]{0,40}$RE_CAPTCHA # "complete the captcha"
24             | $RE_CAPTCHA[^.]{0,40}(?:to\s+continue|to\s+proceed|to\s+access) # "captcha to continue"
25             | i['\x{2019}]?m\s+not\s+a\s+robot # "I'm not a robot"
26             | (?:verify|prove|confirm)\s+(?:that\s+)?you\s+are\s+(?:a\s+)?human # "verify you are human"
27             | checking\s+your\s+browser
28             | security\s+check
29             /ix;
30             my $RE_WALL = qr/cf-chl|cf_chl|__cf_|datadome|perimeterx|px-captcha|akamai|incapsula|imperva/i;
31             my $RE_TITLE = qr/^\s*just\s+a\s+moment|^\s*attention\s+required|^\s*access\s+denied/i;
32              
33             # A WAF / bot-management gate (Cloudflare, DataDome, PerimeterX, Akamai) often
34             # does not embed a widget into the requested page -- it REDIRECTS to a dedicated
35             # challenge URL. reCAPTCHA / hCaptcha redirects land on the provider's own
36             # verification endpoint. We key purely on the final (post-redirect) URL's
37             # host+path matching a known challenge endpoint: a real content page's
38             # final_url never contains /cdn-cgi/challenge etc., so URL equality with the
39             # requested URL is irrelevant (and checking it would false-positive on cosmetic
40             # http->https / www<->apex / trailing-slash redirects).
41             my $RE_CHALLENGE_CAPTCHA = qr{
42             (?:www\.)?google\.com/recaptcha # reCAPTCHA verification endpoint
43             | /recaptcha/api # reCAPTCHA api2/anchor frame
44             | \bhcaptcha\.com\b # hCaptcha challenge host
45             }ix;
46             my $RE_CHALLENGE_WALL = qr{
47             /cdn-cgi/challenge # Cloudflare managed challenge
48             | __cf_chl # Cloudflare challenge query/path token
49             | /challenge-platform/ # Cloudflare challenge-platform asset
50             | datadome # DataDome (host or path)
51             | geo\.captcha-delivery\.com # DataDome captcha delivery host
52             | /px/captcha # PerimeterX captcha path
53             | perimeterx # PerimeterX (host or path)
54             }ix;
55              
56             #----------------------------------------------------------------------
57             # Content classification
58             #----------------------------------------------------------------------
59              
60             sub signals {
61 121     121 1 8983 my ( $page, %opt ) = @_;
62 121 100       207 my $min = defined $opt{min_markdown} ? $opt{min_markdown} : $MIN_MARKDOWN;
63 121   50     172 $page ||= {};
64 121   50     196 my $md = $page->{markdown} // '';
65 121   100     370 my $html = ( $page->{raw_html} // $page->{html} // '' );
      100        
66 121   100     217 my $title = $page->{title} // '';
67 121   50     180 my $code = $page->{status_code} // 0;
68             # The post-redirect URL, falling back to the requested URL when the normalized
69             # page omits it. Either may be absent (signals() is also called on bare test
70             # hashes) -- when so, the challenge-URL checks below simply find no match and
71             # no signal is raised. Never warns/dies on missing keys.
72 121   66     285 my $final = $page->{final_url} // $page->{url} // '';
      50        
73              
74             # 'blocked' is about content fingerprints (bot walls in the body), not HTTP
75             # status — status lives on its own axis (http_error) so why_failed can report
76             # a bare 403 as http_403 while a Cloudflare body still reads bot_wall_detected.
77             # ADDITIONALLY: a WAF/bot-management gate often redirects to a challenge URL
78             # (e.g. /cdn-cgi/challenge, geo.captcha-delivery.com). When the final_url
79             # matches a known challenge endpoint we OR that in -- we never clear a signal
80             # already raised by a body fingerprint.
81 121   100     9863 my $blocked =
82             ( $md =~ $RE_BLOCK )
83             || ( $html =~ $RE_WALL )
84             || ( $title =~ $RE_TITLE )
85             || ( $final =~ $RE_CHALLENGE_WALL );
86              
87             # A captcha marker alone does not wall a page -- context decides. Three rules:
88             # * thin page + any marker (markdown OR html) -> wall. A near-empty page that
89             # mentions a captcha is a JS-rendered gate (the real content never loaded).
90             # * rich page + markdown marker + wall-PROMPT language -> wall. A verbose
91             # captcha page ("complete the hCaptcha to continue", "I'm not a robot")
92             # carries prompt language in its rendered text.
93             # * rich page + markdown marker but NO prompt language -> NOT a wall. This is
94             # an incidental mention -- a cookie banner / privacy policy noting that the
95             # site uses reCAPTCHA. The real content is present; do not punish it.
96             # * rich page + html-only marker -> NOT a wall. An embedded widget (comment-
97             # form reCAPTCHA, Turnstile login box) leaves markers only in the markup
98             # (class names, script src), never in the visible content.
99             # * redirect to a CAPTCHA provider's own verification endpoint
100             # (google.com/recaptcha, hcaptcha.com) -> wall. The final_url left the
101             # origin entirely and landed on the captcha provider. OR-ed in; an
102             # already-true captcha signal is never cleared.
103 121         194 my $thin = length($md) < $min;
104 121   100     17967 my $captcha =
105             ( $thin && ( $md =~ $RE_CAPTCHA || $html =~ $RE_CAPTCHA ) )
106             || ( $md =~ $RE_CAPTCHA && $md =~ $RE_CAPTCHA_PROMPT )
107             || ( $final =~ $RE_CHALLENGE_CAPTCHA );
108              
109             return {
110             js_required => ( $md =~ $RE_JS ) ? 1 : 0,
111             blocked => $blocked ? 1 : 0,
112             captcha => $captcha ? 1 : 0,
113             thin_html => $thin ? 1 : 0,
114 121 100 100     3106 http_error => ( $code >= 500 || $SOFT_FAIL{$code} ) ? 1 : 0,
    100          
    100          
    100          
    100          
115             };
116             }
117              
118              
119             sub is_good {
120 51     51 1 239520 my ( $page, %opt ) = @_;
121 51 50       103 return 0 unless ref $page eq 'HASH';
122 51 100 100     125 return 0 if defined $page->{success} && !$page->{success};
123 50   50     76 my $code = $page->{status_code} // 0;
124 50 100 66     183 return 0 if $code && ( $code >= 500 || $SOFT_FAIL{$code} );
      66        
125 46         68 my $sig = signals( $page, %opt );
126 46 100 66     233 return 0 if $sig->{js_required} || $sig->{blocked} || $sig->{captcha} || $sig->{thin_html};
      100        
      100        
127 31         96 return 1;
128             }
129              
130              
131             # Most specific reason first.
132             sub why_failed {
133 25     25 1 338 my ( $page, %opt ) = @_;
134 25 50       64 return 'empty' unless ref $page eq 'HASH';
135 25         44 my $sig = signals( $page, %opt );
136 25 100       54 return 'captcha' if $sig->{captcha};
137 22 100       93 return 'bot_wall_detected' if $sig->{blocked};
138 16 100       30 return 'js_required' if $sig->{js_required};
139 15   50     22 my $code = $page->{status_code} // 0;
140 15 100 100     89 return "http_$code" if $code && ( $code >= 500 || $SOFT_FAIL{$code} );
      66        
141 13 100       174 return 'thin_content' if $sig->{thin_html};
142 2         9 return undef;
143             }
144              
145              
146             #----------------------------------------------------------------------
147             # Service detection
148             #----------------------------------------------------------------------
149              
150             sub _probe_ua {
151 0     0   0 my ( $ua, $timeout ) = @_;
152 0 0       0 return $ua if $ua;
153 0         0 require LWP::UserAgent;
154 0   0     0 return LWP::UserAgent->new( agent => "WWW-Crawl4AI/$VERSION", timeout => ( $timeout // 5 ) );
155             }
156              
157             sub probe_cloakbrowser {
158 1     1 1 739 my ( $cdp_url, %opt ) = @_;
159 1 50 33     7 return 0 unless defined $cdp_url && length $cdp_url;
160 0         0 ( my $base = $cdp_url ) =~ s{/+$}{};
161 0         0 $base =~ s{\?.*$}{}; # strip CloakBrowser query params (fingerprint=...)
162 0         0 my $ua = _probe_ua( $opt{ua}, $opt{timeout} );
163 0         0 my $res = $ua->get( $base . '/json/version' );
164 0 0       0 return $res->is_success ? 1 : 0;
165             }
166              
167              
168             sub detect_proxy_env {
169 1   50 1 1 5 return $ENV{CRAWL4AI_PROXY_URL} || undef;
170             }
171              
172              
173             1;
174              
175             __END__