File Coverage

blib/lib/WWW/Crawl4AI/Detect.pm
Criterion Covered Total %
statement 38 47 80.8
branch 31 38 81.5
condition 38 57 66.6
subroutine 7 8 87.5
pod 5 5 100.0
total 119 155 76.7


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Detect;
2             # ABSTRACT: service detection and content-quality classification for Crawl4AI
3 3     3   115376 use strict;
  3         6  
  3         110  
4 3     3   14 use warnings;
  3         16  
  3         3609  
5              
6             our $VERSION = '0.001';
7              
8              
9             # Default: a result needs at least this many markdown characters to count.
10             our $MIN_MARKDOWN = 500;
11              
12             # HTTP status codes that mean "the target pushed back", not "transport broke".
13             my %SOFT_FAIL = map { $_ => 1 } ( 401, 403, 429 );
14              
15             my $RE_JS = qr/enable\s+javascript|please\s+enable\s+js|requires?\s+javascript/i;
16             my $RE_BLOCK = qr/access\s+denied|checking\s+your\s+browser|are\s+you\s+(?:a\s+)?human|verify\s+you\s+are\s+human|unusual\s+traffic/i;
17             my $RE_CAPTCHA = qr/\b(?:re)?captcha\b|hcaptcha|g-recaptcha|cf-turnstile/i;
18             my $RE_WALL = qr/cf-chl|cf_chl|__cf_|datadome|perimeterx|px-captcha|akamai|incapsula|imperva/i;
19             my $RE_TITLE = qr/^\s*just\s+a\s+moment|^\s*attention\s+required|^\s*access\s+denied/i;
20              
21             #----------------------------------------------------------------------
22             # Content classification
23             #----------------------------------------------------------------------
24              
25             sub signals {
26 102     102 1 3733 my ( $page, %opt ) = @_;
27 102 100       165 my $min = defined $opt{min_markdown} ? $opt{min_markdown} : $MIN_MARKDOWN;
28 102   50     144 $page ||= {};
29 102   50     164 my $md = $page->{markdown} // '';
30 102   100     307 my $html = ( $page->{raw_html} // $page->{html} // '' );
      100        
31 102   100     174 my $title = $page->{title} // '';
32 102   50     167 my $code = $page->{status_code} // 0;
33              
34             # 'blocked' is about content fingerprints (bot walls in the body), not HTTP
35             # status — status lives on its own axis (http_error) so why_failed can report
36             # a bare 403 as http_403 while a Cloudflare body still reads bot_wall_detected.
37 102   66     6260 my $blocked =
38             ( $md =~ $RE_BLOCK )
39             || ( $html =~ $RE_WALL )
40             || ( $title =~ $RE_TITLE );
41              
42             # A captcha marker only walls the page when its prompt shows up in the
43             # *rendered* text (markdown). An embedded widget -- a comment-form reCAPTCHA,
44             # a Cloudflare Turnstile login box -- leaves markers only in the HTML/script
45             # markup (class names, script src), not the visible content, so an HTML-only
46             # match on an otherwise content-rich page is NOT a wall. Treat an HTML-only
47             # match as blocking only when the page is also thin (a JS-rendered gate).
48 102         155 my $thin = length($md) < $min;
49 102   100     9044 my $captcha = ( $md =~ $RE_CAPTCHA ) || ( $html =~ $RE_CAPTCHA && $thin );
50              
51             return {
52             js_required => ( $md =~ $RE_JS ) ? 1 : 0,
53             blocked => $blocked ? 1 : 0,
54             captcha => $captcha ? 1 : 0,
55             thin_html => $thin ? 1 : 0,
56 102 100 100     2222 http_error => ( $code >= 500 || $SOFT_FAIL{$code} ) ? 1 : 0,
    100          
    100          
    100          
    100          
57             };
58             }
59              
60              
61             sub is_good {
62 43     43 1 278787 my ( $page, %opt ) = @_;
63 43 50       85 return 0 unless ref $page eq 'HASH';
64 43 100 100     97 return 0 if defined $page->{success} && !$page->{success};
65 42   50     61 my $code = $page->{status_code} // 0;
66 42 100 66     143 return 0 if $code && ( $code >= 500 || $SOFT_FAIL{$code} );
      66        
67 38         63 my $sig = signals( $page, %opt );
68 38 100 33     197 return 0 if $sig->{js_required} || $sig->{blocked} || $sig->{captcha} || $sig->{thin_html};
      33        
      66        
69 26         68 return 1;
70             }
71              
72              
73             # Most specific reason first.
74             sub why_failed {
75 22     22 1 317 my ( $page, %opt ) = @_;
76 22 50       49 return 'empty' unless ref $page eq 'HASH';
77 22         32 my $sig = signals( $page, %opt );
78 22 100       46 return 'captcha' if $sig->{captcha};
79 20 100       99 return 'bot_wall_detected' if $sig->{blocked};
80 16 100       30 return 'js_required' if $sig->{js_required};
81 15   50     24 my $code = $page->{status_code} // 0;
82 15 100 100     58 return "http_$code" if $code && ( $code >= 500 || $SOFT_FAIL{$code} );
      66        
83 13 100       158 return 'thin_content' if $sig->{thin_html};
84 2         12 return undef;
85             }
86              
87              
88             #----------------------------------------------------------------------
89             # Service detection
90             #----------------------------------------------------------------------
91              
92             sub _probe_ua {
93 0     0   0 my ( $ua, $timeout ) = @_;
94 0 0       0 return $ua if $ua;
95 0         0 require LWP::UserAgent;
96 0   0     0 return LWP::UserAgent->new( agent => "WWW-Crawl4AI/$VERSION", timeout => ( $timeout // 5 ) );
97             }
98              
99             sub probe_cloakbrowser {
100 1     1 1 710 my ( $cdp_url, %opt ) = @_;
101 1 50 33     6 return 0 unless defined $cdp_url && length $cdp_url;
102 0         0 ( my $base = $cdp_url ) =~ s{/+$}{};
103 0         0 $base =~ s{\?.*$}{}; # strip CloakBrowser query params (fingerprint=...)
104 0         0 my $ua = _probe_ua( $opt{ua}, $opt{timeout} );
105 0         0 my $res = $ua->get( $base . '/json/version' );
106 0 0       0 return $res->is_success ? 1 : 0;
107             }
108              
109              
110             sub detect_proxy_env {
111 1   50 1 1 5 return $ENV{CRAWL4AI_PROXY_URL} || undef;
112             }
113              
114              
115             1;
116              
117             __END__