File Coverage

blib/lib/WWW/Crawl4AI/Detect.pm
Criterion Covered Total %
statement 39 48 81.2
branch 31 38 81.5
condition 49 65 75.3
subroutine 7 8 87.5
pod 5 5 100.0
total 131 164 79.8


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Detect;
2             # ABSTRACT: service detection and content-quality classification for Crawl4AI
3 3     3   78085 use strict;
  3         6  
  3         83  
4 3     3   9 use warnings;
  3         7  
  3         3376  
5              
6             our $VERSION = '0.001';
7              
8              
9             # Default: a result needs at least this many markdown characters to count.
10             our $MIN_MARKDOWN = 500;
11              
12             # HTTP status codes that mean "the target pushed back", not "transport broke".
13             my %SOFT_FAIL = map { $_ => 1 } ( 401, 403, 429 );
14              
15             my $RE_JS = qr/enable\s+javascript|please\s+enable\s+js|requires?\s+javascript/i;
16             my $RE_BLOCK = qr/access\s+denied|checking\s+your\s+browser|are\s+you\s+(?:a\s+)?human|verify\s+you\s+are\s+human|unusual\s+traffic/i;
17             my $RE_CAPTCHA = qr/(?:\b(?:re)?captcha\b|hcaptcha|g-recaptcha|cf-turnstile)/i;
18             my $RE_WALL = qr/cf-chl|cf_chl|__cf_|datadome|perimeterx|px-captcha|akamai|incapsula|imperva/i;
19             my $RE_TITLE = qr/^\s*just\s+a\s+moment|^\s*attention\s+required|^\s*access\s+denied/i;
20              
21             # A WAF / bot-management gate (Cloudflare, DataDome, PerimeterX, Akamai) often
22             # does not embed a widget into the requested page -- it REDIRECTS to a dedicated
23             # challenge URL. reCAPTCHA / hCaptcha redirects land on the provider's own
24             # verification endpoint. We key purely on the final (post-redirect) URL's
25             # host+path matching a known challenge endpoint: a real content page's
26             # final_url never contains /cdn-cgi/challenge etc., so URL equality with the
27             # requested URL is irrelevant (and checking it would false-positive on cosmetic
28             # http->https / www<->apex / trailing-slash redirects).
29             my $RE_CHALLENGE_CAPTCHA = qr{
30             (?:www\.)?google\.com/recaptcha # reCAPTCHA verification endpoint
31             | /recaptcha/api # reCAPTCHA api2/anchor frame
32             | \bhcaptcha\.com\b # hCaptcha challenge host
33             }ix;
34             my $RE_CHALLENGE_WALL = qr{
35             /cdn-cgi/challenge # Cloudflare managed challenge
36             | __cf_chl # Cloudflare challenge query/path token
37             | /challenge-platform/ # Cloudflare challenge-platform asset
38             | datadome # DataDome (host or path)
39             | geo\.captcha-delivery\.com # DataDome captcha delivery host
40             | /px/captcha # PerimeterX captcha path
41             | perimeterx # PerimeterX (host or path)
42             }ix;
43              
44             #----------------------------------------------------------------------
45             # Content classification
46             #----------------------------------------------------------------------
47              
48             sub signals {
49 129     129 1 11447 my ( $page, %opt ) = @_;
50 129 100       210 my $min = defined $opt{min_markdown} ? $opt{min_markdown} : $MIN_MARKDOWN;
51 129   50     229 $page ||= {};
52 129   50     231 my $md = $page->{markdown} // '';
53 129   100     431 my $html = ( $page->{raw_html} // $page->{html} // '' );
      100        
54 129   100     255 my $title = $page->{title} // '';
55 129   50     209 my $code = $page->{status_code} // 0;
56             # The post-redirect URL, falling back to the requested URL when the normalized
57             # page omits it. Either may be absent (signals() is also called on bare test
58             # hashes) -- when so, the challenge-URL checks below simply find no match and
59             # no signal is raised. Never warns/dies on missing keys.
60 129   66     290 my $final = $page->{final_url} // $page->{url} // '';
      50        
61              
62             # Content volume is the master signal. A bot-wall / JS-shell / captcha gate
63             # REPLACES the page content -- it is, by definition, thin. So every signal
64             # derived from VISIBLE rendered text (the markdown) is only trustworthy on a
65             # thin page: on a content-rich page those same words are incidental mentions
66             # (a footer "enable JavaScript", an article quoting "unusual traffic", a
67             # privacy note about reCAPTCHA) and must NOT discard a successful scrape --
68             # body words can never prove a scrape was impossible once we hold the content.
69             # STRUCTURAL fingerprints are exempt: WAF tokens in the HTML markup
70             # (__cf_chl, datadome), a "Just a moment" , or a redirect whose </td> </tr> <tr> <td class="h" > <a name="71">71</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # final_url is a known challenge endpoint -- a real content page never carries </td> </tr> <tr> <td class="h" > <a name="72">72</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # those, regardless of size. </td> </tr> <tr> <td class="h" > <a name="73">73</a> </td> <td class="c3" > 129 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 158 </td> <td class="s"> my $thin = length($md) < $min; </td> </tr> <tr> <td class="h" > <a name="74">74</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="75">75</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # 'blocked' is a bot-wall fingerprint, not HTTP status (status lives on the </td> </tr> <tr> <td class="h" > <a name="76">76</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # http_error axis, so a bare 403 reads http_403 while a Cloudflare body reads </td> </tr> <tr> <td class="h" > <a name="77">77</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # bot_wall_detected). The visible-text match ($RE_BLOCK) only counts on a thin </td> </tr> <tr> <td class="h" > <a name="78">78</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # page; the structural arms (WAF tokens in HTML, "Just a moment" title, </td> </tr> <tr> <td class="h" > <a name="79">79</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # redirect to a challenge URL) stand alone. </td> </tr> <tr> <td class="h" > <a name="80">80</a> </td> <td class="c3" > 129 </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#80-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 1427 </td> <td class="s"> my $blocked = </td> </tr> <tr> <td class="h" > <a name="81">81</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> ( $thin && $md =~ $RE_BLOCK ) </td> </tr> <tr> <td class="h" > <a name="82">82</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> || ( $html =~ $RE_WALL ) </td> </tr> <tr> <td class="h" > <a name="83">83</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> || ( $title =~ $RE_TITLE ) </td> </tr> <tr> <td class="h" > <a name="84">84</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> || ( $final =~ $RE_CHALLENGE_WALL ); </td> </tr> <tr> <td class="h" > <a name="85">85</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="86">86</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # 'captcha' is a captcha *wall*, not an incidental widget or mention: </td> </tr> <tr> <td class="h" > <a name="87">87</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # * thin page + any marker (markdown OR html) -> wall. A near-empty page that </td> </tr> <tr> <td class="h" > <a name="88">88</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # carries a captcha marker is a JS-rendered gate (real content never loaded). </td> </tr> <tr> <td class="h" > <a name="89">89</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # * redirect to a CAPTCHA provider's own verification endpoint </td> </tr> <tr> <td class="h" > <a name="90">90</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # (google.com/recaptcha, hcaptcha.com) -> wall. The final_url left the </td> </tr> <tr> <td class="h" > <a name="91">91</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # origin and landed on the captcha provider; size-independent. </td> </tr> <tr> <td class="h" > <a name="92">92</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # * rich page + marker (markdown OR html-only) -> NOT a wall. A cookie-banner </td> </tr> <tr> <td class="h" > <a name="93">93</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # reCAPTCHA note, an embedded comment-form widget, a Turnstile login box -- </td> </tr> <tr> <td class="h" > <a name="94">94</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # the real content is present, so the marker is incidental. </td> </tr> <tr> <td class="h" > <a name="95">95</a> </td> <td class="c3" > 129 </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#95-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 881 </td> <td class="s"> my $captcha = </td> </tr> <tr> <td class="h" > <a name="96">96</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> ( $thin && ( $md =~ $RE_CAPTCHA || $html =~ $RE_CAPTCHA ) ) </td> </tr> <tr> <td class="h" > <a name="97">97</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> || ( $final =~ $RE_CHALLENGE_CAPTCHA ); </td> </tr> <tr> <td class="h" > <a name="98">98</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="99">99</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> return { </td> </tr> <tr> <td class="h" > <a name="100">100</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # A thin JS shell whose only text is "enable JavaScript" -- the real content </td> </tr> <tr> <td class="h" > <a name="101">101</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # never rendered. A rich page that merely mentions JavaScript is already </td> </tr> <tr> <td class="h" > <a name="102">102</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # rendered, so the match is incidental. </td> </tr> <tr> <td class="h" > <a name="103">103</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> js_required => ( $thin && $md =~ $RE_JS ) ? 1 : 0, </td> </tr> <tr> <td class="h" > <a name="104">104</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> blocked => $blocked ? 1 : 0, </td> </tr> <tr> <td class="h" > <a name="105">105</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> captcha => $captcha ? 1 : 0, </td> </tr> <tr> <td class="h" > <a name="106">106</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> thin_html => $thin ? 1 : 0, </td> </tr> <tr> <td class="h" > <a name="107">107</a> </td> <td class="c3" > 129 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#107-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#107-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 1005 </td> <td class="s"> http_error => ( $code >= 500 || $SOFT_FAIL{$code} ) ? 1 : 0, </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#-2"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#-3"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#-4"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#-5"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="108">108</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> }; </td> </tr> <tr> <td class="h" > <a name="109">109</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="110">110</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="111">111</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="112">112</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub is_good { </td> </tr> <tr> <td class="h" > <a name="113">113</a> </td> <td class="c3" > 53 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#113-1"> 53 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#113-1"> 1 </a> </td> <td > 268766 </td> <td class="s"> my ( $page, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="114">114</a> </td> <td class="c3" > 53 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#114-1"> 50 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 108 </td> <td class="s"> return 0 unless ref $page eq 'HASH'; </td> </tr> <tr> <td class="h" > <a name="115">115</a> </td> <td class="c3" > 53 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#115-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#115-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 134 </td> <td class="s"> return 0 if defined $page->{success} && !$page->{success}; </td> </tr> <tr> <td class="h" > <a name="116">116</a> </td> <td class="c3" > 52 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#116-1"> 50 </a> </td> <td >   </td> <td >   </td> <td > 102 </td> <td class="s"> my $code = $page->{status_code} // 0; </td> </tr> <tr> <td class="h" > <a name="117">117</a> </td> <td class="c3" > 52 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#117-1"> 100 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#117-1"> 66 </a> </td> <td >   </td> <td >   </td> <td > 210 </td> <td class="s"> return 0 if $code && ( $code >= 500 || $SOFT_FAIL{$code} ); </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 66 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="118">118</a> </td> <td class="c3" > 51 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 76 </td> <td class="s"> my $sig = signals( $page, %opt ); </td> </tr> <tr> <td class="h" > <a name="119">119</a> </td> <td class="c3" > 51 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#119-1"> 100 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#119-1"> 66 </a> </td> <td >   </td> <td >   </td> <td > 251 </td> <td class="s"> return 0 if $sig->{js_required} || $sig->{blocked} || $sig->{captcha} || $sig->{thin_html}; </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-3"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="120">120</a> </td> <td class="c3" > 33 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 89 </td> <td class="s"> return 1; </td> </tr> <tr> <td class="h" > <a name="121">121</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="122">122</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="123">123</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="124">124</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # Most specific reason first. </td> </tr> <tr> <td class="h" > <a name="125">125</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub why_failed { </td> </tr> <tr> <td class="h" > <a name="126">126</a> </td> <td class="c3" > 26 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#126-1"> 26 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#126-1"> 1 </a> </td> <td > 386 </td> <td class="s"> my ( $page, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="127">127</a> </td> <td class="c3" > 26 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#127-1"> 50 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 78 </td> <td class="s"> return 'empty' unless ref $page eq 'HASH'; </td> </tr> <tr> <td class="h" > <a name="128">128</a> </td> <td class="c3" > 26 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 44 </td> <td class="s"> my $sig = signals( $page, %opt ); </td> </tr> <tr> <td class="h" > <a name="129">129</a> </td> <td class="c3" > 26 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#129-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 73 </td> <td class="s"> return 'captcha' if $sig->{captcha}; </td> </tr> <tr> <td class="h" > <a name="130">130</a> </td> <td class="c3" > 23 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#130-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 93 </td> <td class="s"> return 'bot_wall_detected' if $sig->{blocked}; </td> </tr> <tr> <td class="h" > <a name="131">131</a> </td> <td class="c3" > 17 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#131-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 35 </td> <td class="s"> return 'js_required' if $sig->{js_required}; </td> </tr> <tr> <td class="h" > <a name="132">132</a> </td> <td class="c3" > 16 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#132-1"> 50 </a> </td> <td >   </td> <td >   </td> <td > 27 </td> <td class="s"> my $code = $page->{status_code} // 0; </td> </tr> <tr> <td class="h" > <a name="133">133</a> </td> <td class="c3" > 16 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#133-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#133-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 73 </td> <td class="s"> return "http_$code" if $code && ( $code >= 500 || $SOFT_FAIL{$code} ); </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 66 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="134">134</a> </td> <td class="c3" > 14 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#134-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 166 </td> <td class="s"> return 'thin_content' if $sig->{thin_html}; </td> </tr> <tr> <td class="h" > <a name="135">135</a> </td> <td class="c3" > 3 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 19 </td> <td class="s"> return undef; </td> </tr> <tr> <td class="h" > <a name="136">136</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="137">137</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="138">138</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="139">139</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> #---------------------------------------------------------------------- </td> </tr> <tr> <td class="h" > <a name="140">140</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # Service detection </td> </tr> <tr> <td class="h" > <a name="141">141</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> #---------------------------------------------------------------------- </td> </tr> <tr> <td class="h" > <a name="142">142</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="143">143</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub _probe_ua { </td> </tr> <tr> <td class="h" > <a name="144">144</a> </td> <td class="c0" > <a href="#145"> 0 </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#144-1"> 0 </a> </td> <td >   </td> <td > 0 </td> <td class="s"> my ( $ua, $timeout ) = @_; </td> </tr> <tr> <td class="h" > <a name="145">145</a> </td> <td class="c0" > <a href="#146"> 0 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#145-1"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return $ua if $ua; </td> </tr> <tr> <td class="h" > <a name="146">146</a> </td> <td class="c0" > <a href="#147"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> require LWP::UserAgent; </td> </tr> <tr> <td class="h" > <a name="147">147</a> </td> <td class="c0" > <a href="#153"> 0 </a> </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#147-1"> 0 </a> </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return LWP::UserAgent->new( agent => "WWW-Crawl4AI/$VERSION", timeout => ( $timeout // 5 ) ); </td> </tr> <tr> <td class="h" > <a name="148">148</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="149">149</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="150">150</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub probe_cloakbrowser { </td> </tr> <tr> <td class="h" > <a name="151">151</a> </td> <td class="c3" > 1 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#151-1"> 1 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#151-1"> 1 </a> </td> <td > 989 </td> <td class="s"> my ( $cdp_url, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="152">152</a> </td> <td class="c3" > 1 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#152-1"> 50 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#152-1"> 33 </a> </td> <td >   </td> <td >   </td> <td > 8 </td> <td class="s"> return 0 unless defined $cdp_url && length $cdp_url; </td> </tr> <tr> <td class="h" > <a name="153">153</a> </td> <td class="c0" > <a href="#154"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> ( my $base = $cdp_url ) =~ s{/+$}{}; </td> </tr> <tr> <td class="h" > <a name="154">154</a> </td> <td class="c0" > <a href="#155"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> $base =~ s{\?.*$}{}; # strip CloakBrowser query params (fingerprint=...) </td> </tr> <tr> <td class="h" > <a name="155">155</a> </td> <td class="c0" > <a href="#156"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> my $ua = _probe_ua( $opt{ua}, $opt{timeout} ); </td> </tr> <tr> <td class="h" > <a name="156">156</a> </td> <td class="c0" > <a href="#157"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> my $res = $ua->get( $base . '/json/version' ); </td> </tr> <tr> <td class="h" > <a name="157">157</a> </td> <td class="c0" > 0 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#157-1"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return $res->is_success ? 1 : 0; </td> </tr> <tr> <td class="h" > <a name="158">158</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="159">159</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="160">160</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="161">161</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub detect_proxy_env { </td> </tr> <tr> <td class="h" > <a name="162">162</a> </td> <td class="c3" > 1 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#162-1"> 50 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#162-1"> 1 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#162-1"> 1 </a> </td> <td > 8 </td> <td class="s"> return $ENV{CRAWL4AI_PROXY_URL} || undef; </td> </tr> <tr> <td class="h" > <a name="163">163</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="164">164</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="165">165</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="166">166</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> 1; </td> </tr> <tr> <td class="h" > <a name="167">167</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="168">168</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> __END__ </td> </tr> </table> </body> </html>