File Coverage

blib/lib/WWW/Crawl4AI/Detect.pm
Criterion Covered Total %
statement 36 45 80.0
branch 27 34 79.4
condition 31 46 67.3
subroutine 7 8 87.5
pod 5 5 100.0
total 106 138 76.8


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Detect;
2             # ABSTRACT: service detection and content-quality classification for Crawl4AI
3 3     3   77142 use strict;
  3         6  
  3         110  
4 3     3   12 use warnings;
  3         9  
  3         2447  
5              
6             our $VERSION = '0.001';
7              
8              
9             # Default: a result needs at least this many markdown characters to count.
10             our $MIN_MARKDOWN = 500;
11              
12             # HTTP status codes that mean "the target pushed back", not "transport broke".
13             my %SOFT_FAIL = map { $_ => 1 } ( 401, 403, 429 );
14              
15             # A WAF / bot-management gate (Cloudflare, DataDome, PerimeterX, Akamai) often
16             # does not embed a widget into the requested page -- it REDIRECTS to a dedicated
17             # challenge URL. reCAPTCHA / hCaptcha redirects land on the provider's own
18             # verification endpoint. We key purely on the final (post-redirect) URL's
19             # host+path matching a known challenge endpoint: a real content page's
20             # final_url never contains /cdn-cgi/challenge etc., so URL equality with the
21             # requested URL is irrelevant (and checking it would false-positive on cosmetic
22             # http->https / www<->apex / trailing-slash redirects).
23             my $RE_CHALLENGE_CAPTCHA = qr{
24             (?:www\.)?google\.com/recaptcha # reCAPTCHA verification endpoint
25             | /recaptcha/api # reCAPTCHA api2/anchor frame
26             | \bhcaptcha\.com\b # hCaptcha challenge host
27             }ix;
28             my $RE_CHALLENGE_WALL = qr{
29             /cdn-cgi/challenge # Cloudflare managed challenge
30             | __cf_chl # Cloudflare challenge query/path token
31             | /challenge-platform/ # Cloudflare challenge-platform asset
32             | datadome # DataDome (host or path)
33             | geo\.captcha-delivery\.com # DataDome captcha delivery host
34             | /px/captcha # PerimeterX captcha path
35             | perimeterx # PerimeterX (host or path)
36             }ix;
37              
38             #----------------------------------------------------------------------
39             # Content classification
40             #----------------------------------------------------------------------
41              
42             sub signals {
43 132     132 1 9616 my ( $page, %opt ) = @_;
44 132 100       206 my $min = defined $opt{min_markdown} ? $opt{min_markdown} : $MIN_MARKDOWN;
45 132   50     199 $page ||= {};
46 132   50     209 my $md = $page->{markdown} // '';
47 132   50     176 my $code = $page->{status_code} // 0;
48             # The post-redirect URL, falling back to the requested URL when the normalized
49             # page omits it. Either may be absent (signals() is also called on bare test
50             # hashes) -- when so, the challenge-URL checks below simply find no match and
51             # no signal is raised. Never warns/dies on missing keys.
52 132   66     359 my $final = $page->{final_url} // $page->{url} // '';
      50        
53              
54             # Content volume is the master signal. A bot-wall / JS-shell / captcha gate
55             # REPLACES the page content -- it is, by definition, thin. So a content-rich
56             # page (>= $min markdown chars) that came back 200 IS the scrape: nothing in
57             # its body text or may discard it. The body/title phrase heuristics </td> </tr> <tr> <td class="h" > <a name="58">58</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # ($RE_BLOCK, $RE_JS, $RE_CAPTCHA body arms, $RE_WALL HTML-token, $RE_TITLE) </td> </tr> <tr> <td class="h" > <a name="59">59</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # were removed in 0.005: on a thin page they were redundant (thin_html already </td> </tr> <tr> <td class="h" > <a name="60">60</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # fails the page), and on a full page they were pure false-positives -- a </td> </tr> <tr> <td class="h" > <a name="61">61</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # 386 KB article carrying Cloudflare's passive __cf_ beacon, or a legit </td> </tr> <tr> <td class="h" > <a name="62">62</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # "Access Denied" <title>, was wrongly thrown away. The ONLY size-independent </td> </tr> <tr> <td class="h" > <a name="63">63</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # block signals kept are the fingerprints a real content page can never carry: </td> </tr> <tr> <td class="h" > <a name="64">64</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # an HTTP push-back status, or a redirect whose final_url is a known WAF / </td> </tr> <tr> <td class="h" > <a name="65">65</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # captcha challenge endpoint (the page physically left the origin). </td> </tr> <tr> <td class="h" > <a name="66">66</a> </td> <td class="c3" > 132 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 174 </td> <td class="s"> my $thin = length($md) < $min; </td> </tr> <tr> <td class="h" > <a name="67">67</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="68">68</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # 'blocked' / 'captcha' fire only when the post-redirect final_url is a known </td> </tr> <tr> <td class="h" > <a name="69">69</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # WAF / captcha challenge endpoint. Not HTTP status -- that lives on the </td> </tr> <tr> <td class="h" > <a name="70">70</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # http_error axis. A site that soft-blocks us by serving one identical </td> </tr> <tr> <td class="h" > <a name="71">71</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # interstitial for every URL (200, no redirect) is caught one level up, by the </td> </tr> <tr> <td class="h" > <a name="72">72</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # caller comparing markdown across the fetched pages -- not here, per-page. </td> </tr> <tr> <td class="h" > <a name="73">73</a> </td> <td class="c3" > 132 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#73-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 681 </td> <td class="s"> my $blocked = ( $final =~ $RE_CHALLENGE_WALL ) ? 1 : 0; </td> </tr> <tr> <td class="h" > <a name="74">74</a> </td> <td class="c3" > 132 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#74-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 668 </td> <td class="s"> my $captcha = ( $final =~ $RE_CHALLENGE_CAPTCHA ) ? 1 : 0; </td> </tr> <tr> <td class="h" > <a name="75">75</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="76">76</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> return { </td> </tr> <tr> <td class="h" > <a name="77">77</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> blocked => $blocked, </td> </tr> <tr> <td class="h" > <a name="78">78</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> captcha => $captcha, </td> </tr> <tr> <td class="h" > <a name="79">79</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> thin_html => $thin ? 1 : 0, </td> </tr> <tr> <td class="h" > <a name="80">80</a> </td> <td class="c3" > 132 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#80-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#80-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 651 </td> <td class="s"> http_error => ( $code >= 500 || $SOFT_FAIL{$code} ) ? 1 : 0, </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#-2"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="81">81</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> }; </td> </tr> <tr> <td class="h" > <a name="82">82</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="83">83</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="84">84</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="85">85</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub is_good { </td> </tr> <tr> <td class="h" > <a name="86">86</a> </td> <td class="c3" > 57 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#86-1"> 57 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#86-1"> 1 </a> </td> <td > 241232 </td> <td class="s"> my ( $page, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="87">87</a> </td> <td class="c3" > 57 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#87-1"> 50 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 119 </td> <td class="s"> return 0 unless ref $page eq 'HASH'; </td> </tr> <tr> <td class="h" > <a name="88">88</a> </td> <td class="c3" > 57 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#88-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#88-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 141 </td> <td class="s"> return 0 if defined $page->{success} && !$page->{success}; </td> </tr> <tr> <td class="h" > <a name="89">89</a> </td> <td class="c3" > 56 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#89-1"> 50 </a> </td> <td >   </td> <td >   </td> <td > 78 </td> <td class="s"> my $code = $page->{status_code} // 0; </td> </tr> <tr> <td class="h" > <a name="90">90</a> </td> <td class="c3" > 56 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#90-1"> 100 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#90-1"> 66 </a> </td> <td >   </td> <td >   </td> <td > 200 </td> <td class="s"> return 0 if $code && ( $code >= 500 || $SOFT_FAIL{$code} ); </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 66 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="91">91</a> </td> <td class="c3" > 55 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 78 </td> <td class="s"> my $sig = signals( $page, %opt ); </td> </tr> <tr> <td class="h" > <a name="92">92</a> </td> <td class="c3" > 55 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#92-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#92-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 211 </td> <td class="s"> return 0 if $sig->{blocked} || $sig->{captcha} || $sig->{thin_html}; </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="93">93</a> </td> <td class="c3" > 34 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 109 </td> <td class="s"> return 1; </td> </tr> <tr> <td class="h" > <a name="94">94</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="95">95</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="96">96</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="97">97</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # Most specific reason first. </td> </tr> <tr> <td class="h" > <a name="98">98</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub why_failed { </td> </tr> <tr> <td class="h" > <a name="99">99</a> </td> <td class="c3" > 26 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#99-1"> 26 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#99-1"> 1 </a> </td> <td > 38 </td> <td class="s"> my ( $page, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="100">100</a> </td> <td class="c3" > 26 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#100-1"> 50 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 63 </td> <td class="s"> return 'empty' unless ref $page eq 'HASH'; </td> </tr> <tr> <td class="h" > <a name="101">101</a> </td> <td class="c3" > 26 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 43 </td> <td class="s"> my $sig = signals( $page, %opt ); </td> </tr> <tr> <td class="h" > <a name="102">102</a> </td> <td class="c3" > 26 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#102-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 50 </td> <td class="s"> return 'captcha' if $sig->{captcha}; </td> </tr> <tr> <td class="h" > <a name="103">103</a> </td> <td class="c3" > 25 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#103-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 108 </td> <td class="s"> return 'bot_wall_detected' if $sig->{blocked}; </td> </tr> <tr> <td class="h" > <a name="104">104</a> </td> <td class="c3" > 20 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#104-1"> 50 </a> </td> <td >   </td> <td >   </td> <td > 46 </td> <td class="s"> my $code = $page->{status_code} // 0; </td> </tr> <tr> <td class="h" > <a name="105">105</a> </td> <td class="c3" > 20 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#105-1"> 100 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#105-1"> 100 </a> </td> <td >   </td> <td >   </td> <td > 93 </td> <td class="s"> return "http_$code" if $code && ( $code >= 500 || $SOFT_FAIL{$code} ); </td> </tr> <tr> <td class="h" > <a > </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#-2"> 66 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="106">106</a> </td> <td class="c3" > 18 </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#106-1"> 100 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 178 </td> <td class="s"> return 'thin_content' if $sig->{thin_html}; </td> </tr> <tr> <td class="h" > <a name="107">107</a> </td> <td class="c3" > 4 </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 17 </td> <td class="s"> return undef; </td> </tr> <tr> <td class="h" > <a name="108">108</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="109">109</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="110">110</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="111">111</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> #---------------------------------------------------------------------- </td> </tr> <tr> <td class="h" > <a name="112">112</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> # Service detection </td> </tr> <tr> <td class="h" > <a name="113">113</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> #---------------------------------------------------------------------- </td> </tr> <tr> <td class="h" > <a name="114">114</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="115">115</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub _probe_ua { </td> </tr> <tr> <td class="h" > <a name="116">116</a> </td> <td class="c0" > <a href="#117"> 0 </a> </td> <td >   </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#116-1"> 0 </a> </td> <td >   </td> <td > 0 </td> <td class="s"> my ( $ua, $timeout ) = @_; </td> </tr> <tr> <td class="h" > <a name="117">117</a> </td> <td class="c0" > <a href="#118"> 0 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#117-1"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return $ua if $ua; </td> </tr> <tr> <td class="h" > <a name="118">118</a> </td> <td class="c0" > <a href="#119"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> require LWP::UserAgent; </td> </tr> <tr> <td class="h" > <a name="119">119</a> </td> <td class="c0" > <a href="#125"> 0 </a> </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#119-1"> 0 </a> </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return LWP::UserAgent->new( agent => "WWW-Crawl4AI/$VERSION", timeout => ( $timeout // 5 ) ); </td> </tr> <tr> <td class="h" > <a name="120">120</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="121">121</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="122">122</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub probe_cloakbrowser { </td> </tr> <tr> <td class="h" > <a name="123">123</a> </td> <td class="c3" > 1 </td> <td >   </td> <td >   </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#123-1"> 1 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#123-1"> 1 </a> </td> <td > 719 </td> <td class="s"> my ( $cdp_url, %opt ) = @_; </td> </tr> <tr> <td class="h" > <a name="124">124</a> </td> <td class="c3" > 1 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#124-1"> 50 </a> </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#124-1"> 33 </a> </td> <td >   </td> <td >   </td> <td > 16 </td> <td class="s"> return 0 unless defined $cdp_url && length $cdp_url; </td> </tr> <tr> <td class="h" > <a name="125">125</a> </td> <td class="c0" > <a href="#126"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> ( my $base = $cdp_url ) =~ s{/+$}{}; </td> </tr> <tr> <td class="h" > <a name="126">126</a> </td> <td class="c0" > <a href="#127"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> $base =~ s{\?.*$}{}; # strip CloakBrowser query params (fingerprint=...) </td> </tr> <tr> <td class="h" > <a name="127">127</a> </td> <td class="c0" > <a href="#128"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> my $ua = _probe_ua( $opt{ua}, $opt{timeout} ); </td> </tr> <tr> <td class="h" > <a name="128">128</a> </td> <td class="c0" > <a href="#129"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> my $res = $ua->get( $base . '/json/version' ); </td> </tr> <tr> <td class="h" > <a name="129">129</a> </td> <td class="c0" > 0 </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--branch.html#129-1"> 0 </a> </td> <td >   </td> <td >   </td> <td >   </td> <td > 0 </td> <td class="s"> return $res->is_success ? 1 : 0; </td> </tr> <tr> <td class="h" > <a name="130">130</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="131">131</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="132">132</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="133">133</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub detect_proxy_env { </td> </tr> <tr> <td class="h" > <a name="134">134</a> </td> <td class="c3" > 1 </td> <td >   </td> <td class="c0" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--condition.html#134-1"> 50 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#134-1"> 1 </a> </td> <td class="c3" > <a href="blib-lib-WWW-Crawl4AI-Detect-pm--subroutine.html#134-1"> 1 </a> </td> <td > 4 </td> <td class="s"> return $ENV{CRAWL4AI_PROXY_URL} || undef; </td> </tr> <tr> <td class="h" > <a name="135">135</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="136">136</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="137">137</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="138">138</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> 1; </td> </tr> <tr> <td class="h" > <a name="139">139</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="140">140</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> __END__ </td> </tr> </table> </body> </html>