File Coverage

blib/lib/WWW/Crawl4AI/Result.pm
Criterion Covered Total %
statement 44 45 97.7
branch 16 24 66.6
condition 7 13 53.8
subroutine 12 12 100.0
pod 8 8 100.0
total 87 102 85.2


line stmt bran cond sub pod time code
1             package WWW::Crawl4AI::Result;
2             # ABSTRACT: normalized result of a WWW::Crawl4AI strategy chain
3 4     4   108290 use Moo;
  4         5971  
  4         23  
4 4     4   2593 use JSON::MaybeXS ();
  4         10898  
  4         103  
5 4     4   984 use URI ();
  4         11198  
  4         3938  
6              
7             our $VERSION = '0.001';
8              
9              
10             has ok => ( is => 'ro', default => sub { 0 } );
11              
12              
13             has url => ( is => 'ro' );
14              
15              
16             has final_url => ( is => 'ro' );
17              
18              
19             has status => ( is => 'ro' );
20              
21              
22             has markdown => ( is => 'ro' );
23              
24              
25             has html => ( is => 'ro' );
26              
27              
28             has title => ( is => 'ro' );
29              
30              
31             has backend => ( is => 'ro' );
32              
33              
34             has cost_class => ( is => 'ro' );
35              
36              
37             has signals => ( is => 'ro', default => sub { {} } );
38              
39              
40             has why_failed => ( is => 'ro' );
41              
42              
43             has error => ( is => 'ro' );
44              
45              
46             has attempts => ( is => 'ro', default => sub { [] } );
47              
48              
49             has links => ( is => 'ro', default => sub { { internal => [], external => [] } } );
50              
51              
52             has response_headers => ( is => 'ro', default => sub { {} } );
53              
54              
55             has _json => (
56             is => 'lazy',
57             default => sub { JSON::MaybeXS->new( utf8 => 0, canonical => 1, convert_blessed => 1 ) },
58             );
59              
60             sub from_attempt {
61 29     29 1 300199 my ( $class, $attempt, %extra ) = @_;
62 29   50     70 my $page = $attempt->page || {};
63             return $class->new(
64             ok => $attempt->ok,
65             url => $page->{url},
66             final_url => $page->{final_url} // $page->{url},
67             status => $page->{status_code},
68             markdown => $page->{markdown},
69             html => $page->{html},
70             title => $page->{title},
71             backend => $attempt->backend,
72             cost_class => $attempt->cost_class,
73             signals => $attempt->signals,
74             why_failed => $attempt->why_failed,
75             links => $page->{links} // { internal => [], external => [] },
76 29   66     267 response_headers => _lc_headers( $page->{response_headers} ),
      100        
77             %extra,
78             );
79             }
80              
81              
82 3     3 1 10912 sub attempt_count { scalar @{ $_[0]->attempts } }
  3         31  
83              
84              
85 14 50   14 1 69 sub internal_links { $_[0]->links->{internal} || [] }
86              
87              
88 14 50   14 1 47 sub external_links { $_[0]->links->{external} || [] }
89              
90              
91             sub urls {
92 13     13 1 19 my ( $self ) = @_;
93 13   33     34 my $base = $self->final_url // $self->url;
94 13         20 my ( %seen, @urls );
95 13         15 for my $link ( @{ $self->internal_links }, @{ $self->external_links } ) {
  13         23  
  13         24  
96 22 50       45 my $href = ref $link eq 'HASH' ? $link->{href} : $link;
97 22 50 33     56 next unless defined $href && length $href;
98 22 100       57 next if $href =~ m{\A(?:javascript|mailto|tel|data):}i;
99 20 100       29 next if $href =~ /\A#/;
100 19 50       63 my $abs = $base ? URI->new_abs( $href, $base )->as_string : $href;
101 19 50       9536 next unless $abs =~ m{\Ahttps?://}i;
102 19 100       52 next if $seen{$abs}++;
103 18         41 push @urls, $abs;
104             }
105 13         53 return \@urls;
106             }
107              
108              
109             sub to_hash {
110 2     2 1 1493 my ( $self ) = @_;
111             return {
112             ok => $self->ok ? \1 : \0,
113             url => $self->url,
114             final_url => $self->final_url,
115             status => $self->status,
116             backend => $self->backend,
117             cost_class => $self->cost_class,
118             title => $self->title,
119             markdown => $self->markdown,
120             signals => $self->signals,
121             why_failed => $self->why_failed,
122             links => $self->links,
123             urls => $self->urls,
124             response_headers => $self->response_headers,
125 0         0 ( defined $self->error ? ( error => "@{[ $self->error ]}" ) : () ),
126 2 50       47 attempts => [ map { $_->to_hash } @{ $self->attempts } ],
  2 50       4  
  2         15  
127             };
128             }
129              
130              
131 1     1 1 47 sub TO_JSON { $_[0]->to_hash }
132              
133             sub attempts_json {
134 1     1 1 32 my ( $self ) = @_;
135 1         18 return $self->_json->encode( [ map { $_->to_hash } @{ $self->attempts } ] );
  2         5  
  1         28  
136             }
137              
138              
139             # Lowercase all header keys for deterministic, case-insensitive matching by callers.
140             sub _lc_headers {
141 29     29   103 my ($h) = @_;
142 29 100       469 return {} unless ref $h eq 'HASH';
143 1         3 return { map { lc($_) => $h->{$_} } keys %$h };
  3         15  
144             }
145              
146             1;
147              
148             __END__