line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#! perl |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
933
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
5
|
1
|
|
|
1
|
|
5
|
use utf8; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
5
|
|
6
|
1
|
|
|
1
|
|
20
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
88
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package Comics::Fetcher::Cascade; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = "1.00"; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 NAME |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Comics::Fetcher::Cascade -- Cascading url grabber |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 SYNOPSIS |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
package Comics::Plugin::Sigmund; |
19
|
|
|
|
|
|
|
use parent qw(Comics::Fetcher::Cascade); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $name = "Sigmund"; |
22
|
|
|
|
|
|
|
our $url = "http://www.sigmund.nl/"; |
23
|
|
|
|
|
|
|
our @patterns = ( [ qr{ ... (?...) ... }, |
24
|
|
|
|
|
|
|
qr{ ... (?...) ... }, |
25
|
|
|
|
|
|
|
... |
26
|
|
|
|
|
|
|
qr{ ... (?...) ... } ], |
27
|
|
|
|
|
|
|
); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Return the package name. |
30
|
|
|
|
|
|
|
__PACKAGE__; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The C Fetcher can use one or more patterns to determine the |
35
|
|
|
|
|
|
|
URL of the desired image. If multiple patterns are supplied, each |
36
|
|
|
|
|
|
|
pattern is applied to the fetched page and must define the url for the |
37
|
|
|
|
|
|
|
next page as a named capture. The process is repeated, and the final |
38
|
|
|
|
|
|
|
pattern has to provide the final url and image name. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
The Fetcher requires the common package variables: |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=over 8 |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=item $name |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
The full name of this comic, e.g. "Fokke en Sukke". |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item $url |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
The url of this comic's starting (i.e. home) page. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=back |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Fetcher specific arguments: |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
This Fetcher requires either C<$path> (direct URL fetch), C<$pattern> |
57
|
|
|
|
|
|
|
(single fetch), or C<@patterns> (cascading fetch). |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=over 8 |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=item $path |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
The URL of the desired image. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
If I is not an absolute URL, it will be interpreted relative to |
66
|
|
|
|
|
|
|
the I. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=item $pattern |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
A pattern to locate the image URL from the starting page. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item @patterns |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
An array with patterns to locate the image URL. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
When a pattern matches, it must define the named capture C, which |
77
|
|
|
|
|
|
|
points to the page to be loaded and used for the next pattern. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=back |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
Any of the patterns may additionally define: |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=over 8 |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=item title |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
The image title. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item alt |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
The alternative text. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=back |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=cut |
96
|
|
|
|
|
|
|
|
97
|
1
|
|
|
1
|
|
6
|
use parent qw(Comics::Fetcher::Base); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
5
|
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub fetch { |
100
|
0
|
|
|
0
|
0
|
|
my ( $self, $reuse ) = @_; |
101
|
0
|
|
|
|
|
|
my $state = $self->{state}; |
102
|
0
|
|
0
|
|
|
|
my $pats = $self->{patterns} || [ $self->{pattern} ]; |
103
|
0
|
|
|
|
|
|
my $name = $self->{name}; |
104
|
0
|
|
|
|
|
|
my $url = $self->{url}; |
105
|
0
|
|
|
|
|
|
my $tag = $self->{tag}; |
106
|
0
|
|
|
|
|
|
delete $state->{fail}; |
107
|
|
|
|
|
|
|
|
108
|
0
|
|
|
|
|
|
my ( $image, $title, $alt ) = @_; |
109
|
|
|
|
|
|
|
|
110
|
0
|
|
|
|
|
|
my $referer = "comics.html"; |
111
|
0
|
0
|
|
|
|
|
if ( $self->{path} ) { |
112
|
0
|
|
|
|
|
|
$url = $self->urlabs( $url, $self->{path} ); |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
else { |
115
|
0
|
|
|
|
|
|
my $pix = 0; |
116
|
0
|
|
|
|
|
|
my $data; |
117
|
0
|
|
|
|
|
|
foreach my $pat ( @$pats ) { |
118
|
0
|
|
|
|
|
|
$pix++; |
119
|
0
|
|
|
|
|
|
$data = ""; |
120
|
0
|
|
|
|
|
|
$state->{trying} = $url; |
121
|
|
|
|
|
|
|
|
122
|
0
|
0
|
|
|
|
|
if ( $reuse ) { |
123
|
0
|
|
|
|
|
|
::debug("Reusing page $pix $url"); |
124
|
0
|
|
|
|
|
|
$data = $self->load_html(".$tag.$pix.html"); |
125
|
0
|
0
|
|
|
|
|
$reuse = 0 unless $data; |
126
|
|
|
|
|
|
|
} |
127
|
0
|
0
|
|
|
|
|
unless ( $data ) { |
128
|
0
|
|
|
|
|
|
::debug("Fetching page $pix $url"); |
129
|
0
|
|
|
|
|
|
$::ua->default_header( Referer => $referer ); |
130
|
0
|
|
|
|
|
|
my $res = $::ua->get($url); |
131
|
0
|
0
|
|
|
|
|
unless ( $res->is_success ) { |
132
|
0
|
0
|
|
|
|
|
$self->{fail} = "Not found", return if $self->{optional}; |
133
|
0
|
|
|
|
|
|
die($res->status_line); |
134
|
|
|
|
|
|
|
} |
135
|
0
|
|
|
|
|
|
$data = $res->content; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
0
|
0
|
|
|
|
|
unless ( $data =~ $pat ) { |
139
|
0
|
0
|
|
|
|
|
$self->{fail} = "No match", return if $self->{optional}; |
140
|
|
|
|
|
|
|
# Save a copy of the failed data. |
141
|
0
|
0
|
0
|
|
|
|
$self->save_html( ".$tag.$pix.html", $data ) |
142
|
|
|
|
|
|
|
if ::debugging() && !$reuse; |
143
|
0
|
|
|
|
|
|
die("FAIL: pattern $pix not found"); |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
1
|
|
|
1
|
|
901
|
$url = $self->urlabs( $url, $+{url} ); |
|
1
|
|
|
|
|
397
|
|
|
1
|
|
|
|
|
425
|
|
|
0
|
|
|
|
|
|
|
147
|
0
|
0
|
|
|
|
|
unless ( $url ) { |
148
|
0
|
|
|
|
|
|
die("FAIL: pattern $pix not found"); |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# Other match data expected: |
152
|
0
|
0
|
|
|
|
|
$title = $+{title} if $+{title}; |
153
|
0
|
0
|
|
|
|
|
$alt = $+{alt} if $+{alt}; |
154
|
|
|
|
|
|
|
|
155
|
0
|
|
|
|
|
|
$referer = $url; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
0
|
0
|
|
|
|
|
unless ( $title ) { |
159
|
0
|
0
|
|
|
|
|
$title = $1 if $data =~ /(.*?)<\/title>/; |
160
|
0
|
|
0
|
|
|
|
$title ||= $name; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
0
|
|
0
|
|
|
|
$alt ||= $tag; |
165
|
0
|
|
0
|
|
|
|
$title ||= $name; |
166
|
|
|
|
|
|
|
|
167
|
0
|
|
0
|
|
|
|
my $etag = $state->{etag} || "None"; |
168
|
0
|
|
|
|
|
|
$state->{trying} = $url; |
169
|
0
|
|
|
|
|
|
::debug("Fetching image $url (ETag: $etag)"); |
170
|
0
|
|
|
|
|
|
$::ua->default_header( Referer => $referer ); |
171
|
0
|
|
|
|
|
|
$::ua->default_header( "If-None-Match" => $etag ); |
172
|
0
|
|
|
|
|
|
my $res = $::ua->get($url); |
173
|
0
|
0
|
|
|
|
|
unless ( $res->is_success ) { |
174
|
0
|
|
|
|
|
|
$state->{fail} = $res->status_line; |
175
|
0
|
0
|
|
|
|
|
if ( $state->{fail} =~ /304 Not Modified/ ) { |
176
|
0
|
|
|
|
|
|
::debug("Not fetching: Up to date $url"); |
177
|
0
|
|
|
|
|
|
$::stats->{uptodate}++; |
178
|
0
|
|
|
|
|
|
delete( $state->{trying} ); |
179
|
0
|
|
|
|
|
|
delete( $state->{fail} ); |
180
|
0
|
|
|
|
|
|
return $state; |
181
|
|
|
|
|
|
|
} |
182
|
0
|
|
|
|
|
|
die("FAIL (image): ", $state->{fail}); |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
0
|
|
|
|
|
|
my $data = $res->content; |
186
|
0
|
|
|
|
|
|
my $info; |
187
|
0
|
0
|
0
|
|
|
|
if ( !$data or !($info = Image::Info::image_info(\$data)) ) { |
188
|
0
|
|
|
|
|
|
die("FAIL: image no data"); |
189
|
|
|
|
|
|
|
} |
190
|
0
|
|
|
|
|
|
$state->{etag} = $res->header('etag'); |
191
|
0
|
|
|
|
|
|
my $md5 = Digest::MD5::md5_base64($data); |
192
|
0
|
0
|
0
|
|
|
|
if ( $state->{md5} and $state->{md5} eq $md5 ) { |
193
|
0
|
|
|
|
|
|
::debug("Fetching: Up to date $url"); |
194
|
0
|
|
|
|
|
|
$::stats->{uptodate}++; |
195
|
0
|
|
|
|
|
|
delete( $state->{trying} ); |
196
|
0
|
|
|
|
|
|
return $state; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
0
|
0
|
0
|
|
|
|
if ( $state->{c_img} |
200
|
|
|
|
|
|
|
and my $oldimg = $self->spoolfile( $state->{c_img} ) ) { |
201
|
0
|
0
|
|
|
|
|
unlink($oldimg) |
202
|
|
|
|
|
|
|
&& ::debug("Removed: $oldimg"); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
0
|
0
|
0
|
|
|
|
unless ( $tag && $info->{file_ext} ) { |
206
|
1
|
|
|
1
|
|
8
|
use Data::Dumper; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
350
|
|
207
|
0
|
|
|
|
|
|
warn($tag, ": ", Dumper($info)); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
my $img = sprintf( "%s-%06x.%s", $tag, |
210
|
0
|
|
|
|
|
|
int(rand(0x1000000)), $info->{file_ext} ); |
211
|
0
|
|
|
|
|
|
$state->{c_width} = $info->{width}; |
212
|
0
|
|
|
|
|
|
$state->{c_height} = $info->{height}; |
213
|
|
|
|
|
|
|
|
214
|
0
|
|
|
|
|
|
$self->save_image( $img, \$data ); |
215
|
|
|
|
|
|
|
|
216
|
0
|
|
|
|
|
|
$state->{update} = time; |
217
|
0
|
|
|
|
|
|
$state->{md5} = $md5; |
218
|
0
|
|
|
|
|
|
delete( $state->{trying} ); |
219
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
$state->{c_alt} = $alt; |
221
|
0
|
|
|
|
|
|
$state->{c_title} = $title; |
222
|
0
|
|
|
|
|
|
$state->{c_img} = $img; |
223
|
|
|
|
|
|
|
|
224
|
0
|
|
|
|
|
|
my $html = "$tag.html"; |
225
|
0
|
|
|
|
|
|
$self->save_html($html); |
226
|
|
|
|
|
|
|
|
227
|
0
|
|
|
|
|
|
$state->{url} = $url; |
228
|
|
|
|
|
|
|
|
229
|
0
|
|
|
|
|
|
return 1; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
1; |