line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package RDF::Scutter; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
55754
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
49
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
5
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
133
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.1'; |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
6
|
use base ('LWP::RobotUA'); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1213
|
|
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
135840
|
use RDF::Redland; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
14
|
|
|
|
|
|
|
my ($that, %params) = @_; |
15
|
|
|
|
|
|
|
my $class = ref($that) || $that; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $scutterplan = $params{scutterplan}; |
18
|
|
|
|
|
|
|
croak("No place to start, please give an arrayref with URLs as a 'scutterplan' parameter") unless (ref($scutterplan) eq 'ARRAY'); |
19
|
|
|
|
|
|
|
delete $params{scutterplan}; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Some parameters, that should be deleted before passing them to SUPER |
22
|
|
|
|
|
|
|
my $skip = $params{skipregexp}; |
23
|
|
|
|
|
|
|
delete $params{skipregexp}; |
24
|
|
|
|
|
|
|
my $okwait = $params{okwait} || 1; |
25
|
|
|
|
|
|
|
delete $params{okwait}; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
unless ($params{agent}) { # agent is required by SUPER, set it to who I am |
28
|
|
|
|
|
|
|
$params{agent} = $class . '/' . $VERSION; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
croak "Setting an e-mail address using the 'from' parameter is required" unless ($params{from}); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $self = $class->SUPER::new(%params); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
foreach my $url (@{$scutterplan}) { |
36
|
|
|
|
|
|
|
$self->{QUEUE}->{$url} = ''; # Internally, QUEUE holds a hash where the keys are URLs to be visited and values are the URL they were referenced from. |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$self->{VISITED} = {}; |
40
|
|
|
|
|
|
|
$self->{SKIP} = $skip; |
41
|
|
|
|
|
|
|
$self->{OKWAIT} = $okwait; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
bless($self, $class); |
44
|
|
|
|
|
|
|
return $self; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub scutter { |
48
|
|
|
|
|
|
|
my ($self, $storage, $maxcount) = @_; |
49
|
|
|
|
|
|
|
LWP::Debug::trace('scutter'); |
50
|
|
|
|
|
|
|
my $model = new RDF::Redland::Model($storage, ""); # $model will contain all we find. |
51
|
|
|
|
|
|
|
croak "Failed to create RDF::Redland::Model for storage\n" unless $model; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
my $count = 0; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# ----------------------------------------------------------------- |
56
|
|
|
|
|
|
|
# Main loop starts here. |
57
|
|
|
|
|
|
|
# Iterate over the QUEUE (which is changing as we go) |
58
|
|
|
|
|
|
|
while (my ($url, $referer) = each(%{$self->{QUEUE}})) { |
59
|
|
|
|
|
|
|
local $SIG{TERM} = sub { $model->sync; }; |
60
|
|
|
|
|
|
|
next if ($self->{VISITED}->{$url}); # Then, we've been there in this run |
61
|
|
|
|
|
|
|
# LWP::Debug::debug('Retrieving ' . $url); |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
$count++; |
64
|
|
|
|
|
|
|
my $uri = new RDF::Redland::URI($url); # Set up some basic nodes. |
65
|
|
|
|
|
|
|
my $context=new RDF::Redland::BlankNode('context'.$count); |
66
|
|
|
|
|
|
|
my $fetch=new RDF::Redland::BlankNode('fetch'.$count); # It is actually unique to this run, but will have to change later |
67
|
|
|
|
|
|
|
my $rdftype = new RDF::Redland::URI('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Now, statements about the contexts |
71
|
|
|
|
|
|
|
$model->add_statement($context, |
72
|
|
|
|
|
|
|
$rdftype, |
73
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#Context'), $context); |
74
|
|
|
|
|
|
|
$model->add_statement($context, |
75
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#source'), |
76
|
|
|
|
|
|
|
$uri, $context); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
if ($referer) { |
79
|
|
|
|
|
|
|
$model->add_statement($context, |
80
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#origin'), |
81
|
|
|
|
|
|
|
new RDF::Redland::URINode($referer), $context); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
if ($self->{SKIP} and ($url =~ m/$self->{SKIP}/)) { # Support skipping per a regexp |
85
|
|
|
|
|
|
|
LWP::Debug::debug('Skipping ' . $url); |
86
|
|
|
|
|
|
|
LWP::Debug::debug('Disallowed as per regular expression: ' . $self->{SKIP}); |
87
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
88
|
|
|
|
|
|
|
fetch => $fetch, |
89
|
|
|
|
|
|
|
count => $count, |
90
|
|
|
|
|
|
|
context => $context, |
91
|
|
|
|
|
|
|
rel => 'skip', |
92
|
|
|
|
|
|
|
message => 'Disallowed as per regular expression: ' . $self->{SKIP}); |
93
|
|
|
|
|
|
|
delete $self->{QUEUE}->{$url}; |
94
|
|
|
|
|
|
|
next; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
unless ($self->rules->allowed($url)) { |
98
|
|
|
|
|
|
|
# This is not actually likely to run, it seems, as LWP::RobotUA |
99
|
|
|
|
|
|
|
# may not have decided yet at this point, and will throw a 403 |
100
|
|
|
|
|
|
|
# Forbidden instead. |
101
|
|
|
|
|
|
|
LWP::Debug::debug('Skipping ' . $url); |
102
|
|
|
|
|
|
|
LWP::Debug::debug('Disallowed as per robots.txt'); |
103
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
104
|
|
|
|
|
|
|
fetch => $fetch, |
105
|
|
|
|
|
|
|
count => $count, |
106
|
|
|
|
|
|
|
context => $context, |
107
|
|
|
|
|
|
|
rel => 'skip', |
108
|
|
|
|
|
|
|
message => 'Disallowed as per robots.txt'); |
109
|
|
|
|
|
|
|
delete $self->{QUEUE}->{$url}; |
110
|
|
|
|
|
|
|
next; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# TODO: Doesn't seem to work |
114
|
|
|
|
|
|
|
if ($self->host_wait($url) > $self->{OKWAIT}) { # We can't request, and won't bother to wait. |
115
|
|
|
|
|
|
|
LWP::Debug::debug("Do $url later."); |
116
|
|
|
|
|
|
|
delete $self->{QUEUE}->{$url}; # Delete where we are |
117
|
|
|
|
|
|
|
$self->{QUEUE}->{$url} = $referer; # And reinsert |
118
|
|
|
|
|
|
|
next; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
print STDERR "No: $count, Retrieving $url\n"; |
122
|
|
|
|
|
|
|
my $response = $self->get($url, 'Referer' => $referer); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my $fetchtime = $response->header('Date'); # Get a time somehow. |
126
|
|
|
|
|
|
|
unless ($fetchtime) { |
127
|
|
|
|
|
|
|
$fetchtime = localtime; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# More statements about the fetch we just did. |
131
|
|
|
|
|
|
|
$model->add_statement($context, |
132
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#fetch'), |
133
|
|
|
|
|
|
|
$fetch, $context); |
134
|
|
|
|
|
|
|
$model->add_statement($fetch, |
135
|
|
|
|
|
|
|
$rdftype, |
136
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#Fetch'), $context); |
137
|
|
|
|
|
|
|
$model->add_statement($fetch, |
138
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/dc/elements/1.1/date'), |
139
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($fetchtime), $context); |
140
|
|
|
|
|
|
|
$model->add_statement($fetch, |
141
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#status'), |
142
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($response->code), $context); |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
$self->{VISITED}->{$url} = 1; # Been there, done that, |
145
|
|
|
|
|
|
|
delete $self->{QUEUE}->{$url}; # one teeshirt is sufficient |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
if ($response->is_success) { |
148
|
|
|
|
|
|
|
# W00T, we really got the document! |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
my $parser=new RDF::Redland::Parser; |
151
|
|
|
|
|
|
|
unless ($parser) { |
152
|
|
|
|
|
|
|
LWP::Debug::debug('Skipping ' . $url); |
153
|
|
|
|
|
|
|
LWP::Debug::debug('Could not create parser for MIME type '.$response->header('Content-Type')); |
154
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
155
|
|
|
|
|
|
|
fetch => $fetch, |
156
|
|
|
|
|
|
|
count => $count, |
157
|
|
|
|
|
|
|
context => $context, |
158
|
|
|
|
|
|
|
message => 'Could not create Redland parser for MIME type '.$response->header('Content-Type')); |
159
|
|
|
|
|
|
|
next; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my $thisdoc; |
163
|
|
|
|
|
|
|
eval { # We try to parse it |
164
|
|
|
|
|
|
|
$thisdoc = $parser->parse_string_as_stream($response->decoded_content, $uri); |
165
|
|
|
|
|
|
|
}; |
166
|
|
|
|
|
|
|
if ($@){ |
167
|
|
|
|
|
|
|
LWP::Debug::debug('Skipping ' . $url); |
168
|
|
|
|
|
|
|
LWP::Debug::debug('Parser error: ' . $@); |
169
|
|
|
|
|
|
|
LWP::Debug::conns($response->decoded_content); |
170
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
171
|
|
|
|
|
|
|
fetch => $fetch, |
172
|
|
|
|
|
|
|
count => $count, |
173
|
|
|
|
|
|
|
context => $context, |
174
|
|
|
|
|
|
|
message => 'Redland parser reported ' . $@); |
175
|
|
|
|
|
|
|
next; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
unless ($thisdoc) { |
179
|
|
|
|
|
|
|
LWP::Debug::debug('Skipping ' . $url); |
180
|
|
|
|
|
|
|
LWP::Debug::debug('Parser returned no content.'); |
181
|
|
|
|
|
|
|
LWP::Debug::conns($response->decoded_content); |
182
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
183
|
|
|
|
|
|
|
fetch => $fetch, |
184
|
|
|
|
|
|
|
count => $count, |
185
|
|
|
|
|
|
|
context => $context, |
186
|
|
|
|
|
|
|
message => 'Redland parser returned no content.'); |
187
|
|
|
|
|
|
|
next; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# Now build a temporary model for this resource |
191
|
|
|
|
|
|
|
my $tmpstorage=new RDF::Redland::Storage("memory", "tmpstore", "new='yes',contexts='yes'"); |
192
|
|
|
|
|
|
|
my $thismodel = new RDF::Redland::Model($tmpstorage, ""); |
193
|
|
|
|
|
|
|
while($thisdoc && !$thisdoc->end) { # Add the statements to both models |
194
|
|
|
|
|
|
|
my $statement=$thisdoc->current; |
195
|
|
|
|
|
|
|
$model->add_statement($statement,$context); |
196
|
|
|
|
|
|
|
$thismodel->add_statement($statement,$context); |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
$thisdoc->next; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# More about the fetch |
202
|
|
|
|
|
|
|
$model->add_statement($fetch, |
203
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#raw_triple_count'), |
204
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($thismodel->size), $context); |
205
|
|
|
|
|
|
|
if ($response->header('ETag')) { |
206
|
|
|
|
|
|
|
$model->add_statement($fetch, |
207
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#etag'), |
208
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($response->header('ETag')), $context); |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
if ($response->header('Last-Modified')) { |
211
|
|
|
|
|
|
|
$model->add_statement($fetch, |
212
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#last_modified'), |
213
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($response->header('Last-Modified')), $context); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# The query will get out the seeAlso links from the resource, |
217
|
|
|
|
|
|
|
# which is what we'll follow |
218
|
|
|
|
|
|
|
my $query=new RDF::Redland::Query('SELECT DISTINCT ?doc WHERE { [ ?doc ] }', undef, undef, "sparql"); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
my $results; |
221
|
|
|
|
|
|
|
eval { |
222
|
|
|
|
|
|
|
$results = $query->execute($thismodel); |
223
|
|
|
|
|
|
|
}; |
224
|
|
|
|
|
|
|
if ($@){ |
225
|
|
|
|
|
|
|
LWP::Debug::debug('Failed to query links, Redland reported: ' . $@); |
226
|
|
|
|
|
|
|
LWP::Debug::conns($response->decoded_content); |
227
|
|
|
|
|
|
|
next; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# OK, here we go through all the results and get the URLs we want. |
231
|
|
|
|
|
|
|
while(!$results->finished) { |
232
|
|
|
|
|
|
|
for (my $i=0; $i < $results->bindings_count(); $i++) { |
233
|
|
|
|
|
|
|
my $value=$results->binding_value($i); |
234
|
|
|
|
|
|
|
$self->_check_and_add($url, $value->uri->as_string); |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
$results->next_result; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
# $model->sync; # Finally, make sure this is saved to the storage. Needed? |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# If we have a maxcount, then check if we should jump out of the |
242
|
|
|
|
|
|
|
# loop |
243
|
|
|
|
|
|
|
last if (defined($maxcount) and ($count >= $maxcount)); |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} elsif (($response->is_redirect) && ($response->header('Location'))) { |
246
|
|
|
|
|
|
|
# Hmm, dull, just a redirect, lets add it to the queue if we |
247
|
|
|
|
|
|
|
# haven't been there |
248
|
|
|
|
|
|
|
$self->_check_and_add($url, $response->header('Location')); |
249
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
250
|
|
|
|
|
|
|
fetch => $fetch, |
251
|
|
|
|
|
|
|
count => $count, |
252
|
|
|
|
|
|
|
context => $context, |
253
|
|
|
|
|
|
|
rel => 'skip', |
254
|
|
|
|
|
|
|
message => 'HTTP Redirect'); |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
} else { # Error situation, retrieval not OK |
258
|
|
|
|
|
|
|
$model = $self->_error_statements(model => $model, |
259
|
|
|
|
|
|
|
fetch => $fetch, |
260
|
|
|
|
|
|
|
count => $count, |
261
|
|
|
|
|
|
|
context => $context, |
262
|
|
|
|
|
|
|
message => 'HTTP Error. Message: '.$response->message); |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
return $model; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# This is a sub just for internal use, and it creates a few statements |
271
|
|
|
|
|
|
|
# in case of an error. It is just a shorthand really. |
272
|
|
|
|
|
|
|
# There are lots of usage examples in the code... :-) |
273
|
|
|
|
|
|
|
sub _error_statements { |
274
|
|
|
|
|
|
|
my ($self, %msg) = @_; |
275
|
|
|
|
|
|
|
my $reason=new RDF::Redland::BlankNode('reason'.$msg{count}); |
276
|
|
|
|
|
|
|
my $rel = $msg{rel} || 'error'; # Error relationship if nothing else is given. |
277
|
|
|
|
|
|
|
my $model = $msg{model}; |
278
|
|
|
|
|
|
|
$model->add_statement($msg{fetch}, |
279
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#'.$rel), |
280
|
|
|
|
|
|
|
$reason, $msg{context}); |
281
|
|
|
|
|
|
|
$model->add_statement($reason, |
282
|
|
|
|
|
|
|
new RDF::Redland::URI('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), |
283
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/net/scutter#Reason'), $msg{context}); |
284
|
|
|
|
|
|
|
$model->add_statement($reason, |
285
|
|
|
|
|
|
|
new RDF::Redland::URINode('http://purl.org/dc/elements/1.1/description'), |
286
|
|
|
|
|
|
|
new RDF::Redland::LiteralNode($msg{message}), $msg{context}); |
287
|
|
|
|
|
|
|
return $model; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Internal sub, to check if we have been on an URL before, and if not, |
291
|
|
|
|
|
|
|
# add it to the QUEUE. First argument is where we are now, second is |
292
|
|
|
|
|
|
|
# the URL we're checking. |
293
|
|
|
|
|
|
|
sub _check_and_add { |
294
|
|
|
|
|
|
|
my ($self, $thisurl, $foundurl) = @_; |
295
|
|
|
|
|
|
|
unless ($self->{VISITED}->{$foundurl}) { |
296
|
|
|
|
|
|
|
$self->{QUEUE}->{$foundurl} = $thisurl; |
297
|
|
|
|
|
|
|
print STDERR "Adding URL: " . $foundurl ."\n"; |
298
|
|
|
|
|
|
|
return 1; |
299
|
|
|
|
|
|
|
} else { |
300
|
|
|
|
|
|
|
delete $self->{QUEUE}->{$foundurl}; |
301
|
|
|
|
|
|
|
LWP::Debug::debug('Has been visited, so skipping ' . $foundurl); |
302
|
|
|
|
|
|
|
return 0; |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
1; |
307
|
|
|
|
|
|
|
__END__ |