File Coverage

blib/lib/RDF/Scutter.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package RDF::Scutter;
2              
3 1     1   55754 use strict;
  1         3  
  1         49  
4 1     1   6 use warnings;
  1         2  
  1         36  
5 1     1   7 use Carp;
  1         6  
  1         133  
6              
7             our $VERSION = '0.1';
8              
9 1     1   6 use base ('LWP::RobotUA');
  1         2  
  1         1213  
10              
11 1     1   135840 use RDF::Redland;
  0            
  0            
12              
13             sub new {
14             my ($that, %params) = @_;
15             my $class = ref($that) || $that;
16              
17             my $scutterplan = $params{scutterplan};
18             croak("No place to start, please give an arrayref with URLs as a 'scutterplan' parameter") unless (ref($scutterplan) eq 'ARRAY');
19             delete $params{scutterplan};
20              
21             # Some parameters, that should be deleted before passing them to SUPER
22             my $skip = $params{skipregexp};
23             delete $params{skipregexp};
24             my $okwait = $params{okwait} || 1;
25             delete $params{okwait};
26              
27             unless ($params{agent}) { # agent is required by SUPER, set it to who I am
28             $params{agent} = $class . '/' . $VERSION;
29             }
30              
31             croak "Setting an e-mail address using the 'from' parameter is required" unless ($params{from});
32              
33             my $self = $class->SUPER::new(%params);
34              
35             foreach my $url (@{$scutterplan}) {
36             $self->{QUEUE}->{$url} = ''; # Internally, QUEUE holds a hash where the keys are URLs to be visited and values are the URL they were referenced from.
37             }
38              
39             $self->{VISITED} = {};
40             $self->{SKIP} = $skip;
41             $self->{OKWAIT} = $okwait;
42              
43             bless($self, $class);
44             return $self;
45             }
46              
47             sub scutter {
48             my ($self, $storage, $maxcount) = @_;
49             LWP::Debug::trace('scutter');
50             my $model = new RDF::Redland::Model($storage, ""); # $model will contain all we find.
51             croak "Failed to create RDF::Redland::Model for storage\n" unless $model;
52              
53             my $count = 0;
54              
55             # -----------------------------------------------------------------
56             # Main loop starts here.
57             # Iterate over the QUEUE (which is changing as we go)
58             while (my ($url, $referer) = each(%{$self->{QUEUE}})) {
59             local $SIG{TERM} = sub { $model->sync; };
60             next if ($self->{VISITED}->{$url}); # Then, we've been there in this run
61             # LWP::Debug::debug('Retrieving ' . $url);
62              
63             $count++;
64             my $uri = new RDF::Redland::URI($url); # Set up some basic nodes.
65             my $context=new RDF::Redland::BlankNode('context'.$count);
66             my $fetch=new RDF::Redland::BlankNode('fetch'.$count); # It is actually unique to this run, but will have to change later
67             my $rdftype = new RDF::Redland::URI('http://www.w3.org/1999/02/22-rdf-syntax-ns#type');
68              
69            
70             # Now, statements about the contexts
71             $model->add_statement($context,
72             $rdftype,
73             new RDF::Redland::URINode('http://purl.org/net/scutter#Context'), $context);
74             $model->add_statement($context,
75             new RDF::Redland::URINode('http://purl.org/net/scutter#source'),
76             $uri, $context);
77              
78             if ($referer) {
79             $model->add_statement($context,
80             new RDF::Redland::URINode('http://purl.org/net/scutter#origin'),
81             new RDF::Redland::URINode($referer), $context);
82             }
83              
84             if ($self->{SKIP} and ($url =~ m/$self->{SKIP}/)) { # Support skipping per a regexp
85             LWP::Debug::debug('Skipping ' . $url);
86             LWP::Debug::debug('Disallowed as per regular expression: ' . $self->{SKIP});
87             $model = $self->_error_statements(model => $model,
88             fetch => $fetch,
89             count => $count,
90             context => $context,
91             rel => 'skip',
92             message => 'Disallowed as per regular expression: ' . $self->{SKIP});
93             delete $self->{QUEUE}->{$url};
94             next;
95             }
96              
97             unless ($self->rules->allowed($url)) {
98             # This is not actually likely to run, it seems, as LWP::RobotUA
99             # may not have decided yet at this point, and will throw a 403
100             # Forbidden instead.
101             LWP::Debug::debug('Skipping ' . $url);
102             LWP::Debug::debug('Disallowed as per robots.txt');
103             $model = $self->_error_statements(model => $model,
104             fetch => $fetch,
105             count => $count,
106             context => $context,
107             rel => 'skip',
108             message => 'Disallowed as per robots.txt');
109             delete $self->{QUEUE}->{$url};
110             next;
111             }
112              
113             # TODO: Doesn't seem to work
114             if ($self->host_wait($url) > $self->{OKWAIT}) { # We can't request, and won't bother to wait.
115             LWP::Debug::debug("Do $url later.");
116             delete $self->{QUEUE}->{$url}; # Delete where we are
117             $self->{QUEUE}->{$url} = $referer; # And reinsert
118             next;
119             }
120              
121             print STDERR "No: $count, Retrieving $url\n";
122             my $response = $self->get($url, 'Referer' => $referer);
123              
124              
125             my $fetchtime = $response->header('Date'); # Get a time somehow.
126             unless ($fetchtime) {
127             $fetchtime = localtime;
128             }
129              
130             # More statements about the fetch we just did.
131             $model->add_statement($context,
132             new RDF::Redland::URINode('http://purl.org/net/scutter#fetch'),
133             $fetch, $context);
134             $model->add_statement($fetch,
135             $rdftype,
136             new RDF::Redland::URINode('http://purl.org/net/scutter#Fetch'), $context);
137             $model->add_statement($fetch,
138             new RDF::Redland::URINode('http://purl.org/dc/elements/1.1/date'),
139             new RDF::Redland::LiteralNode($fetchtime), $context);
140             $model->add_statement($fetch,
141             new RDF::Redland::URINode('http://purl.org/net/scutter#status'),
142             new RDF::Redland::LiteralNode($response->code), $context);
143              
144             $self->{VISITED}->{$url} = 1; # Been there, done that,
145             delete $self->{QUEUE}->{$url}; # one teeshirt is sufficient
146              
147             if ($response->is_success) {
148             # W00T, we really got the document!
149              
150             my $parser=new RDF::Redland::Parser;
151             unless ($parser) {
152             LWP::Debug::debug('Skipping ' . $url);
153             LWP::Debug::debug('Could not create parser for MIME type '.$response->header('Content-Type'));
154             $model = $self->_error_statements(model => $model,
155             fetch => $fetch,
156             count => $count,
157             context => $context,
158             message => 'Could not create Redland parser for MIME type '.$response->header('Content-Type'));
159             next;
160             }
161              
162             my $thisdoc;
163             eval { # We try to parse it
164             $thisdoc = $parser->parse_string_as_stream($response->decoded_content, $uri);
165             };
166             if ($@){
167             LWP::Debug::debug('Skipping ' . $url);
168             LWP::Debug::debug('Parser error: ' . $@);
169             LWP::Debug::conns($response->decoded_content);
170             $model = $self->_error_statements(model => $model,
171             fetch => $fetch,
172             count => $count,
173             context => $context,
174             message => 'Redland parser reported ' . $@);
175             next;
176             }
177              
178             unless ($thisdoc) {
179             LWP::Debug::debug('Skipping ' . $url);
180             LWP::Debug::debug('Parser returned no content.');
181             LWP::Debug::conns($response->decoded_content);
182             $model = $self->_error_statements(model => $model,
183             fetch => $fetch,
184             count => $count,
185             context => $context,
186             message => 'Redland parser returned no content.');
187             next;
188             }
189              
190             # Now build a temporary model for this resource
191             my $tmpstorage=new RDF::Redland::Storage("memory", "tmpstore", "new='yes',contexts='yes'");
192             my $thismodel = new RDF::Redland::Model($tmpstorage, "");
193             while($thisdoc && !$thisdoc->end) { # Add the statements to both models
194             my $statement=$thisdoc->current;
195             $model->add_statement($statement,$context);
196             $thismodel->add_statement($statement,$context);
197              
198             $thisdoc->next;
199             }
200              
201             # More about the fetch
202             $model->add_statement($fetch,
203             new RDF::Redland::URINode('http://purl.org/net/scutter#raw_triple_count'),
204             new RDF::Redland::LiteralNode($thismodel->size), $context);
205             if ($response->header('ETag')) {
206             $model->add_statement($fetch,
207             new RDF::Redland::URINode('http://purl.org/net/scutter#etag'),
208             new RDF::Redland::LiteralNode($response->header('ETag')), $context);
209             }
210             if ($response->header('Last-Modified')) {
211             $model->add_statement($fetch,
212             new RDF::Redland::URINode('http://purl.org/net/scutter#last_modified'),
213             new RDF::Redland::LiteralNode($response->header('Last-Modified')), $context);
214             }
215              
216             # The query will get out the seeAlso links from the resource,
217             # which is what we'll follow
218             my $query=new RDF::Redland::Query('SELECT DISTINCT ?doc WHERE { [ ?doc ] }', undef, undef, "sparql");
219              
220             my $results;
221             eval {
222             $results = $query->execute($thismodel);
223             };
224             if ($@){
225             LWP::Debug::debug('Failed to query links, Redland reported: ' . $@);
226             LWP::Debug::conns($response->decoded_content);
227             next;
228             }
229              
230             # OK, here we go through all the results and get the URLs we want.
231             while(!$results->finished) {
232             for (my $i=0; $i < $results->bindings_count(); $i++) {
233             my $value=$results->binding_value($i);
234             $self->_check_and_add($url, $value->uri->as_string);
235             }
236             $results->next_result;
237             }
238             # $model->sync; # Finally, make sure this is saved to the storage. Needed?
239              
240              
241             # If we have a maxcount, then check if we should jump out of the
242             # loop
243             last if (defined($maxcount) and ($count >= $maxcount));
244              
245             } elsif (($response->is_redirect) && ($response->header('Location'))) {
246             # Hmm, dull, just a redirect, lets add it to the queue if we
247             # haven't been there
248             $self->_check_and_add($url, $response->header('Location'));
249             $model = $self->_error_statements(model => $model,
250             fetch => $fetch,
251             count => $count,
252             context => $context,
253             rel => 'skip',
254             message => 'HTTP Redirect');
255              
256              
257             } else { # Error situation, retrieval not OK
258             $model = $self->_error_statements(model => $model,
259             fetch => $fetch,
260             count => $count,
261             context => $context,
262             message => 'HTTP Error. Message: '.$response->message);
263             }
264              
265             }
266             return $model;
267             }
268              
269              
270             # This is a sub just for internal use, and it creates a few statements
271             # in case of an error. It is just a shorthand really.
272             # There are lots of usage examples in the code... :-)
273             sub _error_statements {
274             my ($self, %msg) = @_;
275             my $reason=new RDF::Redland::BlankNode('reason'.$msg{count});
276             my $rel = $msg{rel} || 'error'; # Error relationship if nothing else is given.
277             my $model = $msg{model};
278             $model->add_statement($msg{fetch},
279             new RDF::Redland::URINode('http://purl.org/net/scutter#'.$rel),
280             $reason, $msg{context});
281             $model->add_statement($reason,
282             new RDF::Redland::URI('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
283             new RDF::Redland::URINode('http://purl.org/net/scutter#Reason'), $msg{context});
284             $model->add_statement($reason,
285             new RDF::Redland::URINode('http://purl.org/dc/elements/1.1/description'),
286             new RDF::Redland::LiteralNode($msg{message}), $msg{context});
287             return $model;
288             }
289              
290             # Internal sub, to check if we have been on an URL before, and if not,
291             # add it to the QUEUE. First argument is where we are now, second is
292             # the URL we're checking.
293             sub _check_and_add {
294             my ($self, $thisurl, $foundurl) = @_;
295             unless ($self->{VISITED}->{$foundurl}) {
296             $self->{QUEUE}->{$foundurl} = $thisurl;
297             print STDERR "Adding URL: " . $foundurl ."\n";
298             return 1;
299             } else {
300             delete $self->{QUEUE}->{$foundurl};
301             LWP::Debug::debug('Has been visited, so skipping ' . $foundurl);
302             return 0;
303             }
304             }
305              
306             1;
307             __END__