File Coverage

blib/lib/HTTP/LRDD.pm
Criterion Covered Total %
statement 7 9 77.7
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 10 12 83.3


line stmt bran cond sub pod time code
1             package HTTP::LRDD;
2              
3 1     1   19002 use strict;
  1         1  
  1         27  
4 1     1   16 use 5.010;
  1         3  
  1         29  
5              
6 1     1   366 use HTML::HTML5::Parser 0.107;
  0            
  0            
7             use HTML::HTML5::Sanity 0.102;
8             use HTTP::Link::Parser 0.102 qw(:all);
9             use HTTP::Status 0 qw(:constants);
10             use Object::AUTHORITY 0;
11             use RDF::TrineX::Functions 0 -shortcuts;
12             use RDF::RDFa::Parser 1.096;
13             use RDF::Query 2.900;
14             use Scalar::Util 0 qw(blessed);
15             use URI 0;
16             use URI::Escape 0;
17             use XML::Atom::OWL 0.100;
18             use XRD::Parser 0.101;
19              
20             my (@Predicates, @_Predicates, @MediaTypes);
21              
22             BEGIN
23             {
24             $HTTP::LRDD::AUTHORITY = 'cpan:TOBYINK';
25             $HTTP::LRDD::VERSION = '0.106';
26            
27             @Predicates = (
28             'describedby',
29             'lrdd',
30             'http://www.w3.org/2007/05/powder-s#describedby',
31             'http://www.w3.org/1999/xhtml/vocab#meta',
32             'http://www.w3.org/2000/01/rdf-schema#seeAlso',
33             );
34             @_Predicates = @Predicates;
35             @MediaTypes = (
36             'application/xrd+xml',
37             'application/rdf+xml',
38             'text/turtle',
39             'application/atom+xml;q=0.9',
40             'application/xhtml+xml;q=0.9',
41             'text/html;q=0.9',
42             '*/*;q=0.1',
43             );
44             }
45              
46             sub __rdf_query
47             {
48             my ($sparql, $model) = @_;
49             my $result = RDF::Query->new($sparql)->execute($model);
50            
51             if ($result->is_boolean)
52             { return $result->get_boolean }
53             elsif ($result->is_bindings)
54             { return $result }
55            
56             $result->is_graph or die;
57            
58             my $return = RDF::Trine::Model->new;
59             $return->add_hashref( $result->as_hashref );
60             return $return;
61             }
62              
63             sub import
64             {
65             my $class = shift;
66             @Predicates = @_ if @_;
67             }
68              
69             sub new
70             {
71             my $class = shift;
72             my $self = bless { }, $class;
73            
74             $self->{predicates} = @_ ? \@_ : \@Predicates;
75            
76             return $self;
77             }
78              
79             sub new_strict
80             {
81             my $class = shift;
82             return $class->new(qw(describedby lrdd));
83             }
84              
85             sub new_default
86             {
87             my $class = shift;
88             return $class->new(@_Predicates);
89             }
90              
91             sub discover
92             {
93             my $self = shift;
94             my $uri = shift;
95             my $list = wantarray;
96            
97             $self = $self->new
98             unless blessed($self) && $self->isa(__PACKAGE__);
99            
100             my (@results, $rdfa, $rdfx, $response);
101            
102             # STEP 1: check the HTTP headers for a descriptor link
103             if ($uri =~ /^https?:/i)
104             {
105             $response = $self->_ua->head($uri);
106             my $model = rdf_parse();
107            
108             # Parse HTTP 'Link' headers.
109             parse_links_into_model($response => $model);
110            
111             if ($response->code eq HTTP_SEE_OTHER) # 303 Redirect
112             {
113             my $seeother = URI->new_abs(
114             $response->header('Location'),
115             URI->new($uri));
116            
117             $model->add_hashref({
118             $uri => {
119             'http://www.w3.org/2000/01/rdf-schema#seeAlso' => [
120             { 'value' => "$seeother" , 'type' => 'uri' },
121             ],
122             },
123             });
124             }
125            
126             my $iterator = __rdf_query($self->_make_sparql($uri, $list), $model);
127             while (my $row = $iterator->next)
128             {
129             push @results, $row->{'descriptor'}->uri
130             if defined $row->{'descriptor'}
131             && $row->{'descriptor'}->is_resource;
132             }
133            
134             # Bypass further processing if we've got a result and we only wanted one!
135             return $results[0] if @results && !$list;
136             }
137            
138             # STEP 2: check the HTTP body (RDF) for a descriptor link
139             if ($uri =~ /^https?:/i)
140             {
141             my $model = rdf_parse();
142            
143             # Parse as RDFa, if the response is RDFa.
144             ($response, $rdfa) = $self->_cond_parse_rdfa($response, $model, $uri);
145            
146             # If the response was not RDFa, try parsing as RDF.
147             ($response, $rdfx) = $self->_cond_parse_rdf($response, $model, $uri)
148             unless defined $rdfa;
149            
150             my $iterator = __rdf_query($self->_make_sparql($uri, $list), $model);
151             while (my $row = $iterator->next)
152             {
153             push @results, $row->{'descriptor'}->uri
154             if defined $row->{'descriptor'}
155             && $row->{'descriptor'}->is_resource;
156             }
157            
158             # Bypass further processing if we've got a result and we only wanted one!
159             return $results[0] if @results && !$list;
160             }
161            
162             # STEP 2a: AtomOWL doesn't use <id> as a subject URI.
163             if (defined $rdfa && $rdfa->{'atom_parser'} && blessed($self->{'cache'}->{$uri}))
164             {
165             my $iterator = __rdf_query($self->_make_sparql_atomowl($uri, $list), $self->{'cache'}->{$uri});
166             while (my $row = $iterator->next)
167             {
168             push @results, $row->{'descriptor'}->uri
169             if defined $row->{'descriptor'}
170             && $row->{'descriptor'}->is_resource;
171             }
172            
173             # Bypass further processing if we've got a result and we only wanted one!
174             return $results[0] if @results && !$list;
175             }
176            
177             # STEP 3: try host-meta.
178             my $hostmeta_location = XRD::Parser::hostmeta_location($uri);
179             unless (blessed($self->{'cache'}->{$hostmeta_location}))
180             {
181             eval
182             {
183             my $hm = XRD::Parser->hostmeta($uri);
184             $hm->consume;
185             $self->{'cache'}->{$hostmeta_location} = $hm->graph;
186             };
187             }
188             if (blessed( $self->{'cache'}->{$hostmeta_location} ))
189             {
190             my $hm_graph = $self->{'cache'}->{$hostmeta_location};
191            
192             # First try original query.
193             my $iterator = __rdf_query($self->_make_sparql($uri, $list), $hm_graph);
194             while (my $row = $iterator->next)
195             {
196             push @results, $row->{'descriptor'}->uri
197             if defined $row->{'descriptor'}
198             && $row->{'descriptor'}->is_resource;
199             }
200            
201             # Then try using host-meta URI templates.
202             $iterator = __rdf_query($self->_make_sparql_template($uri, $list), $hm_graph);
203             while (my $row = $iterator->next)
204             {
205             if (defined $row->{'descriptor'}
206             && $row->{'descriptor'}->is_literal
207             && $row->{'descriptor'}->literal_datatype eq (XRD::Parser->URI_XRD.'URITemplate'))
208             {
209             my $u = $row->{'descriptor'}->literal_value;
210             $u =~ s/\{uri\}/uri_escape($uri)/ie;
211             push @results, $u;
212             }
213             }
214             }
215            
216             # STEP 4: the resource may be self-describing
217             if ($rdfa || $rdfx)
218             {
219             my $data = $self->parse($uri);
220            
221             # only add $uri to @results
222             # if we're completely desparate,
223             # or it seems to provide something useful.
224             push @results, $uri
225             if !@results
226             || $data->count_statements(RDF::Trine::Node::Resource->new($uri), undef, undef);
227             }
228            
229             if (@results)
230             {
231             return $list ? @results : $results[0];
232             }
233            
234             return;
235             }
236              
237             sub parse
238             {
239             my $self = shift;
240             my $uri = shift or return undef;
241            
242             $self = $self->new
243             unless blessed($self) && $self->isa(__PACKAGE__);
244            
245             unless (blessed($self->{'cache'}{$uri})
246             and $self->{'cache'}{$uri}->isa('RDF::Trine::Model'))
247             {
248             my $response = $self->_ua->get($uri);
249             my $model = rdf_parse();
250            
251             # Parse as RDFa, if the response is RDFa.
252             ($response, my $rdfa) = $self->_cond_parse_rdfa($response, $model, $uri);
253            
254             # If the response was not RDFa, try parsing as RDF.
255             ($response, my $rdfx) = $self->_cond_parse_rdf($response, $model, $uri)
256             unless defined $rdfa;
257            
258             # If the response was not RDFa or another type of RDF, try parsing as XRD.
259             ($response, my $xrd) = $self->_cond_parse_xrd($response, $model, $uri)
260             unless defined $rdfa || defined $rdfx;
261             }
262            
263             return $self->{'cache'}{$uri};
264             }
265              
266             sub process
267             {
268             my $self = shift;
269             my $uri = shift;
270            
271             $self = $self->new
272             unless blessed($self) && $self->isa(__PACKAGE__);
273            
274             my $descriptor = $self->discover($uri);
275             return $self->parse($descriptor) // rdf_parse();
276             }
277              
278             sub process_all
279             {
280             my $self = shift;
281             my $uri = shift;
282            
283             $self = $self->new
284             unless blessed($self) && $self->isa(__PACKAGE__);
285            
286             my @descriptors = $self->discover($uri);
287             my $model = $self->parse($uri) // rdf_parse();
288            
289             foreach my $descriptor (@descriptors)
290             {
291             my $description = $self->parse($descriptor);
292             rdf_parse($description, model=>$model); # merge
293             }
294            
295             return $model;
296             }
297              
298             sub _make_sparql
299             {
300             my ($self, $uri, $list) = @_;
301            
302             my @p;
303             foreach my $p (@{ $self->{'predicates'} })
304             {
305             push @p, sprintf('{ <%s> <%s> ?descriptor . }',
306             $uri, HTTP::Link::Parser::relationship_uri($p));
307             }
308             return $list ?
309             'SELECT DISTINCT ?descriptor WHERE { '.(join ' UNION ', @p).' }' :
310             'SELECT DISTINCT ?descriptor WHERE { OPTIONAL '.(join ' OPTIONAL ', @p).' }';
311             }
312              
313             sub _make_sparql_atomowl
314             {
315             my ($self, $uri, $list) = @_;
316            
317             my @p;
318             foreach my $p (@{ $self->{'predicates'} })
319             {
320             push @p, sprintf('{ ?feed <%s> ?descriptor . }',
321             HTTP::Link::Parser::relationship_uri($p));
322             }
323            
324             # this can be ambiguous in the face of atom:source.
325             return $list ?
326             'SELECT DISTINCT ?descriptor WHERE { ?feed a <http://bblfish.net/work/atom-owl/2006-06-06/#Feed> . { '.(join ' UNION ', @p).'} }' :
327             'SELECT DISTINCT ?descriptor WHERE { ?feed a <http://bblfish.net/work/atom-owl/2006-06-06/#Feed> . OPTIONAL '.(join ' OPTIONAL ', @p).' }';
328             }
329              
330             sub _make_sparql_template
331             {
332             my ($self, $uri, $list) = @_;
333             my $hosturi = XRD::Parser::host_uri( $uri );
334             my @p;
335             foreach my $p (@{ $self->{'predicates'} })
336             {
337             push @p, sprintf('{ <%s> <%s> ?descriptor . }',
338             $hosturi, XRD::Parser::template_uri(HTTP::Link::Parser::relationship_uri($p)));
339             }
340             return $list ?
341             'SELECT DISTINCT ?descriptor WHERE { '.(join ' UNION ', @p).' }' :
342             'SELECT DISTINCT ?descriptor WHERE { OPTIONAL '.(join ' OPTIONAL ', @p).' }';
343             }
344              
345             sub _cond_parse_rdfa
346             {
347             my ($self, $response, $model, $uri) = @_;
348            
349             my $rdfa_options;
350             my $rdfa_input;
351            
352             if ($response->content_type =~ m'^(application/atom\+xml|image/svg\+xml|application/xhtml\+xml|text/html)'i)
353             {
354             if (uc $response->request->method ne 'GET')
355             {
356             $self->_ua->max_redirect(3);
357             $response = $self->_ua->get($uri);
358             $self->_ua->max_redirect(0);
359             }
360             }
361             else
362             {
363             return ($response, undef);
364             }
365            
366             $response->is_success or return ($response, undef);
367            
368             my $hostlang = RDF::RDFa::Parser::Config->host_from_media_type($response->content_type);
369             $rdfa_options = RDF::RDFa::Parser::Config->new($hostlang, RDF::RDFa::Parser::Config->RDFA_GUESS,
370             atom_parser => ($response->content_type =~ m'^application/atom\+xml'i ? 1 : 0),
371             );
372            
373             if (defined $rdfa_options)
374             {
375             # Make sure any predicate keywords are recognised in @rel/@rev.
376             # This can override some normal RDFa keywords in some cases.
377             foreach my $attr (qw(rel rev))
378             {
379             foreach my $p (@{ $self->{'predicates'} })
380             {
381             $rdfa_options->{'keywords'}->{$attr}->{$p}
382             = HTTP::Link::Parser::relationship_uri($p)
383             unless $p =~ /:/;
384             $rdfa_options->{'keywords'}->{'insensitive'}->{$attr}->{$p}
385             = HTTP::Link::Parser::relationship_uri($p)
386             unless $p =~ /:/;
387             }
388             }
389            
390             $rdfa_input = $response->decoded_content
391             unless defined $rdfa_input;
392            
393             my $parser = RDF::RDFa::Parser->new($rdfa_input, $response->base, $rdfa_options, $model->_store);
394             $parser->consume;
395             $self->{'cache'}->{$uri} = $model;
396             return ($response, $rdfa_options);
397             }
398            
399             return ($response, undef);
400             }
401              
402             sub _cond_parse_rdf
403             {
404             my ($self, $response, $model, $uri) = @_;
405             my $type;
406            
407             if ($response->content_type =~ m'^(application/rdf\+xml|(application|text)/(x-)?(rdf\+)?(turtle|n3|json))'i)
408             {
409             if (uc $response->request->method ne 'GET')
410             {
411             $self->_ua->max_redirect(3);
412             $response = $self->_ua->get($uri);
413             $self->_ua->max_redirect(0);
414             }
415            
416             $type = 'Turtle';
417             $type = 'RDFXML' if $response->content_type =~ /rdf.xml/;
418             $type = 'RDFJSON' if $response->content_type =~ /json/;
419             }
420             else
421             {
422             return ($response, undef);
423             }
424            
425             $response->is_success or return ($response, undef);
426            
427             rdf_parse($response->decoded_content, type=>$type, model=>$model, base=>$response->base);
428             $self->{'cache'}->{$uri} = $model;
429             return ($response, 1);
430             }
431              
432             sub _cond_parse_xrd
433             {
434             my ($self, $response, $model, $uri) = @_;
435             my $type;
436            
437             if ($response->content_type =~ m'^(text/plain|application/octet-stream|application/xrd\+xml|(application|text)/xml)'i)
438             {
439             if (uc $response->request->method ne 'GET')
440             {
441             $self->_ua->max_redirect(3);
442             $response = $self->_ua->get($uri);
443             $self->_ua->max_redirect(0);
444             }
445             }
446             else
447             {
448             return ($response, undef);
449             }
450            
451             $response->is_success or return ($response, undef);
452            
453             my $xrd = XRD::Parser->new($response->decoded_content, $response->base, {loose_mime=>1}, $model->_store);
454             $xrd->consume;
455             $self->{'cache'}->{$uri} = $model;
456             return ($response, $xrd);
457             }
458              
459             sub _ua
460             {
461             my $self = shift;
462            
463             unless (defined $self->{ua})
464             {
465             $self->{ua} = LWP::UserAgent->new;
466             $self->{ua}->agent(sprintf('%s/%s (%s) ', __PACKAGE__, __PACKAGE__->VERSION, __PACKAGE__->AUTHORITY));
467             $self->{ua}->default_header(Accept => (join ', ', @MediaTypes));
468             $self->{ua}->max_redirect(0);
469             }
470            
471             return $self->{ua};
472             }
473              
474             1;
475              
476             =head1 NAME
477              
478             HTTP::LRDD - link-based resource descriptor discovery
479              
480             =head1 SYNOPSIS
481              
482             use HTTP::LRDD;
483            
484             my $lrdd = HTTP::LRDD->new;
485             my @descriptors = $lrdd->discover($resource);
486             foreach my $descriptor (@descriptors)
487             {
488             my $description = $lrdd->parse($descriptor);
489             # $description is an RDF::Trine::Model
490             }
491            
492             =head1 DESCRIPTION
493              
494             Note: the LRDD specification has ceased to be, with some parts being merged into
495             the host-meta Internet Draft. This CPAN module will go in its own direction,
496             bundling up best-practice techniques for discovering links and descriptors for a
497             given URI.
498              
499             =head2 Import Routine
500              
501             =over 4
502              
503             =item C<< use HTTP::LRDD (@predicates) >>
504              
505             When importing HTTP::LRDD, you can optionally provide a list of
506             predicate URIs (i.e. the URIs which rel values expand to). This
507             may also include IANA-registered link types, which are short tokens
508             rather than full URIs. If you do not provide a list of predicate
509             URIs, then a sensible default set is used.
510              
511             Because this configuration is global in nature, it is not recommended.
512             It is better to supply a list of predicates to the constructor
513             instead, or rely on the defaults. This feature should be regarded
514             as deprecated.
515              
516             =back
517              
518             =head2 Constructors
519              
520             =over 4
521              
522             =item C<< HTTP::LRDD->new(@predicates) >>
523              
524             Create a new LRDD discovery object using the given predicate URIs.
525             If @predicates is omitted, then the predicates passed to the import
526             routine are used instead.
527              
528             =item C<< HTTP::LRDD->new_strict >>
529              
530             Create a new LRDD discovery object using the 'describedby' and
531             'lrdd' IANA-registered predicates.
532              
533             =item C<< HTTP::LRDD->new_default >>
534              
535             Create a new LRDD discovery object using the default set of
536             predicates ('describedby', 'lrdd', 'wdrs:describedby', 'xhv:meta'
537             and 'rdfs:seeAlso').
538              
539             =back
540              
541             =head2 Public Methods
542              
543             =over 4
544              
545             =item C<< $lrdd->discover($resource_uri) >>
546              
547             Discovers a descriptor for the given resource; or if called in a list
548             context, a list of descriptors.
549              
550             A descriptor is a resource that provides a description for something.
551             So, if the given resource URI was the web address for an image, then
552             the descriptor might be the web address for a metadata file about the
553             image. If the given URI was an e-mail address, then the descriptor
554             might be a profile document for the person to whom the address belongs.
555              
556             The following sources are checked (in order) to find links to
557             descriptors.
558              
559             =over 4
560              
561             =item * HTTP response headers ("Link" header; "303 See Other" status)
562              
563             =item * HTTP response message (RDF or RDFa)
564              
565             =item * https://HOSTNAME/.well-known/host-meta
566              
567             =item * http://HOSTNAME/.well-known/host-meta
568              
569             =back
570              
571             If none of the above is able to yield a link to a descriptor, then
572             the resource URI itself may be returned if it is in a self-describing
573             format (e.g. RDF).
574              
575             There is no guaranteed file format for the descriptor, but it is
576             usually RDF, POWDER XML or XRD.
577              
578             This method can also be called without an object (as a class method)
579             in which case, a temporary object is created automatically using
580             C<< new >>.
581              
582             =item C<< $lrdd->parse($descriptor_uri) >>
583              
584             Parses a descriptor in XRD or RDF (RDF/XML, RDFa, Turtle, etc).
585              
586             Returns an RDF::Trine::Model or undef if unable to process.
587              
588             This method can also be called without an object (as a class method)
589             in which case, a temporary object is created automatically using
590             C<< new >>.
591              
592             =item C<< $lrdd->process($resource_uri) >>
593              
594             Performs the equivalent of C<discover> and C<parse> in one easy step.
595              
596             Calls C<discover> in a non-list context, so only the first descriptor
597             is used.
598              
599             =item C<< $lrdd->process_all($resource_uri) >>
600              
601             Performs the equivalent of C<discover> and C<parse> in one easy step.
602              
603             Calls C<discover> in a list context, so multiple descriptors are
604             combined into the resulting graph.
605              
606             =back
607              
608             =head1 EXAMPLES
609              
610             Discover the hub address (PubSubHubub) for a feed:
611              
612             my $lrdd = HTTP::LRDD->new('hub');
613             my $hub = $lrdd->discover('http://example.net/feed.atom');
614              
615             Discover an author link (rel="author") from an HTML page:
616              
617             my $lrdd = HTTP::LRDD->new('author');
618             my $author = $lrdd->discover('http://example.com/page.html');
619              
620             (For RDF people, you should note that rel="author" is not semantically
621             equivalent to the "foaf:maker" property but closer to the
622             "foaf:maker/foaf:homepage" SPARQL 1.1 property path - i.e. the rel="author"
623             link destination is not a URI for the author themselves, but a page about
624             the author.)
625              
626             If that author resource is in a machine-readable format (e.g. RDF), then
627             parse the data:
628              
629             my $author_data = $lrdd->parse($author);
630            
631             Or, you can combine C<discover> and C<parse>:
632              
633             my $lrdd = HTTP::LRDD->new('author');
634             my $author_data = $lrdd->process('http://example.com/page.html');
635              
636             Get metadata for an image:
637              
638             my $lrdd = HTTP::LRDD->new;
639             my $data = $lrdd->process_all('http://example.org/flower.jpeg');
640              
641             As we're not passing any arguments to the constructor, we can use a shortcut:
642              
643             my $data = HTTP::LRDD->process_all('http://example.org/flower.jpeg');
644            
645             Find the title of the image:
646              
647             use RDF::QueryX::Lazy;
648            
649             my $image = q<http://example.org/flower.jpeg>;
650             my $results = RDF::QueryX::Lazy
651             -> new("SELECT * WHERE { <$image> dc:title ?t }")
652             -> execute( HTTP::LRDD->process_all($image) );
653              
654             =head1 BUGS
655              
656             Please report any bugs to L<http://rt.cpan.org/>.
657              
658             B<Note>: many problems can stem from servers that send incorrect
659             C<Content-Type> headers. If you send an XRD file as "text/html",
660             then this module will not guess what you're doing - it will assume
661             the file is really HTML, and inspect it for RDFa. For host-meta
662             files, this module is slightly more relaxed, as there's a strong
663             assumption that they are XRD... but YOU SHOULD NOT RELY ON THIS.
664             If you're running a server, use the correct media type.
665              
666             =head1 SEE ALSO
667              
668             L<HTTP::Link::Parser>, L<XRD::Parser>, L<XML::Atom::OWL>
669             L<WWW::Finger>.
670              
671             L<http://www.perlrdf.org/>.
672              
673             =head1 AUTHOR
674              
675             Toby Inkster E<lt>tobyink@cpan.orgE<gt>.
676              
677             =head1 COPYRIGHT AND LICENCE
678              
679             Copyright 2010-2012 Toby Inkster
680              
681             This library is free software; you can redistribute it and/or modify it
682             under the same terms as Perl itself.
683              
684             =head1 DISCLAIMER OF WARRANTIES
685              
686             THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
687             WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
688             MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
689              
690             =cut