File Coverage

blib/lib/RDF/Trine/Pattern.pm
Criterion Covered Total %
statement 184 193 95.3
branch 32 40 80.0
condition 5 15 33.3
subroutine 28 28 100.0
pod 14 14 100.0
total 263 290 90.6


line stmt bran cond sub pod time code
1             # RDF::Trine::Pattern
2             # -----------------------------------------------------------------------------
3              
4             =head1 NAME
5              
6             RDF::Trine::Pattern - Class for basic graph patterns
7              
8             =head1 VERSION
9              
10             This document describes RDF::Trine::Pattern version 1.017
11              
12             =cut
13              
14             package RDF::Trine::Pattern;
15              
16 68     68   415 use strict;
  68         156  
  68         1577  
17 68     68   312 use warnings;
  68         142  
  68         1486  
18 68     68   310 no warnings 'redefine';
  68         153  
  68         1671  
19              
20 68     68   346 use Data::Dumper;
  68         161  
  68         2622  
21 68     68   371 use Log::Log4perl;
  68         150  
  68         423  
22 68     68   3530 use Scalar::Util qw(blessed refaddr);
  68         160  
  68         2984  
23 68     68   396 use List::Util qw(any);
  68         190  
  68         3625  
24 68     68   397 use Carp qw(carp croak confess);
  68         163  
  68         3033  
25 68     68   371 use RDF::Trine::Iterator qw(smap);
  68         158  
  68         2465  
26 68     68   363 use RDF::Trine qw(iri);
  68         155  
  68         3291  
27              
28             ######################################################################
29              
30             our ($VERSION);
31             BEGIN {
32 68     68   95527 $VERSION = '1.017';
33             }
34              
35             ######################################################################
36              
37             =head1 METHODS
38              
39             =over 4
40              
41             =item C<< new ( @triples ) >>
42              
43             Returns a new BasicGraphPattern structure.
44              
45             =cut
46              
47             sub new {
48 990     990 1 7978 my $class = shift;
49 990         2423 my @triples = @_;
50 990         2101 foreach my $t (@triples) {
51 1038 100 100     8806 unless (blessed($t) and $t->isa('RDF::Trine::Statement')) {
52 2         23 throw RDF::Trine::Error -text => "Patterns belonging to a BGP must be triples";
53             }
54             }
55 988         3643 return bless( [ @triples ], $class );
56             }
57              
58             =item C<< construct_args >>
59              
60             Returns a list of arguments that, passed to this class' constructor,
61             will produce a clone of this algebra pattern.
62              
63             =cut
64              
65             sub construct_args {
66 1     1 1 488 my $self = shift;
67 1         5 return ($self->triples);
68             }
69              
70             =item C<< triples >>
71              
72             Returns a list of triples belonging to this BGP.
73              
74             =cut
75              
76             sub triples {
77 2950     2950 1 5008 my $self = shift;
78 2950         9155 return @$self;
79             }
80              
81             =item C<< type >>
82              
83             =cut
84              
85             sub type {
86 25     25 1 84 return 'BGP';
87             }
88              
89             =item C<< sse >>
90              
91             Returns the SSE string for this algebra expression.
92              
93             =cut
94              
95             sub sse {
96 31     31 1 76 my $self = shift;
97 31         70 my $context = shift;
98            
99             return sprintf(
100             '(bgp %s)',
101 31         93 join(' ', map { $_->sse( $context ) } $self->triples)
  54         203  
102             );
103             }
104              
105             =item C<< referenced_variables >>
106              
107             Returns a list of the variable names used in this algebra expression.
108              
109             =cut
110              
111             sub referenced_variables {
112 65     65 1 131 my $self = shift;
113 65         181 return RDF::Trine::_uniq(map { $_->referenced_variables } $self->triples);
  118         372  
114             }
115              
116             =item C<< definite_variables >>
117              
118             Returns a list of the variable names that will be bound after evaluating this algebra expression.
119              
120             =cut
121              
122             sub definite_variables {
123 1     1 1 3 my $self = shift;
124 1         3 return RDF::Trine::_uniq(map { $_->definite_variables } $self->triples);
  1         5  
125             }
126              
127             =item C<< clone >>
128              
129             =cut
130              
131             sub clone {
132 1     1 1 4 my $self = shift;
133 1         3 my $class = ref($self);
134 1         4 return $class->new( map { $_->clone } $self->triples );
  1         6  
135             }
136              
137             =item C<< bind_variables ( \%bound ) >>
138              
139             Returns a new pattern with variables named in %bound replaced by their corresponding bound values.
140              
141             =cut
142              
143             sub bind_variables {
144 1     1 1 4 my $self = shift;
145 1         3 my $class = ref($self);
146 1         2 my $bound = shift;
147 1         3 return $class->new( map { $_->bind_variables( $bound ) } $self->triples );
  1         6  
148             }
149              
150             =item C<< subsumes ( $statement ) >>
151              
152             Returns true if the pattern will subsume the $statement when matched against a
153             triple store.
154              
155             =cut
156              
157             sub subsumes {
158 5     5 1 482 my $self = shift;
159 5         9 my $st = shift;
160            
161 5         25 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
162 5         622 my @triples = $self->triples;
163 5         9 foreach my $t (@triples) {
164 5 100       19 if ($t->subsumes( $st )) {
165 4         12 $l->debug($self->sse . " \x{2292} " . $st->sse);
166 4         51 return 1;
167             }
168             }
169 1         6 return 0;
170             }
171              
172             =item C<< merge_patterns ( @patterns ) >>
173              
174             Given an array of patterns, this will merge them into one.
175              
176             =cut
177              
178             sub merge_patterns {
179 12     12 1 39 my ($class, @patterns) = @_;
180 12         26 my @all_triples;
181 12         30 foreach my $pattern (@patterns) {
182 12 50 33     110 unless (blessed($pattern) and $pattern->isa('RDF::Trine::Pattern')) {
183 0         0 throw RDF::Trine::Error -text => "Patterns to be merged must be patterns themselves";
184             }
185 12         56 push(@all_triples, $pattern->triples);
186             }
187 12         38 return $class->new(@all_triples);
188             }
189              
190             =item C<< sort_for_join_variables >>
191              
192             Returns a new pattern object with the subpatterns of the referrant
193             sorted based on heuristics that ensure firstly that patterns can be
194             joined on the same variable and secondly on the usual selectivity
195             (i.e. how quickly the engine can drill down to the answer) of triple
196             patterns. Calls C<< subgroup >>, C<< sort_triples >> and C<<
197             merge_patterns >> in that order.
198              
199             =cut
200              
201             sub sort_for_join_variables {
202 914     914 1 1705 my $self = shift;
203 914 100       2573 return $self if (scalar $self->triples == 1);
204              
205 12         36 my $class = ref($self);
206 12         104 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
207 12         2741 $l->debug('Reordering ' . scalar $self->triples . ' triples for heuristical optimizations');
208              
209 12         163 my @sorted_triple_patterns = $self->subgroup;
210              
211 12         30 my @patterns;
212 12         31 foreach my $pattern (@sorted_triple_patterns) {
213 12         44 my $sorted = $pattern->sort_triples;
214 12         33 push(@patterns, $sorted);
215             }
216 12         47 return $class->merge_patterns(@patterns);
217             }
218              
219              
220             =item C<< subgroup >>
221              
222             Splits the pattern object up in an array of pattern objects where the
223             same triple patterns occur. It will group on common variables, so that
224             triple patterns can be joined together is in a group together. It will
225             also group triples that have no connection to other triples in a
226             group. It will then order the groups, first by number triples with
227             common variables, then by number of literals, then by the total number
228             of terms that are not variables.
229              
230              
231             =cut
232              
233             sub subgroup {
234 12     12 1 34 my $self = shift;
235 12         43 my @triples = $self->triples;
236 12         55 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
237 12         320 my %structure_counts;
238             my %triples_by_tid;
239             # First, we loop the dataset to compile some numbers for the
240             # variables in each triple pattern. This is to break the pattern
241             # into subpatterns that can be joined on the same variable
242 12         34 foreach my $t (@triples) {
243 24         72 my $tid = refaddr($t);
244 24         115 $triples_by_tid{$tid} = $t;
245 24         50 my $not_variable = 0;
246 24         86 foreach my $n ($t->nodes) {
247 80 100       256 if ($n->isa('RDF::Trine::Node::Variable')) {
248 31         101 my $name = $n->name;
249 31         92 $structure_counts{ $name }{ 'name' } = $name; # TODO: Worth doing in an array?
250 31         55 push(@{$structure_counts{$name}{'claimed_patterns'}}, $tid);
  31         99  
251 31         70 $structure_counts{ $name }{ 'common_variable_count' }++;
252 31 100       103 $structure_counts{ $name }{ 'not_variable_count' } = 0 unless ($structure_counts{ $name }{ 'not_variable_count' });
253 31 50       96 $structure_counts{ $name }{ 'literal_count' } = 0 unless ($structure_counts{ $name }{ 'literal_count' });
254 31         109 foreach my $char (split(//, $n->as_string)) { # TODO: Use a more standard format
255 93         191 $structure_counts{ $name }{ 'string_sum' } += ord($char);
256             }
257 31         102 foreach my $o ($t->nodes) {
258 99 100 33     416 unless ($o->isa('RDF::Trine::Node::Variable')) {
259 48         105 $structure_counts{ $name }{ 'not_variable_count' }++;
260             }
261             elsif ($o->isa('RDF::Trine::Node::Literal')) {
262             $structure_counts{ $name }{ 'literal_count' }++;
263             }
264             }
265             } else {
266 49         92 $not_variable++;
267             }
268             }
269 24 100       81 if ($not_variable == 3) { # Then, there are no variables in the pattern
270 6         12 my $name = '_no_definite';
271 6         15 $structure_counts{ $name }{ 'not_variable_count' } = $not_variable;
272 6         12 $structure_counts{ $name }{ 'common_variable_count' } = 0;
273 6         11 $structure_counts{ $name }{ 'literal_count' } = 0; # Doesn't mean anything now
274 6         12 $structure_counts{ $name }{ 'string_sum' } = 0; # Doesn't mean anything now
275 6         13 push(@{$structure_counts{$name}{'claimed_patterns'}}, $tid);
  6         18  
276             }
277              
278             }
279              
280             # Group triple subpatterns with just one triple pattern
281 12         28 my $just_ones;
282 12         63 while (my ($name, $data) = each(%structure_counts)) {
283 27 100       112 if($data->{'common_variable_count'} <= 1) {
284 19         42 $just_ones->{'common_variable_count'} = 1;
285 19         39 $just_ones->{'string_sum'} = 1;
286 19         48 $just_ones->{'literal_count'} += $data->{'literal_count'};
287 19         38 $just_ones->{'not_variable_count'} += $data->{'not_variable_count'};
288 19         29 my @claimed = @{$data->{'claimed_patterns'}};
  19         49  
289 19 100   7   71 unless (any { $_ == $claimed[0] } @{$just_ones->{'claimed_patterns'}}) {
  7         21  
  19         94  
290 14         36 push(@{$just_ones->{'claimed_patterns'}}, $claimed[0]);
  14         45  
291             }
292 19         125 delete $structure_counts{$name};
293             }
294             }
295              
296 12         88 $l->trace('Results of structural analysis: ' . Dumper(\%structure_counts));
297 12         1397 $l->trace('Block of single-triple patterns: ' . Dumper($just_ones));
298              
299             # Now, sort the patterns in the order specified by first the number
300             # of occurances of common variables, then the number of literals
301             # and then the number of terms that are not variables
302 12         847 my @sorted_patterns = sort { $b->{'common_variable_count'} <=> $a->{'common_variable_count'}
303             or $b->{'literal_count'} <=> $a->{'literal_count'}
304             or $b->{'not_variable_count'} <=> $a->{'not_variable_count'}
305 0 0 0     0 or $b->{'string_sum'} <=> $a->{'string_sum'}
      0        
306             } values(%structure_counts);
307              
308 12         28 push (@sorted_patterns, $just_ones);
309              
310 12         30 my @sorted_triple_patterns;
311              
312             # Now, loop through the sorted patterns, let the one with most
313             # weight first select the triples it wants to join. Within those
314             # subpatterns, apply the sort order of triple pattern heuristic
315 12         35 foreach my $item (@sorted_patterns) {
316 12         23 my @triple_patterns;
317 12         34 my $triples_left = scalar keys(%triples_by_tid);
318 12 50       44 if ($triples_left > 2) {
319 0         0 foreach my $tid (@{$item->{'claimed_patterns'}}) {
  0         0  
320 0 0       0 if (defined($triples_by_tid{$tid})) {
321 0         0 push(@triple_patterns, $triples_by_tid{$tid});
322 0         0 delete $triples_by_tid{$tid};
323             }
324             }
325 0         0 $l->debug("There are $triples_left triples left");
326 0         0 push(@sorted_triple_patterns, RDF::Trine::Pattern->new(@triple_patterns)); # TODO: Better way to call ourselves?
327             } else {
328 12         64 $l->debug("There is a rest of $triples_left triples");
329 12         120 push(@sorted_triple_patterns, RDF::Trine::Pattern->new(values(%triples_by_tid)));
330 12         31 last;
331             }
332             }
333              
334 12         75 return @sorted_triple_patterns;
335             }
336              
337             =item C<< sort_triples >>
338              
339             Will sort the triple patterns based on heuristics that looks at how
340             many variables the patterns have, and where they occur, see REFERENCES
341             for details. Returns a new sorted pattern object.
342              
343             =cut
344              
345             sub sort_triples {
346 12     12 1 23 my $self = shift;
347 12         44 return $self->_hsp_heuristic_1_4_triple_pattern_order;
348             }
349              
350             sub _hsp_heuristic_1_4_triple_pattern_order { # Heuristic 1 and 4 of HSP
351 12     12   27 my $self = shift;
352 12         29 my $class = ref($self);
353 12         34 my @triples = @$self;
354 12 50       46 return $self if (scalar @triples == 1);
355 12         35 my %triples_by_tid;
356 12         26 foreach my $t (@triples) {
357 24         82 my $tid = refaddr($t);
358 24         82 $triples_by_tid{$tid}{'tid'} = $tid; # TODO: Worth doing this in an array?
359 24         57 $triples_by_tid{$tid}{'triple'} = $t;
360 24         66 $triples_by_tid{$tid}{'sum'} = _hsp_heuristic_triple_sum($t);
361             }
362 12         70 my @sorted_tids = sort { $a->{'sum'} <=> $b->{'sum'} } values(%triples_by_tid);
  12         59  
363 12         29 my @sorted_triples;
364 12         33 foreach my $entry (@sorted_tids) {
365 24         61 push(@sorted_triples, $triples_by_tid{$entry->{'tid'}}->{'triple'});
366             }
367 12         41 return $class->new(@sorted_triples);
368             }
369              
370             # The below function finds a number to aid sorting
371             # It takes into account Heuristic 1 and 4 of the HSP paper, see REFERENCES
372             # as well as that it was noted in the text that rdf:type is usually less selective.
373              
374             # By assigning the integers to nodes, depending on whether they are in
375             # triple (subject, predicate, object), variables, rdf:type and
376             # literals, and sum them, they may be sorted. See code for the actual
377             # values used.
378              
379             # Denoting s for bound subject, p for bound predicate, a for rdf:type
380             # as predicate, o for bound object and l for literal object and ? for
381             # variable, we get the following order, most of which are identical to
382             # the HSP:
383              
384             # spl: 6
385             # spo: 8
386             # sao: 10
387             # s?l: 14
388             # s?o: 16
389             # ?pl: 25
390             # ?po: 27
391             # sp?: 30
392             # sa?: 32
393             # ??l: 33
394             # ??o: 35
395             # s??: 38
396             # ?p?: 49
397             # ?a?: 51
398             # ???: 57
399              
400             # Note that this number is not intended as an estimate of selectivity,
401             # merely a sorting key, but further research may possibly create such
402             # numbers.
403              
404             sub _hsp_heuristic_triple_sum {
405 24     24   41 my $t = shift;
406 24         39 my $sum = 0;
407 24 100       85 if ($t->subject->is_variable) {
408 15         33 $sum = 20;
409             } else {
410 9         17 $sum = 1;
411             }
412 24 100       93 if ($t->predicate->is_variable) {
413 2         4 $sum += 10;
414             } else {
415 22 100       86 if ($t->predicate->equal(iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))) {
416 7         23 $sum += 4;
417             } else {
418 15         36 $sum += 2;
419             }
420             }
421 24 100       98 if ($t->object->is_variable) {
    100          
422 14         35 $sum += 27;
423             } elsif ($t->object->is_literal) {
424 3         9 $sum += 3;
425             } else {
426 7         18 $sum += 5;
427             }
428 24         112 my $l = Log::Log4perl->get_logger("rdf.trine.pattern");
429             # Now a trick to get an deterministic sort order, hard to test without.
430 24         697 $sum *= 10000000;
431 24         110 foreach my $c (split(//,$t->as_string)) {
432 1643         2215 $sum += ord($c);
433             }
434 24         151 $l->debug($t->as_string . " triple has sorting sum " . $sum);
435 24         249 return $sum;
436             }
437              
438              
439            
440              
441             1;
442              
443             __END__
444              
445             =back
446              
447             =head1 BUGS
448              
449             Please report any bugs or feature requests to through the GitHub web interface
450             at L<https://github.com/kasei/perlrdf/issues>.
451              
452             =head1 REFERENCES
453              
454             The heuristics to order triple patterns in this module is strongly
455             influenced by L<The ICS-FORTH Heuristics-based SPARQL Planner
456             (HSP)|http://www.ics.forth.gr/isl/index_main.php?l=e&c=645>.
457              
458             =head1 AUTHOR
459              
460             Gregory Todd Williams C<< <gwilliams@cpan.org> >>
461              
462             Kjetil Kjernsmo C<< <kjetilk@cpan.org> >>
463              
464             =head1 COPYRIGHT
465              
466             Copyright (c) 2006-2012 Gregory Todd Williams. This
467             program is free software; you can redistribute it and/or modify it under
468             the same terms as Perl itself.
469              
470             =cut