line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
62584
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
129
|
|
2
|
|
|
|
|
|
|
package RDF::TrineX::Merge::Bnodes; |
3
|
|
|
|
|
|
|
#ABSTRACT: Merge blank nodes that obviously refer to the same resource |
4
|
|
|
|
|
|
|
our $VERSION = '0.1.1'; #VERSION |
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
1484
|
use parent 'Exporter'; |
|
2
|
|
|
|
|
602
|
|
|
2
|
|
|
|
|
13
|
|
7
|
|
|
|
|
|
|
our @EXPORT = qw(merge_bnodes); |
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
7382
|
use Digest; |
|
2
|
|
|
|
|
1101
|
|
|
2
|
|
|
|
|
53
|
|
10
|
2
|
|
|
2
|
|
3155
|
use RDF::Trine::Model; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub merge_bnodes { |
13
|
|
|
|
|
|
|
my ($iterator, %options) = @_; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# configuration |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
$iterator = $iterator->as_stream if $iterator->isa('RDF::Trine::Model'); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $digest = $options{digest} || 'MD5'; |
20
|
|
|
|
|
|
|
$digest = Digest->new($digest) unless ref $digest; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $model = $options{model} || RDF::Trine::Model->new; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# iterate and buffer triples with a single blank node |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my %buffer; |
28
|
|
|
|
|
|
|
while (my $triple = $iterator->next) { |
29
|
|
|
|
|
|
|
my $id = undef; |
30
|
|
|
|
|
|
|
my $subj = $triple->subject; |
31
|
|
|
|
|
|
|
my $obj = $triple->object; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
if ( $subj->isa('RDF::Trine::Node::Blank') ) { |
34
|
|
|
|
|
|
|
if ( $obj->isa('RDF::Trine::Node::Blank') ) { |
35
|
|
|
|
|
|
|
# both blank => flush buffer |
36
|
|
|
|
|
|
|
my @ids = map { $_->blank_identifier } $subj, $obj; |
37
|
|
|
|
|
|
|
foreach (@ids) { |
38
|
|
|
|
|
|
|
foreach (@{ $buffer{$_} || [] }) { |
39
|
|
|
|
|
|
|
$model->add_statement($_); |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
$buffer{$_} = undef; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
} else { |
44
|
|
|
|
|
|
|
$id = $subj->blank_identifier; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
} elsif ( $obj->isa('RDF::Trine::Node::Blank') ) { |
47
|
|
|
|
|
|
|
$id = $obj->blank_identifier; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
if ( defined $id and ($buffer{$id} or !exists $buffer{$id}) ) { |
51
|
|
|
|
|
|
|
push @{ $buffer{$id} }, $triple; |
52
|
|
|
|
|
|
|
next; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
$model->add_statement( $triple ); |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my %id2digest; |
59
|
|
|
|
|
|
|
my %digest2id; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
while (my ($id, $triples) = each %buffer) { |
62
|
|
|
|
|
|
|
next if !defined $triples; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# calculate digest for the set of triples connected to bnode $id |
65
|
|
|
|
|
|
|
my @canonical; |
66
|
|
|
|
|
|
|
foreach (@$triples) { |
67
|
|
|
|
|
|
|
my ($subj, $obj) = map { |
68
|
|
|
|
|
|
|
$_->isa('RDF::Trine::Node::Blank') ? '~' : $_->as_ntriples |
69
|
|
|
|
|
|
|
} $_->subject, $_->object; |
70
|
|
|
|
|
|
|
push @canonical, join ' ', $subj, $_->predicate->as_ntriples, $obj; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
# print "$_\n" for sort @canonical; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
$digest->reset; |
75
|
|
|
|
|
|
|
$digest->add($_) for sort @canonical; |
76
|
|
|
|
|
|
|
my $base64 = $digest->b64digest; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
$id2digest{$id} = $base64; |
79
|
|
|
|
|
|
|
push @{$digest2id{$base64}}, $id; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# use Data::Dumper; print Dumper(\%digest2id)."\n"; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# keep only of of each bnode that obviously refer to the same resource |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
foreach my $base64 ( keys %digest2id ) { |
87
|
|
|
|
|
|
|
# sort only required for stable bnode ids (FIXME?) |
88
|
|
|
|
|
|
|
my @ids = sort @{$digest2id{$base64}}; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
shift @ids; # keep the first |
91
|
|
|
|
|
|
|
foreach (@ids) { |
92
|
|
|
|
|
|
|
$buffer{$_} = undef; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# add remaining triples with bnodes |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
foreach (grep { defined $_ } values %buffer) { |
100
|
|
|
|
|
|
|
foreach ( @$_ ) { |
101
|
|
|
|
|
|
|
$model->add_statement( $_ ); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
return $model; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
__END__ |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=pod |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=encoding UTF-8 |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 NAME |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
RDF::TrineX::Merge::Bnodes - Merge blank nodes that obviously refer to the same resource |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head1 VERSION |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
version 0.1.1 |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head1 SYNOPSIS |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
use RDF::TrineX::Merge::Bnodes; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
$model = merge_bnodes($model_or_iterator, %options); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
To give an example, applying C<merge_bnodes> on this graph: |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
@prefix foaf: <http://xmlns.com/foaf/0.1/> . |
131
|
|
|
|
|
|
|
@base <http://example.org/> . |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . |
134
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . # obviously the same |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
will remove the second Bob. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 DESCRIPTION |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
This module exports the function B<merge_bnodes> to merge blank nodes that |
141
|
|
|
|
|
|
|
obviously refer to the same resource in an RDF graph. The function gets passed |
142
|
|
|
|
|
|
|
a L<RDF::Trine::Model> or L<RDF::Trine::Iterator>. The model or iterator |
143
|
|
|
|
|
|
|
should only contain RDF-compatible statements (e.g. no blank node predicates). |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The function can be applied to get rid of obviously duplicated statements. |
146
|
|
|
|
|
|
|
Obviously duplicated statements are defined as following: |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=over |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The statements include either a blank node subject or a blank node object. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The statements only differ by their blank node identifier. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=item |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
The blank nodes are not part of any other statement that includes two blank |
161
|
|
|
|
|
|
|
nodes. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=back |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
In other words, the algorithm first finds all star subgraphs with the internal |
166
|
|
|
|
|
|
|
node as only blank nodes in the subgraph. Each subgraph is assigned a digest |
167
|
|
|
|
|
|
|
value calculated from all triples and nodes expect the blank nodes. Then |
168
|
|
|
|
|
|
|
duplicated subgraphs with same digest are removed. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head1 LIMITATIONS |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
Statements that involve multiple blank nodes or blank nodes that are connected |
173
|
|
|
|
|
|
|
to another blank node are never removed. |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Don't expect the algorithm to understand what you is actually meant by the |
176
|
|
|
|
|
|
|
existence of blank nodes in your data. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head1 CONFIGURATION |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Options can be passed as key-value pairs: |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=over |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item digest |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
A L<Digest> or the name of a Digest module, e.g. "C<MD4>". The default digest |
187
|
|
|
|
|
|
|
is L<Digest::MD5>. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=back |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Options not implemented yet: |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=over |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Option to skolemize blank nodes (IRIs with C<.well-known/genid/>). |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=item |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Option to also remove entailed statements with blank nodes: |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . |
204
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ] . # could also be removed |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=item |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=back |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 AUTHOR |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Jakob Voà |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
This software is copyright (c) 2014 by Jakob VoÃ. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
219
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=cut |