| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
2
|
|
|
2
|
|
62584
|
use strict; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
129
|
|
|
2
|
|
|
|
|
|
|
package RDF::TrineX::Merge::Bnodes; |
|
3
|
|
|
|
|
|
|
#ABSTRACT: Merge blank nodes that obviously refer to the same resource |
|
4
|
|
|
|
|
|
|
our $VERSION = '0.1.1'; #VERSION |
|
5
|
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
1484
|
use parent 'Exporter'; |
|
|
2
|
|
|
|
|
602
|
|
|
|
2
|
|
|
|
|
13
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT = qw(merge_bnodes); |
|
8
|
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
7382
|
use Digest; |
|
|
2
|
|
|
|
|
1101
|
|
|
|
2
|
|
|
|
|
53
|
|
|
10
|
2
|
|
|
2
|
|
3155
|
use RDF::Trine::Model; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub merge_bnodes { |
|
13
|
|
|
|
|
|
|
my ($iterator, %options) = @_; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# configuration |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
$iterator = $iterator->as_stream if $iterator->isa('RDF::Trine::Model'); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my $digest = $options{digest} || 'MD5'; |
|
20
|
|
|
|
|
|
|
$digest = Digest->new($digest) unless ref $digest; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my $model = $options{model} || RDF::Trine::Model->new; |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# iterate and buffer triples with a single blank node |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my %buffer; |
|
28
|
|
|
|
|
|
|
while (my $triple = $iterator->next) { |
|
29
|
|
|
|
|
|
|
my $id = undef; |
|
30
|
|
|
|
|
|
|
my $subj = $triple->subject; |
|
31
|
|
|
|
|
|
|
my $obj = $triple->object; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
if ( $subj->isa('RDF::Trine::Node::Blank') ) { |
|
34
|
|
|
|
|
|
|
if ( $obj->isa('RDF::Trine::Node::Blank') ) { |
|
35
|
|
|
|
|
|
|
# both blank => flush buffer |
|
36
|
|
|
|
|
|
|
my @ids = map { $_->blank_identifier } $subj, $obj; |
|
37
|
|
|
|
|
|
|
foreach (@ids) { |
|
38
|
|
|
|
|
|
|
foreach (@{ $buffer{$_} || [] }) { |
|
39
|
|
|
|
|
|
|
$model->add_statement($_); |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
$buffer{$_} = undef; |
|
42
|
|
|
|
|
|
|
} |
|
43
|
|
|
|
|
|
|
} else { |
|
44
|
|
|
|
|
|
|
$id = $subj->blank_identifier; |
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
} elsif ( $obj->isa('RDF::Trine::Node::Blank') ) { |
|
47
|
|
|
|
|
|
|
$id = $obj->blank_identifier; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
if ( defined $id and ($buffer{$id} or !exists $buffer{$id}) ) { |
|
51
|
|
|
|
|
|
|
push @{ $buffer{$id} }, $triple; |
|
52
|
|
|
|
|
|
|
next; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
$model->add_statement( $triple ); |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my %id2digest; |
|
59
|
|
|
|
|
|
|
my %digest2id; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
while (my ($id, $triples) = each %buffer) { |
|
62
|
|
|
|
|
|
|
next if !defined $triples; |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# calculate digest for the set of triples connected to bnode $id |
|
65
|
|
|
|
|
|
|
my @canonical; |
|
66
|
|
|
|
|
|
|
foreach (@$triples) { |
|
67
|
|
|
|
|
|
|
my ($subj, $obj) = map { |
|
68
|
|
|
|
|
|
|
$_->isa('RDF::Trine::Node::Blank') ? '~' : $_->as_ntriples |
|
69
|
|
|
|
|
|
|
} $_->subject, $_->object; |
|
70
|
|
|
|
|
|
|
push @canonical, join ' ', $subj, $_->predicate->as_ntriples, $obj; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
# print "$_\n" for sort @canonical; |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
$digest->reset; |
|
75
|
|
|
|
|
|
|
$digest->add($_) for sort @canonical; |
|
76
|
|
|
|
|
|
|
my $base64 = $digest->b64digest; |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
$id2digest{$id} = $base64; |
|
79
|
|
|
|
|
|
|
push @{$digest2id{$base64}}, $id; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# use Data::Dumper; print Dumper(\%digest2id)."\n"; |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# keep only of of each bnode that obviously refer to the same resource |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
foreach my $base64 ( keys %digest2id ) { |
|
87
|
|
|
|
|
|
|
# sort only required for stable bnode ids (FIXME?) |
|
88
|
|
|
|
|
|
|
my @ids = sort @{$digest2id{$base64}}; |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
shift @ids; # keep the first |
|
91
|
|
|
|
|
|
|
foreach (@ids) { |
|
92
|
|
|
|
|
|
|
$buffer{$_} = undef; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# add remaining triples with bnodes |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
foreach (grep { defined $_ } values %buffer) { |
|
100
|
|
|
|
|
|
|
foreach ( @$_ ) { |
|
101
|
|
|
|
|
|
|
$model->add_statement( $_ ); |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
return $model; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
__END__ |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=pod |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=encoding UTF-8 |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 NAME |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
RDF::TrineX::Merge::Bnodes - Merge blank nodes that obviously refer to the same resource |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head1 VERSION |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
version 0.1.1 |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
use RDF::TrineX::Merge::Bnodes; |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
$model = merge_bnodes($model_or_iterator, %options); |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
To give an example, applying C<merge_bnodes> on this graph: |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
@prefix foaf: <http://xmlns.com/foaf/0.1/> . |
|
131
|
|
|
|
|
|
|
@base <http://example.org/> . |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . |
|
134
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . # obviously the same |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
will remove the second Bob. |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
This module exports the function B<merge_bnodes> to merge blank nodes that |
|
141
|
|
|
|
|
|
|
obviously refer to the same resource in an RDF graph. The function gets passed |
|
142
|
|
|
|
|
|
|
a L<RDF::Trine::Model> or L<RDF::Trine::Iterator>. The model or iterator |
|
143
|
|
|
|
|
|
|
should only contain RDF-compatible statements (e.g. no blank node predicates). |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The function can be applied to get rid of obviously duplicated statements. |
|
146
|
|
|
|
|
|
|
Obviously duplicated statements are defined as following: |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=over |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The statements include either a blank node subject or a blank node object. |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The statements only differ by their blank node identifier. |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=item |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
The blank nodes are not part of any other statement that includes two blank |
|
161
|
|
|
|
|
|
|
nodes. |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=back |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
In other words, the algorithm first finds all star subgraphs with the internal |
|
166
|
|
|
|
|
|
|
node as only blank nodes in the subgraph. Each subgraph is assigned a digest |
|
167
|
|
|
|
|
|
|
value calculated from all triples and nodes expect the blank nodes. Then |
|
168
|
|
|
|
|
|
|
duplicated subgraphs with same digest are removed. |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head1 LIMITATIONS |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
Statements that involve multiple blank nodes or blank nodes that are connected |
|
173
|
|
|
|
|
|
|
to another blank node are never removed. |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Don't expect the algorithm to understand what you is actually meant by the |
|
176
|
|
|
|
|
|
|
existence of blank nodes in your data. |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head1 CONFIGURATION |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Options can be passed as key-value pairs: |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=over |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=item digest |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
A L<Digest> or the name of a Digest module, e.g. "C<MD4>". The default digest |
|
187
|
|
|
|
|
|
|
is L<Digest::MD5>. |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=back |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Options not implemented yet: |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=over |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=item |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Option to skolemize blank nodes (IRIs with C<.well-known/genid/>). |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=item |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Option to also remove entailed statements with blank nodes: |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ; foaf:name "Bob" ] . |
|
204
|
|
|
|
|
|
|
<Alice> foaf:knows [ a foaf:Person ] . # could also be removed |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=item |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=back |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=head1 AUTHOR |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Jakob Voà |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
This software is copyright (c) 2014 by Jakob VoÃ. |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
|
219
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=cut |