line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
OWL::Simple::Parser |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use OWL::Simple::Parser; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# load Experimental Factor Ontology (http://www.ebi.ac.uk/efo/efo.owl) |
12
|
|
|
|
|
|
|
my $parser = OWL::Simple::Parser->new( owlfile => 'efo.owl', |
13
|
|
|
|
|
|
|
synonym_tag => 'efo:alternative_term', |
14
|
|
|
|
|
|
|
definition_tag => 'efo:definition' ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# parse file |
17
|
|
|
|
|
|
|
$parser->parse(); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# iterate through all the classes |
20
|
|
|
|
|
|
|
for my $id (keys %{ $parser->class }){ |
21
|
|
|
|
|
|
|
my $OWLClass = $parser->class->{$id}; |
22
|
|
|
|
|
|
|
print $id . ' ' . $OWLClass->label . "\n"; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# list synonyms |
25
|
|
|
|
|
|
|
for my $syn (@{ $OWLClass->synonyms }){ |
26
|
|
|
|
|
|
|
print "\tsynonym - $syn\n"; |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# list definitions |
30
|
|
|
|
|
|
|
for my $def (@{ $OWLClass->definitions }){ |
31
|
|
|
|
|
|
|
print "\tdef - $def\n"; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# list parents |
35
|
|
|
|
|
|
|
for my $parent (@{ $OWLClass->subClassOf }){ |
36
|
|
|
|
|
|
|
print "\tsubClassOf - $parent\n"; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 DESCRIPTION |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
A simple OWL parser loading accessions, labels and synonyms and exposes them |
43
|
|
|
|
|
|
|
as a collection of OWL::Simple::Class objects. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This module wraps XML::Parser, which is a sequential event-driven XML parser that |
46
|
|
|
|
|
|
|
can potentially handle very large XML documents. The whole XML structure |
47
|
|
|
|
|
|
|
is never loaded into memory completely, only the bits of interest. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
In the constructor specify the owlfile to be loaded and two optional tags - |
50
|
|
|
|
|
|
|
synonym_tag or definition_tag that define custom annotations in the ontology for |
51
|
|
|
|
|
|
|
synonyms and definitions respectively. Note both tags have to be fully |
52
|
|
|
|
|
|
|
specified exactly as in the OWL XML to be loaded, e.g. FULL_SYN for NCI Thesaurus |
53
|
|
|
|
|
|
|
or efo:alternative_term for EFO. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head2 METHODS |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=over |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item class_count() |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Number of classes loaded by the parser. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=item synonyms_count() |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Number of synonyms loaded by the parser. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item version() |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Version of the ontology extracted from the owl:versionInfo. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=item class |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Hash collection of all the OWL::Simple::Class objects |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=back |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head1 AUTHOR |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Tomasz Adamusiak <tomasz@cpan.org> |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Copyright (c) 2010-2011 European Bioinformatics Institute. All Rights Reserved. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it |
86
|
|
|
|
|
|
|
under lGPLv3. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
This software is provided "as is" without warranty of any kind. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=cut |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
package OWL::Simple::Parser; |
93
|
|
|
|
|
|
|
|
94
|
1
|
|
|
1
|
|
26412
|
use Moose 0.89; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
use OWL::Simple::Class; |
96
|
|
|
|
|
|
|
use XML::Parser 2.34; |
97
|
|
|
|
|
|
|
use Data::Dumper; |
98
|
|
|
|
|
|
|
use Log::Log4perl qw(:easy); |
99
|
|
|
|
|
|
|
Log::Log4perl->easy_init( { level => $INFO, layout => '%-5p - %m%n' } ); |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
our $VERSION = 1.01; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
has 'owlfile' => ( is => 'rw', isa => 'Str', required => 1 ); |
104
|
|
|
|
|
|
|
has 'class' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
105
|
|
|
|
|
|
|
has 'class_count' => ( is => 'rw', isa => 'Int', default => 0 ); |
106
|
|
|
|
|
|
|
has 'synonyms_count' => ( is => 'rw', isa => 'Int', default => 0 ); |
107
|
|
|
|
|
|
|
has 'version' => ( is => 'rw', isa => 'Str' , default => ''); |
108
|
|
|
|
|
|
|
has 'synonym_tag' => |
109
|
|
|
|
|
|
|
( is => 'rw', isa => 'Str', default => 'efo:alternative_term' ); |
110
|
|
|
|
|
|
|
has 'definition_tag' => |
111
|
|
|
|
|
|
|
( is => 'rw', isa => 'Str', default => 'efo:definition' ); |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
my $parser; |
115
|
|
|
|
|
|
|
my $path = ''; |
116
|
|
|
|
|
|
|
my $class = OWL::Simple::Class->new(); |
117
|
|
|
|
|
|
|
my %restriction; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# Default constructor. Initializes the XML::Parser and sets appropriate handlers. |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub BUILD() { |
122
|
|
|
|
|
|
|
my $self = shift; |
123
|
|
|
|
|
|
|
$parser = new XML::Parser; |
124
|
|
|
|
|
|
|
$parser->setHandlers( |
125
|
|
|
|
|
|
|
Start => sub { $self->startElement(@_) }, |
126
|
|
|
|
|
|
|
End => sub { $self->endElement(@_) }, |
127
|
|
|
|
|
|
|
Char => sub { $self->characterData(@_) }, |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Increments internal counter of classes and synonyms parser respectively. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub incr_classes() { |
134
|
|
|
|
|
|
|
my $self = shift; |
135
|
|
|
|
|
|
|
$self->class_count( $self->class_count + 1 ); |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub incr_synonyms() { |
139
|
|
|
|
|
|
|
my $self = shift; |
140
|
|
|
|
|
|
|
$self->synonyms_count( $self->synonyms_count + 1 ); |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Main function. Parser the owlfile using XML::Parser |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub parse() { |
146
|
|
|
|
|
|
|
my $self = shift; |
147
|
|
|
|
|
|
|
$parser->parsefile( $self->owlfile ); |
148
|
|
|
|
|
|
|
INFO "LOADED " |
149
|
|
|
|
|
|
|
. $self->class_count |
150
|
|
|
|
|
|
|
. ' CLASSES AND ' |
151
|
|
|
|
|
|
|
. $self->synonyms_count |
152
|
|
|
|
|
|
|
. ' SYNONYMS from ' |
153
|
|
|
|
|
|
|
. $self->owlfile; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
1; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# Handler executed by XML::Parser. Adds current element to $path. |
159
|
|
|
|
|
|
|
# $path is used characterData() to determine whtether node text should be |
160
|
|
|
|
|
|
|
# added to class. |
161
|
|
|
|
|
|
|
# |
162
|
|
|
|
|
|
|
# Initializes a new OWLClass object and stores it in $class. This is later |
163
|
|
|
|
|
|
|
# populated by other handlers. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub startElement() { |
166
|
|
|
|
|
|
|
my ( $self, $parseinst, $element, %attr ) = @_; |
167
|
|
|
|
|
|
|
DEBUG "->startElement $self, $parseinst, $element"; |
168
|
|
|
|
|
|
|
$path = $path . '/' . $element; # add element to path |
169
|
|
|
|
|
|
|
if ( $path eq '/rdf:RDF/owl:Class' ) { |
170
|
|
|
|
|
|
|
$self->incr_classes(); |
171
|
|
|
|
|
|
|
INFO( |
172
|
|
|
|
|
|
|
"Loaded " . $self->class_count . " classes from " . $self->owlfile ) |
173
|
|
|
|
|
|
|
if $self->class_count % 1000 == 0; |
174
|
|
|
|
|
|
|
$class = OWL::Simple::Class->new(); |
175
|
|
|
|
|
|
|
$class->id( $attr{'rdf:about'} ) if defined $attr{'rdf:about'}; |
176
|
|
|
|
|
|
|
$class->id( $attr{'rdf:ID'} ) if defined $attr{'rdf:ID'}; |
177
|
|
|
|
|
|
|
WARN 'DUPLICATE RDF:ID & RDF:ABOUT IN ' . $attr{'rdf:about'} |
178
|
|
|
|
|
|
|
if ( defined $attr{'rdf:id'} && defined $attr{'rdf:about'} ); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Two ways to match parents, either as rdf:resource attribute |
182
|
|
|
|
|
|
|
# on rdfs:subClassOf or rdf:about on nested rdfs:subClassOf/owl:Class |
183
|
|
|
|
|
|
|
elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf' ) { |
184
|
|
|
|
|
|
|
push @{ $class->subClassOf }, $attr{'rdf:resource'} |
185
|
|
|
|
|
|
|
if defined $attr{'rdf:resource'}; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
elsif ( $path eq '/rdf:RDF/owl:Class/rdfs:subClassOf/owl:Class' ) { |
188
|
|
|
|
|
|
|
push @{ $class->subClassOf }, $attr{'rdf:about'} |
189
|
|
|
|
|
|
|
if defined $attr{'rdf:about'}; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# Here we try to match relations, e.g. part_of, derives_from, etc. |
193
|
|
|
|
|
|
|
elsif ( $element eq 'owl:Restriction' ) { |
194
|
|
|
|
|
|
|
$restriction{type} = undef; |
195
|
|
|
|
|
|
|
$restriction{class} = []; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
elsif ( $element eq 'owl:someValuesFrom' ) { |
198
|
|
|
|
|
|
|
push @{ $restriction{class} }, $attr{'rdf:resource'} |
199
|
|
|
|
|
|
|
if defined $attr{'rdf:resource'}; |
200
|
|
|
|
|
|
|
push @{ $restriction{class} }, $attr{'rdf:about'} |
201
|
|
|
|
|
|
|
if defined $attr{'rdf:about'}; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# Regex as properties can be transitive, etc. |
205
|
|
|
|
|
|
|
elsif ( $element =~ /owl:\w+Property$/ ) { |
206
|
|
|
|
|
|
|
$restriction{type} = $attr{'rdf:about'} if defined $attr{'rdf:about'}; |
207
|
|
|
|
|
|
|
$restriction{type} = $attr{'rdf:resource'} |
208
|
|
|
|
|
|
|
if defined $attr{'rdf:resource'}; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# Handler executed by XML::Parser when node text is processed. |
213
|
|
|
|
|
|
|
# |
214
|
|
|
|
|
|
|
# For rdfs:label stores the value into $class->label otherwise |
215
|
|
|
|
|
|
|
# class->annotation() this is then subsequently pushed into |
216
|
|
|
|
|
|
|
# respective synonyms or definitions table when the |
217
|
|
|
|
|
|
|
# endElement() event is fired |
218
|
|
|
|
|
|
|
# NOTE characterData can be called multiple times, before |
219
|
|
|
|
|
|
|
# the end tag |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub characterData { |
222
|
|
|
|
|
|
|
my ( $self, $parseinst, $data ) = @_; |
223
|
|
|
|
|
|
|
DEBUG "->characterData $self, $parseinst, $data"; |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Get rdfs:label |
226
|
|
|
|
|
|
|
if ( $path eq '/rdf:RDF/owl:Class/rdfs:label' ) { |
227
|
|
|
|
|
|
|
$class->label( |
228
|
|
|
|
|
|
|
( defined $class->label() ? $class->label() : '' ) . $data ); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# Get definition_citation or defintion |
232
|
|
|
|
|
|
|
elsif ( |
233
|
|
|
|
|
|
|
$path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(definition|definition_citation)\w*! |
234
|
|
|
|
|
|
|
|| $path eq '/rdf:RDF/owl:Class/' . $self->definition_tag |
235
|
|
|
|
|
|
|
) |
236
|
|
|
|
|
|
|
{ |
237
|
|
|
|
|
|
|
$class->annotation( |
238
|
|
|
|
|
|
|
( defined $class->annotation() ? $class->annotation() : '' ) |
239
|
|
|
|
|
|
|
. $data ); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Get synonyms, either matching to anything with synonym or |
243
|
|
|
|
|
|
|
# alternative_term inside or custom tag from parameters |
244
|
|
|
|
|
|
|
elsif ( |
245
|
|
|
|
|
|
|
$path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(synonym|alternative_term)\w*! |
246
|
|
|
|
|
|
|
|| $path eq '/rdf:RDF/owl:Class/' . $self->synonym_tag ) |
247
|
|
|
|
|
|
|
{ |
248
|
|
|
|
|
|
|
$class->annotation( |
249
|
|
|
|
|
|
|
( defined $class->annotation() ? $class->annotation() : '' ) |
250
|
|
|
|
|
|
|
. $data ); |
251
|
|
|
|
|
|
|
WARN( "Unparsable synonym detected for " . $class->id ) |
252
|
|
|
|
|
|
|
unless defined $data; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# detecting closing tag inside, NCIt fix |
255
|
|
|
|
|
|
|
# FIXME this is probably no longer necessary |
256
|
|
|
|
|
|
|
# once the synonym is concatenated, but have not checked |
257
|
|
|
|
|
|
|
#if ( $data =~ m!</! ) { |
258
|
|
|
|
|
|
|
# ($data) = $data =~ m!>(.*?)</!; # match to first entry |
259
|
|
|
|
|
|
|
#} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# Extract version information |
264
|
|
|
|
|
|
|
elsif ( $path eq '/rdf:RDF/owl:Ontology/owl:versionInfo' ){ |
265
|
|
|
|
|
|
|
$self->version($self->version() . $data); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Handler executed by XML::Parser when the closing tag |
270
|
|
|
|
|
|
|
# is encountered. For owl:Class it pushes it into the class hash as it was |
271
|
|
|
|
|
|
|
# processed by characterData() already and the parser is ready to |
272
|
|
|
|
|
|
|
# process a new owl:Class. |
273
|
|
|
|
|
|
|
# |
274
|
|
|
|
|
|
|
# Also strips the closing tag from $path. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub endElement() { |
277
|
|
|
|
|
|
|
my ( $self, $parseinst, $element ) = @_; |
278
|
|
|
|
|
|
|
DEBUG "->endElement $self, $parseinst, $element"; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# Reached end of class, add the class to hash |
281
|
|
|
|
|
|
|
if ( $path eq '/rdf:RDF/owl:Class' |
282
|
|
|
|
|
|
|
&& $class->id ne "http://www.w3.org/2002/07/owl#Thing" ) |
283
|
|
|
|
|
|
|
{ |
284
|
|
|
|
|
|
|
WARN 'Class ' . $class->id . ' possibly duplicated' |
285
|
|
|
|
|
|
|
if defined $self->class->{ $class->id }; |
286
|
|
|
|
|
|
|
my $classhash = $self->class; |
287
|
|
|
|
|
|
|
$classhash->{ $class->id } = $class; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Reached end of the relationship tag, add to appropriate array |
291
|
|
|
|
|
|
|
# Currently supports only part_of, and even that poorly. |
292
|
|
|
|
|
|
|
# FIXME circular references |
293
|
|
|
|
|
|
|
elsif ( $element eq 'owl:Restriction' ) { |
294
|
|
|
|
|
|
|
WARN "UNDEFINED RESTRICTION " . $class->id |
295
|
|
|
|
|
|
|
if not defined $restriction{type}; |
296
|
|
|
|
|
|
|
if ( $restriction{type} =~ m!/part_of$! ) { |
297
|
|
|
|
|
|
|
for my $cls ( @{ $restriction{class} } ) { |
298
|
|
|
|
|
|
|
push @{ $class->part_of }, $cls; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# character data can be called multiple times |
304
|
|
|
|
|
|
|
# for a single element, so it's concatanated there |
305
|
|
|
|
|
|
|
# and saved here |
306
|
|
|
|
|
|
|
elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*definition_citation$! ){ |
307
|
|
|
|
|
|
|
push @{ $class->xrefs }, $class->annotation if $class->annotation() ne ''; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*definition$! |
310
|
|
|
|
|
|
|
|| $path eq '/rdf:RDF/owl:Class/' . $self->definition_tag ){ |
311
|
|
|
|
|
|
|
push @{ $class->definitions }, $class->annotation if $class->annotation() ne ''; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
elsif ( $path =~ m!^/rdf:RDF/owl:Class/\w*:?\w*(synonym|alternative_term)\w*! |
314
|
|
|
|
|
|
|
|| $path eq '/rdf:RDF/owl:Class/' . $self->synonym_tag ){ |
315
|
|
|
|
|
|
|
$self->incr_synonyms(); |
316
|
|
|
|
|
|
|
push @{ $class->synonyms }, $class->annotation if $class->annotation() ne ''; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
print Dumper($class) unless defined $class->annotation; |
319
|
|
|
|
|
|
|
# clear temp annotation |
320
|
|
|
|
|
|
|
$class->annotation(''); |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
#remove end element from path |
323
|
|
|
|
|
|
|
$path =~ s!/$element$!!; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
1; |