line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::Thesaurus::IO::LivelinkCollectionServer; |
2
|
4
|
|
|
4
|
|
2046
|
use Moose; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
20
|
|
3
|
|
|
|
|
|
|
with 'Lingua::Thesaurus::IO'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
has '_term_rev_idx' => (is => 'bare', |
6
|
|
|
|
|
|
|
documentation => "internal reverse index while parsing terms"); |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
has '_rel_types' => (is => 'ro', |
9
|
|
|
|
|
|
|
documentation => "default reltypes for Livelink Collection Server", |
10
|
|
|
|
|
|
|
default => sub { { |
11
|
|
|
|
|
|
|
# rel description inverse is_external |
12
|
|
|
|
|
|
|
# === =========== ======= =========== |
13
|
|
|
|
|
|
|
AB => ['Abbreviation' => AF => undef], |
14
|
|
|
|
|
|
|
AF => ['Abbreviation For' => AB => undef], |
15
|
|
|
|
|
|
|
EQ => ['See Also' => UF => undef], |
16
|
|
|
|
|
|
|
UF => ['Used For' => EQ => undef], |
17
|
|
|
|
|
|
|
EQA => ['See AND' => UFA => undef], |
18
|
|
|
|
|
|
|
UFA => ['Used For AND' => EQA => undef], |
19
|
|
|
|
|
|
|
BT => ['Broad Term' => NT => undef], |
20
|
|
|
|
|
|
|
NT => ['Narrow Term' => BT => undef], |
21
|
|
|
|
|
|
|
RT => ['Related Term' => RT => undef], |
22
|
|
|
|
|
|
|
SN => ['Scope Note' => undef , 1 ], |
23
|
|
|
|
|
|
|
HN => ['History Note' => undef , 1 ], |
24
|
|
|
|
|
|
|
}}); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub load { |
28
|
3
|
|
|
3
|
1
|
7
|
my $self = shift; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# files are given either as ->load($f1, $f2, ..), or |
31
|
|
|
|
|
|
|
# as ->load({$ori1 => $f1, $ori2 => $f2, ...}); |
32
|
3
|
|
|
|
|
4
|
my @files; # list of pairs [$filename, $origin] |
33
|
3
|
100
|
66
|
|
|
28
|
if (@_ == 1 && ref $_[0] eq 'HASH') { |
34
|
1
|
|
|
|
|
2
|
my $files_hash = shift; |
35
|
1
|
|
|
|
|
3
|
@files = map {[$files_hash->{$_}, $_]} keys %$files_hash; |
|
2
|
|
|
|
|
6
|
|
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
else { |
38
|
2
|
|
|
|
|
5
|
@files = map {[$_, undef]} @_; |
|
2
|
|
|
|
|
6
|
|
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# initialize storage structure |
42
|
3
|
|
|
|
|
112
|
my $storage = $self->storage; |
43
|
3
|
|
|
|
|
29
|
$storage->initialize; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# store relation types |
46
|
3
|
|
|
|
|
8
|
while (my ($rel_id, $rel_data) = each %{$self->_rel_types}) { |
|
33
|
|
|
|
|
1451
|
|
47
|
30
|
|
|
|
|
31
|
my ($descr, $is_external) = @{$rel_data}[0, 2]; |
|
30
|
|
|
|
|
73
|
|
48
|
30
|
|
|
|
|
138
|
$storage->store_rel_type($rel_id, $descr, $is_external); |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# load each file |
52
|
3
|
|
|
4
|
|
41
|
$storage->do_transaction(sub {$self->_load_file(@$_)}) foreach @files; |
|
4
|
|
|
|
|
52
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# cleanup internal reverse index |
55
|
3
|
|
|
|
|
90
|
$self->{_term_rev_idx} = {}; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub _load_file { |
60
|
4
|
|
|
4
|
|
10
|
my ($self, $file, $origin) = @_; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# lecture du fichier (force :crlf IO mode so that Win32 dumpfiles also work) |
63
|
4
|
50
|
|
|
|
296
|
open my $fh, "<:crlf", $file or die "open $file: $!"; |
64
|
|
|
|
|
|
|
|
65
|
4
|
|
|
|
|
10
|
my %term; |
66
|
|
|
|
|
|
|
my $term_count; |
67
|
0
|
|
|
|
|
0
|
my $thesaurus_name; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
LINE: |
70
|
4
|
|
|
|
|
133
|
while (<$fh>) { |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# skip initial lines until thesaurus name declaration |
73
|
3231
|
100
|
100
|
|
|
5833
|
$thesaurus_name //= do {s/^BEGIN_REL THES_NAME=(.*)//; $1} |
|
44
|
|
|
|
|
49
|
|
|
44
|
|
|
|
|
345
|
|
74
|
|
|
|
|
|
|
or next LINE; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# stop at ending line |
77
|
3191
|
100
|
|
|
|
4780
|
last LINE if /^END_REL/; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# unfold continuation lines |
80
|
|
|
|
|
|
|
CONTINUATION_LINE: |
81
|
3187
|
|
|
|
|
1999
|
while (1) { |
82
|
3609
|
100
|
|
|
|
5933
|
s/\+\n$/<$fh>/e or last CONTINUATION_LINE; |
|
422
|
|
|
|
|
1704
|
|
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# suppress comments |
86
|
3187
|
|
|
|
|
2754
|
s/###.*//; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# skip empty lines |
89
|
3187
|
100
|
|
|
|
5686
|
next LINE if /^\s*$/; |
90
|
|
|
|
|
|
|
|
91
|
3174
|
50
|
|
|
|
16784
|
my ($rel_id, $term_string) = ($_ =~ /^([A-Z]+)\d*\s*=\s*(.+?)\s*$/) |
92
|
|
|
|
|
|
|
or die "incorrect thesaurus syntax at $file line $.: $_\n"; |
93
|
|
|
|
|
|
|
|
94
|
3174
|
100
|
|
|
|
4481
|
if ($rel_id eq 'LT') { |
95
|
|
|
|
|
|
|
# insert last term |
96
|
1233
|
100
|
|
|
|
2932
|
$self->_insert_term(\%term) if keys %term; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# build a new term |
99
|
1233
|
|
|
|
|
11266
|
%term = (LT => $term_string, origin => $origin); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
else { |
102
|
|
|
|
|
|
|
# store relation info into current term |
103
|
1941
|
|
|
|
|
1258
|
push @{$term{$rel_id}}, $term_string; |
|
1941
|
|
|
|
|
10891
|
|
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
# insert last term |
107
|
4
|
50
|
|
|
|
19
|
$self->_insert_term(\%term) if keys %term; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _insert_term { |
112
|
1233
|
|
|
1233
|
|
1068
|
my ($self, $term_hash) = @_; |
113
|
1233
|
|
|
|
|
34429
|
my $storage = $self->storage; |
114
|
|
|
|
|
|
|
|
115
|
1233
|
|
|
|
|
1343
|
my $origin = delete $term_hash->{origin}; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# store the lead term |
118
|
1233
|
|
|
|
|
1170
|
my $term_string = delete $term_hash->{LT}; |
119
|
1233
|
|
100
|
|
|
8054
|
my $term_id = $self->{_term_rev_idx}{$origin || ''}{$term_string} |
|
|
|
66
|
|
|
|
|
120
|
|
|
|
|
|
|
//= $storage->store_term($term_string, $origin); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# store each collection of relations |
123
|
1233
|
|
|
|
|
32104
|
my $rel_types = $self->_rel_types; |
124
|
1233
|
|
|
|
|
3210
|
while (my ($rel_id, $related) = each %$term_hash) { |
125
|
606
|
50
|
|
|
|
1081
|
my $rel_type = $rel_types->{$rel_id} |
126
|
|
|
|
|
|
|
or die "unknown relation type: $rel_id\n"; |
127
|
606
|
|
|
|
|
494
|
my ($inverse_id, $is_external) = @{$rel_type}[1, 2]; |
|
606
|
|
|
|
|
900
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# for internal relations, replace strings by ids of related terms |
130
|
606
|
100
|
|
|
|
898
|
unless ($is_external) { |
131
|
379
|
|
|
|
|
440
|
foreach my $rel (@$related) { |
132
|
1705
|
|
100
|
|
|
7939
|
$rel = $self->{_term_rev_idx}{$origin || ''}{$rel} |
|
|
|
66
|
|
|
|
|
133
|
|
|
|
|
|
|
//= $storage->store_term($rel, $origin); |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# store it |
138
|
606
|
|
|
|
|
1268
|
$storage->store_relation($term_id, $rel_id, $related, |
139
|
|
|
|
|
|
|
$is_external, $inverse_id); |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
1; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
__END__ |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=encoding ISO8859-1 |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 NAME |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Lingua::Thesaurus::IO::LivelinkCollectionServer - IO class for Livelink Collection Server thesaurus files |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head1 DESCRIPTION |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
This class implements the L<Lingua::Thesaurus::IO> role for |
157
|
|
|
|
|
|
|
files issued from the I<Livelink Collection Server> database |
158
|
|
|
|
|
|
|
(formerly known as I<Basis Plus>). Parsing is quite rudimentary |
159
|
|
|
|
|
|
|
and does not claim to comply with the full BasisPlus specification. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head2 File syntax |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Files start with a header of shape: |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
BEGIN_LAYOUT |
166
|
|
|
|
|
|
|
FORMAT=FREE |
167
|
|
|
|
|
|
|
DATA_TERM_SEPARATOR='&&&' |
168
|
|
|
|
|
|
|
END_OF_DATA_STATEMENT='@@@' |
169
|
|
|
|
|
|
|
DATA_QUAL_SEPARATOR='###' |
170
|
|
|
|
|
|
|
END_LAYOUT |
171
|
|
|
|
|
|
|
ACTION_CODE=A |
172
|
|
|
|
|
|
|
<<< |
173
|
|
|
|
|
|
|
<<< Thesaurus dump in FREE format. |
174
|
|
|
|
|
|
|
<<< |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
which is completely ignored. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Actual data starts with |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
BEGIN_REL THES_NAME=<thesaurus_name> |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
and ends with |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
END_REL. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
All data lines are of shape |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
<relation_name> = <target> |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
If the last character on the line is '+', then this indicates that |
191
|
|
|
|
|
|
|
the next line is a continuation line, which should be concatenated |
192
|
|
|
|
|
|
|
to the current line. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
The relation name 'LT' (for I<Lead Term>) introduces a new term. |
195
|
|
|
|
|
|
|
The following lines are relations for this term, until the next LT. |
196
|
|
|
|
|
|
|
Relations may be : |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# rel description reverse is_external |
199
|
|
|
|
|
|
|
# === =========== ======= =========== |
200
|
|
|
|
|
|
|
[AB => 'Abbreviation' => AF => undef], |
201
|
|
|
|
|
|
|
[AF => 'Abbreviation For' => AB => undef], |
202
|
|
|
|
|
|
|
[EQ => 'See Also' => UF => undef], |
203
|
|
|
|
|
|
|
[UF => 'Used For' => EQ => undef], |
204
|
|
|
|
|
|
|
[EQA => 'See AND' => UFA => undef], |
205
|
|
|
|
|
|
|
[UFA => 'Used For AND' => EQA => undef], |
206
|
|
|
|
|
|
|
[BT => 'Broad Term' => NT => undef], |
207
|
|
|
|
|
|
|
[NT => 'Narrow Term' => BT => undef], |
208
|
|
|
|
|
|
|
[RT => 'Related Term' => RT => undef], |
209
|
|
|
|
|
|
|
[SN => 'Scope Note' => undef , 1 ], |
210
|
|
|
|
|
|
|
[HN => 'History Note' => undef , 1 ], |
211
|
|
|
|
|
|
|
); |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 METHODS |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 load |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Loading a thesaurus file in LivelinkCollectionServer format. |