line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::YaTeA; |
2
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
220259
|
use strict; |
|
5
|
|
|
|
|
29
|
|
|
5
|
|
|
|
|
171
|
|
4
|
5
|
|
|
5
|
|
30
|
use warnings; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
145
|
|
5
|
5
|
|
|
5
|
|
3081
|
use utf8; |
|
5
|
|
|
|
|
79
|
|
|
5
|
|
|
|
|
30
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=encoding utf8 |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
Lingua::YaTeA - Perl extension for extracting terms from a corpus and providing a syntactic analysis in a head-modifier format. |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 SYNOPSIS |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use Lingua::YaTeA; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my %config = Lingua::YaTeA::load_config($rcfile); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$yatea = Lingua::YaTeA->new($config{"OPTIONS"}, \%config); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
$corpus = Lingua::YaTeA::Corpus->new($corpus_path,$yatea->getOptionSet,$yatea->getMessageSet); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$yatea->termExtraction($corpus); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This module is the main module of the software named YaTeA. It aims at |
29
|
|
|
|
|
|
|
extracting noun phrases that look like terms from a corpus. It |
30
|
|
|
|
|
|
|
provides their syntactic analysis in a head-modifier representation. |
31
|
|
|
|
|
|
|
As an input, the term extractor requires a corpus which has been |
32
|
|
|
|
|
|
|
segmented into words and sentences, lemmatized and tagged with |
33
|
|
|
|
|
|
|
part-of-speech (POS) information. The input file is encoded in |
34
|
|
|
|
|
|
|
UTF-8. The implementation of this term extractor allows to process |
35
|
|
|
|
|
|
|
large corpora. Data provided with YaTeA allow to extract terms from |
36
|
|
|
|
|
|
|
English and French texts. But new linguistic features can be |
37
|
|
|
|
|
|
|
integrated to extract terms from another language. Moreover, |
38
|
|
|
|
|
|
|
linguistic features can be modified or created for a sub-language or |
39
|
|
|
|
|
|
|
tagset. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
For the use of YaTeA, see the documentation with the script C. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
The main strategy of analysis of the term candidates is based on the |
44
|
|
|
|
|
|
|
exploitation of simple parsing patterns and endogenous |
45
|
|
|
|
|
|
|
disambiguation. Exogenous disambiguation is also made possible for the |
46
|
|
|
|
|
|
|
identification and the analysis of term candidates by the use of |
47
|
|
|
|
|
|
|
external resources, I lists of testified terms. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head2 ANALYSIS: ENDOGENOUS AND EXOGENOUS DISAMBIGUATION |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Endogenous disambiguation consists in the exploitation of intermediate |
52
|
|
|
|
|
|
|
chunking and parsing results for the parsing of a given Maximal Noun |
53
|
|
|
|
|
|
|
Phrase (MNP). This feature allows the parse of complex noun phrases |
54
|
|
|
|
|
|
|
using a limited number of simple parsing patterns (80 patterns |
55
|
|
|
|
|
|
|
containing a maximum of 3 content words in the experiments described |
56
|
|
|
|
|
|
|
below). All the MNPs corresponding to parsing patterns are parsed |
57
|
|
|
|
|
|
|
first. In a second step, remaining unparsed MNPs are processed using |
58
|
|
|
|
|
|
|
the results of the first step as I. An |
59
|
|
|
|
|
|
|
I is a subsequence (contiguous or not) of a MNP |
60
|
|
|
|
|
|
|
that corresponds to a shorter term candidate that was parsed during |
61
|
|
|
|
|
|
|
the first step of the parsing process. This subsequence along with its |
62
|
|
|
|
|
|
|
internal analysis is used as an anchor in the parsing of the |
63
|
|
|
|
|
|
|
MNP. Islands are used to simplify the POS sequence of the MNP for |
64
|
|
|
|
|
|
|
which no parsing pattern was found. The subsequence covered by the |
65
|
|
|
|
|
|
|
island is reduced to its syntactic head. In addition, islands increase |
66
|
|
|
|
|
|
|
the degree of reliability of the parse. When no resource is provided |
67
|
|
|
|
|
|
|
and as there is no parsing pattern defined for the complete POS |
68
|
|
|
|
|
|
|
sequence "NN NN NN of NN" corresponding to the term candidate |
69
|
|
|
|
|
|
|
"Northern blot analysis of cwlH", the progressive method is |
70
|
|
|
|
|
|
|
applied. In such a case, the TC is bracketed from the right to the |
71
|
|
|
|
|
|
|
left, which results in a poor quality analysis. When considering the |
72
|
|
|
|
|
|
|
island of reliability "northern blot analysis", the correct bracketing |
73
|
|
|
|
|
|
|
is found. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=head1 METHODS |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head2 load_config() |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
load_config($rcfile); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The method loads the configuration of the NLP Platform by reading the |
83
|
|
|
|
|
|
|
configuration file given in argument. It returns the hashtable |
84
|
|
|
|
|
|
|
containing the configuration. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head2 new() |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
new($command_line_options_h,$system_config_h); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The methods creates a new term extractor and sets oprtions from the |
91
|
|
|
|
|
|
|
command line (C<$commend_line_options_h>) and options defined in the |
92
|
|
|
|
|
|
|
hashtable (C<$system_config_h>) given by address. The methods returns |
93
|
|
|
|
|
|
|
the created object. |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head2 termExtraction() |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
termExtraction($corpus); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
This method applies a extraction process on the corpus C<$corpus> |
100
|
|
|
|
|
|
|
given as parameter, and stores results in the directories specified in |
101
|
|
|
|
|
|
|
the configuration files. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 setOptions() |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
setOptions($command_line_options_h); |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
This method creates an option set. It sets the options defined in the |
109
|
|
|
|
|
|
|
hashtable C<$command_line_options_h> (given by reference) and checks |
110
|
|
|
|
|
|
|
if the C parameter is defined in the configuration. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 setConfigFiles() |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
setConfigFiles($this,$system_config_h); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head2 setLocaleFiles() |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
setLocaleFiles($this,$system_config_h); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head2 addOptionsFromFile() |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
addOptionsFromFile($this); |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 setMessageSet() |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
setMessageSet($this,$system_config_h); |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 setTagSet() |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
setTagSet($this); |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head2 setParsingPatterns() |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
setParsingPatterns($this); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 setChunkingDataSet() |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
setChunkingDataSet($this); |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=head2 setForbiddenStructureSet() |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
setForbiddenStructureSet($this); |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head2 loadTestifiedTerms() |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
loadTestifiedTerms($this,$process_counter_r,$corpus,$sentence_boundary,$document_boundary,$match_type,$message_set,$display_language); |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 setTestifiedTermSet() |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
setTestifiedTermSet($this,$filtering_lexicon_h,$sentence_boundary,$match_type); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head2 getTestifiedTermSet() |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
getTestifiedTermSet($this); |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 getFSSet() |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
getFSSet($this); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head2 getConfigFileSet |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
getConfigFileSet($this); |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head2 getLocaleFileSet() |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
getLocaleFileSet($this); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head2 getResultFileSet() |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
getResultFileSet($this); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head2 getOptionSet() |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
getOptionSet($this); |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This method returns the field C. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head2 getTagSet() |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
getTagSet($this); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head2 getChunkingDataSet() |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
getChunkingDataSet($this); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head2 getParsingPatternSet() |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
getParsingPatternSet($this); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=head2 getMessageSet() |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
getMessageSet($this); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head2 getTestifiedSet() |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
getTestifiedSet($this); |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head2 addMessageSetFile() |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
addMessageSetFile($this); |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 displayExtractionResults() |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
displayExtractionResults($this,$phrase_set,$corpus,$message_set,$display_language,$default_output); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 CONFIGURATION |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
The configuration file of YaTeA is divided into two sections: |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=over |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=item * Section C |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=over |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=item * |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
C : directory containing the configuration files according to the language |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=item * |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
C : directory containing the environment files according to the language |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=item * |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
C : directory where are stored the results (probably not useful) |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=back |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=item * Section C |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=over |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=item * |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
C I : Definition of the language of the |
272
|
|
|
|
|
|
|
corpus. Values are either C (French - TreeTagger output - TagSet |
273
|
|
|
|
|
|
|
), |
274
|
|
|
|
|
|
|
C (French - output of Flemm analyser or C (English - |
275
|
|
|
|
|
|
|
TreeTagger or GeniaTagger output - PennTreeBank Tagset) |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item * |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
C I : Specification of a name for the current version |
281
|
|
|
|
|
|
|
of the analysis. Results are gathered in a specific directory of this |
282
|
|
|
|
|
|
|
name and result files also carry this suffix |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=item * |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
C : set the path to the directory that will contain the |
287
|
|
|
|
|
|
|
results for the current corpus (default: working directory) |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=item * |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
C I : Name of a file containing a list of testified |
292
|
|
|
|
|
|
|
terms. The testified terms have to provided in the TreeTagger output |
293
|
|
|
|
|
|
|
format. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=item * |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
C : all occurrences of monolexical phrases |
298
|
|
|
|
|
|
|
are considered as term candidates. The value is 0 or 1. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=item * |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
C : occurrences of monolexical term |
303
|
|
|
|
|
|
|
candidates that appear in complex term candidates are also displayed. The value is 0 or 1. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=item * |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
C [loose or strict] : |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=over |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=item * |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
C : testified terms match either inflected or lemmatized forms of each word |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item * |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
C : testified terms match the combination of inflected form and POS tag of each word |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=item * |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
unspecified option: testified terms match match inflected forms of words |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=back |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=item * |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
C : display of the parsed term candidates in XML format. The |
328
|
|
|
|
|
|
|
value is 0 or 1. |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
=item * |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
C : display of a list of terms and sub-terms along with |
333
|
|
|
|
|
|
|
their frequency. To display only term candidates containing more than |
334
|
|
|
|
|
|
|
one word (multi-word term candidates), specify the value C. |
335
|
|
|
|
|
|
|
All term candidates will be displayed , monolexical and multi-word |
336
|
|
|
|
|
|
|
term candidates with the value C, or if any value is specified. |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=item * |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
C : displays of the corpus marked with phrases in a |
341
|
|
|
|
|
|
|
HTML file along with the indication that they are term candidates or |
342
|
|
|
|
|
|
|
not. The value is 0 or 1. |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
=item * |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
C : annotation of the corpus with term candidates in a |
347
|
|
|
|
|
|
|
XML format compatible with the BioLG software. The value is 0 or 1. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=item * |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
C : annotation of the corpus with testified terms in a |
352
|
|
|
|
|
|
|
XML format compatible with the BioLG software. The value is 0 or 1. |
353
|
|
|
|
|
|
|
(http://www.it.utu.fi/biolg/, biological tuned version of the Link |
354
|
|
|
|
|
|
|
Grammar Parser) |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=item * |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
C : creation of a BioLG compatible XML version |
359
|
|
|
|
|
|
|
of the corpus with PoS tags marked form each word. The value is 0 or 1. |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=item * |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
C : displays informations on parsed phrases (i.e. term |
364
|
|
|
|
|
|
|
candidates) in a text format. The value is 0 or 1. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=item * |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
C : only annotate testified terms (no acquisition). The |
370
|
|
|
|
|
|
|
value is 0 or 1. |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=item * |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
C : term candidates are displayed in |
375
|
|
|
|
|
|
|
TreeTagger output format. Term separator is the sentence boundary tag |
376
|
|
|
|
|
|
|
C. To extract only term candidates containing more than one |
377
|
|
|
|
|
|
|
word (multi-word term candidates), specify the option C. |
378
|
|
|
|
|
|
|
All term candidates will be displayed , monolexical and multi-word |
379
|
|
|
|
|
|
|
term candidates with the value C, or if any value is specified. |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=back |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=back |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=over |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=item * |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Charlotte Roze has defined the configuration files to process a corpus |
392
|
|
|
|
|
|
|
tagged with Flemm |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=item * |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
Wiktoria Golik, Robert Bossy and Claire Nédellec (MIG/INRA) have |
397
|
|
|
|
|
|
|
corrected bugs and improve the mapping of testified terms. |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=back |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head1 SEE ALSO |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Sophie Aubin and Thierry Hamon. Improving Term Extraction with |
405
|
|
|
|
|
|
|
Terminological Resources. In Advances in Natural Language Processing |
406
|
|
|
|
|
|
|
(5th International Conference on NLP, FinTAL 2006). pages |
407
|
|
|
|
|
|
|
380-387. Tapio Salakoski, Filip Ginter, Sampo Pyysalo, Tapio Pahikkala |
408
|
|
|
|
|
|
|
(Eds). August 2006. LNAI 4139. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 AUTHORS |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
Thierry Hamon and Sophie Aubin |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head1 LICENSE |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Copyright (C) 2005 by Thierry Hamon and Sophie Aubin |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
419
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.8.6 or, |
420
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=cut |
424
|
|
|
|
|
|
|
|
425
|
5
|
|
|
5
|
|
2911
|
use Data::Dumper; |
|
5
|
|
|
|
|
27618
|
|
|
5
|
|
|
|
|
365
|
|
426
|
5
|
|
|
5
|
|
2074
|
use Lingua::YaTeA::ParsingPatternRecordSet; |
|
5
|
|
|
|
|
17
|
|
|
5
|
|
|
|
|
62
|
|
427
|
5
|
|
|
5
|
|
2426
|
use Lingua::YaTeA::OptionSet; |
|
5
|
|
|
|
|
24
|
|
|
5
|
|
|
|
|
59
|
|
428
|
5
|
|
|
5
|
|
2131
|
use Lingua::YaTeA::Option; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
59
|
|
429
|
5
|
|
|
5
|
|
2084
|
use Lingua::YaTeA::FileSet; |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
52
|
|
430
|
5
|
|
|
5
|
|
2162
|
use Lingua::YaTeA::MessageSet; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
56
|
|
431
|
5
|
|
|
5
|
|
2099
|
use Lingua::YaTeA::TagSet; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
56
|
|
432
|
5
|
|
|
5
|
|
1786
|
use Lingua::YaTeA::ChunkingDataSet; |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
49
|
|
433
|
5
|
|
|
5
|
|
2171
|
use Lingua::YaTeA::ForbiddenStructureSet; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
55
|
|
434
|
5
|
|
|
5
|
|
3013
|
use Lingua::YaTeA::PhraseSet; |
|
5
|
|
|
|
|
23
|
|
|
5
|
|
|
|
|
69
|
|
435
|
5
|
|
|
5
|
|
2478
|
use Lingua::YaTeA::TestifiedTermSet; |
|
5
|
|
|
|
|
17
|
|
|
5
|
|
|
|
|
68
|
|
436
|
|
|
|
|
|
|
|
437
|
5
|
|
|
5
|
|
4035
|
use Config::General; |
|
5
|
|
|
|
|
129233
|
|
|
5
|
|
|
|
|
17556
|
|
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
our $VERSION='0.624'; |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
our $process_counter = 1; |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
sub load_config |
444
|
|
|
|
|
|
|
{ |
445
|
|
|
|
|
|
|
|
446
|
4
|
|
|
4
|
1
|
407
|
my ($rcfile) = @_; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# Read de configuration file |
449
|
|
|
|
|
|
|
|
450
|
4
|
50
|
33
|
|
|
41
|
if ((! defined $rcfile) || ($rcfile eq "")) { |
451
|
0
|
|
|
|
|
0
|
$rcfile = "/usr/etc/yatea/yatea.rc"; |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
4
|
|
|
|
|
45
|
my $conf = new Config::General('-ConfigFile' => $rcfile, |
455
|
|
|
|
|
|
|
'-InterPolateVars' => 1, |
456
|
|
|
|
|
|
|
'-InterPolateEnv' => 1 |
457
|
|
|
|
|
|
|
); |
458
|
|
|
|
|
|
|
|
459
|
4
|
|
|
|
|
21779
|
my %config = $conf->getall; |
460
|
|
|
|
|
|
|
# `mkdir -p $config{'ALVISTMP'}`; # to put in a specific method |
461
|
4
|
|
|
|
|
122
|
return(%config); |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub new |
466
|
|
|
|
|
|
|
{ |
467
|
4
|
|
|
4
|
1
|
504
|
my ($class,$command_line_options_h,$system_config_h) = @_; |
468
|
4
|
|
|
|
|
12
|
my $this = {}; |
469
|
4
|
|
|
|
|
11
|
bless ($this,$class); |
470
|
4
|
|
|
|
|
25
|
$this->{OPTION_SET} = (); |
471
|
4
|
|
|
|
|
11
|
$this->{CONFIG_FILE_SET} = (); |
472
|
4
|
|
|
|
|
11
|
$this->{LOCALE_FILE_SET} = (); |
473
|
4
|
|
|
|
|
10
|
$this->{MESSAGE_SET} = (); |
474
|
4
|
|
|
|
|
11
|
$this->{TAG_SET} = (); |
475
|
4
|
|
|
|
|
9
|
$this->{PARSING_PATTERN_SET} = (); |
476
|
4
|
|
|
|
|
11
|
$this->{CHUNKING_DATA_SET} = (); |
477
|
4
|
|
|
|
|
13
|
$this->{FS_SET} = (); |
478
|
4
|
|
|
|
|
10
|
$this->{TESTIFIED_SET} = (); |
479
|
4
|
|
|
|
|
19
|
$this->setOptions($command_line_options_h);; |
480
|
4
|
|
|
|
|
48
|
$this->setConfigFiles($system_config_h); |
481
|
4
|
|
|
|
|
20
|
$this->addOptionsFromFile; |
482
|
4
|
|
|
|
|
25
|
$this->setLocaleFiles($system_config_h); |
483
|
4
|
|
|
|
|
20
|
$this->setMessageSet; |
484
|
4
|
|
|
|
|
23
|
$this->getOptionSet->handleOptionDependencies($this->getMessageSet); |
485
|
4
|
|
|
|
|
15
|
$this->getOptionSet->setDefaultOutputPath; |
486
|
4
|
|
|
|
|
19
|
$this->setTagSet; |
487
|
4
|
|
|
|
|
28
|
$this->setParsingPatterns; |
488
|
4
|
|
|
|
|
38
|
$this->setChunkingDataSet; |
489
|
4
|
|
|
|
|
30
|
$this->setForbiddenStructureSet; |
490
|
4
|
|
|
|
|
23
|
return $this; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
sub termExtraction |
497
|
|
|
|
|
|
|
{ |
498
|
2
|
|
|
2
|
1
|
17
|
my ($this,$corpus) = @_; |
499
|
2
|
|
|
|
|
7
|
my $sentence_boundary = $this->getOptionSet->getSentenceBoundary; |
500
|
2
|
|
|
|
|
8
|
my $document_boundary = $this->getOptionSet->getDocumentBoundary; |
501
|
2
|
|
|
|
|
11
|
my $debug_fh = FileHandle->new(">".$corpus->getOutputFileSet->getFile('debug')->getPath);; |
502
|
2
|
|
|
|
|
246
|
binmode($debug_fh, ":utf8"); |
503
|
2
|
|
|
|
|
11
|
$this->loadTestifiedTerms(\$process_counter,$corpus,$sentence_boundary,$document_boundary,$this->getOptionSet->MatchTypeValue,$this->getMessageSet,$this->getOptionSet->getDisplayLanguage); |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
|
506
|
2
|
|
|
|
|
17
|
print STDERR $process_counter++ . ") " . ($this->getMessageSet->getMessage('LOAD_CORPUS')->getContent($this->getOptionSet->getDisplayLanguage)) . "\n"; |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# warn "Language: " . $this->getOptionSet->getLanguage . "\n"; |
510
|
|
|
|
|
|
|
|
511
|
2
|
|
|
|
|
19
|
$corpus->read($sentence_boundary,$document_boundary,$this->getFSSet,$this->getTestifiedTermSet,$this->getOptionSet->MatchTypeValue,$this->getMessageSet,$this->getOptionSet->getDisplayLanguage, $this->getOptionSet->getLanguage,$debug_fh); |
512
|
|
|
|
|
|
|
|
513
|
2
|
|
|
|
|
101
|
my $phrase_set = Lingua::YaTeA::PhraseSet->new; |
514
|
|
|
|
|
|
|
|
515
|
2
|
|
|
|
|
18
|
print STDERR $process_counter++ . ") " . ($this->getMessageSet->getMessage('CHUNKING')->getContent($this->getOptionSet->getDisplayLanguage)) . "\n"; |
516
|
2
|
|
|
|
|
20
|
$corpus->chunk($phrase_set,$sentence_boundary,$document_boundary,$this->getChunkingDataSet,$this->getFSSet,$this->getTagSet,$this->getParsingPatternSet,$this->getTestifiedTermSet,$this->getOptionSet,$debug_fh); |
517
|
|
|
|
|
|
|
|
518
|
2
|
|
|
|
|
16
|
my $fh = FileHandle->new(">".$corpus->getOutputFileSet->getFile('unparsed')->getPath); |
519
|
2
|
|
|
|
|
431
|
binmode($fh, ":utf8"); |
520
|
2
|
|
|
|
|
18
|
$phrase_set->printPhrases($fh); |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# print STDERR Dumper($phrase_set); |
523
|
|
|
|
|
|
|
|
524
|
2
|
|
|
|
|
12
|
$phrase_set->printChunkingStatistics($this->getMessageSet,$this->getOptionSet->getDisplayLanguage); |
525
|
2
|
50
|
33
|
|
|
13
|
if ((! defined $this->getOptionSet->getOption('annotate-only')) || ($this->getOptionSet->getOption('annotate-only')->getValue() == 0)) |
526
|
|
|
|
|
|
|
{ |
527
|
2
|
|
|
|
|
12
|
$phrase_set->sortUnparsed; |
528
|
|
|
|
|
|
|
|
529
|
2
|
|
|
|
|
13
|
print STDERR $process_counter++ . ") " . ($this->getMessageSet->getMessage('PARSING')->getContent($this->getOptionSet->getDisplayLanguage)) . "\n"; |
530
|
|
|
|
|
|
|
|
531
|
2
|
|
|
|
|
19
|
$phrase_set->parseProgressively($this->getTagSet,$this->getOptionSet->getParsingDirection,$this->getParsingPatternSet,$this->getChunkingDataSet,$corpus->getLexicon,$corpus->getSentenceSet,$this->getMessageSet,$this->getOptionSet->getDisplayLanguage,$debug_fh); |
532
|
|
|
|
|
|
|
|
533
|
2
|
|
|
|
|
22
|
$phrase_set->printParsingStatistics($this->getMessageSet,$this->getOptionSet->getDisplayLanguage); |
534
|
|
|
|
|
|
|
|
535
|
2
|
0
|
33
|
|
|
15
|
if( |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
536
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('xmlout')) && ($this->getOptionSet->getOption('xmlout') == 1)) |
537
|
|
|
|
|
|
|
|| |
538
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('termList')) && ($this->getOptionSet->getOption('termList') ne "")) |
539
|
|
|
|
|
|
|
|| |
540
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('printChunking')) && ($this->getOptionSet->getOption('printChunking')) == 1) |
541
|
|
|
|
|
|
|
|| |
542
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('TC-for-BioLG')) && ($this->getOptionSet->getOption('TC-for-BioLG')) == 1) |
543
|
|
|
|
|
|
|
|| |
544
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('TTG-style-term-candidates')) && ($this->getOptionSet->getOption('TTG-style-term-candidates') ne "")) |
545
|
|
|
|
|
|
|
|| |
546
|
|
|
|
|
|
|
($this->getOptionSet->getDefaultOutput == 1) |
547
|
|
|
|
|
|
|
) |
548
|
|
|
|
|
|
|
{ |
549
|
2
|
|
|
|
|
10
|
$phrase_set->addTermCandidates($this->getOptionSet); |
550
|
2
|
|
|
|
|
11
|
$corpus->makeDDW($phrase_set->getTermCandidates,$debug_fh); |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
|
554
|
2
|
|
|
|
|
28
|
print STDERR $process_counter++ . ") " . ($this->getMessageSet->getMessage('RESULTS')->getContent($this->getOptionSet->getDisplayLanguage)) . "\n"; |
555
|
|
|
|
|
|
|
|
556
|
2
|
|
|
|
|
19
|
$this->displayExtractionResults($phrase_set,$corpus,$this->getMessageSet,$this->getOptionSet->getDisplayLanguage,$this->getOptionSet->getDefaultOutput,$debug_fh); |
557
|
2
|
|
|
|
|
774
|
return(0); |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
sub setOptions |
563
|
|
|
|
|
|
|
{ |
564
|
4
|
|
|
4
|
1
|
13
|
my ($this,$command_line_options_h) = @_; |
565
|
4
|
|
|
|
|
8
|
my $options; |
566
|
|
|
|
|
|
|
|
567
|
4
|
|
|
|
|
34
|
$this->{OPTION_SET} = Lingua::YaTeA::OptionSet->new; |
568
|
|
|
|
|
|
|
|
569
|
4
|
|
|
|
|
32
|
$this->getOptionSet->addOptionSet($command_line_options_h,$this->getMessageSet,"EN"); |
570
|
4
|
|
|
|
|
16
|
$this->getOptionSet->checkCompulsory("language"); |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
sub setConfigFiles |
575
|
|
|
|
|
|
|
{ |
576
|
4
|
|
|
4
|
1
|
15
|
my ($this,$system_config_h) = @_; |
577
|
4
|
|
|
|
|
8
|
my $config_files; |
578
|
4
|
|
|
|
|
13
|
my $language = $this->getOptionSet->getLanguage; |
579
|
|
|
|
|
|
|
# print STDERR Dumper(%$system_config_h); |
580
|
4
|
|
|
|
|
22
|
my $repository = $system_config_h->{'DefaultConfig'}->{CONFIG_DIR} . "/" . $language; |
581
|
|
|
|
|
|
|
|
582
|
4
|
|
|
|
|
20
|
my @file_names = ("Options","ForbiddenStructures","ChunkingFrontiers","ChunkingExceptions","CleaningFrontiers","CleaningExceptions","ParsingPatterns","TagSet","LGPmapping"); |
583
|
|
|
|
|
|
|
|
584
|
4
|
|
|
|
|
34
|
$this->{CONFIG_FILE_SET} = Lingua::YaTeA::FileSet->new($repository); |
585
|
|
|
|
|
|
|
|
586
|
4
|
|
|
|
|
18
|
$this->getConfigFileSet->checkRepositoryExists; |
587
|
|
|
|
|
|
|
|
588
|
4
|
|
|
|
|
30
|
$this->getConfigFileSet->addFiles($this->getConfigFileSet->getRepository,\@file_names); |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub setLocaleFiles |
593
|
|
|
|
|
|
|
{ |
594
|
4
|
|
|
4
|
1
|
13
|
my ($this,$system_config_h) = @_; |
595
|
4
|
|
|
|
|
9
|
my $config_files; |
596
|
4
|
|
|
|
|
18
|
my $repository = $system_config_h->{'DefaultConfig'}->{LOCALE_DIR} . "/"; |
597
|
4
|
|
|
|
|
13
|
my @file_names = ("Messages"); |
598
|
|
|
|
|
|
|
|
599
|
4
|
|
|
|
|
25
|
$this->{LOCALE_FILE_SET} = Lingua::YaTeA::FileSet->new($repository); |
600
|
4
|
|
|
|
|
15
|
$this->getLocaleFileSet->checkRepositoryExists; |
601
|
4
|
|
|
|
|
32
|
$this->addMessageSetFile; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
sub addOptionsFromFile |
606
|
|
|
|
|
|
|
{ |
607
|
4
|
|
|
4
|
1
|
11
|
my ($this) = @_; |
608
|
|
|
|
|
|
|
|
609
|
4
|
|
|
|
|
24
|
$this->getOptionSet->readFromFile($this->getConfigFileSet->getFile("Options")); |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
sub setMessageSet |
613
|
|
|
|
|
|
|
{ |
614
|
4
|
|
|
4
|
1
|
13
|
my ($this,$system_config_h) = @_; |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
|
617
|
4
|
|
|
|
|
115
|
$this->{MESSAGE_SET} = Lingua::YaTeA::MessageSet->new($this->getLocaleFileSet->getFile("Messages"),$this->getOptionSet->getDisplayLanguage); |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
sub setTagSet |
622
|
|
|
|
|
|
|
{ |
623
|
4
|
|
|
4
|
1
|
12
|
my ($this) = @_; |
624
|
4
|
|
|
|
|
15
|
$this->{TAG_SET} = Lingua::YaTeA::TagSet->new($this->getConfigFileSet->getFile("TagSet")->getPath); |
625
|
|
|
|
|
|
|
# print STDERR "Tagset loaded\n" |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
sub setParsingPatterns |
629
|
|
|
|
|
|
|
{ |
630
|
4
|
|
|
4
|
1
|
12
|
my ($this) = @_; |
631
|
4
|
|
|
|
|
15
|
$this->{PARSING_PATTERN_SET} = Lingua::YaTeA::ParsingPatternRecordSet->new($this->getConfigFileSet->getFile("ParsingPatterns")->getPath,$this->getTagSet,$this->getMessageSet,$this->getOptionSet->getDisplayLanguage); |
632
|
|
|
|
|
|
|
# print STDERR "Parsing Patterns loaded\n"; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
sub setChunkingDataSet |
636
|
|
|
|
|
|
|
{ |
637
|
4
|
|
|
4
|
1
|
18
|
my ($this) = @_; |
638
|
4
|
|
|
|
|
25
|
$this->{CHUNKING_DATA_SET} = Lingua::YaTeA::ChunkingDataSet->new($this->getConfigFileSet); |
639
|
|
|
|
|
|
|
# print STDERR "Chunking Data loaded\n"; |
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
sub setForbiddenStructureSet |
644
|
|
|
|
|
|
|
{ |
645
|
4
|
|
|
4
|
1
|
12
|
my ($this) = @_; |
646
|
4
|
|
|
|
|
19
|
$this->{FS_SET} = Lingua::YaTeA::ForbiddenStructureSet->new($this->getConfigFileSet->getFile("ForbiddenStructures")->getPath); |
647
|
|
|
|
|
|
|
# print STDERR "Forbidden Structures loaded\n" |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
sub loadTestifiedTerms |
652
|
|
|
|
|
|
|
{ |
653
|
2
|
|
|
2
|
1
|
10
|
my ($this,$process_counter_r,$corpus,$sentence_boundary,$document_boundary,$match_type,$message_set,$display_language) = @_; |
654
|
|
|
|
|
|
|
|
655
|
2
|
|
|
|
|
4
|
my $filtering_lexicon_h; |
656
|
2
|
100
|
|
|
|
14
|
if ($this->getOptionSet->optionExists('termino')) |
657
|
|
|
|
|
|
|
{ |
658
|
1
|
|
|
|
|
7
|
print STDERR "\n" . $$process_counter_r++ . ") " . $message_set->getMessage('LOADING_TESTIFIED')->getContent($display_language) . "\n"; |
659
|
1
|
|
|
|
|
8
|
$filtering_lexicon_h = $corpus->preLoadLexicon($sentence_boundary,$document_boundary,$match_type); |
660
|
1
|
|
|
|
|
8
|
$this->setTestifiedTermSet($filtering_lexicon_h,$sentence_boundary,$match_type); |
661
|
1
|
|
|
|
|
38
|
print STDERR "\t" . $Lingua::YaTeA::TestifiedTerm::id . ($message_set->getMessage('TESTIFIED_LOADED')->getContent($display_language)) . "\n"; |
662
|
1
|
|
|
|
|
9
|
$this->getTestifiedTermSet->changeKeyToID; |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
else |
665
|
|
|
|
|
|
|
{ |
666
|
|
|
|
|
|
|
# (($this->getOptionSet->getOption('TT-for-BioLG')->getValue() == 1) && |
667
|
|
|
|
|
|
|
# creation of an empty set of Testified Terms |
668
|
|
|
|
|
|
|
# TTforLGp can be used to build a XML version of the corpus compatible with BioLG, even if no testified terms are provided |
669
|
|
|
|
|
|
|
# if ($this->getOptionSet->optionExists('TT-for-BioLG')) |
670
|
|
|
|
|
|
|
# { |
671
|
1
|
|
|
|
|
11
|
$this->{TESTIFIED_SET} = Lingua::YaTeA::TestifiedTermSet->new; |
672
|
|
|
|
|
|
|
# } |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
sub setTestifiedTermSet |
678
|
|
|
|
|
|
|
{ |
679
|
1
|
|
|
1
|
1
|
7
|
my ($this,$filtering_lexicon_h,$sentence_boundary,$match_type) = @_; |
680
|
1
|
|
|
|
|
2
|
my $file_path; |
681
|
1
|
|
|
|
|
14
|
$this->{TESTIFIED_SET} = Lingua::YaTeA::TestifiedTermSet->new; |
682
|
|
|
|
|
|
|
|
683
|
1
|
|
|
|
|
5
|
$file_path = $this->getOptionSet->getOption('termino')->getValue; # modified by Thierry Hamon 05/02/2007 |
684
|
|
|
|
|
|
|
# foreach $file_path (@{$this->getOptionSet->getOption('termino')->getValue}) |
685
|
|
|
|
|
|
|
# { |
686
|
1
|
|
|
|
|
5
|
$this->getTestifiedSet->addSubset($file_path,$filtering_lexicon_h,$sentence_boundary,$match_type,$this->getTagSet); |
687
|
|
|
|
|
|
|
# } |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub getTestifiedTermSet |
692
|
|
|
|
|
|
|
{ |
693
|
5
|
|
|
5
|
1
|
15
|
my ($this) = @_; |
694
|
5
|
|
|
|
|
18
|
return $this->{TESTIFIED_SET}; |
695
|
|
|
|
|
|
|
} |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
sub getFSSet |
700
|
|
|
|
|
|
|
{ |
701
|
4
|
|
|
4
|
1
|
11
|
my ($this) = @_; |
702
|
4
|
|
|
|
|
20
|
return $this->{FS_SET}; |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
sub getConfigFileSet |
706
|
|
|
|
|
|
|
{ |
707
|
32
|
|
|
32
|
1
|
66
|
my ($this) = @_; |
708
|
32
|
|
|
|
|
163
|
return $this->{CONFIG_FILE_SET}; |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
sub getLocaleFileSet |
712
|
|
|
|
|
|
|
{ |
713
|
16
|
|
|
16
|
1
|
30
|
my ($this) = @_; |
714
|
16
|
|
|
|
|
56
|
return $this->{LOCALE_FILE_SET}; |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub getResultFileSet |
718
|
|
|
|
|
|
|
{ |
719
|
0
|
|
|
0
|
1
|
0
|
my ($this) = @_; |
720
|
0
|
|
|
|
|
0
|
return $this->{RESULT_FILE_SET}; |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
sub getOptionSet |
725
|
|
|
|
|
|
|
{ |
726
|
146
|
|
|
146
|
1
|
297
|
my ($this) = @_; |
727
|
146
|
|
|
|
|
1016
|
return $this->{OPTION_SET}; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
sub getTagSet |
731
|
|
|
|
|
|
|
{ |
732
|
11
|
|
|
11
|
1
|
29
|
my ($this) = @_; |
733
|
11
|
|
|
|
|
57
|
return $this->{TAG_SET}; |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
sub getChunkingDataSet |
737
|
|
|
|
|
|
|
{ |
738
|
4
|
|
|
4
|
1
|
10
|
my ($this) = @_; |
739
|
4
|
|
|
|
|
24
|
return $this->{CHUNKING_DATA_SET}; |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
sub getParsingPatternSet |
743
|
|
|
|
|
|
|
{ |
744
|
4
|
|
|
4
|
1
|
13
|
my ($this) = @_; |
745
|
4
|
|
|
|
|
20
|
return $this->{PARSING_PATTERN_SET}; |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
sub getMessageSet |
749
|
|
|
|
|
|
|
{ |
750
|
41
|
|
|
41
|
1
|
95
|
my ($this) = @_; |
751
|
41
|
|
|
|
|
265
|
return $this->{MESSAGE_SET}; |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
sub getTestifiedSet |
755
|
|
|
|
|
|
|
{ |
756
|
1
|
|
|
1
|
1
|
3
|
my ($this) = @_; |
757
|
1
|
|
|
|
|
4
|
return $this->{TESTIFIED_SET}; |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
sub addMessageSetFile |
761
|
|
|
|
|
|
|
{ |
762
|
4
|
|
|
4
|
1
|
13
|
my ($this) = @_; |
763
|
4
|
|
|
|
|
14
|
my $repository = $this->getLocaleFileSet->getRepository; |
764
|
|
|
|
|
|
|
|
765
|
4
|
|
|
|
|
14
|
my $display_language = $this->getOptionSet->getLanguage; # default |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
# if the language of message display is different from that of the processed text, the Messages file is searched in a different sister repository |
768
|
4
|
50
|
33
|
|
|
15
|
if( |
769
|
|
|
|
|
|
|
($this->getOptionSet->optionExists('MESSAGE_DISPLAY')) |
770
|
|
|
|
|
|
|
&& |
771
|
|
|
|
|
|
|
($this->getOptionSet->getDisplayLanguage ne $this->getOptionSet->getLanguage) |
772
|
|
|
|
|
|
|
) |
773
|
|
|
|
|
|
|
{ |
774
|
0
|
|
|
|
|
0
|
$display_language = $this->getOptionSet->getDisplayLanguage; |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
|
777
|
4
|
|
|
|
|
17
|
$repository .= $display_language ; |
778
|
4
|
|
|
|
|
14
|
$this->getLocaleFileSet->addFile($repository, 'Messages'); |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
sub displayExtractionResults |
782
|
|
|
|
|
|
|
{ |
783
|
2
|
|
|
2
|
1
|
10
|
my ($this,$phrase_set,$corpus,$message_set,$display_language,$default_output,$debug_fh) = @_; |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
|
787
|
2
|
50
|
33
|
|
|
8
|
if ((defined $this->getOptionSet->getOption('debug')) && ($this->getOptionSet->getOption('debug')->getValue() == 1)) |
788
|
|
|
|
|
|
|
{ |
789
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_RAW')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('debug')->getPath . "'\n"; |
790
|
0
|
|
|
|
|
0
|
$phrase_set->printPhrases($debug_fh); |
791
|
0
|
|
|
|
|
0
|
$phrase_set->printUnparsable($corpus->getOutputFileSet->getFile('unparsable')); |
792
|
|
|
|
|
|
|
# $phrase_set->printUnparsed($corpus->getOutputFileSet->getFile('unparsed')); |
793
|
|
|
|
|
|
|
} |
794
|
|
|
|
|
|
|
|
795
|
2
|
50
|
33
|
|
|
9
|
if |
|
|
|
33
|
|
|
|
|
796
|
|
|
|
|
|
|
( |
797
|
|
|
|
|
|
|
((defined $this->getOptionSet->getOption('xmlout')) && ($this->getOptionSet->getOption('xmlout')->getValue() == 1)) |
798
|
|
|
|
|
|
|
|| |
799
|
|
|
|
|
|
|
($default_output == 1) |
800
|
|
|
|
|
|
|
) |
801
|
|
|
|
|
|
|
{ |
802
|
2
|
|
|
|
|
9
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_TC_XML')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('candidates')->getPath . "'\n"; |
803
|
2
|
|
|
|
|
17
|
$phrase_set->printTermCandidatesXML($corpus->getOutputFileSet->getFile("candidates"),$this->getTagSet); |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
2
|
50
|
33
|
|
|
24
|
if ((defined $this->getOptionSet->getOption('printChunking')) && ($this->getOptionSet->getOption('printChunking')->getValue() == 1)) |
807
|
|
|
|
|
|
|
{ |
808
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_CORPUS_PHRASES')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('candidatesAndUnparsedInCorpus')->getPath . "'\n"; |
809
|
|
|
|
|
|
|
|
810
|
0
|
|
|
|
|
0
|
$corpus->printCandidatesAndUnparsedInCorpus($phrase_set->getTermCandidates,$phrase_set->getUnparsable,$corpus->getOutputFileSet->getFile('candidatesAndUnparsedInCorpus'),$this->getOptionSet->getSentenceBoundary,$this->getOptionSet->getDocumentBoundary,$this->getOptionSet->getOption('COLOR_BLIND')); |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
2
|
50
|
33
|
|
|
11
|
if ((defined $this->getOptionSet->getOption('TC-for-BioLG')) && ($this->getOptionSet->getOption('TC-for-BioLG')->getValue() == 1)) |
814
|
|
|
|
|
|
|
{ |
815
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('TC_FOR_LGP')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('TCforBioLG')->getPath . "'\n"; |
816
|
0
|
|
|
|
|
0
|
$corpus->printCorpusForLGPwithTCs($phrase_set->getTermCandidates,$corpus->getOutputFileSet->getFile('TCforBioLG'),$this->getOptionSet->getSentenceBoundary,$this->getOptionSet->getDocumentBoundary,$this->getConfigFileSet->getFile("LGPmapping"),$this->getOptionSet->getChainedLinks,$this->getTagSet); |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
|
819
|
2
|
50
|
33
|
|
|
10
|
if ((defined $this->getOptionSet->getOption('TT-for-BioLG')) && ($this->getOptionSet->getOption('TT-for-BioLG')->getValue() == 1)) |
820
|
|
|
|
|
|
|
{ |
821
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('TT_FOR_LGP')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('TTforBioLG')->getPath . "'\n"; |
822
|
0
|
|
|
|
|
0
|
$corpus->printCorpusForLGPwithTTs($this->getTestifiedTermSet->getTestifiedTerms,$corpus->getOutputFileSet->getFile('TTforBioLG'),$this->getOptionSet->getSentenceBoundary,$this->getOptionSet->getDocumentBoundary,$this->getConfigFileSet->getFile("LGPmapping"),$this->getOptionSet->getParsingDirection,$this->getOptionSet->getChainedLinks,$this->getTagSet); |
823
|
|
|
|
|
|
|
} |
824
|
|
|
|
|
|
|
|
825
|
2
|
50
|
33
|
|
|
9
|
if ((defined $this->getOptionSet->getOption('XML-corpus-for-BioLG')) && ($this->getOptionSet->getOption('XML-corpus-for-BioLG')->getValue() == 1)) |
826
|
|
|
|
|
|
|
{ |
827
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('XML_FOR_BIOLG')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('corpusForBioLG')->getPath . "'\n"; |
828
|
0
|
|
|
|
|
0
|
$corpus->printCorpusForBioLG($corpus->getOutputFileSet->getFile('corpusForBioLG'),$this->getOptionSet->getSentenceBoundary,$this->getOptionSet->getDocumentBoundary,$this->getOptionSet->getChainedLinks,$this->getTagSet); |
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
|
831
|
2
|
50
|
|
|
|
14
|
if (defined $this->getOptionSet->getOption('termList')) |
832
|
|
|
|
|
|
|
{ |
833
|
2
|
|
|
|
|
16
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_TERM_LIST')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('termList')->getPath . "'\n"; |
834
|
2
|
|
|
|
|
21
|
$phrase_set->printTermList($corpus->getOutputFileSet->getFile('termList'),$this->getOptionSet->getTermListStyle); |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
# warn $this->getOptionSet->getOption('TTG-style-term-candidates')->getValue() . "\n"; |
838
|
|
|
|
|
|
|
|
839
|
2
|
50
|
|
|
|
27
|
if (defined $this->getOptionSet->getOption('TTG-style-term-candidates')) |
840
|
|
|
|
|
|
|
{ |
841
|
2
|
|
|
|
|
16
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('TTG_TERM_CANDIDATES')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('termCandidates')->getPath . "'\n"; |
842
|
2
|
|
|
|
|
17
|
$phrase_set->printTermCandidatesTTG($corpus->getOutputFileSet->getFile("termCandidates"),$this->getOptionSet->getTTGStyle); |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
|
845
|
2
|
50
|
|
|
|
17
|
if (defined $this->getOptionSet->getOption('bootstrap')) |
846
|
|
|
|
|
|
|
{ |
847
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_BOOTSTRAP')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('parsedTerms')->getPath . "'\n"; |
848
|
0
|
|
|
|
|
0
|
$phrase_set->printBootstrapList($corpus->getOutputFileSet->getFile('parsedTerms'),$corpus->getName); |
849
|
|
|
|
|
|
|
} |
850
|
2
|
50
|
33
|
|
|
10
|
if ((defined $this->getOptionSet->getOption('XML-corpus-raw')) && ($this->getOptionSet->getOption('XML-corpus-raw')->getValue() == 1)) |
851
|
|
|
|
|
|
|
{ |
852
|
0
|
|
|
|
|
0
|
print STDERR "\t-" . ($this->getMessageSet->getMessage('DISPLAY_CORPUS_RAW')->getContent($this->getOptionSet->getDisplayLanguage)) . "\'". $corpus->getOutputFileSet->getFile('corpusRaw')->getPath . "'\n"; |
853
|
|
|
|
|
|
|
|
854
|
0
|
|
|
|
|
0
|
$corpus->printXMLRawCorpus($corpus->getOutputFileSet->getFile('corpusRaw'),$this->getOptionSet->getSentenceBoundary,$this->getOptionSet->getDocumentBoundary); |
855
|
|
|
|
|
|
|
} |
856
|
2
|
|
|
|
|
8
|
return(0); |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
# To specify several files, repeat the -termino switch |
861
|
|
|
|
|
|
|
# for each |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
1; |