File Coverage

blib/lib/Alvis/TermTagger.pm
Criterion Covered Total %
statement 253 388 65.2
branch 59 110 53.6
condition 85 258 32.9
subroutine 15 19 78.9
pod 14 14 100.0
total 426 789 53.9


line stmt bran cond sub pod time code
1             package Alvis::TermTagger;
2              
3             our $VERSION = '0.82';
4              
5             #######################################################################
6             #
7             # Last Update: 16/09/2015 (mm/dd/yyyy date format)
8             #
9             # Copyright (C) 2006 Thierry Hamon
10             #
11             # Written by thierry.hamon@limsi.fr
12             #
13             # Author : Thierry Hamon
14             # Email : thierry.hamon@limsi.fr
15             # URL : https://perso.limsi.fr/hamon/
16             #
17             ########################################################################
18              
19              
20 5     5   91457 use strict;
  5         13  
  5         264  
21 5     5   30 use warnings;
  5         18  
  5         206  
22              
23 5     5   3833 use utf8;
  5         65  
  5         32  
24              
25             # TODO : write functions for term tagginga, term selection with and
26             # without offset in the corpus
27              
28             sub termtagging {
29              
30 4     4 1 33 my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
31              
32 4         7 my @term_list;
33             my %term_listIdx;
34 0         0 my @regex_term_list;
35 0         0 my @regex_lemmawordterm_list;
36 0         0 my %corpus;
37 0         0 my %lc_corpus;
38 0         0 my %lemmatised_corpus;
39 0         0 my %lc_lemmatised_corpus;
40 0         0 my %corpus_index;
41 0         0 my %lemmatised_corpus_index;
42 0         0 my %idtrm_select;
43 0         0 my %idlemtrm_select;
44              
45 4 100       15 if (!defined $caseSensitive) {
46 1         2 $caseSensitive = -1;
47             }
48              
49 4         17 &load_TermList($term_list_filename,\@term_list, \%term_listIdx);
50 4         19 &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
51              
52 4         22 &load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
53 4 100       19 if (defined $lemmatised_corpus_filename) {
54 3         10 &load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
55             }
56 4         20 &corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
57 4 100       26 if (defined $lemmatised_corpus_filename) {
58 3         11 &corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
59             }
60 4         24 &term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
61 4 100       20 if (defined $lemmatised_corpus_filename) {
62 3         12 &term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
63             }
64 4         23 &term_tagging_offset(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
65 4 100       23 if (defined $lemmatised_corpus_filename) {
66 3         14 &term_tagging_offset(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
67             }
68 4         270 return(0);
69             }
70              
71             sub termtagging_brat {
72              
73 1     1 1 12 my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
74              
75 1         1 my @term_list;
76             my %term_listIdx;
77 0         0 my @regex_term_list;
78 0         0 my @regex_lemmawordterm_list;
79 0         0 my %corpus;
80 0         0 my %lc_corpus;
81 0         0 my %lemmatised_corpus;
82 0         0 my %lc_lemmatised_corpus;
83 0         0 my %corpus_index;
84 0         0 my %lemmatised_corpus_index;
85 0         0 my %idtrm_select;
86 0         0 my %idlemtrm_select;
87              
88 1 50       4 if (!defined $caseSensitive) {
89 1         2 $caseSensitive = -1;
90             }
91              
92 1         5 &load_TermList($term_list_filename,\@term_list, \%term_listIdx);
93 1         8 &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
94              
95 1         6 &load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
96 1 50       6 if (defined $lemmatised_corpus_filename) {
97 0         0 &load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
98             }
99 1         4 &corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
100 1 50       5 if (defined $lemmatised_corpus_filename) {
101 0         0 &corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
102             }
103 1         6 &term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
104 1 50       5 if (defined $lemmatised_corpus_filename) {
105 0         0 &term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
106             }
107 1         5 &term_tagging_offset_brat(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
108 1 50       6 if (defined $lemmatised_corpus_filename) {
109 0         0 &term_tagging_offset_brat(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
110             }
111 1         28 return(0);
112             }
113              
114              
115             sub load_TermList {
116 5     5 1 11 my ($termlist_name, $ref_termlist, $ref_termlistIdx) = @_;
117              
118 5         6 my $line;
119             my $line1;
120 0         0 my $term; # not use yet
121 0         0 my $suppl_info; # not use yet
122 0         0 my @tab;
123              
124 5         1096 warn "Loading the terminological resource\n";
125              
126 5 50       173 open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";
127              
128 5         27 binmode(DESC_TERMLIST, ":utf8");
129              
130 5         107 while($line1 = ) {
131 35         48 chomp $line1;
132 35         64 utf8::decode($line1);
133 35         47 $line=$line1;
134              
135             # Blank and comment lines are throw away
136 35 50 33     223 if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
      33        
137             # Term is split from the other information
138 35         123 my @tab = split / ?[\|:] ?/, $line;
139 35 50       84 if ($tab[0] !~ /^\s*$/) {
140             # TODO better
141 35         104 $tab[0] =~ s/ +/ /go;
142 35         41 $tab[0] =~ s/ $//go;
143 35         29 $tab[0] =~ s/^ //go;
144             # $tab[0] =~ s/\\:/:/go;
145             # warn "term: " . $tab[0] . "\n";;
146 35 50       63 if (!exists $ref_termlistIdx->{$tab[0]}) {
147 35         45 push @$ref_termlist, \@tab;
148 35         152 $ref_termlistIdx->{$tab[0]} = scalar(@$ref_termlist) -1;
149             } else {
150 0         0 $ref_termlist->[$ref_termlistIdx->{$tab[0]}]->[2] .= ";" . $tab[2];
151             }
152             }
153             }
154             }
155 5         35 close DESC_TERMLIST;
156 5         798 print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
157             }
158              
159             sub get_Regex_TermList {
160              
161 5     5 1 13 my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
162 5         6 my $term_counter;
163              
164 5         451 warn "Generating the regular expression from the terms\n";
165              
166 5         38 for($term_counter = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
167 35         49 $ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
168 35 50       57 if (defined $ref_regex_lemmaWordtermlist) {
169 35 50       52 if (defined $ref_termlist->[$term_counter]->[3]) {
170 0         0 $ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
171             # warn "==> " . $ref_termlist->[$term_counter]->[3] . "\n";
172             } else {
173 35         39 $ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
174             }
175             }
176             # warn $ref_regex_lemmaWordtermlist->[$term_counter] . "\n";
177 35         46 $ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
178 35         68 $ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
179 35         44 $ref_regex_termlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}A]/og;
180 35         35 $ref_regex_termlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
181 35         31 $ref_regex_termlist->[$term_counter] =~ s/C/[\x{00C7}|C]/og;
182 35         35 $ref_regex_termlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
183 35         39 $ref_regex_termlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
184 35         39 $ref_regex_termlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
185 35         36 $ref_regex_termlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
186 35         32 $ref_regex_termlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
187 35         60 $ref_regex_termlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
188 35         43 $ref_regex_termlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;
189              
190 35 50       50 if (defined $ref_regex_lemmaWordtermlist) {
191 35         39 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
192 35         56 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
193 35         32 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/A/[\x{00C0}-\x{00C5}A]/og;
194 35         37 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/AE/(\x{00C6}|AE)/og;
195 35         34 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/C/[\x{00C7}C]/og;
196 35         29 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/E/[\x{00C8}-\x{00CB}E]/og;
197 35         36 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/I/[\x{00CC}-\x{00CF}I]/og;
198 35         38 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/D/[\x{00D0}D]/og;
199 35         33 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/N/[\x{00D1}N]/og;
200 35         31 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/O/[\x{00D2}-\x{00D8}O]/og;
201 35         28 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/U/[\x{00D9}-\x{00DC}U]/og;
202 35         73 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/Y/[\x{00DD}Y]/og;
203             }
204             }
205 5         211 print STDERR "\n\tTerm/regex list size : " . scalar(@$ref_regex_termlist);
206 5 50       20 if (defined $ref_regex_lemmaWordtermlist) {
207 5         143 print STDERR" / " . scalar(@$ref_regex_lemmaWordtermlist);
208             }
209 5         414 print STDERR "\n\n";
210             }
211              
212             sub load_Corpus {
213              
214 8     8 1 16 my ($corpus_filename, $ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
215 8         12 my $line;
216 8         20 my $sent_id = 1;
217 8         11 my $offset = 0;
218 8         8 my $lineLen = 0;
219              
220 8         788 warn "Loading the corpus\n";
221              
222 8 50       251 open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
223            
224 8         40 binmode(CORPUS, ":utf8");
225            
226 8         121 while($line=){
227 21         50 $lineLen = length($line);
228 21         30 chomp $line;
229 21         73 $ref_tabh_Corpus->{$sent_id}->{'line'} = $line;
230 21         27 $ref_tabh_Corpus->{$sent_id}->{'offset'} = $offset;
231 21         118 $ref_tabh_Corpus_lc->{$sent_id}->{'line'} = lc $line;
232 21         27 $ref_tabh_Corpus_lc->{$sent_id}->{'offset'} = $offset;
233             # warn "=> " . $ref_tabh_Corpus_lc->{$sent_id} . "\n";
234 21         23 $sent_id++;
235 21         82 $offset += $lineLen;
236             }
237 8         45 close CORPUS;
238 8         1083 print STDERR "\n\tCorpus size : " . scalar(keys %$ref_tabh_Corpus) . "\n\n";
239             }
240              
241              
242             sub corpus_Indexing {
243 8     8 1 15 my ($ref_corpus_lc, $ref_corpus, $ref_corpus_index, $caseSensitive) = @_;
244              
245 8         10 my $word;
246             my @tab_words;
247 0         0 my @tab_words_lc;
248 0         0 my $sent_id;
249 0         0 my $i;
250              
251 8         806 warn "Indexing the corpus\n";
252              
253 8         47 foreach $sent_id (keys %$ref_corpus_lc) { # \-\.,\n;\/
254 21         387 @tab_words = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus->{$sent_id}->{'line'};
255 21         343 @tab_words_lc = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus_lc->{$sent_id}->{'line'};
256 21         68 for($i=0;$i < scalar(@tab_words_lc);$i++) {
257             # foreach $word_lc (@tab_words_lc) {
258 581 100 100     2411 if ((defined $caseSensitive) && (($caseSensitive == 0) || (length($tab_words_lc[$i]) <= $caseSensitive))) {
      33        
259 226         277 $word = $tab_words[$i];
260             } else {
261 355         408 $word = $tab_words_lc[$i];
262             }
263 581 100       928 if ($word ne "") {
264 520         669 $word =~ s/[\x{00C0}-\x{00C5}\x{00E0}-\x{00E5}]/A/og;
265 520         468 $word =~ s/\x{00C6}/AE/og;
266 520         421 $word =~ s/[\x{00C7}]/C/og;
267 520         506 $word =~ s/[\x{00C8}-\x{00CB}]/E/og;
268 520         490 $word =~ s/[\x{00CC}-\x{00CF}]/I/og;
269 520         442 $word =~ s/[\x{00D0}]/D/og;
270 520         430 $word =~ s/[\x{00D1}]/N/og;
271 520         518 $word =~ s/[\x{00D2}-\x{00D8}]/O/og;
272 520         481 $word =~ s/[\x{00D9}-\x{00DC}]/U/og;
273 520         412 $word =~ s/[\x{00DD}]/Y/og;
274              
275 520 100       1025 if (!exists $ref_corpus_index->{$word}) {
276 388         278 my @tabtmp;
277 388         843 $ref_corpus_index->{$word} = \@tabtmp;
278             }
279 520         411 push @{$ref_corpus_index->{$word}}, $sent_id;
  520         1586  
280             }
281             }
282             }
283             # print STDERR join(" : ", keys(%$ref_corpus_index)) . "\n";
284              
285 8         1505 print STDERR "\n\tSize of the first selected term list: " . scalar(keys %$ref_corpus_index) . "\n\n";
286             }
287              
288             sub print_corpus_index {
289 0     0 1 0 my ($ref_corpus_index) = @_;
290              
291 0         0 my $word;
292              
293 0         0 foreach $word (sort keys %$ref_corpus_index) {
294 0         0 print STDERR "$word\t";
295 0         0 print STDERR join(", ", @{$ref_corpus_index->{$word}});
  0         0  
296 0         0 print STDERR "\n";
297             }
298             }
299              
300             sub _term_Selection2 {
301 0     0   0 my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select) = @_;
302 0         0 my $counter;
303             my $term;
304 0         0 my @tab_termlex;
305 0         0 my $i;
306 0         0 my $word;
307 0         0 my $sent_id;
308 0         0 my $word_found = 0;
309              
310 0         0 warn "Selecting the terms potentialy appearing in the corpus\n";
311              
312 0         0 my %tabh_numtrm_select;
313            
314 0         0 for($counter = 0;$counter < scalar @$ref_termlist;$counter++) {
315 0         0 $term = lc $ref_termlist->[$counter]->[0];
316             # XXX - ABREVIATION - XXX
317 0         0 @tab_termlex = split /[ \-]+/, $term;
318 0         0 $word_found = 0;
319 0         0 $i=0;
320 0   0     0 do {
321 0         0 $word = $tab_termlex[$i];
322 0 0 0     0 if (($word ne "") && ((length($word) > 2) || (scalar(@tab_termlex)==1)) &&
      0        
      0        
323             ((exists $ref_corpus_index->{$word}))) { # || (exists $ref_corpus_index->{$word . "s"})
324 0         0 $word_found = 1;
325 0 0       0 if (!exists $ref_tabh_idtrm_select->{$counter}) {
326 0         0 my %tabhtmp2;
327 0         0 $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
328             }
329 0         0 foreach $sent_id (@{$ref_corpus_index->{$word}}) {
  0         0  
330 0         0 ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
  0         0  
331             }
332             }
333 0         0 $i++;
334             } while((!$word_found) && ($i < scalar @tab_termlex));
335             }
336              
337 0         0 warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
338             }
339              
340             sub term_Selection {
341 8     8 1 17 my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select, $caseSensitive, $termField) = @_;
342 8         8 my $counter;
343             my $term;
344 0         0 my @tab_termlex;
345 0         0 my $termCap;
346 0         0 my @tab_termlexCap;
347 0         0 my $i;
348 0         0 my $word;
349 0         0 my $sent_id;
350 8         12 my $word_found = 0;
351              
352 8         8 my @recordedWords;
353              
354 8 50       23 if (!defined $termField) {
355 8         12 $termField = 0;
356             }
357              
358 8         837 warn "Selecting the terms potentialy appearing in the corpus ($termField)\n";
359              
360 8         22 my %tabh_numtrm_select;
361            
362             # warn "caseSensitive: $caseSensitive\n";
363 8         37 for($counter = 0;$counter < scalar @$ref_termlist;$counter++) {
364 56 50       99 if (defined $ref_termlist->[$counter]->[$termField]) {
365             # warn "==> " . $ref_termlist->[$counter]->[0] . " / " . $ref_termlist->[$counter]->[$termField] . "\n";
366 56 100 66     303 if ((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField]) <= $caseSensitive))) {
      33        
367 14         15 $term = $ref_termlist->[$counter]->[$termField];
368 14         16 $termCap = $ref_termlist->[$counter]->[$termField];
369             # warn "passe\n";
370             } else {
371 42         68 $term = lc $ref_termlist->[$counter]->[$termField];
372 42         63 $termCap = $ref_termlist->[$counter]->[$termField];
373             }
374             } else {
375 0         0 $term = lc $ref_termlist->[$counter]->[0];
376 0         0 $termCap = $ref_termlist->[$counter]->[0];
377             }
378             # warn "+++> $term ($termCap)\n";
379             # XXX - ABREVIATION - XXX
380             # @tab_termlex = split /[ \-:]+/, $term;
381 56         201 @tab_termlex = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]+/, $term;
382 56         125 @tab_termlexCap = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]+/, $termCap;
383             # @tab_termlex = split /[ \-:]+/, $term;
384             # @tab_termlexCap = split /[ \-:]+/, $termCap;
385 56         62 $word_found = 0;
386 56         48 $i=0;
387 56         71 @recordedWords = ();
388 56         65 $word = $tab_termlex[$i];
389             # warn join(':', @tab_termlex) . " -- " . join(':', @tab_termlexCap) . "\n";
390             # warn scalar(@tab_termlex) . " -- " . scalar(@tab_termlexCap) . " ($i)\n";
391 56   66     418 while(($i < scalar(@tab_termlex)) && ($i < scalar(@tab_termlexCap)) &&
      66        
      66        
392             ((($word eq "") || (exists $ref_corpus_index->{$word})) ||
393             ((($caseSensitive == 0) || (length($termCap) > $caseSensitive)) &&
394             (exists $ref_corpus_index->{$tab_termlexCap[$i]})))
395             ) {
396             # ((($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField]) > $caseSensitive)) &&
397              
398 102 50       174 if ($word ne "") {
399             # warn "---> $term\n";
400 102         128 push @recordedWords, $word;
401             # } else {
402             # warn "--------------------------> $term\n";
403             }
404 102         81 $i++;
405 102         412 $word = $tab_termlex[$i];
406             # warn "i: $i\n";
407             }
408 56 100       101 if ($i == scalar(@tab_termlex)) {
409 55         72 foreach $word (@recordedWords) {
410             # print STDERR "$word : ";
411 102 100       191 if (!exists $ref_tabh_idtrm_select->{$counter}) {
412 55         47 my %tabhtmp2;
413 55         104 $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
414             }
415 102         90 foreach $sent_id (@{$ref_corpus_index->{$word}}) {
  102         192  
416 190         163 ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
  190         429  
417             }
418             }
419             }
420             # }
421             }
422             # print STDERR "\n";
423              
424             # print STDERR join(" : ", keys(%$ref_tabh_idtrm_select)) . "\n";
425              
426 8         931 warn "Size of the selected list: " . scalar (keys %$ref_tabh_idtrm_select) . "\n";
427             # foreach $counter (keys %$ref_tabh_idtrm_select) {
428             # warn $ref_termlist->[$counter]->[0] . "\n";
429             # }
430              
431 8         1233 warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
432             }
433              
434             sub term_tagging_offset {
435 7     7 1 18 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
436 7         17 my $counter;
437             my $term_regex;
438 0         0 my $sent_id;
439 0         0 my $line;
440 0         0 my $termField2;
441              
442 7 50       21 if (!defined $termField) {
443 7         9 $termField = 0;
444             }
445             # XXX - ABREVIATION - XXX => regex
446              
447 7         709 warn "Term tagging\n";
448              
449 7 50       340 open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
450              
451 7         45 binmode(TAGGEDCORPUS, ":utf8");
452              
453 7         32 foreach $counter (keys %$ref_tabh_idtrm_select) {
454 48         189 $term_regex = $ref_regex_termlist->[$counter];
455 48         45 $termField2 = 0;
456 48 50       128 if (defined $ref_termlist->[$counter]->[$termField]) {
457 48         57 $termField2 = $termField;
458             }
459 48         41 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  48         206  
460 76         126 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
461 76         2442 print STDERR ".";
462            
463 76 100 66     3956 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      100        
      66        
      66        
      66        
464             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/)) ||
465             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
466             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
467 51         157 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
468             }
469 76 100 66     2819 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      100        
      66        
      66        
      66        
470             ($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i)) ||
471             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
472             ($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
473 4         16 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
474             }
475 76 50 66     2756 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      66        
      66        
      33        
      33        
476             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/)) ||
477             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
478             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/i))) {
479 0         0 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
480             }
481 76 50 66     2216 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      66        
      66        
      33        
      33        
482             ($line =~ /^($term_regex)$/i)) ||
483             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
484             ($line =~ /^($term_regex)$/i))) {
485 0         0 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
486             }
487             }
488 48         4834 print STDERR "\n";
489             }
490              
491 7         270 close TAGGEDCORPUS;
492              
493             #########################################################################################################
494 7         1211 warn "\nEnd of term tagging\n";
495             }
496              
497             sub printMatchingTerm() {
498 55     55 1 101 my ($descriptor, $ref_matching_term, $sent_id) = @_;
499              
500 55         197 print $descriptor "$sent_id\t";
501 55         199 print $descriptor join("\t", @$ref_matching_term);
502 55         153 print $descriptor "\n";
503              
504             }
505              
506              
507             sub term_tagging_offset_tab {
508 0     0 1 0 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $ref_tab_results, $caseSensitive, $termField) = @_;
509 0         0 my $counter;
510             my $term_regex;
511 0         0 my $sent_id;
512 0         0 my $line;
513 0         0 my $i;
514 0         0 my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);
515 0         0 my $termField2;
516              
517 0         0 $i = 0;
518              
519 0 0       0 if (!defined $termField) {
520 0         0 $termField = 0;
521             }
522              
523             # XXX - ABREVIATION - XXX => regex
524             # warn "====> $caseSensitive\n";
525            
526 0         0 foreach $counter (keys %$ref_tabh_idtrm_select) {
527             # printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
528 0         0 $term_regex = $ref_regex_termlist->[$counter];
529             # warn "counter: $counter ($term_regex)\n";
530              
531 0         0 $termField2 = 0;
532 0 0       0 if (defined $ref_termlist->[$counter]->[$termField]) {
533 0         0 $termField2 = $termField;
534             }
535              
536 0         0 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  0         0  
537 0         0 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
538              
539             # warn "$line\n$term_regex\n";
540              
541 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
542             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
543             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
544             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
545 5     5   22215 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
  5         2448  
  5         6858  
  0         0  
546             }
547 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
548             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
549             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
550             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
551 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
552             }
553 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
554             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?$term_regex)$/s)) ||
555             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
556             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?$term_regex)$/is))) {
557 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
558             }
559 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
560             ($line =~ /^(?$term_regex)$/s)) ||
561             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
562             ($line =~ /^(?$term_regex)$/is))) {
563 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
564             }
565             }
566 0         0 $i++;
567             }
568 0         0 print STDERR "\n";
569              
570             #########################################################################################################
571 0         0 warn "\nEnd of term tagging\n";
572             }
573              
574             sub term_tagging_offset_brat {
575 1     1 1 2 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
576 1         2 my $counter;
577             my $term_regex;
578 0         0 my $sent_id;
579 0         0 my $line;
580 0         0 my $i;
581 1         2 my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);
582 1         1 my $termField2;
583 1         2 my $termId = 1;
584 1         1 my $offset;
585             my $currOffset;
586              
587 1         2 $i = 0;
588              
589 1         87 warn "Term tagging ($offset_tagged_corpus_name)\n";
590              
591 1 50       93 open TAGGEDCORPUS, ">$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
592              
593 1         6 binmode(TAGGEDCORPUS, ":utf8");
594              
595              
596 1 50       8 if (!defined $termField) {
597 1         2 $termField = 0;
598             }
599              
600             # XXX - ABREVIATION - XXX => regex
601             # warn "====> $caseSensitive\n";
602            
603 1         5 foreach $counter (keys %$ref_tabh_idtrm_select) {
604             # printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
605 7         13 $term_regex = $ref_regex_termlist->[$counter];
606             # warn "counter: $counter ($term_regex)\n";
607              
608 7         8 $termField2 = 0;
609 7 50       14 if (defined $ref_termlist->[$counter]->[$termField]) {
610 7         8 $termField2 = $termField;
611             }
612              
613 7         4 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  7         20  
614 12         16 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
615 12         9 $offset = $ref_tabh_corpus->{$sent_id}->{'offset'};
616              
617             # warn "$line\n$term_regex\n";
618             # warn "$line\n$offset\n";
619              
620 12 100 33     480 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      66        
      33        
621             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
622             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
623             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
624 7         67 $currOffset = $offset+length($`)+length($+{before});
625 7         41 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}),$ref_termlist->[$counter]->[2]);
626             }
627 12 100 33     334 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      66        
      33        
628             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
629             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
630             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
631 1         3 $currOffset = $offset+length($`);
632 1         7 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}),$ref_termlist->[$counter]->[2]);
633             }
634 12 50 33     342 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      33        
      33        
635             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?$term_regex)$/s)) ||
636             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
637             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?$term_regex)$/is))) {
638 0         0 $currOffset = $offset+length($`)+length($+{before});
639 0         0 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}),$ref_termlist->[$counter]->[2]);
640             }
641 12 50 33     255 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      33        
      33        
642             ($line =~ /^(?$term_regex)$/s)) ||
643             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
644             ($line =~ /^(?$term_regex)$/is))) {
645 0         0 $currOffset = $offset+length($`);
646 0         0 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}),$ref_termlist->[$counter]->[2]);
647             }
648             }
649 7         11 $i++;
650             }
651 1         45 print STDERR "\n";
652              
653 1         35 close TAGGEDCORPUS;
654             #########################################################################################################
655 1         147 warn "\nEnd of term tagging\n";
656             }
657              
658             sub print_brat_output() {
659 8     8 1 23 my ($descriptor, $termId, $matching_term, $start_offset, $end_offset, $semtag) = @_;
660              
661 8 100 66     35 if ((!defined $semtag) || ($semtag =~ /^\s*$/)) {
662 4         4 $semtag = "term";
663             }
664              
665 8         40 print $descriptor "T$$termId\t$semtag $start_offset $end_offset\t$matching_term\n";
666 8         16 $$termId++;
667             }
668              
669              
670             sub printMatchingTerm_tab() {
671 0     0 1   my ($ref_matching_term, $term, $sent_id, $ref_tab_results) = @_;
672              
673 0           my $tmp_line = "";
674 0           my $tmp_key;
675              
676             # warn "\nOK: $term\n";
677             # warn "ref_matching_term: " . join ("\t", @$ref_matching_term) . "\n";
678              
679 0 0         if (ref($ref_tab_results) eq "ARRAY") {
680 0           $tmp_line .= "$sent_id\t";
681 0           $tmp_line .= join ("\t", @$ref_matching_term);
682 0           push @$ref_tab_results, $tmp_line;
683             # warn "tmp_line: $tmp_line\n";
684             } else {
685 0 0         if (ref($ref_tab_results) eq "HASH") {
686 0           my @tab_tmp;
687 0           $term =~ s/\\([\-\+\(\)\{\}])/$1/og;
688 0           $tmp_key .= $sent_id . "_";
689 0           $tmp_key .= $term;
690              
691 0           push @tab_tmp, $sent_id;
692 0           push @tab_tmp, $term;
693 0           push @tab_tmp, @$ref_matching_term;
694              
695             # warn "term_key: $tmp_key\n";
696             # if (!exists $ref_tab_results->{$tmp_key}) {
697 0 0         if (!exists($ref_tab_results->{$tmp_key})) {
698             # warn "!exists\n";
699 0           $ref_tab_results->{$tmp_key} = \@tab_tmp;
700             } else {
701             # warn "exists\n";
702             # push @{$ref_tab_results->{$tmp_key}}, @tab_tmp;
703 0 0         if (defined $tab_tmp[4]) {
704 0           $ref_tab_results->{$tmp_key}->[4] .= ";" . $tab_tmp[4];
705             } else {
706 0           $ref_tab_results->{$tmp_key}->[4] .= ";";
707             }
708             }
709             # warn "tab_tmp: " . join ("\t", @{$ref_tab_results->{$tmp_key}}) . "\n";
710              
711             # } else {
712             # foreach $refmatch (@{$ref_tab_results->{$tmp_key}}) {
713            
714             # }
715             # }
716             }
717             }
718             }
719              
720              
721             1;
722              
723             __END__