File Coverage

blib/lib/Alvis/TermTagger.pm
Criterion Covered Total %
statement 221 356 62.0
branch 57 108 52.7
condition 83 255 32.5
subroutine 15 19 78.9
pod 14 14 100.0
total 390 752 51.8


line stmt bran cond sub pod time code
1             package Alvis::TermTagger;
2              
3             our $VERSION = '0.81';
4              
5             #######################################################################
6             #
7             # Last Update: 16/09/2015 (mm/dd/yyyy date format)
8             #
9             # Copyright (C) 2006 Thierry Hamon
10             #
11             # Written by thierry.hamon@limsi.fr
12             #
13             # Author : Thierry Hamon
14             # Email : thierry.hamon@limsi.fr
15             # URL : https://perso.limsi.fr/hamon/
16             #
17             ########################################################################
18              
19              
20 4     4   133554 use strict;
  4         10  
  4         116  
21 4     4   20 use warnings;
  4         9  
  4         142  
22              
23 4     4   27808 use utf8;
  4         63  
  4         31  
24              
25             # TODO : write functions for term tagginga, term selection with and
26             # without offset in the corpus
27              
28             sub termtagging {
29              
30 4     4 1 34 my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
31              
32 4         8 my @term_list;
33             my %term_listIdx;
34 0         0 my @regex_term_list;
35 0         0 my @regex_lemmawordterm_list;
36 0         0 my %corpus;
37 0         0 my %lc_corpus;
38 0         0 my %lemmatised_corpus;
39 0         0 my %lc_lemmatised_corpus;
40 0         0 my %corpus_index;
41 0         0 my %lemmatised_corpus_index;
42 0         0 my %idtrm_select;
43 0         0 my %idlemtrm_select;
44              
45 4 100       14 if (!defined $caseSensitive) {
46 1         2 $caseSensitive = -1;
47             }
48              
49 4         14 &load_TermList($term_list_filename,\@term_list, \%term_listIdx);
50 4         42 &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
51              
52 4         19 &load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
53 4 100       15 if (defined $lemmatised_corpus_filename) {
54 3         7 &load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
55             }
56 4         16 &corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
57 4 100       14 if (defined $lemmatised_corpus_filename) {
58 3         9 &corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
59             }
60 4         17 &term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
61 4 100       14 if (defined $lemmatised_corpus_filename) {
62 3         9 &term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
63             }
64 4         18 &term_tagging_offset(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
65 4 100       25 if (defined $lemmatised_corpus_filename) {
66 3         9 &term_tagging_offset(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
67             }
68 4         172 return(0);
69             }
70              
71             sub termtagging_brat {
72              
73 1     1 1 14 my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename, $caseSensitive) = @_;
74              
75 1         3 my @term_list;
76             my %term_listIdx;
77 0         0 my @regex_term_list;
78 0         0 my @regex_lemmawordterm_list;
79 0         0 my %corpus;
80 0         0 my %lc_corpus;
81 0         0 my %lemmatised_corpus;
82 0         0 my %lc_lemmatised_corpus;
83 0         0 my %corpus_index;
84 0         0 my %lemmatised_corpus_index;
85 0         0 my %idtrm_select;
86 0         0 my %idlemtrm_select;
87              
88 1 50       4 if (!defined $caseSensitive) {
89 1         2 $caseSensitive = -1;
90             }
91              
92 1         5 &load_TermList($term_list_filename,\@term_list, \%term_listIdx);
93 1         6 &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);
94              
95 1         6 &load_Corpus($corpus_filename, \%corpus, \%lc_corpus);
96 1 50       4 if (defined $lemmatised_corpus_filename) {
97 0         0 &load_Corpus($lemmatised_corpus_filename, \%lemmatised_corpus, \%lc_lemmatised_corpus);
98             }
99 1         5 &corpus_Indexing(\%lc_corpus, \%corpus, \%corpus_index, $caseSensitive);
100 1 50       5 if (defined $lemmatised_corpus_filename) {
101 0         0 &corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus, \%lemmatised_corpus_index, $caseSensitive);
102             }
103 1         5 &term_Selection(\%corpus_index, \@term_list, \%idtrm_select, $caseSensitive);
104 1 50       5 if (defined $lemmatised_corpus_filename) {
105 0         0 &term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select, $caseSensitive);
106             }
107 1         5 &term_tagging_offset_brat(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename, $caseSensitive);
108 1 50       6 if (defined $lemmatised_corpus_filename) {
109 0         0 &term_tagging_offset_brat(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename, $caseSensitive);
110             }
111 1         39 return(0);
112             }
113              
114              
115             sub load_TermList {
116 5     5 1 10 my ($termlist_name, $ref_termlist, $ref_termlistIdx) = @_;
117              
118 5         8 my $line;
119             my $line1;
120 0         0 my $term; # not use yet
121 0         0 my $suppl_info; # not use yet
122 0         0 my @tab;
123              
124 5         1162 warn "Loading the terminological resource\n";
125              
126 5 50       198 open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";
127              
128 5         27 binmode(DESC_TERMLIST, ":utf8");
129              
130 5         124 while($line1 = ) {
131 35         60 chomp $line1;
132 35         65 utf8::decode($line1);
133 35         66 $line=$line1;
134              
135             # Blank and comment lines are throw away
136 35 50 33     207 if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
      33        
137             # Term is split from the other information
138 35         121 my @tab = split / ?[\|:] ?/, $line;
139 35 50       94 if ($tab[0] !~ /^\s*$/) {
140             # TODO better
141 35         92 $tab[0] =~ s/ +/ /go;
142 35         52 $tab[0] =~ s/ $//go;
143 35         43 $tab[0] =~ s/^ //go;
144             # $tab[0] =~ s/\\:/:/go;
145             # warn "term: " . $tab[0] . "\n";;
146 35 50       73 if (!exists $ref_termlistIdx->{$tab[0]}) {
147 35         56 push @$ref_termlist, \@tab;
148 35         220 $ref_termlistIdx->{$tab[0]} = scalar(@$ref_termlist) -1;
149             } else {
150 0         0 $ref_termlist->[$ref_termlistIdx->{$tab[0]}]->[2] .= ";" . $tab[2];
151             }
152             }
153             }
154             }
155 5         34 close DESC_TERMLIST;
156 5         825 print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
157             }
158              
159             sub get_Regex_TermList {
160              
161 5     5 1 9 my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
162 5         6 my $term_counter;
163              
164 5         440 warn "Generating the regular expression from the terms\n";
165              
166 5         32 for($term_counter = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
167 35         70 $ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
168 35 50       72 if (defined $ref_regex_lemmaWordtermlist) {
169 35 50       54 if (defined $ref_termlist->[$term_counter]->[3]) {
170 0         0 $ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
171             # warn "==> " . $ref_termlist->[$term_counter]->[3] . "\n";
172             } else {
173 35         55 $ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
174             }
175             }
176             # warn $ref_regex_lemmaWordtermlist->[$term_counter] . "\n";
177 35         59 $ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
178 35         71 $ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
179 35 50       66 if (defined $ref_regex_lemmaWordtermlist) {
180 35         50 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}\\])/\\$1/og;
181 35         112 $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
182             }
183             }
184 5         178 print STDERR "\n\tTerm/regex list size : " . scalar(@$ref_regex_termlist);
185 5 50       19 if (defined $ref_regex_lemmaWordtermlist) {
186 5         142 print STDERR" / " . scalar(@$ref_regex_lemmaWordtermlist);
187             }
188 5         391 print STDERR "\n\n";
189             }
190              
191             sub load_Corpus {
192              
193 8     8 1 43 my ($corpus_filename, $ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
194 8         22 my $line;
195 8         12 my $sent_id = 1;
196 8         9 my $offset = 0;
197 8         11 my $lineLen = 0;
198              
199 8         668 warn "Loading the corpus\n";
200              
201 8 50       214 open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
202            
203 8         27 binmode(CORPUS, ":utf8");
204            
205 8         103 while($line=){
206 21         44 $lineLen = length($line);
207 21         29 chomp $line;
208 21         81 $ref_tabh_Corpus->{$sent_id}->{'line'} = $line;
209 21         33 $ref_tabh_Corpus->{$sent_id}->{'offset'} = $offset;
210 21         127 $ref_tabh_Corpus_lc->{$sent_id}->{'line'} = lc $line;
211 21         34 $ref_tabh_Corpus_lc->{$sent_id}->{'offset'} = $offset;
212             # warn "=> " . $ref_tabh_Corpus_lc->{$sent_id} . "\n";
213 21         25 $sent_id++;
214 21         101 $offset += $lineLen;
215             }
216 8         55 close CORPUS;
217 8         1108 print STDERR "\n\tCorpus size : " . scalar(keys %$ref_tabh_Corpus) . "\n\n";
218             }
219              
220              
221             sub corpus_Indexing {
222 8     8 1 16 my ($ref_corpus_lc, $ref_corpus, $ref_corpus_index, $caseSensitive) = @_;
223              
224 8         9 my $word;
225             my @tab_words;
226 0         0 my @tab_words_lc;
227 0         0 my $sent_id;
228 0         0 my $i;
229              
230 8         674 warn "Indexing the corpus\n";
231              
232 8         69 foreach $sent_id (keys %$ref_corpus_lc) { # \-\.,\n;\/
233 21         362 @tab_words = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus->{$sent_id}->{'line'};
234 21         344 @tab_words_lc = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus_lc->{$sent_id}->{'line'};
235 21         55 for($i=0;$i < scalar(@tab_words_lc);$i++) {
236             # foreach $word_lc (@tab_words_lc) {
237 581 100 100     2572 if ((defined $caseSensitive) && (($caseSensitive == 0) || (length($tab_words_lc[$i]) <= $caseSensitive))) {
      33        
238 226         303 $word = $tab_words[$i];
239             } else {
240 355         540 $word = $tab_words_lc[$i];
241             }
242 581 100       1063 if ($word ne "") {
243 520 100       1151 if (!exists $ref_corpus_index->{$word}) {
244 388         387 my @tabtmp;
245 388         908 $ref_corpus_index->{$word} = \@tabtmp;
246             }
247 520         556 push @{$ref_corpus_index->{$word}}, $sent_id;
  520         1638  
248             }
249             }
250             }
251             # print STDERR join(" : ", keys(%$ref_corpus_index)) . "\n";
252              
253 8         1212 print STDERR "\n\tSize of the first selected term list: " . scalar(keys %$ref_corpus_index) . "\n\n";
254             }
255              
256             sub print_corpus_index {
257 0     0 1 0 my ($ref_corpus_index) = @_;
258              
259 0         0 my $word;
260              
261 0         0 foreach $word (sort keys %$ref_corpus_index) {
262 0         0 print STDERR "$word\t";
263 0         0 print STDERR join(", ", @{$ref_corpus_index->{$word}});
  0         0  
264 0         0 print STDERR "\n";
265             }
266             }
267              
268             sub _term_Selection2 {
269 0     0   0 my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select) = @_;
270 0         0 my $counter;
271             my $term;
272 0         0 my @tab_termlex;
273 0         0 my $i;
274 0         0 my $word;
275 0         0 my $sent_id;
276 0         0 my $word_found = 0;
277              
278 0         0 warn "Selecting the terms potentialy appearing in the corpus\n";
279              
280 0         0 my %tabh_numtrm_select;
281            
282 0         0 for($counter = 0;$counter < scalar @$ref_termlist;$counter++) {
283 0         0 $term = lc $ref_termlist->[$counter]->[0];
284             # XXX - ABREVIATION - XXX
285 0         0 @tab_termlex = split /[ \-]+/, $term;
286 0         0 $word_found = 0;
287 0         0 $i=0;
288 0   0     0 do {
289 0         0 $word = $tab_termlex[$i];
290 0 0 0     0 if (($word ne "") && ((length($word) > 2) || (scalar(@tab_termlex)==1)) &&
      0        
      0        
291             ((exists $ref_corpus_index->{$word}))) { # || (exists $ref_corpus_index->{$word . "s"})
292 0         0 $word_found = 1;
293 0 0       0 if (!exists $ref_tabh_idtrm_select->{$counter}) {
294 0         0 my %tabhtmp2;
295 0         0 $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
296             }
297 0         0 foreach $sent_id (@{$ref_corpus_index->{$word}}) {
  0         0  
298 0         0 ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
  0         0  
299             }
300             }
301 0         0 $i++;
302             } while((!$word_found) && ($i < scalar @tab_termlex));
303             }
304              
305 0         0 warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
306             }
307              
308             sub term_Selection {
309 8     8 1 13 my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select, $caseSensitive, $termField) = @_;
310 8         12 my $counter;
311             my $term;
312 0         0 my @tab_termlex;
313 0         0 my $termCap;
314 0         0 my @tab_termlexCap;
315 0         0 my $i;
316 0         0 my $word;
317 0         0 my $sent_id;
318 8         12 my $word_found = 0;
319              
320 8         9 my @recordedWords;
321              
322 8 50       21 if (!defined $termField) {
323 8         10 $termField = 0;
324             }
325              
326 8         692 warn "Selecting the terms potentialy appearing in the corpus ($termField)\n";
327              
328 8         20 my %tabh_numtrm_select;
329            
330             # warn "caseSensitive: $caseSensitive\n";
331 8         28 for($counter = 0;$counter < scalar @$ref_termlist;$counter++) {
332 56 50       110 if (defined $ref_termlist->[$counter]->[$termField]) {
333             # warn "==> " . $ref_termlist->[$counter]->[0] . " / " . $ref_termlist->[$counter]->[$termField] . "\n";
334 56 100 66     256 if ((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField]) <= $caseSensitive))) {
      33        
335 14         21 $term = $ref_termlist->[$counter]->[$termField];
336 14         21 $termCap = $ref_termlist->[$counter]->[$termField];
337             # warn "passe\n";
338             } else {
339 42         66 $term = lc $ref_termlist->[$counter]->[$termField];
340 42         66 $termCap = $ref_termlist->[$counter]->[$termField];
341             }
342             } else {
343 0         0 $term = lc $ref_termlist->[$counter]->[0];
344 0         0 $termCap = $ref_termlist->[$counter]->[0];
345             }
346             # warn "+++> $term ($termCap)\n";
347             # XXX - ABREVIATION - XXX
348             # @tab_termlex = split /[ \-:]+/, $term;
349 56         166 @tab_termlex = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]+/, $term;
350 56         143 @tab_termlexCap = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]+/, $termCap;
351             # @tab_termlex = split /[ \-:]+/, $term;
352             # @tab_termlexCap = split /[ \-:]+/, $termCap;
353 56         63 $word_found = 0;
354 56         53 $i=0;
355 56         82 @recordedWords = ();
356 56         72 $word = $tab_termlex[$i];
357             # warn join(':', @tab_termlex) . " -- " . join(':', @tab_termlexCap) . "\n";
358             # warn scalar(@tab_termlex) . " -- " . scalar(@tab_termlexCap) . " ($i)\n";
359 56   66     437 while(($i < scalar(@tab_termlex)) && ($i < scalar(@tab_termlexCap)) &&
      66        
      66        
360             ((($word eq "") || (exists $ref_corpus_index->{$word})) ||
361             ((($caseSensitive == 0) || (length($termCap) > $caseSensitive)) &&
362             (exists $ref_corpus_index->{$tab_termlexCap[$i]})))
363             ) {
364             # ((($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField]) > $caseSensitive)) &&
365              
366 102 50       206 if ($word ne "") {
367             # warn "---> $term\n";
368 102         135 push @recordedWords, $word;
369             # } else {
370             # warn "--------------------------> $term\n";
371             }
372 102         105 $i++;
373 102         433 $word = $tab_termlex[$i];
374             # warn "i: $i\n";
375             }
376 56 100       110 if ($i == scalar(@tab_termlex)) {
377 55         72 foreach $word (@recordedWords) {
378             # print STDERR "$word : ";
379 102 100       217 if (!exists $ref_tabh_idtrm_select->{$counter}) {
380 55         61 my %tabhtmp2;
381 55         94 $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
382             }
383 102         106 foreach $sent_id (@{$ref_corpus_index->{$word}}) {
  102         217  
384 190         170 ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
  190         490  
385             }
386             }
387             }
388             # }
389             }
390             # print STDERR "\n";
391              
392             # print STDERR join(" : ", keys(%$ref_tabh_idtrm_select)) . "\n";
393              
394 8         715 warn "Size of the selected list: " . scalar (keys %$ref_tabh_idtrm_select) . "\n";
395             # foreach $counter (keys %$ref_tabh_idtrm_select) {
396             # warn $ref_termlist->[$counter]->[0] . "\n";
397             # }
398              
399 8         1075 warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
400             }
401              
402             sub term_tagging_offset {
403 7     7 1 14 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
404 7         8 my $counter;
405             my $term_regex;
406 0         0 my $sent_id;
407 0         0 my $line;
408 0         0 my $termField2;
409              
410 7 50       31 if (!defined $termField) {
411 7         10 $termField = 0;
412             }
413             # XXX - ABREVIATION - XXX => regex
414              
415 7         595 warn "Term tagging\n";
416              
417 7 50       334 open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
418              
419 7         28 binmode(TAGGEDCORPUS, ":utf8");
420              
421 7         24 foreach $counter (keys %$ref_tabh_idtrm_select) {
422 48         136 $term_regex = $ref_regex_termlist->[$counter];
423 48         60 $termField2 = 0;
424 48 50       111 if (defined $ref_termlist->[$counter]->[$termField]) {
425 48         56 $termField2 = $termField;
426             }
427 48         45 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  48         165  
428 76         129 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
429 76         1931 print STDERR ".";
430            
431 76 100 66     3504 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      100        
      66        
      66        
      66        
432             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/)) ||
433             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
434             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
435 51         125 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
436             }
437 76 100 66     2612 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      100        
      66        
      66        
      66        
438             ($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i)) ||
439             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
440             ($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i))) {
441 4         14 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
442             }
443 76 50 66     2483 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      66        
      66        
      33        
      33        
444             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/)) ||
445             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
446             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/i))) {
447 0         0 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
448             }
449 76 50 66     1899 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      66        
      66        
      33        
      33        
450             ($line =~ /^($term_regex)$/i)) ||
451             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
452             ($line =~ /^($term_regex)$/i))) {
453 0         0 printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
454             }
455             }
456 48         3967 print STDERR "\n";
457             }
458              
459 7         228 close TAGGEDCORPUS;
460              
461             #########################################################################################################
462 7         959 warn "\nEnd of term tagging\n";
463             }
464              
465             sub printMatchingTerm() {
466 55     55 1 81 my ($descriptor, $ref_matching_term, $sent_id) = @_;
467              
468 55         168 print $descriptor "$sent_id\t";
469 55         121 print $descriptor join("\t", @$ref_matching_term);
470 55         113 print $descriptor "\n";
471              
472             }
473              
474              
475             sub term_tagging_offset_tab {
476 0     0 1 0 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $ref_tab_results, $caseSensitive, $termField) = @_;
477 0         0 my $counter;
478             my $term_regex;
479 0         0 my $sent_id;
480 0         0 my $line;
481 0         0 my $i;
482 0         0 my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);
483 0         0 my $termField2;
484              
485 0         0 $i = 0;
486              
487 0 0       0 if (!defined $termField) {
488 0         0 $termField = 0;
489             }
490              
491             # XXX - ABREVIATION - XXX => regex
492             # warn "====> $caseSensitive\n";
493            
494 0         0 foreach $counter (keys %$ref_tabh_idtrm_select) {
495             # printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
496 0         0 $term_regex = $ref_regex_termlist->[$counter];
497             # warn "counter: $counter ($term_regex)\n";
498              
499 0         0 $termField2 = 0;
500 0 0       0 if (defined $ref_termlist->[$counter]->[$termField]) {
501 0         0 $termField2 = $termField;
502             }
503              
504 0         0 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  0         0  
505 0         0 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
506              
507             # warn "$line\n$term_regex\n";
508              
509 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
510             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
511             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
512             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
513 4     4   24593 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
  4         18766  
  4         6798  
  0         0  
514             }
515 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
516             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
517             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
518             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
519 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
520             }
521 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
522             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?$term_regex)$/s)) ||
523             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
524             ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?$term_regex)$/is))) {
525 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
526             }
527 0 0 0     0 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      0        
      0        
      0        
      0        
      0        
528             ($line =~ /^(?$term_regex)$/s)) ||
529             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
530             ($line =~ /^(?$term_regex)$/is))) {
531 0         0 printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
532             }
533             }
534 0         0 $i++;
535             }
536 0         0 print STDERR "\n";
537              
538             #########################################################################################################
539 0         0 warn "\nEnd of term tagging\n";
540             }
541              
542             sub term_tagging_offset_brat {
543 1     1 1 2 my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name, $caseSensitive, $termField) = @_;
544 1         2 my $counter;
545             my $term_regex;
546 0         0 my $sent_id;
547 0         0 my $line;
548 0         0 my $i;
549 1         2 my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);
550 1         2 my $termField2;
551 1         8 my $termId = 1;
552 1         1 my $offset;
553             my $currOffset;
554              
555 1         2 $i = 0;
556              
557 1         86 warn "Term tagging ($offset_tagged_corpus_name)\n";
558              
559 1 50       138 open TAGGEDCORPUS, ">$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";
560              
561 1         5 binmode(TAGGEDCORPUS, ":utf8");
562              
563              
564 1 50       4 if (!defined $termField) {
565 1         2 $termField = 0;
566             }
567              
568             # XXX - ABREVIATION - XXX => regex
569             # warn "====> $caseSensitive\n";
570            
571 1         3 foreach $counter (keys %$ref_tabh_idtrm_select) {
572             # printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
573 7         16 $term_regex = $ref_regex_termlist->[$counter];
574             # warn "counter: $counter ($term_regex)\n";
575              
576 7         8 $termField2 = 0;
577 7 50       16 if (defined $ref_termlist->[$counter]->[$termField]) {
578 7         9 $termField2 = $termField;
579             }
580              
581 7         8 foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
  7         24  
582 12         21 $line = $ref_tabh_corpus->{$sent_id}->{'line'};
583 12         15 $offset = $ref_tabh_corpus->{$sent_id}->{'offset'};
584              
585             # warn "$line\n$term_regex\n";
586             # warn "$line\n$offset\n";
587              
588 12 100 33     494 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      66        
      33        
589             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
590             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
591             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+])(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
592 7         49 $currOffset = $offset+length($`)+length($+{before});
593 7         46 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}));
594             }
595 12 100 33     344 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      66        
      33        
596             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/s)) ||
597             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
598             ($line =~ /^(?$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is))) {
599 1         2 $currOffset = $offset+length($`);
600 1         9 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset + length($+{term}));
601             }
602 12 50 33     328 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      33        
      33        
603             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?$term_regex)$/s)) ||
604             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
605             ($line =~ /(?[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+])(?$term_regex)$/is))) {
606 0         0 $currOffset = $offset+length($`)+length($+{before});
607 0         0 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}));
608             }
609 12 50 33     284 if ((((defined $caseSensitive) && (($caseSensitive == 0) || (length($ref_termlist->[$counter]->[$termField2]) <= $caseSensitive))) &&
      33        
      33        
      33        
      33        
      33        
610             ($line =~ /^(?$term_regex)$/s)) ||
611             (((!defined $caseSensitive) || ($caseSensitive < 0) || (length($ref_termlist->[$counter]->[$termField2]) > $caseSensitive)) &&
612             ($line =~ /^(?$term_regex)$/is))) {
613 0         0 $currOffset = $offset+length($`);
614 0         0 print_brat_output(\*TAGGEDCORPUS, \$termId, $+{term}, $currOffset, $currOffset+length($+{term}));
615             }
616             }
617 7         11 $i++;
618             }
619 1         42 print STDERR "\n";
620              
621 1         127 close TAGGEDCORPUS;
622             #########################################################################################################
623 1         176 warn "\nEnd of term tagging\n";
624             }
625              
626             sub print_brat_output() {
627 8     8 1 28 my ($descriptor, $termId, $matching_term, $start_offset, $end_offset) = @_;
628              
629 8         47 print $descriptor "T$$termId\tterm $start_offset $end_offset\t$matching_term\n";
630 8         17 $$termId++;
631             }
632              
633              
634             sub printMatchingTerm_tab() {
635 0     0 1   my ($ref_matching_term, $term, $sent_id, $ref_tab_results) = @_;
636              
637 0           my $tmp_line = "";
638 0           my $tmp_key;
639              
640             # warn "\nOK: $term\n";
641             # warn "ref_matching_term: " . join ("\t", @$ref_matching_term) . "\n";
642              
643 0 0         if (ref($ref_tab_results) eq "ARRAY") {
644 0           $tmp_line .= "$sent_id\t";
645 0           $tmp_line .= join ("\t", @$ref_matching_term);
646 0           push @$ref_tab_results, $tmp_line;
647             # warn "tmp_line: $tmp_line\n";
648             } else {
649 0 0         if (ref($ref_tab_results) eq "HASH") {
650 0           my @tab_tmp;
651 0           $term =~ s/\\([\-\+\(\)\{\}])/$1/og;
652 0           $tmp_key .= $sent_id . "_";
653 0           $tmp_key .= $term;
654              
655 0           push @tab_tmp, $sent_id;
656 0           push @tab_tmp, @$ref_matching_term;
657 0           push @tab_tmp, $term;
658              
659             # warn "term_key: $tmp_key\n";
660             # if (!exists $ref_tab_results->{$tmp_key}) {
661 0 0         if (!exists($ref_tab_results->{$tmp_key})) {
662             # warn "!exists\n";
663 0           $ref_tab_results->{$tmp_key} = \@tab_tmp;
664             } else {
665             # warn "exists\n";
666             # push @{$ref_tab_results->{$tmp_key}}, @tab_tmp;
667 0 0         if (defined $tab_tmp[3]) {
668 0           $ref_tab_results->{$tmp_key}->[3] .= ";" . $tab_tmp[3];
669             } else {
670 0           $ref_tab_results->{$tmp_key}->[3] .= ";";
671             }
672             }
673             # warn "tab_tmp: " . join ("\t", @{$ref_tab_results->{$tmp_key}}) . "\n";
674              
675             # } else {
676             # foreach $refmatch (@{$ref_tab_results->{$tmp_key}}) {
677            
678             # }
679             # }
680             }
681             }
682             }
683              
684             1;
685              
686             __END__