File Coverage

blib/lib/Lingua/BioYaTeA/PostProcessing.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Lingua::BioYaTeA::PostProcessing;
2              
3 1     1   41413 use strict;
  1         2  
  1         38  
4 1     1   4 use warnings;
  1         3  
  1         37  
5 1     1   1166 use utf8;
  1         12  
  1         6  
6              
7             =encoding utf8
8              
9             =head1 NAME
10              
11             Lingua::BioYaTeA::PostProcessing - Perl extension for postprocessing BioYaTeA term extraction.
12              
13             =head1 SYNOPSIS
14              
15             use Lingua::BioYaTeA::PostProcessing;
16              
17             my $postProc = Lingua::BioYaTeA::PostProcessing->new(
18             {
19             'input-file' => "sampleEN-output.xml",
20             'output-file' => "sampleEN-bioyatea-out-pp.xml",
21             'configuration' => "post-processing-filtering.conf",
22             }
23             );
24             $postProc->logfile(dirname($postProc->output_file) . '/term-filtering.log');
25             $postProc->load_configuration;
26             $postProc->defineTwigParser;
27             $postProc->filtering;
28             $postProc->printResume;
29              
30              
31             =head1 DESCRIPTION
32              
33             The module implements an extension for the post-processing of the
34             BioYaTeA (C output. Currently, the XML BioYaTeA
35             output is filtered according to rules in order to remove non relevant
36             extracted terms.
37              
38             The input and output files are in the XML YaTeA format.
39              
40             The configuration file provides patterns related to various types:
41             inflected forms (C
) or lemmatized forms (C) of terms or
42             term components and action to perform. Currently only the C
43             action (to remove terms) is implemented.
44              
45              
46             =head1 METHODS
47              
48             =head2 new()
49              
50             new(\%options);
51              
52             The method creates a post-processing component of BioYaTeA and sets
53             the option attribute with the hashtable C<@options>, and returns the
54             created object.
55              
56             The hashtable C<@options> contains several fields: the input file name
57             C, the output file name C, the configuration
58             file name C and the temporary directory name
59             C.
60              
61             Other attributes are: the C parser C, the
62             counter of term candidates C, the counter of rejected
63             terms C, the list of regular expressions used to
64             identify terms to reject C, the indication whether the
65             application of each regular expression is case insensitive
66             C, the log file handler C, the output file
67             handler C, and the log file name C.
68              
69             C is a hashtable where keys are C and values are an
70             array of regular expressions.
71              
72             C is a hashtable where keys are regular expressions.
73              
74             =head2 tc_counter()
75              
76             tc_counter($tc_counter);
77              
78             This method sets the attribute C with the value
79             C<$tc_counter> and returns it. When no argument is given, the value of
80             the attribute C is return.
81              
82             =head2 logfh()
83              
84             logfh($logfh);
85              
86             This method sets the attribute C with the handler
87             C<$logfh> and returns it. When no argument is given, the value of
88             the attribute C is return.
89              
90             =head2 outfh()
91              
92             outfh();
93              
94             This method sets the attribute C with the handler
95             C<$outfh> and returns it. When no argument is given, the value of
96             the attribute C is return.
97              
98             =head2 count_rejected()
99              
100             count_rejected($count_rejected);
101              
102             This method sets the attribute C with the value
103             C<$count_rejected> and returns it. When no argument is given, the
104             value of the attribute C is return.
105              
106             =head2 case_insensitive()
107              
108             case_insensitive(\%case_insensitive);
109              
110             This method sets the attribute C with the hashtable
111             C<%case_insensitive> and returns it. When no argument is given, the
112             hashtable reference of the attribute C is return.
113              
114             =head2 case_insensitive_elt()
115              
116             case_insensitive_elt($case_insensitive_name, case_insensitive_value);
117              
118             This method sets the indication whether the regular expression
119             C<$case_insensitive_name> is case insensitive or not (value
120             C<$case_insensitive_value>) in the hashtable referred by the attribute
121             C and returns it. When one argument is set, the
122             value associated to the regular expression C<$case_insensitive_name>
123             is return. When no argument is given, an undefined value is return.
124              
125             =head2 exists_case_insensitive_elt()
126              
127             exists_case_insensitive_elt($case_insensitive_name);
128              
129             The method indicates if the application of the regular expression
130             C<$case_insensitive_name> is case insensitive or not.
131              
132             =head2 options()
133              
134             options(\%options);
135              
136             This method sets the attribute C with the hashtable
137             C<%options> and returns it. When no argument is given, the
138             hashtable reference of the attribute C is return.
139              
140             =head2 configuration()
141              
142             configuration($configuration);
143              
144             This method sets the attribute C with the value
145             C<$configuration> and returns it. When no argument is given, the
146             value of the attribute C is return.
147              
148             =head2 input_file()
149              
150             input_file($input_file);
151              
152             This method sets the field C of the attribute C
153             with the value C<$input_file> (input file name) and returns it. When
154             no argument is given, the value of the field C of the
155             attribute C is return.
156              
157             =head2 logfile()
158              
159             logfile($logfile);
160              
161             This method sets the field C of the attribute C
162             with the value C<$log_file> (log file name) and returns it. When no
163             argument is given, the value of the field C of the attribute
164             C is return.
165              
166             =head2 tmp_dir()
167              
168             tmp_dir($tmp_dir);
169              
170             This method sets the field C of the attribute C
171             with the value C<$output_file> (output file name) and returns it. When
172             no argument is given, the value of the field C of the
173             attribute C is return.
174              
175             =head2 output_file()
176              
177             output_file($output_file);
178              
179             This method sets the field C of the attribute C
180             with the value C<$output_file> (output file name) and returns it. When
181             no argument is given, the value of the field C of the
182             attribute C is return.
183              
184             =head2 reg_exps()
185              
186             reg_exps(\%reg_exps);
187              
188             This method sets the attribute C with the hashtable
189             C<%reg_exps> and returns it. When no argument is given, the hashtable
190             reference of the attribute C is return.
191              
192             =head2 reg_exp_elt()
193              
194             reg_exp_elt($reg_exp_name, $reg_exp_value);
195              
196             This method adds the regular expression C<$reg_exp_value> to the
197             array related to the type of patterns C<$reg_exp_name> and returns
198             it. When one argument is set, the array referred by C<$reg_exp_name>
199             is return. When no argument is given, a reference to an empty array is
200             return.
201              
202             =head2 twig_parser()
203              
204             twig_parser($twig_parser);
205              
206             This method sets the attribute C with the C
207             parser C<$twig_parser> and returns it. When no argument is given, the
208             value of the attribute C is return.
209              
210             =head2 defineTwigParser()
211              
212             defineTwigParser();
213              
214             The method defines the C parser and associates to the
215             object.
216              
217             =head2 processTerms()
218              
219             processTerms($twig_parser,$data);
220              
221             The function processes terms which match regular expressions by
222             applying associated actions (as defined in the configuration file, for
223             instance). The terms are in XML tree C<$data>.
224              
225             Note: this is a function which uses in the C parser (called
226             as function pointer).
227              
228             =head2 load_configuration()
229              
230             load_configuration();
231              
232             The method process and loads the configuration file (set in the
233             attribute C of the current object). The attributes
234             C and C are set by this method.
235              
236             =head2 filtering()
237              
238             filtering();
239              
240             The method performs the full filtering of the terms:
241              
242             =over
243              
244             =item setting of the temporary file if not defined
245              
246             =item opening the XML output file
247              
248             =item setting the C parser
249              
250             =item processing of the XML input file in order to apply action
251             associated to the regular expressions
252              
253             =back
254              
255             =head2 printResume()
256              
257             printResume();
258              
259             The method prints the number of rejected terms and the number of
260             remaining candidate terms.
261              
262             =head2 rmlog()
263              
264             rmlog();
265              
266             The method deletes the log file.
267              
268             =head1 CONFIGURATION FILE FORMAT
269              
270             The configuration file defines the action to perform when an
271             associated regular expression matches a term form. For instance:
272              
273             C
274              
275             Each line defines an association between an action (only C for
276             the moment) and a regular expression to apply to a form of a term
277             (C for the inflected form, C for the lemmatised form).
278              
279             The action and regular expression parts are separated by the character
280             C<=>. The two elements of the regular expression are separated by two
281             collons (C<::>).
282              
283             Comments are introduced by a C<#> character at the begin of the line.
284              
285             =head1 SEE ALSO
286              
287             Documentation of Lingua::YaTeA
288              
289             =head1 AUTHORS
290              
291             Wiktoria Golik , Zorana Ratkovic , Robert Bossy , Claire Nédellec , Thierry Hamon
292              
293             =head1 LICENSE
294              
295             Copyright (C) 2012 Wiktoria Golik, Zorana Ratkovic, Robert Bossy, Claire Nédellec and Thierry Hamon
296              
297             This library is free software; you can redistribute it and/or modify
298             it under the same terms as Perl itself, either Perl version 5.8.6 or,
299             at your option, any later version of Perl 5 you may have available.
300              
301              
302             =cut
303              
304 1     1   804 use Lingua::BioYaTeA::TwigXML;
  0            
  0            
305             # use XML::Twig;
306              
307             our $VERSION='0.1';
308              
309             sub new {
310             my ($class, $options) = @_;
311              
312             my $this = {
313             'options' => {},
314             'twig_parser' => undef,
315             'tc_counter' => 0,
316             'count_rejected' => 0,
317             'reg_exps' => {},
318             'case_insensitive' => {},
319             'logfh' => undef,
320             'outfh' => undef,
321             'logfile' => undef,
322             };
323              
324             bless $this, $class;
325              
326             $this->options($options);
327              
328             return($this);
329             }
330              
331             sub tc_counter {
332             my ($self, $tc_counter) = @_;
333              
334             if (defined $tc_counter) {
335             $self->{'tc_counter'} = $tc_counter;
336             }
337             return($self->{'tc_counter'});
338             }
339              
340             sub logfh {
341             my ($self, $logfh) = @_;
342              
343             if (defined $logfh) {
344             $self->{'logfh'} = $logfh;
345             }
346             return($self->{'logfh'});
347             }
348              
349             sub outfh {
350             my ($self, $outfh) = @_;
351              
352             if (defined $outfh) {
353             $self->{'outfh'} = $outfh;
354             }
355             return($self->{'outfh'});
356             }
357              
358             sub count_rejected {
359             my ($self, $count_rejected) = @_;
360              
361             if (defined $count_rejected) {
362             $self->{'count_rejected'} = $count_rejected;
363             }
364             return($self->{'count_rejected'});
365             }
366              
367             sub case_insensitive {
368             my ($self, $case_insensitive) = @_;
369              
370             if (defined $case_insensitive) {
371             %{$self->{'case_insensitive'}} = %$case_insensitive;
372             }
373             return($self->{'case_insensitive'});
374             }
375              
376             sub case_insensitive_elt {
377             my $self = shift;
378              
379             my $case_insensitive_name;
380             my $case_insensitive_value;
381              
382             if (scalar(@_) >= 2) {
383             ($self, $case_insensitive_name, $case_insensitive_value) = @_;
384             $self->{'case_insensitive'}->{$case_insensitive_name} = $case_insensitive_value;
385             } else {
386             if (scalar(@_) == 1) {
387             $case_insensitive_name = $_[0];
388             return($self->{'case_insensitive'}->{$case_insensitive_name});
389             }
390             }
391             return(undef);
392             }
393              
394              
395             sub exists_case_insensitive_elt {
396             my ($self, $case_insensitive_name) = @_;
397              
398             return(exists($self->{'case_insensitive'}->{$case_insensitive_name}));
399             }
400              
401             sub options {
402             my ($self, $options) = @_;
403              
404             if (defined $options) {
405             %{$self->{'options'}} = %$options;
406             }
407             return($self->{'options'});
408             }
409              
410             sub configuration {
411             my ($self, $configuration) = @_;
412              
413             if (defined $configuration) {
414             $self->options->{'configuration'} = $configuration;
415             }
416             return($self->options->{'configuration'});
417             }
418              
419             sub input_file {
420             my ($self, $input_file) = @_;
421              
422             if (defined $input_file) {
423             $self->options->{'input-file'} = $input_file;
424             }
425             return($self->options->{'input-file'});
426             }
427              
428             sub logfile {
429             my ($self, $logfile) = @_;
430              
431             if (defined $logfile) {
432             $self->options->{'logfile'} = $logfile;
433             }
434             return($self->options->{'logfile'});
435             }
436              
437             sub tmp_dir {
438             my ($self, $tmp_dir) = @_;
439              
440             if (defined $tmp_dir) {
441             $self->options->{'tmp-dir'} = $tmp_dir;
442             }
443             return($self->options->{'tmp-dir'});
444             }
445              
446             sub output_file {
447             my ($self, $output_file) = @_;
448              
449             if (defined $output_file) {
450             $self->options->{'output-file'} = $output_file;
451             }
452             return($self->options->{'output-file'});
453             }
454              
455             sub reg_exps {
456             my ($self, $reg_exps) = @_;
457              
458             if (defined $reg_exps) {
459             %{$self->{'reg_exps'}} = %$reg_exps;
460             }
461             return($self->{'reg_exps'});
462             }
463              
464             sub reg_exp_elt {
465             my $self = shift;
466             my $name;
467             my $value;
468              
469             if (scalar(@_) >= 2) {
470             ($name, $value) = @_;
471             # if (!defined $self->reg_exps->{$name}) {
472             # $self->reg_exps->{$name} = [];
473             # }
474              
475             push @{$self->reg_exps->{$name}}, $value;
476             return($self->reg_exps->{$name});
477             # warn $self->reg_exps->{$name} . "\n";
478             } else {
479             if (scalar(@_) == 1) {
480             $name = $_[0];
481             return($self->reg_exps->{$name});
482             }
483             }
484             return([]);
485             }
486              
487              
488             sub twig_parser {
489             my ($self, $twig_parser) = @_;
490              
491             if (defined $twig_parser) {
492             $self->{'twig_parser'} = $twig_parser;
493             }
494             return($self->{'twig_parser'});
495             }
496              
497             sub _printOptions {
498             my ($self, $fh) = @_;
499             my $option;
500             my %options = %{$self->options};
501              
502             if (!defined $fh) {
503             $fh = \*stdout;
504             }
505              
506             print $fh "\nOptions: \n";
507             foreach $option (keys %options) {
508             print $fh "\t$option: " . $options{$option} . "\n";
509             }
510             print $fh "\n";
511             }
512              
513             sub defineTwigParser {
514             my ($self) = @_;
515              
516             my $start_handlers = {
517             'TERM_CANDIDATE' => \&processTerms,
518             };
519             my $twig_parser = Lingua::BioYaTeA::TwigXML->new(TwigHandlers => $start_handlers,
520             keep_spaces_in => [''],
521             pretty_print => 'indented',
522             load_DTD=>0,
523             keep_encoding=>1
524             );
525             $twig_parser->objectSelf($self);
526             $self->twig_parser($twig_parser);
527             }
528              
529             sub processTerms {
530             my ($twig_parser,$data) = @_;
531              
532             my $field;
533             my $regs_a;
534             my $reg;
535             my $term_data;
536             my $dismissed = 0;
537              
538             # my $twig_parser = $self->twig_parser;
539              
540             # warn "\n$twig_parser\n";
541              
542             my $self = $twig_parser->objectSelf;
543              
544             my $logfh = $self->logfh;
545             my $outfh = $self->outfh;
546              
547             # warn "\n$self\n";
548             # warn "\n". $self->tc_counter . "\n";
549             $self->tc_counter($self->tc_counter + 1);
550             #print "Data ", $data->child(0)->text(), " ", scalar(keys %{$self->reg_exps}), "\t";
551              
552             mainloop:
553             while (($field, $regs_a) = each (%{$self->reg_exps})) {
554             $term_data = $data->first_child_text($field);
555             #print "Filter '$term_data'\t";
556              
557             foreach $reg (@$regs_a) {
558             if($self->exists_case_insensitive_elt($reg)) {
559             if($term_data =~ /$reg/i) {
560             print $logfh $term_data;
561             print $logfh "\t(i) " . $field;
562             print $logfh "\t" . $reg . "\n";
563             $self->count_rejected($self->count_rejected + 1);
564             $data->set_att("DISMISSED"=>"TRUE");
565             $dismissed = 1;
566             #return;
567             last mainloop;
568             }
569             } else {
570             # print "\n\t/$reg/";
571              
572             # warn "\n$reg:\n";
573             if($term_data =~ /$reg/) {
574             print $logfh $term_data;
575             print $logfh "\t" . $field;
576             print $logfh "\t" . $reg . "\n";
577             $self->count_rejected($self->count_rejected + 1);
578             $data->set_att("DISMISSED"=>"TRUE");
579             $dismissed = 1;
580             #return;
581             last mainloop;
582             # return;
583             }
584             }
585             }
586             }
587             if ($dismissed) {
588             $data->set_att("DISMISSED"=>"TRUE");
589             #print " DISMISSED\n";
590             } else {
591             $data->set_att("DISMISSED"=>"FALSE");
592             #print " not dismissed\n";
593             }
594            
595             $twig_parser->flush($outfh);
596              
597             }
598              
599              
600             sub load_configuration {
601             my ($self) = @_;
602              
603             my $line;
604             my $num_line=0;
605             my $name;
606             my $value;
607              
608             open (CONFIG, "<" . $self->configuration) || die "cannot open configutation file : " . $self->configuration . "\n";
609             while ($line = ) {
610             $num_line++;
611             if(($line !~ /^\s*#/) && ($line !~ /^\s*$/)) {
612             chomp $line;
613             $line =~ s/\s+$//;
614             # warn "$line\n";
615             if($line =~ /^\s*CLEAN=([^:]+)::(.+)$/) {
616             $name = $1;
617             $value = $2;
618             if($value =~ /^\/(.+)\/(i)$/) {
619             $self->case_insensitive_elt($1,$2);
620             $self->reg_exp_elt($name, $1);
621             # $self->reg_exp_elt($name, $2);
622            
623             } else {
624             $value =~ /^\/(.+)\/$/;
625             $self->reg_exp_elt($name, $1);
626             }
627             } else {
628             print STDERR "Invalid line in configuration file, line: " . $num_line . "\n";
629             }
630             }
631             }
632             # print Dumper(\%reg_exp);
633             print STDERR "Configuration file loaded\n";
634             # exit;
635             }
636              
637             sub filtering {
638             my ($self) = @_;
639              
640             my ($second,$minute,$hour,$day,$month,$year,$weekday,$yearday,$isdailysavingtime) = localtime(time);
641             $year += 1900;
642              
643             if (!defined $self->logfile) {
644             $self->logfile($self->tmp_dir . '/term-filtering-tmp-' . "date-$year-$month-${day}_${hour}_$minute" . '.log');
645             }
646             warn "openning " . $self->logfile . "\n";
647             open (LOG, ">>" . $self->logfile) || die "Cannot open " . $self->logfile . "\n";
648             $self->logfh(\*LOG);
649              
650             open(OUT, ">>".$self->output_file) || die "Cannot open " . $self->output_file . "\n";
651             $self->outfh(\*OUT);
652              
653             warn "start parsing of " . $self->input_file . "\n";
654              
655             $self->twig_parser->{'SELF'} = $self;
656             $self->twig_parser->parsefile($self->input_file);
657              
658             $self->twig_parser->flush(\*OUT);
659             close LOG;
660             close OUT;
661             return(1);
662             }
663              
664             sub printResume {
665             my ($self) = @_;
666              
667             print STDERR $self->tc_counter . " terms at the beginning\n";
668             print STDERR ($self->tc_counter - $self->count_rejected) . " term candidate after filtering\n";
669             }
670              
671             sub rmlog {
672             my ($self) = @_;
673              
674             unlink($self->logfile);
675             }
676              
677             1;