| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # | 
| 2 |  |  |  |  |  |  | # TestifiedTermParser.yp | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | #  used to generate Lingua::YaTeA::TestifiedTermParser.pm | 
| 5 |  |  |  |  |  |  | # | 
| 6 |  |  |  |  |  |  | #  Use: yapp -m 'Lingua::YaTeA::TestifiedTermParser' -o lib/Lingua/YaTeA/TestifiedTermParser.pm lib/Lingua/YaTeA/TestifiedTermParser.yp | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # Parse::Yapp input grammar for parsing the yatea testified terms | 
| 9 |  |  |  |  |  |  | # | 
| 10 |  |  |  |  |  |  | # | 
| 11 |  |  |  |  |  |  | # | 
| 12 |  |  |  |  |  |  | %{ | 
| 13 | 5 |  |  | 5 |  | 1215 | use Lingua::YaTeA; | 
|  | 5 |  |  |  |  | 7 |  | 
|  | 5 |  |  |  |  | 26 |  | 
| 14 | 5 |  |  | 5 |  | 118 | use Data::Dumper; | 
|  | 5 |  |  |  |  | 7 |  | 
|  | 5 |  |  |  |  | 181 |  | 
| 15 | 5 |  |  | 5 |  | 20 | use warnings; | 
|  | 5 |  |  |  |  | 7 |  | 
|  | 5 |  |  |  |  | 92 |  | 
| 16 | 5 |  |  | 5 |  | 20 | use UNIVERSAL; | 
|  | 5 |  |  |  |  | 6 |  | 
|  | 5 |  |  |  |  | 19 |  | 
| 17 | 5 |  |  | 5 |  | 139 | use Scalar::Util qw(blessed); | 
|  | 5 |  |  |  |  | 11 |  | 
|  | 5 |  |  |  |  | 8523 |  | 
| 18 |  |  |  |  |  |  | my @words; | 
| 19 |  |  |  |  |  |  | my $word; | 
| 20 |  |  |  |  |  |  | my $item; | 
| 21 |  |  |  |  |  |  | my @infos; | 
| 22 |  |  |  |  |  |  | my @IF; | 
| 23 |  |  |  |  |  |  | my @POS; | 
| 24 |  |  |  |  |  |  | my @LF; | 
| 25 |  |  |  |  |  |  | my $src; | 
| 26 |  |  |  |  |  |  | my @lex_items; | 
| 27 |  |  |  |  |  |  | my $testified; | 
| 28 |  |  |  |  |  |  | my $i; | 
| 29 |  |  |  |  |  |  | my $tree; | 
| 30 |  |  |  |  |  |  | my $node_set; | 
| 31 |  |  |  |  |  |  | my $node; | 
| 32 |  |  |  |  |  |  | my $edge; | 
| 33 |  |  |  |  |  |  | my $index = 0; | 
| 34 |  |  |  |  |  |  | my @uncomplete; | 
| 35 |  |  |  |  |  |  | my $level = 0; | 
| 36 |  |  |  |  |  |  | my $num_line =1; | 
| 37 |  |  |  |  |  |  | %} | 
| 38 |  |  |  |  |  |  | %% | 
| 39 |  |  |  |  |  |  | input:  #empty | 
| 40 | 0 |  |  | 0 | 0 |  | |   input line  {  #print STDERR "\n INPUT  \n"; | 
|  |  |  |  | 0 |  |  |  | 
| 41 | 0 | 0 |  |  |  |  | } | 
| 42 |  |  |  |  |  |  | ; | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | line:     '\n' { # print "VIDE: " . $_[1] | 
| 45 | 0 |  |  | 0 |  |  | $num_line++; | 
| 46 |  |  |  |  |  |  | } | 
| 47 |  |  |  |  |  |  | | testified { | 
| 48 | 0 |  |  | 0 |  |  | $num_line++; | 
| 49 | 0 |  |  |  |  |  | @lex_items = (); | 
| 50 | 0 |  |  |  |  |  | @words= (); | 
| 51 | 0 |  |  |  |  |  | my $testified; | 
| 52 |  |  |  |  |  |  | # print STDERR "=>$_[1]\n"; | 
| 53 |  |  |  |  |  |  | my $testified_infos; | 
| 54 | 0 | 0 |  |  |  |  | if($_[0]->YYData->{TTS}->getTestifiedInfos(\$testified_infos,\@IF,\@POS,\@LF,$src,\@lex_items,$_[0]->YYData->{MATCH},$_[0]->YYData->{FILTERING_LEXICON},$_[0]->YYData->{TAGSET}) == 1) { | 
| 55 | 0 | 0 |  |  |  |  | if(scalar @lex_items > 1) { | 
| 56 | 0 |  |  |  |  |  | $testified = Lingua::YaTeA::MultiWordTestifiedTerm->new($testified_infos->{"NUM_CONTENT_WORDS"},\@lex_items,$_[0]->YYData->{TAGSET},$src,$_[0]->YYData->{MATCH}); | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  | } | 
| 59 | 0 | 0 | 0 |  |  |  | if ((blessed($testified)) && ($testified->isa('Lingua::YaTeA::TestifiedTerm'))) { | 
| 60 |  |  |  |  |  |  | #print STDERR "ajout tt: " . $testified->getIF . "\n"; | 
| 61 | 0 |  |  |  |  |  | $_[0]->YYData->{TTS}->addTestified($testified); | 
| 62 |  |  |  |  |  |  |  | 
| 63 | 0 | 0 | 0 |  |  |  | if ((blessed($testified)) && ($testified->isa('Lingua::YaTeA::MultiWordTestifiedTerm'))) { | 
| 64 | 0 |  |  |  |  |  | $tree =  Lingua::YaTeA::Tree->new; | 
| 65 | 0 |  |  |  |  |  | $tree->setNodeSet($node_set); | 
| 66 |  |  |  |  |  |  | # $tree->print($testified_infos->{"WORDS"}); | 
| 67 | 0 |  |  |  |  |  | $tree->setIndexSet($testified->getIndexSet); | 
| 68 | 0 |  |  |  |  |  | $tree->setHead; | 
| 69 | 0 |  |  |  |  |  | $testified->addTree($tree); | 
| 70 | 0 |  |  |  |  |  | $testified->setParsingMethod("USER"); | 
| 71 |  |  |  |  |  |  | } | 
| 72 |  |  |  |  |  |  | } | 
| 73 |  |  |  |  |  |  | # print "fin creation :" . $testified->getIF . "\n"; | 
| 74 | 0 |  |  |  |  |  | $level = 0; | 
| 75 | 0 |  |  |  |  |  | $index = 0; | 
| 76 |  |  |  |  |  |  | } | 
| 77 |  |  |  |  |  |  | ; | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | testified: { | 
| 81 | 0 |  |  | 0 |  |  | $node_set = Lingua::YaTeA::NodeSet->new; | 
| 82 |  |  |  |  |  |  | } | 
| 83 |  |  |  | 0 |  |  | OPEN parsing END infos { #print "trouve testified2 $_[1]\n"; | 
| 84 |  |  |  |  |  |  | } | 
| 85 | 0 |  |  | 0 |  |  | | error '\nTESTIFIED: ' { $_[0]->YYErrok } | 
| 86 |  |  |  |  |  |  | ; | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | infos: INFOS { | 
| 89 |  |  |  |  |  |  | # print "infos $_[1]\n"; | 
| 90 | 0 |  |  | 0 |  |  | @infos = split /\t/, $_[1]; | 
| 91 | 0 |  |  |  |  |  | @IF = split / /, $infos[0]; | 
| 92 | 0 |  |  |  |  |  | @POS = split / /, $infos[1]; | 
| 93 | 0 |  |  |  |  |  | @LF = split / /, $infos[2]; | 
| 94 | 0 |  |  |  |  |  | $src = $infos[3]; | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  | ; | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | parsing: CANDIDATE PREP DET CANDIDATE | 
| 101 |  |  |  |  |  |  | | CANDIDATE PREP CANDIDATE | 
| 102 |  |  |  |  |  |  | | CANDIDATE PREP PREP CANDIDATE | 
| 103 |  |  |  |  |  |  | | CANDIDATE DET CANDIDATE | 
| 104 |  |  |  | 0 |  |  | | CANDIDATE CANDIDATE { | 
| 105 |  |  |  |  |  |  | # print STDERR "PARSING $_[1]\n" | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  | | | 
| 108 | 0 |  |  | 0 |  |  | | error '\nPARSING: ' { $_[0]->YYErrok } | 
| 109 |  |  |  |  |  |  | ; | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | PREP: WORD P_STATUS { | 
| 113 |  |  |  |  |  |  | # print STDERR "PREP $_[1] $_[2]\n"; | 
| 114 | 0 |  |  | 0 |  |  | $node->{"PREP"} = Lingua::YaTeA::TermLeaf->new($index); | 
| 115 | 0 |  |  |  |  |  | $index++; | 
| 116 |  |  |  |  |  |  | }; | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | DET: WORD D_STATUS { | 
| 119 |  |  |  |  |  |  | # print STDERR "DET $_[1] $_[2]\n"; | 
| 120 | 0 |  |  | 0 |  |  | $node->{"DET"} = Lingua::YaTeA::TermLeaf->new($index); | 
| 121 | 0 |  |  |  |  |  | $index++; | 
| 122 |  |  |  |  |  |  | }; | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | CANDIDATE: WORD C_STATUS{ | 
| 125 |  |  |  |  |  |  | # print STDERR "CANDIDATE1 $_[1] $_[2]\n"; | 
| 126 | 0 |  |  | 0 |  |  | $edge = Lingua::YaTeA::TermLeaf->new($index); | 
| 127 | 0 |  |  |  |  |  | $node->addEdge($edge,$_[2]); | 
| 128 |  |  |  |  |  |  | # print "ajout du edge :" ; | 
| 129 |  |  |  |  |  |  | # print Dumper($edge) . "\n"; | 
| 130 | 0 |  |  |  |  |  | $index++; | 
| 131 |  |  |  |  |  |  | } | 
| 132 |  |  |  | 0 |  |  | | OPEN parsing CLOSE { | 
| 133 |  |  |  |  |  |  | #print STDERR "CANDIDATE2 $_[1]\n"; | 
| 134 |  |  |  |  |  |  | } | 
| 135 | 0 |  |  | 0 |  |  | | error '\nCANDIDATE: ' { $_[0]->YYErrok } | 
| 136 |  |  |  |  |  |  | ; | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | OPEN: OPEN_TAG { | 
| 139 |  |  |  |  |  |  | # print STDERR "OPEN $_[1]\n"; | 
| 140 | 0 | 0 |  | 0 |  |  | if ($level == 0) | 
| 141 |  |  |  |  |  |  | { | 
| 142 | 0 |  |  |  |  |  | $node = Lingua::YaTeA::RootNode->new($level); | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  | else | 
| 145 |  |  |  |  |  |  | { | 
| 146 | 0 |  |  |  |  |  | $node = Lingua::YaTeA::InternalNode->new($level); | 
| 147 |  |  |  |  |  |  | } | 
| 148 | 0 |  |  |  |  |  | $node_set->addNode($node); | 
| 149 | 0 |  |  |  |  |  | push @uncomplete, $node; | 
| 150 | 0 |  |  |  |  |  | $level++; | 
| 151 |  |  |  |  |  |  | } | 
| 152 | 0 |  |  | 0 |  |  | |        error '\nOPEN: ' { $_[0]->YYErrok } | 
| 153 |  |  |  |  |  |  | ; | 
| 154 |  |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  | 0 |  |  | END: END_TAG { | 
| 157 |  |  |  |  |  |  | # print STDERR "END $_[1]\n"; | 
| 158 |  |  |  |  |  |  | } | 
| 159 | 0 |  |  | 0 |  |  | |        error '\nEND: ' { $_[0]->YYErrok } | 
| 160 |  |  |  |  |  |  | ; | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | CLOSE: END_TAG C_STATUS { | 
| 163 |  |  |  |  |  |  | # print STDERR "CLOSE_TAG $_[1] $_[2]\n"; | 
| 164 | 0 |  |  | 0 |  |  | pop @uncomplete; | 
| 165 | 0 |  |  |  |  |  | $node->linkToFather(\@uncomplete,$_[2]); | 
| 166 | 0 |  |  |  |  |  | $node = $uncomplete[$#uncomplete]; | 
| 167 | 0 |  |  |  |  |  | $level--; | 
| 168 |  |  |  |  |  |  | } | 
| 169 | 0 |  |  | 0 |  |  | |        error '\nCLOSE: ' { $_[0]->YYErrok } | 
| 170 | 0 |  |  |  |  |  | ; | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  |  | 
| 173 | 0 |  |  |  |  |  | %% | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | sub _Error { | 
| 176 |  |  |  |  |  |  | exists $_[0]->YYData->{ERRMSG} | 
| 177 | 0 | 0 |  | 0 |  |  | and do { | 
| 178 | 0 |  |  |  |  |  | print $_[0]->YYData->{ERRMSG}; | 
| 179 | 0 |  |  |  |  |  | delete $_[0]->YYData->{ERRMSG}; | 
| 180 | 0 |  |  |  |  |  | return; | 
| 181 |  |  |  |  |  |  | }; | 
| 182 | 0 |  |  |  |  |  | print  "EXPECT: "; | 
| 183 | 0 |  |  |  |  |  | print $_[0]->YYExpect . "\n"; | 
| 184 | 0 |  |  |  |  |  | print  "CURTOK: "; | 
| 185 | 0 |  |  |  |  |  | print "-" . $_[0]->YYCurtok . "-\n"; | 
| 186 | 0 |  |  |  |  |  | print  "CURVAL: "; | 
| 187 | 0 |  |  |  |  |  | print $_[0]->YYCurval . "\n"; | 
| 188 | 0 |  |  |  |  |  | print  "Lexer: "; | 
| 189 | 0 |  |  |  |  |  | print Dumper($_[0]->YYLexer) . "\n"; | 
| 190 | 0 |  |  |  |  |  | print "Syntax error.\n"; | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | sub _Lexer { | 
| 194 | 0 |  |  | 0 |  |  | my($parser)=shift; | 
| 195 | 0 |  |  |  |  |  | my $fh = $parser->YYData->{FH}; | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  |  | 
| 198 | 0 |  |  |  |  |  | my $open = '(\()'; | 
| 199 | 0 |  |  |  |  |  | my $word = $parser->YYData->{WORD};; | 
| 200 | 0 |  |  |  |  |  | my $close = '(\)<=[MH]>)'; | 
| 201 | 0 |  |  |  |  |  | my $end = '(\))'; | 
| 202 | 0 |  |  |  |  |  | my $d_status = '<=(D)>'; | 
| 203 | 0 |  |  |  |  |  | my $p_status = '<=(P)>'; | 
| 204 | 0 |  |  |  |  |  | my $c_status = '<=([MH])>'; | 
| 205 | 0 |  |  |  |  |  | my $infos = '\t(.+)'; | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | $parser->YYData->{INPUT} | 
| 209 |  |  |  |  |  |  | or  $parser->YYData->{INPUT} = <$fh> | 
| 210 | 0 | 0 | 0 |  |  |  | or  return('',undef); | 
| 211 |  |  |  |  |  |  |  | 
| 212 | 0 |  |  |  |  |  | $parser->YYData->{INPUT}=~s/^[ \t]*#.*//; | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  |  | 
| 215 | 0 |  |  |  |  |  | for ($parser->YYData->{INPUT}) { | 
| 216 |  |  |  |  |  |  | #print "TEST-" .$parser->YYData->{INPUT}. "-\n"; | 
| 217 | 0 | 0 |  |  |  |  | s/^$open\s*// and return ('OPEN_TAG', $1); | 
| 218 | 0 | 0 |  |  |  |  | s/^$end// and return('END_TAG', $1); | 
| 219 | 0 | 0 |  |  |  |  | s/^$word\s*// and return ('WORD', $1); | 
| 220 | 0 | 0 |  |  |  |  | s/^$c_status\s*// and return ('C_STATUS', $1); | 
| 221 | 0 | 0 |  |  |  |  | s/^$d_status\s*// and return ('D_STATUS', $1); | 
| 222 | 0 | 0 |  |  |  |  | s/^$p_status\s*// and return ('P_STATUS', $1); | 
| 223 | 0 | 0 |  |  |  |  | s/^$close\s*// and return('CLOSE_TAG', $1); | 
| 224 | 0 | 0 |  |  |  |  | s/^$infos\s*// and return('INFOS', $1, $2); | 
| 225 | 0 | 0 |  |  |  |  | s/^.+//s  and return($1,$1); | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | =head1 NAME | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | Lingua::YaTeA::TestifiedTermParser - Perl extension for the parser of testified term file (based on Parse::Yapp) | 
| 232 |  |  |  |  |  |  |  | 
| 233 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | use Lingua::YaTeA::TestifiedTermParser; | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | my $fh = FileHandle->new("<$file_path"); | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | my $parser = Lingua::YaTeA::TestifiedTermParser->new(); | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | $parser->YYData->{TTS} = $this; | 
| 242 |  |  |  |  |  |  | $parser->YYData->{WORD} = $word_characters_regexp; | 
| 243 |  |  |  |  |  |  | $parser->YYData->{TAGSET} = $tag_set; | 
| 244 |  |  |  |  |  |  | $parser->YYData->{MATCH} = $match_type; | 
| 245 |  |  |  |  |  |  | $parser->YYData->{FH} = $fh; | 
| 246 |  |  |  |  |  |  | $parser->YYData->{FILTERING_LEXICON} = $filtering_lexicon_h; | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | $parser->YYParse(yylex => \&Lingua::YaTeA::ParsingPatternParser::_Lexer, yyerror => \&Lingua::YaTeA::ParsingPatternParser::_Error); | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | The module implements a parser for analysing testified term file. | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | The parser takes into account several information: the word character | 
| 256 |  |  |  |  |  |  | list (field C) i.e. all the possible characters in a word, the | 
| 257 |  |  |  |  |  |  | Part-of-Speech tagset (field C), the type of matching (field | 
| 258 |  |  |  |  |  |  | C), the file handler to read (field C), and the lexicon of | 
| 259 |  |  |  |  |  |  | the corpus (field C). | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | =head1 METHODS | 
| 262 |  |  |  |  |  |  |  | 
| 263 |  |  |  |  |  |  | =head2 _Error() | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | _Error($error_objet); | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | The method is used to manage the parsing error and prints a message | 
| 268 |  |  |  |  |  |  | explaining the error. | 
| 269 |  |  |  |  |  |  |  | 
| 270 |  |  |  |  |  |  | =head2 _Lexer() | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | _Lexer($parser_info); | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | The method applies the parser on the data contains in the structure | 
| 275 |  |  |  |  |  |  | C<$parser_info> (field C). | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | Sophie Aubin and Thierry Hamon. Improving Term Extraction with | 
| 280 |  |  |  |  |  |  | Terminological Resources. In Advances in Natural Language Processing | 
| 281 |  |  |  |  |  |  | (5th International Conference on NLP, FinTAL 2006). pages | 
| 282 |  |  |  |  |  |  | 380-387. Tapio Salakoski, Filip Ginter, Sampo Pyysalo, Tapio Pahikkala | 
| 283 |  |  |  |  |  |  | (Eds). August 2006. LNAI 4139. | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | =head1 AUTHOR | 
| 287 |  |  |  |  |  |  |  | 
| 288 |  |  |  |  |  |  | Thierry Hamon  and Sophie Aubin | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | Copyright (C) 2005 by Thierry Hamon and Sophie Aubin | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | This library is free software; you can redistribute it and/or modify | 
| 295 |  |  |  |  |  |  | it under the same terms as Perl itself, either Perl version 5.8.6 or, | 
| 296 |  |  |  |  |  |  | at your option, any later version of Perl 5 you may have available. | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | =cut |