| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Lingua::YaTeA::TestifiedTerm; | 
| 2 | 5 |  |  | 5 |  | 36 | use strict; | 
|  | 5 |  |  |  |  | 12 |  | 
|  | 5 |  |  |  |  | 138 |  | 
| 3 | 5 |  |  | 5 |  | 29 | use warnings; | 
|  | 5 |  |  |  |  | 13 |  | 
|  | 5 |  |  |  |  | 136 |  | 
| 4 | 5 |  |  | 5 |  | 27 | use UNIVERSAL; | 
|  | 5 |  |  |  |  | 11 |  | 
|  | 5 |  |  |  |  | 25 |  | 
| 5 | 5 |  |  | 5 |  | 133 | use NEXT; | 
|  | 5 |  |  |  |  | 11 |  | 
|  | 5 |  |  |  |  | 43 |  | 
| 6 | 5 |  |  | 5 |  | 155 | use Scalar::Util qw(blessed); | 
|  | 5 |  |  |  |  | 16 |  | 
|  | 5 |  |  |  |  | 8211 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | our $id = 0; | 
| 9 |  |  |  |  |  |  | our $VERSION=$Lingua::YaTeA::VERSION; | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | sub new | 
| 12 |  |  |  |  |  |  | { | 
| 13 | 1 |  |  | 1 | 1 | 393 | my ($class_or_object,$num_content_words,$words_a,$tag_set,$source,$match_type) = @_; | 
| 14 |  |  |  |  |  |  |  | 
| 15 | 1 |  |  |  |  | 3 | my $this = shift; | 
| 16 |  |  |  |  |  |  |  | 
| 17 | 1 | 50 |  |  |  | 5 | $this = bless {}, $this unless ref $this; | 
| 18 | 1 |  |  |  |  | 3 | $this->{ID} = $id; | 
| 19 | 1 |  |  |  |  | 3 | $this->{IF} = (); | 
| 20 | 1 |  |  |  |  | 2 | $this->{POS} = (); | 
| 21 | 1 |  |  |  |  | 3 | $this->{LF} = (); | 
| 22 | 1 |  |  |  |  | 4 | $this->{SOURCE} = []; | 
| 23 | 1 |  |  |  |  | 3 | $this->{WORDS} = []; | 
| 24 | 1 |  |  |  |  | 75 | $this->{REG_EXP} = (); | 
| 25 | 1 |  |  |  |  | 6 | $this->{FOUND} = 0; | 
| 26 | 1 |  |  |  |  | 3 | $this->{OCCURRENCES} = []; | 
| 27 | 1 |  |  |  |  | 13 | $this->{INDEX_SET} = Lingua::YaTeA::IndexSet->new; | 
| 28 | 1 |  |  |  |  | 10 | $this->buildLinguisticInfos($words_a,$tag_set); | 
| 29 | 1 |  |  |  |  | 2 | push @{$this->getSource}, split /,/,$source; | 
|  | 1 |  |  |  |  | 13 |  | 
| 30 | 1 |  |  |  |  | 15 | $this->buildRegularExpression($match_type); | 
| 31 | 1 |  |  |  |  | 2 | $this->setIndexSet(scalar @{$this->getWords}); | 
|  | 1 |  |  |  |  | 4 |  | 
| 32 | 1 |  |  |  |  | 23 | $this->NEXT::new(@_); | 
| 33 | 1 |  |  |  |  | 6 | return $this; | 
| 34 |  |  |  |  |  |  | } | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | sub isInLexicon | 
| 37 |  |  |  |  |  |  | { | 
| 38 | 10 |  |  | 10 | 1 | 28 | my ($this,$filtering_lexicon_h,$match_type) = @_; | 
| 39 | 10 |  |  |  |  | 17 | my $lex_item; | 
| 40 |  |  |  |  |  |  |  | 
| 41 | 10 |  |  |  |  | 17 | foreach $lex_item (@{$this->getWords}) | 
|  | 10 |  |  |  |  | 28 |  | 
| 42 |  |  |  |  |  |  | { | 
| 43 | 13 | 50 |  |  |  | 30 | if($match_type eq "loose") # look at IF or LF | 
| 44 |  |  |  |  |  |  | { | 
| 45 | 13 | 50 | 66 |  |  | 49 | if( | 
| 46 |  |  |  |  |  |  | (!exists $filtering_lexicon_h->{lc($lex_item->getIF)}) | 
| 47 |  |  |  |  |  |  | && | 
| 48 |  |  |  |  |  |  | (!exists $filtering_lexicon_h->{lc($lex_item->getLF)}) | 
| 49 |  |  |  |  |  |  | ) | 
| 50 |  |  |  |  |  |  | { | 
| 51 |  |  |  |  |  |  | # current word does not appear in the corpus : testified term won't be loaded | 
| 52 | 7 |  |  |  |  | 38 | return 0; | 
| 53 |  |  |  |  |  |  | } | 
| 54 |  |  |  |  |  |  | } | 
| 55 |  |  |  |  |  |  | else | 
| 56 |  |  |  |  |  |  | { | 
| 57 | 0 | 0 |  |  |  | 0 | if($match_type eq "strict") # look at IF and POS | 
| 58 |  |  |  |  |  |  | { | 
| 59 | 0 | 0 |  |  |  | 0 | if (!exists $filtering_lexicon_h->{lc($lex_item->getIF)."~".$lex_item->getPOS}) | 
| 60 |  |  |  |  |  |  | { | 
| 61 |  |  |  |  |  |  | # current word does not appear in the corpus : testified term won't be loaded | 
| 62 | 0 |  |  |  |  | 0 | return 0; | 
| 63 |  |  |  |  |  |  | } | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | } | 
| 66 |  |  |  |  |  |  | else | 
| 67 |  |  |  |  |  |  | { | 
| 68 |  |  |  |  |  |  | # default match: look at IF | 
| 69 | 0 | 0 |  |  |  | 0 | if(!exists $filtering_lexicon_h->{lc($lex_item->getIF)}) | 
| 70 |  |  |  |  |  |  | { | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | # current word does not appear in the corpus : testified term won't be loaded | 
| 73 | 0 |  |  |  |  | 0 | return 0; | 
| 74 |  |  |  |  |  |  | } | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | } | 
| 77 |  |  |  |  |  |  | } | 
| 78 |  |  |  |  |  |  | } | 
| 79 | 3 |  |  |  |  | 13 | return 1; | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | sub buildLinguisticInfos | 
| 84 |  |  |  |  |  |  | { | 
| 85 | 2 |  |  | 2 | 1 | 5 | my ($this,$lex_items_a,$tag_set) = @_; | 
| 86 |  |  |  |  |  |  |  | 
| 87 | 2 |  |  |  |  | 9 | my $lex; | 
| 88 |  |  |  |  |  |  | my $IF; | 
| 89 | 2 |  |  |  |  | 0 | my $POS; | 
| 90 | 2 |  |  |  |  | 0 | my $LF; | 
| 91 | 2 |  |  |  |  | 8 | my %prep = ("of"=>"of", "to"=>"to"); | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  |  | 
| 94 | 2 |  |  |  |  | 6 | foreach $lex (@$lex_items_a) | 
| 95 |  |  |  |  |  |  | { | 
| 96 | 4 | 50 | 33 |  |  | 32 | if ((blessed($lex)) && ($lex->isa("Lingua::YaTeA::LexiconItem"))) | 
| 97 |  |  |  |  |  |  | { | 
| 98 | 4 |  |  |  |  | 20 | $IF .= $lex->getIF . " " ; | 
| 99 |  |  |  |  |  |  | #if (exists $prep{$lex->getLF}) | 
| 100 | 4 | 50 |  |  |  | 12 | if ($tag_set->existTag('PREPOSITIONS',$lex->getIF)) | 
| 101 |  |  |  |  |  |  | { | 
| 102 | 0 |  |  |  |  | 0 | $POS .= $lex->getLF . " "; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  | else | 
| 105 |  |  |  |  |  |  | { | 
| 106 | 4 |  |  |  |  | 10 | $POS .= $lex->getPOS . " "; | 
| 107 |  |  |  |  |  |  | } | 
| 108 | 4 |  |  |  |  | 12 | $LF .= $lex->getLF . " " ; | 
| 109 | 4 |  |  |  |  | 8 | push @{$this->getWords}, $lex; | 
|  | 4 |  |  |  |  | 11 |  | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  | else | 
| 112 |  |  |  |  |  |  | { | 
| 113 | 0 |  |  |  |  | 0 | die "problem: " . $lex . "\n"; | 
| 114 |  |  |  |  |  |  | } | 
| 115 |  |  |  |  |  |  | } | 
| 116 | 2 |  |  |  |  | 14 | $IF =~ s/\s+$//; | 
| 117 | 2 |  |  |  |  | 9 | $POS =~ s/\s+$//; | 
| 118 | 2 |  |  |  |  | 8 | $LF =~ s/\s+$//; | 
| 119 | 2 |  |  |  |  | 10 | $this->setIF($IF); | 
| 120 | 2 |  |  |  |  | 8 | $this->setPOS($POS); | 
| 121 | 2 |  |  |  |  | 8 | $this->setLF($LF); | 
| 122 |  |  |  |  |  |  | } | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | sub getWords | 
| 125 |  |  |  |  |  |  | { | 
| 126 | 39 |  |  | 39 | 1 | 88 | my ($this) = @_; | 
| 127 | 39 |  |  |  |  | 131 | return $this->{WORDS}; | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | sub setIF | 
| 131 |  |  |  |  |  |  | { | 
| 132 | 2 |  |  | 2 | 1 | 6 | my ($this,$new) = @_; | 
| 133 | 2 |  |  |  |  | 5 | $this->{IF} = $new; | 
| 134 |  |  |  |  |  |  | } | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | sub setPOS | 
| 137 |  |  |  |  |  |  | { | 
| 138 | 2 |  |  | 2 | 1 | 6 | my ($this,$new) = @_; | 
| 139 | 2 |  |  |  |  | 3 | $this->{POS} = $new; | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | sub setLF | 
| 143 |  |  |  |  |  |  | { | 
| 144 | 2 |  |  | 2 | 1 | 11 | my ($this,$new) = @_; | 
| 145 | 2 |  |  |  |  | 10 | $this->{LF} = $new; | 
| 146 |  |  |  |  |  |  | } | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | sub getIF | 
| 149 |  |  |  |  |  |  | { | 
| 150 | 9 |  |  | 9 | 1 | 34 | my ($this) = @_; | 
| 151 | 9 |  |  |  |  | 31 | return $this->{IF}; | 
| 152 |  |  |  |  |  |  | } | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | sub getPOS | 
| 155 |  |  |  |  |  |  | { | 
| 156 | 1 |  |  | 1 | 1 | 3 | my ($this) = @_; | 
| 157 | 1 |  |  |  |  | 5 | return $this->{POS}; | 
| 158 |  |  |  |  |  |  | } | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | sub getLF | 
| 161 |  |  |  |  |  |  | { | 
| 162 | 0 |  |  | 0 | 1 | 0 | my ($this) = @_; | 
| 163 | 0 |  |  |  |  | 0 | return $this->{LF}; | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | sub getID | 
| 167 |  |  |  |  |  |  | { | 
| 168 | 15 |  |  | 15 | 1 | 32 | my ($this) = @_; | 
| 169 | 15 |  |  |  |  | 79 | return $this->{ID}; | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | sub buildKey | 
| 174 |  |  |  |  |  |  | { | 
| 175 | 1 |  |  | 1 | 1 | 3 | my ($this) = @_; | 
| 176 | 1 |  |  |  |  | 6 | my $key = $this->{"IF"} . "~" . $this->{"POS"} . "~" . $this->{"LF"}; | 
| 177 | 1 |  |  |  |  | 4 | return $key; | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | sub getSource | 
| 181 |  |  |  |  |  |  | { | 
| 182 | 6 |  |  | 6 | 1 | 18 | my ($this) = @_; | 
| 183 | 6 |  |  |  |  | 28 | return $this->{SOURCE}; | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | sub buildRegularExpression | 
| 187 |  |  |  |  |  |  | { | 
| 188 | 1 |  |  | 1 | 1 | 4 | my ($this,$match_type) = @_; | 
| 189 | 1 |  |  |  |  | 3 | my $frontier = "\(\\n\\<\\/\?FRONTIER ID=\[0\-9\]\+ TT=\[0\-9\]\+\\>\)\*"; | 
| 190 | 1 |  |  |  |  | 4 | my $reg_exp = $frontier . "\?"; | 
| 191 | 1 |  |  |  |  | 2 | my $lex; | 
| 192 |  |  |  |  |  |  |  | 
| 193 | 1 | 50 |  |  |  | 4 | if($match_type eq "loose") # IF or LF | 
| 194 |  |  |  |  |  |  | { | 
| 195 | 1 |  |  |  |  | 3 | foreach $lex (@{$this->getWords}) | 
|  | 1 |  |  |  |  | 3 |  | 
| 196 |  |  |  |  |  |  | { | 
| 197 | 2 |  |  |  |  | 7 | $reg_exp .= "\(\(\\n".quotemeta($lex->getIF) . "\\t\[\^\\t\]\+\\t\[\^\\t\]\+\)\|\(\\n\[\^\\t\]\+\\t\[\^\\t\]\+\\t". quotemeta($lex->getLF) . "\)\)" . $frontier; | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  | else | 
| 202 |  |  |  |  |  |  | { | 
| 203 | 0 | 0 |  |  |  | 0 | if($match_type eq "strict") # IF and POS | 
| 204 |  |  |  |  |  |  | { | 
| 205 | 0 |  |  |  |  | 0 | foreach $lex (@{$this->getWords}) | 
|  | 0 |  |  |  |  | 0 |  | 
| 206 |  |  |  |  |  |  | { | 
| 207 | 0 |  |  |  |  | 0 | $reg_exp .= "\\n".quotemeta($lex->getIF) . "\\t".quotemeta($lex->getPOS) ."\\t\[\^\\t\]\+" . $frontier; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | else | 
| 211 |  |  |  |  |  |  | { | 
| 212 | 0 |  |  |  |  | 0 | foreach $lex (@{$this->getWords}) # IF | 
|  | 0 |  |  |  |  | 0 |  | 
| 213 |  |  |  |  |  |  | { | 
| 214 | 0 |  |  |  |  | 0 | $reg_exp .= "\\n".quotemeta($lex->getIF) . "\\t\[\^\\t\]\+\\t\[\^\\t\]\+" . $frontier; | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  | } | 
| 217 |  |  |  |  |  |  | } | 
| 218 | 1 |  |  |  |  | 3 | $reg_exp .= "\\n"; | 
| 219 | 1 |  |  |  |  | 4 | $this->{REG_EXP} = $reg_exp; | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | sub getRegExp | 
| 224 |  |  |  |  |  |  | { | 
| 225 | 3 |  |  | 3 | 1 | 8 | my ($this) = @_; | 
| 226 | 3 |  |  |  |  | 8 | return $this->{REG_EXP}; | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | sub getWord | 
| 230 |  |  |  |  |  |  | { | 
| 231 | 22 |  |  | 22 | 1 | 38 | my ($this,$index) = @_; | 
| 232 | 22 |  |  |  |  | 58 | return $this->getWords->[$index]; | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | } | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | sub addOccurrence | 
| 237 |  |  |  |  |  |  | { | 
| 238 | 3 |  |  | 3 | 1 | 11 | my ($this,$phrase_occurrence,$phrase,$key,$fh) = @_; | 
| 239 | 3 |  |  |  |  | 11 | my $start_offset; | 
| 240 |  |  |  |  |  |  | my $end_offset; | 
| 241 | 3 |  |  |  |  | 0 | my $testified_occurrence; | 
| 242 | 3 |  |  |  |  | 13 | my @index = split(/-/,$key); | 
| 243 | 3 |  |  |  |  | 18 | ($start_offset,$end_offset) = $this->getPositionInPhrase($phrase,\@index,$fh); | 
| 244 | 3 |  |  |  |  | 13 | $testified_occurrence = Lingua::YaTeA::Occurrence->new; | 
| 245 | 3 |  |  |  |  | 13 | $testified_occurrence->setInfoForTestifiedTerm($phrase_occurrence->getSentence,$phrase_occurrence->getStartChar + $start_offset, $phrase_occurrence->getEndChar - $end_offset); | 
| 246 | 3 |  |  |  |  | 8 | push @{$this->{OCCURRENCES}}, $testified_occurrence; | 
|  | 3 |  |  |  |  | 14 |  | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | sub getPositionInPhrase | 
| 250 |  |  |  |  |  |  | { | 
| 251 | 3 |  |  | 3 | 1 | 20 | my ($this,$phrase,$index_a,$fh) = @_; | 
| 252 | 3 |  |  |  |  | 12 | my @before; | 
| 253 |  |  |  |  |  |  | my @after; | 
| 254 | 3 |  |  |  |  | 0 | my $index; | 
| 255 | 3 |  |  |  |  | 6 | my $start_offset = 0; | 
| 256 | 3 |  |  |  |  | 6 | my $end_offset = 0; | 
| 257 |  |  |  |  |  |  | #print $fh $index_a->[0] . "\n"; | 
| 258 |  |  |  |  |  |  | #print $fh $index_a->[$#$index_a] . "\n"; | 
| 259 | 3 |  |  |  |  | 14 | for ($index = 0; $index < $index_a->[0]; $index++) | 
| 260 |  |  |  |  |  |  | { | 
| 261 | 2 |  |  |  |  | 8 | push @before, $index; | 
| 262 |  |  |  |  |  |  | } | 
| 263 | 3 |  |  |  |  | 26 | for ($index = $index_a->[$#$index_a] +1; $index < $phrase->getIndexSet->getSize; $index++) | 
| 264 |  |  |  |  |  |  | { | 
| 265 | 7 |  |  |  |  | 18 | push @after, $index; | 
| 266 |  |  |  |  |  |  | } | 
| 267 |  |  |  |  |  |  |  | 
| 268 | 3 |  |  |  |  | 9 | foreach $index (@before) | 
| 269 |  |  |  |  |  |  | { | 
| 270 | 2 |  |  |  |  | 6 | $start_offset += $phrase->getWord($index)->getLength +1; | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  |  | 
| 273 | 3 |  |  |  |  | 14 | foreach $index (@after) | 
| 274 |  |  |  |  |  |  | { | 
| 275 | 7 |  |  |  |  | 19 | $end_offset += $phrase->getWord($index)->getLength +1; | 
| 276 |  |  |  |  |  |  | } | 
| 277 |  |  |  |  |  |  |  | 
| 278 | 3 |  |  |  |  | 9 | return ($start_offset,$end_offset); | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | sub setIndexSet | 
| 282 |  |  |  |  |  |  | { | 
| 283 | 1 |  |  | 1 | 1 | 3 | my ($this,$size) = @_; | 
| 284 | 1 |  |  |  |  | 2 | my $i = 0; | 
| 285 | 1 |  |  |  |  | 5 | while ($i < $size) | 
| 286 |  |  |  |  |  |  | { | 
| 287 | 2 |  |  |  |  | 9 | $this->getIndexSet->addIndex($i); | 
| 288 | 2 |  |  |  |  | 7 | $i++; | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | sub getIndexSet | 
| 294 |  |  |  |  |  |  | { | 
| 295 | 4 |  |  | 4 | 1 | 10 | my ($this) = @_; | 
| 296 | 4 |  |  |  |  | 26 | return $this->{INDEX_SET}; | 
| 297 |  |  |  |  |  |  | } | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | sub getOccurrences | 
| 300 |  |  |  |  |  |  | { | 
| 301 | 0 |  |  | 0 | 1 |  | my ($this) = @_; | 
| 302 | 0 |  |  |  |  |  | return $this->{OCCURRENCES}; | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  |  | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | 1; | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  |  | 
| 309 |  |  |  |  |  |  | __END__ |