| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Lingua::Stem::UniNE::CS; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 2 |  |  | 2 |  | 47105 | use v5.8.1; | 
|  | 2 |  |  |  |  | 8 |  | 
|  | 2 |  |  |  |  | 108 |  | 
| 4 | 2 |  |  | 2 |  | 12 | use utf8; | 
|  | 2 |  |  |  |  | 9 |  | 
|  | 2 |  |  |  |  | 18 |  | 
| 5 | 2 |  |  | 2 |  | 44 | use strict; | 
|  | 2 |  |  |  |  | 5 |  | 
|  | 2 |  |  |  |  | 75 |  | 
| 6 | 2 |  |  | 2 |  | 10 | use warnings; | 
|  | 2 |  |  |  |  | 28 |  | 
|  | 2 |  |  |  |  | 77 |  | 
| 7 | 2 |  |  | 2 |  | 1789 | use parent 'Exporter'; | 
|  | 2 |  |  |  |  | 680 |  | 
|  | 2 |  |  |  |  | 11 |  | 
| 8 | 2 |  |  | 2 |  | 1833 | use Unicode::CaseFold qw( fc ); | 
|  | 2 |  |  |  |  | 2417 |  | 
|  | 2 |  |  |  |  | 213 |  | 
| 9 | 2 |  |  | 2 |  | 2398 | use Unicode::Normalize qw( NFC ); | 
|  | 2 |  |  |  |  | 5537 |  | 
|  | 2 |  |  |  |  | 5161 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | our $VERSION   = '0.08'; | 
| 12 |  |  |  |  |  |  | our @EXPORT_OK = qw( stem stem_cs stem_aggressive stem_cs_aggressive ); | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | *stem_cs            = \&stem; | 
| 15 |  |  |  |  |  |  | *stem_cs_aggressive = \&stem_aggressive; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | sub stem { | 
| 18 | 78 |  |  | 78 | 0 | 243 | my ($word) = @_; | 
| 19 |  |  |  |  |  |  |  | 
| 20 | 2 |  |  | 2 |  | 154 | $word = NFC fc $word; | 
|  | 2 |  |  |  |  | 4 |  | 
|  | 2 |  |  |  |  | 37 |  | 
|  | 78 |  |  |  |  | 236 |  | 
| 21 | 78 |  |  |  |  | 76614 | $word = remove_case($word); | 
| 22 | 78 |  |  |  |  | 176 | $word = remove_possessive($word); | 
| 23 |  |  |  |  |  |  |  | 
| 24 | 78 |  |  |  |  | 342 | return $word; | 
| 25 |  |  |  |  |  |  | } | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | sub stem_aggressive { | 
| 28 | 5 |  |  | 5 | 0 | 42 | my ($word) = @_; | 
| 29 |  |  |  |  |  |  |  | 
| 30 | 5 |  |  |  |  | 21 | $word = NFC fc $word; | 
| 31 | 5 |  |  |  |  | 79 | $word = remove_case($word); | 
| 32 | 5 |  |  |  |  | 15 | $word = remove_possessive($word); | 
| 33 | 5 |  |  |  |  | 17 | $word = remove_comparative($word); | 
| 34 | 5 |  |  |  |  | 15 | $word = remove_diminutive($word); | 
| 35 | 5 |  |  |  |  | 14 | $word = remove_augmentative($word); | 
| 36 | 5 |  |  |  |  | 14 | $word = remove_derivational($word); | 
| 37 |  |  |  |  |  |  |  | 
| 38 | 5 |  |  |  |  | 26 | return $word; | 
| 39 |  |  |  |  |  |  | } | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | # remove grammatical case endings from nouns and adjectives | 
| 42 |  |  |  |  |  |  | sub remove_case { | 
| 43 | 83 |  |  | 83 | 0 | 127 | my ($word) = @_; | 
| 44 | 83 |  |  |  |  | 166 | my $length = length $word; | 
| 45 |  |  |  |  |  |  |  | 
| 46 | 83 | 100 |  |  |  | 199 | if ($length > 7) { | 
| 47 | 8 | 100 |  |  |  | 37 | return $word | 
| 48 |  |  |  |  |  |  | if $word =~ s{ atech $}{}x; | 
| 49 |  |  |  |  |  |  | } | 
| 50 |  |  |  |  |  |  |  | 
| 51 | 82 | 100 |  |  |  | 161 | if ($length > 6) { | 
| 52 | 22 | 100 |  |  |  | 120 | return $word | 
| 53 |  |  |  |  |  |  | if $word =~ s{ atům $}{}x; | 
| 54 |  |  |  |  |  |  |  | 
| 55 | 21 | 100 |  |  |  | 64 | return palatalize($word) | 
| 56 |  |  |  |  |  |  | if $word =~ s{ (?<= ě ) tem $}{}x;  # -ětem → -ě | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  |  | 
| 59 | 80 | 100 |  |  |  | 161 | if ($length > 5) { | 
| 60 | 43 | 100 |  |  |  | 1174 | return $word | 
| 61 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 62 |  |  |  |  |  |  | ými     # -ými | 
| 63 |  |  |  |  |  |  | | am[ai]  # -ama -ami | 
| 64 |  |  |  |  |  |  | | at[ay]  # -ata -aty | 
| 65 |  |  |  |  |  |  | | ov[éi]  # -ové -ovi | 
| 66 |  |  |  |  |  |  | | [áý]ch  # -ách -ých | 
| 67 |  |  |  |  |  |  | ) $}{}x; | 
| 68 |  |  |  |  |  |  |  | 
| 69 | 34 | 100 |  |  |  | 223 | return palatalize($word) | 
| 70 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 71 |  |  |  |  |  |  | (?<= [eě]  ) t[ei]  # -ete -eti -ěte -ěti → -e -ě | 
| 72 |  |  |  |  |  |  | | (?<= [éi]  ) mu     # -ému -imu           → -é -i | 
| 73 |  |  |  |  |  |  | | (?<= [eií] ) ch     # -ech -ich -ích      → -e -i -í | 
| 74 |  |  |  |  |  |  | | (?<= [eěí] ) mi     # -emi -ěmi -ími      → -e -ě -í | 
| 75 |  |  |  |  |  |  | | (?<= [éií] ) ho     # -ého -iho -ího      → -é -i -í | 
| 76 |  |  |  |  |  |  | ) $}{}x; | 
| 77 |  |  |  |  |  |  | } | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 56 | 100 |  |  |  | 116 | if ($length > 4) { | 
| 80 | 39 | 100 |  |  |  | 197 | return $word | 
| 81 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 82 |  |  |  |  |  |  | at      # -at | 
| 83 |  |  |  |  |  |  | | mi      # -mi | 
| 84 |  |  |  |  |  |  | | us      # -us | 
| 85 |  |  |  |  |  |  | | o[su]   # -os -ou | 
| 86 |  |  |  |  |  |  | | [áůý]m  # -ám -ům -ým | 
| 87 |  |  |  |  |  |  | ) $}{}x; | 
| 88 |  |  |  |  |  |  |  | 
| 89 | 30 | 100 |  |  |  | 173 | return palatalize($word) | 
| 90 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 91 |  |  |  |  |  |  | es          # -es | 
| 92 |  |  |  |  |  |  | | [éí]m       # -ém -ím | 
| 93 |  |  |  |  |  |  | | (?<= e ) m  # -em → -e | 
| 94 |  |  |  |  |  |  | ) $}{}x; | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  |  | 
| 97 | 40 | 100 |  |  |  | 89 | if ($length > 3) { | 
| 98 | 38 | 100 |  |  |  | 257 | return $word | 
| 99 |  |  |  |  |  |  | if $word =~ s{ [aáéouůyý] $}{}x; | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 27 | 100 |  |  |  | 131 | return palatalize($word) | 
| 102 |  |  |  |  |  |  | if $word =~ m{ [eěií] $}x; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 4 |  |  |  |  | 14 | return $word; | 
| 106 |  |  |  |  |  |  | } | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | # remove possesive endings from names | 
| 109 |  |  |  |  |  |  | sub remove_possessive { | 
| 110 | 83 |  |  | 83 | 0 | 112 | my ($word) = @_; | 
| 111 |  |  |  |  |  |  |  | 
| 112 | 83 | 100 |  |  |  | 286 | return $word | 
| 113 |  |  |  |  |  |  | if length $word < 6; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 9 | 100 |  |  |  | 43 | return $word | 
| 116 |  |  |  |  |  |  | if $word =~ s{ [oů]v $}{}x;  # -ov -ův | 
| 117 |  |  |  |  |  |  |  | 
| 118 | 7 | 100 |  |  |  | 35 | return palatalize($word) | 
| 119 |  |  |  |  |  |  | if $word =~ s{ (?<= i ) n $}{}x;  # -in → -i | 
| 120 |  |  |  |  |  |  |  | 
| 121 | 6 |  |  |  |  | 15 | return $word; | 
| 122 |  |  |  |  |  |  | } | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | sub remove_comparative { | 
| 125 | 5 |  |  | 5 | 0 | 9 | my ($word) = @_; | 
| 126 |  |  |  |  |  |  |  | 
| 127 | 5 | 100 |  |  |  | 22 | return $word | 
| 128 |  |  |  |  |  |  | if length $word < 6; | 
| 129 |  |  |  |  |  |  |  | 
| 130 | 1 | 50 |  |  |  | 5 | return palatalize($word) | 
| 131 |  |  |  |  |  |  | if $word =~ s{ (?<= [eě] ) jš $}{}x;  # -ejš -ějš → -e -ě | 
| 132 |  |  |  |  |  |  |  | 
| 133 | 1 |  |  |  |  | 3 | return $word; | 
| 134 |  |  |  |  |  |  | } | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | sub remove_diminutive { | 
| 137 | 5 |  |  | 5 | 0 | 6 | my ($word) = @_; | 
| 138 | 5 |  |  |  |  | 9 | my $length = length $word; | 
| 139 |  |  |  |  |  |  |  | 
| 140 | 5 | 50 |  |  |  | 13 | if ($length > 7) { | 
| 141 | 0 | 0 |  |  |  | 0 | return $word | 
| 142 |  |  |  |  |  |  | if $word =~ s{ oušek $}{}x; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 | 5 | 50 |  |  |  | 14 | if ($length > 6) { | 
| 146 |  |  |  |  |  |  | # -aček -áček -anek -ánek -oček -onek -uček -unek | 
| 147 | 0 | 0 |  |  |  | 0 | return $word | 
| 148 |  |  |  |  |  |  | if $word =~ s{ [aáou][čn]ek $}{}x; | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | # -eček -éček -enek -ének -iček -íček -inek -ínek → -e -é -i -í | 
| 151 | 0 | 0 |  |  |  | 0 | return palatalize($word) | 
| 152 |  |  |  |  |  |  | if $word =~ s{ (?<= [eéií] ) [čn]ek $}{}x; | 
| 153 |  |  |  |  |  |  | } | 
| 154 |  |  |  |  |  |  |  | 
| 155 | 5 | 100 |  |  |  | 14 | if ($length > 5) { | 
| 156 |  |  |  |  |  |  | # -ačk -áčk -ank -ánk -átk -očk -onk -učk -unk -ušk | 
| 157 | 1 | 50 |  |  |  | 4 | return $word | 
| 158 |  |  |  |  |  |  | if $word =~ s{ (?: [aáou][čn] | át | uš ) k $}{}x; | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | # -ečk -éčk -enk -énk -ičk -íčk -ink -ínk | 
| 161 | 1 | 50 |  |  |  | 5 | return palatalize($word) | 
| 162 |  |  |  |  |  |  | if $word =~ s{ [eéií][čn]k $}{}x; | 
| 163 |  |  |  |  |  |  | } | 
| 164 |  |  |  |  |  |  |  | 
| 165 | 5 | 100 |  |  |  | 11 | if ($length > 4) { | 
| 166 |  |  |  |  |  |  | # -ak -ák -ok -uk → -a -á -o -u | 
| 167 | 1 | 50 |  |  |  | 4 | return $word | 
| 168 |  |  |  |  |  |  | if $word =~ s{ (?<= [aáou] ) k $}{}x; | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | # -ek -ék -ik -ík → -e -é -i -í | 
| 171 | 1 | 50 |  |  |  | 4 | return palatalize($word) | 
| 172 |  |  |  |  |  |  | if $word =~ s{ (?<= [eéií] ) k $}{}x; | 
| 173 |  |  |  |  |  |  | } | 
| 174 |  |  |  |  |  |  |  | 
| 175 | 5 | 100 |  |  |  | 12 | if ($length > 3) { | 
| 176 | 2 | 50 |  |  |  | 8 | return $word | 
| 177 |  |  |  |  |  |  | if $word =~ s{ k $}{}x; | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  |  | 
| 180 | 5 |  |  |  |  | 11 | return $word; | 
| 181 |  |  |  |  |  |  | } | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | sub remove_augmentative { | 
| 184 | 5 |  |  | 5 | 0 | 8 | my ($word) = @_; | 
| 185 | 5 |  |  |  |  | 10 | my $length = length $word; | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 5 | 50 |  |  |  | 12 | if ($length > 6) { | 
| 188 | 0 | 0 |  |  |  | 0 | return $word | 
| 189 |  |  |  |  |  |  | if $word =~ s{ ajzn $}{}x; | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  |  | 
| 192 | 5 | 100 |  |  |  | 13 | if ($length > 5) { | 
| 193 | 1 | 50 |  |  |  | 8 | return palatalize($word) | 
| 194 |  |  |  |  |  |  | if $word =~ s{ (?<= i ) (?: sk | zn ) $}{}x;  # -isk -izn → -i | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  |  | 
| 197 | 5 | 100 |  |  |  | 12 | if ($length > 4) { | 
| 198 | 1 | 50 |  |  |  | 5 | return $word | 
| 199 |  |  |  |  |  |  | if $word =~ s{ ák $}{}x; | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 5 |  |  |  |  | 12 | return $word; | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | sub remove_derivational { | 
| 206 | 5 |  |  | 5 | 0 | 8 | my ($word) = @_; | 
| 207 | 5 |  |  |  |  | 8 | my $length = length $word; | 
| 208 |  |  |  |  |  |  |  | 
| 209 | 5 | 50 |  |  |  | 12 | if ($length > 8) { | 
| 210 | 0 | 0 |  |  |  | 0 | return $word | 
| 211 |  |  |  |  |  |  | if $word =~ s{ obinec $}{}x; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 5 | 50 |  |  |  | 13 | if ($length > 7) { | 
| 215 |  |  |  |  |  |  | # -ovisk -ovišt -ovník -ovstv | 
| 216 | 0 | 0 |  |  |  | 0 | return $word | 
| 217 |  |  |  |  |  |  | if $word =~ s{ ov (?: isk | išt | ník | stv ) $}{}x; | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | # -ionář → -i | 
| 220 | 0 | 0 |  |  |  | 0 | return palatalize($word) | 
| 221 |  |  |  |  |  |  | if $word =~ s{ (?<= i ) onář $}{}x; | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  |  | 
| 224 | 5 | 50 |  |  |  | 10 | if ($length > 6) { | 
| 225 | 0 | 0 |  |  |  | 0 | return $word | 
| 226 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 227 |  |  |  |  |  |  | ásek | loun | nost | štin | teln | | 
| 228 |  |  |  |  |  |  | ov (?: ec | ík | in | tv )  # -ovec -ovík -ovin -ovtv | 
| 229 |  |  |  |  |  |  | ) $}{}x; | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | # -enic -inec -itel → -e -i | 
| 232 | 0 | 0 |  |  |  | 0 | return palatalize($word) | 
| 233 |  |  |  |  |  |  | if $word =~ s{ (?: (?<= e ) nic | (?<= i ) (?: nec | tel ) ) $}{}x; | 
| 234 |  |  |  |  |  |  | } | 
| 235 |  |  |  |  |  |  |  | 
| 236 | 5 | 100 |  |  |  | 15 | if ($length > 5) { | 
| 237 | 1 | 50 |  |  |  | 5 | return $word | 
| 238 |  |  |  |  |  |  | if $word =~ s{ árn $}{}x; | 
| 239 |  |  |  |  |  |  |  | 
| 240 | 1 | 50 |  |  |  | 6 | return palatalize($word) | 
| 241 |  |  |  |  |  |  | if $word =~ s{ (?<= ě ) nk $}{}x;  # -ěnk → -ě | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | # -ián -isk -ist -išt -itb → -i -í | 
| 244 | 1 | 50 |  |  |  | 6 | return palatalize($word) | 
| 245 |  |  |  |  |  |  | if $word =~ s{ (?<= i ) (?: án | sk | st | št | tb ) $}{}x; | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | # -írn → -í | 
| 248 | 1 | 50 |  |  |  | 7 | return palatalize($word) | 
| 249 |  |  |  |  |  |  | if $word =~ s{ (?<= í ) rn $}{}x; | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | # -och -ost -oun -ouš -out -ovn | 
| 252 | 1 | 50 |  |  |  | 4 | return $word | 
| 253 |  |  |  |  |  |  | if $word =~ s{ o (?: ch | st | un | uš | ut | vn ) $}{}x; | 
| 254 |  |  |  |  |  |  |  | 
| 255 | 1 | 50 |  |  |  | 8 | return $word | 
| 256 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 257 |  |  |  |  |  |  | čan | ctv | kář | kyn | néř | ník | stv | ušk | 
| 258 |  |  |  |  |  |  | ) $}{}x; | 
| 259 |  |  |  |  |  |  | } | 
| 260 |  |  |  |  |  |  |  | 
| 261 | 5 | 100 |  |  |  | 17 | if ($length > 4) { | 
| 262 |  |  |  |  |  |  | # -ač -áč -an -án -ář -as | 
| 263 | 1 | 50 |  |  |  | 6 | return $word | 
| 264 |  |  |  |  |  |  | if $word =~ s{ (?: a[čns] | á[čnř] ) $}{}x; | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | # -ec -en -ěn -éř -ic -in -it -iv -ín -íř → -e -ě -é -i -í | 
| 267 | 1 | 50 |  |  |  | 8 | return palatalize($word) | 
| 268 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 269 |  |  |  |  |  |  | (?<= e ) [cn] | 
| 270 |  |  |  |  |  |  | | (?<= ě ) n | 
| 271 |  |  |  |  |  |  | | (?<= é ) ř | 
| 272 |  |  |  |  |  |  | | (?<= i ) [cntv] | 
| 273 |  |  |  |  |  |  | | (?<= í ) [nř] | 
| 274 |  |  |  |  |  |  | ) $}{}x; | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  | # -čk -čn -dl -nk -ob -oň -ot -ov -tk -tv -ul -vk -yn | 
| 277 | 1 | 50 |  |  |  | 8 | return $word | 
| 278 |  |  |  |  |  |  | if $word =~ s{ (?: | 
| 279 |  |  |  |  |  |  | č[kn] | o[bňtv] | t[kv] | [du]l | [nv]k | yn | 
| 280 |  |  |  |  |  |  | ) $}{}x; | 
| 281 |  |  |  |  |  |  | } | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 5 | 100 |  |  |  | 12 | if ($length > 3) { | 
| 284 | 2 | 100 |  |  |  | 15 | return $word | 
| 285 |  |  |  |  |  |  | if $word =~ s{ [cčklnt] $}{}x; | 
| 286 |  |  |  |  |  |  | } | 
| 287 |  |  |  |  |  |  |  | 
| 288 | 4 |  |  |  |  | 9 | return $word; | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | sub palatalize { | 
| 292 | 49 |  |  | 49 | 0 | 90 | my ($word) = @_; | 
| 293 |  |  |  |  |  |  |  | 
| 294 | 49 | 100 | 100 |  |  | 1440 | return $word | 
|  |  |  | 100 |  |  |  |  | 
|  |  |  | 100 |  |  |  |  | 
| 295 |  |  |  |  |  |  | if $word =~ s{ čt[ěií]  $}{ck}x  # -čtě -čti -čtí  → -ck | 
| 296 |  |  |  |  |  |  | || $word =~ s{ št[ěií]  $}{sk}x  # -ště -šti -ští  → -sk | 
| 297 |  |  |  |  |  |  | || $word =~ s{ [cč][ei] $}{k}x   # -ce -ci -če -či → -k | 
| 298 |  |  |  |  |  |  | || $word =~ s{ [zž][ei] $}{h}x;  # -ze -zi -že -ži → -h | 
| 299 |  |  |  |  |  |  |  | 
| 300 | 30 |  |  |  |  | 50 | chop $word; | 
| 301 |  |  |  |  |  |  |  | 
| 302 | 30 |  |  |  |  | 96 | return $word; | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  |  | 
| 305 |  |  |  |  |  |  | 1; | 
| 306 |  |  |  |  |  |  |  | 
| 307 |  |  |  |  |  |  | __END__ |