| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# For Emacs: -*- mode:cperl; eval: (folding-mode 1); coding:utf-8 -*- |
|
2
|
|
|
|
|
|
|
package Lingua::HUN::Word2Num; |
|
3
|
|
|
|
|
|
|
# ABSTRACT: Word to number conversion in Hungarian |
|
4
|
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
136688
|
use 5.16.0; |
|
|
1
|
|
|
|
|
4
|
|
|
6
|
1
|
|
|
1
|
|
6
|
use utf8; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
15
|
|
|
7
|
1
|
|
|
1
|
|
37
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
77
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# {{{ use block |
|
10
|
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
1418
|
use Export::Attrs; |
|
|
1
|
|
|
|
|
12745
|
|
|
|
1
|
|
|
|
|
7
|
|
|
12
|
1
|
|
|
1
|
|
1411
|
use Parse::RecDescent; |
|
|
1
|
|
|
|
|
52866
|
|
|
|
1
|
|
|
|
|
8
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# }}} |
|
15
|
|
|
|
|
|
|
# {{{ var block |
|
16
|
|
|
|
|
|
|
our $VERSION = '0.2603300'; |
|
17
|
|
|
|
|
|
|
my $parser = hun_numerals(); |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# }}} |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# {{{ w2n convert text to number |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub w2n :Export { |
|
24
|
2
|
|
100
|
2
|
1
|
153467
|
my $input = shift // return; |
|
25
|
|
|
|
|
|
|
|
|
26
|
1
|
|
|
|
|
7
|
$input =~ s{-}{}gxms; # remove hyphens used in million compounds |
|
27
|
|
|
|
|
|
|
|
|
28
|
1
|
|
|
|
|
16
|
return $parser->numeral($input); |
|
29
|
1
|
|
|
1
|
|
179
|
} |
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
7
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# }}} |
|
32
|
|
|
|
|
|
|
# {{{ hun_numerals create parser for hungarian numerals |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub hun_numerals { |
|
35
|
1
|
|
|
1
|
1
|
8
|
return Parse::RecDescent->new(q{ |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
numeral: mega |
|
39
|
|
|
|
|
|
|
| kOhOd |
|
40
|
|
|
|
|
|
|
| { } |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
number: 'tíz' { 10 } |
|
43
|
|
|
|
|
|
|
| 'tizenegy' { 11 } |
|
44
|
|
|
|
|
|
|
| /tizen(kettő|két)/ { 12 } |
|
45
|
|
|
|
|
|
|
| 'tizenhárom' { 13 } |
|
46
|
|
|
|
|
|
|
| 'tizennégy' { 14 } |
|
47
|
|
|
|
|
|
|
| 'tizenöt' { 15 } |
|
48
|
|
|
|
|
|
|
| 'tizenhat' { 16 } |
|
49
|
|
|
|
|
|
|
| 'tizenhét' { 17 } |
|
50
|
|
|
|
|
|
|
| 'tizennyolc' { 18 } |
|
51
|
|
|
|
|
|
|
| 'tizenkilenc' { 19 } |
|
52
|
|
|
|
|
|
|
| 'nulla' { 0 } |
|
53
|
|
|
|
|
|
|
| 'egy' { 1 } |
|
54
|
|
|
|
|
|
|
| /kett(ő|ö)/ { 2 } |
|
55
|
|
|
|
|
|
|
| 'két' { 2 } |
|
56
|
|
|
|
|
|
|
| 'három' { 3 } |
|
57
|
|
|
|
|
|
|
| 'négy' { 4 } |
|
58
|
|
|
|
|
|
|
| /öt/ { 5 } |
|
59
|
|
|
|
|
|
|
| 'hat' { 6 } |
|
60
|
|
|
|
|
|
|
| 'hét' { 7 } |
|
61
|
|
|
|
|
|
|
| 'nyolc' { 8 } |
|
62
|
|
|
|
|
|
|
| 'kilenc' { 9 } |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
tens: 'húsz' { 20 } |
|
65
|
|
|
|
|
|
|
| /huszon/ number { 20 + $item[2] } |
|
66
|
|
|
|
|
|
|
| 'harminc' { 30 } |
|
67
|
|
|
|
|
|
|
| 'negyven' { 40 } |
|
68
|
|
|
|
|
|
|
| 'ötven' { 50 } |
|
69
|
|
|
|
|
|
|
| 'hatvan' { 60 } |
|
70
|
|
|
|
|
|
|
| 'hetven' { 70 } |
|
71
|
|
|
|
|
|
|
| 'nyolcvan' { 80 } |
|
72
|
|
|
|
|
|
|
| 'kilencven' { 90 } |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
deca: tens number { $item[1] + $item[2] } |
|
75
|
|
|
|
|
|
|
| tens |
|
76
|
|
|
|
|
|
|
| number |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
hecto: number 'száz' deca { $item[1] * 100 + $item[3] } |
|
79
|
|
|
|
|
|
|
| number 'száz' { $item[1] * 100 } |
|
80
|
|
|
|
|
|
|
| 'száz' deca { 100 + $item[2] } |
|
81
|
|
|
|
|
|
|
| 'száz' { 100 } |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
hOd: hecto |
|
84
|
|
|
|
|
|
|
| deca |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
kilo: hOd 'ezer' hOd { $item[1] * 1000 + $item[3] } |
|
87
|
|
|
|
|
|
|
| hOd 'ezer' { $item[1] * 1000 } |
|
88
|
|
|
|
|
|
|
| 'ezer' hOd { 1000 + $item[2] } |
|
89
|
|
|
|
|
|
|
| 'ezer' { 1000 } |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
kOhOd: kilo |
|
92
|
|
|
|
|
|
|
| hOd |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
mega: hOd /milli(ó|o)/ kOhOd { $item[1] * 1_000_000 + $item[3] } |
|
95
|
|
|
|
|
|
|
| hOd /milli(ó|o)/ { $item[1] * 1_000_000 } |
|
96
|
|
|
|
|
|
|
}); |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# }}} |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# {{{ ordinal2cardinal convert ordinal text to cardinal text |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub ordinal2cardinal :Export { |
|
104
|
0
|
|
0
|
0
|
1
|
|
my $input = shift // return; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Hungarian ordinals: |
|
107
|
|
|
|
|
|
|
# első → egy, második → kettő (fully suppletive) |
|
108
|
|
|
|
|
|
|
# Stem-altered: harmadik→három, negyedik→négy, hetedik→hét, |
|
109
|
|
|
|
|
|
|
# tizedik→tíz, huszadik→húsz, harminc→harmincadik, etc. |
|
110
|
|
|
|
|
|
|
# Regular: cardinal + vowel-harmony suffix (-adik/-odik/-edik/-ödik) |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# Full lookup for standalone ordinals AND teens. |
|
113
|
|
|
|
|
|
|
# Teens (11-19) are single tokens in the w2n parser ("tizenhárom" etc.), |
|
114
|
|
|
|
|
|
|
# so they must be handled as complete lookups — suffix decomposition |
|
115
|
|
|
|
|
|
|
# would produce wrong forms. Same for standalone 1-10, round tens, etc. |
|
116
|
0
|
|
|
|
|
|
my %lookup = ( |
|
117
|
|
|
|
|
|
|
'első' => 'egy', |
|
118
|
|
|
|
|
|
|
'második' => 'kettő', |
|
119
|
|
|
|
|
|
|
'harmadik' => 'három', |
|
120
|
|
|
|
|
|
|
'negyedik' => 'négy', |
|
121
|
|
|
|
|
|
|
'ötödik' => 'öt', |
|
122
|
|
|
|
|
|
|
'hatodik' => 'hat', |
|
123
|
|
|
|
|
|
|
'hetedik' => 'hét', |
|
124
|
|
|
|
|
|
|
'nyolcadik' => 'nyolc', |
|
125
|
|
|
|
|
|
|
'kilencedik' => 'kilenc', |
|
126
|
|
|
|
|
|
|
'tizedik' => 'tíz', |
|
127
|
|
|
|
|
|
|
'tizenegyedik' => 'tizenegy', |
|
128
|
|
|
|
|
|
|
'tizenkettedik' => 'tizenkettő', |
|
129
|
|
|
|
|
|
|
'tizenharmadik' => 'tizenhárom', |
|
130
|
|
|
|
|
|
|
'tizennegyedik' => 'tizennégy', |
|
131
|
|
|
|
|
|
|
'tizenötödik' => 'tizenöt', |
|
132
|
|
|
|
|
|
|
'tizenhatodik' => 'tizenhat', |
|
133
|
|
|
|
|
|
|
'tizenhetedik' => 'tizenhét', |
|
134
|
|
|
|
|
|
|
'tizennyolcadik' => 'tizennyolc', |
|
135
|
|
|
|
|
|
|
'tizenkilencedik' => 'tizenkilenc', |
|
136
|
|
|
|
|
|
|
'huszadik' => 'húsz', |
|
137
|
|
|
|
|
|
|
'huszonegyedik' => 'huszonegy', |
|
138
|
|
|
|
|
|
|
'huszonkettedik' => 'huszonkettő', |
|
139
|
|
|
|
|
|
|
'huszonharmadik' => 'huszonhárom', |
|
140
|
|
|
|
|
|
|
'huszonnegyedik' => 'huszonnégy', |
|
141
|
|
|
|
|
|
|
'huszonötödik' => 'huszonöt', |
|
142
|
|
|
|
|
|
|
'huszonhatodik' => 'huszonhat', |
|
143
|
|
|
|
|
|
|
'huszonhetedik' => 'huszonhét', |
|
144
|
|
|
|
|
|
|
'huszonnyolcadik' => 'huszonnyolc', |
|
145
|
|
|
|
|
|
|
'huszonkilencedik' => 'huszonkilenc', |
|
146
|
|
|
|
|
|
|
'harmincadik' => 'harminc', |
|
147
|
|
|
|
|
|
|
'harmincegyedik' => 'harmincegy', |
|
148
|
|
|
|
|
|
|
'negyvenedik' => 'negyven', |
|
149
|
|
|
|
|
|
|
'negyvenegyedik' => 'negyvenegy', |
|
150
|
|
|
|
|
|
|
'ötvenedik' => 'ötven', |
|
151
|
|
|
|
|
|
|
'ötvenegyedik' => 'ötvenegy', |
|
152
|
|
|
|
|
|
|
'hatvanadik' => 'hatvan', |
|
153
|
|
|
|
|
|
|
'hatvanegyedik' => 'hatvanegy', |
|
154
|
|
|
|
|
|
|
'hetvenedik' => 'hetven', |
|
155
|
|
|
|
|
|
|
'hetvenegyedik' => 'hetvenegy', |
|
156
|
|
|
|
|
|
|
'nyolcvanadik' => 'nyolcvan', |
|
157
|
|
|
|
|
|
|
'nyolcvanegyedik' => 'nyolcvanegy', |
|
158
|
|
|
|
|
|
|
'kilencvenedik' => 'kilencven', |
|
159
|
|
|
|
|
|
|
'kilencvenegyedik' => 'kilencvenegy', |
|
160
|
|
|
|
|
|
|
'századik' => 'száz', |
|
161
|
|
|
|
|
|
|
'ezredik' => 'ezer', |
|
162
|
|
|
|
|
|
|
'milliomodik' => 'millió', |
|
163
|
|
|
|
|
|
|
); |
|
164
|
|
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
return $lookup{$input} if exists $lookup{$input}; |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Compound ordinal: try splitting so the TAIL matches a lookup entry. |
|
168
|
|
|
|
|
|
|
# This correctly handles "száztizenegyedik" → "száz" + "tizenegyedik" |
|
169
|
|
|
|
|
|
|
# → lookup "tizenegyedik" = "tizenegy" → "száztizenegy" (111). |
|
170
|
|
|
|
|
|
|
# Try longest tail first (= shortest prefix) so the most specific |
|
171
|
|
|
|
|
|
|
# lookup wins over generic suffix decomposition. |
|
172
|
0
|
|
|
|
|
|
for my $tail_len (reverse 1 .. length($input) - 1) { |
|
173
|
0
|
|
|
|
|
|
my $tail = substr($input, -$tail_len); |
|
174
|
0
|
|
|
|
|
|
my $prefix = substr($input, 0, length($input) - $tail_len); |
|
175
|
0
|
0
|
|
|
|
|
if (exists $lookup{$tail}) { |
|
176
|
0
|
|
|
|
|
|
return $prefix . $lookup{$tail}; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# Generic suffix decomposition for truly compound ordinals where |
|
181
|
|
|
|
|
|
|
# the tail is not a complete lookup entry (e.g. 30+, 40+, ... units). |
|
182
|
0
|
|
|
|
|
|
my @ord_suffix_to_cardinal = ( |
|
183
|
|
|
|
|
|
|
[ 'negyedik', 'négy' ], # must precede egyedik |
|
184
|
|
|
|
|
|
|
[ 'egyedik', 'egy' ], |
|
185
|
|
|
|
|
|
|
[ 'kettedik', 'kettő' ], |
|
186
|
|
|
|
|
|
|
[ 'harmadik', 'három' ], |
|
187
|
|
|
|
|
|
|
[ 'ötödik', 'öt' ], |
|
188
|
|
|
|
|
|
|
[ 'hatodik', 'hat' ], |
|
189
|
|
|
|
|
|
|
[ 'hetedik', 'hét' ], |
|
190
|
|
|
|
|
|
|
[ 'nyolcadik', 'nyolc' ], |
|
191
|
|
|
|
|
|
|
[ 'kilencedik', 'kilenc' ], |
|
192
|
|
|
|
|
|
|
[ 'ezredik', 'ezer' ], |
|
193
|
|
|
|
|
|
|
[ 'századik', 'száz' ], |
|
194
|
|
|
|
|
|
|
); |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# Find the match with the longest suffix (= most specific ordinal ending). |
|
197
|
0
|
|
|
|
|
|
my $best_result; |
|
198
|
0
|
|
|
|
|
|
my $best_suffix_len = -1; |
|
199
|
0
|
|
|
|
|
|
for my $pair (@ord_suffix_to_cardinal) { |
|
200
|
0
|
|
|
|
|
|
my ($suffix, $cardinal) = @$pair; |
|
201
|
0
|
0
|
|
|
|
|
if ($input =~ m{\A (.+) \Q$suffix\E \z}xms) { |
|
202
|
0
|
|
|
|
|
|
my $prefix = $1; |
|
203
|
0
|
0
|
|
|
|
|
if (length($suffix) > $best_suffix_len) { |
|
204
|
0
|
|
|
|
|
|
$best_suffix_len = length($suffix); |
|
205
|
0
|
|
|
|
|
|
$best_result = $prefix . $cardinal; |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
} |
|
209
|
0
|
0
|
|
|
|
|
return $best_result if defined $best_result; |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# Fallback: strip ordinal suffix (vowel-harmony variants) |
|
212
|
|
|
|
|
|
|
# -odik, -adik, -edik, -ödik first (longer), then bare -dik |
|
213
|
0
|
0
|
|
|
|
|
if ($input =~ s{(?:a|o|e|ö)dik\z}{}xms) { |
|
214
|
0
|
|
|
|
|
|
return $input; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
0
|
0
|
|
|
|
|
if ($input =~ s{dik\z}{}xms) { |
|
217
|
0
|
|
|
|
|
|
return $input; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
return; # not an ordinal |
|
221
|
1
|
|
|
1
|
|
1095
|
} |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
5
|
|
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# }}} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
1; |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
__END__ |