File Coverage

blib/lib/Lingua/KOR/Word2Num.pm

Criterion	Covered	Total	%
statement	25	40	62.5
branch	1	14	7.1
condition	4	9	44.4
subroutine	9	10	90.0
pod	3	3	100.0
total	42	76	55.2

line	stmt	bran	cond	sub	pod	time	code
1							# For Emacs: -- mode:cperl; eval: (folding-mode 1); coding:utf-8; --
2
3							package Lingua::KOR::Word2Num;
4							# ABSTRACT: Word to number conversion in Korean
5
6	1			1		122519	use 5.16.0;
	1					5
7	1			1		7	use utf8;
	1					2
	1					16
8	1			1		37	use warnings;
	1					2
	1					108
9
10							# {{{ use block
11
12	1			1		738	use Export::Attrs;
	1					13375
	1					9
13	1			1		1407	use Parse::RecDescent;
	1					54456
	1					10
14
15							# }}}
16							# {{{ variable declarations
17							our $VERSION = '0.2603300';
18							my $parser = kor_numerals();
19
20							# }}}
21
22							# {{{ w2n convert text to number
23
24							sub w2n :Export {
25	2		100	2	1	203058	my $input = shift // return;
26
27							# zero
28	1	50	33			9	return 0 if ($input =~ m{\b(yeong)\b}xmsi \|\| $input =~ m{\A영\z}xms);
29
30	1					2	$input .= " "; # Grant space at the end
31
32	1		50			12	return $parser->numeral($input) \|\| undef;
33	1			1		247	}
	1					8
	1					12
34
35							# }}}
36							# {{{ kor_numerals create parser for numerals
37
38							sub kor_numerals {
39	1			1	1	7	return Parse::RecDescent->new(q{
40							numeral:
41							numeral: eok { return $item[1]; } # root parse. go from maximum to minimum value
42							\| man { return $item[1]; }
43							\| millenium2 { return $item[1]; }
44							\| millenium1 { return $item[1]; }
45							\| century { return $item[1]; }
46							\| decade { return $item[1]; }
47							\| { return undef; }
48
49							number: 'yeong ' { $return = 0; } # try to find a word from 0 to 9
50							\| 'il ' { $return = 1; }
51							\| 'sam ' { $return = 3; }
52							\| 'sa ' { $return = 4; }
53							\| 'o ' { $return = 5; }
54							\| 'yuk ' { $return = 6; }
55							\| 'chil ' { $return = 7; }
56							\| 'pal ' { $return = 8; }
57							\| 'gu ' { $return = 9; }
58							\| 'i ' { $return = 2; }
59							\| /영/ { $return = 0; }
60							\| /일/ { $return = 1; }
61							\| /이/ { $return = 2; }
62							\| /삼/ { $return = 3; }
63							\| /사/ { $return = 4; }
64							\| /오/ { $return = 5; }
65							\| /육/ { $return = 6; }
66							\| /칠/ { $return = 7; }
67							\| /팔/ { $return = 8; }
68							\| /구/ { $return = 9; }
69
70							tens: 'ilsip ' { $return = 10; } # try to find a word that represents
71							\| 'isip ' { $return = 20; } # values 10,20,..,90
72							\| 'samsip ' { $return = 30; }
73							\| 'sasip ' { $return = 40; }
74							\| 'osip ' { $return = 50; }
75							\| 'yuksip ' { $return = 60; }
76							\| 'chilsip ' { $return = 70; }
77							\| 'palsip ' { $return = 80; }
78							\| 'gusip ' { $return = 90; }
79							\| 'sip ' { $return = 10; }
80							\| /일십/ { $return = 10; }
81							\| /이십/ { $return = 20; }
82							\| /삼십/ { $return = 30; }
83							\| /사십/ { $return = 40; }
84							\| /오십/ { $return = 50; }
85							\| /육십/ { $return = 60; }
86							\| /칠십/ { $return = 70; }
87							\| /팔십/ { $return = 80; }
88							\| /구십/ { $return = 90; }
89							\| /십/ { $return = 10; }
90
91							hundreds: 'ilbaek ' { $return = 100; } # try to find a word that represents
92							\| 'ibaek ' { $return = 200; } # values 100,200,..,900
93							\| 'sambaek ' { $return = 300; }
94							\| 'sabaek ' { $return = 400; }
95							\| 'obaek ' { $return = 500; }
96							\| 'yukbaek ' { $return = 600; }
97							\| 'chilbaek ' { $return = 700; }
98							\| 'palbaek ' { $return = 800; }
99							\| 'gubaek ' { $return = 900; }
100							\| 'baek ' { $return = 100; }
101							\| /일백/ { $return = 100; }
102							\| /이백/ { $return = 200; }
103							\| /삼백/ { $return = 300; }
104							\| /사백/ { $return = 400; }
105							\| /오백/ { $return = 500; }
106							\| /육백/ { $return = 600; }
107							\| /칠백/ { $return = 700; }
108							\| /팔백/ { $return = 800; }
109							\| /구백/ { $return = 900; }
110							\| /백/ { $return = 100; }
111
112							thousands: 'ilcheon ' { $return = 1000; } # try to find a word that represents
113							\| 'icheon ' { $return = 2000; } # values 1000,2000,..,9000
114							\| 'samcheon ' { $return = 3000; }
115							\| 'sacheon ' { $return = 4000; }
116							\| 'ocheon ' { $return = 5000; }
117							\| 'yukcheon ' { $return = 6000; }
118							\| 'chilcheon ' { $return = 7000; }
119							\| 'palcheon ' { $return = 8000; }
120							\| 'gucheon ' { $return = 9000; }
121							\| 'cheon ' { $return = 1000; }
122							\| /일천/ { $return = 1000; }
123							\| /이천/ { $return = 2000; }
124							\| /삼천/ { $return = 3000; }
125							\| /사천/ { $return = 4000; }
126							\| /오천/ { $return = 5000; }
127							\| /육천/ { $return = 6000; }
128							\| /칠천/ { $return = 7000; }
129							\| /팔천/ { $return = 8000; }
130							\| /구천/ { $return = 9000; }
131							\| /천/ { $return = 1000; }
132
133							tenthousands: 'ilman ' { $return = 10000; } # try to find a word that represents
134							\| 'iman ' { $return = 20000; } # values 10000,20000,..,90000
135							\| 'samman ' { $return = 30000; }
136							\| 'saman ' { $return = 40000; }
137							\| 'oman ' { $return = 50000; }
138							\| 'yukman ' { $return = 60000; }
139							\| 'chilman ' { $return = 70000; }
140							\| 'palman ' { $return = 80000; }
141							\| 'guman ' { $return = 90000; }
142							\| 'man ' { $return = 10000; }
143							\| /일만/ { $return = 10000; }
144							\| /이만/ { $return = 20000; }
145							\| /삼만/ { $return = 30000; }
146							\| /사만/ { $return = 40000; }
147							\| /오만/ { $return = 50000; }
148							\| /육만/ { $return = 60000; }
149							\| /칠만/ { $return = 70000; }
150							\| /팔만/ { $return = 80000; }
151							\| /구만/ { $return = 90000; }
152							\| /만/ { $return = 10000; }
153
154							decade: tens(?) number(?) number(?) # try to find words that represents values
155							{ $return = 0; # from 0 to 99
156							for (@item) {
157							$return += $$_[0] if (ref $_ && defined $$_[0]);
158							}
159							}
160
161							century: hundreds(?) decade(?) # try to find words that represents values
162							{ $return = 0; # from 100 to 999
163							for (@item) {
164							$return += $$_[0] if (ref $_ && defined $$_[0]);
165							}
166							}
167
168							millenium1: thousands(1) century(?) # try to find words that represents values
169							{ $return = 0; # from 1.000 to 9.999
170							for (@item) {
171							if (ref $_ && defined $$_[0]) {
172							$return += $$_[0] if (ref $_ && defined $$_[0]);
173							}
174							}
175							}
176
177							millenium2: tenthousands(1) thousands(?) century(?) decade(?) # try to find words that represents values
178							{ $return = 0; # from 10.000 to 99.999
179							for (@item) {
180							if (ref $_ && defined $$_[0]) {
181							$return += $$_[0] if (ref $_ && defined $$_[0]);
182							}
183							}
184							}
185
186							man: millenium1(?) century(?) decade(?) # N만K = N * 10_000 + K where N is 1..9999
187							manmark # handles values from 10_000 to 99_999_999
188							millenium1(?) century(?) decade(?)
189							{ $return = 0;
190							for (@item) {
191							if (ref $_ && defined $$_[0]) {
192							$return += $$_[0];
193							} elsif (defined $_ && $_ eq "Man") {
194							$return = ($return>0) ? $return * 10000 : 10000;
195							}
196							}
197							}
198
199							manmark: 'man ' { $return = "Man"; }
200							\| /만/ { $return = "Man"; }
201
202							eok: millenium1(?) century(?) decade(?) # try to find words that represents values
203							eokmark # from 100.000.000 to 999.999.999.999
204							man(?) millenium1(?) century(?) decade(?)
205							{ $return = 0;
206							for (@item) {
207							if (ref $_ && defined $$_[0]) {
208							$return += $$_[0];
209							} elsif (defined $_ && $_ eq "Eok") {
210							$return = ($return>0) ? $return * 100000000 : 100000000;
211							}
212							}
213							}
214
215							eokmark: 'eok ' { $return = "Eok"; }
216							\| /억/ { $return = "Eok"; }
217							});
218							}
219
220							# }}}
221							# {{{ ordinal2cardinal convert ordinal text to cardinal text
222
223							sub ordinal2cardinal :Export {
224	0		0	0	1		my $input = shift // return;
225
226							# Korean ordinals append " 번째" to the number form.
227							# Special: "첫 번째" (1st) → Sino-Korean "일"
228	0	0					return '일' if $input eq '첫 번째';
229
230							# Must end with " 번째" to be an ordinal ([ ] needed because /x ignores bare spaces)
231	0	0					$input =~ s{[ ]번째\z}{}xms or return;
232
233							# Native Korean ones → Sino-Korean cardinal
234	0						state $native_ones = {
235							'한' => '일', '두' => '이', '세' => '삼',
236							'네' => '사', '다섯' => '오', '여섯' => '육',
237							'일곱' => '칠', '여덟' => '팔', '아홉' => '구',
238							};
239
240							# Native Korean tens → Sino-Korean cardinal
241	0						state $native_tens = {
242							'열' => '십', '스물' => '이십', '서른' => '삼십',
243							'마흔' => '사십', '쉰' => '오십', '예순' => '육십',
244							'일흔' => '칠십', '여든' => '팔십', '아흔' => '구십',
245							};
246
247							# Pure native ones (1-9)
248	0	0					return $native_ones->{$input} if exists $native_ones->{$input};
249
250							# Pure native tens (10,20,...,90)
251	0	0					return $native_tens->{$input} if exists $native_tens->{$input};
252
253							# Native compound: tens + ones (e.g. "스물세" → "이십삼")
254							# Try each tens prefix (longest first to avoid ambiguity)
255	0						for my $ten (sort { length $b <=> length $a } keys %{$native_tens}) {
	0
	0
256	0	0					if ($input =~ m{\A\Q$ten\E(.+)\z}xms) {
257	0						my $ones_part = $1;
258	0	0					if (exists $native_ones->{$ones_part}) {
259	0						return $native_tens->{$ten} . $native_ones->{$ones_part};
260							}
261							}
262							}
263
264							# For 100+, Sino-Korean cardinal is used directly — return as-is
265	0						return $input;
266	1			1		1042	}
	1					3
	1					6
267
268							# }}}
269
270							1;
271
272							__END__