line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Lingua::EN::Words2Nums - convert English text to numbers |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=cut |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
package Lingua::EN::Words2Nums; |
10
|
1
|
|
|
1
|
|
172616
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
50
|
|
11
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
1956
|
|
12
|
|
|
|
|
|
|
require Exporter; |
13
|
|
|
|
|
|
|
our @ISA=qw(Exporter); |
14
|
|
|
|
|
|
|
our @EXPORT=qw(&words2nums); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 SYNOPSIS |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
use Lingua::EN::Words2Nums; |
19
|
|
|
|
|
|
|
$num=words2nums("two thousand and one"); |
20
|
|
|
|
|
|
|
$num=words2nums("twenty-second"); |
21
|
|
|
|
|
|
|
$num=words2nums("15 billion, 6 million, and ninteen"); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
This module converts English text into numbers. It supports both ordinal and |
26
|
|
|
|
|
|
|
cardinal numbers, negative numbers, and very large numbers. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
The main subroutine, which is exported by default, is words2nums(). This |
29
|
|
|
|
|
|
|
subroutine, when fed a string, will attempt to convert it into a number. |
30
|
|
|
|
|
|
|
If it succeeds, the number will be returned. If it fails, it returns undef. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 VARIABLES |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
There are a number of variables that can be used to tweak the behavior of this |
35
|
|
|
|
|
|
|
module. For example, debugging can be be enabled by setting |
36
|
|
|
|
|
|
|
$Lingua::EN::Words2Nums::debug=1 |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=over 4 |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=cut |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# Public global variables. |
43
|
|
|
|
|
|
|
our $debug = 0; |
44
|
|
|
|
|
|
|
our $billion = 10 ** 9; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=item $Lingua::EN::Words2Nums::debug |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Default: 0. If set to a true value, outputs on standard error some useful |
49
|
|
|
|
|
|
|
messages if parsing fails for some reason. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=item $Lingua::EN::Words2Nums::billion |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Default: 10 ** 9. This is the number that will be returned for "one billion". |
54
|
|
|
|
|
|
|
It defaults to the American version; the English will want to set it to |
55
|
|
|
|
|
|
|
10 ** 12. Setting this number automatically changes all the larger numbers |
56
|
|
|
|
|
|
|
(trillion, quadrillion, etc) to match. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=back |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head1 NOTES |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
It does not understand decimals or fractions, yet. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Scores are supported, eg: "four score and ten". So are dozens. So is a baker's |
65
|
|
|
|
|
|
|
dozen. And a gross. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Various mispellings of numbers are understood. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
While it handles googol correctly, googolplex is too large to fit in perl's |
70
|
|
|
|
|
|
|
standard scalar type, and "inf" will be returned. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=cut |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
our %nametosub = ( |
75
|
|
|
|
|
|
|
naught => [ \&num, 0 ], # Cardinal numbers, leaving out the a |
76
|
|
|
|
|
|
|
nought => [ \&num, 0 ], |
77
|
|
|
|
|
|
|
zero => [ \&num, 0 ], # ones that just add "th". |
78
|
|
|
|
|
|
|
one => [ \&num, 1 ], first => [ \&num, 1 ], |
79
|
|
|
|
|
|
|
two => [ \&num, 2 ], second => [ \&num, 2 ], |
80
|
|
|
|
|
|
|
three => [ \&num, 3 ], third => [ \&num, 3 ], |
81
|
|
|
|
|
|
|
four => [ \&num, 4 ], fourth => [ \&num, 4 ], |
82
|
|
|
|
|
|
|
five => [ \&num, 5 ], fifth => [ \&num, 5 ], |
83
|
|
|
|
|
|
|
six => [ \&num, 6 ], |
84
|
|
|
|
|
|
|
seven => [ \&num, 7 ], seven => [ \&num, 7 ], |
85
|
|
|
|
|
|
|
eight => [ \&num, 8 ], eighth => [ \&num, 8 ], |
86
|
|
|
|
|
|
|
nine => [ \&num, 9 ], ninth => [ \&num, 9 ], |
87
|
|
|
|
|
|
|
ten => [ \&num, 10 ], |
88
|
|
|
|
|
|
|
eleven => [ \&num, 11 ], |
89
|
|
|
|
|
|
|
twelve => [ \&num, 12 ], twelfth => [ \&num, 12 ], |
90
|
|
|
|
|
|
|
thirteen => [ \&num, 13 ], |
91
|
|
|
|
|
|
|
fifteen => [ \&num, 15 ], |
92
|
|
|
|
|
|
|
eighteen => [ \&num, 18 ], |
93
|
|
|
|
|
|
|
ninteen => [ \&num, 19 ], # common(?) mispelling |
94
|
|
|
|
|
|
|
teen => [ \&suffix, 10 ], # takes care of the regular teens |
95
|
|
|
|
|
|
|
twenty => [ \&num, 20 ], twentieth => [ \&num, 20 ], |
96
|
|
|
|
|
|
|
thirty => [ \&num, 30 ], thirtieth => [ \&num, 30 ], |
97
|
|
|
|
|
|
|
forty => [ \&num, 40 ], fortieth => [ \&num, 40 ], |
98
|
|
|
|
|
|
|
fourty => [ \&num, 40 ], fourtieth => [ \&num, 40 ], # at least I mispell it like this |
99
|
|
|
|
|
|
|
fifty => [ \&num, 50 ], fiftieth => [ \&num, 50 ], |
100
|
|
|
|
|
|
|
sixty => [ \&num, 60 ], sixtieth => [ \&num, 60 ], |
101
|
|
|
|
|
|
|
seventy => [ \&num, 70 ], seventieth => [ \&num, 70 ], |
102
|
|
|
|
|
|
|
eighty => [ \&num, 80 ], eightieth => [ \&num, 80 ], |
103
|
|
|
|
|
|
|
ninety => [ \&num, 90 ], ninetieth => [ \&num, 90 ], |
104
|
|
|
|
|
|
|
ninty => [ \&num, 90 ], # common mispelling |
105
|
|
|
|
|
|
|
hundred => [ \&prefix, 100 ], |
106
|
|
|
|
|
|
|
thousand => [ \&prefix, 1000 ], |
107
|
|
|
|
|
|
|
million => [ \&prefix, 10 ** 6 ], |
108
|
|
|
|
|
|
|
milion => [ \&prefix, 10 ** 6 ], # common(?) mispelling |
109
|
|
|
|
|
|
|
milliard => [ \&prefix, 10 ** 9 ], |
110
|
|
|
|
|
|
|
billion => [ \&powprefix, 2 ], # These vary depending on country. |
111
|
|
|
|
|
|
|
billiard => [ \&prefix, 10 ** 15 ], |
112
|
|
|
|
|
|
|
trillion => [ \&powprefix, 3 ], |
113
|
|
|
|
|
|
|
trilliard => [ \&prefix, 10 ** 21 ], |
114
|
|
|
|
|
|
|
quadrillion => [ \&powprefix, 4 ], |
115
|
|
|
|
|
|
|
quadrilliard => [ \&prefix, 10 ** 27 ], |
116
|
|
|
|
|
|
|
quintillion => [ \&powprefix, 5 ], |
117
|
|
|
|
|
|
|
quintilliard => [ \&prefix, 10 ** 33 ], |
118
|
|
|
|
|
|
|
sextillion => [ \&powprefix, 6 ], |
119
|
|
|
|
|
|
|
sextilliard => [ \&prefix, 10 ** 39 ], |
120
|
|
|
|
|
|
|
septillion => [ \&powprefix, 7 ], |
121
|
|
|
|
|
|
|
septilliard => [ \&prefix, 10 ** 45 ], |
122
|
|
|
|
|
|
|
octillion => [ \&powprefix, 8 ], |
123
|
|
|
|
|
|
|
octilliard => [ \&prefix, 10 ** 51 ], |
124
|
|
|
|
|
|
|
nonillion => [ \&powprefix, 9 ], |
125
|
|
|
|
|
|
|
nonilliard => [ \&prefix, 10 ** 57 ], |
126
|
|
|
|
|
|
|
decillion => [ \&powprefix, 10 ], |
127
|
|
|
|
|
|
|
decilliard => [ \&prefix, 10 ** 63 ], |
128
|
|
|
|
|
|
|
undecillion => [ \&powprefix, 11 ], |
129
|
|
|
|
|
|
|
undecilliard => [ \&prefix, 10 ** 69 ], |
130
|
|
|
|
|
|
|
duodecillion => [ \&powprefix, 12 ], |
131
|
|
|
|
|
|
|
duodecilliard => [ \&prefix, 10 ** 75 ], |
132
|
|
|
|
|
|
|
tredecillion => [ \&powprefix, 13 ], |
133
|
|
|
|
|
|
|
tredecilliard => [ \&prefix, 10 ** 81 ], |
134
|
|
|
|
|
|
|
quattuordecillion => [ \&powprefix, 14 ], |
135
|
|
|
|
|
|
|
quattuordecilliard => [ \&prefix, 10 ** 87 ], |
136
|
|
|
|
|
|
|
quindecillion => [ \&powprefix, 15 ], |
137
|
|
|
|
|
|
|
quindecilliard => [ \&prefix, 10 ** 93 ], |
138
|
|
|
|
|
|
|
sexdecillion => [ \&powprefix, 16 ], |
139
|
|
|
|
|
|
|
septendecillion => [ \&powprefix, 17 ], |
140
|
|
|
|
|
|
|
octodecillion => [ \&powprefix, 18 ], |
141
|
|
|
|
|
|
|
novemdecillion => [ \&powprefix, 19 ], |
142
|
|
|
|
|
|
|
vigintillion => [ \&powprefix, 20 ], |
143
|
|
|
|
|
|
|
unvigintillion => [ \&powprefix, 21 ], |
144
|
|
|
|
|
|
|
duovigintillion => [ \&powprefix, 22 ], |
145
|
|
|
|
|
|
|
duvigintillion => [ \&powprefix, 22 ], # some use this spelling |
146
|
|
|
|
|
|
|
trevigintillion => [ \&powprefix, 23 ], |
147
|
|
|
|
|
|
|
quattuorvigintillion => [ \&powprefix, 24 ], |
148
|
|
|
|
|
|
|
quinvigintillion => [ \&powprefix, 25 ], |
149
|
|
|
|
|
|
|
sexvigintillion => [ \&powprefix, 26 ], |
150
|
|
|
|
|
|
|
septenvigintillion => [ \&powprefix, 27 ], |
151
|
|
|
|
|
|
|
octovigintillion => [ \&powprefix, 28 ], |
152
|
|
|
|
|
|
|
novemvigintillion => [ \&powprefix, 29 ], |
153
|
|
|
|
|
|
|
trigintillion => [ \&powprefix, 30 ], |
154
|
|
|
|
|
|
|
# This process can be continued indefinitely, but one has to stop |
155
|
|
|
|
|
|
|
# somewhere. -- A Dictionary of Units of Measurement |
156
|
|
|
|
|
|
|
centillion => [ \&powprefix, 100 ], |
157
|
|
|
|
|
|
|
googol => [ \&googol ], |
158
|
|
|
|
|
|
|
googolplex => [ \&googolplex ], |
159
|
|
|
|
|
|
|
negative => [ \&invert ], |
160
|
|
|
|
|
|
|
minus => [ \&invert ], |
161
|
|
|
|
|
|
|
score => [ \&prefix, 20 ], |
162
|
|
|
|
|
|
|
gross => [ \&prefix, 12 * 12 ], |
163
|
|
|
|
|
|
|
dozen => [ \&prefix, 12 ], |
164
|
|
|
|
|
|
|
bakersdozen => [ \&prefix, 13 ], |
165
|
|
|
|
|
|
|
bakerdozen => [ \&prefix, 13 ], |
166
|
|
|
|
|
|
|
eleventyone => [ \&num, 111 ], # This nprogram written on the day |
167
|
|
|
|
|
|
|
eleventyfirst =>[ \&num, 111 ], # FOTR released. |
168
|
|
|
|
|
|
|
s => [ sub {} ], # ignore 's', at the end of a word, |
169
|
|
|
|
|
|
|
# easy pluralization of dozens, etc. |
170
|
|
|
|
|
|
|
es => [ sub {} ], # same for 'es'; for googolplexes, etc. |
171
|
|
|
|
|
|
|
th => [ sub {} ], # ignore 'th', for cardinal nums |
172
|
|
|
|
|
|
|
); |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Note the ordering, so that eg, ninety has a chance to match before nine. |
175
|
|
|
|
|
|
|
my $numregexp = join("|", reverse sort keys %nametosub); |
176
|
|
|
|
|
|
|
$numregexp=qr/($numregexp)/; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
my ($total, $mult, $oldpre, $newmult, $suffix, $val); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub num ($) { |
181
|
100
|
|
|
100
|
0
|
172
|
$val = shift; |
182
|
100
|
100
|
|
|
|
181
|
if ($suffix) { |
183
|
1
|
|
|
|
|
2
|
$val += $suffix; |
184
|
1
|
|
|
|
|
3
|
$suffix = 0; |
185
|
|
|
|
|
|
|
} |
186
|
100
|
|
|
|
|
126
|
$total += $val * $mult; |
187
|
100
|
|
|
|
|
2896
|
$newmult = 0; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub prefix ($) { |
191
|
58
|
|
|
58
|
0
|
76
|
my $pre = shift; |
192
|
58
|
100
|
|
|
|
113
|
if ($pre > $oldpre) { # end of a prefix chain |
193
|
50
|
100
|
|
|
|
86
|
$total += $mult if $newmult; # special case for lone "thousand", etc. |
194
|
50
|
|
|
|
|
58
|
$mult = 1; |
195
|
|
|
|
|
|
|
} |
196
|
58
|
|
|
|
|
71
|
$mult *= $pre; |
197
|
58
|
|
|
|
|
56
|
$oldpre = $pre; |
198
|
58
|
|
|
|
|
600
|
$newmult = 1; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
sub powprefix { |
202
|
3
|
|
|
3
|
0
|
9
|
my $power = shift; |
203
|
3
|
50
|
|
|
|
9
|
if ($billion == 10 ** 9) { # EN |
|
|
0
|
|
|
|
|
|
204
|
3
|
|
|
|
|
13
|
prefix(10 ** (($power + 1) * 3)); |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
elsif ($billion == 10 ** 12) { # GB |
207
|
0
|
|
|
|
|
0
|
prefix(10 ** ($power * 6)); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
else { |
210
|
0
|
|
|
|
|
0
|
failure("\$billion is set to odd value: $billion"); |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub suffix ($) { |
216
|
1
|
|
|
1
|
0
|
13
|
$suffix = shift; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub invert () { |
220
|
0
|
|
|
0
|
0
|
0
|
$total *= -1; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub googol () { |
224
|
0
|
|
|
0
|
0
|
0
|
prefix(10 ** 100); |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub googolplex () { |
228
|
0
|
|
|
0
|
0
|
0
|
prefix(10 ** (10 ** 100)); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub failure ($) { |
232
|
6
|
50
|
|
6
|
0
|
15
|
print STDERR shift()."\n" if $debug; |
233
|
6
|
|
|
|
|
24
|
return; # undef on failure |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub words2nums ($) { |
237
|
66
|
|
|
66
|
0
|
7461
|
local $_=lc(shift); |
238
|
66
|
|
|
|
|
104
|
chomp $_; |
239
|
|
|
|
|
|
|
|
240
|
66
|
|
|
|
|
112
|
s/,//; # ignore comma, even if it's in a plain number |
241
|
66
|
100
|
|
|
|
268
|
return $_ if /^[-+]?[.0-9\s]+$/; # short circuit for plain number |
242
|
|
|
|
|
|
|
|
243
|
60
|
50
|
|
|
|
311
|
if (/^[-+0-9.]+$/) { |
244
|
0
|
0
|
|
|
|
0
|
return failure("+ or - not at beginning") if length $_; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
60
|
|
|
|
|
193
|
s/\b(and|a|of)\b//g; # ignore some common words |
248
|
60
|
|
|
|
|
243
|
s/[^A-Za-z0-9.]//g; # ignore spaces and punctuation, except period. |
249
|
60
|
100
|
|
|
|
135
|
return failure("not a number") unless length $_; |
250
|
|
|
|
|
|
|
|
251
|
57
|
|
|
|
|
235
|
$total=$oldpre=$suffix=$newmult=0; |
252
|
57
|
|
|
|
|
56
|
$mult=1; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# Work backwards up the string. |
255
|
57
|
|
|
|
|
102
|
while (length $_) { |
256
|
60
|
|
|
|
|
2116
|
$nametosub{$1}[0]->($nametosub{$1}[1]) while s/$numregexp$//; |
257
|
60
|
100
|
|
|
|
173
|
if (length $_) { |
258
|
14
|
100
|
|
|
|
68
|
if (s/(\d+)(?:st|nd|rd|th)?$//) { |
259
|
11
|
|
|
|
|
19
|
num($1); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
else { |
262
|
3
|
|
|
|
|
6
|
last; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
} |
266
|
57
|
100
|
|
|
|
99
|
return failure("error at $_") if length $_; |
267
|
54
|
100
|
|
|
|
101
|
$total += $mult if $newmult; # special case for lone "thousand", etc. |
268
|
54
|
|
|
|
|
296
|
return $total; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=head1 AUTHOR |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Copyright 2001-2003 Joey Hess |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or |
276
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=cut |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
1 |