line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Lingua::EN::Numericalize - Replaces English descriptions of numbers with numerals |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=cut |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
package Lingua::EN::Numericalize; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
require Exporter; |
12
|
|
|
|
|
|
|
our @ISA = qw/Exporter/; |
13
|
|
|
|
|
|
|
our @EXPORT = qw/&str2nbr/; |
14
|
|
|
|
|
|
|
our $VERSION = substr q$Revision: 1.52 $, 10; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
local $\ = $/; |
17
|
|
|
|
|
|
|
our $debug = 0; |
18
|
|
|
|
|
|
|
our $UK = 0; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
use Lingua::EN::Numericalize; |
23
|
|
|
|
|
|
|
print str2nbr("one thousand maniacs"); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$_ = "six hundred three-score and six"; |
26
|
|
|
|
|
|
|
str2nbr(); |
27
|
|
|
|
|
|
|
print; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$Lingua::EN::Numericalize::UK = 1; |
30
|
|
|
|
|
|
|
print str2nbr("one billion"); # 1,000,000,000,000 |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This module interpolates English descriptions of numbers in a given string with their numeric counterparts. It supports both ordinal and cardinal numbers, negative numbers, and very large numbers. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
The module exports a single function into the caller's namespace as follows: |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=over |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item B |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
This function receives an optional string (using $_ if none is passed) and converts all English text that describes a number into its numeric equivalent. When called in a void context, the function sets $_ to the new value. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=back |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
The module's behaviour is affected by the following variables: |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=over |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub str2nbr { |
53
|
101
|
|
|
101
|
1
|
9865
|
my $s = lc(shift); |
54
|
101
|
50
|
|
|
|
216
|
local $_ if wantarray(); |
55
|
|
|
|
|
|
|
|
56
|
101
|
|
|
|
|
5842
|
$s =~ s/$_/$strrep{$_}/eeg for keys %strrep; |
|
11
|
|
|
|
|
796
|
|
57
|
|
|
|
|
|
|
|
58
|
101
|
|
|
|
|
179
|
my @ret; |
59
|
101
|
|
|
|
|
518
|
for (split /\b/, $s) { |
60
|
355
|
100
|
|
|
|
982
|
push(@ret, $_), next if /^\d+$/; |
61
|
332
|
100
|
|
|
|
858
|
push(@ret, $_), next if /[^a-zA-Z0-9]/; |
62
|
204
|
|
|
|
|
329
|
push(@ret, word2num()); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# generate number sequences |
66
|
|
|
|
|
|
|
|
67
|
101
|
|
|
|
|
159
|
my $i = 0; |
68
|
101
|
|
|
|
|
234
|
while ($i < $#ret) { |
69
|
259
|
100
|
|
|
|
441
|
$ret[$i] = [ $ret[$i] ], $n = 1 if isnbr($ret[$i]); |
70
|
259
|
100
|
|
|
|
537
|
if (ref($ret[$i])) { |
71
|
248
|
|
|
|
|
332
|
my $next = $ret[$i + 1]; |
72
|
248
|
100
|
|
|
|
322
|
if (isnbr($next)) { |
73
|
113
|
|
|
|
|
102
|
push @{$ret[$i]}, $next; |
|
113
|
|
|
|
|
214
|
|
74
|
113
|
|
|
|
|
154
|
splice(@ret, $i + 1, 1); |
75
|
113
|
|
|
|
|
280
|
next; |
76
|
|
|
|
|
|
|
} |
77
|
135
|
|
|
|
|
219
|
my $nexxt = $ret[$i + 2]; |
78
|
135
|
50
|
66
|
|
|
186
|
if (isconj($next) && (isnbr($nexxt) || isconj($nexxt))) { |
|
|
|
66
|
|
|
|
|
79
|
132
|
|
|
|
|
202
|
splice(@ret, $i + 1, 1); |
80
|
132
|
|
|
|
|
337
|
next; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} |
83
|
14
|
|
|
|
|
34
|
$i++; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# calculate sequences |
87
|
|
|
|
|
|
|
|
88
|
101
|
|
100
|
|
|
341
|
ref && ($_ = seq2int(@$_)) for @ret; |
89
|
|
|
|
|
|
|
|
90
|
101
|
|
|
|
|
733
|
$_ = join "", @ret; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item B<$Lingua::EN::Numericalize::UK> |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
This variable may be set to indicate that the UK meaning of C should be used. By default, this module uses the American meaning of this word :( Please note that all the related larger numbers e.g. trillion, quadrillion, etc. assume the chosen behaviour as well. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=item B<$Lingua::EN::Numericalize::debug> |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
If set to true, the module outputs on standard error messages useful for debugging. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=back |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=cut |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub isnbr { |
106
|
639
|
|
100
|
639
|
0
|
3486
|
! /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ && return for @_; |
107
|
263
|
|
|
|
|
696
|
return 1; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub isconj { |
111
|
159
|
|
33
|
159
|
0
|
305
|
my $w = shift || $_; |
112
|
159
|
|
100
|
|
|
3770
|
$w =~ /$_/ && return 1 for @conj; |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# splits "fourtytwo", "onehundred", etc. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub compound { |
118
|
204
|
|
0
|
204
|
0
|
421
|
my $w = shift || $_ || return; |
119
|
|
|
|
|
|
|
|
120
|
204
|
|
|
|
|
4346
|
$w =~ s/(\d)$_$/$1$abb{$_}/ for keys %abb; |
121
|
|
|
|
|
|
|
|
122
|
204
|
|
|
|
|
317
|
my @ret; my @w2n = keys %word2num; |
|
204
|
|
|
|
|
3307
|
|
123
|
204
|
|
|
|
|
1025
|
for (my $i = 0; $i < @w2n; $i++) { |
124
|
8623
|
100
|
|
|
|
75589
|
push(@ret, $word2num{$w2n[$i]}), $i = 0 |
125
|
|
|
|
|
|
|
if $w =~ s/$w2n[$i]$//; |
126
|
8623
|
100
|
|
|
|
29205
|
last unless $w; |
127
|
|
|
|
|
|
|
} |
128
|
204
|
100
|
|
|
|
408
|
push @ret, $w if $w; |
129
|
204
|
|
|
|
|
1109
|
reverse @ret; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub word2num { |
133
|
204
|
|
50
|
204
|
0
|
649
|
my $w = shift || $_ || return; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
$w =~ s/$_/$tokrep{$_}/g |
136
|
204
|
|
|
|
|
6583
|
for keys %tokrep; |
137
|
|
|
|
|
|
|
|
138
|
204
|
|
|
|
|
314
|
my @ret; |
139
|
204
|
|
|
|
|
328
|
for $w (compound($w)) { |
140
|
209
|
|
|
|
|
260
|
my $o = $w; |
141
|
209
|
|
|
|
|
476
|
for (keys %suffix) { |
142
|
715
|
|
100
|
|
|
9019
|
my ($m) = $w =~ /(.*)$_$/; $m ||= ""; |
|
715
|
|
|
|
|
2328
|
|
143
|
715
|
100
|
|
|
|
2003
|
$w = $suffix{$_}->($word2num{$m}), last |
144
|
|
|
|
|
|
|
if $word2num{$m}; |
145
|
|
|
|
|
|
|
} |
146
|
209
|
|
|
|
|
629
|
push @ret, $w; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
204
|
|
|
|
|
561
|
@ret; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub seq2int { |
153
|
142
|
|
|
142
|
0
|
237
|
my @seq = @_; |
154
|
142
|
50
|
|
|
|
244
|
print "seq2int(): ", join "-", @seq if $debug; |
155
|
142
|
|
|
|
|
201
|
my ($i, $max) = (0) x 2; |
156
|
142
|
|
66
|
|
|
661
|
($max < $seq[$_]) && ($max = $seq[$_], $i = $_) for 0 .. $#seq; |
157
|
142
|
100
|
|
|
|
273
|
if ($i == 0) { |
158
|
92
|
|
|
|
|
87
|
my $ret = 0; |
159
|
92
|
|
|
|
|
590
|
$ret += $_ for @seq; |
160
|
92
|
|
|
|
|
370
|
return $ret; |
161
|
|
|
|
|
|
|
} |
162
|
50
|
|
|
|
|
150
|
$seq[$i] * seq2int(@seq[0 .. $i - 1]) + seq2int(@seq[$i + 1 .. $#seq]); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# conjunctions are valid separators for text numbers |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
our @conj = ('and', 'of', '\s+', '-', ','); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# abbreviations |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
our %abb = ( |
172
|
|
|
|
|
|
|
k => "0" x 3, |
173
|
|
|
|
|
|
|
m => "0" x 6, |
174
|
|
|
|
|
|
|
b => "0" x ($UK ? 12 : 9), |
175
|
|
|
|
|
|
|
); |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
our %strrep = ( |
178
|
|
|
|
|
|
|
'milion' => q/"million"/, # common mispelling |
179
|
|
|
|
|
|
|
'(\d)\s*,\s*(\d)' => q/"$1$2"/, # commas in numbers ok to remove |
180
|
|
|
|
|
|
|
q/baker('?s)?(\s+)?dozen/ => q/"baker"/, # colloquialism |
181
|
|
|
|
|
|
|
'(\d)(st|nd|rd|th)' => q/"$1"/, |
182
|
|
|
|
|
|
|
); |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
our %tokrep = ( |
185
|
|
|
|
|
|
|
'th$' => "", # cardinals |
186
|
|
|
|
|
|
|
'(s?e)?s$' => "", # pluralis |
187
|
|
|
|
|
|
|
'tie' => "ty", # four[tie]th |
188
|
|
|
|
|
|
|
); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
our %suffix = ( |
191
|
|
|
|
|
|
|
teen => sub { 10 + shift }, |
192
|
|
|
|
|
|
|
ty => sub { 10 * shift }, |
193
|
|
|
|
|
|
|
illiard => sub { 10 ** (9 + 6 * (shift() - 1)) }, |
194
|
|
|
|
|
|
|
illion => sub { |
195
|
|
|
|
|
|
|
my $k = shift; |
196
|
|
|
|
|
|
|
return 1e6 if $k == 1; |
197
|
|
|
|
|
|
|
my $n = $UK ? 6 * $k : 3 * ($k - 1); |
198
|
|
|
|
|
|
|
10 ** ($n + 6); |
199
|
|
|
|
|
|
|
}, |
200
|
|
|
|
|
|
|
); |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
our %latin = ( |
203
|
|
|
|
|
|
|
un => 1, |
204
|
|
|
|
|
|
|
duo => 2, |
205
|
|
|
|
|
|
|
tre => 3, tr => 3, |
206
|
|
|
|
|
|
|
quattuor => 4, quadr => 4, |
207
|
|
|
|
|
|
|
quin => 5, quint => 5, |
208
|
|
|
|
|
|
|
sex => 6, sext => 6, |
209
|
|
|
|
|
|
|
septen => 7, sept => 7, |
210
|
|
|
|
|
|
|
octo => 8, oct => 8, |
211
|
|
|
|
|
|
|
novem => 9, non => 9, |
212
|
|
|
|
|
|
|
dec => 10, |
213
|
|
|
|
|
|
|
undec => 11, |
214
|
|
|
|
|
|
|
duodec => 12, |
215
|
|
|
|
|
|
|
tredec => 13, |
216
|
|
|
|
|
|
|
quattuordec => 14, |
217
|
|
|
|
|
|
|
quindec => 15, |
218
|
|
|
|
|
|
|
hex => 16, |
219
|
|
|
|
|
|
|
vigint => 20, vig => 20, |
220
|
|
|
|
|
|
|
trig => 30, |
221
|
|
|
|
|
|
|
cent => 100, |
222
|
|
|
|
|
|
|
); |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
our %word2num = ( |
225
|
|
|
|
|
|
|
naught => 0, |
226
|
|
|
|
|
|
|
first => 1, |
227
|
|
|
|
|
|
|
second => 2, |
228
|
|
|
|
|
|
|
third => 3, |
229
|
|
|
|
|
|
|
zero => 0, |
230
|
|
|
|
|
|
|
one => 1, |
231
|
|
|
|
|
|
|
two => 2, |
232
|
|
|
|
|
|
|
three => 3, thir => 3, |
233
|
|
|
|
|
|
|
four => 4, for => 4, |
234
|
|
|
|
|
|
|
five => 5, fif => 5, |
235
|
|
|
|
|
|
|
six => 6, |
236
|
|
|
|
|
|
|
seven => 7, |
237
|
|
|
|
|
|
|
eight => 8, eigh => 8, |
238
|
|
|
|
|
|
|
nine => 9, nin => 9, |
239
|
|
|
|
|
|
|
ten => 10, |
240
|
|
|
|
|
|
|
eleven => 11, |
241
|
|
|
|
|
|
|
twelve => 12, twelf => 12, |
242
|
|
|
|
|
|
|
twen => 2, |
243
|
|
|
|
|
|
|
hundred => 100, |
244
|
|
|
|
|
|
|
thousand => 1000, |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
m => 1, # million/milliard |
247
|
|
|
|
|
|
|
b => 2, # billion |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
googol => 10 ** 100, |
250
|
|
|
|
|
|
|
googolplex => 10 ** (10 ** 100), |
251
|
|
|
|
|
|
|
score => 20, |
252
|
|
|
|
|
|
|
gros => 12 * 12, # gross |
253
|
|
|
|
|
|
|
dozen => 12, |
254
|
|
|
|
|
|
|
baker => 13, |
255
|
|
|
|
|
|
|
eleventyone => 111, |
256
|
|
|
|
|
|
|
eleventyfirst => 111, |
257
|
|
|
|
|
|
|
); |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
%word2num = (%word2num, %latin); |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
1; |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
__END__ |