File Coverage

blib/lib/Lingua/EN/Words2Nums.pm
Criterion Covered Total %
statement 44 50 88.0
branch 21 28 75.0
condition n/a
subroutine 8 11 72.7
pod 0 9 0.0
total 73 98 74.4


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2              
3             =head1 NAME
4              
5             Lingua::EN::Words2Nums - convert English text to numbers
6              
7             =cut
8              
9             package Lingua::EN::Words2Nums;
10 1     1   172616 use warnings;
  1         3  
  1         50  
11 1     1   7 use strict;
  1         1  
  1         1956  
12             require Exporter;
13             our @ISA=qw(Exporter);
14             our @EXPORT=qw(&words2nums);
15              
16             =head1 SYNOPSIS
17              
18             use Lingua::EN::Words2Nums;
19             $num=words2nums("two thousand and one");
20             $num=words2nums("twenty-second");
21             $num=words2nums("15 billion, 6 million, and ninteen");
22              
23             =head1 DESCRIPTION
24              
25             This module converts English text into numbers. It supports both ordinal and
26             cardinal numbers, negative numbers, and very large numbers.
27              
28             The main subroutine, which is exported by default, is words2nums(). This
29             subroutine, when fed a string, will attempt to convert it into a number.
30             If it succeeds, the number will be returned. If it fails, it returns undef.
31              
32             =head1 VARIABLES
33              
34             There are a number of variables that can be used to tweak the behavior of this
35             module. For example, debugging can be be enabled by setting
36             $Lingua::EN::Words2Nums::debug=1
37              
38             =over 4
39              
40             =cut
41              
42             # Public global variables.
43             our $debug = 0;
44             our $billion = 10 ** 9;
45              
46             =item $Lingua::EN::Words2Nums::debug
47              
48             Default: 0. If set to a true value, outputs on standard error some useful
49             messages if parsing fails for some reason.
50              
51             =item $Lingua::EN::Words2Nums::billion
52              
53             Default: 10 ** 9. This is the number that will be returned for "one billion".
54             It defaults to the American version; the English will want to set it to
55             10 ** 12. Setting this number automatically changes all the larger numbers
56             (trillion, quadrillion, etc) to match.
57              
58             =back
59              
60             =head1 NOTES
61              
62             It does not understand decimals or fractions, yet.
63              
64             Scores are supported, eg: "four score and ten". So are dozens. So is a baker's
65             dozen. And a gross.
66              
67             Various mispellings of numbers are understood.
68              
69             While it handles googol correctly, googolplex is too large to fit in perl's
70             standard scalar type, and "inf" will be returned.
71              
72             =cut
73            
74             our %nametosub = (
75             naught => [ \&num, 0 ], # Cardinal numbers, leaving out the a
76             nought => [ \&num, 0 ],
77             zero => [ \&num, 0 ], # ones that just add "th".
78             one => [ \&num, 1 ], first => [ \&num, 1 ],
79             two => [ \&num, 2 ], second => [ \&num, 2 ],
80             three => [ \&num, 3 ], third => [ \&num, 3 ],
81             four => [ \&num, 4 ], fourth => [ \&num, 4 ],
82             five => [ \&num, 5 ], fifth => [ \&num, 5 ],
83             six => [ \&num, 6 ],
84             seven => [ \&num, 7 ], seven => [ \&num, 7 ],
85             eight => [ \&num, 8 ], eighth => [ \&num, 8 ],
86             nine => [ \&num, 9 ], ninth => [ \&num, 9 ],
87             ten => [ \&num, 10 ],
88             eleven => [ \&num, 11 ],
89             twelve => [ \&num, 12 ], twelfth => [ \&num, 12 ],
90             thirteen => [ \&num, 13 ],
91             fifteen => [ \&num, 15 ],
92             eighteen => [ \&num, 18 ],
93             ninteen => [ \&num, 19 ], # common(?) mispelling
94             teen => [ \&suffix, 10 ], # takes care of the regular teens
95             twenty => [ \&num, 20 ], twentieth => [ \&num, 20 ],
96             thirty => [ \&num, 30 ], thirtieth => [ \&num, 30 ],
97             forty => [ \&num, 40 ], fortieth => [ \&num, 40 ],
98             fourty => [ \&num, 40 ], fourtieth => [ \&num, 40 ], # at least I mispell it like this
99             fifty => [ \&num, 50 ], fiftieth => [ \&num, 50 ],
100             sixty => [ \&num, 60 ], sixtieth => [ \&num, 60 ],
101             seventy => [ \&num, 70 ], seventieth => [ \&num, 70 ],
102             eighty => [ \&num, 80 ], eightieth => [ \&num, 80 ],
103             ninety => [ \&num, 90 ], ninetieth => [ \&num, 90 ],
104             ninty => [ \&num, 90 ], # common mispelling
105             hundred => [ \&prefix, 100 ],
106             thousand => [ \&prefix, 1000 ],
107             million => [ \&prefix, 10 ** 6 ],
108             milion => [ \&prefix, 10 ** 6 ], # common(?) mispelling
109             milliard => [ \&prefix, 10 ** 9 ],
110             billion => [ \&powprefix, 2 ], # These vary depending on country.
111             billiard => [ \&prefix, 10 ** 15 ],
112             trillion => [ \&powprefix, 3 ],
113             trilliard => [ \&prefix, 10 ** 21 ],
114             quadrillion => [ \&powprefix, 4 ],
115             quadrilliard => [ \&prefix, 10 ** 27 ],
116             quintillion => [ \&powprefix, 5 ],
117             quintilliard => [ \&prefix, 10 ** 33 ],
118             sextillion => [ \&powprefix, 6 ],
119             sextilliard => [ \&prefix, 10 ** 39 ],
120             septillion => [ \&powprefix, 7 ],
121             septilliard => [ \&prefix, 10 ** 45 ],
122             octillion => [ \&powprefix, 8 ],
123             octilliard => [ \&prefix, 10 ** 51 ],
124             nonillion => [ \&powprefix, 9 ],
125             nonilliard => [ \&prefix, 10 ** 57 ],
126             decillion => [ \&powprefix, 10 ],
127             decilliard => [ \&prefix, 10 ** 63 ],
128             undecillion => [ \&powprefix, 11 ],
129             undecilliard => [ \&prefix, 10 ** 69 ],
130             duodecillion => [ \&powprefix, 12 ],
131             duodecilliard => [ \&prefix, 10 ** 75 ],
132             tredecillion => [ \&powprefix, 13 ],
133             tredecilliard => [ \&prefix, 10 ** 81 ],
134             quattuordecillion => [ \&powprefix, 14 ],
135             quattuordecilliard => [ \&prefix, 10 ** 87 ],
136             quindecillion => [ \&powprefix, 15 ],
137             quindecilliard => [ \&prefix, 10 ** 93 ],
138             sexdecillion => [ \&powprefix, 16 ],
139             septendecillion => [ \&powprefix, 17 ],
140             octodecillion => [ \&powprefix, 18 ],
141             novemdecillion => [ \&powprefix, 19 ],
142             vigintillion => [ \&powprefix, 20 ],
143             unvigintillion => [ \&powprefix, 21 ],
144             duovigintillion => [ \&powprefix, 22 ],
145             duvigintillion => [ \&powprefix, 22 ], # some use this spelling
146             trevigintillion => [ \&powprefix, 23 ],
147             quattuorvigintillion => [ \&powprefix, 24 ],
148             quinvigintillion => [ \&powprefix, 25 ],
149             sexvigintillion => [ \&powprefix, 26 ],
150             septenvigintillion => [ \&powprefix, 27 ],
151             octovigintillion => [ \&powprefix, 28 ],
152             novemvigintillion => [ \&powprefix, 29 ],
153             trigintillion => [ \&powprefix, 30 ],
154             # This process can be continued indefinitely, but one has to stop
155             # somewhere. -- A Dictionary of Units of Measurement
156             centillion => [ \&powprefix, 100 ],
157             googol => [ \&googol ],
158             googolplex => [ \&googolplex ],
159             negative => [ \&invert ],
160             minus => [ \&invert ],
161             score => [ \&prefix, 20 ],
162             gross => [ \&prefix, 12 * 12 ],
163             dozen => [ \&prefix, 12 ],
164             bakersdozen => [ \&prefix, 13 ],
165             bakerdozen => [ \&prefix, 13 ],
166             eleventyone => [ \&num, 111 ], # This nprogram written on the day
167             eleventyfirst =>[ \&num, 111 ], # FOTR released.
168             s => [ sub {} ], # ignore 's', at the end of a word,
169             # easy pluralization of dozens, etc.
170             es => [ sub {} ], # same for 'es'; for googolplexes, etc.
171             th => [ sub {} ], # ignore 'th', for cardinal nums
172             );
173              
174             # Note the ordering, so that eg, ninety has a chance to match before nine.
175             my $numregexp = join("|", reverse sort keys %nametosub);
176             $numregexp=qr/($numregexp)/;
177              
178             my ($total, $mult, $oldpre, $newmult, $suffix, $val);
179              
180             sub num ($) {
181 100     100 0 172 $val = shift;
182 100 100       181 if ($suffix) {
183 1         2 $val += $suffix;
184 1         3 $suffix = 0;
185             }
186 100         126 $total += $val * $mult;
187 100         2896 $newmult = 0;
188             }
189              
190             sub prefix ($) {
191 58     58 0 76 my $pre = shift;
192 58 100       113 if ($pre > $oldpre) { # end of a prefix chain
193 50 100       86 $total += $mult if $newmult; # special case for lone "thousand", etc.
194 50         58 $mult = 1;
195             }
196 58         71 $mult *= $pre;
197 58         56 $oldpre = $pre;
198 58         600 $newmult = 1;
199             }
200              
201             sub powprefix {
202 3     3 0 9 my $power = shift;
203 3 50       9 if ($billion == 10 ** 9) { # EN
    0          
204 3         13 prefix(10 ** (($power + 1) * 3));
205             }
206             elsif ($billion == 10 ** 12) { # GB
207 0         0 prefix(10 ** ($power * 6));
208             }
209             else {
210 0         0 failure("\$billion is set to odd value: $billion");
211             }
212             }
213              
214              
215             sub suffix ($) {
216 1     1 0 13 $suffix = shift;
217             }
218              
219             sub invert () {
220 0     0 0 0 $total *= -1;
221             }
222              
223             sub googol () {
224 0     0 0 0 prefix(10 ** 100);
225             }
226              
227             sub googolplex () {
228 0     0 0 0 prefix(10 ** (10 ** 100));
229             }
230              
231             sub failure ($) {
232 6 50   6 0 15 print STDERR shift()."\n" if $debug;
233 6         24 return; # undef on failure
234             }
235              
236             sub words2nums ($) {
237 66     66 0 7461 local $_=lc(shift);
238 66         104 chomp $_;
239              
240 66         112 s/,//; # ignore comma, even if it's in a plain number
241 66 100       268 return $_ if /^[-+]?[.0-9\s]+$/; # short circuit for plain number
242              
243 60 50       311 if (/^[-+0-9.]+$/) {
244 0 0       0 return failure("+ or - not at beginning") if length $_;
245             }
246            
247 60         193 s/\b(and|a|of)\b//g; # ignore some common words
248 60         243 s/[^A-Za-z0-9.]//g; # ignore spaces and punctuation, except period.
249 60 100       135 return failure("not a number") unless length $_;
250              
251 57         235 $total=$oldpre=$suffix=$newmult=0;
252 57         56 $mult=1;
253            
254             # Work backwards up the string.
255 57         102 while (length $_) {
256 60         2116 $nametosub{$1}[0]->($nametosub{$1}[1]) while s/$numregexp$//;
257 60 100       173 if (length $_) {
258 14 100       68 if (s/(\d+)(?:st|nd|rd|th)?$//) {
259 11         19 num($1);
260             }
261             else {
262 3         6 last;
263             }
264             }
265             }
266 57 100       99 return failure("error at $_") if length $_;
267 54 100       101 $total += $mult if $newmult; # special case for lone "thousand", etc.
268 54         296 return $total;
269             }
270              
271             =head1 AUTHOR
272              
273             Copyright 2001-2003 Joey Hess
274              
275             This module is free software; you can redistribute it and/or
276             modify it under the same terms as Perl itself.
277              
278             =cut
279              
280             1