File Coverage

blib/lib/Regexp/IntInequality.pm
Criterion Covered Total %
statement 86 86 100.0
branch 107 110 100.0
condition 42 42 100.0
subroutine 6 6 100.0
pod 1 1 100.0
total 242 245 100.0


line stmt bran cond sub pod time code
1             #!perl
2             package Regexp::IntInequality;
3 1     1   154179 use warnings;
  1         2  
  1         67  
4 1     1   6 use strict;
  1         2  
  1         47  
5 1     1   8 use Exporter 'import';
  1         3  
  1         35  
6 1     1   20 use Carp;
  1         3  
  1         1877  
7              
8             our $VERSION = '0.90';
9             # For AUTHOR, COPYRIGHT, AND LICENSE see the bottom of this file
10              
11             our @EXPORT_OK = qw/ re_int_ineq /;
12              
13             =head1 Name
14              
15             Regexp::IntInequality - generate regular expressions to match integers
16             greater than / less than / etc. a value
17              
18             =head1 Synopsis
19              
20             use Regexp::IntInequality 're_int_ineq';
21            
22             # regex to match non-negative integers > 42 (ignores minus signs!):
23             my $gt = re_int_ineq('>', 42);
24             my $str = "Do you know why 23, 74, and 47 are special? And what about 42?";
25             while ( $str =~ /($gt)/g ) {
26             print "Match: $1\n"; # prints "Match: 74" and "Match: 47"
27             }
28            
29             # regex to match any integer <= 42:
30             # (the "map" is a trick to get a qr// regex in one line)
31             my ($le) = map {qr/^$_$/} re_int_ineq('<=', 42, 1);
32             for my $i (-123, 42, 47) { # the first two match, third doesn't
33             print $i=~$le ? "$i matches\n" : "$i doesn't match\n";
34             }
35              
36             =head1 Description
37              
38             This module provides a single function, C, which generates
39             regular expressions that match integers that fulfill a specified inequality
40             (greater than, less than, and so on). By default, only non-negative integers
41             are matched (minus signs are ignored), and optionally all integers can be
42             matched, including negative. Integers with leading zeros are never matched.
43              
44             B Normally, this is not a task for regular expressions, instead it is
45             often preferable to use regular expressions or other methods to extract the
46             numbers from a string and then use normal numeric comparison operators.
47             However, there are cases where this module can be useful, for example when
48             embedding these regular expressions as part of a larger expression or grammar,
49             or when dealing with an API that only accepts regular expressions.
50              
51             The generated regular expressions are valid in Perl, Python, and JavaScript
52             ES2018 or later, and probably in other languages that support
53             L with the same syntax.
54             L.
55              
56             =head2 C, I<$n>, I<$allint>, I<$anchor>>
57              
58             Generates a regex that matches integers according to the following parameters.
59             It is returned as a string rather than a precompiled regex so it can more
60             easily be embedded in larger expressions.
61              
62             Note the regular expressions will grow significantly the more digits are in
63             the integer. I suggest not to generate regular expressions from unbounded
64             user input.
65              
66             =head3 C<$op>
67              
68             The operator the regex should implement, one of C<< ">" >>, C<< ">=" >>,
69             C<< "<" >>, C<< "<=" >>, C<"!=">, or C<"=="> (the latter is provided simply
70             for completeness, despite the name of this module.)
71              
72             =head3 C<$n>
73              
74             The integer against which the regex should compare. It may not have leading
75             zeroes and may only be negative when L|/$allint> is a true value.
76              
77             =head3 C<$allint>
78              
79             If C<$allint> is B, then the generated regex will
80             only cover positive integers and zero, and C<$n> may not be negative.
81             B that in this case, any minus signs before integers are not
82             included in the regex. This means that when using the regex, for example, to
83             extract integers greater than 10 from the string C<"3 5 15 -7 -12">, it will
84             match C<"15"> B C<"12">!
85              
86             If C<$allint> is B, then the generated regex will cover all
87             integers, including negative, and C<$n> may also be any integer. Note
88             that all generated regexes that match zero will also match C<"-0"> and vice
89             versa.
90              
91             =head3 C<$anchor> and Anchoring
92              
93             If this argument is omitted, meaning if the function is called with two or
94             three arguments, this option is on by default. However, if this argument is
95             given explicitly, meaning the function is called with four arguments, then any
96             true value turns on the option, while any false value, I C,
97             turns the option off. This is important to note because many other functions
98             in Perl will make not make a distinction between omitting an argument and
99             passing C for that argument.
100              
101             When this option is on (see explanation above), the regex will have zero-width
102             assertions (a.k.a. anchors) surrounding the expression in order to prevent
103             matches inside of integers. For example, when extracting integers less than 20
104             from the string C<"1199 32 5">, the generated regex will by default I
105             extract the C<"11"> or C<"19"> from C<"1199">, and will only match C<"5">. On
106             the other hand, any non-digit characters (including minus signs) are
107             considered delimiters: extracting all integers less than 5 from the string
108             C<"2x3-3-24y25"> with L|/$allint> turned on will result in C<"2">,
109             C<"3">, C<"-3">, and C<"-24">.
110              
111             This behavior is useful if you are extracting numbers from a longer string.
112             If you want to validate that a string contains I an integer, then you
113             will need to add additional anchors. For example, assuming you've stored the
114             output of C in C<$re>, then you could say C<$str =~ /\A$re\z/> to
115             validate that C<$str> contains only that integer.
116             However, this task is more commonly done by first checking that C<$str> is a
117             valid integer in general, such as via the expressions provided by
118             L, and then using normal numeric comparisons to
119             check that it is in the range you expect.
120              
121             If on the other hand you want to turn off the default anchors described above,
122             perhaps because you want to implement your own, then you can pass a false
123             value for the C<$anchor> option. Repeating the above example, extracting
124             integers less than 20 from the string C<"1199 32 5"> with this option on and
125             no additional anchors results in C<"11">, C<"9">, C<"9">, C<"3">, C<"2">,
126             and C<"5"> - so use this feature with caution and testing!
127              
128             =begin comment
129              
130             1. < 2 -inf <--------------0- 1
131             2. <= 2 -inf <--------------0---- 2
132             3. => 2 0 2 ------> +inf
133             4. > 2 0 3 ---> +inf
134             --+--+--+--+--+--0--+--+--+--+--+--
135             -5 -4 -3 -2 -1 0 1 2 3 4 5
136             5. < -2 -inf <--- -3 0
137             6. <= -2 -inf <------ -2 0
138             7. => -2 -2 ----0-------------> +inf
139             8. > -2 -1 -0-------------> +inf
140              
141             1. positive ints up to the value, plus all negative ints
142              
143             2. gets converted to (1)
144              
145             3. gets converted to (4)
146              
147             4. positive ints starting at the value
148              
149             5. gets reflected, handled like (4), and reflected back
150              
151             6. gets converted to (5)
152              
153             7. gets converted to (8)
154              
155             8. gets reflected, handled like (1), and reflected back
156              
157             =end comment
158              
159             =cut
160              
161             # Regex character ranges for single digits
162             # ($_RNG_GT[0] is "all digits > 0" and so on; @_RNG_LT1 doesn't include 0)
163             my @_RNG_GT = ( map({"[$_-9]"} 1..7), '[89]', '9', '(?!)' );
164             my @_RNG_LT0 = ( '(?!)', '0', '[01]', map({"[0-$_]"} 2..8) );
165             my @_RNG_LT1 = ( '(?!)', '(?!)', '1', '[12]', map({"[1-$_]"} 3..8) );
166              
167             # A few constants
168             my @_ALLINT_ZN = ('-0','0','-[1-9][0-9]*'); # all ints, only zero & negative
169             my @_ALLINT_ZP = ('-0','0','[1-9][0-9]*'); # all ints, only zero & positive
170             my @_ALLINT_NN = ('0','[1-9][0-9]*'); # all non-negative ints
171              
172             my $_PREFIX_NN = '(?
173             my $_PREFIX_AI = '(?
174             my $_SUFFIX = '(?![0-9])';
175              
176             sub re_int_ineq { ## no critic (ProhibitExcessComplexity)
177             # operator, integer, "all integers" (negative), anchors
178 1294     1294 1 65639922 my ($op, $n, $ai, $anchor) = @_;
179              
180             # Handle arguments
181 1294 100       4462 $anchor=1 if @_<4;
182 1294 100 100     8614 croak "invalid arguments to re_int_ineq"
      100        
183             if !defined $op || !defined $n || @_>4;
184 1291 100       3338 if ($ai) { $n =~ /\A-?(?:0|[1-9][0-9]*)\z/ or croak "invalid int" }
  670 100       6422  
185 621 100       5159 else { $n =~ /\A(?:0|[1-9][0-9]*)\z/ or croak "invalid non-negative int" }
186              
187             # Handle easy operators first
188 1279 100       4397 if ($op eq '==') {
    100          
189 176 100 100     669 return $n==0 && $ai ? '-?0' : $n unless $anchor;
    100          
190 170 100 100     957 return "(?:${_PREFIX_AI}0|-0)$_SUFFIX" if $n==0 && $ai;
191 162 100       1227 return $n.$_SUFFIX if $n=~/\A-/;
192 124 100       1050 return +( $ai ? $_PREFIX_AI : $_PREFIX_NN ).$n.$_SUFFIX
193             }
194             elsif ($op eq '!=') {
195 176 100       693 if (!$anchor) {
196 6 100       63 return +( $ai ? '-?' : '' ). '[1-9][0-9]*' if $n==0;
    100          
197 3 100       11 return "(?!$n)".( $ai ? '-?' : '' ).'(?:0|[1-9][0-9]*)'
198             }
199 170 100       1084 return "(?!".( $n == 0 ? '-?0' : $n ).$_SUFFIX.")"
    100          
200             ."(?:$_PREFIX_AI(?:0|[1-9][0-9]*)|-0|-[1-9][0-9]*)$_SUFFIX"
201             if $ai;
202 86         477 return "(?!$n$_SUFFIX)$_PREFIX_NN(?:0|[1-9][0-9]*)$_SUFFIX"
203             }
204              
205             my $mkre = sub {
206 919     919   2079 my %se = map {$_=>1} @_;
  3119         7051  
207 919 50       2494 confess "assertion failed: no re" unless %se; # uncoverable branch true
208              
209             # A bit of optimization
210             delete $se{'0'}
211 919 100 100     3389 if $se{'[1-9]?[0-9]'} || grep {/\A\[0-?\d\]\z/} keys %se;
  2730         6630  
212             delete $se{'-0'}
213 919 100 100     6799 if $se{'-[1-9]?[0-9]'} || grep {/\A-\[0-?\d\]\z/} keys %se;
  2904         6161  
214              
215             # Separate positive and negative terms
216 919         1621 my (@pos, @neg);
217 919 100       2200 for (keys %se) { if (/\A-/) { push @neg,$_ } else { push @pos,$_ } }
  3026         6008  
  855         1828  
  2171         3891  
218             #@pos = sort { length($a)<=>length($b) or $a cmp $b } @pos;
219             #@neg = sort { length($a)<=>length($b) or $a cmp $b } @neg;
220             # simple sorting seems to work well enough:
221 919         3031 @pos = sort @pos;
222 919         1863 @neg = sort @neg;
223              
224 919         1377 my @all;
225             # Handle positive values - need prefix
226 919 100       2588 if (!$anchor) { push @all, @pos }
  8 100       41  
227 792 100       4440 elsif (@pos) { push @all, ( $ai ? $_PREFIX_AI : $_PREFIX_NN )
    100          
228             .( @pos>1 ? '(?:'.join('|',@pos).')' : $pos[0] ) }
229              
230             # Handle negative values
231             # The @neg>5 case is just for a small length reduction:
232             # 4: "-a|-b|-c|-d"=11 "-(?:a|b|c|d)"=12 +1
233             # 5: "-a|-b|-c|-d|-e"=14 "-(?:a|b|c|d|e)"=14 0
234             # 6: "-a|-b|-c|-d|-e|-f"=17 "-(?:a|b|c|d|e|f)"=16 -1
235 919 100       3623 if (@neg<6) { push @all, @neg }
  907         1719  
236 12         36 else { push @all, '-(?:'.join('|', map {substr $_,1} @neg ).')' }
  74         146  
237              
238             # Done
239 919 50       1805 confess "assertion failed: no re" unless @all; # uncoverable branch true
240 919 100       13128 return +( @all>1 ? '(?:'.join('|',@all).')' : $all[0] )
    100          
241             .( $anchor ? $_SUFFIX : '' );
242 927         6722 };
243              
244             # Inspect operator and adjust $n accordingly
245 927         1658 my $gt_not_lt; # Note: may be modified by $reflect below!
246 927 100 100     4742 if ($op eq '>' || $op eq '>=') {
    100 100        
247 467 100       1263 $n-- if $op eq '>='; # turn >= into >
248 467         692 $gt_not_lt = 1;
249             }
250             elsif ($op eq '<' || $op eq '<=') {
251 458 100       1439 $n++ if $op eq '<='; # turn <= into <
252 458         838 $gt_not_lt = 0;
253             }
254 2         326 else { croak "unknown operator" }
255              
256             # Handle some special cases the code below doesn't handle
257 925 100 100     2787 return '(?!)' if $n==0 && !$gt_not_lt && !$ai; # "<0"
      100        
258 919 100 100     2843 return $mkre->( $ai ? @_ALLINT_ZP : @_ALLINT_NN )
    100          
259             if $n==-1 && $gt_not_lt; # ">-1"/">=0"
260 899 100       2053 return $mkre->( $gt_not_lt ? '[1-9][0-9]*' : '-[1-9][0-9]*' )
    100          
261             if $n==0; # ">0"/">=1" and "<0"/"<=-1"
262              
263             # Prepare some variables
264 863   100     2290 my $reflect = $ai && $n<0; # reflect the number line over zero
265 863 100       3157 $gt_not_lt = !$gt_not_lt if $reflect; # invert operator
266 863 100       2292 my $an = $reflect ? -$n : $n; # invert input - abs($n)
267 863 100       2173 my $minus = $reflect ? '-' : ''; # invert output
268 863 50       3424 confess "assertion failed: an=$an" unless $an=~/\A[1-9][0-9]*\z/; # uncoverable branch true
269              
270 863         1311 my %subex;
271              
272             # Add the other half of the number line
273 863 100 100     2411 if ($ai && !$gt_not_lt)
274 229 100       2048 { $subex{$_}++ for $reflect ? @_ALLINT_NN : @_ALLINT_ZN }
275              
276             # Add expressions for all ints with a different number of digits
277 863 100       2498 if ($gt_not_lt) { # ">": all ints with more digits should match
278 425 100       1296 if (length($an)==1) # len 1 => 2+ digits
279 109         391 { $subex{$minus.'[1-9][0-9]+'}++ }
280             else # len 2+ => 3+ digits
281 316         1408 { $subex{$minus.'[1-9][0-9]{'.length($an).',}'}++ }
282             }
283             else { # "<": all ints with less digits should match
284 438 100       1628 if (length($an)>3) { # len 4+ => one to len-1 digits
    100          
    100          
285 95         334 $subex{$minus.'0'}++;
286 95         382 $subex{$minus.'[1-9][0-9]{0,'.(length($an)-2).'}'}++;
287             }
288             elsif (length($an)>2) # len 3 => one or two digits
289 125         589 { $subex{$minus.'[1-9]?[0-9]'}++ }
290             elsif (length($an)>1) # len 2 => one digit
291 106         454 { $subex{$minus.'[0-9]'}++ }
292             }
293              
294             # Add expressions for ints with the same number of digits
295 863         2529 my @dig = split //, $an;
296 863         2589 for my $i (0..$#dig) {
297 2129         3430 my $rest = $#dig-$i;
298 2129 100 100     6105 my $rng = $gt_not_lt ? \@_RNG_GT
    100          
299             : !$#dig||$i ? \@_RNG_LT0 : \@_RNG_LT1;
300 2129 100       11301 $subex{ $minus . substr($an,0,$i) . $rng->[$dig[$i]]
    100          
    100          
301             . ( !$rest ? '' : $rest==1 ? '[0-9]' : "[0-9]{$rest}" ) }++
302             # filter these out right away, since they won't match:
303             unless $rng->[$dig[$i]] eq '(?!)';
304             }
305              
306 863         3197 return $mkre->( keys %subex );
307             }
308              
309             1;
310             __END__