| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
############################################### |
|
4
|
|
|
|
|
|
|
# Author: Olivier Delouya - 2026 |
|
5
|
|
|
|
|
|
|
# File: Tstregex.pm (Hybrid Modulino) |
|
6
|
|
|
|
|
|
|
# Content: Regex longest match diagnostic |
|
7
|
|
|
|
|
|
|
# indent: Whitesmith (perltidy -bl -bli) |
|
8
|
|
|
|
|
|
|
############################################### |
|
9
|
|
|
|
|
|
|
|
|
10
|
15
|
|
|
15
|
|
1776319
|
use strict; |
|
|
15
|
|
|
|
|
29
|
|
|
|
15
|
|
|
|
|
600
|
|
|
11
|
15
|
|
|
15
|
|
71
|
use warnings; |
|
|
15
|
|
|
|
|
71
|
|
|
|
15
|
|
|
|
|
1335
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Tstregex - A Hybrid Regex Diagnostic Tool (single file Library module and command tool) |
|
16
|
|
|
|
|
|
|
shows the longest Regular Expression match / highlight the rejected part |
|
17
|
|
|
|
|
|
|
Example: |
|
18
|
|
|
|
|
|
|
$ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a' |
|
19
|
|
|
|
|
|
|
abc123 |
|
20
|
|
|
|
|
|
|
abcB<12a> (B<^[a-z]*>\d{3}$) |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring (idem with regexp lexical groups between parenthesis) |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
C<$ tstregex 'regex' string1 string2 ... stringN |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 OPTIONS (CLI) |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head2 -h --help |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
show that help.. |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head2 -v --verbose |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
shows key info on (un)matching.. |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head2 -d --diag |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Triggers the Enriched Diagnostic View. It displays: |
|
41
|
|
|
|
|
|
|
- The string with the failing part highlighted. |
|
42
|
|
|
|
|
|
|
- The exact token in the regex that caused the break. |
|
43
|
|
|
|
|
|
|
- A visual pointer (C<^--- HERE>) aligned with the regex syntax. |
|
44
|
|
|
|
|
|
|
- Execution time (useful for spotting ReDoS/Exponential backtracking). |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head2 -a --assert |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Misc: performs a huge test suite various a large collection of regexp tests with Tstregex.. |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=head1 Perl Module SYNOPSIS |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
use Tstregex; |
|
55
|
|
|
|
|
|
|
my $ctx = tstregex_init_desc('/^\d{3}/'); |
|
56
|
|
|
|
|
|
|
tstregex($ctx, '12a'); |
|
57
|
|
|
|
|
|
|
if (!tstregex_is_full_match($res)) |
|
58
|
|
|
|
|
|
|
{ |
|
59
|
|
|
|
|
|
|
my $token = tstregex_get_fail_token($res); |
|
60
|
|
|
|
|
|
|
my $pos = tstregex_get_match_len($res); |
|
61
|
|
|
|
|
|
|
print "Failure on token '$token' at column $pos\n"; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 API |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 tstregex_init_desc($raw_re) |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head2 tstregex($ctx, $string) |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Executes the diagnostic. Updates the context. |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head2 tstregex_is_full_match |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Returns match status of input string (BOOL 0 OR 1) |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head2 tstregex_get_match_portion |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Returns the matching portion in case of full match |
|
81
|
|
|
|
|
|
|
(might be smaller than input string, depending on anchors..) |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head2 tstregex_get_match_len |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
Returns the matching substring length |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
=head2 tstregex_get_fail_token |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Returns the failing token in the regexp |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 tstregex_get_re_clean |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Returns the matching regexp subpart |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head2 tstregex_get_re_raw |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Returns the internal representation of the regexp |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head2 tstregex_get_prefix_offset |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Returns the offset of the original regexp in the raw regexp |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
C is designed to solve the "Black Box" problem of Regular Expressions. |
|
106
|
|
|
|
|
|
|
When a complex regex fails, Perl usually just says "No Match". This tool |
|
107
|
|
|
|
|
|
|
identifies exactly B and B it failed by finding the longest possible |
|
108
|
|
|
|
|
|
|
partial match. |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 EXAMPLE |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
$ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a' |
|
113
|
|
|
|
|
|
|
abc123 |
|
114
|
|
|
|
|
|
|
abcB<12a> (B<^[a-z]*>\d{3}$) |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
I |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head2 The "Nibbling" Engine |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
The diagnostic logic uses a "Nibbling" (grignotage) strategy: |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=over 4 |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=item 1. Decomposition |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest. |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=item 2. Longest Match Search |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
It iteratively tests these sub-patterns against the input string. It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall. |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=item 3. Failure Point Identification |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure". |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=back |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 AUTHOR |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Olivier Delouya - 2026 |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 LICENSE |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Artistic Version 2 |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=cut |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
package main; |
|
150
|
|
|
|
|
|
|
{ |
|
151
|
15
|
|
|
15
|
|
104
|
use strict; |
|
|
15
|
|
|
|
|
43
|
|
|
|
15
|
|
|
|
|
352
|
|
|
152
|
15
|
|
|
15
|
|
85
|
use warnings; |
|
|
15
|
|
|
|
|
43
|
|
|
|
15
|
|
|
|
|
738
|
|
|
153
|
15
|
|
|
15
|
|
93
|
use Carp qw(confess); |
|
|
15
|
|
|
|
|
24
|
|
|
|
15
|
|
|
|
|
1159
|
|
|
154
|
|
|
|
|
|
|
$SIG{__WARN__} = 'confess'; |
|
155
|
|
|
|
|
|
|
$SIG{__DIE__ } = 'confess'; |
|
156
|
|
|
|
|
|
|
|
|
157
|
15
|
|
|
15
|
|
9822
|
use Term::ANSIColor qw(:constants); |
|
|
15
|
|
|
|
|
146642
|
|
|
|
15
|
|
|
|
|
18009
|
|
|
158
|
15
|
|
|
15
|
|
149
|
use Time::HiRes qw(gettimeofday tv_interval); |
|
|
15
|
|
|
|
|
27
|
|
|
|
15
|
|
|
|
|
143
|
|
|
159
|
15
|
|
|
15
|
|
8228
|
use utf8; |
|
|
15
|
|
|
|
|
3958
|
|
|
|
15
|
|
|
|
|
106
|
|
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# --- Constants & ASCII Codes --- |
|
162
|
|
|
|
|
|
|
use constant |
|
163
|
|
|
|
|
|
|
{ |
|
164
|
15
|
|
|
|
|
4468
|
C_m => 109, C_g => 103, C_i => 105, C_s => 115, |
|
165
|
|
|
|
|
|
|
C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48, |
|
166
|
|
|
|
|
|
|
C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97, |
|
167
|
|
|
|
|
|
|
C_LOW_Z => 122, C_UNDSC => 95, |
|
168
|
|
|
|
|
|
|
OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93, |
|
169
|
|
|
|
|
|
|
OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62, |
|
170
|
|
|
|
|
|
|
ESC => "\e", |
|
171
|
|
|
|
|
|
|
CUU => "\e[A", # Cursor Up |
|
172
|
|
|
|
|
|
|
UI_OFFSET=> 11, # Alignment offset for " Syntax: " |
|
173
|
15
|
|
|
15
|
|
1414
|
}; |
|
|
15
|
|
|
|
|
25
|
|
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# --- ENCAPSULATED DEBUG ALIAS --- |
|
176
|
|
|
|
|
|
|
BEGIN |
|
177
|
|
|
|
|
|
|
{ |
|
178
|
15
|
50
|
|
15
|
|
29334
|
if ($INC{'perl5db.pl'}) |
|
179
|
|
|
|
|
|
|
{ |
|
180
|
0
|
|
|
|
|
0
|
require Data::Dumper; |
|
181
|
0
|
|
|
|
|
0
|
require Term::ANSIColor; |
|
182
|
15
|
|
|
15
|
|
100
|
no strict 'refs'; |
|
|
15
|
|
|
|
|
40
|
|
|
|
15
|
|
|
|
|
629
|
|
|
183
|
15
|
|
|
15
|
|
67
|
no warnings 'once'; |
|
|
15
|
|
|
|
|
22
|
|
|
|
15
|
|
|
|
|
4788
|
|
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
my $debug_sub = sub |
|
186
|
|
|
|
|
|
|
{ |
|
187
|
0
|
|
|
|
|
0
|
my @args = @_; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Automatically detect flattened hashes: |
|
190
|
|
|
|
|
|
|
# If even number of arguments and the first one isn't a reference |
|
191
|
0
|
0
|
0
|
|
|
0
|
if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0])) |
|
|
|
|
0
|
|
|
|
|
|
192
|
|
|
|
|
|
|
{ |
|
193
|
|
|
|
|
|
|
# Wrap the flattened list into a temporary hashref |
|
194
|
0
|
|
|
|
|
0
|
my %tmp_hash = @args; |
|
195
|
0
|
|
|
|
|
0
|
@args = (\%tmp_hash); |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
|
|
198
|
0
|
|
|
|
|
0
|
print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(), |
|
199
|
|
|
|
|
|
|
'DEBUG (tstregex): ', Term::ANSIColor::RESET(), |
|
200
|
|
|
|
|
|
|
Data::Dumper::Dumper(@args); |
|
201
|
0
|
|
|
|
|
0
|
}; |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Force injection into all relevant namespaces |
|
204
|
0
|
|
|
|
|
0
|
foreach my $pkg ('main', 'Tstregex', 'DB') |
|
205
|
|
|
|
|
|
|
{ |
|
206
|
0
|
|
|
|
|
0
|
*{"${pkg}::d"} = $debug_sub; |
|
|
0
|
|
|
|
|
0
|
|
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
0
|
0
|
|
|
|
0
|
my $cuu = defined &main::CUU ? main::CUU() : "\e[A"; |
|
210
|
0
|
|
|
|
|
0
|
print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(), |
|
211
|
|
|
|
|
|
|
'INFO: ', Term::ANSIColor::RESET(), |
|
212
|
|
|
|
|
|
|
"Alias 'd' ready (Auto-hash detection enabled)\n\n"; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
exit(main(scalar(@ARGV), \@ARGV)) if(!caller); |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub main |
|
219
|
|
|
|
|
|
|
{ |
|
220
|
0
|
|
|
0
|
|
0
|
my ($argc, $argv) = @_; |
|
221
|
0
|
0
|
0
|
|
|
0
|
if(!$argc || ($argc && $$argv[0] =~ /^-h|--help$/)) { help(); exit(0); } |
|
|
0
|
|
0
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
222
|
|
|
|
|
|
|
|
|
223
|
0
|
|
|
|
|
0
|
binmode STDOUT, ':utf8'; |
|
224
|
0
|
|
|
|
|
0
|
my ($mode_diag, $verbose, $assert) = (0)x2; |
|
225
|
0
|
|
|
|
|
0
|
for(my $i=0; $i<$argc; $i++) |
|
226
|
|
|
|
|
|
|
{ |
|
227
|
0
|
0
|
0
|
|
|
0
|
do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d|--diag$/); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
228
|
0
|
0
|
0
|
|
|
0
|
do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v|--verbose$/); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
229
|
0
|
0
|
0
|
|
|
0
|
do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a|--assert$/); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
0
|
0
|
|
|
|
0
|
if ($assert) |
|
233
|
|
|
|
|
|
|
{ |
|
234
|
0
|
|
|
|
|
0
|
print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET; |
|
235
|
0
|
|
|
|
|
0
|
_run_internal_tests($mode_diag, $verbose); |
|
236
|
0
|
|
|
|
|
0
|
exit(0); |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
0
|
my @new_argv; |
|
240
|
0
|
|
|
|
|
0
|
foreach(@$argv) |
|
241
|
|
|
|
|
|
|
{ |
|
242
|
0
|
0
|
|
|
|
0
|
push @new_argv, $_ if(defined($_)); |
|
243
|
|
|
|
|
|
|
} |
|
244
|
0
|
|
|
|
|
0
|
$argv = \@new_argv; |
|
245
|
0
|
|
|
|
|
0
|
$argc = scalar @$argv; |
|
246
|
|
|
|
|
|
|
|
|
247
|
0
|
|
|
|
|
0
|
my $re_raw = shift @{$argv}; |
|
|
0
|
|
|
|
|
0
|
|
|
248
|
0
|
|
|
|
|
0
|
my $ctx = Tstregex::tstregex_init_desc($re_raw); |
|
249
|
0
|
|
|
|
|
0
|
my $global_result = 0; # success! BE POSITIVE !! |
|
250
|
0
|
|
|
|
|
0
|
foreach my $pattern (@{$argv}) |
|
|
0
|
|
|
|
|
0
|
|
|
251
|
|
|
|
|
|
|
{ |
|
252
|
0
|
0
|
|
|
|
0
|
my $t0 = [gettimeofday] if $mode_diag; |
|
253
|
0
|
|
|
|
|
0
|
my $result = Tstregex::tstregex($ctx, $pattern); |
|
254
|
0
|
0
|
|
|
|
0
|
$global_result = 1 if($result); |
|
255
|
0
|
0
|
|
|
|
0
|
$mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0)) |
|
256
|
|
|
|
|
|
|
: _display_standard($pattern, $ctx); |
|
257
|
0
|
0
|
|
|
|
0
|
if($verbose) |
|
258
|
|
|
|
|
|
|
{ |
|
259
|
0
|
0
|
|
|
|
0
|
print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; '; |
|
260
|
0
|
0
|
|
|
|
0
|
print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx)) |
|
261
|
|
|
|
|
|
|
: ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx)); |
|
262
|
0
|
|
|
|
|
0
|
print Term::ANSIColor::RESET(), "\n"; |
|
263
|
0
|
0
|
0
|
|
|
0
|
print $ctx->{warning} if($verbose && $ctx->{warning} ne ''); |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
} |
|
266
|
0
|
|
|
|
|
0
|
return $global_result; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub _run_internal_tests |
|
270
|
|
|
|
|
|
|
{ |
|
271
|
0
|
|
|
0
|
|
0
|
my ($mode_diag, $verbose) = @_; |
|
272
|
0
|
|
|
|
|
0
|
my $fh = \*main::DATA; |
|
273
|
0
|
|
|
|
|
0
|
seek($fh, 0, 0); |
|
274
|
|
|
|
|
|
|
|
|
275
|
0
|
|
|
|
|
0
|
my $found_data_token = 0; |
|
276
|
|
|
|
|
|
|
|
|
277
|
0
|
|
|
|
|
0
|
while (<$fh>) |
|
278
|
|
|
|
|
|
|
{ |
|
279
|
0
|
|
|
|
|
0
|
chomp; |
|
280
|
|
|
|
|
|
|
# PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker |
|
281
|
|
|
|
|
|
|
# This prevents the script from parsing its own source code |
|
282
|
0
|
0
|
|
|
|
0
|
if (!$found_data_token) |
|
283
|
|
|
|
|
|
|
{ |
|
284
|
0
|
0
|
|
|
|
0
|
$found_data_token = 1 if /^__(DATA|END)__/; |
|
285
|
0
|
|
|
|
|
0
|
next; |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
else |
|
288
|
|
|
|
|
|
|
{ |
|
289
|
0
|
0
|
|
|
|
0
|
last if /^__(DATA|END)__/; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
0
|
0
|
0
|
|
|
0
|
next if /^\s*$/ || /^#/; |
|
292
|
0
|
|
|
|
|
0
|
my ($re, @rest) = split(/\s+|:::\s*/, $_); |
|
293
|
0
|
0
|
|
|
|
0
|
next unless $re; |
|
294
|
0
|
0
|
|
|
|
0
|
my @strings = grep { $_ ne '0' && $_ ne '1' } @rest; |
|
|
0
|
|
|
|
|
0
|
|
|
295
|
|
|
|
|
|
|
|
|
296
|
0
|
|
|
|
|
0
|
print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n"; |
|
297
|
0
|
|
|
|
|
0
|
my $ctx = Tstregex::tstregex_init_desc($re); |
|
298
|
0
|
0
|
0
|
|
|
0
|
print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne ''); |
|
299
|
0
|
|
|
|
|
0
|
foreach my $s (@strings) |
|
300
|
|
|
|
|
|
|
{ |
|
301
|
0
|
|
|
|
|
0
|
my $t0; |
|
302
|
0
|
0
|
|
|
|
0
|
$t0 = [gettimeofday] if $mode_diag; |
|
303
|
0
|
|
|
|
|
0
|
Tstregex::tstregex($ctx, $s); |
|
304
|
0
|
0
|
|
|
|
0
|
$mode_diag ? _display_enriched($s, $ctx, tv_interval($t0)) |
|
305
|
|
|
|
|
|
|
: _display_standard($s, $ctx); |
|
306
|
|
|
|
|
|
|
} |
|
307
|
0
|
|
|
|
|
0
|
print '-' x 40, "\n"; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
sub _display_standard |
|
312
|
|
|
|
|
|
|
{ |
|
313
|
0
|
|
|
0
|
|
0
|
my ($pattern, $ctx) = @_; |
|
314
|
0
|
0
|
|
|
|
0
|
if (Tstregex::tstregex_is_full_match($ctx)) |
|
315
|
|
|
|
|
|
|
{ |
|
316
|
0
|
|
|
|
|
0
|
print "$pattern"; |
|
317
|
0
|
|
|
|
|
0
|
print "\n"; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
else |
|
320
|
|
|
|
|
|
|
{ |
|
321
|
0
|
|
|
|
|
0
|
my $match_len = Tstregex::tstregex_get_match_len ($ctx); |
|
322
|
0
|
|
|
|
|
0
|
my $fail_token = Tstregex::tstregex_get_fail_token($ctx); |
|
323
|
0
|
|
|
|
|
0
|
my $re_clean = Tstregex::tstregex_get_re_clean ($ctx); |
|
324
|
0
|
|
|
|
|
0
|
print substr($pattern, 0, $match_len), BOLD, substr($pattern, $match_len), RESET; |
|
325
|
0
|
|
|
|
|
0
|
my $off = length($re_clean) - length($fail_token); |
|
326
|
0
|
|
|
|
|
0
|
print ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n"; |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub _display_enriched |
|
331
|
|
|
|
|
|
|
{ |
|
332
|
0
|
|
|
0
|
|
0
|
my ($pattern, $ctx, $elapsed) = @_; |
|
333
|
0
|
|
|
|
|
0
|
print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n"; |
|
334
|
|
|
|
|
|
|
|
|
335
|
0
|
0
|
|
|
|
0
|
if (Tstregex::tstregex_is_full_match($ctx)) |
|
336
|
|
|
|
|
|
|
{ |
|
337
|
0
|
|
|
|
|
0
|
print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n"; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
else |
|
340
|
|
|
|
|
|
|
{ |
|
341
|
0
|
|
|
|
|
0
|
my $match_len = Tstregex::tstregex_get_match_len ($ctx); |
|
342
|
0
|
|
|
|
|
0
|
my $fail_token = Tstregex::tstregex_get_fail_token ($ctx); |
|
343
|
0
|
|
|
|
|
0
|
my $re_clean = Tstregex::tstregex_get_re_clean ($ctx); |
|
344
|
0
|
|
|
|
|
0
|
my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx); |
|
345
|
0
|
|
|
|
|
0
|
my $re_raw = Tstregex::tstregex_get_re_raw ($ctx); |
|
346
|
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
0
|
print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len), |
|
348
|
|
|
|
|
|
|
BOLD, WHITE, substr($pattern, $match_len), RESET; |
|
349
|
0
|
|
|
|
|
0
|
print ' (at ', CYAN, $fail_token, RESET, ")\n"; |
|
350
|
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
0
|
my $err_pos_in_clean = length($re_clean) - length($fail_token); |
|
352
|
0
|
|
|
|
|
0
|
my $final_pointer_pos = $prefix_off + $err_pos_in_clean; |
|
353
|
|
|
|
|
|
|
|
|
354
|
0
|
|
|
|
|
0
|
print ' Syntax: ', WHITE, $re_raw, RESET, "\n"; |
|
355
|
0
|
|
|
|
|
0
|
print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n"; |
|
356
|
|
|
|
|
|
|
} |
|
357
|
0
|
|
|
|
|
0
|
printf " Time: %.4fs\n\n", $elapsed; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub help |
|
361
|
|
|
|
|
|
|
{ |
|
362
|
0
|
|
|
0
|
|
0
|
print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET; |
|
363
|
0
|
|
|
|
|
0
|
print "Usage:\n"; |
|
364
|
0
|
|
|
|
|
0
|
print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n"; |
|
365
|
0
|
|
|
|
|
0
|
print "Examples:\n"; |
|
366
|
0
|
|
|
|
|
0
|
print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n"; |
|
367
|
0
|
|
|
|
|
0
|
print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n"; |
|
368
|
0
|
|
|
|
|
0
|
print BOLD, 'DELIMITERS ', RESET, "are optional\n"; |
|
369
|
0
|
|
|
|
|
0
|
print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n"; |
|
370
|
0
|
|
|
|
|
0
|
print "Options:\n"; |
|
371
|
0
|
|
|
|
|
0
|
print "-h --help Shows that help\n"; |
|
372
|
0
|
|
|
|
|
0
|
print "-v --verbose Shows keys info on match/unmatch\n"; |
|
373
|
0
|
|
|
|
|
0
|
print "-d --diag Enriched diagnostic with timing and syntax pointers\n"; |
|
374
|
0
|
|
|
|
|
0
|
print "-a --assert Misc: shows a large test of regexp against tstregex..\n"; |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
1; |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
package Tstregex; |
|
382
|
|
|
|
|
|
|
{ |
|
383
|
|
|
|
|
|
|
our $VERSION = '1.03'; |
|
384
|
15
|
|
|
15
|
|
131
|
use Exporter qw(import); |
|
|
15
|
|
|
|
|
25
|
|
|
|
15
|
|
|
|
|
4157
|
|
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
our @EXPORT = qw( |
|
387
|
|
|
|
|
|
|
tstregex |
|
388
|
|
|
|
|
|
|
tstregex_init_desc |
|
389
|
|
|
|
|
|
|
tstregex_get_match_len |
|
390
|
|
|
|
|
|
|
tstregex_get_fail_token |
|
391
|
|
|
|
|
|
|
tstregex_is_full_match |
|
392
|
|
|
|
|
|
|
tstregex_get_re_clean |
|
393
|
|
|
|
|
|
|
tstregex_get_prefix_offset |
|
394
|
|
|
|
|
|
|
tstregex_get_re_raw |
|
395
|
|
|
|
|
|
|
tstregex_get_match_portion |
|
396
|
|
|
|
|
|
|
tstregex_get_info |
|
397
|
|
|
|
|
|
|
); |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# --- PUBLIC API (The Getters) --- |
|
400
|
|
|
|
|
|
|
|
|
401
|
115
|
|
|
115
|
1
|
466
|
sub tstregex_get_match_len { return $_[0]->{match_len}; } |
|
402
|
115
|
|
|
115
|
1
|
450
|
sub tstregex_get_fail_token { return $_[0]->{fail_token}; } |
|
403
|
0
|
|
|
0
|
1
|
0
|
sub tstregex_is_full_match { return $_[0]->{full_match}; } |
|
404
|
0
|
|
|
0
|
1
|
0
|
sub tstregex_get_re_clean { return $_[0]->{re_clean}; } |
|
405
|
0
|
|
|
0
|
1
|
0
|
sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; } |
|
406
|
0
|
|
|
0
|
1
|
0
|
sub tstregex_get_re_raw { return $_[0]->{re_raw}; } |
|
407
|
115
|
|
50
|
115
|
0
|
509
|
sub tstregex_get_captures { return $_[0]->{captures} // [];} |
|
408
|
0
|
|
|
0
|
1
|
0
|
sub tstregex_get_match_portion { return $_[0]->{match_portion}; } |
|
409
|
0
|
|
|
0
|
0
|
0
|
sub tstregex_get_info { return $_[0]->{warning}; } |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# Main diagnostic function |
|
412
|
15
|
|
|
15
|
|
106
|
use constant; |
|
|
15
|
|
|
|
|
26
|
|
|
|
15
|
|
|
|
|
1448
|
|
|
413
|
|
|
|
|
|
|
use constant |
|
414
|
|
|
|
|
|
|
{ |
|
415
|
15
|
|
|
|
|
39051
|
C_EMPTY => '', |
|
416
|
|
|
|
|
|
|
RE_EMPTY => qr/\0/, |
|
417
|
|
|
|
|
|
|
ASCII_LPAREN => ord('('), # 40 |
|
418
|
|
|
|
|
|
|
ASCII_RPAREN => ord(')'), # 41 |
|
419
|
|
|
|
|
|
|
ASCII_LBRACE => ord('{'), # 123 |
|
420
|
|
|
|
|
|
|
ASCII_RBRACE => ord('}'), # 125 |
|
421
|
|
|
|
|
|
|
ASCII_LBRACK => ord('['), # 91 |
|
422
|
|
|
|
|
|
|
ASCII_RBRACK => ord(']'), # 93 |
|
423
|
|
|
|
|
|
|
ASCII_LT => ord('<'), # 60 |
|
424
|
|
|
|
|
|
|
ASCII_GT => ord('>'), # 62 |
|
425
|
15
|
|
|
15
|
|
99
|
}; |
|
|
15
|
|
|
|
|
47
|
|
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub tstregex |
|
428
|
|
|
|
|
|
|
{ |
|
429
|
115
|
|
|
115
|
1
|
518
|
my ($ctx, $pattern) = @_; |
|
430
|
|
|
|
|
|
|
# FIX: re init start-state fields in case of multiple test patterns |
|
431
|
115
|
|
|
|
|
286
|
$ctx -> {'fail_token'} = Tstregex::C_EMPTY; |
|
432
|
115
|
|
|
|
|
172
|
$ctx -> {'match_portion'} = undef; |
|
433
|
115
|
|
|
|
|
153
|
$ctx -> {'right_unmatch'} = undef; |
|
434
|
115
|
|
|
|
|
1134
|
$ctx -> {'match_len'} = 0; |
|
435
|
115
|
|
|
|
|
149
|
$ctx -> {'full_match'} = 0; |
|
436
|
115
|
|
|
|
|
153
|
$ctx -> {'left_unmatch'} = undef; |
|
437
|
115
|
|
|
|
|
173
|
my $re_raw = $ctx->{re_raw}; |
|
438
|
115
|
|
|
|
|
1198
|
my $org_pat = $pattern; |
|
439
|
115
|
|
|
|
|
165
|
my $internal_offset = 0; |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# Handle prefix offset if pattern is wrapped like the RE |
|
442
|
115
|
100
|
66
|
|
|
296
|
if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1) |
|
443
|
|
|
|
|
|
|
{ |
|
444
|
1
|
|
|
|
|
5
|
my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1); |
|
445
|
1
|
|
|
|
|
3
|
my $pat_first_char = _stringat($pattern, 0); |
|
446
|
1
|
50
|
|
|
|
4
|
if ($pat_first_char == $re_delim_char) |
|
447
|
|
|
|
|
|
|
{ |
|
448
|
1
|
|
|
|
|
4
|
$pattern = substr($org_pat, $ctx->{prefix_offset}); |
|
449
|
1
|
|
|
|
|
3
|
chop $pattern; |
|
450
|
1
|
|
|
|
|
3
|
$internal_offset = $ctx->{prefix_offset}; |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
} |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# Fast track: check if it matches globally first |
|
455
|
115
|
100
|
|
|
|
755
|
if ($pattern =~ $ctx->{re_compiled}) |
|
456
|
|
|
|
|
|
|
{ |
|
457
|
68
|
|
|
|
|
125
|
$ctx->{full_match} = 1; |
|
458
|
68
|
|
|
|
|
126
|
$ctx->{match_len} = length($org_pat); |
|
459
|
68
|
|
|
|
|
190
|
$ctx->{match_portion} = $&; # the exact sub portion that matched |
|
460
|
68
|
|
|
|
|
145
|
$ctx->{left_unmatch} = $`; # the left part of matching sub part |
|
461
|
68
|
|
|
|
|
149
|
$ctx->{right_unmatch} = $'; # the right part of matching sub part |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# --- Capture Groups Extraction --- |
|
464
|
|
|
|
|
|
|
# We populate the captures array only on a successful global match |
|
465
|
68
|
|
|
|
|
86
|
my @caps; |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
# The special variable $#- contains the number of capture groups |
|
468
|
|
|
|
|
|
|
# We start at 1 because $0 is the whole match |
|
469
|
68
|
|
|
|
|
268
|
for my $i (1 .. $#-) |
|
470
|
|
|
|
|
|
|
{ |
|
471
|
22
|
100
|
|
|
|
65
|
if (defined $-[$i]) |
|
472
|
|
|
|
|
|
|
{ |
|
473
|
|
|
|
|
|
|
# Extracting the substring using offsets from @- and @+ |
|
474
|
21
|
|
|
|
|
116
|
push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]); |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
else |
|
477
|
|
|
|
|
|
|
{ |
|
478
|
|
|
|
|
|
|
# Optional group that participated but didn't catch text |
|
479
|
1
|
|
|
|
|
4
|
push @caps, undef; |
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
} |
|
482
|
68
|
|
|
|
|
151
|
$ctx->{captures} = \@caps; |
|
483
|
|
|
|
|
|
|
|
|
484
|
68
|
|
|
|
|
211
|
return 1; |
|
485
|
|
|
|
|
|
|
} |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# Nibbling phase: find the longest matching lexical group |
|
488
|
47
|
|
|
|
|
68
|
my $match_reg = Tstregex::C_EMPTY; |
|
489
|
47
|
|
|
|
|
63
|
foreach my $step (@{$ctx->{steps}}) |
|
|
47
|
|
|
|
|
155
|
|
|
490
|
|
|
|
|
|
|
{ |
|
491
|
114
|
100
|
|
|
|
1380
|
if ($pattern =~ qr/$step/) |
|
492
|
|
|
|
|
|
|
{ |
|
493
|
35
|
|
|
|
|
104
|
$match_reg = $step; |
|
494
|
35
|
|
|
|
|
60
|
last; |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
} |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# ** SENSITIVE ** |
|
499
|
|
|
|
|
|
|
# Fine-tune the match length character by character |
|
500
|
|
|
|
|
|
|
# Append a \z to avoid Perl to skip final \n |
|
501
|
|
|
|
|
|
|
# if Nibbling failed ($match_reg empty), get the full regex for fine-tuning. |
|
502
|
47
|
100
|
|
|
|
149
|
my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean}; |
|
503
|
|
|
|
|
|
|
# critical! add starting anchor (\A) to force coherency check from the first char.. |
|
504
|
47
|
|
|
|
|
171
|
my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #; |
|
505
|
47
|
|
|
|
|
148
|
for (my $i = length($pattern); $i >= 0; $i--) |
|
506
|
|
|
|
|
|
|
{ |
|
507
|
|
|
|
|
|
|
# check if current prefix is valid according to the target |
|
508
|
117
|
100
|
|
|
|
363
|
last if ($pattern =~ $match_work); |
|
509
|
83
|
|
|
|
|
193
|
chop $pattern; |
|
510
|
|
|
|
|
|
|
} |
|
511
|
47
|
|
|
|
|
93
|
$ctx->{match_len} = length($pattern) + $internal_offset; |
|
512
|
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# Identify the failing token for display |
|
514
|
47
|
50
|
|
|
|
55
|
my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean}; |
|
|
47
|
|
|
|
|
164
|
|
|
515
|
47
|
|
|
|
|
111
|
my $remaining_re = substr($tail_re, length($match_reg)); |
|
516
|
|
|
|
|
|
|
|
|
517
|
47
|
50
|
|
|
|
95
|
if ($remaining_re ne Tstregex::C_EMPTY) |
|
518
|
|
|
|
|
|
|
{ |
|
519
|
|
|
|
|
|
|
# get the first token for analyse |
|
520
|
47
|
|
|
|
|
110
|
my $next_tokens = _get_lex_groups($remaining_re); |
|
521
|
47
|
|
50
|
|
|
122
|
my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY; |
|
522
|
47
|
|
|
|
|
75
|
$ctx->{fail_token} = $remaining_re; |
|
523
|
47
|
100
|
|
|
|
170
|
$ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b|\^|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $) |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
else |
|
526
|
|
|
|
|
|
|
{ |
|
527
|
0
|
|
|
|
|
0
|
$ctx->{fail_token} = Tstregex::C_EMPTY; |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# Ensure captures is empty/undef on failure |
|
531
|
47
|
|
|
|
|
89
|
$ctx->{captures} = []; |
|
532
|
47
|
50
|
|
|
|
180
|
return $ctx->{match_undef}? undef:0; |
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
# Context initialization and RE peeling |
|
536
|
|
|
|
|
|
|
sub tstregex_init_desc |
|
537
|
|
|
|
|
|
|
{ |
|
538
|
115
|
|
|
115
|
1
|
2826974
|
my ($re_raw) = @_; |
|
539
|
|
|
|
|
|
|
# The Shield: Catching the 5.28 deprecation warnings and fatal errors |
|
540
|
|
|
|
|
|
|
# We use a localized __WARN__ handler to catch the "Unescaped left brace" |
|
541
|
|
|
|
|
|
|
# even if it's not a fatal error yet in 5.28 |
|
542
|
|
|
|
|
|
|
|
|
543
|
115
|
|
|
|
|
326
|
my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw); |
|
544
|
115
|
50
|
|
|
|
299
|
my $match_undef = $re_compiled eq RE_EMPTY ? 1:0; |
|
545
|
|
|
|
|
|
|
|
|
546
|
115
|
|
|
|
|
1015
|
my $steps = _parse_lex_groups($re_clean); |
|
547
|
|
|
|
|
|
|
# { |
|
548
|
|
|
|
|
|
|
# no warnings 'experimental::re_strict'; |
|
549
|
|
|
|
|
|
|
# use re 'strict'; |
|
550
|
|
|
|
|
|
|
# $steps = _parse_lex_groups($re_clean); |
|
551
|
|
|
|
|
|
|
# }; |
|
552
|
|
|
|
|
|
|
return |
|
553
|
|
|
|
|
|
|
{ |
|
554
|
115
|
|
|
|
|
1190
|
re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean, |
|
555
|
|
|
|
|
|
|
steps => $steps, prefix_offset => $prefix_off, match_len => 0, |
|
556
|
|
|
|
|
|
|
fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef, |
|
557
|
|
|
|
|
|
|
match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef, |
|
558
|
|
|
|
|
|
|
warning => $last_warning, |
|
559
|
|
|
|
|
|
|
}; |
|
560
|
|
|
|
|
|
|
} |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# Helper: get char code at position |
|
563
|
2
|
|
|
2
|
|
6
|
sub _stringat($$) { return vec($_[0], $_[1], 8); } |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub _unwrap_regex |
|
566
|
|
|
|
|
|
|
{ |
|
567
|
115
|
|
|
115
|
|
212
|
my ($raw) = @_; |
|
568
|
115
|
50
|
33
|
|
|
622
|
return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw || $raw eq Tstregex::C_EMPTY; |
|
569
|
|
|
|
|
|
|
|
|
570
|
115
|
|
|
|
|
172
|
my $raw_org = $raw; |
|
571
|
115
|
|
|
|
|
185
|
my $options = Tstregex::C_EMPTY; |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# 1. Extract trailing options (ismxg) |
|
574
|
115
|
|
|
|
|
506
|
while ($raw =~ s/([ismxg])$//) |
|
575
|
|
|
|
|
|
|
{ |
|
576
|
0
|
|
|
|
|
0
|
$options = $1 . $options; |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# 2. Delegate peeling to _strop |
|
580
|
115
|
|
|
|
|
250
|
my $clean = _strop($raw); |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# 3. Automatic offset calculation (locate the "juice" within the original string) |
|
583
|
115
|
|
|
|
|
265
|
my $off = index($raw_org, $clean); |
|
584
|
115
|
50
|
|
|
|
230
|
$off = 0 if $off < 0; |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# 4. Secure Forge (Remove 'g' as it is irrelevant for qr//) |
|
587
|
115
|
|
|
|
|
186
|
$options =~ tr/g//d; |
|
588
|
115
|
50
|
|
|
|
270
|
my $re_str = $options ? "(?$options)$clean" : $clean; |
|
589
|
|
|
|
|
|
|
|
|
590
|
115
|
|
|
|
|
224
|
my ($re_ret, $warn) = _safe_qr($re_str); |
|
591
|
|
|
|
|
|
|
|
|
592
|
115
|
|
|
|
|
391
|
return ($re_ret, $clean, $off, $warn); |
|
593
|
|
|
|
|
|
|
} |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
sub _safe_qr |
|
596
|
|
|
|
|
|
|
{ |
|
597
|
162
|
|
|
162
|
|
272
|
my ($re_str) = @_; |
|
598
|
162
|
|
|
|
|
203
|
my ($re, $err); |
|
599
|
|
|
|
|
|
|
{ |
|
600
|
162
|
|
|
|
|
183
|
local $@; |
|
|
162
|
|
|
|
|
199
|
|
|
601
|
162
|
|
|
1
|
|
1392
|
local $SIG{__DIE__} = local $SIG{__WARN__} = sub { }; |
|
602
|
|
|
|
|
|
|
# dont catch the warning there, let the eval fail instead and get the message back in $@ |
|
603
|
162
|
|
|
|
|
278
|
$re = eval { qr/$re_str/ }; |
|
|
162
|
|
|
|
|
3031
|
|
|
604
|
162
|
|
50
|
|
|
1112
|
$err = $@ // ''; |
|
605
|
|
|
|
|
|
|
} |
|
606
|
162
|
|
50
|
|
|
592
|
return ($re // RE_EMPTY, $err); |
|
607
|
|
|
|
|
|
|
} |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# _strop: strip operators |
|
610
|
|
|
|
|
|
|
# Peels Perl operators (m!!, m{}, //) by checking extremities. |
|
611
|
|
|
|
|
|
|
# It ensures only the core regex juice is returned. |
|
612
|
|
|
|
|
|
|
sub _strop |
|
613
|
|
|
|
|
|
|
{ |
|
614
|
115
|
|
|
115
|
|
182
|
my ($raw) = @_; |
|
615
|
115
|
50
|
33
|
|
|
401
|
return $raw if !defined $raw || $raw eq Tstregex::C_EMPTY; |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# Remove leading/trailing whitespace |
|
618
|
115
|
|
|
|
|
498
|
$raw =~ s/^\s+|\s+$//g; |
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
# Identify opening delimiter (after an optional 'm') |
|
621
|
115
|
100
|
|
|
|
640
|
if ($raw =~ /^((?:m\s*)?)([^\w\s])(.*)$/s) |
|
622
|
|
|
|
|
|
|
{ |
|
623
|
50
|
|
|
|
|
186
|
my $prefix = $1; |
|
624
|
50
|
|
|
|
|
153
|
my $open = $2; |
|
625
|
50
|
|
|
|
|
92
|
my $body = $3; |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# Map symmetric pairs using ASCII constants |
|
628
|
50
|
|
|
|
|
259
|
my %sym_or_eq = |
|
629
|
|
|
|
|
|
|
( |
|
630
|
|
|
|
|
|
|
chr(ASCII_LBRACE) => chr(ASCII_RBRACE), |
|
631
|
|
|
|
|
|
|
chr(ASCII_LBRACK) => chr(ASCII_RBRACK), |
|
632
|
|
|
|
|
|
|
chr(ASCII_LPAREN) => chr(ASCII_RPAREN), |
|
633
|
|
|
|
|
|
|
chr(ASCII_LT) => chr(ASCII_GT), |
|
634
|
|
|
|
|
|
|
); |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
# Expected close is either the matching pair or the same character (e.g., m!!) |
|
637
|
50
|
|
66
|
|
|
164
|
my $expected_close = $sym_or_eq{$open} || $open; |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
# PROTECT CAPTURE GROUPS: |
|
640
|
|
|
|
|
|
|
# If the delimiter is a parenthesis but there is no 'm' prefix, |
|
641
|
|
|
|
|
|
|
# it is a capturing group, NOT an operator. Do not peel! |
|
642
|
50
|
100
|
66
|
|
|
186
|
if ($open eq chr(ASCII_LPAREN) && !$prefix) |
|
643
|
|
|
|
|
|
|
{ |
|
644
|
26
|
|
|
|
|
89
|
return $raw; |
|
645
|
|
|
|
|
|
|
} |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
# Check if the very last character matches our expected closing delimiter |
|
648
|
24
|
100
|
|
|
|
68
|
if (substr($body, -1) eq $expected_close) |
|
649
|
|
|
|
|
|
|
{ |
|
650
|
1
|
|
|
|
|
6
|
return substr($body, 0, -1); |
|
651
|
|
|
|
|
|
|
} |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
# If 'm' was present but closing failed, return body (best effort) |
|
654
|
23
|
50
|
|
|
|
97
|
return $body if $prefix; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
88
|
|
|
|
|
179
|
return $raw; |
|
658
|
|
|
|
|
|
|
} |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
sub _parse_lex_groups |
|
661
|
|
|
|
|
|
|
{ |
|
662
|
115
|
|
|
115
|
|
216
|
my ($regex) = @_; |
|
663
|
115
|
|
|
|
|
257
|
my $tokens = _get_lex_groups($regex); |
|
664
|
115
|
|
|
|
|
153
|
my @results; |
|
665
|
115
|
|
|
|
|
310
|
my $current = join(Tstregex::C_EMPTY, @$tokens); |
|
666
|
|
|
|
|
|
|
|
|
667
|
115
|
|
|
|
|
235
|
while (@$tokens) |
|
668
|
|
|
|
|
|
|
{ |
|
669
|
412
|
|
|
|
|
928
|
my $opens = () = $current =~ /(?
|
|
670
|
412
|
|
|
|
|
694
|
my $closes = () = $current =~ /(?
|
|
671
|
412
|
50
|
|
|
|
674
|
if ($opens >= $closes) |
|
672
|
|
|
|
|
|
|
{ |
|
673
|
412
|
|
|
|
|
704
|
my $v = $current . (')' x ($opens - $closes)); |
|
674
|
412
|
|
|
|
|
691
|
$v =~ s/(?
|
|
675
|
412
|
100
|
|
|
|
490
|
if (eval { qr/$v/ }) { push @results, $v; } |
|
|
412
|
|
|
|
|
5347
|
|
|
|
409
|
|
|
|
|
776
|
|
|
676
|
|
|
|
|
|
|
} |
|
677
|
412
|
|
|
|
|
825
|
my $last = pop @$tokens; |
|
678
|
412
|
50
|
|
|
|
1177
|
substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last; |
|
679
|
|
|
|
|
|
|
} |
|
680
|
115
|
|
|
|
|
271
|
return \@results; |
|
681
|
|
|
|
|
|
|
} |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# Lexical tokenizer for Perl Regex |
|
684
|
|
|
|
|
|
|
sub _get_lex_groups |
|
685
|
|
|
|
|
|
|
{ |
|
686
|
162
|
|
|
162
|
|
257
|
my ($regex) = @_; |
|
687
|
162
|
|
|
|
|
248
|
my @groups; |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
# --- START: class mismatch support (Added [.*?] to tokenizer) --- |
|
690
|
|
|
|
|
|
|
# my $re = qr/(\(\?\#.*?\))|(\(\?[:=!<>]+)|(\{\d+,?\d*\})|(\[.*?\])|(\\.)|([\(\)\|^\$\+\*\?])|(.)/x; |
|
691
|
|
|
|
|
|
|
# --- ENHANCED: Atomic Lookaround & Recursive Group Support --- |
|
692
|
|
|
|
|
|
|
# Group 1: Comments (?#...) |
|
693
|
|
|
|
|
|
|
# Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens |
|
694
|
|
|
|
|
|
|
# Group 3: Quantifiers {n,m} |
|
695
|
|
|
|
|
|
|
# Group 4: Character classes [...] |
|
696
|
|
|
|
|
|
|
# Group 5: Escaped characters \. |
|
697
|
|
|
|
|
|
|
# Group 6: Metacharacters ( ) | ^ $ + * ? |
|
698
|
|
|
|
|
|
|
# Group 7: Any other character |
|
699
|
|
|
|
|
|
|
# --- END: class mismatch support --- |
|
700
|
162
|
|
|
|
|
386
|
my $re = qr/ |
|
701
|
|
|
|
|
|
|
(\(\?\#.*?\)) |
|
702
|
|
|
|
|
|
|
| ( # START GROUP 2 |
|
703
|
|
|
|
|
|
|
\(\?[:=!<>]+ # Assertion header |
|
704
|
|
|
|
|
|
|
(?: # Content |
|
705
|
|
|
|
|
|
|
(?> [^()]+ ) # Non-paren characters (atomic) |
|
706
|
|
|
|
|
|
|
| |
|
707
|
|
|
|
|
|
|
(?2) # Recursive call to Group 2 |
|
708
|
|
|
|
|
|
|
)* |
|
709
|
|
|
|
|
|
|
\) # Matching closing paren |
|
710
|
|
|
|
|
|
|
) # END GROUP 2 |
|
711
|
|
|
|
|
|
|
| (\{\d+,?\d*\}) |
|
712
|
|
|
|
|
|
|
| (\[.*?\]) |
|
713
|
|
|
|
|
|
|
| (\\.) |
|
714
|
|
|
|
|
|
|
| ([\(\)\|^\$\+\*\?]) |
|
715
|
|
|
|
|
|
|
| (.) |
|
716
|
|
|
|
|
|
|
/x; |
|
717
|
|
|
|
|
|
|
|
|
718
|
162
|
|
|
|
|
1164
|
while ($regex =~ /$re/g) |
|
719
|
|
|
|
|
|
|
{ |
|
720
|
561
|
|
100
|
|
|
4274
|
my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7; |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
721
|
561
|
100
|
66
|
|
|
2478
|
if (defined $t && $t =~ /^[\+\*\?]$|^\{\d/ && @groups && $groups[-1] !~ /^[\(\)\|]$/) |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
722
|
|
|
|
|
|
|
{ |
|
723
|
71
|
|
|
|
|
297
|
$groups[-1] .= $t; |
|
724
|
|
|
|
|
|
|
} |
|
725
|
490
|
|
|
|
|
2271
|
else { push @groups, $t; } |
|
726
|
|
|
|
|
|
|
} |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
# TODO: test that optimized code fragment instead; Much time spent here, but sensitive part.. |
|
729
|
|
|
|
|
|
|
# my $quantifiers = '+*?{'; |
|
730
|
|
|
|
|
|
|
# while ($regex =~ /$re/g) |
|
731
|
|
|
|
|
|
|
# { |
|
732
|
|
|
|
|
|
|
# my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7; |
|
733
|
|
|
|
|
|
|
# if (defined $t && @groups) |
|
734
|
|
|
|
|
|
|
# { |
|
735
|
|
|
|
|
|
|
# my $char = substr($t, 0, 1); |
|
736
|
|
|
|
|
|
|
# if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\(\)\|]$/) |
|
737
|
|
|
|
|
|
|
# { |
|
738
|
|
|
|
|
|
|
# $groups[-1] .= $t; |
|
739
|
|
|
|
|
|
|
# next; |
|
740
|
|
|
|
|
|
|
# } |
|
741
|
|
|
|
|
|
|
# } |
|
742
|
|
|
|
|
|
|
# push @groups, $t; |
|
743
|
|
|
|
|
|
|
# } |
|
744
|
|
|
|
|
|
|
|
|
745
|
162
|
|
|
|
|
476
|
return \@groups; |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
} |
|
749
|
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
1; |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
package main; |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
__DATA__ |