File Coverage

blib/lib/Tstregex.pm
Criterion Covered Total %
statement 168 298 56.3
branch 37 102 36.2
condition 36 80 45.0
subroutine 28 40 70.0
pod 9 11 81.8
total 278 531 52.3


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             ###############################################
4             # Author: Olivier Delouya - 2026
5             # File: Tstregex.pm (Hybrid Modulino)
6             # Content: Regex longest match diagnostic
7             # indent: Whitesmith (perltidy -bl -bli)
8             ###############################################
9              
10 15     15   1423104 use strict;
  15         27  
  15         425  
11 15     15   51 use warnings;
  15         68  
  15         1039  
12              
13             =head1 NAME
14              
15             Tstregex - A Diagnostic Tool that quickly shows the longest Regular Expression string match, highlighting the rejected part. The terminal command
16             tstregex '/^[a-z]*\d{3}$/' 'abc12a' shows: Tstregex - Diagnostic Tool ... shows: abc·B<12a>· (^[a-z]*·B<\d{3}$>·)
17              
18             # Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring
19             #(idem with regexp lexical groups between parenthesis)
20             # A Hybrid Regex Diagnostic Tool (single file Library module with API and command tool)
21              
22             =cut
23            
24             =head1 SYNOPSIS
25              
26             C<$tstregex 'regex' string1 string2 ... stringN>
27              
28             =head1 OPTIONS (CLI)
29              
30             =head2 -h --help
31              
32             show that help..
33              
34             =head2 -v --verbose
35              
36             shows key info on (un)matching..
37              
38             =head2 -d --diag
39              
40             =over
41              
42             =item Triggers the Enriched Diagnostic View. It displays:
43              
44             =item - The string with the failing part highlighted.
45              
46             =item - The exact token in the regex that caused the break.
47              
48             =item - A visual pointer (C<^--- HERE>) aligned with the regex syntax.
49              
50             =item - Execution time (useful for spotting ReDoS/Exponential backtracking).
51              
52             =back
53              
54             =head2 -a --assert
55              
56             Misc: performs a huge test suite various a large collection of regexp tests with Tstregex..
57              
58             =head1 Perl Module SYNOPSIS
59              
60             use Tstregex;
61             my $ctx = tstregex_init_desc('/^\d{3}/');
62             tstregex($ctx, '12a');
63             if (!tstregex_is_full_match($res))
64             {
65             my $token = tstregex_get_fail_token($res);
66             my $pos = tstregex_get_match_len($res);
67             print "Failure on token '$token' at column $pos\n";
68             }
69             Note that if your purpose is just to display the result as the command would do, you can call directly
70             the main with the appropriate (argc, argv) style parameters to avoid to spawn another perl process.
71             See the tstregex file stub for details..
72              
73             =head1 API
74              
75             =head2 tstregex_init_desc($raw_re)
76              
77             Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash.
78              
79             =head2 tstregex($ctx, $string)
80              
81             Executes the diagnostic. Updates the context.
82              
83             =head2 tstregex_is_full_match
84              
85             Returns match status of input string (BOOL 0 OR 1)
86              
87             =head2 tstregex_get_match_portion
88              
89             Returns the matching portion in case of full match
90             (might be smaller than input string, depending on anchors..)
91              
92             =head2 tstregex_get_match_len
93              
94             Returns the matching substring length
95              
96             =head2 tstregex_get_fail_token
97              
98             Returns the failing token in the regexp
99              
100             =head2 tstregex_get_re_clean
101              
102             Returns the matching regexp subpart
103              
104             =head2 tstregex_get_re_raw
105              
106             Returns the internal representation of the regexp
107              
108             =head2 tstregex_get_prefix_offset
109              
110             Returns the offset of the original regexp in the raw regexp
111              
112             =head1 DESCRIPTION
113              
114             =over
115              
116             =item B is designed to solve the "Black Box" problem of Regular Expressions.
117              
118             =item When a complex regex fails, Perl usually just says "No Match".
119              
120             =item This tool identifies exactly B and B it failed by finding the longest possible partial match.
121              
122             =back
123              
124             =head1 EXAMPLE
125              
126             =over
127              
128             =item $ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a'
129              
130             =item abc123
131              
132             =item abcB<12a> (^[a-z]*B<\d{3}$>)
133              
134             =back
135              
136             I
137              
138             =head2 The "Nibbling" Engine
139              
140             The diagnostic logic uses a "Nibbling" (grignotage) strategy:
141              
142             =over 4
143              
144             =item 1. Decomposition
145              
146             The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest.
147              
148             =item 2. Longest Match Search
149              
150             =back
151              
152             =over 4
153              
154             It iteratively tests these sub-patterns against the input string.
155              
156             It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall.
157              
158             =back
159              
160             =over 4
161              
162             =item 3. Failure Point Identification
163              
164             Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure".
165              
166             =back
167              
168             =head1 AUTHOR
169              
170             Olivier Delouya - 2026
171              
172             =head1 LICENSE
173              
174             Artistic Version 2
175              
176             =head1 SEE ALSO
177              
178             This tool belongs to the family of Perl regex development utilities. It is
179             intended for developers looking for a lightweight perl regex debugger,
180             providing automated regex diagnostics when a regular expression fails
181             or requires deep troubleshooting in the CLI terminal.
182              
183             =head1 DESCRIPTION
184              
185             Tstregex is a CLI tool designed for Perl regex debugging and diagnostics.
186             If you need to debug a Perl regular expression, find where a regex fails,
187             or pinpoint the exact character that broke your pattern, this module
188             automates the process.
189              
190             =head1 KEYWORDS
191              
192             perl regex debugger, debug regular expression perl, regex failure pinpoint,
193             match longest regex string, regular expression diagnostic tool,
194             perl regex troubleshooting.
195              
196             =cut
197              
198              
199             package main;
200             {
201 15     15   78 use strict;
  15         43  
  15         243  
202 15     15   58 use warnings;
  15         38  
  15         542  
203 15     15   62 use Carp qw(confess);
  15         18  
  15         928  
204             $SIG{__WARN__} = 'confess';
205             $SIG{__DIE__ } = 'confess';
206              
207 15     15   7209 use IO::Handle;
  15         80838  
  15         796  
208 15     15   8611 use Term::ANSIColor qw(:constants);
  15         120626  
  15         14271  
209 15     15   176 use Time::HiRes qw(gettimeofday tv_interval);
  15         39  
  15         167  
210 15     15   7074 use utf8;
  15         3050  
  15         73  
211              
212             # --- Constants & ASCII Codes ---
213             use constant
214             {
215 15         4282 C_m => 109, C_g => 103, C_i => 105, C_s => 115,
216             C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48,
217             C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97,
218             C_LOW_Z => 122, C_UNDSC => 95,
219             OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93,
220             OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62,
221             ESC => "\e",
222             CUU => "\e[A", # Cursor Up
223             UI_OFFSET=> 11, # Alignment offset for " Syntax: "
224 15     15   1136 };
  15         18  
225              
226             # --- ENCAPSULATED DEBUG ALIAS ---
227             BEGIN
228             {
229 15 50   15   21542 if ($INC{'perl5db.pl'})
230             {
231 0         0 require Data::Dumper;
232 0         0 require Term::ANSIColor;
233 15     15   77 no strict 'refs';
  15         18  
  15         439  
234 15     15   44 no warnings 'once';
  15         18  
  15         3808  
235              
236             my $debug_sub = sub
237             {
238 0         0 my @args = @_;
239              
240             # Automatically detect flattened hashes:
241             # If even number of arguments and the first one isn't a reference
242 0 0 0     0 if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0]))
      0        
243             {
244             # Wrap the flattened list into a temporary hashref
245 0         0 my %tmp_hash = @args;
246 0         0 @args = (\%tmp_hash);
247             }
248              
249 0         0 print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(),
250             'DEBUG (tstregex): ', Term::ANSIColor::RESET(),
251             Data::Dumper::Dumper(@args);
252 0         0 };
253              
254             # Force injection into all relevant namespaces
255 0         0 foreach my $pkg ('main', 'Tstregex', 'DB')
256             {
257 0         0 *{"${pkg}::d"} = $debug_sub;
  0         0  
258             }
259              
260 0 0       0 my $cuu = defined &main::CUU ? main::CUU() : "\e[A";
261 0         0 print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(),
262             'INFO: ', Term::ANSIColor::RESET(),
263             "Alias 'd' ready (Auto-hash detection enabled)\n\n";
264             }
265             }
266            
267             exit(main(scalar(@ARGV), \@ARGV)) if(!caller);
268              
269             sub main
270             {
271 0     0   0 my ($argc, $argv) = @_;
272 0 0 0     0 if(!$argc || ($argc && $$argv[0] =~ /^-h|--help$/)) { help(); exit(0); }
  0   0     0  
  0         0  
273              
274 0         0 binmode STDOUT, ':utf8';
275 0         0 my ($mode_diag, $verbose, $assert) = (0)x2;
276 0         0 for(my $i=0; $i<$argc; $i++)
277             {
278 0 0 0     0 do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d|--diag$/);
  0         0  
  0         0  
  0         0  
279 0 0 0     0 do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v|--verbose$/);
  0         0  
  0         0  
  0         0  
280 0 0 0     0 do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a|--assert$/);
  0         0  
  0         0  
  0         0  
281             }
282            
283 0 0       0 if ($assert)
284             {
285 0         0 print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET;
286 0         0 _run_internal_tests($mode_diag, $verbose);
287 0         0 exit(0);
288             }
289              
290 0         0 my @new_argv;
291 0         0 foreach(@$argv)
292             {
293 0 0       0 push @new_argv, $_ if(defined($_));
294             }
295 0         0 $argv = \@new_argv;
296 0         0 $argc = scalar @$argv;
297            
298 0         0 my $re_raw = shift @{$argv};
  0         0  
299 0         0 my $ctx = Tstregex::tstregex_init_desc($re_raw);
300 0         0 my $global_result = 0; # success! BE POSITIVE !!
301 0         0 foreach my $pattern (@{$argv})
  0         0  
302             {
303 0 0       0 my $t0 = [gettimeofday] if $mode_diag;
304 0         0 my $result = Tstregex::tstregex($ctx, $pattern);
305 0 0       0 $global_result = 1 if($result);
306 0 0       0 $mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0))
307             : _display_standard($pattern, $ctx);
308 0 0       0 if($verbose)
309             {
310 0 0       0 print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; ';
311 0 0       0 print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx))
312             : ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx));
313 0         0 print Term::ANSIColor::RESET(), "\n";
314 0 0 0     0 print $ctx->{warning} if($verbose && $ctx->{warning} ne '');
315             }
316             }
317 0         0 return $global_result;
318             }
319            
320             sub help
321             {
322 0     0   0 print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET;
323 0         0 print "Usage:\n";
324 0         0 print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n";
325 0         0 print "Examples:\n";
326 0         0 print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n";
327 0         0 print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n";
328 0         0 print BOLD, 'DELIMITERS ', RESET, "are optional\n";
329 0         0 print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n";
330 0         0 print "Options:\n";
331 0         0 print "-h --help Shows that help\n";
332 0         0 print "-v --verbose Shows keys info on match/unmatch\n";
333 0         0 print "-d --diag Enriched diagnostic with timing and syntax pointers\n";
334 0         0 print "-a --assert Misc: shows a large test of regexp against tstregex..\n";
335             }
336              
337             sub _run_internal_tests
338             {
339 0     0   0 my ($mode_diag, $verbose) = @_;
340 0         0 my $fh = \*main::DATA;
341 0         0 seek($fh, 0, 0);
342            
343 0         0 my $found_data_token = 0;
344            
345 0         0 while (<$fh>)
346             {
347 0         0 chomp;
348             # PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker
349             # This prevents the script from parsing its own source code
350 0 0       0 if (!$found_data_token)
351             {
352 0 0       0 $found_data_token = 1 if /^__(DATA|END)__/;
353 0         0 next;
354             }
355             else
356             {
357 0 0       0 last if /^__(DATA|END)__/;
358             }
359 0 0 0     0 next if /^\s*$/ || /^#/;
360 0         0 my ($re, @rest) = split(/\s+|:::\s*/, $_);
361 0 0       0 next unless $re;
362 0 0       0 my @strings = grep { $_ ne '0' && $_ ne '1' } @rest;
  0         0  
363              
364 0         0 print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n";
365 0         0 my $ctx = Tstregex::tstregex_init_desc($re);
366 0 0 0     0 print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne '');
367 0         0 foreach my $s (@strings)
368             {
369 0         0 my $t0;
370 0 0       0 $t0 = [gettimeofday] if $mode_diag;
371 0         0 Tstregex::tstregex($ctx, $s);
372 0 0       0 $mode_diag ? _display_enriched($s, $ctx, tv_interval($t0))
373             : _display_standard($s, $ctx);
374             }
375 0         0 print '-' x 40, "\n";
376             }
377             }
378            
379             sub _get_re_val
380             {
381             return (
382 0     0   0 Tstregex::tstregex_get_match_len ($_[0]),
383             Tstregex::tstregex_get_fail_token($_[0]),
384             Tstregex::tstregex_get_re_clean ($_[0])
385             );
386             }
387              
388             sub _display_standard
389             {
390 0     0   0 my ($pattern, $ctx) = @_;
391 0 0       0 if (Tstregex::tstregex_is_full_match($ctx))
392             {
393 0         0 print "$pattern\n";
394 0         0 return undef;
395             }
396 0         0 my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
397 0         0 print substr($pattern, 0, $match_len); STDOUT->flush(); print STDERR BOLD, substr($pattern, $match_len), RESET;
  0         0  
  0         0  
398 0         0 my $off = length($re_clean) - length($fail_token);
399 0         0 print STDERR ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n";
400 0         0 undef;
401             }
402              
403             sub _display_enriched
404             {
405 0     0   0 my ($pattern, $ctx, $elapsed) = @_;
406 0         0 print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n";
407              
408 0 0       0 if (Tstregex::tstregex_is_full_match($ctx))
409             {
410 0         0 print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n";
411 0         0 return undef;
412             }
413 0         0 my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
414 0         0 my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx);
415 0         0 my $re_raw = Tstregex::tstregex_get_re_raw ($ctx);
416              
417 0         0 print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len),
418             BOLD, WHITE, substr($pattern, $match_len), RESET;
419 0         0 print ' (at ', CYAN, $fail_token, RESET, ")\n";
420              
421 0         0 my $err_pos_in_clean = length($re_clean) - length($fail_token);
422 0         0 my $final_pointer_pos = $prefix_off + $err_pos_in_clean;
423              
424 0         0 print ' Syntax: ', WHITE, $re_raw, RESET, "\n";
425 0         0 print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n";
426 0         0 printf " Time: %.4fs\n\n", $elapsed;
427 0         0 undef;
428             }
429              
430             }
431              
432             1;
433              
434             package Tstregex;
435             {
436             our $VERSION = '1.14';
437 15     15   103 use Exporter qw(import);
  15         19  
  15         2920  
438              
439             our @EXPORT = qw(
440             tstregex
441             tstregex_init_desc
442             tstregex_get_match_len
443             tstregex_get_fail_token
444             tstregex_is_full_match
445             tstregex_get_re_clean
446             tstregex_get_prefix_offset
447             tstregex_get_re_raw
448             tstregex_get_match_portion
449             tstregex_get_info
450             );
451              
452             # --- PUBLIC API (The Getters) ---
453              
454 115     115 1 384 sub tstregex_get_match_len { return $_[0]->{match_len}; }
455 115     115 1 356 sub tstregex_get_fail_token { return $_[0]->{fail_token}; }
456 0     0 1 0 sub tstregex_is_full_match { return $_[0]->{full_match}; }
457 0     0 1 0 sub tstregex_get_re_clean { return $_[0]->{re_clean}; }
458 0     0 1 0 sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; }
459 0     0 1 0 sub tstregex_get_re_raw { return $_[0]->{re_raw}; }
460 115   50 115 0 399 sub tstregex_get_captures { return $_[0]->{captures} // [];}
461 0     0 1 0 sub tstregex_get_match_portion { return $_[0]->{match_portion}; }
462 0     0 0 0 sub tstregex_get_info { return $_[0]->{warning}; }
463              
464             # Main diagnostic function
465 15     15   71 use constant;
  15         25  
  15         1174  
466             use constant
467             {
468 15         28143 C_EMPTY => '',
469             RE_EMPTY => qr/\0/,
470             ASCII_LPAREN => ord('('), # 40
471             ASCII_RPAREN => ord(')'), # 41
472             ASCII_LBRACE => ord('{'), # 123
473             ASCII_RBRACE => ord('}'), # 125
474             ASCII_LBRACK => ord('['), # 91
475             ASCII_RBRACK => ord(']'), # 93
476             ASCII_LT => ord('<'), # 60
477             ASCII_GT => ord('>'), # 62
478 15     15   75 };
  15         31  
479            
480             sub tstregex
481             {
482 115     115 1 412 my ($ctx, $pattern) = @_;
483             # FIX: re init start-state fields in case of multiple test patterns
484 115         182 $ctx -> {'fail_token'} = Tstregex::C_EMPTY;
485 115         155 $ctx -> {'match_portion'} = undef;
486 115         123 $ctx -> {'right_unmatch'} = undef;
487 115         130 $ctx -> {'match_len'} = 0;
488 115         115 $ctx -> {'full_match'} = 0;
489 115         124 $ctx -> {'left_unmatch'} = undef;
490 115         131 my $re_raw = $ctx->{re_raw};
491 115         128 my $org_pat = $pattern;
492 115         139 my $internal_offset = 0;
493              
494             # Handle prefix offset if pattern is wrapped like the RE
495 115 100 66     235 if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1)
496             {
497 1         4 my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1);
498 1         3 my $pat_first_char = _stringat($pattern, 0);
499 1 50       2 if ($pat_first_char == $re_delim_char)
500             {
501 1         2 $pattern = substr($org_pat, $ctx->{prefix_offset});
502 1         2 chop $pattern;
503 1         2 $internal_offset = $ctx->{prefix_offset};
504             }
505             }
506              
507             # Fast track: check if it matches globally first
508 115 100       541 if ($pattern =~ $ctx->{re_compiled})
509             {
510 68         104 $ctx->{full_match} = 1;
511 68         90 $ctx->{match_len} = length($org_pat);
512 68         124 $ctx->{match_portion} = $&; # the exact sub portion that matched
513 68         110 $ctx->{left_unmatch} = $`; # the left part of matching sub part
514 68         109 $ctx->{right_unmatch} = $'; # the right part of matching sub part
515              
516             # --- Capture Groups Extraction ---
517             # We populate the captures array only on a successful global match
518 68         88 my @caps;
519              
520             # The special variable $#- contains the number of capture groups
521             # We start at 1 because $0 is the whole match
522 68         236 for my $i (1 .. $#-)
523             {
524 22 100       50 if (defined $-[$i])
525             {
526             # Extracting the substring using offsets from @- and @+
527 21         96 push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]);
528             }
529             else
530             {
531             # Optional group that participated but didn't catch text
532 1         2 push @caps, undef;
533             }
534             }
535 68         109 $ctx->{captures} = \@caps;
536              
537 68         150 return 1;
538             }
539              
540             # Nibbling phase: find the longest matching lexical group
541 47         102 my $match_reg = Tstregex::C_EMPTY;
542 47         82 foreach my $step (@{$ctx->{steps}})
  47         105  
543             {
544 114 100       1021 if ($pattern =~ qr/$step/)
545             {
546 35         48 $match_reg = $step;
547 35         52 last;
548             }
549             }
550              
551             # ** SENSITIVE **
552             # Fine-tune the match length character by character
553             # Append a \z to avoid Perl to skip final \n
554             # if Nibbling failed ($match_reg empty), get the full regex for fine-tuning.
555 47 100       131 my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean};
556             # critical! add starting anchor (\A) to force coherency check from the first char..
557 47         112 my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #;
558 47         143 for (my $i = length($pattern); $i >= 0; $i--)
559             {
560             # check if current prefix is valid according to the target
561 117 100       324 last if ($pattern =~ $match_work);
562 83         146 chop $pattern;
563             }
564 47         93 $ctx->{match_len} = length($pattern) + $internal_offset;
565              
566             # Identify the failing token for display
567 47 50       45 my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean};
  47         146  
568 47         81 my $remaining_re = substr($tail_re, length($match_reg));
569              
570 47 50       85 if ($remaining_re ne Tstregex::C_EMPTY)
571             {
572             # get the first token for analyse
573 47         83 my $next_tokens = _get_lex_groups($remaining_re);
574 47   50     105 my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY;
575 47         89 $ctx->{fail_token} = $remaining_re;
576 47 100       157 $ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b|\^|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $)
577             }
578             else
579             {
580 0         0 $ctx->{fail_token} = Tstregex::C_EMPTY;
581             }
582              
583             # Ensure captures is empty/undef on failure
584 47         73 $ctx->{captures} = [];
585 47 50       151 return $ctx->{match_undef}? undef:0;
586             }
587              
588             # Context initialization and RE peeling
589             sub tstregex_init_desc
590             {
591 115     115 1 2142136 my ($re_raw) = @_;
592             # The Shield: Catching the 5.28 deprecation warnings and fatal errors
593             # We use a localized __WARN__ handler to catch the "Unescaped left brace"
594             # even if it's not a fatal error yet in 5.28
595            
596 115         243 my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw);
597 115 50       259 my $match_undef = $re_compiled eq RE_EMPTY ? 1:0;
598            
599 115         188 my $steps = _parse_lex_groups($re_clean);
600             # {
601             # no warnings 'experimental::re_strict';
602             # use re 'strict';
603             # $steps = _parse_lex_groups($re_clean);
604             # };
605             return
606             {
607 115         891 re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean,
608             steps => $steps, prefix_offset => $prefix_off, match_len => 0,
609             fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef,
610             match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef,
611             warning => $last_warning,
612             };
613             }
614              
615             # Helper: get char code at position
616 2     2   3 sub _stringat($$) { return vec($_[0], $_[1], 8); }
617              
618             sub _unwrap_regex
619             {
620 115     115   189 my ($raw) = @_;
621 115 50 33     480 return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw || $raw eq Tstregex::C_EMPTY;
622            
623 115         136 my $raw_org = $raw;
624 115         175 my $options = Tstregex::C_EMPTY;
625            
626             # 1. Extract trailing options (ismxg)
627 115         484 while ($raw =~ s/([ismxg])$//)
628             {
629 0         0 $options = $1 . $options;
630             }
631            
632             # 2. Delegate peeling to _strop
633 115         206 my $clean = _strop($raw);
634            
635             # 3. Automatic offset calculation (locate the "juice" within the original string)
636 115         225 my $off = index($raw_org, $clean);
637 115 50       262 $off = 0 if $off < 0;
638            
639             # 4. Secure Forge (Remove 'g' as it is irrelevant for qr//)
640 115         161 $options =~ tr/g//d;
641 115 50       222 my $re_str = $options ? "(?$options)$clean" : $clean;
642            
643 115         212 my ($re_ret, $warn) = _safe_qr($re_str);
644            
645 115         299 return ($re_ret, $clean, $off, $warn);
646             }
647            
648             sub _safe_qr
649             {
650 162     162   238 my ($re_str) = @_;
651 162         172 my ($re, $err);
652             {
653 162         173 local $@;
  162         161  
654 162     1   1211 local $SIG{__DIE__} = local $SIG{__WARN__} = sub { };
655             # dont catch the warning there, let the eval fail instead and get the message back in $@
656 162         243 $re = eval { qr/$re_str/ };
  162         2260  
657 162   50     832 $err = $@ // '';
658             }
659 162   50     481 return ($re // RE_EMPTY, $err);
660             }
661            
662             # _strop: strip operators
663             # Peels Perl operators (m!!, m{}, //) by checking extremities.
664             # It ensures only the core regex juice is returned.
665             sub _strop
666             {
667 115     115   148 my ($raw) = @_;
668 115 50 33     370 return $raw if !defined $raw || $raw eq Tstregex::C_EMPTY;
669            
670             # Remove leading/trailing whitespace
671 115         423 $raw =~ s/^\s+|\s+$//g;
672            
673             # Identify opening delimiter (after an optional 'm')
674 115 100       513 if ($raw =~ /^((?:m\s*)?)([^\w\s])(.*)$/s)
675             {
676 50         125 my $prefix = $1;
677 50         75 my $open = $2;
678 50         95 my $body = $3;
679            
680             # Map symmetric pairs using ASCII constants
681 50         220 my %sym_or_eq =
682             (
683             chr(ASCII_LBRACE) => chr(ASCII_RBRACE),
684             chr(ASCII_LBRACK) => chr(ASCII_RBRACK),
685             chr(ASCII_LPAREN) => chr(ASCII_RPAREN),
686             chr(ASCII_LT) => chr(ASCII_GT),
687             );
688            
689             # Expected close is either the matching pair or the same character (e.g., m!!)
690 50   66     165 my $expected_close = $sym_or_eq{$open} || $open;
691              
692             # PROTECT CAPTURE GROUPS:
693             # If the delimiter is a parenthesis but there is no 'm' prefix,
694             # it is a capturing group, NOT an operator. Do not peel!
695 50 100 66     164 if ($open eq chr(ASCII_LPAREN) && !$prefix)
696             {
697 26         71 return $raw;
698             }
699              
700             # Check if the very last character matches our expected closing delimiter
701 24 100       59 if (substr($body, -1) eq $expected_close)
702             {
703 1         5 return substr($body, 0, -1);
704             }
705            
706             # If 'm' was present but closing failed, return body (best effort)
707 23 50       66 return $body if $prefix;
708             }
709            
710 88         143 return $raw;
711             }
712            
713             sub _parse_lex_groups
714             {
715 115     115   206 my ($regex) = @_;
716 115         208 my $tokens = _get_lex_groups($regex);
717 115         128 my @results;
718 115         261 my $current = join(Tstregex::C_EMPTY, @$tokens);
719              
720 115         214 while (@$tokens)
721             {
722 412         710 my $opens = () = $current =~ /(?
723 412         588 my $closes = () = $current =~ /(?
724 412 50       559 if ($opens >= $closes)
725             {
726 412         584 my $v = $current . (')' x ($opens - $closes));
727 412         541 $v =~ s/(?
728 412 100       380 if (eval { qr/$v/ }) { push @results, $v; }
  412         4346  
  409         608  
729             }
730 412         648 my $last = pop @$tokens;
731 412 50       1003 substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last;
732             }
733 115         218 return \@results;
734             }
735              
736             # Lexical tokenizer for Perl Regex
737             sub _get_lex_groups
738             {
739 162     162   208 my ($regex) = @_;
740 162         168 my @groups;
741              
742             # --- START: class mismatch support (Added [.*?] to tokenizer) ---
743             # my $re = qr/(\(\?\#.*?\))|(\(\?[:=!<>]+)|(\{\d+,?\d*\})|(\[.*?\])|(\\.)|([\(\)\|^\$\+\*\?])|(.)/x;
744             # --- ENHANCED: Atomic Lookaround & Recursive Group Support ---
745             # Group 1: Comments (?#...)
746             # Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens
747             # Group 3: Quantifiers {n,m}
748             # Group 4: Character classes [...]
749             # Group 5: Escaped characters \.
750             # Group 6: Metacharacters ( ) | ^ $ + * ?
751             # Group 7: Any other character
752             # --- END: class mismatch support ---
753 162         288 my $re = qr/
754             (\(\?\#.*?\))
755             | ( # START GROUP 2
756             \(\?[:=!<>]+ # Assertion header
757             (?: # Content
758             (?> [^()]+ ) # Non-paren characters (atomic)
759             |
760             (?2) # Recursive call to Group 2
761             )*
762             \) # Matching closing paren
763             ) # END GROUP 2
764             | (\{\d+,?\d*\})
765             | (\[.*?\])
766             | (\\.)
767             | ([\(\)\|^\$\+\*\?])
768             | (.)
769             /x;
770              
771 162         940 while ($regex =~ /$re/g)
772             {
773 561   100     3509 my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
      100        
      100        
      100        
      100        
      66        
774 561 100 66     1924 if (defined $t && $t =~ /^[\+\*\?]$|^\{\d/ && @groups && $groups[-1] !~ /^[\(\)\|]$/)
      66        
      100        
775             {
776 71         239 $groups[-1] .= $t;
777             }
778 490         1734 else { push @groups, $t; }
779             }
780            
781             # TODO: test that optimized code fragment instead; Much time spent here, but sensitive part..
782             # my $quantifiers = '+*?{';
783             # while ($regex =~ /$re/g)
784             # {
785             # my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
786             # if (defined $t && @groups)
787             # {
788             # my $char = substr($t, 0, 1);
789             # if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\(\)\|]$/)
790             # {
791             # $groups[-1] .= $t;
792             # next;
793             # }
794             # }
795             # push @groups, $t;
796             # }
797            
798 162         404 return \@groups;
799             }
800              
801             }
802            
803             1;
804              
805             package main;
806              
807             __DATA__