File Coverage

blib/lib/Tstregex.pm
Criterion Covered Total %
statement 168 298 56.3
branch 37 102 36.2
condition 36 80 45.0
subroutine 28 40 70.0
pod 9 11 81.8
total 278 531 52.3


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2              
3             ###############################################
4             # Author: Olivier Delouya - 2026
5             # File: Tstregex.pm (Hybrid Modulino)
6             # Content: Regex longest match diagnostic
7             # indent: Whitesmith (perltidy -bl -bli)
8             ###############################################
9              
10 15     15   1560875 use strict;
  15         21  
  15         472  
11 15     15   53 use warnings;
  15         67  
  15         1140  
12              
13             =head1 NAME
14              
15             Tstregex - A Diagnostic Tool that quickly shows the longest Regular Expression string match, highlighting the rejected part. The terminal command
16             tstregex '/^[a-z]*\d{3}$/' 'abc12a' shows: abcE<#x1D5FE>E<#x1D5FF>E<#x1D5EE> (^[a-z]*\E<#x1D5ED>{E<#x1D5FC>}{E<#x1D5FD>}$)
17              
18             # Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring
19             #(idem with regexp lexical groups between parenthesis)
20             # A Hybrid Regex Diagnostic Tool (single file Library module with API and command tool)
21              
22             =cut
23            
24             =head1 SYNOPSIS
25              
26             C<$tstregex 'regex' string1 string2 ... stringN>
27              
28             =head1 OPTIONS (CLI)
29              
30             =head2 -h --help
31              
32             show that help..
33              
34             =head2 -v --verbose
35              
36             shows key info on (un)matching..
37              
38             =head2 -d --diag
39              
40             =over
41              
42             =item Triggers the Enriched Diagnostic View. It displays:
43              
44             =item - The string with the failing part highlighted.
45              
46             =item - The exact token in the regex that caused the break.
47              
48             =item - A visual pointer (C<^--- HERE>) aligned with the regex syntax.
49              
50             =item - Execution time (useful for spotting ReDoS/Exponential backtracking).
51              
52             =back
53              
54             =head2 -a --assert
55              
56             Misc: performs a huge test suite various a large collection of regexp tests with Tstregex..
57              
58             =head1 Perl Module SYNOPSIS
59              
60             use Tstregex;
61             my $ctx = tstregex_init_desc('/^\d{3}/');
62             tstregex($ctx, '12a');
63             if (!tstregex_is_full_match($res))
64             {
65             my $token = tstregex_get_fail_token($res);
66             my $pos = tstregex_get_match_len($res);
67             print "Failure on token '$token' at column $pos\n";
68             }
69             Note that if your purpose is just to display the result as the command would do, you can call directly
70             the main with the appropriate (argc, argv) style parameters to avoid to spawn another perl process.
71             See the tstregex file stub for details..
72              
73             =head1 API
74              
75             =head2 tstregex_init_desc($raw_re)
76              
77             Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash.
78              
79             =head2 tstregex($ctx, $string)
80              
81             Executes the diagnostic. Updates the context.
82              
83             =head2 tstregex_is_full_match
84              
85             Returns match status of input string (BOOL 0 OR 1)
86              
87             =head2 tstregex_get_match_portion
88              
89             Returns the matching portion in case of full match
90             (might be smaller than input string, depending on anchors..)
91              
92             =head2 tstregex_get_match_len
93              
94             Returns the matching substring length
95              
96             =head2 tstregex_get_fail_token
97              
98             Returns the failing token in the regexp
99              
100             =head2 tstregex_get_re_clean
101              
102             Returns the matching regexp subpart
103              
104             =head2 tstregex_get_re_raw
105              
106             Returns the internal representation of the regexp
107              
108             =head2 tstregex_get_prefix_offset
109              
110             Returns the offset of the original regexp in the raw regexp
111              
112             =head1 DESCRIPTION
113              
114             =over
115              
116             =item B is designed to solve the "Black Box" problem of Regular Expressions.
117              
118             =item When a complex regex fails, Perl usually just says "No Match".
119              
120             =item This tool identifies exactly B and B it failed by finding the longest possible partial match.
121              
122             =back
123              
124             =head1 EXAMPLE
125              
126             =over
127              
128             =item $ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a'
129              
130             =item abc123
131              
132             =item abcB<12a> (^[a-z]*B<\d{3}$>)
133              
134             =back
135              
136             I
137              
138             =head2 The "Nibbling" Engine
139              
140             The diagnostic logic uses a "Nibbling" (grignotage) strategy:
141              
142             =over 4
143              
144             =item 1. Decomposition
145              
146             The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest.
147              
148             =item 2. Longest Match Search
149              
150             =back
151              
152             =over 4
153              
154             It iteratively tests these sub-patterns against the input string.
155              
156             It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall.
157              
158             =back
159              
160             =over 4
161              
162             =item 3. Failure Point Identification
163              
164             Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure".
165              
166             =back
167              
168             =head1 AUTHOR
169              
170             Olivier Delouya - 2026
171              
172             =head1 LICENSE
173              
174             Artistic Version 2
175              
176             =head1 SEE ALSO
177              
178             This tool belongs to the family of Perl regex development utilities. It is
179             intended for developers looking for a lightweight perl regex debugger,
180             providing automated regex diagnostics when a regular expression fails
181             or requires deep troubleshooting in the CLI terminal.
182              
183             =head1 DESCRIPTION
184              
185             Tstregex is a CLI tool designed for Perl regex debugging and diagnostics.
186             If you need to debug a Perl regular expression, find where a regex fails,
187             or pinpoint the exact character that broke your pattern, this module
188             automates the process.
189              
190             =head1 KEYWORDS
191              
192             perl regex debugger, debug regular expression perl, regex failure pinpoint,
193             match longest regex string, regular expression diagnostic tool,
194             perl regex troubleshooting.
195              
196             =cut
197              
198              
199             package main;
200             {
201 15     15   77 use strict;
  15         34  
  15         253  
202 15     15   50 use warnings;
  15         66  
  15         645  
203 15     15   67 use Carp qw(confess);
  15         19  
  15         1030  
204             $SIG{__WARN__} = 'confess';
205             $SIG{__DIE__ } = 'confess';
206              
207 15     15   7609 use IO::Handle;
  15         84945  
  15         740  
208 15     15   8467 use Term::ANSIColor qw(:constants);
  15         125381  
  15         14166  
209 15     15   149 use Time::HiRes qw(gettimeofday tv_interval);
  15         45  
  15         120  
210 15     15   6987 use utf8;
  15         3186  
  15         121  
211              
212             # --- Constants & ASCII Codes ---
213             use constant
214             {
215 15         4554 C_m => 109, C_g => 103, C_i => 105, C_s => 115,
216             C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48,
217             C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97,
218             C_LOW_Z => 122, C_UNDSC => 95,
219             OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93,
220             OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62,
221             ESC => "\e",
222             CUU => "\e[A", # Cursor Up
223             UI_OFFSET=> 11, # Alignment offset for " Syntax: "
224 15     15   1149 };
  15         21  
225              
226             # --- ENCAPSULATED DEBUG ALIAS ---
227             BEGIN
228             {
229 15 50   15   22349 if ($INC{'perl5db.pl'})
230             {
231 0         0 require Data::Dumper;
232 0         0 require Term::ANSIColor;
233 15     15   87 no strict 'refs';
  15         21  
  15         486  
234 15     15   55 no warnings 'once';
  15         19  
  15         3909  
235              
236             my $debug_sub = sub
237             {
238 0         0 my @args = @_;
239              
240             # Automatically detect flattened hashes:
241             # If even number of arguments and the first one isn't a reference
242 0 0 0     0 if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0]))
      0        
243             {
244             # Wrap the flattened list into a temporary hashref
245 0         0 my %tmp_hash = @args;
246 0         0 @args = (\%tmp_hash);
247             }
248              
249 0         0 print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(),
250             'DEBUG (tstregex): ', Term::ANSIColor::RESET(),
251             Data::Dumper::Dumper(@args);
252 0         0 };
253              
254             # Force injection into all relevant namespaces
255 0         0 foreach my $pkg ('main', 'Tstregex', 'DB')
256             {
257 0         0 *{"${pkg}::d"} = $debug_sub;
  0         0  
258             }
259              
260 0 0       0 my $cuu = defined &main::CUU ? main::CUU() : "\e[A";
261 0         0 print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(),
262             'INFO: ', Term::ANSIColor::RESET(),
263             "Alias 'd' ready (Auto-hash detection enabled)\n\n";
264             }
265             }
266            
267             exit(main(scalar(@ARGV), \@ARGV)) if(!caller);
268              
269             sub main
270             {
271 0     0   0 my ($argc, $argv) = @_;
272 0 0 0     0 if(!$argc || ($argc && $$argv[0] =~ /^-h|--help$/)) { help(); exit(0); }
  0   0     0  
  0         0  
273              
274 0         0 binmode STDOUT, ':utf8';
275 0         0 my ($mode_diag, $verbose, $assert) = (0)x2;
276 0         0 for(my $i=0; $i<$argc; $i++)
277             {
278 0 0 0     0 do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d|--diag$/);
  0         0  
  0         0  
  0         0  
279 0 0 0     0 do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v|--verbose$/);
  0         0  
  0         0  
  0         0  
280 0 0 0     0 do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a|--assert$/);
  0         0  
  0         0  
  0         0  
281             }
282            
283 0 0       0 if ($assert)
284             {
285 0         0 print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET;
286 0         0 _run_internal_tests($mode_diag, $verbose);
287 0         0 exit(0);
288             }
289              
290 0         0 my @new_argv;
291 0         0 foreach(@$argv)
292             {
293 0 0       0 push @new_argv, $_ if(defined($_));
294             }
295 0         0 $argv = \@new_argv;
296 0         0 $argc = scalar @$argv;
297            
298 0         0 my $re_raw = shift @{$argv};
  0         0  
299 0         0 my $ctx = Tstregex::tstregex_init_desc($re_raw);
300 0         0 my $global_result = 0; # success! BE POSITIVE !!
301 0         0 foreach my $pattern (@{$argv})
  0         0  
302             {
303 0 0       0 my $t0 = [gettimeofday] if $mode_diag;
304 0         0 my $result = Tstregex::tstregex($ctx, $pattern);
305 0 0       0 $global_result = 1 if($result);
306 0 0       0 $mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0))
307             : _display_standard($pattern, $ctx);
308 0 0       0 if($verbose)
309             {
310 0 0       0 print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; ';
311 0 0       0 print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx))
312             : ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx));
313 0         0 print Term::ANSIColor::RESET(), "\n";
314 0 0 0     0 print $ctx->{warning} if($verbose && $ctx->{warning} ne '');
315             }
316             }
317 0         0 return $global_result;
318             }
319            
320             sub help
321             {
322 0     0   0 print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET;
323 0         0 print "Usage:\n";
324 0         0 print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n";
325 0         0 print "Examples:\n";
326 0         0 print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n";
327 0         0 print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n";
328 0         0 print BOLD, 'DELIMITERS ', RESET, "are optional\n";
329 0         0 print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n";
330 0         0 print "Options:\n";
331 0         0 print "-h --help Shows that help\n";
332 0         0 print "-v --verbose Shows keys info on match/unmatch\n";
333 0         0 print "-d --diag Enriched diagnostic with timing and syntax pointers\n";
334 0         0 print "-a --assert Misc: shows a large test of regexp against tstregex..\n";
335             }
336              
337             sub _run_internal_tests
338             {
339 0     0   0 my ($mode_diag, $verbose) = @_;
340 0         0 my $fh = \*main::DATA;
341 0         0 seek($fh, 0, 0);
342            
343 0         0 my $found_data_token = 0;
344            
345 0         0 while (<$fh>)
346             {
347 0         0 chomp;
348             # PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker
349             # This prevents the script from parsing its own source code
350 0 0       0 if (!$found_data_token)
351             {
352 0 0       0 $found_data_token = 1 if /^__(DATA|END)__/;
353 0         0 next;
354             }
355             else
356             {
357 0 0       0 last if /^__(DATA|END)__/;
358             }
359 0 0 0     0 next if /^\s*$/ || /^#/;
360 0         0 my ($re, @rest) = split(/\s+|:::\s*/, $_);
361 0 0       0 next unless $re;
362 0 0       0 my @strings = grep { $_ ne '0' && $_ ne '1' } @rest;
  0         0  
363              
364 0         0 print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n";
365 0         0 my $ctx = Tstregex::tstregex_init_desc($re);
366 0 0 0     0 print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne '');
367 0         0 foreach my $s (@strings)
368             {
369 0         0 my $t0;
370 0 0       0 $t0 = [gettimeofday] if $mode_diag;
371 0         0 Tstregex::tstregex($ctx, $s);
372 0 0       0 $mode_diag ? _display_enriched($s, $ctx, tv_interval($t0))
373             : _display_standard($s, $ctx);
374             }
375 0         0 print '-' x 40, "\n";
376             }
377             }
378            
379             sub _get_re_val
380             {
381             return (
382 0     0   0 Tstregex::tstregex_get_match_len ($_[0]),
383             Tstregex::tstregex_get_fail_token($_[0]),
384             Tstregex::tstregex_get_re_clean ($_[0])
385             );
386             }
387              
388             sub _display_standard
389             {
390 0     0   0 my ($pattern, $ctx) = @_;
391 0 0       0 if (Tstregex::tstregex_is_full_match($ctx))
392             {
393 0         0 print "$pattern\n";
394 0         0 return undef;
395             }
396 0         0 my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
397 0         0 print substr($pattern, 0, $match_len); STDOUT->flush(); print STDERR BOLD, substr($pattern, $match_len), RESET;
  0         0  
  0         0  
398 0         0 my $off = length($re_clean) - length($fail_token);
399 0         0 print STDERR ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n";
400 0         0 undef;
401             }
402              
403             sub _display_enriched
404             {
405 0     0   0 my ($pattern, $ctx, $elapsed) = @_;
406 0         0 print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n";
407              
408 0 0       0 if (Tstregex::tstregex_is_full_match($ctx))
409             {
410 0         0 print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n";
411 0         0 return undef;
412             }
413 0         0 my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
414 0         0 my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx);
415 0         0 my $re_raw = Tstregex::tstregex_get_re_raw ($ctx);
416              
417 0         0 print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len),
418             BOLD, WHITE, substr($pattern, $match_len), RESET;
419 0         0 print ' (at ', CYAN, $fail_token, RESET, ")\n";
420              
421 0         0 my $err_pos_in_clean = length($re_clean) - length($fail_token);
422 0         0 my $final_pointer_pos = $prefix_off + $err_pos_in_clean;
423              
424 0         0 print ' Syntax: ', WHITE, $re_raw, RESET, "\n";
425 0         0 print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n";
426 0         0 printf " Time: %.4fs\n\n", $elapsed;
427 0         0 undef;
428             }
429              
430             }
431              
432             1;
433              
434             package Tstregex;
435             {
436             our $VERSION = '1.13';
437 15     15   107 use Exporter qw(import);
  15         19  
  15         3362  
438              
439             our @EXPORT = qw(
440             tstregex
441             tstregex_init_desc
442             tstregex_get_match_len
443             tstregex_get_fail_token
444             tstregex_is_full_match
445             tstregex_get_re_clean
446             tstregex_get_prefix_offset
447             tstregex_get_re_raw
448             tstregex_get_match_portion
449             tstregex_get_info
450             );
451              
452             # --- PUBLIC API (The Getters) ---
453              
454 115     115 1 367 sub tstregex_get_match_len { return $_[0]->{match_len}; }
455 115     115 1 367 sub tstregex_get_fail_token { return $_[0]->{fail_token}; }
456 0     0 1 0 sub tstregex_is_full_match { return $_[0]->{full_match}; }
457 0     0 1 0 sub tstregex_get_re_clean { return $_[0]->{re_clean}; }
458 0     0 1 0 sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; }
459 0     0 1 0 sub tstregex_get_re_raw { return $_[0]->{re_raw}; }
460 115   50 115 0 427 sub tstregex_get_captures { return $_[0]->{captures} // [];}
461 0     0 1 0 sub tstregex_get_match_portion { return $_[0]->{match_portion}; }
462 0     0 0 0 sub tstregex_get_info { return $_[0]->{warning}; }
463              
464             # Main diagnostic function
465 15     15   78 use constant;
  15         20  
  15         1268  
466             use constant
467             {
468 15         30878 C_EMPTY => '',
469             RE_EMPTY => qr/\0/,
470             ASCII_LPAREN => ord('('), # 40
471             ASCII_RPAREN => ord(')'), # 41
472             ASCII_LBRACE => ord('{'), # 123
473             ASCII_RBRACE => ord('}'), # 125
474             ASCII_LBRACK => ord('['), # 91
475             ASCII_RBRACK => ord(']'), # 93
476             ASCII_LT => ord('<'), # 60
477             ASCII_GT => ord('>'), # 62
478 15     15   116 };
  15         32  
479            
480             sub tstregex
481             {
482 115     115 1 422 my ($ctx, $pattern) = @_;
483             # FIX: re init start-state fields in case of multiple test patterns
484 115         188 $ctx -> {'fail_token'} = Tstregex::C_EMPTY;
485 115         134 $ctx -> {'match_portion'} = undef;
486 115         124 $ctx -> {'right_unmatch'} = undef;
487 115         126 $ctx -> {'match_len'} = 0;
488 115         145 $ctx -> {'full_match'} = 0;
489 115         124 $ctx -> {'left_unmatch'} = undef;
490 115         133 my $re_raw = $ctx->{re_raw};
491 115         125 my $org_pat = $pattern;
492 115         120 my $internal_offset = 0;
493              
494             # Handle prefix offset if pattern is wrapped like the RE
495 115 100 66     254 if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1)
496             {
497 1         6 my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1);
498 1         3 my $pat_first_char = _stringat($pattern, 0);
499 1 50       4 if ($pat_first_char == $re_delim_char)
500             {
501 1         3 $pattern = substr($org_pat, $ctx->{prefix_offset});
502 1         4 chop $pattern;
503 1         3 $internal_offset = $ctx->{prefix_offset};
504             }
505             }
506              
507             # Fast track: check if it matches globally first
508 115 100       526 if ($pattern =~ $ctx->{re_compiled})
509             {
510 68         91 $ctx->{full_match} = 1;
511 68         116 $ctx->{match_len} = length($org_pat);
512 68         176 $ctx->{match_portion} = $&; # the exact sub portion that matched
513 68         136 $ctx->{left_unmatch} = $`; # the left part of matching sub part
514 68         143 $ctx->{right_unmatch} = $'; # the right part of matching sub part
515              
516             # --- Capture Groups Extraction ---
517             # We populate the captures array only on a successful global match
518 68         96 my @caps;
519              
520             # The special variable $#- contains the number of capture groups
521             # We start at 1 because $0 is the whole match
522 68         217 for my $i (1 .. $#-)
523             {
524 22 100       63 if (defined $-[$i])
525             {
526             # Extracting the substring using offsets from @- and @+
527 21         90 push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]);
528             }
529             else
530             {
531             # Optional group that participated but didn't catch text
532 1         2 push @caps, undef;
533             }
534             }
535 68         113 $ctx->{captures} = \@caps;
536              
537 68         159 return 1;
538             }
539              
540             # Nibbling phase: find the longest matching lexical group
541 47         75 my $match_reg = Tstregex::C_EMPTY;
542 47         65 foreach my $step (@{$ctx->{steps}})
  47         112  
543             {
544 114 100       1016 if ($pattern =~ qr/$step/)
545             {
546 35         47 $match_reg = $step;
547 35         51 last;
548             }
549             }
550              
551             # ** SENSITIVE **
552             # Fine-tune the match length character by character
553             # Append a \z to avoid Perl to skip final \n
554             # if Nibbling failed ($match_reg empty), get the full regex for fine-tuning.
555 47 100       132 my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean};
556             # critical! add starting anchor (\A) to force coherency check from the first char..
557 47         149 my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #;
558 47         147 for (my $i = length($pattern); $i >= 0; $i--)
559             {
560             # check if current prefix is valid according to the target
561 117 100       317 last if ($pattern =~ $match_work);
562 83         139 chop $pattern;
563             }
564 47         96 $ctx->{match_len} = length($pattern) + $internal_offset;
565              
566             # Identify the failing token for display
567 47 50       45 my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean};
  47         124  
568 47         110 my $remaining_re = substr($tail_re, length($match_reg));
569              
570 47 50       73 if ($remaining_re ne Tstregex::C_EMPTY)
571             {
572             # get the first token for analyse
573 47         75 my $next_tokens = _get_lex_groups($remaining_re);
574 47   50     95 my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY;
575 47         72 $ctx->{fail_token} = $remaining_re;
576 47 100       152 $ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b|\^|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $)
577             }
578             else
579             {
580 0         0 $ctx->{fail_token} = Tstregex::C_EMPTY;
581             }
582              
583             # Ensure captures is empty/undef on failure
584 47         83 $ctx->{captures} = [];
585 47 50       146 return $ctx->{match_undef}? undef:0;
586             }
587              
588             # Context initialization and RE peeling
589             sub tstregex_init_desc
590             {
591 115     115 1 2231759 my ($re_raw) = @_;
592             # The Shield: Catching the 5.28 deprecation warnings and fatal errors
593             # We use a localized __WARN__ handler to catch the "Unescaped left brace"
594             # even if it's not a fatal error yet in 5.28
595            
596 115         228 my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw);
597 115 50       245 my $match_undef = $re_compiled eq RE_EMPTY ? 1:0;
598            
599 115         222 my $steps = _parse_lex_groups($re_clean);
600             # {
601             # no warnings 'experimental::re_strict';
602             # use re 'strict';
603             # $steps = _parse_lex_groups($re_clean);
604             # };
605             return
606             {
607 115         943 re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean,
608             steps => $steps, prefix_offset => $prefix_off, match_len => 0,
609             fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef,
610             match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef,
611             warning => $last_warning,
612             };
613             }
614              
615             # Helper: get char code at position
616 2     2   6 sub _stringat($$) { return vec($_[0], $_[1], 8); }
617              
618             sub _unwrap_regex
619             {
620 115     115   159 my ($raw) = @_;
621 115 50 33     519 return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw || $raw eq Tstregex::C_EMPTY;
622            
623 115         176 my $raw_org = $raw;
624 115         132 my $options = Tstregex::C_EMPTY;
625            
626             # 1. Extract trailing options (ismxg)
627 115         404 while ($raw =~ s/([ismxg])$//)
628             {
629 0         0 $options = $1 . $options;
630             }
631            
632             # 2. Delegate peeling to _strop
633 115         203 my $clean = _strop($raw);
634            
635             # 3. Automatic offset calculation (locate the "juice" within the original string)
636 115         211 my $off = index($raw_org, $clean);
637 115 50       262 $off = 0 if $off < 0;
638            
639             # 4. Secure Forge (Remove 'g' as it is irrelevant for qr//)
640 115         155 $options =~ tr/g//d;
641 115 50       207 my $re_str = $options ? "(?$options)$clean" : $clean;
642            
643 115         189 my ($re_ret, $warn) = _safe_qr($re_str);
644            
645 115         282 return ($re_ret, $clean, $off, $warn);
646             }
647            
648             sub _safe_qr
649             {
650 162     162   211 my ($re_str) = @_;
651 162         204 my ($re, $err);
652             {
653 162         141 local $@;
  162         163  
654 162     1   1200 local $SIG{__DIE__} = local $SIG{__WARN__} = sub { };
655             # dont catch the warning there, let the eval fail instead and get the message back in $@
656 162         273 $re = eval { qr/$re_str/ };
  162         2316  
657 162   50     829 $err = $@ // '';
658             }
659 162   50     472 return ($re // RE_EMPTY, $err);
660             }
661            
662             # _strop: strip operators
663             # Peels Perl operators (m!!, m{}, //) by checking extremities.
664             # It ensures only the core regex juice is returned.
665             sub _strop
666             {
667 115     115   151 my ($raw) = @_;
668 115 50 33     385 return $raw if !defined $raw || $raw eq Tstregex::C_EMPTY;
669            
670             # Remove leading/trailing whitespace
671 115         394 $raw =~ s/^\s+|\s+$//g;
672            
673             # Identify opening delimiter (after an optional 'm')
674 115 100       547 if ($raw =~ /^((?:m\s*)?)([^\w\s])(.*)$/s)
675             {
676 50         114 my $prefix = $1;
677 50         75 my $open = $2;
678 50         82 my $body = $3;
679            
680             # Map symmetric pairs using ASCII constants
681 50         214 my %sym_or_eq =
682             (
683             chr(ASCII_LBRACE) => chr(ASCII_RBRACE),
684             chr(ASCII_LBRACK) => chr(ASCII_RBRACK),
685             chr(ASCII_LPAREN) => chr(ASCII_RPAREN),
686             chr(ASCII_LT) => chr(ASCII_GT),
687             );
688            
689             # Expected close is either the matching pair or the same character (e.g., m!!)
690 50   66     176 my $expected_close = $sym_or_eq{$open} || $open;
691              
692             # PROTECT CAPTURE GROUPS:
693             # If the delimiter is a parenthesis but there is no 'm' prefix,
694             # it is a capturing group, NOT an operator. Do not peel!
695 50 100 66     129 if ($open eq chr(ASCII_LPAREN) && !$prefix)
696             {
697 26         71 return $raw;
698             }
699              
700             # Check if the very last character matches our expected closing delimiter
701 24 100       70 if (substr($body, -1) eq $expected_close)
702             {
703 1         5 return substr($body, 0, -1);
704             }
705            
706             # If 'm' was present but closing failed, return body (best effort)
707 23 50       85 return $body if $prefix;
708             }
709            
710 88         141 return $raw;
711             }
712            
713             sub _parse_lex_groups
714             {
715 115     115   218 my ($regex) = @_;
716 115         201 my $tokens = _get_lex_groups($regex);
717 115         119 my @results;
718 115         261 my $current = join(Tstregex::C_EMPTY, @$tokens);
719              
720 115         194 while (@$tokens)
721             {
722 412         730 my $opens = () = $current =~ /(?
723 412         536 my $closes = () = $current =~ /(?
724 412 50       538 if ($opens >= $closes)
725             {
726 412         566 my $v = $current . (')' x ($opens - $closes));
727 412         479 $v =~ s/(?
728 412 100       377 if (eval { qr/$v/ }) { push @results, $v; }
  412         4197  
  409         595  
729             }
730 412         553 my $last = pop @$tokens;
731 412 50       934 substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last;
732             }
733 115         229 return \@results;
734             }
735              
736             # Lexical tokenizer for Perl Regex
737             sub _get_lex_groups
738             {
739 162     162   228 my ($regex) = @_;
740 162         176 my @groups;
741              
742             # --- START: class mismatch support (Added [.*?] to tokenizer) ---
743             # my $re = qr/(\(\?\#.*?\))|(\(\?[:=!<>]+)|(\{\d+,?\d*\})|(\[.*?\])|(\\.)|([\(\)\|^\$\+\*\?])|(.)/x;
744             # --- ENHANCED: Atomic Lookaround & Recursive Group Support ---
745             # Group 1: Comments (?#...)
746             # Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens
747             # Group 3: Quantifiers {n,m}
748             # Group 4: Character classes [...]
749             # Group 5: Escaped characters \.
750             # Group 6: Metacharacters ( ) | ^ $ + * ?
751             # Group 7: Any other character
752             # --- END: class mismatch support ---
753 162         285 my $re = qr/
754             (\(\?\#.*?\))
755             | ( # START GROUP 2
756             \(\?[:=!<>]+ # Assertion header
757             (?: # Content
758             (?> [^()]+ ) # Non-paren characters (atomic)
759             |
760             (?2) # Recursive call to Group 2
761             )*
762             \) # Matching closing paren
763             ) # END GROUP 2
764             | (\{\d+,?\d*\})
765             | (\[.*?\])
766             | (\\.)
767             | ([\(\)\|^\$\+\*\?])
768             | (.)
769             /x;
770              
771 162         967 while ($regex =~ /$re/g)
772             {
773 561   100     3265 my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
      100        
      100        
      100        
      100        
      66        
774 561 100 66     1903 if (defined $t && $t =~ /^[\+\*\?]$|^\{\d/ && @groups && $groups[-1] !~ /^[\(\)\|]$/)
      66        
      100        
775             {
776 71         250 $groups[-1] .= $t;
777             }
778 490         1795 else { push @groups, $t; }
779             }
780            
781             # TODO: test that optimized code fragment instead; Much time spent here, but sensitive part..
782             # my $quantifiers = '+*?{';
783             # while ($regex =~ /$re/g)
784             # {
785             # my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
786             # if (defined $t && @groups)
787             # {
788             # my $char = substr($t, 0, 1);
789             # if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\(\)\|]$/)
790             # {
791             # $groups[-1] .= $t;
792             # next;
793             # }
794             # }
795             # push @groups, $t;
796             # }
797            
798 162         385 return \@groups;
799             }
800              
801             }
802            
803             1;
804              
805             package main;
806              
807             __DATA__