File Coverage

blib/lib/Tstregex.pm

Criterion	Covered	Total	%
statement	168	298	56.3
branch	37	102	36.2
condition	36	80	45.0
subroutine	28	40	70.0
pod	9	11	81.8
total	278	531	52.3

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							###############################################
4							# Author: Olivier Delouya - 2026
5							# File: Tstregex.pm (Hybrid Modulino)
6							# Content: Regex longest match diagnostic
7							# indent: Whitesmith (perltidy -bl -bli)
8							###############################################
9
10	15			15		1423104	use strict;
	15					27
	15					425
11	15			15		51	use warnings;
	15					68
	15					1039
12
13							=head1 NAME
14
15							Tstregex - A Diagnostic Tool that quickly shows the longest Regular Expression string match, highlighting the rejected part. The terminal command
16							tstregex '/^[a-z]\d{3}$/' 'abc12a' shows: Tstregex - Diagnostic Tool ... shows: abc�B<12a>� (^[a-z]�B<\d{3}$>�)
17
18							# Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring
19							#(idem with regexp lexical groups between parenthesis)
20							# A Hybrid Regex Diagnostic Tool (single file Library module with API and command tool)
21
22							=cut
23
24							=head1 SYNOPSIS
25
26							C<$tstregex 'regex' string1 string2 ... stringN>
27
28							=head1 OPTIONS (CLI)
29
30							=head2 -h --help
31
32							show that help..
33
34							=head2 -v --verbose
35
36							shows key info on (un)matching..
37
38							=head2 -d --diag
39
40							=over
41
42							=item Triggers the Enriched Diagnostic View. It displays:
43
44							=item - The string with the failing part highlighted.
45
46							=item - The exact token in the regex that caused the break.
47
48							=item - A visual pointer (C<^--- HERE>) aligned with the regex syntax.
49
50							=item - Execution time (useful for spotting ReDoS/Exponential backtracking).
51
52							=back
53
54							=head2 -a --assert
55
56							Misc: performs a huge test suite various a large collection of regexp tests with Tstregex..
57
58							=head1 Perl Module SYNOPSIS
59
60							use Tstregex;
61							my $ctx = tstregex_init_desc('/^\d{3}/');
62							tstregex($ctx, '12a');
63							if (!tstregex_is_full_match($res))
64							{
65							my $token = tstregex_get_fail_token($res);
66							my $pos = tstregex_get_match_len($res);
67							print "Failure on token '$token' at column $pos\n";
68							}
69							Note that if your purpose is just to display the result as the command would do, you can call directly
70							the main with the appropriate (argc, argv) style parameters to avoid to spawn another perl process.
71							See the tstregex file stub for details..
72
73							=head1 API
74
75							=head2 tstregex_init_desc($raw_re)
76
77							Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash.
78
79							=head2 tstregex($ctx, $string)
80
81							Executes the diagnostic. Updates the context.
82
83							=head2 tstregex_is_full_match
84
85							Returns match status of input string (BOOL 0 OR 1)
86
87							=head2 tstregex_get_match_portion
88
89							Returns the matching portion in case of full match
90							(might be smaller than input string, depending on anchors..)
91
92							=head2 tstregex_get_match_len
93
94							Returns the matching substring length
95
96							=head2 tstregex_get_fail_token
97
98							Returns the failing token in the regexp
99
100							=head2 tstregex_get_re_clean
101
102							Returns the matching regexp subpart
103
104							=head2 tstregex_get_re_raw
105
106							Returns the internal representation of the regexp
107
108							=head2 tstregex_get_prefix_offset
109
110							Returns the offset of the original regexp in the raw regexp
111
112							=head1 DESCRIPTION
113
114							=over
115
116							=item B is designed to solve the "Black Box" problem of Regular Expressions.
117
118							=item When a complex regex fails, Perl usually just says "No Match".
119
120							=item This tool identifies exactly B and B it failed by finding the longest possible partial match.
121
122							=back
123
124							=head1 EXAMPLE
125
126							=over
127
128							=item $ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a'
129
130							=item abc123
131
132							=item abcB<12a> (^[a-z]*B<\d{3}$>)
133
134							=back
135
136							I
137
138							=head2 The "Nibbling" Engine
139
140							The diagnostic logic uses a "Nibbling" (grignotage) strategy:
141
142							=over 4
143
144							=item 1. Decomposition
145
146							The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest.
147
148							=item 2. Longest Match Search
149
150							=back
151
152							=over 4
153
154							It iteratively tests these sub-patterns against the input string.
155
156							It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall.
157
158							=back
159
160							=over 4
161
162							=item 3. Failure Point Identification
163
164							Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure".
165
166							=back
167
168							=head1 AUTHOR
169
170							Olivier Delouya - 2026
171
172							=head1 LICENSE
173
174							Artistic Version 2
175
176							=head1 SEE ALSO
177
178							This tool belongs to the family of Perl regex development utilities. It is
179							intended for developers looking for a lightweight perl regex debugger,
180							providing automated regex diagnostics when a regular expression fails
181							or requires deep troubleshooting in the CLI terminal.
182
183							=head1 DESCRIPTION
184
185							Tstregex is a CLI tool designed for Perl regex debugging and diagnostics.
186							If you need to debug a Perl regular expression, find where a regex fails,
187							or pinpoint the exact character that broke your pattern, this module
188							automates the process.
189
190							=head1 KEYWORDS
191
192							perl regex debugger, debug regular expression perl, regex failure pinpoint,
193							match longest regex string, regular expression diagnostic tool,
194							perl regex troubleshooting.
195
196							=cut
197
198
199							package main;
200							{
201	15			15		78	use strict;
	15					43
	15					243
202	15			15		58	use warnings;
	15					38
	15					542
203	15			15		62	use Carp qw(confess);
	15					18
	15					928
204							$SIG{__WARN__} = 'confess';
205							$SIG{__DIE__ } = 'confess';
206
207	15			15		7209	use IO::Handle;
	15					80838
	15					796
208	15			15		8611	use Term::ANSIColor qw(:constants);
	15					120626
	15					14271
209	15			15		176	use Time::HiRes qw(gettimeofday tv_interval);
	15					39
	15					167
210	15			15		7074	use utf8;
	15					3050
	15					73
211
212							# --- Constants & ASCII Codes ---
213							use constant
214							{
215	15					4282	C_m => 109, C_g => 103, C_i => 105, C_s => 115,
216							C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48,
217							C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97,
218							C_LOW_Z => 122, C_UNDSC => 95,
219							OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93,
220							OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62,
221							ESC => "\e",
222							CUU => "\e[A", # Cursor Up
223							UI_OFFSET=> 11, # Alignment offset for " Syntax: "
224	15			15		1136	};
	15					18
225
226							# --- ENCAPSULATED DEBUG ALIAS ---
227							BEGIN
228							{
229	15	50		15		21542	if ($INC{'perl5db.pl'})
230							{
231	0					0	require Data::Dumper;
232	0					0	require Term::ANSIColor;
233	15			15		77	no strict 'refs';
	15					18
	15					439
234	15			15		44	no warnings 'once';
	15					18
	15					3808
235
236							my $debug_sub = sub
237							{
238	0					0	my @args = @_;
239
240							# Automatically detect flattened hashes:
241							# If even number of arguments and the first one isn't a reference
242	0	0	0			0	if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0]))
			0
243							{
244							# Wrap the flattened list into a temporary hashref
245	0					0	my %tmp_hash = @args;
246	0					0	@args = (\%tmp_hash);
247							}
248
249	0					0	print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(),
250							'DEBUG (tstregex): ', Term::ANSIColor::RESET(),
251							Data::Dumper::Dumper(@args);
252	0					0	};
253
254							# Force injection into all relevant namespaces
255	0					0	foreach my $pkg ('main', 'Tstregex', 'DB')
256							{
257	0					0	*{"${pkg}::d"} = $debug_sub;
	0					0
258							}
259
260	0	0				0	my $cuu = defined &main::CUU ? main::CUU() : "\e[A";
261	0					0	print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(),
262							'INFO: ', Term::ANSIColor::RESET(),
263							"Alias 'd' ready (Auto-hash detection enabled)\n\n";
264							}
265							}
266
267							exit(main(scalar(@ARGV), \@ARGV)) if(!caller);
268
269							sub main
270							{
271	0			0		0	my ($argc, $argv) = @_;
272	0	0	0			0	if(!$argc \|\| ($argc && $$argv[0] =~ /^-h\|--help$/)) { help(); exit(0); }
	0		0			0
	0					0
273
274	0					0	binmode STDOUT, ':utf8';
275	0					0	my ($mode_diag, $verbose, $assert) = (0)x2;
276	0					0	for(my $i=0; $i<$argc; $i++)
277							{
278	0	0	0			0	do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d\|--diag$/);
	0					0
	0					0
	0					0
279	0	0	0			0	do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v\|--verbose$/);
	0					0
	0					0
	0					0
280	0	0	0			0	do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a\|--assert$/);
	0					0
	0					0
	0					0
281							}
282
283	0	0				0	if ($assert)
284							{
285	0					0	print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET;
286	0					0	_run_internal_tests($mode_diag, $verbose);
287	0					0	exit(0);
288							}
289
290	0					0	my @new_argv;
291	0					0	foreach(@$argv)
292							{
293	0	0				0	push @new_argv, $_ if(defined($_));
294							}
295	0					0	$argv = \@new_argv;
296	0					0	$argc = scalar @$argv;
297
298	0					0	my $re_raw = shift @{$argv};
	0					0
299	0					0	my $ctx = Tstregex::tstregex_init_desc($re_raw);
300	0					0	my $global_result = 0; # success! BE POSITIVE !!
301	0					0	foreach my $pattern (@{$argv})
	0					0
302							{
303	0	0				0	my $t0 = [gettimeofday] if $mode_diag;
304	0					0	my $result = Tstregex::tstregex($ctx, $pattern);
305	0	0				0	$global_result = 1 if($result);
306	0	0				0	$mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0))
307							: _display_standard($pattern, $ctx);
308	0	0				0	if($verbose)
309							{
310	0	0				0	print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; ';
311	0	0				0	print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx))
312							: ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx));
313	0					0	print Term::ANSIColor::RESET(), "\n";
314	0	0	0			0	print $ctx->{warning} if($verbose && $ctx->{warning} ne '');
315							}
316							}
317	0					0	return $global_result;
318							}
319
320							sub help
321							{
322	0			0		0	print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET;
323	0					0	print "Usage:\n";
324	0					0	print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n";
325	0					0	print "Examples:\n";
326	0					0	print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n";
327	0					0	print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n";
328	0					0	print BOLD, 'DELIMITERS ', RESET, "are optional\n";
329	0					0	print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n";
330	0					0	print "Options:\n";
331	0					0	print "-h --help Shows that help\n";
332	0					0	print "-v --verbose Shows keys info on match/unmatch\n";
333	0					0	print "-d --diag Enriched diagnostic with timing and syntax pointers\n";
334	0					0	print "-a --assert Misc: shows a large test of regexp against tstregex..\n";
335							}
336
337							sub _run_internal_tests
338							{
339	0			0		0	my ($mode_diag, $verbose) = @_;
340	0					0	my $fh = \*main::DATA;
341	0					0	seek($fh, 0, 0);
342
343	0					0	my $found_data_token = 0;
344
345	0					0	while (<$fh>)
346							{
347	0					0	chomp;
348							# PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker
349							# This prevents the script from parsing its own source code
350	0	0				0	if (!$found_data_token)
351							{
352	0	0				0	$found_data_token = 1 if /^__(DATA\|END)__/;
353	0					0	next;
354							}
355							else
356							{
357	0	0				0	last if /^__(DATA\|END)__/;
358							}
359	0	0	0			0	next if /^\s*$/ \|\| /^#/;
360	0					0	my ($re, @rest) = split(/\s+\|:::\s*/, $_);
361	0	0				0	next unless $re;
362	0	0				0	my @strings = grep { $_ ne '0' && $_ ne '1' } @rest;
	0					0
363
364	0					0	print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n";
365	0					0	my $ctx = Tstregex::tstregex_init_desc($re);
366	0	0	0			0	print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne '');
367	0					0	foreach my $s (@strings)
368							{
369	0					0	my $t0;
370	0	0				0	$t0 = [gettimeofday] if $mode_diag;
371	0					0	Tstregex::tstregex($ctx, $s);
372	0	0				0	$mode_diag ? _display_enriched($s, $ctx, tv_interval($t0))
373							: _display_standard($s, $ctx);
374							}
375	0					0	print '-' x 40, "\n";
376							}
377							}
378
379							sub _get_re_val
380							{
381							return (
382	0			0		0	Tstregex::tstregex_get_match_len ($_[0]),
383							Tstregex::tstregex_get_fail_token($_[0]),
384							Tstregex::tstregex_get_re_clean ($_[0])
385							);
386							}
387
388							sub _display_standard
389							{
390	0			0		0	my ($pattern, $ctx) = @_;
391	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
392							{
393	0					0	print "$pattern\n";
394	0					0	return undef;
395							}
396	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
397	0					0	print substr($pattern, 0, $match_len); STDOUT->flush(); print STDERR BOLD, substr($pattern, $match_len), RESET;
	0					0
	0					0
398	0					0	my $off = length($re_clean) - length($fail_token);
399	0					0	print STDERR ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n";
400	0					0	undef;
401							}
402
403							sub _display_enriched
404							{
405	0			0		0	my ($pattern, $ctx, $elapsed) = @_;
406	0					0	print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n";
407
408	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
409							{
410	0					0	print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n";
411	0					0	return undef;
412							}
413	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
414	0					0	my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx);
415	0					0	my $re_raw = Tstregex::tstregex_get_re_raw ($ctx);
416
417	0					0	print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len),
418							BOLD, WHITE, substr($pattern, $match_len), RESET;
419	0					0	print ' (at ', CYAN, $fail_token, RESET, ")\n";
420
421	0					0	my $err_pos_in_clean = length($re_clean) - length($fail_token);
422	0					0	my $final_pointer_pos = $prefix_off + $err_pos_in_clean;
423
424	0					0	print ' Syntax: ', WHITE, $re_raw, RESET, "\n";
425	0					0	print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n";
426	0					0	printf " Time: %.4fs\n\n", $elapsed;
427	0					0	undef;
428							}
429
430							}
431
432							1;
433
434							package Tstregex;
435							{
436							our $VERSION = '1.14';
437	15			15		103	use Exporter qw(import);
	15					19
	15					2920
438
439							our @EXPORT = qw(
440							tstregex
441							tstregex_init_desc
442							tstregex_get_match_len
443							tstregex_get_fail_token
444							tstregex_is_full_match
445							tstregex_get_re_clean
446							tstregex_get_prefix_offset
447							tstregex_get_re_raw
448							tstregex_get_match_portion
449							tstregex_get_info
450							);
451
452							# --- PUBLIC API (The Getters) ---
453
454	115			115	1	384	sub tstregex_get_match_len { return $_[0]->{match_len}; }
455	115			115	1	356	sub tstregex_get_fail_token { return $_[0]->{fail_token}; }
456	0			0	1	0	sub tstregex_is_full_match { return $_[0]->{full_match}; }
457	0			0	1	0	sub tstregex_get_re_clean { return $_[0]->{re_clean}; }
458	0			0	1	0	sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; }
459	0			0	1	0	sub tstregex_get_re_raw { return $_[0]->{re_raw}; }
460	115		50	115	0	399	sub tstregex_get_captures { return $_[0]->{captures} // [];}
461	0			0	1	0	sub tstregex_get_match_portion { return $_[0]->{match_portion}; }
462	0			0	0	0	sub tstregex_get_info { return $_[0]->{warning}; }
463
464							# Main diagnostic function
465	15			15		71	use constant;
	15					25
	15					1174
466							use constant
467							{
468	15					28143	C_EMPTY => '',
469							RE_EMPTY => qr/\0/,
470							ASCII_LPAREN => ord('('), # 40
471							ASCII_RPAREN => ord(')'), # 41
472							ASCII_LBRACE => ord('{'), # 123
473							ASCII_RBRACE => ord('}'), # 125
474							ASCII_LBRACK => ord('['), # 91
475							ASCII_RBRACK => ord(']'), # 93
476							ASCII_LT => ord('<'), # 60
477							ASCII_GT => ord('>'), # 62
478	15			15		75	};
	15					31
479
480							sub tstregex
481							{
482	115			115	1	412	my ($ctx, $pattern) = @_;
483							# FIX: re init start-state fields in case of multiple test patterns
484	115					182	$ctx -> {'fail_token'} = Tstregex::C_EMPTY;
485	115					155	$ctx -> {'match_portion'} = undef;
486	115					123	$ctx -> {'right_unmatch'} = undef;
487	115					130	$ctx -> {'match_len'} = 0;
488	115					115	$ctx -> {'full_match'} = 0;
489	115					124	$ctx -> {'left_unmatch'} = undef;
490	115					131	my $re_raw = $ctx->{re_raw};
491	115					128	my $org_pat = $pattern;
492	115					139	my $internal_offset = 0;
493
494							# Handle prefix offset if pattern is wrapped like the RE
495	115	100	66			235	if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1)
496							{
497	1					4	my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1);
498	1					3	my $pat_first_char = _stringat($pattern, 0);
499	1	50				2	if ($pat_first_char == $re_delim_char)
500							{
501	1					2	$pattern = substr($org_pat, $ctx->{prefix_offset});
502	1					2	chop $pattern;
503	1					2	$internal_offset = $ctx->{prefix_offset};
504							}
505							}
506
507							# Fast track: check if it matches globally first
508	115	100				541	if ($pattern =~ $ctx->{re_compiled})
509							{
510	68					104	$ctx->{full_match} = 1;
511	68					90	$ctx->{match_len} = length($org_pat);
512	68					124	$ctx->{match_portion} = $&; # the exact sub portion that matched
513	68					110	$ctx->{left_unmatch} = $`; # the left part of matching sub part
514	68					109	$ctx->{right_unmatch} = $'; # the right part of matching sub part
515
516							# --- Capture Groups Extraction ---
517							# We populate the captures array only on a successful global match
518	68					88	my @caps;
519
520							# The special variable $#- contains the number of capture groups
521							# We start at 1 because $0 is the whole match
522	68					236	for my $i (1 .. $#-)
523							{
524	22	100				50	if (defined $-[$i])
525							{
526							# Extracting the substring using offsets from @- and @+
527	21					96	push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]);
528							}
529							else
530							{
531							# Optional group that participated but didn't catch text
532	1					2	push @caps, undef;
533							}
534							}
535	68					109	$ctx->{captures} = \@caps;
536
537	68					150	return 1;
538							}
539
540							# Nibbling phase: find the longest matching lexical group
541	47					102	my $match_reg = Tstregex::C_EMPTY;
542	47					82	foreach my $step (@{$ctx->{steps}})
	47					105
543							{
544	114	100				1021	if ($pattern =~ qr/$step/)
545							{
546	35					48	$match_reg = $step;
547	35					52	last;
548							}
549							}
550
551							# SENSITIVE
552							# Fine-tune the match length character by character
553							# Append a \z to avoid Perl to skip final \n
554							# if Nibbling failed ($match_reg empty), get the full regex for fine-tuning.
555	47	100				131	my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean};
556							# critical! add starting anchor (\A) to force coherency check from the first char..
557	47					112	my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #;
558	47					143	for (my $i = length($pattern); $i >= 0; $i--)
559							{
560							# check if current prefix is valid according to the target
561	117	100				324	last if ($pattern =~ $match_work);
562	83					146	chop $pattern;
563							}
564	47					93	$ctx->{match_len} = length($pattern) + $internal_offset;
565
566							# Identify the failing token for display
567	47	50				45	my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean};
	47					146
568	47					81	my $remaining_re = substr($tail_re, length($match_reg));
569
570	47	50				85	if ($remaining_re ne Tstregex::C_EMPTY)
571							{
572							# get the first token for analyse
573	47					83	my $next_tokens = _get_lex_groups($remaining_re);
574	47		50			105	my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY;
575	47					89	$ctx->{fail_token} = $remaining_re;
576	47	100				157	$ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b\|\^\|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $)
577							}
578							else
579							{
580	0					0	$ctx->{fail_token} = Tstregex::C_EMPTY;
581							}
582
583							# Ensure captures is empty/undef on failure
584	47					73	$ctx->{captures} = [];
585	47	50				151	return $ctx->{match_undef}? undef:0;
586							}
587
588							# Context initialization and RE peeling
589							sub tstregex_init_desc
590							{
591	115			115	1	2142136	my ($re_raw) = @_;
592							# The Shield: Catching the 5.28 deprecation warnings and fatal errors
593							# We use a localized __WARN__ handler to catch the "Unescaped left brace"
594							# even if it's not a fatal error yet in 5.28
595
596	115					243	my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw);
597	115	50				259	my $match_undef = $re_compiled eq RE_EMPTY ? 1:0;
598
599	115					188	my $steps = _parse_lex_groups($re_clean);
600							# {
601							# no warnings 'experimental::re_strict';
602							# use re 'strict';
603							# $steps = _parse_lex_groups($re_clean);
604							# };
605							return
606							{
607	115					891	re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean,
608							steps => $steps, prefix_offset => $prefix_off, match_len => 0,
609							fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef,
610							match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef,
611							warning => $last_warning,
612							};
613							}
614
615							# Helper: get char code at position
616	2			2		3	sub _stringat($$) { return vec($_[0], $_[1], 8); }
617
618							sub _unwrap_regex
619							{
620	115			115		189	my ($raw) = @_;
621	115	50	33			480	return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
622
623	115					136	my $raw_org = $raw;
624	115					175	my $options = Tstregex::C_EMPTY;
625
626							# 1. Extract trailing options (ismxg)
627	115					484	while ($raw =~ s/([ismxg])$//)
628							{
629	0					0	$options = $1 . $options;
630							}
631
632							# 2. Delegate peeling to _strop
633	115					206	my $clean = _strop($raw);
634
635							# 3. Automatic offset calculation (locate the "juice" within the original string)
636	115					225	my $off = index($raw_org, $clean);
637	115	50				262	$off = 0 if $off < 0;
638
639							# 4. Secure Forge (Remove 'g' as it is irrelevant for qr//)
640	115					161	$options =~ tr/g//d;
641	115	50				222	my $re_str = $options ? "(?$options)$clean" : $clean;
642
643	115					212	my ($re_ret, $warn) = _safe_qr($re_str);
644
645	115					299	return ($re_ret, $clean, $off, $warn);
646							}
647
648							sub _safe_qr
649							{
650	162			162		238	my ($re_str) = @_;
651	162					172	my ($re, $err);
652							{
653	162					173	local $@;
	162					161
654	162			1		1211	local $SIG{__DIE__} = local $SIG{__WARN__} = sub { };
655							# dont catch the warning there, let the eval fail instead and get the message back in $@
656	162					243	$re = eval { qr/$re_str/ };
	162					2260
657	162		50			832	$err = $@ // '';
658							}
659	162		50			481	return ($re // RE_EMPTY, $err);
660							}
661
662							# _strop: strip operators
663							# Peels Perl operators (m!!, m{}, //) by checking extremities.
664							# It ensures only the core regex juice is returned.
665							sub _strop
666							{
667	115			115		148	my ($raw) = @_;
668	115	50	33			370	return $raw if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
669
670							# Remove leading/trailing whitespace
671	115					423	$raw =~ s/^\s+\|\s+$//g;
672
673							# Identify opening delimiter (after an optional 'm')
674	115	100				513	if ($raw =~ /^((?:m\s)?)([^\w\s])(.)$/s)
675							{
676	50					125	my $prefix = $1;
677	50					75	my $open = $2;
678	50					95	my $body = $3;
679
680							# Map symmetric pairs using ASCII constants
681	50					220	my %sym_or_eq =
682							(
683							chr(ASCII_LBRACE) => chr(ASCII_RBRACE),
684							chr(ASCII_LBRACK) => chr(ASCII_RBRACK),
685							chr(ASCII_LPAREN) => chr(ASCII_RPAREN),
686							chr(ASCII_LT) => chr(ASCII_GT),
687							);
688
689							# Expected close is either the matching pair or the same character (e.g., m!!)
690	50		66			165	my $expected_close = $sym_or_eq{$open} \|\| $open;
691
692							# PROTECT CAPTURE GROUPS:
693							# If the delimiter is a parenthesis but there is no 'm' prefix,
694							# it is a capturing group, NOT an operator. Do not peel!
695	50	100	66			164	if ($open eq chr(ASCII_LPAREN) && !$prefix)
696							{
697	26					71	return $raw;
698							}
699
700							# Check if the very last character matches our expected closing delimiter
701	24	100				59	if (substr($body, -1) eq $expected_close)
702							{
703	1					5	return substr($body, 0, -1);
704							}
705
706							# If 'm' was present but closing failed, return body (best effort)
707	23	50				66	return $body if $prefix;
708							}
709
710	88					143	return $raw;
711							}
712
713							sub _parse_lex_groups
714							{
715	115			115		206	my ($regex) = @_;
716	115					208	my $tokens = _get_lex_groups($regex);
717	115					128	my @results;
718	115					261	my $current = join(Tstregex::C_EMPTY, @$tokens);
719
720	115					214	while (@$tokens)
721							{
722	412					710	my $opens = () = $current =~ /(?
723	412					588	my $closes = () = $current =~ /(?
724	412	50				559	if ($opens >= $closes)
725							{
726	412					584	my $v = $current . (')' x ($opens - $closes));
727	412					541	$v =~ s/(?
728	412	100				380	if (eval { qr/$v/ }) { push @results, $v; }
	412					4346
	409					608
729							}
730	412					648	my $last = pop @$tokens;
731	412	50				1003	substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last;
732							}
733	115					218	return \@results;
734							}
735
736							# Lexical tokenizer for Perl Regex
737							sub _get_lex_groups
738							{
739	162			162		208	my ($regex) = @_;
740	162					168	my @groups;
741
742							# --- START: class mismatch support (Added [.*?] to tokenizer) ---
743							# my $re = qr/($\?\#.?$)\|($\?[:=!<>]+)\|(\{\d+,?\d\})\|(\[.?\])\|(\\.)\|([\($\\|^\$\+\\?])\|(.)/x;
744							# --- ENHANCED: Atomic Lookaround & Recursive Group Support ---
745							# Group 1: Comments (?#...)
746							# Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens
747							# Group 3: Quantifiers {n,m}
748							# Group 4: Character classes [...]
749							# Group 5: Escaped characters \.
750							# Group 6: Metacharacters ( ) \| ^ $ + * ?
751							# Group 7: Any other character
752							# --- END: class mismatch support ---
753	162					288	my $re = qr/
754							($\?\#.*?$)
755							\| ( # START GROUP 2
756							\(\?[:=!<>]+ # Assertion header
757							(?: # Content
758							(?> [^()]+ ) # Non-paren characters (atomic)
759							\|
760							(?2) # Recursive call to Group 2
761							)*
762							\) # Matching closing paren
763							) # END GROUP 2
764							\| (\{\d+,?\d*\})
765							\| (\[.*?\])
766							\| (\\.)
767							\| ([\\|^\$\+\*\?])
768							\| (.)
769							/x;
770
771	162					940	while ($regex =~ /$re/g)
772							{
773	561		100			3509	my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
			100
			100
			100
			100
			66
774	561	100	66			1924	if (defined $t && $t =~ /^[\+\*\?]$\|^\{\d/ && @groups && $groups[-1] !~ /^[\\|]$/)
			66
			100
775							{
776	71					239	$groups[-1] .= $t;
777							}
778	490					1734	else { push @groups, $t; }
779							}
780
781							# TODO: test that optimized code fragment instead; Much time spent here, but sensitive part..
782							# my $quantifiers = '+*?{';
783							# while ($regex =~ /$re/g)
784							# {
785							# my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
786							# if (defined $t && @groups)
787							# {
788							# my $char = substr($t, 0, 1);
789							# if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\\|]$/)
790							# {
791							# $groups[-1] .= $t;
792							# next;
793							# }
794							# }
795							# push @groups, $t;
796							# }
797
798	162					404	return \@groups;
799							}
800
801							}
802
803							1;
804
805							package main;
806
807							__DATA__