File Coverage

blib/lib/Tstregex.pm

Criterion	Covered	Total	%
statement	168	298	56.3
branch	37	102	36.2
condition	36	80	45.0
subroutine	28	40	70.0
pod	9	11	81.8
total	278	531	52.3

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							###############################################
4							# Author: Olivier Delouya - 2026
5							# File: Tstregex.pm (Hybrid Modulino)
6							# Content: Regex longest match diagnostic
7							# indent: Whitesmith (perltidy -bl -bli)
8							###############################################
9
10	15			15		1425954	use strict;
	15					23
	15					459
11	15			15		49	use warnings;
	15					72
	15					1056
12
13							=head1 NAME
14
15							Tstregex - A Diagnostic Tool that quickly shows the longest Regular Expression string match, highlighting the rejected part. The terminal command
16							tstregex '/^[a-z]\d{3}$/' 'abc12a' shows: abcB<12a> (^[a-z]B<\d{3}$>)
17
18							# Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring
19							#(idem with regexp lexical groups between parenthesis)
20							# A Hybrid Regex Diagnostic Tool (single file Library module with API and command tool)
21
22							=cut
23
24							=head1 SYNOPSIS
25
26							C<$tstregex 'regex' string1 string2 ... stringN>
27
28							=head1 OPTIONS (CLI)
29
30							=head2 -h --help
31
32							show that help..
33
34							=head2 -v --verbose
35
36							shows key info on (un)matching..
37
38							=head2 -d --diag
39
40							=over
41
42							=item Triggers the Enriched Diagnostic View. It displays:
43
44							=item - The string with the failing part highlighted.
45
46							=item - The exact token in the regex that caused the break.
47
48							=item - A visual pointer (C<^--- HERE>) aligned with the regex syntax.
49
50							=item - Execution time (useful for spotting ReDoS/Exponential backtracking).
51
52							=back
53
54							=head2 -a --assert
55
56							Misc: performs a huge test suite various a large collection of regexp tests with Tstregex..
57
58							=head1 Perl Module SYNOPSIS
59
60							use Tstregex;
61							my $ctx = tstregex_init_desc('/^\d{3}/');
62							tstregex($ctx, '12a');
63							if (!tstregex_is_full_match($res))
64							{
65							my $token = tstregex_get_fail_token($res);
66							my $pos = tstregex_get_match_len($res);
67							print "Failure on token '$token' at column $pos\n";
68							}
69							Note that if your purpose is just to display the result as the command would do, you can call directly
70							the main with the appropriate (argc, argv) style parameters to avoid to spawn another perl process.
71							See the tstregex file stub for details..
72
73							=head1 API
74
75							=head2 tstregex_init_desc($raw_re)
76
77							Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash.
78
79							=head2 tstregex($ctx, $string)
80
81							Executes the diagnostic. Updates the context.
82
83							=head2 tstregex_is_full_match
84
85							Returns match status of input string (BOOL 0 OR 1)
86
87							=head2 tstregex_get_match_portion
88
89							Returns the matching portion in case of full match
90							(might be smaller than input string, depending on anchors..)
91
92							=head2 tstregex_get_match_len
93
94							Returns the matching substring length
95
96							=head2 tstregex_get_fail_token
97
98							Returns the failing token in the regexp
99
100							=head2 tstregex_get_re_clean
101
102							Returns the matching regexp subpart
103
104							=head2 tstregex_get_re_raw
105
106							Returns the internal representation of the regexp
107
108							=head2 tstregex_get_prefix_offset
109
110							Returns the offset of the original regexp in the raw regexp
111
112							=head1 DESCRIPTION
113
114							=over
115
116							=item B is designed to solve the "Black Box" problem of Regular Expressions.
117
118							=item When a complex regex fails, Perl usually just says "No Match".
119
120							=item This tool identifies exactly B and B it failed by finding the longest possible partial match.
121
122							=back
123
124							=head1 EXAMPLE
125
126							=over
127
128							=item $ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a'
129
130							=item abc123
131
132							=item abcB<12a> (^[a-z]*B<\d{3}$>)
133
134							=back
135
136							I
137
138							=head2 The "Nibbling" Engine
139
140							The diagnostic logic uses a "Nibbling" (grignotage) strategy:
141
142							=over 4
143
144							=item 1. Decomposition
145
146							The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest.
147
148							=item 2. Longest Match Search
149
150							=back
151
152							=over 4
153
154							It iteratively tests these sub-patterns against the input string.
155
156							It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall.
157
158							=back
159
160							=over 4
161
162							=item 3. Failure Point Identification
163
164							Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure".
165
166							=back
167
168							=head1 AUTHOR
169
170							Olivier Delouya - 2026
171
172							=head1 LICENSE
173
174							Artistic Version 2
175
176							=cut
177
178
179							package main;
180							{
181	15			15		67	use strict;
	15					33
	15					301
182	15			15		47	use warnings;
	15					30
	15					660
183	15			15		65	use Carp qw(confess);
	15					18
	15					958
184							$SIG{__WARN__} = 'confess';
185							$SIG{__DIE__ } = 'confess';
186
187	15			15		7676	use IO::Handle;
	15					87756
	15					721
188	15			15		8568	use Term::ANSIColor qw(:constants);
	15					114932
	15					13301
189	15			15		170	use Time::HiRes qw(gettimeofday tv_interval);
	15					21
	15					139
190	15			15		6413	use utf8;
	15					2965
	15					89
191
192							# --- Constants & ASCII Codes ---
193							use constant
194							{
195	15					4246	C_m => 109, C_g => 103, C_i => 105, C_s => 115,
196							C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48,
197							C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97,
198							C_LOW_Z => 122, C_UNDSC => 95,
199							OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93,
200							OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62,
201							ESC => "\e",
202							CUU => "\e[A", # Cursor Up
203							UI_OFFSET=> 11, # Alignment offset for " Syntax: "
204	15			15		1135	};
	15					22
205
206							# --- ENCAPSULATED DEBUG ALIAS ---
207							BEGIN
208							{
209	15	50		15		21973	if ($INC{'perl5db.pl'})
210							{
211	0					0	require Data::Dumper;
212	0					0	require Term::ANSIColor;
213	15			15		99	no strict 'refs';
	15					19
	15					455
214	15			15		52	no warnings 'once';
	15					16
	15					3852
215
216							my $debug_sub = sub
217							{
218	0					0	my @args = @_;
219
220							# Automatically detect flattened hashes:
221							# If even number of arguments and the first one isn't a reference
222	0	0	0			0	if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0]))
			0
223							{
224							# Wrap the flattened list into a temporary hashref
225	0					0	my %tmp_hash = @args;
226	0					0	@args = (\%tmp_hash);
227							}
228
229	0					0	print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(),
230							'DEBUG (tstregex): ', Term::ANSIColor::RESET(),
231							Data::Dumper::Dumper(@args);
232	0					0	};
233
234							# Force injection into all relevant namespaces
235	0					0	foreach my $pkg ('main', 'Tstregex', 'DB')
236							{
237	0					0	*{"${pkg}::d"} = $debug_sub;
	0					0
238							}
239
240	0	0				0	my $cuu = defined &main::CUU ? main::CUU() : "\e[A";
241	0					0	print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(),
242							'INFO: ', Term::ANSIColor::RESET(),
243							"Alias 'd' ready (Auto-hash detection enabled)\n\n";
244							}
245							}
246
247							exit(main(scalar(@ARGV), \@ARGV)) if(!caller);
248
249							sub main
250							{
251	0			0		0	my ($argc, $argv) = @_;
252	0	0	0			0	if(!$argc \|\| ($argc && $$argv[0] =~ /^-h\|--help$/)) { help(); exit(0); }
	0		0			0
	0					0
253
254	0					0	binmode STDOUT, ':utf8';
255	0					0	my ($mode_diag, $verbose, $assert) = (0)x2;
256	0					0	for(my $i=0; $i<$argc; $i++)
257							{
258	0	0	0			0	do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d\|--diag$/);
	0					0
	0					0
	0					0
259	0	0	0			0	do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v\|--verbose$/);
	0					0
	0					0
	0					0
260	0	0	0			0	do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a\|--assert$/);
	0					0
	0					0
	0					0
261							}
262
263	0	0				0	if ($assert)
264							{
265	0					0	print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET;
266	0					0	_run_internal_tests($mode_diag, $verbose);
267	0					0	exit(0);
268							}
269
270	0					0	my @new_argv;
271	0					0	foreach(@$argv)
272							{
273	0	0				0	push @new_argv, $_ if(defined($_));
274							}
275	0					0	$argv = \@new_argv;
276	0					0	$argc = scalar @$argv;
277
278	0					0	my $re_raw = shift @{$argv};
	0					0
279	0					0	my $ctx = Tstregex::tstregex_init_desc($re_raw);
280	0					0	my $global_result = 0; # success! BE POSITIVE !!
281	0					0	foreach my $pattern (@{$argv})
	0					0
282							{
283	0	0				0	my $t0 = [gettimeofday] if $mode_diag;
284	0					0	my $result = Tstregex::tstregex($ctx, $pattern);
285	0	0				0	$global_result = 1 if($result);
286	0	0				0	$mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0))
287							: _display_standard($pattern, $ctx);
288	0	0				0	if($verbose)
289							{
290	0	0				0	print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; ';
291	0	0				0	print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx))
292							: ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx));
293	0					0	print Term::ANSIColor::RESET(), "\n";
294	0	0	0			0	print $ctx->{warning} if($verbose && $ctx->{warning} ne '');
295							}
296							}
297	0					0	return $global_result;
298							}
299
300							sub _run_internal_tests
301							{
302	0			0		0	my ($mode_diag, $verbose) = @_;
303	0					0	my $fh = \*main::DATA;
304	0					0	seek($fh, 0, 0);
305
306	0					0	my $found_data_token = 0;
307
308	0					0	while (<$fh>)
309							{
310	0					0	chomp;
311							# PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker
312							# This prevents the script from parsing its own source code
313	0	0				0	if (!$found_data_token)
314							{
315	0	0				0	$found_data_token = 1 if /^__(DATA\|END)__/;
316	0					0	next;
317							}
318							else
319							{
320	0	0				0	last if /^__(DATA\|END)__/;
321							}
322	0	0	0			0	next if /^\s*$/ \|\| /^#/;
323	0					0	my ($re, @rest) = split(/\s+\|:::\s*/, $_);
324	0	0				0	next unless $re;
325	0	0				0	my @strings = grep { $_ ne '0' && $_ ne '1' } @rest;
	0					0
326
327	0					0	print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n";
328	0					0	my $ctx = Tstregex::tstregex_init_desc($re);
329	0	0	0			0	print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne '');
330	0					0	foreach my $s (@strings)
331							{
332	0					0	my $t0;
333	0	0				0	$t0 = [gettimeofday] if $mode_diag;
334	0					0	Tstregex::tstregex($ctx, $s);
335	0	0				0	$mode_diag ? _display_enriched($s, $ctx, tv_interval($t0))
336							: _display_standard($s, $ctx);
337							}
338	0					0	print '-' x 40, "\n";
339							}
340							}
341
342							sub _get_re_val
343							{
344							return (
345	0			0		0	Tstregex::tstregex_get_match_len ($_[0]),
346							Tstregex::tstregex_get_fail_token($_[0]),
347							Tstregex::tstregex_get_re_clean ($_[0])
348							);
349							}
350
351							sub _display_standard
352							{
353	0			0		0	my ($pattern, $ctx) = @_;
354	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
355							{
356	0					0	print "$pattern\n";
357	0					0	return undef;
358							}
359	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
360	0					0	print substr($pattern, 0, $match_len); STDOUT->flush(); print STDERR BOLD, substr($pattern, $match_len), RESET;
	0					0
	0					0
361	0					0	my $off = length($re_clean) - length($fail_token);
362	0					0	print STDERR ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n";
363	0					0	undef;
364							}
365
366							sub _display_enriched
367							{
368	0			0		0	my ($pattern, $ctx, $elapsed) = @_;
369	0					0	print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n";
370
371	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
372							{
373	0					0	print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n";
374	0					0	return undef;
375							}
376	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
377	0					0	my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx);
378	0					0	my $re_raw = Tstregex::tstregex_get_re_raw ($ctx);
379
380	0					0	print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len),
381							BOLD, WHITE, substr($pattern, $match_len), RESET;
382	0					0	print ' (at ', CYAN, $fail_token, RESET, ")\n";
383
384	0					0	my $err_pos_in_clean = length($re_clean) - length($fail_token);
385	0					0	my $final_pointer_pos = $prefix_off + $err_pos_in_clean;
386
387	0					0	print ' Syntax: ', WHITE, $re_raw, RESET, "\n";
388	0					0	print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n";
389	0					0	printf " Time: %.4fs\n\n", $elapsed;
390	0					0	undef;
391							}
392
393							sub help
394							{
395	0			0		0	print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET;
396	0					0	print "Usage:\n";
397	0					0	print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n";
398	0					0	print "Examples:\n";
399	0					0	print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n";
400	0					0	print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n";
401	0					0	print BOLD, 'DELIMITERS ', RESET, "are optional\n";
402	0					0	print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n";
403	0					0	print "Options:\n";
404	0					0	print "-h --help Shows that help\n";
405	0					0	print "-v --verbose Shows keys info on match/unmatch\n";
406	0					0	print "-d --diag Enriched diagnostic with timing and syntax pointers\n";
407	0					0	print "-a --assert Misc: shows a large test of regexp against tstregex..\n";
408							}
409
410							}
411
412							1;
413
414							package Tstregex;
415							{
416							our $VERSION = '1.12';
417	15			15		102	use Exporter qw(import);
	15					16
	15					3070
418
419							our @EXPORT = qw(
420							tstregex
421							tstregex_init_desc
422							tstregex_get_match_len
423							tstregex_get_fail_token
424							tstregex_is_full_match
425							tstregex_get_re_clean
426							tstregex_get_prefix_offset
427							tstregex_get_re_raw
428							tstregex_get_match_portion
429							tstregex_get_info
430							);
431
432							# --- PUBLIC API (The Getters) ---
433
434	115			115	1	383	sub tstregex_get_match_len { return $_[0]->{match_len}; }
435	115			115	1	359	sub tstregex_get_fail_token { return $_[0]->{fail_token}; }
436	0			0	1	0	sub tstregex_is_full_match { return $_[0]->{full_match}; }
437	0			0	1	0	sub tstregex_get_re_clean { return $_[0]->{re_clean}; }
438	0			0	1	0	sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; }
439	0			0	1	0	sub tstregex_get_re_raw { return $_[0]->{re_raw}; }
440	115		50	115	0	415	sub tstregex_get_captures { return $_[0]->{captures} // [];}
441	0			0	1	0	sub tstregex_get_match_portion { return $_[0]->{match_portion}; }
442	0			0	0	0	sub tstregex_get_info { return $_[0]->{warning}; }
443
444							# Main diagnostic function
445	15			15		69	use constant;
	15					17
	15					1237
446							use constant
447							{
448	15					29491	C_EMPTY => '',
449							RE_EMPTY => qr/\0/,
450							ASCII_LPAREN => ord('('), # 40
451							ASCII_RPAREN => ord(')'), # 41
452							ASCII_LBRACE => ord('{'), # 123
453							ASCII_RBRACE => ord('}'), # 125
454							ASCII_LBRACK => ord('['), # 91
455							ASCII_RBRACK => ord(']'), # 93
456							ASCII_LT => ord('<'), # 60
457							ASCII_GT => ord('>'), # 62
458	15			15		86	};
	15					44
459
460							sub tstregex
461							{
462	115			115	1	383	my ($ctx, $pattern) = @_;
463							# FIX: re init start-state fields in case of multiple test patterns
464	115					213	$ctx -> {'fail_token'} = Tstregex::C_EMPTY;
465	115					132	$ctx -> {'match_portion'} = undef;
466	115					132	$ctx -> {'right_unmatch'} = undef;
467	115					141	$ctx -> {'match_len'} = 0;
468	115					113	$ctx -> {'full_match'} = 0;
469	115					127	$ctx -> {'left_unmatch'} = undef;
470	115					133	my $re_raw = $ctx->{re_raw};
471	115					156	my $org_pat = $pattern;
472	115					137	my $internal_offset = 0;
473
474							# Handle prefix offset if pattern is wrapped like the RE
475	115	100	66			241	if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1)
476							{
477	1					2	my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1);
478	1					3	my $pat_first_char = _stringat($pattern, 0);
479	1	50				2	if ($pat_first_char == $re_delim_char)
480							{
481	1					2	$pattern = substr($org_pat, $ctx->{prefix_offset});
482	1					3	chop $pattern;
483	1					1	$internal_offset = $ctx->{prefix_offset};
484							}
485							}
486
487							# Fast track: check if it matches globally first
488	115	100				592	if ($pattern =~ $ctx->{re_compiled})
489							{
490	68					86	$ctx->{full_match} = 1;
491	68					85	$ctx->{match_len} = length($org_pat);
492	68					129	$ctx->{match_portion} = $&; # the exact sub portion that matched
493	68					135	$ctx->{left_unmatch} = $`; # the left part of matching sub part
494	68					104	$ctx->{right_unmatch} = $'; # the right part of matching sub part
495
496							# --- Capture Groups Extraction ---
497							# We populate the captures array only on a successful global match
498	68					85	my @caps;
499
500							# The special variable $#- contains the number of capture groups
501							# We start at 1 because $0 is the whole match
502	68					230	for my $i (1 .. $#-)
503							{
504	22	100				58	if (defined $-[$i])
505							{
506							# Extracting the substring using offsets from @- and @+
507	21					84	push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]);
508							}
509							else
510							{
511							# Optional group that participated but didn't catch text
512	1					2	push @caps, undef;
513							}
514							}
515	68					129	$ctx->{captures} = \@caps;
516
517	68					155	return 1;
518							}
519
520							# Nibbling phase: find the longest matching lexical group
521	47					119	my $match_reg = Tstregex::C_EMPTY;
522	47					54	foreach my $step (@{$ctx->{steps}})
	47					123
523							{
524	114	100				965	if ($pattern =~ qr/$step/)
525							{
526	35					42	$match_reg = $step;
527	35					58	last;
528							}
529							}
530
531							# SENSITIVE
532							# Fine-tune the match length character by character
533							# Append a \z to avoid Perl to skip final \n
534							# if Nibbling failed ($match_reg empty), get the full regex for fine-tuning.
535	47	100				112	my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean};
536							# critical! add starting anchor (\A) to force coherency check from the first char..
537	47					112	my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #;
538	47					116	for (my $i = length($pattern); $i >= 0; $i--)
539							{
540							# check if current prefix is valid according to the target
541	117	100				308	last if ($pattern =~ $match_work);
542	83					152	chop $pattern;
543							}
544	47					105	$ctx->{match_len} = length($pattern) + $internal_offset;
545
546							# Identify the failing token for display
547	47	50				61	my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean};
	47					114
548	47					88	my $remaining_re = substr($tail_re, length($match_reg));
549
550	47	50				82	if ($remaining_re ne Tstregex::C_EMPTY)
551							{
552							# get the first token for analyse
553	47					100	my $next_tokens = _get_lex_groups($remaining_re);
554	47		50			94	my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY;
555	47					69	$ctx->{fail_token} = $remaining_re;
556	47	100				145	$ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b\|\^\|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $)
557							}
558							else
559							{
560	0					0	$ctx->{fail_token} = Tstregex::C_EMPTY;
561							}
562
563							# Ensure captures is empty/undef on failure
564	47					93	$ctx->{captures} = [];
565	47	50				165	return $ctx->{match_undef}? undef:0;
566							}
567
568							# Context initialization and RE peeling
569							sub tstregex_init_desc
570							{
571	115			115	1	2160713	my ($re_raw) = @_;
572							# The Shield: Catching the 5.28 deprecation warnings and fatal errors
573							# We use a localized __WARN__ handler to catch the "Unescaped left brace"
574							# even if it's not a fatal error yet in 5.28
575
576	115					256	my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw);
577	115	50				306	my $match_undef = $re_compiled eq RE_EMPTY ? 1:0;
578
579	115					239	my $steps = _parse_lex_groups($re_clean);
580							# {
581							# no warnings 'experimental::re_strict';
582							# use re 'strict';
583							# $steps = _parse_lex_groups($re_clean);
584							# };
585							return
586							{
587	115					910	re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean,
588							steps => $steps, prefix_offset => $prefix_off, match_len => 0,
589							fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef,
590							match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef,
591							warning => $last_warning,
592							};
593							}
594
595							# Helper: get char code at position
596	2			2		4	sub _stringat($$) { return vec($_[0], $_[1], 8); }
597
598							sub _unwrap_regex
599							{
600	115			115		168	my ($raw) = @_;
601	115	50	33			532	return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
602
603	115					163	my $raw_org = $raw;
604	115					132	my $options = Tstregex::C_EMPTY;
605
606							# 1. Extract trailing options (ismxg)
607	115					474	while ($raw =~ s/([ismxg])$//)
608							{
609	0					0	$options = $1 . $options;
610							}
611
612							# 2. Delegate peeling to _strop
613	115					208	my $clean = _strop($raw);
614
615							# 3. Automatic offset calculation (locate the "juice" within the original string)
616	115					260	my $off = index($raw_org, $clean);
617	115	50				252	$off = 0 if $off < 0;
618
619							# 4. Secure Forge (Remove 'g' as it is irrelevant for qr//)
620	115					170	$options =~ tr/g//d;
621	115	50				242	my $re_str = $options ? "(?$options)$clean" : $clean;
622
623	115					216	my ($re_ret, $warn) = _safe_qr($re_str);
624
625	115					317	return ($re_ret, $clean, $off, $warn);
626							}
627
628							sub _safe_qr
629							{
630	162			162		234	my ($re_str) = @_;
631	162					191	my ($re, $err);
632							{
633	162					168	local $@;
	162					163
634	162			1		1242	local $SIG{__DIE__} = local $SIG{__WARN__} = sub { };
635							# dont catch the warning there, let the eval fail instead and get the message back in $@
636	162					262	$re = eval { qr/$re_str/ };
	162					2366
637	162		50			967	$err = $@ // '';
638							}
639	162		50			454	return ($re // RE_EMPTY, $err);
640							}
641
642							# _strop: strip operators
643							# Peels Perl operators (m!!, m{}, //) by checking extremities.
644							# It ensures only the core regex juice is returned.
645							sub _strop
646							{
647	115			115		162	my ($raw) = @_;
648	115	50	33			374	return $raw if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
649
650							# Remove leading/trailing whitespace
651	115					422	$raw =~ s/^\s+\|\s+$//g;
652
653							# Identify opening delimiter (after an optional 'm')
654	115	100				528	if ($raw =~ /^((?:m\s)?)([^\w\s])(.)$/s)
655							{
656	50					135	my $prefix = $1;
657	50					123	my $open = $2;
658	50					91	my $body = $3;
659
660							# Map symmetric pairs using ASCII constants
661	50					242	my %sym_or_eq =
662							(
663							chr(ASCII_LBRACE) => chr(ASCII_RBRACE),
664							chr(ASCII_LBRACK) => chr(ASCII_RBRACK),
665							chr(ASCII_LPAREN) => chr(ASCII_RPAREN),
666							chr(ASCII_LT) => chr(ASCII_GT),
667							);
668
669							# Expected close is either the matching pair or the same character (e.g., m!!)
670	50		66			203	my $expected_close = $sym_or_eq{$open} \|\| $open;
671
672							# PROTECT CAPTURE GROUPS:
673							# If the delimiter is a parenthesis but there is no 'm' prefix,
674							# it is a capturing group, NOT an operator. Do not peel!
675	50	100	66			168	if ($open eq chr(ASCII_LPAREN) && !$prefix)
676							{
677	26					74	return $raw;
678							}
679
680							# Check if the very last character matches our expected closing delimiter
681	24	100				67	if (substr($body, -1) eq $expected_close)
682							{
683	1					5	return substr($body, 0, -1);
684							}
685
686							# If 'm' was present but closing failed, return body (best effort)
687	23	50				99	return $body if $prefix;
688							}
689
690	88					180	return $raw;
691							}
692
693							sub _parse_lex_groups
694							{
695	115			115		225	my ($regex) = @_;
696	115					205	my $tokens = _get_lex_groups($regex);
697	115					140	my @results;
698	115					255	my $current = join(Tstregex::C_EMPTY, @$tokens);
699
700	115					244	while (@$tokens)
701							{
702	412					717	my $opens = () = $current =~ /(?
703	412					533	my $closes = () = $current =~ /(?
704	412	50				548	if ($opens >= $closes)
705							{
706	412					592	my $v = $current . (')' x ($opens - $closes));
707	412					512	$v =~ s/(?
708	412	100				406	if (eval { qr/$v/ }) { push @results, $v; }
	412					4244
	409					589
709							}
710	412					566	my $last = pop @$tokens;
711	412	50				922	substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last;
712							}
713	115					205	return \@results;
714							}
715
716							# Lexical tokenizer for Perl Regex
717							sub _get_lex_groups
718							{
719	162			162		209	my ($regex) = @_;
720	162					181	my @groups;
721
722							# --- START: class mismatch support (Added [.*?] to tokenizer) ---
723							# my $re = qr/($\?\#.?$)\|($\?[:=!<>]+)\|(\{\d+,?\d\})\|(\[.?\])\|(\\.)\|([\($\\|^\$\+\\?])\|(.)/x;
724							# --- ENHANCED: Atomic Lookaround & Recursive Group Support ---
725							# Group 1: Comments (?#...)
726							# Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens
727							# Group 3: Quantifiers {n,m}
728							# Group 4: Character classes [...]
729							# Group 5: Escaped characters \.
730							# Group 6: Metacharacters ( ) \| ^ $ + * ?
731							# Group 7: Any other character
732							# --- END: class mismatch support ---
733	162					302	my $re = qr/
734							($\?\#.*?$)
735							\| ( # START GROUP 2
736							\(\?[:=!<>]+ # Assertion header
737							(?: # Content
738							(?> [^()]+ ) # Non-paren characters (atomic)
739							\|
740							(?2) # Recursive call to Group 2
741							)*
742							\) # Matching closing paren
743							) # END GROUP 2
744							\| (\{\d+,?\d*\})
745							\| (\[.*?\])
746							\| (\\.)
747							\| ([\\|^\$\+\*\?])
748							\| (.)
749							/x;
750
751	162					1002	while ($regex =~ /$re/g)
752							{
753	561		100			3587	my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
			100
			100
			100
			100
			66
754	561	100	66			1983	if (defined $t && $t =~ /^[\+\*\?]$\|^\{\d/ && @groups && $groups[-1] !~ /^[\\|]$/)
			66
			100
755							{
756	71					208	$groups[-1] .= $t;
757							}
758	490					1755	else { push @groups, $t; }
759							}
760
761							# TODO: test that optimized code fragment instead; Much time spent here, but sensitive part..
762							# my $quantifiers = '+*?{';
763							# while ($regex =~ /$re/g)
764							# {
765							# my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
766							# if (defined $t && @groups)
767							# {
768							# my $char = substr($t, 0, 1);
769							# if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\\|]$/)
770							# {
771							# $groups[-1] .= $t;
772							# next;
773							# }
774							# }
775							# push @groups, $t;
776							# }
777
778	162					406	return \@groups;
779							}
780
781							}
782
783							1;
784
785							package main;
786
787							__DATA__