File Coverage

blib/lib/Tstregex.pm

Criterion	Covered	Total	%
statement	168	298	56.3
branch	37	102	36.2
condition	36	80	45.0
subroutine	28	40	70.0
pod	9	11	81.8
total	278	531	52.3

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							###############################################
4							# Author: Olivier Delouya - 2026
5							# File: Tstregex.pm (Hybrid Modulino)
6							# Content: Regex longest match diagnostic
7							# indent: Whitesmith (perltidy -bl -bli)
8							###############################################
9
10	15			15		1560875	use strict;
	15					21
	15					472
11	15			15		53	use warnings;
	15					67
	15					1140
12
13							=head1 NAME
14
15							Tstregex - A Diagnostic Tool that quickly shows the longest Regular Expression string match, highlighting the rejected part. The terminal command
16							tstregex '/^[a-z]\d{3}$/' 'abc12a' shows: abcE<#x1D5FE>E<#x1D5FF>E<#x1D5EE> (^[a-z]\E<#x1D5ED>{E<#x1D5FC>}{E<#x1D5FD>}$)
17
18							# Above, the normal parts are the longuest matching substring when bold parts highlights the rejected substring
19							#(idem with regexp lexical groups between parenthesis)
20							# A Hybrid Regex Diagnostic Tool (single file Library module with API and command tool)
21
22							=cut
23
24							=head1 SYNOPSIS
25
26							C<$tstregex 'regex' string1 string2 ... stringN>
27
28							=head1 OPTIONS (CLI)
29
30							=head2 -h --help
31
32							show that help..
33
34							=head2 -v --verbose
35
36							shows key info on (un)matching..
37
38							=head2 -d --diag
39
40							=over
41
42							=item Triggers the Enriched Diagnostic View. It displays:
43
44							=item - The string with the failing part highlighted.
45
46							=item - The exact token in the regex that caused the break.
47
48							=item - A visual pointer (C<^--- HERE>) aligned with the regex syntax.
49
50							=item - Execution time (useful for spotting ReDoS/Exponential backtracking).
51
52							=back
53
54							=head2 -a --assert
55
56							Misc: performs a huge test suite various a large collection of regexp tests with Tstregex..
57
58							=head1 Perl Module SYNOPSIS
59
60							use Tstregex;
61							my $ctx = tstregex_init_desc('/^\d{3}/');
62							tstregex($ctx, '12a');
63							if (!tstregex_is_full_match($res))
64							{
65							my $token = tstregex_get_fail_token($res);
66							my $pos = tstregex_get_match_len($res);
67							print "Failure on token '$token' at column $pos\n";
68							}
69							Note that if your purpose is just to display the result as the command would do, you can call directly
70							the main with the appropriate (argc, argv) style parameters to avoid to spawn another perl process.
71							See the tstregex file stub for details..
72
73							=head1 API
74
75							=head2 tstregex_init_desc($raw_re)
76
77							Pre-parses the regex, handles delimiters (m!!, //, etc.), extracts modifiers (i, s, m, x), and prepares the nibbling steps. Returns a context hash.
78
79							=head2 tstregex($ctx, $string)
80
81							Executes the diagnostic. Updates the context.
82
83							=head2 tstregex_is_full_match
84
85							Returns match status of input string (BOOL 0 OR 1)
86
87							=head2 tstregex_get_match_portion
88
89							Returns the matching portion in case of full match
90							(might be smaller than input string, depending on anchors..)
91
92							=head2 tstregex_get_match_len
93
94							Returns the matching substring length
95
96							=head2 tstregex_get_fail_token
97
98							Returns the failing token in the regexp
99
100							=head2 tstregex_get_re_clean
101
102							Returns the matching regexp subpart
103
104							=head2 tstregex_get_re_raw
105
106							Returns the internal representation of the regexp
107
108							=head2 tstregex_get_prefix_offset
109
110							Returns the offset of the original regexp in the raw regexp
111
112							=head1 DESCRIPTION
113
114							=over
115
116							=item B is designed to solve the "Black Box" problem of Regular Expressions.
117
118							=item When a complex regex fails, Perl usually just says "No Match".
119
120							=item This tool identifies exactly B and B it failed by finding the longest possible partial match.
121
122							=back
123
124							=head1 EXAMPLE
125
126							=over
127
128							=item $ perl lib/Tstregex.pm '/^[a-z]*\d{3}$/' 'abc123' 'abc12a'
129
130							=item abc123
131
132							=item abcB<12a> (^[a-z]*B<\d{3}$>)
133
134							=back
135
136							I
137
138							=head2 The "Nibbling" Engine
139
140							The diagnostic logic uses a "Nibbling" (grignotage) strategy:
141
142							=over 4
143
144							=item 1. Decomposition
145
146							The engine breaks down your regex into a hierarchy of valid sub-patterns (lexical groups, atoms, and quantifiers) from longest to shortest.
147
148							=item 2. Longest Match Search
149
150							=back
151
152							=over 4
153
154							It iteratively tests these sub-patterns against the input string.
155
156							It's not just checking if the start matches, but what is the I sequence of instructions the engine could follow before hitting a wall.
157
158							=back
159
160							=over 4
161
162							=item 3. Failure Point Identification
163
164							Once the longest matching sub-pattern is found, the tool identifies the very next token in your regex syntax. This is your "Point of Failure".
165
166							=back
167
168							=head1 AUTHOR
169
170							Olivier Delouya - 2026
171
172							=head1 LICENSE
173
174							Artistic Version 2
175
176							=head1 SEE ALSO
177
178							This tool belongs to the family of Perl regex development utilities. It is
179							intended for developers looking for a lightweight perl regex debugger,
180							providing automated regex diagnostics when a regular expression fails
181							or requires deep troubleshooting in the CLI terminal.
182
183							=head1 DESCRIPTION
184
185							Tstregex is a CLI tool designed for Perl regex debugging and diagnostics.
186							If you need to debug a Perl regular expression, find where a regex fails,
187							or pinpoint the exact character that broke your pattern, this module
188							automates the process.
189
190							=head1 KEYWORDS
191
192							perl regex debugger, debug regular expression perl, regex failure pinpoint,
193							match longest regex string, regular expression diagnostic tool,
194							perl regex troubleshooting.
195
196							=cut
197
198
199							package main;
200							{
201	15			15		77	use strict;
	15					34
	15					253
202	15			15		50	use warnings;
	15					66
	15					645
203	15			15		67	use Carp qw(confess);
	15					19
	15					1030
204							$SIG{__WARN__} = 'confess';
205							$SIG{__DIE__ } = 'confess';
206
207	15			15		7609	use IO::Handle;
	15					84945
	15					740
208	15			15		8467	use Term::ANSIColor qw(:constants);
	15					125381
	15					14166
209	15			15		149	use Time::HiRes qw(gettimeofday tv_interval);
	15					45
	15					120
210	15			15		6987	use utf8;
	15					3186
	15					121
211
212							# --- Constants & ASCII Codes ---
213							use constant
214							{
215	15					4554	C_m => 109, C_g => 103, C_i => 105, C_s => 115,
216							C_x => 120, C_SLASH => 47, C_SPACE => 32, C_ZERO => 48,
217							C_NINE => 57, C_UP_A => 65, C_UP_Z => 90, C_LOW_A => 97,
218							C_LOW_Z => 122, C_UNDSC => 95,
219							OP_PAR => 40, CL_PAR => 41, OP_BRK => 91, CL_BRK => 93,
220							OP_BRC => 123, CL_BRC => 125, OP_ANG => 60, CL_ANG => 62,
221							ESC => "\e",
222							CUU => "\e[A", # Cursor Up
223							UI_OFFSET=> 11, # Alignment offset for " Syntax: "
224	15			15		1149	};
	15					21
225
226							# --- ENCAPSULATED DEBUG ALIAS ---
227							BEGIN
228							{
229	15	50		15		22349	if ($INC{'perl5db.pl'})
230							{
231	0					0	require Data::Dumper;
232	0					0	require Term::ANSIColor;
233	15			15		87	no strict 'refs';
	15					21
	15					486
234	15			15		55	no warnings 'once';
	15					19
	15					3909
235
236							my $debug_sub = sub
237							{
238	0					0	my @args = @_;
239
240							# Automatically detect flattened hashes:
241							# If even number of arguments and the first one isn't a reference
242	0	0	0			0	if (scalar @args > 1 && scalar @args % 2 == 0 && !ref($args[0]))
			0
243							{
244							# Wrap the flattened list into a temporary hashref
245	0					0	my %tmp_hash = @args;
246	0					0	@args = (\%tmp_hash);
247							}
248
249	0					0	print "\n", Term::ANSIColor::BOLD(), Term::ANSIColor::BLUE(),
250							'DEBUG (tstregex): ', Term::ANSIColor::RESET(),
251							Data::Dumper::Dumper(@args);
252	0					0	};
253
254							# Force injection into all relevant namespaces
255	0					0	foreach my $pkg ('main', 'Tstregex', 'DB')
256							{
257	0					0	*{"${pkg}::d"} = $debug_sub;
	0					0
258							}
259
260	0	0				0	my $cuu = defined &main::CUU ? main::CUU() : "\e[A";
261	0					0	print $cuu, Term::ANSIColor::BOLD(), Term::ANSIColor::CYAN(),
262							'INFO: ', Term::ANSIColor::RESET(),
263							"Alias 'd' ready (Auto-hash detection enabled)\n\n";
264							}
265							}
266
267							exit(main(scalar(@ARGV), \@ARGV)) if(!caller);
268
269							sub main
270							{
271	0			0		0	my ($argc, $argv) = @_;
272	0	0	0			0	if(!$argc \|\| ($argc && $$argv[0] =~ /^-h\|--help$/)) { help(); exit(0); }
	0		0			0
	0					0
273
274	0					0	binmode STDOUT, ':utf8';
275	0					0	my ($mode_diag, $verbose, $assert) = (0)x2;
276	0					0	for(my $i=0; $i<$argc; $i++)
277							{
278	0	0	0			0	do { $mode_diag = 1; undef $$argv[$i]; next } if (!$mode_diag && $$argv[$i] =~ /^-d\|--diag$/);
	0					0
	0					0
	0					0
279	0	0	0			0	do { $verbose = 1; undef $$argv[$i]; next } if (!$verbose && $$argv[$i] =~ /^-v\|--verbose$/);
	0					0
	0					0
	0					0
280	0	0	0			0	do { $assert = 1; undef $$argv[$i]; next } if (!$assert && $$argv[$i] =~ /^-a\|--assert$/);
	0					0
	0					0
	0					0
281							}
282
283	0	0				0	if ($assert)
284							{
285	0					0	print BOLD, BLUE, "--- Internal Test Suite (DATA Section) ---\n", RESET;
286	0					0	_run_internal_tests($mode_diag, $verbose);
287	0					0	exit(0);
288							}
289
290	0					0	my @new_argv;
291	0					0	foreach(@$argv)
292							{
293	0	0				0	push @new_argv, $_ if(defined($_));
294							}
295	0					0	$argv = \@new_argv;
296	0					0	$argc = scalar @$argv;
297
298	0					0	my $re_raw = shift @{$argv};
	0					0
299	0					0	my $ctx = Tstregex::tstregex_init_desc($re_raw);
300	0					0	my $global_result = 0; # success! BE POSITIVE !!
301	0					0	foreach my $pattern (@{$argv})
	0					0
302							{
303	0	0				0	my $t0 = [gettimeofday] if $mode_diag;
304	0					0	my $result = Tstregex::tstregex($ctx, $pattern);
305	0	0				0	$global_result = 1 if($result);
306	0	0				0	$mode_diag ? _display_enriched($pattern, $ctx, tv_interval($t0))
307							: _display_standard($pattern, $ctx);
308	0	0				0	if($verbose)
309							{
310	0	0				0	print $result? 'Match':'UNmatch', '! Match length: ', Tstregex::tstregex_get_match_len($ctx), '; ';
311	0	0				0	print $result? ('Match portion: ', Term::ANSIColor::UNDERLINE(), Tstregex::tstregex_get_match_portion($ctx))
312							: ('Fail token: ', Tstregex::tstregex_get_fail_token($ctx));
313	0					0	print Term::ANSIColor::RESET(), "\n";
314	0	0	0			0	print $ctx->{warning} if($verbose && $ctx->{warning} ne '');
315							}
316							}
317	0					0	return $global_result;
318							}
319
320							sub help
321							{
322	0			0		0	print BOLD, WHITE, "Tstregex.pm - Longest match Regular Expression Diagnostic Tool (2026 - PerlOD)\n", RESET;
323	0					0	print "Usage:\n";
324	0					0	print " perl Tstregex.pm [options] 'regex' 'string1' ['string2' ...]\n\n";
325	0					0	print "Examples:\n";
326	0					0	print " perl Tstregex.pm '([0-3][0-9])/[0-1][0-9]/\\d{4}' '21/72/1985'\n";
327	0					0	print ' 21/', BOLD, '72/1985', RESET, ' ([0-3][0-9]/', BOLD, '[0-1][0-9]/\d{4}', RESET, ")\n\n";
328	0					0	print BOLD, 'DELIMITERS ', RESET, "are optional\n";
329	0					0	print " Supported: /.../, m!...!, m{...}. Modifiers (/i, /x, /s...) and captures are supported.\n\n";
330	0					0	print "Options:\n";
331	0					0	print "-h --help Shows that help\n";
332	0					0	print "-v --verbose Shows keys info on match/unmatch\n";
333	0					0	print "-d --diag Enriched diagnostic with timing and syntax pointers\n";
334	0					0	print "-a --assert Misc: shows a large test of regexp against tstregex..\n";
335							}
336
337							sub _run_internal_tests
338							{
339	0			0		0	my ($mode_diag, $verbose) = @_;
340	0					0	my $fh = \*main::DATA;
341	0					0	seek($fh, 0, 0);
342
343	0					0	my $found_data_token = 0;
344
345	0					0	while (<$fh>)
346							{
347	0					0	chomp;
348							# PHASE 1: Skip everything until we hit the __DATA__ or __END__ marker
349							# This prevents the script from parsing its own source code
350	0	0				0	if (!$found_data_token)
351							{
352	0	0				0	$found_data_token = 1 if /^__(DATA\|END)__/;
353	0					0	next;
354							}
355							else
356							{
357	0	0				0	last if /^__(DATA\|END)__/;
358							}
359	0	0	0			0	next if /^\s*$/ \|\| /^#/;
360	0					0	my ($re, @rest) = split(/\s+\|:::\s*/, $_);
361	0	0				0	next unless $re;
362	0	0				0	my @strings = grep { $_ ne '0' && $_ ne '1' } @rest;
	0					0
363
364	0					0	print BOLD, YELLOW, 'Testing Regex: ', RESET, "$re\n";
365	0					0	my $ctx = Tstregex::tstregex_init_desc($re);
366	0	0	0			0	print 'Warning ', $ctx->{warning}, "\n" if($verbose && $ctx->{warning} ne '');
367	0					0	foreach my $s (@strings)
368							{
369	0					0	my $t0;
370	0	0				0	$t0 = [gettimeofday] if $mode_diag;
371	0					0	Tstregex::tstregex($ctx, $s);
372	0	0				0	$mode_diag ? _display_enriched($s, $ctx, tv_interval($t0))
373							: _display_standard($s, $ctx);
374							}
375	0					0	print '-' x 40, "\n";
376							}
377							}
378
379							sub _get_re_val
380							{
381							return (
382	0			0		0	Tstregex::tstregex_get_match_len ($_[0]),
383							Tstregex::tstregex_get_fail_token($_[0]),
384							Tstregex::tstregex_get_re_clean ($_[0])
385							);
386							}
387
388							sub _display_standard
389							{
390	0			0		0	my ($pattern, $ctx) = @_;
391	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
392							{
393	0					0	print "$pattern\n";
394	0					0	return undef;
395							}
396	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
397	0					0	print substr($pattern, 0, $match_len); STDOUT->flush(); print STDERR BOLD, substr($pattern, $match_len), RESET;
	0					0
	0					0
398	0					0	my $off = length($re_clean) - length($fail_token);
399	0					0	print STDERR ' (', substr($re_clean, 0, $off), BOLD, $fail_token, RESET, ")\n";
400	0					0	undef;
401							}
402
403							sub _display_enriched
404							{
405	0			0		0	my ($pattern, $ctx, $elapsed) = @_;
406	0					0	print BOLD, MAGENTA, '--- Diagnostic View ---', RESET, "\n";
407
408	0	0				0	if (Tstregex::tstregex_is_full_match($ctx))
409							{
410	0					0	print GREEN, ' Result: ', RESET, "$pattern (FULL MATCH)\n";
411	0					0	return undef;
412							}
413	0					0	my ($match_len, $fail_token, $re_clean ) = _get_re_val($ctx);
414	0					0	my $prefix_off = Tstregex::tstregex_get_prefix_offset($ctx);
415	0					0	my $re_raw = Tstregex::tstregex_get_re_raw ($ctx);
416
417	0					0	print YELLOW, ' Result: ', RESET, substr($pattern, 0, $match_len),
418							BOLD, WHITE, substr($pattern, $match_len), RESET;
419	0					0	print ' (at ', CYAN, $fail_token, RESET, ")\n";
420
421	0					0	my $err_pos_in_clean = length($re_clean) - length($fail_token);
422	0					0	my $final_pointer_pos = $prefix_off + $err_pos_in_clean;
423
424	0					0	print ' Syntax: ', WHITE, $re_raw, RESET, "\n";
425	0					0	print ' ', ' ' x $final_pointer_pos, BOLD, RED, '^--- HERE', RESET, "\n";
426	0					0	printf " Time: %.4fs\n\n", $elapsed;
427	0					0	undef;
428							}
429
430							}
431
432							1;
433
434							package Tstregex;
435							{
436							our $VERSION = '1.13';
437	15			15		107	use Exporter qw(import);
	15					19
	15					3362
438
439							our @EXPORT = qw(
440							tstregex
441							tstregex_init_desc
442							tstregex_get_match_len
443							tstregex_get_fail_token
444							tstregex_is_full_match
445							tstregex_get_re_clean
446							tstregex_get_prefix_offset
447							tstregex_get_re_raw
448							tstregex_get_match_portion
449							tstregex_get_info
450							);
451
452							# --- PUBLIC API (The Getters) ---
453
454	115			115	1	367	sub tstregex_get_match_len { return $_[0]->{match_len}; }
455	115			115	1	367	sub tstregex_get_fail_token { return $_[0]->{fail_token}; }
456	0			0	1	0	sub tstregex_is_full_match { return $_[0]->{full_match}; }
457	0			0	1	0	sub tstregex_get_re_clean { return $_[0]->{re_clean}; }
458	0			0	1	0	sub tstregex_get_prefix_offset { return $_[0]->{prefix_offset}; }
459	0			0	1	0	sub tstregex_get_re_raw { return $_[0]->{re_raw}; }
460	115		50	115	0	427	sub tstregex_get_captures { return $_[0]->{captures} // [];}
461	0			0	1	0	sub tstregex_get_match_portion { return $_[0]->{match_portion}; }
462	0			0	0	0	sub tstregex_get_info { return $_[0]->{warning}; }
463
464							# Main diagnostic function
465	15			15		78	use constant;
	15					20
	15					1268
466							use constant
467							{
468	15					30878	C_EMPTY => '',
469							RE_EMPTY => qr/\0/,
470							ASCII_LPAREN => ord('('), # 40
471							ASCII_RPAREN => ord(')'), # 41
472							ASCII_LBRACE => ord('{'), # 123
473							ASCII_RBRACE => ord('}'), # 125
474							ASCII_LBRACK => ord('['), # 91
475							ASCII_RBRACK => ord(']'), # 93
476							ASCII_LT => ord('<'), # 60
477							ASCII_GT => ord('>'), # 62
478	15			15		116	};
	15					32
479
480							sub tstregex
481							{
482	115			115	1	422	my ($ctx, $pattern) = @_;
483							# FIX: re init start-state fields in case of multiple test patterns
484	115					188	$ctx -> {'fail_token'} = Tstregex::C_EMPTY;
485	115					134	$ctx -> {'match_portion'} = undef;
486	115					124	$ctx -> {'right_unmatch'} = undef;
487	115					126	$ctx -> {'match_len'} = 0;
488	115					145	$ctx -> {'full_match'} = 0;
489	115					124	$ctx -> {'left_unmatch'} = undef;
490	115					133	my $re_raw = $ctx->{re_raw};
491	115					125	my $org_pat = $pattern;
492	115					120	my $internal_offset = 0;
493
494							# Handle prefix offset if pattern is wrapped like the RE
495	115	100	66			254	if ($ctx->{prefix_offset} > 0 && length($pattern) >= $ctx->{prefix_offset} + 1)
496							{
497	1					6	my $re_delim_char = _stringat($re_raw, $ctx->{prefix_offset} - 1);
498	1					3	my $pat_first_char = _stringat($pattern, 0);
499	1	50				4	if ($pat_first_char == $re_delim_char)
500							{
501	1					3	$pattern = substr($org_pat, $ctx->{prefix_offset});
502	1					4	chop $pattern;
503	1					3	$internal_offset = $ctx->{prefix_offset};
504							}
505							}
506
507							# Fast track: check if it matches globally first
508	115	100				526	if ($pattern =~ $ctx->{re_compiled})
509							{
510	68					91	$ctx->{full_match} = 1;
511	68					116	$ctx->{match_len} = length($org_pat);
512	68					176	$ctx->{match_portion} = $&; # the exact sub portion that matched
513	68					136	$ctx->{left_unmatch} = $`; # the left part of matching sub part
514	68					143	$ctx->{right_unmatch} = $'; # the right part of matching sub part
515
516							# --- Capture Groups Extraction ---
517							# We populate the captures array only on a successful global match
518	68					96	my @caps;
519
520							# The special variable $#- contains the number of capture groups
521							# We start at 1 because $0 is the whole match
522	68					217	for my $i (1 .. $#-)
523							{
524	22	100				63	if (defined $-[$i])
525							{
526							# Extracting the substring using offsets from @- and @+
527	21					90	push @caps, substr($pattern, $-[$i], $+[$i] - $-[$i]);
528							}
529							else
530							{
531							# Optional group that participated but didn't catch text
532	1					2	push @caps, undef;
533							}
534							}
535	68					113	$ctx->{captures} = \@caps;
536
537	68					159	return 1;
538							}
539
540							# Nibbling phase: find the longest matching lexical group
541	47					75	my $match_reg = Tstregex::C_EMPTY;
542	47					65	foreach my $step (@{$ctx->{steps}})
	47					112
543							{
544	114	100				1016	if ($pattern =~ qr/$step/)
545							{
546	35					47	$match_reg = $step;
547	35					51	last;
548							}
549							}
550
551							# SENSITIVE
552							# Fine-tune the match length character by character
553							# Append a \z to avoid Perl to skip final \n
554							# if Nibbling failed ($match_reg empty), get the full regex for fine-tuning.
555	47	100				132	my $target_re = ($match_reg ne Tstregex::C_EMPTY) ? $match_reg : $ctx->{re_clean};
556							# critical! add starting anchor (\A) to force coherency check from the first char..
557	47					149	my ($match_work, $warn) = _safe_qr("\\A$target_re\\z"); # (qr/\A$target_re\z/, undef); #;
558	47					147	for (my $i = length($pattern); $i >= 0; $i--)
559							{
560							# check if current prefix is valid according to the target
561	117	100				317	last if ($pattern =~ $match_work);
562	83					139	chop $pattern;
563							}
564	47					96	$ctx->{match_len} = length($pattern) + $internal_offset;
565
566							# Identify the failing token for display
567	47	50				45	my $tail_re = (scalar @{$ctx->{steps}}) ? $ctx->{steps}->[0] : $ctx->{re_clean};
	47					124
568	47					110	my $remaining_re = substr($tail_re, length($match_reg));
569
570	47	50				73	if ($remaining_re ne Tstregex::C_EMPTY)
571							{
572							# get the first token for analyse
573	47					75	my $next_tokens = _get_lex_groups($remaining_re);
574	47		50			95	my $first_token = $next_tokens->[0] // Tstregex::C_EMPTY;
575	47					72	$ctx->{fail_token} = $remaining_re;
576	47	100				152	$ctx->{fail_token} = $first_token if ($first_token =~ /^(\\b\|\^\|\$)$/); # Anchor case (0 width): want detail (just \b, ^ or $)
577							}
578							else
579							{
580	0					0	$ctx->{fail_token} = Tstregex::C_EMPTY;
581							}
582
583							# Ensure captures is empty/undef on failure
584	47					83	$ctx->{captures} = [];
585	47	50				146	return $ctx->{match_undef}? undef:0;
586							}
587
588							# Context initialization and RE peeling
589							sub tstregex_init_desc
590							{
591	115			115	1	2231759	my ($re_raw) = @_;
592							# The Shield: Catching the 5.28 deprecation warnings and fatal errors
593							# We use a localized __WARN__ handler to catch the "Unescaped left brace"
594							# even if it's not a fatal error yet in 5.28
595
596	115					228	my ($re_compiled, $re_clean, $prefix_off, $last_warning) = _unwrap_regex($re_raw);
597	115	50				245	my $match_undef = $re_compiled eq RE_EMPTY ? 1:0;
598
599	115					222	my $steps = _parse_lex_groups($re_clean);
600							# {
601							# no warnings 'experimental::re_strict';
602							# use re 'strict';
603							# $steps = _parse_lex_groups($re_clean);
604							# };
605							return
606							{
607	115					943	re_raw => $re_raw, re_compiled => $re_compiled, re_clean => $re_clean,
608							steps => $steps, prefix_offset => $prefix_off, match_len => 0,
609							fail_token => Tstregex::C_EMPTY, full_match => 0,match_portion => undef,
610							match_undef => $match_undef, left_unmatch => undef, right_unmatch => undef,
611							warning => $last_warning,
612							};
613							}
614
615							# Helper: get char code at position
616	2			2		6	sub _stringat($$) { return vec($_[0], $_[1], 8); }
617
618							sub _unwrap_regex
619							{
620	115			115		159	my ($raw) = @_;
621	115	50	33			519	return (qr//, Tstregex::C_EMPTY, 0) if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
622
623	115					176	my $raw_org = $raw;
624	115					132	my $options = Tstregex::C_EMPTY;
625
626							# 1. Extract trailing options (ismxg)
627	115					404	while ($raw =~ s/([ismxg])$//)
628							{
629	0					0	$options = $1 . $options;
630							}
631
632							# 2. Delegate peeling to _strop
633	115					203	my $clean = _strop($raw);
634
635							# 3. Automatic offset calculation (locate the "juice" within the original string)
636	115					211	my $off = index($raw_org, $clean);
637	115	50				262	$off = 0 if $off < 0;
638
639							# 4. Secure Forge (Remove 'g' as it is irrelevant for qr//)
640	115					155	$options =~ tr/g//d;
641	115	50				207	my $re_str = $options ? "(?$options)$clean" : $clean;
642
643	115					189	my ($re_ret, $warn) = _safe_qr($re_str);
644
645	115					282	return ($re_ret, $clean, $off, $warn);
646							}
647
648							sub _safe_qr
649							{
650	162			162		211	my ($re_str) = @_;
651	162					204	my ($re, $err);
652							{
653	162					141	local $@;
	162					163
654	162			1		1200	local $SIG{__DIE__} = local $SIG{__WARN__} = sub { };
655							# dont catch the warning there, let the eval fail instead and get the message back in $@
656	162					273	$re = eval { qr/$re_str/ };
	162					2316
657	162		50			829	$err = $@ // '';
658							}
659	162		50			472	return ($re // RE_EMPTY, $err);
660							}
661
662							# _strop: strip operators
663							# Peels Perl operators (m!!, m{}, //) by checking extremities.
664							# It ensures only the core regex juice is returned.
665							sub _strop
666							{
667	115			115		151	my ($raw) = @_;
668	115	50	33			385	return $raw if !defined $raw \|\| $raw eq Tstregex::C_EMPTY;
669
670							# Remove leading/trailing whitespace
671	115					394	$raw =~ s/^\s+\|\s+$//g;
672
673							# Identify opening delimiter (after an optional 'm')
674	115	100				547	if ($raw =~ /^((?:m\s)?)([^\w\s])(.)$/s)
675							{
676	50					114	my $prefix = $1;
677	50					75	my $open = $2;
678	50					82	my $body = $3;
679
680							# Map symmetric pairs using ASCII constants
681	50					214	my %sym_or_eq =
682							(
683							chr(ASCII_LBRACE) => chr(ASCII_RBRACE),
684							chr(ASCII_LBRACK) => chr(ASCII_RBRACK),
685							chr(ASCII_LPAREN) => chr(ASCII_RPAREN),
686							chr(ASCII_LT) => chr(ASCII_GT),
687							);
688
689							# Expected close is either the matching pair or the same character (e.g., m!!)
690	50		66			176	my $expected_close = $sym_or_eq{$open} \|\| $open;
691
692							# PROTECT CAPTURE GROUPS:
693							# If the delimiter is a parenthesis but there is no 'm' prefix,
694							# it is a capturing group, NOT an operator. Do not peel!
695	50	100	66			129	if ($open eq chr(ASCII_LPAREN) && !$prefix)
696							{
697	26					71	return $raw;
698							}
699
700							# Check if the very last character matches our expected closing delimiter
701	24	100				70	if (substr($body, -1) eq $expected_close)
702							{
703	1					5	return substr($body, 0, -1);
704							}
705
706							# If 'm' was present but closing failed, return body (best effort)
707	23	50				85	return $body if $prefix;
708							}
709
710	88					141	return $raw;
711							}
712
713							sub _parse_lex_groups
714							{
715	115			115		218	my ($regex) = @_;
716	115					201	my $tokens = _get_lex_groups($regex);
717	115					119	my @results;
718	115					261	my $current = join(Tstregex::C_EMPTY, @$tokens);
719
720	115					194	while (@$tokens)
721							{
722	412					730	my $opens = () = $current =~ /(?
723	412					536	my $closes = () = $current =~ /(?
724	412	50				538	if ($opens >= $closes)
725							{
726	412					566	my $v = $current . (')' x ($opens - $closes));
727	412					479	$v =~ s/(?
728	412	100				377	if (eval { qr/$v/ }) { push @results, $v; }
	412					4197
	409					595
729							}
730	412					553	my $last = pop @$tokens;
731	412	50				934	substr($current, -length($last)) = Tstregex::C_EMPTY if defined $last;
732							}
733	115					229	return \@results;
734							}
735
736							# Lexical tokenizer for Perl Regex
737							sub _get_lex_groups
738							{
739	162			162		228	my ($regex) = @_;
740	162					176	my @groups;
741
742							# --- START: class mismatch support (Added [.*?] to tokenizer) ---
743							# my $re = qr/($\?\#.?$)\|($\?[:=!<>]+)\|(\{\d+,?\d\})\|(\[.?\])\|(\\.)\|([\($\\|^\$\+\\?])\|(.)/x;
744							# --- ENHANCED: Atomic Lookaround & Recursive Group Support ---
745							# Group 1: Comments (?#...)
746							# Group 2: Assertions and Groups (?=, (?:, (?<, etc. including nested parens
747							# Group 3: Quantifiers {n,m}
748							# Group 4: Character classes [...]
749							# Group 5: Escaped characters \.
750							# Group 6: Metacharacters ( ) \| ^ $ + * ?
751							# Group 7: Any other character
752							# --- END: class mismatch support ---
753	162					285	my $re = qr/
754							($\?\#.*?$)
755							\| ( # START GROUP 2
756							\(\?[:=!<>]+ # Assertion header
757							(?: # Content
758							(?> [^()]+ ) # Non-paren characters (atomic)
759							\|
760							(?2) # Recursive call to Group 2
761							)*
762							\) # Matching closing paren
763							) # END GROUP 2
764							\| (\{\d+,?\d*\})
765							\| (\[.*?\])
766							\| (\\.)
767							\| ([\\|^\$\+\*\?])
768							\| (.)
769							/x;
770
771	162					967	while ($regex =~ /$re/g)
772							{
773	561		100			3265	my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
			100
			100
			100
			100
			66
774	561	100	66			1903	if (defined $t && $t =~ /^[\+\*\?]$\|^\{\d/ && @groups && $groups[-1] !~ /^[\\|]$/)
			66
			100
775							{
776	71					250	$groups[-1] .= $t;
777							}
778	490					1795	else { push @groups, $t; }
779							}
780
781							# TODO: test that optimized code fragment instead; Much time spent here, but sensitive part..
782							# my $quantifiers = '+*?{';
783							# while ($regex =~ /$re/g)
784							# {
785							# my $t = $1 // $2 // $3 // $4 // $5 // $6 // $7;
786							# if (defined $t && @groups)
787							# {
788							# my $char = substr($t, 0, 1);
789							# if (index($quantifiers, $char) != -1 && $groups[-1] !~ /^[\\|]$/)
790							# {
791							# $groups[-1] .= $t;
792							# next;
793							# }
794							# }
795							# push @groups, $t;
796							# }
797
798	162					385	return \@groups;
799							}
800
801							}
802
803							1;
804
805							package main;
806
807							__DATA__