File Coverage

blib/lib/Data/Random/String/Matches.pm

Criterion	Covered	Total	%
statement	421	445	94.6
branch	225	268	83.9
condition	77	123	62.6
subroutine	38	38	100.0
pod	10	10	100.0
total	771	884	87.2

line	stmt	bran	cond	sub	pod	time	code
1							package Data::Random::String::Matches;
2
3	32			32		3104840	use 5.010;
	32					174
4
5	32			32		196	use strict;
	32					67
	32					967
6	32			32		185	use warnings;
	32					65
	32					2272
7
8	32			32		220	use Carp qw(carp croak);
	32					76
	32					2500
9	32			32		17784	use Params::Get;
	32					461956
	32					1985
10	32			32		17414	use utf8;
	32					10156
	32					232
11
12							our $VERSION = '0.04';
13
14							=head1 NAME
15
16							Data::Random::String::Matches - Generate random strings matching a regex
17
18							=head1 SYNOPSIS
19
20							use Data::Random::String::Matches;
21
22							# Create a generator with regex and optional length
23							my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7);
24
25							# Generate a matching string
26							my $str = $gen->generate();
27							print $str; # e.g., "XYZ1234"
28
29							# Alternation
30							my $gen2 = Data::Random::String::Matches->new(qr/(cat\|dog\|bird)/);
31							my $animal = $gen2->generate_smart(); # "cat", "dog", or "bird"
32
33							# Backreferences
34							my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/);
35							my $str3 = $gen3->generate_smart(); # e.g., "abc-abc"
36
37							# Groups and quantifiers
38							my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/);
39							my $laugh = $gen4->generate_smart(); # "haha", "hahaha", or "hahahaha"
40
41							# Unicode
42							$gen = Data::Random::String::Matches->new(qr/\p{L}{5}/);
43
44							# Named captures
45							$gen = Data::Random::String::Matches->new(qr/(?\d{4})-\k/);
46
47							# Possessive
48							$gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/);
49
50							# Lookaheads
51							$gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/);
52
53							# Combined
54							$gen = Data::Random::String::Matches->new(
55							qr/(?\p{Lu}{2})\d++\k(?=[A-Z])/
56							);
57
58							# Consistency with Legacy software
59							print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";
60
61							=head1 DESCRIPTION
62
63							This module generates random strings that match a given regular expression pattern.
64							It parses the regex pattern and intelligently builds matching strings, supporting
65							a wide range of regex features.
66
67							=head1 SUPPORTED REGEX FEATURES
68
69							=head2 Character Classes
70
71							=over 4
72
73							=item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]>
74
75							=item * Negated classes: C<[^a-z]>
76
77							=item * Ranges: C<[a-zA-Z0-9]>
78
79							=item * Escape sequences in classes: C<[\d\w]>
80
81							=back
82
83							=head2 Escape Sequences
84
85							=over 4
86
87							=item * C<\d> - digit [0-9]
88
89							=item * C<\w> - word character [a-zA-Z0-9_]
90
91							=item * C<\s> - whitespace
92
93							=item * C<\D> - non-digit
94
95							=item * C<\W> - non-word character
96
97							=item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return
98
99							=back
100
101							=head2 Quantifiers
102
103							=over 4
104
105							=item * C<{n}> - exactly n times
106
107							=item * C<{n,m}> - between n and m times
108
109							=item * C<{n,}> - n or more times
110
111							=item * C<+> - one or more (1-5 times)
112
113							=item * C<*> - zero or more (0-5 times)
114
115							=item * C - zero or one
116
117							=back
118
119							=head2 Grouping and Alternation
120
121							=over 4
122
123							=item * C<(...)> - capturing group
124
125							=item * C<(?:...)> - non-capturing group
126
127							=item * C<\|> - alternation (e.g., C)
128
129							=item * C<\1>, C<\2>, etc. - backreferences
130
131							=back
132
133							=head2 Other
134
135							=over 4
136
137							=item * C<.> - any character (printable ASCII)
138
139							=item * Literal characters
140
141							=item * C<^> and C<$> anchors (stripped during parsing)
142
143							=back
144
145							=head1 LIMITATIONS
146
147							=over 4
148
149							=item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported
150
151							=item * Named groups ((?...)) are not supported
152
153							=item * Possessive quantifiers (*+, ++) are not supported
154
155							=item * Unicode properties (\p{L}, \p{N}) are not supported
156
157							=item * Some complex nested patterns may not work correctly with smart parsing
158
159							=back
160
161							=head1 EXAMPLES
162
163							# Email-like pattern
164							my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/);
165
166							# API key pattern
167							my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/);
168
169							# Phone number
170							my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/);
171
172							# Repeated pattern
173							my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/);
174
175							=head1 METHODS
176
177							=head2 new($regex, $length)
178
179							Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string.
180							C<$length> is optional and defaults to 10 (used for fallback generation).
181
182							=cut
183
184							sub new {
185	215			215	1	1653540	my ($class, $regex, $length) = @_;
186
187	215	100				880	croak 'Regex pattern is required' unless defined $regex;
188
189							# Convert string to regex if needed
190	214	100				1377	my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/;
191
192	214		100			2178	my $self = {
193							regex => $regex_obj,
194							regex_str => "$regex",
195							length => $length \|\| 10,
196							backrefs => {}, # Store backreferences
197							named_refs => {}, # Store named captures
198							};
199
200	214					954	return bless $self, $class;
201							}
202
203							=head2 generate($max_attempts)
204
205							Generates a random string matching the regex. First tries smart parsing, then
206							falls back to brute force if needed. Tries up to C<$max_attempts> times
207							(default 1000) before croaking.
208
209							=cut
210
211							sub generate {
212	4124			4124	1	54757	my ($self, $max_attempts) = @_;
213	4124		100			16316	$max_attempts //= 1000;
214
215	4124					8373	my $regex = $self->{regex};
216	4124					7782	my $length = $self->{length};
217
218							# First try the smart approach
219	4124					6948	my $str = eval { $self->_build_from_pattern($self->{regex_str}) };
	4124					10083
220	4124	100	100			45315	if (defined $str && $str =~ /^$regex$/) {
221	4122					17732	return $str;
222							}
223
224							# If smart approach failed, show warning in debug mode
225	2	0	33			26	if ($ENV{DEBUG_REGEX_GEN} && $@) {
226	0					0	warn "Smart generation failed: $@";
227							}
228
229							# Fall back to brute force with character set matching
230	2					8	for (1 .. $max_attempts) {
231	1010					3227	$str = $self->_random_string_smart($length);
232	1010	50				8136	return $str if $str =~ /^$regex$/;
233							}
234
235	2					280	croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}";
236							}
237
238							sub _random_string_smart {
239	1010			1010		2662	my ($self, $len) = @_;
240
241	1010					2355	my $regex_str = $self->{regex_str};
242
243							# Detect common patterns and generate appropriate characters
244	1010					1658	my @chars;
245
246	1010	50	33			10695	if ($regex_str =~ /\\d/ \|\| $regex_str =~ /\[0-9\]/ \|\| $regex_str =~ /\[\^[^\]]*[A-Za-z]/) {
		50	33
		50	33
		50	33
			33
247							# Digit patterns
248	0					0	@chars = ('0'..'9');
249							} elsif ($regex_str =~ /\[A-Z\]/ \|\| $regex_str =~ /\[A-Z[^\]]*\]/) {
250							# Uppercase patterns
251	0					0	@chars = ('A'..'Z');
252							} elsif ($regex_str =~ /\[a-z\]/ \|\| $regex_str =~ /\[a-z[^\]]*\]/) {
253							# Lowercase patterns
254	0					0	@chars = ('a'..'z');
255							} elsif ($regex_str =~ /\\w/ \|\| $regex_str =~ /\[a-zA-Z0-9_\]/) {
256							# Word characters
257	0					0	@chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
258							} else {
259							# Default to printable ASCII
260	1010					2880	@chars = map { chr($_) } (33 .. 126);
	94940					233045
261							}
262
263	1010					2644	my $str = '';
264	1010					10600	$str .= $chars[int(rand(@chars))] for (1 .. $len);
265
266	1010					20775	return $str;
267							}
268
269							=head2 generate_smart()
270
271							Parses the regex and builds a matching string directly. Faster and more reliable
272							than brute force, but may not handle all edge cases.
273
274							=cut
275
276							sub generate_smart {
277	132			132	1	22490	my $self = $_[0];
278	132					615	return $self->_build_from_pattern($self->{regex_str});
279							}
280
281							=head2 generate_many($count, $unique)
282
283							Generates multiple random strings matching the regex.
284
285							my @strings = $gen->generate_many(10); # 10 strings (may have duplicates)
286							my @strings = $gen->generate_many(10, 1); # 10 unique strings
287							my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings
288
289							# Generate until you have 1000 unique codes
290							my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/);
291							my @codes = $gen->generate_many(1000, 'unique');
292
293							Parameters:
294
295							=over 4
296
297							=item * C<$count> - Number of strings to generate (required, must be positive)
298
299							=item * C<$unique> - If true, ensures all generated strings are unique. May return fewer
300							than C<$count> strings if uniqueness cannot be achieved within reasonable attempts.
301							Accepts any true value (1, 'unique', etc.)
302
303							=back
304
305							Returns: List of generated strings
306
307							Dies: If count is not a positive integer
308
309							Warns: If unable to generate the requested number of unique strings
310
311							=cut
312
313							sub generate_many {
314	24			24	1	9137	my ($self, $count, $unique) = @_;
315
316	24	100	100			220	croak 'Count must be a positive integer' unless defined $count && $count > 0;
317
318	21					45	my @results;
319
320	21	100				61	if ($unique) {
321							# Generate unique strings
322	6					12	my %seen;
323	6					15	my $attempts = 0;
324	6					15	my $max_attempts = $count * 100; # Reasonable limit
325
326	6		100			44	while (keys %seen < $count && $attempts < $max_attempts) {
327	1634					4321	my $str = $self->generate();
328	1634					4625	$seen{$str} = 1;
329	1634					6784	$attempts++;
330							}
331
332	6	100				21	if (keys %seen < $count) {
333	1					39	carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested";
334							}
335
336	6					1115	@results = keys %seen;
337							} else {
338							# Generate any strings (may have duplicates)
339	15					81	push @results, $self->generate() for (1 .. $count);
340							}
341
342	21					1421	return @results;
343							}
344
345							=head2 get_seed()
346
347							Gets the random seed for reproducible generation
348
349							=cut
350
351							sub get_seed {
352	2			2	1	783	my $self = shift;
353
354	2					13	return $self->{seed};
355							}
356
357							=head2 set_seed($seed)
358
359							Sets the random seed for reproducible generation
360
361							=cut
362
363							sub set_seed {
364	13			13	1	4154	my $self = shift;
365	13					54	my $params = Params::Get::get_params('seed', \@_);
366	12					279	my $seed = $params->{'seed'};
367
368	12	100				139	croak 'Seed must be defined' unless defined $seed;
369
370	11					26	srand($seed);
371	11					22	$self->{seed} = $seed;
372
373	11					45	return $self;
374							}
375
376							=head2 suggest_simpler_pattern()
377
378							Analyzes patterns and suggests improvements.
379
380							my $suggestion = $gen->suggest_simpler_pattern();
381
382							if ($suggestion) {
383							print "Reason: $suggestion->{reason}\n";
384							print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern};
385							print "Tips:\n";
386							print " - $_\n" for @{$suggestion->{tips}};
387							}
388
389							=cut
390
391							sub suggest_simpler_pattern {
392	21			21	1	119	my $self = $_[0];
393
394	21					54	my $pattern = $self->{regex_str};
395	21					90	my $info = $self->pattern_info();
396
397							# Check for patterns that are too complex
398	21	100				54	if ($info->{complexity} eq 'very_complex') {
399							return {
400	1					48	pattern => undef,
401							reason => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.',
402							tips => [
403							'Split alternations into separate generators',
404							'Avoid deeply nested groups',
405							'Use fixed-length patterns when possible',
406							],
407							};
408							}
409
410							# Suggest removing unnecessary backreferences
411	20	50	66			113	if ($info->{features}{has_backreferences} && $pattern =~ /($\w+$).*\\\d+/) {
412	0					0	my $simpler = $pattern;
413							# Can't automatically simplify backreferences, but can suggest
414							return {
415	0					0	pattern => undef,
416							reason => 'Backreferences add complexity. Consider if you really need repeated groups.',
417							tips => [
418							'If the repeated part doesn\'t need to match, use two separate patterns',
419							'For validation, backreferences are great; for generation, they limit variation',
420							],
421							};
422							}
423
424							# Suggest fixed quantifiers instead of ranges
425	20	100				91	if ($pattern =~ /\{(\d+),(\d+)\}/) {
426	8					26	my ($min, $max) = ($1, $2);
427	8	100				27	if ($max - $min > 10) {
428	6					14	my $mid = int(($min + $max) / 2);
429	6					9	my $simpler = $pattern;
430	6					41	$simpler =~ s/\{\d+,\d+\}/\{$mid\}/;
431							return {
432	6					47	pattern => $simpler,
433							reason => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.",
434							tips => [
435							'Fixed lengths are faster to generate',
436							'If you need variety, generate multiple patterns with different fixed lengths',
437							],
438							};
439							}
440							}
441
442							# Suggest limiting alternations
443	14	100				39	if ($info->{features}{has_alternation}) {
444	2					15	my @alts = split /\\|/, $pattern;
445	2	100				6	if (@alts > 10) {
446							return {
447	1					12	pattern => undef,
448							reason => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.',
449							tips => [
450							'Create separate generators for different alternatives',
451							'Group similar patterns together',
452							'Use character classes [abc] instead of (a\|b\|c)',
453							],
454							};
455							}
456
457							# Check if alternations could be a character class
458	1	50				8	if ($pattern =~ /$([a-zA-Z])\\|([a-zA-Z])\\|([a-zA-Z])$/) {
459	1					7	my $chars = join('', $1, $2, $3);
460	1					3	my $simpler = $pattern;
461	1					7	$simpler =~ s/$[a-zA-Z]\\|[a-zA-Z]\\|[a-zA-Z]$/[$chars]/;
462							return {
463	1					10	pattern => $simpler,
464							reason => 'Single-character alternations can be simplified to character classes.',
465							tips => [
466							'Use [abc] instead of (a\|b\|c)',
467							'Character classes are faster to process',
468							],
469							};
470							}
471							}
472
473							# Suggest removing lookaheads/lookbehinds for generation
474	12	100	100			59	if ($info->{features}{has_lookahead} \|\| $info->{features}{has_lookbehind}) {
475	3					6	my $simpler = $pattern;
476	3					17	$simpler =~ s/$\?[=!].*?$//g; # Remove lookaheads
477	3					11	$simpler =~ s/$\?<[=!].*?$//g; # Remove lookbehinds
478
479	3	50				11	if ($simpler ne $pattern) {
480							return {
481	3					20	pattern => $simpler,
482							reason => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.',
483							tips => [
484							'Lookaheads are great for validation, not generation',
485							'The simplified pattern generates the same strings',
486							],
487							};
488							}
489							}
490
491							# Check for Unicode when ASCII would work
492	9	100	66			31	if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) {
493	2					5	my $simpler = $pattern;
494	2					7	$simpler =~ s/\\p\{L\}/[A-Za-z]/g;
495							return {
496	2					14	pattern => $simpler,
497							reason => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.',
498							tips => [
499							'ASCII patterns are faster',
500							'Only use Unicode if you need non-ASCII characters',
501							],
502							};
503							}
504
505							# Check for overly long fixed strings
506	7	50				16	if ($pattern =~ /([a-zA-Z]{20,})/) {
507							return {
508	0					0	pattern => undef,
509							reason => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.',
510							tips => [
511							'Use variables instead of long literals',
512							'Break into smaller patterns',
513							],
514							};
515							}
516
517							# Pattern seems reasonable
518	7					22	return undef;
519							}
520
521							=head2 validate($string)
522
523							Checks if a string matches the pattern without generating.
524
525							if ($gen->validate('1234')) {
526							print "Valid!\n";
527							}
528
529							=cut
530
531							sub validate {
532	69			69	1	5945	my $self = shift;
533	69					261	my $params = Params::Get::get_params('string', \@_);
534	68					1639	my $string = $params->{'string'};
535
536	68	100				294	croak('String must be defined') unless defined $string;
537
538	67					114	my $regex = $self->{regex};
539	67					1111	return $string =~ /^$regex$/;
540							}
541
542							=head2 pattern_info()
543
544							Returns detailed information about the pattern.
545
546							my $info = $gen->pattern_info();
547							print "Complexity: $info->{complexity}\n";
548							print "Min length: $info->{min_length}\n";
549							print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n";
550
551							C analyzes a regular expression to produce a structured summary of its characteristics,
552							including estimated string lengths, detected features, and an overall complexity rating.
553							It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals,
554							character classes, and quantifiers.
555							It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash.
556							Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level.
557
558							=cut
559
560							sub pattern_info {
561	41			41	1	6097	my $self = $_[0];
562
563	41	100				108	return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'};
564
565	38					72	my $pattern = $self->{'regex_str'};
566
567							# Calculate approximate min/max lengths
568	38					100	my ($min_len, $max_len) = $self->_estimate_length($pattern);
569
570							# Detect pattern features
571	38	100				908	my %features = (
		100
		100
		100
		100
		100
		100
572							has_alternation => ($pattern =~ /\\|/ ? 1 : 0),
573							has_backreferences => ($pattern =~ /(\\[1-9]\|\\k<)/ ? 1 : 0),
574							has_unicode => ($pattern =~ /\\p\{/ ? 1 : 0),
575							has_lookahead => ($pattern =~ /\(\?[=!]/ ? 1 : 0),
576							has_lookbehind => ($pattern =~ /\(\?<[=!]/ ? 1 : 0),
577							has_named_groups => ($pattern =~ /\(\?
578							has_possessive => ($pattern =~ /(?:[+?]\+\|\{\d+(?:,\d)?\}\+)/ ? 1 : 0),
579							);
580
581	38					205	my $info = {
582							pattern => $pattern,
583							min_length => $min_len,
584							max_length => $max_len,
585							estimated_length => int(($min_len + $max_len) / 2),
586							features => \%features,
587							complexity => $self->_calculate_complexity(\%features, $pattern),
588							};
589
590	38					140	$self->{'_pattern_info_cache'} = $info;
591
592	38					108	return $info;
593							}
594
595							sub _estimate_length {
596	38			38		88	my ($self, $pattern) = @_;
597
598							# Remove anchors and modifiers
599	38					371	$pattern =~ s/^$\?\^?[iumsx-]:(.)$$/$1/;
600	38					100	$pattern =~ s/^\^//;
601	38					74	$pattern =~ s/\$//;
602
603	38					59	my $min = 0;
604	38					58	my $max = 0;
605
606							# Simple heuristic - count fixed characters and quantifiers
607	38					59	my $last_was_atom = 0; # Handle cases like \d{3} where the quantifier modifies the atom count
608	38					249	while ($pattern =~ /([^+*?{}\[\]\\])\|\\[dwsWDN]\|\[([^\]]+)\]\|\{(\d+)(?:,(\d+))?\}/g) {
609	222	100	66			899	if (defined $1 \|\| (defined $2 && $2)) {
		100	100
610	156					236	$min++;
611	156					255	$max++;
612	156					498	$last_was_atom = 1;
613							} elsif (defined $3) {
614	36	100				107	if ($last_was_atom) {
615							# Replace the last atom’s contribution
616	16					66	$min += $3 - 1;
617	16	100				60	$max += (defined $4 ? $4 : $3) - 1;
618	16					75	$last_was_atom = 0;
619							} else {
620							# No preceding atom? assume standalone
621	20					46	$min += $3;
622	20	100				114	$max += defined $4 ? $4 : $3;
623							}
624							}
625							}
626
627							# Account for +, *, ?
628	38					119	my $plus_count = () = $pattern =~ /\+/g;
629	38					77	my $star_count = () = $pattern =~ /\*/g;
630	38					81	my $question_count = () = $pattern =~ /\?/g;
631
632	38					63	$min += $plus_count; # + means at least 1
633	38					72	$max += ($plus_count * 5) + ($star_count * 5); # Assume max 5 repetitions
634	38					60	$min -= $question_count; # ? makes things optional
635
636	38	50				85	$min = 0 if $min < 0;
637	38	50				77	$max = $min + 50 if $max < $min; # Ensure max >= min
638
639	38					151	return ($min, $max);
640							}
641
642							sub _calculate_complexity {
643	38			38		139	my ($self, $features, $pattern) = @_;
644
645	38					99	my $score = 0;
646
647							# Base complexity from pattern length
648	38					95	$score += length($pattern) / 10;
649
650							# Add complexity for features
651	38	100				97	$score += 2 if $features->{has_alternation};
652	38	100				95	$score += 3 if $features->{has_backreferences};
653	38	100				85	$score += 2 if $features->{has_unicode};
654	38	100				92	$score += 2 if $features->{has_lookahead};
655	38	100				85	$score += 2 if $features->{has_lookbehind};
656	38	100				86	$score += 1 if $features->{has_named_groups};
657	38	100				77	$score += 1 if $features->{has_possessive};
658
659							# Classify
660	38	100				215	return 'simple' if $score < 3;
661	15	100				102	return 'moderate' if $score < 7;
662	3	100				23	return 'complex' if $score < 12;
663	1					10	return 'very_complex';
664							}
665
666							sub _build_from_pattern {
667	4255			4255		8749	my ($self, $pattern) = @_;
668
669							# Reset backreferences for each generation
670	4255					10797	$self->{backrefs} = {};
671	4255					8317	$self->{named_refs} = {};
672	4255					7506	$self->{group_counter} = 0;
673
674							# Remove regex delimiters and modifiers
675							# Handle (?^:...), (?i:...), (?-i:...) etc
676	4255					35885	$pattern =~ s/^$\?\^?[iumsx-]:(.)$$/$1/;
677
678							# Remove anchors (they're handled by the regex match itself)
679	4255					9524	$pattern =~ s/^\^//;
680	4255					7491	$pattern =~ s/\$//;
681
682	4255					10489	return $self->_parse_sequence($pattern);
683							}
684
685							sub _parse_sequence {
686	4397			4397		8596	my ($self, $pattern) = @_;
687
688	4397					7273	my $result = '';
689	4397					6590	my $i = 0;
690	4397					6849	my $len = length($pattern);
691
692	4397					10392	while ($i < $len) {
693	5817					11507	my $char = substr($pattern, $i, 1);
694
695	5817	100	66			19022	if ($char eq '\\') {
		100
		100
		100
		50
		100
		100
696							# Escape sequence
697	940					1373	$i++;
698	940					1634	my $next = substr($pattern, $i, 1);
699
700	940	100	66			5732	if ($next =~ /[1-9]/) {
		100	66
		100
		100
		100
		100
		100
		100
		100
		100
		50
701							# Backreference
702	30					55	my $ref_num = $next;
703	30	50				82	if (exists $self->{backrefs}{$ref_num}) {
704	30					70	$result .= $self->{backrefs}{$ref_num};
705							} else {
706	0					0	croak "Backreference \\$ref_num used before group defined";
707							}
708							} elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') {
709							# Named backreference \k
710	7					18	my $end = index($pattern, '>', $i+2);
711	7					18	my $name = substr($pattern, $i+2, $end-$i-2);
712	7	50				18	if (exists $self->{named_refs}{$name}) {
713	7					16	$result .= $self->{named_refs}{$name};
714							} else {
715	0					0	croak "Named backreference \\k<$name> used before group defined";
716							}
717	7					11	$i = $end;
718							} elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') {
719							# Unicode property \p{L}, \p{N}, etc.
720	25					77	my $end = index($pattern, '}', $i+2);
721	25					60	my $prop = substr($pattern, $i+2, $end-$i-2);
722							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
723	95			95		250	$self->_unicode_property_char($prop);
724	25					153	});
725	25					116	$result .= $generated;
726	25					50	$i = $new_i;
727							} elsif ($next eq 'd') {
728	730			2304		4135	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1);
	2304					7387
729	730					2426	$result .= $generated;
730	730					1339	$i = $new_i;
731							} elsif ($next eq 'w') {
732							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
733	115			115		1539	my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
734	115					1045	$chars[int(rand(@chars))];
735	35					301	}, 1);
736	35					183	$result .= $generated;
737	35					69	$i = $new_i;
738							} elsif ($next eq 's') {
739	2			2		17	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1);
	2					10
740	2					11	$result .= $generated;
741	2					6	$i = $new_i;
742							} elsif ($next eq 'D') {
743							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
744	6			6		26	my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126);
	504					1013
	564					1403
745	6					139	$chars[int(rand(@chars))];
746	2					18	});
747	2					16	$result .= $generated;
748	2					7	$i = $new_i;
749							} elsif ($next eq 'W') {
750							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
751	6			6		52	my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126);
	186					371
	564					1477
752	6					80	$chars[int(rand(@chars))];
753	2					21	});
754	2					17	$result .= $generated;
755	2					6	$i = $new_i;
756							} elsif ($next eq 't') {
757	2					7	$result .= "\t";
758							} elsif ($next eq 'n') {
759	2					8	$result .= "\n";
760							} elsif ($next eq 'r') {
761	0					0	$result .= "\r";
762							} else {
763	103					140	$result .= $next;
764							}
765	940					2304	$i++;
766							} elsif ($char eq '[') {
767							# Character class
768	3935					9596	my $end = $self->_find_matching_bracket($pattern, $i);
769	3935	50				8802	croak 'Unmatched [' if $end == -1;
770
771	3935					8220	my $class = substr($pattern, $i+1, $end-$i-1);
772							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
773	17179			17179		41866	$self->_random_from_class($class);
774	3935					22167	}, 1);
775	3935					16201	$result .= $generated;
776	3935					11943	$i = $new_i + 1;
777							} elsif ($char eq '(') {
778							# Group - could be various types
779	151					518	my $end = $self->_find_matching_paren($pattern, $i);
780	151	50				450	croak 'Unmatched (' if $end == -1;
781
782	151					503	my $group_content = substr($pattern, $i+1, $end-$i-1);
783
784							# Check for special group types
785	151					306	my $is_capturing = 1;
786	151					248	my $is_lookahead = 0;
787	151					288	my $is_lookbehind = 0;
788	151					312	my $is_negative = 0;
789	151					345	my $group_name = undef;
790
791	151	100				1078	if ($group_content =~ /^\?:/) {
		100
		100
		100
		100
		100
792							# Non-capturing group
793	2					8	$is_capturing = 0;
794	2					8	$group_content = substr($group_content, 2);
795							} elsif ($group_content =~ /^\?<([^>]+)>/) {
796							# Named capture (?...)
797	15					50	$group_name = $1;
798	15					55	$group_content = substr($group_content, length($1) + 3);
799							} elsif ($group_content =~ /^\?=/) {
800							# Positive lookahead (?=...)
801	6					20	$is_lookahead = 1;
802	6					17	$is_capturing = 0;
803	6					26	$group_content = substr($group_content, 2);
804							} elsif ($group_content =~ /^\?!/) {
805							# Negative lookahead (?!...)
806	1					4	$is_lookahead = 1;
807	1					4	$is_negative = 1;
808	1					4	$is_capturing = 0;
809	1					5	$group_content = substr($group_content, 2);
810							} elsif ($group_content =~ /^\?<=/) {
811							# Positive lookbehind (?<=...)
812	1					5	$is_lookbehind = 1;
813	1					3	$is_capturing = 0;
814	1					5	$group_content = substr($group_content, 3);
815							} elsif ($group_content =~ /^\?
816							# Negative lookbehind (?
817	1					5	$is_lookbehind = 1;
818	1					4	$is_negative = 1;
819	1					3	$is_capturing = 0;
820	1					6	$group_content = substr($group_content, 3);
821							}
822
823							# Handle lookaheads/lookbehinds
824	151	100				481	if ($is_lookahead) {
		100
825							# For positive lookahead, generate the pattern but don't advance
826							# For negative lookahead, avoid the pattern
827	7	100				34	if (!$is_negative) {
828							# Generate what the lookahead expects but don't consume it
829							# This is a simplification - we just note the constraint
830							}
831							# Lookaheads don't add to the result
832	7					22	$i = $end + 1;
833	7					35	next;
834							} elsif ($is_lookbehind) {
835							# Lookbehinds check what came before
836							# For generation, we can mostly ignore them
837	2					8	$i = $end + 1;
838	2					11	next;
839							}
840
841							# Check for alternation
842	142					237	my $generated;
843	142	100				448	if ($group_content =~ /\\|/) {
844	88					309	$generated = $self->_handle_alternation($group_content);
845							} else {
846	54					200	$generated = $self->_parse_sequence($group_content);
847							}
848
849							# Store backreference if capturing
850	142	100				392	if ($is_capturing) {
851	140					365	$self->{group_counter}++;
852	140					608	$self->{backrefs}{$self->{group_counter}} = $generated;
853
854	140	100				433	if (defined $group_name) {
855	15					88	$self->{named_refs}{$group_name} = $generated;
856							}
857							}
858
859							# Handle quantifier after group (including possessive)
860	142			154		679	my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1);
	154					500
861	142					424	$result .= $final_generated;
862	142					458	$i = $new_i + 1;
863							} elsif ($char eq '.') {
864							# Any character (except newline)
865							my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
866	2			2		10	my @chars = map { chr($_) } (33 .. 126);
	188					476
867	2					34	$chars[int(rand(@chars))];
868	2					16	});
869	2					14	$result .= $generated;
870	2					11	$i = $new_i + 1;
871							} elsif ($char eq '\|') {
872							# Alternation at top level - just return what we have
873							# (This is handled by _handle_alternation for groups)
874	0					0	last;
875							} elsif ($char =~ /[+*?]/ \|\| $char eq '{') {
876							# Quantifier without preceding element - shouldn't happen in valid regex
877	1					283	croak "$pattern: Quantifier '$char' without preceding element";
878							} elsif ($char =~ /[\w ]/) {
879							# Literal character
880	719			729		2631	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char });
	729					1783
881	719					2009	$result .= $generated;
882	719					1623	$i = $new_i + 1;
883							} else {
884							# Other literal characters
885	69					132	$result .= $char;
886	69					145	$i++;
887							}
888							}
889
890	4396					13232	return $result;
891							}
892
893							sub _handle_quantifier {
894	5594			5594		13157	my ($self, $pattern, $pos, $generator, $check_possessive) = @_;
895	5594		100			13622	$check_possessive //= 1; # Default to checking for possessive
896
897	5594					10241	my $next = substr($pattern, $pos + 1, 1);
898	5594					8298	my $is_possessive = 0;
899
900							# Check for possessive quantifier (+)
901	5594	100	66			18594	if ($check_possessive && $pos + 2 < length($pattern)) {
902	3912					6713	my $after_next = substr($pattern, $pos + 2, 1);
903	3912	100	66			18686	if (($next =~ /[+*?]/ \|\| $next eq '}') && $after_next eq '+') {
			100
904	6					23	$is_possessive = 1;
905							}
906							}
907
908	5594	100				13579	if ($next eq '{') {
		100
		100
		100
909	3451					7024	my $end = index($pattern, '}', $pos + 2);
910	3451	50				6869	croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1);
911	3451					7219	my $quant = substr($pattern, $pos + 2, $end - $pos - 2);
912
913							# Check for possessive after }
914	3451	100	66			13017	if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') {
			100
915	2					5	$is_possessive = 1;
916	2					5	$end++;
917							}
918
919	3451					5665	my $result = '';
920	3451	100				13682	if ($quant =~ /^(\d+)$/) {
		50
		0
921							# Exact: {n}
922	3392					19589	$result .= $generator->() for (1 .. $1);
923							} elsif ($quant =~ /^(\d+),(\d+)$/) {
924							# Range: {n,m}
925	59					295	my $count = $1 + int(rand($2 - $1 + 1));
926	59					186	$result .= $generator->() for (1 .. $count);
927							} elsif ($quant =~ /^(\d+),$/) {
928							# Minimum: {n,}
929	0					0	my $count = $1 + int(rand(5));
930	0					0	$result .= $generator->() for (1 .. $count);
931							}
932	3451					13284	return ($result, $end);
933							} elsif ($next eq '+') {
934							# One or more (possessive: ++)
935	22					53	my $actual_end = $pos + 1;
936	22	100				100	if ($is_possessive) {
937	4					11	$actual_end++;
938							}
939	22					203	my $count = 1 + int(rand(5));
940	22					54	my $result = '';
941	22					96	$result .= $generator->() for (1 .. $count);
942	22					139	return ($result, $actual_end);
943							} elsif ($next eq '*') {
944							# Zero or more (possessive: *+)
945	3					11	my $actual_end = $pos + 1;
946	3	100				15	if ($is_possessive) {
947	1					4	$actual_end++;
948							}
949	3					19	my $count = int(rand(6));
950	3					62	my $result = '';
951	3					27	$result .= $generator->() for (1 .. $count);
952	3					18	return ($result, $actual_end);
953							} elsif ($next eq '?') {
954							# Zero or one (possessive: ?+)
955	4					15	my $actual_end = $pos + 1;
956	4	100				22	if ($is_possessive) {
957	1					3	$actual_end++;
958							}
959	4	100				23	my $result = rand() < 0.5 ? $generator->() : '';
960	4					19	return ($result, $actual_end);
961							} else {
962							# No quantifier
963	2114					3977	return ($generator->(), $pos);
964							}
965							}
966
967							sub _handle_alternation {
968	88			88		211	my ($self, $pattern) = @_;
969
970							# Split on \| but respect groups
971	88					180	my @alternatives;
972	88					173	my $current = '';
973	88					132	my $depth = 0;
974
975	88					541	for my $char (split //, $pattern) {
976	1008	100	100			3347	if ($char eq '(') {
		100
		100
977	4					10	$depth++;
978	4					10	$current .= $char;
979							} elsif ($char eq ')') {
980	4					9	$depth--;
981	4					9	$current .= $char;
982							} elsif ($char eq '\|' && $depth == 0) {
983	162					393	push @alternatives, $current;
984	162					366	$current = '';
985							} else {
986	838					1568	$current .= $char;
987							}
988							}
989	88	50				396	push @alternatives, $current if length($current);
990
991							# Choose one alternative randomly
992	88					505	my $chosen = $alternatives[int(rand(@alternatives))];
993	88					447	return $self->_parse_sequence($chosen);
994							}
995
996							sub _find_matching_bracket {
997	3935			3935		8085	my ($self, $pattern, $start) = @_;
998
999	3935					6582	my $depth = 0;
1000	3935					9523	for (my $i = $start; $i < length($pattern); $i++) {
1001	25047					40565	my $char = substr($pattern, $i, 1);
1002	25047	100	33			95668	if ($char eq '[' && ($i == $start \|\| substr($pattern, $i-1, 1) ne '\\')) {
		100	66
			66
1003	3935					9455	$depth++;
1004							} elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') {
1005	3935					6001	$depth--;
1006	3935	50				12544	return $i if $depth == 0;
1007							}
1008							}
1009	0					0	return -1;
1010							}
1011
1012							sub _find_matching_paren {
1013	151			151		392	my ($self, $pattern, $start) = @_;
1014
1015	151					309	my $depth = 0;
1016	151					610	for (my $i = $start; $i < length($pattern); $i++) {
1017	1750					3323	my $char = substr($pattern, $i, 1);
1018	1750	100				3973	my $prev = $i > 0 ? substr($pattern, $i-1, 1) : '';
1019
1020	1750	100	66			8373	if ($char eq '(' && $prev ne '\\') {
		100	66
1021	156					512	$depth++;
1022							} elsif ($char eq ')' && $prev ne '\\') {
1023	156					269	$depth--;
1024	156	100				665	return $i if $depth == 0;
1025							}
1026							}
1027	0					0	return -1;
1028							}
1029
1030							sub _random_from_class {
1031	17179			17179		32983	my ($self, $class) = @_;
1032
1033	17179					24922	my @chars;
1034
1035							# Debugging this regex: qr/!#-'*+\\-\\.\\^_`\|~0-9A-Za-z/
1036							# which used to give this error: 'Argument "#" isn't numeric in range (or flop)'
1037	17179	50				37852	warn "DEBUG: class = '$class', length = ", length($class) if ($ENV{DEBUG_REGEX_GEN});
1038
1039							# Handle negation
1040	17179					24706	my $negate = 0;
1041	17179	100				39114	if (substr($class, 0, 1) eq '^') {
1042	10					20	$negate = 1;
1043	10					43	$class = substr($class, 1);
1044							}
1045
1046							# Parse character class with escape sequences
1047	17179					23444	my $i = 0;
1048	17179					32443	while ($i < length($class)) {
1049	33352					56977	my $char = substr($class, $i, 1);
1050
1051	33352	50				68176	warn "DEBUG: i=$i, char='$char' (ord=", ord($char), ')' if ($ENV{DEBUG_REGEX_GEN});
1052
1053	33352	100	100			121186	if ($char eq '\\') {
		100
1054	159					196	$i++;
1055	159					226	my $next = substr($class, $i, 1);
1056	159	50				315	warn "DEBUG: Escaped char: $next" if ($ENV{DEBUG_REGEX_GEN});
1057	159	100	66			493	if ($next eq 'd') {
		100
		50
		100
1058	16					75	push @chars, ('0'..'9');
1059							} elsif ($next eq 'w') {
1060	10					152	push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_');
1061							} elsif ($next eq 's') {
1062	0					0	push @chars, (' ', "\t", "\n");
1063							} elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') {
1064							# Unicode property in character class
1065	6					15	my $end = index($class, '}', $i+2);
1066	6					12	my $prop = substr($class, $i+2, $end-$i-2);
1067	6					15	push @chars, $self->_unicode_property_chars($prop);
1068	6					32	$i = $end;
1069							} else {
1070							# Escaped literal character (including \-, \., \^, etc.)
1071	127					199	push @chars, $next;
1072							}
1073							} elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') {
1074							# Potential range
1075	30522					50852	my $end_char = substr($class, $i+2, 1);
1076
1077							# Check if end is escaped or if this is valid range
1078	30522	50	33			95406	if ($end_char eq '\\' \|\| $end_char eq ']') {
		50
1079							# Not a range, dash is literal
1080	0					0	push @chars, $char;
1081							} elsif (ord($end_char) >= ord($char)) {
1082							# Valid range - use ord/chr to avoid quote interpolation issues
1083	30522					45050	my $start_ord = ord($char);
1084	30522					41659	my $end_ord = ord($end_char);
1085	30522					65871	push @chars, map { chr($_) } ($start_ord .. $end_ord);
	560798					1072122
1086	30522					88498	$i += 2; # Will be incremented again by loop, total +3
1087							} else {
1088							# Invalid range order
1089	0					0	push @chars, $char;
1090							}
1091							} else {
1092	2671					5591	push @chars, $char;
1093							}
1094	33352					76237	$i++;
1095							}
1096
1097	17179	50				39108	warn 'DEBUG: Final chars array has ', scalar(@chars), ' elements' if ($ENV{DEBUG_REGEX_GEN});
1098
1099	17179	100				32980	if ($negate) {
1100	10					41	my %excluded = map { $_ => 1 } @chars;
	100					248
1101	10					41	@chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126);
	940					2140
	940					2101
1102							}
1103
1104	17179	50				122984	return @chars ? $chars[int(rand(@chars))] : 'X';
1105							}
1106
1107							sub _unicode_property_char {
1108	95			95		238	my ($self, $prop) = @_;
1109	95					208	my @chars = $self->_unicode_property_chars($prop);
1110	95	50				1243	return @chars ? $chars[int(rand(@chars))] : 'X';
1111							}
1112
1113							sub _unicode_property_chars {
1114	101			101		187	my ($self, $prop) = @_;
1115
1116							# Common Unicode properties
1117	101	100	100			379	if ($prop eq 'L' \|\| $prop eq 'Letter') {
		100	66
		100	66
		50	33
		0	0
		0	0
		0	0
		0	0
1118							# Letters, skip × and ÷ which are symbols
1119	88					1202	return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'À')..ord('Ö'), ord('Ø')..ord('ö'), ord('ø')..ord('ÿ')));
	5456					10911
1120							} elsif ($prop eq 'N' \|\| $prop eq 'Number') {
1121							# Numbers
1122							# return ('0' .. '9', map { chr($_) } (ord('①').. ord('⑳')));
1123	3					14	return ('0' .. '9');
1124							} elsif ($prop eq 'Lu' \|\| $prop eq 'Uppercase_Letter') {
1125							# Uppercase letters, skip × which is not a letter
1126	6					69	return ('A' .. 'Z', map { chr($_) } (ord('À') .. ord('Ö'), ord('Ø') .. ord('Þ')));
	180					452
1127							} elsif ($prop eq 'Ll' \|\| $prop eq 'Lowercase_Letter') {
1128							# Lowercase letters, skip ÷ which is not a letter
1129	4					44	return ('a' .. 'z', map { chr($_) } (ord('à') .. ord('ö'), ord('ø') .. ord('ÿ')));
	124					299
1130							} elsif ($prop eq 'P' \|\| $prop eq 'Punctuation') {
1131							# Punctuation
1132	0					0	return ('.', ',', '!', '?', ';', ':', '-', '—', '…');
1133							} elsif ($prop eq 'S' \|\| $prop eq 'Symbol') {
1134							# Symbols
1135	0					0	return ('$', '€', '£', '¥', '©', '®', '™', '°', '±', '×', '÷');
1136							} elsif ($prop eq 'Z' \|\| $prop eq 'Separator') {
1137							# Separators
1138	0					0	return (' ', "\t", "\n");
1139							} elsif ($prop eq 'Nd' \|\| $prop eq 'Decimal_Number') {
1140							# Decimal numbers
1141	0					0	return ('0'..'9');
1142							} else {
1143							# Unknown property - return letters as default
1144	0					0	return ('a'..'z', 'A'..'Z');
1145							}
1146							}
1147
1148							=head2 create_random_string
1149
1150							For consistency with L.
1151
1152							print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";
1153
1154							=cut
1155
1156							sub create_random_string
1157							{
1158	2			2	1	2212	my $class = shift;
1159	2					14	my $params = Params::Get::get_params(undef, @_);
1160
1161	2					97	my $regex = $params->{'regex'};
1162	2					5	my $length = $params->{'length'};
1163
1164	2					7	return $class->new($regex, $length)->generate();
1165							}
1166
1167							=head1 AUTHOR
1168
1169							Nigel Horne, C<< >>
1170
1171							=head1 SEE ALSO
1172
1173							=over 4
1174
1175							=item * Test coverage report: L
1176
1177							=item * L
1178
1179							=item * L
1180
1181							=back
1182
1183							=head1 LICENCE AND COPYRIGHT
1184
1185							Copyright 2025 Nigel Horne.
1186
1187							Usage is subject to licence terms.
1188
1189							The licence terms of this software are as follows:
1190
1191							=over 4
1192
1193							=item * Personal single user, single computer use: GPL2
1194
1195							=item * All other users (including Commercial, Charity, Educational, Government)
1196							must apply in writing for a licence for use from Nigel Horne at the
1197							above e-mail.
1198
1199							=back
1200
1201							=cut
1202
1203							1;