File Coverage

blib/lib/Mock/Data/Charset.pm
Criterion Covered Total %
statement 312 374 83.4
branch 190 258 73.6
condition 78 139 56.1
subroutine 40 46 86.9
pod 16 16 100.0
total 636 833 76.3


line stmt bran cond sub pod time code
1             package Mock::Data::Charset;
2 12     12   225851 use strict;
  12         30  
  12         464  
3 12     12   69 use warnings;
  12         24  
  12         634  
4 12     12   1260 use Mock::Data::Util qw( _parse_context _escape_str );
  12         38  
  12         141  
5             require Carp;
6             our @CARP_NOT= ('Mock::Data::Util');
7             require Mock::Data::Generator;
8             our @ISA= ( 'Mock::Data::Generator' );
9              
10             # ABSTRACT: Generator of strings from a set of characters
11             our $VERSION = '0.04'; # VERSION
12              
13              
14             our @generator_attrs= qw( str_len min_codepoint max_codepoint );
15              
16             sub new {
17 52     52 1 181254 my $class= shift;
18 52         123 my (%self, %parse);
19             # make the common case fast
20 52 100 100     268 if (@_ == 1 && !ref $_[0]) {
21 7         1341 qr/[$_[0]]/;
22 7         47 %self= ( notation => $_[0] );
23 7 50       25 if (ref $class) {
24 0   0     0 $self{generator_opts} ||= { %{ $class->{generator_opts} } };
  0         0  
25 0   0     0 $self{max_codepoint} //= $class->{max_codepoint};
26 0         0 $class= ref $class;
27             }
28 7         59 return bless \%self, $class;
29             }
30              
31 45 100       221 %self= @_ != 1? @_ : %{$_[0]};
  20         80  
32              
33             # Look for fields from the parser
34 45 100       221 $parse{classes}= delete $self{classes} if defined $self{classes};
35 45 100       129 $parse{codepoints}= delete $self{codepoints} if defined $self{codepoints};
36 45 50       121 $parse{codepoint_ranges}= delete $self{codepoint_ranges} if defined $self{codepoint_ranges};
37 45 50       134 $parse{negate}= delete $self{negate} if defined $self{negate};
38 45 100       153 if (defined $self{chars}) {
39 3         3 push @{$parse{codepoints}}, map ord, @{$self{chars}};
  3         5  
  3         10  
40 3         4 delete $self{chars};
41             }
42 45 50       155 if (defined $self{ranges}) {
43 0         0 push @{$parse{codepoint_ranges}},
44             map +( ref $_? ( ord $_->[0], ord $_->[1] ) : ord ),
45 0 0       0 @{$self{ranges}};
  0         0  
46 0         0 delete $self{ranges};
47             }
48              
49             # If called on an object, carry over some settings
50 45 50       111 if (ref $class) {
51 0 0 0     0 if (!keys %parse && !defined $self{notation} && !$self{members} && !$self{member_invlist}) {
      0        
      0        
52             @self{'_parse','notation','members','member_invlist'}=
53 0         0 @{$class}{'_parse','notation','members','member_invlist'};
  0         0  
54             }
55 0   0     0 $self{$_} //= $class->{$_} for @generator_attrs;
56 0         0 $class= ref $class;
57             }
58              
59 45 100 66     258 if (defined $self{notation} && !keys %parse) {
    100          
60             # want to trigger the syntax error exception now, not lazily later on
61 28         933 qr/[$self{notation}]/;
62             }
63             elsif (keys %parse) {
64 11         26 $self{_parse}= \%parse;
65             Carp::croak("Charset-building options (classes, chars, codepoints, ranges, codepoint_ranges, negate)"
66             ." cannot be combined with members, member_invlist or notation attributes")
67 11 50 33     97 if $self{members} or $self{member_invlist}; # allow notation to preserve original text
68             }
69             else {
70             # At least one of members, member_invlist, notation, or _parse must be specified
71             Carp::croak("Require at least one of members, member_invlist, notation, or charset-building options")
72 6 50 33     34 unless $self{members} or $self{member_invlist};
73             }
74            
75 45         492 return bless \%self, $class;
76             }
77              
78             sub _parse {
79             # If the '_parse' wasn't initialized, it can be derived from members or member_invlist or notation
80 31 100   31   163 $_[0]{_parse} || do {
81 20         35 my $self= shift;
82 20 50       79 if (defined $self->{notation}) {
    0          
    0          
83 20         88 $self->{_parse}= $self->parse($self->{notation});
84             }
85             elsif ($self->{members}) {
86 0         0 $self->{_parse}{codepoints}= [ map ord, @{$self->{members}} ];
  0         0  
87             }
88             elsif (my $inv= $self->{member_invlist}) {
89 0         0 my $i;
90 0         0 for ($i= 0; $i < $#$inv; $i+= 2) {
91 0 0       0 if ($inv->[$i] + 1 == $inv->[$i+1]) { push @{$self->{_parse}{codepoints}}, $inv->[$i] }
  0         0  
  0         0  
92 0         0 else { push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], $inv->[$i+1] - 1; }
  0         0  
93             }
94 0 0       0 if ($i == $#$inv) {
95 0   0     0 push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], ($self->max_codepoint || 0x10FFFF);
  0         0  
96             }
97             }
98 0         0 else { die "Unhandled lazy-build scenario" }
99 20         88 $self->{_parse};
100             };
101             }
102              
103              
104             sub notation {
105 16   66 16 1 2201 $_[0]{notation} //= _deparse_charset($_[0]->_parse);
106             }
107              
108              
109             sub min_codepoint {
110 122 50   122 1 290 $_[0]{min_codepoint}= $_[1] if @_ > 1;
111             $_[0]{min_codepoint}
112 122         307 }
113             sub max_codepoint {
114             $_[0]{max_codepoint}
115 161     161 1 396 }
116              
117              
118             sub str_len {
119 122 50   122 1 277 $_[0]{str_len}= $_[1] if @_ > 1;
120 122         323 $_[0]{str_len};
121             }
122              
123              
124             sub count {
125 978 100   978 1 466200 $_[0]{members}? scalar @{$_[0]{members}}
  84         224  
126             : $_[0]->_invlist_index->[-1];
127             }
128              
129              
130             sub members {
131 105   66 105 1 346 $_[0]{members} ||= $_[0]->_build_members;
132             }
133              
134             sub _build_members {
135 21     21   37 my $self= shift;
136 21         65 my $invlist= $self->member_invlist;
137 21         41 my @members;
138 21 50       54 if (@$invlist > 1) {
139             push @members, map chr, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
140 21         708 for 0 .. (($#$invlist-1)>>1);
141             }
142             # an odd number of elements means the list ends with an "include-all"
143 21 50       79 push @members, map chr, $invlist->[-1] .. 0x10FFFF
144             if 1 & @$invlist;
145 21         108 return \@members;
146             }
147              
148             sub Mock::Data::Charset::Util::expand_invlist {
149 6     6   13379 my $invlist= shift;
150 6         11 my @members;
151 6 100       22 if (@$invlist > 1) {
152             push @members, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1)
153 5         46 for 0 .. (($#$invlist-1)>>1);
154             }
155             # an odd number of elements means the list ends with an "include-all"
156 6 100       23 push @members, $invlist->[-1] .. 0x10FFFF
157             if 1 & @$invlist;
158 6         18 return \@members;
159             }
160              
161             # The index is private because there's not a good way to explain it to the user
162             sub _invlist_index {
163 2667     2667   5037 my $self= shift;
164 2667   66     11122 $self->{_invlist_index} ||= Mock::Data::Charset::Util::create_invlist_index($self->member_invlist);
165             }
166              
167             sub Mock::Data::Charset::Util::create_invlist_index {
168 29     29   6846 my $invlist= shift;
169 29         68 my $n_spans= (@$invlist + 1) >> 1;
170 29         54 my @index;
171 29         164 $#index= $n_spans-1;
172 29         191 my $total= 0;
173             $index[$_]= $total += $invlist->[$_*2+1] - $invlist->[$_*2]
174 29         3590 for 0 .. (@$invlist >> 1)-1;
175 29 100       114 if (@$invlist & 1) { # In the case that the final range is infinite
176 6         21 $index[$n_spans-1]= $total + 0x110000 - $invlist->[-1];
177             }
178 29         178 \@index;
179             }
180              
181              
182             sub member_invlist {
183 1833 50   1833 1 4882 if (@_ > 1) {
184 0         0 $_[0]{member_invlist}= $_[1];
185 0         0 delete $_[0]{_invlist_index};
186 0         0 delete $_[0]{members};
187 0         0 delete $_[0]{notation};
188             }
189 1833   66     6400 $_[0]{member_invlist} //= _build_member_invlist(@_);
190             }
191              
192             sub _build_member_invlist {
193 39     39   66 my $self= shift;
194 39         101 my $max_codepoint= $self->max_codepoint;
195             # If the search space is small, and there is already a regex notation, it is probably faster
196             # to iterate and let perl do the work than to parse the charset.
197 39         70 my $invlist;
198 39 100 66     199 if (!defined $max_codepoint || $max_codepoint > 1000 || !defined $self->{notation}) {
      100        
199 30   100     148 $max_codepoint ||= 0x10FFFF;
200 30         58 $invlist= eval {
201 30         97 _parsed_charset_to_invlist($self->_parse, $max_codepoint);
202             }# or main::diag $@
203             }
204 39   66     130 $invlist ||= _charset_invlist_brute_force($self->notation, $max_codepoint);
205             # If a user writes to the invlist, it will become out of sync with the Index,
206             # leading to confusing bugs.
207 39 50       381 if (Internals->can('SvREADONLY')) {
208 39         3844 Internals::SvREADONLY($_,1) for @$invlist;
209 39         110 Internals::SvREADONLY(@$invlist,1);
210             }
211 39         284 return $invlist;
212             }
213              
214             # Lazy-built string of all basic-multilingual-plane characters
215             our $_ascii_chars;
216             our $_unicode_chars;
217             sub _build_unicode_chars {
218 1 50   1   4 unless (defined $_unicode_chars) {
219             # Construct ranges of valid characters separated by NUL.
220             # Older perls die when the regex engine encounters an invalid character
221             # but newer perls just treat the invalid character as "not a member",
222             # unless the set is a negation in which case non-characters *are* a member.
223             # This makes the assumption that if a non-char isn't a member then \0 won't
224             # be either.
225 1         2 $_unicode_chars= '';
226 1         20555 $_unicode_chars .= chr($_) for 0 .. 0xD7FF;
227 1         8 $_unicode_chars .= "\0";
228 1         224 $_unicode_chars .= chr($_) for 0xFDF0 .. 0xFFFD;
229 1         8 for (1..16) {
230 16         122 $_unicode_chars .= "\0";
231 16         340209 $_unicode_chars .= chr($_) for ($_<<16) .. (($_<<16)|0xFFFD);
232             }
233             }
234 1         9 \$_unicode_chars;
235             }
236              
237             sub _charset_invlist_brute_force {
238 9     9   24 my ($set, $max_codepoint)= @_;
239 9 100       40 my $inv= (ord $set == ord '^')? substr($set,1) : '^'.$set;
240 9         32 my @invlist;
241            
242             # optimize common case
243 9 100       24 if ($max_codepoint < 256) {
244             # Find first character of every match and first character of every non-match
245             # and convert to codepoints.
246 8 100 66     1091 @invlist= map +(defined $_? ord($_) : ()),
247             ($_ascii_chars //= join('', map chr($_), 0..255))
248             =~ / ( [$set] ) (?> [$set]* ) (?: \z | ( [$inv] ) (?> [$inv]* ) )/gx;
249             }
250             else {
251 1 50       8 _build_unicode_chars() unless defined $_unicode_chars;
252             # This got complicated while trying to support perls that can't match against non-characters.
253             # The non-characters have been replaced by NULs, so need to capture the char before and after
254             # each transition in case one of them is a NUL.
255 1 50       1460 my @endpoints=
256             ($max_codepoint < 0x10FFFF? substr($_unicode_chars,0,$max_codepoint+1) : $_unicode_chars)
257             =~ /( [$set] ) ( [$set] )* ( \z | [$inv] ) ( [$inv] )* /gx;
258 1 50       6 if (@endpoints) {
259             # List is a multiple of 4 elements: (first-member,last-member,first-non-member,last-non-member)
260             # We're not interested in the span of non-members at the end, so just remove those.
261 1         3 pop @endpoints; pop @endpoints;
  1         22  
262             # Iterate every transition of member/nonmember, and use the second character if present
263             # and isn't a NUL, else use the first character and add 1.
264 1         13 push @invlist, ord $endpoints[0];
265 1         5 for (my $i= 1; $i < @endpoints; $i+= 2) {
266 19 100 66     103 if (defined $endpoints[$i+1] && ord $endpoints[$i+1]) {
    50          
267 18         27 push @invlist, ord $endpoints[$i+1];
268             } elsif (defined $endpoints[$i]) {
269 1         4 push @invlist, 1 + ord $endpoints[$i];
270             } else {
271 0         0 push @invlist, 1 + $invlist[-1];
272             }
273             }
274             # substr is an estimate, because string skips characters, so remove any spurrous
275             # codepoints beyond the max
276 1   66     9 pop @invlist while @invlist && $invlist[-1] > $max_codepoint;
277             }
278             }
279             # If an "infinite" range would be returned, but the user set a maximum codepoint,
280             # list the max codepoint as the end of the invlist.
281 9 100 66     95 if ($max_codepoint < 0x10FFFF and 1 & @invlist) {
282 4         10 push @invlist, $max_codepoint+1;
283             }
284 9         41 return \@invlist;
285             }
286              
287             sub _parsed_charset_to_invlist {
288 30     30   89 my ($parse, $max_codepoint)= @_;
289 30         53 my @invlists;
290             # convert the character list into an inversion list
291 30 100       165 if (defined (my $cp= $parse->{codepoints})) {
292 5         17 my @chars= sort { $a <=> $b } @$cp;
  6         18  
293 5         23 my @invlist= (shift @chars);
294 5         10 push @invlist, $invlist[0] + 1;
295 5         43 for (my $i= 0; $i <= $#chars; $i++) {
296             # If the next char is adjacent, extend the span
297 5 100       12 if ($invlist[-1] == $chars[$i]) {
298 2         7 ++$invlist[-1];
299             } else {
300 3         7 push @invlist, $chars[$i], $chars[$i]+1;
301             }
302             }
303 5         12 push @invlists, \@invlist;
304             }
305             # Each range is an inversion list already
306 30 100       90 if (my $r= $parse->{codepoint_ranges}) {
307 10         60 for (my $i= 0; $i < (@$r >> 1); $i++) {
308 11         42 my ($start, $limit)= ($r->[$i*2], $r->[$i*2+1]+1);
309             # Try to combine the range with the most recent inversion list, if possible,
310 11 100 66     55 if (@invlists && $invlists[-1][-1] < $start) {
    50 33        
311 1         3 push @{ $invlists[-1] }, $start, $limit;
  1         6  
312             } elsif (@invlists && $invlists[-1][0] > $limit) {
313 0         0 unshift @{ $invlists[-1] }, $start, $limit;
  0         0  
314             } else {
315             # else just start a new inversion list
316 10         47 push @invlists, [ $start, $limit ]
317             }
318             }
319             }
320             # Convert each character class to an inversion list.
321 30 100       91 if ($parse->{classes}) {
322             push @invlists, _class_invlist($_)
323 15         45 for @{ $parse->{classes} };
  15         74  
324             }
325 30         142479 my $invlist= Mock::Data::Charset::Util::merge_invlists(\@invlists, $max_codepoint);
326             # Perform negation of inversion list by either starting at char 0 or removing char 0
327 30 100       126 if ($parse->{negate}) {
328 1 50       5 if ($invlist->[0]) { unshift @$invlist, 0 }
  1         4  
329 0         0 else { shift @$invlist; }
330             }
331 30         386 return $invlist;
332             }
333              
334              
335             our $_compile;
336             sub compile {
337 4     4 1 18 local $_compile= 1;
338 4         48 shift->generate(@_);
339             }
340             sub generate {
341 122     122 1 267 my ($self, $mock)= (shift, shift);
342 122         337 my ($len, $cp_min, $cp_max, $member_count)
343             = ($self->str_len, $self->min_codepoint, $self->max_codepoint, $self->count);
344 122 100       302 if (@_) {
345 90 100       306 my %opts= ref $_[0] eq 'HASH'? %{ shift() } : ();
  87         410  
346 90 100 33     257 $len= @_? shift : $opts{str_len} // $opts{len} // $opts{size}; # allow some aliases for length
      33        
347 90   66     390 $cp_min= $opts{min_codepoint} // $cp_min;
348 90   66     336 $cp_max= $opts{max_codepoint} // $cp_max;
349             }
350 122 100 100     537 my ($memb_min, $memb_span)= !defined $cp_min && !defined $cp_max? (0,$member_count)
351             : $self->_codepoint_minmax_to_member_range($cp_min, $cp_max);
352              
353             # If compiling, $len will be a function, else it will be an integer
354 0     0   0 $len= !defined $len? ($_compile? sub { 1 } : 1 )
355 0     0   0 : !ref $len? ($_compile? sub { $len } : $len)
356             : ref $len eq 'ARRAY'? (
357 0     0   0 $_compile? sub { $len->[0] + int rand ($len->[1] - $len->[0]) }
358 122 50       485 : $len->[0] + int rand ($len->[1] - $len->[0])
    50          
    50          
    100          
    50          
    100          
    100          
    100          
359             )
360             : ref $len eq 'CODE'? ($_compile? $len : $len->($mock))
361             : Carp::croak("Unknown str_len specification '$len'");
362              
363             # If member list is small-ish, use faster direct array access
364 122 100 100     414 if ($self->{members} || $member_count < 500) {
365 98         211 my $members= $self->members;
366             return sub {
367 3     3   10 my $buf= '';
368             $buf .= $members->[$memb_min + int rand $memb_span]
369 3         14 for 1..$len->($_[0]);
370 3         49 return $buf;
371 98 100       234 } if $_compile;
372 94         147 my $buf= '';
373             $buf .= $members->[$memb_min + int rand $memb_span]
374 94         598 for 1..$len;
375 94         590 return $buf;
376             }
377             else {
378 24         82 my $invlist= $self->member_invlist;
379 24         58 my $index= $self->_invlist_index;
380             return sub {
381 0     0   0 my $ret= '';
382             $ret .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
383 0         0 for 1..$len->($_[0]);
384 24 50       64 } if $_compile;
385 24         45 my $buf= '';
386             $buf .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index)
387 24         131 for 1..$len;
388 24         181 return $buf;
389             }
390             }
391              
392             sub _codepoint_minmax_to_member_range {
393 27     27   49 my $self= shift;
394 27         62 my ($cp_min, $cp_max)= @_;
395             my $memb_min= !defined $cp_min? 0
396 27 100       72 : do {
397 6         15 my ($at, $ins)= _find_invlist_element($cp_min, $self->member_invlist, $self->_invlist_index);
398 6   33     20 $at // $ins
399             };
400             my $memb_lim= !defined $cp_max? $self->count
401 27 50       63 : do {
402 27         68 my ($at, $ins)= _find_invlist_element($cp_max, $self->member_invlist, $self->_invlist_index);
403 27 100       89 defined $at? $at + 1 : $ins;
404             };
405 27         71 return ($memb_min, $memb_lim-$memb_min);
406             }
407              
408              
409             sub parse {
410 36     36 1 315310 my ($self, $notation)= @_;
411 36 50       148 return { codepoints => [] } unless length $notation;
412 36 50       108 return { classes => ['All'] } if $notation eq '^';
413 36         88 $notation .= ']';
414             # parse function needs $_ to be the input string
415 36         189 pos($notation)= 0;
416 36         177 return _parse_charset() for $notation;
417             }
418              
419             our $have_prop_invlist;
420             our %_parse_charset_backslash= (
421             a => ord "\a",
422             b => ord "\b",
423             c => sub { die "Unimplemented: \\c" },
424             d => sub { push @{$_[0]{classes}}, 'digit'; undef; },
425             D => sub { push @{$_[0]{classes}}, '^digit'; undef; },
426             e => ord "\e",
427             f => ord "\f",
428             h => sub { push @{$_[0]{classes}}, 'horizspace'; undef; },
429             H => sub { push @{$_[0]{classes}}, '^horizspace'; undef; },
430             n => ord "\n",
431             N => \&_parse_charset_namedchar,
432             o => \&_parse_charset_oct,
433             p => \&_parse_charset_classname,
434             P => sub { _parse_charset_classname(shift, 1) },
435             r => ord "\r",
436             s => sub { push @{$_[0]{classes}}, 'space'; undef; },
437             S => sub { push @{$_[0]{classes}}, '^space'; undef; },
438             t => ord "\t",
439             v => sub { push @{$_[0]{classes}}, 'vertspace'; undef; },
440             V => sub { push @{$_[0]{classes}}, '^vertspace'; undef; },
441             w => sub { push @{$_[0]{classes}}, 'word'; undef; },
442             W => sub { push @{$_[0]{classes}}, '^word'; undef; },
443             x => \&_parse_charset_hex,
444             0 => \&_parse_charset_oct,
445             1 => \&_parse_charset_oct,
446             2 => \&_parse_charset_oct,
447             3 => \&_parse_charset_oct,
448             4 => \&_parse_charset_oct,
449             5 => \&_parse_charset_oct,
450             6 => \&_parse_charset_oct,
451             7 => \&_parse_charset_oct,
452             8 => \&_parse_charset_oct,
453             9 => \&_parse_charset_oct,
454             );
455             our %_class_invlist_cache= (
456             'Any' => [ 0 ],
457             '\\N' => [ 0, ord("\n"), 1+ord("\n") ],
458             );
459             sub _class_invlist {
460 26     26   72729 my $class= shift;
461 26 100       83 if (ord $class == ord '^') {
462 5         31 return Mock::Data::Charset::Util::negate_invlist(
463             _class_invlist(substr($class,1))
464             );
465             }
466 21   66     111 return $_class_invlist_cache{$class} ||= do {
467 9 100       30 $have_prop_invlist= do { require Unicode::UCD; !!Unicode::UCD->can('prop_invlist') }
  3         3743  
  3         90555  
468             unless defined $have_prop_invlist;
469 9 50       60 $have_prop_invlist? [ Unicode::UCD::prop_invlist($class) ]
470             : _charset_invlist_brute_force("\\p{$class}", 0x10FFFF);
471             };
472             }
473             sub _parse_charset_hex {
474 3 50   3   20 /\G( [0-9A-Fa-f]{2} | \{ ([0-9A-Fa-f]+) \} )/gcx
475             or die "Invalid hex escape at "._parse_context;
476 3 100       18 return hex(defined $2? $2 : $1);
477             }
478             sub _parse_charset_oct {
479 5     5   20 --pos; # The caller ate one of the characters we need to parse
480 5 50       33 /\G( [0-7]{3} | 0 | o\{ ([0-7]+) \} ) /gcx
481             or die "Invalid octal escape at "._parse_context;
482 5 100       63 return oct(defined $2? $2 : $1);
483             }
484             sub _parse_charset_namedchar {
485 2     2   1022 require charnames;
486 2 50       12974 /\G \{ ([^}]+) \} /gcx
487             # or die "Invalid named char following \\N at '".substr($_,pos,10)."'";
488             and return charnames::vianame($1);
489             # Plain "\N" means every character except \n
490 0         0 push @{ $_[0]{classes} }, '\\N';
  0         0  
491 0         0 return;
492             }
493             sub _parse_charset_classname {
494 7     7   19 my ($result, $negate)= @_;
495 7 50       45 /\G \{ ([^}]+) \} /gcx
496             or die "Invalid class name following \\p at "._parse_context;
497 7 100       17 push @{$result->{classes}}, lc($negate? "^$1" : $1);
  7         45  
498             undef
499 7         19 }
500             sub _parse_charset {
501 41     41   80 my $flags= shift;
502             # argument is in $_, starting from pos($_)
503 41         91 my %parse;
504             my @range;
505 41         132 $parse{codepoints}= \my @chars;
506 41 100       164 $parse{negate}= 1 if /\G \^ /gcx;
507 41 50       163 if (/\G]/gc) { push @chars, ord ']' }
  0         0  
508 41         69 while (1) {
509 133         230 my $cp; # literal codepoint to be added
510             # Check for special cases
511 133 100 50     608 if (/\G ( \\ | - | \[: | \] ) /gcx) {
    50 66        
      33        
512 88 100       357 if ($1 eq '\\') {
    100          
    100          
513 23 50       102 /\G(.)/gc or die "Unexpected end of input";
514 23   33     109 $cp= $_parse_charset_backslash{$1} || ord $1;
515 23 100       100 $cp= $cp->(\%parse)
516             if ref $cp;
517             }
518             elsif ($1 eq '-') {
519 19 100       48 if (@range == 1) {
520 18         61 push @range, ord '-';
521 18         77 next;
522             }
523             else {
524 1         3 $cp= ord '-';
525             }
526             }
527             elsif ($1 eq '[:') {
528 5 50       36 /\G ( [^:]+ ) :] /gcx
529             or die "Invalid character class at "._parse_context;
530 5         13 push @{$parse{classes}}, $1;
  5         34  
531             }
532             else {
533 41         90 last; # $1 eq ']';
534             }
535             }
536             elsif ($flags && ($flags->{x}||0) >= 2 && /\G[ \t]/gc) {
537 0         0 next; # ignore space and tab under /xx
538             }
539             else {
540 45 50       153 /\G(.)/gc or die "Unexpected end of input";
541 45         115 $cp= ord $1;
542             }
543             # If no single character was found, any range-in-progress needs converted to
544             # charcters
545 74 100       34403 if (!defined $cp) {
    100          
    100          
546 20         37 push @chars, @range;
547 20         41 @range= ();
548             }
549             # At this point, $cp will contain the next ordinal of the character to include,
550             # but it might also be starting or finishing a range.
551             elsif (@range == 1) {
552 8         46 push @chars, $range[0];
553 8         18 $range[0]= $cp;
554             }
555             elsif (@range == 2) {
556 17         60 push @{$parse{codepoint_ranges}}, $range[0], $cp;
  17         60  
557 17         45 @range= ();
558             }
559             else {
560 29         63 push @range, $cp;
561             }
562             #printf "# pos %d cp %d range %s %s include %s\n", pos $_, $cp, $range[0] // '(null)', $range[1] // '(null)', join(',', @include);
563             }
564 41         83 push @chars, @range;
565 41 100       91 if (@chars) {
566 13         52 @chars= sort { $a <=> $b } @chars;
  11         29  
567             } else {
568 28         60 delete $parse{codepoints};
569             }
570 41         236 return \%parse;
571             }
572              
573             sub _ord_to_safe_regex_char {
574 3 0   3   41 return chr($_[0]) =~ /[\w]/? chr $_[0]
    50          
575             : $_[0] <= 0xFF? sprintf('\x%02X',$_[0])
576             : sprintf('\x{%X}',$_[0])
577             }
578              
579             sub _deparse_charset {
580 1     1   2 my $parse= shift;
581 1         3 my $str= '';
582 1 50       5 if (my $cp= $parse->{codepoints}) {
583             $str .= _ord_to_safe_regex_char($_)
584 1         6 for @$cp;
585             }
586 1 50       17 if (my $r= $parse->{codepoint_ranges}) {
587 0         0 for (my $i= 0; $i < (@$r << 1); $i++) {
588 0         0 $str .= _ord_to_safe_regex_char($r->[$i*2]) . '-' . _ord_to_safe_regex_char($r->[$i*2+1]);
589             }
590             }
591 1 50       5 if (my $cl= $parse->{classes}) {
592             # TODO: reverse conversions to \h \v etc.
593 0         0 for (@$cl) {
594 0 0       0 $str .= $_ eq '\N'? '\0-\x09\x0B-\x{10FFFF}'
    0          
595             : ord == ord '^'? '\P{'.substr($_,1).'}'
596             : '\p{'.$_.'}';
597             }
598             }
599 1         14 return $str;
600             }
601              
602              
603             sub get_member {
604 856 100   856 1 5747 $_[0]{members}? $_[0]{members}[$_[1]]
605             : chr _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
606             }
607              
608             sub get_member_codepoint {
609 6 50   6 1 3396 $_[0]{members}? ord $_[0]{members}[$_[1]]
610             : _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index);
611             }
612              
613             sub _get_invlist_element {
614 1231     1231   3013 my ($ofs, $invlist, $invlist_index)= @_;
615 1231 50       3222 $ofs += @$invlist_index if $ofs < 0;
616 1231 50 33     5793 return undef if $ofs >= $invlist_index->[-1] || $ofs < 0;
617             # Binary Search to find the range that contains this numbered element
618 1231         3176 my ($min, $max, $mid)= (0, $#$invlist_index);
619 1231         2157 while (1) {
620 6798         10515 $mid= ($min+$max) >> 1;
621 6798 100 100     19893 if ($ofs >= $invlist_index->[$mid]) {
    100          
622 3368         5150 $min= $mid+1
623             }
624             elsif ($mid > 0 && $ofs < $invlist_index->[$mid-1]) {
625 2199         3594 $max= $mid-1
626             }
627             else {
628 1231 100       3392 $ofs -= $invlist_index->[$mid-1] if $mid > 0;
629 1231         7619 return $invlist->[$mid*2] + $ofs;
630             }
631             }
632             }
633              
634              
635             sub find_member {
636 861     861 1 4442 my ($self, $char)= @_;
637 861         2558 return _find_invlist_element(ord $char, $self->member_invlist, $self->_invlist_index);
638             }
639              
640             sub _find_invlist_element {
641 894     894   1933 my ($codepoint, $invlist, $index)= @_;
642             # Binary Search to find the range that contains this numbered element
643 894         2261 my ($min, $max, $mid)= (0, $#$invlist);
644 894         1484 while (1) {
645 6846         10143 $mid= ($min+$max) >> 1;
646 6846 100 100     29275 if ($mid > 0 && $codepoint < $invlist->[$mid]) {
    100 100        
647 2629         4264 $max= $mid-1
648             }
649             elsif ($mid < $#$invlist && $codepoint >= $invlist->[$mid+1]) {
650 3323         5533 $min= $mid+1;
651             }
652             else {
653 894 100       2339 return (undef, 0) unless $codepoint >= $invlist->[$mid];
654 893 100       2004 return $codepoint - $invlist->[$mid] unless $mid > 0;
655 885 100       7885 return $codepoint - $invlist->[$mid] + $index->[($mid >> 1) - 1] unless $mid & 1;
656             # if $mid is an odd number, the range is excluded, and there is no match
657 13 100       60 return undef unless wantarray;
658 11         57 return (undef, $index->[($mid-1)>>1]) # return insertion point as second val
659             }
660             }
661             }
662              
663              
664             sub negate {
665 0     0 1 0 my $self= shift;
666 0         0 my $neg= Mock::Data::Charset::Util::negate_invlist($self->member_invlist, $self->max_codepoint);
667 0         0 return $self->new(member_invlist => $neg);
668             }
669             sub Mock::Data::Charset::Util::negate_invlist {
670 5     5   3152 my ($invlist, $max_codepoint)= @_;
671             # Toggle first char of 0
672 5 50       674 $invlist= $invlist->[0]? [ 0, @$invlist ] : [ @{$invlist}[1..$#$invlist] ];
  0         0  
673             # If max_codepoint is defined, and was the final char, remove the range starting at max_codepoint+1
674 5 50 33     53 if (@$invlist & 1 and defined $max_codepoint and $invlist->[-1] == $max_codepoint+1) {
      33        
675 0         0 pop @$invlist;
676             }
677 5         34 return $invlist;
678             }
679              
680              
681             sub union {
682 0     0 1 0 my $self= $_[0];
683 0         0 my @invlists= @_;
684             ref eq 'ARRAY' || ($_= $_->member_invlist)
685 0   0     0 for @invlists;
686 0         0 my $combined= Mock::Data::Charset::Util::merge_invlists(\@invlists, $self->max_codepoint);
687 0         0 return $self->new(member_invlist => $combined);
688             }
689              
690             #=head2 merge_invlists
691             #
692             # my $combined_invlist= $charset->merge_invlist( \@list2, \@list3, ... );
693             # my $combined_invlist= merge_invlist( \@list1, \@list2, ... );
694             #
695             #Merge one or more inversion lists into a superset of all of them.
696             #If called as a method, the L is used as the first list.
697             #
698             #The return value is an inversion list, which can be wrapped in a Charset object by passing it
699             #as the C attribute.
700             #
701             #The current L applies to the result. If called as a plain function, the
702             #C is assumed to be the Unicode maximum of C<0x10FFFF>.
703             #
704             #=cut
705              
706             sub Mock::Data::Charset::Util::merge_invlists {
707 38     38   11765 my @invlists= @{shift()};
  38         104  
708 38   100     140 my $max_codepoint= shift // 0x10FFFF;
709              
710 38 50       98 return [] unless @invlists;
711 38 100       112 return [@{$invlists[0]}] unless @invlists > 1;
  24         1801  
712 14         30 my @combined= ();
713             # Repeatedly select the minimum range among the input lists and add it to the result
714 14         42 my @pos= (0)x@invlists;
715 14         44 while (@invlists) {
716 4005         9596 my ($min_ch, $min_i)= ($invlists[0][$pos[0]], 0);
717             # Find which inversion list contains the lowest range
718 4005         8711 for (my $i= 1; $i < @invlists; $i++) {
719 3996 100       11994 if ($invlists[$i][$pos[$i]] < $min_ch) {
720 978         1728 $min_ch= $invlists[$i][$pos[$i]];
721 978         2269 $min_i= $i;
722             }
723             }
724 4005 100       8006 last if $min_ch > $max_codepoint;
725             # Check for overlap of this new inclusion range with the previous
726 4003 100 100     12264 if (@combined && $combined[-1] >= $min_ch) {
727             # they overlap, so just replace the end-codepoint of the range
728             # if the new endpoint is larger
729 3772         7140 my $new_end= $invlists[$min_i][$pos[$min_i]+1];
730 3772 100 100     13414 $combined[-1]= $new_end if !defined $new_end || $combined[-1] < $new_end;
731             }
732             else {
733             # else, simply append the range
734 231         573 push @combined, @{$invlists[$min_i]}[$pos[$min_i] .. $pos[$min_i]+1];
  231         905  
735             }
736             # If the list is empty now, remove it from consideration
737 4003 100       6356 if (($pos[$min_i] += 2) >= @{$invlists[$min_i]}) {
  4003 50       13294  
738 21         48 splice @invlists, $min_i, 1;
739 21         39 splice @pos, $min_i, 1;
740             # If the invlist ends with an infinite range now, we are done
741 21 100       74 if (!defined $combined[-1]) {
742 6         15 pop @combined;
743 6         17 last;
744             }
745             }
746             # If this is the only list remaining, append the rest and done
747             elsif (@invlists == 1) {
748 0         0 push @combined, @{$invlists[0]}[$pos[0] .. $#{$invlists[0]}];
  0         0  
  0         0  
749 0         0 last;
750             }
751             }
752 14         50 while ($combined[-1] > $max_codepoint) {
753 1         4 pop @combined;
754             }
755             # If the list ends with inclusion, and the max_codepoint is less than unicode max,
756             # end the list with it.
757 14 100 100     66 if (1 & @combined and $max_codepoint < 0x10FFFF) {
758 1         4 push @combined, $max_codepoint+1;
759             }
760 14         66 return \@combined;
761             }
762              
763             1;
764              
765             __END__