line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Mock::Data::Charset; |
2
|
9
|
|
|
9
|
|
252426
|
use strict; |
|
9
|
|
|
|
|
27
|
|
|
9
|
|
|
|
|
282
|
|
3
|
9
|
|
|
9
|
|
46
|
use warnings; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
270
|
|
4
|
9
|
|
|
9
|
|
1020
|
use Mock::Data::Util qw( _parse_context _escape_str ); |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
59717
|
|
5
|
|
|
|
|
|
|
require Carp; |
6
|
|
|
|
|
|
|
our @CARP_NOT= ('Mock::Data::Util'); |
7
|
|
|
|
|
|
|
require Mock::Data::Generator; |
8
|
|
|
|
|
|
|
our @ISA= ( 'Mock::Data::Generator' ); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ABSTRACT: Generator of strings from a set of characters |
11
|
|
|
|
|
|
|
our $VERSION = '0.02'; # VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @generator_attrs= qw( str_len min_codepoint max_codepoint ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub new { |
17
|
45
|
|
|
45
|
1
|
132949
|
my $class= shift; |
18
|
45
|
|
|
|
|
85
|
my (%self, %parse); |
19
|
|
|
|
|
|
|
# make the common case fast |
20
|
45
|
100
|
100
|
|
|
202
|
if (@_ == 1 && !ref $_[0]) { |
21
|
7
|
|
|
|
|
653
|
qr/[$_[0]]/; |
22
|
7
|
|
|
|
|
200
|
%self= ( notation => $_[0] ); |
23
|
7
|
50
|
|
|
|
18
|
if (ref $class) { |
24
|
0
|
|
0
|
|
|
0
|
$self{generator_opts} ||= { %{ $class->{generator_opts} } }; |
|
0
|
|
|
|
|
0
|
|
25
|
0
|
|
0
|
|
|
0
|
$self{max_codepoint} //= $class->{max_codepoint}; |
26
|
0
|
|
|
|
|
0
|
$class= ref $class; |
27
|
|
|
|
|
|
|
} |
28
|
7
|
|
|
|
|
47
|
return bless \%self, $class; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
38
|
100
|
|
|
|
146
|
%self= @_ != 1? @_ : %{$_[0]}; |
|
14
|
|
|
|
|
50
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Look for fields from the parser |
34
|
38
|
100
|
|
|
|
227
|
$parse{classes}= delete $self{classes} if defined $self{classes}; |
35
|
38
|
100
|
|
|
|
106
|
$parse{codepoints}= delete $self{codepoints} if defined $self{codepoints}; |
36
|
38
|
50
|
|
|
|
92
|
$parse{codepoint_ranges}= delete $self{codepoint_ranges} if defined $self{codepoint_ranges}; |
37
|
38
|
50
|
|
|
|
93
|
$parse{negate}= delete $self{negate} if defined $self{negate}; |
38
|
38
|
100
|
|
|
|
86
|
if (defined $self{chars}) { |
39
|
3
|
|
|
|
|
6
|
push @{$parse{codepoints}}, map ord, @{$self{chars}}; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
11
|
|
40
|
3
|
|
|
|
|
7
|
delete $self{chars}; |
41
|
|
|
|
|
|
|
} |
42
|
38
|
50
|
|
|
|
92
|
if (defined $self{ranges}) { |
43
|
0
|
|
|
|
|
0
|
push @{$parse{codepoint_ranges}}, |
44
|
|
|
|
|
|
|
map +( ref $_? ( ord $_->[0], ord $_->[1] ) : ord ), |
45
|
0
|
0
|
|
|
|
0
|
@{$self{ranges}}; |
|
0
|
|
|
|
|
0
|
|
46
|
0
|
|
|
|
|
0
|
delete $self{ranges}; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# If called on an object, carry over some settings |
50
|
38
|
50
|
|
|
|
91
|
if (ref $class) { |
51
|
0
|
0
|
0
|
|
|
0
|
if (!keys %parse && !defined $self{notation} && !$self{members} && !$self{member_invlist}) { |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
52
|
|
|
|
|
|
|
@self{'_parse','notation','members','member_invlist'}= |
53
|
0
|
|
|
|
|
0
|
@{$class}{'_parse','notation','members','member_invlist'}; |
|
0
|
|
|
|
|
0
|
|
54
|
|
|
|
|
|
|
} |
55
|
0
|
|
0
|
|
|
0
|
$self{$_} //= $class->{$_} for @generator_attrs; |
56
|
0
|
|
|
|
|
0
|
$class= ref $class; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
38
|
100
|
66
|
|
|
181
|
if (defined $self{notation} && !keys %parse) { |
|
|
100
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# want to trigger the syntax error exception now, not lazily later on |
61
|
21
|
|
|
|
|
431
|
qr/[$self{notation}]/; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
elsif (keys %parse) { |
64
|
11
|
|
|
|
|
27
|
$self{_parse}= \%parse; |
65
|
|
|
|
|
|
|
Carp::croak("Charset-building options (classes, chars, codepoints, ranges, codepoint_ranges, negate)" |
66
|
|
|
|
|
|
|
." cannot be combined with members, member_invlist or notation attributes") |
67
|
11
|
50
|
33
|
|
|
96
|
if $self{members} or $self{member_invlist}; # allow notation to preserve original text |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
else { |
70
|
|
|
|
|
|
|
# At least one of members, member_invlist, notation, or _parse must be specified |
71
|
|
|
|
|
|
|
Carp::croak("Require at least one of members, member_invlist, notation, or charset-building options") |
72
|
6
|
50
|
33
|
|
|
27
|
unless $self{members} or $self{member_invlist}; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
38
|
|
|
|
|
2135
|
return bless \%self, $class; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _parse { |
79
|
|
|
|
|
|
|
# If the '_parse' wasn't initialized, it can be derived from members or member_invlist or notation |
80
|
26
|
100
|
|
26
|
|
98
|
$_[0]{_parse} || do { |
81
|
15
|
|
|
|
|
28
|
my $self= shift; |
82
|
15
|
50
|
|
|
|
36
|
if (defined $self->{notation}) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
83
|
15
|
|
|
|
|
52
|
$self->{_parse}= $self->parse($self->{notation}); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
elsif ($self->{members}) { |
86
|
0
|
|
|
|
|
0
|
$self->{_parse}{codepoints}= [ map ord, @{$self->{members}} ]; |
|
0
|
|
|
|
|
0
|
|
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
elsif (my $inv= $self->{member_invlist}) { |
89
|
0
|
|
|
|
|
0
|
my $i; |
90
|
0
|
|
|
|
|
0
|
for ($i= 0; $i < $#$inv; $i+= 2) { |
91
|
0
|
0
|
|
|
|
0
|
if ($inv->[$i] + 1 == $inv->[$i+1]) { push @{$self->{_parse}{codepoints}}, $inv->[$i] } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
92
|
0
|
|
|
|
|
0
|
else { push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], $inv->[$i+1] - 1; } |
|
0
|
|
|
|
|
0
|
|
93
|
|
|
|
|
|
|
} |
94
|
0
|
0
|
|
|
|
0
|
if ($i == $#$inv) { |
95
|
0
|
|
0
|
|
|
0
|
push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], ($self->max_codepoint || 0x10FFFF); |
|
0
|
|
|
|
|
0
|
|
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
} |
98
|
0
|
|
|
|
|
0
|
else { die "Unhandled lazy-build scenario" } |
99
|
15
|
|
|
|
|
53
|
$self->{_parse}; |
100
|
|
|
|
|
|
|
}; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub notation { |
105
|
14
|
|
66
|
14
|
1
|
1286
|
$_[0]{notation} //= _deparse_charset($_[0]->_parse); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub min_codepoint { |
110
|
112
|
50
|
|
112
|
1
|
202
|
$_[0]{min_codepoint}= $_[1] if @_ > 1; |
111
|
|
|
|
|
|
|
$_[0]{min_codepoint} |
112
|
112
|
|
|
|
|
243
|
} |
113
|
|
|
|
|
|
|
sub max_codepoint { |
114
|
|
|
|
|
|
|
$_[0]{max_codepoint} |
115
|
146
|
|
|
146
|
1
|
288
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub str_len { |
119
|
112
|
50
|
|
112
|
1
|
231
|
$_[0]{str_len}= $_[1] if @_ > 1; |
120
|
112
|
|
|
|
|
293
|
$_[0]{str_len}; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
sub count { |
125
|
861
|
100
|
|
861
|
1
|
296991
|
$_[0]{members}? scalar @{$_[0]{members}} |
|
83
|
|
|
|
|
179
|
|
126
|
|
|
|
|
|
|
: $_[0]->_invlist_index->[-1]; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub members { |
131
|
103
|
|
66
|
103
|
1
|
313
|
$_[0]{members} ||= $_[0]->_build_members; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub _build_members { |
135
|
20
|
|
|
20
|
|
34
|
my $self= shift; |
136
|
20
|
|
|
|
|
42
|
my $invlist= $self->member_invlist; |
137
|
20
|
|
|
|
|
33
|
my @members; |
138
|
20
|
50
|
|
|
|
55
|
if (@$invlist > 1) { |
139
|
|
|
|
|
|
|
push @members, map chr, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1) |
140
|
20
|
|
|
|
|
472
|
for 0 .. (($#$invlist-1)>>1); |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
# an odd number of elements means the list ends with an "include-all" |
143
|
20
|
50
|
|
|
|
63
|
push @members, map chr, $invlist->[-1] .. 0x10FFFF |
144
|
|
|
|
|
|
|
if 1 & @$invlist; |
145
|
20
|
|
|
|
|
75
|
return \@members; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::expand_invlist { |
149
|
6
|
|
|
6
|
|
10263
|
my $invlist= shift; |
150
|
6
|
|
|
|
|
11
|
my @members; |
151
|
6
|
100
|
|
|
|
19
|
if (@$invlist > 1) { |
152
|
|
|
|
|
|
|
push @members, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1) |
153
|
5
|
|
|
|
|
32
|
for 0 .. (($#$invlist-1)>>1); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
# an odd number of elements means the list ends with an "include-all" |
156
|
6
|
100
|
|
|
|
19
|
push @members, $invlist->[-1] .. 0x10FFFF |
157
|
|
|
|
|
|
|
if 1 & @$invlist; |
158
|
6
|
|
|
|
|
14
|
return \@members; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# The index is private because there's not a good way to explain it to the user |
162
|
|
|
|
|
|
|
sub _invlist_index { |
163
|
2329
|
|
|
2329
|
|
3427
|
my $self= shift; |
164
|
2329
|
|
66
|
|
|
6623
|
$self->{_invlist_index} ||= Mock::Data::Charset::Util::create_invlist_index($self->member_invlist); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::create_invlist_index { |
168
|
24
|
|
|
24
|
|
5012
|
my $invlist= shift; |
169
|
24
|
|
|
|
|
59
|
my $n_spans= (@$invlist + 1) >> 1; |
170
|
24
|
|
|
|
|
32
|
my @index; |
171
|
24
|
|
|
|
|
87
|
$#index= $n_spans-1; |
172
|
24
|
|
|
|
|
39
|
my $total= 0; |
173
|
|
|
|
|
|
|
$index[$_]= $total += $invlist->[$_*2+1] - $invlist->[$_*2] |
174
|
24
|
|
|
|
|
312
|
for 0 .. (@$invlist >> 1)-1; |
175
|
24
|
100
|
|
|
|
83
|
if (@$invlist & 1) { # In the case that the final range is infinite |
176
|
6
|
|
|
|
|
19
|
$index[$n_spans-1]= $total + 0x110000 - $invlist->[-1]; |
177
|
|
|
|
|
|
|
} |
178
|
24
|
|
|
|
|
117
|
\@index; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub member_invlist { |
183
|
1605
|
50
|
|
1605
|
1
|
3336
|
if (@_ > 1) { |
184
|
0
|
|
|
|
|
0
|
$_[0]{member_invlist}= $_[1]; |
185
|
0
|
|
|
|
|
0
|
delete $_[0]{_invlist_index}; |
186
|
0
|
|
|
|
|
0
|
delete $_[0]{members}; |
187
|
0
|
|
|
|
|
0
|
delete $_[0]{notation}; |
188
|
|
|
|
|
|
|
} |
189
|
1605
|
|
66
|
|
|
4013
|
$_[0]{member_invlist} //= _build_member_invlist(@_); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub _build_member_invlist { |
193
|
34
|
|
|
34
|
|
63
|
my $self= shift; |
194
|
34
|
|
|
|
|
83
|
my $max_codepoint= $self->max_codepoint; |
195
|
|
|
|
|
|
|
# If the search space is small, and there is already a regex notation, it is probably faster |
196
|
|
|
|
|
|
|
# to iterate and let perl do the work than to parse the charset. |
197
|
34
|
|
|
|
|
51
|
my $invlist; |
198
|
34
|
100
|
66
|
|
|
216
|
if (!defined $max_codepoint || $max_codepoint > 1000 || !defined $self->{notation}) { |
|
|
|
100
|
|
|
|
|
199
|
25
|
|
100
|
|
|
110
|
$max_codepoint ||= 0x10FFFF; |
200
|
25
|
|
|
|
|
45
|
$invlist= eval { |
201
|
25
|
|
|
|
|
65
|
_parsed_charset_to_invlist($self->_parse, $max_codepoint); |
202
|
|
|
|
|
|
|
}# or main::diag $@ |
203
|
|
|
|
|
|
|
} |
204
|
34
|
|
66
|
|
|
169
|
$invlist ||= _charset_invlist_brute_force($self->notation, $max_codepoint); |
205
|
|
|
|
|
|
|
# If a user writes to the invlist, it will become out of sync with the Index, |
206
|
|
|
|
|
|
|
# leading to confusing bugs. |
207
|
34
|
50
|
|
|
|
241
|
if (Internals->can('SvREADONLY')) { |
208
|
34
|
|
|
|
|
446
|
Internals::SvREADONLY($_,1) for @$invlist; |
209
|
34
|
|
|
|
|
80
|
Internals::SvREADONLY(@$invlist,1); |
210
|
|
|
|
|
|
|
} |
211
|
34
|
|
|
|
|
171
|
return $invlist; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Lazy-built string of all basic-multilingual-plane characters |
215
|
|
|
|
|
|
|
our $_ascii_chars; |
216
|
|
|
|
|
|
|
our $_unicode_chars; |
217
|
|
|
|
|
|
|
sub _build_unicode_chars { |
218
|
1
|
50
|
|
1
|
|
5
|
unless (defined $_unicode_chars) { |
219
|
|
|
|
|
|
|
# Construct ranges of valid characters separated by NUL. |
220
|
|
|
|
|
|
|
# Older perls die when the regex engine encounters an invalid character |
221
|
|
|
|
|
|
|
# but newer perls just treat the invalid character as "not a member", |
222
|
|
|
|
|
|
|
# unless the set is a negation in which case non-characters *are* a member. |
223
|
|
|
|
|
|
|
# This makes the assumption that if a non-char isn't a member then \0 won't |
224
|
|
|
|
|
|
|
# be either. |
225
|
1
|
|
|
|
|
3
|
$_unicode_chars= ''; |
226
|
1
|
|
|
|
|
10820
|
$_unicode_chars .= chr($_) for 0 .. 0xD7FF; |
227
|
1
|
|
|
|
|
12
|
$_unicode_chars .= "\0"; |
228
|
1
|
|
|
|
|
109
|
$_unicode_chars .= chr($_) for 0xFDF0 .. 0xFFFD; |
229
|
1
|
|
|
|
|
4
|
for (1..16) { |
230
|
16
|
|
|
|
|
177
|
$_unicode_chars .= "\0"; |
231
|
16
|
|
|
|
|
215436
|
$_unicode_chars .= chr($_) for ($_<<16) .. (($_<<16)|0xFFFD); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
} |
234
|
1
|
|
|
|
|
9
|
\$_unicode_chars; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _charset_invlist_brute_force { |
238
|
9
|
|
|
9
|
|
26
|
my ($set, $max_codepoint)= @_; |
239
|
9
|
100
|
|
|
|
41
|
my $inv= (ord $set == ord '^')? substr($set,1) : '^'.$set; |
240
|
9
|
|
|
|
|
13
|
my @invlist; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# optimize common case |
243
|
9
|
100
|
|
|
|
28
|
if ($max_codepoint < 256) { |
244
|
|
|
|
|
|
|
# Find first character of every match and first character of every non-match |
245
|
|
|
|
|
|
|
# and convert to codepoints. |
246
|
8
|
100
|
66
|
|
|
438
|
@invlist= map +(defined $_? ord($_) : ()), |
247
|
|
|
|
|
|
|
($_ascii_chars //= join('', map chr($_), 0..255)) |
248
|
|
|
|
|
|
|
=~ / ( [$set] ) (?> [$set]* ) (?: \z | ( [$inv] ) (?> [$inv]* ) )/gx; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
else { |
251
|
1
|
50
|
|
|
|
7
|
_build_unicode_chars() unless defined $_unicode_chars; |
252
|
|
|
|
|
|
|
# This got complicated while trying to support perls that can't match against non-characters. |
253
|
|
|
|
|
|
|
# The non-characters have been replaced by NULs, so need to capture the char before and after |
254
|
|
|
|
|
|
|
# each transition in case one of them is a NUL. |
255
|
1
|
50
|
|
|
|
5924
|
my @endpoints= |
256
|
|
|
|
|
|
|
($max_codepoint < 0x10FFFF? substr($_unicode_chars,0,$max_codepoint+1) : $_unicode_chars) |
257
|
|
|
|
|
|
|
=~ /( [$set] ) ( [$set] )* ( \z | [$inv] ) ( [$inv] )* /gx; |
258
|
1
|
50
|
|
|
|
9
|
if (@endpoints) { |
259
|
|
|
|
|
|
|
# List is a multiple of 4 elements: (first-member,last-member,first-non-member,last-non-member) |
260
|
|
|
|
|
|
|
# We're not interested in the span of non-members at the end, so just remove those. |
261
|
1
|
|
|
|
|
2
|
pop @endpoints; pop @endpoints; |
|
1
|
|
|
|
|
3
|
|
262
|
|
|
|
|
|
|
# Iterate every transition of member/nonmember, and use the second character if present |
263
|
|
|
|
|
|
|
# and isn't a NUL, else use the first character and add 1. |
264
|
1
|
|
|
|
|
7
|
push @invlist, ord $endpoints[0]; |
265
|
1
|
|
|
|
|
6
|
for (my $i= 1; $i < @endpoints; $i+= 2) { |
266
|
19
|
100
|
66
|
|
|
64
|
if (defined $endpoints[$i+1] && ord $endpoints[$i+1]) { |
|
|
50
|
|
|
|
|
|
267
|
18
|
|
|
|
|
41
|
push @invlist, ord $endpoints[$i+1]; |
268
|
|
|
|
|
|
|
} elsif (defined $endpoints[$i]) { |
269
|
1
|
|
|
|
|
5
|
push @invlist, 1 + ord $endpoints[$i]; |
270
|
|
|
|
|
|
|
} else { |
271
|
0
|
|
|
|
|
0
|
push @invlist, 1 + $invlist[-1]; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
# substr is an estimate, because string skips characters, so remove any spurrous |
275
|
|
|
|
|
|
|
# codepoints beyond the max |
276
|
1
|
|
66
|
|
|
12
|
pop @invlist while @invlist && $invlist[-1] > $max_codepoint; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
# If an "infinite" range would be returned, but the user set a maximum codepoint, |
280
|
|
|
|
|
|
|
# list the max codepoint as the end of the invlist. |
281
|
9
|
100
|
66
|
|
|
916
|
if ($max_codepoint < 0x10FFFF and 1 & @invlist) { |
282
|
4
|
|
|
|
|
11
|
push @invlist, $max_codepoint+1; |
283
|
|
|
|
|
|
|
} |
284
|
9
|
|
|
|
|
41
|
return \@invlist; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub _parsed_charset_to_invlist { |
288
|
25
|
|
|
25
|
|
92
|
my ($parse, $max_codepoint)= @_; |
289
|
25
|
|
|
|
|
41
|
my @invlists; |
290
|
|
|
|
|
|
|
# convert the character list into an inversion list |
291
|
25
|
100
|
|
|
|
72
|
if (defined (my $cp= $parse->{codepoints})) { |
292
|
5
|
|
|
|
|
22
|
my @chars= sort { $a <=> $b } @$cp; |
|
6
|
|
|
|
|
18
|
|
293
|
5
|
|
|
|
|
13
|
my @invlist= (shift @chars); |
294
|
5
|
|
|
|
|
11
|
push @invlist, $invlist[0] + 1; |
295
|
5
|
|
|
|
|
19
|
for (my $i= 0; $i <= $#chars; $i++) { |
296
|
|
|
|
|
|
|
# If the next char is adjacent, extend the span |
297
|
5
|
100
|
|
|
|
20
|
if ($invlist[-1] == $chars[$i]) { |
298
|
2
|
|
|
|
|
7
|
++$invlist[-1]; |
299
|
|
|
|
|
|
|
} else { |
300
|
3
|
|
|
|
|
9
|
push @invlist, $chars[$i], $chars[$i]+1; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
} |
303
|
5
|
|
|
|
|
16
|
push @invlists, \@invlist; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
# Each range is an inversion list already |
306
|
25
|
100
|
|
|
|
67
|
if (my $r= $parse->{codepoint_ranges}) { |
307
|
9
|
|
|
|
|
32
|
for (my $i= 0; $i < (@$r >> 1); $i++) { |
308
|
10
|
|
|
|
|
32
|
my ($start, $limit)= ($r->[$i*2], $r->[$i*2+1]+1); |
309
|
|
|
|
|
|
|
# Try to combine the range with the most recent inversion list, if possible, |
310
|
10
|
100
|
66
|
|
|
54
|
if (@invlists && $invlists[-1][-1] < $start) { |
|
|
50
|
33
|
|
|
|
|
311
|
1
|
|
|
|
|
4
|
push @{ $invlists[-1] }, $start, $limit; |
|
1
|
|
|
|
|
5
|
|
312
|
|
|
|
|
|
|
} elsif (@invlists && $invlists[-1][0] > $limit) { |
313
|
0
|
|
|
|
|
0
|
unshift @{ $invlists[-1] }, $start, $limit; |
|
0
|
|
|
|
|
0
|
|
314
|
|
|
|
|
|
|
} else { |
315
|
|
|
|
|
|
|
# else just start a new inversion list |
316
|
9
|
|
|
|
|
34
|
push @invlists, [ $start, $limit ] |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
# Convert each character class to an inversion list. |
321
|
25
|
100
|
|
|
|
59
|
if ($parse->{classes}) { |
322
|
|
|
|
|
|
|
push @invlists, _class_invlist($_) |
323
|
11
|
|
|
|
|
35
|
for @{ $parse->{classes} }; |
|
11
|
|
|
|
|
44
|
|
324
|
|
|
|
|
|
|
} |
325
|
25
|
|
|
|
|
2417
|
my $invlist= Mock::Data::Charset::Util::merge_invlists(\@invlists, $max_codepoint); |
326
|
|
|
|
|
|
|
# Perform negation of inversion list by either starting at char 0 or removing char 0 |
327
|
25
|
100
|
|
|
|
74
|
if ($parse->{negate}) { |
328
|
1
|
50
|
|
|
|
4
|
if ($invlist->[0]) { unshift @$invlist, 0 } |
|
1
|
|
|
|
|
4
|
|
329
|
0
|
|
|
|
|
0
|
else { shift @$invlist; } |
330
|
|
|
|
|
|
|
} |
331
|
25
|
|
|
|
|
140
|
return $invlist; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
our $_compile; |
336
|
|
|
|
|
|
|
sub compile { |
337
|
3
|
|
|
3
|
1
|
20
|
local $_compile= 1; |
338
|
3
|
|
|
|
|
12
|
shift->generate(@_); |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
sub generate { |
341
|
112
|
|
|
112
|
1
|
207
|
my ($self, $mock)= (shift, shift); |
342
|
112
|
|
|
|
|
230
|
my ($len, $cp_min, $cp_max, $member_count) |
343
|
|
|
|
|
|
|
= ($self->str_len, $self->min_codepoint, $self->max_codepoint, $self->count); |
344
|
112
|
100
|
|
|
|
245
|
if (@_) { |
345
|
82
|
100
|
|
|
|
187
|
my %opts= ref $_[0] eq 'HASH'? %{ shift() } : (); |
|
79
|
|
|
|
|
302
|
|
346
|
82
|
100
|
33
|
|
|
211
|
$len= @_? shift : $opts{str_len} // $opts{len} // $opts{size}; # allow some aliases for length |
|
|
|
33
|
|
|
|
|
347
|
82
|
|
66
|
|
|
243
|
$cp_min= $opts{min_codepoint} // $cp_min; |
348
|
82
|
|
66
|
|
|
241
|
$cp_max= $opts{max_codepoint} // $cp_max; |
349
|
|
|
|
|
|
|
} |
350
|
112
|
100
|
100
|
|
|
415
|
my ($memb_min, $memb_span)= !defined $cp_min && !defined $cp_max? (0,$member_count) |
351
|
|
|
|
|
|
|
: $self->_codepoint_minmax_to_member_range($cp_min, $cp_max); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# If compiling, $len will be a function, else it will be an integer |
354
|
0
|
|
|
0
|
|
0
|
$len= !defined $len? ($_compile? sub { 1 } : 1 ) |
355
|
0
|
|
|
0
|
|
0
|
: !ref $len? ($_compile? sub { $len } : $len) |
356
|
|
|
|
|
|
|
: ref $len eq 'ARRAY'? ( |
357
|
0
|
|
|
0
|
|
0
|
$_compile? sub { $len->[0] + int rand ($len->[1] - $len->[0]) } |
358
|
112
|
50
|
|
|
|
353
|
: $len->[0] + int rand ($len->[1] - $len->[0]) |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
359
|
|
|
|
|
|
|
) |
360
|
|
|
|
|
|
|
: ref $len eq 'CODE'? ($_compile? $len : $len->($mock)) |
361
|
|
|
|
|
|
|
: Carp::croak("Unknown str_len specification '$len'"); |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# If member list is small-ish, use faster direct array access |
364
|
112
|
100
|
100
|
|
|
367
|
if ($self->{members} || $member_count < 500) { |
365
|
96
|
|
|
|
|
187
|
my $members= $self->members; |
366
|
|
|
|
|
|
|
return sub { |
367
|
3
|
|
|
3
|
|
9
|
my $buf= ''; |
368
|
|
|
|
|
|
|
$buf .= $members->[$memb_min + int rand $memb_span] |
369
|
3
|
|
|
|
|
10
|
for 1..$len->($_[0]); |
370
|
3
|
|
|
|
|
35
|
return $buf; |
371
|
96
|
100
|
|
|
|
218
|
} if $_compile; |
372
|
93
|
|
|
|
|
150
|
my $buf= ''; |
373
|
|
|
|
|
|
|
$buf .= $members->[$memb_min + int rand $memb_span] |
374
|
93
|
|
|
|
|
433
|
for 1..$len; |
375
|
93
|
|
|
|
|
402
|
return $buf; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
else { |
378
|
16
|
|
|
|
|
34
|
my $invlist= $self->member_invlist; |
379
|
16
|
|
|
|
|
27
|
my $index= $self->_invlist_index; |
380
|
|
|
|
|
|
|
return sub { |
381
|
0
|
|
|
0
|
|
0
|
my $ret= ''; |
382
|
|
|
|
|
|
|
$ret .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index) |
383
|
0
|
|
|
|
|
0
|
for 1..$len->($_[0]); |
384
|
16
|
50
|
|
|
|
38
|
} if $_compile; |
385
|
16
|
|
|
|
|
27
|
my $buf= ''; |
386
|
|
|
|
|
|
|
$buf .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index) |
387
|
16
|
|
|
|
|
62
|
for 1..$len; |
388
|
16
|
|
|
|
|
66
|
return $buf; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _codepoint_minmax_to_member_range { |
393
|
27
|
|
|
27
|
|
42
|
my $self= shift; |
394
|
27
|
|
|
|
|
48
|
my ($cp_min, $cp_max)= @_; |
395
|
|
|
|
|
|
|
my $memb_min= !defined $cp_min? 0 |
396
|
27
|
100
|
|
|
|
53
|
: do { |
397
|
6
|
|
|
|
|
15
|
my ($at, $ins)= _find_invlist_element($cp_min, $self->member_invlist, $self->_invlist_index); |
398
|
6
|
|
33
|
|
|
19
|
$at // $ins |
399
|
|
|
|
|
|
|
}; |
400
|
|
|
|
|
|
|
my $memb_lim= !defined $cp_max? $self->count |
401
|
27
|
50
|
|
|
|
50
|
: do { |
402
|
27
|
|
|
|
|
57
|
my ($at, $ins)= _find_invlist_element($cp_max, $self->member_invlist, $self->_invlist_index); |
403
|
27
|
100
|
|
|
|
68
|
defined $at? $at + 1 : $ins; |
404
|
|
|
|
|
|
|
}; |
405
|
27
|
|
|
|
|
65
|
return ($memb_min, $memb_lim-$memb_min); |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub parse { |
410
|
31
|
|
|
31
|
1
|
22332
|
my ($self, $notation)= @_; |
411
|
31
|
50
|
|
|
|
100
|
return { codepoints => [] } unless length $notation; |
412
|
31
|
50
|
|
|
|
123
|
return { classes => ['All'] } if $notation eq '^'; |
413
|
31
|
|
|
|
|
75
|
$notation .= ']'; |
414
|
|
|
|
|
|
|
# parse function needs $_ to be the input string |
415
|
31
|
|
|
|
|
98
|
pos($notation)= 0; |
416
|
31
|
|
|
|
|
140
|
return _parse_charset() for $notation; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
our $have_prop_invlist; |
420
|
|
|
|
|
|
|
our %_parse_charset_backslash= ( |
421
|
|
|
|
|
|
|
a => ord "\a", |
422
|
|
|
|
|
|
|
b => ord "\b", |
423
|
|
|
|
|
|
|
c => sub { die "Unimplemented: \\c" }, |
424
|
|
|
|
|
|
|
d => sub { push @{$_[0]{classes}}, 'digit'; undef; }, |
425
|
|
|
|
|
|
|
D => sub { push @{$_[0]{classes}}, '^digit'; undef; }, |
426
|
|
|
|
|
|
|
e => ord "\e", |
427
|
|
|
|
|
|
|
f => ord "\f", |
428
|
|
|
|
|
|
|
h => sub { push @{$_[0]{classes}}, 'horizspace'; undef; }, |
429
|
|
|
|
|
|
|
H => sub { push @{$_[0]{classes}}, '^horizspace'; undef; }, |
430
|
|
|
|
|
|
|
n => ord "\n", |
431
|
|
|
|
|
|
|
N => \&_parse_charset_namedchar, |
432
|
|
|
|
|
|
|
o => \&_parse_charset_oct, |
433
|
|
|
|
|
|
|
p => \&_parse_charset_classname, |
434
|
|
|
|
|
|
|
P => sub { _parse_charset_classname(shift, 1) }, |
435
|
|
|
|
|
|
|
r => ord "\r", |
436
|
|
|
|
|
|
|
s => sub { push @{$_[0]{classes}}, 'space'; undef; }, |
437
|
|
|
|
|
|
|
S => sub { push @{$_[0]{classes}}, '^space'; undef; }, |
438
|
|
|
|
|
|
|
t => ord "\t", |
439
|
|
|
|
|
|
|
v => sub { push @{$_[0]{classes}}, 'vertspace'; undef; }, |
440
|
|
|
|
|
|
|
V => sub { push @{$_[0]{classes}}, '^vertspace'; undef; }, |
441
|
|
|
|
|
|
|
w => sub { push @{$_[0]{classes}}, 'word'; undef; }, |
442
|
|
|
|
|
|
|
W => sub { push @{$_[0]{classes}}, '^word'; undef; }, |
443
|
|
|
|
|
|
|
x => \&_parse_charset_hex, |
444
|
|
|
|
|
|
|
0 => \&_parse_charset_oct, |
445
|
|
|
|
|
|
|
1 => \&_parse_charset_oct, |
446
|
|
|
|
|
|
|
2 => \&_parse_charset_oct, |
447
|
|
|
|
|
|
|
3 => \&_parse_charset_oct, |
448
|
|
|
|
|
|
|
4 => \&_parse_charset_oct, |
449
|
|
|
|
|
|
|
5 => \&_parse_charset_oct, |
450
|
|
|
|
|
|
|
6 => \&_parse_charset_oct, |
451
|
|
|
|
|
|
|
7 => \&_parse_charset_oct, |
452
|
|
|
|
|
|
|
8 => \&_parse_charset_oct, |
453
|
|
|
|
|
|
|
9 => \&_parse_charset_oct, |
454
|
|
|
|
|
|
|
); |
455
|
|
|
|
|
|
|
our %_class_invlist_cache= ( |
456
|
|
|
|
|
|
|
'Any' => [ 0 ], |
457
|
|
|
|
|
|
|
'\\N' => [ 0, ord("\n"), 1+ord("\n") ], |
458
|
|
|
|
|
|
|
); |
459
|
|
|
|
|
|
|
sub _class_invlist { |
460
|
22
|
|
|
22
|
|
10809
|
my $class= shift; |
461
|
22
|
100
|
|
|
|
51
|
if (ord $class == ord '^') { |
462
|
5
|
|
|
|
|
25
|
return Mock::Data::Charset::Util::negate_invlist( |
463
|
|
|
|
|
|
|
_class_invlist(substr($class,1)) |
464
|
|
|
|
|
|
|
); |
465
|
|
|
|
|
|
|
} |
466
|
17
|
|
66
|
|
|
82
|
return $_class_invlist_cache{$class} ||= do { |
467
|
8
|
100
|
|
|
|
26
|
$have_prop_invlist= do { require Unicode::UCD; !!Unicode::UCD->can('prop_invlist') } |
|
2
|
|
|
|
|
2673
|
|
|
2
|
|
|
|
|
46089
|
|
468
|
|
|
|
|
|
|
unless defined $have_prop_invlist; |
469
|
8
|
50
|
|
|
|
41
|
$have_prop_invlist? [ Unicode::UCD::prop_invlist($class) ] |
470
|
|
|
|
|
|
|
: _charset_invlist_brute_force("\\p{$class}", 0x10FFFF); |
471
|
|
|
|
|
|
|
}; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
sub _parse_charset_hex { |
474
|
3
|
50
|
|
3
|
|
17
|
/\G( [0-9A-Fa-f]{2} | \{ ([0-9A-Fa-f]+) \} )/gcx |
475
|
|
|
|
|
|
|
or die "Invalid hex escape at "._parse_context; |
476
|
3
|
100
|
|
|
|
18
|
return hex(defined $2? $2 : $1); |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
sub _parse_charset_oct { |
479
|
5
|
|
|
5
|
|
17
|
--pos; # The caller ate one of the characters we need to parse |
480
|
5
|
50
|
|
|
|
31
|
/\G( [0-7]{3} | 0 | o\{ ([0-7]+) \} ) /gcx |
481
|
|
|
|
|
|
|
or die "Invalid octal escape at "._parse_context; |
482
|
5
|
100
|
|
|
|
30
|
return oct(defined $2? $2 : $1); |
483
|
|
|
|
|
|
|
} |
484
|
|
|
|
|
|
|
sub _parse_charset_namedchar { |
485
|
2
|
|
|
2
|
|
1245
|
require charnames; |
486
|
2
|
50
|
|
|
|
12322
|
/\G \{ ([^}]+) \} /gcx |
487
|
|
|
|
|
|
|
# or die "Invalid named char following \\N at '".substr($_,pos,10)."'"; |
488
|
|
|
|
|
|
|
and return charnames::vianame($1); |
489
|
|
|
|
|
|
|
# Plain "\N" means every character except \n |
490
|
0
|
|
|
|
|
0
|
push @{ $_[0]{classes} }, '\\N'; |
|
0
|
|
|
|
|
0
|
|
491
|
0
|
|
|
|
|
0
|
return; |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
sub _parse_charset_classname { |
494
|
7
|
|
|
7
|
|
20
|
my ($result, $negate)= @_; |
495
|
7
|
50
|
|
|
|
39
|
/\G \{ ([^}]+) \} /gcx |
496
|
|
|
|
|
|
|
or die "Invalid class name following \\p at "._parse_context; |
497
|
7
|
100
|
|
|
|
14
|
push @{$result->{classes}}, lc($negate? "^$1" : $1); |
|
7
|
|
|
|
|
39
|
|
498
|
|
|
|
|
|
|
undef |
499
|
7
|
|
|
|
|
18
|
} |
500
|
|
|
|
|
|
|
sub _parse_charset { |
501
|
36
|
|
|
36
|
|
68
|
my $flags= shift; |
502
|
|
|
|
|
|
|
# argument is in $_, starting from pos($_) |
503
|
36
|
|
|
|
|
65
|
my %parse; |
504
|
|
|
|
|
|
|
my @range; |
505
|
36
|
|
|
|
|
109
|
$parse{codepoints}= \my @chars; |
506
|
36
|
100
|
|
|
|
139
|
$parse{negate}= 1 if /\G \^ /gcx; |
507
|
36
|
50
|
|
|
|
97
|
if (/\G]/gc) { push @chars, ord ']' } |
|
0
|
|
|
|
|
0
|
|
508
|
36
|
|
|
|
|
64
|
while (1) { |
509
|
121
|
|
|
|
|
155
|
my $cp; # literal codepoint to be added |
510
|
|
|
|
|
|
|
# Check for special cases |
511
|
121
|
100
|
50
|
|
|
490
|
if (/\G ( \\ | - | \[: | \] ) /gcx) { |
|
|
50
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
512
|
78
|
100
|
|
|
|
288
|
if ($1 eq '\\') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
513
|
19
|
50
|
|
|
|
66
|
/\G(.)/gc or die "Unexpected end of input"; |
514
|
19
|
|
33
|
|
|
76
|
$cp= $_parse_charset_backslash{$1} || ord $1; |
515
|
19
|
100
|
|
|
|
99
|
$cp= $cp->(\%parse) |
516
|
|
|
|
|
|
|
if ref $cp; |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
elsif ($1 eq '-') { |
519
|
18
|
100
|
|
|
|
48
|
if (@range == 1) { |
520
|
17
|
|
|
|
|
31
|
push @range, ord '-'; |
521
|
17
|
|
|
|
|
38
|
next; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
else { |
524
|
1
|
|
|
|
|
4
|
$cp= ord '-'; |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
elsif ($1 eq '[:') { |
528
|
5
|
50
|
|
|
|
34
|
/\G ( [^:]+ ) :] /gcx |
529
|
|
|
|
|
|
|
or die "Invalid character class at "._parse_context; |
530
|
5
|
|
|
|
|
12
|
push @{$parse{classes}}, $1; |
|
5
|
|
|
|
|
24
|
|
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
else { |
533
|
36
|
|
|
|
|
112
|
last; # $1 eq ']'; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
elsif ($flags && ($flags->{x}||0) >= 2 && /\G[ \t]/gc) { |
537
|
0
|
|
|
|
|
0
|
next; # ignore space and tab under /xx |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
else { |
540
|
43
|
50
|
|
|
|
130
|
/\G(.)/gc or die "Unexpected end of input"; |
541
|
43
|
|
|
|
|
81
|
$cp= ord $1; |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
# If no single character was found, any range-in-progress needs converted to |
544
|
|
|
|
|
|
|
# charcters |
545
|
68
|
100
|
|
|
|
13418
|
if (!defined $cp) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
546
|
16
|
|
|
|
|
34
|
push @chars, @range; |
547
|
16
|
|
|
|
|
25
|
@range= (); |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
# At this point, $cp will contain the next ordinal of the character to include, |
550
|
|
|
|
|
|
|
# but it might also be starting or finishing a range. |
551
|
|
|
|
|
|
|
elsif (@range == 1) { |
552
|
8
|
|
|
|
|
18
|
push @chars, $range[0]; |
553
|
8
|
|
|
|
|
16
|
$range[0]= $cp; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
elsif (@range == 2) { |
556
|
16
|
|
|
|
|
22
|
push @{$parse{codepoint_ranges}}, $range[0], $cp; |
|
16
|
|
|
|
|
46
|
|
557
|
16
|
|
|
|
|
31
|
@range= (); |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
else { |
560
|
28
|
|
|
|
|
62
|
push @range, $cp; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
#printf "# pos %d cp %d range %s %s include %s\n", pos $_, $cp, $range[0] // '(null)', $range[1] // '(null)', join(',', @include); |
563
|
|
|
|
|
|
|
} |
564
|
36
|
|
|
|
|
68
|
push @chars, @range; |
565
|
36
|
100
|
|
|
|
89
|
if (@chars) { |
566
|
13
|
|
|
|
|
65
|
@chars= sort { $a <=> $b } @chars; |
|
11
|
|
|
|
|
31
|
|
567
|
|
|
|
|
|
|
} else { |
568
|
23
|
|
|
|
|
48
|
delete $parse{codepoints}; |
569
|
|
|
|
|
|
|
} |
570
|
36
|
|
|
|
|
187
|
return \%parse; |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub _ord_to_safe_regex_char { |
574
|
3
|
0
|
|
3
|
|
33
|
return chr($_[0]) =~ /[\w]/? chr $_[0] |
|
|
50
|
|
|
|
|
|
575
|
|
|
|
|
|
|
: $_[0] <= 0xFF? sprintf('\x%02X',$_[0]) |
576
|
|
|
|
|
|
|
: sprintf('\x{%X}',$_[0]) |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub _deparse_charset { |
580
|
1
|
|
|
1
|
|
2
|
my $parse= shift; |
581
|
1
|
|
|
|
|
3
|
my $str= ''; |
582
|
1
|
50
|
|
|
|
5
|
if (my $cp= $parse->{codepoints}) { |
583
|
|
|
|
|
|
|
$str .= _ord_to_safe_regex_char($_) |
584
|
1
|
|
|
|
|
5
|
for @$cp; |
585
|
|
|
|
|
|
|
} |
586
|
1
|
50
|
|
|
|
4
|
if (my $r= $parse->{codepoint_ranges}) { |
587
|
0
|
|
|
|
|
0
|
for (my $i= 0; $i < (@$r << 1); $i++) { |
588
|
0
|
|
|
|
|
0
|
$str .= _ord_to_safe_regex_char($r->[$i*2]) . '-' . _ord_to_safe_regex_char($r->[$i*2+1]); |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
} |
591
|
1
|
50
|
|
|
|
5
|
if (my $cl= $parse->{classes}) { |
592
|
|
|
|
|
|
|
# TODO: reverse conversions to \h \v etc. |
593
|
0
|
|
|
|
|
0
|
for (@$cl) { |
594
|
0
|
0
|
|
|
|
0
|
$str .= $_ eq '\N'? '\0-\x09\x0B-\x{10FFFF}' |
|
|
0
|
|
|
|
|
|
595
|
|
|
|
|
|
|
: ord == ord '^'? '\P{'.substr($_,1).'}' |
596
|
|
|
|
|
|
|
: '\p{'.$_.'}'; |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
} |
599
|
1
|
|
|
|
|
6
|
return $str; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
sub get_member { |
604
|
749
|
100
|
|
749
|
1
|
3564
|
$_[0]{members}? $_[0]{members}[$_[1]] |
605
|
|
|
|
|
|
|
: chr _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index); |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
sub get_member_codepoint { |
609
|
6
|
50
|
|
6
|
1
|
2687
|
$_[0]{members}? ord $_[0]{members}[$_[1]] |
610
|
|
|
|
|
|
|
: _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index); |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub _get_invlist_element { |
614
|
1108
|
|
|
1108
|
|
1977
|
my ($ofs, $invlist, $invlist_index)= @_; |
615
|
1108
|
50
|
|
|
|
2112
|
$ofs += @$invlist_index if $ofs < 0; |
616
|
1108
|
50
|
33
|
|
|
3944
|
return undef if $ofs >= $invlist_index->[-1] || $ofs < 0; |
617
|
|
|
|
|
|
|
# Binary Search to find the range that contains this numbered element |
618
|
1108
|
|
|
|
|
2072
|
my ($min, $max, $mid)= (0, $#$invlist_index); |
619
|
1108
|
|
|
|
|
1594
|
while (1) { |
620
|
5772
|
|
|
|
|
7211
|
$mid= ($min+$max) >> 1; |
621
|
5772
|
100
|
100
|
|
|
13656
|
if ($ofs >= $invlist_index->[$mid]) { |
|
|
100
|
|
|
|
|
|
622
|
2758
|
|
|
|
|
3469
|
$min= $mid+1 |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
elsif ($mid > 0 && $ofs < $invlist_index->[$mid-1]) { |
625
|
1906
|
|
|
|
|
2594
|
$max= $mid-1 |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
else { |
628
|
1108
|
100
|
|
|
|
2127
|
$ofs -= $invlist_index->[$mid-1] if $mid > 0; |
629
|
1108
|
|
|
|
|
4343
|
return $invlist->[$mid*2] + $ofs; |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
sub find_member { |
636
|
754
|
|
|
754
|
1
|
2673
|
my ($self, $char)= @_; |
637
|
754
|
|
|
|
|
1509
|
return _find_invlist_element(ord $char, $self->member_invlist, $self->_invlist_index); |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
sub _find_invlist_element { |
641
|
787
|
|
|
787
|
|
1491
|
my ($codepoint, $invlist, $index)= @_; |
642
|
|
|
|
|
|
|
# Binary Search to find the range that contains this numbered element |
643
|
787
|
|
|
|
|
1389
|
my ($min, $max, $mid)= (0, $#$invlist); |
644
|
787
|
|
|
|
|
1193
|
while (1) { |
645
|
5739
|
|
|
|
|
6960
|
$mid= ($min+$max) >> 1; |
646
|
5739
|
100
|
100
|
|
|
18815
|
if ($mid > 0 && $codepoint < $invlist->[$mid]) { |
|
|
100
|
100
|
|
|
|
|
647
|
2331
|
|
|
|
|
3019
|
$max= $mid-1 |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
elsif ($mid < $#$invlist && $codepoint >= $invlist->[$mid+1]) { |
650
|
2621
|
|
|
|
|
3630
|
$min= $mid+1; |
651
|
|
|
|
|
|
|
} |
652
|
|
|
|
|
|
|
else { |
653
|
787
|
100
|
|
|
|
1543
|
return (undef, 0) unless $codepoint >= $invlist->[$mid]; |
654
|
786
|
100
|
|
|
|
1454
|
return $codepoint - $invlist->[$mid] unless $mid > 0; |
655
|
778
|
100
|
|
|
|
4728
|
return $codepoint - $invlist->[$mid] + $index->[($mid >> 1) - 1] unless $mid & 1; |
656
|
|
|
|
|
|
|
# if $mid is an odd number, the range is excluded, and there is no match |
657
|
13
|
100
|
|
|
|
46
|
return undef unless wantarray; |
658
|
11
|
|
|
|
|
40
|
return (undef, $index->[($mid-1)>>1]) # return insertion point as second val |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
} |
661
|
|
|
|
|
|
|
} |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
sub negate { |
665
|
0
|
|
|
0
|
1
|
0
|
my $self= shift; |
666
|
0
|
|
|
|
|
0
|
my $neg= Mock::Data::Charset::Util::negate_invlist($self->member_invlist, $self->max_codepoint); |
667
|
0
|
|
|
|
|
0
|
return $self->new(member_invlist => $neg); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::negate_invlist { |
670
|
5
|
|
|
5
|
|
1246
|
my ($invlist, $max_codepoint)= @_; |
671
|
|
|
|
|
|
|
# Toggle first char of 0 |
672
|
5
|
50
|
|
|
|
247
|
$invlist= $invlist->[0]? [ 0, @$invlist ] : [ @{$invlist}[1..$#$invlist] ]; |
|
0
|
|
|
|
|
0
|
|
673
|
|
|
|
|
|
|
# If max_codepoint is defined, and was the final char, remove the range starting at max_codepoint+1 |
674
|
5
|
50
|
33
|
|
|
38
|
if (@$invlist & 1 and defined $max_codepoint and $invlist->[-1] == $max_codepoint+1) { |
|
|
|
33
|
|
|
|
|
675
|
0
|
|
|
|
|
0
|
pop @$invlist; |
676
|
|
|
|
|
|
|
} |
677
|
5
|
|
|
|
|
23
|
return $invlist; |
678
|
|
|
|
|
|
|
} |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
sub union { |
682
|
0
|
|
|
0
|
1
|
0
|
my $self= $_[0]; |
683
|
0
|
|
|
|
|
0
|
my @invlists= @_; |
684
|
|
|
|
|
|
|
ref eq 'ARRAY' || ($_= $_->member_invlist) |
685
|
0
|
|
0
|
|
|
0
|
for @invlists; |
686
|
0
|
|
|
|
|
0
|
my $combined= Mock::Data::Charset::Util::merge_invlists(\@invlists, $self->max_codepoint); |
687
|
0
|
|
|
|
|
0
|
return $self->new(member_invlist => $combined); |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
#=head2 merge_invlists |
691
|
|
|
|
|
|
|
# |
692
|
|
|
|
|
|
|
# my $combined_invlist= $charset->merge_invlist( \@list2, \@list3, ... ); |
693
|
|
|
|
|
|
|
# my $combined_invlist= merge_invlist( \@list1, \@list2, ... ); |
694
|
|
|
|
|
|
|
# |
695
|
|
|
|
|
|
|
#Merge one or more inversion lists into a superset of all of them. |
696
|
|
|
|
|
|
|
#If called as a method, the L is used as the first list. |
697
|
|
|
|
|
|
|
# |
698
|
|
|
|
|
|
|
#The return value is an inversion list, which can be wrapped in a Charset object by passing it |
699
|
|
|
|
|
|
|
#as the C attribute. |
700
|
|
|
|
|
|
|
# |
701
|
|
|
|
|
|
|
#The current L applies to the result. If called as a plain function, the |
702
|
|
|
|
|
|
|
#C is assumed to be the Unicode maximum of C<0x10FFFF>. |
703
|
|
|
|
|
|
|
# |
704
|
|
|
|
|
|
|
#=cut |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::merge_invlists { |
707
|
33
|
|
|
33
|
|
9948
|
my @invlists= @{shift()}; |
|
33
|
|
|
|
|
78
|
|
708
|
33
|
|
100
|
|
|
99
|
my $max_codepoint= shift // 0x10FFFF; |
709
|
|
|
|
|
|
|
|
710
|
33
|
50
|
|
|
|
80
|
return [] unless @invlists; |
711
|
33
|
100
|
|
|
|
135
|
return [@{$invlists[0]}] unless @invlists > 1; |
|
19
|
|
|
|
|
101
|
|
712
|
14
|
|
|
|
|
26
|
my @combined= (); |
713
|
|
|
|
|
|
|
# Repeatedly select the minimum range among the input lists and add it to the result |
714
|
14
|
|
|
|
|
42
|
my @pos= (0)x@invlists; |
715
|
14
|
|
|
|
|
35
|
while (@invlists) { |
716
|
3293
|
|
|
|
|
5552
|
my ($min_ch, $min_i)= ($invlists[0][$pos[0]], 0); |
717
|
|
|
|
|
|
|
# Find which inversion list contains the lowest range |
718
|
3293
|
|
|
|
|
5627
|
for (my $i= 1; $i < @invlists; $i++) { |
719
|
3284
|
100
|
|
|
|
7549
|
if ($invlists[$i][$pos[$i]] < $min_ch) { |
720
|
814
|
|
|
|
|
1168
|
$min_ch= $invlists[$i][$pos[$i]]; |
721
|
814
|
|
|
|
|
1472
|
$min_i= $i; |
722
|
|
|
|
|
|
|
} |
723
|
|
|
|
|
|
|
} |
724
|
3293
|
100
|
|
|
|
5427
|
last if $min_ch > $max_codepoint; |
725
|
|
|
|
|
|
|
# Check for overlap of this new inclusion range with the previous |
726
|
3291
|
100
|
100
|
|
|
8048
|
if (@combined && $combined[-1] >= $min_ch) { |
727
|
|
|
|
|
|
|
# they overlap, so just replace the end-codepoint of the range |
728
|
|
|
|
|
|
|
# if the new endpoint is larger |
729
|
3111
|
|
|
|
|
4492
|
my $new_end= $invlists[$min_i][$pos[$min_i]+1]; |
730
|
3111
|
100
|
100
|
|
|
8754
|
$combined[-1]= $new_end if !defined $new_end || $combined[-1] < $new_end; |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
else { |
733
|
|
|
|
|
|
|
# else, simply append the range |
734
|
180
|
|
|
|
|
287
|
push @combined, @{$invlists[$min_i]}[$pos[$min_i] .. $pos[$min_i]+1]; |
|
180
|
|
|
|
|
427
|
|
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
# If the list is empty now, remove it from consideration |
737
|
3291
|
100
|
|
|
|
4524
|
if (($pos[$min_i] += 2) >= @{$invlists[$min_i]}) { |
|
3291
|
50
|
|
|
|
8620
|
|
738
|
21
|
|
|
|
|
36
|
splice @invlists, $min_i, 1; |
739
|
21
|
|
|
|
|
30
|
splice @pos, $min_i, 1; |
740
|
|
|
|
|
|
|
# If the invlist ends with an infinite range now, we are done |
741
|
21
|
100
|
|
|
|
53
|
if (!defined $combined[-1]) { |
742
|
6
|
|
|
|
|
12
|
pop @combined; |
743
|
6
|
|
|
|
|
17
|
last; |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
# If this is the only list remaining, append the rest and done |
747
|
|
|
|
|
|
|
elsif (@invlists == 1) { |
748
|
0
|
|
|
|
|
0
|
push @combined, @{$invlists[0]}[$pos[0] .. $#{$invlists[0]}]; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
749
|
0
|
|
|
|
|
0
|
last; |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
} |
752
|
14
|
|
|
|
|
34
|
while ($combined[-1] > $max_codepoint) { |
753
|
1
|
|
|
|
|
3
|
pop @combined; |
754
|
|
|
|
|
|
|
} |
755
|
|
|
|
|
|
|
# If the list ends with inclusion, and the max_codepoint is less than unicode max, |
756
|
|
|
|
|
|
|
# end the list with it. |
757
|
14
|
100
|
100
|
|
|
53
|
if (1 & @combined and $max_codepoint < 0x10FFFF) { |
758
|
1
|
|
|
|
|
2
|
push @combined, $max_codepoint+1; |
759
|
|
|
|
|
|
|
} |
760
|
14
|
|
|
|
|
47
|
return \@combined; |
761
|
|
|
|
|
|
|
} |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
1; |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
__END__ |