| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Mock::Data::Charset; |
|
2
|
9
|
|
|
9
|
|
252426
|
use strict; |
|
|
9
|
|
|
|
|
27
|
|
|
|
9
|
|
|
|
|
282
|
|
|
3
|
9
|
|
|
9
|
|
46
|
use warnings; |
|
|
9
|
|
|
|
|
18
|
|
|
|
9
|
|
|
|
|
270
|
|
|
4
|
9
|
|
|
9
|
|
1020
|
use Mock::Data::Util qw( _parse_context _escape_str ); |
|
|
9
|
|
|
|
|
22
|
|
|
|
9
|
|
|
|
|
59717
|
|
|
5
|
|
|
|
|
|
|
require Carp; |
|
6
|
|
|
|
|
|
|
our @CARP_NOT= ('Mock::Data::Util'); |
|
7
|
|
|
|
|
|
|
require Mock::Data::Generator; |
|
8
|
|
|
|
|
|
|
our @ISA= ( 'Mock::Data::Generator' ); |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# ABSTRACT: Generator of strings from a set of characters |
|
11
|
|
|
|
|
|
|
our $VERSION = '0.02'; # VERSION |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @generator_attrs= qw( str_len min_codepoint max_codepoint ); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub new { |
|
17
|
45
|
|
|
45
|
1
|
132949
|
my $class= shift; |
|
18
|
45
|
|
|
|
|
85
|
my (%self, %parse); |
|
19
|
|
|
|
|
|
|
# make the common case fast |
|
20
|
45
|
100
|
100
|
|
|
202
|
if (@_ == 1 && !ref $_[0]) { |
|
21
|
7
|
|
|
|
|
653
|
qr/[$_[0]]/; |
|
22
|
7
|
|
|
|
|
200
|
%self= ( notation => $_[0] ); |
|
23
|
7
|
50
|
|
|
|
18
|
if (ref $class) { |
|
24
|
0
|
|
0
|
|
|
0
|
$self{generator_opts} ||= { %{ $class->{generator_opts} } }; |
|
|
0
|
|
|
|
|
0
|
|
|
25
|
0
|
|
0
|
|
|
0
|
$self{max_codepoint} //= $class->{max_codepoint}; |
|
26
|
0
|
|
|
|
|
0
|
$class= ref $class; |
|
27
|
|
|
|
|
|
|
} |
|
28
|
7
|
|
|
|
|
47
|
return bless \%self, $class; |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
38
|
100
|
|
|
|
146
|
%self= @_ != 1? @_ : %{$_[0]}; |
|
|
14
|
|
|
|
|
50
|
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Look for fields from the parser |
|
34
|
38
|
100
|
|
|
|
227
|
$parse{classes}= delete $self{classes} if defined $self{classes}; |
|
35
|
38
|
100
|
|
|
|
106
|
$parse{codepoints}= delete $self{codepoints} if defined $self{codepoints}; |
|
36
|
38
|
50
|
|
|
|
92
|
$parse{codepoint_ranges}= delete $self{codepoint_ranges} if defined $self{codepoint_ranges}; |
|
37
|
38
|
50
|
|
|
|
93
|
$parse{negate}= delete $self{negate} if defined $self{negate}; |
|
38
|
38
|
100
|
|
|
|
86
|
if (defined $self{chars}) { |
|
39
|
3
|
|
|
|
|
6
|
push @{$parse{codepoints}}, map ord, @{$self{chars}}; |
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
11
|
|
|
40
|
3
|
|
|
|
|
7
|
delete $self{chars}; |
|
41
|
|
|
|
|
|
|
} |
|
42
|
38
|
50
|
|
|
|
92
|
if (defined $self{ranges}) { |
|
43
|
0
|
|
|
|
|
0
|
push @{$parse{codepoint_ranges}}, |
|
44
|
|
|
|
|
|
|
map +( ref $_? ( ord $_->[0], ord $_->[1] ) : ord ), |
|
45
|
0
|
0
|
|
|
|
0
|
@{$self{ranges}}; |
|
|
0
|
|
|
|
|
0
|
|
|
46
|
0
|
|
|
|
|
0
|
delete $self{ranges}; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# If called on an object, carry over some settings |
|
50
|
38
|
50
|
|
|
|
91
|
if (ref $class) { |
|
51
|
0
|
0
|
0
|
|
|
0
|
if (!keys %parse && !defined $self{notation} && !$self{members} && !$self{member_invlist}) { |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
52
|
|
|
|
|
|
|
@self{'_parse','notation','members','member_invlist'}= |
|
53
|
0
|
|
|
|
|
0
|
@{$class}{'_parse','notation','members','member_invlist'}; |
|
|
0
|
|
|
|
|
0
|
|
|
54
|
|
|
|
|
|
|
} |
|
55
|
0
|
|
0
|
|
|
0
|
$self{$_} //= $class->{$_} for @generator_attrs; |
|
56
|
0
|
|
|
|
|
0
|
$class= ref $class; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
38
|
100
|
66
|
|
|
181
|
if (defined $self{notation} && !keys %parse) { |
|
|
|
100
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# want to trigger the syntax error exception now, not lazily later on |
|
61
|
21
|
|
|
|
|
431
|
qr/[$self{notation}]/; |
|
62
|
|
|
|
|
|
|
} |
|
63
|
|
|
|
|
|
|
elsif (keys %parse) { |
|
64
|
11
|
|
|
|
|
27
|
$self{_parse}= \%parse; |
|
65
|
|
|
|
|
|
|
Carp::croak("Charset-building options (classes, chars, codepoints, ranges, codepoint_ranges, negate)" |
|
66
|
|
|
|
|
|
|
." cannot be combined with members, member_invlist or notation attributes") |
|
67
|
11
|
50
|
33
|
|
|
96
|
if $self{members} or $self{member_invlist}; # allow notation to preserve original text |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
else { |
|
70
|
|
|
|
|
|
|
# At least one of members, member_invlist, notation, or _parse must be specified |
|
71
|
|
|
|
|
|
|
Carp::croak("Require at least one of members, member_invlist, notation, or charset-building options") |
|
72
|
6
|
50
|
33
|
|
|
27
|
unless $self{members} or $self{member_invlist}; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
38
|
|
|
|
|
2135
|
return bless \%self, $class; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub _parse { |
|
79
|
|
|
|
|
|
|
# If the '_parse' wasn't initialized, it can be derived from members or member_invlist or notation |
|
80
|
26
|
100
|
|
26
|
|
98
|
$_[0]{_parse} || do { |
|
81
|
15
|
|
|
|
|
28
|
my $self= shift; |
|
82
|
15
|
50
|
|
|
|
36
|
if (defined $self->{notation}) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
83
|
15
|
|
|
|
|
52
|
$self->{_parse}= $self->parse($self->{notation}); |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
elsif ($self->{members}) { |
|
86
|
0
|
|
|
|
|
0
|
$self->{_parse}{codepoints}= [ map ord, @{$self->{members}} ]; |
|
|
0
|
|
|
|
|
0
|
|
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
elsif (my $inv= $self->{member_invlist}) { |
|
89
|
0
|
|
|
|
|
0
|
my $i; |
|
90
|
0
|
|
|
|
|
0
|
for ($i= 0; $i < $#$inv; $i+= 2) { |
|
91
|
0
|
0
|
|
|
|
0
|
if ($inv->[$i] + 1 == $inv->[$i+1]) { push @{$self->{_parse}{codepoints}}, $inv->[$i] } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
92
|
0
|
|
|
|
|
0
|
else { push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], $inv->[$i+1] - 1; } |
|
|
0
|
|
|
|
|
0
|
|
|
93
|
|
|
|
|
|
|
} |
|
94
|
0
|
0
|
|
|
|
0
|
if ($i == $#$inv) { |
|
95
|
0
|
|
0
|
|
|
0
|
push @{$self->{_parse}{codepoint_ranges}}, $inv->[$i], ($self->max_codepoint || 0x10FFFF); |
|
|
0
|
|
|
|
|
0
|
|
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
} |
|
98
|
0
|
|
|
|
|
0
|
else { die "Unhandled lazy-build scenario" } |
|
99
|
15
|
|
|
|
|
53
|
$self->{_parse}; |
|
100
|
|
|
|
|
|
|
}; |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub notation { |
|
105
|
14
|
|
66
|
14
|
1
|
1286
|
$_[0]{notation} //= _deparse_charset($_[0]->_parse); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub min_codepoint { |
|
110
|
112
|
50
|
|
112
|
1
|
202
|
$_[0]{min_codepoint}= $_[1] if @_ > 1; |
|
111
|
|
|
|
|
|
|
$_[0]{min_codepoint} |
|
112
|
112
|
|
|
|
|
243
|
} |
|
113
|
|
|
|
|
|
|
sub max_codepoint { |
|
114
|
|
|
|
|
|
|
$_[0]{max_codepoint} |
|
115
|
146
|
|
|
146
|
1
|
288
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub str_len { |
|
119
|
112
|
50
|
|
112
|
1
|
231
|
$_[0]{str_len}= $_[1] if @_ > 1; |
|
120
|
112
|
|
|
|
|
293
|
$_[0]{str_len}; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
sub count { |
|
125
|
861
|
100
|
|
861
|
1
|
296991
|
$_[0]{members}? scalar @{$_[0]{members}} |
|
|
83
|
|
|
|
|
179
|
|
|
126
|
|
|
|
|
|
|
: $_[0]->_invlist_index->[-1]; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub members { |
|
131
|
103
|
|
66
|
103
|
1
|
313
|
$_[0]{members} ||= $_[0]->_build_members; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub _build_members { |
|
135
|
20
|
|
|
20
|
|
34
|
my $self= shift; |
|
136
|
20
|
|
|
|
|
42
|
my $invlist= $self->member_invlist; |
|
137
|
20
|
|
|
|
|
33
|
my @members; |
|
138
|
20
|
50
|
|
|
|
55
|
if (@$invlist > 1) { |
|
139
|
|
|
|
|
|
|
push @members, map chr, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1) |
|
140
|
20
|
|
|
|
|
472
|
for 0 .. (($#$invlist-1)>>1); |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
# an odd number of elements means the list ends with an "include-all" |
|
143
|
20
|
50
|
|
|
|
63
|
push @members, map chr, $invlist->[-1] .. 0x10FFFF |
|
144
|
|
|
|
|
|
|
if 1 & @$invlist; |
|
145
|
20
|
|
|
|
|
75
|
return \@members; |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::expand_invlist { |
|
149
|
6
|
|
|
6
|
|
10263
|
my $invlist= shift; |
|
150
|
6
|
|
|
|
|
11
|
my @members; |
|
151
|
6
|
100
|
|
|
|
19
|
if (@$invlist > 1) { |
|
152
|
|
|
|
|
|
|
push @members, $invlist->[$_*2] .. ($invlist->[$_*2+1]-1) |
|
153
|
5
|
|
|
|
|
32
|
for 0 .. (($#$invlist-1)>>1); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
# an odd number of elements means the list ends with an "include-all" |
|
156
|
6
|
100
|
|
|
|
19
|
push @members, $invlist->[-1] .. 0x10FFFF |
|
157
|
|
|
|
|
|
|
if 1 & @$invlist; |
|
158
|
6
|
|
|
|
|
14
|
return \@members; |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# The index is private because there's not a good way to explain it to the user |
|
162
|
|
|
|
|
|
|
sub _invlist_index { |
|
163
|
2329
|
|
|
2329
|
|
3427
|
my $self= shift; |
|
164
|
2329
|
|
66
|
|
|
6623
|
$self->{_invlist_index} ||= Mock::Data::Charset::Util::create_invlist_index($self->member_invlist); |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::create_invlist_index { |
|
168
|
24
|
|
|
24
|
|
5012
|
my $invlist= shift; |
|
169
|
24
|
|
|
|
|
59
|
my $n_spans= (@$invlist + 1) >> 1; |
|
170
|
24
|
|
|
|
|
32
|
my @index; |
|
171
|
24
|
|
|
|
|
87
|
$#index= $n_spans-1; |
|
172
|
24
|
|
|
|
|
39
|
my $total= 0; |
|
173
|
|
|
|
|
|
|
$index[$_]= $total += $invlist->[$_*2+1] - $invlist->[$_*2] |
|
174
|
24
|
|
|
|
|
312
|
for 0 .. (@$invlist >> 1)-1; |
|
175
|
24
|
100
|
|
|
|
83
|
if (@$invlist & 1) { # In the case that the final range is infinite |
|
176
|
6
|
|
|
|
|
19
|
$index[$n_spans-1]= $total + 0x110000 - $invlist->[-1]; |
|
177
|
|
|
|
|
|
|
} |
|
178
|
24
|
|
|
|
|
117
|
\@index; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub member_invlist { |
|
183
|
1605
|
50
|
|
1605
|
1
|
3336
|
if (@_ > 1) { |
|
184
|
0
|
|
|
|
|
0
|
$_[0]{member_invlist}= $_[1]; |
|
185
|
0
|
|
|
|
|
0
|
delete $_[0]{_invlist_index}; |
|
186
|
0
|
|
|
|
|
0
|
delete $_[0]{members}; |
|
187
|
0
|
|
|
|
|
0
|
delete $_[0]{notation}; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
1605
|
|
66
|
|
|
4013
|
$_[0]{member_invlist} //= _build_member_invlist(@_); |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub _build_member_invlist { |
|
193
|
34
|
|
|
34
|
|
63
|
my $self= shift; |
|
194
|
34
|
|
|
|
|
83
|
my $max_codepoint= $self->max_codepoint; |
|
195
|
|
|
|
|
|
|
# If the search space is small, and there is already a regex notation, it is probably faster |
|
196
|
|
|
|
|
|
|
# to iterate and let perl do the work than to parse the charset. |
|
197
|
34
|
|
|
|
|
51
|
my $invlist; |
|
198
|
34
|
100
|
66
|
|
|
216
|
if (!defined $max_codepoint || $max_codepoint > 1000 || !defined $self->{notation}) { |
|
|
|
|
100
|
|
|
|
|
|
199
|
25
|
|
100
|
|
|
110
|
$max_codepoint ||= 0x10FFFF; |
|
200
|
25
|
|
|
|
|
45
|
$invlist= eval { |
|
201
|
25
|
|
|
|
|
65
|
_parsed_charset_to_invlist($self->_parse, $max_codepoint); |
|
202
|
|
|
|
|
|
|
}# or main::diag $@ |
|
203
|
|
|
|
|
|
|
} |
|
204
|
34
|
|
66
|
|
|
169
|
$invlist ||= _charset_invlist_brute_force($self->notation, $max_codepoint); |
|
205
|
|
|
|
|
|
|
# If a user writes to the invlist, it will become out of sync with the Index, |
|
206
|
|
|
|
|
|
|
# leading to confusing bugs. |
|
207
|
34
|
50
|
|
|
|
241
|
if (Internals->can('SvREADONLY')) { |
|
208
|
34
|
|
|
|
|
446
|
Internals::SvREADONLY($_,1) for @$invlist; |
|
209
|
34
|
|
|
|
|
80
|
Internals::SvREADONLY(@$invlist,1); |
|
210
|
|
|
|
|
|
|
} |
|
211
|
34
|
|
|
|
|
171
|
return $invlist; |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Lazy-built string of all basic-multilingual-plane characters |
|
215
|
|
|
|
|
|
|
our $_ascii_chars; |
|
216
|
|
|
|
|
|
|
our $_unicode_chars; |
|
217
|
|
|
|
|
|
|
sub _build_unicode_chars { |
|
218
|
1
|
50
|
|
1
|
|
5
|
unless (defined $_unicode_chars) { |
|
219
|
|
|
|
|
|
|
# Construct ranges of valid characters separated by NUL. |
|
220
|
|
|
|
|
|
|
# Older perls die when the regex engine encounters an invalid character |
|
221
|
|
|
|
|
|
|
# but newer perls just treat the invalid character as "not a member", |
|
222
|
|
|
|
|
|
|
# unless the set is a negation in which case non-characters *are* a member. |
|
223
|
|
|
|
|
|
|
# This makes the assumption that if a non-char isn't a member then \0 won't |
|
224
|
|
|
|
|
|
|
# be either. |
|
225
|
1
|
|
|
|
|
3
|
$_unicode_chars= ''; |
|
226
|
1
|
|
|
|
|
10820
|
$_unicode_chars .= chr($_) for 0 .. 0xD7FF; |
|
227
|
1
|
|
|
|
|
12
|
$_unicode_chars .= "\0"; |
|
228
|
1
|
|
|
|
|
109
|
$_unicode_chars .= chr($_) for 0xFDF0 .. 0xFFFD; |
|
229
|
1
|
|
|
|
|
4
|
for (1..16) { |
|
230
|
16
|
|
|
|
|
177
|
$_unicode_chars .= "\0"; |
|
231
|
16
|
|
|
|
|
215436
|
$_unicode_chars .= chr($_) for ($_<<16) .. (($_<<16)|0xFFFD); |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
} |
|
234
|
1
|
|
|
|
|
9
|
\$_unicode_chars; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _charset_invlist_brute_force { |
|
238
|
9
|
|
|
9
|
|
26
|
my ($set, $max_codepoint)= @_; |
|
239
|
9
|
100
|
|
|
|
41
|
my $inv= (ord $set == ord '^')? substr($set,1) : '^'.$set; |
|
240
|
9
|
|
|
|
|
13
|
my @invlist; |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# optimize common case |
|
243
|
9
|
100
|
|
|
|
28
|
if ($max_codepoint < 256) { |
|
244
|
|
|
|
|
|
|
# Find first character of every match and first character of every non-match |
|
245
|
|
|
|
|
|
|
# and convert to codepoints. |
|
246
|
8
|
100
|
66
|
|
|
438
|
@invlist= map +(defined $_? ord($_) : ()), |
|
247
|
|
|
|
|
|
|
($_ascii_chars //= join('', map chr($_), 0..255)) |
|
248
|
|
|
|
|
|
|
=~ / ( [$set] ) (?> [$set]* ) (?: \z | ( [$inv] ) (?> [$inv]* ) )/gx; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
else { |
|
251
|
1
|
50
|
|
|
|
7
|
_build_unicode_chars() unless defined $_unicode_chars; |
|
252
|
|
|
|
|
|
|
# This got complicated while trying to support perls that can't match against non-characters. |
|
253
|
|
|
|
|
|
|
# The non-characters have been replaced by NULs, so need to capture the char before and after |
|
254
|
|
|
|
|
|
|
# each transition in case one of them is a NUL. |
|
255
|
1
|
50
|
|
|
|
5924
|
my @endpoints= |
|
256
|
|
|
|
|
|
|
($max_codepoint < 0x10FFFF? substr($_unicode_chars,0,$max_codepoint+1) : $_unicode_chars) |
|
257
|
|
|
|
|
|
|
=~ /( [$set] ) ( [$set] )* ( \z | [$inv] ) ( [$inv] )* /gx; |
|
258
|
1
|
50
|
|
|
|
9
|
if (@endpoints) { |
|
259
|
|
|
|
|
|
|
# List is a multiple of 4 elements: (first-member,last-member,first-non-member,last-non-member) |
|
260
|
|
|
|
|
|
|
# We're not interested in the span of non-members at the end, so just remove those. |
|
261
|
1
|
|
|
|
|
2
|
pop @endpoints; pop @endpoints; |
|
|
1
|
|
|
|
|
3
|
|
|
262
|
|
|
|
|
|
|
# Iterate every transition of member/nonmember, and use the second character if present |
|
263
|
|
|
|
|
|
|
# and isn't a NUL, else use the first character and add 1. |
|
264
|
1
|
|
|
|
|
7
|
push @invlist, ord $endpoints[0]; |
|
265
|
1
|
|
|
|
|
6
|
for (my $i= 1; $i < @endpoints; $i+= 2) { |
|
266
|
19
|
100
|
66
|
|
|
64
|
if (defined $endpoints[$i+1] && ord $endpoints[$i+1]) { |
|
|
|
50
|
|
|
|
|
|
|
267
|
18
|
|
|
|
|
41
|
push @invlist, ord $endpoints[$i+1]; |
|
268
|
|
|
|
|
|
|
} elsif (defined $endpoints[$i]) { |
|
269
|
1
|
|
|
|
|
5
|
push @invlist, 1 + ord $endpoints[$i]; |
|
270
|
|
|
|
|
|
|
} else { |
|
271
|
0
|
|
|
|
|
0
|
push @invlist, 1 + $invlist[-1]; |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
# substr is an estimate, because string skips characters, so remove any spurrous |
|
275
|
|
|
|
|
|
|
# codepoints beyond the max |
|
276
|
1
|
|
66
|
|
|
12
|
pop @invlist while @invlist && $invlist[-1] > $max_codepoint; |
|
277
|
|
|
|
|
|
|
} |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
# If an "infinite" range would be returned, but the user set a maximum codepoint, |
|
280
|
|
|
|
|
|
|
# list the max codepoint as the end of the invlist. |
|
281
|
9
|
100
|
66
|
|
|
916
|
if ($max_codepoint < 0x10FFFF and 1 & @invlist) { |
|
282
|
4
|
|
|
|
|
11
|
push @invlist, $max_codepoint+1; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
9
|
|
|
|
|
41
|
return \@invlist; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub _parsed_charset_to_invlist { |
|
288
|
25
|
|
|
25
|
|
92
|
my ($parse, $max_codepoint)= @_; |
|
289
|
25
|
|
|
|
|
41
|
my @invlists; |
|
290
|
|
|
|
|
|
|
# convert the character list into an inversion list |
|
291
|
25
|
100
|
|
|
|
72
|
if (defined (my $cp= $parse->{codepoints})) { |
|
292
|
5
|
|
|
|
|
22
|
my @chars= sort { $a <=> $b } @$cp; |
|
|
6
|
|
|
|
|
18
|
|
|
293
|
5
|
|
|
|
|
13
|
my @invlist= (shift @chars); |
|
294
|
5
|
|
|
|
|
11
|
push @invlist, $invlist[0] + 1; |
|
295
|
5
|
|
|
|
|
19
|
for (my $i= 0; $i <= $#chars; $i++) { |
|
296
|
|
|
|
|
|
|
# If the next char is adjacent, extend the span |
|
297
|
5
|
100
|
|
|
|
20
|
if ($invlist[-1] == $chars[$i]) { |
|
298
|
2
|
|
|
|
|
7
|
++$invlist[-1]; |
|
299
|
|
|
|
|
|
|
} else { |
|
300
|
3
|
|
|
|
|
9
|
push @invlist, $chars[$i], $chars[$i]+1; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
} |
|
303
|
5
|
|
|
|
|
16
|
push @invlists, \@invlist; |
|
304
|
|
|
|
|
|
|
} |
|
305
|
|
|
|
|
|
|
# Each range is an inversion list already |
|
306
|
25
|
100
|
|
|
|
67
|
if (my $r= $parse->{codepoint_ranges}) { |
|
307
|
9
|
|
|
|
|
32
|
for (my $i= 0; $i < (@$r >> 1); $i++) { |
|
308
|
10
|
|
|
|
|
32
|
my ($start, $limit)= ($r->[$i*2], $r->[$i*2+1]+1); |
|
309
|
|
|
|
|
|
|
# Try to combine the range with the most recent inversion list, if possible, |
|
310
|
10
|
100
|
66
|
|
|
54
|
if (@invlists && $invlists[-1][-1] < $start) { |
|
|
|
50
|
33
|
|
|
|
|
|
311
|
1
|
|
|
|
|
4
|
push @{ $invlists[-1] }, $start, $limit; |
|
|
1
|
|
|
|
|
5
|
|
|
312
|
|
|
|
|
|
|
} elsif (@invlists && $invlists[-1][0] > $limit) { |
|
313
|
0
|
|
|
|
|
0
|
unshift @{ $invlists[-1] }, $start, $limit; |
|
|
0
|
|
|
|
|
0
|
|
|
314
|
|
|
|
|
|
|
} else { |
|
315
|
|
|
|
|
|
|
# else just start a new inversion list |
|
316
|
9
|
|
|
|
|
34
|
push @invlists, [ $start, $limit ] |
|
317
|
|
|
|
|
|
|
} |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
# Convert each character class to an inversion list. |
|
321
|
25
|
100
|
|
|
|
59
|
if ($parse->{classes}) { |
|
322
|
|
|
|
|
|
|
push @invlists, _class_invlist($_) |
|
323
|
11
|
|
|
|
|
35
|
for @{ $parse->{classes} }; |
|
|
11
|
|
|
|
|
44
|
|
|
324
|
|
|
|
|
|
|
} |
|
325
|
25
|
|
|
|
|
2417
|
my $invlist= Mock::Data::Charset::Util::merge_invlists(\@invlists, $max_codepoint); |
|
326
|
|
|
|
|
|
|
# Perform negation of inversion list by either starting at char 0 or removing char 0 |
|
327
|
25
|
100
|
|
|
|
74
|
if ($parse->{negate}) { |
|
328
|
1
|
50
|
|
|
|
4
|
if ($invlist->[0]) { unshift @$invlist, 0 } |
|
|
1
|
|
|
|
|
4
|
|
|
329
|
0
|
|
|
|
|
0
|
else { shift @$invlist; } |
|
330
|
|
|
|
|
|
|
} |
|
331
|
25
|
|
|
|
|
140
|
return $invlist; |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
our $_compile; |
|
336
|
|
|
|
|
|
|
sub compile { |
|
337
|
3
|
|
|
3
|
1
|
20
|
local $_compile= 1; |
|
338
|
3
|
|
|
|
|
12
|
shift->generate(@_); |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
sub generate { |
|
341
|
112
|
|
|
112
|
1
|
207
|
my ($self, $mock)= (shift, shift); |
|
342
|
112
|
|
|
|
|
230
|
my ($len, $cp_min, $cp_max, $member_count) |
|
343
|
|
|
|
|
|
|
= ($self->str_len, $self->min_codepoint, $self->max_codepoint, $self->count); |
|
344
|
112
|
100
|
|
|
|
245
|
if (@_) { |
|
345
|
82
|
100
|
|
|
|
187
|
my %opts= ref $_[0] eq 'HASH'? %{ shift() } : (); |
|
|
79
|
|
|
|
|
302
|
|
|
346
|
82
|
100
|
33
|
|
|
211
|
$len= @_? shift : $opts{str_len} // $opts{len} // $opts{size}; # allow some aliases for length |
|
|
|
|
33
|
|
|
|
|
|
347
|
82
|
|
66
|
|
|
243
|
$cp_min= $opts{min_codepoint} // $cp_min; |
|
348
|
82
|
|
66
|
|
|
241
|
$cp_max= $opts{max_codepoint} // $cp_max; |
|
349
|
|
|
|
|
|
|
} |
|
350
|
112
|
100
|
100
|
|
|
415
|
my ($memb_min, $memb_span)= !defined $cp_min && !defined $cp_max? (0,$member_count) |
|
351
|
|
|
|
|
|
|
: $self->_codepoint_minmax_to_member_range($cp_min, $cp_max); |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# If compiling, $len will be a function, else it will be an integer |
|
354
|
0
|
|
|
0
|
|
0
|
$len= !defined $len? ($_compile? sub { 1 } : 1 ) |
|
355
|
0
|
|
|
0
|
|
0
|
: !ref $len? ($_compile? sub { $len } : $len) |
|
356
|
|
|
|
|
|
|
: ref $len eq 'ARRAY'? ( |
|
357
|
0
|
|
|
0
|
|
0
|
$_compile? sub { $len->[0] + int rand ($len->[1] - $len->[0]) } |
|
358
|
112
|
50
|
|
|
|
353
|
: $len->[0] + int rand ($len->[1] - $len->[0]) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
) |
|
360
|
|
|
|
|
|
|
: ref $len eq 'CODE'? ($_compile? $len : $len->($mock)) |
|
361
|
|
|
|
|
|
|
: Carp::croak("Unknown str_len specification '$len'"); |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# If member list is small-ish, use faster direct array access |
|
364
|
112
|
100
|
100
|
|
|
367
|
if ($self->{members} || $member_count < 500) { |
|
365
|
96
|
|
|
|
|
187
|
my $members= $self->members; |
|
366
|
|
|
|
|
|
|
return sub { |
|
367
|
3
|
|
|
3
|
|
9
|
my $buf= ''; |
|
368
|
|
|
|
|
|
|
$buf .= $members->[$memb_min + int rand $memb_span] |
|
369
|
3
|
|
|
|
|
10
|
for 1..$len->($_[0]); |
|
370
|
3
|
|
|
|
|
35
|
return $buf; |
|
371
|
96
|
100
|
|
|
|
218
|
} if $_compile; |
|
372
|
93
|
|
|
|
|
150
|
my $buf= ''; |
|
373
|
|
|
|
|
|
|
$buf .= $members->[$memb_min + int rand $memb_span] |
|
374
|
93
|
|
|
|
|
433
|
for 1..$len; |
|
375
|
93
|
|
|
|
|
402
|
return $buf; |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
else { |
|
378
|
16
|
|
|
|
|
34
|
my $invlist= $self->member_invlist; |
|
379
|
16
|
|
|
|
|
27
|
my $index= $self->_invlist_index; |
|
380
|
|
|
|
|
|
|
return sub { |
|
381
|
0
|
|
|
0
|
|
0
|
my $ret= ''; |
|
382
|
|
|
|
|
|
|
$ret .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index) |
|
383
|
0
|
|
|
|
|
0
|
for 1..$len->($_[0]); |
|
384
|
16
|
50
|
|
|
|
38
|
} if $_compile; |
|
385
|
16
|
|
|
|
|
27
|
my $buf= ''; |
|
386
|
|
|
|
|
|
|
$buf .= chr _get_invlist_element($memb_min + int rand($memb_span), $invlist, $index) |
|
387
|
16
|
|
|
|
|
62
|
for 1..$len; |
|
388
|
16
|
|
|
|
|
66
|
return $buf; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _codepoint_minmax_to_member_range { |
|
393
|
27
|
|
|
27
|
|
42
|
my $self= shift; |
|
394
|
27
|
|
|
|
|
48
|
my ($cp_min, $cp_max)= @_; |
|
395
|
|
|
|
|
|
|
my $memb_min= !defined $cp_min? 0 |
|
396
|
27
|
100
|
|
|
|
53
|
: do { |
|
397
|
6
|
|
|
|
|
15
|
my ($at, $ins)= _find_invlist_element($cp_min, $self->member_invlist, $self->_invlist_index); |
|
398
|
6
|
|
33
|
|
|
19
|
$at // $ins |
|
399
|
|
|
|
|
|
|
}; |
|
400
|
|
|
|
|
|
|
my $memb_lim= !defined $cp_max? $self->count |
|
401
|
27
|
50
|
|
|
|
50
|
: do { |
|
402
|
27
|
|
|
|
|
57
|
my ($at, $ins)= _find_invlist_element($cp_max, $self->member_invlist, $self->_invlist_index); |
|
403
|
27
|
100
|
|
|
|
68
|
defined $at? $at + 1 : $ins; |
|
404
|
|
|
|
|
|
|
}; |
|
405
|
27
|
|
|
|
|
65
|
return ($memb_min, $memb_lim-$memb_min); |
|
406
|
|
|
|
|
|
|
} |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub parse { |
|
410
|
31
|
|
|
31
|
1
|
22332
|
my ($self, $notation)= @_; |
|
411
|
31
|
50
|
|
|
|
100
|
return { codepoints => [] } unless length $notation; |
|
412
|
31
|
50
|
|
|
|
123
|
return { classes => ['All'] } if $notation eq '^'; |
|
413
|
31
|
|
|
|
|
75
|
$notation .= ']'; |
|
414
|
|
|
|
|
|
|
# parse function needs $_ to be the input string |
|
415
|
31
|
|
|
|
|
98
|
pos($notation)= 0; |
|
416
|
31
|
|
|
|
|
140
|
return _parse_charset() for $notation; |
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
our $have_prop_invlist; |
|
420
|
|
|
|
|
|
|
our %_parse_charset_backslash= ( |
|
421
|
|
|
|
|
|
|
a => ord "\a", |
|
422
|
|
|
|
|
|
|
b => ord "\b", |
|
423
|
|
|
|
|
|
|
c => sub { die "Unimplemented: \\c" }, |
|
424
|
|
|
|
|
|
|
d => sub { push @{$_[0]{classes}}, 'digit'; undef; }, |
|
425
|
|
|
|
|
|
|
D => sub { push @{$_[0]{classes}}, '^digit'; undef; }, |
|
426
|
|
|
|
|
|
|
e => ord "\e", |
|
427
|
|
|
|
|
|
|
f => ord "\f", |
|
428
|
|
|
|
|
|
|
h => sub { push @{$_[0]{classes}}, 'horizspace'; undef; }, |
|
429
|
|
|
|
|
|
|
H => sub { push @{$_[0]{classes}}, '^horizspace'; undef; }, |
|
430
|
|
|
|
|
|
|
n => ord "\n", |
|
431
|
|
|
|
|
|
|
N => \&_parse_charset_namedchar, |
|
432
|
|
|
|
|
|
|
o => \&_parse_charset_oct, |
|
433
|
|
|
|
|
|
|
p => \&_parse_charset_classname, |
|
434
|
|
|
|
|
|
|
P => sub { _parse_charset_classname(shift, 1) }, |
|
435
|
|
|
|
|
|
|
r => ord "\r", |
|
436
|
|
|
|
|
|
|
s => sub { push @{$_[0]{classes}}, 'space'; undef; }, |
|
437
|
|
|
|
|
|
|
S => sub { push @{$_[0]{classes}}, '^space'; undef; }, |
|
438
|
|
|
|
|
|
|
t => ord "\t", |
|
439
|
|
|
|
|
|
|
v => sub { push @{$_[0]{classes}}, 'vertspace'; undef; }, |
|
440
|
|
|
|
|
|
|
V => sub { push @{$_[0]{classes}}, '^vertspace'; undef; }, |
|
441
|
|
|
|
|
|
|
w => sub { push @{$_[0]{classes}}, 'word'; undef; }, |
|
442
|
|
|
|
|
|
|
W => sub { push @{$_[0]{classes}}, '^word'; undef; }, |
|
443
|
|
|
|
|
|
|
x => \&_parse_charset_hex, |
|
444
|
|
|
|
|
|
|
0 => \&_parse_charset_oct, |
|
445
|
|
|
|
|
|
|
1 => \&_parse_charset_oct, |
|
446
|
|
|
|
|
|
|
2 => \&_parse_charset_oct, |
|
447
|
|
|
|
|
|
|
3 => \&_parse_charset_oct, |
|
448
|
|
|
|
|
|
|
4 => \&_parse_charset_oct, |
|
449
|
|
|
|
|
|
|
5 => \&_parse_charset_oct, |
|
450
|
|
|
|
|
|
|
6 => \&_parse_charset_oct, |
|
451
|
|
|
|
|
|
|
7 => \&_parse_charset_oct, |
|
452
|
|
|
|
|
|
|
8 => \&_parse_charset_oct, |
|
453
|
|
|
|
|
|
|
9 => \&_parse_charset_oct, |
|
454
|
|
|
|
|
|
|
); |
|
455
|
|
|
|
|
|
|
our %_class_invlist_cache= ( |
|
456
|
|
|
|
|
|
|
'Any' => [ 0 ], |
|
457
|
|
|
|
|
|
|
'\\N' => [ 0, ord("\n"), 1+ord("\n") ], |
|
458
|
|
|
|
|
|
|
); |
|
459
|
|
|
|
|
|
|
sub _class_invlist { |
|
460
|
22
|
|
|
22
|
|
10809
|
my $class= shift; |
|
461
|
22
|
100
|
|
|
|
51
|
if (ord $class == ord '^') { |
|
462
|
5
|
|
|
|
|
25
|
return Mock::Data::Charset::Util::negate_invlist( |
|
463
|
|
|
|
|
|
|
_class_invlist(substr($class,1)) |
|
464
|
|
|
|
|
|
|
); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
17
|
|
66
|
|
|
82
|
return $_class_invlist_cache{$class} ||= do { |
|
467
|
8
|
100
|
|
|
|
26
|
$have_prop_invlist= do { require Unicode::UCD; !!Unicode::UCD->can('prop_invlist') } |
|
|
2
|
|
|
|
|
2673
|
|
|
|
2
|
|
|
|
|
46089
|
|
|
468
|
|
|
|
|
|
|
unless defined $have_prop_invlist; |
|
469
|
8
|
50
|
|
|
|
41
|
$have_prop_invlist? [ Unicode::UCD::prop_invlist($class) ] |
|
470
|
|
|
|
|
|
|
: _charset_invlist_brute_force("\\p{$class}", 0x10FFFF); |
|
471
|
|
|
|
|
|
|
}; |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
sub _parse_charset_hex { |
|
474
|
3
|
50
|
|
3
|
|
17
|
/\G( [0-9A-Fa-f]{2} | \{ ([0-9A-Fa-f]+) \} )/gcx |
|
475
|
|
|
|
|
|
|
or die "Invalid hex escape at "._parse_context; |
|
476
|
3
|
100
|
|
|
|
18
|
return hex(defined $2? $2 : $1); |
|
477
|
|
|
|
|
|
|
} |
|
478
|
|
|
|
|
|
|
sub _parse_charset_oct { |
|
479
|
5
|
|
|
5
|
|
17
|
--pos; # The caller ate one of the characters we need to parse |
|
480
|
5
|
50
|
|
|
|
31
|
/\G( [0-7]{3} | 0 | o\{ ([0-7]+) \} ) /gcx |
|
481
|
|
|
|
|
|
|
or die "Invalid octal escape at "._parse_context; |
|
482
|
5
|
100
|
|
|
|
30
|
return oct(defined $2? $2 : $1); |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
sub _parse_charset_namedchar { |
|
485
|
2
|
|
|
2
|
|
1245
|
require charnames; |
|
486
|
2
|
50
|
|
|
|
12322
|
/\G \{ ([^}]+) \} /gcx |
|
487
|
|
|
|
|
|
|
# or die "Invalid named char following \\N at '".substr($_,pos,10)."'"; |
|
488
|
|
|
|
|
|
|
and return charnames::vianame($1); |
|
489
|
|
|
|
|
|
|
# Plain "\N" means every character except \n |
|
490
|
0
|
|
|
|
|
0
|
push @{ $_[0]{classes} }, '\\N'; |
|
|
0
|
|
|
|
|
0
|
|
|
491
|
0
|
|
|
|
|
0
|
return; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
sub _parse_charset_classname { |
|
494
|
7
|
|
|
7
|
|
20
|
my ($result, $negate)= @_; |
|
495
|
7
|
50
|
|
|
|
39
|
/\G \{ ([^}]+) \} /gcx |
|
496
|
|
|
|
|
|
|
or die "Invalid class name following \\p at "._parse_context; |
|
497
|
7
|
100
|
|
|
|
14
|
push @{$result->{classes}}, lc($negate? "^$1" : $1); |
|
|
7
|
|
|
|
|
39
|
|
|
498
|
|
|
|
|
|
|
undef |
|
499
|
7
|
|
|
|
|
18
|
} |
|
500
|
|
|
|
|
|
|
sub _parse_charset { |
|
501
|
36
|
|
|
36
|
|
68
|
my $flags= shift; |
|
502
|
|
|
|
|
|
|
# argument is in $_, starting from pos($_) |
|
503
|
36
|
|
|
|
|
65
|
my %parse; |
|
504
|
|
|
|
|
|
|
my @range; |
|
505
|
36
|
|
|
|
|
109
|
$parse{codepoints}= \my @chars; |
|
506
|
36
|
100
|
|
|
|
139
|
$parse{negate}= 1 if /\G \^ /gcx; |
|
507
|
36
|
50
|
|
|
|
97
|
if (/\G]/gc) { push @chars, ord ']' } |
|
|
0
|
|
|
|
|
0
|
|
|
508
|
36
|
|
|
|
|
64
|
while (1) { |
|
509
|
121
|
|
|
|
|
155
|
my $cp; # literal codepoint to be added |
|
510
|
|
|
|
|
|
|
# Check for special cases |
|
511
|
121
|
100
|
50
|
|
|
490
|
if (/\G ( \\ | - | \[: | \] ) /gcx) { |
|
|
|
50
|
66
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
512
|
78
|
100
|
|
|
|
288
|
if ($1 eq '\\') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
513
|
19
|
50
|
|
|
|
66
|
/\G(.)/gc or die "Unexpected end of input"; |
|
514
|
19
|
|
33
|
|
|
76
|
$cp= $_parse_charset_backslash{$1} || ord $1; |
|
515
|
19
|
100
|
|
|
|
99
|
$cp= $cp->(\%parse) |
|
516
|
|
|
|
|
|
|
if ref $cp; |
|
517
|
|
|
|
|
|
|
} |
|
518
|
|
|
|
|
|
|
elsif ($1 eq '-') { |
|
519
|
18
|
100
|
|
|
|
48
|
if (@range == 1) { |
|
520
|
17
|
|
|
|
|
31
|
push @range, ord '-'; |
|
521
|
17
|
|
|
|
|
38
|
next; |
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
else { |
|
524
|
1
|
|
|
|
|
4
|
$cp= ord '-'; |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
} |
|
527
|
|
|
|
|
|
|
elsif ($1 eq '[:') { |
|
528
|
5
|
50
|
|
|
|
34
|
/\G ( [^:]+ ) :] /gcx |
|
529
|
|
|
|
|
|
|
or die "Invalid character class at "._parse_context; |
|
530
|
5
|
|
|
|
|
12
|
push @{$parse{classes}}, $1; |
|
|
5
|
|
|
|
|
24
|
|
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
else { |
|
533
|
36
|
|
|
|
|
112
|
last; # $1 eq ']'; |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
elsif ($flags && ($flags->{x}||0) >= 2 && /\G[ \t]/gc) { |
|
537
|
0
|
|
|
|
|
0
|
next; # ignore space and tab under /xx |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
else { |
|
540
|
43
|
50
|
|
|
|
130
|
/\G(.)/gc or die "Unexpected end of input"; |
|
541
|
43
|
|
|
|
|
81
|
$cp= ord $1; |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
# If no single character was found, any range-in-progress needs converted to |
|
544
|
|
|
|
|
|
|
# charcters |
|
545
|
68
|
100
|
|
|
|
13418
|
if (!defined $cp) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
546
|
16
|
|
|
|
|
34
|
push @chars, @range; |
|
547
|
16
|
|
|
|
|
25
|
@range= (); |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
# At this point, $cp will contain the next ordinal of the character to include, |
|
550
|
|
|
|
|
|
|
# but it might also be starting or finishing a range. |
|
551
|
|
|
|
|
|
|
elsif (@range == 1) { |
|
552
|
8
|
|
|
|
|
18
|
push @chars, $range[0]; |
|
553
|
8
|
|
|
|
|
16
|
$range[0]= $cp; |
|
554
|
|
|
|
|
|
|
} |
|
555
|
|
|
|
|
|
|
elsif (@range == 2) { |
|
556
|
16
|
|
|
|
|
22
|
push @{$parse{codepoint_ranges}}, $range[0], $cp; |
|
|
16
|
|
|
|
|
46
|
|
|
557
|
16
|
|
|
|
|
31
|
@range= (); |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
else { |
|
560
|
28
|
|
|
|
|
62
|
push @range, $cp; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
|
|
|
|
|
|
#printf "# pos %d cp %d range %s %s include %s\n", pos $_, $cp, $range[0] // '(null)', $range[1] // '(null)', join(',', @include); |
|
563
|
|
|
|
|
|
|
} |
|
564
|
36
|
|
|
|
|
68
|
push @chars, @range; |
|
565
|
36
|
100
|
|
|
|
89
|
if (@chars) { |
|
566
|
13
|
|
|
|
|
65
|
@chars= sort { $a <=> $b } @chars; |
|
|
11
|
|
|
|
|
31
|
|
|
567
|
|
|
|
|
|
|
} else { |
|
568
|
23
|
|
|
|
|
48
|
delete $parse{codepoints}; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
36
|
|
|
|
|
187
|
return \%parse; |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
sub _ord_to_safe_regex_char { |
|
574
|
3
|
0
|
|
3
|
|
33
|
return chr($_[0]) =~ /[\w]/? chr $_[0] |
|
|
|
50
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
: $_[0] <= 0xFF? sprintf('\x%02X',$_[0]) |
|
576
|
|
|
|
|
|
|
: sprintf('\x{%X}',$_[0]) |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub _deparse_charset { |
|
580
|
1
|
|
|
1
|
|
2
|
my $parse= shift; |
|
581
|
1
|
|
|
|
|
3
|
my $str= ''; |
|
582
|
1
|
50
|
|
|
|
5
|
if (my $cp= $parse->{codepoints}) { |
|
583
|
|
|
|
|
|
|
$str .= _ord_to_safe_regex_char($_) |
|
584
|
1
|
|
|
|
|
5
|
for @$cp; |
|
585
|
|
|
|
|
|
|
} |
|
586
|
1
|
50
|
|
|
|
4
|
if (my $r= $parse->{codepoint_ranges}) { |
|
587
|
0
|
|
|
|
|
0
|
for (my $i= 0; $i < (@$r << 1); $i++) { |
|
588
|
0
|
|
|
|
|
0
|
$str .= _ord_to_safe_regex_char($r->[$i*2]) . '-' . _ord_to_safe_regex_char($r->[$i*2+1]); |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
} |
|
591
|
1
|
50
|
|
|
|
5
|
if (my $cl= $parse->{classes}) { |
|
592
|
|
|
|
|
|
|
# TODO: reverse conversions to \h \v etc. |
|
593
|
0
|
|
|
|
|
0
|
for (@$cl) { |
|
594
|
0
|
0
|
|
|
|
0
|
$str .= $_ eq '\N'? '\0-\x09\x0B-\x{10FFFF}' |
|
|
|
0
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
: ord == ord '^'? '\P{'.substr($_,1).'}' |
|
596
|
|
|
|
|
|
|
: '\p{'.$_.'}'; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
} |
|
599
|
1
|
|
|
|
|
6
|
return $str; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
sub get_member { |
|
604
|
749
|
100
|
|
749
|
1
|
3564
|
$_[0]{members}? $_[0]{members}[$_[1]] |
|
605
|
|
|
|
|
|
|
: chr _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index); |
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
sub get_member_codepoint { |
|
609
|
6
|
50
|
|
6
|
1
|
2687
|
$_[0]{members}? ord $_[0]{members}[$_[1]] |
|
610
|
|
|
|
|
|
|
: _get_invlist_element($_[1], $_[0]->member_invlist, $_[0]->_invlist_index); |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub _get_invlist_element { |
|
614
|
1108
|
|
|
1108
|
|
1977
|
my ($ofs, $invlist, $invlist_index)= @_; |
|
615
|
1108
|
50
|
|
|
|
2112
|
$ofs += @$invlist_index if $ofs < 0; |
|
616
|
1108
|
50
|
33
|
|
|
3944
|
return undef if $ofs >= $invlist_index->[-1] || $ofs < 0; |
|
617
|
|
|
|
|
|
|
# Binary Search to find the range that contains this numbered element |
|
618
|
1108
|
|
|
|
|
2072
|
my ($min, $max, $mid)= (0, $#$invlist_index); |
|
619
|
1108
|
|
|
|
|
1594
|
while (1) { |
|
620
|
5772
|
|
|
|
|
7211
|
$mid= ($min+$max) >> 1; |
|
621
|
5772
|
100
|
100
|
|
|
13656
|
if ($ofs >= $invlist_index->[$mid]) { |
|
|
|
100
|
|
|
|
|
|
|
622
|
2758
|
|
|
|
|
3469
|
$min= $mid+1 |
|
623
|
|
|
|
|
|
|
} |
|
624
|
|
|
|
|
|
|
elsif ($mid > 0 && $ofs < $invlist_index->[$mid-1]) { |
|
625
|
1906
|
|
|
|
|
2594
|
$max= $mid-1 |
|
626
|
|
|
|
|
|
|
} |
|
627
|
|
|
|
|
|
|
else { |
|
628
|
1108
|
100
|
|
|
|
2127
|
$ofs -= $invlist_index->[$mid-1] if $mid > 0; |
|
629
|
1108
|
|
|
|
|
4343
|
return $invlist->[$mid*2] + $ofs; |
|
630
|
|
|
|
|
|
|
} |
|
631
|
|
|
|
|
|
|
} |
|
632
|
|
|
|
|
|
|
} |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
sub find_member { |
|
636
|
754
|
|
|
754
|
1
|
2673
|
my ($self, $char)= @_; |
|
637
|
754
|
|
|
|
|
1509
|
return _find_invlist_element(ord $char, $self->member_invlist, $self->_invlist_index); |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
sub _find_invlist_element { |
|
641
|
787
|
|
|
787
|
|
1491
|
my ($codepoint, $invlist, $index)= @_; |
|
642
|
|
|
|
|
|
|
# Binary Search to find the range that contains this numbered element |
|
643
|
787
|
|
|
|
|
1389
|
my ($min, $max, $mid)= (0, $#$invlist); |
|
644
|
787
|
|
|
|
|
1193
|
while (1) { |
|
645
|
5739
|
|
|
|
|
6960
|
$mid= ($min+$max) >> 1; |
|
646
|
5739
|
100
|
100
|
|
|
18815
|
if ($mid > 0 && $codepoint < $invlist->[$mid]) { |
|
|
|
100
|
100
|
|
|
|
|
|
647
|
2331
|
|
|
|
|
3019
|
$max= $mid-1 |
|
648
|
|
|
|
|
|
|
} |
|
649
|
|
|
|
|
|
|
elsif ($mid < $#$invlist && $codepoint >= $invlist->[$mid+1]) { |
|
650
|
2621
|
|
|
|
|
3630
|
$min= $mid+1; |
|
651
|
|
|
|
|
|
|
} |
|
652
|
|
|
|
|
|
|
else { |
|
653
|
787
|
100
|
|
|
|
1543
|
return (undef, 0) unless $codepoint >= $invlist->[$mid]; |
|
654
|
786
|
100
|
|
|
|
1454
|
return $codepoint - $invlist->[$mid] unless $mid > 0; |
|
655
|
778
|
100
|
|
|
|
4728
|
return $codepoint - $invlist->[$mid] + $index->[($mid >> 1) - 1] unless $mid & 1; |
|
656
|
|
|
|
|
|
|
# if $mid is an odd number, the range is excluded, and there is no match |
|
657
|
13
|
100
|
|
|
|
46
|
return undef unless wantarray; |
|
658
|
11
|
|
|
|
|
40
|
return (undef, $index->[($mid-1)>>1]) # return insertion point as second val |
|
659
|
|
|
|
|
|
|
} |
|
660
|
|
|
|
|
|
|
} |
|
661
|
|
|
|
|
|
|
} |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
sub negate { |
|
665
|
0
|
|
|
0
|
1
|
0
|
my $self= shift; |
|
666
|
0
|
|
|
|
|
0
|
my $neg= Mock::Data::Charset::Util::negate_invlist($self->member_invlist, $self->max_codepoint); |
|
667
|
0
|
|
|
|
|
0
|
return $self->new(member_invlist => $neg); |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::negate_invlist { |
|
670
|
5
|
|
|
5
|
|
1246
|
my ($invlist, $max_codepoint)= @_; |
|
671
|
|
|
|
|
|
|
# Toggle first char of 0 |
|
672
|
5
|
50
|
|
|
|
247
|
$invlist= $invlist->[0]? [ 0, @$invlist ] : [ @{$invlist}[1..$#$invlist] ]; |
|
|
0
|
|
|
|
|
0
|
|
|
673
|
|
|
|
|
|
|
# If max_codepoint is defined, and was the final char, remove the range starting at max_codepoint+1 |
|
674
|
5
|
50
|
33
|
|
|
38
|
if (@$invlist & 1 and defined $max_codepoint and $invlist->[-1] == $max_codepoint+1) { |
|
|
|
|
33
|
|
|
|
|
|
675
|
0
|
|
|
|
|
0
|
pop @$invlist; |
|
676
|
|
|
|
|
|
|
} |
|
677
|
5
|
|
|
|
|
23
|
return $invlist; |
|
678
|
|
|
|
|
|
|
} |
|
679
|
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
sub union { |
|
682
|
0
|
|
|
0
|
1
|
0
|
my $self= $_[0]; |
|
683
|
0
|
|
|
|
|
0
|
my @invlists= @_; |
|
684
|
|
|
|
|
|
|
ref eq 'ARRAY' || ($_= $_->member_invlist) |
|
685
|
0
|
|
0
|
|
|
0
|
for @invlists; |
|
686
|
0
|
|
|
|
|
0
|
my $combined= Mock::Data::Charset::Util::merge_invlists(\@invlists, $self->max_codepoint); |
|
687
|
0
|
|
|
|
|
0
|
return $self->new(member_invlist => $combined); |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
#=head2 merge_invlists |
|
691
|
|
|
|
|
|
|
# |
|
692
|
|
|
|
|
|
|
# my $combined_invlist= $charset->merge_invlist( \@list2, \@list3, ... ); |
|
693
|
|
|
|
|
|
|
# my $combined_invlist= merge_invlist( \@list1, \@list2, ... ); |
|
694
|
|
|
|
|
|
|
# |
|
695
|
|
|
|
|
|
|
#Merge one or more inversion lists into a superset of all of them. |
|
696
|
|
|
|
|
|
|
#If called as a method, the L is used as the first list. |
|
697
|
|
|
|
|
|
|
# |
|
698
|
|
|
|
|
|
|
#The return value is an inversion list, which can be wrapped in a Charset object by passing it |
|
699
|
|
|
|
|
|
|
#as the C attribute. |
|
700
|
|
|
|
|
|
|
# |
|
701
|
|
|
|
|
|
|
#The current L applies to the result. If called as a plain function, the |
|
702
|
|
|
|
|
|
|
#C is assumed to be the Unicode maximum of C<0x10FFFF>. |
|
703
|
|
|
|
|
|
|
# |
|
704
|
|
|
|
|
|
|
#=cut |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
sub Mock::Data::Charset::Util::merge_invlists { |
|
707
|
33
|
|
|
33
|
|
9948
|
my @invlists= @{shift()}; |
|
|
33
|
|
|
|
|
78
|
|
|
708
|
33
|
|
100
|
|
|
99
|
my $max_codepoint= shift // 0x10FFFF; |
|
709
|
|
|
|
|
|
|
|
|
710
|
33
|
50
|
|
|
|
80
|
return [] unless @invlists; |
|
711
|
33
|
100
|
|
|
|
135
|
return [@{$invlists[0]}] unless @invlists > 1; |
|
|
19
|
|
|
|
|
101
|
|
|
712
|
14
|
|
|
|
|
26
|
my @combined= (); |
|
713
|
|
|
|
|
|
|
# Repeatedly select the minimum range among the input lists and add it to the result |
|
714
|
14
|
|
|
|
|
42
|
my @pos= (0)x@invlists; |
|
715
|
14
|
|
|
|
|
35
|
while (@invlists) { |
|
716
|
3293
|
|
|
|
|
5552
|
my ($min_ch, $min_i)= ($invlists[0][$pos[0]], 0); |
|
717
|
|
|
|
|
|
|
# Find which inversion list contains the lowest range |
|
718
|
3293
|
|
|
|
|
5627
|
for (my $i= 1; $i < @invlists; $i++) { |
|
719
|
3284
|
100
|
|
|
|
7549
|
if ($invlists[$i][$pos[$i]] < $min_ch) { |
|
720
|
814
|
|
|
|
|
1168
|
$min_ch= $invlists[$i][$pos[$i]]; |
|
721
|
814
|
|
|
|
|
1472
|
$min_i= $i; |
|
722
|
|
|
|
|
|
|
} |
|
723
|
|
|
|
|
|
|
} |
|
724
|
3293
|
100
|
|
|
|
5427
|
last if $min_ch > $max_codepoint; |
|
725
|
|
|
|
|
|
|
# Check for overlap of this new inclusion range with the previous |
|
726
|
3291
|
100
|
100
|
|
|
8048
|
if (@combined && $combined[-1] >= $min_ch) { |
|
727
|
|
|
|
|
|
|
# they overlap, so just replace the end-codepoint of the range |
|
728
|
|
|
|
|
|
|
# if the new endpoint is larger |
|
729
|
3111
|
|
|
|
|
4492
|
my $new_end= $invlists[$min_i][$pos[$min_i]+1]; |
|
730
|
3111
|
100
|
100
|
|
|
8754
|
$combined[-1]= $new_end if !defined $new_end || $combined[-1] < $new_end; |
|
731
|
|
|
|
|
|
|
} |
|
732
|
|
|
|
|
|
|
else { |
|
733
|
|
|
|
|
|
|
# else, simply append the range |
|
734
|
180
|
|
|
|
|
287
|
push @combined, @{$invlists[$min_i]}[$pos[$min_i] .. $pos[$min_i]+1]; |
|
|
180
|
|
|
|
|
427
|
|
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
# If the list is empty now, remove it from consideration |
|
737
|
3291
|
100
|
|
|
|
4524
|
if (($pos[$min_i] += 2) >= @{$invlists[$min_i]}) { |
|
|
3291
|
50
|
|
|
|
8620
|
|
|
738
|
21
|
|
|
|
|
36
|
splice @invlists, $min_i, 1; |
|
739
|
21
|
|
|
|
|
30
|
splice @pos, $min_i, 1; |
|
740
|
|
|
|
|
|
|
# If the invlist ends with an infinite range now, we are done |
|
741
|
21
|
100
|
|
|
|
53
|
if (!defined $combined[-1]) { |
|
742
|
6
|
|
|
|
|
12
|
pop @combined; |
|
743
|
6
|
|
|
|
|
17
|
last; |
|
744
|
|
|
|
|
|
|
} |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
# If this is the only list remaining, append the rest and done |
|
747
|
|
|
|
|
|
|
elsif (@invlists == 1) { |
|
748
|
0
|
|
|
|
|
0
|
push @combined, @{$invlists[0]}[$pos[0] .. $#{$invlists[0]}]; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
749
|
0
|
|
|
|
|
0
|
last; |
|
750
|
|
|
|
|
|
|
} |
|
751
|
|
|
|
|
|
|
} |
|
752
|
14
|
|
|
|
|
34
|
while ($combined[-1] > $max_codepoint) { |
|
753
|
1
|
|
|
|
|
3
|
pop @combined; |
|
754
|
|
|
|
|
|
|
} |
|
755
|
|
|
|
|
|
|
# If the list ends with inclusion, and the max_codepoint is less than unicode max, |
|
756
|
|
|
|
|
|
|
# end the list with it. |
|
757
|
14
|
100
|
100
|
|
|
53
|
if (1 & @combined and $max_codepoint < 0x10FFFF) { |
|
758
|
1
|
|
|
|
|
2
|
push @combined, $max_codepoint+1; |
|
759
|
|
|
|
|
|
|
} |
|
760
|
14
|
|
|
|
|
47
|
return \@combined; |
|
761
|
|
|
|
|
|
|
} |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
1; |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
__END__ |