line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Net::IDN::UTS46; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require 5.008005; # Unicode BiDi classes |
4
|
|
|
|
|
|
|
|
5
|
12
|
|
|
12
|
|
648272
|
use strict; |
|
12
|
|
|
|
|
82
|
|
|
12
|
|
|
|
|
374
|
|
6
|
12
|
|
|
12
|
|
1188
|
use utf8; |
|
12
|
|
|
|
|
49
|
|
|
12
|
|
|
|
|
64
|
|
7
|
12
|
|
|
12
|
|
285
|
use warnings; |
|
12
|
|
|
|
|
23
|
|
|
12
|
|
|
|
|
302
|
|
8
|
|
|
|
|
|
|
|
9
|
12
|
|
|
12
|
|
66
|
use Carp; |
|
12
|
|
|
|
|
23
|
|
|
12
|
|
|
|
|
1607
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = "2.499_20180929"; |
12
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = ('Exporter'); |
15
|
|
|
|
|
|
|
our @EXPORT = (); |
16
|
|
|
|
|
|
|
our @EXPORT_OK = ('uts46_to_ascii', 'uts46_to_unicode'); |
17
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => \@EXPORT_OK ); |
18
|
|
|
|
|
|
|
|
19
|
12
|
|
|
12
|
|
4838
|
use Unicode::Normalize (); |
|
12
|
|
|
|
|
16659
|
|
|
12
|
|
|
|
|
403
|
|
20
|
|
|
|
|
|
|
|
21
|
12
|
|
|
12
|
|
2803
|
use Net::IDN::Punycode 1.1 (':all'); |
|
12
|
|
|
|
|
224
|
|
|
12
|
|
|
|
|
1351
|
|
22
|
12
|
|
|
12
|
|
3078
|
use Net::IDN::Encode 2.100 (':_var'); |
|
12
|
|
|
|
|
222
|
|
|
12
|
|
|
|
|
1620
|
|
23
|
12
|
|
|
12
|
|
12796
|
use Net::IDN::UTS46::_Mapping 5.002 ('/^(Is|Map).*/'); # UTS #46 is only defined from Unicode 5.2.0 |
|
12
|
|
|
|
|
995
|
|
|
12
|
|
|
|
|
6432
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub uts46_to_unicode { |
26
|
7794
|
|
|
7794
|
1
|
30159
|
my ($label, %param) = @_; |
27
|
7794
|
50
|
|
|
|
21462
|
croak "Transitional processing is not defined for ToUnicode" if $param{'TransitionalProcessing'}; |
28
|
|
|
|
|
|
|
|
29
|
7794
|
|
|
|
|
18997
|
splice @_, 1, 0, undef; |
30
|
7794
|
|
|
|
|
28793
|
goto &_process; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub uts46_to_ascii { |
34
|
12413
|
|
|
12413
|
1
|
4909640
|
my ($label, %param) = @_; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
splice @_, 1, 0, sub { |
37
|
2342
|
|
|
2342
|
|
4648
|
local $_ = shift; |
38
|
2342
|
100
|
|
|
|
7164
|
if(m/\P{ASCII}/) { |
39
|
1713
|
|
|
|
|
2834
|
eval { $_ = $IDNA_PREFIX . encode_punycode($_) }; |
|
1713
|
|
|
|
|
5158
|
|
40
|
1713
|
50
|
|
|
|
4124
|
croak "$@ [A3]" if $@; |
41
|
|
|
|
|
|
|
} |
42
|
2342
|
|
|
|
|
5028
|
return $_; |
43
|
12413
|
|
|
|
|
67032
|
}; |
44
|
12413
|
|
|
|
|
44182
|
goto &_process; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
*to_unicode = \&uts46_to_unicode; |
48
|
|
|
|
|
|
|
*to_ascii = \&uts46_to_ascii; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _process { |
51
|
20207
|
|
|
20207
|
|
51314
|
my ($label, $to_ascii, %param) = @_; |
52
|
12
|
|
|
12
|
|
3679
|
no warnings 'utf8'; |
|
12
|
|
|
|
|
26
|
|
|
12
|
|
|
|
|
2262
|
|
53
|
|
|
|
|
|
|
croak "The following parameter is invalid: $_" |
54
|
20207
|
|
|
|
|
49365
|
foreach(grep { !m/^(?:TransitionalProcessing|UseSTD3ASCIIRules|AllowUnassigned)$/ } keys %param); |
|
20181
|
|
|
|
|
131914
|
|
55
|
|
|
|
|
|
|
|
56
|
20207
|
100
|
|
|
|
60722
|
$param{'TransitionalProcessing'} = 0 unless exists $param{'TransitionalProcessing'}; |
57
|
20207
|
100
|
|
|
|
56069
|
$param{'UseSTD3ASCIIRules'} = 1 unless exists $param{'UseSTD3ASCIIRules'}; |
58
|
20207
|
100
|
|
|
|
51736
|
$param{'AllowUnassigned'} = 0 unless exists $param{'AllowUnassigned'}; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# 1. Map |
61
|
|
|
|
|
|
|
# - disallowed |
62
|
|
|
|
|
|
|
# |
63
|
20207
|
100
|
|
|
|
44714
|
if($param{'AllowUnassigned'}) { |
64
|
2
|
50
|
|
|
|
22
|
$label =~ m/(\p{Is_DisallowedAssigned})/ and croak sprintf('disallowed character U+%04X', ord($1)); |
65
|
|
|
|
|
|
|
} else { |
66
|
20205
|
100
|
|
|
|
1128336
|
$label =~ m/(\p{IsDisallowed})/ and croak sprintf('disallowed character U+%04X', ord($1)); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
12044
|
100
|
|
|
|
85962
|
if($param{'UseSTD3ASCIIRules'}) { |
70
|
12041
|
100
|
|
|
|
195389
|
$label =~ m/(\p{IsDisallowedSTD3Valid})/ and croak sprintf('disallowed_STD3_valid character U+%04X', ord($1)); |
71
|
10818
|
100
|
|
|
|
44067
|
$label =~ m/(\p{IsDisallowedSTD3Mapped})/ and croak sprintf('disallowed_STD3_mapped character U+%04X', ord($1)); |
72
|
|
|
|
|
|
|
}; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# - ignored |
75
|
|
|
|
|
|
|
# |
76
|
10787
|
|
|
|
|
39610
|
$label = MapIgnored($label); |
77
|
|
|
|
|
|
|
## $label = MapDisallowedSTD3Ignored($label) if(!$param{'UseSTD3ASCIIRules'}); |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# - mapped |
80
|
|
|
|
|
|
|
# |
81
|
10787
|
|
|
|
|
27539
|
$label = MapMapped($label); |
82
|
10787
|
100
|
|
|
|
27255
|
$label = MapDisallowedSTD3Mapped($label) if(!$param{'UseSTD3ASCIIRules'}); |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# - deviation |
85
|
10787
|
100
|
|
|
|
27745
|
$label = MapDeviation($label) if($param{'TransitionalProcessing'}); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# 2. Normalize |
88
|
|
|
|
|
|
|
# |
89
|
10787
|
|
|
|
|
89519
|
$label = Unicode::Normalize::NFC($label); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# 3. Break |
92
|
|
|
|
|
|
|
# |
93
|
10787
|
|
|
|
|
46062
|
my @ll = split /\./, $label, -1; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
## IDNA test vectors: an empty label at the end (separating the root domain |
96
|
|
|
|
|
|
|
## "", if present) must be preserved. It is not checked for |
97
|
|
|
|
|
|
|
## the minumum length criteria and the dot separting it is |
98
|
|
|
|
|
|
|
## not included in the maximum length of the domain. |
99
|
|
|
|
|
|
|
## |
100
|
10787
|
100
|
100
|
|
|
58021
|
my $rooted = @ll && length($ll[$#ll]) < 1; pop @ll if $rooted; |
|
10787
|
|
|
|
|
25227
|
|
101
|
10787
|
|
|
|
|
17825
|
my $is_bidi = 0; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# 4. Convert/Validate |
104
|
|
|
|
|
|
|
# |
105
|
10787
|
|
|
|
|
20374
|
foreach my $l (@ll) { |
106
|
16833
|
100
|
|
|
|
73465
|
if($l =~ m/^(?:(?i)$IDNA_PREFIX)(\p{ASCII}+)$/o) { |
107
|
8378
|
|
|
|
|
15778
|
eval { $l = decode_punycode($1); }; |
|
8378
|
|
|
|
|
26055
|
|
108
|
8378
|
100
|
|
|
|
20881
|
croak 'Invalid Punycode sequence [P4]' if $@; |
109
|
|
|
|
|
|
|
|
110
|
8373
|
50
|
|
|
|
31132
|
_validate_label($l, %param, |
111
|
|
|
|
|
|
|
'TransitionalProcessing' => 0, |
112
|
|
|
|
|
|
|
) unless $@; |
113
|
|
|
|
|
|
|
} else { |
114
|
8455
|
|
|
|
|
44826
|
_validate_label($l,%param,'_AssumeNFC' => 1); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
9968
|
100
|
100
|
|
|
48767
|
$is_bidi = 1 if !$is_bidi && $l =~ m/[\p{Bc:R}\p{Bc:AL}\p{Bc:AN}]/; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
3922
|
|
|
|
|
7826
|
foreach my $l (@ll) { |
121
|
5801
|
100
|
|
|
|
17751
|
_validate_bidi($l,%param) if $is_bidi; |
122
|
4033
|
|
|
|
|
12400
|
_validate_contextj($l,%param); |
123
|
|
|
|
|
|
|
|
124
|
3661
|
100
|
|
|
|
9208
|
if(defined $to_ascii) { |
125
|
2342
|
|
|
|
|
6330
|
$l = $to_ascii->($l, %param); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
## IDNA test vectors: labels have to be checked for the minimum length of 1 (but not for the |
129
|
|
|
|
|
|
|
## maximum length of 63) even in to_unicode. |
130
|
|
|
|
|
|
|
## |
131
|
3661
|
100
|
|
|
|
25142
|
croak "empty label [A4_2]" if length($l) < 1; |
132
|
3520
|
100
|
100
|
|
|
18329
|
croak "label too long [A4_2]" if length($l) > 63 and defined $to_ascii; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
1573
|
|
|
|
|
4429
|
my $domain = join('.', @ll); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
## IDNA test vectors: domains have to be checked for the minimum length of 1 (but not for the |
138
|
|
|
|
|
|
|
## maximum length of 253 excluding a final dot) even in to_unicode. |
139
|
|
|
|
|
|
|
## |
140
|
1573
|
100
|
|
|
|
4380
|
croak "empty domain name [A4_1]" if length($domain) < 1; |
141
|
1568
|
100
|
100
|
|
|
5621
|
croak "domain name too long [A4_1]" if length($domain) > 253 and defined $to_ascii; |
142
|
|
|
|
|
|
|
|
143
|
1556
|
100
|
|
|
|
3207
|
$domain .= '.' if $rooted; |
144
|
|
|
|
|
|
|
|
145
|
1556
|
|
|
|
|
14807
|
return $domain; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _validate_label { |
149
|
16828
|
|
|
16828
|
|
57050
|
my($l,%param) = @_; |
150
|
12
|
|
|
12
|
|
30702
|
no warnings 'utf8'; |
|
12
|
|
|
|
|
29
|
|
|
12
|
|
|
|
|
1520
|
|
151
|
|
|
|
|
|
|
|
152
|
16828
|
100
|
66
|
|
|
83892
|
$l eq Unicode::Normalize::NFC($l) or croak "not in Unicode Normalization Form NFC [V1]" unless $param{'_AssumeNFC'}; |
153
|
|
|
|
|
|
|
|
154
|
16825
|
100
|
|
|
|
57068
|
$l =~ m/^..--/ and croak "contains U+002D HYPHEN-MINUS in both third and forth position [V2]"; |
155
|
16705
|
100
|
|
|
|
120871
|
$l =~ m/^-/ and croak "begins with U+002D HYPHEN-MINUS [V3]"; |
156
|
15973
|
100
|
|
|
|
106438
|
$l =~ m/-$/ and croak "ends with U+002D HYPHEN-MINUS [V3]"; |
157
|
15371
|
50
|
|
|
|
30995
|
$l =~ m/\./ and croak "contains U+0023 FULL STOP [V4]"; |
158
|
15371
|
100
|
|
|
|
274939
|
$l =~ m/^\p{IsMark}/ and croak "begins with General_Category=Mark [V5]"; |
159
|
|
|
|
|
|
|
|
160
|
13439
|
100
|
|
|
|
29041
|
unless($param{'AllowUnassigned'}) { |
161
|
13437
|
100
|
|
|
|
292828
|
$l =~m/(\p{Unassigned})/ and croak sprintf "contains unassigned character U+%04X [V6]", ord $1; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
11375
|
100
|
|
|
|
23792
|
if($param{'UseSTD3ASCIIRules'}) { |
165
|
11372
|
100
|
|
|
|
74805
|
$l =~m/(\p{IsDisallowedSTD3Valid})/ and croak sprintf "contains disallowed_STD3_valid character U+%04X [V6]", ord $1; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
10994
|
100
|
|
|
|
23519
|
if($param{'TransitionalProcessing'}) { |
169
|
2019
|
50
|
|
|
|
5112
|
$l =~ m/(\p{IsDeviation})/ and croak sprintf "contains deviation character U+%04X [V6]", ord $1; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
10994
|
50
|
|
|
|
28181
|
$l =~ m/(\p{IsIgnored})/ and croak sprintf "contains ignored character U+%04X [V6]", ord $1; |
173
|
10994
|
100
|
|
|
|
63102
|
$l =~ m/(\p{IsMapped}|\p{IsDisallowedSTD3Mapped})/ and croak sprintf "contains mapped character U+%04X [V6]", ord $1; |
174
|
10946
|
100
|
|
|
|
155723
|
$l =~ m/(\p{IsDisallowed})/ and croak sprintf "contains disallowed character U+%04X [V6]", ord $1; |
175
|
|
|
|
|
|
|
|
176
|
9968
|
|
|
|
|
63673
|
return 1; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# For perl versions < 5.11, there is a bug where Bc:L does not match some |
180
|
|
|
|
|
|
|
# character blocks that are not fully included in the main UnicodeData.txt file: |
181
|
|
|
|
|
|
|
# |
182
|
|
|
|
|
|
|
# 3400;;Lo;0;L;;;;;N;;;;; |
183
|
|
|
|
|
|
|
# 4DB5;;Lo;0;L;;;;;N;;;;; |
184
|
|
|
|
|
|
|
# 4E00;;Lo;0;L;;;;;N;;;;; |
185
|
|
|
|
|
|
|
# 9FBB;;Lo;0;L;;;;;N;;;;; |
186
|
|
|
|
|
|
|
# AC00;;Lo;0;L;;;;;N;;;;; |
187
|
|
|
|
|
|
|
# D7A3;;Lo;0;L;;;;;N;;;;; |
188
|
|
|
|
|
|
|
# 20000;;Lo;0;L;;;;;N;;;;; |
189
|
|
|
|
|
|
|
# 2A6D6;;Lo;0;L;;;;;N;;;;; |
190
|
|
|
|
|
|
|
# |
191
|
|
|
|
|
|
|
my $_RE_BidiClass_L = $] >= 5.011 ? '\p{Bc:L}' : '\p{Bc:L}\x{3400}-\x{4DB5}\x{4E00}-\x{9FBB}\x{AC00}-\x{D7A3}\x{20000}-\x{2A6D6}'; |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub _validate_bidi { |
194
|
3249
|
|
|
3249
|
|
9130
|
my($l,%param) = @_; |
195
|
12
|
|
|
12
|
|
25682
|
no warnings 'utf8'; |
|
12
|
|
|
|
|
28
|
|
|
12
|
|
|
|
|
1409
|
|
196
|
|
|
|
|
|
|
|
197
|
3249
|
100
|
|
|
|
8892
|
return 1 unless length($l); |
198
|
|
|
|
|
|
|
|
199
|
3200
|
100
|
|
|
|
12331
|
if( $l =~ m/^[$_RE_BidiClass_L]/o ) { # LTR (left-to-right) |
200
|
1009
|
100
|
|
|
|
52579
|
$l =~ m/[^$_RE_BidiClass_L\p{Bc:EN}\p{Bc:ES}\p{Bc:CS}\p{Bc:ET}\p{Bc:BN}\p{Bc:ON}\p{Bc:NSM}]/o and croak 'contains characters with wrong bidi class for LTR [B5]'; |
201
|
630
|
100
|
|
|
|
13489
|
$l =~ m/[$_RE_BidiClass_L\p{Bc:EN}][\p{Bc:NSM}\P{Assigned}]*$/o or croak 'ends with character of wrong bidi class for LTR [B6]'; |
202
|
552
|
|
|
|
|
2921
|
return 1; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
2191
|
100
|
|
|
|
7089
|
if( $l =~ m/^[\p{Bc:R}\p{Bc:AL}]/ ) { # RTL (right-to-left) |
206
|
1334
|
100
|
|
|
|
45099
|
$l =~ m/[^\p{Bc:R}\p{Bc:AL}\p{Bc:AN}\p{Bc:EN}\p{Bc:ES}\p{Bc:CS}\p{Bc:ET}\p{Bc:ON}\p{Bc:BN}\p{Bc:NSM}]/ and croak 'contains characters with wrong bidi class for RTL [B2]'; |
207
|
1012
|
100
|
|
|
|
16768
|
$l =~ m/[\p{Bc:R}\p{Bc:AL}\p{Bc:EN}\p{Bc:AN}][\p{Bc:NSM}\P{Assigned}]*$/ or croak 'ends with character of wrong bidi class for RTL [B3]'; |
208
|
922
|
100
|
|
|
|
8716
|
$l =~ m/\p{Bc:EN}.*\p{Bc:AN}|\p{Bc:AN}.*\p{Bc:EN}/ and croak 'contains characters with both bidi class EN and AN [B4]'; |
209
|
880
|
|
|
|
|
2353
|
return 1; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
857
|
|
|
|
|
110750
|
croak 'starts with character of wrong bidi class [B1]'; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# For perl versions < 5.11, some Unicode properties such as Ccc or Joining_Type |
216
|
|
|
|
|
|
|
# are not supported. Instead, we use a conrete list of characters; this is safe |
217
|
|
|
|
|
|
|
# because the Unicode version supported by theses perl versions will not be |
218
|
|
|
|
|
|
|
# updated. For newer perl versions, we use the Unicode property (which is |
219
|
|
|
|
|
|
|
# supported from 5.11), so we will always be up-to-date with the Unicode |
220
|
|
|
|
|
|
|
# version supported by our underlying perl. |
221
|
|
|
|
|
|
|
# |
222
|
|
|
|
|
|
|
my $_RE_Ccc_Virama = $] >= 5.011 ? qr/\p{Ccc:Virama}/ : qr/[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B4D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0DCA}\x{0E3A}\x{0F84}\x{1039}\x{103A}\x{1714}\x{1734}\x{17D2}\x{1A60}\x{1B44}\x{1BAA}\x{1BF2}\x{1BF3}\x{2D7F}\x{A806}\x{A8C4}\x{A953}\x{A9C0}\x{ABED}\x{00010A3F}\x{00011046}\x{000110B9}]/; |
223
|
|
|
|
|
|
|
my $_RE_JoiningType_L = $] >= 5.011 ? qr/\p{Joining_Type:L}/ : qr/(?!)/; |
224
|
|
|
|
|
|
|
my $_RE_JoiningType_R = $] >= 5.011 ? qr/\p{Joining_Type:R}/ : qr/[\x{0622}-\x{0625}\x{0627}\x{0629}\x{062F}-\x{0632}\x{0648}\x{0671}-\x{0673}\x{0675}-\x{0677}\x{0688}-\x{0699}\x{06C0}\x{06C3}-\x{06CB}\x{06CD}\x{06CF}\x{06D2}\x{06D3}\x{06D5}\x{06EE}\x{06EF}\x{0710}\x{0715}-\x{0719}\x{071E}\x{0728}\x{072A}\x{072C}\x{072F}\x{074D}\x{0759}-\x{075B}\x{076B}\x{076C}\x{0771}\x{0773}\x{0774}\x{0778}\x{0779}]/; |
225
|
|
|
|
|
|
|
my $_RE_JoiningType_D = $] >= 5.011 ? qr/\p{Joining_Type:D}/ : qr/[\x{0620}\x{0626}\x{0628}\x{062A}-\x{062E}\x{0633}-\x{063F}\x{0641}-\x{0647}\x{0649}\x{064A}\x{066E}\x{066F}\x{0678}-\x{0687}\x{069A}-\x{06BF}\x{06C1}\x{06C2}\x{06CC}\x{06CE}\x{06D0}\x{06D1}\x{06FA}-\x{06FC}\x{06FF}\x{0712}-\x{0714}\x{071A}-\x{071D}\x{071F}-\x{0727}\x{0729}\x{072B}\x{072D}\x{072E}\x{074E}-\x{0758}\x{075C}-\x{076A}\x{076D}-\x{0770}\x{0772}\x{0775}-\x{0777}\x{077A}-\x{077F}\x{07CA}-\x{07EA}]/; |
226
|
|
|
|
|
|
|
my $_RE_JoiningType_T = $] >= 5.011 ? qr/\p{Joining_Type:T}/ : qr/[\x{00AD}\x{0300}-\x{036F}\x{0483}-\x{0489}\x{0591}-\x{05BD}\x{05BF}\x{05C1}\x{05C2}\x{05C4}\x{05C5}\x{05C7}\x{0610}-\x{061A}\x{064B}-\x{065F}\x{0670}\x{06D6}-\x{06DC}\x{06DF}-\x{06E4}\x{06E7}\x{06E8}\x{06EA}-\x{06ED}\x{070F}\x{0711}\x{0730}-\x{074A}\x{07A6}-\x{07B0}\x{07EB}-\x{07F3}\x{0816}-\x{0819}\x{081B}-\x{0823}\x{0825}-\x{0827}\x{0829}-\x{082D}\x{0859}-\x{085B}\x{0900}-\x{0902}\x{093A}\x{093C}\x{0941}-\x{0948}\x{094D}\x{0951}-\x{0957}\x{0962}\x{0963}\x{0981}\x{09BC}\x{09C1}-\x{09C4}\x{09CD}\x{09E2}\x{09E3}\x{0A01}\x{0A02}\x{0A3C}\x{0A41}\x{0A42}\x{0A47}\x{0A48}\x{0A4B}-\x{0A4D}\x{0A51}\x{0A70}\x{0A71}\x{0A75}\x{0A81}\x{0A82}\x{0ABC}\x{0AC1}-\x{0AC5}\x{0AC7}\x{0AC8}\x{0ACD}\x{0AE2}\x{0AE3}\x{0B01}\x{0B3C}\x{0B3F}\x{0B41}-\x{0B44}\x{0B4D}\x{0B56}\x{0B62}\x{0B63}\x{0B82}\x{0BC0}\x{0BCD}\x{0C3E}-\x{0C40}\x{0C46}-\x{0C48}\x{0C4A}-\x{0C4D}\x{0C55}\x{0C56}\x{0C62}\x{0C63}\x{0CBC}\x{0CBF}\x{0CC6}\x{0CCC}\x{0CCD}\x{0CE2}\x{0CE3}\x{0D41}-\x{0D44}\x{0D4D}\x{0D62}\x{0D63}\x{0DCA}\x{0DD2}-\x{0DD4}\x{0DD6}\x{0E31}\x{0E34}-\x{0E3A}\x{0E47}-\x{0E4E}\x{0EB1}\x{0EB4}-\x{0EB9}\x{0EBB}\x{0EBC}\x{0EC8}-\x{0ECD}\x{0F18}\x{0F19}\x{0F35}\x{0F37}\x{0F39}\x{0F71}-\x{0F7E}\x{0F80}-\x{0F84}\x{0F86}\x{0F87}\x{0F8D}-\x{0F97}\x{0F99}-\x{0FBC}\x{0FC6}\x{102D}-\x{1030}\x{1032}-\x{1037}\x{1039}\x{103A}\x{103D}\x{103E}\x{1058}\x{1059}\x{105E}-\x{1060}\x{1071}-\x{1074}\x{1082}\x{1085}\x{1086}\x{108D}\x{109D}\x{135D}-\x{135F}\x{1712}-\x{1714}\x{1732}-\x{1734}\x{1752}\x{1753}\x{1772}\x{1773}\x{17B4}\x{17B5}\x{17B7}-\x{17BD}\x{17C6}\x{17C9}-\x{17D3}\x{17DD}\x{180B}-\x{180D}\x{18A9}\x{1920}-\x{1922}\x{1927}\x{1928}\x{1932}\x{1939}-\x{193B}\x{1A17}\x{1A18}\x{1A56}\x{1A58}-\x{1A5E}\x{1A60}\x{1A62}\x{1A65}-\x{1A6C}\x{1A73}-\x{1A7C}\x{1A7F}\x{1B00}-\x{1B03}\x{1B34}\x{1B36}-\x{1B3A}\x{1B3C}\x{1B42}\x{1B6B}-\x{1B73}\x{1B80}\x{1B81}\x{1BA2}-\x{1BA5}\x{1BA8}\x{1BA9}\x{1BE6}\x{1BE8}\x{1BE9}\x{1BED}\x{1BEF}-\x{1BF1}\x{1C2C}-\x{1C33}\x{1C36}\x{1C37}\x{1CD0}-\x{1CD2}\x{1CD4}-\x{1CE0}\x{1CE2}-\x{1CE8}\x{1CED}\x{1DC0}-\x{1DE6}\x{1DFC}-\x{1DFF}\x{200B}\x{200E}\x{200F}\x{202A}-\x{202E}\x{2060}-\x{2064}\x{206A}-\x{206F}\x{20D0}-\x{20F0}\x{2CEF}-\x{2CF1}\x{2D7F}\x{2DE0}-\x{2DFF}\x{302A}-\x{302F}\x{3099}\x{309A}\x{A66F}-\x{A672}\x{A67C}\x{A67D}\x{A6F0}\x{A6F1}\x{A802}\x{A806}\x{A80B}\x{A825}\x{A826}\x{A8C4}\x{A8E0}-\x{A8F1}\x{A926}-\x{A92D}\x{A947}-\x{A951}\x{A980}-\x{A982}\x{A9B3}\x{A9B6}-\x{A9B9}\x{A9BC}\x{AA29}-\x{AA2E}\x{AA31}\x{AA32}\x{AA35}\x{AA36}\x{AA43}\x{AA4C}\x{AAB0}\x{AAB2}-\x{AAB4}\x{AAB7}\x{AAB8}\x{AABE}\x{AABF}\x{AAC1}\x{ABE5}\x{ABE8}\x{ABED}\x{FB1E}\x{FE00}-\x{FE0F}\x{FE20}-\x{FE26}\x{FEFF}\x{FFF9}-\x{FFFB}\x{101FD}\x{10A01}-\x{10A03}\x{10A05}\x{10A06}\x{10A0C}-\x{10A0F}\x{10A38}-\x{10A3A}\x{10A3F}\x{11001}\x{11038}-\x{11046}\x{11080}\x{11081}\x{110B3}-\x{110B6}\x{110B9}\x{110BA}\x{110BD}\x{1D167}-\x{1D169}\x{1D173}-\x{1D182}\x{1D185}-\x{1D18B}\x{1D1AA}-\x{1D1AD}\x{1D242}-\x{1D244}\x{E0001}\x{E0020}-\x{E007F}\x{E0100}-\x{E01EF}]/; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub _validate_contextj { |
229
|
4033
|
|
|
4033
|
|
10660
|
my($l,%param) = @_; |
230
|
12
|
|
|
12
|
|
100803
|
no warnings 'utf8'; |
|
12
|
|
|
|
|
34
|
|
|
12
|
|
|
|
|
715
|
|
231
|
4033
|
100
|
66
|
|
|
16937
|
return 1 unless defined($l) && length($l); |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
# catch ContextJ characters without defined rule (as of Unicode 6.0.0, this cannot match) |
234
|
|
|
|
|
|
|
# |
235
|
3892
|
50
|
|
|
|
13167
|
$l =~ m/([^\x{200C}\x{200D}\P{Join_Control}])/ and croak sprintf "contains CONTEXTJ character U+%04X without defined rule [C1]", ord($1); |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# RFC 5892, Appendix A.1. ZERO WIDTH NON-JOINER |
238
|
|
|
|
|
|
|
# Code point: |
239
|
|
|
|
|
|
|
# U+200C |
240
|
|
|
|
|
|
|
# |
241
|
|
|
|
|
|
|
# Overview: |
242
|
|
|
|
|
|
|
# This may occur in a formally cursive script (such as Arabic) in a |
243
|
|
|
|
|
|
|
# context where it breaks a cursive connection as required for |
244
|
|
|
|
|
|
|
# orthographic rules, as in the Persian language, for example. It |
245
|
|
|
|
|
|
|
# also may occur in Indic scripts in a consonant-conjunct context |
246
|
|
|
|
|
|
|
# (immediately following a virama), to control required display of |
247
|
|
|
|
|
|
|
# such conjuncts. |
248
|
|
|
|
|
|
|
# |
249
|
|
|
|
|
|
|
# |
250
|
|
|
|
|
|
|
# Lookup: |
251
|
|
|
|
|
|
|
# True |
252
|
|
|
|
|
|
|
# |
253
|
|
|
|
|
|
|
# Rule Set: |
254
|
|
|
|
|
|
|
# False; |
255
|
|
|
|
|
|
|
# If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
256
|
|
|
|
|
|
|
# If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
257
|
|
|
|
|
|
|
# (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
258
|
|
|
|
|
|
|
|
259
|
3892
|
100
|
100
|
|
|
51250
|
$l =~ m/ |
260
|
|
|
|
|
|
|
$_RE_Ccc_Virama |
261
|
|
|
|
|
|
|
\x{200C} |
262
|
|
|
|
|
|
|
| |
263
|
|
|
|
|
|
|
(?: $_RE_JoiningType_L | $_RE_JoiningType_D) $_RE_JoiningType_T* |
264
|
|
|
|
|
|
|
\x{200C} |
265
|
|
|
|
|
|
|
$_RE_JoiningType_T*(?: $_RE_JoiningType_R | $_RE_JoiningType_D) |
266
|
|
|
|
|
|
|
| |
267
|
|
|
|
|
|
|
(\x{200C}) |
268
|
|
|
|
|
|
|
/xo and defined($1) and croak sprintf "rule for CONTEXTJ character U+%04X not satisfied [C2]", ord($1); |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# RFC 5892, Appendix A.2. ZERO WIDTH JOINER |
271
|
|
|
|
|
|
|
# |
272
|
|
|
|
|
|
|
# Code point: |
273
|
|
|
|
|
|
|
# U+200D |
274
|
|
|
|
|
|
|
# |
275
|
|
|
|
|
|
|
# Overview: |
276
|
|
|
|
|
|
|
# This may occur in Indic scripts in a consonant-conjunct context |
277
|
|
|
|
|
|
|
# (immediately following a virama), to control required display of |
278
|
|
|
|
|
|
|
# such conjuncts. |
279
|
|
|
|
|
|
|
# |
280
|
|
|
|
|
|
|
# Lookup: |
281
|
|
|
|
|
|
|
# True |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# Rule Set: |
284
|
|
|
|
|
|
|
# False; |
285
|
|
|
|
|
|
|
# If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
286
|
|
|
|
|
|
|
|
287
|
3694
|
100
|
100
|
|
|
48282
|
$l =~ m/ |
288
|
|
|
|
|
|
|
$_RE_Ccc_Virama |
289
|
|
|
|
|
|
|
\x{200D} |
290
|
|
|
|
|
|
|
| |
291
|
|
|
|
|
|
|
(\x{200D}) |
292
|
|
|
|
|
|
|
/xo and defined($1) and croak sprintf "rule for CONTEXTJ character U+%04X not satisfied [C2]", ord($1); |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
1; |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
__END__ |