File Coverage

blib/lib/Net/IDN/UTS46.pm
Criterion Covered Total %
statement 124 124 100.0
branch 98 106 92.4
condition 22 24 91.6
subroutine 19 19 100.0
pod 2 2 100.0
total 265 275 96.3


line stmt bran cond sub pod time code
1             package Net::IDN::UTS46;
2              
3             require 5.008005; # Unicode BiDi classes
4              
5 12     12   638029 use strict;
  12         83  
  12         361  
6 12     12   1270 use utf8;
  12         101  
  12         88  
7 12     12   290 use warnings;
  12         23  
  12         321  
8              
9 12     12   67 use Carp;
  12         22  
  12         1682  
10              
11             our $VERSION = "2.500";
12             $VERSION = eval $VERSION;
13              
14             our @ISA = ('Exporter');
15             our @EXPORT = ();
16             our @EXPORT_OK = ('uts46_to_ascii', 'uts46_to_unicode');
17             our %EXPORT_TAGS = ( 'all' => \@EXPORT_OK );
18              
19 12     12   4992 use Unicode::Normalize ();
  12         17928  
  12         388  
20              
21 12     12   2825 use Net::IDN::Punycode 1.1 (':all');
  12         212  
  12         1690  
22 12     12   2962 use Net::IDN::Encode 2.100 (':_var');
  12         215  
  12         1676  
23 12     12   13040 use Net::IDN::UTS46::_Mapping 5.002 ('/^(Is|Map).*/'); # UTS #46 is only defined from Unicode 5.2.0
  12         944  
  12         6219  
24              
25             sub uts46_to_unicode {
26 7794     7794 1 22321 my ($label, %param) = @_;
27 7794 50       16935 croak "Transitional processing is not defined for ToUnicode" if $param{'TransitionalProcessing'};
28              
29 7794         15657 splice @_, 1, 0, undef;
30 7794         24238 goto &_process;
31             }
32              
33             sub uts46_to_ascii {
34 12413     12413 1 4911555 my ($label, %param) = @_;
35              
36             splice @_, 1, 0, sub {
37 2342     2342   4496 local $_ = shift;
38 2342 100       6814 if(m/\P{ASCII}/) {
39 1713         2874 eval { $_ = $IDNA_PREFIX . encode_punycode($_) };
  1713         8613  
40 1713 50       3861 croak "$@ [A3]" if $@;
41             }
42 2342         4894 return $_;
43 12413         73297 };
44 12413         45001 goto &_process;
45             }
46              
47             *to_unicode = \&uts46_to_unicode;
48             *to_ascii = \&uts46_to_ascii;
49              
50             sub _process {
51 20207     20207   48895 my ($label, $to_ascii, %param) = @_;
52 12     12   3770 no warnings 'utf8';
  12         27  
  12         2167  
53             croak "The following parameter is invalid: $_"
54 20207         45547 foreach(grep { !m/^(?:TransitionalProcessing|UseSTD3ASCIIRules|AllowUnassigned)$/ } keys %param);
  20181         124408  
55              
56 20207 100       56493 $param{'TransitionalProcessing'} = 0 unless exists $param{'TransitionalProcessing'};
57 20207 100       51740 $param{'UseSTD3ASCIIRules'} = 1 unless exists $param{'UseSTD3ASCIIRules'};
58 20207 100       46393 $param{'AllowUnassigned'} = 0 unless exists $param{'AllowUnassigned'};
59              
60             # 1. Map
61             # - disallowed
62             #
63 20207 100       41809 if($param{'AllowUnassigned'}) {
64 2 50       20 $label =~ m/(\p{Is_DisallowedAssigned})/ and croak sprintf('disallowed character U+%04X', ord($1));
65             } else {
66 20205 100       1026862 $label =~ m/(\p{IsDisallowed})/ and croak sprintf('disallowed character U+%04X', ord($1));
67             }
68              
69 12044 100       92606 if($param{'UseSTD3ASCIIRules'}) {
70 12041 100       179035 $label =~ m/(\p{IsDisallowedSTD3Valid})/ and croak sprintf('disallowed_STD3_valid character U+%04X', ord($1));
71 10818 100       39935 $label =~ m/(\p{IsDisallowedSTD3Mapped})/ and croak sprintf('disallowed_STD3_mapped character U+%04X', ord($1));
72             };
73              
74             # - ignored
75             #
76 10787         36221 $label = MapIgnored($label);
77             ## $label = MapDisallowedSTD3Ignored($label) if(!$param{'UseSTD3ASCIIRules'});
78              
79             # - mapped
80             #
81 10787         25839 $label = MapMapped($label);
82 10787 100       25604 $label = MapDisallowedSTD3Mapped($label) if(!$param{'UseSTD3ASCIIRules'});
83              
84             # - deviation
85 10787 100       25318 $label = MapDeviation($label) if($param{'TransitionalProcessing'});
86              
87             # 2. Normalize
88             #
89 10787         80551 $label = Unicode::Normalize::NFC($label);
90              
91             # 3. Break
92             #
93 10787         42860 my @ll = split /\./, $label, -1;
94              
95             ## IDNA test vectors: an empty label at the end (separating the root domain
96             ## "", if present) must be preserved. It is not checked for
97             ## the minumum length criteria and the dot separting it is
98             ## not included in the maximum length of the domain.
99             ##
100 10787 100 100     55359 my $rooted = @ll && length($ll[$#ll]) < 1; pop @ll if $rooted;
  10787         22649  
101 10787         16408 my $is_bidi = 0;
102              
103             # 4. Convert/Validate
104             #
105 10787         18874 foreach my $l (@ll) {
106 16833 100       67037 if($l =~ m/^(?:(?i)$IDNA_PREFIX)(\p{ASCII}+)$/o) {
107 8378         13634 eval { $l = decode_punycode($1); };
  8378         41898  
108 8378 100       19941 croak 'Invalid Punycode sequence [P4]' if $@;
109              
110 8373 50       27619 _validate_label($l, %param,
111             'TransitionalProcessing' => 0,
112             ) unless $@;
113             } else {
114 8455         41058 _validate_label($l,%param,'_AssumeNFC' => 1);
115             }
116              
117 9968 100 100     44796 $is_bidi = 1 if !$is_bidi && $l =~ m/[\p{Bc:R}\p{Bc:AL}\p{Bc:AN}]/;
118             }
119              
120 3922         7182 foreach my $l (@ll) {
121 5801 100       15152 _validate_bidi($l,%param) if $is_bidi;
122 4033         11523 _validate_contextj($l,%param);
123              
124 3661 100       8387 if(defined $to_ascii) {
125 2342         5819 $l = $to_ascii->($l, %param);
126             }
127              
128             ## IDNA test vectors: labels have to be checked for the minimum length of 1 (but not for the
129             ## maximum length of 63) even in to_unicode.
130             ##
131 3661 100       23272 croak "empty label [A4_2]" if length($l) < 1;
132 3520 100 100     17323 croak "label too long [A4_2]" if length($l) > 63 and defined $to_ascii;
133             }
134              
135 1573         4058 my $domain = join('.', @ll);
136              
137             ## IDNA test vectors: domains have to be checked for the minimum length of 1 (but not for the
138             ## maximum length of 253 excluding a final dot) even in to_unicode.
139             ##
140 1573 100       3942 croak "empty domain name [A4_1]" if length($domain) < 1;
141 1568 100 100     5168 croak "domain name too long [A4_1]" if length($domain) > 253 and defined $to_ascii;
142              
143 1556 100       3045 $domain .= '.' if $rooted;
144              
145 1556         13547 return $domain;
146             }
147              
148             sub _validate_label {
149 16828     16828   53256 my($l,%param) = @_;
150 12     12   30754 no warnings 'utf8';
  12         30  
  12         1572  
151              
152 16828 100 66     68075 $l eq Unicode::Normalize::NFC($l) or croak "not in Unicode Normalization Form NFC [V1]" unless $param{'_AssumeNFC'};
153              
154 16825 100       49517 $l =~ m/^..--/ and croak "contains U+002D HYPHEN-MINUS in both third and forth position [V2]";
155 16705 100       111718 $l =~ m/^-/ and croak "begins with U+002D HYPHEN-MINUS [V3]";
156 15973 100       95586 $l =~ m/-$/ and croak "ends with U+002D HYPHEN-MINUS [V3]";
157 15371 50       28568 $l =~ m/\./ and croak "contains U+0023 FULL STOP [V4]";
158 15371 100       253345 $l =~ m/^\p{IsMark}/ and croak "begins with General_Category=Mark [V5]";
159              
160 13439 100       26042 unless($param{'AllowUnassigned'}) {
161 13437 100       267164 $l =~m/(\p{Unassigned})/ and croak sprintf "contains unassigned character U+%04X [V6]", ord $1;
162             }
163              
164 11375 100       21424 if($param{'UseSTD3ASCIIRules'}) {
165 11372 100       68258 $l =~m/(\p{IsDisallowedSTD3Valid})/ and croak sprintf "contains disallowed_STD3_valid character U+%04X [V6]", ord $1;
166             }
167              
168 10994 100       20781 if($param{'TransitionalProcessing'}) {
169 2019 50       4689 $l =~ m/(\p{IsDeviation})/ and croak sprintf "contains deviation character U+%04X [V6]", ord $1;
170             }
171              
172 10994 50       25656 $l =~ m/(\p{IsIgnored})/ and croak sprintf "contains ignored character U+%04X [V6]", ord $1;
173 10994 100       54940 $l =~ m/(\p{IsMapped}|\p{IsDisallowedSTD3Mapped})/ and croak sprintf "contains mapped character U+%04X [V6]", ord $1;
174 10946 100       143234 $l =~ m/(\p{IsDisallowed})/ and croak sprintf "contains disallowed character U+%04X [V6]", ord $1;
175              
176 9968         60612 return 1;
177             }
178              
179             # For perl versions < 5.11, there is a bug where Bc:L does not match some
180             # character blocks that are not fully included in the main UnicodeData.txt file:
181             #
182             # 3400;;Lo;0;L;;;;;N;;;;;
183             # 4DB5;;Lo;0;L;;;;;N;;;;;
184             # 4E00;;Lo;0;L;;;;;N;;;;;
185             # 9FBB;;Lo;0;L;;;;;N;;;;;
186             # AC00;;Lo;0;L;;;;;N;;;;;
187             # D7A3;;Lo;0;L;;;;;N;;;;;
188             # 20000;;Lo;0;L;;;;;N;;;;;
189             # 2A6D6;;Lo;0;L;;;;;N;;;;;
190             #
191             my $_RE_BidiClass_L = $] >= 5.011 ? '\p{Bc:L}' : '\p{Bc:L}\x{3400}-\x{4DB5}\x{4E00}-\x{9FBB}\x{AC00}-\x{D7A3}\x{20000}-\x{2A6D6}';
192              
193             sub _validate_bidi {
194 3249     3249   8031 my($l,%param) = @_;
195 12     12   25374 no warnings 'utf8';
  12         28  
  12         1336  
196              
197 3249 100       8140 return 1 unless length($l);
198              
199 3200 100       10359 if( $l =~ m/^[$_RE_BidiClass_L]/o ) { # LTR (left-to-right)
200 1009 100       47919 $l =~ m/[^$_RE_BidiClass_L\p{Bc:EN}\p{Bc:ES}\p{Bc:CS}\p{Bc:ET}\p{Bc:BN}\p{Bc:ON}\p{Bc:NSM}]/o and croak 'contains characters with wrong bidi class for LTR [B5]';
201 630 100       12172 $l =~ m/[$_RE_BidiClass_L\p{Bc:EN}][\p{Bc:NSM}\P{Assigned}]*$/o or croak 'ends with character of wrong bidi class for LTR [B6]';
202 552         2593 return 1;
203             }
204              
205 2191 100       6471 if( $l =~ m/^[\p{Bc:R}\p{Bc:AL}]/ ) { # RTL (right-to-left)
206 1334 100       41673 $l =~ m/[^\p{Bc:R}\p{Bc:AL}\p{Bc:AN}\p{Bc:EN}\p{Bc:ES}\p{Bc:CS}\p{Bc:ET}\p{Bc:ON}\p{Bc:BN}\p{Bc:NSM}]/ and croak 'contains characters with wrong bidi class for RTL [B2]';
207 1012 100       14958 $l =~ m/[\p{Bc:R}\p{Bc:AL}\p{Bc:EN}\p{Bc:AN}][\p{Bc:NSM}\P{Assigned}]*$/ or croak 'ends with character of wrong bidi class for RTL [B3]';
208 922 100       7828 $l =~ m/\p{Bc:EN}.*\p{Bc:AN}|\p{Bc:AN}.*\p{Bc:EN}/ and croak 'contains characters with both bidi class EN and AN [B4]';
209 880         2140 return 1;
210             }
211              
212 857         99448 croak 'starts with character of wrong bidi class [B1]';
213             }
214              
215             # For perl versions < 5.11, some Unicode properties such as Ccc or Joining_Type
216             # are not supported. Instead, we use a conrete list of characters; this is safe
217             # because the Unicode version supported by theses perl versions will not be
218             # updated. For newer perl versions, we use the Unicode property (which is
219             # supported from 5.11), so we will always be up-to-date with the Unicode
220             # version supported by our underlying perl.
221             #
222             my $_RE_Ccc_Virama = $] >= 5.011 ? qr/\p{Ccc:Virama}/ : qr/[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B4D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0DCA}\x{0E3A}\x{0F84}\x{1039}\x{103A}\x{1714}\x{1734}\x{17D2}\x{1A60}\x{1B44}\x{1BAA}\x{1BF2}\x{1BF3}\x{2D7F}\x{A806}\x{A8C4}\x{A953}\x{A9C0}\x{ABED}\x{00010A3F}\x{00011046}\x{000110B9}]/;
223             my $_RE_JoiningType_L = $] >= 5.011 ? qr/\p{Joining_Type:L}/ : qr/(?!)/;
224             my $_RE_JoiningType_R = $] >= 5.011 ? qr/\p{Joining_Type:R}/ : qr/[\x{0622}-\x{0625}\x{0627}\x{0629}\x{062F}-\x{0632}\x{0648}\x{0671}-\x{0673}\x{0675}-\x{0677}\x{0688}-\x{0699}\x{06C0}\x{06C3}-\x{06CB}\x{06CD}\x{06CF}\x{06D2}\x{06D3}\x{06D5}\x{06EE}\x{06EF}\x{0710}\x{0715}-\x{0719}\x{071E}\x{0728}\x{072A}\x{072C}\x{072F}\x{074D}\x{0759}-\x{075B}\x{076B}\x{076C}\x{0771}\x{0773}\x{0774}\x{0778}\x{0779}]/;
225             my $_RE_JoiningType_D = $] >= 5.011 ? qr/\p{Joining_Type:D}/ : qr/[\x{0620}\x{0626}\x{0628}\x{062A}-\x{062E}\x{0633}-\x{063F}\x{0641}-\x{0647}\x{0649}\x{064A}\x{066E}\x{066F}\x{0678}-\x{0687}\x{069A}-\x{06BF}\x{06C1}\x{06C2}\x{06CC}\x{06CE}\x{06D0}\x{06D1}\x{06FA}-\x{06FC}\x{06FF}\x{0712}-\x{0714}\x{071A}-\x{071D}\x{071F}-\x{0727}\x{0729}\x{072B}\x{072D}\x{072E}\x{074E}-\x{0758}\x{075C}-\x{076A}\x{076D}-\x{0770}\x{0772}\x{0775}-\x{0777}\x{077A}-\x{077F}\x{07CA}-\x{07EA}]/;
226             my $_RE_JoiningType_T = $] >= 5.011 ? qr/\p{Joining_Type:T}/ : qr/[\x{00AD}\x{0300}-\x{036F}\x{0483}-\x{0489}\x{0591}-\x{05BD}\x{05BF}\x{05C1}\x{05C2}\x{05C4}\x{05C5}\x{05C7}\x{0610}-\x{061A}\x{064B}-\x{065F}\x{0670}\x{06D6}-\x{06DC}\x{06DF}-\x{06E4}\x{06E7}\x{06E8}\x{06EA}-\x{06ED}\x{070F}\x{0711}\x{0730}-\x{074A}\x{07A6}-\x{07B0}\x{07EB}-\x{07F3}\x{0816}-\x{0819}\x{081B}-\x{0823}\x{0825}-\x{0827}\x{0829}-\x{082D}\x{0859}-\x{085B}\x{0900}-\x{0902}\x{093A}\x{093C}\x{0941}-\x{0948}\x{094D}\x{0951}-\x{0957}\x{0962}\x{0963}\x{0981}\x{09BC}\x{09C1}-\x{09C4}\x{09CD}\x{09E2}\x{09E3}\x{0A01}\x{0A02}\x{0A3C}\x{0A41}\x{0A42}\x{0A47}\x{0A48}\x{0A4B}-\x{0A4D}\x{0A51}\x{0A70}\x{0A71}\x{0A75}\x{0A81}\x{0A82}\x{0ABC}\x{0AC1}-\x{0AC5}\x{0AC7}\x{0AC8}\x{0ACD}\x{0AE2}\x{0AE3}\x{0B01}\x{0B3C}\x{0B3F}\x{0B41}-\x{0B44}\x{0B4D}\x{0B56}\x{0B62}\x{0B63}\x{0B82}\x{0BC0}\x{0BCD}\x{0C3E}-\x{0C40}\x{0C46}-\x{0C48}\x{0C4A}-\x{0C4D}\x{0C55}\x{0C56}\x{0C62}\x{0C63}\x{0CBC}\x{0CBF}\x{0CC6}\x{0CCC}\x{0CCD}\x{0CE2}\x{0CE3}\x{0D41}-\x{0D44}\x{0D4D}\x{0D62}\x{0D63}\x{0DCA}\x{0DD2}-\x{0DD4}\x{0DD6}\x{0E31}\x{0E34}-\x{0E3A}\x{0E47}-\x{0E4E}\x{0EB1}\x{0EB4}-\x{0EB9}\x{0EBB}\x{0EBC}\x{0EC8}-\x{0ECD}\x{0F18}\x{0F19}\x{0F35}\x{0F37}\x{0F39}\x{0F71}-\x{0F7E}\x{0F80}-\x{0F84}\x{0F86}\x{0F87}\x{0F8D}-\x{0F97}\x{0F99}-\x{0FBC}\x{0FC6}\x{102D}-\x{1030}\x{1032}-\x{1037}\x{1039}\x{103A}\x{103D}\x{103E}\x{1058}\x{1059}\x{105E}-\x{1060}\x{1071}-\x{1074}\x{1082}\x{1085}\x{1086}\x{108D}\x{109D}\x{135D}-\x{135F}\x{1712}-\x{1714}\x{1732}-\x{1734}\x{1752}\x{1753}\x{1772}\x{1773}\x{17B4}\x{17B5}\x{17B7}-\x{17BD}\x{17C6}\x{17C9}-\x{17D3}\x{17DD}\x{180B}-\x{180D}\x{18A9}\x{1920}-\x{1922}\x{1927}\x{1928}\x{1932}\x{1939}-\x{193B}\x{1A17}\x{1A18}\x{1A56}\x{1A58}-\x{1A5E}\x{1A60}\x{1A62}\x{1A65}-\x{1A6C}\x{1A73}-\x{1A7C}\x{1A7F}\x{1B00}-\x{1B03}\x{1B34}\x{1B36}-\x{1B3A}\x{1B3C}\x{1B42}\x{1B6B}-\x{1B73}\x{1B80}\x{1B81}\x{1BA2}-\x{1BA5}\x{1BA8}\x{1BA9}\x{1BE6}\x{1BE8}\x{1BE9}\x{1BED}\x{1BEF}-\x{1BF1}\x{1C2C}-\x{1C33}\x{1C36}\x{1C37}\x{1CD0}-\x{1CD2}\x{1CD4}-\x{1CE0}\x{1CE2}-\x{1CE8}\x{1CED}\x{1DC0}-\x{1DE6}\x{1DFC}-\x{1DFF}\x{200B}\x{200E}\x{200F}\x{202A}-\x{202E}\x{2060}-\x{2064}\x{206A}-\x{206F}\x{20D0}-\x{20F0}\x{2CEF}-\x{2CF1}\x{2D7F}\x{2DE0}-\x{2DFF}\x{302A}-\x{302F}\x{3099}\x{309A}\x{A66F}-\x{A672}\x{A67C}\x{A67D}\x{A6F0}\x{A6F1}\x{A802}\x{A806}\x{A80B}\x{A825}\x{A826}\x{A8C4}\x{A8E0}-\x{A8F1}\x{A926}-\x{A92D}\x{A947}-\x{A951}\x{A980}-\x{A982}\x{A9B3}\x{A9B6}-\x{A9B9}\x{A9BC}\x{AA29}-\x{AA2E}\x{AA31}\x{AA32}\x{AA35}\x{AA36}\x{AA43}\x{AA4C}\x{AAB0}\x{AAB2}-\x{AAB4}\x{AAB7}\x{AAB8}\x{AABE}\x{AABF}\x{AAC1}\x{ABE5}\x{ABE8}\x{ABED}\x{FB1E}\x{FE00}-\x{FE0F}\x{FE20}-\x{FE26}\x{FEFF}\x{FFF9}-\x{FFFB}\x{101FD}\x{10A01}-\x{10A03}\x{10A05}\x{10A06}\x{10A0C}-\x{10A0F}\x{10A38}-\x{10A3A}\x{10A3F}\x{11001}\x{11038}-\x{11046}\x{11080}\x{11081}\x{110B3}-\x{110B6}\x{110B9}\x{110BA}\x{110BD}\x{1D167}-\x{1D169}\x{1D173}-\x{1D182}\x{1D185}-\x{1D18B}\x{1D1AA}-\x{1D1AD}\x{1D242}-\x{1D244}\x{E0001}\x{E0020}-\x{E007F}\x{E0100}-\x{E01EF}]/;
227              
228             sub _validate_contextj {
229 4033     4033   9684 my($l,%param) = @_;
230 12     12   101640 no warnings 'utf8';
  12         30  
  12         710  
231 4033 100 66     15275 return 1 unless defined($l) && length($l);
232              
233             # catch ContextJ characters without defined rule (as of Unicode 6.0.0, this cannot match)
234             #
235 3892 50       11918 $l =~ m/([^\x{200C}\x{200D}\P{Join_Control}])/ and croak sprintf "contains CONTEXTJ character U+%04X without defined rule [C1]", ord($1);
236              
237             # RFC 5892, Appendix A.1. ZERO WIDTH NON-JOINER
238             # Code point:
239             # U+200C
240             #
241             # Overview:
242             # This may occur in a formally cursive script (such as Arabic) in a
243             # context where it breaks a cursive connection as required for
244             # orthographic rules, as in the Persian language, for example. It
245             # also may occur in Indic scripts in a consonant-conjunct context
246             # (immediately following a virama), to control required display of
247             # such conjuncts.
248             #
249             #
250             # Lookup:
251             # True
252             #
253             # Rule Set:
254             # False;
255             # If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
256             # If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
257             # (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
258              
259 3892 100 100     45304 $l =~ m/
260             $_RE_Ccc_Virama
261             \x{200C}
262             |
263             (?: $_RE_JoiningType_L | $_RE_JoiningType_D) $_RE_JoiningType_T*
264             \x{200C}
265             $_RE_JoiningType_T*(?: $_RE_JoiningType_R | $_RE_JoiningType_D)
266             |
267             (\x{200C})
268             /xo and defined($1) and croak sprintf "rule for CONTEXTJ character U+%04X not satisfied [C2]", ord($1);
269              
270             # RFC 5892, Appendix A.2. ZERO WIDTH JOINER
271             #
272             # Code point:
273             # U+200D
274             #
275             # Overview:
276             # This may occur in Indic scripts in a consonant-conjunct context
277             # (immediately following a virama), to control required display of
278             # such conjuncts.
279             #
280             # Lookup:
281             # True
282              
283             # Rule Set:
284             # False;
285             # If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
286              
287 3694 100 100     43730 $l =~ m/
288             $_RE_Ccc_Virama
289             \x{200D}
290             |
291             (\x{200D})
292             /xo and defined($1) and croak sprintf "rule for CONTEXTJ character U+%04X not satisfied [C2]", ord($1);
293             }
294              
295             1;
296              
297             __END__