line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package utf8::all; |
2
|
18
|
|
|
18
|
|
645601
|
use strict; |
|
18
|
|
|
|
|
40
|
|
|
18
|
|
|
|
|
477
|
|
3
|
18
|
|
|
18
|
|
91
|
use warnings; |
|
18
|
|
|
|
|
37
|
|
|
18
|
|
|
|
|
656
|
|
4
|
18
|
|
|
18
|
|
339
|
use 5.010; # state |
|
18
|
|
|
|
|
63
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
# ABSTRACT: turn on Unicode - all of it |
7
|
|
|
|
|
|
|
our $VERSION = '0.022'; # VERSION |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#pod =head1 SYNOPSIS |
10
|
|
|
|
|
|
|
#pod |
11
|
|
|
|
|
|
|
#pod use utf8::all; # Turn on UTF-8, all of it. |
12
|
|
|
|
|
|
|
#pod |
13
|
|
|
|
|
|
|
#pod open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here |
14
|
|
|
|
|
|
|
#pod print length 'føø bār'; # 7 UTF-8 characters |
15
|
|
|
|
|
|
|
#pod my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main) |
16
|
|
|
|
|
|
|
#pod |
17
|
|
|
|
|
|
|
#pod =head1 DESCRIPTION |
18
|
|
|
|
|
|
|
#pod |
19
|
|
|
|
|
|
|
#pod The C |
20
|
|
|
|
|
|
|
#pod program text in the current lexical scope. This also means that you |
21
|
|
|
|
|
|
|
#pod can now use literal Unicode characters as part of strings, variable |
22
|
|
|
|
|
|
|
#pod names, and regular expressions. |
23
|
|
|
|
|
|
|
#pod |
24
|
|
|
|
|
|
|
#pod C goes further: |
25
|
|
|
|
|
|
|
#pod |
26
|
|
|
|
|
|
|
#pod =over 4 |
27
|
|
|
|
|
|
|
#pod |
28
|
|
|
|
|
|
|
#pod =item * |
29
|
|
|
|
|
|
|
#pod |
30
|
|
|
|
|
|
|
#pod L|charnames> are imported so C<\N{...}> sequences can be |
31
|
|
|
|
|
|
|
#pod used to compile Unicode characters based on names. |
32
|
|
|
|
|
|
|
#pod |
33
|
|
|
|
|
|
|
#pod =item * |
34
|
|
|
|
|
|
|
#pod |
35
|
|
|
|
|
|
|
#pod On Perl C or higher, the C |
36
|
|
|
|
|
|
|
#pod enabled. |
37
|
|
|
|
|
|
|
#pod |
38
|
|
|
|
|
|
|
#pod =item * |
39
|
|
|
|
|
|
|
#pod |
40
|
|
|
|
|
|
|
#pod C |
41
|
|
|
|
|
|
|
#pod C<5.16.0> and higher. |
42
|
|
|
|
|
|
|
#pod |
43
|
|
|
|
|
|
|
#pod =item * |
44
|
|
|
|
|
|
|
#pod |
45
|
|
|
|
|
|
|
#pod Filehandles are opened with UTF-8 encoding turned on by default |
46
|
|
|
|
|
|
|
#pod (including STDIN, STDOUT, STDERR). Meaning that they automatically |
47
|
|
|
|
|
|
|
#pod convert UTF-8 octets to characters and vice versa. If you I |
48
|
|
|
|
|
|
|
#pod want UTF-8 for a particular filehandle, you'll have to set C
|
49
|
|
|
|
|
|
|
#pod $filehandle>. |
50
|
|
|
|
|
|
|
#pod |
51
|
|
|
|
|
|
|
#pod =item * |
52
|
|
|
|
|
|
|
#pod |
53
|
|
|
|
|
|
|
#pod C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when |
54
|
|
|
|
|
|
|
#pod C is used from the main package). This is similar to the |
55
|
|
|
|
|
|
|
#pod behaviour of the C<-CA> perl command-line switch (see L). |
56
|
|
|
|
|
|
|
#pod |
57
|
|
|
|
|
|
|
#pod =item * |
58
|
|
|
|
|
|
|
#pod |
59
|
|
|
|
|
|
|
#pod C, C, C (including the C and |
60
|
|
|
|
|
|
|
#pod backtick operators), and L|perlfunc/glob> (including the C<< <> |
61
|
|
|
|
|
|
|
#pod >> operator) now all work with and return Unicode characters instead |
62
|
|
|
|
|
|
|
#pod of (UTF-8) octets. |
63
|
|
|
|
|
|
|
#pod |
64
|
|
|
|
|
|
|
#pod =back |
65
|
|
|
|
|
|
|
#pod |
66
|
|
|
|
|
|
|
#pod =head2 Lexical Scope |
67
|
|
|
|
|
|
|
#pod |
68
|
|
|
|
|
|
|
#pod The pragma is lexically-scoped, so you can do the following if you had |
69
|
|
|
|
|
|
|
#pod some reason to: |
70
|
|
|
|
|
|
|
#pod |
71
|
|
|
|
|
|
|
#pod { |
72
|
|
|
|
|
|
|
#pod use utf8::all; |
73
|
|
|
|
|
|
|
#pod open my $out, '>', 'outfile'; |
74
|
|
|
|
|
|
|
#pod my $utf8_str = 'føø bār'; |
75
|
|
|
|
|
|
|
#pod print length $utf8_str, "\n"; # 7 |
76
|
|
|
|
|
|
|
#pod print $out $utf8_str; # out as utf8 |
77
|
|
|
|
|
|
|
#pod } |
78
|
|
|
|
|
|
|
#pod open my $in, '<', 'outfile'; # in as raw |
79
|
|
|
|
|
|
|
#pod my $text = do { local $/; <$in>}; |
80
|
|
|
|
|
|
|
#pod print length $text, "\n"; # 10, not 7! |
81
|
|
|
|
|
|
|
#pod |
82
|
|
|
|
|
|
|
#pod Instead of lexical scoping, you can also use C to turn |
83
|
|
|
|
|
|
|
#pod off the effects. |
84
|
|
|
|
|
|
|
#pod |
85
|
|
|
|
|
|
|
#pod Note that the effect on C<@ARGV> and the C, C, and |
86
|
|
|
|
|
|
|
#pod C file handles is always global! |
87
|
|
|
|
|
|
|
#pod |
88
|
|
|
|
|
|
|
#pod =head2 UTF-8 Errors |
89
|
|
|
|
|
|
|
#pod |
90
|
|
|
|
|
|
|
#pod C will handle invalid code points (i.e., utf-8 that does |
91
|
|
|
|
|
|
|
#pod not map to a valid unicode "character"), as a fatal error. |
92
|
|
|
|
|
|
|
#pod |
93
|
|
|
|
|
|
|
#pod For C, C, and C, one can change this |
94
|
|
|
|
|
|
|
#pod behaviour by setting the attribute L"$utf8::all::UTF8_CHECK">. |
95
|
|
|
|
|
|
|
#pod |
96
|
|
|
|
|
|
|
#pod =head1 COMPATIBILITY |
97
|
|
|
|
|
|
|
#pod |
98
|
|
|
|
|
|
|
#pod The filesystems of Dos, Windows, and OS/2 do not (fully) support |
99
|
|
|
|
|
|
|
#pod UTF-8. The C and C functions and C operators |
100
|
|
|
|
|
|
|
#pod will therefore not be replaced on these systems. |
101
|
|
|
|
|
|
|
#pod |
102
|
|
|
|
|
|
|
#pod =head1 SEE ALSO |
103
|
|
|
|
|
|
|
#pod |
104
|
|
|
|
|
|
|
#pod =over 4 |
105
|
|
|
|
|
|
|
#pod |
106
|
|
|
|
|
|
|
#pod =item * |
107
|
|
|
|
|
|
|
#pod |
108
|
|
|
|
|
|
|
#pod L for fully utf-8 aware File::Find functions. |
109
|
|
|
|
|
|
|
#pod |
110
|
|
|
|
|
|
|
#pod =item * |
111
|
|
|
|
|
|
|
#pod |
112
|
|
|
|
|
|
|
#pod L for fully utf-8 aware Cwd functions. |
113
|
|
|
|
|
|
|
#pod |
114
|
|
|
|
|
|
|
#pod =back |
115
|
|
|
|
|
|
|
#pod |
116
|
|
|
|
|
|
|
#pod =cut |
117
|
|
|
|
|
|
|
|
118
|
18
|
|
|
18
|
|
7139
|
use Import::Into; |
|
18
|
|
|
|
|
38561
|
|
|
18
|
|
|
|
|
518
|
|
119
|
18
|
|
|
18
|
|
6127
|
use parent qw(Encode charnames utf8 open warnings feature); |
|
18
|
|
|
|
|
4039
|
|
|
18
|
|
|
|
|
152
|
|
120
|
18
|
|
|
18
|
|
2236111
|
use Symbol qw(qualify_to_ref); |
|
18
|
|
|
|
|
12015
|
|
|
18
|
|
|
|
|
1384
|
|
121
|
18
|
|
|
18
|
|
139
|
use Config; |
|
18
|
|
|
|
|
50
|
|
|
18
|
|
|
|
|
1181
|
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Holds the pointers to the original version of redefined functions |
124
|
|
|
|
|
|
|
state %_orig_functions; |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# Current (i.e., this) package |
127
|
|
|
|
|
|
|
my $current_package = __PACKAGE__; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
require Carp; |
130
|
|
|
|
|
|
|
$Carp::Internal{$current_package}++; # To get warnings reported at correct caller level |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#pod =attr $utf8::all::UTF8_CHECK |
133
|
|
|
|
|
|
|
#pod |
134
|
|
|
|
|
|
|
#pod By default C marks decoding errors as fatal (default value |
135
|
|
|
|
|
|
|
#pod for this setting is C). If you want, you can change this by |
136
|
|
|
|
|
|
|
#pod setting C<$utf8::all::UTF8_CHECK>. The value C reports |
137
|
|
|
|
|
|
|
#pod the encoding errors as warnings, and C will completely |
138
|
|
|
|
|
|
|
#pod ignore them. Please see L for details. Note: C is |
139
|
|
|
|
|
|
|
#pod I enforced. |
140
|
|
|
|
|
|
|
#pod |
141
|
|
|
|
|
|
|
#pod Important: Only controls the handling of decoding errors in C, |
142
|
|
|
|
|
|
|
#pod C, and C. |
143
|
|
|
|
|
|
|
#pod |
144
|
|
|
|
|
|
|
#pod =cut |
145
|
|
|
|
|
|
|
|
146
|
18
|
|
|
18
|
|
118
|
use Encode (); |
|
18
|
|
|
|
|
43
|
|
|
18
|
|
|
|
|
335
|
|
147
|
18
|
|
|
18
|
|
7346
|
use PerlIO::utf8_strict; |
|
18
|
|
|
|
|
7482
|
|
|
18
|
|
|
|
|
3903
|
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
our $UTF8_CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC; # Die on encoding errors |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# UTF-8 Encoding object |
152
|
|
|
|
|
|
|
my $_UTF8 = Encode::find_encoding('UTF-8'); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub import { |
155
|
|
|
|
|
|
|
# Enable features/pragmas in calling package |
156
|
26
|
|
|
26
|
|
1343
|
my $target = caller; |
157
|
|
|
|
|
|
|
|
158
|
26
|
|
|
|
|
229
|
'utf8'->import::into($target); |
159
|
26
|
|
|
|
|
6280
|
'open'->import::into($target, 'IO' => ':utf8_strict'); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# use open ':std' only works with some encodings. |
162
|
26
|
|
|
|
|
6613
|
state $have_encoded_std = 0; |
163
|
26
|
100
|
|
|
|
203
|
if (!$have_encoded_std++) { |
164
|
18
|
|
|
|
|
146
|
binmode STDERR, ':utf8_strict'; |
165
|
18
|
|
|
|
|
89
|
binmode STDOUT, ':utf8_strict'; |
166
|
18
|
|
|
|
|
88
|
binmode STDIN, ':utf8_strict'; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
26
|
|
|
|
|
164
|
'charnames'->import::into($target, qw{:full :short}); |
170
|
26
|
|
|
|
|
10362
|
'warnings'->import::into($target, qw{FATAL utf8}); |
171
|
26
|
50
|
|
|
|
6083
|
'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0; |
172
|
26
|
50
|
|
|
|
7578
|
'feature'->import::into($target, qw{unicode_eval fc}) if $^V >= v5.16.0; |
173
|
|
|
|
|
|
|
|
174
|
26
|
50
|
|
|
|
5891
|
unless ($^O =~ /MSWin32|cygwin|dos|os2/) { |
175
|
18
|
|
|
18
|
|
194
|
no strict qw(refs); ## no critic (TestingAndDebugging::ProhibitNoStrict) |
|
18
|
|
|
|
|
44
|
|
|
18
|
|
|
|
|
665
|
|
176
|
18
|
|
|
18
|
|
98
|
no warnings qw(redefine); |
|
18
|
|
|
|
|
42
|
|
|
18
|
|
|
|
|
10913
|
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Replace readdir with utf8 aware version |
179
|
26
|
|
|
|
|
86
|
*{$target . '::readdir'} = \&_utf8_readdir; |
|
26
|
|
|
|
|
169
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Replace readdir with utf8 aware version |
182
|
26
|
|
|
|
|
74
|
*{$target . '::readlink'} = \&_utf8_readlink; |
|
26
|
|
|
|
|
148
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# Replace glob with utf8 aware version |
185
|
26
|
|
|
|
|
90
|
*{$target . '::glob'} = \&_utf8_glob; |
|
26
|
|
|
|
|
170
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# Set compiler hint to encode/decode in the redefined functions |
188
|
26
|
|
|
|
|
124
|
$^H{'utf8::all'} = 1; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# Make @ARGV utf-8 when called from the main package, unless perl was launched |
192
|
|
|
|
|
|
|
# with the -CA flag as this already has @ARGV decoded automatically. |
193
|
|
|
|
|
|
|
# -CA is active if the the fifth bit (32) of the ${^UNICODE} variable is set. |
194
|
|
|
|
|
|
|
# (see perlrun on the -C command switch for details about ${^UNICODE}) |
195
|
26
|
50
|
|
|
|
137
|
if (!(${^UNICODE} & 32)) { |
196
|
26
|
|
|
|
|
68
|
state $have_encoded_argv = 0; |
197
|
26
|
100
|
100
|
|
|
215
|
if ($target eq 'main' && !$have_encoded_argv++) { |
198
|
17
|
50
|
|
|
|
116
|
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC |
199
|
17
|
50
|
|
|
|
95
|
$_ = ($_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_) for @ARGV; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
26
|
|
|
|
|
9787
|
return; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub unimport { ## no critic (Subroutines::ProhibitBuiltinHomonyms) |
207
|
|
|
|
|
|
|
# Disable features/pragmas in calling package |
208
|
|
|
|
|
|
|
# Note: Does NOT undo the effect on @ARGV, |
209
|
|
|
|
|
|
|
# nor on the STDIN, STDOUT, and STDERR file handles! |
210
|
|
|
|
|
|
|
# These effects are always "global". |
211
|
|
|
|
|
|
|
|
212
|
1
|
|
|
1
|
|
8
|
my $target = caller; |
213
|
1
|
|
|
|
|
6
|
'utf8'->unimport::out_of($target); |
214
|
1
|
|
|
|
|
162
|
'open'->import::into($target, qw{IO :bytes}); |
215
|
|
|
|
|
|
|
|
216
|
1
|
|
|
|
|
192
|
$^H{'utf8::all'} = 0; # Reset compiler hint |
217
|
|
|
|
|
|
|
|
218
|
1
|
|
|
|
|
1535
|
return; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub _utf8_readdir(*) { ## no critic (Subroutines::ProhibitSubroutinePrototypes) |
222
|
6
|
|
|
6
|
|
24951
|
my $pre_handle = shift; |
223
|
6
|
|
|
|
|
74
|
my $hints = (caller 0)[10]; |
224
|
6
|
100
|
|
|
|
32
|
my $handle = ref($pre_handle) ? $pre_handle : qualify_to_ref($pre_handle, caller); |
225
|
6
|
100
|
|
|
|
64
|
if (not $hints->{'utf8::all'}) { |
226
|
1
|
|
|
|
|
10
|
return CORE::readdir($handle); |
227
|
|
|
|
|
|
|
} else { |
228
|
5
|
50
|
|
|
|
35
|
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC |
229
|
5
|
100
|
|
|
|
17
|
if (wantarray) { |
230
|
4
|
50
|
|
|
|
64
|
return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::readdir($handle); |
|
16
|
|
|
|
|
193
|
|
231
|
|
|
|
|
|
|
} else { |
232
|
1
|
|
|
|
|
4
|
my $r = CORE::readdir($handle); |
233
|
1
|
50
|
|
|
|
8
|
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub _utf8_readlink(_) { ## no critic (Subroutines::ProhibitSubroutinePrototypes) |
239
|
4
|
|
|
4
|
|
2662
|
my $arg = shift; |
240
|
4
|
|
|
|
|
28
|
my $hints = (caller 0)[10]; |
241
|
4
|
50
|
|
|
|
124
|
if (not $hints->{'utf8::all'}) { |
242
|
0
|
|
|
|
|
0
|
return CORE::readlink($arg); |
243
|
|
|
|
|
|
|
} else { |
244
|
4
|
100
|
|
|
|
25
|
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC |
245
|
4
|
50
|
|
|
|
41
|
$arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg; |
246
|
3
|
|
|
|
|
88
|
my $r = CORE::readlink($arg); |
247
|
3
|
100
|
|
|
|
36
|
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
sub _utf8_glob { |
252
|
9
|
|
|
9
|
|
3183
|
my $arg = $_[0]; # Making this a lexical somehow is important! |
253
|
9
|
|
|
|
|
49
|
my $hints = (caller 0)[10]; |
254
|
9
|
50
|
|
|
|
147
|
if (not $hints->{'utf8::all'}) { |
255
|
0
|
|
|
|
|
0
|
return CORE::glob($arg); |
256
|
|
|
|
|
|
|
} else { |
257
|
9
|
100
|
|
|
|
35
|
$UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC |
258
|
9
|
50
|
|
|
|
67
|
$arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg; |
259
|
8
|
100
|
|
|
|
69
|
if (wantarray) { |
260
|
3
|
50
|
|
|
|
69
|
return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::glob($arg); |
|
6
|
|
|
|
|
55
|
|
261
|
|
|
|
|
|
|
} else { |
262
|
5
|
|
|
|
|
78
|
my $r = CORE::glob($arg); |
263
|
5
|
100
|
|
|
|
36
|
return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
#pod =head1 INTERACTION WITH AUTODIE |
269
|
|
|
|
|
|
|
#pod |
270
|
|
|
|
|
|
|
#pod If you use L, which is a great idea, you need to use at least |
271
|
|
|
|
|
|
|
#pod version B<2.12>, released on L
|
272
|
|
|
|
|
|
|
#pod 2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>. |
273
|
|
|
|
|
|
|
#pod Otherwise, autodie obliterates the IO layers set by the L |
274
|
|
|
|
|
|
|
#pod pragma. See L |
275
|
|
|
|
|
|
|
#pod #54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L
|
276
|
|
|
|
|
|
|
#pod #7|https://github.com/doherty/utf8-all/issues/7>. |
277
|
|
|
|
|
|
|
#pod |
278
|
|
|
|
|
|
|
#pod =cut |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
1; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
__END__ |