File Coverage

blib/lib/utf8/all.pm
Criterion Covered Total %
statement 96 98 97.9
branch 39 52 75.0
condition 9 12 75.0
subroutine 16 16 100.0
pod n/a
total 160 178 89.8


line stmt bran cond sub pod time code
1             package utf8::all;
2 22     22   2458095 use strict;
  22         1986  
  22         872  
3 22     22   217 use warnings;
  22         48  
  22         1246  
4 22     22   437 use 5.010; # state
  22         85  
5              
6             # ABSTRACT: turn on Unicode - all of it
7             our $VERSION = '0.026'; # VERSION
8              
9             #pod =head1 SYNOPSIS
10             #pod
11             #pod use utf8::all; # Turn on UTF-8, all of it.
12             #pod
13             #pod open my $in, '<', 'contains-utf8'; # UTF-8 already turned on here
14             #pod print length 'føø bār'; # 7 UTF-8 characters
15             #pod my $utf8_arg = shift @ARGV; # @ARGV is UTF-8 too (only for main)
16             #pod
17             #pod =head1 DESCRIPTION
18             #pod
19             #pod The C pragma tells the Perl parser to allow UTF-8 in the
20             #pod program text in the current lexical scope. This also means that you
21             #pod can now use literal Unicode characters as part of strings, variable
22             #pod names, and regular expressions.
23             #pod
24             #pod C goes further:
25             #pod
26             #pod =over 4
27             #pod
28             #pod =item *
29             #pod
30             #pod L|charnames> are imported so C<\N{...}> sequences can be
31             #pod used to compile Unicode characters based on names.
32             #pod
33             #pod =item *
34             #pod
35             #pod On Perl C or higher, the C is
36             #pod enabled.
37             #pod
38             #pod =item *
39             #pod
40             #pod C and C are enabled on Perl
41             #pod C<5.16.0> and higher.
42             #pod
43             #pod =item *
44             #pod
45             #pod Filehandles are opened with UTF-8 encoding turned on by default
46             #pod (including C, C, and C when C is
47             #pod used from the C
package). Meaning that they automatically
48             #pod convert UTF-8 octets to characters and vice versa. If you I
49             #pod want UTF-8 for a particular filehandle, you'll have to set C
50             #pod $filehandle>.
51             #pod
52             #pod =item *
53             #pod
54             #pod C<@ARGV> gets converted from UTF-8 octets to Unicode characters (when
55             #pod C is used from the C
package). This is similar to the
56             #pod behaviour of the C<-CA> perl command-line switch (see L).
57             #pod
58             #pod =item *
59             #pod
60             #pod C, C, C (including the C and
61             #pod backtick operators), and L|perlfunc/glob> (including the C<<
62             #pod <> >> operator) now all work with and return Unicode characters
63             #pod instead of (UTF-8) octets (again only when C is used from
64             #pod the C
package).
65             #pod
66             #pod =back
67             #pod
68             #pod =head2 Lexical Scope
69             #pod
70             #pod The pragma is lexically-scoped, so you can do the following if you had
71             #pod some reason to:
72             #pod
73             #pod {
74             #pod use utf8::all;
75             #pod open my $out, '>', 'outfile';
76             #pod my $utf8_str = 'føø bār';
77             #pod print length $utf8_str, "\n"; # 7
78             #pod print $out $utf8_str; # out as utf8
79             #pod }
80             #pod open my $in, '<', 'outfile'; # in as raw
81             #pod my $text = do { local $/; <$in>};
82             #pod print length $text, "\n"; # 10, not 7!
83             #pod
84             #pod Instead of lexical scoping, you can also use C to turn
85             #pod off the effects.
86             #pod
87             #pod Note that the effect on C<@ARGV> and the C, C, and
88             #pod C file handles is always global and can not be undone!
89             #pod
90             #pod =head2 Enabling/Disabling Global Features
91             #pod
92             #pod As described above, the default behaviour of C is to
93             #pod convert C<@ARGV> and to open the C, C, and C
94             #pod file handles with UTF-8 encoding, and override the C and
95             #pod C functions and C operators when C is used
96             #pod from the C
package.
97             #pod
98             #pod If you want to disable these features even when C is used
99             #pod from the C
package, add the option C (or
100             #pod C) to the use line. E.g.:
101             #pod
102             #pod use utf8::all 'NO-GLOBAL';
103             #pod
104             #pod If on the other hand you want to enable these global effects even when
105             #pod C was used from another package than C
, use the
106             #pod option C on the use line:
107             #pod
108             #pod use utf8::all 'GLOBAL';
109             #pod
110             #pod =head2 UTF-8 Errors
111             #pod
112             #pod C will handle invalid code points (i.e., utf-8 that does
113             #pod not map to a valid unicode "character"), as a fatal error.
114             #pod
115             #pod For C, C, and C, one can change this
116             #pod behaviour by setting the attribute L.
117             #pod
118             #pod =head1 COMPATIBILITY
119             #pod
120             #pod The filesystems of Dos, Windows, and OS/2 do not (fully) support
121             #pod UTF-8. The C and C functions and C operators
122             #pod will therefore not be replaced on these systems.
123             #pod
124             #pod =head1 SEE ALSO
125             #pod
126             #pod =over 4
127             #pod
128             #pod =item *
129             #pod
130             #pod L for fully utf-8 aware File::Find functions.
131             #pod
132             #pod =item *
133             #pod
134             #pod L for fully utf-8 aware Cwd functions.
135             #pod
136             #pod =back
137             #pod
138             #pod =cut
139              
140 22     22   10542 use Import::Into;
  22         78442  
  22         1065  
141 22     22   6030 use parent qw(Encode charnames utf8 open warnings feature);
  22         4252  
  22         142  
142 22     22   616063 use Symbol qw(qualify_to_ref);
  22         30015  
  22         2042  
143 22     22   204 use Config;
  22         39  
  22         1975  
144              
145             # Holds the pointers to the original version of redefined functions
146             state %_orig_functions;
147              
148             # Current (i.e., this) package
149             my $current_package = __PACKAGE__;
150              
151             require Carp;
152             $Carp::Internal{$current_package}++; # To get warnings reported at correct caller level
153              
154             #pod =attr $utf8::all::UTF8_CHECK
155             #pod
156             #pod By default C marks decoding errors as fatal (default value
157             #pod for this setting is C). If you want, you can change this by
158             #pod setting C<$utf8::all::UTF8_CHECK>. The value C reports
159             #pod the encoding errors as warnings, and C will completely
160             #pod ignore them. Please see L for details. Note: C is
161             #pod I enforced.
162             #pod
163             #pod Important: Only controls the handling of decoding errors in C,
164             #pod C, and C.
165             #pod
166             #pod =cut
167              
168 22     22   165 use Encode ();
  22         59  
  22         446  
169 22     22   9761 use PerlIO::utf8_strict;
  22         12874  
  22         8375  
170              
171             our $UTF8_CHECK = Encode::FB_CROAK | Encode::LEAVE_SRC; # Die on encoding errors
172              
173             # UTF-8 Encoding object
174             my $_UTF8 = Encode::find_encoding('UTF-8');
175              
176             sub import {
177             # Enable features/pragmas in calling package
178 29     29   1934 my $target = caller;
179              
180             # Enable global effects be default only when imported from main package
181 29         177 my $no_global = $target ne 'main';
182              
183             # Override global?
184 29 100 66     179 if (defined $_[1] && $_[1] =~ /^(?:(NO-)?GLOBAL|LEXICAL-ONLY)$/i) {
185 2         15 $no_global = $_[1] !~ /^GLOBAL$/i;
186 2         6 splice(@_, 1, 1); # Remove option from import's arguments
187             }
188              
189 29         232 'utf8'->import::into($target);
190 29         8189 'open'->import::into($target, 'IO' => ':utf8_strict');
191              
192             # use open ':std' only works with some encodings.
193 29         7891 state $have_encoded_std = 0;
194 29 100 100     249 unless ($no_global || $have_encoded_std++) {
195 18         182 binmode STDERR, ':utf8_strict';
196 18         87 binmode STDOUT, ':utf8_strict';
197 18         84 binmode STDIN, ':utf8_strict';
198             }
199              
200 29         211 'charnames'->import::into($target, qw{:full :short});
201 29         601698 'warnings'->import::into($target, qw{FATAL utf8});
202 29 50       10057 'feature'->import::into($target, qw{unicode_strings}) if $^V >= v5.11.0;
203 29 50       13721 'feature'->import::into($target, qw{unicode_eval fc}) if $^V >= v5.16.0;
204              
205 29 100 66     7011 unless ($no_global || $^O =~ /MSWin32|cygwin|dos|os2/) {
206 22     22   170 no strict qw(refs); ## no critic (TestingAndDebugging::ProhibitNoStrict)
  22         37  
  22         1159  
207 22     22   118 no warnings qw(redefine);
  22         42  
  22         21219  
208              
209             # Replace readdir with utf8 aware version
210 25         70 *{$target . '::readdir'} = \&_utf8_readdir;
  25         152  
211              
212             # Replace readdir with utf8 aware version
213 25         61 *{$target . '::readlink'} = \&_utf8_readlink;
  25         87  
214              
215             # Replace glob with utf8 aware version
216 25         56 *{$target . '::glob'} = \&_utf8_glob;
  25         92  
217              
218             # Set compiler hint to encode/decode in the redefined functions
219 25         118 $^H{'utf8::all'} = 1;
220             }
221              
222             # Make @ARGV utf-8 when, unless perl was launched with the -CA
223             # flag as this already has @ARGV decoded automatically. -CA is
224             # active if the the fifth bit (32) of the ${^UNICODE} variable is
225             # set. (see perlrun on the -C command switch for details about
226             # ${^UNICODE})
227 29 100 66     180 unless ($no_global || (${^UNICODE} & 32)) {
228 25         51 state $have_encoded_argv = 0;
229 25 100       89 if (!$have_encoded_argv++) {
230 18 50       91 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
231 18 50       114 $_ = ($_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_) for @ARGV;
232             }
233             }
234              
235 29         13534 return;
236             }
237              
238             sub unimport { ## no critic (Subroutines::ProhibitBuiltinHomonyms)
239             # Disable features/pragmas in calling package
240             # Note: Does NOT undo the effect on @ARGV,
241             # nor on the STDIN, STDOUT, and STDERR file handles!
242             # These effects are always "global".
243              
244 1     1   18 my $target = caller;
245 1         10 'utf8'->unimport::out_of($target);
246 1         298 'open'->import::into($target, qw{IO :bytes});
247              
248 1 50       367 unless ($^O =~ /MSWin32|cygwin|dos|os2/) {
249 1         5 $^H{'utf8::all'} = 0; # Reset compiler hint
250             }
251              
252 1         2960 return;
253             }
254              
255             sub _utf8_readdir(*) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
256 6     6   432235 my $pre_handle = shift;
257 6         78 my $hints = (caller 0)[10];
258 6 100       31 my $handle = ref($pre_handle) ? $pre_handle : qualify_to_ref($pre_handle, caller);
259 6 100       49 if (not $hints->{'utf8::all'}) {
260 1         22 return CORE::readdir($handle);
261             } else {
262 5 50       18 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
263 5 100       12 if (wantarray) {
264 4 50       136 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::readdir($handle);
  16         125  
265             } else {
266 1         8 my $r = CORE::readdir($handle);
267 1 50       10 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
268             }
269             }
270             }
271              
272             sub _utf8_readlink(_) { ## no critic (Subroutines::ProhibitSubroutinePrototypes)
273 4     4   236439 my $arg = shift;
274 4         31 my $hints = (caller 0)[10];
275 4 50       195 if (not $hints->{'utf8::all'}) {
276 0         0 return CORE::readlink($arg);
277             } else {
278 4 100       15 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
279 4 50       59 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
280 3         138 my $r = CORE::readlink($arg);
281 3 100       38 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
282             }
283             }
284              
285             sub _utf8_glob {
286 9     9   325492 my $arg = $_[0]; # Making this a lexical somehow is important!
287 9         51 my $hints = (caller 0)[10];
288 9 50       190 if (not $hints->{'utf8::all'}) {
289 0         0 return CORE::glob($arg);
290             } else {
291 9 100       23 $UTF8_CHECK |= Encode::LEAVE_SRC if $UTF8_CHECK; # Enforce LEAVE_SRC
292 9 50       70 $arg = $arg ? $_UTF8->encode($arg, $UTF8_CHECK) : $arg;
293 8 100       79 if (wantarray) {
294 3 50       140 return map { $_ ? $_UTF8->decode($_, $UTF8_CHECK) : $_ } CORE::glob($arg);
  6         38  
295             } else {
296 5         142 my $r = CORE::glob($arg);
297 5 100       34 return $r ? $_UTF8->decode($r, $UTF8_CHECK) : $r;
298             }
299             }
300             }
301              
302             #pod =head1 INTERACTION WITH AUTODIE
303             #pod
304             #pod If you use L, which is a great idea, you need to use at least
305             #pod version B<2.12>, released on L
306             #pod 2012|https://metacpan.org/source/PJF/autodie-2.12/Changes#L3>.
307             #pod Otherwise, autodie obliterates the IO layers set by the L
308             #pod pragma. See L
309             #pod #54777|https://rt.cpan.org/Ticket/Display.html?id=54777> and L
310             #pod #7|https://github.com/doherty/utf8-all/issues/7>.
311             #pod
312             #pod =cut
313              
314             1;
315              
316             __END__