line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package String::UnicodeUTF8; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
54972
|
use strict; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
75
|
|
4
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
66
|
|
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
1744
|
use String::Unquotemeta (); |
|
2
|
|
|
|
|
476
|
|
|
2
|
|
|
|
|
39
|
|
7
|
2
|
|
|
2
|
|
1767
|
use Module::Want 0.6 (); |
|
2
|
|
|
|
|
2252
|
|
|
2
|
|
|
|
|
109
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
$String::UnicodeUTF8::VERSION = '0.21'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
sub import { |
12
|
22
|
100
|
|
22
|
|
23112
|
return 1 if @_ == 1; # no-op import() |
13
|
|
|
|
|
|
|
|
14
|
20
|
|
|
|
|
43
|
my $caller = caller(); |
15
|
|
|
|
|
|
|
|
16
|
2
|
|
|
2
|
|
12
|
no strict 'refs'; ## no critic |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
3235
|
|
17
|
20
|
|
|
|
|
65
|
for ( @_[ 1 .. $#_ ] ) { |
18
|
20
|
100
|
100
|
|
|
107
|
next if $_ eq 'import' || $_ eq '_pre_581_is_utf8_hack'; |
19
|
18
|
100
|
|
|
|
21
|
*{ $caller . '::' . $_ } = \&{$_} if defined &{$_}; |
|
17
|
|
|
|
|
103
|
|
|
17
|
|
|
|
|
36
|
|
|
18
|
|
|
|
|
84
|
|
20
|
|
|
|
|
|
|
} |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# is_utf8() is confusing, it really means “is this a Unicode string”, not “is this a utf-8 bytes string”) |
24
|
|
|
|
|
|
|
*is_unicode = $] >= 5.008_001 ? \&utf8::is_utf8 : \&_pre_581_is_utf8_hack; # or just 'use 5.8.1;' and drop this ? |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $pre_573_is_utf8_hack = $] >= 5.007_003 ? undef : {}; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub char_count { |
29
|
7
|
|
|
7
|
1
|
3981
|
return CORE::length( get_unicode( $_[0] ) ); |
30
|
|
|
|
|
|
|
} |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub bytes_size { |
33
|
7
|
|
|
7
|
1
|
72
|
return CORE::length( get_utf8( $_[0] ) ); |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub get_unicode { |
37
|
111
|
|
|
111
|
1
|
324
|
my ($string) = @_; |
38
|
|
|
|
|
|
|
|
39
|
111
|
100
|
|
|
|
341
|
if ( !is_unicode($string) ) { |
40
|
59
|
50
|
|
|
|
156
|
if ( defined &utf8::decode ) { |
41
|
59
|
|
|
|
|
139
|
utf8::decode($string); |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
else { # decode() a hacky way: |
44
|
0
|
|
|
|
|
0
|
$string = pack( "U*", unpack( "C0U*", $string ) ); # 5.6+ at least |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# if decode() did not fully do it (e.g. it only contained ascii characters and utf8::decode() was called) |
48
|
59
|
100
|
|
|
|
174
|
if ( !is_unicode($string) ) { |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# force strings without unicode characters to be unicode strings |
51
|
33
|
50
|
|
|
|
73
|
if ( defined &utf8::upgrade ) { |
52
|
33
|
|
|
|
|
65
|
utf8::upgrade($string); |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
else { # upgrade() the hacky way: (TODO: how?) |
55
|
0
|
|
|
|
|
0
|
require Carp; |
56
|
0
|
|
|
|
|
0
|
Carp::carp("pack() did not result in unicode string and there is no way to emulate utf8::upgrade"); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
111
|
50
|
|
|
|
234
|
$pre_573_is_utf8_hack->{$string} = '' if ref $pre_573_is_utf8_hack; |
62
|
111
|
|
|
|
|
534
|
return $string; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub get_utf8 { |
66
|
293
|
|
|
293
|
1
|
470
|
my ($string) = @_; |
67
|
293
|
100
|
|
|
|
740
|
if ( is_unicode($string) ) { |
68
|
63
|
50
|
|
|
|
140
|
if ( defined &utf8::encode ) { |
69
|
63
|
|
|
|
|
117
|
utf8::encode($string); |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
else { # encode() the hacky way: |
72
|
0
|
|
|
|
|
0
|
$string = pack( "C0U*", unpack( "U*", $string ) ); # 5.6+ at least |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
293
|
50
|
|
|
|
570
|
delete $pre_573_is_utf8_hack->{$string} if ref $pre_573_is_utf8_hack; |
77
|
293
|
|
|
|
|
1314
|
return $string; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# ? want to serialize these too ? |
81
|
|
|
|
|
|
|
# my %esc = ( "\n" => '\n', "\t" => '\t', "\r" => '\r', "\\" => '\\\\', "\a" => '\a', "\b" => '\b', "\f" => '\f' ); |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub escape_utf8_or_unicode { |
84
|
58
|
|
|
58
|
1
|
700
|
my ( $s, $quotemeta ) = @_; # undocumented second flag for internal use |
85
|
|
|
|
|
|
|
|
86
|
58
|
|
|
|
|
136
|
my $is_uni = is_unicode($s); # otherwise you'll get \xae\x{301} instead of \x{ae}\x{301} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# ick: patches uber welcome |
89
|
58
|
50
|
66
|
|
|
276
|
if ( $is_uni && $] < 5.008_001 && Module::Want::have_mod('Data::Dumper') ) { |
|
|
|
33
|
|
|
|
|
90
|
0
|
|
|
|
|
0
|
local $Data::Dumper::Terse = 1; |
91
|
0
|
|
|
|
|
0
|
$s = Data::Dumper::Dumper($s); |
92
|
0
|
|
|
|
|
0
|
$s =~ s/\A(["|'])//; |
93
|
0
|
|
|
|
|
0
|
my $quote = $1; |
94
|
0
|
|
|
|
|
0
|
$s =~ s/$quote\s*\z//; |
95
|
0
|
0
|
|
|
|
0
|
$s =~ s/'/\\'/g unless $quote eq "'"; |
96
|
0
|
|
|
|
|
0
|
return get_utf8($s); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
58
|
|
|
|
|
268
|
$s =~ s{([^!#&()*+,\-.\/0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz{|}~ ])} |
100
|
|
|
|
|
|
|
{ |
101
|
104
|
|
|
|
|
191
|
my $chr = "$1"; |
102
|
104
|
|
|
|
|
131
|
my $n = ord($chr); |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# if ( exists $esc{$chr} ) { # more universal way ??? |
105
|
|
|
|
|
|
|
# $esc{$chr}; |
106
|
|
|
|
|
|
|
# } |
107
|
|
|
|
|
|
|
# els |
108
|
104
|
100
|
100
|
|
|
441
|
if ( $n < 32 || $n > 126 ) { |
|
|
50
|
|
|
|
|
|
109
|
83
|
100
|
66
|
|
|
641
|
sprintf( ( !$is_uni && $n < 255 ? '\x%02x' : '\x{%04x}' ), $n ); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
elsif ($quotemeta) { |
112
|
21
|
|
|
|
|
85
|
quotemeta($chr); |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
else { |
115
|
0
|
|
|
|
|
0
|
$chr |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
}ge; |
118
|
|
|
|
|
|
|
|
119
|
58
|
|
|
|
|
132
|
return get_utf8($s); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub escape_utf8 { |
123
|
9
|
|
|
9
|
1
|
79
|
my ($string) = @_; |
124
|
9
|
|
|
|
|
25
|
$string = get_utf8($string); |
125
|
9
|
|
|
|
|
38
|
return escape_utf8_or_unicode($string); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub escape_unicode { |
129
|
19
|
|
|
19
|
1
|
13352
|
my ($string) = @_; |
130
|
19
|
|
|
|
|
48
|
$string = get_unicode($string); |
131
|
19
|
|
|
|
|
43
|
return escape_utf8_or_unicode($string); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub unescape_utf8_or_unicode { |
135
|
139
|
|
|
139
|
1
|
216
|
my ( $string, $unquotemeta ) = @_; # undocumented second flag for internal use |
136
|
139
|
100
|
|
|
|
462
|
my $is_uni = $string =~ m/\\x\{[0-9a-fA-f]+\}/ ? 1 : 0; |
137
|
|
|
|
|
|
|
|
138
|
139
|
|
|
|
|
462
|
$string =~ s/((?:\\x(?:[0-9a-fA-f]{2}|\{[0-9a-fA-f]+\}))+)/eval qq{"$1"}/eg; ## no critic |
|
58
|
|
|
|
|
4101
|
|
139
|
139
|
100
|
|
|
|
655
|
$string = String::Unquotemeta::unquotemeta($string) if $unquotemeta; |
140
|
139
|
100
|
|
|
|
1575
|
return get_unicode($string) if $is_uni; |
141
|
112
|
|
|
|
|
212
|
return get_utf8($string); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub unescape_utf8 { |
145
|
2
|
|
|
2
|
1
|
4
|
my ($string) = @_; |
146
|
2
|
|
|
|
|
5
|
$string = unescape_utf8_or_unicode($string); |
147
|
2
|
|
|
|
|
5
|
return get_utf8($string); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub unescape_unicode { |
151
|
2
|
|
|
2
|
1
|
5
|
my ($string) = @_; |
152
|
2
|
|
|
|
|
6
|
$string = unescape_utf8_or_unicode($string); |
153
|
2
|
|
|
|
|
5
|
return get_unicode($string); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub quotemeta_bytes { # I ♥ perl\'s coolness |
157
|
7
|
|
|
7
|
1
|
92
|
my $utf8_quoted = quotemeta_utf8( $_[0] ); |
158
|
7
|
|
|
|
|
23
|
return unescape_utf8_or_unicode($utf8_quoted); |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
sub quotemeta_utf8 { # I \xe2\x99\xa5 perl\'s coolness |
162
|
14
|
|
|
14
|
1
|
92
|
my ($string) = @_; |
163
|
14
|
|
|
|
|
31
|
$string = get_utf8($string); |
164
|
14
|
|
|
|
|
35
|
return escape_utf8_or_unicode( $string, 1 ); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub quotemeta_unicode { # I \x{2665} perl\'s coolness |
168
|
7
|
|
|
7
|
1
|
77
|
my ($string) = @_; |
169
|
7
|
|
|
|
|
21
|
$string = get_unicode($string); |
170
|
7
|
|
|
|
|
19
|
return escape_utf8_or_unicode( $string, 1 ); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub unquotemeta_bytes { |
174
|
42
|
|
|
42
|
1
|
7174
|
goto &unquotemeta_utf8; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub unquotemeta_utf8 { |
178
|
84
|
|
|
84
|
1
|
122
|
my ($escaped_string) = @_; |
179
|
84
|
|
|
|
|
172
|
$escaped_string = unescape_utf8_or_unicode( $escaped_string, 1 ); |
180
|
84
|
|
|
|
|
147
|
return get_utf8($escaped_string); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub unquotemeta_unicode { |
184
|
42
|
|
|
42
|
1
|
78
|
my ($escaped_string) = @_; |
185
|
42
|
|
|
|
|
85
|
$escaped_string = unescape_utf8_or_unicode( $escaped_string, 1 ); |
186
|
42
|
|
|
|
|
93
|
return get_unicode($escaped_string); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _pre_581_is_utf8_hack { |
190
|
7
|
|
|
7
|
|
4929
|
my ($string) = @_; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# strings with unicode characters that are unicode strings |
193
|
7
|
|
|
|
|
52
|
require bytes; |
194
|
7
|
100
|
|
|
|
30
|
return 1 if bytes::length($string) != CORE::length($string); |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# strings without unicode characters that are unicode strings |
197
|
5
|
50
|
|
|
|
2028
|
if ( Module::Want::have_mod('Encode') ) { |
198
|
5
|
100
|
|
|
|
252638
|
return 1 if Encode::is_utf8($string); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
else { |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# So we have a string without unicode characters and no utf8::is_utf8() or Encode::is_utf8(), time to get hacky! |
203
|
0
|
0
|
0
|
|
|
0
|
if ( Module::Want::have_mod('B::Flags') && defined &B::svref_2object ) { # B::Flags brings in B *but* B::svref_2object can be compiled away in some specific circumstances |
204
|
0
|
0
|
|
|
|
0
|
return 1 if B::svref_2object( \$string )->flagspv() =~ m/UTF.?8/i; # works on 5.6! |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
else { |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# oi, still nothing is available at this point so time to get reeeeeaaaallly hacky! (patches very very welcome, this is a terrible last ditch effort) |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# not fool proof (same text–different-string/variable or [down|up]grade() outside of get_[utf8|unicode]) |
211
|
0
|
0
|
|
|
|
0
|
return 1 if exists $pre_573_is_utf8_hack->{$string}; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
4
|
|
|
|
|
43
|
return; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
1; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
__END__ |