File Coverage

blib/lib/Test/utf8.pm

Criterion	Covered	Total	%
statement	91	91	100.0
branch	12	12	100.0
condition	16	16	100.0
subroutine	21	21	100.0
pod	6	8	75.0
total	146	148	98.6

line	stmt	bran	cond	sub	pod	time	code
1							package Test::utf8;
2
3	7			7		451339	use 5.007003;
	7					53
4
5	7			7		39	use strict;
	7					13
	7					135
6	7			7		32	use warnings;
	7					16
	7					240
7
8	7			7		62	use base qw(Exporter);
	7					34
	7					1019
9
10	7			7		4159	use Encode;
	7					73779
	7					508
11	7			7		3880	use charnames ':full';
	7					221549
	7					53
12
13							our $VERSION = "1.02";
14
15							our @EXPORT = qw(
16							is_valid_string is_dodgy_utf8 is_sane_utf8
17							is_within_ascii is_within_latin1 is_within_latin_1
18							is_flagged_utf8 isnt_flagged_utf8
19							);
20
21							# A Regexp string to match valid UTF8 bytes
22							# this info comes from page 78 of "The Unicode Standard 4.0"
23							# published by the Unicode Consortium
24							our $valid_utf8_regexp = <<'REGEX' ;
25							[\x{00}-\x{7f}]
26							\| [\x{c2}-\x{df}][\x{80}-\x{bf}]
27							\| \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
28							\| [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
29							\| \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
30							\| [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
31							\| \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
32							\| [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
33							\| \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
34							REGEX
35
36							=head1 NAME
37
38							Test::utf8 - handy utf8 tests
39
40							=head1 SYNOPSIS
41
42							# check the string is good
43							is_valid_string($string); # check the string is valid
44							is_sane_utf8($string); # check not double encoded
45
46							# check the string has certain attributes
47							is_flagged_utf8($string1); # has utf8 flag set
48							is_within_ascii($string2); # only has ascii chars in it
49							isnt_within_ascii($string3); # has chars outside the ascii range
50							is_within_latin_1($string4); # only has latin-1 chars in it
51							isnt_within_ascii($string5); # has chars outside the latin-1 range
52
53							=head1 DESCRIPTION
54
55							This module is a collection of tests useful for dealing with utf8 strings in
56							Perl.
57
58							This module has two types of tests: The validity tests check if a string is
59							valid and not corrupt, whereas the characteristics tests will check that string
60							has a given set of characteristics.
61
62							=head2 Validity Tests
63
64							=over
65
66							=item is_valid_string($string, $testname)
67
68							Checks if the string is "valid", i.e. this passes and returns true unless
69							the internal utf8 flag hasn't been set on scalar that isn't made up of a valid
70							utf-8 byte sequence.
71
72							This should I happen and, in theory, this test should always pass. Unless
73							you (or a module you use) goes monkeying around inside a scalar using Encode's
74							private functions or XS code you shouldn't ever end up in a situation where
75							you've got a corrupt scalar. But if you do, and you do, then this function
76							should help you detect the problem.
77
78							To be clear, here's an example of the error case this can detect:
79
80							my $mark = "Mark";
81							my $leon = "L\x{e9}on";
82							is_valid_string($mark); # passes, not utf-8
83							is_valid_string($leon); # passes, not utf-8
84
85							my $iloveny = "I \x{2665} NY";
86							is_valid_string($iloveny); # passes, proper utf-8
87
88							my $acme = "L\x{c3}\x{a9}on";
89							Encode::_utf8_on($acme); # (please don't do things like this)
90							is_valid_string($acme); # passes, proper utf-8 byte sequence upgraded
91
92							Encode::_utf8_on($leon); # (this is why you don't do things like this)
93							is_valid_string($leon); # fails! the byte \x{e9} isn't valid utf-8
94
95							=cut
96
97							sub is_valid_string($;$)
98							{
99	12			12	1	28020	my $string = shift;
100	12		100			54	my $name = shift \|\| "valid string test";
101
102							# check we're a utf8 string - if not, we pass.
103	12	100				60	unless (Encode::is_utf8($string))
104	5					14	{ return _pass($name) }
105
106							# work out at what byte (if any) we have an invalid byte sequence
107							# and return the correct test result
108	7					19	my $pos = _invalid_sequence_at_byte($string);
109	7	100				19	if (_ok(!defined($pos), $name)) { return 1 }
	4					973
110	3					3428	_diag("malformed byte sequence starting at byte $pos");
111	3					8	return;
112							}
113
114							sub _invalid_sequence_at_byte($)
115							{
116	7			7		27	my $string = shift;
117
118							# examine the bytes that make up the string (not the chars)
119							# by turning off the utf8 flag (no, use bytes doesn't
120							# work, we're dealing with a regexp)
121	7					23	Encode::_utf8_off($string); ## no critic (ProtectPrivateSubs)
122
123							# work out the index of the first non matching byte
124	7					197	my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
125
126							# if we matched all the string return the empty list
127	7		100			31	my $pos = pos $string \|\| 0;
128	7	100				24	return if $pos == length($string);
129
130							# otherwise return the position we found
131	3					7	return $pos
132							}
133
134							=item is_sane_utf8($string, $name)
135
136							This test fails if the string contains something that looks like it
137							might be dodgy utf8, i.e. containing something that looks like the
138							multi-byte sequence for a latin-1 character but perl hasn't been
139							instructed to treat as such. Strings that are not utf8 always
140							automatically pass.
141
142							Some examples may help:
143
144							# This will pass as it's a normal latin-1 string
145							is_sane_utf8("Hello L\x{e9}eon");
146
147							# this will fail because the \x{c3}\x{a9} looks like the
148							# utf8 byte sequence for e-acute
149							my $string = "Hello L\x{c3}\x{a9}on";
150							is_sane_utf8($string);
151
152							# this will pass because the utf8 is correctly interpreted as utf8
153							Encode::_utf8_on($string)
154							is_sane_utf8($string);
155
156							Obviously this isn't a hundred percent reliable. The edge case where
157							this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
158							LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
159							WITH TILDE") followed by one of the latin-1 punctuation symbols.
160
161							# a capital letter A with tilde surrounded by smart quotes
162							# this will fail because it'll see the "\x{c2}\x{94}" and think
163							# it's actually the utf8 sequence for the end smart quote
164							is_sane_utf8("\x{93}\x{c2}\x{94}");
165
166							However, since this hardly comes up this test is reasonably reliable
167							in most cases. Still, care should be applied in cases where dynamic
168							data is placed next to latin-1 punctuation to avoid false negatives.
169
170							There exists two situations to cause this test to fail; The string
171							contains utf8 byte sequences and the string hasn't been flagged as
172							utf8 (this normally means that you got it from an external source like
173							a C library; When Perl needs to store a string internally as utf8 it
174							does it's own encoding and flagging transparently) or a utf8 flagged
175							string contains byte sequences that when translated to characters
176							themselves look like a utf8 byte sequence. The test diagnostics tells
177							you which is the case.
178
179							=cut
180
181							# build my regular expression out of the latin-1 bytes
182							# NOTE: This won't work if our locale is nonstandard will it?
183							my $re_bit = join "\|", map { Encode::encode("utf8",chr($_)) } (127..255);
184
185							sub is_sane_utf8($;$)
186							{
187	6			6	1	2365	my $string = shift;
188	6		100			40	my $name = shift \|\| "sane utf8";
189
190							# regexp in scalar context with 'g', meaning this loop will run for
191							# each match. Should only have to run it once, but will redo if
192							# the failing case turns out to be allowed in %allowed.
193	6					422	while ($string =~ /($re_bit)/o)
194							{
195							# work out what the double encoded string was
196	2					6	my $bytes = $1;
197
198	2					19	my $index = $+[0] - length($bytes);
199	2					10	my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
	4					20
200
201							# what character does that represent?
202	2					17	my $char = Encode::decode("utf8",$bytes);
203	2					112	my $ord = ord($char);
204	2					7	my $hex = sprintf '%00x', $ord;
205	2					8	$char = charnames::viacode($ord);
206
207							# print out diagnostic messages
208	2					12070	_fail($name);
209	2					2123	_diag(qq{Found dodgy chars "$codes" at char $index\n});
210	2	100				18	if (Encode::is_utf8($string))
211	1					3	{ _diag("Chars in utf8 string look like utf8 byte sequence.") }
212							else
213	1					3	{ _diag("String not flagged as utf8...was it meant to be?\n") }
214	2					13	_diag("Probably originally a $char char - codepoint $ord (dec),"
215							." $hex (hex)\n");
216
217	2					7	return 0;
218							}
219
220							# got this far, must have passed.
221	4					21	_ok(1,$name);
222	4					1040	return 1;
223							}
224
225							# historic name of method; deprecated
226	5			5	0	11506	sub is_dodgy_utf8 { goto &is_sane_utf8 }
227
228							=back
229
230							=head2 String Characteristic Tests
231
232							These routines allow you to check the range of characters in a string.
233							Note that these routines are blind to the actual encoding perl
234							internally uses to store the characters, they just check if the
235							string contains only characters that can be represented in the named
236							encoding:
237
238							=over
239
240							=item is_within_ascii
241
242							Tests that a string only contains characters that are in the ASCII
243							character set.
244
245							=cut
246
247							sub is_within_ascii($;$)
248							{
249	3			3	1	7019	my $string = shift;
250	3		100			14	my $name = shift \|\| "within ascii";
251
252							# look for anything that isn't ascii or pass
253	3	100				23	$string =~ /([^\x{00}-\x{7f}])/ or return _pass($name);
254
255							# explain why we failed
256	1					4	my $dec = ord($1);
257	1					5	my $hex = sprintf '%02x', $dec;
258
259	1					11	_fail($name);
260	1					1065	_diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
261
262	1					3	return 0;
263							}
264
265							=item is_within_latin_1
266
267							Tests that a string only contains characters that are in latin-1.
268
269							=cut
270
271							sub is_within_latin_1($;$)
272							{
273	4			4	1	2640	my $string = shift;
274	4		100			27	my $name = shift \|\| "within latin-1";
275
276							# look for anything that isn't ascii or pass
277	4	100				20	$string =~ /([^\x{00}-\x{ff}])/ or return _pass($name);
278
279							# explain why we failed
280	1					4	my $dec = ord($1);
281	1					12	my $hex = sprintf '%x', $dec;
282
283	1					4	_fail($name);
284	1					1062	_diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
285
286	1					3	return 0;
287							}
288
289	3			3	0	7041	sub is_within_latin1 { goto &is_within_latin_1 }
290
291							=back
292
293							Simply check if a scalar is or isn't flagged as utf8 by perl's
294							internals:
295
296							=over
297
298							=item is_flagged_utf8($string, $name)
299
300							Passes if the string is flagged by perl's internals as utf8, fails if
301							it's not.
302
303							=cut
304
305							sub is_flagged_utf8
306							{
307	3			3	1	6811	my $string = shift;
308	3		100			13	my $name = shift \|\| "flagged as utf8";
309	3					24	return _ok(Encode::is_utf8($string),$name);
310							}
311
312							=item isnt_flagged_utf8($string,$name)
313
314							The opposite of C, passes if and only if the string
315							isn't flagged as utf8 by perl's internals.
316
317							Note: you can refer to this function as C if you
318							really want to.
319
320							=cut
321
322							sub isnt_flagged_utf8($;$)
323							{
324	3			3	1	8516	my $string = shift;
325	3		100			13	my $name = shift \|\| "not flagged as utf8";
326	3					11	return _ok(!Encode::is_utf8($string), $name);
327							}
328
329							sub isn::t_flagged_utf8($;$)
330							{
331	3			3		8427	my $string = shift;
332	3		100			20	my $name = shift \|\| "not flagged as utf8";
333	3					12	return _ok(!Encode::is_utf8($string), $name);
334							}
335
336							=back
337
338							=head1 AUTHOR
339
340							Written by Mark Fowler B
341
342							=head1 COPYRIGHT
343
344							Copyright Mark Fowler 2004,2012. All rights reserved.
345
346							This program is free software; you can redistribute it
347							and/or modify it under the same terms as Perl itself.
348
349							=head1 BUGS
350
351							None known. Please report any to me via the CPAN RT system. See
352							http://rt.cpan.org/ for more details.
353
354							=head1 SEE ALSO
355
356							L for testing for double encoded HTML
357							entities.
358
359							=cut
360
361							##########
362
363							# shortcuts for Test::Builder.
364
365	7			7		9059	use Test::Builder;
	7					19
	7					1316
366							my $tester = Test::Builder->new();
367
368							sub _ok
369							{
370	34			34		66	local $Test::Builder::Level = $Test::Builder::Level + 1;
371	34					142	return $tester->ok(@_)
372							}
373							sub _diag
374							{
375	11			11		26	local $Test::Builder::Level = $Test::Builder::Level + 1;
376	11					38	$tester->diag(@_);
377	11					2839	return;
378							}
379
380							sub _fail
381							{
382	4			4		13	local $Test::Builder::Level = $Test::Builder::Level + 1;
383	4					12	return _ok(0,@_)
384							}
385
386							sub _pass
387							{
388	10			10		21	local $Test::Builder::Level = $Test::Builder::Level + 1;
389	10					28	return _ok(1,@_)
390							}
391
392
393							1;