File Coverage

blib/lib/IsUTF8.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch	10	14	71.4
condition			n/a
subroutine	5	5	100.0
pod	0	1	0.0
total	37	44	84.0

line	stmt	bran	sub	pod	time	code
1						package IsUTF8;
2
3						# $Id: IsUTF8.pm 2011 2006-06-21 08:23:10Z heiko $
4						# $URL: https://svn.schlittermann.de/pub/perl-unicode-detect/trunk/lib/IsUTF8.pm $
5						# © 2006
6						#
7
8						=head1 NAME
9
10						IsUTF8 - detects if UTF8 characters are present
11
12						=cut
13
14	1		1		28448	use strict;
	1				3
	1				43
15	1		1		6	use warnings;
	1				1
	1				45
16
17						our $VERSION = '0.2';
18
19	1		1		6	use Exporter;
	1				6
	1				646
20						our @ISA = qw/Exporter/;
21						our @EXPORT_OK = qw/&isUTF8/;
22
23						=head1 SYNOPSIS
24
25						use IsUTF8;
26						$result = IsUTF8::isUTF8;
27						$result = IsUTF8::isUTF8($line);
28
29						use IsUTF8 qw(isUTF8);
30						$result = isUTF8;
31						$result = isUTF8($line);
32
33						use IsUTF8 qw(isUTF8 debug);
34						$result = isUTF8;
35						$result = isUTF8($line);
36
37						if (not defined $result) {
38						print "Contains some characters with 8th bit set!";
39						}
40						if ($result == 0) {
41						print "Plain ASCII (0..127)";
42						}
43						if ($result) {
44						print "Contains UTF8";
45						}
46
47						=cut
48
49						my $debug = 0;
50
51						sub import(@) {
52	1		1		18	$debug = grep /^debug$/, @_;
53	1				7	@_ = grep !/^debug$/, @_;
54	1				170	goto &Exporter::import;
55						}
56
57						sub isUTF8(;$) {
58	8	100	8	0	15518	my $data = @_ ? $_[0] : $_;
59
60	8	50			26	$data =~ s/\s*$// if $debug;
61	8	50			18	print STDERR "test: $data\n" if $debug > 2;
62
63	8	100			85	if ($data =~ /(
64						[\xc0-\xdf][\x80-\xbf]
65						\| [\xe0-\xef][\x80-\xbf]{2}
66						\| [\xf0-\xf7][\x80-\xbf]{3} ) /x
67						) {
68	4	50			10	if ($debug) {
69	0				0	print STDERR "$data\n"
70						. " " x index($data, $1)
71						. "^\n";
72						}
73
74	4				12	return 1;
75
76						}
77
78	4	100			17	if ($data =~ /([\x80-\xff])/) {
79	2	50			8	if ($debug) {
80	0				0	print STDERR "$data\n",
81						. " " x index($data, $1)
82						. "^\n";
83						}
84
85	2				8	return undef;
86						}
87
88	2				5	return 0;
89						}
90
91						=head1 DESCRIPTION
92
93						This tests the given line and returns true if there is at least one
94						UTF8 character sequence. (Actually the tests returns after the first
95						sequence found.) C is returned if there is some other character
96						with the 8th bit set. C<0> is returned if there are only characters
97						from C<0x00> to C<0x7f>.
98
99						=head1 BACKGROUND
100
101						UTF8-Encoding looks like this:
102
103						1111.0x: 1111.0000-1111.0111 0xF0 - 0xF7, followed by 3 bytes
104						1110.xx: 1110.0000-1110.1111 0xE0 - 0xEF, followed by 2 bytes
105						110x.xx: 1100.0000-1101.1111 0xC0 - 0xDF, followed by 1 byte
106						10xx.xx: 1000.0000-1011.1111 0x80 - 0xBF (following byte as above)
107
108						=head1 SEE ALSO
109
110						L and L
111
112						=head1 BUGS
113
114						First release. Please do not rely on a stable API yet. If you're interested
115						in stabilizing, please tell me.
116
117						Probably. Not tested a lot!
118
119						=head1 AUTHOR
120
121						Heiko Schlittermann
122
123						=cut
124
125
126						1;