File Coverage

blib/lib/Search/Tools/UTF8.pm

Criterion	Covered	Total	%
statement	68	86	79.0
branch	21	36	58.3
condition	12	16	75.0
subroutine	15	15	100.0
pod	7	7	100.0
total	123	160	76.8

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::UTF8;
2	38			38		252401	use strict;
	38					97
	38					902
3	38			38		152	use warnings;
	38					56
	38					733
4	38			38		140	use Carp;
	38					58
	38					1624
5	38			38		3677	use Search::Tools; # XS stuff
	38					70
	38					820
6	38			38		13064	use Encode;
	38					246328
	38					2283
7	38			38		14968	use charnames ':full';
	38					939019
	38					213
8	38			38		10125	use Data::Dump qw( dump );
	38					39814
	38					1910
9	38			38		198	use base qw( Exporter );
	38					65
	38					47624
10							our @EXPORT = qw(
11							to_utf8
12							is_valid_utf8
13							is_flagged_utf8
14							is_perl_utf8_string
15							is_ascii
16							is_latin1
17							is_sane_utf8
18							find_bad_utf8
19							find_bad_ascii
20							find_bad_latin1
21							find_bad_latin1_report
22							byte_length
23							looks_like_cp1252
24							fix_cp1252_codepoints_in_utf8
25							debug_bytes
26							);
27
28							our $Debug = ( $ENV{PERL_DEBUG} && $ENV{PERL_DEBUG} > 2 ) ? 1 : 0;
29
30							our $VERSION = '1.006';
31
32							sub to_utf8 {
33	604			604	1	2158	my $str = shift;
34	604	50				1068	Carp::cluck("\$str is undefined") unless defined $str;
35
36	604		100			1163	my $charset = shift \|\| 'iso-8859-1';
37
38							# checks first
39	604	100				965	if ( is_flagged_utf8($str) ) {
40	393	50				617	$Debug and carp "string '$str' is flagged utf8 already";
41	393					1159	return $str;
42							}
43	211	100				1153	if ( is_ascii($str) ) {
44	193					660	Encode::_utf8_on($str);
45	193	50				398	$Debug and carp "string '$str' is ascii; utf8 flag turned on";
46	193					1196	return $str;
47							}
48	18	100				53	if ( is_valid_utf8($str) ) {
49
50							# we got here only because the flag was off and it wasn't ascii.
51							# however, is_valid_utf8() claims that it is valid internal UTF8,
52							# so just turn the flag on.
53	6					19	Encode::_utf8_on($str);
54	6	50				14	$Debug and carp "string '$str' is valid utf8; utf8 flag turned on";
55	6					18	return $str;
56							}
57
58							$Debug
59	12	50				33	and carp "converting $str from $charset -> utf8";
60	12					73	my $c = Encode::decode( $charset, $str );
61	12	50				1292	$Debug and carp "converted $c";
62
63	12	50				43	unless ( is_sane_utf8( $c, 1 ) ) {
64	0					0	carp "not sane: $c";
65							}
66
67	12					153	return $c;
68							}
69
70							sub is_flagged_utf8 {
71	611			611	1	1849	return Encode::is_utf8( $_[0] );
72							}
73
74							my $re_bit = join "\|",
75							map { Encode::encode( "utf8", chr($_) ) } ( 127 .. 255 );
76
77							#binmode STDERR, ":utf8";
78							#print STDERR $re_bit;
79
80							sub is_sane_utf8 {
81	15			15	1	46	my $string = shift;
82	15		50			52	my $warnings = shift \|\| $Debug \|\| 0;
83
84	15					25	my $is_insane = 0;
85	15					3154	while ( $string =~ /($re_bit)/go ) {
86
87							# work out what the double encoded string was
88	0					0	my $bytes = $1;
89
90	0					0	my $index = $+[0] - length($bytes);
91	0					0	my $codes = join '', map { sprintf '<%00x>', ord($_) } split //,
	0					0
92							$bytes;
93
94							# what character does that represent?
95	0					0	my $char = Encode::decode( "utf8", $bytes );
96	0					0	my $ord = ord($char);
97	0					0	my $hex = sprintf '%00x', $ord;
98	0					0	$char = charnames::viacode($ord);
99
100							# print out diagnostic messages
101	0	0				0	if ($warnings) {
102
103	0					0	warn(qq{Found dodgy chars "$codes" at char $index\n});
104	0	0				0	if ( Encode::is_utf8($string) ) {
105	0					0	warn("Chars in utf8 string look like utf8 byte sequence.");
106							}
107							else {
108	0					0	warn("String not flagged as utf8...was it meant to be?\n");
109							}
110	0					0	warn(
111							"Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n"
112							);
113
114							}
115	0					0	$is_insane++;
116							}
117
118	15	50				122	return $is_insane ? 0 : 1;
119							}
120
121							sub is_valid_utf8 {
122	39	100	100	39	1	4963	if ( is_latin1( $_[0] )
			100
123							&& !is_ascii( $_[0] )
124							&& !is_perl_utf8_string( $_[0] ) )
125							{
126	8					30	return 0;
127							}
128	31					155	return is_perl_utf8_string( $_[0] );
129							}
130
131							sub find_bad_latin1_report {
132	2			2	1	223	my $bad = find_bad_latin1(@_);
133	2	50				5	if ($bad) {
134
135							# explain why we failed
136	2					5	my $char = substr( $_[0], $bad, 1 );
137	2					3	my $dec = ord($char);
138	2					7	my $hex = sprintf '%x', $dec;
139	2					277	carp("byte $bad ($char) is not Latin1 (it's $dec dec / $hex hex)");
140							}
141	2					16	return $bad;
142							}
143
144							sub looks_like_cp1252 {
145	6	100	33	6	1	59	if ( !is_latin1( $_[0] )
			66
146							&& !is_ascii( $_[0] )
147							&& $_[0] =~ m/[\x80-\x9f]/ )
148							{
149	5					22	return 1;
150							}
151	1					5	return 0;
152							}
153
154							my %win1252 = (
155							"\x80" => "\x{20AC}", #EURO SIGN
156							"\x81" => '', #UNDEFINED
157							"\x82" => "\x{201A}", #SINGLE LOW-9 QUOTATION MARK
158							"\x83" => "\x{0192}", #LATIN SMALL LETTER F WITH HOOK
159							"\x84" => "\x{201E}", #DOUBLE LOW-9 QUOTATION MARK
160							"\x85" => "\x{2026}", #HORIZONTAL ELLIPSIS
161							"\x86" => "\x{2020}", #DAGGER
162							"\x87" => "\x{2021}", #DOUBLE DAGGER
163							"\x88" => "\x{02C6}", #MODIFIER LETTER CIRCUMFLEX ACCENT
164							"\x89" => "\x{2030}", #PER MILLE SIGN
165							"\x8A" => "\x{0160}", #LATIN CAPITAL LETTER S WITH CARON
166							"\x8B" => "\x{2039}", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
167							"\x8C" => "\x{0152}", #LATIN CAPITAL LIGATURE OE
168							"\x8D" => '', #UNDEFINED
169							"\x8E" => "\x{017D}", #LATIN CAPITAL LETTER Z WITH CARON
170							"\x8F" => '', #UNDEFINED
171							"\x90" => '', #UNDEFINED
172							"\x91" => "\x{2018}", #LEFT SINGLE QUOTATION MARK
173							"\x92" => "\x{2019}", #RIGHT SINGLE QUOTATION MARK
174							"\x93" => "\x{201C}", #LEFT DOUBLE QUOTATION MARK
175							"\x94" => "\x{201D}", #RIGHT DOUBLE QUOTATION MARK
176							"\x95" => "\x{2022}", #BULLET
177							"\x96" => "\x{2013}", #EN DASH
178							"\x97" => "\x{2014}", #EM DASH
179							"\x98" => "\x{02DC}", #SMALL TILDE
180							"\x99" => "\x{2122}", #TRADE MARK SIGN
181							"\x9A" => "\x{0161}", #LATIN SMALL LETTER S WITH CARON
182							"\x9B" => "\x{203A}", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
183							"\x9C" => "\x{0153}", #LATIN SMALL LIGATURE OE
184							"\x9D" => '', #UNDEFINED
185							"\x9E" => "\x{017E}", #LATIN SMALL LETTER Z WITH CARON
186							"\x9F" => "\x{0178}", #LATIN CAPITAL LETTER Y WITH DIAERESIS
187
188							);
189
190							# fix_latin (used in Transliterate) lacks the check for the
191							# prefixed \xc2 byte, but the UTF-8 encoding for these
192							# Windows codepoints has the leading \xc2 byte.
193							sub fix_cp1252_codepoints_in_utf8 {
194	1			1	1	217	my $buf = shift;
195	1	50				2	unless ( is_valid_utf8($buf) ) {
196	0					0	my $badbyte = find_bad_utf8($buf);
197	0					0	croak "bad UTF-8 byte(s) at $badbyte [ " . dump($buf) . " ]";
198							}
199	1	50				3	$Debug and warn "converting $buf\n";
200	1					2	my $bytes = Encode::encode_utf8( to_utf8($buf) );
201	1					14	$bytes =~ s/\xc2([\x80-\x9f])/$win1252{$1}/g;
202	1					2	return to_utf8($bytes);
203							}
204
205							1;
206
207							__END__