File Coverage

lib/Text/ASCII/Convert.pm

Criterion	Covered	Total	%
statement	23	23	100.0
branch	7	8	87.5
condition			n/a
subroutine	7	7	100.0
pod	0	2	0.0
total	37	40	92.5

line	stmt	bran	sub	pod	time	code
1						package Text::ASCII::Convert;
2	1		1		72491	use strict;
	1				3
	1				35
3	1		1		10	use warnings FATAL => 'all';
	1				3
	1				54
4	1		1		664	use Encode qw(decode);
	1				11259
	1				72
5	1		1		7	use base 'Exporter';
	1				2
	1				387
6						our @EXPORT = qw(convert_to_ascii);
7						our @EXPORT_OK = qw(convert_to_ascii);
8						our $VERSION = '0.21';
9						my %char_map;
10
11						=encoding utf8
12
13						=head1 NAME
14
15						Text::ASCII::Convert - Perl module to convert non-ASCII characters to their ASCII equivalents
16
17						=head1 SYNOPSIS
18
19						use Text::ASCII::Convert;
20
21						print convert_to_ascii("Ýou hãve a nèw vòice-mãil");
22
23						# prints "You have a new voice-mail"
24
25						=head1 DESCRIPTION
26
27						This module attempts to convert non-ASCII characters in a string to their closet ASCII homoglyph. The input
28						can be a string of Unicode characters or a string of UTF-8 octets. The output is always a string of ASCII characters
29						in the range 0x00 to 0x7F.
30
31						This is most useful for catching spam that uses non-ASCII characters to obfuscate words. For example,
32
33						Ýou hãve a nèw vòice-mãil
34						You havé Reꞓeìved an Enꞓryptéd Company Maíl
35
36						would be converted to
37
38						You have a new voice-mail
39						You have ReCeived an EnCrypted Company Mail
40
41						Unlike other transliteration software, this plugin converts non-ASCII characters
42						to their ASCII equivalents based on appearance instead of meaning. For example, the
43						German eszett character 'ß' is converted to the Roman letter 'B' instead of 'ss'
44						because it resembles a 'B' in appearance. Likewise, the Greek letter Sigma ('Σ') is
45						converted to 'E' and a lower case Omega ('ω') is converted to 'w' even though these
46						letters have different lexical meanings.
47
48						Not all non-ASCII characters are converted. For example, the Japanese Hiragana
49						character 'あ' is not converted because it does not resemble any ASCII character.
50						Characters that have no ASCII equivalent are replaced by spaces. To avoid long runs
51						of spaces, multiple spaces are collapsed into a single space. For example,
52
53						Find 💋💘Singles💋💘 in your Area
54
55						would be converted to
56
57						Find Singles in your Area
58
59						The plugin also removes zero-width characters such as the zero-width
60						space (U+200B) and zero-width non-joiner (U+200C) that are often used to
61						obfuscate words.
62
63						Control characters such as tabs, newlines, and carriage returns are retained.
64
65						=head1 AUTHORS
66
67						Kent Oyer
68
69						=head1 LICENSE AND COPYRIGHT
70
71						Copyright (C) 2023 MXGuardian LLC
72
73						This program is free software: you can redistribute it and/or modify
74						it under the terms of the GNU General Public License as published by
75						the Free Software Foundation, either version 3 of the License, or
76						(at your option) any later version.
77
78						This program is distributed in the hope that it will be useful,
79						but WITHOUT ANY WARRANTY; without even the implied warranty of
80						MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE
81						file included with this distribution for more information.
82
83						You should have received a copy of the GNU General Public License
84						along with this program. If not, see https://www.gnu.org/licenses/.
85
86						=cut
87
88						UNITCHECK {
89						# build character map from __DATA__ section
90						while () {
91						chomp;
92						my ($key,$value) = split /\s+/;
93						my $ascii = join('', map { chr(hex($_)) } split /\+/, $value);
94						$char_map{chr(hex($key))} = $ascii;
95						}
96						close DATA;
97						};
98
99						# Converts a string of Unicode characters (or UTF-8 encoded bytes) to a string of ASCII characters
100						# in the range 0x00 to 0x7F. Non-ASCII characters are replaced with their ASCII equivalents.
101						# Zero-width characters and combining marks are removed. Multiple spaces are collapsed into a single space.
102						#
103						sub convert_to_ascii {
104	6	100	6	0	2076	my $str = is_valid_utf_8($_[0]) ? decode('UTF-8', $_[0]) : $_[0];
105						# remove zero-width characters and combining marks
106	1		1		607	$str =~ s/[\xAD\x{034F}\x{200B}-\x{200F}\x{202A}\x{202B}\x{202C}\x{2060}\x{FEFF}]\|\p{Combining_Mark}//g;
	1				15
	1				28
	6				137
107						# replace non-ascii characters with ascii equivalents
108	6	100			28	$str =~ s/([^[:ascii:]])/defined($char_map{$1})?$char_map{$1}:' '/eg;
	53				230
109						# collapse spaces
110	6				53	$str =~ s/\x{20}+/ /g;
111	6				35	return $str;
112						}
113
114						# returns true if the provided string of octets represents a syntactically
115						# valid UTF-8 string, otherwise a false is returned.
116						# Copied from Mail::SpamAssassin::Util::is_valid_utf8
117						#
118						sub is_valid_utf_8 {
119	6	50	6	0	17	return undef if !defined $_[0];
120						#
121						# RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
122						# RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
123						# UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
124						# UTF8-1 = %x00-7F
125						# UTF8-2 = %xC2-DF UTF8-tail
126						# UTF8-3 = %xE0 %xA0-BF UTF8-tail /
127						# %xE1-EC 2( UTF8-tail ) /
128						# %xED %x80-9F UTF8-tail /
129						# # U+D800..U+DFFF are utf16 surrogates, not legal utf8
130						# %xEE-EF 2( UTF8-tail )
131						# UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
132						# %xF1-F3 3( UTF8-tail ) /
133						# %xF4 %x80-8F 2( UTF8-tail )
134						# UTF8-tail = %x80-BF
135						#
136						# loose variant:
137						# [\x00-\x7F] \| [\xC0-\xDF][\x80-\xBF] \|
138						# [\xE0-\xEF][\x80-\xBF]{2} \| [\xF0-\xF4][\x80-\xBF]{3}
139						#
140	6	100			60	$_[0] =~ /^ (?: [\x00-\x7F] \|
141						[\xC2-\xDF] [\x80-\xBF] \|
142						\xE0 [\xA0-\xBF] [\x80-\xBF] \|
143						[\xE1-\xEC] [\x80-\xBF]{2} \|
144						\xED [\x80-\x9F] [\x80-\xBF] \|
145						[\xEE-\xEF] [\x80-\xBF]{2} \|
146						\xF0 [\x90-\xBF] [\x80-\xBF]{2} \|
147						[\xF1-\xF3] [\x80-\xBF]{3} \|
148						\xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
149						}
150
151
152						1;
153
154						__DATA__