File Coverage

lib/Text/ASCII/Convert.pm
Criterion Covered Total %
statement 23 23 100.0
branch 7 8 87.5
condition n/a
subroutine 7 7 100.0
pod 0 2 0.0
total 37 40 92.5


line stmt bran cond sub pod time code
1             package Text::ASCII::Convert;
2 1     1   72491 use strict;
  1         3  
  1         35  
3 1     1   10 use warnings FATAL => 'all';
  1         3  
  1         54  
4 1     1   664 use Encode qw(decode);
  1         11259  
  1         72  
5 1     1   7 use base 'Exporter';
  1         2  
  1         387  
6             our @EXPORT = qw(convert_to_ascii);
7             our @EXPORT_OK = qw(convert_to_ascii);
8             our $VERSION = '0.21';
9             my %char_map;
10              
11             =encoding utf8
12              
13             =head1 NAME
14              
15             Text::ASCII::Convert - Perl module to convert non-ASCII characters to their ASCII equivalents
16              
17             =head1 SYNOPSIS
18              
19             use Text::ASCII::Convert;
20              
21             print convert_to_ascii("Ýou hãve a nèw vòice-mãil");
22              
23             # prints "You have a new voice-mail"
24              
25             =head1 DESCRIPTION
26              
27             This module attempts to convert non-ASCII characters in a string to their closet ASCII homoglyph. The input
28             can be a string of Unicode characters or a string of UTF-8 octets. The output is always a string of ASCII characters
29             in the range 0x00 to 0x7F.
30              
31             This is most useful for catching spam that uses non-ASCII characters to obfuscate words. For example,
32              
33             Ýou hãve a nèw vòice-mãil
34             You havé Reꞓeìved an Enꞓryptéd Company Maíl
35              
36             would be converted to
37              
38             You have a new voice-mail
39             You have ReCeived an EnCrypted Company Mail
40              
41             Unlike other transliteration software, this plugin converts non-ASCII characters
42             to their ASCII equivalents based on appearance instead of meaning. For example, the
43             German eszett character 'ß' is converted to the Roman letter 'B' instead of 'ss'
44             because it resembles a 'B' in appearance. Likewise, the Greek letter Sigma ('Σ') is
45             converted to 'E' and a lower case Omega ('ω') is converted to 'w' even though these
46             letters have different lexical meanings.
47              
48             Not all non-ASCII characters are converted. For example, the Japanese Hiragana
49             character 'あ' is not converted because it does not resemble any ASCII character.
50             Characters that have no ASCII equivalent are replaced by spaces. To avoid long runs
51             of spaces, multiple spaces are collapsed into a single space. For example,
52              
53             Find 💋💘Singles💋💘 in your Area
54              
55             would be converted to
56              
57             Find Singles in your Area
58              
59             The plugin also removes zero-width characters such as the zero-width
60             space (U+200B) and zero-width non-joiner (U+200C) that are often used to
61             obfuscate words.
62              
63             Control characters such as tabs, newlines, and carriage returns are retained.
64              
65             =head1 AUTHORS
66              
67             Kent Oyer
68              
69             =head1 LICENSE AND COPYRIGHT
70              
71             Copyright (C) 2023 MXGuardian LLC
72              
73             This program is free software: you can redistribute it and/or modify
74             it under the terms of the GNU General Public License as published by
75             the Free Software Foundation, either version 3 of the License, or
76             (at your option) any later version.
77              
78             This program is distributed in the hope that it will be useful,
79             but WITHOUT ANY WARRANTY; without even the implied warranty of
80             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE
81             file included with this distribution for more information.
82              
83             You should have received a copy of the GNU General Public License
84             along with this program. If not, see https://www.gnu.org/licenses/.
85              
86             =cut
87              
88             UNITCHECK {
89             # build character map from __DATA__ section
90             while () {
91             chomp;
92             my ($key,$value) = split /\s+/;
93             my $ascii = join('', map { chr(hex($_)) } split /\+/, $value);
94             $char_map{chr(hex($key))} = $ascii;
95             }
96             close DATA;
97             };
98              
99             # Converts a string of Unicode characters (or UTF-8 encoded bytes) to a string of ASCII characters
100             # in the range 0x00 to 0x7F. Non-ASCII characters are replaced with their ASCII equivalents.
101             # Zero-width characters and combining marks are removed. Multiple spaces are collapsed into a single space.
102             #
103             sub convert_to_ascii {
104 6 100   6 0 2076 my $str = is_valid_utf_8($_[0]) ? decode('UTF-8', $_[0]) : $_[0];
105             # remove zero-width characters and combining marks
106 1     1   607 $str =~ s/[\xAD\x{034F}\x{200B}-\x{200F}\x{202A}\x{202B}\x{202C}\x{2060}\x{FEFF}]|\p{Combining_Mark}//g;
  1         15  
  1         28  
  6         137  
107             # replace non-ascii characters with ascii equivalents
108 6 100       28 $str =~ s/([^[:ascii:]])/defined($char_map{$1})?$char_map{$1}:' '/eg;
  53         230  
109             # collapse spaces
110 6         53 $str =~ s/\x{20}+/ /g;
111 6         35 return $str;
112             }
113              
114             # returns true if the provided string of octets represents a syntactically
115             # valid UTF-8 string, otherwise a false is returned.
116             # Copied from Mail::SpamAssassin::Util::is_valid_utf8
117             #
118             sub is_valid_utf_8 {
119 6 50   6 0 17 return undef if !defined $_[0];
120             #
121             # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
122             # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
123             # UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
124             # UTF8-1 = %x00-7F
125             # UTF8-2 = %xC2-DF UTF8-tail
126             # UTF8-3 = %xE0 %xA0-BF UTF8-tail /
127             # %xE1-EC 2( UTF8-tail ) /
128             # %xED %x80-9F UTF8-tail /
129             # # U+D800..U+DFFF are utf16 surrogates, not legal utf8
130             # %xEE-EF 2( UTF8-tail )
131             # UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
132             # %xF1-F3 3( UTF8-tail ) /
133             # %xF4 %x80-8F 2( UTF8-tail )
134             # UTF8-tail = %x80-BF
135             #
136             # loose variant:
137             # [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
138             # [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
139             #
140 6 100       60 $_[0] =~ /^ (?: [\x00-\x7F] |
141             [\xC2-\xDF] [\x80-\xBF] |
142             \xE0 [\xA0-\xBF] [\x80-\xBF] |
143             [\xE1-\xEC] [\x80-\xBF]{2} |
144             \xED [\x80-\x9F] [\x80-\xBF] |
145             [\xEE-\xEF] [\x80-\xBF]{2} |
146             \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
147             [\xF1-\xF3] [\x80-\xBF]{3} |
148             \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
149             }
150              
151              
152             1;
153              
154             __DATA__