line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Catalyst::Plugin::Params::Demoronize; |
2
|
|
|
|
|
|
|
BEGIN { |
3
|
2
|
|
|
2
|
|
3737
|
$Catalyst::Plugin::Params::Demoronize::VERSION = '1.14'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
19
|
use strict; |
|
2
|
|
|
|
|
80
|
|
|
2
|
|
|
|
|
79
|
|
7
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
109
|
|
8
|
2
|
|
|
2
|
|
2234
|
use utf8; |
|
2
|
|
|
|
|
17
|
|
|
2
|
|
|
|
|
17
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 NAME |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Catalyst::Plugin::Params::Demoronize - convert common UTF-8 and Windows-1252 characters to their ASCII equivalents |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 SYNOPSIS |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# Be sure and use the Unicode plugin if you want to handle Unicode |
17
|
|
|
|
|
|
|
# replacement. |
18
|
|
|
|
|
|
|
use Catalyst qw(Unicode Demoronize); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Optionally enable replacement of common unicode "smart" characters. |
21
|
|
|
|
|
|
|
MyApp->config->{demoronize} = { replace_unicode => 1 } |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
to borrow a few passages from the documentation packaged |
26
|
|
|
|
|
|
|
with john walker's demoronizer.pl: |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=over 4 |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
...as is usually the case when you encounter something |
31
|
|
|
|
|
|
|
shoddy in the vicinity of a computer, Microsoft incompetence |
32
|
|
|
|
|
|
|
and gratuitous incompatibility were to blame. Western |
33
|
|
|
|
|
|
|
language HTML documents are written in the ISO 8859-1 |
34
|
|
|
|
|
|
|
Latin-1 character set, with a specified set of escapes for |
35
|
|
|
|
|
|
|
special characters. Blithely ignoring this prescription, as |
36
|
|
|
|
|
|
|
usual, Microsoft use their own "extension" to Latin-1, in |
37
|
|
|
|
|
|
|
which a variety of characters which do not appear in Latin-1 |
38
|
|
|
|
|
|
|
are inserted in the range 0x82 through 0x95--this having the |
39
|
|
|
|
|
|
|
merit of being incompatible with both Latin-1 and Unicode, |
40
|
|
|
|
|
|
|
which reserve this region for additional control |
41
|
|
|
|
|
|
|
characters. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
These characters include open and close single and double |
44
|
|
|
|
|
|
|
quotes, em and en dashes, an ellipsis and a variety of other |
45
|
|
|
|
|
|
|
things you've been dying for, such as a capital Y umlaut and |
46
|
|
|
|
|
|
|
a florin symbol. Well, okay, you say, if Microsoft want to |
47
|
|
|
|
|
|
|
have their own little incompatible character set, why not? |
48
|
|
|
|
|
|
|
Because it doesn't stop there--in their inimitable fashion |
49
|
|
|
|
|
|
|
(who would want to?)--they aggressively pollute the Web |
50
|
|
|
|
|
|
|
pages of unknowing and innocent victims worldwide with these |
51
|
|
|
|
|
|
|
characters, with the result that the owners of these pages |
52
|
|
|
|
|
|
|
look like semi-literate morons when their pages are viewed |
53
|
|
|
|
|
|
|
on non-Microsoft platforms (or on Microsoft platforms, for |
54
|
|
|
|
|
|
|
that matter, if the user has selected as the browser's font |
55
|
|
|
|
|
|
|
one of the many TrueType fonts which do not include the |
56
|
|
|
|
|
|
|
incompatible Microsoft characters). |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
You see, "state of the art" Microsoft Office applications |
59
|
|
|
|
|
|
|
sport a nifty feature called "smart quotes." (Rule of |
60
|
|
|
|
|
|
|
thumb--every time Microsoft use the word "smart," be on the |
61
|
|
|
|
|
|
|
lookout for something dumb). This feature is on by default |
62
|
|
|
|
|
|
|
in both Word and PowerPoint, and can be disabled only by |
63
|
|
|
|
|
|
|
finding the little box buried among the dozens of |
64
|
|
|
|
|
|
|
bewildering option panels these products contain. If |
65
|
|
|
|
|
|
|
enabled, and you type the string, |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
"Halt," he cried, "this is the police!" |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
"smart quotes" transforms the ASCII quote characters |
70
|
|
|
|
|
|
|
automatically into the incompatible Microsoft opening and |
71
|
|
|
|
|
|
|
closing quotes. ASCII single and double quotes are |
72
|
|
|
|
|
|
|
similarly transformed (even though ASCII already contains |
73
|
|
|
|
|
|
|
apostrophe and single open quote characters), and double |
74
|
|
|
|
|
|
|
hyphens are replaced by the incompatible em dash symbol. |
75
|
|
|
|
|
|
|
What other horrors occur, I know not. If the user notices |
76
|
|
|
|
|
|
|
this happening at all, their reaction might be "Thank you |
77
|
|
|
|
|
|
|
Billy-boy--that looks ever so much nicer," not knowing |
78
|
|
|
|
|
|
|
they've been set up to look like a moron to folks all over |
79
|
|
|
|
|
|
|
the world. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=back |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
these characters are commonly inserted into form elements |
84
|
|
|
|
|
|
|
via cut and paste operations. in many cases, they are |
85
|
|
|
|
|
|
|
converted to UTF-8 by the browser. this plugin will replace |
86
|
|
|
|
|
|
|
both the unicode characters AND the Windows-1252 characters |
87
|
|
|
|
|
|
|
with sane ASCII equivalents. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head1 UNICODE |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Demoronize assumes that you are using L |
92
|
|
|
|
|
|
|
to convert incoming parameters into Unicode characters. If you are |
93
|
|
|
|
|
|
|
not and enable optional C, you may have issues. |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head1 CONFIG |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 replace_unicode |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
If this flag is enabled (it is off by default) then commonly substituted |
100
|
|
|
|
|
|
|
Unicode characters will be converted to their ASCII equivalents. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 replace_map |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
A map of Unicode characters and their ASCII equivalents that will be swapped. |
105
|
|
|
|
|
|
|
This can be overridden, but defaults to: |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=cut |
108
|
|
|
|
|
|
|
|
109
|
2
|
|
|
2
|
|
3383
|
use MRO::Compat; |
|
2
|
|
|
|
|
14655
|
|
|
2
|
|
|
|
|
79
|
|
110
|
2
|
|
|
2
|
|
3740
|
use Encode::ZapCP1252; |
|
2
|
|
|
|
|
51288
|
|
|
2
|
|
|
|
|
2015
|
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head1 METHODS |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=over 4 |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=item prepare_parameters |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
Converts parameters. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=cut |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub prepare_parameters |
123
|
|
|
|
|
|
|
{ |
124
|
7
|
|
|
7
|
1
|
10889
|
my $c = shift; |
125
|
|
|
|
|
|
|
|
126
|
7
|
|
|
|
|
48
|
my $retval = $c->maybe::next::method(@_); |
127
|
7
|
|
|
|
|
149
|
my $params = $c->req->params; |
128
|
|
|
|
|
|
|
|
129
|
7
|
|
|
|
|
1875
|
foreach my $key (keys %$params) { |
130
|
7
|
|
|
|
|
16
|
my $ref = \$params->{$key}; |
131
|
|
|
|
|
|
|
|
132
|
7
|
|
|
|
|
18
|
for (ref $$ref) { |
133
|
7
|
100
|
|
|
|
139
|
/^$/ && do { $$ref = $c->_demoronize($$ref) }; |
|
6
|
|
|
|
|
27
|
|
134
|
7
|
100
|
|
|
|
65
|
/^ARRAY$/ && do { $$ref = [ map { $c->_demoronize($_) } @$$ref ] }; |
|
1
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
9
|
|
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub _demoronize |
140
|
|
|
|
|
|
|
{ |
141
|
9
|
|
|
9
|
|
21
|
my $c = shift; |
142
|
9
|
|
|
|
|
22
|
my $str = shift; |
143
|
|
|
|
|
|
|
|
144
|
9
|
|
|
|
|
388
|
zap_cp1252($str); |
145
|
|
|
|
|
|
|
|
146
|
9
|
|
100
|
|
|
515
|
my $config = $c->config->{'demoronize'} ||= {}; |
147
|
|
|
|
|
|
|
|
148
|
9
|
|
|
|
|
843
|
$config->{replace_map} = { |
149
|
|
|
|
|
|
|
'‚' => ',', # 82, SINGLE LOW-9 QUOTATION MARK |
150
|
|
|
|
|
|
|
'„' => ',,', # 84, DOUBLE LOW-9 QUOTATION MARK |
151
|
|
|
|
|
|
|
'…' => '...', # 85, HORIZONTAL ELLIPSIS |
152
|
|
|
|
|
|
|
'ˆ' => '^', # 88, MODIFIER LETTER CIRCUMFLEX ACCENT |
153
|
|
|
|
|
|
|
'‘' => '`', # 91, LEFT SINGLE QUOTATION MARK |
154
|
|
|
|
|
|
|
'’' => "'", # 92, RIGHT SINGLE QUOTATION MARK |
155
|
|
|
|
|
|
|
'“' => '"', # 93, LEFT DOUBLE QUOTATION MARK |
156
|
|
|
|
|
|
|
'”' => '"', # 94, RIGHT DOUBLE QUOTATION MARK |
157
|
|
|
|
|
|
|
'•' => '*', # 95, BULLET |
158
|
|
|
|
|
|
|
'–' => '-', # 96, EN DASH |
159
|
|
|
|
|
|
|
'—' => '-', # 97, EM DASH |
160
|
|
|
|
|
|
|
'‹' => '<', # 8B, SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
161
|
|
|
|
|
|
|
'›' => '>', # 9B, SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
162
|
|
|
|
|
|
|
}; |
163
|
|
|
|
|
|
|
|
164
|
9
|
100
|
66
|
|
|
62
|
if(exists($config->{'replace_unicode'}) && $config->{'replace_unicode'}) { |
165
|
|
|
|
|
|
|
|
166
|
3
|
|
|
|
|
4
|
foreach my $replace (keys(%{ $config->{replace_map} })) { |
|
3
|
|
|
|
|
11
|
|
167
|
39
|
50
|
|
|
|
107
|
next unless defined($str); |
168
|
39
|
|
|
|
|
475
|
$str =~ s/$replace/$config->{replace_map}->{$replace}/g; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
9
|
|
|
|
|
37
|
return $str; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=back |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 AUTHOR |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
Mike Eldridge |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=over 4 |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=item * Cory Watson |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=item * Chisel Wright |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=item * Michele Beltrame |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=back |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=cut |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
1; |
196
|
|
|
|
|
|
|
|