line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Regexp::Common::profanity_us; |
2
|
|
|
|
|
|
|
BEGIN { |
3
|
1
|
|
|
1
|
|
800
|
$Regexp::Common::profanity_us::VERSION = '4.112150'; |
4
|
|
|
|
|
|
|
} |
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
9
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
7
|
1
|
|
|
1
|
|
50
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
1140
|
use Data::Dumper; |
|
1
|
|
|
|
|
9102
|
|
|
1
|
|
|
|
|
92
|
|
10
|
1
|
|
|
1
|
|
798
|
use Data::Section::Simple qw(get_data_section); |
|
1
|
|
|
|
|
707
|
|
|
1
|
|
|
|
|
123
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
8
|
use Regexp::Common qw /pattern clean no_defaults/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
16
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
0
|
|
|
0
|
0
|
|
sub longest_first { length($b) <=> length($a) } |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
my ($profanity, @profanity); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $data = Data::Section::Simple::get_data_section('profanity'); |
25
|
|
|
|
|
|
|
#die "data: $data"; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my @line = split "\n", $data; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
for (@line) { |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# warn $_; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
next if /^#/; |
35
|
|
|
|
|
|
|
next if /^\s*$/; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
s/^\s+//; |
40
|
|
|
|
|
|
|
s/\s+$//; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
push @profanity, $_; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
#die Dumper("profanity", \@profanity); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
pattern name => [qw (profanity us normal label -dist=7)], |
49
|
|
|
|
|
|
|
create => sub |
50
|
|
|
|
|
|
|
{ |
51
|
|
|
|
|
|
|
my ($self, $flags) = @_; |
52
|
|
|
|
|
|
|
my $word_width = $flags->{-dist}; |
53
|
|
|
|
|
|
|
my $any_char = ".{0,$word_width}"; |
54
|
|
|
|
|
|
|
@profanity = map { s/-/$any_char/g; $_ } @profanity; |
55
|
|
|
|
|
|
|
$profanity = join '|', @profanity; |
56
|
|
|
|
|
|
|
$profanity = '(?k:' . $profanity . ')'; |
57
|
|
|
|
|
|
|
#warn "$profanity"; |
58
|
|
|
|
|
|
|
$profanity; |
59
|
|
|
|
|
|
|
}, |
60
|
|
|
|
|
|
|
; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
1; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=pod |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 NAME |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Regexp::Common::profanity_us -- provide regexes for U.S. profanity |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 SYNOPSIS |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
use Regexp::Common qw /profanity_us/; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
my $RE = $RE{profanity}{us}{normal}{label}{-keep}{-dist=>3}; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
while (<>) { |
81
|
|
|
|
|
|
|
warn "PROFANE" if /$RE/; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Or easier |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
use Regexp::Profanity::US; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
$profane = profane ($string); |
89
|
|
|
|
|
|
|
@profane = profane_list($string); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 OVERVIEW |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Instead of a dry technical overview, I am going to explain the structure |
94
|
|
|
|
|
|
|
of this module based on its history. I consult at a company that generates |
95
|
|
|
|
|
|
|
customer leads primarily by having websites that attract people (e.g. |
96
|
|
|
|
|
|
|
lowering loan values, selling cars, buying real estate, etc.). For some reason |
97
|
|
|
|
|
|
|
we get more than our fair share of profane leads. For this reason I was told |
98
|
|
|
|
|
|
|
to write a profanity checker. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
For the data that I was dealing with, the profanity was most often in the |
101
|
|
|
|
|
|
|
email address or in the first or last name, so I naively started filtering |
102
|
|
|
|
|
|
|
profanity with a set of regexps for that sort of data. Note that both names |
103
|
|
|
|
|
|
|
and email addresses are unlike what you are reading now: they are not |
104
|
|
|
|
|
|
|
whitespace-separated text, but are instead labels. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Therefore full support for profanity checking should work in 2 entirely |
107
|
|
|
|
|
|
|
different contexts: labels (email, names) and text (what you are reading). |
108
|
|
|
|
|
|
|
Because open-source is driven by demand and I have no need for detecting |
109
|
|
|
|
|
|
|
profanity in text, only C |
110
|
|
|
|
|
|
|
know the next sentence: "patches welcome" :) |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head2 Spelling Variations Dictated by Sound or Sight |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head3 Creative use of symbols to spell words (el33t sp3@k) |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Now, within labels, you can see normal ascii or creative use of symbols: |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
Here are some normal profane labels: |
119
|
|
|
|
|
|
|
suckmycock@isp.com |
120
|
|
|
|
|
|
|
shitonastick |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
And here they are in ascii art: |
123
|
|
|
|
|
|
|
s\/cKmyc0k@aol.com |
124
|
|
|
|
|
|
|
sh|+0naST1ck |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
A CPAN module which does a great job of "drawing words" is |
127
|
|
|
|
|
|
|
L. |
128
|
|
|
|
|
|
|
I thought I knew all of the ways that someone could |
129
|
|
|
|
|
|
|
"inflate" a letter so that dirty words could bypass a profanity checker, but |
130
|
|
|
|
|
|
|
just look at all these: |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
%letter = |
133
|
|
|
|
|
|
|
( a => [ "4", "@" ], |
134
|
|
|
|
|
|
|
c => "(", |
135
|
|
|
|
|
|
|
e => "3", |
136
|
|
|
|
|
|
|
g => "6", |
137
|
|
|
|
|
|
|
h => [ "|-|", "]-[" ], |
138
|
|
|
|
|
|
|
k => [ "|<", "]{" ], |
139
|
|
|
|
|
|
|
i => "!", |
140
|
|
|
|
|
|
|
l => [ "1", "|" ], |
141
|
|
|
|
|
|
|
m => [ "|V|", "|\\/|" ], |
142
|
|
|
|
|
|
|
n => "|\\|", |
143
|
|
|
|
|
|
|
o => "0", |
144
|
|
|
|
|
|
|
s => [ "5", "Z" ], |
145
|
|
|
|
|
|
|
t => [ "7", "+"], |
146
|
|
|
|
|
|
|
u => "\\_/", |
147
|
|
|
|
|
|
|
v => "\\/", |
148
|
|
|
|
|
|
|
w => [ "vv", "\\/\\/" ], |
149
|
|
|
|
|
|
|
'y' => "j", |
150
|
|
|
|
|
|
|
z => "2", |
151
|
|
|
|
|
|
|
); |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head3 Soundex respelling |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Which of course brings me to the final way to take normal text and vary it |
156
|
|
|
|
|
|
|
for the same meaning: soundex. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
The way a word sounds can lead to different spellings. For example, we have |
159
|
|
|
|
|
|
|
shitonastick |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Which we can soundex out as: |
162
|
|
|
|
|
|
|
shitonuhstick |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Or, given: |
165
|
|
|
|
|
|
|
nigger |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
We can rewrite it as: |
168
|
|
|
|
|
|
|
nigga |
169
|
|
|
|
|
|
|
nigguh |
170
|
|
|
|
|
|
|
niggah |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
There are two CPAN modules, L and |
173
|
|
|
|
|
|
|
L which do |
174
|
|
|
|
|
|
|
this sort of thing, but after they resolved "shit" and "shot" to the same |
175
|
|
|
|
|
|
|
soundex, I forgot about them :). |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
So to conclude this OVERVIEW, (or is that oV3r\/ieW :), this module does |
179
|
|
|
|
|
|
|
profanity checking for: |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
labels and not text |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
and for: |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
normal and not eleet spelling |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
with a bit of hedging to support soundexing (and only definite obscene |
188
|
|
|
|
|
|
|
words are searched for. Ambiguous / contextual searching is left as an |
189
|
|
|
|
|
|
|
exercise for the reader). |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
In L terminology, which is the infrastructure on which |
192
|
|
|
|
|
|
|
this module is built, we have only the following regexp for your |
193
|
|
|
|
|
|
|
string-matching ecstasy: |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
$RE{profanity}{us}{normal}{label} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
and patches are welcome for: |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
$RE{profanity}{us}{label}{eleet} |
200
|
|
|
|
|
|
|
$RE{profanity}{us}{text}{normal} |
201
|
|
|
|
|
|
|
$RE{profanity}{us}{text}{eleet} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
But do note this if you plan to implement I parsing, |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
C<[^:alpha:]> and not C<\b> should be used because C<_> does not form a |
206
|
|
|
|
|
|
|
word boundary and so |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
\bshit\b |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
will match |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
shit head |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
and |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
shit-head |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
but not |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
shit_head |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Another thing about text is that it may be resolved into labels by splitting |
223
|
|
|
|
|
|
|
on whitespace. Thus, one could have one engine and a different pre-processor. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 USAGE |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Please consult the manual of L for a general description |
228
|
|
|
|
|
|
|
of the works of this interface. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Do not use this module directly, but load it via I. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
This module reads one flag, C<-dist> which is used to set the amount of |
233
|
|
|
|
|
|
|
characters that can appear between components of an obscene phrase. |
234
|
|
|
|
|
|
|
For example |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
suck!!!my!!!cock |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
will match the following regular expression |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
suck-my-cock |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
as long as the flag C<-dist> is set to 3 or greater because this module |
243
|
|
|
|
|
|
|
changes C<-> into C<.{0,$dist}> with C<$dist> defaulting to 7. |
244
|
|
|
|
|
|
|
Why such a large default? It is done so that the profanity list can |
245
|
|
|
|
|
|
|
omit certain words such as my or your. Take this: |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
poop on your face |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
We have the following regular expression |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
poop--face |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
which is transformed to |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
poop.{0,7}.{0,7}face |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
which will match the possible prepositions and adjectives in between |
258
|
|
|
|
|
|
|
"poop" and "face" and also match the hideous term "poopface". |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head2 Capturing |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Under C<-keep> (see L): |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=over 4 |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item $1 |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
captures the entire word |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=back |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=head1 SEE ALSO |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
L for a general description of how to use this interface. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
L for a slightly more European set of words. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
L for a pair of wrapper functions that use these regexps. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head1 AUTHOR |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
T. M. Brannon, tbone@cpan.org |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
I cannot pay enough thanks to |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
Matthew Simon Cavalletto, evo@cpan.org. |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
who refactored this module completely of his own volition and |
289
|
|
|
|
|
|
|
in spite of his hectic schedule. He turned this module from an |
290
|
|
|
|
|
|
|
unsophisticated hack into something worth others using. |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Useful brain picking came from William McKee of Knowmad Consulting on the |
293
|
|
|
|
|
|
|
L mailing list. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
__DATA__ |