line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This is a Bayesian-style probabilistic classifier, using an algorithm based on |
25
|
|
|
|
|
|
|
the one detailed in Paul Graham's I<A Plan For Spam> paper at: |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
http://www.paulgraham.com/spam.html |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
It also incorporates some other aspects taken from Graham Robinson's webpage |
30
|
|
|
|
|
|
|
on the subject at: |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
And the chi-square probability combiner as described here: |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
http://www.linuxjournal.com/print.php?sid=6467 |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
The results are incorporated into SpamAssassin as the BAYES_* rules. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 METHODS |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=cut |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
package Mail::SpamAssassin::Plugin::Bayes; |
45
|
|
|
|
|
|
|
|
46
|
21
|
|
|
21
|
|
153
|
use strict; |
|
21
|
|
|
|
|
47
|
|
|
21
|
|
|
|
|
720
|
|
47
|
21
|
|
|
21
|
|
120
|
use warnings; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
721
|
|
48
|
|
|
|
|
|
|
# use bytes; |
49
|
21
|
|
|
21
|
|
136
|
use re 'taint'; |
|
21
|
|
|
|
|
42
|
|
|
21
|
|
|
|
|
2047
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
BEGIN { |
52
|
21
|
|
|
|
|
155
|
eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 } |
|
21
|
|
|
|
|
1391
|
|
|
21
|
|
|
|
|
717
|
|
53
|
21
|
50
|
|
21
|
|
80
|
or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
21
|
|
|
21
|
|
165
|
use Mail::SpamAssassin; |
|
21
|
|
|
|
|
45
|
|
|
21
|
|
|
|
|
622
|
|
57
|
21
|
|
|
21
|
|
128
|
use Mail::SpamAssassin::Plugin; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
678
|
|
58
|
21
|
|
|
21
|
|
142
|
use Mail::SpamAssassin::PerMsgStatus; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
599
|
|
59
|
21
|
|
|
21
|
|
171
|
use Mail::SpamAssassin::Logger; |
|
21
|
|
|
|
|
47
|
|
|
21
|
|
|
|
|
1337
|
|
60
|
21
|
|
|
21
|
|
148
|
use Mail::SpamAssassin::Util qw(untaint_var); |
|
21
|
|
|
|
|
38
|
|
|
21
|
|
|
|
|
982
|
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# pick ONLY ONE of these combining implementations. |
63
|
21
|
|
|
21
|
|
7469
|
use Mail::SpamAssassin::Bayes::CombineChi; |
|
21
|
|
|
|
|
51
|
|
|
21
|
|
|
|
|
13201
|
|
64
|
|
|
|
|
|
|
# use Mail::SpamAssassin::Bayes::CombineNaiveBayes; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# Which headers should we scan for tokens? Don't use all of them, as it's easy |
69
|
|
|
|
|
|
|
# to pick up spurious clues from some. What we now do is use all of them |
70
|
|
|
|
|
|
|
# *less* these well-known headers; that way we can pick up spammers' tracking |
71
|
|
|
|
|
|
|
# headers (which are obviously not well-known in advance!). |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# Received is handled specially |
74
|
|
|
|
|
|
|
our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise |
75
|
|
|
|
|
|
|
|Delivered-To |Delivery-Date |
76
|
|
|
|
|
|
|
|(?:X-)?Envelope-To |
77
|
|
|
|
|
|
|
|X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
|Subject # not worth a tiny gain vs. to db size increase |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Date: can provide invalid cues if your spam corpus is |
82
|
|
|
|
|
|
|
# older/newer than ham |
83
|
|
|
|
|
|
|
|Date |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# List headers: ignore. a spamfiltering mailing list will |
86
|
|
|
|
|
|
|
# become a nonspam sign. |
87
|
|
|
|
|
|
|
|X-List|(?:X-)?Mailing-List |
88
|
|
|
|
|
|
|
|(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe |
89
|
|
|
|
|
|
|
|Unsubscribe|Host|Id|Manager|Admin|Comment |
90
|
|
|
|
|
|
|
|Name|Url) |
91
|
|
|
|
|
|
|
|X-Unsub(?:scribe)? |
92
|
|
|
|
|
|
|
|X-Mailman-Version |X-Been[Tt]here |X-Loop |
93
|
|
|
|
|
|
|
|Mail-Followup-To |
94
|
|
|
|
|
|
|
|X-eGroups-(?:Return|From) |
95
|
|
|
|
|
|
|
|X-MDMailing-List |
96
|
|
|
|
|
|
|
|X-XEmacs-List |
97
|
|
|
|
|
|
|
|X-Sympa-To |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# gatewayed through mailing list (thanks to Allen Smith) |
100
|
|
|
|
|
|
|
|(?:X-)?Resent-(?:From|To|Date) |
101
|
|
|
|
|
|
|
|(?:X-)?Original-(?:From|To|Date) |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Spamfilter/virus-scanner headers: too easy to chain from |
104
|
|
|
|
|
|
|
# these |
105
|
|
|
|
|
|
|
|X-MailScanner(?:-SpamCheck)? |
106
|
|
|
|
|
|
|
|X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? |
107
|
|
|
|
|
|
|
|X-Antispam |X-RBL-Warning |X-Mailscanner |
108
|
|
|
|
|
|
|
|X-MDaemon-Deliver-To |X-Virus-Scanned |
109
|
|
|
|
|
|
|
|X-Mass-Check-Id |
110
|
|
|
|
|
|
|
|X-Pyzor |X-DCC-\S{2,25}-Metrics |
111
|
|
|
|
|
|
|
|X-Filtered-B[Yy] |X-Scanned-By |X-Scanner |
112
|
|
|
|
|
|
|
|X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status |
113
|
|
|
|
|
|
|
|X-SpamCop-[^:]+ |
114
|
|
|
|
|
|
|
|X-SMTPD |(?:X-)?Spam-Apparently-To |
115
|
|
|
|
|
|
|
|SPAM |X-Perlmx-Spam |
116
|
|
|
|
|
|
|
|X-Bogosity |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# some noisy Outlook headers that add no good clues: |
119
|
|
|
|
|
|
|
|Content-Class |Thread-(?:Index|Topic) |
120
|
|
|
|
|
|
|
|X-Original[Aa]rrival[Tt]ime |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# Annotations from IMAP, POP, and MH: |
123
|
|
|
|
|
|
|
|(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded |
124
|
|
|
|
|
|
|
|Lines |Content-Length |
125
|
|
|
|
|
|
|
|X-UIDL? |X-IMAPbase |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# Annotations from Bugzilla |
128
|
|
|
|
|
|
|
|X-Bugzilla-[^:]+ |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# Annotations from VM: (thanks to Allen Smith) |
131
|
|
|
|
|
|
|
|X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified |
132
|
|
|
|
|
|
|
|Summary-Format|VHeader|v\d-Data|Message-Order) |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# Annotations from Gnus: |
135
|
|
|
|
|
|
|
| X-Gnus-Mail-Source |
136
|
|
|
|
|
|
|
| Xref |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
)}x; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
# Note only the presence of these headers, in order to reduce the |
141
|
|
|
|
|
|
|
# hapaxen they generate. |
142
|
|
|
|
|
|
|
our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face |
143
|
|
|
|
|
|
|
|X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint |
144
|
|
|
|
|
|
|
|D(?:KIM|omainKey)-Signature |
145
|
|
|
|
|
|
|
)}ix; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# tweaks tested as of Nov 18 2002 by jm posted to -devel at |
148
|
|
|
|
|
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/12977556/ |
149
|
|
|
|
|
|
|
# for results. The winners are now the default settings. |
150
|
21
|
|
|
21
|
|
171
|
use constant IGNORE_TITLE_CASE => 1; |
|
21
|
|
|
|
|
52
|
|
|
21
|
|
|
|
|
1145
|
|
151
|
21
|
|
|
21
|
|
129
|
use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0; |
|
21
|
|
|
|
|
46
|
|
|
21
|
|
|
|
|
940
|
|
152
|
21
|
|
|
21
|
|
130
|
use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1; |
|
21
|
|
|
|
|
44
|
|
|
21
|
|
|
|
|
922
|
|
153
|
21
|
|
|
21
|
|
144
|
use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; |
|
21
|
|
|
|
|
46
|
|
|
21
|
|
|
|
|
882
|
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# tweaks by jm on May 12 2003, see -devel email at |
156
|
|
|
|
|
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/14844556/ |
157
|
21
|
|
|
21
|
|
121
|
use constant PRE_CHEW_ADDR_HEADERS => 1; |
|
21
|
|
|
|
|
43
|
|
|
21
|
|
|
|
|
844
|
|
158
|
21
|
|
|
21
|
|
119
|
use constant CHEW_BODY_URIS => 1; |
|
21
|
|
|
|
|
46
|
|
|
21
|
|
|
|
|
944
|
|
159
|
21
|
|
|
21
|
|
121
|
use constant CHEW_BODY_MAILADDRS => 1; |
|
21
|
|
|
|
|
45
|
|
|
21
|
|
|
|
|
839
|
|
160
|
21
|
|
|
21
|
|
116
|
use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; |
|
21
|
|
|
|
|
46
|
|
|
21
|
|
|
|
|
854
|
|
161
|
21
|
|
|
21
|
|
124
|
use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; |
|
21
|
|
|
|
|
43
|
|
|
21
|
|
|
|
|
802
|
|
162
|
21
|
|
|
21
|
|
110
|
use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
801
|
|
163
|
21
|
|
|
21
|
|
116
|
use constant IGNORE_MSGID_TOKENS => 0; |
|
21
|
|
|
|
|
43
|
|
|
21
|
|
|
|
|
816
|
|
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# tweaks of 12 March 2004, see bug 2129. |
166
|
21
|
|
|
21
|
|
126
|
use constant DECOMPOSE_BODY_TOKENS => 1; |
|
21
|
|
|
|
|
47
|
|
|
21
|
|
|
|
|
856
|
|
167
|
21
|
|
|
21
|
|
121
|
use constant MAP_HEADERS_MID => 1; |
|
21
|
|
|
|
|
50
|
|
|
21
|
|
|
|
|
870
|
|
168
|
21
|
|
|
21
|
|
113
|
use constant MAP_HEADERS_FROMTOCC => 1; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
844
|
|
169
|
21
|
|
|
21
|
|
124
|
use constant MAP_HEADERS_USERAGENT => 1; |
|
21
|
|
|
|
|
47
|
|
|
21
|
|
|
|
|
866
|
|
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 |
172
|
21
|
|
|
21
|
|
116
|
use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
987
|
|
173
|
21
|
|
|
21
|
|
130
|
use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; |
|
21
|
|
|
|
|
38
|
|
|
21
|
|
|
|
|
2628
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# We store header-mined tokens in the db with a "HHeaderName:val" format. |
176
|
|
|
|
|
|
|
# some headers may contain lots of gibberish tokens, so allow a little basic |
177
|
|
|
|
|
|
|
# compression by mapping the header name at least here. these are the headers |
178
|
|
|
|
|
|
|
# which appear with the most frequency in my db. note: this doesn't have to |
179
|
|
|
|
|
|
|
# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing |
180
|
|
|
|
|
|
|
# tokens from multiple different headers may impact accuracy, so might as well |
181
|
|
|
|
|
|
|
# avoid this if possible. These are the top ones from my corpus, BTW (jm). |
182
|
|
|
|
|
|
|
our %HEADER_NAME_COMPRESSION = ( |
183
|
|
|
|
|
|
|
'Message-Id' => '*m', |
184
|
|
|
|
|
|
|
'Message-ID' => '*M', |
185
|
|
|
|
|
|
|
'Received' => '*r', |
186
|
|
|
|
|
|
|
'User-Agent' => '*u', |
187
|
|
|
|
|
|
|
'References' => '*f', |
188
|
|
|
|
|
|
|
'In-Reply-To' => '*i', |
189
|
|
|
|
|
|
|
'From' => '*F', |
190
|
|
|
|
|
|
|
'Reply-To' => '*R', |
191
|
|
|
|
|
|
|
'Return-Path' => '*p', |
192
|
|
|
|
|
|
|
'Return-path' => '*rp', |
193
|
|
|
|
|
|
|
'X-Mailer' => '*x', |
194
|
|
|
|
|
|
|
'X-Authentication-Warning' => '*a', |
195
|
|
|
|
|
|
|
'Organization' => '*o', |
196
|
|
|
|
|
|
|
'Organisation' => '*o', |
197
|
|
|
|
|
|
|
'Content-Type' => '*ct', |
198
|
|
|
|
|
|
|
'Content-Disposition' => '*cd', |
199
|
|
|
|
|
|
|
'Content-Transfer-Encoding' => '*ce', |
200
|
|
|
|
|
|
|
'x-spam-relays-trusted' => '*RT', |
201
|
|
|
|
|
|
|
'x-spam-relays-untrusted' => '*RU', |
202
|
|
|
|
|
|
|
); |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# How many seconds should the opportunistic_expire lock be valid? |
205
|
|
|
|
|
|
|
our $OPPORTUNISTIC_LOCK_VALID = 300; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# Should we use the Robinson f(w) equation from |
208
|
|
|
|
|
|
|
# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? |
209
|
|
|
|
|
|
|
# It gives better results, in that scores are more likely to distribute |
210
|
|
|
|
|
|
|
# into the <0.5 range for nonspam and >0.5 for spam. |
211
|
21
|
|
|
21
|
|
147
|
use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; |
|
21
|
|
|
|
|
41
|
|
|
21
|
|
|
|
|
1103
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# How many of the most significant tokens should we use for the p(w) |
214
|
|
|
|
|
|
|
# calculation? |
215
|
21
|
|
|
21
|
|
132
|
use constant N_SIGNIFICANT_TOKENS => 150; |
|
21
|
|
|
|
|
52
|
|
|
21
|
|
|
|
|
1150
|
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# How many significant tokens are required for a classifier score to |
218
|
|
|
|
|
|
|
# be considered usable? |
219
|
21
|
|
|
21
|
|
133
|
use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; |
|
21
|
|
|
|
|
42
|
|
|
21
|
|
|
|
|
1079
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# How long a token should we hold onto? (note: German speakers typically |
222
|
|
|
|
|
|
|
# will require a longer token than English ones.) |
223
|
21
|
|
|
21
|
|
138
|
use constant MAX_TOKEN_LENGTH => 15; |
|
21
|
|
|
|
|
40
|
|
|
21
|
|
|
|
|
165996
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
########################################################################### |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub new { |
228
|
62
|
|
|
62
|
1
|
275
|
my $class = shift; |
229
|
62
|
|
|
|
|
226
|
my ($main) = @_; |
230
|
|
|
|
|
|
|
|
231
|
62
|
|
33
|
|
|
446
|
$class = ref($class) || $class; |
232
|
62
|
|
|
|
|
427
|
my $self = $class->SUPER::new($main); |
233
|
62
|
|
|
|
|
166
|
bless ($self, $class); |
234
|
|
|
|
|
|
|
|
235
|
62
|
|
|
|
|
228
|
$self->{main} = $main; |
236
|
62
|
|
|
|
|
181
|
$self->{conf} = $main->{conf}; |
237
|
62
|
|
|
|
|
237
|
$self->{use_ignores} = 1; |
238
|
|
|
|
|
|
|
|
239
|
62
|
|
|
|
|
357
|
$self->register_eval_rule("check_bayes"); |
240
|
62
|
|
|
|
|
553
|
$self; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub finish { |
244
|
40
|
|
|
40
|
1
|
107
|
my $self = shift; |
245
|
40
|
100
|
|
|
|
196
|
if ($self->{store}) { |
246
|
39
|
|
|
|
|
297
|
$self->{store}->untie_db(); |
247
|
|
|
|
|
|
|
} |
248
|
40
|
|
|
|
|
99
|
%{$self} = (); |
|
40
|
|
|
|
|
305
|
|
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
########################################################################### |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# Plugin hook. |
254
|
|
|
|
|
|
|
# Return this implementation object, for callers that need to know |
255
|
|
|
|
|
|
|
# it. TODO: callers shouldn't *need* to know it! |
256
|
|
|
|
|
|
|
# used only in test suite to get access to {store}, internal APIs. |
257
|
|
|
|
|
|
|
# |
258
|
1160
|
|
|
1160
|
0
|
2272
|
sub learner_get_implementation { return shift; } |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
########################################################################### |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# Plugin hook. |
263
|
|
|
|
|
|
|
# Called in the parent process shortly before forking off child processes. |
264
|
|
|
|
|
|
|
sub prefork_init { |
265
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
266
|
|
|
|
|
|
|
|
267
|
0
|
0
|
0
|
|
|
0
|
if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) { |
268
|
0
|
|
|
|
|
0
|
$self->{store}->prefork_init; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
########################################################################### |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Plugin hook. |
275
|
|
|
|
|
|
|
# Called in a child process shortly after being spawned. |
276
|
|
|
|
|
|
|
sub spamd_child_init { |
277
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
278
|
|
|
|
|
|
|
|
279
|
0
|
0
|
0
|
|
|
0
|
if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) { |
280
|
0
|
|
|
|
|
0
|
$self->{store}->spamd_child_init; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
########################################################################### |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# Plugin hook. |
287
|
|
|
|
|
|
|
sub check_bayes { |
288
|
0
|
|
|
0
|
0
|
0
|
my ($self, $pms, $fulltext, $min, $max) = @_; |
289
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
0
|
return 0 if (!$self->{conf}->{use_learner}); |
291
|
0
|
0
|
0
|
|
|
0
|
return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules}); |
292
|
|
|
|
|
|
|
|
293
|
0
|
0
|
|
|
|
0
|
if (!exists ($pms->{bayes_score})) { |
294
|
0
|
|
|
|
|
0
|
my $timer = $self->{main}->time_method("check_bayes"); |
295
|
0
|
|
|
|
|
0
|
$pms->{bayes_score} = $self->scan($pms, $pms->{msg}); |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
0
|
0
|
0
|
|
|
0
|
if (defined $pms->{bayes_score} && |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
299
|
|
|
|
|
|
|
($min == 0 || $pms->{bayes_score} > $min) && |
300
|
|
|
|
|
|
|
($max eq "undef" || $pms->{bayes_score} <= $max)) |
301
|
|
|
|
|
|
|
{ |
302
|
0
|
0
|
|
|
|
0
|
if ($self->{conf}->{detailed_bayes_score}) { |
303
|
|
|
|
|
|
|
$pms->test_log(sprintf ("score: %3.4f, hits: %s", |
304
|
|
|
|
|
|
|
$pms->{bayes_score}, |
305
|
0
|
|
|
|
|
0
|
$pms->{bayes_hits})); |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
else { |
308
|
0
|
|
|
|
|
0
|
$pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score})); |
309
|
|
|
|
|
|
|
} |
310
|
0
|
|
|
|
|
0
|
return 1; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
0
|
|
|
|
|
0
|
return 0; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
########################################################################### |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# Plugin hook. |
319
|
|
|
|
|
|
|
sub learner_close { |
320
|
6
|
|
|
6
|
1
|
12
|
my ($self, $params) = @_; |
321
|
6
|
|
|
|
|
13
|
my $quiet = $params->{quiet}; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# do a sanity check here. Weird things happen if we remain tied |
324
|
|
|
|
|
|
|
# after compiling; for example, spamd will never see that the |
325
|
|
|
|
|
|
|
# number of messages has reached the bayes-scanning threshold. |
326
|
6
|
50
|
|
|
|
24
|
if ($self->{store}->db_readable()) { |
327
|
0
|
0
|
|
|
|
0
|
warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; |
328
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
########################################################################### |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# read configuration items to control bayes behaviour. Called by |
335
|
|
|
|
|
|
|
# BayesStore::read_db_configs(). |
336
|
|
|
|
|
|
|
sub read_db_configs { |
337
|
42
|
|
|
42
|
0
|
82
|
my ($self) = @_; |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# use of hapaxes. Set on bayes object, since it controls prob |
340
|
|
|
|
|
|
|
# computation. |
341
|
42
|
|
|
|
|
167
|
$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
########################################################################### |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
sub ignore_message { |
346
|
0
|
|
|
0
|
0
|
0
|
my ($self,$PMS) = @_; |
347
|
|
|
|
|
|
|
|
348
|
0
|
0
|
|
|
|
0
|
return 0 unless $self->{use_ignores}; |
349
|
|
|
|
|
|
|
|
350
|
0
|
|
|
|
|
0
|
my $ig_from = $self->{main}->call_plugins ("check_wb_list", |
351
|
|
|
|
|
|
|
{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); |
352
|
0
|
|
|
|
|
0
|
my $ig_to = $self->{main}->call_plugins ("check_wb_list", |
353
|
|
|
|
|
|
|
{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); |
354
|
|
|
|
|
|
|
|
355
|
0
|
|
0
|
|
|
0
|
my $ignore = $ig_from || $ig_to; |
356
|
|
|
|
|
|
|
|
357
|
0
|
0
|
|
|
|
0
|
dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; |
358
|
|
|
|
|
|
|
|
359
|
0
|
|
|
|
|
0
|
return $ignore; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
########################################################################### |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Plugin hook. |
365
|
|
|
|
|
|
|
sub learn_message { |
366
|
10
|
|
|
10
|
1
|
33
|
my ($self, $params) = @_; |
367
|
10
|
|
|
|
|
27
|
my $isspam = $params->{isspam}; |
368
|
10
|
|
|
|
|
23
|
my $msg = $params->{msg}; |
369
|
10
|
|
|
|
|
24
|
my $id = $params->{id}; |
370
|
|
|
|
|
|
|
|
371
|
10
|
50
|
|
|
|
35
|
if (!$self->{conf}->{use_bayes}) { return; } |
|
0
|
|
|
|
|
0
|
|
372
|
|
|
|
|
|
|
|
373
|
10
|
|
|
|
|
66
|
my $msgdata = $self->get_body_from_msg ($msg); |
374
|
10
|
|
|
|
|
25
|
my $ret; |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
eval { |
377
|
10
|
|
|
|
|
57
|
local $SIG{'__DIE__'}; # do not run user die() traps in here |
378
|
10
|
|
|
|
|
48
|
my $timer = $self->{main}->time_method("b_learn"); |
379
|
|
|
|
|
|
|
|
380
|
10
|
|
|
|
|
21
|
my $ok; |
381
|
10
|
100
|
|
|
|
41
|
if ($self->{main}->{learn_to_journal}) { |
382
|
|
|
|
|
|
|
# If we're going to learn to journal, we'll try going r/o first... |
383
|
|
|
|
|
|
|
# If that fails for some reason, let's try going r/w. This happens |
384
|
|
|
|
|
|
|
# if the DB doesn't exist yet. |
385
|
2
|
|
33
|
|
|
15
|
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); |
386
|
|
|
|
|
|
|
} else { |
387
|
8
|
|
|
|
|
49
|
$ok = $self->{store}->tie_db_writable(); |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
10
|
100
|
|
|
|
36
|
if ($ok) { |
391
|
8
|
|
|
|
|
46
|
$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); |
392
|
|
|
|
|
|
|
|
393
|
8
|
50
|
|
|
|
52
|
if (!$self->{main}->{learn_caller_will_untie}) { |
394
|
8
|
|
|
|
|
58
|
$self->{store}->untie_db(); |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
} |
397
|
10
|
|
|
|
|
83
|
1; |
398
|
10
|
50
|
|
|
|
25
|
} or do { # if we died, untie the dbs. |
399
|
0
|
0
|
|
|
|
0
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
|
0
|
|
|
|
|
0
|
|
400
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
401
|
0
|
|
|
|
|
0
|
die "bayes: (in learn) $eval_stat\n"; |
402
|
|
|
|
|
|
|
}; |
403
|
|
|
|
|
|
|
|
404
|
10
|
|
|
|
|
61
|
return $ret; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# this function is trapped by the wrapper above |
408
|
|
|
|
|
|
|
sub _learn_trapped { |
409
|
8
|
|
|
8
|
|
26
|
my ($self, $isspam, $msg, $msgdata, $msgid) = @_; |
410
|
8
|
|
|
|
|
27
|
my @msgid = ( $msgid ); |
411
|
|
|
|
|
|
|
|
412
|
8
|
50
|
|
|
|
28
|
if (!defined $msgid) { |
413
|
8
|
|
|
|
|
34
|
@msgid = $self->get_msgid($msg); |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
8
|
|
|
|
|
44
|
foreach my $msgid_t ( @msgid ) { |
417
|
12
|
|
|
|
|
75
|
my $seen = $self->{store}->seen_get ($msgid_t); |
418
|
|
|
|
|
|
|
|
419
|
12
|
100
|
|
|
|
63
|
if (defined ($seen)) { |
420
|
4
|
100
|
66
|
|
|
72
|
if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { |
|
|
50
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
421
|
2
|
|
|
|
|
18
|
dbg("bayes: $msgid_t already learnt correctly, not learning twice"); |
422
|
2
|
|
|
|
|
9
|
return 0; |
423
|
|
|
|
|
|
|
} elsif ($seen !~ /^[hs]$/) { |
424
|
0
|
|
|
|
|
0
|
warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored"); |
425
|
|
|
|
|
|
|
} else { |
426
|
|
|
|
|
|
|
# bug 3704: If the message was already learned, don't try learning it again. |
427
|
|
|
|
|
|
|
# this prevents, for instance, manually learning as spam, then autolearning |
428
|
|
|
|
|
|
|
# as ham, or visa versa. |
429
|
2
|
50
|
|
|
|
15
|
if ($self->{main}->{learn_no_relearn}) { |
430
|
0
|
|
|
|
|
0
|
dbg("bayes: $msgid_t already learnt as opposite, not re-learning"); |
431
|
0
|
|
|
|
|
0
|
return 0; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
2
|
|
|
|
|
19
|
dbg("bayes: $msgid_t already learnt as opposite, forgetting first"); |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# kluge so that forget() won't untie the db on us ... |
437
|
2
|
|
|
|
|
8
|
my $orig = $self->{main}->{learn_caller_will_untie}; |
438
|
2
|
|
|
|
|
6
|
$self->{main}->{learn_caller_will_untie} = 1; |
439
|
|
|
|
|
|
|
|
440
|
2
|
|
|
|
|
17
|
my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# reset the value post-forget() ... |
443
|
2
|
|
|
|
|
11
|
$self->{main}->{learn_caller_will_untie} = $orig; |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# forget() gave us a fatal error, so propagate that up |
446
|
2
|
50
|
|
|
|
12
|
if ($fatal) { |
447
|
0
|
|
|
|
|
0
|
dbg("bayes: forget() returned a fatal error, so learn() will too"); |
448
|
0
|
|
|
|
|
0
|
return; |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# we're only going to have seen this once, so stop if it's been |
453
|
|
|
|
|
|
|
# seen already |
454
|
2
|
|
|
|
|
8
|
last; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# Now that we're sure we haven't seen this message before ... |
459
|
6
|
|
|
|
|
25
|
$msgid = $msgid[0]; |
460
|
|
|
|
|
|
|
|
461
|
6
|
|
|
|
|
60
|
my $msgatime = $msg->receive_date(); |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# If the message atime comes back as being more than 1 day in the |
464
|
|
|
|
|
|
|
# future, something's messed up and we should revert to current time as |
465
|
|
|
|
|
|
|
# a safety measure. |
466
|
|
|
|
|
|
|
# |
467
|
6
|
50
|
|
|
|
36
|
$msgatime = time if ( $msgatime - time > 86400 ); |
468
|
|
|
|
|
|
|
|
469
|
6
|
|
|
|
|
50
|
my $tokens = $self->tokenize($msg, $msgdata); |
470
|
|
|
|
|
|
|
|
471
|
6
|
|
|
|
|
15
|
{ my $timer = $self->{main}->time_method('b_count_change'); |
|
6
|
|
|
|
|
41
|
|
472
|
6
|
100
|
|
|
|
30
|
if ($isspam) { |
473
|
4
|
|
|
|
|
59
|
$self->{store}->nspam_nham_change(1, 0); |
474
|
4
|
|
|
|
|
48
|
$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); |
475
|
|
|
|
|
|
|
} else { |
476
|
2
|
|
|
|
|
13
|
$self->{store}->nspam_nham_change(0, 1); |
477
|
2
|
|
|
|
|
19
|
$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
|
481
|
6
|
100
|
|
|
|
176
|
$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); |
482
|
6
|
|
|
|
|
56
|
$self->{store}->cleanup(); |
483
|
|
|
|
|
|
|
|
484
|
6
|
|
|
|
|
83
|
$self->{main}->call_plugins("bayes_learn", { toksref => $tokens, |
485
|
|
|
|
|
|
|
isspam => $isspam, |
486
|
|
|
|
|
|
|
msgid => $msgid, |
487
|
|
|
|
|
|
|
msgatime => $msgatime, |
488
|
|
|
|
|
|
|
}); |
489
|
|
|
|
|
|
|
|
490
|
6
|
|
|
|
|
72
|
dbg("bayes: learned '$msgid', atime: $msgatime"); |
491
|
|
|
|
|
|
|
|
492
|
6
|
|
|
|
|
410
|
1; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
########################################################################### |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# Plugin hook. |
498
|
|
|
|
|
|
|
sub forget_message { |
499
|
4
|
|
|
4
|
1
|
15
|
my ($self, $params) = @_; |
500
|
4
|
|
|
|
|
9
|
my $msg = $params->{msg}; |
501
|
4
|
|
|
|
|
11
|
my $id = $params->{id}; |
502
|
|
|
|
|
|
|
|
503
|
4
|
50
|
|
|
|
16
|
if (!$self->{conf}->{use_bayes}) { return; } |
|
0
|
|
|
|
|
0
|
|
504
|
|
|
|
|
|
|
|
505
|
4
|
|
|
|
|
20
|
my $msgdata = $self->get_body_from_msg ($msg); |
506
|
4
|
|
|
|
|
13
|
my $ret; |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# we still tie for writing here, since we write to the seen db |
509
|
|
|
|
|
|
|
# synchronously |
510
|
|
|
|
|
|
|
eval { |
511
|
4
|
|
|
|
|
21
|
local $SIG{'__DIE__'}; # do not run user die() traps in here |
512
|
4
|
|
|
|
|
19
|
my $timer = $self->{main}->time_method("b_learn"); |
513
|
|
|
|
|
|
|
|
514
|
4
|
|
|
|
|
10
|
my $ok; |
515
|
4
|
50
|
|
|
|
16
|
if ($self->{main}->{learn_to_journal}) { |
516
|
|
|
|
|
|
|
# If we're going to learn to journal, we'll try going r/o first... |
517
|
|
|
|
|
|
|
# If that fails for some reason, let's try going r/w. This happens |
518
|
|
|
|
|
|
|
# if the DB doesn't exist yet. |
519
|
0
|
|
0
|
|
|
0
|
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); |
520
|
|
|
|
|
|
|
} else { |
521
|
4
|
|
|
|
|
20
|
$ok = $self->{store}->tie_db_writable(); |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
4
|
50
|
|
|
|
18
|
if ($ok) { |
525
|
4
|
|
|
|
|
21
|
$ret = $self->_forget_trapped ($msg, $msgdata, $id); |
526
|
|
|
|
|
|
|
|
527
|
4
|
100
|
|
|
|
34
|
if (!$self->{main}->{learn_caller_will_untie}) { |
528
|
2
|
|
|
|
|
15
|
$self->{store}->untie_db(); |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
} |
531
|
4
|
|
|
|
|
35
|
1; |
532
|
4
|
50
|
|
|
|
10
|
} or do { # if we died, untie the dbs. |
533
|
0
|
0
|
|
|
|
0
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
|
0
|
|
|
|
|
0
|
|
534
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
535
|
0
|
|
|
|
|
0
|
die "bayes: (in forget) $eval_stat\n"; |
536
|
|
|
|
|
|
|
}; |
537
|
|
|
|
|
|
|
|
538
|
4
|
|
|
|
|
62
|
return $ret; |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
# this function is trapped by the wrapper above |
542
|
|
|
|
|
|
|
sub _forget_trapped { |
543
|
4
|
|
|
4
|
|
15
|
my ($self, $msg, $msgdata, $msgid) = @_; |
544
|
4
|
|
|
|
|
12
|
my @msgid = ( $msgid ); |
545
|
4
|
|
|
|
|
7
|
my $isspam; |
546
|
|
|
|
|
|
|
|
547
|
4
|
50
|
|
|
|
18
|
if (!defined $msgid) { |
548
|
4
|
|
|
|
|
13
|
@msgid = $self->get_msgid($msg); |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
4
|
|
|
|
|
25
|
while( $msgid = shift @msgid ) { |
552
|
4
|
|
|
|
|
25
|
my $seen = $self->{store}->seen_get ($msgid); |
553
|
|
|
|
|
|
|
|
554
|
4
|
50
|
|
|
|
29
|
if (defined ($seen)) { |
555
|
4
|
100
|
|
|
|
25
|
if ($seen eq 's') { |
|
|
50
|
|
|
|
|
|
556
|
2
|
|
|
|
|
8
|
$isspam = 1; |
557
|
|
|
|
|
|
|
} elsif ($seen eq 'h') { |
558
|
2
|
|
|
|
|
7
|
$isspam = 0; |
559
|
|
|
|
|
|
|
} else { |
560
|
0
|
|
|
|
|
0
|
dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); |
561
|
0
|
|
|
|
|
0
|
return 0; |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
# messages should only be learned once, so stop if we find a msgid |
565
|
|
|
|
|
|
|
# which was seen before |
566
|
4
|
|
|
|
|
14
|
last; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
else { |
569
|
0
|
|
|
|
|
0
|
dbg("bayes: forget: msgid $msgid not learnt, ignored"); |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# This message wasn't learnt before, so return |
574
|
4
|
50
|
|
|
|
18
|
if (!defined $isspam) { |
|
|
100
|
|
|
|
|
|
575
|
0
|
|
|
|
|
0
|
dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); |
576
|
0
|
|
|
|
|
0
|
return 0; |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
elsif ($isspam) { |
579
|
2
|
|
|
|
|
11
|
$self->{store}->nspam_nham_change (-1, 0); |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
else { |
582
|
2
|
|
|
|
|
12
|
$self->{store}->nspam_nham_change (0, -1); |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
4
|
|
|
|
|
34
|
my $tokens = $self->tokenize($msg, $msgdata); |
586
|
|
|
|
|
|
|
|
587
|
4
|
100
|
|
|
|
20
|
if ($isspam) { |
588
|
2
|
|
|
|
|
16
|
$self->{store}->multi_tok_count_change (-1, 0, $tokens); |
589
|
|
|
|
|
|
|
} else { |
590
|
2
|
|
|
|
|
18
|
$self->{store}->multi_tok_count_change (0, -1, $tokens); |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
4
|
|
|
|
|
97
|
$self->{store}->seen_delete ($msgid); |
594
|
4
|
|
|
|
|
44
|
$self->{store}->cleanup(); |
595
|
|
|
|
|
|
|
|
596
|
4
|
|
|
|
|
58
|
$self->{main}->call_plugins("bayes_forget", { toksref => $tokens, |
597
|
|
|
|
|
|
|
isspam => $isspam, |
598
|
|
|
|
|
|
|
msgid => $msgid, |
599
|
|
|
|
|
|
|
}); |
600
|
|
|
|
|
|
|
|
601
|
4
|
|
|
|
|
330
|
1; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
########################################################################### |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
# Plugin hook. |
607
|
|
|
|
|
|
|
sub learner_sync { |
608
|
2
|
|
|
2
|
1
|
9
|
my ($self, $params) = @_; |
609
|
2
|
50
|
|
|
|
12
|
if (!$self->{conf}->{use_bayes}) { return 0; } |
|
0
|
|
|
|
|
0
|
|
610
|
2
|
|
|
|
|
22
|
dbg("bayes: bayes journal sync starting"); |
611
|
2
|
|
|
|
|
37
|
$self->{store}->sync($params); |
612
|
2
|
|
|
|
|
14
|
dbg("bayes: bayes journal sync completed"); |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
########################################################################### |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# Plugin hook. |
618
|
|
|
|
|
|
|
sub learner_expire_old_training { |
619
|
0
|
|
|
0
|
1
|
0
|
my ($self, $params) = @_; |
620
|
0
|
0
|
|
|
|
0
|
if (!$self->{conf}->{use_bayes}) { return 0; } |
|
0
|
|
|
|
|
0
|
|
621
|
0
|
|
|
|
|
0
|
dbg("bayes: expiry starting"); |
622
|
0
|
|
|
|
|
0
|
my $timer = $self->{main}->time_method("expire_bayes"); |
623
|
0
|
|
|
|
|
0
|
$self->{store}->expire_old_tokens($params); |
624
|
0
|
|
|
|
|
0
|
dbg("bayes: expiry completed"); |
625
|
|
|
|
|
|
|
} |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
########################################################################### |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# Plugin hook. |
630
|
|
|
|
|
|
|
# Check to make sure we can tie() the DB, and we have enough entries to do a scan |
631
|
|
|
|
|
|
|
# if we're told the caller will untie(), go ahead and leave the db tied. |
632
|
|
|
|
|
|
|
sub learner_is_scan_available { |
633
|
145
|
|
|
145
|
1
|
454
|
my ($self, $params) = @_; |
634
|
|
|
|
|
|
|
|
635
|
145
|
50
|
|
|
|
608
|
return 0 unless $self->{conf}->{use_bayes}; |
636
|
145
|
100
|
|
|
|
774
|
return 0 unless $self->{store}->tie_db_readonly(); |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
# We need the DB to stay tied, so if the journal sync occurs, don't untie! |
639
|
6
|
|
|
|
|
31
|
my $caller_untie = $self->{main}->{learn_caller_will_untie}; |
640
|
6
|
|
|
|
|
29
|
$self->{main}->{learn_caller_will_untie} = 1; |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# Do a journal sync if necessary. Do this before the nspam_nham_get() |
643
|
|
|
|
|
|
|
# call since the sync may cause an update in the number of messages |
644
|
|
|
|
|
|
|
# learnt. |
645
|
6
|
|
|
|
|
35
|
$self->_opportunistic_calls(1); |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
# Reset the variable appropriately |
648
|
6
|
|
|
|
|
25
|
$self->{main}->{learn_caller_will_untie} = $caller_untie; |
649
|
|
|
|
|
|
|
|
650
|
6
|
|
|
|
|
53
|
my ($ns, $nn) = $self->{store}->nspam_nham_get(); |
651
|
|
|
|
|
|
|
|
652
|
6
|
50
|
|
|
|
47
|
if ($ns < $self->{conf}->{bayes_min_spam_num}) { |
653
|
0
|
|
|
|
|
0
|
dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); |
654
|
0
|
0
|
|
|
|
0
|
if (!$self->{main}->{learn_caller_will_untie}) { |
655
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
656
|
|
|
|
|
|
|
} |
657
|
0
|
|
|
|
|
0
|
return 0; |
658
|
|
|
|
|
|
|
} |
659
|
6
|
50
|
|
|
|
29
|
if ($nn < $self->{conf}->{bayes_min_ham_num}) { |
660
|
0
|
|
|
|
|
0
|
dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); |
661
|
0
|
0
|
|
|
|
0
|
if (!$self->{main}->{learn_caller_will_untie}) { |
662
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
663
|
|
|
|
|
|
|
} |
664
|
0
|
|
|
|
|
0
|
return 0; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
6
|
|
|
|
|
38
|
return 1; |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
########################################################################### |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
sub scan { |
673
|
4
|
|
|
4
|
0
|
18
|
my ($self, $permsgstatus, $msg) = @_; |
674
|
4
|
|
|
|
|
9
|
my $score; |
675
|
|
|
|
|
|
|
|
676
|
4
|
50
|
|
|
|
18
|
return unless $self->{conf}->{use_learner}; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
# When we're doing a scan, we'll guarantee that we'll do the untie, |
679
|
|
|
|
|
|
|
# so override the global setting until we're done. |
680
|
4
|
|
|
|
|
10
|
my $caller_untie = $self->{main}->{learn_caller_will_untie}; |
681
|
4
|
|
|
|
|
9
|
$self->{main}->{learn_caller_will_untie} = 1; |
682
|
|
|
|
|
|
|
|
683
|
4
|
50
|
|
|
|
42
|
goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); |
684
|
|
|
|
|
|
|
|
685
|
4
|
50
|
|
|
|
21
|
goto skip unless $self->learner_is_scan_available(); |
686
|
|
|
|
|
|
|
|
687
|
4
|
|
|
|
|
21
|
my ($ns, $nn) = $self->{store}->nspam_nham_get(); |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() |
690
|
|
|
|
|
|
|
## $self->{raw_counts} = " ns=$ns nn=$nn "; |
691
|
|
|
|
|
|
|
## } |
692
|
|
|
|
|
|
|
|
693
|
4
|
|
|
|
|
35
|
dbg("bayes: corpus size: nspam = $ns, nham = $nn"); |
694
|
|
|
|
|
|
|
|
695
|
4
|
|
|
|
|
9
|
my $msgtokens; |
696
|
4
|
|
|
|
|
7
|
{ my $timer = $self->{main}->time_method('b_tokenize'); |
|
4
|
|
|
|
|
25
|
|
697
|
4
|
|
|
|
|
25
|
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); |
698
|
4
|
|
|
|
|
41
|
$msgtokens = $self->tokenize($msg, $msgdata); |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
4
|
|
|
|
|
12
|
my $tokensdata; |
702
|
4
|
|
|
|
|
11
|
{ my $timer = $self->{main}->time_method('b_tok_get_all'); |
|
4
|
|
|
|
|
25
|
|
703
|
4
|
|
|
|
|
11
|
$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); |
|
4
|
|
|
|
|
218
|
|
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
4
|
|
|
|
|
75
|
my $timer_compute_prob = $self->{main}->time_method('b_comp_prob'); |
707
|
|
|
|
|
|
|
|
708
|
4
|
|
|
|
|
26
|
my $probabilities_ref = |
709
|
|
|
|
|
|
|
$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn); |
710
|
|
|
|
|
|
|
|
711
|
4
|
|
|
|
|
11
|
my %pw; |
712
|
4
|
|
|
|
|
7
|
foreach my $tokendata (@{$tokensdata}) { |
|
4
|
|
|
|
|
22
|
|
713
|
1104
|
|
|
|
|
1618
|
my $prob = shift(@$probabilities_ref); |
714
|
1104
|
100
|
|
|
|
2015
|
next unless defined $prob; |
715
|
502
|
|
|
|
|
594
|
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; |
|
502
|
|
|
|
|
863
|
|
716
|
502
|
|
|
|
|
1666
|
$pw{$token} = { |
717
|
|
|
|
|
|
|
prob => $prob, |
718
|
|
|
|
|
|
|
spam_count => $tok_spam, |
719
|
|
|
|
|
|
|
ham_count => $tok_ham, |
720
|
|
|
|
|
|
|
atime => $atime |
721
|
|
|
|
|
|
|
}; |
722
|
|
|
|
|
|
|
} |
723
|
|
|
|
|
|
|
|
724
|
4
|
|
|
|
|
94
|
my @pw_keys = keys %pw; |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# If none of the tokens were found in the DB, we're going to skip |
727
|
|
|
|
|
|
|
# this message... |
728
|
4
|
50
|
|
|
|
16
|
if (!@pw_keys) { |
729
|
0
|
|
|
|
|
0
|
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); |
730
|
0
|
|
|
|
|
0
|
goto skip; |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
|
733
|
4
|
|
|
|
|
14
|
my $tcount_total = keys %{$msgtokens}; |
|
4
|
|
|
|
|
12
|
|
734
|
4
|
|
|
|
|
11
|
my $tcount_learned = scalar @pw_keys; |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# Figure out the message receive time (used as atime below) |
737
|
|
|
|
|
|
|
# If the message atime comes back as being in the future, something's |
738
|
|
|
|
|
|
|
# messed up and we should revert to current time as a safety measure. |
739
|
|
|
|
|
|
|
# |
740
|
4
|
|
|
|
|
29
|
my $msgatime = $msg->receive_date(); |
741
|
4
|
|
|
|
|
16
|
my $now = time; |
742
|
4
|
50
|
|
|
|
17
|
$msgatime = $now if ( $msgatime > $now ); |
743
|
|
|
|
|
|
|
|
744
|
4
|
|
|
|
|
18
|
my @touch_tokens; |
745
|
4
|
|
|
|
|
29
|
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; |
746
|
4
|
|
|
|
|
25
|
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; |
747
|
|
|
|
|
|
|
|
748
|
4
|
|
|
|
|
373
|
my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys); |
749
|
4
|
|
|
|
|
43
|
my $log_each_token = (would_log('dbg', 'bayes') > 1); |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# now take the most significant tokens and calculate probs using |
752
|
|
|
|
|
|
|
# Robinson's formula. |
753
|
|
|
|
|
|
|
|
754
|
4
|
|
|
|
|
35
|
@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys; |
|
2963
|
|
|
|
|
4014
|
|
755
|
|
|
|
|
|
|
|
756
|
4
|
100
|
|
|
|
28
|
if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 } |
|
2
|
|
|
|
|
22
|
|
757
|
|
|
|
|
|
|
|
758
|
4
|
|
|
|
|
11
|
my @sorted; |
759
|
4
|
|
|
|
|
14
|
foreach my $tok (@pw_keys) { |
760
|
414
|
100
|
|
|
|
790
|
next if $tok_strength{$tok} < |
761
|
|
|
|
|
|
|
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; |
762
|
|
|
|
|
|
|
|
763
|
282
|
|
|
|
|
373
|
my $pw_tok = $pw{$tok}; |
764
|
282
|
|
|
|
|
376
|
my $pw_prob = $pw_tok->{prob}; |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
# What's more expensive, scanning headers for HAMMYTOKENS and |
767
|
|
|
|
|
|
|
# SPAMMYTOKENS tags that aren't there or collecting data that |
768
|
|
|
|
|
|
|
# won't be used? Just collecting the data is certainly simpler. |
769
|
|
|
|
|
|
|
# |
770
|
282
|
|
50
|
|
|
748
|
my $raw_token = $msgtokens->{$tok} || "(unknown)"; |
771
|
282
|
|
|
|
|
418
|
my $s = $pw_tok->{spam_count}; |
772
|
282
|
|
|
|
|
358
|
my $n = $pw_tok->{ham_count}; |
773
|
282
|
|
|
|
|
360
|
my $a = $pw_tok->{atime}; |
774
|
|
|
|
|
|
|
|
775
|
282
|
100
|
|
|
|
353
|
push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy }, |
|
282
|
|
|
|
|
919
|
|
776
|
|
|
|
|
|
|
[$raw_token, $pw_prob, $s, $n, $a] ); |
777
|
|
|
|
|
|
|
|
778
|
282
|
|
|
|
|
508
|
push(@sorted, $pw_prob); |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
# update the atime on this token, it proved useful |
781
|
282
|
|
|
|
|
387
|
push(@touch_tokens, $tok); |
782
|
|
|
|
|
|
|
|
783
|
282
|
50
|
|
|
|
793
|
if ($log_each_token) { |
784
|
0
|
|
|
|
|
0
|
dbg("bayes: token '$raw_token' => $pw_prob"); |
785
|
|
|
|
|
|
|
} |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
4
|
50
|
50
|
|
|
49
|
if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && |
789
|
|
|
|
|
|
|
$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) |
790
|
|
|
|
|
|
|
{ |
791
|
0
|
|
|
|
|
0
|
dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); |
792
|
0
|
|
|
|
|
0
|
goto skip; |
793
|
|
|
|
|
|
|
} |
794
|
|
|
|
|
|
|
|
795
|
4
|
|
|
|
|
30
|
$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); |
796
|
4
|
|
|
|
|
23
|
undef $timer_compute_prob; # end a timing section |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
# Couldn't come up with a probability? |
799
|
4
|
50
|
|
|
|
16
|
goto skip unless defined $score; |
800
|
|
|
|
|
|
|
|
801
|
4
|
|
|
|
|
56
|
dbg("bayes: score = $score"); |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
# no need to call tok_touch_all unless there were significant |
804
|
|
|
|
|
|
|
# tokens and a score was returned |
805
|
|
|
|
|
|
|
# we don't really care about the return value here |
806
|
|
|
|
|
|
|
|
807
|
4
|
|
|
|
|
12
|
{ my $timer = $self->{main}->time_method('b_tok_touch_all'); |
|
4
|
|
|
|
|
28
|
|
808
|
4
|
|
|
|
|
62
|
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime); |
809
|
|
|
|
|
|
|
} |
810
|
|
|
|
|
|
|
|
811
|
4
|
|
|
|
|
19
|
my $timer_finish = $self->{main}->time_method('b_finish'); |
812
|
|
|
|
|
|
|
|
813
|
4
|
|
|
|
|
16
|
$permsgstatus->{bayes_nspam} = $ns; |
814
|
4
|
|
|
|
|
12
|
$permsgstatus->{bayes_nham} = $nn; |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() |
817
|
|
|
|
|
|
|
## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; |
818
|
|
|
|
|
|
|
## } |
819
|
|
|
|
|
|
|
|
820
|
4
|
|
|
|
|
47
|
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, |
821
|
|
|
|
|
|
|
probsref => \%pw, |
822
|
|
|
|
|
|
|
score => $score, |
823
|
|
|
|
|
|
|
msgatime => $msgatime, |
824
|
|
|
|
|
|
|
significant_tokens => \@touch_tokens, |
825
|
|
|
|
|
|
|
}); |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
skip: |
828
|
4
|
50
|
|
|
|
22
|
if (!defined $score) { |
829
|
0
|
|
|
|
|
0
|
dbg("bayes: not scoring message, returning undef"); |
830
|
|
|
|
|
|
|
} |
831
|
|
|
|
|
|
|
|
832
|
4
|
|
|
|
|
16
|
undef $timer_compute_prob; # end a timing section if still running |
833
|
4
|
50
|
|
|
|
13
|
if (!defined $timer_finish) { |
834
|
4
|
|
|
|
|
24
|
$timer_finish = $self->{main}->time_method('b_finish'); |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
# Take any opportunistic actions we can take |
838
|
4
|
50
|
|
|
|
24
|
if ($self->{main}->{opportunistic_expire_check_only}) { |
839
|
|
|
|
|
|
|
# we're supposed to report on expiry only -- so do the |
840
|
|
|
|
|
|
|
# _opportunistic_calls() run for the journal only. |
841
|
0
|
|
|
|
|
0
|
$self->_opportunistic_calls(1); |
842
|
0
|
|
|
|
|
0
|
$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
else { |
845
|
4
|
|
|
|
|
20
|
$self->_opportunistic_calls(); |
846
|
|
|
|
|
|
|
} |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
# Do any cleanup we need to do |
849
|
4
|
|
|
|
|
25
|
$self->{store}->cleanup(); |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
# Reset the value accordingly |
852
|
4
|
|
|
|
|
20
|
$self->{main}->{learn_caller_will_untie} = $caller_untie; |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# If our caller won't untie the db, we need to do it. |
855
|
4
|
50
|
|
|
|
19
|
if (!$caller_untie) { |
856
|
4
|
|
|
|
|
33
|
$self->{store}->untie_db(); |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
$permsgstatus->set_tag ('BAYESTCHAMMY', |
860
|
4
|
50
|
|
|
|
19
|
($tinfo_hammy ? scalar @{$tinfo_hammy} : 0)); |
|
4
|
|
|
|
|
31
|
|
861
|
|
|
|
|
|
|
$permsgstatus->set_tag ('BAYESTCSPAMMY', |
862
|
4
|
50
|
|
|
|
15
|
($tinfo_spammy ? scalar @{$tinfo_spammy} : 0)); |
|
4
|
|
|
|
|
17
|
|
863
|
4
|
|
|
|
|
22
|
$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned); |
864
|
4
|
|
|
|
|
15
|
$permsgstatus->set_tag ('BAYESTC', $tcount_total); |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
$permsgstatus->set_tag ('HAMMYTOKENS', sub { |
867
|
0
|
|
|
0
|
|
0
|
my $pms = shift; |
868
|
|
|
|
|
|
|
$self->bayes_report_make_list |
869
|
0
|
|
|
|
|
0
|
($pms, $pms->{bayes_token_info_hammy}, shift); |
870
|
4
|
|
|
|
|
39
|
}); |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
$permsgstatus->set_tag ('SPAMMYTOKENS', sub { |
873
|
0
|
|
|
0
|
|
0
|
my $pms = shift; |
874
|
|
|
|
|
|
|
$self->bayes_report_make_list |
875
|
0
|
|
|
|
|
0
|
($pms, $pms->{bayes_token_info_spammy}, shift); |
876
|
4
|
|
|
|
|
73
|
}); |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
$permsgstatus->set_tag ('TOKENSUMMARY', sub { |
879
|
0
|
|
|
0
|
|
0
|
my $pms = shift; |
880
|
0
|
0
|
|
|
|
0
|
if ( defined $pms->{tag_data}{BAYESTC} ) |
881
|
|
|
|
|
|
|
{ |
882
|
|
|
|
|
|
|
my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED} |
883
|
|
|
|
|
|
|
- $pms->{tag_data}{BAYESTCSPAMMY} |
884
|
0
|
|
|
|
|
0
|
- $pms->{tag_data}{BAYESTCHAMMY}; |
885
|
|
|
|
|
|
|
my $tcount_new = $pms->{tag_data}{BAYESTC} |
886
|
0
|
|
|
|
|
0
|
- $pms->{tag_data}{BAYESTCLEARNED}; |
887
|
0
|
|
|
|
|
0
|
"Tokens: new, $tcount_new; " |
888
|
|
|
|
|
|
|
."hammy, $pms->{tag_data}{BAYESTCHAMMY}; " |
889
|
|
|
|
|
|
|
."neutral, $tcount_neutral; " |
890
|
|
|
|
|
|
|
."spammy, $pms->{tag_data}{BAYESTCSPAMMY}." |
891
|
|
|
|
|
|
|
} else { |
892
|
0
|
|
|
|
|
0
|
"Bayes not run."; |
893
|
|
|
|
|
|
|
} |
894
|
4
|
|
|
|
|
43
|
}); |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
|
897
|
4
|
|
|
|
|
749
|
return $score; |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
########################################################################### |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
# Plugin hook. |
903
|
|
|
|
|
|
|
sub learner_dump_database { |
904
|
0
|
|
|
0
|
1
|
0
|
my ($self, $params) = @_; |
905
|
0
|
|
|
|
|
0
|
my $magic = $params->{magic}; |
906
|
0
|
|
|
|
|
0
|
my $toks = $params->{toks}; |
907
|
0
|
|
|
|
|
0
|
my $regex = $params->{regex}; |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
# allow dump to occur even if use_bayes disables everything else ... |
910
|
|
|
|
|
|
|
#return 0 unless $self->{conf}->{use_bayes}; |
911
|
0
|
0
|
|
|
|
0
|
return 0 unless $self->{store}->tie_db_readonly(); |
912
|
|
|
|
|
|
|
|
913
|
0
|
|
|
|
|
0
|
my @vars = $self->{store}->get_storage_variables(); |
914
|
|
|
|
|
|
|
|
915
|
0
|
|
|
|
|
0
|
my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; |
916
|
|
|
|
|
|
|
|
917
|
0
|
|
|
|
|
0
|
my $template = '%3.3f %10u %10u %10u %s'."\n"; |
918
|
|
|
|
|
|
|
|
919
|
0
|
0
|
|
|
|
0
|
if ( $magic ) { |
920
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version') |
921
|
|
|
|
|
|
|
or die "Error writing: $!"; |
922
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam') |
923
|
|
|
|
|
|
|
or die "Error writing: $!"; |
924
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $nh, 0, 'non-token data: nham') |
925
|
|
|
|
|
|
|
or die "Error writing: $!"; |
926
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens') |
927
|
|
|
|
|
|
|
or die "Error writing: $!"; |
928
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime') |
929
|
|
|
|
|
|
|
or die "Error writing: $!"; |
930
|
0
|
0
|
|
|
|
0
|
if ( $bv >= 2 ) { |
931
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime') |
932
|
|
|
|
|
|
|
or die "Error writing: $!"; |
933
|
|
|
|
|
|
|
} |
934
|
0
|
0
|
|
|
|
0
|
if ( $bv < 2 ) { |
935
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') |
936
|
|
|
|
|
|
|
or die "Error writing: $!"; |
937
|
|
|
|
|
|
|
} |
938
|
0
|
0
|
|
|
|
0
|
if ( $bv >= 2 ) { |
939
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') |
940
|
|
|
|
|
|
|
or die "Error writing: $!"; |
941
|
|
|
|
|
|
|
} |
942
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime') |
943
|
|
|
|
|
|
|
or die "Error writing: $!"; |
944
|
0
|
0
|
|
|
|
0
|
if ( $bv >= 2 ) { |
945
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta') |
946
|
|
|
|
|
|
|
or die "Error writing: $!"; |
947
|
|
|
|
|
|
|
|
948
|
0
|
0
|
|
|
|
0
|
printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count') |
949
|
|
|
|
|
|
|
or die "Error writing: $!"; |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
|
953
|
0
|
0
|
|
|
|
0
|
if ( $toks ) { |
954
|
|
|
|
|
|
|
# let the store sort out the db_toks |
955
|
0
|
|
|
|
|
0
|
$self->{store}->dump_db_toks($template, $regex, @vars); |
956
|
|
|
|
|
|
|
} |
957
|
|
|
|
|
|
|
|
958
|
0
|
0
|
|
|
|
0
|
if (!$self->{main}->{learn_caller_will_untie}) { |
959
|
0
|
|
|
|
|
0
|
$self->{store}->untie_db(); |
960
|
|
|
|
|
|
|
} |
961
|
0
|
|
|
|
|
0
|
return 1; |
962
|
|
|
|
|
|
|
} |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
########################################################################### |
965
|
|
|
|
|
|
|
# TODO: these are NOT public, but the test suite needs to call them. |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
sub get_msgid { |
968
|
14
|
|
|
14
|
0
|
55
|
my ($self, $msg) = @_; |
969
|
|
|
|
|
|
|
|
970
|
14
|
|
|
|
|
25
|
my @msgid; |
971
|
|
|
|
|
|
|
|
972
|
14
|
|
|
|
|
54
|
my $msgid = $msg->get_header("Message-Id"); |
973
|
14
|
50
|
33
|
|
|
363
|
if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) { |
|
|
|
33
|
|
|
|
|
974
|
|
|
|
|
|
|
# remove \r and < and > prefix/suffixes |
975
|
14
|
|
|
|
|
56
|
chomp $msgid; |
976
|
14
|
|
|
|
|
86
|
$msgid =~ s/^<//; $msgid =~ s/>.*$//g; |
|
14
|
|
|
|
|
95
|
|
977
|
14
|
|
|
|
|
57
|
push(@msgid, $msgid); |
978
|
|
|
|
|
|
|
} |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
# Use sha1_hex(Date: and top N bytes of body) |
983
|
|
|
|
|
|
|
# where N is MIN(1024 bytes, 1/2 of body length) |
984
|
|
|
|
|
|
|
# |
985
|
14
|
|
|
|
|
61
|
my $date = $msg->get_header("Date"); |
986
|
14
|
50
|
33
|
|
|
143
|
$date = "None" if (!defined $date || $date eq ''); # No Date? |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
#Removed per bug 5185 |
989
|
|
|
|
|
|
|
#my @rcvd = $msg->get_header("Received"); |
990
|
|
|
|
|
|
|
#my $rcvd = $rcvd[$#rcvd]; |
991
|
|
|
|
|
|
|
#$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received? |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# Make a copy since pristine_body is a reference ... |
994
|
14
|
|
|
|
|
86
|
my $body = join('', $msg->get_pristine_body()); |
995
|
|
|
|
|
|
|
|
996
|
14
|
50
|
|
|
|
77
|
if (length($body) > 64) { # Small Body? |
997
|
14
|
50
|
|
|
|
60
|
my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) ); |
998
|
14
|
|
|
|
|
52
|
substr($body, $keep) = ''; |
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
#Stripping all CR and LF so that testing midstream from MTA and post delivery don't |
1002
|
|
|
|
|
|
|
#generate different id's simply because of LF<->CR<->CRLF changes. |
1003
|
14
|
|
|
|
|
290
|
$body =~ s/[\r\n]//g; |
1004
|
|
|
|
|
|
|
|
1005
|
14
|
|
|
|
|
268
|
unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated'); |
1006
|
|
|
|
|
|
|
|
1007
|
14
|
50
|
|
|
|
123
|
return wantarray ? @msgid : $msgid[0]; |
1008
|
|
|
|
|
|
|
} |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
sub get_body_from_msg { |
1011
|
20
|
|
|
20
|
0
|
72
|
my ($self, $msg) = @_; |
1012
|
|
|
|
|
|
|
|
1013
|
20
|
50
|
|
|
|
79
|
if (!ref $msg) { |
1014
|
|
|
|
|
|
|
# I have no idea why this seems to happen. TODO |
1015
|
0
|
|
|
|
|
0
|
warn "bayes: msg not a ref: '$msg'"; |
1016
|
0
|
|
|
|
|
0
|
return { }; |
1017
|
|
|
|
|
|
|
} |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
my $permsgstatus = |
1020
|
20
|
|
|
|
|
181
|
Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); |
1021
|
20
|
|
|
|
|
143
|
$msg->extract_message_metadata ($permsgstatus); |
1022
|
20
|
|
|
|
|
104
|
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); |
1023
|
20
|
|
|
|
|
103
|
$permsgstatus->finish(); |
1024
|
|
|
|
|
|
|
|
1025
|
20
|
50
|
|
|
|
87
|
if (!defined $msgdata) { |
1026
|
|
|
|
|
|
|
# why?! |
1027
|
0
|
|
|
|
|
0
|
warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n"; |
1028
|
0
|
|
|
|
|
0
|
return { }; |
1029
|
|
|
|
|
|
|
} |
1030
|
|
|
|
|
|
|
|
1031
|
20
|
|
|
|
|
97
|
return $msgdata; |
1032
|
|
|
|
|
|
|
} |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
sub _get_msgdata_from_permsgstatus { |
1035
|
24
|
|
|
24
|
|
71
|
my ($self, $pms) = @_; |
1036
|
|
|
|
|
|
|
|
1037
|
24
|
|
|
|
|
70
|
my $t_src = $self->{conf}->{bayes_token_sources}; |
1038
|
24
|
|
|
|
|
55
|
my $msgdata = { }; |
1039
|
|
|
|
|
|
|
$msgdata->{bayes_token_body} = |
1040
|
24
|
50
|
|
|
|
226
|
$pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible}; |
1041
|
|
|
|
|
|
|
$msgdata->{bayes_token_inviz} = |
1042
|
24
|
50
|
|
|
|
181
|
$pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible}; |
1043
|
|
|
|
|
|
|
$msgdata->{bayes_mimepart_digests} = |
1044
|
24
|
50
|
|
|
|
91
|
$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart}; |
1045
|
24
|
|
|
|
|
77
|
@{$msgdata->{bayes_token_uris}} = |
1046
|
24
|
50
|
|
|
|
180
|
$pms->get_uri_list() if $t_src->{uri}; |
1047
|
24
|
|
|
|
|
75
|
return $msgdata; |
1048
|
|
|
|
|
|
|
} |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
########################################################################### |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
# The calling functions expect a uniq'ed array of tokens ... |
1053
|
|
|
|
|
|
|
sub tokenize { |
1054
|
16
|
|
|
16
|
0
|
68
|
my ($self, $msg, $msgdata) = @_; |
1055
|
|
|
|
|
|
|
|
1056
|
16
|
|
|
|
|
57
|
my $t_src = $self->{conf}->{bayes_token_sources}; |
1057
|
16
|
|
|
|
|
32
|
my @tokens; |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
# visible tokens from the body |
1060
|
16
|
50
|
|
|
|
75
|
if ($msgdata->{bayes_token_body}) { |
1061
|
|
|
|
|
|
|
my(@t) = map($self->_tokenize_line ($_, '', 1), |
1062
|
16
|
|
|
|
|
40
|
@{$msgdata->{bayes_token_body}} ); |
|
16
|
|
|
|
|
115
|
|
1063
|
16
|
|
|
|
|
579
|
dbg("bayes: tokenized body: %d tokens", scalar @t); |
1064
|
16
|
|
|
|
|
1944
|
push(@tokens, @t); |
1065
|
|
|
|
|
|
|
} |
1066
|
|
|
|
|
|
|
# the URI list |
1067
|
16
|
50
|
|
|
|
109
|
if ($msgdata->{bayes_token_uris}) { |
1068
|
|
|
|
|
|
|
my(@t) = map($self->_tokenize_line ($_, '', 2), |
1069
|
16
|
|
|
|
|
39
|
@{$msgdata->{bayes_token_uris}} ); |
|
16
|
|
|
|
|
93
|
|
1070
|
16
|
|
|
|
|
75
|
dbg("bayes: tokenized uri: %d tokens", scalar @t); |
1071
|
16
|
|
|
|
|
74
|
push(@tokens, @t); |
1072
|
|
|
|
|
|
|
} |
1073
|
|
|
|
|
|
|
# add invisible tokens |
1074
|
16
|
50
|
|
|
|
60
|
if ($msgdata->{bayes_token_inviz}) { |
1075
|
16
|
|
|
|
|
40
|
my $tokprefix; |
1076
|
16
|
|
|
|
|
27
|
if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' } |
|
16
|
|
|
|
|
71
|
|
1077
|
16
|
|
|
|
|
31
|
if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' } |
1078
|
16
|
50
|
|
|
|
48
|
if (defined $tokprefix) { |
1079
|
|
|
|
|
|
|
my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1), |
1080
|
16
|
|
|
|
|
46
|
@{$msgdata->{bayes_token_inviz}} ); |
|
16
|
|
|
|
|
50
|
|
1081
|
16
|
|
|
|
|
60
|
dbg("bayes: tokenized invisible: %d tokens", scalar @t); |
1082
|
16
|
|
|
|
|
46
|
push(@tokens, @t); |
1083
|
|
|
|
|
|
|
} |
1084
|
|
|
|
|
|
|
} |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
# add digests and Content-Type of all MIME parts |
1087
|
16
|
50
|
|
|
|
65
|
if ($msgdata->{bayes_mimepart_digests}) { |
1088
|
0
|
|
|
|
|
0
|
my %shorthand = ( # some frequent MIME part contents for human readability |
1089
|
|
|
|
|
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext', |
1090
|
|
|
|
|
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML', |
1091
|
|
|
|
|
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML', |
1092
|
|
|
|
|
|
|
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext', |
1093
|
|
|
|
|
|
|
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML', |
1094
|
|
|
|
|
|
|
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext', |
1095
|
|
|
|
|
|
|
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML', |
1096
|
|
|
|
|
|
|
); |
1097
|
|
|
|
|
|
|
my(@t) = map('MIME:' . ($shorthand{$_} || $_), |
1098
|
0
|
|
0
|
|
|
0
|
@{ $msgdata->{bayes_mimepart_digests} }); |
|
0
|
|
|
|
|
0
|
|
1099
|
0
|
|
|
|
|
0
|
dbg("bayes: tokenized mime parts: %d tokens", scalar @t); |
1100
|
0
|
|
|
|
|
0
|
dbg("bayes: mime-part token %s", $_) for @t; |
1101
|
0
|
|
|
|
|
0
|
push(@tokens, @t); |
1102
|
|
|
|
|
|
|
} |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
# Tokenize the headers |
1105
|
16
|
50
|
|
|
|
58
|
if ($t_src->{header}) { |
1106
|
16
|
|
|
|
|
45
|
my(@t); |
1107
|
16
|
|
|
|
|
75
|
my %hdrs = $self->_tokenize_headers ($msg); |
1108
|
16
|
|
|
|
|
121
|
while( my($prefix, $value) = each %hdrs ) { |
1109
|
200
|
|
|
|
|
650
|
push(@t, $self->_tokenize_line ($value, "H$prefix:", 0)); |
1110
|
|
|
|
|
|
|
} |
1111
|
16
|
|
|
|
|
89
|
dbg("bayes: tokenized header: %d tokens", scalar @t); |
1112
|
16
|
|
|
|
|
778
|
push(@tokens, @t); |
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
# Go ahead and uniq the array, skip null tokens (can happen sometimes) |
1116
|
|
|
|
|
|
|
# generate an SHA1 hash and take the lower 40 bits as our token |
1117
|
16
|
|
|
|
|
63
|
my %tokens; |
1118
|
16
|
|
|
|
|
78
|
foreach my $token (@tokens) { |
1119
|
|
|
|
|
|
|
# dbg("bayes: token: %s", $token); |
1120
|
6714
|
50
|
|
|
|
41363
|
$tokens{substr(sha1($token), -5)} = $token if $token ne ''; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
# return the keys == tokens ... |
1124
|
16
|
|
|
|
|
779
|
return \%tokens; |
1125
|
|
|
|
|
|
|
} |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
sub _tokenize_line { |
1128
|
532
|
|
|
532
|
|
910
|
my $self = $_[0]; |
1129
|
532
|
|
|
|
|
835
|
my $tokprefix = $_[2]; |
1130
|
532
|
|
|
|
|
733
|
my $region = $_[3]; |
1131
|
532
|
|
|
|
|
1222
|
local ($_) = $_[1]; |
1132
|
|
|
|
|
|
|
|
1133
|
532
|
|
|
|
|
805
|
my @rettokens; |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, |
1136
|
|
|
|
|
|
|
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. |
1137
|
|
|
|
|
|
|
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; |
1140
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic) |
1142
|
532
|
|
|
|
|
2120
|
s{ ( [A-Za-z0-9,@*!_'"\$. -]+ | |
1143
|
|
|
|
|
|
|
[\xC0-\xDF][\x80-\xBF] | |
1144
|
|
|
|
|
|
|
[\xE0-\xEF][\x80-\xBF]{2} | |
1145
|
|
|
|
|
|
|
[\xF0-\xF4][\x80-\xBF]{3} | |
1146
|
4844
|
100
|
|
|
|
18609
|
[\xA1-\xFF] ) | . } |
1147
|
|
|
|
|
|
|
{ defined $1 ? $1 : ' ' }xsge; |
1148
|
|
|
|
|
|
|
# should we also turn NBSP ( \xC2\xA0 ) into space? |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
# DO split on "..." or "--" or "---"; common formatting error resulting in |
1151
|
|
|
|
|
|
|
# hapaxes. Keep the separator itself as a token, though, as long ones can |
1152
|
532
|
|
|
|
|
1674
|
# be good spamsigns. |
1153
|
532
|
|
|
|
|
922
|
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; |
1154
|
|
|
|
|
|
|
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; |
1155
|
532
|
|
|
|
|
689
|
|
1156
|
532
|
100
|
100
|
|
|
1754
|
if (IGNORE_TITLE_CASE) { |
1157
|
|
|
|
|
|
|
if ($region == 1 || $region == 2) { |
1158
|
|
|
|
|
|
|
# lower-case Title Case at start of a full-stop-delimited line (as would |
1159
|
332
|
|
|
|
|
2262
|
# be seen in a Western language). |
|
242
|
|
|
|
|
2395
|
|
1160
|
|
|
|
|
|
|
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; |
1161
|
|
|
|
|
|
|
} |
1162
|
|
|
|
|
|
|
} |
1163
|
532
|
|
|
|
|
1970
|
|
1164
|
|
|
|
|
|
|
my $magic_re = $self->{store}->get_magic_re(); |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# Note that split() in scope of 'use bytes' results in words with utf8 flag |
1167
|
|
|
|
|
|
|
# cleared, even if the source string has perl characters semantics !!! |
1168
|
|
|
|
|
|
|
# Is this really still desirable? |
1169
|
532
|
|
|
|
|
4285
|
|
1170
|
8592
|
|
|
|
|
18878
|
foreach my $token (split) { |
1171
|
8592
|
|
|
|
|
15118
|
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end |
1172
|
|
|
|
|
|
|
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
# Skip false magic tokens |
1175
|
|
|
|
|
|
|
# TVD: we need to do a defined() check since SQL doesn't have magic |
1176
|
|
|
|
|
|
|
# tokens, so the SQL BayesStore returns undef. I really want a way |
1177
|
|
|
|
|
|
|
# of optimizing that out, but I haven't come up with anything yet. |
1178
|
8592
|
50
|
33
|
|
|
38282
|
# |
1179
|
|
|
|
|
|
|
next if ( defined $magic_re && $token =~ /$magic_re/ ); |
1180
|
|
|
|
|
|
|
|
1181
|
8592
|
|
|
|
|
18384
|
# *do* keep 3-byte tokens; there's some solid signs in there |
1182
|
|
|
|
|
|
|
my $len = length($token); |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
# but extend the stop-list. These are squarely in the gray |
1185
|
|
|
|
|
|
|
# area, and it just slows us down to record them. |
1186
|
|
|
|
|
|
|
# See http://wiki.apache.org/spamassassin/BayesStopList for more info. |
1187
|
8592
|
100
|
100
|
|
|
42245
|
# |
1188
|
|
|
|
|
|
|
next if $len < 3 || |
1189
|
|
|
|
|
|
|
($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i); |
1190
|
|
|
|
|
|
|
|
1191
|
5230
|
100
|
100
|
|
|
13661
|
# are we in the body? If so, apply some body-specific breakouts |
1192
|
3534
|
100
|
|
|
|
10045
|
if ($region == 1 || $region == 2) { |
|
|
100
|
|
|
|
|
|
1193
|
38
|
|
|
|
|
173
|
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { |
1194
|
|
|
|
|
|
|
push (@rettokens, $self->_tokenize_mail_addrs ($token)); |
1195
|
|
|
|
|
|
|
} |
1196
|
110
|
|
|
|
|
364
|
elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { |
1197
|
110
|
|
|
|
|
245
|
push (@rettokens, "UD:".$token); # the full token |
|
110
|
|
|
|
|
588
|
|
1198
|
132
|
|
|
|
|
681
|
my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { |
1199
|
|
|
|
|
|
|
push (@rettokens, "UD:".$1); # UD = URL domain |
1200
|
|
|
|
|
|
|
} |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
} |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
# note: do not trim down overlong tokens if they contain '*'. This is |
1205
|
|
|
|
|
|
|
# used as part of split tokens such as "HTo:D*net" indicating that |
1206
|
|
|
|
|
|
|
# the domain ".net" appeared in the To header. |
1207
|
5230
|
100
|
100
|
|
|
12967
|
# |
1208
|
|
|
|
|
|
|
if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { |
1209
|
250
|
50
|
|
|
|
676
|
|
1210
|
|
|
|
|
|
|
if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) { |
1211
|
|
|
|
|
|
|
# Bug 7135 |
1212
|
0
|
|
|
|
|
0
|
# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences |
1213
|
|
|
|
|
|
|
my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] ) |
1214
|
0
|
0
|
|
|
|
0
|
[\x80-\xBF]{2} )/xsg; |
1215
|
0
|
|
|
|
|
0
|
if (@t) { |
1216
|
0
|
|
|
|
|
0
|
push (@rettokens, map($tokprefix.'u8:'.$_, @t)); |
1217
|
|
|
|
|
|
|
next; |
1218
|
|
|
|
|
|
|
} |
1219
|
|
|
|
|
|
|
} |
1220
|
250
|
|
|
|
|
330
|
|
1221
|
|
|
|
|
|
|
if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { |
1222
|
|
|
|
|
|
|
# Matt sez: "Could be asian? Autrijus suggested doing character ngrams, |
1223
|
|
|
|
|
|
|
# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan |
1224
|
|
|
|
|
|
|
# to me! (jm) |
1225
|
|
|
|
|
|
|
while ($token =~ s/^(..?)//) { |
1226
|
|
|
|
|
|
|
push (@rettokens, $tokprefix.'8:'.$1); |
1227
|
|
|
|
|
|
|
} |
1228
|
|
|
|
|
|
|
next; |
1229
|
|
|
|
|
|
|
} |
1230
|
250
|
100
|
100
|
|
|
1090
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1231
|
|
|
|
|
|
|
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) |
1232
|
|
|
|
|
|
|
|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) |
1233
|
|
|
|
|
|
|
|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) |
1234
|
|
|
|
|
|
|
{ |
1235
|
|
|
|
|
|
|
# if (TOKENIZE_LONG_TOKENS_AS_SKIPS) |
1236
|
|
|
|
|
|
|
# Spambayes trick via Matt: Just retain 7 chars. Do not retain the |
1237
|
|
|
|
|
|
|
# length, it does not help; see jm's mail to -devel on Nov 20 2002 at |
1238
|
|
|
|
|
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/12977605/ |
1239
|
|
|
|
|
|
|
# "sk:" stands for "skip". |
1240
|
|
|
|
|
|
|
# Bug 7141: retain seven UTF-8 chars (or other bytes), |
1241
|
232
|
|
|
|
|
1737
|
# if followed by at least two bytes |
1242
|
|
|
|
|
|
|
$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] | |
1243
|
|
|
|
|
|
|
[\xC0-\xDF][\x80-\xBF] | |
1244
|
|
|
|
|
|
|
[\xE0-\xEF][\x80-\xBF]{2} | |
1245
|
|
|
|
|
|
|
[\xF0-\xF4][\x80-\xBF]{3} | . ){7} )) |
1246
|
|
|
|
|
|
|
.{2,} \z }{sk:$1}xs; |
1247
|
|
|
|
|
|
|
## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes |
1248
|
|
|
|
|
|
|
} |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
|
1251
|
5230
|
100
|
100
|
|
|
12387
|
# decompose tokens? do this after shortening long tokens |
1252
|
3534
|
|
|
|
|
4649
|
if ($region == 1 || $region == 2) { |
1253
|
3534
|
100
|
|
|
|
7994
|
if (DECOMPOSE_BODY_TOKENS) { |
1254
|
358
|
|
|
|
|
732
|
if ($token =~ /[^\w:\*]/) { |
1255
|
358
|
|
|
|
|
1444
|
my $decompd = $token; # "Foo!" |
1256
|
358
|
|
|
|
|
1256
|
$decompd =~ s/[^\w:\*]//gs; |
1257
|
|
|
|
|
|
|
push (@rettokens, $tokprefix.$decompd); # "Foo" |
1258
|
|
|
|
|
|
|
} |
1259
|
3534
|
100
|
|
|
|
8690
|
|
1260
|
712
|
|
|
|
|
1524
|
if ($token =~ /[A-Z]/) { |
|
712
|
|
|
|
|
1413
|
|
1261
|
712
|
|
|
|
|
2002
|
my $decompd = $token; $decompd = lc $decompd; |
1262
|
|
|
|
|
|
|
push (@rettokens, $tokprefix.$decompd); # "foo!" |
1263
|
712
|
100
|
|
|
|
2404
|
|
1264
|
76
|
|
|
|
|
276
|
if ($token =~ /[^\w:\*]/) { |
1265
|
76
|
|
|
|
|
305
|
$decompd =~ s/[^\w:\*]//gs; |
1266
|
|
|
|
|
|
|
push (@rettokens, $tokprefix.$decompd); # "foo" |
1267
|
|
|
|
|
|
|
} |
1268
|
|
|
|
|
|
|
} |
1269
|
|
|
|
|
|
|
} |
1270
|
|
|
|
|
|
|
} |
1271
|
5230
|
|
|
|
|
21066
|
|
1272
|
|
|
|
|
|
|
push (@rettokens, $tokprefix.$token); |
1273
|
|
|
|
|
|
|
} |
1274
|
532
|
|
|
|
|
8313
|
|
1275
|
|
|
|
|
|
|
return @rettokens; |
1276
|
|
|
|
|
|
|
} |
1277
|
|
|
|
|
|
|
|
1278
|
16
|
|
|
16
|
|
52
|
sub _tokenize_headers { |
1279
|
|
|
|
|
|
|
my ($self, $msg) = @_; |
1280
|
16
|
|
|
|
|
37
|
|
1281
|
|
|
|
|
|
|
my %parsed; |
1282
|
|
|
|
|
|
|
|
1283
|
16
|
|
|
|
|
35
|
my %user_ignore; |
|
16
|
|
|
|
|
76
|
|
1284
|
|
|
|
|
|
|
$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}}; |
1285
|
|
|
|
|
|
|
|
1286
|
16
|
|
|
|
|
41
|
# get headers in array context |
1287
|
|
|
|
|
|
|
my @hdrs; |
1288
|
16
|
|
|
|
|
102
|
my @rcvdlines; |
1289
|
|
|
|
|
|
|
for ($msg->get_all_headers()) { |
1290
|
202
|
100
|
|
|
|
607
|
# first, keep a copy of Received headers, so we can strip down to last 2 |
1291
|
52
|
|
|
|
|
154
|
if (/^Received:/i) { |
1292
|
52
|
|
|
|
|
109
|
push(@rcvdlines, $_); |
1293
|
|
|
|
|
|
|
next; |
1294
|
|
|
|
|
|
|
} |
1295
|
150
|
100
|
|
|
|
2112
|
# and now skip lines for headers we don't want (including all Received) |
1296
|
102
|
|
|
|
|
174
|
next if /^${IGNORED_HDRS}:/i; |
1297
|
102
|
|
|
|
|
306
|
next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; |
1298
|
|
|
|
|
|
|
push(@hdrs, $_); |
1299
|
16
|
|
|
|
|
133
|
} |
1300
|
|
|
|
|
|
|
push(@hdrs, $msg->get_all_metadata()); |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
# and re-add the last 2 received lines: usually a good source of |
1303
|
16
|
100
|
|
|
|
75
|
# spamware tokens and HELO names. |
|
14
|
|
|
|
|
59
|
|
1304
|
16
|
100
|
|
|
|
56
|
if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } |
|
14
|
|
|
|
|
49
|
|
1305
|
|
|
|
|
|
|
if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } |
1306
|
16
|
|
|
|
|
70
|
|
1307
|
194
|
50
|
|
|
|
688
|
for (@hdrs) { |
1308
|
194
|
|
|
|
|
881
|
next unless /\S/; |
1309
|
|
|
|
|
|
|
my ($hdr, $val) = split(/:/, $_, 2); |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
# remove user-specified headers here, after Received, in case they |
1312
|
194
|
50
|
|
|
|
639
|
# want to ignore that too |
1313
|
|
|
|
|
|
|
next if exists $user_ignore{lc $hdr}; |
1314
|
|
|
|
|
|
|
|
1315
|
194
|
|
50
|
|
|
452
|
# Prep the header value |
1316
|
194
|
|
|
|
|
388
|
$val ||= ''; |
1317
|
|
|
|
|
|
|
chomp($val); |
1318
|
|
|
|
|
|
|
|
1319
|
194
|
100
|
|
|
|
1570
|
# special tokenization for some headers: |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1320
|
16
|
|
|
|
|
102
|
if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { |
1321
|
|
|
|
|
|
|
$val = $self->_pre_chew_message_id ($val); |
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) |
1324
|
|
|
|
|
|
|
(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) |
1325
|
60
|
|
|
|
|
184
|
{ |
1326
|
|
|
|
|
|
|
$val = $self->_pre_chew_addr_header ($val); |
1327
|
|
|
|
|
|
|
} |
1328
|
28
|
|
|
|
|
114
|
elsif ($hdr eq 'Received') { |
1329
|
|
|
|
|
|
|
$val = $self->_pre_chew_received ($val); |
1330
|
|
|
|
|
|
|
} |
1331
|
4
|
|
|
|
|
31
|
elsif ($hdr eq 'Content-Type') { |
1332
|
|
|
|
|
|
|
$val = $self->_pre_chew_content_type ($val); |
1333
|
|
|
|
|
|
|
} |
1334
|
2
|
|
|
|
|
11
|
elsif ($hdr eq 'MIME-Version') { |
1335
|
|
|
|
|
|
|
$val =~ s/1\.0//; # totally innocuous |
1336
|
|
|
|
|
|
|
} |
1337
|
0
|
|
|
|
|
0
|
elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { |
1338
|
|
|
|
|
|
|
$val = "1"; # just mark the presence, they create lots of hapaxen |
1339
|
|
|
|
|
|
|
} |
1340
|
194
|
|
|
|
|
359
|
|
1341
|
194
|
100
|
|
|
|
575
|
if (MAP_HEADERS_MID) { |
1342
|
16
|
|
|
|
|
59
|
if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { |
1343
|
|
|
|
|
|
|
$parsed{"*MI"} = $val; |
1344
|
|
|
|
|
|
|
} |
1345
|
194
|
|
|
|
|
292
|
} |
1346
|
194
|
100
|
|
|
|
574
|
if (MAP_HEADERS_FROMTOCC) { |
1347
|
32
|
|
|
|
|
104
|
if ($hdr =~ /^(?:From|To|Cc)$/i) { |
1348
|
|
|
|
|
|
|
$parsed{"*Ad"} = $val; |
1349
|
|
|
|
|
|
|
} |
1350
|
194
|
|
|
|
|
311
|
} |
1351
|
194
|
50
|
|
|
|
482
|
if (MAP_HEADERS_USERAGENT) { |
1352
|
0
|
|
|
|
|
0
|
if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { |
1353
|
|
|
|
|
|
|
$parsed{"*UA"} = $val; |
1354
|
|
|
|
|
|
|
} |
1355
|
|
|
|
|
|
|
} |
1356
|
|
|
|
|
|
|
|
1357
|
194
|
100
|
|
|
|
523
|
# replace hdr name with "compressed" version if possible |
1358
|
126
|
|
|
|
|
343
|
if (defined $HEADER_NAME_COMPRESSION{$hdr}) { |
1359
|
|
|
|
|
|
|
$hdr = $HEADER_NAME_COMPRESSION{$hdr}; |
1360
|
|
|
|
|
|
|
} |
1361
|
194
|
100
|
|
|
|
499
|
|
1362
|
26
|
|
|
|
|
106
|
if (exists $parsed{$hdr}) { |
1363
|
|
|
|
|
|
|
$parsed{$hdr} .= " ".$val; |
1364
|
168
|
|
|
|
|
550
|
} else { |
1365
|
|
|
|
|
|
|
$parsed{$hdr} = $val; |
1366
|
194
|
50
|
|
|
|
505
|
} |
1367
|
0
|
|
|
|
|
0
|
if (would_log('dbg', 'bayes') > 1) { |
1368
|
|
|
|
|
|
|
dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); |
1369
|
|
|
|
|
|
|
} |
1370
|
|
|
|
|
|
|
} |
1371
|
16
|
|
|
|
|
294
|
|
1372
|
|
|
|
|
|
|
return %parsed; |
1373
|
|
|
|
|
|
|
} |
1374
|
|
|
|
|
|
|
|
1375
|
4
|
|
|
4
|
|
20
|
sub _pre_chew_content_type { |
1376
|
|
|
|
|
|
|
my ($self, $val) = @_; |
1377
|
|
|
|
|
|
|
|
1378
|
4
|
50
|
|
|
|
27
|
# hopefully this will retain good bits without too many hapaxen |
1379
|
0
|
|
|
|
|
0
|
if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { |
1380
|
0
|
0
|
|
|
|
0
|
my $boundary = $1; |
1381
|
0
|
|
|
|
|
0
|
$boundary = '' if !defined $boundary; # avoid a warning |
1382
|
|
|
|
|
|
|
$boundary =~ s/[a-fA-F0-9]/H/gs; |
1383
|
0
|
|
|
|
|
0
|
# break up blocks of separator chars so they become their own tokens |
1384
|
0
|
|
|
|
|
0
|
$boundary =~ s/([-_\.=]+)/ $1 /gs; |
1385
|
|
|
|
|
|
|
$val .= $boundary; |
1386
|
|
|
|
|
|
|
} |
1387
|
|
|
|
|
|
|
|
1388
|
4
|
|
|
|
|
43
|
# stop-list words for Content-Type header: these wind up totally gray |
1389
|
|
|
|
|
|
|
$val =~ s/\b(?:text|charset)\b//; |
1390
|
4
|
|
|
|
|
18
|
|
1391
|
|
|
|
|
|
|
$val; |
1392
|
|
|
|
|
|
|
} |
1393
|
|
|
|
|
|
|
|
1394
|
16
|
|
|
16
|
|
69
|
sub _pre_chew_message_id { |
1395
|
|
|
|
|
|
|
my ($self, $val) = @_; |
1396
|
|
|
|
|
|
|
# we can (a) get rid of a lot of hapaxen and (b) increase the token |
1397
|
|
|
|
|
|
|
# specificity by pre-parsing some common formats. |
1398
|
|
|
|
|
|
|
|
1399
|
16
|
|
|
|
|
62
|
# Outlook Express format: |
1400
|
|
|
|
|
|
|
$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ |
1401
|
|
|
|
|
|
|
([0-9a-f]{4})[0-9a-f]{4}\$ |
1402
|
|
|
|
|
|
|
([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; |
1403
|
|
|
|
|
|
|
|
1404
|
16
|
|
|
|
|
40
|
# Exim: |
1405
|
|
|
|
|
|
|
$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; |
1406
|
|
|
|
|
|
|
|
1407
|
16
|
|
|
|
|
34
|
# Sendmail: |
1408
|
|
|
|
|
|
|
$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. |
1409
|
|
|
|
|
|
|
[A-F0-9]{10,12}\@//gx; |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
# try to split Message-ID segments on probable ID boundaries. Note that |
1412
|
|
|
|
|
|
|
# Outlook message-ids seem to contain a server identifier ID in the last |
1413
|
|
|
|
|
|
|
# 8 bytes before the @. Make sure this becomes its own token, it's a |
1414
|
16
|
|
|
|
|
118
|
# great spam-sign for a learning system! Be sure to split on ".". |
1415
|
16
|
|
|
|
|
60
|
$val =~ s/[^_A-Za-z0-9]/ /g; |
1416
|
|
|
|
|
|
|
$val; |
1417
|
|
|
|
|
|
|
} |
1418
|
|
|
|
|
|
|
|
1419
|
28
|
|
|
28
|
|
90
|
sub _pre_chew_received { |
1420
|
|
|
|
|
|
|
my ($self, $val) = @_; |
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs |
1423
|
|
|
|
|
|
|
# and valid-format RFC-822/2822 dates |
1424
|
28
|
|
|
|
|
141
|
|
1425
|
28
|
|
|
|
|
118
|
$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail |
1426
|
28
|
|
|
|
|
173
|
$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail |
1427
|
28
|
|
|
|
|
87
|
$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail |
1428
|
|
|
|
|
|
|
$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim |
1429
|
28
|
|
|
|
|
289
|
|
1430
|
|
|
|
|
|
|
$val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? |
1431
|
|
|
|
|
|
|
[0-3\s]?[0-9]\s |
1432
|
|
|
|
|
|
|
(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s |
1433
|
|
|
|
|
|
|
(?:19|20)?[0-9]{2}\s |
1434
|
|
|
|
|
|
|
[0-2][0-9](?:\:[0-5][0-9]){1,2}\s |
1435
|
|
|
|
|
|
|
(?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* |
1436
|
|
|
|
|
|
|
//gx; |
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for |
1439
|
|
|
|
|
|
|
# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens |
1440
|
|
|
|
|
|
|
# (on both sides) |
1441
|
|
|
|
|
|
|
# also make a dup with the full IP, as fodder for |
1442
|
28
|
|
|
|
|
218
|
# bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" |
1443
|
30
|
50
|
33
|
|
|
255
|
$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ |
|
|
|
33
|
|
|
|
|
1444
|
0
|
|
|
|
|
0
|
if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { |
1445
|
|
|
|
|
|
|
$1.$2.$3.$4. |
1446
|
|
|
|
|
|
|
" ip*".$1.$2.$3.$4." "; |
1447
|
30
|
|
|
|
|
416
|
} else { |
1448
|
|
|
|
|
|
|
$1.$2.$3. |
1449
|
|
|
|
|
|
|
" ip*".$1.$2.$3.$4." "; |
1450
|
|
|
|
|
|
|
} |
1451
|
|
|
|
|
|
|
}gex; |
1452
|
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
|
# trim these: they turn out as the most common tokens, but with a |
1454
|
28
|
|
|
|
|
350
|
# prob of about .5. waste of space! |
1455
|
|
|
|
|
|
|
$val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; |
1456
|
28
|
|
|
|
|
117
|
|
1457
|
|
|
|
|
|
|
$val; |
1458
|
|
|
|
|
|
|
} |
1459
|
|
|
|
|
|
|
|
1460
|
60
|
|
|
60
|
|
188
|
sub _pre_chew_addr_header { |
1461
|
60
|
|
|
|
|
119
|
my ($self, $val) = @_; |
1462
|
|
|
|
|
|
|
local ($_); |
1463
|
60
|
|
|
|
|
260
|
|
1464
|
60
|
|
|
|
|
121
|
my @addrs = $self->{main}->find_all_addrs_in_line ($val); |
1465
|
60
|
|
|
|
|
127
|
my @toks; |
1466
|
48
|
|
|
|
|
135
|
foreach (@addrs) { |
1467
|
|
|
|
|
|
|
push (@toks, $self->_tokenize_mail_addrs ($_)); |
1468
|
60
|
|
|
|
|
354
|
} |
1469
|
|
|
|
|
|
|
return join (' ', @toks); |
1470
|
|
|
|
|
|
|
} |
1471
|
|
|
|
|
|
|
|
1472
|
86
|
|
|
86
|
|
267
|
sub _tokenize_mail_addrs { |
1473
|
|
|
|
|
|
|
my ($self, $addr) = @_; |
1474
|
86
|
50
|
|
|
|
468
|
|
1475
|
86
|
|
|
|
|
154
|
($addr =~ /(.+)\@(.+)$/) or return (); |
1476
|
86
|
|
|
|
|
404
|
my @toks; |
1477
|
86
|
|
|
|
|
208
|
push(@toks, "U*".$1, "D*".$2); |
|
86
|
|
|
|
|
465
|
|
|
76
|
|
|
|
|
376
|
|
1478
|
86
|
|
|
|
|
498
|
$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } |
1479
|
|
|
|
|
|
|
return @toks; |
1480
|
|
|
|
|
|
|
} |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
########################################################################### |
1484
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
# compute the probability that a token is spammish for each token |
1486
|
4
|
|
|
4
|
|
17
|
sub _compute_prob_for_all_tokens { |
1487
|
4
|
|
|
|
|
10
|
my ($self, $tokensdata, $ns, $nn) = @_; |
1488
|
|
|
|
|
|
|
my @probabilities; |
1489
|
4
|
50
|
33
|
|
|
41
|
|
1490
|
|
|
|
|
|
|
return if !$ns || !$nn; |
1491
|
4
|
|
|
|
|
12
|
|
1492
|
4
|
|
|
|
|
15
|
my $threshold = 1; # ignore low-freq tokens below this s+n threshold |
1493
|
|
|
|
|
|
|
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { |
1494
|
|
|
|
|
|
|
$threshold = 10; |
1495
|
4
|
50
|
|
|
|
17
|
} |
1496
|
0
|
|
|
|
|
0
|
if (!$self->{use_hapaxes}) { |
1497
|
|
|
|
|
|
|
$threshold = 2; |
1498
|
|
|
|
|
|
|
} |
1499
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
18
|
|
1500
|
1104
|
|
|
|
|
1434
|
foreach my $tokendata (@{$tokensdata}) { |
1501
|
1104
|
|
|
|
|
1486
|
my $s = $tokendata->[1]; # spam count |
1502
|
1104
|
|
|
|
|
1325
|
my $n = $tokendata->[2]; # ham count |
1503
|
|
|
|
|
|
|
my $prob; |
1504
|
21
|
|
|
21
|
|
245
|
|
|
21
|
|
|
|
|
51
|
|
|
21
|
|
|
|
|
26809
|
|
1505
|
1104
|
100
|
|
|
|
1815
|
no warnings 'uninitialized'; # treat undef as zero in addition |
1506
|
|
|
|
|
|
|
if ($s + $n >= $threshold) { |
1507
|
|
|
|
|
|
|
# ignoring low-freq tokens, also covers the (!$s && !$n) case |
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
# my $ratios = $s / $ns; |
1510
|
|
|
|
|
|
|
# my $ration = $n / $nn; |
1511
|
|
|
|
|
|
|
# $prob = $ratios / ($ration + $ratios); |
1512
|
502
|
|
|
|
|
1120
|
# |
1513
|
|
|
|
|
|
|
$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster |
1514
|
502
|
|
|
|
|
623
|
|
1515
|
|
|
|
|
|
|
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { |
1516
|
|
|
|
|
|
|
# use Robinson's f(x) equation for low-n tokens, instead of just |
1517
|
502
|
|
|
|
|
620
|
# ignoring them |
1518
|
502
|
|
|
|
|
1088
|
my $robn = $s + $n; |
1519
|
|
|
|
|
|
|
$prob = |
1520
|
|
|
|
|
|
|
($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) |
1521
|
|
|
|
|
|
|
/ |
1522
|
|
|
|
|
|
|
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); |
1523
|
|
|
|
|
|
|
} |
1524
|
|
|
|
|
|
|
} |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
# 'log_raw_counts' is used to log the raw data for the Bayes equations |
1527
|
|
|
|
|
|
|
# during a mass-check, allowing the S and X constants to be optimized |
1528
|
|
|
|
|
|
|
# quickly without requiring re-tokenization of the messages for each |
1529
|
|
|
|
|
|
|
# attempt. There's really no need for this code to be uncommented in |
1530
|
|
|
|
|
|
|
# normal use, however. It has never been publicly documented, so |
1531
|
|
|
|
|
|
|
# commenting it out is fine. ;) |
1532
|
|
|
|
|
|
|
# |
1533
|
|
|
|
|
|
|
## if ($self->{log_raw_counts}) { |
1534
|
|
|
|
|
|
|
## $self->{raw_counts} .= " s=$s,n=$n "; |
1535
|
|
|
|
|
|
|
## } |
1536
|
1104
|
|
|
|
|
2043
|
|
1537
|
|
|
|
|
|
|
push(@probabilities, $prob); |
1538
|
4
|
|
|
|
|
26
|
} |
1539
|
|
|
|
|
|
|
return \@probabilities; |
1540
|
|
|
|
|
|
|
} |
1541
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
# compute the probability that a token is spammish |
1543
|
0
|
|
|
0
|
|
0
|
sub _compute_prob_for_token { |
1544
|
|
|
|
|
|
|
my ($self, $token, $ns, $nn, $s, $n) = @_; |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
# we allow the caller to give us the token information, just |
1547
|
0
|
0
|
0
|
|
|
0
|
# to save a potentially expensive lookup |
1548
|
0
|
|
|
|
|
0
|
if (!defined($s) || !defined($n)) { |
1549
|
|
|
|
|
|
|
($s, $n, undef) = $self->{store}->tok_get($token); |
1550
|
0
|
0
|
0
|
|
|
0
|
} |
1551
|
|
|
|
|
|
|
return if !$s && !$n; |
1552
|
0
|
|
|
|
|
0
|
|
1553
|
|
|
|
|
|
|
my $probabilities_ref = |
1554
|
|
|
|
|
|
|
$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn); |
1555
|
0
|
|
|
|
|
0
|
|
1556
|
|
|
|
|
|
|
return $probabilities_ref->[0]; |
1557
|
|
|
|
|
|
|
} |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
########################################################################### |
1560
|
|
|
|
|
|
|
# If a token is neither hammy nor spammy, return 0. |
1561
|
|
|
|
|
|
|
# For a spammy token, return the minimum number of additional ham messages |
1562
|
|
|
|
|
|
|
# it would have had to appear in to no longer be spammy. Hammy tokens |
1563
|
|
|
|
|
|
|
# are handled similarly. That's what the function does (at the time |
1564
|
|
|
|
|
|
|
# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly |
1565
|
|
|
|
|
|
|
# more useful if it returned the number of /additional/ ham messages |
1566
|
|
|
|
|
|
|
# a spammy token would have to appear in to no longer be spammy but I |
1567
|
|
|
|
|
|
|
# fear that might require the solution to a cubic equation, and I |
1568
|
|
|
|
|
|
|
# just don't have the time for that now. |
1569
|
|
|
|
|
|
|
|
1570
|
0
|
|
|
0
|
|
0
|
sub _compute_declassification_distance { |
1571
|
|
|
|
|
|
|
my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; |
1572
|
0
|
0
|
0
|
|
|
0
|
|
1573
|
|
|
|
|
|
|
return 0 if $ns == 0 && $nn == 0; |
1574
|
0
|
|
|
|
|
0
|
|
1575
|
0
|
0
|
|
|
|
0
|
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} |
|
0
|
0
|
|
|
|
0
|
|
1576
|
|
|
|
|
|
|
if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} |
1577
|
0
|
0
|
0
|
|
|
0
|
|
1578
|
0
|
0
|
|
|
|
0
|
return 0 if $Ns == 0 || $Nn == 0; |
1579
|
|
|
|
|
|
|
return 0 if abs( $prob - 0.5 ) < |
1580
|
|
|
|
|
|
|
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; |
1581
|
0
|
0
|
|
|
|
0
|
|
1582
|
0
|
|
|
|
|
0
|
my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); |
1583
|
|
|
|
|
|
|
my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; |
1584
|
0
|
|
|
|
|
0
|
|
1585
|
|
|
|
|
|
|
return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na |
1586
|
|
|
|
|
|
|
unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; |
1587
|
0
|
|
|
|
|
0
|
|
1588
|
0
|
|
|
|
|
0
|
my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; |
1589
|
0
|
|
|
|
|
0
|
my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; |
1590
|
0
|
|
|
|
|
0
|
my $a = $Nb * ( 1 - $p ); |
1591
|
0
|
|
|
|
|
0
|
my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; |
1592
|
0
|
|
|
|
|
0
|
my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); |
1593
|
0
|
0
|
|
|
|
0
|
my $discrim = $b * $b - 4 * $a * $c; |
1594
|
0
|
|
|
|
|
0
|
my $disc_max_0 = $discrim < 0 ? 0 : $discrim; |
1595
|
|
|
|
|
|
|
my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; |
1596
|
|
|
|
|
|
|
|
1597
|
0
|
0
|
|
|
|
0
|
# This shouldn't be necessary. Should not be < 1 |
1598
|
|
|
|
|
|
|
return $dd_exact < 1 ? 1 : int($dd_exact); |
1599
|
|
|
|
|
|
|
} |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
########################################################################### |
1602
|
|
|
|
|
|
|
|
1603
|
10
|
|
|
10
|
|
40
|
sub _opportunistic_calls { |
1604
|
|
|
|
|
|
|
my($self, $journal_only) = @_; |
1605
|
|
|
|
|
|
|
|
1606
|
10
|
50
|
|
|
|
129
|
# If we're not already tied, abort. |
1607
|
0
|
|
|
|
|
0
|
if (!$self->{store}->db_readable()) { |
1608
|
0
|
|
|
|
|
0
|
dbg("bayes: opportunistic call attempt failed, DB not readable"); |
1609
|
|
|
|
|
|
|
return; |
1610
|
|
|
|
|
|
|
} |
1611
|
|
|
|
|
|
|
|
1612
|
10
|
|
|
|
|
77
|
# Is an expire or sync running? |
1613
|
10
|
50
|
33
|
|
|
39
|
my $running_expire = $self->{store}->get_running_expire_tok(); |
1614
|
0
|
|
|
|
|
0
|
if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { |
1615
|
0
|
|
|
|
|
0
|
dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); |
1616
|
|
|
|
|
|
|
return; |
1617
|
|
|
|
|
|
|
} |
1618
|
|
|
|
|
|
|
|
1619
|
10
|
50
|
66
|
|
|
145
|
# handle expiry and syncing |
|
|
50
|
|
|
|
|
|
1620
|
0
|
|
|
|
|
0
|
if (!$journal_only && $self->{store}->expiry_due()) { |
1621
|
|
|
|
|
|
|
dbg("bayes: opportunistic call found expiry due"); |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
# sync will bring the DB R/W as necessary, and the expire will remove |
1624
|
0
|
|
|
|
|
0
|
# the running_expire token, may untie as well. |
1625
|
|
|
|
|
|
|
$self->{main}->{bayes_scanner}->sync(1,1); |
1626
|
|
|
|
|
|
|
} |
1627
|
0
|
|
|
|
|
0
|
elsif ( $self->{store}->sync_due() ) { |
1628
|
|
|
|
|
|
|
dbg("bayes: opportunistic call found journal sync due"); |
1629
|
|
|
|
|
|
|
|
1630
|
0
|
|
|
|
|
0
|
# sync will bring the DB R/W as necessary, may untie as well |
1631
|
|
|
|
|
|
|
$self->{main}->{bayes_scanner}->sync(1,0); |
1632
|
|
|
|
|
|
|
|
1633
|
0
|
0
|
|
|
|
0
|
# We can only remove the running_expire token if we're doing R/W |
1634
|
0
|
|
|
|
|
0
|
if ($self->{store}->db_writable()) { |
1635
|
|
|
|
|
|
|
$self->{store}->remove_running_expire_tok(); |
1636
|
|
|
|
|
|
|
} |
1637
|
|
|
|
|
|
|
} |
1638
|
10
|
|
|
|
|
34
|
|
1639
|
|
|
|
|
|
|
return; |
1640
|
|
|
|
|
|
|
} |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
########################################################################### |
1643
|
|
|
|
|
|
|
|
1644
|
62
|
|
|
62
|
1
|
232
|
sub learner_new { |
1645
|
|
|
|
|
|
|
my ($self) = @_; |
1646
|
62
|
|
|
|
|
138
|
|
1647
|
62
|
|
|
|
|
271
|
my $store; |
1648
|
62
|
100
|
|
|
|
368
|
my $module = $self->{conf}->{bayes_store_module}; |
|
|
50
|
|
|
|
|
|
1649
|
56
|
|
|
|
|
191
|
if (!$module) { |
1650
|
|
|
|
|
|
|
$module = 'Mail::SpamAssassin::BayesStore::DBM'; |
1651
|
6
|
|
|
|
|
40
|
} elsif ($module =~ /^([_A-Za-z0-9:]+)$/) { |
1652
|
|
|
|
|
|
|
$module = untaint_var($module); |
1653
|
0
|
|
|
|
|
0
|
} else { |
1654
|
|
|
|
|
|
|
die "bayes: invalid module: $module\n"; |
1655
|
|
|
|
|
|
|
} |
1656
|
62
|
|
|
|
|
322
|
|
1657
|
62
|
|
|
|
|
225
|
dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module); |
1658
|
|
|
|
|
|
|
undef $self->{store}; # DESTROYs previous object, if any |
1659
|
|
|
|
|
|
|
eval ' |
1660
|
|
|
|
|
|
|
require '.$module.'; |
1661
|
|
|
|
|
|
|
$store = '.$module.'->new($self); |
1662
|
62
|
50
|
|
|
|
8504
|
1; |
1663
|
0
|
0
|
|
|
|
0
|
' or do { |
|
0
|
|
|
|
|
0
|
|
1664
|
0
|
|
|
|
|
0
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
1665
|
|
|
|
|
|
|
die "bayes: learner_new $module new() failed: $eval_stat\n"; |
1666
|
|
|
|
|
|
|
}; |
1667
|
62
|
|
|
|
|
488
|
|
1668
|
62
|
|
|
|
|
179
|
dbg("bayes: learner_new: got store=%s", $store); |
1669
|
|
|
|
|
|
|
$self->{store} = $store; |
1670
|
62
|
|
|
|
|
254
|
|
1671
|
|
|
|
|
|
|
$self; |
1672
|
|
|
|
|
|
|
} |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
########################################################################### |
1675
|
|
|
|
|
|
|
|
1676
|
0
|
|
|
0
|
0
|
|
sub bayes_report_make_list { |
1677
|
0
|
0
|
|
|
|
|
my ($self, $pms, $info, $param) = @_; |
1678
|
|
|
|
|
|
|
return "Tokens not available." unless defined $info; |
1679
|
0
|
|
0
|
|
|
|
|
1680
|
|
|
|
|
|
|
my ($limit,$fmt_arg,$more) = split /,/, ($param || '5'); |
1681
|
0
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
my %formats = ( |
1683
|
|
|
|
|
|
|
short => '$t', |
1684
|
|
|
|
|
|
|
Short => 'Token: \"$t\"', |
1685
|
|
|
|
|
|
|
compact => '$p-$D--$t', |
1686
|
|
|
|
|
|
|
Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', |
1687
|
|
|
|
|
|
|
medium => '$p-$D-$N--$t', |
1688
|
|
|
|
|
|
|
long => '$p-$d--${h}h-${s}s--${a}d--$t', |
1689
|
|
|
|
|
|
|
Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"' |
1690
|
|
|
|
|
|
|
); |
1691
|
0
|
0
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); |
1693
|
0
|
0
|
|
|
|
|
|
1694
|
|
|
|
|
|
|
return "Invalid format, must be one of: ".join(",",keys %formats) |
1695
|
|
|
|
|
|
|
unless defined $raw_fmt; |
1696
|
0
|
|
|
|
|
|
|
1697
|
0
|
0
|
|
|
|
|
my $fmt = '"'.$raw_fmt.'"'; |
1698
|
0
|
0
|
|
|
|
|
my $amt = $limit < @$info ? $limit : @$info; |
1699
|
|
|
|
|
|
|
return "" unless $amt; |
1700
|
0
|
|
|
|
|
|
|
1701
|
0
|
|
|
|
|
|
my $ns = $pms->{bayes_nspam}; |
1702
|
0
|
0
|
|
0
|
|
|
my $nh = $pms->{bayes_nham}; |
|
0
|
|
|
|
|
|
|
1703
|
0
|
|
|
|
|
|
my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; |
1704
|
|
|
|
|
|
|
my $now = time; |
1705
|
|
|
|
|
|
|
|
1706
|
0
|
|
|
|
|
|
join ', ', map { |
1707
|
0
|
|
|
|
|
|
my($t,$prob,$s,$h,$u) = @$_; |
1708
|
0
|
|
|
|
|
|
my $a = int(($now - $u)/(3600 * 24)); |
1709
|
0
|
|
|
|
|
|
my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob); |
1710
|
0
|
|
|
|
|
|
my $p = sprintf "%.3f", $prob; |
1711
|
0
|
0
|
|
|
|
|
my $n = $s + $h; |
1712
|
0
|
|
|
|
|
|
my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); |
1713
|
0
|
|
|
|
|
|
my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); |
1714
|
0
|
|
|
|
|
|
eval $fmt; ## no critic |
|
0
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
} @{$info}[0..$amt-1]; |
1716
|
|
|
|
|
|
|
} |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
1; |