| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # <@LICENSE> | 
| 2 |  |  |  |  |  |  | # Licensed to the Apache Software Foundation (ASF) under one or more | 
| 3 |  |  |  |  |  |  | # contributor license agreements.  See the NOTICE file distributed with | 
| 4 |  |  |  |  |  |  | # this work for additional information regarding copyright ownership. | 
| 5 |  |  |  |  |  |  | # The ASF licenses this file to you under the Apache License, Version 2.0 | 
| 6 |  |  |  |  |  |  | # (the "License"); you may not use this file except in compliance with | 
| 7 |  |  |  |  |  |  | # the License.  You may obtain a copy of the License at: | 
| 8 |  |  |  |  |  |  | # | 
| 9 |  |  |  |  |  |  | #     http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  |  |  |  |  |  | # | 
| 11 |  |  |  |  |  |  | # Unless required by applicable law or agreed to in writing, software | 
| 12 |  |  |  |  |  |  | # distributed under the License is distributed on an "AS IS" BASIS, | 
| 13 |  |  |  |  |  |  | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 14 |  |  |  |  |  |  | # See the License for the specific language governing permissions and | 
| 15 |  |  |  |  |  |  | # limitations under the License. | 
| 16 |  |  |  |  |  |  | # </@LICENSE> | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | =head1 NAME | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | This is a Bayesian-style probabilistic classifier, using an algorithm based on | 
| 25 |  |  |  |  |  |  | the one detailed in Paul Graham's I<A Plan For Spam> paper at: | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | http://www.paulgraham.com/spam.html | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | It also incorporates some other aspects taken from Graham Robinson's webpage | 
| 30 |  |  |  |  |  |  | on the subject at: | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | And the chi-square probability combiner as described here: | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | http://www.linuxjournal.com/print.php?sid=6467 | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | The results are incorporated into SpamAssassin as the BAYES_* rules. | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | =head1 METHODS | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | =cut | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | use strict; | 
| 46 | 22 |  |  | 22 |  | 141 | use warnings; | 
|  | 22 |  |  |  |  | 44 |  | 
|  | 22 |  |  |  |  | 640 |  | 
| 47 | 22 |  |  | 22 |  | 107 | # use bytes; | 
|  | 22 |  |  |  |  | 39 |  | 
|  | 22 |  |  |  |  | 702 |  | 
| 48 |  |  |  |  |  |  | use re 'taint'; | 
| 49 | 22 |  |  | 22 |  | 124 |  | 
|  | 22 |  |  |  |  | 56 |  | 
|  | 22 |  |  |  |  | 1865 |  | 
| 50 |  |  |  |  |  |  | BEGIN { | 
| 51 |  |  |  |  |  |  | eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 } | 
| 52 | 22 |  |  |  |  | 114 | or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) } | 
|  | 22 |  |  |  |  | 1353 |  | 
|  | 22 |  |  |  |  | 554 |  | 
| 53 | 22 | 50 |  | 22 |  | 75 | } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | use Mail::SpamAssassin; | 
| 56 | 22 |  |  | 22 |  | 124 | use Mail::SpamAssassin::Plugin; | 
|  | 22 |  |  |  |  | 46 |  | 
|  | 22 |  |  |  |  | 543 |  | 
| 57 | 22 |  |  | 22 |  | 108 | use Mail::SpamAssassin::PerMsgStatus; | 
|  | 22 |  |  |  |  | 60 |  | 
|  | 22 |  |  |  |  | 659 |  | 
| 58 | 22 |  |  | 22 |  | 126 | use Mail::SpamAssassin::Logger; | 
|  | 22 |  |  |  |  | 42 |  | 
|  | 22 |  |  |  |  | 599 |  | 
| 59 | 22 |  |  | 22 |  | 116 | use Mail::SpamAssassin::Util qw(untaint_var); | 
|  | 22 |  |  |  |  | 46 |  | 
|  | 22 |  |  |  |  | 1203 |  | 
| 60 | 22 |  |  | 22 |  | 195 |  | 
|  | 22 |  |  |  |  | 38 |  | 
|  | 22 |  |  |  |  | 996 |  | 
| 61 |  |  |  |  |  |  | # pick ONLY ONE of these combining implementations. | 
| 62 |  |  |  |  |  |  | use Mail::SpamAssassin::Bayes::CombineChi; | 
| 63 | 22 |  |  | 22 |  | 6640 | # use Mail::SpamAssassin::Bayes::CombineNaiveBayes; | 
|  | 22 |  |  |  |  | 57 |  | 
|  | 22 |  |  |  |  | 12009 |  | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | our @ISA = qw(Mail::SpamAssassin::Plugin); | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | # Which headers should we scan for tokens?  Don't use all of them, as it's easy | 
| 68 |  |  |  |  |  |  | # to pick up spurious clues from some.  What we now do is use all of them | 
| 69 |  |  |  |  |  |  | # *less* these well-known headers; that way we can pick up spammers' tracking | 
| 70 |  |  |  |  |  |  | # headers (which are obviously not well-known in advance!). | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | # Received is handled specially | 
| 73 |  |  |  |  |  |  | our $IGNORED_HDRS = qr{(?: (?:X-)?Sender    # misc noise | 
| 74 |  |  |  |  |  |  | |Delivered-To |Delivery-Date | 
| 75 |  |  |  |  |  |  | |(?:X-)?Envelope-To | 
| 76 |  |  |  |  |  |  | |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | |Subject      # not worth a tiny gain vs. to db size increase | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | # Date: can provide invalid cues if your spam corpus is | 
| 81 |  |  |  |  |  |  | # older/newer than ham | 
| 82 |  |  |  |  |  |  | |Date | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | # List headers: ignore. a spamfiltering mailing list will | 
| 85 |  |  |  |  |  |  | # become a nonspam sign. | 
| 86 |  |  |  |  |  |  | |X-List|(?:X-)?Mailing-List | 
| 87 |  |  |  |  |  |  | |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe | 
| 88 |  |  |  |  |  |  | |Unsubscribe|Host|Id|Manager|Admin|Comment | 
| 89 |  |  |  |  |  |  | |Name|Url) | 
| 90 |  |  |  |  |  |  | |X-Unsub(?:scribe)? | 
| 91 |  |  |  |  |  |  | |X-Mailman-Version |X-Been[Tt]here |X-Loop | 
| 92 |  |  |  |  |  |  | |Mail-Followup-To | 
| 93 |  |  |  |  |  |  | |X-eGroups-(?:Return|From) | 
| 94 |  |  |  |  |  |  | |X-MDMailing-List | 
| 95 |  |  |  |  |  |  | |X-XEmacs-List | 
| 96 |  |  |  |  |  |  | |X-Sympa-To | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | # gatewayed through mailing list (thanks to Allen Smith) | 
| 99 |  |  |  |  |  |  | |(?:X-)?Resent-(?:From|To|Date) | 
| 100 |  |  |  |  |  |  | |(?:X-)?Original-(?:From|To|Date) | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | # Spamfilter/virus-scanner headers: too easy to chain from | 
| 103 |  |  |  |  |  |  | # these | 
| 104 |  |  |  |  |  |  | |X-MailScanner(?:-SpamCheck)? | 
| 105 |  |  |  |  |  |  | |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? | 
| 106 |  |  |  |  |  |  | |X-Antispam |X-RBL-Warning |X-Mailscanner | 
| 107 |  |  |  |  |  |  | |X-MDaemon-Deliver-To |X-Virus-Scanned | 
| 108 |  |  |  |  |  |  | |X-Mass-Check-Id | 
| 109 |  |  |  |  |  |  | |X-Pyzor |X-DCC-\S{2,25}-Metrics | 
| 110 |  |  |  |  |  |  | |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner | 
| 111 |  |  |  |  |  |  | |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status | 
| 112 |  |  |  |  |  |  | |X-SpamCop-[^:]+ | 
| 113 |  |  |  |  |  |  | |X-SMTPD |(?:X-)?Spam-Apparently-To | 
| 114 |  |  |  |  |  |  | |SPAM |X-Perlmx-Spam | 
| 115 |  |  |  |  |  |  | |X-Bogosity | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | # some noisy Outlook headers that add no good clues: | 
| 118 |  |  |  |  |  |  | |Content-Class |Thread-(?:Index|Topic) | 
| 119 |  |  |  |  |  |  | |X-Original[Aa]rrival[Tt]ime | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | # Annotations from IMAP, POP, and MH: | 
| 122 |  |  |  |  |  |  | |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded | 
| 123 |  |  |  |  |  |  | |Lines |Content-Length | 
| 124 |  |  |  |  |  |  | |X-UIDL? |X-IMAPbase | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | # Annotations from Bugzilla | 
| 127 |  |  |  |  |  |  | |X-Bugzilla-[^:]+ | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | # Annotations from VM: (thanks to Allen Smith) | 
| 130 |  |  |  |  |  |  | |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified | 
| 131 |  |  |  |  |  |  | |Summary-Format|VHeader|v\d-Data|Message-Order) | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | # Annotations from Gnus: | 
| 134 |  |  |  |  |  |  | | X-Gnus-Mail-Source | 
| 135 |  |  |  |  |  |  | | Xref | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | )}x; | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | # Note only the presence of these headers, in order to reduce the | 
| 140 |  |  |  |  |  |  | # hapaxen they generate. | 
| 141 |  |  |  |  |  |  | our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face | 
| 142 |  |  |  |  |  |  | |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint | 
| 143 |  |  |  |  |  |  | |D(?:KIM|omainKey)-Signature | 
| 144 |  |  |  |  |  |  | )}ix; | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | # tweaks tested as of Nov 18 2002 by jm posted to -devel at | 
| 147 |  |  |  |  |  |  | # http://sourceforge.net/p/spamassassin/mailman/message/12977556/ | 
| 148 |  |  |  |  |  |  | # for results.  The winners are now the default settings. | 
| 149 |  |  |  |  |  |  | use constant IGNORE_TITLE_CASE => 1; | 
| 150 | 22 |  |  | 22 |  | 178 | use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0; | 
|  | 22 |  |  |  |  | 51 |  | 
|  | 22 |  |  |  |  | 1074 |  | 
| 151 | 22 |  |  | 22 |  | 113 | use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1; | 
|  | 22 |  |  |  |  | 41 |  | 
|  | 22 |  |  |  |  | 827 |  | 
| 152 | 22 |  |  | 22 |  | 135 | use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; | 
|  | 22 |  |  |  |  | 43 |  | 
|  | 22 |  |  |  |  | 791 |  | 
| 153 | 22 |  |  | 22 |  | 111 |  | 
|  | 22 |  |  |  |  | 39 |  | 
|  | 22 |  |  |  |  | 811 |  | 
| 154 |  |  |  |  |  |  | # tweaks by jm on May 12 2003, see -devel email at | 
| 155 |  |  |  |  |  |  | # http://sourceforge.net/p/spamassassin/mailman/message/14844556/ | 
| 156 |  |  |  |  |  |  | use constant PRE_CHEW_ADDR_HEADERS => 1; | 
| 157 | 22 |  |  | 22 |  | 150 | use constant CHEW_BODY_URIS => 1; | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 758 |  | 
| 158 | 22 |  |  | 22 |  | 102 | use constant CHEW_BODY_MAILADDRS => 1; | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 763 |  | 
| 159 | 22 |  |  | 22 |  | 105 | use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; | 
|  | 22 |  |  |  |  | 37 |  | 
|  | 22 |  |  |  |  | 755 |  | 
| 160 | 22 |  |  | 22 |  | 105 | use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; | 
|  | 22 |  |  |  |  | 44 |  | 
|  | 22 |  |  |  |  | 725 |  | 
| 161 | 22 |  |  | 22 |  | 105 | use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; | 
|  | 22 |  |  |  |  | 41 |  | 
|  | 22 |  |  |  |  | 709 |  | 
| 162 | 22 |  |  | 22 |  | 104 | use constant IGNORE_MSGID_TOKENS => 0; | 
|  | 22 |  |  |  |  | 37 |  | 
|  | 22 |  |  |  |  | 695 |  | 
| 163 | 22 |  |  | 22 |  | 100 |  | 
|  | 22 |  |  |  |  | 38 |  | 
|  | 22 |  |  |  |  | 708 |  | 
| 164 |  |  |  |  |  |  | # tweaks of 12 March 2004, see bug 2129. | 
| 165 |  |  |  |  |  |  | use constant DECOMPOSE_BODY_TOKENS => 1; | 
| 166 | 22 |  |  | 22 |  | 100 | use constant MAP_HEADERS_MID => 1; | 
|  | 22 |  |  |  |  | 41 |  | 
|  | 22 |  |  |  |  | 732 |  | 
| 167 | 22 |  |  | 22 |  | 102 | use constant MAP_HEADERS_FROMTOCC => 1; | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 758 |  | 
| 168 | 22 |  |  | 22 |  | 103 | use constant MAP_HEADERS_USERAGENT => 1; | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 705 |  | 
| 169 | 22 |  |  | 22 |  | 99 |  | 
|  | 22 |  |  |  |  | 41 |  | 
|  | 22 |  |  |  |  | 794 |  | 
| 170 |  |  |  |  |  |  | # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 | 
| 171 |  |  |  |  |  |  | use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; | 
| 172 | 22 |  |  | 22 |  | 111 | use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 851 |  | 
| 173 | 22 |  |  | 22 |  | 126 |  | 
|  | 22 |  |  |  |  | 40 |  | 
|  | 22 |  |  |  |  | 2432 |  | 
| 174 |  |  |  |  |  |  | # We store header-mined tokens in the db with a "HHeaderName:val" format. | 
| 175 |  |  |  |  |  |  | # some headers may contain lots of gibberish tokens, so allow a little basic | 
| 176 |  |  |  |  |  |  | # compression by mapping the header name at least here.  these are the headers | 
| 177 |  |  |  |  |  |  | # which appear with the most frequency in my db.  note: this doesn't have to | 
| 178 |  |  |  |  |  |  | # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing | 
| 179 |  |  |  |  |  |  | # tokens from multiple different headers may impact accuracy, so might as well | 
| 180 |  |  |  |  |  |  | # avoid this if possible. These are the top ones from my corpus, BTW (jm). | 
| 181 |  |  |  |  |  |  | our %HEADER_NAME_COMPRESSION = ( | 
| 182 |  |  |  |  |  |  | 'Message-Id'		=> '*m', | 
| 183 |  |  |  |  |  |  | 'Message-ID'		=> '*M', | 
| 184 |  |  |  |  |  |  | 'Received'		=> '*r', | 
| 185 |  |  |  |  |  |  | 'User-Agent'		=> '*u', | 
| 186 |  |  |  |  |  |  | 'References'		=> '*f', | 
| 187 |  |  |  |  |  |  | 'In-Reply-To'		=> '*i', | 
| 188 |  |  |  |  |  |  | 'From'		=> '*F', | 
| 189 |  |  |  |  |  |  | 'Reply-To'		=> '*R', | 
| 190 |  |  |  |  |  |  | 'Return-Path'		=> '*p', | 
| 191 |  |  |  |  |  |  | 'Return-path'		=> '*rp', | 
| 192 |  |  |  |  |  |  | 'X-Mailer'		=> '*x', | 
| 193 |  |  |  |  |  |  | 'X-Authentication-Warning' => '*a', | 
| 194 |  |  |  |  |  |  | 'Organization'	=> '*o', | 
| 195 |  |  |  |  |  |  | 'Organisation'        => '*o', | 
| 196 |  |  |  |  |  |  | 'Content-Type'	=> '*ct', | 
| 197 |  |  |  |  |  |  | 'Content-Disposition'	=> '*cd', | 
| 198 |  |  |  |  |  |  | 'Content-Transfer-Encoding' => '*ce', | 
| 199 |  |  |  |  |  |  | 'x-spam-relays-trusted' => '*RT', | 
| 200 |  |  |  |  |  |  | 'x-spam-relays-untrusted' => '*RU', | 
| 201 |  |  |  |  |  |  | ); | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | # How many seconds should the opportunistic_expire lock be valid? | 
| 204 |  |  |  |  |  |  | our $OPPORTUNISTIC_LOCK_VALID = 300; | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | # Should we use the Robinson f(w) equation from | 
| 207 |  |  |  |  |  |  | # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? | 
| 208 |  |  |  |  |  |  | # It gives better results, in that scores are more likely to distribute | 
| 209 |  |  |  |  |  |  | # into the <0.5 range for nonspam and >0.5 for spam. | 
| 210 |  |  |  |  |  |  | use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; | 
| 211 | 22 |  |  | 22 |  | 143 |  | 
|  | 22 |  |  |  |  | 43 |  | 
|  | 22 |  |  |  |  | 1034 |  | 
| 212 |  |  |  |  |  |  | # How many of the most significant tokens should we use for the p(w) | 
| 213 |  |  |  |  |  |  | # calculation? | 
| 214 |  |  |  |  |  |  | use constant N_SIGNIFICANT_TOKENS => 150; | 
| 215 | 22 |  |  | 22 |  | 123 |  | 
|  | 22 |  |  |  |  | 38 |  | 
|  | 22 |  |  |  |  | 1044 |  | 
| 216 |  |  |  |  |  |  | # How many significant tokens are required for a classifier score to | 
| 217 |  |  |  |  |  |  | # be considered usable? | 
| 218 |  |  |  |  |  |  | use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; | 
| 219 | 22 |  |  | 22 |  | 121 |  | 
|  | 22 |  |  |  |  | 47 |  | 
|  | 22 |  |  |  |  | 1021 |  | 
| 220 |  |  |  |  |  |  | # How long a token should we hold onto?  (note: German speakers typically | 
| 221 |  |  |  |  |  |  | # will require a longer token than English ones.) | 
| 222 |  |  |  |  |  |  | use constant MAX_TOKEN_LENGTH => 15; | 
| 223 | 22 |  |  | 22 |  | 127 |  | 
|  | 22 |  |  |  |  | 39 |  | 
|  | 22 |  |  |  |  | 154837 |  | 
| 224 |  |  |  |  |  |  | ########################################################################### | 
| 225 |  |  |  |  |  |  |  | 
| 226 |  |  |  |  |  |  | my $class = shift; | 
| 227 |  |  |  |  |  |  | my ($main) = @_; | 
| 228 | 63 |  |  | 63 | 1 | 218 |  | 
| 229 | 63 |  |  |  |  | 200 | $class = ref($class) || $class; | 
| 230 |  |  |  |  |  |  | my $self = $class->SUPER::new($main); | 
| 231 | 63 |  | 33 |  |  | 420 | bless ($self, $class); | 
| 232 | 63 |  |  |  |  | 348 |  | 
| 233 | 63 |  |  |  |  | 176 | $self->{main} = $main; | 
| 234 |  |  |  |  |  |  | $self->{conf} = $main->{conf}; | 
| 235 | 63 |  |  |  |  | 218 | $self->{use_ignores} = 1; | 
| 236 | 63 |  |  |  |  | 158 |  | 
| 237 | 63 |  |  |  |  | 180 | $self->register_eval_rule("check_bayes"); | 
| 238 |  |  |  |  |  |  | $self; | 
| 239 | 63 |  |  |  |  | 263 | } | 
| 240 | 63 |  |  |  |  | 519 |  | 
| 241 |  |  |  |  |  |  | my $self = shift; | 
| 242 |  |  |  |  |  |  | if ($self->{store}) { | 
| 243 |  |  |  |  |  |  | $self->{store}->untie_db(); | 
| 244 | 40 |  |  | 40 | 1 | 113 | } | 
| 245 | 40 | 100 |  |  |  | 171 | %{$self} = (); | 
| 246 | 39 |  |  |  |  | 259 | } | 
| 247 |  |  |  |  |  |  |  | 
| 248 | 40 |  |  |  |  | 88 | ########################################################################### | 
|  | 40 |  |  |  |  | 242 |  | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | # Plugin hook. | 
| 251 |  |  |  |  |  |  | # Return this implementation object, for callers that need to know | 
| 252 |  |  |  |  |  |  | # it.  TODO: callers shouldn't *need* to know it! | 
| 253 |  |  |  |  |  |  | # used only in test suite to get access to {store}, internal APIs. | 
| 254 |  |  |  |  |  |  | # | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | ########################################################################### | 
| 257 |  |  |  |  |  |  |  | 
| 258 | 1160 |  |  | 1160 | 0 | 1838 | # Plugin hook. | 
| 259 |  |  |  |  |  |  | # Called in the parent process shortly before forking off child processes. | 
| 260 |  |  |  |  |  |  | my ($self) = @_; | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) { | 
| 263 |  |  |  |  |  |  | $self->{store}->prefork_init; | 
| 264 |  |  |  |  |  |  | } | 
| 265 | 0 |  |  | 0 | 0 | 0 | } | 
| 266 |  |  |  |  |  |  |  | 
| 267 | 0 | 0 | 0 |  |  | 0 | ########################################################################### | 
| 268 | 0 |  |  |  |  | 0 |  | 
| 269 |  |  |  |  |  |  | # Plugin hook. | 
| 270 |  |  |  |  |  |  | # Called in a child process shortly after being spawned. | 
| 271 |  |  |  |  |  |  | my ($self) = @_; | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) { | 
| 274 |  |  |  |  |  |  | $self->{store}->spamd_child_init; | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  | } | 
| 277 | 0 |  |  | 0 | 1 | 0 |  | 
| 278 |  |  |  |  |  |  | ########################################################################### | 
| 279 | 0 | 0 | 0 |  |  | 0 |  | 
| 280 | 0 |  |  |  |  | 0 | # Plugin hook. | 
| 281 |  |  |  |  |  |  | my ($self, $pms, $fulltext, $min, $max) = @_; | 
| 282 |  |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  | return 0 if (!$self->{conf}->{use_learner}); | 
| 284 |  |  |  |  |  |  | return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules}); | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | if (!exists ($pms->{bayes_score})) { | 
| 287 |  |  |  |  |  |  | my $timer = $self->{main}->time_method("check_bayes"); | 
| 288 | 0 |  |  | 0 | 0 | 0 | $pms->{bayes_score} = $self->scan($pms, $pms->{msg}); | 
| 289 |  |  |  |  |  |  | } | 
| 290 | 0 | 0 |  |  |  | 0 |  | 
| 291 | 0 | 0 | 0 |  |  | 0 | if (defined $pms->{bayes_score} && | 
| 292 |  |  |  |  |  |  | ($min == 0 || $pms->{bayes_score} > $min) && | 
| 293 | 0 | 0 |  |  |  | 0 | ($max eq "undef" || $pms->{bayes_score} <= $max)) | 
| 294 | 0 |  |  |  |  | 0 | { | 
| 295 | 0 |  |  |  |  | 0 | if ($self->{conf}->{detailed_bayes_score}) { | 
| 296 |  |  |  |  |  |  | $pms->test_log(sprintf ("score: %3.4f, hits: %s", | 
| 297 |  |  |  |  |  |  | $pms->{bayes_score}, | 
| 298 | 0 | 0 | 0 |  |  | 0 | $pms->{bayes_hits})); | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 299 |  |  |  |  |  |  | } | 
| 300 |  |  |  |  |  |  | else { | 
| 301 |  |  |  |  |  |  | $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score})); | 
| 302 | 0 | 0 |  |  |  | 0 | } | 
| 303 |  |  |  |  |  |  | return 1; | 
| 304 |  |  |  |  |  |  | } | 
| 305 | 0 |  |  |  |  | 0 |  | 
| 306 |  |  |  |  |  |  | return 0; | 
| 307 |  |  |  |  |  |  | } | 
| 308 | 0 |  |  |  |  | 0 |  | 
| 309 |  |  |  |  |  |  | ########################################################################### | 
| 310 | 0 |  |  |  |  | 0 |  | 
| 311 |  |  |  |  |  |  | # Plugin hook. | 
| 312 |  |  |  |  |  |  | my ($self, $params) = @_; | 
| 313 | 0 |  |  |  |  | 0 | my $quiet = $params->{quiet}; | 
| 314 |  |  |  |  |  |  |  | 
| 315 |  |  |  |  |  |  | # do a sanity check here.  Weird things happen if we remain tied | 
| 316 |  |  |  |  |  |  | # after compiling; for example, spamd will never see that the | 
| 317 |  |  |  |  |  |  | # number of messages has reached the bayes-scanning threshold. | 
| 318 |  |  |  |  |  |  | if ($self->{store}->db_readable()) { | 
| 319 |  |  |  |  |  |  | warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; | 
| 320 | 6 |  |  | 6 | 1 | 13 | $self->{store}->untie_db(); | 
| 321 | 6 |  |  |  |  | 9 | } | 
| 322 |  |  |  |  |  |  | } | 
| 323 |  |  |  |  |  |  |  | 
| 324 |  |  |  |  |  |  | ########################################################################### | 
| 325 |  |  |  |  |  |  |  | 
| 326 | 6 | 50 |  |  |  | 20 | # read configuration items to control bayes behaviour.  Called by | 
| 327 | 0 | 0 |  |  |  | 0 | # BayesStore::read_db_configs(). | 
| 328 | 0 |  |  |  |  | 0 | my ($self) = @_; | 
| 329 |  |  |  |  |  |  |  | 
| 330 |  |  |  |  |  |  | # use of hapaxes.  Set on bayes object, since it controls prob | 
| 331 |  |  |  |  |  |  | # computation. | 
| 332 |  |  |  |  |  |  | $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; | 
| 333 |  |  |  |  |  |  | } | 
| 334 |  |  |  |  |  |  | ########################################################################### | 
| 335 |  |  |  |  |  |  |  | 
| 336 |  |  |  |  |  |  | my ($self,$PMS) = @_; | 
| 337 | 42 |  |  | 42 | 0 | 100 |  | 
| 338 |  |  |  |  |  |  | return 0 unless $self->{use_ignores}; | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | my $ig_from = $self->{main}->call_plugins ("check_wb_list", | 
| 341 | 42 |  |  |  |  | 160 | { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); | 
| 342 |  |  |  |  |  |  | my $ig_to = $self->{main}->call_plugins ("check_wb_list", | 
| 343 |  |  |  |  |  |  | { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | my $ignore = $ig_from || $ig_to; | 
| 346 | 0 |  |  | 0 | 0 | 0 |  | 
| 347 |  |  |  |  |  |  | dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; | 
| 348 | 0 | 0 |  |  |  | 0 |  | 
| 349 |  |  |  |  |  |  | return $ignore; | 
| 350 | 0 |  |  |  |  | 0 | } | 
| 351 |  |  |  |  |  |  |  | 
| 352 | 0 |  |  |  |  | 0 | ########################################################################### | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | # Plugin hook. | 
| 355 | 0 |  | 0 |  |  | 0 | my ($self, $params) = @_; | 
| 356 |  |  |  |  |  |  | my $isspam = $params->{isspam}; | 
| 357 | 0 | 0 |  |  |  | 0 | my $msg = $params->{msg}; | 
| 358 |  |  |  |  |  |  | my $id = $params->{id}; | 
| 359 | 0 |  |  |  |  | 0 |  | 
| 360 |  |  |  |  |  |  | if (!$self->{conf}->{use_bayes}) { return; } | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  | my $msgdata = $self->get_body_from_msg ($msg); | 
| 363 |  |  |  |  |  |  | my $ret; | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | eval { | 
| 366 | 10 |  |  | 10 | 1 | 28 | local $SIG{'__DIE__'};	# do not run user die() traps in here | 
| 367 | 10 |  |  |  |  | 27 | my $timer = $self->{main}->time_method("b_learn"); | 
| 368 | 10 |  |  |  |  | 18 |  | 
| 369 | 10 |  |  |  |  | 21 | my $ok; | 
| 370 |  |  |  |  |  |  | if ($self->{main}->{learn_to_journal}) { | 
| 371 | 10 | 50 |  |  |  | 46 | # If we're going to learn to journal, we'll try going r/o first... | 
|  | 0 |  |  |  |  | 0 |  | 
| 372 |  |  |  |  |  |  | # If that fails for some reason, let's try going r/w.  This happens | 
| 373 | 10 |  |  |  |  | 82 | # if the DB doesn't exist yet. | 
| 374 | 10 |  |  |  |  | 26 | $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); | 
| 375 |  |  |  |  |  |  | } else { | 
| 376 |  |  |  |  |  |  | $ok = $self->{store}->tie_db_writable(); | 
| 377 | 10 |  |  |  |  | 76 | } | 
| 378 | 10 |  |  |  |  | 57 |  | 
| 379 |  |  |  |  |  |  | if ($ok) { | 
| 380 | 10 |  |  |  |  | 24 | $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); | 
| 381 | 10 | 100 |  |  |  | 36 |  | 
| 382 |  |  |  |  |  |  | if (!$self->{main}->{learn_caller_will_untie}) { | 
| 383 |  |  |  |  |  |  | $self->{store}->untie_db(); | 
| 384 |  |  |  |  |  |  | } | 
| 385 | 2 |  | 33 |  |  | 18 | } | 
| 386 |  |  |  |  |  |  | 1; | 
| 387 | 8 |  |  |  |  | 63 | } or do {		# if we died, untie the dbs. | 
| 388 |  |  |  |  |  |  | my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat; | 
| 389 |  |  |  |  |  |  | $self->{store}->untie_db(); | 
| 390 | 10 | 100 |  |  |  | 36 | die "bayes: (in learn) $eval_stat\n"; | 
| 391 | 8 |  |  |  |  | 50 | }; | 
| 392 |  |  |  |  |  |  |  | 
| 393 | 8 | 50 |  |  |  | 50 | return $ret; | 
| 394 | 8 |  |  |  |  | 59 | } | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | # this function is trapped by the wrapper above | 
| 397 | 10 |  |  |  |  | 97 | my ($self, $isspam, $msg, $msgdata, $msgid) = @_; | 
| 398 | 10 | 50 |  |  |  | 19 | my @msgid = ( $msgid ); | 
| 399 | 0 | 0 |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 400 | 0 |  |  |  |  | 0 | if (!defined $msgid) { | 
| 401 | 0 |  |  |  |  | 0 | @msgid = $self->get_msgid($msg); | 
| 402 |  |  |  |  |  |  | } | 
| 403 |  |  |  |  |  |  |  | 
| 404 | 10 |  |  |  |  | 69 | foreach my $msgid_t ( @msgid ) { | 
| 405 |  |  |  |  |  |  | my $seen = $self->{store}->seen_get ($msgid_t); | 
| 406 |  |  |  |  |  |  |  | 
| 407 |  |  |  |  |  |  | if (defined ($seen)) { | 
| 408 |  |  |  |  |  |  | if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { | 
| 409 | 8 |  |  | 8 |  | 30 | dbg("bayes: $msgid_t already learnt correctly, not learning twice"); | 
| 410 | 8 |  |  |  |  | 31 | return 0; | 
| 411 |  |  |  |  |  |  | } elsif ($seen !~ /^[hs]$/) { | 
| 412 | 8 | 50 |  |  |  | 25 | warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored"); | 
| 413 | 8 |  |  |  |  | 48 | } else { | 
| 414 |  |  |  |  |  |  | # bug 3704: If the message was already learned, don't try learning it again. | 
| 415 |  |  |  |  |  |  | # this prevents, for instance, manually learning as spam, then autolearning | 
| 416 | 8 |  |  |  |  | 36 | # as ham, or visa versa. | 
| 417 | 12 |  |  |  |  | 73 | if ($self->{main}->{learn_no_relearn}) { | 
| 418 |  |  |  |  |  |  | dbg("bayes: $msgid_t already learnt as opposite, not re-learning"); | 
| 419 | 12 | 100 |  |  |  | 66 | return 0; | 
| 420 | 4 | 100 | 66 |  |  | 83 | } | 
|  |  | 50 | 33 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
| 421 | 2 |  |  |  |  | 17 |  | 
| 422 | 2 |  |  |  |  | 10 | dbg("bayes: $msgid_t already learnt as opposite, forgetting first"); | 
| 423 |  |  |  |  |  |  |  | 
| 424 | 0 |  |  |  |  | 0 | # kluge so that forget() won't untie the db on us ... | 
| 425 |  |  |  |  |  |  | my $orig = $self->{main}->{learn_caller_will_untie}; | 
| 426 |  |  |  |  |  |  | $self->{main}->{learn_caller_will_untie} = 1; | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); | 
| 429 | 2 | 50 |  |  |  | 13 |  | 
| 430 | 0 |  |  |  |  | 0 | # reset the value post-forget() ... | 
| 431 | 0 |  |  |  |  | 0 | $self->{main}->{learn_caller_will_untie} = $orig; | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  | # forget() gave us a fatal error, so propagate that up | 
| 434 | 2 |  |  |  |  | 18 | if ($fatal) { | 
| 435 |  |  |  |  |  |  | dbg("bayes: forget() returned a fatal error, so learn() will too"); | 
| 436 |  |  |  |  |  |  | return; | 
| 437 | 2 |  |  |  |  | 13 | } | 
| 438 | 2 |  |  |  |  | 7 | } | 
| 439 |  |  |  |  |  |  |  | 
| 440 | 2 |  |  |  |  | 18 | # we're only going to have seen this once, so stop if it's been | 
| 441 |  |  |  |  |  |  | # seen already | 
| 442 |  |  |  |  |  |  | last; | 
| 443 | 2 |  |  |  |  | 12 | } | 
| 444 |  |  |  |  |  |  | } | 
| 445 |  |  |  |  |  |  |  | 
| 446 | 2 | 50 |  |  |  | 11 | # Now that we're sure we haven't seen this message before ... | 
| 447 | 0 |  |  |  |  | 0 | $msgid = $msgid[0]; | 
| 448 | 0 |  |  |  |  | 0 |  | 
| 449 |  |  |  |  |  |  | my $msgatime = $msg->receive_date(); | 
| 450 |  |  |  |  |  |  |  | 
| 451 |  |  |  |  |  |  | # If the message atime comes back as being more than 1 day in the | 
| 452 |  |  |  |  |  |  | # future, something's messed up and we should revert to current time as | 
| 453 |  |  |  |  |  |  | # a safety measure. | 
| 454 | 2 |  |  |  |  | 11 | # | 
| 455 |  |  |  |  |  |  | $msgatime = time if ( $msgatime - time > 86400 ); | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | my $tokens = $self->tokenize($msg, $msgdata); | 
| 458 |  |  |  |  |  |  |  | 
| 459 | 6 |  |  |  |  | 22 | { my $timer = $self->{main}->time_method('b_count_change'); | 
| 460 |  |  |  |  |  |  | if ($isspam) { | 
| 461 | 6 |  |  |  |  | 53 | $self->{store}->nspam_nham_change(1, 0); | 
| 462 |  |  |  |  |  |  | $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); | 
| 463 |  |  |  |  |  |  | } else { | 
| 464 |  |  |  |  |  |  | $self->{store}->nspam_nham_change(0, 1); | 
| 465 |  |  |  |  |  |  | $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); | 
| 466 |  |  |  |  |  |  | } | 
| 467 | 6 | 50 |  |  |  | 35 | } | 
| 468 |  |  |  |  |  |  |  | 
| 469 | 6 |  |  |  |  | 43 | $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); | 
| 470 |  |  |  |  |  |  | $self->{store}->cleanup(); | 
| 471 | 6 |  |  |  |  | 18 |  | 
|  | 6 |  |  |  |  | 67 |  | 
| 472 | 6 | 100 |  |  |  | 31 | $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, | 
| 473 | 4 |  |  |  |  | 61 | isspam => $isspam, | 
| 474 | 4 |  |  |  |  | 44 | msgid => $msgid, | 
| 475 |  |  |  |  |  |  | msgatime => $msgatime, | 
| 476 | 2 |  |  |  |  | 19 | }); | 
| 477 | 2 |  |  |  |  | 19 |  | 
| 478 |  |  |  |  |  |  | dbg("bayes: learned '$msgid', atime: $msgatime"); | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | 1; | 
| 481 | 6 | 100 |  |  |  | 240 | } | 
| 482 | 6 |  |  |  |  | 61 |  | 
| 483 |  |  |  |  |  |  | ########################################################################### | 
| 484 | 6 |  |  |  |  | 120 |  | 
| 485 |  |  |  |  |  |  | # Plugin hook. | 
| 486 |  |  |  |  |  |  | my ($self, $params) = @_; | 
| 487 |  |  |  |  |  |  | my $msg = $params->{msg}; | 
| 488 |  |  |  |  |  |  | my $id = $params->{id}; | 
| 489 |  |  |  |  |  |  |  | 
| 490 | 6 |  |  |  |  | 92 | if (!$self->{conf}->{use_bayes}) { return; } | 
| 491 |  |  |  |  |  |  |  | 
| 492 | 6 |  |  |  |  | 567 | my $msgdata = $self->get_body_from_msg ($msg); | 
| 493 |  |  |  |  |  |  | my $ret; | 
| 494 |  |  |  |  |  |  |  | 
| 495 |  |  |  |  |  |  | # we still tie for writing here, since we write to the seen db | 
| 496 |  |  |  |  |  |  | # synchronously | 
| 497 |  |  |  |  |  |  | eval { | 
| 498 |  |  |  |  |  |  | local $SIG{'__DIE__'};	# do not run user die() traps in here | 
| 499 | 4 |  |  | 4 | 1 | 14 | my $timer = $self->{main}->time_method("b_learn"); | 
| 500 | 4 |  |  |  |  | 10 |  | 
| 501 | 4 |  |  |  |  | 9 | my $ok; | 
| 502 |  |  |  |  |  |  | if ($self->{main}->{learn_to_journal}) { | 
| 503 | 4 | 50 |  |  |  | 21 | # If we're going to learn to journal, we'll try going r/o first... | 
|  | 0 |  |  |  |  | 0 |  | 
| 504 |  |  |  |  |  |  | # If that fails for some reason, let's try going r/w.  This happens | 
| 505 | 4 |  |  |  |  | 23 | # if the DB doesn't exist yet. | 
| 506 | 4 |  |  |  |  | 11 | $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); | 
| 507 |  |  |  |  |  |  | } else { | 
| 508 |  |  |  |  |  |  | $ok = $self->{store}->tie_db_writable(); | 
| 509 |  |  |  |  |  |  | } | 
| 510 |  |  |  |  |  |  |  | 
| 511 | 4 |  |  |  |  | 27 | if ($ok) { | 
| 512 | 4 |  |  |  |  | 23 | $ret = $self->_forget_trapped ($msg, $msgdata, $id); | 
| 513 |  |  |  |  |  |  |  | 
| 514 | 4 |  |  |  |  | 8 | if (!$self->{main}->{learn_caller_will_untie}) { | 
| 515 | 4 | 50 |  |  |  | 24 | $self->{store}->untie_db(); | 
| 516 |  |  |  |  |  |  | } | 
| 517 |  |  |  |  |  |  | } | 
| 518 |  |  |  |  |  |  | 1; | 
| 519 | 0 |  | 0 |  |  | 0 | } or do {		# if we died, untie the dbs. | 
| 520 |  |  |  |  |  |  | my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat; | 
| 521 | 4 |  |  |  |  | 32 | $self->{store}->untie_db(); | 
| 522 |  |  |  |  |  |  | die "bayes: (in forget) $eval_stat\n"; | 
| 523 |  |  |  |  |  |  | }; | 
| 524 | 4 | 50 |  |  |  | 17 |  | 
| 525 | 4 |  |  |  |  | 25 | return $ret; | 
| 526 |  |  |  |  |  |  | } | 
| 527 | 4 | 100 |  |  |  | 26 |  | 
| 528 | 2 |  |  |  |  | 17 | # this function is trapped by the wrapper above | 
| 529 |  |  |  |  |  |  | my ($self, $msg, $msgdata, $msgid) = @_; | 
| 530 |  |  |  |  |  |  | my @msgid = ( $msgid ); | 
| 531 | 4 |  |  |  |  | 43 | my $isspam; | 
| 532 | 4 | 50 |  |  |  | 10 |  | 
| 533 | 0 | 0 |  |  |  | 0 | if (!defined $msgid) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 534 | 0 |  |  |  |  | 0 | @msgid = $self->get_msgid($msg); | 
| 535 | 0 |  |  |  |  | 0 | } | 
| 536 |  |  |  |  |  |  |  | 
| 537 |  |  |  |  |  |  | while( $msgid = shift @msgid ) { | 
| 538 | 4 |  |  |  |  | 33 | my $seen = $self->{store}->seen_get ($msgid); | 
| 539 |  |  |  |  |  |  |  | 
| 540 |  |  |  |  |  |  | if (defined ($seen)) { | 
| 541 |  |  |  |  |  |  | if ($seen eq 's') { | 
| 542 |  |  |  |  |  |  | $isspam = 1; | 
| 543 | 4 |  |  | 4 |  | 16 | } elsif ($seen eq 'h') { | 
| 544 | 4 |  |  |  |  | 16 | $isspam = 0; | 
| 545 | 4 |  |  |  |  | 9 | } else { | 
| 546 |  |  |  |  |  |  | dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); | 
| 547 | 4 | 50 |  |  |  | 16 | return 0; | 
| 548 | 4 |  |  |  |  | 21 | } | 
| 549 |  |  |  |  |  |  |  | 
| 550 |  |  |  |  |  |  | # messages should only be learned once, so stop if we find a msgid | 
| 551 | 4 |  |  |  |  | 24 | # which was seen before | 
| 552 | 4 |  |  |  |  | 27 | last; | 
| 553 |  |  |  |  |  |  | } | 
| 554 | 4 | 50 |  |  |  | 26 | else { | 
| 555 | 4 | 100 |  |  |  | 25 | dbg("bayes: forget: msgid $msgid not learnt, ignored"); | 
|  |  | 50 |  |  |  |  |  | 
| 556 | 2 |  |  |  |  | 7 | } | 
| 557 |  |  |  |  |  |  | } | 
| 558 | 2 |  |  |  |  | 7 |  | 
| 559 |  |  |  |  |  |  | # This message wasn't learnt before, so return | 
| 560 | 0 |  |  |  |  | 0 | if (!defined $isspam) { | 
| 561 | 0 |  |  |  |  | 0 | dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); | 
| 562 |  |  |  |  |  |  | return 0; | 
| 563 |  |  |  |  |  |  | } | 
| 564 |  |  |  |  |  |  | elsif ($isspam) { | 
| 565 |  |  |  |  |  |  | $self->{store}->nspam_nham_change (-1, 0); | 
| 566 | 4 |  |  |  |  | 12 | } | 
| 567 |  |  |  |  |  |  | else { | 
| 568 |  |  |  |  |  |  | $self->{store}->nspam_nham_change (0, -1); | 
| 569 | 0 |  |  |  |  | 0 | } | 
| 570 |  |  |  |  |  |  |  | 
| 571 |  |  |  |  |  |  | my $tokens = $self->tokenize($msg, $msgdata); | 
| 572 |  |  |  |  |  |  |  | 
| 573 |  |  |  |  |  |  | if ($isspam) { | 
| 574 | 4 | 50 |  |  |  | 24 | $self->{store}->multi_tok_count_change (-1, 0, $tokens); | 
|  |  | 100 |  |  |  |  |  | 
| 575 | 0 |  |  |  |  | 0 | } else { | 
| 576 | 0 |  |  |  |  | 0 | $self->{store}->multi_tok_count_change (0, -1, $tokens); | 
| 577 |  |  |  |  |  |  | } | 
| 578 |  |  |  |  |  |  |  | 
| 579 | 2 |  |  |  |  | 13 | $self->{store}->seen_delete ($msgid); | 
| 580 |  |  |  |  |  |  | $self->{store}->cleanup(); | 
| 581 |  |  |  |  |  |  |  | 
| 582 | 2 |  |  |  |  | 16 | $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, | 
| 583 |  |  |  |  |  |  | isspam => $isspam, | 
| 584 |  |  |  |  |  |  | msgid => $msgid, | 
| 585 | 4 |  |  |  |  | 39 | }); | 
| 586 |  |  |  |  |  |  |  | 
| 587 | 4 | 100 |  |  |  | 25 | 1; | 
| 588 | 2 |  |  |  |  | 80 | } | 
| 589 |  |  |  |  |  |  |  | 
| 590 | 2 |  |  |  |  | 24 | ########################################################################### | 
| 591 |  |  |  |  |  |  |  | 
| 592 |  |  |  |  |  |  | # Plugin hook. | 
| 593 | 4 |  |  |  |  | 198 | my ($self, $params) = @_; | 
| 594 | 4 |  |  |  |  | 36 | if (!$self->{conf}->{use_bayes}) { return 0; } | 
| 595 |  |  |  |  |  |  | dbg("bayes: bayes journal sync starting"); | 
| 596 | 4 |  |  |  |  | 59 | $self->{store}->sync($params); | 
| 597 |  |  |  |  |  |  | dbg("bayes: bayes journal sync completed"); | 
| 598 |  |  |  |  |  |  | } | 
| 599 |  |  |  |  |  |  |  | 
| 600 |  |  |  |  |  |  | ########################################################################### | 
| 601 | 4 |  |  |  |  | 413 |  | 
| 602 |  |  |  |  |  |  | # Plugin hook. | 
| 603 |  |  |  |  |  |  | my ($self, $params) = @_; | 
| 604 |  |  |  |  |  |  | if (!$self->{conf}->{use_bayes}) { return 0; } | 
| 605 |  |  |  |  |  |  | dbg("bayes: expiry starting"); | 
| 606 |  |  |  |  |  |  | my $timer = $self->{main}->time_method("expire_bayes"); | 
| 607 |  |  |  |  |  |  | $self->{store}->expire_old_tokens($params); | 
| 608 | 2 |  |  | 2 | 1 | 8 | dbg("bayes: expiry completed"); | 
| 609 | 2 | 50 |  |  |  | 11 | } | 
|  | 0 |  |  |  |  | 0 |  | 
| 610 | 2 |  |  |  |  | 15 |  | 
| 611 | 2 |  |  |  |  | 37 | ########################################################################### | 
| 612 | 2 |  |  |  |  | 10 |  | 
| 613 |  |  |  |  |  |  | # Plugin hook. | 
| 614 |  |  |  |  |  |  | # Check to make sure we can tie() the DB, and we have enough entries to do a scan | 
| 615 |  |  |  |  |  |  | # if we're told the caller will untie(), go ahead and leave the db tied. | 
| 616 |  |  |  |  |  |  | my ($self, $params) = @_; | 
| 617 |  |  |  |  |  |  |  | 
| 618 |  |  |  |  |  |  | return 0 unless $self->{conf}->{use_bayes}; | 
| 619 | 0 |  |  | 0 | 1 | 0 | return 0 unless $self->{store}->tie_db_readonly(); | 
| 620 | 0 | 0 |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 621 | 0 |  |  |  |  | 0 | # We need the DB to stay tied, so if the journal sync occurs, don't untie! | 
| 622 | 0 |  |  |  |  | 0 | my $caller_untie = $self->{main}->{learn_caller_will_untie}; | 
| 623 | 0 |  |  |  |  | 0 | $self->{main}->{learn_caller_will_untie} = 1; | 
| 624 | 0 |  |  |  |  | 0 |  | 
| 625 |  |  |  |  |  |  | # Do a journal sync if necessary.  Do this before the nspam_nham_get() | 
| 626 |  |  |  |  |  |  | # call since the sync may cause an update in the number of messages | 
| 627 |  |  |  |  |  |  | # learnt. | 
| 628 |  |  |  |  |  |  | $self->_opportunistic_calls(1); | 
| 629 |  |  |  |  |  |  |  | 
| 630 |  |  |  |  |  |  | # Reset the variable appropriately | 
| 631 |  |  |  |  |  |  | $self->{main}->{learn_caller_will_untie} = $caller_untie; | 
| 632 |  |  |  |  |  |  |  | 
| 633 | 146 |  |  | 146 | 1 | 414 | my ($ns, $nn) = $self->{store}->nspam_nham_get(); | 
| 634 |  |  |  |  |  |  |  | 
| 635 | 146 | 50 |  |  |  | 543 | if ($ns < $self->{conf}->{bayes_min_spam_num}) { | 
| 636 | 146 | 100 |  |  |  | 691 | dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); | 
| 637 |  |  |  |  |  |  | if (!$self->{main}->{learn_caller_will_untie}) { | 
| 638 |  |  |  |  |  |  | $self->{store}->untie_db(); | 
| 639 | 6 |  |  |  |  | 20 | } | 
| 640 | 6 |  |  |  |  | 25 | return 0; | 
| 641 |  |  |  |  |  |  | } | 
| 642 |  |  |  |  |  |  | if ($nn < $self->{conf}->{bayes_min_ham_num}) { | 
| 643 |  |  |  |  |  |  | dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); | 
| 644 |  |  |  |  |  |  | if (!$self->{main}->{learn_caller_will_untie}) { | 
| 645 | 6 |  |  |  |  | 29 | $self->{store}->untie_db(); | 
| 646 |  |  |  |  |  |  | } | 
| 647 |  |  |  |  |  |  | return 0; | 
| 648 | 6 |  |  |  |  | 19 | } | 
| 649 |  |  |  |  |  |  |  | 
| 650 | 6 |  |  |  |  | 55 | return 1; | 
| 651 |  |  |  |  |  |  | } | 
| 652 | 6 | 50 |  |  |  | 34 |  | 
| 653 | 0 |  |  |  |  | 0 | ########################################################################### | 
| 654 | 0 | 0 |  |  |  | 0 |  | 
| 655 | 0 |  |  |  |  | 0 | my ($self, $permsgstatus, $msg) = @_; | 
| 656 |  |  |  |  |  |  | my $score; | 
| 657 | 0 |  |  |  |  | 0 |  | 
| 658 |  |  |  |  |  |  | return unless $self->{conf}->{use_learner}; | 
| 659 | 6 | 50 |  |  |  | 22 |  | 
| 660 | 0 |  |  |  |  | 0 | # When we're doing a scan, we'll guarantee that we'll do the untie, | 
| 661 | 0 | 0 |  |  |  | 0 | # so override the global setting until we're done. | 
| 662 | 0 |  |  |  |  | 0 | my $caller_untie = $self->{main}->{learn_caller_will_untie}; | 
| 663 |  |  |  |  |  |  | $self->{main}->{learn_caller_will_untie} = 1; | 
| 664 | 0 |  |  |  |  | 0 |  | 
| 665 |  |  |  |  |  |  | goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); | 
| 666 |  |  |  |  |  |  |  | 
| 667 | 6 |  |  |  |  | 22 | goto skip unless $self->learner_is_scan_available(); | 
| 668 |  |  |  |  |  |  |  | 
| 669 |  |  |  |  |  |  | my ($ns, $nn) = $self->{store}->nspam_nham_get(); | 
| 670 |  |  |  |  |  |  |  | 
| 671 |  |  |  |  |  |  | ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() | 
| 672 |  |  |  |  |  |  | ## $self->{raw_counts} = " ns=$ns nn=$nn "; | 
| 673 | 4 |  |  | 4 | 0 | 21 | ## } | 
| 674 | 4 |  |  |  |  | 11 |  | 
| 675 |  |  |  |  |  |  | dbg("bayes: corpus size: nspam = $ns, nham = $nn"); | 
| 676 | 4 | 50 |  |  |  | 17 |  | 
| 677 |  |  |  |  |  |  | my $msgtokens; | 
| 678 |  |  |  |  |  |  | { my $timer = $self->{main}->time_method('b_tokenize'); | 
| 679 |  |  |  |  |  |  | my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); | 
| 680 | 4 |  |  |  |  | 10 | $msgtokens = $self->tokenize($msg, $msgdata); | 
| 681 | 4 |  |  |  |  | 12 | } | 
| 682 |  |  |  |  |  |  |  | 
| 683 | 4 | 50 |  |  |  | 31 | my $tokensdata; | 
| 684 |  |  |  |  |  |  | { my $timer = $self->{main}->time_method('b_tok_get_all'); | 
| 685 | 4 | 50 |  |  |  | 29 | $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); | 
| 686 |  |  |  |  |  |  | } | 
| 687 | 4 |  |  |  |  | 18 |  | 
| 688 |  |  |  |  |  |  | my $timer_compute_prob = $self->{main}->time_method('b_comp_prob'); | 
| 689 |  |  |  |  |  |  |  | 
| 690 |  |  |  |  |  |  | my $probabilities_ref = | 
| 691 |  |  |  |  |  |  | $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn); | 
| 692 |  |  |  |  |  |  |  | 
| 693 | 4 |  |  |  |  | 51 | my %pw; | 
| 694 |  |  |  |  |  |  | foreach my $tokendata (@{$tokensdata}) { | 
| 695 | 4 |  |  |  |  | 11 | my $prob = shift(@$probabilities_ref); | 
| 696 | 4 |  |  |  |  | 10 | next unless defined $prob; | 
|  | 4 |  |  |  |  | 18 |  | 
| 697 | 4 |  |  |  |  | 16 | my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; | 
| 698 | 4 |  |  |  |  | 24 | $pw{$token} = { | 
| 699 |  |  |  |  |  |  | prob => $prob, | 
| 700 |  |  |  |  |  |  | spam_count => $tok_spam, | 
| 701 | 4 |  |  |  |  | 13 | ham_count => $tok_ham, | 
| 702 | 4 |  |  |  |  | 9 | atime => $atime | 
|  | 4 |  |  |  |  | 31 |  | 
| 703 | 4 |  |  |  |  | 11 | }; | 
|  | 4 |  |  |  |  | 197 |  | 
| 704 |  |  |  |  |  |  | } | 
| 705 |  |  |  |  |  |  |  | 
| 706 | 4 |  |  |  |  | 54 | my @pw_keys = keys %pw; | 
| 707 |  |  |  |  |  |  |  | 
| 708 | 4 |  |  |  |  | 20 | # If none of the tokens were found in the DB, we're going to skip | 
| 709 |  |  |  |  |  |  | # this message... | 
| 710 |  |  |  |  |  |  | if (!@pw_keys) { | 
| 711 | 4 |  |  |  |  | 14 | dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); | 
| 712 | 4 |  |  |  |  | 13 | goto skip; | 
|  | 4 |  |  |  |  | 18 |  | 
| 713 | 1104 |  |  |  |  | 1149 | } | 
| 714 | 1104 | 100 |  |  |  | 1583 |  | 
| 715 | 502 |  |  |  |  | 477 | my $tcount_total = keys %{$msgtokens}; | 
|  | 502 |  |  |  |  | 665 |  | 
| 716 | 502 |  |  |  |  | 1353 | my $tcount_learned = scalar @pw_keys; | 
| 717 |  |  |  |  |  |  |  | 
| 718 |  |  |  |  |  |  | # Figure out the message receive time (used as atime below) | 
| 719 |  |  |  |  |  |  | # If the message atime comes back as being in the future, something's | 
| 720 |  |  |  |  |  |  | # messed up and we should revert to current time as a safety measure. | 
| 721 |  |  |  |  |  |  | # | 
| 722 |  |  |  |  |  |  | my $msgatime = $msg->receive_date(); | 
| 723 |  |  |  |  |  |  | my $now = time; | 
| 724 | 4 |  |  |  |  | 83 | $msgatime = $now if ( $msgatime > $now ); | 
| 725 |  |  |  |  |  |  |  | 
| 726 |  |  |  |  |  |  | my @touch_tokens; | 
| 727 |  |  |  |  |  |  | my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; | 
| 728 | 4 | 50 |  |  |  | 25 | my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; | 
| 729 | 0 |  |  |  |  | 0 |  | 
| 730 | 0 |  |  |  |  | 0 | my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys); | 
| 731 |  |  |  |  |  |  | my $log_each_token = (would_log('dbg', 'bayes') > 1); | 
| 732 |  |  |  |  |  |  |  | 
| 733 | 4 |  |  |  |  | 8 | # now take the most significant tokens and calculate probs using | 
|  | 4 |  |  |  |  | 11 |  | 
| 734 | 4 |  |  |  |  | 9 | # Robinson's formula. | 
| 735 |  |  |  |  |  |  |  | 
| 736 |  |  |  |  |  |  | @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys; | 
| 737 |  |  |  |  |  |  |  | 
| 738 |  |  |  |  |  |  | if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 } | 
| 739 |  |  |  |  |  |  |  | 
| 740 | 4 |  |  |  |  | 29 | my @sorted; | 
| 741 | 4 |  |  |  |  | 16 | foreach my $tok (@pw_keys) { | 
| 742 | 4 | 50 |  |  |  | 16 | next if $tok_strength{$tok} < | 
| 743 |  |  |  |  |  |  | $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; | 
| 744 | 4 |  |  |  |  | 9 |  | 
| 745 | 4 |  |  |  |  | 23 | my $pw_tok = $pw{$tok}; | 
| 746 | 4 |  |  |  |  | 16 | my $pw_prob = $pw_tok->{prob}; | 
| 747 |  |  |  |  |  |  |  | 
| 748 | 4 |  |  |  |  | 306 | # What's more expensive, scanning headers for HAMMYTOKENS and | 
| 749 | 4 |  |  |  |  | 47 | # SPAMMYTOKENS tags that aren't there or collecting data that | 
| 750 |  |  |  |  |  |  | # won't be used?  Just collecting the data is certainly simpler. | 
| 751 |  |  |  |  |  |  | # | 
| 752 |  |  |  |  |  |  | my $raw_token = $msgtokens->{$tok} || "(unknown)"; | 
| 753 |  |  |  |  |  |  | my $s = $pw_tok->{spam_count}; | 
| 754 | 4 |  |  |  |  | 31 | my $n = $pw_tok->{ham_count}; | 
|  | 2935 |  |  |  |  | 3018 |  | 
| 755 |  |  |  |  |  |  | my $a = $pw_tok->{atime}; | 
| 756 | 4 | 100 |  |  |  | 17 |  | 
|  | 2 |  |  |  |  | 12 |  | 
| 757 |  |  |  |  |  |  | push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy }, | 
| 758 | 4 |  |  |  |  | 9 | [$raw_token, $pw_prob, $s, $n, $a] ); | 
| 759 | 4 |  |  |  |  | 8 |  | 
| 760 | 414 | 100 |  |  |  | 653 | push(@sorted, $pw_prob); | 
| 761 |  |  |  |  |  |  |  | 
| 762 |  |  |  |  |  |  | # update the atime on this token, it proved useful | 
| 763 | 282 |  |  |  |  | 290 | push(@touch_tokens, $tok); | 
| 764 | 282 |  |  |  |  | 298 |  | 
| 765 |  |  |  |  |  |  | if ($log_each_token) { | 
| 766 |  |  |  |  |  |  | dbg("bayes: token '$raw_token' => $pw_prob"); | 
| 767 |  |  |  |  |  |  | } | 
| 768 |  |  |  |  |  |  | } | 
| 769 |  |  |  |  |  |  |  | 
| 770 | 282 |  | 50 |  |  | 599 | if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && | 
| 771 | 282 |  |  |  |  | 318 | $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) | 
| 772 | 282 |  |  |  |  | 288 | { | 
| 773 | 282 |  |  |  |  | 290 | dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); | 
| 774 |  |  |  |  |  |  | goto skip; | 
| 775 | 282 | 100 |  |  |  | 275 | } | 
|  | 282 |  |  |  |  | 806 |  | 
| 776 |  |  |  |  |  |  |  | 
| 777 |  |  |  |  |  |  | $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); | 
| 778 | 282 |  |  |  |  | 391 | undef $timer_compute_prob;  # end a timing section | 
| 779 |  |  |  |  |  |  |  | 
| 780 |  |  |  |  |  |  | # Couldn't come up with a probability? | 
| 781 | 282 |  |  |  |  | 313 | goto skip unless defined $score; | 
| 782 |  |  |  |  |  |  |  | 
| 783 | 282 | 50 |  |  |  | 602 | dbg("bayes: score = $score"); | 
| 784 | 0 |  |  |  |  | 0 |  | 
| 785 |  |  |  |  |  |  | # no need to call tok_touch_all unless there were significant | 
| 786 |  |  |  |  |  |  | # tokens and a score was returned | 
| 787 |  |  |  |  |  |  | # we don't really care about the return value here | 
| 788 | 4 | 50 | 50 |  |  | 34 |  | 
| 789 |  |  |  |  |  |  | { my $timer = $self->{main}->time_method('b_tok_touch_all'); | 
| 790 |  |  |  |  |  |  | $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); | 
| 791 | 0 |  |  |  |  | 0 | } | 
| 792 | 0 |  |  |  |  | 0 |  | 
| 793 |  |  |  |  |  |  | my $timer_finish = $self->{main}->time_method('b_finish'); | 
| 794 |  |  |  |  |  |  |  | 
| 795 | 4 |  |  |  |  | 33 | $permsgstatus->{bayes_nspam} = $ns; | 
| 796 | 4 |  |  |  |  | 13 | $permsgstatus->{bayes_nham} = $nn; | 
| 797 |  |  |  |  |  |  |  | 
| 798 |  |  |  |  |  |  | ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() | 
| 799 | 4 | 50 |  |  |  | 10 | ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; | 
| 800 |  |  |  |  |  |  | ## } | 
| 801 | 4 |  |  |  |  | 46 |  | 
| 802 |  |  |  |  |  |  | $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, | 
| 803 |  |  |  |  |  |  | probsref => \%pw, | 
| 804 |  |  |  |  |  |  | score => $score, | 
| 805 |  |  |  |  |  |  | msgatime => $msgatime, | 
| 806 |  |  |  |  |  |  | significant_tokens => \@touch_tokens, | 
| 807 | 4 |  |  |  |  | 7 | }); | 
|  | 4 |  |  |  |  | 20 |  | 
| 808 | 4 |  |  |  |  | 59 |  | 
| 809 |  |  |  |  |  |  | skip: | 
| 810 |  |  |  |  |  |  | if (!defined $score) { | 
| 811 | 4 |  |  |  |  | 18 | dbg("bayes: not scoring message, returning undef"); | 
| 812 |  |  |  |  |  |  | } | 
| 813 | 4 |  |  |  |  | 11 |  | 
| 814 | 4 |  |  |  |  | 28 | undef $timer_compute_prob;  # end a timing section if still running | 
| 815 |  |  |  |  |  |  | if (!defined $timer_finish) { | 
| 816 |  |  |  |  |  |  | $timer_finish = $self->{main}->time_method('b_finish'); | 
| 817 |  |  |  |  |  |  | } | 
| 818 |  |  |  |  |  |  |  | 
| 819 |  |  |  |  |  |  | # Take any opportunistic actions we can take | 
| 820 | 4 |  |  |  |  | 50 | if ($self->{main}->{opportunistic_expire_check_only}) { | 
| 821 |  |  |  |  |  |  | # we're supposed to report on expiry only -- so do the | 
| 822 |  |  |  |  |  |  | # _opportunistic_calls() run for the journal only. | 
| 823 |  |  |  |  |  |  | $self->_opportunistic_calls(1); | 
| 824 |  |  |  |  |  |  | $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); | 
| 825 |  |  |  |  |  |  | } | 
| 826 |  |  |  |  |  |  | else { | 
| 827 |  |  |  |  |  |  | $self->_opportunistic_calls(); | 
| 828 | 4 | 50 |  |  |  | 19 | } | 
| 829 | 0 |  |  |  |  | 0 |  | 
| 830 |  |  |  |  |  |  | # Do any cleanup we need to do | 
| 831 |  |  |  |  |  |  | $self->{store}->cleanup(); | 
| 832 | 4 |  |  |  |  | 15 |  | 
| 833 | 4 | 50 |  |  |  | 12 | # Reset the value accordingly | 
| 834 | 4 |  |  |  |  | 26 | $self->{main}->{learn_caller_will_untie} = $caller_untie; | 
| 835 |  |  |  |  |  |  |  | 
| 836 |  |  |  |  |  |  | # If our caller won't untie the db, we need to do it. | 
| 837 |  |  |  |  |  |  | if (!$caller_untie) { | 
| 838 | 4 | 50 |  |  |  | 16 | $self->{store}->untie_db(); | 
| 839 |  |  |  |  |  |  | } | 
| 840 |  |  |  |  |  |  |  | 
| 841 | 0 |  |  |  |  | 0 | $permsgstatus->set_tag ('BAYESTCHAMMY', | 
| 842 | 0 |  |  |  |  | 0 | ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0)); | 
| 843 |  |  |  |  |  |  | $permsgstatus->set_tag ('BAYESTCSPAMMY', | 
| 844 |  |  |  |  |  |  | ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0)); | 
| 845 | 4 |  |  |  |  | 20 | $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned); | 
| 846 |  |  |  |  |  |  | $permsgstatus->set_tag ('BAYESTC', $tcount_total); | 
| 847 |  |  |  |  |  |  |  | 
| 848 |  |  |  |  |  |  | $permsgstatus->set_tag ('HAMMYTOKENS', sub { | 
| 849 | 4 |  |  |  |  | 28 | my $pms = shift; | 
| 850 |  |  |  |  |  |  | $self->bayes_report_make_list | 
| 851 |  |  |  |  |  |  | ($pms, $pms->{bayes_token_info_hammy}, shift); | 
| 852 | 4 |  |  |  |  | 13 | }); | 
| 853 |  |  |  |  |  |  |  | 
| 854 |  |  |  |  |  |  | $permsgstatus->set_tag ('SPAMMYTOKENS', sub { | 
| 855 | 4 | 50 |  |  |  | 15 | my $pms = shift; | 
| 856 | 4 |  |  |  |  | 38 | $self->bayes_report_make_list | 
| 857 |  |  |  |  |  |  | ($pms, $pms->{bayes_token_info_spammy}, shift); | 
| 858 |  |  |  |  |  |  | }); | 
| 859 |  |  |  |  |  |  |  | 
| 860 | 4 | 50 |  |  |  | 17 | $permsgstatus->set_tag ('TOKENSUMMARY', sub { | 
|  | 4 |  |  |  |  | 26 |  | 
| 861 |  |  |  |  |  |  | my $pms = shift; | 
| 862 | 4 | 50 |  |  |  | 14 | if ( defined $pms->{tag_data}{BAYESTC} ) | 
|  | 4 |  |  |  |  | 22 |  | 
| 863 | 4 |  |  |  |  | 12 | { | 
| 864 | 4 |  |  |  |  | 13 | my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED} | 
| 865 |  |  |  |  |  |  | - $pms->{tag_data}{BAYESTCSPAMMY} | 
| 866 |  |  |  |  |  |  | - $pms->{tag_data}{BAYESTCHAMMY}; | 
| 867 | 0 |  |  | 0 |  | 0 | my $tcount_new = $pms->{tag_data}{BAYESTC} | 
| 868 |  |  |  |  |  |  | - $pms->{tag_data}{BAYESTCLEARNED}; | 
| 869 | 0 |  |  |  |  | 0 | "Tokens: new, $tcount_new; " | 
| 870 | 4 |  |  |  |  | 37 | ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; " | 
| 871 |  |  |  |  |  |  | ."neutral, $tcount_neutral; " | 
| 872 |  |  |  |  |  |  | ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}." | 
| 873 | 0 |  |  | 0 |  | 0 | } else { | 
| 874 |  |  |  |  |  |  | "Bayes not run."; | 
| 875 | 0 |  |  |  |  | 0 | } | 
| 876 | 4 |  |  |  |  | 21 | }); | 
| 877 |  |  |  |  |  |  |  | 
| 878 |  |  |  |  |  |  |  | 
| 879 | 0 |  |  | 0 |  | 0 | return $score; | 
| 880 | 0 | 0 |  |  |  | 0 | } | 
| 881 |  |  |  |  |  |  |  | 
| 882 |  |  |  |  |  |  | ########################################################################### | 
| 883 |  |  |  |  |  |  |  | 
| 884 | 0 |  |  |  |  | 0 | # Plugin hook. | 
| 885 |  |  |  |  |  |  | my ($self, $params) = @_; | 
| 886 | 0 |  |  |  |  | 0 | my $magic = $params->{magic}; | 
| 887 | 0 |  |  |  |  | 0 | my $toks = $params->{toks}; | 
| 888 |  |  |  |  |  |  | my $regex = $params->{regex}; | 
| 889 |  |  |  |  |  |  |  | 
| 890 |  |  |  |  |  |  | # allow dump to occur even if use_bayes disables everything else ... | 
| 891 |  |  |  |  |  |  | #return 0 unless $self->{conf}->{use_bayes}; | 
| 892 | 0 |  |  |  |  | 0 | return 0 unless $self->{store}->tie_db_readonly(); | 
| 893 |  |  |  |  |  |  |  | 
| 894 | 4 |  |  |  |  | 42 | my @vars = $self->{store}->get_storage_variables(); | 
| 895 |  |  |  |  |  |  |  | 
| 896 |  |  |  |  |  |  | my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; | 
| 897 | 4 |  |  |  |  | 548 |  | 
| 898 |  |  |  |  |  |  | my $template = '%3.3f %10u %10u %10u  %s'."\n"; | 
| 899 |  |  |  |  |  |  |  | 
| 900 |  |  |  |  |  |  | if ( $magic ) { | 
| 901 |  |  |  |  |  |  | printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version') | 
| 902 |  |  |  |  |  |  | or die "Error writing: $!"; | 
| 903 |  |  |  |  |  |  | printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam') | 
| 904 | 0 |  |  | 0 | 1 | 0 | or die "Error writing: $!"; | 
| 905 | 0 |  |  |  |  | 0 | printf($template, 0.0, 0, $nh, 0, 'non-token data: nham') | 
| 906 | 0 |  |  |  |  | 0 | or die "Error writing: $!"; | 
| 907 | 0 |  |  |  |  | 0 | printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens') | 
| 908 |  |  |  |  |  |  | or die "Error writing: $!"; | 
| 909 |  |  |  |  |  |  | printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime') | 
| 910 |  |  |  |  |  |  | or die "Error writing: $!"; | 
| 911 | 0 | 0 |  |  |  | 0 | if ( $bv >= 2 ) { | 
| 912 |  |  |  |  |  |  | printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime') | 
| 913 | 0 |  |  |  |  | 0 | or die "Error writing: $!"; | 
| 914 |  |  |  |  |  |  | } | 
| 915 | 0 |  |  |  |  | 0 | if ( $bv < 2 ) { | 
| 916 |  |  |  |  |  |  | printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') | 
| 917 | 0 |  |  |  |  | 0 | or die "Error writing: $!"; | 
| 918 |  |  |  |  |  |  | } | 
| 919 | 0 | 0 |  |  |  | 0 | if ( $bv >= 2 ) { | 
| 920 | 0 | 0 |  |  |  | 0 | printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') | 
| 921 |  |  |  |  |  |  | or die "Error writing: $!"; | 
| 922 | 0 | 0 |  |  |  | 0 | } | 
| 923 |  |  |  |  |  |  | printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime') | 
| 924 | 0 | 0 |  |  |  | 0 | or die "Error writing: $!"; | 
| 925 |  |  |  |  |  |  | if ( $bv >= 2 ) { | 
| 926 | 0 | 0 |  |  |  | 0 | printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta') | 
| 927 |  |  |  |  |  |  | or die "Error writing: $!"; | 
| 928 | 0 | 0 |  |  |  | 0 |  | 
| 929 |  |  |  |  |  |  | printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count') | 
| 930 | 0 | 0 |  |  |  | 0 | or die "Error writing: $!"; | 
| 931 | 0 | 0 |  |  |  | 0 | } | 
| 932 |  |  |  |  |  |  | } | 
| 933 |  |  |  |  |  |  |  | 
| 934 | 0 | 0 |  |  |  | 0 | if ( $toks ) { | 
| 935 | 0 | 0 |  |  |  | 0 | # let the store sort out the db_toks | 
| 936 |  |  |  |  |  |  | $self->{store}->dump_db_toks($template, $regex, @vars); | 
| 937 |  |  |  |  |  |  | } | 
| 938 | 0 | 0 |  |  |  | 0 |  | 
| 939 | 0 | 0 |  |  |  | 0 | if (!$self->{main}->{learn_caller_will_untie}) { | 
| 940 |  |  |  |  |  |  | $self->{store}->untie_db(); | 
| 941 |  |  |  |  |  |  | } | 
| 942 | 0 | 0 |  |  |  | 0 | return 1; | 
| 943 |  |  |  |  |  |  | } | 
| 944 | 0 | 0 |  |  |  | 0 |  | 
| 945 | 0 | 0 |  |  |  | 0 | ########################################################################### | 
| 946 |  |  |  |  |  |  | # TODO: these are NOT public, but the test suite needs to call them. | 
| 947 |  |  |  |  |  |  |  | 
| 948 | 0 | 0 |  |  |  | 0 | my ($self, $msg) = @_; | 
| 949 |  |  |  |  |  |  |  | 
| 950 |  |  |  |  |  |  | my @msgid; | 
| 951 |  |  |  |  |  |  |  | 
| 952 |  |  |  |  |  |  | my $msgid = $msg->get_header("Message-Id"); | 
| 953 | 0 | 0 |  |  |  | 0 | if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) { | 
| 954 |  |  |  |  |  |  | # remove \r and < and > prefix/suffixes | 
| 955 | 0 |  |  |  |  | 0 | chomp $msgid; | 
| 956 |  |  |  |  |  |  | $msgid =~ s/^<//; $msgid =~ s/>.*$//g; | 
| 957 |  |  |  |  |  |  | push(@msgid, $msgid); | 
| 958 | 0 | 0 |  |  |  | 0 | } | 
| 959 | 0 |  |  |  |  | 0 |  | 
| 960 |  |  |  |  |  |  | # Modified 2012-01-17  per bug 5185 to remove last received from msg_id calculation | 
| 961 | 0 |  |  |  |  | 0 |  | 
| 962 |  |  |  |  |  |  | # Use sha1_hex(Date: and top N bytes of body) | 
| 963 |  |  |  |  |  |  | # where N is MIN(1024 bytes, 1/2 of body length) | 
| 964 |  |  |  |  |  |  | # | 
| 965 |  |  |  |  |  |  | my $date = $msg->get_header("Date"); | 
| 966 |  |  |  |  |  |  | $date = "None" if (!defined $date || $date eq ''); # No Date? | 
| 967 |  |  |  |  |  |  |  | 
| 968 | 14 |  |  | 14 | 0 | 42 | #Removed per bug 5185 | 
| 969 |  |  |  |  |  |  | #my @rcvd = $msg->get_header("Received"); | 
| 970 | 14 |  |  |  |  | 28 | #my $rcvd = $rcvd[$#rcvd]; | 
| 971 |  |  |  |  |  |  | #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received? | 
| 972 | 14 |  |  |  |  | 97 |  | 
| 973 | 14 | 50 | 33 |  |  | 315 | # Make a copy since pristine_body is a reference ... | 
|  |  |  | 33 |  |  |  |  | 
| 974 |  |  |  |  |  |  | my $body = join('', $msg->get_pristine_body()); | 
| 975 | 14 |  |  |  |  | 57 |  | 
| 976 | 14 |  |  |  |  | 92 | if (length($body) > 64) { # Small Body? | 
|  | 14 |  |  |  |  | 81 |  | 
| 977 | 14 |  |  |  |  | 52 | my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) ); | 
| 978 |  |  |  |  |  |  | substr($body, $keep) = ''; | 
| 979 |  |  |  |  |  |  | } | 
| 980 |  |  |  |  |  |  |  | 
| 981 |  |  |  |  |  |  | #Stripping all CR and LF so that testing midstream from MTA and post delivery don't | 
| 982 |  |  |  |  |  |  | #generate different id's simply because of LF<->CR<->CRLF changes. | 
| 983 |  |  |  |  |  |  | $body =~ s/[\r\n]//g; | 
| 984 |  |  |  |  |  |  |  | 
| 985 | 14 |  |  |  |  | 57 | unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated'); | 
| 986 | 14 | 50 | 33 |  |  | 140 |  | 
| 987 |  |  |  |  |  |  | return wantarray ? @msgid : $msgid[0]; | 
| 988 |  |  |  |  |  |  | } | 
| 989 |  |  |  |  |  |  |  | 
| 990 |  |  |  |  |  |  | my ($self, $msg) = @_; | 
| 991 |  |  |  |  |  |  |  | 
| 992 |  |  |  |  |  |  | if (!ref $msg) { | 
| 993 |  |  |  |  |  |  | # I have no idea why this seems to happen. TODO | 
| 994 | 14 |  |  |  |  | 96 | warn "bayes: msg not a ref: '$msg'"; | 
| 995 |  |  |  |  |  |  | return { }; | 
| 996 | 14 | 50 |  |  |  | 76 | } | 
| 997 | 14 | 50 |  |  |  | 61 |  | 
| 998 | 14 |  |  |  |  | 44 | my $permsgstatus = | 
| 999 |  |  |  |  |  |  | Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); | 
| 1000 |  |  |  |  |  |  | $msg->extract_message_metadata ($permsgstatus); | 
| 1001 |  |  |  |  |  |  | my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); | 
| 1002 |  |  |  |  |  |  | $permsgstatus->finish(); | 
| 1003 | 14 |  |  |  |  | 249 |  | 
| 1004 |  |  |  |  |  |  | if (!defined $msgdata) { | 
| 1005 | 14 |  |  |  |  | 295 | # why?! | 
| 1006 |  |  |  |  |  |  | warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n"; | 
| 1007 | 14 | 50 |  |  |  | 111 | return { }; | 
| 1008 |  |  |  |  |  |  | } | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  | return $msgdata; | 
| 1011 | 20 |  |  | 20 | 0 | 65 | } | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 | 20 | 50 |  |  |  | 86 | my ($self, $pms) = @_; | 
| 1014 |  |  |  |  |  |  |  | 
| 1015 | 0 |  |  |  |  | 0 | my $t_src = $self->{conf}->{bayes_token_sources}; | 
| 1016 | 0 |  |  |  |  | 0 | my $msgdata = { }; | 
| 1017 |  |  |  |  |  |  | $msgdata->{bayes_token_body} = | 
| 1018 |  |  |  |  |  |  | $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible}; | 
| 1019 |  |  |  |  |  |  | $msgdata->{bayes_token_inviz} = | 
| 1020 | 20 |  |  |  |  | 205 | $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible}; | 
| 1021 | 20 |  |  |  |  | 137 | $msgdata->{bayes_mimepart_digests} = | 
| 1022 | 20 |  |  |  |  | 97 | $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart}; | 
| 1023 | 20 |  |  |  |  | 113 | @{$msgdata->{bayes_token_uris}} = | 
| 1024 |  |  |  |  |  |  | $pms->get_uri_list() if $t_src->{uri}; | 
| 1025 | 20 | 50 |  |  |  | 71 | return $msgdata; | 
| 1026 |  |  |  |  |  |  | } | 
| 1027 | 0 |  |  |  |  | 0 |  | 
| 1028 | 0 |  |  |  |  | 0 | ########################################################################### | 
| 1029 |  |  |  |  |  |  |  | 
| 1030 |  |  |  |  |  |  | # The calling functions expect a uniq'ed array of tokens ... | 
| 1031 | 20 |  |  |  |  | 123 | my ($self, $msg, $msgdata) = @_; | 
| 1032 |  |  |  |  |  |  |  | 
| 1033 |  |  |  |  |  |  | my $t_src = $self->{conf}->{bayes_token_sources}; | 
| 1034 |  |  |  |  |  |  |  | 
| 1035 | 24 |  |  | 24 |  | 63 | # visible tokens from the body | 
| 1036 |  |  |  |  |  |  | my @tokens_body; | 
| 1037 | 24 |  |  |  |  | 75 | if ($msgdata->{bayes_token_body}) { | 
| 1038 | 24 |  |  |  |  | 66 | foreach (@{$msgdata->{bayes_token_body}}) { | 
| 1039 |  |  |  |  |  |  | push(@tokens_body, $self->_tokenize_line ($_, '', 1)); | 
| 1040 | 24 | 50 |  |  |  | 189 | last if scalar @tokens_body >= 50000; | 
| 1041 |  |  |  |  |  |  | } | 
| 1042 | 24 | 50 |  |  |  | 158 | dbg("bayes: tokenized body: %d tokens", scalar @tokens_body); | 
| 1043 |  |  |  |  |  |  | } | 
| 1044 | 24 | 50 |  |  |  | 84 | # the URI list | 
| 1045 | 24 |  |  |  |  | 77 | my @tokens_uri; | 
| 1046 | 24 | 50 |  |  |  | 186 | if ($msgdata->{bayes_token_uris}) { | 
| 1047 | 24 |  |  |  |  | 79 | foreach (@{$msgdata->{bayes_token_uris}}) { | 
| 1048 |  |  |  |  |  |  | push(@tokens_uri, $self->_tokenize_line ($_, '', 2)); | 
| 1049 |  |  |  |  |  |  | last if scalar @tokens_uri >= 10000; | 
| 1050 |  |  |  |  |  |  | } | 
| 1051 |  |  |  |  |  |  | dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri); | 
| 1052 |  |  |  |  |  |  | } | 
| 1053 |  |  |  |  |  |  | # add invisible tokens | 
| 1054 | 16 |  |  | 16 | 0 | 58 | my @tokens_inviz; | 
| 1055 |  |  |  |  |  |  | if ($msgdata->{bayes_token_inviz}) { | 
| 1056 | 16 |  |  |  |  | 60 | my $tokprefix; | 
| 1057 |  |  |  |  |  |  | if (ADD_INVIZ_TOKENS_I_PREFIX)  { $tokprefix = 'I*:' } | 
| 1058 |  |  |  |  |  |  | if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' } | 
| 1059 | 16 |  |  |  |  | 36 | if (defined $tokprefix) { | 
| 1060 | 16 | 50 |  |  |  | 63 | foreach (@{$msgdata->{bayes_token_inviz}}) { | 
| 1061 | 16 |  |  |  |  | 30 | push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1)); | 
|  | 16 |  |  |  |  | 82 |  | 
| 1062 | 278 |  |  |  |  | 668 | last if scalar @tokens_inviz >= 50000; | 
| 1063 | 278 | 50 |  |  |  | 1119 | } | 
| 1064 |  |  |  |  |  |  | } | 
| 1065 | 16 |  |  |  |  | 136 | dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz); | 
| 1066 |  |  |  |  |  |  | } | 
| 1067 |  |  |  |  |  |  |  | 
| 1068 | 16 |  |  |  |  | 41 | # add digests and Content-Type of all MIME parts | 
| 1069 | 16 | 50 |  |  |  | 91 | my @tokens_mimepart; | 
| 1070 | 16 |  |  |  |  | 39 | if ($msgdata->{bayes_mimepart_digests}) { | 
|  | 16 |  |  |  |  | 65 |  | 
| 1071 | 54 |  |  |  |  | 135 | my %shorthand = (  # some frequent MIME part contents for human readability | 
| 1072 | 54 | 50 |  |  |  | 172 | 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext', | 
| 1073 |  |  |  |  |  |  | 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML', | 
| 1074 | 16 |  |  |  |  | 60 | 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml'  => 'Empty-XML', | 
| 1075 |  |  |  |  |  |  | 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext', | 
| 1076 |  |  |  |  |  |  | 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML', | 
| 1077 | 16 |  |  |  |  | 29 | '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext', | 
| 1078 | 16 | 50 |  |  |  | 57 | '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML', | 
| 1079 | 16 |  |  |  |  | 27 | ); | 
| 1080 | 16 |  |  |  |  | 26 | @tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_), | 
|  | 16 |  |  |  |  | 49 |  | 
| 1081 | 16 |  |  |  |  | 33 | @{ $msgdata->{bayes_mimepart_digests} }); | 
| 1082 | 16 | 50 |  |  |  | 60 | dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart); | 
| 1083 | 16 |  |  |  |  | 31 | dbg("bayes: mime-part token %s", $_) for @tokens_mimepart; | 
|  | 16 |  |  |  |  | 56 |  | 
| 1084 | 0 |  |  |  |  | 0 | } | 
| 1085 | 0 | 0 |  |  |  | 0 |  | 
| 1086 |  |  |  |  |  |  | # Tokenize the headers | 
| 1087 |  |  |  |  |  |  | my @tokens_header; | 
| 1088 | 16 |  |  |  |  | 44 | if ($t_src->{header}) { | 
| 1089 |  |  |  |  |  |  | my %hdrs = $self->_tokenize_headers ($msg); | 
| 1090 |  |  |  |  |  |  | while( my($prefix, $value) = each %hdrs ) { | 
| 1091 |  |  |  |  |  |  | push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0)); | 
| 1092 | 16 |  |  |  |  | 33 | last if scalar @tokens_header >= 10000; | 
| 1093 | 16 | 50 |  |  |  | 58 | } | 
| 1094 | 0 |  |  |  |  | 0 | dbg("bayes: tokenized header: %d tokens", scalar @tokens_header); | 
| 1095 |  |  |  |  |  |  | } | 
| 1096 |  |  |  |  |  |  |  | 
| 1097 |  |  |  |  |  |  | # Go ahead and uniq the array, skip null tokens (can happen sometimes) | 
| 1098 |  |  |  |  |  |  | # generate an SHA1 hash and take the lower 40 bits as our token | 
| 1099 |  |  |  |  |  |  | my %tokens; | 
| 1100 |  |  |  |  |  |  | foreach my $token | 
| 1101 |  |  |  |  |  |  | (@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header) | 
| 1102 |  |  |  |  |  |  | { | 
| 1103 |  |  |  |  |  |  | # dbg("bayes: token: %s", $token); | 
| 1104 | 0 |  | 0 |  |  | 0 | $tokens{substr(sha1($token), -5)} = $token  if $token ne ''; | 
|  | 0 |  |  |  |  | 0 |  | 
| 1105 | 0 |  |  |  |  | 0 | } | 
| 1106 | 0 |  |  |  |  | 0 |  | 
| 1107 |  |  |  |  |  |  | # return the keys == tokens ... | 
| 1108 |  |  |  |  |  |  | return \%tokens; | 
| 1109 |  |  |  |  |  |  | } | 
| 1110 | 16 |  |  |  |  | 34 |  | 
| 1111 | 16 | 50 |  |  |  | 70 | my $self = $_[0]; | 
| 1112 | 16 |  |  |  |  | 67 | my $tokprefix = $_[2]; | 
| 1113 | 16 |  |  |  |  | 136 | my $region = $_[3]; | 
| 1114 | 200 |  |  |  |  | 548 | local ($_) = $_[1]; | 
| 1115 | 200 | 50 |  |  |  | 1020 |  | 
| 1116 |  |  |  |  |  |  | my @rettokens; | 
| 1117 | 16 |  |  |  |  | 101 |  | 
| 1118 |  |  |  |  |  |  | # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, | 
| 1119 |  |  |  |  |  |  | # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it. | 
| 1120 |  |  |  |  |  |  | # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" | 
| 1121 |  |  |  |  |  |  |  | 
| 1122 | 16 |  |  |  |  | 56 | ### (previous:)  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; | 
| 1123 | 16 |  |  |  |  | 77 |  | 
| 1124 |  |  |  |  |  |  | ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic) | 
| 1125 |  |  |  |  |  |  | s{ ( [A-Za-z0-9,@*!_'"\$. -]+  | | 
| 1126 |  |  |  |  |  |  | [\xC0-\xDF][\x80-\xBF]    | | 
| 1127 | 6714 | 50 |  |  |  | 35244 | [\xE0-\xEF][\x80-\xBF]{2} | | 
| 1128 |  |  |  |  |  |  | [\xF0-\xF4][\x80-\xBF]{3} | | 
| 1129 |  |  |  |  |  |  | [\xA1-\xFF] ) | . } | 
| 1130 |  |  |  |  |  |  | { defined $1 ? $1 : ' ' }xsge; | 
| 1131 | 16 |  |  |  |  | 879 | # should we also turn NBSP ( \xC2\xA0 ) into space? | 
| 1132 |  |  |  |  |  |  |  | 
| 1133 |  |  |  |  |  |  | # DO split on "..." or "--" or "---"; common formatting error resulting in | 
| 1134 |  |  |  |  |  |  | # hapaxes.  Keep the separator itself as a token, though, as long ones can | 
| 1135 | 532 |  |  | 532 |  | 701 | # be good spamsigns. | 
| 1136 | 532 |  |  |  |  | 711 | s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; | 
| 1137 | 532 |  |  |  |  | 584 | s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; | 
| 1138 | 532 |  |  |  |  | 1044 |  | 
| 1139 |  |  |  |  |  |  | if (IGNORE_TITLE_CASE) { | 
| 1140 | 532 |  |  |  |  | 586 | if ($region == 1 || $region == 2) { | 
| 1141 |  |  |  |  |  |  | # lower-case Title Case at start of a full-stop-delimited line (as would | 
| 1142 |  |  |  |  |  |  | # be seen in a Western language). | 
| 1143 |  |  |  |  |  |  | s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; | 
| 1144 |  |  |  |  |  |  | } | 
| 1145 |  |  |  |  |  |  | } | 
| 1146 |  |  |  |  |  |  |  | 
| 1147 |  |  |  |  |  |  | my $magic_re = $self->{store}->get_magic_re(); | 
| 1148 |  |  |  |  |  |  |  | 
| 1149 | 532 |  |  |  |  | 1823 | # Note that split() in scope of 'use bytes' results in words with utf8 flag | 
| 1150 |  |  |  |  |  |  | # cleared, even if the source string has perl characters semantics !!! | 
| 1151 |  |  |  |  |  |  | # Is this really still desirable? | 
| 1152 |  |  |  |  |  |  |  | 
| 1153 | 4844 | 100 |  |  |  | 14792 | foreach my $token (split) { | 
| 1154 |  |  |  |  |  |  | $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end | 
| 1155 |  |  |  |  |  |  | $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens | 
| 1156 |  |  |  |  |  |  |  | 
| 1157 |  |  |  |  |  |  | # Skip false magic tokens | 
| 1158 |  |  |  |  |  |  | # TVD: we need to do a defined() check since SQL doesn't have magic | 
| 1159 | 532 |  |  |  |  | 1474 | # tokens, so the SQL BayesStore returns undef.  I really want a way | 
| 1160 | 532 |  |  |  |  | 756 | # of optimizing that out, but I haven't come up with anything yet. | 
| 1161 |  |  |  |  |  |  | # | 
| 1162 | 532 |  |  |  |  | 552 | next if ( defined $magic_re && $token =~ /$magic_re/ ); | 
| 1163 | 532 | 100 | 100 |  |  | 1388 |  | 
| 1164 |  |  |  |  |  |  | # *do* keep 3-byte tokens; there's some solid signs in there | 
| 1165 |  |  |  |  |  |  | my $len = length($token); | 
| 1166 | 332 |  |  |  |  | 1868 |  | 
|  | 242 |  |  |  |  | 2033 |  | 
| 1167 |  |  |  |  |  |  | # but extend the stop-list. These are squarely in the gray | 
| 1168 |  |  |  |  |  |  | # area, and it just slows us down to record them. | 
| 1169 |  |  |  |  |  |  | # See http://wiki.apache.org/spamassassin/BayesStopList for more info. | 
| 1170 | 532 |  |  |  |  | 1705 | # | 
| 1171 |  |  |  |  |  |  | next if $len < 3 || | 
| 1172 |  |  |  |  |  |  | ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i); | 
| 1173 |  |  |  |  |  |  |  | 
| 1174 |  |  |  |  |  |  | # are we in the body?  If so, apply some body-specific breakouts | 
| 1175 |  |  |  |  |  |  | if ($region == 1 || $region == 2) { | 
| 1176 | 532 |  |  |  |  | 4139 | if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { | 
| 1177 | 8592 |  |  |  |  | 15089 | push (@rettokens, $self->_tokenize_mail_addrs ($token)); | 
| 1178 | 8592 |  |  |  |  | 11820 | } | 
| 1179 |  |  |  |  |  |  | elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { | 
| 1180 |  |  |  |  |  |  | push (@rettokens, "UD:".$token); # the full token | 
| 1181 |  |  |  |  |  |  | my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { | 
| 1182 |  |  |  |  |  |  | push (@rettokens, "UD:".$1); # UD = URL domain | 
| 1183 |  |  |  |  |  |  | } | 
| 1184 |  |  |  |  |  |  | } | 
| 1185 | 8592 | 50 | 33 |  |  | 29609 | } | 
| 1186 |  |  |  |  |  |  |  | 
| 1187 |  |  |  |  |  |  | # note: do not trim down overlong tokens if they contain '*'.  This is | 
| 1188 | 8592 |  |  |  |  | 14048 | # used as part of split tokens such as "HTo:D*net" indicating that | 
| 1189 |  |  |  |  |  |  | # the domain ".net" appeared in the To header. | 
| 1190 |  |  |  |  |  |  | # | 
| 1191 |  |  |  |  |  |  | if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { | 
| 1192 |  |  |  |  |  |  |  | 
| 1193 |  |  |  |  |  |  | if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) { | 
| 1194 | 8592 | 100 | 100 |  |  | 34595 | # Bug 7135 | 
| 1195 |  |  |  |  |  |  | # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences | 
| 1196 |  |  |  |  |  |  | my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] ) | 
| 1197 |  |  |  |  |  |  | [\x80-\xBF]{2} )/xsg; | 
| 1198 | 5230 | 100 | 100 |  |  | 11367 | if (@t) { | 
| 1199 | 3534 | 100 |  |  |  | 7816 | push (@rettokens, map($tokprefix.'u8:'.$_, @t)); | 
|  |  | 100 |  |  |  |  |  | 
| 1200 | 38 |  |  |  |  | 152 | next; | 
| 1201 |  |  |  |  |  |  | } | 
| 1202 |  |  |  |  |  |  | } | 
| 1203 | 110 |  |  |  |  | 306 |  | 
| 1204 | 110 |  |  |  |  | 164 | if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { | 
|  | 110 |  |  |  |  | 472 |  | 
| 1205 | 132 |  |  |  |  | 529 | # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, | 
| 1206 |  |  |  |  |  |  | # but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan | 
| 1207 |  |  |  |  |  |  | # to me! (jm) | 
| 1208 |  |  |  |  |  |  | while ($token =~ s/^(..?)//) { | 
| 1209 |  |  |  |  |  |  | push (@rettokens, $tokprefix.'8:'.$1); | 
| 1210 |  |  |  |  |  |  | } | 
| 1211 |  |  |  |  |  |  | next; | 
| 1212 |  |  |  |  |  |  | } | 
| 1213 |  |  |  |  |  |  |  | 
| 1214 | 5230 | 100 | 100 |  |  | 10121 | if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) | 
| 1215 |  |  |  |  |  |  | || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) | 
| 1216 | 250 | 50 |  |  |  | 580 | || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) | 
| 1217 |  |  |  |  |  |  | { | 
| 1218 |  |  |  |  |  |  | # if (TOKENIZE_LONG_TOKENS_AS_SKIPS) | 
| 1219 | 0 |  |  |  |  | 0 | # Spambayes trick via Matt: Just retain 7 chars.  Do not retain the | 
| 1220 |  |  |  |  |  |  | # length, it does not help; see jm's mail to -devel on Nov 20 2002 at | 
| 1221 | 0 | 0 |  |  |  | 0 | # http://sourceforge.net/p/spamassassin/mailman/message/12977605/ | 
| 1222 | 0 |  |  |  |  | 0 | # "sk:" stands for "skip". | 
| 1223 | 0 |  |  |  |  | 0 | # Bug 7141: retain seven UTF-8 chars (or other bytes), | 
| 1224 |  |  |  |  |  |  | # if followed by at least two bytes | 
| 1225 |  |  |  |  |  |  | $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF]      | | 
| 1226 |  |  |  |  |  |  | [\xC0-\xDF][\x80-\xBF]    | | 
| 1227 | 250 |  |  |  |  | 272 | [\xE0-\xEF][\x80-\xBF]{2} | | 
| 1228 |  |  |  |  |  |  | [\xF0-\xF4][\x80-\xBF]{3} | . ){7} )) | 
| 1229 |  |  |  |  |  |  | .{2,} \z }{sk:$1}xs; | 
| 1230 |  |  |  |  |  |  | ## (was:)  $token = "sk:".substr($token, 0, 7);  # seven bytes | 
| 1231 |  |  |  |  |  |  | } | 
| 1232 |  |  |  |  |  |  | } | 
| 1233 |  |  |  |  |  |  |  | 
| 1234 |  |  |  |  |  |  | # decompose tokens?  do this after shortening long tokens | 
| 1235 |  |  |  |  |  |  | if ($region == 1 || $region == 2) { | 
| 1236 |  |  |  |  |  |  | if (DECOMPOSE_BODY_TOKENS) { | 
| 1237 | 250 | 100 | 100 |  |  | 949 | if ($token =~ /[^\w:\*]/) { | 
|  |  |  | 100 |  |  |  |  | 
|  |  |  | 100 |  |  |  |  | 
|  |  |  | 50 |  |  |  |  | 
|  |  |  | 66 |  |  |  |  | 
| 1238 |  |  |  |  |  |  | my $decompd = $token;                        # "Foo!" | 
| 1239 |  |  |  |  |  |  | $decompd =~ s/[^\w:\*]//gs; | 
| 1240 |  |  |  |  |  |  | push (@rettokens, $tokprefix.$decompd);      # "Foo" | 
| 1241 |  |  |  |  |  |  | } | 
| 1242 |  |  |  |  |  |  |  | 
| 1243 |  |  |  |  |  |  | if ($token =~ /[A-Z]/) { | 
| 1244 |  |  |  |  |  |  | my $decompd = $token; $decompd = lc $decompd; | 
| 1245 |  |  |  |  |  |  | push (@rettokens, $tokprefix.$decompd);      # "foo!" | 
| 1246 |  |  |  |  |  |  |  | 
| 1247 |  |  |  |  |  |  | if ($token =~ /[^\w:\*]/) { | 
| 1248 | 232 |  |  |  |  | 1478 | $decompd =~ s/[^\w:\*]//gs; | 
| 1249 |  |  |  |  |  |  | push (@rettokens, $tokprefix.$decompd);    # "foo" | 
| 1250 |  |  |  |  |  |  | } | 
| 1251 |  |  |  |  |  |  | } | 
| 1252 |  |  |  |  |  |  | } | 
| 1253 |  |  |  |  |  |  | } | 
| 1254 |  |  |  |  |  |  |  | 
| 1255 |  |  |  |  |  |  | push (@rettokens, $tokprefix.$token); | 
| 1256 |  |  |  |  |  |  | } | 
| 1257 |  |  |  |  |  |  |  | 
| 1258 | 5230 | 100 | 100 |  |  | 9781 | return @rettokens; | 
| 1259 | 3534 |  |  |  |  | 3702 | } | 
| 1260 | 3534 | 100 |  |  |  | 6427 |  | 
| 1261 | 358 |  |  |  |  | 708 | my ($self, $msg) = @_; | 
| 1262 | 358 |  |  |  |  | 1178 |  | 
| 1263 | 358 |  |  |  |  | 990 | my %parsed; | 
| 1264 |  |  |  |  |  |  |  | 
| 1265 |  |  |  |  |  |  | my %user_ignore; | 
| 1266 | 3534 | 100 |  |  |  | 6969 | $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}}; | 
| 1267 | 712 |  |  |  |  | 1219 |  | 
|  | 712 |  |  |  |  | 1082 |  | 
| 1268 | 712 |  |  |  |  | 1670 | # get headers in array context | 
| 1269 |  |  |  |  |  |  | my @hdrs; | 
| 1270 | 712 | 100 |  |  |  | 1962 | my @rcvdlines; | 
| 1271 | 76 |  |  |  |  | 265 | for ($msg->get_all_headers()) { | 
| 1272 | 76 |  |  |  |  | 240 | # first, keep a copy of Received headers, so we can strip down to last 2 | 
| 1273 |  |  |  |  |  |  | if (/^Received:/i) { | 
| 1274 |  |  |  |  |  |  | push(@rcvdlines, $_); | 
| 1275 |  |  |  |  |  |  | next; | 
| 1276 |  |  |  |  |  |  | } | 
| 1277 |  |  |  |  |  |  | # and now skip lines for headers we don't want (including all Received) | 
| 1278 | 5230 |  |  |  |  | 17162 | next if /^${IGNORED_HDRS}:/i; | 
| 1279 |  |  |  |  |  |  | next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; | 
| 1280 |  |  |  |  |  |  | push(@hdrs, $_); | 
| 1281 | 532 |  |  |  |  | 5352 | } | 
| 1282 |  |  |  |  |  |  | push(@hdrs, $msg->get_all_metadata()); | 
| 1283 |  |  |  |  |  |  |  | 
| 1284 |  |  |  |  |  |  | # and re-add the last 2 received lines: usually a good source of | 
| 1285 | 16 |  |  | 16 |  | 50 | # spamware tokens and HELO names. | 
| 1286 |  |  |  |  |  |  | if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } | 
| 1287 | 16 |  |  |  |  | 52 | if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } | 
| 1288 |  |  |  |  |  |  |  | 
| 1289 |  |  |  |  |  |  | for (@hdrs) { | 
| 1290 | 16 |  |  |  |  | 29 | next unless /\S/; | 
|  | 16 |  |  |  |  | 101 |  | 
| 1291 |  |  |  |  |  |  | my ($hdr, $val) = split(/:/, $_, 2); | 
| 1292 |  |  |  |  |  |  |  | 
| 1293 | 16 |  |  |  |  | 35 | # remove user-specified headers here, after Received, in case they | 
| 1294 |  |  |  |  |  |  | # want to ignore that too | 
| 1295 | 16 |  |  |  |  | 145 | next if exists $user_ignore{lc $hdr}; | 
| 1296 |  |  |  |  |  |  |  | 
| 1297 | 202 | 100 |  |  |  | 484 | # Prep the header value | 
| 1298 | 52 |  |  |  |  | 132 | $val ||= ''; | 
| 1299 | 52 |  |  |  |  | 86 | chomp($val); | 
| 1300 |  |  |  |  |  |  |  | 
| 1301 |  |  |  |  |  |  | # special tokenization for some headers: | 
| 1302 | 150 | 100 |  |  |  | 1949 | if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { | 
| 1303 | 102 |  |  |  |  | 144 | $val = $self->_pre_chew_message_id ($val); | 
| 1304 | 102 |  |  |  |  | 242 | } | 
| 1305 |  |  |  |  |  |  | elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) | 
| 1306 | 16 |  |  |  |  | 130 | (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) | 
| 1307 |  |  |  |  |  |  | { | 
| 1308 |  |  |  |  |  |  | $val = $self->_pre_chew_addr_header ($val); | 
| 1309 |  |  |  |  |  |  | } | 
| 1310 | 16 | 100 |  |  |  | 80 | elsif ($hdr eq 'Received') { | 
|  | 14 |  |  |  |  | 42 |  | 
| 1311 | 16 | 100 |  |  |  | 67 | $val = $self->_pre_chew_received ($val); | 
|  | 14 |  |  |  |  | 50 |  | 
| 1312 |  |  |  |  |  |  | } | 
| 1313 | 16 |  |  |  |  | 55 | elsif ($hdr eq 'Content-Type') { | 
| 1314 | 194 | 50 |  |  |  | 593 | $val = $self->_pre_chew_content_type ($val); | 
| 1315 | 194 |  |  |  |  | 707 | } | 
| 1316 |  |  |  |  |  |  | elsif ($hdr eq 'MIME-Version') { | 
| 1317 |  |  |  |  |  |  | $val =~ s/1\.0//;		# totally innocuous | 
| 1318 |  |  |  |  |  |  | } | 
| 1319 | 194 | 50 |  |  |  | 528 | elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { | 
| 1320 |  |  |  |  |  |  | $val = "1"; # just mark the presence, they create lots of hapaxen | 
| 1321 |  |  |  |  |  |  | } | 
| 1322 | 194 |  | 50 |  |  | 368 |  | 
| 1323 | 194 |  |  |  |  | 300 | if (MAP_HEADERS_MID) { | 
| 1324 |  |  |  |  |  |  | if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { | 
| 1325 |  |  |  |  |  |  | $parsed{"*MI"} = $val; | 
| 1326 | 194 | 100 |  |  |  | 1377 | } | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 1327 | 16 |  |  |  |  | 73 | } | 
| 1328 |  |  |  |  |  |  | if (MAP_HEADERS_FROMTOCC) { | 
| 1329 |  |  |  |  |  |  | if ($hdr =~ /^(?:From|To|Cc)$/i) { | 
| 1330 |  |  |  |  |  |  | $parsed{"*Ad"} = $val; | 
| 1331 |  |  |  |  |  |  | } | 
| 1332 | 60 |  |  |  |  | 183 | } | 
| 1333 |  |  |  |  |  |  | if (MAP_HEADERS_USERAGENT) { | 
| 1334 |  |  |  |  |  |  | if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { | 
| 1335 | 28 |  |  |  |  | 102 | $parsed{"*UA"} = $val; | 
| 1336 |  |  |  |  |  |  | } | 
| 1337 |  |  |  |  |  |  | } | 
| 1338 | 4 |  |  |  |  | 32 |  | 
| 1339 |  |  |  |  |  |  | # replace hdr name with "compressed" version if possible | 
| 1340 |  |  |  |  |  |  | if (defined $HEADER_NAME_COMPRESSION{$hdr}) { | 
| 1341 | 2 |  |  |  |  | 9 | $hdr = $HEADER_NAME_COMPRESSION{$hdr}; | 
| 1342 |  |  |  |  |  |  | } | 
| 1343 |  |  |  |  |  |  |  | 
| 1344 | 0 |  |  |  |  | 0 | if (exists $parsed{$hdr}) { | 
| 1345 |  |  |  |  |  |  | $parsed{$hdr} .= " ".$val; | 
| 1346 |  |  |  |  |  |  | } else { | 
| 1347 | 194 |  |  |  |  | 278 | $parsed{$hdr} = $val; | 
| 1348 | 194 | 100 |  |  |  | 474 | } | 
| 1349 | 16 |  |  |  |  | 63 | if (would_log('dbg', 'bayes') > 1) { | 
| 1350 |  |  |  |  |  |  | dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); | 
| 1351 |  |  |  |  |  |  | } | 
| 1352 | 194 |  |  |  |  | 255 | } | 
| 1353 | 194 | 100 |  |  |  | 479 |  | 
| 1354 | 32 |  |  |  |  | 94 | return %parsed; | 
| 1355 |  |  |  |  |  |  | } | 
| 1356 |  |  |  |  |  |  |  | 
| 1357 | 194 |  |  |  |  | 247 | my ($self, $val) = @_; | 
| 1358 | 194 | 50 |  |  |  | 388 |  | 
| 1359 | 0 |  |  |  |  | 0 | # hopefully this will retain good bits without too many hapaxen | 
| 1360 |  |  |  |  |  |  | if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { | 
| 1361 |  |  |  |  |  |  | my $boundary = $1; | 
| 1362 |  |  |  |  |  |  | $boundary = ''  if !defined $boundary;  # avoid a warning | 
| 1363 |  |  |  |  |  |  | $boundary =~ s/[a-fA-F0-9]/H/gs; | 
| 1364 | 194 | 100 |  |  |  | 471 | # break up blocks of separator chars so they become their own tokens | 
| 1365 | 126 |  |  |  |  | 247 | $boundary =~ s/([-_\.=]+)/ $1 /gs; | 
| 1366 |  |  |  |  |  |  | $val .= $boundary; | 
| 1367 |  |  |  |  |  |  | } | 
| 1368 | 194 | 100 |  |  |  | 395 |  | 
| 1369 | 26 |  |  |  |  | 102 | # stop-list words for Content-Type header: these wind up totally gray | 
| 1370 |  |  |  |  |  |  | $val =~ s/\b(?:text|charset)\b//; | 
| 1371 | 168 |  |  |  |  | 477 |  | 
| 1372 |  |  |  |  |  |  | $val; | 
| 1373 | 194 | 50 |  |  |  | 450 | } | 
| 1374 | 0 |  |  |  |  | 0 |  | 
| 1375 |  |  |  |  |  |  | my ($self, $val) = @_; | 
| 1376 |  |  |  |  |  |  | # we can (a) get rid of a lot of hapaxen and (b) increase the token | 
| 1377 |  |  |  |  |  |  | # specificity by pre-parsing some common formats. | 
| 1378 | 16 |  |  |  |  | 283 |  | 
| 1379 |  |  |  |  |  |  | # Outlook Express format: | 
| 1380 |  |  |  |  |  |  | $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ | 
| 1381 |  |  |  |  |  |  | ([0-9a-f]{4})[0-9a-f]{4}\$ | 
| 1382 | 4 |  |  | 4 |  | 17 | ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; | 
| 1383 |  |  |  |  |  |  |  | 
| 1384 |  |  |  |  |  |  | # Exim: | 
| 1385 | 4 | 50 |  |  |  | 21 | $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; | 
| 1386 | 0 |  |  |  |  | 0 |  | 
| 1387 | 0 | 0 |  |  |  | 0 | # Sendmail: | 
| 1388 | 0 |  |  |  |  | 0 | $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. | 
| 1389 |  |  |  |  |  |  | [A-F0-9]{10,12}\@//gx; | 
| 1390 | 0 |  |  |  |  | 0 |  | 
| 1391 | 0 |  |  |  |  | 0 | # try to split Message-ID segments on probable ID boundaries. Note that | 
| 1392 |  |  |  |  |  |  | # Outlook message-ids seem to contain a server identifier ID in the last | 
| 1393 |  |  |  |  |  |  | # 8 bytes before the @.  Make sure this becomes its own token, it's a | 
| 1394 |  |  |  |  |  |  | # great spam-sign for a learning system!  Be sure to split on ".". | 
| 1395 | 4 |  |  |  |  | 24 | $val =~ s/[^_A-Za-z0-9]/ /g; | 
| 1396 |  |  |  |  |  |  | $val; | 
| 1397 | 4 |  |  |  |  | 15 | } | 
| 1398 |  |  |  |  |  |  |  | 
| 1399 |  |  |  |  |  |  | my ($self, $val) = @_; | 
| 1400 |  |  |  |  |  |  |  | 
| 1401 | 16 |  |  | 16 |  | 62 | # Thanks to Dan for these.  Trim out "useless" tokens; sendmail-ish IDs | 
| 1402 |  |  |  |  |  |  | # and valid-format RFC-822/2822 dates | 
| 1403 |  |  |  |  |  |  |  | 
| 1404 |  |  |  |  |  |  | $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs;  # Sendmail | 
| 1405 |  |  |  |  |  |  | $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs;  # Sendmail | 
| 1406 | 16 |  |  |  |  | 53 | $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs;    # Sendmail | 
| 1407 |  |  |  |  |  |  | $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim | 
| 1408 |  |  |  |  |  |  |  | 
| 1409 |  |  |  |  |  |  | $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? | 
| 1410 |  |  |  |  |  |  | [0-3\s]?[0-9]\s | 
| 1411 | 16 |  |  |  |  | 38 | (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s | 
| 1412 |  |  |  |  |  |  | (?:19|20)?[0-9]{2}\s | 
| 1413 |  |  |  |  |  |  | [0-2][0-9](?:\:[0-5][0-9]){1,2}\s | 
| 1414 | 16 |  |  |  |  | 32 | (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* | 
| 1415 |  |  |  |  |  |  | //gx; | 
| 1416 |  |  |  |  |  |  |  | 
| 1417 |  |  |  |  |  |  | # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for | 
| 1418 |  |  |  |  |  |  | # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens | 
| 1419 |  |  |  |  |  |  | # (on both sides) | 
| 1420 |  |  |  |  |  |  | # also make a dup with the full IP, as fodder for | 
| 1421 | 16 |  |  |  |  | 116 | # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" | 
| 1422 | 16 |  |  |  |  | 54 | $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ | 
| 1423 |  |  |  |  |  |  | if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { | 
| 1424 |  |  |  |  |  |  | $1.$2.$3.$4. | 
| 1425 |  |  |  |  |  |  | " ip*".$1.$2.$3.$4." "; | 
| 1426 | 28 |  |  | 28 |  | 168 | } else { | 
| 1427 |  |  |  |  |  |  | $1.$2.$3. | 
| 1428 |  |  |  |  |  |  | " ip*".$1.$2.$3.$4." "; | 
| 1429 |  |  |  |  |  |  | } | 
| 1430 |  |  |  |  |  |  | }gex; | 
| 1431 | 28 |  |  |  |  | 112 |  | 
| 1432 | 28 |  |  |  |  | 83 | # trim these: they turn out as the most common tokens, but with a | 
| 1433 | 28 |  |  |  |  | 174 | # prob of about .5.  waste of space! | 
| 1434 | 28 |  |  |  |  | 65 | $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; | 
| 1435 |  |  |  |  |  |  |  | 
| 1436 | 28 |  |  |  |  | 252 | $val; | 
| 1437 |  |  |  |  |  |  | } | 
| 1438 |  |  |  |  |  |  |  | 
| 1439 |  |  |  |  |  |  | my ($self, $val) = @_; | 
| 1440 |  |  |  |  |  |  | local ($_); | 
| 1441 |  |  |  |  |  |  |  | 
| 1442 |  |  |  |  |  |  | my @addrs = $self->{main}->find_all_addrs_in_line ($val); | 
| 1443 |  |  |  |  |  |  | my @toks; | 
| 1444 |  |  |  |  |  |  | foreach (@addrs) { | 
| 1445 |  |  |  |  |  |  | push (@toks, $self->_tokenize_mail_addrs ($_)); | 
| 1446 |  |  |  |  |  |  | } | 
| 1447 |  |  |  |  |  |  | return join (' ', @toks); | 
| 1448 |  |  |  |  |  |  | } | 
| 1449 | 28 |  |  |  |  | 210 |  | 
| 1450 | 30 | 50 | 33 |  |  | 244 | my ($self, $addr) = @_; | 
|  |  |  | 33 |  |  |  |  | 
| 1451 | 0 |  |  |  |  | 0 |  | 
| 1452 |  |  |  |  |  |  | ($addr =~ /(.+)\@(.+)$/) or return (); | 
| 1453 |  |  |  |  |  |  | my @toks; | 
| 1454 | 30 |  |  |  |  | 371 | push(@toks, "U*".$1, "D*".$2); | 
| 1455 |  |  |  |  |  |  | $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } | 
| 1456 |  |  |  |  |  |  | return @toks; | 
| 1457 |  |  |  |  |  |  | } | 
| 1458 |  |  |  |  |  |  |  | 
| 1459 |  |  |  |  |  |  |  | 
| 1460 |  |  |  |  |  |  | ########################################################################### | 
| 1461 | 28 |  |  |  |  | 293 |  | 
| 1462 |  |  |  |  |  |  | # compute the probability that a token is spammish for each token | 
| 1463 | 28 |  |  |  |  | 88 | my ($self, $tokensdata, $ns, $nn) = @_; | 
| 1464 |  |  |  |  |  |  | my @probabilities; | 
| 1465 |  |  |  |  |  |  |  | 
| 1466 |  |  |  |  |  |  | return if !$ns || !$nn; | 
| 1467 | 60 |  |  | 60 |  | 157 |  | 
| 1468 | 60 |  |  |  |  | 110 | my $threshold = 1;  # ignore low-freq tokens below this s+n threshold | 
| 1469 |  |  |  |  |  |  | if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { | 
| 1470 | 60 |  |  |  |  | 207 | $threshold = 10; | 
| 1471 | 60 |  |  |  |  | 78 | } | 
| 1472 | 60 |  |  |  |  | 116 | if (!$self->{use_hapaxes}) { | 
| 1473 | 48 |  |  |  |  | 111 | $threshold = 2; | 
| 1474 |  |  |  |  |  |  | } | 
| 1475 | 60 |  |  |  |  | 309 |  | 
| 1476 |  |  |  |  |  |  | foreach my $tokendata (@{$tokensdata}) { | 
| 1477 |  |  |  |  |  |  | my $s = $tokendata->[1];  # spam count | 
| 1478 |  |  |  |  |  |  | my $n = $tokendata->[2];  # ham count | 
| 1479 | 86 |  |  | 86 |  | 262 | my $prob; | 
| 1480 |  |  |  |  |  |  |  | 
| 1481 | 86 | 50 |  |  |  | 443 | no warnings 'uninitialized';  # treat undef as zero in addition | 
| 1482 | 86 |  |  |  |  | 146 | if ($s + $n >= $threshold) { | 
| 1483 | 86 |  |  |  |  | 368 | # ignoring low-freq tokens, also covers the (!$s && !$n) case | 
| 1484 | 86 |  |  |  |  | 176 |  | 
|  | 86 |  |  |  |  | 429 |  | 
|  | 76 |  |  |  |  | 343 |  | 
| 1485 | 86 |  |  |  |  | 430 | # my $ratios = $s / $ns; | 
| 1486 |  |  |  |  |  |  | # my $ration = $n / $nn; | 
| 1487 |  |  |  |  |  |  | # $prob = $ratios / ($ration + $ratios); | 
| 1488 |  |  |  |  |  |  | # | 
| 1489 |  |  |  |  |  |  | $prob = ($s * $nn) / ($n * $ns + $s * $nn);  # same thing, faster | 
| 1490 |  |  |  |  |  |  |  | 
| 1491 |  |  |  |  |  |  | if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { | 
| 1492 |  |  |  |  |  |  | # use Robinson's f(x) equation for low-n tokens, instead of just | 
| 1493 | 4 |  |  | 4 |  | 21 | # ignoring them | 
| 1494 | 4 |  |  |  |  | 9 | my $robn = $s + $n; | 
| 1495 |  |  |  |  |  |  | $prob = | 
| 1496 | 4 | 50 | 33 |  |  | 34 | ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) | 
| 1497 |  |  |  |  |  |  | / | 
| 1498 | 4 |  |  |  |  | 8 | ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); | 
| 1499 | 4 |  |  |  |  | 11 | } | 
| 1500 |  |  |  |  |  |  | } | 
| 1501 |  |  |  |  |  |  |  | 
| 1502 | 4 | 50 |  |  |  | 21 | # 'log_raw_counts' is used to log the raw data for the Bayes equations | 
| 1503 | 0 |  |  |  |  | 0 | # during a mass-check, allowing the S and X constants to be optimized | 
| 1504 |  |  |  |  |  |  | # quickly without requiring re-tokenization of the messages for each | 
| 1505 |  |  |  |  |  |  | # attempt. There's really no need for this code to be uncommented in | 
| 1506 | 4 |  |  |  |  | 10 | # normal use, however.   It has never been publicly documented, so | 
|  | 4 |  |  |  |  | 20 |  | 
| 1507 | 1104 |  |  |  |  | 1089 | # commenting it out is fine. ;) | 
| 1508 | 1104 |  |  |  |  | 982 | # | 
| 1509 | 1104 |  |  |  |  | 1025 | ## if ($self->{log_raw_counts}) { | 
| 1510 |  |  |  |  |  |  | ## $self->{raw_counts} .= " s=$s,n=$n "; | 
| 1511 | 22 |  |  | 22 |  | 253 | ## } | 
|  | 22 |  |  |  |  | 83 |  | 
|  | 22 |  |  |  |  | 25493 |  | 
| 1512 | 1104 | 100 |  |  |  | 1494 |  | 
| 1513 |  |  |  |  |  |  | push(@probabilities, $prob); | 
| 1514 |  |  |  |  |  |  | } | 
| 1515 |  |  |  |  |  |  | return \@probabilities; | 
| 1516 |  |  |  |  |  |  | } | 
| 1517 |  |  |  |  |  |  |  | 
| 1518 |  |  |  |  |  |  | # compute the probability that a token is spammish | 
| 1519 | 502 |  |  |  |  | 561 | my ($self, $token, $ns, $nn, $s, $n) = @_; | 
| 1520 |  |  |  |  |  |  |  | 
| 1521 | 502 |  |  |  |  | 465 | # we allow the caller to give us the token information, just | 
| 1522 |  |  |  |  |  |  | # to save a potentially expensive lookup | 
| 1523 |  |  |  |  |  |  | if (!defined($s) || !defined($n)) { | 
| 1524 | 502 |  |  |  |  | 498 | ($s, $n, undef) = $self->{store}->tok_get($token); | 
| 1525 | 502 |  |  |  |  | 649 | } | 
| 1526 |  |  |  |  |  |  | return if !$s && !$n; | 
| 1527 |  |  |  |  |  |  |  | 
| 1528 |  |  |  |  |  |  | my $probabilities_ref = | 
| 1529 |  |  |  |  |  |  | $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn); | 
| 1530 |  |  |  |  |  |  |  | 
| 1531 |  |  |  |  |  |  | return $probabilities_ref->[0]; | 
| 1532 |  |  |  |  |  |  | } | 
| 1533 |  |  |  |  |  |  |  | 
| 1534 |  |  |  |  |  |  | ########################################################################### | 
| 1535 |  |  |  |  |  |  | # If a token is neither hammy nor spammy, return 0. | 
| 1536 |  |  |  |  |  |  | # For a spammy token, return the minimum number of additional ham messages | 
| 1537 |  |  |  |  |  |  | # it would have had to appear in to no longer be spammy.  Hammy tokens | 
| 1538 |  |  |  |  |  |  | # are handled similarly.  That's what the function does (at the time | 
| 1539 |  |  |  |  |  |  | # of this writing, 31 July 2003, 16:02:55 CDT).  It would be slightly | 
| 1540 |  |  |  |  |  |  | # more useful if it returned the number of /additional/ ham messages | 
| 1541 |  |  |  |  |  |  | # a spammy token would have to appear in to no longer be spammy but I | 
| 1542 |  |  |  |  |  |  | # fear that might require the solution to a cubic equation, and I | 
| 1543 | 1104 |  |  |  |  | 1637 | # just don't have the time for that now. | 
| 1544 |  |  |  |  |  |  |  | 
| 1545 | 4 |  |  |  |  | 17 | my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; | 
| 1546 |  |  |  |  |  |  |  | 
| 1547 |  |  |  |  |  |  | return 0 if $ns == 0 && $nn == 0; | 
| 1548 |  |  |  |  |  |  |  | 
| 1549 |  |  |  |  |  |  | if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} | 
| 1550 | 0 |  |  | 0 |  | 0 | if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} | 
| 1551 |  |  |  |  |  |  |  | 
| 1552 |  |  |  |  |  |  | return 0 if $Ns == 0 || $Nn == 0; | 
| 1553 |  |  |  |  |  |  | return 0 if abs( $prob - 0.5 ) < | 
| 1554 | 0 | 0 | 0 |  |  | 0 | $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; | 
| 1555 | 0 |  |  |  |  | 0 |  | 
| 1556 |  |  |  |  |  |  | my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); | 
| 1557 | 0 | 0 | 0 |  |  | 0 | my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; | 
| 1558 |  |  |  |  |  |  |  | 
| 1559 | 0 |  |  |  |  | 0 | return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na | 
| 1560 |  |  |  |  |  |  | unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; | 
| 1561 |  |  |  |  |  |  |  | 
| 1562 | 0 |  |  |  |  | 0 | my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; | 
| 1563 |  |  |  |  |  |  | my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; | 
| 1564 |  |  |  |  |  |  | my $a = $Nb * ( 1 - $p ); | 
| 1565 |  |  |  |  |  |  | my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; | 
| 1566 |  |  |  |  |  |  | my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); | 
| 1567 |  |  |  |  |  |  | my $discrim = $b * $b - 4 * $a * $c; | 
| 1568 |  |  |  |  |  |  | my $disc_max_0 = $discrim < 0 ? 0 : $discrim; | 
| 1569 |  |  |  |  |  |  | my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; | 
| 1570 |  |  |  |  |  |  |  | 
| 1571 |  |  |  |  |  |  | # This shouldn't be necessary.  Should not be < 1 | 
| 1572 |  |  |  |  |  |  | return $dd_exact < 1 ? 1 : int($dd_exact); | 
| 1573 |  |  |  |  |  |  | } | 
| 1574 |  |  |  |  |  |  |  | 
| 1575 |  |  |  |  |  |  | ########################################################################### | 
| 1576 |  |  |  |  |  |  |  | 
| 1577 | 0 |  |  | 0 |  | 0 | my($self, $journal_only) = @_; | 
| 1578 |  |  |  |  |  |  |  | 
| 1579 | 0 | 0 | 0 |  |  | 0 | # If we're not already tied, abort. | 
| 1580 |  |  |  |  |  |  | if (!$self->{store}->db_readable()) { | 
| 1581 | 0 |  |  |  |  | 0 | dbg("bayes: opportunistic call attempt failed, DB not readable"); | 
| 1582 | 0 | 0 |  |  |  | 0 | return; | 
|  | 0 | 0 |  |  |  | 0 |  | 
| 1583 |  |  |  |  |  |  | } | 
| 1584 | 0 | 0 | 0 |  |  | 0 |  | 
| 1585 | 0 | 0 |  |  |  | 0 | # Is an expire or sync running? | 
| 1586 |  |  |  |  |  |  | my $running_expire = $self->{store}->get_running_expire_tok(); | 
| 1587 |  |  |  |  |  |  | if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { | 
| 1588 | 0 | 0 |  |  |  | 0 | dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); | 
| 1589 | 0 |  |  |  |  | 0 | return; | 
| 1590 |  |  |  |  |  |  | } | 
| 1591 | 0 |  |  |  |  | 0 |  | 
| 1592 |  |  |  |  |  |  | # handle expiry and syncing | 
| 1593 |  |  |  |  |  |  | if (!$journal_only && $self->{store}->expiry_due()) { | 
| 1594 | 0 |  |  |  |  | 0 | dbg("bayes: opportunistic call found expiry due"); | 
| 1595 | 0 |  |  |  |  | 0 |  | 
| 1596 | 0 |  |  |  |  | 0 | # sync will bring the DB R/W as necessary, and the expire will remove | 
| 1597 | 0 |  |  |  |  | 0 | # the running_expire token, may untie as well. | 
| 1598 | 0 |  |  |  |  | 0 | $self->{main}->{bayes_scanner}->sync(1,1); | 
| 1599 | 0 |  |  |  |  | 0 | } | 
| 1600 | 0 | 0 |  |  |  | 0 | elsif ( $self->{store}->sync_due() ) { | 
| 1601 | 0 |  |  |  |  | 0 | dbg("bayes: opportunistic call found journal sync due"); | 
| 1602 |  |  |  |  |  |  |  | 
| 1603 |  |  |  |  |  |  | # sync will bring the DB R/W as necessary, may untie as well | 
| 1604 | 0 | 0 |  |  |  | 0 | $self->{main}->{bayes_scanner}->sync(1,0); | 
| 1605 |  |  |  |  |  |  |  | 
| 1606 |  |  |  |  |  |  | # We can only remove the running_expire token if we're doing R/W | 
| 1607 |  |  |  |  |  |  | if ($self->{store}->db_writable()) { | 
| 1608 |  |  |  |  |  |  | $self->{store}->remove_running_expire_tok(); | 
| 1609 |  |  |  |  |  |  | } | 
| 1610 | 10 |  |  | 10 |  | 40 | } | 
| 1611 |  |  |  |  |  |  |  | 
| 1612 |  |  |  |  |  |  | return; | 
| 1613 | 10 | 50 |  |  |  | 120 | } | 
| 1614 | 0 |  |  |  |  | 0 |  | 
| 1615 | 0 |  |  |  |  | 0 | ########################################################################### | 
| 1616 |  |  |  |  |  |  |  | 
| 1617 |  |  |  |  |  |  | my ($self) = @_; | 
| 1618 |  |  |  |  |  |  |  | 
| 1619 | 10 |  |  |  |  | 63 | my $store; | 
| 1620 | 10 | 50 | 33 |  |  | 49 | my $module = $self->{conf}->{bayes_store_module}; | 
| 1621 | 0 |  |  |  |  | 0 | if (!$module) { | 
| 1622 | 0 |  |  |  |  | 0 | $module = 'Mail::SpamAssassin::BayesStore::DBM'; | 
| 1623 |  |  |  |  |  |  | } elsif ($module =~ /^([_A-Za-z0-9:]+)$/) { | 
| 1624 |  |  |  |  |  |  | $module = untaint_var($module); | 
| 1625 |  |  |  |  |  |  | } else { | 
| 1626 | 10 | 50 | 66 |  |  | 137 | die "bayes: invalid module: $module\n"; | 
|  |  | 50 |  |  |  |  |  | 
| 1627 | 0 |  |  |  |  | 0 | } | 
| 1628 |  |  |  |  |  |  |  | 
| 1629 |  |  |  |  |  |  | dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module); | 
| 1630 |  |  |  |  |  |  | undef $self->{store};  # DESTROYs previous object, if any | 
| 1631 | 0 |  |  |  |  | 0 | eval ' | 
| 1632 |  |  |  |  |  |  | require '.$module.'; | 
| 1633 |  |  |  |  |  |  | $store = '.$module.'->new($self); | 
| 1634 | 0 |  |  |  |  | 0 | 1; | 
| 1635 |  |  |  |  |  |  | ' or do { | 
| 1636 |  |  |  |  |  |  | my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat; | 
| 1637 | 0 |  |  |  |  | 0 | die "bayes: learner_new $module new() failed: $eval_stat\n"; | 
| 1638 |  |  |  |  |  |  | }; | 
| 1639 |  |  |  |  |  |  |  | 
| 1640 | 0 | 0 |  |  |  | 0 | dbg("bayes: learner_new: got store=%s", $store); | 
| 1641 | 0 |  |  |  |  | 0 | $self->{store} = $store; | 
| 1642 |  |  |  |  |  |  |  | 
| 1643 |  |  |  |  |  |  | $self; | 
| 1644 |  |  |  |  |  |  | } | 
| 1645 | 10 |  |  |  |  | 35 |  | 
| 1646 |  |  |  |  |  |  | ########################################################################### | 
| 1647 |  |  |  |  |  |  |  | 
| 1648 |  |  |  |  |  |  | my ($self, $pms, $info, $param) = @_; | 
| 1649 |  |  |  |  |  |  | return "Tokens not available." unless defined $info; | 
| 1650 |  |  |  |  |  |  |  | 
| 1651 | 63 |  |  | 63 | 1 | 187 | my ($limit,$fmt_arg,$more) = split /,/, ($param || '5'); | 
| 1652 |  |  |  |  |  |  |  | 
| 1653 | 63 |  |  |  |  | 160 | my %formats = ( | 
| 1654 | 63 |  |  |  |  | 235 | short => '$t', | 
| 1655 | 63 | 100 |  |  |  | 376 | Short => 'Token: \"$t\"', | 
|  |  | 50 |  |  |  |  |  | 
| 1656 | 57 |  |  |  |  | 147 | compact => '$p-$D--$t', | 
| 1657 |  |  |  |  |  |  | Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', | 
| 1658 | 6 |  |  |  |  | 44 | medium => '$p-$D-$N--$t', | 
| 1659 |  |  |  |  |  |  | long => '$p-$d--${h}h-${s}s--${a}d--$t', | 
| 1660 | 0 |  |  |  |  | 0 | Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"' | 
| 1661 |  |  |  |  |  |  | ); | 
| 1662 |  |  |  |  |  |  |  | 
| 1663 | 63 |  |  |  |  | 267 | my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); | 
| 1664 | 63 |  |  |  |  | 168 |  | 
| 1665 |  |  |  |  |  |  | return "Invalid format, must be one of: ".join(",",keys %formats) | 
| 1666 |  |  |  |  |  |  | unless defined $raw_fmt; | 
| 1667 |  |  |  |  |  |  |  | 
| 1668 |  |  |  |  |  |  | my $fmt = '"'.$raw_fmt.'"'; | 
| 1669 | 63 | 50 |  |  |  | 7525 | my $amt = $limit < @$info ? $limit : @$info; | 
| 1670 | 0 | 0 |  |  |  | 0 | return "" unless $amt; | 
|  | 0 |  |  |  |  | 0 |  | 
| 1671 | 0 |  |  |  |  | 0 |  | 
| 1672 |  |  |  |  |  |  | my $ns = $pms->{bayes_nspam}; | 
| 1673 |  |  |  |  |  |  | my $nh = $pms->{bayes_nham}; | 
| 1674 | 63 |  |  |  |  | 446 | my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; | 
| 1675 | 63 |  |  |  |  | 148 | my $now = time; | 
| 1676 |  |  |  |  |  |  |  | 
| 1677 | 63 |  |  |  |  | 215 | join ', ', map { | 
| 1678 |  |  |  |  |  |  | my($t,$prob,$s,$h,$u) = @$_; | 
| 1679 |  |  |  |  |  |  | my $a = int(($now - $u)/(3600 * 24)); | 
| 1680 |  |  |  |  |  |  | my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob); | 
| 1681 |  |  |  |  |  |  | my $p = sprintf "%.3f", $prob; | 
| 1682 |  |  |  |  |  |  | my $n = $s + $h; | 
| 1683 | 0 |  |  | 0 | 0 |  | my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); | 
| 1684 | 0 | 0 |  |  |  |  | my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); | 
| 1685 |  |  |  |  |  |  | eval $fmt;  ## no critic | 
| 1686 | 0 |  | 0 |  |  |  | } @{$info}[0..$amt-1]; | 
| 1687 |  |  |  |  |  |  | } | 
| 1688 | 0 |  |  |  |  |  |  | 
| 1689 |  |  |  |  |  |  | 1; |