File Coverage

blib/lib/Mail/SpamAssassin/Plugin/BodyEval.pm

Criterion	Covered	Total	%
statement	30	132	22.7
branch	0	56	0.0
condition	1	45	2.2
subroutine	7	16	43.7
pod	1	8	12.5
total	39	257	15.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							package Mail::SpamAssassin::Plugin::BodyEval;
19
20	22			22		155	use Mail::SpamAssassin::Plugin;
	22					50
	22					710
21	22			22		116	use Mail::SpamAssassin::Logger;
	22					50
	22					1399
22	22			22		163	use Mail::SpamAssassin::Constants qw(:sa);
	22					66
	22					3191
23
24	22			22		199	use strict;
	22					59
	22					633
25	22			22		123	use warnings;
	22					37
	22					687
26							# use bytes;
27	22			22		136	use re 'taint';
	22					54
	22					36961
28
29							our @ISA = qw(Mail::SpamAssassin::Plugin);
30
31							# constructor: register the eval rule
32							sub new {
33	63			63	1	206	my $class = shift;
34	63					147	my $mailsaobject = shift;
35
36							# some boilerplate...
37	63		33			507	$class = ref($class) \|\| $class;
38	63					357	my $self = $class->SUPER::new($mailsaobject);
39	63					167	bless ($self, $class);
40
41							# the important bit!
42	63					310	$self->register_eval_rule("multipart_alternative_difference");
43	63					236	$self->register_eval_rule("multipart_alternative_difference_count");
44	63					219	$self->register_eval_rule("check_blank_line_ratio");
45	63					221	$self->register_eval_rule("tvd_vertical_words");
46	63					231	$self->register_eval_rule("check_stock_info");
47	63					214	$self->register_eval_rule("check_body_length");
48
49	63					503	return $self;
50							}
51
52							sub multipart_alternative_difference {
53	0			0	0		my ($self, $pms, $fulltext, $min, $max) = @_;
54
55	0	0					$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
56
57	0	0	0				if (($min == 0 \|\| $pms->{madiff} > $min) &&
			0
			0
58							($max eq "undef" \|\| $pms->{madiff} <= $max)) {
59	0						return 1;
60							}
61	0						return 0;
62							}
63
64							sub multipart_alternative_difference_count {
65	0			0	0		my ($self, $pms, $fulltext, $ratio, $minhtml) = @_;
66	0	0					$self->_multipart_alternative_difference($pms) unless (exists $pms->{madiff});
67	0	0					return 0 unless $pms->{madiff_html} > $minhtml;
68	0						return(($pms->{madiff_text} / $pms->{madiff_html}) > $ratio);
69							}
70
71							sub _multipart_alternative_difference {
72	0			0			my ($self, $pms) = @_;
73	0						$pms->{madiff} = 0;
74	0						$pms->{madiff_html} = 0;
75	0						$pms->{madiff_text} = 0;
76
77	0						my $msg = $pms->{msg};
78
79							# Find all multipart/alternative parts in the message
80	0						my @ma = $msg->find_parts(qr@^multipart/alternative\b@i);
81
82							# If there are no multipart/alternative sections, skip this test.
83	0	0					return if (!@ma);
84
85							# Figure out what the MIME content of the message looks like
86	0						my @content = $msg->content_summary();
87
88							# Exchange meeting requests come in as m/a text/html text/calendar,
89							# which we want to ignore because of the high FP rate it would cause.
90							#
91	0	0	0				if (@content == 3 && $content[2] eq 'text/calendar' &&
			0
			0
92							$content[1] eq 'text/html' &&
93							$content[0] eq 'multipart/alternative') {
94	0						return;
95							}
96
97							# Go through each of the multipart parts
98	0						foreach my $part (@ma) {
99	0						my %html;
100							my %text;
101
102							# limit our search to text-based parts
103	0						my @txt = $part->find_parts(qr@^text\b@i);
104	0						foreach my $text (@txt) {
105							# we only care about the rendered version of the part
106	0						my ($type, $rnd) = $text->rendered();
107	0	0					next unless defined $type;
108
109							# parse the rendered text into tokens. assume they are whitespace
110							# separated, and ignore anything that doesn't have a word-character
111							# in it (0-9a-zA-Z_) since those are probably things like bullet
112							# points, horizontal lines, etc. this assumes that punctuation
113							# in one part will be the same in other parts.
114							#
115	0	0					if ($type eq 'text/html') {
116	0						foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
117							#dbg("eval: HTML: $w");
118	0						$html{$w}++;
119							}
120
121							# If there are no words, mark if there's at least 1 image ...
122	0	0	0				if (!%html && exists $pms->{html}{inside}{img}) {
123							# Use "\n" as the mark since it can't ever occur normally
124	0						$html{"\n"}=1;
125							}
126							}
127							else {
128	0						foreach my $w (grep(/\w/, split(/\s+/, $rnd))) {
129							#dbg("eval: TEXT: $w");
130	0						$text{$w}++;
131							}
132							}
133							}
134
135							# How many HTML tokens do we have at the start?
136	0						my $orig = keys %html;
137	0	0					next if ($orig == 0);
138
139	0						$pms->{madiff_html} = $orig;
140	0						$pms->{madiff_text} = keys %text;
141	0						dbg('eval: text words: ' . $pms->{madiff_text} . ', html words: ' . $pms->{madiff_html});
142
143							# If the token appears at least as many times in the text part as
144							# in the html part, remove it from the list of html tokens.
145	0						while(my ($k,$v) = each %text) {
146	0	0	0				delete $html{$k} if (exists $html{$k} && $html{$k}-$text{$k} < 1);
147							}
148
149							#map { dbg("eval: LEFT: $_") } keys %html;
150
151							# In theory, the tokens should be the same in both text and html
152							# parts, so there would be 0 tokens left in the html token list, for
153							# a 0% difference rate. Calculate it here, and record the difference
154							# if it's been the highest so far in this message.
155	0						my $diff = scalar(keys %html)/$orig*100;
156	0	0					$pms->{madiff} = $diff if ($diff > $pms->{madiff});
157
158	0						dbg("eval: " . sprintf "madiff: left: %d, orig: %d, max-difference: %0.2f%%", scalar(keys %html), $orig, $pms->{madiff});
159							}
160
161	0						return;
162							}
163
164							sub check_blank_line_ratio {
165	0			0	0		my ($self, $pms, $fulltext, $min, $max, $minlines) = @_;
166
167	0	0	0				if (!defined $minlines \|\| $minlines < 1) {
168	0						$minlines = 1;
169							}
170
171	0						my $blank_line_ratio_ref = $pms->{blank_line_ratio};
172
173	0	0					if (! exists $blank_line_ratio_ref->{$minlines}) {
174	0						$fulltext = $pms->get_decoded_body_text_array();
175
176	0						my $blank = 0;
177	0						my $nlines = 0;
178	0						foreach my $chunk (@$fulltext) {
179	0						foreach (split(/^/m, $chunk, -1)) {
180	0						$nlines++;
181	0	0					$blank++ if !/\S/;
182							}
183							}
184
185							# report -1 if it's a blank message ...
186	0	0					$blank_line_ratio_ref->{$minlines} =
187							$nlines < $minlines ? -1 : 100 * $blank / $nlines;
188							}
189
190							return (($min == 0 && $blank_line_ratio_ref->{$minlines} <= $max) \|\|
191							($blank_line_ratio_ref->{$minlines} > $min &&
192	0		0				$blank_line_ratio_ref->{$minlines} <= $max));
193							}
194
195							sub tvd_vertical_words {
196	0			0	0		my ($self, $pms, $text, $min, $max) = @_;
197
198							# klugy
199	0	0					$max = 101 if ($max >= 100);
200
201	0	0					if (!defined $pms->{tvd_vertical_words}) {
202	0						$pms->{tvd_vertical_words} = -1;
203
204	0						foreach (@{$text}) {
	0
205	0						my $l = length $_;
206	0	0					next unless ($l > 5);
207	0						my $spaces = tr/ / /;
208	0						my $nonspaces = $l - $spaces;
209	0						my $pct;
210	0	0	0				if ($spaces > $nonspaces \|\| $nonspaces == 0) {
211	0						$pct = 100;
212							}
213							else {
214	0						$pct = int(100*$spaces/$nonspaces);
215							}
216	0	0					$pms->{tvd_vertical_words} = $pct if ($pct > $pms->{tvd_vertical_words});
217							}
218							}
219
220	0						dbg("eval: tvd_vertical_words value: $pms->{tvd_vertical_words} / min: $min / max: $max - value must be >= min and < max");
221	0	0	0				return 1 if ($pms->{tvd_vertical_words} >= $min && $pms->{tvd_vertical_words} < $max);
222							}
223
224							sub check_stock_info {
225	0			0	0		my ($self, $pms, $fulltext, $min) = @_;
226
227	0	0					$self->_check_stock_info($pms) unless (exists $pms->{stock_info});
228
229	0	0	0				if ($min == 0 \|\| $pms->{stock_info} >= $min) {
230	0						return 1;
231							}
232	0						return 0;
233							}
234
235							sub _check_stock_info {
236	0			0			my ($self, $pms) = @_;
237	0						$pms->{stock_info} = 0;
238
239							# Find all multipart/alternative parts in the message
240	0						my @parts = $pms->{msg}->find_parts(qr@^text/plain$@i);
241	0	0					return if (!@parts);
242
243							# Go through each of the multipart parts
244	0						my %hits;
245	0						my $part = $parts[0];
246	0						my ($type, $rnd) = $part->rendered();
247	0	0					return unless $type;
248
249							# bug 5644,5717: avoid pathological cases where a regexp takes massive amount
250							# of time by applying the regexp to limited-size text chunks, one at a time
251
252	0						foreach my $rnd_chunk (
253							Mail::SpamAssassin::Message::split_into_array_of_short_paragraphs($rnd))
254							{
255	0						foreach ( $rnd_chunk =~ /^\s([^:\s][^:\n]{2,29})\s:\s*\S/mg ) {
256	0						my $str = lc $_;
257	0						$str =~ tr/a-z//cd;
258							#$str =~ s/([a-z])0([a-z])/$1o$2/g;
259
260	0	0					if ($str =~ /(
261							^trad(?:e\|ing)date\|
262							company(?:name)?\|
263							s\w?(?:t\w?o\w?c\w?k\|y\w?m(?:\w?b\w?o\w?l)?)\|
264							t(?:arget\|icker)\|
265							(?:opening\|current)p(?:rice)?\|
266							p(?:rojected\|osition)\|
267							expectations\|
268							weeks?high\|
269							marketperformance\|
270							(?:year\|week\|month\|day\|price)(?:target\|estimates?)\|
271							sector\|
272							r(?:ecommendation\|ating)
273							)$/x) {
274	0						$hits{$1}++;
275	0						dbg("eval: stock info hit: $1");
276							}
277							}
278							}
279
280	0						$pms->{stock_info} = scalar keys %hits;
281	0						dbg("eval: stock info total: ".$pms->{stock_info});
282
283	0						return;
284							}
285
286							sub check_body_length {
287	0			0	0		my ($self, $pms, undef, $min) = @_;
288
289	0						my $body_length = $pms->{msg}->{pristine_body_length};
290	0						dbg("eval: body_length - %s - check for min of %s", $body_length, $min);
291
292	0	0	0				return (defined $body_length && $body_length <= $min) ? 1 : 0;
293							}
294
295							# ---------------------------------------------------------------------------
296
297							# capability checks for "if can()":
298							#
299	0			0	0		sub has_check_body_length { 1 }
300
301							1;