File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm

Criterion	Covered	Total	%
statement	951	1264	75.2
branch	284	558	50.9
condition	91	221	41.1
subroutine	111	125	88.8
pod	43	94	45.7
total	1480	2262	65.4

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21
22							=head1 SYNOPSIS
23
24							my $spamtest = new Mail::SpamAssassin ({
25							'rules_filename' => '/etc/spamassassin.rules',
26							'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27							});
28							my $mail = $spamtest->parse();
29
30							my $status = $spamtest->check ($mail);
31
32							my $rewritten_mail;
33							if ($status->is_spam()) {
34							$rewritten_mail = $status->rewrite_mail ();
35							}
36							...
37
38
39							=head1 DESCRIPTION
40
41							The Mail::SpamAssassin C<check()> method returns an object of this
42							class. This object encapsulates all the per-message state.
43
44							=head1 METHODS
45
46							=over 4
47
48							=cut
49
50							package Mail::SpamAssassin::PerMsgStatus;
51
52	40			40		282	use strict;
	40					87
	40					1360
53	40			40		226	use warnings;
	40					88
	40					1354
54	40			40		258	use re 'taint';
	40					99
	40					1830
55
56	40			40		260	use Errno qw(ENOENT);
	40					98
	40					2236
57	40			40		258	use Time::HiRes qw(time);
	40					86
	40					502
58
59	40			40		5715	use Mail::SpamAssassin::Constants qw(:sa);
	40					97
	40					5873
60	40			40		14013	use Mail::SpamAssassin::AsyncLoop;
	40					1451
	40					5703
61	40			40		297	use Mail::SpamAssassin::Conf;
	40					737
	40					1126
62	40			40		280	use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize is_fqdn_valid);
	40					77
	40					2327
63	40			40		19019	use Mail::SpamAssassin::Timeout;
	40					104
	40					1260
64	40			40		274	use Mail::SpamAssassin::Logger;
	40					75
	40					50638
65
66							our @ISA = qw();
67
68							# methods defined by the compiled ruleset; deleted in finish_tests()
69							our @TEMPORARY_METHODS;
70
71							# methods defined by register_plugin_eval_glue(); deleted in finish_tests()
72							our %TEMPORARY_EVAL_GLUE_METHODS;
73
74							###########################################################################
75
76							our %common_tags;
77
78							BEGIN {
79							%common_tags = (
80
81							YESNO => sub {
82	34					71	my $pms = shift;
83	34					181	$pms->_get_tag_value_for_yesno(@_);
84							},
85
86							YESNOCAPS => sub {
87	2					6	my $pms = shift;
88	2					11	uc $pms->_get_tag_value_for_yesno(@_);
89							},
90
91							SCORE => sub {
92	38					115	my $pms = shift;
93	38					164	$pms->_get_tag_value_for_score(@_);
94							},
95
96							HITS => sub {
97	0					0	my $pms = shift;
98	0					0	$pms->_get_tag_value_for_score(@_);
99							},
100
101							REQD => sub {
102	38					101	my $pms = shift;
103	38					152	$pms->_get_tag_value_for_required_score(@_);
104							},
105
106							VERSION => \&Mail::SpamAssassin::Version,
107
108	34					117	SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
109
110							RULESVERSION => sub {
111	0					0	my $pms = shift;
112	0					0	my $conf = $pms->{conf};
113	0					0	my @fnames;
114							@fnames =
115	0	0				0	keys %{$conf->{update_version}} if $conf->{update_version};
	0					0
116	0	0				0	@fnames = sort @fnames if @fnames > 1;
117	0					0	join(',', map($conf->{update_version}{$_}, @fnames));
118							},
119
120							HOSTNAME => sub {
121	39					83	my $pms = shift;
122							$pms->{conf}->{report_hostname} \|\|
123	39	50				400	Mail::SpamAssassin::Util::fq_hostname();
124							},
125
126							REMOTEHOSTNAME => sub {
127	0					0	my $pms = shift;
128	0	0				0	$pms->{tag_data}->{'REMOTEHOSTNAME'} \|\| "localhost";
129							},
130
131							REMOTEHOSTADDR => sub {
132	0					0	my $pms = shift;
133	0	0				0	$pms->{tag_data}->{'REMOTEHOSTADDR'} \|\| "127.0.0.1";
134							},
135
136							LASTEXTERNALIP => sub {
137	0					0	my $pms = shift;
138	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
139	0	0				0	$lasthop ? $lasthop->{ip} : '';
140							},
141
142							LASTEXTERNALRDNS => sub {
143	0					0	my $pms = shift;
144	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
145	0	0				0	$lasthop ? $lasthop->{rdns} : '';
146							},
147
148							LASTEXTERNALHELO => sub {
149	0					0	my $pms = shift;
150	0					0	my $lasthop = $pms->{msg}->{metadata}->{relays_external}->[0];
151	0	0				0	$lasthop ? $lasthop->{helo} : '';
152							},
153
154							CONTACTADDRESS => sub {
155	4					10	my $pms = shift;
156	4					24	$pms->{conf}->{report_contact};
157							},
158
159							BAYES => sub {
160	0					0	my $pms = shift;
161							defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
162	0	0				0	: "0.5";
163							},
164
165	0					0	DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
166
167							STARS => sub {
168	34					95	my $pms = shift;
169	34		50			156	my $arg = (shift \|\| "*");
170	34					111	my $length = int($pms->{score});
171	34	50				157	$length = 50 if $length > 50;
172							# avoid a perl 5.21 warning: "Negative repeat count does nothing"
173	34	100				174	$length > 0 ? $arg x $length : '';
174							},
175
176							AUTOLEARN => sub {
177	34					83	my $pms = shift;
178	34					173	$pms->get_autolearn_status();
179							},
180
181							AUTOLEARNSCORE => sub {
182	0					0	my $pms = shift;
183	0					0	$pms->get_autolearn_points();
184							},
185
186							TESTS => sub {
187	34					93	my $pms = shift;
188	34		50			203	my $arg = (shift \|\| ',');
189	34	100				87	join($arg, sort @{$pms->{test_names_hit}}) \|\| "none";
	34					268
190							},
191
192							SUBTESTS => sub {
193	0					0	my $pms = shift;
194	0		0			0	my $arg = (shift \|\| ',');
195	0	0				0	join($arg, sort @{$pms->{subtest_names_hit}}) \|\| "none";
	0					0
196							},
197
198							SUBTESTSCOLLAPSED => sub {
199	0					0	my $pms = shift;
200	0		0			0	my $arg = (shift \|\| ',');
201	0					0	my (@subtests) = $pms->get_names_of_subtests_hit("collapsed");
202	0	0				0	join($arg, sort @subtests) \|\| "none";
203							},
204
205							TESTSSCORES => sub {
206	0					0	my $pms = shift;
207	0		0			0	my $arg = (shift \|\| ",");
208	0					0	my $scores = $pms->{conf}->{scores};
209							join($arg, map($_ . "=" . ($scores->{$_} \|\| '0'),
210	0	0	0			0	sort @{$pms->{test_names_hit}})) \|\| "none";
	0					0
211							},
212
213							PREVIEW => sub {
214	4					11	my $pms = shift;
215	4					30	$pms->get_content_preview();
216							},
217
218							REPORT => sub {
219	3					8	my $pms = shift;
220	3		50			37	"\n" . ($pms->{tag_data}->{REPORT} \|\| "");
221							},
222
223							SUBJPREFIX => sub {
224	0					0	my $pms = shift;
225	0	0				0	($pms->{tag_data}->{SUBJPREFIX} \|\| "");
226							},
227
228							HEADER => sub {
229	0					0	my $pms = shift;
230	0					0	my $hdr = shift;
231	0	0				0	return if !$hdr;
232	0					0	$pms->get($hdr,undef);
233							},
234
235							TIMING => sub {
236	0					0	my $pms = shift;
237	0					0	$pms->{main}->timer_report();
238							},
239
240							ADDEDHEADERHAM => sub {
241	0					0	my $pms = shift;
242	0					0	$pms->_get_added_headers('headers_ham');
243							},
244
245							ADDEDHEADERSPAM => sub {
246	0					0	my $pms = shift;
247	0					0	$pms->_get_added_headers('headers_spam');
248							},
249
250							ADDEDHEADER => sub {
251	0					0	my $pms = shift;
252							$pms->_get_added_headers(
253	0	0				0	$pms->{is_spam} ? 'headers_spam' : 'headers_ham');
254							},
255
256	40			40		577978	);
257							}
258
259							sub new {
260	154			154	0	767	my $class = shift;
261	154		33			932	$class = ref($class) \|\| $class;
262	154					499	my ($main, $msg, $opts) = @_;
263
264							my $self = {
265							'main' => $main,
266							'msg' => $msg,
267							'score' => 0,
268							'test_log_msgs' => { },
269							'test_names_hit' => [ ],
270							'subtest_names_hit' => [ ],
271							'spamd_result_log_items' => [ ],
272							'tests_already_hit' => { },
273							'c' => { },
274							'tag_data' => { },
275							'rule_errors' => 0,
276							'disable_auto_learning' => 0,
277							'auto_learn_status' => undef,
278							'auto_learn_force_status' => undef,
279							'conf' => $main->{conf},
280							'async' => Mail::SpamAssassin::AsyncLoop->new($main),
281							'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
282	154					2083	'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
283							'uri_detail_list' => { },
284							'subjprefix' => "",
285							};
286
287							dbg("check: pms new, time limit in %.3f s",
288	154	50				1472	$self->{master_deadline} - time) if $self->{master_deadline};
289
290	154	50	66			792	if (defined $opts && $opts->{disable_auto_learning}) {
291	49					136	$self->{disable_auto_learning} = 1;
292							}
293
294							# used with "mass-check --loghits"
295	154	50				622	if ($self->{main}->{save_pattern_hits}) {
296	0					0	$self->{save_pattern_hits} = 1;
297	0					0	$self->{pattern_hits} = { };
298							}
299
300	154					350	delete $self->{should_log_rule_hits};
301	154					643	my $dbgcache = would_log('dbg', 'rules');
302	154	50	33			1017	if ($dbgcache \|\| $self->{save_pattern_hits}) {
303	0					0	$self->{should_log_rule_hits} = 1;
304							}
305
306							# known valid tags that might not get their entry in pms->{tag_data}
307							# in some circumstances
308	154					405	my $tag_data_ref = $self->{tag_data};
309	154					492	foreach (qw(SUMMARY REPORT SUBJPREFIX RBL)) { $tag_data_ref->{$_} = '' }
	616					1777
310	154					606	foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
311							DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN
312							BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
313							HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
314	2618					6468	$tag_data_ref->{$_} = undef; # exist, but undefined
315							}
316
317	154					529	bless ($self, $class);
318	154					606	$self;
319							}
320
321							sub DESTROY {
322	151			151		3482825	my ($self) = shift;
323	151					365	local $@;
324	151					447	eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
	151					764
325							}
326
327							###########################################################################
328
329							=item $status->check ()
330
331							Runs the SpamAssassin rules against the message pointed to by the object.
332
333							=cut
334
335							sub check {
336	97			97	1	278	my ($self) = shift;
337	97					293	my $master_deadline = $self->{master_deadline};
338	97	50				320	if (!$master_deadline) {
339	0					0	$self->check_timed(@_);
340							} else {
341	97					1111	my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
342	97			97		1171	my $err = $t->run(sub { $self->check_timed(@_) });
	97					675
343	96	50	33			1113	if (time > $master_deadline && !$self->{deadline_exceeded}) {
344	0					0	info("check: exceeded time limit in pms check");
345	0					0	$self->{deadline_exceeded} = 1;
346							}
347							}
348							}
349
350							sub check_timed {
351	97			97	0	343	my ($self) = @_;
352	97					258	local ($_);
353
354	97					332	$self->{learned_points} = 0;
355	97					298	$self->{body_only_points} = 0;
356	97					240	$self->{head_only_points} = 0;
357	97					194	$self->{score} = 0;
358
359							# clear NetSet cache before every check to prevent it growing too large
360	97					289	foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
361	291					816	my $netset = $self->{conf}->{$nset_name};
362	291	50				1321	$netset->ditch_cache() if $netset;
363							}
364
365	97					754	$self->{main}->call_plugins ("check_start", { permsgstatus => $self });
366
367							# in order of slowness; fastest first, slowest last.
368							# we do ALL the tests, even if a spam triggers lots of them early on.
369							# this lets us see ludicrously spammish mails (score: 40) etc., which
370							# we can then immediately submit to spamblocking services.
371							#
372							# TODO: change this to do whitelist/blacklists first? probably a plan
373							# NOTE: definitely need AWL stuff last, for regression-to-mean of score
374
375							# TVD: we may want to do more than just clearing out the headers, but ...
376	97					897	$self->{msg}->delete_header('X-Spam-.*');
377
378							# Resident Mail::SpamAssassin code will possibly never change score
379							# sets, even if bayes becomes available. So we should do a quick check
380							# to see if we should go from {0,1} to {2,3}. We of course don't need
381							# to do this switch if we're already using bayes ... ;)
382	97					565	my $set = $self->{conf}->get_score_set();
383	97	0	66			1144	if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
			66
			33
384	0					0	dbg("check: scoreset $set but bayes is available, switching scoresets");
385	0					0	$self->{conf}->set_score_set ($set\|2);
386							}
387	97					579	dbg("check: using scoreset $set in M:S:Pms");
388							# The primary check functionality occurs via a plugin call. For more
389							# information, please see: Mail::SpamAssassin::Plugin::Check
390	97	100				489	if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
391							{
392							# did anything happen? if not, this is fatal
393	1	50				12	if (!$self->{main}->have_plugin("check_main")) {
394	1					25	die "check: no loaded plugin implements 'check_main': cannot scan!\n".
395							"Check that the necessary '.pre' files are in the config directory.\n".
396							"At a minimum, v320.pre loads the Check plugin which is required.\n";
397							}
398							}
399
400							# delete temporary storage and memory allocation used during checking
401	96					762	$self->delete_fulltext_tmpfile();
402
403							# now that we've finished checking the mail, clear out this cache
404							# to avoid unforeseen side-effects.
405	96					1133	$self->{c} = { };
406
407							# Round the score to 3 decimal places to avoid rounding issues
408							# We assume required_score to be properly rounded already.
409							# add 0 to force it back to numeric representation instead of string.
410	96					1690	$self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
411
412							dbg("check: is spam? score=".$self->{score}.
413	96					1030	" required=".$self->{conf}->{required_score});
414	96					587	dbg("check: tests=".$self->get_names_of_tests_hit());
415	96					589	dbg("check: subtests=".$self->get_names_of_subtests_hit("dbg"));
416	96					433	$self->{is_spam} = $self->is_spam();
417
418	96					508	$self->{main}->{resolver}->bgabort();
419	96					519	$self->{main}->call_plugins ("check_end", { permsgstatus => $self });
420
421	96					1720	1;
422							}
423
424							###########################################################################
425
426							=item $status->learn()
427
428							After a mail message has been checked, this method can be called. If the score
429							is outside a certain range around the threshold, ie. if the message is judged
430							more-or-less definitely spam or definitely non-spam, it will be fed into
431							SpamAssassin's learning systems (currently the naive Bayesian classifier),
432							so that future similar mails will be caught.
433
434							=cut
435
436							sub learn {
437	96			96	1	294	my ($self) = shift;
438	96					224	my $master_deadline = $self->{master_deadline};
439	96	50				260	if (!$master_deadline) {
440	0					0	$self->learn_timed(@_);
441							} else {
442	96					465	my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
443	96			96		953	my $err = $t->run(sub { $self->learn_timed(@_) });
	96					815
444	96	50	33			978	if (time > $master_deadline && !$self->{deadline_exceeded}) {
445	0					0	info("learn: exceeded time limit in pms learn");
446	0					0	$self->{deadline_exceeded} = 1;
447							}
448							}
449							}
450
451							sub learn_timed {
452	96			96	0	388	my ($self) = @_;
453
454	96	100	100			848	if (!$self->{conf}->{bayes_auto_learn} \|\|
			100
455							!$self->{conf}->{use_bayes} \|\|
456							$self->{disable_auto_learning})
457							{
458	84					274	$self->{auto_learn_status} = "disabled";
459	84					1083	return;
460							}
461
462	12					29	my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
463	12					81	$arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
464							permsgstatus => $self
465							});
466
467	12					42	$isspam = $arrayref->[0];
468	12					24	$force_autolearn = $arrayref->[1];
469	12					24	$force_autolearn_names = $arrayref->[2];
470
471							#AUTOLEARN_FORCE FLAG INFORMATION
472	12	50	66			70	if (defined $force_autolearn and $force_autolearn > 0) {
473	0					0	$self->{auto_learn_force_status} = "yes";
474	0	0				0	if (defined $force_autolearn_names) {
475	0					0	$self->{auto_learn_force_status} .= " ($force_autolearn_names)";
476							}
477							} else {
478	12					33	$self->{auto_learn_force_status} = "no";
479							}
480
481	12	100				37	if (!defined $isspam) {
482	10					22	$self->{auto_learn_status} = 'no';
483	10					151	return;
484							}
485
486
487	2					11	my $timer = $self->{main}->time_method("learn");
488
489	2					13	$self->{main}->call_plugins ("autolearn", {
490							permsgstatus => $self,
491							isspam => $isspam
492							});
493
494							# bug 3704: temporarily override learn's ability to re-learn a message
495	2					19	my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
496
497	2					5	my $eval_stat;
498							eval {
499	2					21	my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
500	2	50				9	if ($learnstatus->did_learn()) {
501	0	0				0	$self->{auto_learn_status} = $isspam ? "spam" : "ham";
502							}
503							# This must wait until the did_learn call.
504	2					6	$learnstatus->finish();
505	2					8	$self->{main}->finish_learner(); # for now
506
507	2	50				7	if (exists $self->{main}->{bayes_scanner}) {
508	2					8	$self->{main}->{bayes_scanner}->force_close();
509							}
510	2					12	1;
511	2	50				6	} or do {
512	0	0				0	$eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
513							};
514
515							# reset learner options to their original values
516	2					11	$self->{main}->init_learner($orig_learner);
517
518	2	50				42	if (defined $eval_stat) {
519	0					0	dbg("learn: auto-learning failed: $eval_stat");
520	0					0	$self->{auto_learn_status} = "failed";
521							}
522							}
523
524							=item $score = $status->get_autolearn_points()
525
526							Return the message's score as computed for auto-learning. Certain tests are
527							ignored:
528
529							- rules with tflags set to 'learn' (the Bayesian rules)
530
531							- rules with tflags set to 'userconf' (user white/black-listing rules, etc)
532
533							- rules with tflags set to 'noautolearn'
534
535							Also note that auto-learning occurs using scores from either scoreset 0 or 1,
536							depending on what scoreset is used during message check. It is likely that the
537							message check and auto-learn scores will be different.
538
539							=cut
540
541							sub get_autolearn_points {
542	12			12	1	41	my ($self) = @_;
543	12					62	$self->_get_autolearn_points();
544	12					62	return $self->{autolearn_points};
545							}
546
547							=item $score = $status->get_head_only_points()
548
549							Return the message's score as computed for auto-learning, ignoring
550							all rules except for header-based ones.
551
552							=cut
553
554							sub get_head_only_points {
555	12			12	1	40	my ($self) = @_;
556	12					31	$self->_get_autolearn_points();
557	12					36	return $self->{head_only_points};
558							}
559
560							=item $score = $status->get_learned_points()
561
562							Return the message's score as computed for auto-learning, ignoring
563							all rules except for learning-based ones.
564
565							=cut
566
567							sub get_learned_points {
568	12			12	1	32	my ($self) = @_;
569	12					26	$self->_get_autolearn_points();
570	12					38	return $self->{learned_points};
571							}
572
573							=item $score = $status->get_body_only_points()
574
575							Return the message's score as computed for auto-learning, ignoring
576							all rules except for body-based ones.
577
578							=cut
579
580							sub get_body_only_points {
581	12			12	1	36	my ($self) = @_;
582	12					45	$self->_get_autolearn_points();
583	12					40	return $self->{body_only_points};
584							}
585
586							=item $score = $status->get_autolearn_force_status()
587
588							Return whether a message's score included any rules that are flagged as
589							autolearn_force.
590
591							=cut
592
593							sub get_autolearn_force_status {
594	12			12	1	30	my ($self) = @_;
595	12					32	$self->_get_autolearn_points();
596	12					27	return $self->{autolearn_force};
597							}
598
599							=item $rule_names = $status->get_autolearn_force_names()
600
601							Return a list of comma separated list of rule names if a message's
602							score included any rules that are flagged as autolearn_force.
603
604							=cut
605
606							sub get_autolearn_force_names {
607	12			12	1	30	my ($self) = @_;
608	12					27	my ($names);
609
610	12					39	$self->_get_autolearn_points();
611	12					30	$names = $self->{autolearn_force_names};
612
613	12	50				51	if (defined $names) {
614							#remove trailing comma
615	0					0	$names =~ s/,$//;
616							} else {
617	12					38	$names = "";
618							}
619
620	12					38	return $names;
621							}
622
623							sub _get_autolearn_points {
624	72			72		125	my ($self) = @_;
625
626	72	100				160	return if (exists $self->{autolearn_points});
627							# ensure it only gets computed once, even if we return early
628	12					41	$self->{autolearn_points} = 0;
629
630							# This function needs to use use sum($score[scoreset % 2]) not just {score}.
631							# otherwise we shift what we autolearn on and it gets really weird. - tvd
632	12					51	my $orig_scoreset = $self->{conf}->get_score_set();
633	12					25	my $new_scoreset = $orig_scoreset;
634	12					25	my $scores = $self->{conf}->{scores};
635
636	12	50				61	if (($orig_scoreset & 2) == 0) { # we don't need to recompute
637	12					75	dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
638							}
639							else {
640	0					0	$new_scoreset = $orig_scoreset & ~2;
641	0					0	dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
642	0					0	$scores = $self->{conf}->{scoreset}->[$new_scoreset];
643							}
644
645	12					41	my $tflags = $self->{conf}->{tflags};
646	12					34	my $points = 0;
647
648							# Just in case this function is called multiple times, clear out the
649							# previous calculated values
650	12					37	$self->{learned_points} = 0;
651	12					22	$self->{body_only_points} = 0;
652	12					27	$self->{head_only_points} = 0;
653	12					26	$self->{autolearn_force} = 0;
654
655	12					19	foreach my $test (@{$self->{test_names_hit}}) {
	12					59
656							# According to the documentation, noautolearn, userconf, and learn
657							# rules are ignored for autolearning.
658	19	100				78	if (exists $tflags->{$test}) {
659	6	100				40	next if $tflags->{$test} =~ /\bnoautolearn\b/;
660	5	50				71	next if $tflags->{$test} =~ /\buserconf\b/;
661
662							# Keep track of the learn points for an additional autolearn check.
663							# Use the original scoreset since it'll be 0 in sets 0 and 1.
664	0	0				0	if ($tflags->{$test} =~ /\blearn\b/) {
665							# we're guaranteed that the score will be defined
666	0					0	$self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
667	0					0	next;
668							}
669
670							#IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
671	0	0				0	if ($tflags->{$test} =~ /\bautolearn_force\b/) {
672	0					0	$self->{autolearn_force}++;
673							#ADD RULE NAME TO LIST
674	0					0	$self->{autolearn_force_names}.="$test,";
675							}
676							}
677
678							# ignore tests with 0 score (or undefined) in this scoreset
679	13	50				55	next if !$scores->{$test};
680
681							# Go ahead and add points to the proper locations
682							# Changed logic because in testing, I was getting both head and body. Bug 5503
683	13	100				57	if ($self->{conf}->maybe_header_only ($test)) {
		50
684	9					36	$self->{head_only_points} += $scores->{$test};
685	9					94	dbg("learn: auto-learn: adding head_only points $scores->{$test}");
686							} elsif ($self->{conf}->maybe_body_only ($test)) {
687	4					13	$self->{body_only_points} += $scores->{$test};
688	4					79	dbg("learn: auto-learn: adding body_only points $scores->{$test}");
689							} else {
690	0					0	dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
691							}
692
693	13					64	$points += $scores->{$test};
694							}
695
696							# Figure out the final value we'll use for autolearning
697	12					144	$points = (sprintf "%0.3f", $points) + 0;
698	12					148	dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
699
700	12					49	$self->{autolearn_points} = $points;
701							}
702
703							###########################################################################
704
705							=item $isspam = $status->is_spam ()
706
707							After a mail message has been checked, this method can be called. It will
708							return 1 for mail determined likely to be spam, 0 if it does not seem
709							spam-like.
710
711							=cut
712
713							sub is_spam {
714	99			99	1	254	my ($self) = @_;
715							# changed to test this so sub-tests can ask "is_spam" during a run
716	99					462	return ($self->{score} >= $self->{conf}->{required_score});
717							}
718
719							###########################################################################
720
721							=item $list = $status->get_names_of_tests_hit ()
722
723							After a mail message has been checked, this method can be called. It will
724							return a comma-separated string, listing all the symbolic test names
725							of the tests which were triggered by the mail.
726
727							=cut
728
729							sub get_names_of_tests_hit {
730	97			97	1	294	my ($self) = @_;
731
732	97					203	return join(',', sort @{$self->{test_names_hit}});
	97					787
733							}
734
735							=item $list = $status->get_names_of_tests_hit_with_scores_hash ()
736
737							After a mail message has been checked, this method can be called. It will
738							return a pointer to a hash for rule & score pairs for all the symbolic
739							test names and individual scores of the tests which were triggered by the mail.
740
741							=cut
742
743							sub get_names_of_tests_hit_with_scores_hash {
744	0			0	1	0	my ($self) = @_;
745
746							#BASED ON CODE FOR TESTSSCORES TAG
747	0					0	my $scores = $self->{conf}->{scores};
748	0					0	my %testsscores;
749	0		0			0	$testsscores{$_} = $scores->{$_} \|\| '0' for @{$self->{test_names_hit}};
	0					0
750	0					0	return \%testsscores;
751							}
752
753							=item $list = $status->get_names_of_tests_hit_with_scores ()
754
755							After a mail message has been checked, this method can be called. It will
756							return a comma-separated string of rule=score pairs for all the symbolic
757							test names and individual scores of the tests which were triggered by the mail.
758
759							=cut
760
761							sub get_names_of_tests_hit_with_scores {
762	0			0	1	0	my ($self) = @_;
763
764							#BASED ON CODE FOR TESTSSCORES TAG
765	0					0	my $scores = $self->{conf}->{scores};
766							return join(',', map($_ . '=' . ($scores->{$_} \|\| '0'),
767	0		0			0	sort @{$self->{test_names_hit}})) \|\| "none";
768							}
769
770
771							###########################################################################
772
773							=item $list = $status->get_names_of_subtests_hit ()
774
775							After a mail message has been checked, this method can be called. It will
776							return a comma-separated string, listing all the symbolic test names of the
777							meta-rule sub-tests which were triggered by the mail. Sub-tests are the
778							normally-hidden rules, which score 0 and have names beginning with two
779							underscores, used in meta rules.
780
781							If a parameter of collapsed or dbg is passed, the output will be a condensed
782							array of sub-tests with multiple hits reduced to one entry.
783
784							If the parameter of dbg is passed, the output will be a condensed string of
785							sub-tests with multiple hits reduced to one entry with the number of hits
786							in parentheses. Some information is also added at the end regarding the
787							multiple hits.
788
789							=cut
790
791							sub get_names_of_subtests_hit {
792	96			96	1	319	my ($self, $mode) = @_;
793
794	96	50	33			929	if (defined $mode && ($mode eq 'dbg' \|\| $mode eq 'collapsed')) {
			33
795							# This routine prints only one instance of a subrule hit with a count of how many times it hit if greater than 1
796	96					213	my $total_hits = scalar(@{$self->{subtest_names_hit}});
	96					308
797	96	100				568	return '' if !$total_hits;
798
799	42					85	my %subtest_names_hit;
800	42					89	$subtest_names_hit{$_}++ foreach @{$self->{subtest_names_hit}};
	42					316
801
802	42					291	my @subtests = sort keys %subtest_names_hit;
803	42					111	my $deduplicated_hits = scalar(@subtests);
804
805	42					89	my @result;
806	42					138	foreach my $rule (@subtests) {
807	84	50				249	if ($subtest_names_hit{$rule} > 1) {
808	0					0	push @result, "$rule($subtest_names_hit{$rule})";
809							} else {
810	84					203	push @result, $rule;
811							}
812							}
813
814	42	50				172	if ($mode eq 'dbg') {
815	42					406	return join(',', @result)." (Total Subtest Hits: $total_hits / Deduplicated Total Hits: $deduplicated_hits)";
816							} else {
817	0					0	return join(',', @result);
818							}
819							} else {
820							# Return the simpler string with duplicates and commas
821	0					0	return join(',', sort @{$self->{subtest_names_hit}});
	0					0
822							}
823							}
824
825							###########################################################################
826
827							=item $num = $status->get_score ()
828
829							After a mail message has been checked, this method can be called. It will
830							return the message's score.
831
832							=cut
833
834							sub get_score {
835	0			0	1	0	my ($self) = @_;
836	0					0	return $self->{score};
837							}
838
839							# left as backward compatibility
840							sub get_hits {
841	0			0	0	0	my ($self) = @_;
842	0					0	return $self->{score};
843							}
844
845							###########################################################################
846
847							=item $num = $status->get_required_score ()
848
849							After a mail message has been checked, this method can be called. It will
850							return the score required for a mail to be considered spam.
851
852							=cut
853
854							sub get_required_score {
855	0			0	1	0	my ($self) = @_;
856	0					0	return $self->{conf}->{required_score};
857							}
858
859							# left as backward compatibility
860							sub get_required_hits {
861	0			0	0	0	my ($self) = @_;
862	0					0	return $self->{conf}->{required_score};
863							}
864
865							###########################################################################
866
867							=item $num = $status->get_autolearn_status ()
868
869							After a mail message has been checked, this method can be called. It will
870							return one of the following strings depending on whether the mail was
871							auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
872
873							It also returns is flagged with auto_learn_force, it will also include the status
874							and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
875
876							=cut
877
878							sub get_autolearn_status {
879	34			34	1	98	my ($self) = @_;
880	34		50			150	my ($status) = $self->{auto_learn_status} \|\| "unavailable";
881
882	34	100				126	if (defined $self->{auto_learn_force_status}) {
883	2					9	$status .= " autolearn_force=".$self->{auto_learn_force_status};
884							}
885
886	34					105	return $status;
887							}
888
889							###########################################################################
890
891							=item $report = $status->get_report ()
892
893							Deliver a "spam report" on the checked mail message. This contains details of
894							how many spam detection rules it triggered.
895
896							The report is returned as a multi-line string, with the lines separated by
897							C<\n> characters.
898
899							=cut
900
901							sub get_report {
902	4			4	1	901	my ($self) = @_;
903
904	4	50				21	if (!exists $self->{'report'}) {
905	4					8	my $report;
906
907	4					20	my $timer = $self->{main}->time_method("get_report");
908	4					18	$report = $self->{conf}->{report_template};
909	4		50			19	$report \|\|= '(no report template found)';
910
911	4					21	$report = $self->_replace_tags($report);
912
913	4					298	$report =~ s/\n*$/\n\n/s;
914	4					19	$self->{report} = $report;
915							}
916
917	4					19	return $self->{report};
918							}
919
920							###########################################################################
921
922							=item $preview = $status->get_content_preview ()
923
924							Give a "preview" of the content.
925
926							This is returned as a multi-line string, with the lines separated by C<\n>
927							characters, containing a fully-decoded, safe, plain-text sample of the first
928							few lines of the message body.
929
930							=cut
931
932							sub get_content_preview {
933	4			4	1	12	my ($self) = @_;
934
935	4					13	my $str = '';
936	4					7	my @ary = @{$self->get_decoded_stripped_body_text_array()};
	4					17
937	4					13	shift @ary; # drop the subject line
938
939	4					11	my $numlines = 3;
940	4		33			50	while (length ($str) < 200 && @ary && $numlines-- > 0) {
			66
941	12					102	$str .= shift @ary;
942							}
943
944							# in case the last line was huge, trim it back to around 200 chars
945	4					16	local $1;
946	4					13	$str =~ s/^(.{200}).+$/$1 [...]/gm;
947	4					23	chomp ($str); $str .= "\n";
	4					12
948
949							# now, some tidy-ups that make things look a bit prettier
950	4					31	$str =~ s/-----Original Message-----.*$//gm;
951	4					15	$str =~ s/This is a multi-part message in MIME format\.//gs;
952	4					32	$str =~ s/[-_*.]{10,}//gs;
953	4					69	$str =~ s/\s+/ /gs;
954
955							# add "Content preview:" ourselves, so that the text aligns
956							# correctly with the template -- then trim it off. We don't
957							# have to get this exactly right, but it's nicer if we
958							# make a bit of an effort ;)
959	4					36	$str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
960	4					23	$str =~ s/^Content preview:\s+//gs;
961
962	4					29	return $str;
963							}
964
965							###########################################################################
966
967							=item $msg = $status->get_message()
968
969							Return the object representing the message being scanned.
970
971							=cut
972
973							sub get_message {
974	0			0	1	0	my ($self) = @_;
975	0					0	return $self->{msg};
976							}
977
978							###########################################################################
979
980							=item $status->rewrite_mail ()
981
982							Rewrite the mail message. This will at minimum add headers, and at
983							maximum MIME-encapsulate the message text, to reflect its spam or not-spam
984							status. The function will return a scalar of the rewritten message.
985
986							The actual modifications depend on the configuration (see
987							C<Mail::SpamAssassin::Conf> for more information).
988
989							The possible modifications are as follows:
990
991							=over 4
992
993							=item To:, From: and Subject: modification on spam mails
994
995							Depending on the configuration, the To: and From: lines can have a
996							user-defined RFC 2822 comment appended for spam mail. The subject line
997							may have a user-defined string prepended to it for spam mail.
998
999							=item X-Spam-* headers for all mails
1000
1001							Depending on the configuration, zero or more headers with names
1002							beginning with C<X-Spam-> will be added to mail depending on whether
1003							it is spam or ham.
1004
1005							=item spam message with report_safe
1006
1007							If report_safe is set to true (1), then spam messages are encapsulated
1008							into their own message/rfc822 MIME attachment without any modifications
1009							being made.
1010
1011							If report_safe is set to false (0), then the message will only have the
1012							above headers added/modified.
1013
1014							=back
1015
1016							=cut
1017
1018							sub rewrite_mail {
1019	34			34	1	299	my ($self) = @_;
1020
1021	34					163	my $timer = $self->{main}->time_method("rewrite_mail");
1022	34		100			213	my $msg = $self->{msg}->get_mbox_separator() \|\| '';
1023
1024	34	100	66			203	if ($self->{is_spam} && $self->{conf}->{report_safe}) {
1025	2					19	$msg .= $self->rewrite_report_safe();
1026							}
1027							else {
1028	32					172	$msg .= $self->rewrite_no_report_safe();
1029							}
1030
1031	34					211	return $msg;
1032							}
1033
1034							# Make the line endings in the passed string reference appropriate
1035							# for the original mail. Callers must note bug 5250: don't rewrite
1036							# the message body, since that will corrupt 8bit attachments/MIME parts.
1037							#
1038							sub _fixup_report_line_endings {
1039	36			36		131	my ($self, $strref) = @_;
1040	36	50				187	if ($self->{msg}->{line_ending} ne "\n") {
1041	0					0	$$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
1042							}
1043							}
1044
1045							sub _get_added_headers {
1046	34			34		143	my ($self, $which) = @_;
1047	34					113	my $str = '';
1048							# use string appends to put this back together -- I finally benchmarked it.
1049							# join() is 56% of the speed of just using string appends. ;)
1050	34					77	foreach my $hf_ref (@{$self->{conf}->{$which}}) {
	34					166
1051	168					895	my($hfname, $hfbody) = @$hf_ref;
1052	168					471	my $line = $self->_process_header($hfname,$hfbody);
1053	168					609	$line = $self->qp_encode_header($line);
1054	168					1016	$str .= "X-Spam-$hfname: $line\n";
1055							}
1056	34					237	return $str;
1057							};
1058
1059							# rewrite the message in report_safe mode
1060							# should not be called directly, use rewrite_mail instead
1061							#
1062							sub rewrite_report_safe {
1063	2			2	0	7	my ($self) = @_;
1064
1065	2					4	my $tag;
1066
1067							# This is the original message. We do not want to make any modifications so
1068							# we may recover it if necessary. It will be put into the new message as a
1069							# message/rfc822 MIME part.
1070	2					12	my $original = $self->{msg}->get_pristine();
1071
1072							# This is the new message.
1073	2					12	my $newmsg = '';
1074
1075							# the report charset
1076	2					6	my $report_charset = "; charset=iso-8859-1";
1077	2	50				11	if ($self->{conf}->{report_charset}) {
1078	0					0	$report_charset = "; charset=" . $self->{conf}->{report_charset};
1079							}
1080
1081							# the SpamAssassin report
1082	2					15	my $report = $self->get_report();
1083
1084							# If there are any wide characters, need to MIME-encode in UTF-8
1085							# TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1086							# we could try converting to that charset if possible
1087	2	50	33			43	unless ($] < 5.008 \|\| utf8::downgrade($report, 1)) {
1088	0					0	$report_charset = "; charset=utf-8";
1089	0					0	utf8::encode($report);
1090							}
1091
1092							# get original headers, "pristine" if we can do it
1093	2					16	my $from = $self->{msg}->get_pristine_header("From");
1094	2					16	my $to = $self->{msg}->get_pristine_header("To");
1095	2					12	my $cc = $self->{msg}->get_pristine_header("Cc");
1096	2					10	my $subject = $self->{msg}->get_pristine_header("Subject");
1097	2					14	my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1098	2					12	my $date = $self->{msg}->get_pristine_header("Date");
1099
1100							# It'd be nice to do this with a foreach loop, but with only three
1101							# possibilities right now, it's easier not to...
1102
1103	2	50				16	if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1104							# Add a prefix to the subject if needed
1105	0	0	0			0	if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
1106	0					0	$tag = $self->_replace_tags($self->{subjprefix});
1107	0					0	$tag =~ s/\n/ /gs;
1108	0					0	$subject = $tag . $subject;
1109							}
1110							# Add a SPAM prefix
1111	0	0				0	$subject = "\n" if !defined $subject;
1112	0					0	$tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1113	0					0	$tag =~ s/\n/ /gs; # strip tag's newlines
1114	0					0	$subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1115							}
1116
1117	2	50				9	if (defined $self->{conf}->{rewrite_header}->{To}) {
1118	0	0				0	$to = "\n" if !defined $to;
1119	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1120	0					0	$tag =~ s/\n/ /gs; # strip tag's newlines
1121	0					0	$to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1122							}
1123
1124	2	50				11	if (defined $self->{conf}->{rewrite_header}->{From}) {
1125	0	0				0	$from = "\n" if !defined $from;
1126	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1127	0					0	$tag =~ s/\n+//gs; # strip tag's newlines
1128	0					0	$from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1129							}
1130
1131							# add report headers to message
1132	2	50				16	$newmsg .= "From: $from" if defined $from;
1133	2	50				14	$newmsg .= "To: $to" if defined $to;
1134	2	50				9	$newmsg .= "Cc: $cc" if defined $cc;
1135	2	50				29	$newmsg .= "Subject: $subject" if defined $subject;
1136	2	50				14	$newmsg .= "Date: $date" if defined $date;
1137	2	50				11	$newmsg .= "Message-Id: $msgid" if defined $msgid;
1138	2					13	$newmsg .= $self->_get_added_headers('headers_spam');
1139
1140	2	50				15	if (defined $self->{conf}->{report_safe_copy_headers}) {
1141	2					10	my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
	12					38
1142
1143	2					18	foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
	2					21
1144	0	0				0	next if exists $already_added{lc $hdr};
1145	0					0	my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1146	0					0	$already_added{lc $hdr}++;
1147
1148	0	0				0	if (lc $hdr eq "received") { # add Received at the top ...
1149	0					0	my $rhdr = "";
1150	0					0	foreach (@hdrtext) {
1151	0					0	$rhdr .= "$hdr: $_";
1152							}
1153	0					0	$newmsg = "$rhdr$newmsg";
1154							}
1155							else {
1156	0					0	foreach (@hdrtext) {
1157	0					0	$newmsg .= "$hdr: $_";
1158							}
1159							}
1160							}
1161							}
1162
1163							# jm: add a SpamAssassin Received header to note markup time etc.
1164							# emulates the fetchmail style.
1165							# tvd: do this after report_safe_copy_headers so Received will be done correctly
1166	2					13	$newmsg = "Received: from localhost by " .
1167							Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1168							"\twith SpamAssassin (version " .
1169							Mail::SpamAssassin::Version() . ");\n" .
1170							"\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1171							$newmsg;
1172
1173							# MIME boundary
1174	2					33	my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
1175
1176							# ensure it's unique, so we can't be attacked this way
1177	2					78	while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1178	0					0	$boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1179							}
1180
1181							# determine whether Content-Disposition should be "attachment" or "inline"
1182	2					8	my $disposition;
1183	2					15	my $ct = $self->{msg}->get_header("Content-Type");
1184	2	50	33			47	if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
			33
1185	2					7	$disposition = "attachment";
1186	2					12	$report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1187							# if we wanted to defang the attachment, this would be the place
1188							}
1189							else {
1190	0					0	$disposition = "inline";
1191							}
1192
1193	2					7	my $type = "message/rfc822";
1194	2	50				21	$type = "text/plain" if $self->{conf}->{report_safe} > 1;
1195
1196	2					10	my $description = $self->{conf}->{'encapsulated_content_description'};
1197
1198							# Note: the message should end in blank line since mbox format wants
1199							# blank line at end and messages may be concatenated! In addition, the
1200							# x-spam-type parameter is fixed since we will use it later to recognize
1201							# original messages that can be extracted.
1202	2					44	$newmsg .= <<"EOM";
1203							MIME-Version: 1.0
1204							Content-Type: multipart/mixed; boundary="$boundary"
1205
1206							This is a multi-part message in MIME format.
1207
1208							--$boundary
1209							Content-Type: text/plain$report_charset
1210							Content-Disposition: inline
1211							Content-Transfer-Encoding: 8bit
1212
1213							$report
1214
1215							--$boundary
1216							Content-Type: $type; x-spam-type=original
1217							Content-Description: $description
1218							Content-Disposition: $disposition
1219							Content-Transfer-Encoding: 8bit
1220
1221							EOM
1222
1223	2					11	my $newmsgtrailer = "\n--$boundary--\n\n";
1224
1225							# now fix line endings in both headers, report_safe body parts,
1226							# and new MIME boundaries and structure
1227	2					15	$self->_fixup_report_line_endings(\$newmsg);
1228	2					7	$self->_fixup_report_line_endings(\$newmsgtrailer);
1229	2					37	$newmsg .= $original.$newmsgtrailer;
1230
1231	2					37	return $newmsg;
1232							}
1233
1234							# rewrite the message in non-report_safe mode (just headers)
1235							# should not be called directly, use rewrite_mail instead
1236							#
1237							sub rewrite_no_report_safe {
1238	32			32	0	88	my ($self) = @_;
1239
1240	32					84	my $ntag;
1241	32					64	my $pref_subject = 0;
1242
1243							# put the pristine headers into an array
1244							# skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1245							# since there may be a missing header/body
1246							#
1247	32					157	my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1248	32					198	for (my $line = 0; $line <= $#pristine_headers; $line++) {
1249	120	50				596	next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1250	0					0	splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)\|[ \t])/i);
1251	0					0	$line--;
1252							}
1253	32					96	my $separator = '';
1254	32	50	33			433	if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1255	32					137	$separator = pop @pristine_headers;
1256							}
1257
1258	32					107	my $addition = 'headers_ham';
1259
1260	32	50				116	if($self->{is_spam})
1261							{
1262							# special-case: Subject lines. ensure one exists, if we're
1263							# supposed to mark it up.
1264	0					0	my $created_subject = 0;
1265	0					0	my $subject = $self->{msg}->get_pristine_header('Subject');
1266	0	0	0			0	if (!defined($subject) && $self->{is_spam}
			0
1267							&& exists $self->{conf}->{rewrite_header}->{'Subject'})
1268							{
1269	0					0	push(@pristine_headers, "Subject: \n");
1270	0					0	$created_subject = 1;
1271							}
1272
1273							# Deal with header rewriting
1274	0					0	foreach (@pristine_headers) {
1275							# if we're not going to do a rewrite, skip this header!
1276	0	0				0	next if (!/^(From\|Subject\|To):/i);
1277	0					0	my $hdr = ucfirst(lc($1));
1278	0	0				0	next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1279
1280							# pop the original version onto the end of the header array
1281	0	0				0	if ($created_subject) {
1282	0					0	push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1283							} else {
1284	0					0	push(@pristine_headers, "X-Spam-Prev-$_");
1285							}
1286
1287							# Figure out the rewrite piece
1288	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1289	0					0	$tag =~ s/\n/ /gs;
1290
1291							# The tag should be a comment for this header ...
1292	0	0				0	$tag = "($tag)" if ($hdr =~ /^(?:From\|To)$/);
1293
1294	0	0	0			0	if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1295	0	0				0	if($self->{subjprefix} ne "") {
1296	0					0	$ntag = $self->_replace_tags($self->{subjprefix});
1297	0					0	$ntag =~ s/\n/ /gs;
1298	0					0	$ntag =~ s/\s+$//;
1299
1300	0					0	local $1;
1301	0	0				0	if(defined $ntag) {
1302	0					0	s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1303							}
1304							}
1305							}
1306	0					0	s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1307							}
1308
1309	0					0	$addition = 'headers_spam';
1310							} else {
1311							# special-case: Subject lines. ensure one exists, if we're
1312							# supposed to mark it up.
1313	32					103	my $created_subject = 0;
1314	32					123	my $subject = $self->{msg}->get_pristine_header('Subject');
1315	32	50	33			280	if (!defined($subject)
1316							&& exists $self->{conf}->{rewrite_header}->{'Subject'})
1317							{
1318	0					0	push(@pristine_headers, "Subject: \n");
1319	0					0	$created_subject = 1;
1320							}
1321
1322							# Deal with header rewriting
1323	32					106	foreach (@pristine_headers) {
1324							# if we're not going to do a rewrite, skip this header!
1325	88	50				286	next if (!/^(Subject):/i);
1326	0					0	my $hdr = ucfirst(lc($1));
1327	0	0				0	next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1328
1329	0	0	0			0	if((defined $self->{subjprefix}) and (defined $self->{conf}->{rewrite_header}->{Subject})) {
1330	0	0				0	if($self->{subjprefix} ne "") {
1331	0					0	$ntag = $self->_replace_tags($self->{subjprefix});
1332	0					0	$ntag =~ s/\n/ /gs;
1333	0					0	$ntag =~ s/\s+$//;
1334
1335	0					0	local $1;
1336	0	0				0	if(defined $ntag) {
1337	0					0	s/^([^:]+:)[ \t]*(?:\Q${ntag}\E )?/$1 ${ntag} /i;
1338							}
1339							}
1340							}
1341							}
1342
1343							}
1344
1345							# Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1346							# that we want to ensure comes before any SpamAssassin markup headers,
1347							# like the Return-Path header (see bug 3409).
1348							#
1349							# all the rest of the message headers (as left in @pristine_headers), is
1350							# to be placed after the SpamAssassin markup hdrs. Once one of those headers
1351							# is seen, all further headers go into that set; it's assumed that it's an
1352							# old copy of the header, or attempted spoofing, if it crops up halfway
1353							# through the headers.
1354
1355	32					120	my $new_hdrs_pre = '';
1356	32	50	33			288	if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1357	0					0	$new_hdrs_pre .= shift(@pristine_headers);
1358	0		0			0	while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1359	0					0	$new_hdrs_pre .= shift(@pristine_headers);
1360							}
1361							}
1362	32					184	$new_hdrs_pre .= $self->_get_added_headers($addition);
1363
1364							# fix up line endings appropriately
1365	32					240	my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1366	32					163	$self->_fixup_report_line_endings(\$newmsg);
1367
1368	32					166	return $newmsg.$self->{msg}->get_pristine_body();
1369							}
1370
1371							sub qp_encode_header {
1372	168			168	0	541	my ($self, $text) = @_;
1373
1374							# return unchanged if there are no 8-bit characters
1375	168	50				936	return $text if $text !~ tr/\x00-\x7F//c;
1376
1377	0					0	my $cs = 'ISO-8859-1';
1378	0	0				0	if ($self->{report_charset}) {
1379	0					0	$cs = $self->{report_charset};
1380							}
1381
1382	0					0	my @hexchars = split('', '0123456789abcdef');
1383	0					0	my $ord;
1384	0					0	local $1;
1385	0					0	$text =~ s{([\x80-\xff])}{
1386	0					0	$ord = ord $1;
1387	0					0	'='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1388							}ges;
1389
1390	0					0	$text = '=?'.$cs.'?Q?'.$text.'?=';
1391
1392	0					0	dbg("markup: encoding header in $cs: $text");
1393	0					0	return $text;
1394							}
1395
1396							sub _process_header {
1397	168			168		562	my ($self, $hdr_name, $hdr_data) = @_;
1398
1399	168					470	$hdr_data = $self->_replace_tags($hdr_data);
1400	168					488	$hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1401
1402	168	50				492	if ($self->{conf}->{fold_headers}) {
1403	168	50				428	if ($hdr_data =~ /\n/) {
1404	0					0	$hdr_data =~ s/\s\n\s/\n\t/g;
1405	0					0	return $hdr_data;
1406							}
1407							else {
1408							# use '!!' instead of ': ' so it doesn't wrap on the space
1409	168					629	my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1410	168					592	$hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1411							# make sure there are no blank lines in headers
1412							# buggy wrap might not prefix blank lines with \t, so use \s* (bug 7672)
1413	168					518	$hdr =~ s/^\s*\n//gm;
1414	168					1157	return (split (/!!/, $hdr, 2))[1]; # just return the data part
1415							}
1416							}
1417							else {
1418	0					0	$hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1419	0					0	return $hdr_data;
1420							}
1421							}
1422
1423							sub _replace_tags {
1424	174			174		281	my $self = shift;
1425	174					392	my $text = shift;
1426
1427							# default to leaving the original string in place, if we cannot find
1428							# a tag for it (bug 4793)
1429	174					651	local($1,$2,$3);
1430	174					1376	$text =~ s{(_(\w+?)(?:$(.*?)$)?_)}{
1431	430					1174	my $full = $1;
1432	430					1027	my $tag = $2;
1433	430					666	my $result;
1434	430	50				860	if ($tag =~ /^ADDEDHEADER(?:HAM\|SPAM\|)\z/) {
1435							# Bug 6278: break infinite recursion through _get_added_headers and
1436							# _get_tag on an attempt to use such tag in add_header template
1437							} else {
1438	430					908	$result = $self->get_tag_raw($tag,$3);
1439	430	50				1195	$result = join(' ',@$result) if ref $result eq 'ARRAY';
1440							}
1441	430	50				2674	defined $result ? $result : $full;
1442							}ge;
1443
1444	174					874	return $text;
1445							}
1446
1447							###########################################################################
1448
1449							# public API for plugins
1450
1451							=item $status->action_depends_on_tags($tags, $code, @args)
1452
1453							Enqueue the supplied subroutine reference C<$code>, to become runnable when
1454							all the specified tags become available. The C<$tags> may be a simple
1455							scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1456							when called will be passed a C<permessagestatus> object as its first argument,
1457							followed by the supplied (optional) list C<@args> .
1458
1459							=cut
1460
1461							sub action_depends_on_tags {
1462	81			81	1	309	my($self, $tags, $code, @args) = @_;
1463
1464	81	50				328	ref $code eq 'CODE'
1465							or die "action_depends_on_tags: argument must be a subroutine ref";
1466
1467							# tag names on which the given action depends
1468	81	50				384	my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1469
1470							# @{$self->{tagrun_subs}} list of all submitted subroutines
1471							# @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1472							# $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1473
1474							# store action details, obtain its index
1475	81					199	push(@{$self->{tagrun_subs}}, [$code,@args]);
	81					377
1476	81					183	my $action_ind = $#{$self->{tagrun_subs}};
	81					237
1477
1478							# list dependency tag names which are not already satisfied
1479	81					134	my @blocking_tags;
1480	81					229	foreach (@dep_tags) {
1481	81					338	my $data = $self->get_tag($_);
1482	81	50	33			537	if (!defined $data \|\| $data eq '') {
1483	81					336	push @blocking_tags, $_;
1484							}
1485							}
1486
1487	81					321	$self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1488	81					427	$self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1489
1490	81	50				267	if (@blocking_tags) {
1491	81					382	dbg("check: tagrun - action %s blocking on tags %s",
1492							$action_ind, join(', ',@blocking_tags));
1493							} else {
1494	0					0	dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1495							join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1496	0					0	&$code($self, @args);
1497							}
1498							}
1499
1500							# tag_is_ready() will be called by set_tag(), indicating that a given
1501							# tag just received its value, possibly unblocking an action routine
1502							# as declared by action_depends_on_tags().
1503							#
1504							# Well-behaving plugins should call set_tag() once when a tag is fully
1505							# assembled and ready. Multiple calls to set the same tag value are handled
1506							# gracefully, but may result in premature activation of a pending action.
1507							# Setting tag values by plugins should not be done directly but only through
1508							# the public API set_tag(), otherwise a pending action release may be missed.
1509							#
1510							sub tag_is_ready {
1511	897			897	0	1587	my($self, $tag) = @_;
1512	897					1405	$tag = uc $tag;
1513
1514	897	50				1882	if (would_log('dbg', 'check')) {
1515	0					0	my $tag_val = $self->{tag_data}{$tag};
1516	0	0				0	dbg("check: tagrun - tag %s is now ready, value: %s",
		0
1517							$tag, !defined $tag_val ? '<UNDEF>'
1518							: ref $tag_val ne 'ARRAY' ? $tag_val
1519							: 'ARY:[' . join(',',@$tag_val) . ']' );
1520							}
1521	897	50				3003	if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1522	0					0	my $action_ind = 0;
1523	0					0	foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
	0					0
1524	0	0				0	if ($action_pending) {
1525	0					0	$self->{tagrun_actions}{$tag}[$action_ind] = 0;
1526	0	0				0	if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
		0
1527							# should not happen, warn and ignore
1528							warn "tagrun error: count for $action_ind is ".
1529	0					0	$self->{tagrun_tagscnt}[$action_ind]."\n";
1530							} elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1531	0					0	my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
	0					0
1532	0					0	dbg("check: tagrun - tag %s unblocking the action %s: %s",
1533							$tag, $action_ind, join(', ',$code,@args));
1534	0					0	&$code($self, @args);
1535							}
1536							}
1537	0					0	$action_ind++;
1538							}
1539							}
1540							}
1541
1542							# debugging aid: show actions that are still pending, waiting for their
1543							# tags to receive a value
1544							#
1545							sub report_unsatisfied_actions {
1546	136			136	0	339	my($self) = @_;
1547	136					251	my @tags;
1548	136	100				505	@tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
	98					415
1549	136					619	for my $tag (@tags) {
1550							my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1551	79					225	(0 .. $#{$self->{tagrun_actions}{$tag}}));
	79					425
1552	79	50				558	dbg("check: tagrun - tag %s is still blocking action %s",
1553							$tag, join(', ', @pending_actions)) if @pending_actions;
1554							}
1555							}
1556
1557							=item $status->set_tag($tagname, $value)
1558
1559							Set a template tag, as used in C<add_header>, report templates, etc.
1560							This API is intended for use by plugins. Tag names will be converted
1561							to an all-uppercase representation internally.
1562
1563							C<$value> can be a simple scalar (string or number), or a reference to an
1564							array, in which case the public method get_tag will join array elements
1565							using a space as a separator, returning a single string for backward
1566							compatibility.
1567
1568							C<$value> can also be a subroutine reference, which will be evaluated
1569							each time the template is expanded. The first argument passed by get_tag
1570							to a called subroutine will be a PerMsgStatus object (this module's object),
1571							followed by optional arguments provided a caller to get_tag.
1572
1573							Note that perl supports closures, which means that variables set in the
1574							caller's scope can be accessed inside this C<sub>. For example:
1575
1576							my $text = "hello world!";
1577							$status->set_tag("FOO", sub {
1578							my $pms = shift;
1579							return $text;
1580							});
1581
1582							See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1583							on how template tags are used.
1584
1585							C<undef> will be returned if a tag by that name has not been defined.
1586
1587							=cut
1588
1589							sub set_tag {
1590	897			897	1	2267	my($self,$tag,$val) = @_;
1591	897					2928	$self->{tag_data}->{uc $tag} = $val;
1592	897					2217	$self->tag_is_ready($tag);
1593							}
1594
1595							# public API for plugins
1596
1597							=item $string = $status->get_tag($tagname)
1598
1599							Get the current value of a template tag, as used in C<add_header>, report
1600							templates, etc. This API is intended for use by plugins. Tag names will be
1601							converted to an all-uppercase representation internally. See
1602							C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1603							tags.
1604
1605							C<undef> will be returned if a tag by that name has not been defined.
1606
1607							=cut
1608
1609							sub get_tag {
1610	95			95	1	334	my($self, $tag, @args) = @_;
1611
1612	95	50				323	return if !defined $tag;
1613	95					239	$tag = uc $tag;
1614	95					225	my $data;
1615	95	100				544	if (exists $common_tags{$tag}) {
		50
1616							# tag data from traditional pre-defined tag subroutines
1617	4					27	$data = $common_tags{$tag};
1618	4	50				34	$data = $data->($self,@args) if ref $data eq 'CODE';
1619	4	50				27	$data = join(' ',@$data) if ref $data eq 'ARRAY';
1620	4	50				20	$data = "" if !defined $data;
1621							} elsif (exists $self->{tag_data}->{$tag}) {
1622							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1623	91					267	$data = $self->{tag_data}->{$tag};
1624	91	50				284	$data = $data->($self,@args) if ref $data eq 'CODE';
1625	91	50				359	$data = join(' ',@$data) if ref $data eq 'ARRAY';
1626	91	50				372	$data = "" if !defined $data;
1627							}
1628	95					308	return $data;
1629							}
1630
1631							=item $string = $status->get_tag_raw($tagname, @args)
1632
1633							Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1634							and does not convert arrayref tag values into a single string.
1635
1636							=cut
1637
1638							sub get_tag_raw {
1639	430			430	1	1380	my($self, $tag, @args) = @_;
1640
1641	430	50				1265	return if !defined $tag;
1642	430					609	my $data;
1643	430	100				1429	if (exists $common_tags{$tag}) {
		50
1644							# tag data from traditional pre-defined tag subroutines
1645	362					860	$data = $common_tags{$tag};
1646	362	50				1683	$data = $data->($self,@args) if ref $data eq 'CODE';
1647	362	50				1043	$data = "" if !defined $data;
1648							} elsif (exists $self->{tag_data}->{$tag}) {
1649							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1650	68					212	$data = $self->{tag_data}->{$tag};
1651	68	50				1068	$data = $data->($self,@args) if ref $data eq 'CODE';
1652	68	50				229	$data = "" if !defined $data;
1653							}
1654	430					1277	return $data;
1655							}
1656
1657							###########################################################################
1658
1659							# public API for plugins
1660
1661							=item $status->set_spamd_result_item($subref)
1662
1663							Set an entry for the spamd result log line. C<$subref> should be a code
1664							reference for a subroutine which will return a string in C<'name=VALUE'>
1665							format, similar to the other entries in the spamd result line:
1666
1667							Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1668							DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1669							TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1670							TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1671							uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1672							rport=33153,mid=<9PS291LhupY>,autolearn=spam
1673
1674							C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1675							is important that these log lines are easy to parse.
1676
1677							The code reference will be called by spamd after the message has been scanned,
1678							and the C<PerMsgStatus::check()> method has returned.
1679
1680							=cut
1681
1682							sub set_spamd_result_item {
1683	81			81	1	267	my ($self, $ref) = @_;
1684	81					163	push @{$self->{spamd_result_log_items}}, $ref;
	81					342
1685							}
1686
1687							# called by spamd
1688							sub get_spamd_result_log_items {
1689	0			0	0	0	my ($self) = @_;
1690	0					0	my @ret;
1691	0					0	foreach my $ref (@{$self->{spamd_result_log_items}}) {
	0					0
1692	0					0	push @ret, &$ref;
1693							}
1694	0					0	return @ret;
1695							}
1696
1697							###########################################################################
1698
1699							sub _get_tag_value_for_yesno {
1700	36			36		126	my($self, $arg) = @_;
1701	36					76	my($arg_spam, $arg_ham);
1702	36	50				126	($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1703	36	50				233	return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
		50
		100
1704							: (defined $arg_ham ? $arg_ham : 'No');
1705							}
1706
1707							sub _get_tag_value_for_score {
1708	38			38		120	my ($self, $pad) = @_;
1709
1710	38					305	my $score = sprintf("%2.1f", $self->{score});
1711	38					166	my $rscore = $self->_get_tag_value_for_required_score();
1712
1713							#Change due to bug 6419 to use Util function for consistency with spamd
1714							#and PerMessageStatus
1715	38					274	$score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1716
1717							#$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1718	38	50	33			199	if (defined $pad && $pad =~ /^(0+\| +)$/) {
1719	0					0	my $count = length($1) + 3 - length($score);
1720	0	0				0	$score = (substr($pad, 0, $count) . $score) if $count > 0;
1721							}
1722	38					123	return $score;
1723
1724							}
1725
1726							sub _get_tag_value_for_required_score {
1727	76			76		135	my $self = shift;
1728	76					457	return sprintf("%2.1f", $self->{conf}->{required_score});
1729							}
1730
1731
1732							###########################################################################
1733
1734							=item $status->finish ()
1735
1736							Indicate that this C<$status> object is finished with, and can be destroyed.
1737
1738							If you are using SpamAssassin in a persistent environment, or checking many
1739							mail messages from one C<Mail::SpamAssassin> factory, this method should be
1740							called to ensure Perl's garbage collection will clean up old status objects.
1741
1742							=cut
1743
1744							sub finish {
1745	136			136	1	25640	my ($self) = @_;
1746
1747	136					846	$self->{main}->call_plugins ("per_msg_finish", {
1748							permsgstatus => $self
1749							});
1750
1751	136					821	$self->report_unsatisfied_actions;
1752
1753							# Delete out all of the members of $self. This will remove any direct
1754							# circular references and let the memory get reclaimed while also being more
1755							# efficient than a foreach() loop over the keys.
1756	136					311	%{$self} = ();
	136					4785
1757							}
1758
1759							sub finish_tests {
1760	0			0	0	0	my ($conf) = @_;
1761	0					0	foreach my $method (@TEMPORARY_METHODS) {
1762	0	0				0	if (defined &{$method}) {
	0					0
1763	0					0	undef &{$method};
	0					0
1764							}
1765							}
1766	0					0	@TEMPORARY_METHODS = (); # clear for next time
1767	0					0	%TEMPORARY_EVAL_GLUE_METHODS = ();
1768							}
1769
1770
1771							=item $name = $status->get_current_eval_rule_name()
1772
1773							Return the name of the currently-running eval rule. C<undef> is
1774							returned if no eval rule is currently being run. Useful for plugins
1775							to determine the current rule name while inside an eval test function
1776							call.
1777
1778							=cut
1779
1780							sub get_current_eval_rule_name {
1781	0			0	1	0	my ($self) = @_;
1782	0					0	return $self->{current_rule_name};
1783							}
1784
1785							###########################################################################
1786
1787							sub extract_message_metadata {
1788	96			96	0	260	my ($self) = @_;
1789
1790	96					439	my $timer = $self->{main}->time_method("extract_message_metadata");
1791	96					637	$self->{msg}->extract_message_metadata($self);
1792
1793	96					522	foreach my $item (qw(
1794							relays_trusted relays_trusted_str num_relays_trusted
1795							relays_untrusted relays_untrusted_str num_relays_untrusted
1796							relays_internal relays_internal_str num_relays_internal
1797							relays_external relays_external_str num_relays_external
1798							num_relays_unparseable last_trusted_relay_index
1799							last_internal_relay_index
1800							))
1801							{
1802	1440					3368	$self->{$item} = $self->{msg}->{metadata}->{$item};
1803							}
1804
1805							# TODO: International domain names (UTF-8) must be converted to
1806							# ASCII-compatible encoding (ACE) for the purpose of setting the
1807							# SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1808							# (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1809							#
1810	96					200	{ local $1;
	96					349
1811	96					691	my $addr = $self->get('EnvelopeFrom:addr', undef);
1812							# collect a FQDN, ignoring potential trailing WSP
1813	96	100	66			512	if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1814	1					14	$self->set_tag('SENDERDOMAIN', lc $1);
1815							}
1816							# TODO: the get ':addr' only returns the first address; this should be
1817							# augmented to be able to return all addresses in a header field, multiple
1818							# addresses in a From header field are allowed according to RFC 5322
1819	96					412	$addr = $self->get('From:addr', undef);
1820	96	100	66			1129	if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1821	57					667	$self->set_tag('AUTHORDOMAIN', lc $1);
1822							}
1823							}
1824
1825	96					494	$self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1826	96					313	$self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1827	96					316	$self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1828	96					309	$self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1829	96					468	$self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1830
1831							# This should happen before we get called, but just in case.
1832	96	100				408	if (!defined $self->{msg}->{metadata}->{html}) {
1833	95					403	$self->get_decoded_stripped_body_text_array();
1834							}
1835	96					351	$self->{html} = $self->{msg}->{metadata}->{html};
1836
1837							# allow plugins to add more metadata, read the stuff that's there, etc.
1838	96					578	$self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1839							}
1840
1841							###########################################################################
1842
1843							=item $status->get_decoded_body_text_array ()
1844
1845							Returns the message body, with B<base64> or B<quoted-printable> encodings
1846							decoded, and non-text parts or non-inline attachments stripped.
1847
1848							This is the same result text as used in 'rawbody' rules.
1849
1850							It is returned as an array of strings, with each string being a 2-4kB chunk
1851							of the body, split from boundaries if possible.
1852
1853							=cut
1854
1855							sub get_decoded_body_text_array {
1856	110			110	1	558	return $_[0]->{msg}->get_decoded_body_text_array();
1857							}
1858
1859							=item $status->get_decoded_stripped_body_text_array ()
1860
1861							Returns the message body, decoded (as described in
1862							get_decoded_body_text_array()), with HTML rendered, and with whitespace
1863							normalized.
1864
1865							This is the same result text as used in 'body' rules.
1866
1867							It will always render text/html.
1868
1869							It is returned as an array of strings, with each string representing one
1870							'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1871							blocks of multi-line text.
1872
1873							=cut
1874
1875							sub get_decoded_stripped_body_text_array {
1876	339			339	1	1387	return $_[0]->{msg}->get_rendered_body_text_array();
1877							}
1878
1879							###########################################################################
1880
1881							=item $status->get (header_name [, default_value])
1882
1883							Returns a message header, pseudo-header, real name or address.
1884							C<header_name> is the name of a mail header, such as 'Subject', 'To',
1885							etc. If C<default_value> is given, it will be used if the requested
1886							C<header_name> does not exist.
1887
1888							Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1889							or base-64 encoded strings.
1890
1891							Appending a modifier C<:addr> to a header field name will cause everything
1892							except the first email address to be removed from the header field. It is
1893							mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1894							their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1895							the following will result in "example@foo":
1896
1897							=over 4
1898
1899							=item example@foo
1900
1901							=item example@foo (Foo Blah)
1902
1903							=item example@foo, example@bar
1904
1905							=item display: example@foo (Foo Blah), example@bar ;
1906
1907							=item Foo Blah <example@foo>
1908
1909							=item "Foo Blah" <example@foo>
1910
1911							=item "'Foo Blah'" <example@foo>
1912
1913							=back
1914
1915							Appending a modifier C<:name> to a header field name will cause everything
1916							except the first display name to be removed from the header field. It is
1917							mainly applicable to header fields containing a single mail address: 'From',
1918							'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1919							For example, all of the following will result in "Foo Blah". One level of
1920							single quotes is stripped too, as it is often seen.
1921
1922							=over 4
1923
1924							=item example@foo (Foo Blah)
1925
1926							=item example@foo (Foo Blah), example@bar
1927
1928							=item display: example@foo (Foo Blah), example@bar ;
1929
1930							=item Foo Blah <example@foo>
1931
1932							=item "Foo Blah" <example@foo>
1933
1934							=item "'Foo Blah'" <example@foo>
1935
1936							=back
1937
1938							There are several special pseudo-headers that can be specified:
1939
1940							=over 4
1941
1942							=item C<ALL> can be used to mean the text of all the message's headers.
1943							Each header is decoded and unfolded to single line, unless called with :raw.
1944
1945							=item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1946							that could only have been added by trusted relays.
1947
1948							=item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1949							that could only have been added by internal relays.
1950
1951							=item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1952							headers that may have been added by untrusted relays. To make this
1953							pseudo-header more useful for header rules the 'Received' header that was added
1954							by the last trusted relay is included, even though it can be trusted.
1955
1956							=item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1957							that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1958							'Received' header added by the last internal relay is included.
1959
1960							=item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1961							headers.
1962
1963							=item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1964							transaction that delivered this message, if this data has been made available
1965							by the SMTP server.
1966
1967							=item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1968							some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1969							or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1970							returned for this symbol is the text from all 3 headers, separated by newlines.
1971
1972							=item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1973							the message has passed through
1974
1975							=item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1976							the message has passed through
1977
1978							=back
1979
1980							=cut
1981
1982							# only uses two arguments, ignores $defval
1983							sub _get {
1984	3494			3494		6762	my ($self, $request) = @_;
1985
1986	3494					4490	my $result;
1987	3494					4509	my $getaddr = 0;
1988	3494					4236	my $getname = 0;
1989	3494					4281	my $getraw = 0;
1990
1991							# special queries - process and strip modifiers
1992	3494	100				8504	if (index($request,':') >= 0) { # triage
1993	607					1500	local $1;
1994	607					4275	while ($request =~ s/:([^:]*)//) {
1995	607	100				2473	if ($1 eq 'raw') { $getraw = 1 }
	1	100				5
		50
1996	602					2155	elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1997	4					18	elsif ($1 eq 'name') { $getname = 1 }
1998							}
1999							}
2000	3494					6120	my $request_lc = lc $request;
2001
2002							# ALL: entire pristine or semi-raw headers
2003	3494	100				21319	if ($request eq 'ALL') {
		50
		100
		50
		50
		100
		50
		50
		50
		50
		100
		100
2004							return ($getraw ? $self->{msg}->get_pristine_header()
2005	15	50				159	: $self->{msg}->get_all_headers(0));
2006							}
2007							# ALL-TRUSTED: entire trusted raw headers
2008							elsif ($request eq 'ALL-TRUSTED') {
2009							# '+1' since we added the received header even though it's not considered
2010							# trusted, so we know that those headers can be trusted too
2011							return $self->get_all_hdrs_in_rcvd_index_range(
2012	0					0	undef, $self->{last_trusted_relay_index}+1,
2013							undef, undef, $getraw);
2014							}
2015							# ALL-INTERNAL: entire internal raw headers
2016							elsif ($request eq 'ALL-INTERNAL') {
2017							# '+1' for the same reason as in ALL-TRUSTED above
2018							return $self->get_all_hdrs_in_rcvd_index_range(
2019	4					26	undef, $self->{last_internal_relay_index}+1,
2020							undef, undef, $getraw);
2021							}
2022							# ALL-UNTRUSTED: entire untrusted raw headers
2023							elsif ($request eq 'ALL-UNTRUSTED') {
2024							# '+1' for the same reason as in ALL-TRUSTED above
2025							return $self->get_all_hdrs_in_rcvd_index_range(
2026	0					0	$self->{last_trusted_relay_index}+1, undef,
2027							undef, undef, $getraw);
2028							}
2029							# ALL-EXTERNAL: entire external raw headers
2030							elsif ($request eq 'ALL-EXTERNAL') {
2031							# '+1' for the same reason as in ALL-TRUSTED above
2032							return $self->get_all_hdrs_in_rcvd_index_range(
2033	0					0	$self->{last_internal_relay_index}+1, undef,
2034							undef, undef, $getraw);
2035							}
2036							# EnvelopeFrom: the SMTP MAIL FROM: address
2037							elsif ($request_lc eq "\LEnvelopeFrom") {
2038	110					659	$result = $self->get_envelope_from();
2039							}
2040							# untrusted relays list, as string
2041							elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
2042	0					0	$result = $self->{relays_untrusted_str};
2043							}
2044							# trusted relays list, as string
2045							elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
2046	0					0	$result = $self->{relays_trusted_str};
2047							}
2048							# external relays list, as string
2049							elsif ($request_lc eq "\LX-Spam-Relays-External") {
2050	0					0	$result = $self->{relays_external_str};
2051							}
2052							# internal relays list, as string
2053							elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
2054	0					0	$result = $self->{relays_internal_str};
2055							}
2056							# ToCc: the combined recipients list
2057							elsif ($request_lc eq "\LToCc") {
2058	81					546	$result = join("\n", $self->{msg}->get_header('To', $getraw));
2059	81	100				373	if ($result ne '') {
2060	8					39	chomp $result;
2061	8	50				62	$result .= ", " if $result =~ /\S/;
2062							}
2063	81					338	$result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
2064	81	100				459	$result = undef if $result eq '';
2065							}
2066							# MESSAGEID: handle lists which move the real message-id to another
2067							# header for resending.
2068							elsif ($request eq 'MESSAGEID') {
2069	42	50				547	$result = join("\n", grep { defined($_) && $_ ne '' }
2070							$self->{msg}->get_header('X-Message-Id', $getraw),
2071							$self->{msg}->get_header('Resent-Message-Id', $getraw),
2072							$self->{msg}->get_header('X-Original-Message-ID', $getraw),
2073	81					410	$self->{msg}->get_header('Message-Id', $getraw));
2074							}
2075							# a conventional header
2076							else {
2077							my @results = $getraw ? $self->{msg}->raw_header($request)
2078	3203	100				10249	: $self->{msg}->get_header($request);
2079							# dbg("message: get(%s)%s = %s",
2080							# $request, $getraw?'raw':'', join(", ",@results));
2081	3203	100				6154	if (@results) {
2082	357					1617	$result = join('', @results);
2083							} else { # metadata
2084	2846					7381	$result = $self->{msg}->get_metadata($request);
2085							}
2086							}
2087
2088							# special queries
2089	3475	100	100			10864	if (defined $result && ($getaddr \|\| $getname)) {
			100
2090	142					476	local $1;
2091	142					443	$result =~ s/^[^:]+:(.);\s$/$1/gs; # 'undisclosed-recipients: ;'
2092	142					1042	$result =~ s/\s+/ /g; # reduce whitespace
2093	142					656	$result =~ s/^\s+//; # leading whitespace
2094	142					687	$result =~ s/\s+$//; # trailing whitespace
2095
2096	142	100				438	if ($getaddr) {
		50
2097							# Get the email address out of the header
2098							# All of these should result in "jm@foo":
2099							# jm@foo
2100							# jm@foo (Foo Blah)
2101							# jm@foo, jm@bar
2102							# display: jm@foo (Foo Blah), jm@bar ;
2103							# Foo Blah <jm@foo>
2104							# "Foo Blah" <jm@foo>
2105							# "'Foo Blah'" <jm@foo>
2106							#
2107							# strip out the (comments)
2108	138					335	$result =~ s/\s$.?$//g;
2109							# strip out the "quoted text", unless it's the only thing in the string
2110	138	100				539	if ($result !~ /^".*"$/) {
2111	137					376	$result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
2112							}
2113							# Foo Blah <jm@xxx> or <jm@xxx>
2114	138					342	local $1;
2115	138					446	$result =~ s/^[^"<]?<(.?)>.*$/$1/;
2116							# multiple addresses on one line? remove all but first
2117	138					469	$result =~ s/,.*$//;
2118							}
2119							elsif ($getname) {
2120							# Get the display name out of the header
2121							# All of these should result in "Foo Blah":
2122							#
2123							# jm@foo (Foo Blah)
2124							# (Foo Blah) jm@foo
2125							# jm@foo (Foo Blah), jm@bar
2126							# display: jm@foo (Foo Blah), jm@bar ;
2127							# Foo Blah <jm@foo>
2128							# "Foo Blah" <jm@foo>
2129							# "'Foo Blah'" <jm@foo>
2130							#
2131	4					9	local $1;
2132							# does not handle mailbox-list or address-list or quotes well, to be improved
2133	4	100	66			53	if ($result =~ /^ \s* " (.?) (?<!\\)" \s < [^<>]* >/sx \|\|
		50
2134							$result =~ /^ \s* (.?) \s < [^<>]* >/sx) {
2135	3					11	$result = $1; # display-name, RFC 5322
2136							# name-addr = [display-name] angle-addr
2137							# display-name = phrase
2138							# phrase = 1*word / obs-phrase
2139							# word = atom / quoted-string
2140							# obs-phrase = word *(word / "." / CFWS)
2141	3					27	$result =~ s{ " ( (?: [^"\\] \| \\. )* ) " }
	0					0
	0					0
	0					0
2142	3					17	{ my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
2143							$result =~ s/\\"/"/gs;
2144							} elsif ($result =~ /^ [^(,]? $ (.?) $ /sx) { # legacy form
2145	0					0	# nested comments are not handled, to be improved
2146							$result = $1;
2147	1					4	} else { # no display name
2148							$result = '';
2149	4					19	}
2150							$result =~ s/^ \s* ' \s* (.?) \s ' \s* \z/$1/sx;
2151							}
2152	3475					9708	}
2153							return $result;
2154							}
2155
2156							# optimized for speed
2157							# $_[0] is self
2158							# $_[1] is request
2159							# $_[2] is defval
2160	4048			4048	1	25475	sub get {
2161	4048					5129	my $cache = $_[0]->{c};
2162	4048	100				9121	my $found;
2163							if (exists $cache->{$_[1]}) {
2164							# return cache entry if it is known
2165	554					1403	# (measured hit/attempts rate on a production mailer is about 47%)
2166							$found = $cache->{$_[1]};
2167							} else {
2168	3494					6780	# fill in a cache entry
2169	3494					11537	$found = _get(@_);
2170							$cache->{$_[1]} = $found;
2171							}
2172							# if the requested header wasn't found, we should return a default value
2173							# as specified by the caller: if defval argument is present it represents
2174							# a default value even if undef; if defval argument is absent a default
2175	4048	100				26758	# value is an empty string for upwards compatibility
		100
2176							return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2177							}
2178
2179							###########################################################################
2180
2181							# uri parsing from plain text:
2182							# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2183							# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2184							# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2185							# When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2186							# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2187							# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
2188							# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2189							# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2190							# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2191							# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2192							# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2193							#
2194							# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2195							# with URI-encodings and redirecting URIs.
2196							#
2197							# The delimiters for start of a URI in TBird are @(`{\|[\"'<>,\s in OE they are ("<\s
2198							#
2199							# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2200							# TBird's end delimiters are )`{}\|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2201							# OE only uses space as a delimiter, but ignores [~!@#^&()_+`-={}\|[]:";'<>?,.] at the end.
2202							#
2203							# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2204							# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2205							# delimiters include ( and '
2206							#
2207							# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2208
2209	130			130		379	sub _tbirdurire {
2210							my ($self) = @_;
2211
2212	130	50				497	# Cached?
2213							return $self->{tbirdurire} if exists $self->{tbirdurire};
2214
2215	130					354	# a hybrid of tbird and oe's version of uri parsing
2216	130					255	my $tbirdstartdelim = '><"\'`,{[(\|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2217	130					242	my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2218	130					301	my $tbirdenddelim = '><"`}\]{[\|\s' . "\x1b\xa0"; # The \x1b as per bug 4522 # \xa0 (nbsp) added 7/2019
2219							my $nonASCII = '\x80-\xff';
2220
2221	130					605	# schemeless uri start delimiter, combo of most punctuations and delims above
2222							my $scstartdelim = qr/[\!\"\#\$\&\'\*\+\,\/\:\;\<\=\>\?\@\[\\\]\^\`\{\\|\}\~\s\x1b\xa0]/;
2223
2224							# bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2225	130					425	# it's a common thing to find in text
2226	130					365	my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2227							my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2228
2229	130					403	# valid TLDs
2230							my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
2231
2232							# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2233	130					337748	# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2234	130					2474	my $urischemeless = qr/([a-z\d][a-z\d._-]{0,251}\.${tldsRE})\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,2048})?/i;
2235	130					2173	my $uriknownscheme = qr/(?:(?:https?\|ftp):\/\/\|(?:www\d{0,2}\|ftp)\.)[^$tbirdenddelim]{1,2048}/i;
2236							my $urimailscheme = qr/(?:mailto:[^$tbirdenddelimemail]{1,2048}\|[^$tbirdenddelimplusat]{1,251}\@[^$tbirdenddelimemail]{1,251})/i;
2237	130					344459
2238							$self->{tbirdurire} = qr/(?:\b\|(?<=$iso2022shift)\|(?<=[$tbirdstartdelim]))
2239							(?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]\|\z))) \|
2240							(?:($urimailscheme)(?=(?:[$tbirdenddelimemail]\|\z))) \|
2241							(?:(?:^\|(?<=$scstartdelim))($urischemeless)(?=(?:[$tbirdenddelim]\|\z))))/ix;
2242	130					1359
2243							return $self->{tbirdurire};
2244							}
2245
2246							=item $status->get_uri_list ()
2247
2248							Returns an array of all unique URIs found in the message. It takes
2249							a combination of the URIs found in the rendered (decoded and HTML
2250							stripped) body and the URIs found when parsing the HTML in the message.
2251							Will also set $status->{uri_list} (the array as returned by this function).
2252
2253							The returned array will include the "raw" URI as well as
2254							"slightly cooked" versions. For example, the single URI
2255							'http://%77w%77.example.com/' will get turned into:
2256							( 'http://%77w%77.example.com/', 'http://www.example.com/' )
2257
2258							=cut
2259
2260	130			130	1	523	sub get_uri_list {
2261							my ($self) = @_;
2262
2263	130	50				531	# use cached answer if available
2264	0					0	if (exists $self->{uri_list}) {
	0					0
2265							return @{$self->{uri_list}};
2266							}
2267	130					249
2268							my %uris;
2269							# $self->{redirect_num} = 0;
2270
2271	130					259	# get URIs from text/HTML parsing
	531					1091
2272	401	50				841	while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2273	401					500	if ($info->{cleaned}) {
	401					748
2274	540					1173	foreach (@{$info->{cleaned}}) {
2275							$uris{$_} = 1;
2276
2277							# count redirection attempts and log it
2278							# if (my @http = m{\b(https?:/{0,2})}gi) {
2279							# $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2280							# }
2281							}
2282							}
2283							}
2284	130					557
	130					461
2285							@{$self->{uri_list}} = keys %uris;
2286							# $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2287	130					254
	130					797
2288							return @{$self->{uri_list}};
2289							}
2290
2291							=item $status->get_uri_detail_list ()
2292
2293							Returns a hash reference of all unique URIs found in the message and
2294							various data about where the URIs were found in the message. It takes a
2295							combination of the URIs found in the rendered (decoded and HTML stripped)
2296							body and the URIs found when parsing the HTML in the message. Will also
2297							set $status->{uri_detail_list} (the hash reference as returned by this
2298							function).
2299
2300							The hash format looks something like this:
2301
2302							raw_uri => {
2303							types => { a => 1, img => 1, parsed => 1, domainkeys => 1,
2304							unlinked => 1, schemeless => 1 },
2305							cleaned => [ canonicalized_uri ],
2306							anchor_text => [ "click here", "no click here" ],
2307							domains => { domain1 => 1, domain2 => 1 },
2308							hosts => { host1 => domain1, host2 => domain2 },
2309							}
2310
2311							C<raw_uri> is whatever the URI was in the message itself
2312							(http://spamassassin.apache%2Eorg/). Uris parsed from text will be prefixed
2313							with scheme if missing (http://, mailto: etc). HTML uris are as found.
2314
2315							C<types> is a hash of the HTML tags (lowercase) which referenced the
2316							raw_uri. I<parsed> is a faked type which specifies that the raw_uri was
2317							seen in the rendered text. I<domainkeys> is defined when raw_uri was found
2318							from DK/DKIM d= field. I<unlinked> is defined when it's assumed that MUA
2319							will not linkify uri (found in body without scheme or www. prefix).
2320							I<schemeless> is always added for uris without scheme, regardless of
2321							linkifying (i.e. email address found in body without mailto:).
2322
2323							C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2324							(http://spamassassin.apache%2Eorg/, https://spamassassin.apache.org/).
2325
2326							C<anchor_text> is an array of the anchor text (text between <a> and
2327							</a>), if any, which linked to the URI.
2328
2329							C<domains> is a hash of the domains found in the canonicalized URIs.
2330
2331							C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2332							as hash keys, with their domain part stored as a value of each hash entry.
2333
2334							=cut
2335
2336	535			535	1	934	sub get_uri_detail_list {
2337							my ($self) = @_;
2338
2339							# process only once, use unique uri_detail_list_run flag,
2340	535	100				1114	# in case add_uri_detail_list has already been called
2341	405					1183	if ($self->{uri_detail_list_run}) {
2342							return $self->{uri_detail_list};
2343	130					330	}
2344							$self->{uri_detail_list_run} = 1;
2345	130					565
2346							my $timer = $self->{main}->time_method("get_uri_detail_list");
2347
2348	130					700	# process text parsed uris
2349							$self->_process_text_uri_list();
2350	130					888	# process html uris
2351							$self->_process_html_uri_list();
2352	130					738	# process dkim uris
2353							$self->_process_dkim_uri_list();
2354	130					811
2355							return $self->{uri_detail_list};
2356							}
2357
2358	130			130		330	sub _process_text_uri_list {
2359							my ($self) = @_;
2360
2361	130					474	# Use decoded stripped body, which does not contain HTML
2362	130					577	my $textary = $self->get_decoded_stripped_body_text_array();
2363	130					328	my $tbirdurire = $self->_tbirdurire;
2364	130					672	my %seen;
2365							my $would_log_uri_all = would_log('dbg', 'uri-all') == 2; # cache
2366	130					643
2367							foreach my $text (@$textary) {
2368							# a workaround for [perl #69973] bug:
2369							# Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2370							# Bug 6225, regexp and string should both be utf8, or none of them;
2371							# untainting string also seems to avoid the crash
2372							#
2373	669					1852	# Bug 6225: untaint the string in an attempt to work around a perl crash
2374							local $_ = untaint_var($text);
2375	669					1857
2376	669					67022	local($1,$2,$3);
2377	324		66			1526	while (/$tbirdurire/igo) {
2378	324					702	my $rawuri = $1\|\|$2\|\|$3;
2379	324	100				922	my $schost = $4;
		100
2380	324					785	my $rawtype = defined $1 ? 'scheme' : defined $2 ? 'mail' : 'schemeless';
2381	324					1940	$rawuri =~ s/(^[^(])\).$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceding it
2382							$rawuri =~ s/[-~!@#^&()_+=:;\'?,.]$//; # remove trailing string of punctuations that TBird ignores
2383	324	100				1143
2384	305					864	next if exists $seen{$rawuri};
2385							$seen{$rawuri} = 1;
2386	305	50				738
2387							dbg("uri: found rawuri from text ($rawtype): $rawuri") if $would_log_uri_all;
2388
2389	305	100	100			894	# Quick ignore if schemeless host not valid
2390							next if defined $schost && !is_fqdn_valid($schost, 1);
2391
2392							# Ignore cid: mid: as they can be mistaken for emails,
2393							# these should not be parsed from stripped body in any case.
2394	296	50				803	# Example: [cid:image001.png@01D4986E.E3459640]
2395							next if $rawuri =~ /^[cm]id:/i;
2396
2397	296	50				895	# Ignore empty uris
2398							next if $rawuri =~ /^\w+:\/{0,2}$/i;
2399	296					807
2400							my $types = {parsed => 1};
2401
2402							# If it's a hostname that was just sitting out in the
2403							# open, without a protocol, and not inside of an HTML tag,
2404							# the we should add the proper protocol in front, rather
2405	296					531	# than using the base URI.
2406	296	100				1031	my $uri = $rawuri;
2407	140	100				613	if ($uri !~ /^(?:https?\|ftp\|mailto):/i) {
		100
		100
2408	1					3	if ($uri =~ /^ftp\./i) {
2409							$uri = "ftp://$uri";
2410							}
2411	59					166	elsif ($uri =~ /^www\d{0,2}\./i) {
2412							$uri = "http://$uri";
2413							}
2414							elsif (index($uri, '@') != -1) {
2415							# This is not linkified by MUAs: foo@bar%2Ecom
2416							# This IS linkified: foo@bar%2Ebar.com
2417							# And this is linkified: foo@bar%2Ecom?foo.com&bar (woot??)
2418							# And this is linkified with Outlook: foo@bar%2Ecom&foo (woot??)
2419							# ...
2420							# Skip if no dot found after @, tested without urldecoding,
2421	29	50				158	# quick skip for crap like Vi@gra.
2422	29	50				96	next unless $uri =~ /\@.+?\./;
2423	29					69	next if index($uri, ' ') != -1; # ignore garbled
2424	29					85	$uri =~ s/^(?:skype\|e?-?mail)?:+//i; # strip common misparses
2425							$uri = "mailto:$uri";
2426							}
2427							else {
2428							# some spammers are using unschemed URIs to escape filters
2429							# flag that this is a URI that MUAs don't linkify so only use for RBLs
2430	51	50				124	# (TODO: why only use for RBLs?? why not uri rules? Use tflags to choose?)
2431	51					106	next if index($uri, '.') == -1; # skip unless dot found, garbage
2432	51					119	$uri = "http://$uri";
2433							$types->{unlinked} = 1;
2434							}
2435	140					296	# Mark any of those schemeless
2436							$types->{schemeless} = 1;
2437							}
2438	296	100				896
2439							if ($uri =~ /^mailto:/i) {
2440	36	50				109	# MUAs linkify and urldecode mailto:foo%40bar%2Fcom
2441							$uri = Mail::SpamAssassin::Util::url_encode($uri) if $uri =~ /\%[0-9a-f]{2}/i;
2442	36	100				143	# Skip unless @ found after decoding, then check tld is valid
2443	34					90	next unless $uri =~ /\@([^?&>]*)/;
	34					69
2444	34	100				133	my $host = $1; $host =~ s/(?:\%20)+$//; # strip trailing %20 from host
2445							next unless $self->{main}->{registryboundaries}->is_domain_valid($host);
2446							}
2447	291	50				628
2448							dbg("uri: parsed uri from text ($rawtype): $uri") if $would_log_uri_all;
2449	291					843
2450							$self->add_uri_detail_list($uri, $types, 'parsed', 1);
2451							}
2452							}
2453							}
2454
2455	130			130		411	sub _process_html_uri_list {
2456							my ($self) = @_;
2457
2458							# get URIs from HTML parsing
2459	130		100			1229	# use the metadata version since $self->{html} may not be setup
2460	130	50				559	my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} \|\| { };
2461							$self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2462
2463	130					314	# canonicalize the HTML parsed URIs
	257					1212
2464	127	50				308	while(my($uri, $info) = each %{ $detail }) {
2465							if ($self->add_uri_detail_list($uri, $info->{types}, 'html', 0)) {
2466	127	100				274	# Need also to copy and uniq anchor text
2467	118					176	if (exists $info->{anchor_text}) {
2468	118					166	my %seen;
	118					437
	118					260
2469	118					188	foreach (grep { !$seen{$_}++ } @{$info->{anchor_text}}) {
	118					469
2470							push @{$self->{uri_detail_list}->{$uri}->{anchor_text}}, $_;
2471							}
2472							}
2473							}
2474							}
2475							}
2476
2477	130			130		369	sub _process_dkim_uri_list {
2478							my ($self) = @_;
2479
2480							# This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2481							# this functionality
2482							# 2013-01-07
2483							# This functionality is re-enabled as a configuration option disabled by
2484							# default (bug 7087)
2485							# 2014-10-06
2486
2487	130	50				575	# Look for the domain in DK/DKIM headers
2488	130					547	if ($self->{conf}->{parse_dkim_uris}) {
	260					1209
2489							my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef ),
2490	130					697	$self->get('DKIM-Signature',undef) ));
2491	0					0	while ($dk =~ /\bd\s=\s([^;]+)/g) {
2492	0					0	my $d = $1;
2493							$d =~ s/\s+//g;
2494	0					0	# prefix with domainkeys: so it doesn't merge with identical keys
2495							$self->add_uri_detail_list("domainkeys:$d",
2496							{'domainkeys'=>1, 'nocanon'=>1, 'noclean'=>1},
2497							'domainkeys', 1);
2498							}
2499							}
2500							}
2501
2502							=item $status->add_uri_detail_list ($raw_uri, $types, $source, $valid_domain)
2503
2504							Adds values to internal uri_detail_list. When used from Plugins, recommended
2505							to call from parsed_metadata (along with register_method_priority, -10) so
2506							other Plugins calling get_uri_detail_list() will see it.
2507
2508							C<raw_uri> is the URI to be added. The only required parameter.
2509
2510							C<types> is an optional hash reference, contents are added to
2511							uri_detail_list->{types} (see get_uri_detail_list for known keys).
2512							I<parsed> is default is no hash given. I<nocanon> does not run
2513							uri_list_canonicalize (no redirector, uri fixing). I<noclean> skips adding
2514							uri_detail_list->{cleaned}, so it would not be used in "uri" rule checks,
2515							but domain/hosts would still be used for URIBL/RBL purposes.
2516
2517							C<source> is an optional simple string, only used for debug logging purposes
2518							to identify where uri originates from (default: "parsed").
2519
2520							C<valid_domain> is an optional boolean (0/1). If true, uri will not be
2521							added unless hostname/domain is in valid format and contains a valid TLD.
2522							(default: 0)
2523
2524							=cut
2525
2526	418			418	1	952	sub add_uri_detail_list {
2527							my ($self, $uri, $types, $source, $valid_domain) = @_;
2528	418	50				911
2529	418		50			822	$types = {'parsed' => 1} unless defined $types;
2530							$source \|\|= 'parsed';
2531	418					645
2532	418					672	my (%domains, %hosts, %cleaned);
2533							my $udl = $self->{uri_detail_list};
2534	418					1616
2535							dbg("uri: canonicalizing $source uri: $uri");
2536	418					642
2537	418	50				875	my @uris;
2538	0					0	if ($types->{nocanon}) {
2539							push @uris, $uri;
2540	418					1571	} else {
2541							@uris = uri_list_canonicalize($self->{conf}->{redirector_patterns}, $uri);
2542	418					976	}
2543							foreach my $cleanuri (@uris) {
2544	558	50				1216	# Make sure all the URIs are nice and short
2545	0					0	if (length($cleanuri) > MAX_URI_LENGTH) {
2546	0					0	$self->{'uri_truncated'} = 1;
2547							$cleanuri = substr($cleanuri, 0, MAX_URI_LENGTH);
2548	558					1914	}
2549	558					1068	dbg("uri: cleaned uri: $cleanuri");
2550	558					1937	$cleaned{$cleanuri} = 1;
2551	558	100				1323	my ($domain, $host) = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2552	445					1636	if (defined $domain) {
2553	445					1042	dbg("uri: added host: $host domain: $domain");
2554	445					1313	$domains{$domain} = 1;
2555							$hosts{$host} = $domain;
2556							}
2557							}
2558
2559	418	50				873	# Bail out if no good uri found
2560							return unless %cleaned;
2561
2562	418	100	100			1579	# Bail out if no domains/hosts found?
2563							return if $valid_domain && !%domains;
2564
2565	402	50				876	# Merge cleaned
2566	402	100				1087	if (!$types->{noclean}) {
2567	1					3	if ($udl->{$uri}->{cleaned}) {
	1					4
2568							$cleaned{$_} = 1 foreach (@{$udl->{$uri}->{cleaned}});
2569	402					825	}
	402					1389
2570							@{$udl->{$uri}->{cleaned}} = keys %cleaned;
2571							}
2572
2573	402					1550	# Domains/hosts (there might not be any)
2574	402					1404	$udl->{$uri}->{domains}->{$_} = 1 foreach keys %domains;
2575							$udl->{$uri}->{hosts}->{$_} = $hosts{$_} foreach keys %hosts;
2576
2577	402					1771	# Types
2578							$udl->{$uri}->{types}->{$_} = 1 foreach keys %$types;
2579
2580	402					725	# Invalidate uri_list cache
2581							delete $self->{uri_list};
2582	402					5317
2583							return 1;
2584							}
2585
2586
2587							###########################################################################
2588
2589	0			0	0	0	sub ensure_rules_are_complete {
2590	0					0	my $self = shift;
2591							my $metarule = shift;
2592							# @_ is now the list of rules
2593	0					0
2594							foreach my $r (@_) {
2595	0	0				0	# dbg("rules: meta rule depends on net rule $r");
2596							next if ($self->is_rule_complete($r));
2597	0					0
2598	0					0	dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2599							my $timer = $self->{main}->time_method("wait_for_pending_rules");
2600	0					0
2601	0					0	my $start = time;
2602	0					0	$self->harvest_until_rule_completes($r);
2603							my $elapsed = sprintf "%.2f", time - $start;
2604	0	0				0
		0
2605	0					0	if (!$self->is_rule_complete($r)) {
2606							dbg("rules: rule $r is still not complete; exited early?");
2607							}
2608	0					0	elsif ($elapsed > 0) {
2609							my $txt = "rules: $r took $elapsed seconds to complete, for $metarule";
2610	0	0				0	# Info only if something took over 1 sec to wait, prevent log flood
	0					0
	0					0
2611							if ($elapsed >= 1) { info($txt); } else { dbg($txt); }
2612							}
2613							}
2614							}
2615
2616							###########################################################################
2617
2618							# use a separate sub here, for brevity
2619							# called out of generated eval
2620	0			0	0	0	sub handle_eval_rule_errors {
2621	0					0	my ($self, $rulename) = @_;
2622	0					0	warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2623							$self->{rule_errors}++;
2624							}
2625
2626	3773			3773	0	7254	sub register_plugin_eval_glue {
2627							my ($self, $function) = @_;
2628	3773	50				7675
2629	0					0	if (!$function) {
2630	0					0	warn "rules: empty function name";
2631							return;
2632							}
2633
2634	3773	100				46879	# only need to call this once per fn (globally)
2635	283					692	return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2636							$TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2637
2638	283	50				1021	# return if it's not an eval_plugin function
2639							return if (!exists $self->{conf}->{eval_plugins}->{$function});
2640
2641							# return if it's been registered already
2642	283	50	33			2076	return if ($self->can ($function) &&
	0					0
2643							defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2644	283					907
2645							my $evalstr = <<"ENDOFEVAL";
2646							{
2647							package Mail::SpamAssassin::PerMsgStatus;
2648
2649							sub $function {
2650							my (\$self) = shift;
2651							my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2652							return \$plugin->$function (\$self, \@_);
2653							}
2654
2655							1;
2656							}
2657							ENDOFEVAL
2658	283	50		466	1	31832	eval $evalstr . '; 1' ## no critic
	466			101	1	1190
	466			81	0	1138
	466			81	0	1669
	101			4	0	371
	101			77	0	322
	101			93	1	601
	81			81	1	379
	81			81	0	313
	81			81	1	415
	81			133	0	315
	81			93	0	275
	81			81	0	391
	4			555	0	12
	4			81	0	21
	4			81	0	21
	77			4	0	285
	77			4	0	255
	77			4	0	439
	81			4	0	343
	81			4	0	269
	81			4	0	766
	81			28	0	384
	81			4	0	262
	81			4	0	492
	81			4	0	295
	81			0	0	308
	81			77	0	533
	81			77	0	323
	81			77	0	288
	81			77	0	427
	81			77	0	360
	81			77	0	320
	81			77	0	425
	81					310
	81					307
	81					526
	81					304
	81					372
	81					399
	543					1375
	543					1255
	543					1944
	81					321
	81					269
	81					492
	81					317
	81					258
	81					440
	4					13
	4					24
	4					24
	4					13
	4					16
	4					40
	4					17
	4					17
	4					19
	4					16
	4					29
	4					24
	4					11
	4					17
	4					29
	4					19
	4					21
	4					28
	28					77
	28					74
	28					105
	4					19
	4					20
	4					31
	4					19
	4					22
	4					31
	4					14
	4					16
	4					37
	0					0
	0					0
	0					0
	77					330
	77					299
	77					456
	77					282
	77					276
	77					395
	77					296
	77					269
	77					454
	77					316
	89					359
	89					463
	89					367
	77					286
	77					479
	77					299
	77					261
	77					447
	77					307
	77					303
	89					460
2659	0	0				0	or do {
	0					0
2660	0					0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2661	0					0	warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2662							$self->{rule_errors}++;
2663							};
2664
2665	283					4107	# ensure this method is deleted if finish_tests() is called
2666							push (@TEMPORARY_METHODS, $function);
2667							}
2668
2669							###########################################################################
2670
2671							# note: only eval tests should store state in $self->{test_log_msgs};
2672							# pattern tests do not.
2673							#
2674							# the clearing of the test state is now inlined as:
2675							#
2676							# %{$self->{test_log_msgs}} = (); # clear test state
2677							#
2678							# except for this public API for plugin use:
2679
2680							=item $status->clear_test_state()
2681
2682							Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2683
2684							=cut
2685
2686	12			0	1	39	sub clear_test_state {
2687	12					55	my ($self) = @_;
	12					58
2688							%{$self->{test_log_msgs}} = ();
2689							}
2690
2691							# internal API, called only by got_hit()
2692							# TODO: refactor and merge this into that function
2693	128			128		459	sub _handle_hit {
2694							my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2695	128					1090
2696							$self->{main}->call_plugins ("hit_rule", {
2697							permsgstatus => $self,
2698							rulename => $rule,
2699							ruletype => $ruletype,
2700							score => $score
2701							});
2702
2703	128	100				729	# ignore meta-match sub-rules.
	88					197
	76					215
	76					210
2704							if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2705
2706	56	50				294	# this should not happen; warn about it
2707	4					15	if (!defined $score) {
2708	16					92	warn "rules: score undef for rule '$rule' in '$area' '$desc'";
2709							return;
2710							}
2711
2712	56	50				279	# this should not happen; warn about NaN (bug 3364)
2713	4					19	if ($score != $score) {
2714	4					31	warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2715							return;
2716							}
2717
2718	56					268	# Add the rule hit to the score
2719							$self->{score} += $score;
2720	68					223
	56					200
2721	56		100			321	push(@{$self->{test_names_hit}}, $rule);
2722							$area \|\|= '';
2723	56	100	66			401
2724	17					90	if ($score >= 10 \|\| $score <= -10) {
2725							$score = sprintf("%4.0f", $score);
2726							}
2727	65					955	else {
2728							$score = sprintf("%4.1f", $score);
2729							}
2730
2731							# save both summaries
2732							# TODO: this is slower than necessary, if we only need one
2733							$self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2734							$score, $rule, $area,
2735							$self->_wrap_desc($desc,
2736							4+length($rule)+length($score)+length($area), "* "),
2737	56	100				530	($self->{test_log_msgs}->{TERSE} ?
2738							"* " . $self->{test_log_msgs}->{TERSE} : ''));
2739
2740							$self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2741							$score, $rule, $area,
2742							$self->_wrap_desc($desc,
2743	66		100			517	3+length($rule)+length($score)+length($area), " " x 28),
2744	142	50	33			788	($self->{test_log_msgs}->{LONG} \|\| ''));
2745	98					217	if((defined $self->{subjprefix}) and ($self->{subjprefix} ne "")) {
2746							$self->{tag_data}->{SUBJPREFIX} = $self->{subjprefix};
2747							}
2748							}
2749
2750	202			104		818	sub _wrap_desc {
2751							my ($self, $desc, $firstlinelength, $prefix) = @_;
2752	118					422
2753	108					522	my $firstline = " " x $firstlinelength;
2754	108					577	my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
2755	108					1020	$wrapped =~ s/^\s+//s;
2756							$wrapped;
2757							}
2758
2759							###########################################################################
2760
2761							=item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2762
2763							Register a hit against a rule in the ruleset.
2764
2765							There are two mandatory arguments. These are C<$rulename>, the name of the rule
2766							that fired, and C<$desc_prepend>, which is a short string that will be
2767							prepended to the rules C<describe> string in output reports.
2768
2769							In addition, callers can supplement that with the following optional
2770							data:
2771
2772							=over 4
2773
2774							=item score => $num
2775
2776							Optional: the score to use for the rule hit. If unspecified,
2777							the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2778							hash will be used (a configured score), and in its absence the
2779							C<defscore> option value.
2780
2781							=item defscore => $num
2782
2783							Optional: the score to use for the rule hit if neither the
2784							option C<score> is provided, nor a configured score value is provided.
2785
2786							=item value => $num
2787
2788							Optional: the value to assign to the rule; the default value is C<1>.
2789							I<tflags multiple> rules use values of greater than 1 to indicate
2790							multiple hits. This value is accessible to meta rules.
2791
2792							=item ruletype => $type
2793
2794							Optional, but recommended: the rule type string. This is used in the
2795							C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2796							used.
2797
2798							=item tflags => $string
2799
2800							Optional: a string, i.e. a space-separated list of additional tflags
2801							to be appended to an existing list of flags in $self->{conf}->{tflags},
2802							such as: "nice noautolearn multiple". No syntax checks are performed.
2803
2804							=item description => $string
2805
2806							Optional: a custom rule description string. This is used in the
2807							C<hit_rule> plugin call, called by this method. If unset, the static
2808							description is used.
2809
2810							=back
2811
2812							Backward compatibility: the two mandatory arguments have been part of this API
2813							since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2814							new addition in SpamAssassin 3.2.0.
2815
2816							=cut
2817
2818	132			132	1	2705	sub got_hit {
2819							my ($self, $rule, $area, %params) = @_;
2820	132					355
2821							my $conf_ref = $self->{conf};
2822	170					259
2823	170					305	my $dynamic_score_provided;
2824	170	100				390	my $score = $params{score};
2825	50					151	if (defined $score) { # overrides any configured scores
2826							$dynamic_score_provided = 1;
2827	178					666	} else {
2828	178	50				662	$score = $conf_ref->{scores}->{$rule};
2829							$score = $params{defscore} if !defined $score;
2830							}
2831
2832							# adding a hit does nothing if we don't have a score -- we probably
2833	136	50				408	# shouldn't have run it in the first place
2834	8					29	if (!$score) {
	8					16
2835	42					131	%{$self->{test_log_msgs}} = ();
2836							return;
2837							}
2838
2839							# ensure that rule values always result in an increase
2840	128					286	# of $self->{tests_already_hit}->{$rule}:
2841	128	100	66			385	my $value = $params{value};
	164					319
2842							if (!$value \|\| $value <= 0) { $value = 1 }
2843	128					270
2844	128					225	my $tflags_ref = $conf_ref->{tflags};
2845	170	50	33			507	my $tflags_add = $params{tflags};
2846							if (defined $tflags_add && $tflags_add ne '') {
2847	42	0	0			65	$_ = (!defined $_ \|\| $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2848							for $tflags_ref->{$rule};
2849							};
2850	170		100			680
2851							my $already_hit = $self->{tests_already_hit}->{$rule} \|\| 0;
2852	170	100	50			564	# don't count hits multiple times, unless 'tflags multiple' is on
			66
2853	42					186	if ($already_hit && ($tflags_ref->{$rule}\|\|'') !~ /\bmultiple\b/) {
	0					0
2854	42					305	%{$self->{test_log_msgs}} = ();
2855							return;
2856							}
2857	170					514
2858							$self->{tests_already_hit}->{$rule} = $already_hit + $value;
2859
2860	170		100			556	# default ruletype, if not specified:
2861							$params{ruletype} \|\|= 'unknown';
2862	170	100				626
2863	0					0	if ($dynamic_score_provided) { # copy it to static for proper reporting
2864	84					234	$conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2865							$conf_ref->{scores}->{$rule} = $score;
2866							}
2867	212					415
2868	212	50				550	my $rule_descr = $params{description};
2869	136					530	if (defined $rule_descr) {
2870							$conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2871	264					5552	} else {
2872							$rule_descr = $conf_ref->get_description_for_rule($rule); # static
2873							}
2874							# Bug 6880 Set Rule Description to something that says no rule
2875	186	100	66			1326	#$rule_descr = $rule if !defined $rule_descr \|\| $rule_descr eq '';
2876							$rule_descr = "No description available." if !defined $rule_descr \|\| $rule_descr eq '';
2877	186	50				573
2878	58					84	if(defined $self->{conf}->{rewrite_header}->{Subject}) {
2879	58	0				92	my $rule_subjprefix = $conf_ref->{subjprefix}->{$rule};
2880	58					120	if (defined $rule_subjprefix) {
2881	2	0				5	dbg("subjprefix: setting Subject prefix to $rule_subjprefix");
2882	68					179	if($self->{subjprefix} !~ /\Q$rule_subjprefix\E/) {
2883							$self->{subjprefix} .= $rule_subjprefix . " "; # save dynamic subject prefix.
2884							}
2885							}
2886							}
2887
2888							$self->_handle_hit($rule,
2889							$score,
2890							$area,
2891	196					752	$params{ruletype},
2892							$rule_descr);
2893
2894	186					479	# take care of duplicate rules, too (bug 5206)
2895	128	50	33			406	my $dups = $conf_ref->{duplicate_rules}->{$rule};
	12					29
2896	12					64	if ($dups && @{$dups}) {
	70					156
2897	58					163	foreach my $dup (@{$dups}) {
2898							$self->got_hit($dup, $area, %params);
2899							}
2900							}
2901	194					366
	208					452
2902	208					2039	%{$self->{test_log_msgs}} = (); # clear test logs
2903							return 1;
2904							}
2905
2906							###########################################################################
2907
2908							# TODO: this needs API doc
2909	118			4	0	285	sub test_log {
2910	12					41	my ($self, $msg) = @_;
2911	80					297	local $1;
2912	70					253	while ($msg =~ s/^(.{30,48})\s//) {
2913							$self->_test_log_line ($1);
2914	20					98	}
2915							$self->_test_log_line ($msg);
2916							}
2917
2918	20			4		66	sub _test_log_line {
2919							my ($self, $msg) = @_;
2920	20					114
2921	50	50				99	$self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2922	50					193	if (length($msg) > 47) {
2923							$self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
2924	50					102	} else {
2925							$self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2926							}
2927							}
2928
2929							###########################################################################
2930
2931							# helper for get(). Do not call directly, as get() caches its results
2932							# and this does not!
2933	96			98	0	309	sub get_envelope_from {
2934							my ($self) = @_;
2935
2936							# bug 2142:
2937							# Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2938							# calling app has helpfully marked up the source message
2939							# with it. Various MTAs and calling apps each have their
2940							# own idea of what header to use for this! see
2941	96					171
2942							my $envf;
2943
2944							# Rely on the 'envelope-sender-header' header if the user has configured one.
2945							# Assume that because they have configured it, their MTA will always add it.
2946	144	50				492	# This will prevent us falling through and picking up inappropriate headers.
2947							if (defined $self->{conf}->{envelope_sender_header}) {
2948	50					101	# make sure we get the most recent copy - there can be only one EnvelopeSender.
2949							$envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2950	0	0	0			0	# ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
			0
2951							goto ok if defined $envf && ($envf =~ /\@/ \|\| $envf eq '');
2952	50	0				154	# Warn them if it's configured, but not there or not usable.
2953	50					220	if (defined $envf) {
2954							chomp $envf;
2955	50					153	dbg("message: envelope_sender_header '%s: %s' is not an FQDN, ignoring",
2956							$self->{conf}->{envelope_sender_header}, $envf);
2957							} else {
2958	0					0	dbg("message: envelope_sender_header '%s' not found in message",
2959							$self->{conf}->{envelope_sender_header});
2960							}
2961	0					0	# Couldn't get envelope-sender using the configured header.
2962							return;
2963							}
2964
2965							# User hasn't given us a header to trust, so try to guess the sender.
2966
2967							# use the "envelope-sender" string found in the Received headers,
2968							# if possible... use the last untrusted header, in case there's
2969	94					226	# trusted headers.
2970	106	100				390	my $lasthop = $self->{relays_untrusted}->[0];
2971							if (!defined $lasthop) {
2972							# no untrusted headers? in that case, the message is ALL_TRUSTED.
2973	98					257	# use the first trusted header (ie. the oldest, originating one).
2974							$lasthop = $self->{relays_trusted}->[-1];
2975							}
2976	144	100				575
2977	83					222	if (defined $lasthop) {
2978							$envf = $lasthop->{envfrom};
2979							# TODO FIXME: Received.pm puts both null senders and absence-of-sender
2980	95	50	33			335	# into the relays array as '', so we can't distinguish them :(
2981	12					42	if ($envf && ($envf =~ /\@/)) {
2982							goto ok;
2983							}
2984							}
2985
2986							# WARNING: a lot of list software adds an X-Sender for the original env-from
2987							# (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2988							# reuse it as the env-from for its delivery -- even though the list
2989							# software had used a different env-from in the intervening delivery. Hence,
2990							# if this header is present, and there's a fetchmail sig in the Received
2991							# lines, we cannot trust any Envelope-From headers, since they're likely to
2992							# be incorrect fetchmail guesses.
2993	106	50				440
2994	0					0	if ($self->get("X-Sender") =~ /\@/) {
2995	12	0				56	my $rcvd = join(' ', $self->get("Received"));
2996	62					187	if ($rcvd =~ /\(fetchmail/) {
2997	62					269	dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2998							return;
2999							}
3000							}
3001
3002	156	50				1063	# procmailrc notes this (we now recommend adding it to Received instead)
3003							if ($envf = $self->get("X-Envelope-From")) {
3004							# heuristic: this could have been relayed via a list which then used
3005	54	0				115	# a new Envelope-from. check
3006	54					132	if ($self->get("ALL") =~ /^Received:.*?^X-Envelope-From:/smi) {
3007	54					189	dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
3008							return;
3009	33					189	} else {
3010							goto ok;
3011							}
3012							}
3013
3014	156	50				532	# qmail, new-inject(1)
3015							if ($envf = $self->get("Envelope-Sender")) {
3016							# heuristic: this could have been relayed via a list which then used
3017	57	0				180	# a new Envelope-from. check
3018	57					291	if ($self->get("ALL") =~ /^Received:.*?^Envelope-Sender:/smi) {
3019							dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
3020	57					146	} else {
3021							goto ok;
3022							}
3023							}
3024
3025							# Postfix, sendmail, amavisd-new, ...
3026							# RFC 2821 requires it:
3027							# When the delivery SMTP server makes the "final delivery" of a
3028							# message, it inserts a return-path line at the beginning of the mail
3029							# data. This use of return-path is required; mail systems MUST support
3030							# it. The return-path line preserves the information in the <reverse-
3031	106	100				476	# path> from the MAIL command.
3032							if ($envf = $self->get("Return-Path")) {
3033							# heuristic: this could have been relayed via a list which then used
3034	46	50				220	# a new Envelope-from. check
3035	4					11	if ($self->get("ALL") =~ /^Received:.*?^Return-Path:/smi) {
3036							dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
3037	5					16	} else {
3038							goto ok;
3039							}
3040							}
3041
3042	97					367	# give up.
3043							return;
3044	1					4
3045							ok:
3046	1					9	$envf =~ s/^<*//s; # remove <
3047							$envf =~ s/>\s\z//s; # remove >, whitespace, newlines
3048	1					4
3049							return $envf;
3050							}
3051
3052							###########################################################################
3053
3054							# helper for get(ALL-*). get() caches its results, so don't call this
3055							# directly unless you need a range of headers not covered by the ALL-*
3056							# psuedo-headers!
3057
3058							# Get all the headers found between an index range of received headers, the
3059							# index doesn't care if we could parse the received headers or not.
3060							# Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
3061							# first/last header in the message, otherwise indicate the index number you
3062							# want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
3063							# indicate you don't want to include the received header found at the start or
3064							# end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
3065	0			4	0	0	sub get_all_hdrs_in_rcvd_index_range {
3066							my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd, $getraw) = @_;
3067
3068	0	50	33			0	# prevent bad input causing us to return the first header found
3069							return if (defined $end_rcvd && $end_rcvd < 0);
3070	0	50				0
3071	0	50				0	$include_start_rcvd = 1 unless defined $include_start_rcvd;
3072							$include_end_rcvd = 1 unless defined $include_end_rcvd;
3073	4					11
3074	4					14	my $cur_rcvd_index = -1; # none found yet
3075							my $result = '';
3076	4					9
3077	4	50				13	my @hdrs;
3078	0					0	if ($getraw) {
3079							@hdrs = $self->{msg}->get_pristine_header() =~ /^([^ \t].*?\n)(?![ \t])/smgi;
3080	0					0	} else {
3081							@hdrs = split(/^/m, $self->{msg}->get_all_headers(0));
3082							}
3083	0					0
3084	4	50				29	foreach my $hdr (@hdrs) {
3085	0					0	if ($hdr =~ /^Received:/i) {
3086	0	0	0			0	$cur_rcvd_index++;
			0
3087							next if (defined $start_rcvd && !$include_start_rcvd &&
3088	0	0	0			0	$start_rcvd == $cur_rcvd_index);
			0
3089							last if (defined $end_rcvd && !$include_end_rcvd &&
3090							$end_rcvd == $cur_rcvd_index);
3091	0	50	33			0	}
		0	33
			33
			0
3092							if ((!defined $start_rcvd \|\| $start_rcvd <= $cur_rcvd_index) &&
3093	4					18	(!defined $end_rcvd \|\| $cur_rcvd_index < $end_rcvd)) {
3094							$result .= $hdr;
3095							}
3096	0					0	elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
3097	0					0	$result .= $hdr;
3098							last;
3099							}
3100	0	50				0	}
3101							return ($result eq '' ? undef : $result);
3102							}
3103
3104							###########################################################################
3105	0			88	0	0
3106							sub sa_die { Mail::SpamAssassin::sa_die(@_); }
3107
3108							###########################################################################
3109
3110							=item $status->create_fulltext_tmpfile (fulltext_ref)
3111
3112							This function creates a temporary file containing the passed scalar
3113							reference data (typically the full/pristine text of the message).
3114							This is typically used by external programs like pyzor and dccproc, to
3115							avoid hangs due to buffering issues. Methods that need this, should
3116							call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
3117							filename; it will be created if it has not already been.
3118
3119							Note: This can only be called once until $status->delete_fulltext_tmpfile() is
3120							called.
3121
3122							=cut
3123
3124	4			4	1	26	sub create_fulltext_tmpfile {
3125							my ($self, $fulltext) = @_;
3126	0	50				0
3127	0					0	if (defined $self->{fulltext_tmpfile}) {
3128							return $self->{fulltext_tmpfile};
3129							}
3130	0					0
3131	4	50				13	my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
3132							$tmpfh or die "failed to create a temporary file";
3133
3134							# PerlIO's buffered print writes in 8 kB chunks - which can be slow.
3135							# print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
3136							#
3137							# reducing the number of writes and bypassing extra buffering in PerlIO
3138	0					0	# speeds up writing of larger text by a factor of 2
3139	0					0	my $nwrites;
3140	0					0	for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
3141	4	50				23	$nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
3142							defined $nwrites or die "error writing to $tmpf: $!";
3143	0	50				0	}
3144							close $tmpfh or die "error closing $tmpf: $!";
3145	0					0
3146							$self->{fulltext_tmpfile} = $tmpf;
3147	0					0
3148							dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
3149							length($$fulltext), $tmpf);
3150	4					14
3151							return $self->{fulltext_tmpfile};
3152							}
3153
3154							=item $status->delete_fulltext_tmpfile ()
3155
3156							Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
3157							temporary file and uncaches the filename.
3158
3159							=cut
3160
3161	191			191	1	479	sub delete_fulltext_tmpfile {
3162	191	100				1149	my ($self) = @_;
3163	4	50				14	if (defined $self->{fulltext_tmpfile}) {
3164	4					12	if (!unlink $self->{fulltext_tmpfile}) {
3165							my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
3166	4	0				7	# don't fuss too much if file is missing, perhaps it wasn't even created
	4					9
	4					12
3167							if ($! == ENOENT) { warn $msg } else { die $msg }
3168	0					0	}
3169							$self->{fulltext_tmpfile} = undef;
3170							}
3171							}
3172
3173							###########################################################################
3174
3175	237			275	0	590	sub all_from_addrs {
3176							my ($self) = @_;
3177	237	100				692
	171					364
	154					1044
3178							if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3179	79					157
3180							my @addrs;
3181
3182	79					262	# Resent- headers take priority, if present. see bug 672
3183	96	50	33			534	my $resent = $self->get('Resent-From',undef);
3184	17					53	if (defined $resent && $resent =~ /\S/) {
3185							@addrs = $self->{main}->find_all_addrs_in_line ($resent);
3186							}
3187							else {
3188							# bug 2292: Used to use find_all_addrs_in_line() with the same
3189							# headers, but the would catch addresses in comments which caused
3190							# FNs for things like whitelist_from. Since all of these are From
3191							# headers, there should only be 1 address in each anyway (not exactly
3192							# true, RFC 2822 allows multiple addresses in a From header field),
3193							# so use the :addr code...
3194							# bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3195							# so deal with the multiple periods.
3196	79					293	## no critic
	39					237
	43					265
	395					926
3197							@addrs = map { tr/././s; $_ } grep { $_ ne '' }
3198							($self->get('From:addr'), # std
3199							$self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3200							$self->get('Resent-Sender:addr'), # procmailrc manpage
3201							$self->get('X-Envelope-From:addr'), # procmailrc manpage
3202							$self->get('EnvelopeFrom:addr')); # SMTP envelope
3203							# http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3204							}
3205
3206	83					328	# Remove duplicate addresses
	43					219
3207	79					271	my %addrs = map { $_ => 1 } @addrs;
3208							@addrs = keys %addrs;
3209	83					492
3210	83					380	dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3211	83					755	$self->{all_from_addrs} = \@addrs;
3212							return @addrs;
3213							}
3214
3215							=item all_from_addrs_domains
3216
3217							This function returns all the various from addresses in a message using all_from_addrs()
3218							and then returns only the domain names.
3219
3220							=cut
3221
3222	4			50	1	26	sub all_from_addrs_domains {
3223							my ($self) = @_;
3224	4	0				62
3225	4					278	if (exists $self->{all_from_addrs_domains}) {
	4					65
3226							return @{$self->{all_from_addrs_domains}};
3227							}
3228
3229							#TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3230	4					32	#Start with all the normal from addrs
3231							my @addrs = all_from_addrs($self);
3232	4					29
3233							dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3234
3235							#Take just the domain with a dummy localpart
3236	4					27	#removing invalid and duplicate domains
3237	12					28	my(%addrs_seen, @addrs_filtered);
3238	12					57	foreach my $a (@addrs) {
3239	4	0	0			235	my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3240	0					0	next if !defined $domain \|\| $addrs_seen{lc $domain}++;
3241							push(@addrs_filtered, 'dummy@'.$domain);
3242							}
3243	0					0
3244							dbg("eval: all '*From' addrs domains (after uri to domain): " .
3245							join(" ", @addrs_filtered));
3246	0					0
3247							$self->{all_from_addrs_domains} = \@addrs_filtered;
3248	0					0
3249							return @addrs_filtered;
3250							}
3251
3252	160			240	0	393	sub all_to_addrs {
3253							my ($self) = @_;
3254	168	100				496
	89					217
	85					1460
3255							if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
3256	87					185
3257							my @addrs;
3258
3259	83					269	# Resent- headers take priority, if present. see bug 672
3260	83	50				383	my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3261	4					21	if ($resent =~ /\S/) {
3262							@addrs = $self->{main}->find_all_addrs_in_line($resent);
3263							} else {
3264							# OK, a fetchmail trick: try to find the recipient address from
3265							# the most recent 3 Received lines. This is required for sendmail,
3266							# since it does not add a helpful header like exim, qmail
3267							# or Postfix do.
3268	79					236	#
3269	83					377	my $rcvd = $self->get('Received');
3270	83					312	$rcvd =~ s/\n[ \t]+/ /gs;
3271							$rcvd =~ s/\n+/\n/gs;
3272	83					421
	99					256
3273	83					178	my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3274	83					272	my @rcvdaddrs;
3275	47	100				233	foreach my $line (@rcvdlines) {
	6					33
3276							if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3277							}
3278
3279	83					440	@addrs = $self->{main}->find_all_addrs_in_line (
3280							join('',
3281							join(" ", @rcvdaddrs)."\n",
3282							$self->get('To'), # std
3283							$self->get('Apparently-To'), # sendmail, from envelope
3284							$self->get('Delivered-To'), # Postfix, poss qmail
3285							$self->get('Envelope-Recipients'), # qmail: new-inject(1)
3286							$self->get('Apparently-Resent-To'), # procmailrc manpage
3287							$self->get('X-Envelope-To'), # procmailrc manpage
3288							$self->get('Envelope-To'), # exim
3289							$self->get('X-Delivered-To'), # procmail quick start
3290							$self->get('X-Original-To'), # procmail quick start
3291							$self->get('X-Rcpt-To'), # procmail quick start
3292							$self->get('X-Real-To'), # procmail quick start
3293							$self->get('Cc'))); # std
3294							# those are taken from various sources; thanks to Nancy McGough, who
3295							# noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3296							}
3297	83					621
3298	79					287	dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3299	79					1958	$self->{all_to_addrs} = \@addrs;
3300							return @addrs;
3301
3302							# http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3303							# http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
3304							}
3305
3306							###########################################################################
3307
3308							1;
3309							__END__
3310
3311							=back
3312
3313							=head1 SEE ALSO
3314
3315							Mail::SpamAssassin(3)
3316							spamassassin(1)
3317