File Coverage

blib/lib/Mail/SpamAssassin/PerMsgStatus.pm

Criterion	Covered	Total	%
statement	827	1188	69.6
branch	246	510	48.2
condition	85	217	39.1
subroutine	90	118	76.2
pod	47	89	52.8
total	1295	2122	61.0

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::PerMsgStatus - per-message status (spam or not-spam)
21
22							=head1 SYNOPSIS
23
24							my $spamtest = new Mail::SpamAssassin ({
25							'rules_filename' => '/etc/spamassassin.rules',
26							'userprefs_filename' => $ENV{HOME}.'/.spamassassin/user_prefs'
27							});
28							my $mail = $spamtest->parse();
29
30							my $status = $spamtest->check ($mail);
31
32							my $rewritten_mail;
33							if ($status->is_spam()) {
34							$rewritten_mail = $status->rewrite_mail ();
35							}
36							...
37
38
39							=head1 DESCRIPTION
40
41							The Mail::SpamAssassin C<check()> method returns an object of this
42							class. This object encapsulates all the per-message state.
43
44							=head1 METHODS
45
46							=over 4
47
48							=cut
49
50							package Mail::SpamAssassin::PerMsgStatus;
51
52	40			40		274	use strict;
	40					90
	40					1308
53	40			40		237	use warnings;
	40					106
	40					1493
54	40			40		260	use re 'taint';
	40					100
	40					1424
55
56	40			40		286	use Errno qw(ENOENT);
	40					97
	40					2186
57	40			40		286	use Time::HiRes qw(time);
	40					552
	40					415
58
59	40			40		5227	use Mail::SpamAssassin::Constants qw(:sa);
	40					101
	40					5629
60	40			40		12797	use Mail::SpamAssassin::AsyncLoop;
	40					122
	40					1264
61	40			40		922	use Mail::SpamAssassin::Conf;
	40					94
	40					1096
62	40			40		233	use Mail::SpamAssassin::Util qw(untaint_var uri_list_canonicalize);
	40					75
	40					2235
63	40			40		13100	use Mail::SpamAssassin::Timeout;
	40					99
	40					1280
64	40			40		251	use Mail::SpamAssassin::Logger;
	40					76
	40					42880
65
66							our @ISA = qw();
67
68							# methods defined by the compiled ruleset; deleted in finish_tests()
69							our @TEMPORARY_METHODS;
70
71							# methods defined by register_plugin_eval_glue(); deleted in finish_tests()
72							our %TEMPORARY_EVAL_GLUE_METHODS;
73
74							###########################################################################
75
76							our %common_tags;
77
78							BEGIN {
79							%common_tags = (
80
81							YESNO => sub {
82	34					79	my $pms = shift;
83	34					168	$pms->_get_tag_value_for_yesno(@_);
84							},
85
86							YESNOCAPS => sub {
87	2					5	my $pms = shift;
88	2					16	uc $pms->_get_tag_value_for_yesno(@_);
89							},
90
91							SCORE => sub {
92	38					86	my $pms = shift;
93	38					176	$pms->_get_tag_value_for_score(@_);
94							},
95
96							HITS => sub {
97	0					0	my $pms = shift;
98	0					0	$pms->_get_tag_value_for_score(@_);
99							},
100
101							REQD => sub {
102	38					89	my $pms = shift;
103	38					133	$pms->_get_tag_value_for_required_score(@_);
104							},
105
106							VERSION => \&Mail::SpamAssassin::Version,
107
108	34					143	SUBVERSION => sub { $Mail::SpamAssassin::SUB_VERSION },
109
110							RULESVERSION => sub {
111	0					0	my $pms = shift;
112	0					0	my $conf = $pms->{conf};
113	0					0	my @fnames;
114							@fnames =
115	0	0				0	keys %{$conf->{update_version}} if $conf->{update_version};
	0					0
116	0	0				0	@fnames = sort @fnames if @fnames > 1;
117	0					0	join(',', map($conf->{update_version}{$_}, @fnames));
118							},
119
120							HOSTNAME => sub {
121	38					99	my $pms = shift;
122							$pms->{conf}->{report_hostname} \|\|
123	38	50				397	Mail::SpamAssassin::Util::fq_hostname();
124							},
125
126							REMOTEHOSTNAME => sub {
127	0					0	my $pms = shift;
128	0	0				0	$pms->{tag_data}->{'REMOTEHOSTNAME'} \|\| "localhost";
129							},
130
131							REMOTEHOSTADDR => sub {
132	0					0	my $pms = shift;
133	0	0				0	$pms->{tag_data}->{'REMOTEHOSTADDR'} \|\| "127.0.0.1";
134							},
135
136							LASTEXTERNALIP => sub {
137	0					0	my $pms = shift;
138	0					0	my $lasthop = $pms->{relays_external}->[0];
139	0	0				0	$lasthop ? $lasthop->{ip} : '';
140							},
141
142							LASTEXTERNALRDNS => sub {
143	0					0	my $pms = shift;
144	0					0	my $lasthop = $pms->{relays_external}->[0];
145	0	0				0	$lasthop ? $lasthop->{rdns} : '';
146							},
147
148							LASTEXTERNALHELO => sub {
149	0					0	my $pms = shift;
150	0					0	my $lasthop = $pms->{relays_external}->[0];
151	0	0				0	$lasthop ? $lasthop->{helo} : '';
152							},
153
154							CONTACTADDRESS => sub {
155	4					9	my $pms = shift;
156	4					31	$pms->{conf}->{report_contact};
157							},
158
159							BAYES => sub {
160	0					0	my $pms = shift;
161							defined $pms->{bayes_score} ? sprintf("%3.4f", $pms->{bayes_score})
162	0	0				0	: "0.5";
163							},
164
165	0					0	DATE => sub { Mail::SpamAssassin::Util::time_to_rfc822_date() },
166
167							STARS => sub {
168	34					67	my $pms = shift;
169	34		50			112	my $arg = (shift \|\| "*");
170	34					112	my $length = int($pms->{score});
171	34	50				129	$length = 50 if $length > 50;
172							# avoid a perl 5.21 warning: "Negative repeat count does nothing"
173	34	100				143	$length > 0 ? $arg x $length : '';
174							},
175
176							AUTOLEARN => sub {
177	34					84	my $pms = shift;
178	34					168	$pms->get_autolearn_status();
179							},
180
181							AUTOLEARNSCORE => sub {
182	0					0	my $pms = shift;
183	0					0	$pms->get_autolearn_points();
184							},
185
186							TESTS => sub {
187	34					83	my $pms = shift;
188	34		50			194	my $arg = (shift \|\| ',');
189	34	100				70	join($arg, sort @{$pms->{test_names_hit}}) \|\| "none";
	34					211
190							},
191
192							SUBTESTS => sub {
193	0					0	my $pms = shift;
194	0		0			0	my $arg = (shift \|\| ',');
195	0	0				0	join($arg, sort @{$pms->{subtest_names_hit}}) \|\| "none";
	0					0
196							},
197
198							TESTSSCORES => sub {
199	0					0	my $pms = shift;
200	0		0			0	my $arg = (shift \|\| ",");
201	0					0	my $scores = $pms->{conf}->{scores};
202							join($arg, map($_ . "=" . ($scores->{$_} \|\| '0'),
203	0	0	0			0	sort @{$pms->{test_names_hit}})) \|\| "none";
	0					0
204							},
205
206							PREVIEW => sub {
207	4					10	my $pms = shift;
208	4					27	$pms->get_content_preview();
209							},
210
211							REPORT => sub {
212	3					7	my $pms = shift;
213	3		50			29	"\n" . ($pms->{tag_data}->{REPORT} \|\| "");
214							},
215
216							HEADER => sub {
217	0					0	my $pms = shift;
218	0					0	my $hdr = shift;
219	0	0				0	return if !$hdr;
220	0					0	$pms->get($hdr,undef);
221							},
222
223							TIMING => sub {
224	0					0	my $pms = shift;
225	0					0	$pms->{main}->timer_report();
226							},
227
228							ADDEDHEADERHAM => sub {
229	0					0	my $pms = shift;
230	0					0	$pms->_get_added_headers('headers_ham');
231							},
232
233							ADDEDHEADERSPAM => sub {
234	0					0	my $pms = shift;
235	0					0	$pms->_get_added_headers('headers_spam');
236							},
237
238							ADDEDHEADER => sub {
239	0					0	my $pms = shift;
240							$pms->_get_added_headers(
241	0	0				0	$pms->{is_spam} ? 'headers_spam' : 'headers_ham');
242							},
243
244	40			40		522755	);
245							}
246
247							sub new {
248	142			142	0	649	my $class = shift;
249	142		33			907	$class = ref($class) \|\| $class;
250	142					472	my ($main, $msg, $opts) = @_;
251
252							my $self = {
253							'main' => $main,
254							'msg' => $msg,
255							'score' => 0,
256							'test_log_msgs' => { },
257							'test_names_hit' => [ ],
258							'subtest_names_hit' => [ ],
259							'spamd_result_log_items' => [ ],
260							'tests_already_hit' => { },
261							'c' => { },
262							'tag_data' => { },
263							'rule_errors' => 0,
264							'disable_auto_learning' => 0,
265							'auto_learn_status' => undef,
266							'auto_learn_force_status' => undef,
267							'conf' => $main->{conf},
268							'async' => Mail::SpamAssassin::AsyncLoop->new($main),
269							'master_deadline' => $msg->{master_deadline}, # dflt inherited from msg
270	142					2054	'deadline_exceeded' => 0, # time limit exceeded, skipping further tests
271							};
272							#$self->{main}->{use_rule_subs} = 1;
273
274							dbg("check: pms new, time limit in %.3f s",
275	142	50				1259	$self->{master_deadline} - time) if $self->{master_deadline};
276
277	142	50	66			1101	if (defined $opts && $opts->{disable_auto_learning}) {
278	49					205	$self->{disable_auto_learning} = 1;
279							}
280
281							# used with "mass-check --loghits"
282	142	50				553	if ($self->{main}->{save_pattern_hits}) {
283	0					0	$self->{save_pattern_hits} = 1;
284	0					0	$self->{pattern_hits} = { };
285							}
286
287	142					308	delete $self->{should_log_rule_hits};
288	142					730	my $dbgcache = would_log('dbg', 'rules');
289	142	50	33			1212	if ($dbgcache \|\| $self->{save_pattern_hits}) {
290	0					0	$self->{should_log_rule_hits} = 1;
291							}
292
293							# known valid tags that might not get their entry in pms->{tag_data}
294							# in some circumstances
295	142					382	my $tag_data_ref = $self->{tag_data};
296	142					413	foreach (qw(SUMMARY REPORT RBL)) { $tag_data_ref->{$_} = '' }
	426					1449
297	142					673	foreach (qw(AWL AWLMEAN AWLCOUNT AWLPRESCORE
298							DCCB DCCR DCCREP PYZOR DKIMIDENTITY DKIMDOMAIN
299							BAYESTC BAYESTCLEARNED BAYESTCSPAMMY BAYESTCHAMMY
300							HAMMYTOKENS SPAMMYTOKENS TOKENSUMMARY)) {
301	2414					5539	$tag_data_ref->{$_} = undef; # exist, but undefined
302							}
303
304	142					519	bless ($self, $class);
305	142					501	$self;
306							}
307
308							sub DESTROY {
309	141			141		3111560	my ($self) = shift;
310	141					344	local $@;
311	141					389	eval { $self->delete_fulltext_tmpfile() }; # Bug 5808
	141					671
312							}
313
314							###########################################################################
315
316							=item $status->check ()
317
318							Runs the SpamAssassin rules against the message pointed to by the object.
319
320							=cut
321
322							sub check {
323	97			97	1	258	my ($self) = shift;
324	97					337	my $master_deadline = $self->{master_deadline};
325	97	50				322	if (!$master_deadline) {
326	0					0	$self->check_timed(@_);
327							} else {
328	97					1371	my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
329	97			97		1198	my $err = $t->run(sub { $self->check_timed(@_) });
	97					537
330	96	50	33			1128	if (time > $master_deadline && !$self->{deadline_exceeded}) {
331	0					0	info("check: exceeded time limit in pms check");
332	0					0	$self->{deadline_exceeded} = 1;
333							}
334							}
335							}
336
337							sub check_timed {
338	97			97	0	304	my ($self) = @_;
339	97					216	local ($_);
340
341	97					370	$self->{learned_points} = 0;
342	97					286	$self->{body_only_points} = 0;
343	97					267	$self->{head_only_points} = 0;
344	97					215	$self->{score} = 0;
345
346							# clear NetSet cache before every check to prevent it growing too large
347	97					326	foreach my $nset_name (qw(internal_networks trusted_networks msa_networks)) {
348	291					902	my $netset = $self->{conf}->{$nset_name};
349	291	50				1347	$netset->ditch_cache() if $netset;
350							}
351
352	97					911	$self->{main}->call_plugins ("check_start", { permsgstatus => $self });
353
354							# in order of slowness; fastest first, slowest last.
355							# we do ALL the tests, even if a spam triggers lots of them early on.
356							# this lets us see ludicrously spammish mails (score: 40) etc., which
357							# we can then immediately submit to spamblocking services.
358							#
359							# TODO: change this to do whitelist/blacklists first? probably a plan
360							# NOTE: definitely need AWL stuff last, for regression-to-mean of score
361
362							# TVD: we may want to do more than just clearing out the headers, but ...
363	97					900	$self->{msg}->delete_header('X-Spam-.*');
364
365							# Resident Mail::SpamAssassin code will possibly never change score
366							# sets, even if bayes becomes available. So we should do a quick check
367							# to see if we should go from {0,1} to {2,3}. We of course don't need
368							# to do this switch if we're already using bayes ... ;)
369	97					1074	my $set = $self->{conf}->get_score_set();
370	97	0	66			1378	if (($set & 2) == 0 && $self->{main}->{bayes_scanner} && $self->{main}->{bayes_scanner}->is_scan_available() && $self->{conf}->{use_bayes_rules}) {
			66
			33
371	0					0	dbg("check: scoreset $set but bayes is available, switching scoresets");
372	0					0	$self->{conf}->set_score_set ($set\|2);
373							}
374	97					566	dbg("check: using scoreset $set in M:S:Pms");
375							# The primary check functionality occurs via a plugin call. For more
376							# information, please see: Mail::SpamAssassin::Plugin::Check
377	97	100				482	if (!$self->{main}->call_plugins ("check_main", { permsgstatus => $self }))
378							{
379							# did anything happen? if not, this is fatal
380	1	50				4	if (!$self->{main}->have_plugin("check_main")) {
381	1					21	die "check: no loaded plugin implements 'check_main': cannot scan!\n".
382							"Check that the necessary '.pre' files are in the config directory.\n".
383							"At a minimum, v320.pre loads the Check plugin which is required.\n";
384							}
385							}
386
387							# delete temporary storage and memory allocation used during checking
388	96					760	$self->delete_fulltext_tmpfile();
389
390							# now that we've finished checking the mail, clear out this cache
391							# to avoid unforeseen side-effects.
392	96					1256	$self->{c} = { };
393
394							# Round the score to 3 decimal places to avoid rounding issues
395							# We assume required_score to be properly rounded already.
396							# add 0 to force it back to numeric representation instead of string.
397	96					1739	$self->{score} = (sprintf "%0.3f", $self->{score}) + 0;
398
399							dbg("check: is spam? score=".$self->{score}.
400	96					974	" required=".$self->{conf}->{required_score});
401	96					658	dbg("check: tests=".$self->get_names_of_tests_hit());
402	96					574	dbg("check: subtests=".$self->get_names_of_subtests_hit());
403	96					502	$self->{is_spam} = $self->is_spam();
404
405	96					558	$self->{main}->{resolver}->bgabort();
406	96					571	$self->{main}->call_plugins ("check_end", { permsgstatus => $self });
407
408	96					1742	1;
409							}
410
411							###########################################################################
412
413							=item $status->learn()
414
415							After a mail message has been checked, this method can be called. If the score
416							is outside a certain range around the threshold, ie. if the message is judged
417							more-or-less definitely spam or definitely non-spam, it will be fed into
418							SpamAssassin's learning systems (currently the naive Bayesian classifier),
419							so that future similar mails will be caught.
420
421							=cut
422
423							sub learn {
424	96			96	1	247	my ($self) = shift;
425	96					218	my $master_deadline = $self->{master_deadline};
426	96	50				272	if (!$master_deadline) {
427	0					0	$self->learn_timed(@_);
428							} else {
429	96					516	my $t = Mail::SpamAssassin::Timeout->new({ deadline => $master_deadline });
430	96			96		972	my $err = $t->run(sub { $self->learn_timed(@_) });
	96					691
431	96	50	33			998	if (time > $master_deadline && !$self->{deadline_exceeded}) {
432	0					0	info("learn: exceeded time limit in pms learn");
433	0					0	$self->{deadline_exceeded} = 1;
434							}
435							}
436							}
437
438							sub learn_timed {
439	96			96	0	250	my ($self) = @_;
440
441	96	100	100			711	if (!$self->{conf}->{bayes_auto_learn} \|\|
			100
442							!$self->{conf}->{use_bayes} \|\|
443							$self->{disable_auto_learning})
444							{
445	84					327	$self->{auto_learn_status} = "disabled";
446	84					1195	return;
447							}
448
449	12					34	my ($isspam, $force_autolearn, $force_autolearn_names, $arrayref);
450	12					90	$arrayref = $self->{main}->call_plugins ("autolearn_discriminator", {
451							permsgstatus => $self
452							});
453
454	12					35	$isspam = $arrayref->[0];
455	12					29	$force_autolearn = $arrayref->[1];
456	12					23	$force_autolearn_names = $arrayref->[2];
457
458							#AUTOLEARN_FORCE FLAG INFORMATION
459	12	50	66			61	if (defined $force_autolearn and $force_autolearn > 0) {
460	0					0	$self->{auto_learn_force_status} = "yes";
461	0	0				0	if (defined $force_autolearn_names) {
462	0					0	$self->{auto_learn_force_status} .= " ($force_autolearn_names)";
463							}
464							} else {
465	12					34	$self->{auto_learn_force_status} = "no";
466							}
467
468	12	100				48	if (!defined $isspam) {
469	10					23	$self->{auto_learn_status} = 'no';
470	10					138	return;
471							}
472
473
474	2					7	my $timer = $self->{main}->time_method("learn");
475
476	2					12	$self->{main}->call_plugins ("autolearn", {
477							permsgstatus => $self,
478							isspam => $isspam
479							});
480
481							# bug 3704: temporarily override learn's ability to re-learn a message
482	2					14	my $orig_learner = $self->{main}->init_learner({ "no_relearn" => 1 });
483
484	2					2	my $eval_stat;
485							eval {
486	2					18	my $learnstatus = $self->{main}->learn ($self->{msg}, undef, $isspam, 0);
487	2	50				5	if ($learnstatus->did_learn()) {
488	0	0				0	$self->{auto_learn_status} = $isspam ? "spam" : "ham";
489							}
490							# This must wait until the did_learn call.
491	2					7	$learnstatus->finish();
492	2					14	$self->{main}->finish_learner(); # for now
493
494	2	50				10	if (exists $self->{main}->{bayes_scanner}) {
495	2					10	$self->{main}->{bayes_scanner}->force_close();
496							}
497	2					17	1;
498	2	50				5	} or do {
499	0	0				0	$eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
500							};
501
502							# reset learner options to their original values
503	2					9	$self->{main}->init_learner($orig_learner);
504
505	2	50				32	if (defined $eval_stat) {
506	0					0	dbg("learn: auto-learning failed: $eval_stat");
507	0					0	$self->{auto_learn_status} = "failed";
508							}
509							}
510
511							=item $score = $status->get_autolearn_points()
512
513							Return the message's score as computed for auto-learning. Certain tests are
514							ignored:
515
516							- rules with tflags set to 'learn' (the Bayesian rules)
517
518							- rules with tflags set to 'userconf' (user white/black-listing rules, etc)
519
520							- rules with tflags set to 'noautolearn'
521
522							Also note that auto-learning occurs using scores from either scoreset 0 or 1,
523							depending on what scoreset is used during message check. It is likely that the
524							message check and auto-learn scores will be different.
525
526							=cut
527
528							sub get_autolearn_points {
529	12			12	1	27	my ($self) = @_;
530	12					71	$self->_get_autolearn_points();
531	12					40	return $self->{autolearn_points};
532							}
533
534							=item $score = $status->get_head_only_points()
535
536							Return the message's score as computed for auto-learning, ignoring
537							all rules except for header-based ones.
538
539							=cut
540
541							sub get_head_only_points {
542	12			12	1	25	my ($self) = @_;
543	12					34	$self->_get_autolearn_points();
544	12					30	return $self->{head_only_points};
545							}
546
547							=item $score = $status->get_learned_points()
548
549							Return the message's score as computed for auto-learning, ignoring
550							all rules except for learning-based ones.
551
552							=cut
553
554							sub get_learned_points {
555	12			12	1	29	my ($self) = @_;
556	12					38	$self->_get_autolearn_points();
557	12					29	return $self->{learned_points};
558							}
559
560							=item $score = $status->get_body_only_points()
561
562							Return the message's score as computed for auto-learning, ignoring
563							all rules except for body-based ones.
564
565							=cut
566
567							sub get_body_only_points {
568	12			12	1	38	my ($self) = @_;
569	12					37	$self->_get_autolearn_points();
570	12					32	return $self->{body_only_points};
571							}
572
573							=item $score = $status->get_autolearn_force_status()
574
575							Return whether a message's score included any rules that are flagged as
576							autolearn_force.
577
578							=cut
579
580							sub get_autolearn_force_status {
581	12			12	1	34	my ($self) = @_;
582	12					36	$self->_get_autolearn_points();
583	12					31	return $self->{autolearn_force};
584							}
585
586							=item $rule_names = $status->get_autolearn_force_names()
587
588							Return a list of comma separated list of rule names if a message's
589							score included any rules that are flagged as autolearn_force.
590
591							=cut
592
593							sub get_autolearn_force_names {
594	12			12	1	24	my ($self) = @_;
595	12					29	my ($names);
596
597	12					38	$self->_get_autolearn_points();
598	12					25	$names = $self->{autolearn_force_names};
599
600	12	50				40	if (defined $names) {
601							#remove trailing comma
602	0					0	$names =~ s/,$//;
603							} else {
604	12					19	$names = "";
605							}
606
607	12					30	return $names;
608							}
609
610							sub _get_autolearn_points {
611	72			72		101	my ($self) = @_;
612
613	72	100				134	return if (exists $self->{autolearn_points});
614							# ensure it only gets computed once, even if we return early
615	12					38	$self->{autolearn_points} = 0;
616
617							# This function needs to use use sum($score[scoreset % 2]) not just {score}.
618							# otherwise we shift what we autolearn on and it gets really wierd. - tvd
619	12					48	my $orig_scoreset = $self->{conf}->get_score_set();
620	12					23	my $new_scoreset = $orig_scoreset;
621	12					31	my $scores = $self->{conf}->{scores};
622
623	12	50				51	if (($orig_scoreset & 2) == 0) { # we don't need to recompute
624	12					50	dbg("learn: auto-learn: currently using scoreset $orig_scoreset");
625							}
626							else {
627	0					0	$new_scoreset = $orig_scoreset & ~2;
628	0					0	dbg("learn: auto-learn: currently using scoreset $orig_scoreset, recomputing score based on scoreset $new_scoreset");
629	0					0	$scores = $self->{conf}->{scoreset}->[$new_scoreset];
630							}
631
632	12					38	my $tflags = $self->{conf}->{tflags};
633	12					32	my $points = 0;
634
635							# Just in case this function is called multiple times, clear out the
636							# previous calculated values
637	12					33	$self->{learned_points} = 0;
638	12					23	$self->{body_only_points} = 0;
639	12					21	$self->{head_only_points} = 0;
640	12					29	$self->{autolearn_force} = 0;
641
642	12					20	foreach my $test (@{$self->{test_names_hit}}) {
	12					61
643							# According to the documentation, noautolearn, userconf, and learn
644							# rules are ignored for autolearning.
645	19	100				86	if (exists $tflags->{$test}) {
646	6	100				23	next if $tflags->{$test} =~ /\bnoautolearn\b/;
647	5	50				37	next if $tflags->{$test} =~ /\buserconf\b/;
648
649							# Keep track of the learn points for an additional autolearn check.
650							# Use the original scoreset since it'll be 0 in sets 0 and 1.
651	0	0				0	if ($tflags->{$test} =~ /\blearn\b/) {
652							# we're guaranteed that the score will be defined
653	0					0	$self->{learned_points} += $self->{conf}->{scoreset}->[$orig_scoreset]->{$test};
654	0					0	next;
655							}
656
657							#IF ANY RULES ARE AUTOLEARN FORCE, SET THAT FLAG
658	0	0				0	if ($tflags->{$test} =~ /\bautolearn_force\b/) {
659	0					0	$self->{autolearn_force}++;
660							#ADD RULE NAME TO LIST
661	0					0	$self->{autolearn_force_names}.="$test,";
662							}
663							}
664
665							# ignore tests with 0 score (or undefined) in this scoreset
666	13	50				47	next if !$scores->{$test};
667
668							# Go ahead and add points to the proper locations
669							# Changed logic because in testing, I was getting both head and body. Bug 5503
670	13	100				68	if ($self->{conf}->maybe_header_only ($test)) {
		50
671	9					31	$self->{head_only_points} += $scores->{$test};
672	9					68	dbg("learn: auto-learn: adding head_only points $scores->{$test}");
673							} elsif ($self->{conf}->maybe_body_only ($test)) {
674	4					13	$self->{body_only_points} += $scores->{$test};
675	4					29	dbg("learn: auto-learn: adding body_only points $scores->{$test}");
676							} else {
677	0					0	dbg("learn: auto-learn: not considered head or body scores: $scores->{$test}");
678							}
679
680	13					43	$points += $scores->{$test};
681							}
682
683							# Figure out the final value we'll use for autolearning
684	12					134	$points = (sprintf "%0.3f", $points) + 0;
685	12					114	dbg("learn: auto-learn: message score: ".$self->{score}.", computed score for autolearn: $points");
686
687	12					40	$self->{autolearn_points} = $points;
688							}
689
690							###########################################################################
691
692							=item $isspam = $status->is_spam ()
693
694							After a mail message has been checked, this method can be called. It will
695							return 1 for mail determined likely to be spam, 0 if it does not seem
696							spam-like.
697
698							=cut
699
700							sub is_spam {
701	99			99	1	282	my ($self) = @_;
702							# changed to test this so sub-tests can ask "is_spam" during a run
703	99					548	return ($self->{score} >= $self->{conf}->{required_score});
704							}
705
706							###########################################################################
707
708							=item $list = $status->get_names_of_tests_hit ()
709
710							After a mail message has been checked, this method can be called. It will
711							return a comma-separated string, listing all the symbolic test names
712							of the tests which were triggered by the mail.
713
714							=cut
715
716							sub get_names_of_tests_hit {
717	97			97	1	291	my ($self) = @_;
718
719	97					212	return join(',', sort @{$self->{test_names_hit}});
	97					716
720							}
721
722							=item $list = $status->get_names_of_tests_hit_with_scores_hash ()
723
724							After a mail message has been checked, this method can be called. It will
725							return a pointer to a hash for rule & score pairs for all the symbolic
726							test names and individual scores of the tests which were triggered by the mail.
727
728							=cut
729
730							sub get_names_of_tests_hit_with_scores_hash {
731	0			0	1	0	my ($self) = @_;
732
733							#BASED ON CODE FOR TESTSSCORES TAG
734	0					0	my $scores = $self->{conf}->{scores};
735	0					0	my %testsscores;
736	0		0			0	$testsscores{$_} = $scores->{$_} \|\| '0' for @{$self->{test_names_hit}};
	0					0
737	0					0	return \%testsscores;
738							}
739
740							=item $list = $status->get_names_of_tests_hit_with_scores ()
741
742							After a mail message has been checked, this method can be called. It will
743							return a comma-separated string of rule=score pairs for all the symbolic
744							test names and individual scores of the tests which were triggered by the mail.
745
746							=cut
747
748							sub get_names_of_tests_hit_with_scores {
749	0			0	1	0	my ($self) = @_;
750
751							#BASED ON CODE FOR TESTSSCORES TAG
752	0					0	my $scores = $self->{conf}->{scores};
753							return join(',', map($_ . '=' . ($scores->{$_} \|\| '0'),
754	0		0			0	sort @{$self->{test_names_hit}})) \|\| "none";
755							}
756
757
758							###########################################################################
759
760							=item $list = $status->get_names_of_subtests_hit ()
761
762							After a mail message has been checked, this method can be called. It will
763							return a comma-separated string, listing all the symbolic test names of the
764							meta-rule sub-tests which were triggered by the mail. Sub-tests are the
765							normally-hidden rules, which score 0 and have names beginning with two
766							underscores, used in meta rules.
767
768							=cut
769
770							sub get_names_of_subtests_hit {
771	96			96	1	256	my ($self) = @_;
772
773	96					262	return join(',', sort @{$self->{subtest_names_hit}});
	96					737
774							}
775
776							###########################################################################
777
778							=item $num = $status->get_score ()
779
780							After a mail message has been checked, this method can be called. It will
781							return the message's score.
782
783							=cut
784
785							sub get_score {
786	0			0	1	0	my ($self) = @_;
787	0					0	return $self->{score};
788							}
789
790							# left as backward compatibility
791							sub get_hits {
792	0			0	0	0	my ($self) = @_;
793	0					0	return $self->{score};
794							}
795
796							###########################################################################
797
798							=item $num = $status->get_required_score ()
799
800							After a mail message has been checked, this method can be called. It will
801							return the score required for a mail to be considered spam.
802
803							=cut
804
805							sub get_required_score {
806	0			0	1	0	my ($self) = @_;
807	0					0	return $self->{conf}->{required_score};
808							}
809
810							# left as backward compatibility
811							sub get_required_hits {
812	0			0	0	0	my ($self) = @_;
813	0					0	return $self->{conf}->{required_score};
814							}
815
816							###########################################################################
817
818							=item $num = $status->get_autolearn_status ()
819
820							After a mail message has been checked, this method can be called. It will
821							return one of the following strings depending on whether the mail was
822							auto-learned or not: "ham", "no", "spam", "disabled", "failed", "unavailable".
823
824							It also returns is flagged with auto_learn_force, it will also include the status
825							and the rules hit. For example: "autolearn_force=yes (AUTOLEARNTEST_BODY)"
826
827							=cut
828
829							sub get_autolearn_status {
830	34			34	1	82	my ($self) = @_;
831	34		50			139	my ($status) = $self->{auto_learn_status} \|\| "unavailable";
832
833	34	100				126	if (defined $self->{auto_learn_force_status}) {
834	2					9	$status .= " autolearn_force=".$self->{auto_learn_force_status};
835							}
836
837	34					83	return $status;
838							}
839
840							###########################################################################
841
842							=item $report = $status->get_report ()
843
844							Deliver a "spam report" on the checked mail message. This contains details of
845							how many spam detection rules it triggered.
846
847							The report is returned as a multi-line string, with the lines separated by
848							C<\n> characters.
849
850							=cut
851
852							sub get_report {
853	4			4	1	1132	my ($self) = @_;
854
855	4	50				19	if (!exists $self->{'report'}) {
856	4					9	my $report;
857
858	4					19	my $timer = $self->{main}->time_method("get_report");
859	4					18	$report = $self->{conf}->{report_template};
860	4		50			26	$report \|\|= '(no report template found)';
861
862	4					26	$report = $self->_replace_tags($report);
863
864	4					329	$report =~ s/\n*$/\n\n/s;
865	4					20	$self->{report} = $report;
866							}
867
868	4					16	return $self->{report};
869							}
870
871							###########################################################################
872
873							=item $preview = $status->get_content_preview ()
874
875							Give a "preview" of the content.
876
877							This is returned as a multi-line string, with the lines separated by C<\n>
878							characters, containing a fully-decoded, safe, plain-text sample of the first
879							few lines of the message body.
880
881							=cut
882
883							sub get_content_preview {
884	4			4	1	21	my ($self) = @_;
885
886	4					12	my $str = '';
887	4					20	my $ary = $self->get_decoded_stripped_body_text_array();
888	4					10	shift @{$ary}; # drop the subject line
	4					11
889
890	4					11	my $numlines = 3;
891	4		33			38	while (length ($str) < 200 && @{$ary} && $numlines-- > 0) {
	16		66			106
892	12					21	$str .= shift @{$ary};
	12					42
893							}
894	4					9	undef $ary;
895
896							# in case the last line was huge, trim it back to around 200 chars
897	4					25	local $1;
898	4					12	$str =~ s/^(.{200}).+$/$1 [...]/gm;
899	4					12	chomp ($str); $str .= "\n";
	4					9
900
901							# now, some tidy-ups that make things look a bit prettier
902	4					27	$str =~ s/-----Original Message-----.*$//gm;
903	4					18	$str =~ s/This is a multi-part message in MIME format\.//gs;
904	4					24	$str =~ s/[-_*.]{10,}//gs;
905	4					67	$str =~ s/\s+/ /gs;
906
907							# add "Content preview:" ourselves, so that the text aligns
908							# correctly with the template -- then trim it off. We don't
909							# have to get this exactly right, but it's nicer if we
910							# make a bit of an effort ;)
911	4					27	$str = Mail::SpamAssassin::Util::wrap($str, " ", "Content preview: ", 75, 1);
912	4					39	$str =~ s/^Content preview:\s+//gs;
913
914	4					22	return $str;
915							}
916
917							###########################################################################
918
919							=item $msg = $status->get_message()
920
921							Return the object representing the message being scanned.
922
923							=cut
924
925							sub get_message {
926	0			0	1	0	my ($self) = @_;
927	0					0	return $self->{msg};
928							}
929
930							###########################################################################
931
932							=item $status->rewrite_mail ()
933
934							Rewrite the mail message. This will at minimum add headers, and at
935							maximum MIME-encapsulate the message text, to reflect its spam or not-spam
936							status. The function will return a scalar of the rewritten message.
937
938							The actual modifications depend on the configuration (see
939							C<Mail::SpamAssassin::Conf> for more information).
940
941							The possible modifications are as follows:
942
943							=over 4
944
945							=item To:, From: and Subject: modification on spam mails
946
947							Depending on the configuration, the To: and From: lines can have a
948							user-defined RFC 2822 comment appended for spam mail. The subject line
949							may have a user-defined string prepended to it for spam mail.
950
951							=item X-Spam-* headers for all mails
952
953							Depending on the configuration, zero or more headers with names
954							beginning with C<X-Spam-> will be added to mail depending on whether
955							it is spam or ham.
956
957							=item spam message with report_safe
958
959							If report_safe is set to true (1), then spam messages are encapsulated
960							into their own message/rfc822 MIME attachment without any modifications
961							being made.
962
963							If report_safe is set to false (0), then the message will only have the
964							above headers added/modified.
965
966							=back
967
968							=cut
969
970							sub rewrite_mail {
971	34			34	1	336	my ($self) = @_;
972
973	34					142	my $timer = $self->{main}->time_method("rewrite_mail");
974	34		100			220	my $msg = $self->{msg}->get_mbox_separator() \|\| '';
975
976	34	100	66			171	if ($self->{is_spam} && $self->{conf}->{report_safe}) {
977	2					20	$msg .= $self->rewrite_report_safe();
978							}
979							else {
980	32					173	$msg .= $self->rewrite_no_report_safe();
981							}
982
983	34					150	return $msg;
984							}
985
986							# Make the line endings in the passed string reference appropriate
987							# for the original mail. Callers must note bug 5250: don't rewrite
988							# the message body, since that will corrupt 8bit attachments/MIME parts.
989							#
990							sub _fixup_report_line_endings {
991	36			36		124	my ($self, $strref) = @_;
992	36	50				175	if ($self->{msg}->{line_ending} ne "\n") {
993	0					0	$$strref =~ s/\r?\n/$self->{msg}->{line_ending}/gs;
994							}
995							}
996
997							sub _get_added_headers {
998	34			34		96	my ($self, $which) = @_;
999	34					81	my $str = '';
1000							# use string appends to put this back together -- I finally benchmarked it.
1001							# join() is 56% of the speed of just using string appends. ;)
1002	34					83	foreach my $hf_ref (@{$self->{conf}->{$which}}) {
	34					179
1003	168					579	my($hfname, $hfbody) = @$hf_ref;
1004	168					435	my $line = $self->_process_header($hfname,$hfbody);
1005	168					511	$line = $self->qp_encode_header($line);
1006	168					687	$str .= "X-Spam-$hfname: $line\n";
1007							}
1008	34					116	return $str;
1009							};
1010
1011							# rewrite the message in report_safe mode
1012							# should not be called directly, use rewrite_mail instead
1013							#
1014							sub rewrite_report_safe {
1015	2			2	0	9	my ($self) = @_;
1016
1017							# This is the original message. We do not want to make any modifications so
1018							# we may recover it if necessary. It will be put into the new message as a
1019							# message/rfc822 MIME part.
1020	2					12	my $original = $self->{msg}->get_pristine();
1021
1022							# This is the new message.
1023	2					7	my $newmsg = '';
1024
1025							# the report charset
1026	2					10	my $report_charset = "; charset=iso-8859-1";
1027	2	50				13	if ($self->{conf}->{report_charset}) {
1028	0					0	$report_charset = "; charset=" . $self->{conf}->{report_charset};
1029							}
1030
1031							# the SpamAssassin report
1032	2					13	my $report = $self->get_report();
1033
1034							# If there are any wide characters, need to MIME-encode in UTF-8
1035							# TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
1036							# we could try converting to that charset if possible
1037	2	50	33			25	unless ($] < 5.008 \|\| utf8::downgrade($report, 1)) {
1038	0					0	$report_charset = "; charset=utf-8";
1039	0					0	utf8::encode($report);
1040							}
1041
1042							# get original headers, "pristine" if we can do it
1043	2					24	my $from = $self->{msg}->get_pristine_header("From");
1044	2					9	my $to = $self->{msg}->get_pristine_header("To");
1045	2					12	my $cc = $self->{msg}->get_pristine_header("Cc");
1046	2					11	my $subject = $self->{msg}->get_pristine_header("Subject");
1047	2					11	my $msgid = $self->{msg}->get_pristine_header('Message-Id');
1048	2					19	my $date = $self->{msg}->get_pristine_header("Date");
1049
1050							# It'd be nice to do this with a foreach loop, but with only three
1051							# possibilities right now, it's easier not to...
1052
1053	2	50				14	if (defined $self->{conf}->{rewrite_header}->{Subject}) {
1054	0	0				0	$subject = "\n" if !defined $subject;
1055	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{Subject});
1056	0					0	$tag =~ s/\n/ /gs; # strip tag's newlines
1057	0					0	$subject =~ s/^(?:\Q${tag}\E )?/${tag} /g; # For some reason the tag may already be there!?
1058							}
1059
1060	2	50				9	if (defined $self->{conf}->{rewrite_header}->{To}) {
1061	0	0				0	$to = "\n" if !defined $to;
1062	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{To});
1063	0					0	$tag =~ s/\n/ /gs; # strip tag's newlines
1064	0					0	$to =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1065							}
1066
1067	2	50				8	if (defined $self->{conf}->{rewrite_header}->{From}) {
1068	0	0				0	$from = "\n" if !defined $from;
1069	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{From});
1070	0					0	$tag =~ s/\n+//gs; # strip tag's newlines
1071	0					0	$from =~ s/(?:\t\Q(${tag})\E)?$/\t(${tag})/;
1072							}
1073
1074							# add report headers to message
1075	2	50				13	$newmsg .= "From: $from" if defined $from;
1076	2	50				11	$newmsg .= "To: $to" if defined $to;
1077	2	50				8	$newmsg .= "Cc: $cc" if defined $cc;
1078	2	50				12	$newmsg .= "Subject: $subject" if defined $subject;
1079	2	50				20	$newmsg .= "Date: $date" if defined $date;
1080	2	50				23	$newmsg .= "Message-Id: $msgid" if defined $msgid;
1081	2					20	$newmsg .= $self->_get_added_headers('headers_spam');
1082
1083	2	50				14	if (defined $self->{conf}->{report_safe_copy_headers}) {
1084	2					9	my %already_added = map { $_ => 1 } qw/from to cc subject date message-id/;
	12					44
1085
1086	2					6	foreach my $hdr (@{$self->{conf}->{report_safe_copy_headers}}) {
	2					12
1087	0	0				0	next if exists $already_added{lc $hdr};
1088	0					0	my @hdrtext = $self->{msg}->get_pristine_header($hdr);
1089	0					0	$already_added{lc $hdr}++;
1090
1091	0	0				0	if (lc $hdr eq "received") { # add Received at the top ...
1092	0					0	my $rhdr = "";
1093	0					0	foreach (@hdrtext) {
1094	0					0	$rhdr .= "$hdr: $_";
1095							}
1096	0					0	$newmsg = "$rhdr$newmsg";
1097							}
1098							else {
1099	0					0	foreach (@hdrtext) {
1100	0					0	$newmsg .= "$hdr: $_";
1101							}
1102							}
1103							}
1104							}
1105
1106							# jm: add a SpamAssassin Received header to note markup time etc.
1107							# emulates the fetchmail style.
1108							# tvd: do this after report_safe_copy_headers so Received will be done correctly
1109	2					8	$newmsg = "Received: from localhost by " .
1110							Mail::SpamAssassin::Util::fq_hostname() . "\n" .
1111							"\twith SpamAssassin (version " .
1112							Mail::SpamAssassin::Version() . ");\n" .
1113							"\t" . Mail::SpamAssassin::Util::time_to_rfc822_date() . "\n" .
1114							$newmsg;
1115
1116							# MIME boundary
1117	2					20	my $boundary = "----------=_" . sprintf("%08X.%08X",time,int(rand(2 ** 32)));
1118
1119							# ensure it's unique, so we can't be attacked this way
1120	2					60	while ($original =~ /^\Q${boundary}\E(?:--)?$/m) {
1121	0					0	$boundary .= "/".sprintf("%08X",int(rand(2 ** 32)));
1122							}
1123
1124							# determine whether Content-Disposition should be "attachment" or "inline"
1125	2					6	my $disposition;
1126	2					13	my $ct = $self->{msg}->get_header("Content-Type");
1127	2	50	33			73	if (defined $ct && $ct ne '' && $ct !~ m{text/plain}i) {
			33
1128	2					11	$disposition = "attachment";
1129	2					11	$report .= $self->_replace_tags($self->{conf}->{unsafe_report_template});
1130							# if we wanted to defang the attachment, this would be the place
1131							}
1132							else {
1133	0					0	$disposition = "inline";
1134							}
1135
1136	2					7	my $type = "message/rfc822";
1137	2	50				18	$type = "text/plain" if $self->{conf}->{report_safe} > 1;
1138
1139	2					10	my $description = $self->{conf}->{'encapsulated_content_description'};
1140
1141							# Note: the message should end in blank line since mbox format wants
1142							# blank line at end and messages may be concatenated! In addition, the
1143							# x-spam-type parameter is fixed since we will use it later to recognize
1144							# original messages that can be extracted.
1145	2					37	$newmsg .= <<"EOM";
1146							MIME-Version: 1.0
1147							Content-Type: multipart/mixed; boundary="$boundary"
1148
1149							This is a multi-part message in MIME format.
1150
1151							--$boundary
1152							Content-Type: text/plain$report_charset
1153							Content-Disposition: inline
1154							Content-Transfer-Encoding: 8bit
1155
1156							$report
1157
1158							--$boundary
1159							Content-Type: $type; x-spam-type=original
1160							Content-Description: $description
1161							Content-Disposition: $disposition
1162							Content-Transfer-Encoding: 8bit
1163
1164							EOM
1165
1166	2					12	my $newmsgtrailer = "\n--$boundary--\n\n";
1167
1168							# now fix line endings in both headers, report_safe body parts,
1169							# and new MIME boundaries and structure
1170	2					12	$self->_fixup_report_line_endings(\$newmsg);
1171	2					6	$self->_fixup_report_line_endings(\$newmsgtrailer);
1172	2					44	$newmsg .= $original.$newmsgtrailer;
1173
1174	2					33	return $newmsg;
1175							}
1176
1177							# rewrite the message in non-report_safe mode (just headers)
1178							# should not be called directly, use rewrite_mail instead
1179							#
1180							sub rewrite_no_report_safe {
1181	32			32	0	87	my ($self) = @_;
1182
1183							# put the pristine headers into an array
1184							# skip the X-Spam- headers, but allow the X-Spam-Prev headers to remain.
1185							# since there may be a missing header/body
1186							#
1187	32					189	my @pristine_headers = split(/^/m, $self->{msg}->get_pristine_header());
1188	32					208	for (my $line = 0; $line <= $#pristine_headers; $line++) {
1189	120	50				488	next unless ($pristine_headers[$line] =~ /^X-Spam-(?!Prev-)/i);
1190	0					0	splice @pristine_headers, $line, 1 while ($pristine_headers[$line] =~ /^(?:X-Spam-(?!Prev-)\|[ \t])/i);
1191	0					0	$line--;
1192							}
1193	32					101	my $separator = '';
1194	32	50	33			349	if (@pristine_headers && $pristine_headers[$#pristine_headers] =~ /^\s*$/) {
1195	32					91	$separator = pop @pristine_headers;
1196							}
1197
1198	32					85	my $addition = 'headers_ham';
1199
1200	32	50				120	if($self->{is_spam})
1201							{
1202							# special-case: Subject lines. ensure one exists, if we're
1203							# supposed to mark it up.
1204	0					0	my $created_subject = 0;
1205	0					0	my $subject = $self->{msg}->get_pristine_header('Subject');
1206	0	0	0			0	if (!defined($subject) && $self->{is_spam}
			0
1207							&& exists $self->{conf}->{rewrite_header}->{'Subject'})
1208							{
1209	0					0	push(@pristine_headers, "Subject: \n");
1210	0					0	$created_subject = 1;
1211							}
1212
1213							# Deal with header rewriting
1214	0					0	foreach (@pristine_headers) {
1215							# if we're not going to do a rewrite, skip this header!
1216	0	0				0	next if (!/^(From\|Subject\|To):/i);
1217	0					0	my $hdr = ucfirst(lc($1));
1218	0	0				0	next if (!defined $self->{conf}->{rewrite_header}->{$hdr});
1219
1220							# pop the original version onto the end of the header array
1221	0	0				0	if ($created_subject) {
1222	0					0	push(@pristine_headers, "X-Spam-Prev-Subject: (nonexistent)\n");
1223							} else {
1224	0					0	push(@pristine_headers, "X-Spam-Prev-$_");
1225							}
1226
1227							# Figure out the rewrite piece
1228	0					0	my $tag = $self->_replace_tags($self->{conf}->{rewrite_header}->{$hdr});
1229	0					0	$tag =~ s/\n/ /gs;
1230
1231							# The tag should be a comment for this header ...
1232	0	0				0	$tag = "($tag)" if ($hdr =~ /^(?:From\|To)$/);
1233
1234	0					0	local $1;
1235	0					0	s/^([^:]+:)[ \t]*(?:\Q${tag}\E )?/$1 ${tag} /i;
1236							}
1237
1238	0					0	$addition = 'headers_spam';
1239							}
1240
1241							# Break the pristine header set into two blocks; $new_hdrs_pre is the stuff
1242							# that we want to ensure comes before any SpamAssassin markup headers,
1243							# like the Return-Path header (see bug 3409).
1244							#
1245							# all the rest of the message headers (as left in @pristine_headers), is
1246							# to be placed after the SpamAssassin markup hdrs. Once one of those headers
1247							# is seen, all further headers go into that set; it's assumed that it's an
1248							# old copy of the header, or attempted spoofing, if it crops up halfway
1249							# through the headers.
1250
1251	32					82	my $new_hdrs_pre = '';
1252	32	50	33			298	if (@pristine_headers && $pristine_headers[0] =~ /^Return-Path:/i) {
1253	0					0	$new_hdrs_pre .= shift(@pristine_headers);
1254	0		0			0	while (@pristine_headers && $pristine_headers[0] =~ /^[ \t]/) {
1255	0					0	$new_hdrs_pre .= shift(@pristine_headers);
1256							}
1257							}
1258	32					195	$new_hdrs_pre .= $self->_get_added_headers($addition);
1259
1260							# fix up line endings appropriately
1261	32					158	my $newmsg = $new_hdrs_pre . join('',@pristine_headers) . $separator;
1262	32					195	$self->_fixup_report_line_endings(\$newmsg);
1263
1264	32					164	return $newmsg.$self->{msg}->get_pristine_body();
1265							}
1266
1267							sub qp_encode_header {
1268	168			168	0	375	my ($self, $text) = @_;
1269
1270							# return unchanged if there are no 8-bit characters
1271	168	50				596	return $text if $text !~ tr/\x00-\x7F//c;
1272
1273	0					0	my $cs = 'ISO-8859-1';
1274	0	0				0	if ($self->{report_charset}) {
1275	0					0	$cs = $self->{report_charset};
1276							}
1277
1278	0					0	my @hexchars = split('', '0123456789abcdef');
1279	0					0	my $ord;
1280	0					0	local $1;
1281	0					0	$text =~ s{([\x80-\xff])}{
1282	0					0	$ord = ord $1;
1283	0					0	'='.$hexchars[($ord & 0xf0) >> 4].$hexchars[$ord & 0x0f]
1284							}ges;
1285
1286	0					0	$text = '=?'.$cs.'?Q?'.$text.'?=';
1287
1288	0					0	dbg("markup: encoding header in $cs: $text");
1289	0					0	return $text;
1290							}
1291
1292							sub _process_header {
1293	168			168		373	my ($self, $hdr_name, $hdr_data) = @_;
1294
1295	168					412	$hdr_data = $self->_replace_tags($hdr_data);
1296	168					358	$hdr_data =~ s/(?:\r?\n)+$//; # make sure there are no trailing newlines ...
1297
1298	168	50				433	if ($self->{conf}->{fold_headers}) {
1299	168	50				352	if ($hdr_data =~ /\n/) {
1300	0					0	$hdr_data =~ s/\s\n\s/\n\t/g;
1301	0					0	return $hdr_data;
1302							}
1303							else {
1304							# use '!!' instead of ': ' so it doesn't wrap on the space
1305	168					421	my $hdr = "X-Spam-$hdr_name!!$hdr_data";
1306	168					530	$hdr = Mail::SpamAssassin::Util::wrap($hdr, "\t", "", 79, 0, '(?<=[\s,])');
1307	168					397	$hdr =~ s/^\t\n//gm;
1308	168					688	return (split (/!!/, $hdr, 2))[1]; # just return the data part
1309							}
1310							}
1311							else {
1312	0					0	$hdr_data =~ s/\n/ /g; # Can't have newlines in headers, unless folded
1313	0					0	return $hdr_data;
1314							}
1315							}
1316
1317							sub _replace_tags {
1318	174			174		256	my $self = shift;
1319	174					283	my $text = shift;
1320
1321							# default to leaving the original string in place, if we cannot find
1322							# a tag for it (bug 4793)
1323	174					632	local($1,$2,$3);
1324	174					1298	$text =~ s{(_(\w+?)(?:$(.*?)$)?_)}{
1325	430					1021	my $full = $1;
1326	430					690	my $tag = $2;
1327	430					529	my $result;
1328	430	50				749	if ($tag =~ /^ADDEDHEADER(?:HAM\|SPAM\|)\z/) {
1329							# Bug 6278: break infinite recursion through _get_added_headers and
1330							# _get_tag on an attempt to use such tag in add_header template
1331							} else {
1332	430					966	$result = $self->get_tag_raw($tag,$3);
1333	430	50				839	$result = join(' ',@$result) if ref $result eq 'ARRAY';
1334							}
1335	430	50				2170	defined $result ? $result : $full;
1336							}ge;
1337
1338	174					713	return $text;
1339							}
1340
1341							###########################################################################
1342
1343							# public API for plugins
1344
1345							=item $status->action_depends_on_tags($tags, $code, @args)
1346
1347							Enqueue the supplied subroutine reference C<$code>, to become runnable when
1348							all the specified tags become available. The C<$tags> may be a simple
1349							scalar - a tag name, or a listref of tag names. The subroutine C<&$code>
1350							when called will be passed a C<permessagestatus> object as its first argument,
1351							followed by the supplied (optional) list C<@args> .
1352
1353							=cut
1354
1355							sub action_depends_on_tags {
1356	0			0	1	0	my($self, $tags, $code, @args) = @_;
1357
1358	0	0				0	ref $code eq 'CODE'
1359							or die "action_depends_on_tags: argument must be a subroutine ref";
1360
1361							# tag names on which the given action depends
1362	0	0				0	my @dep_tags = !ref $tags ? uc $tags : map(uc($_),@$tags);
1363
1364							# @{$self->{tagrun_subs}} list of all submitted subroutines
1365							# @{$self->{tagrun_actions}{$tag}} bitmask of action indices blocked by tag
1366							# $self->{tagrun_tagscnt}[$action_ind] count of tags still pending
1367
1368							# store action details, obtain its index
1369	0					0	push(@{$self->{tagrun_subs}}, [$code,@args]);
	0					0
1370	0					0	my $action_ind = $#{$self->{tagrun_subs}};
	0					0
1371
1372							# list dependency tag names which are not already satistied
1373							my @blocking_tags =
1374	0		0			0	grep(!defined $self->{tag_data}{$_} \|\| $self->{tag_data}{$_} eq '',
1375							@dep_tags);
1376
1377	0					0	$self->{tagrun_tagscnt}[$action_ind] = scalar @blocking_tags;
1378	0					0	$self->{tagrun_actions}{$_}[$action_ind] = 1 for @blocking_tags;
1379
1380	0	0				0	if (@blocking_tags) {
1381	0					0	dbg("check: tagrun - action %s blocking on tags %s",
1382							$action_ind, join(', ',@blocking_tags));
1383							} else {
1384	0					0	dbg("check: tagrun - tag %s was ready, action %s runnable immediately: %s",
1385							join(', ',@dep_tags), $action_ind, join(', ',$code,@args));
1386	0					0	&$code($self, @args);
1387							}
1388							}
1389
1390							# tag_is_ready() will be called by set_tag(), indicating that a given
1391							# tag just received its value, possibly unblocking an action routine
1392							# as declared by action_depends_on_tags().
1393							#
1394							# Well-behaving plugins should call set_tag() once when a tag is fully
1395							# assembled and ready. Multiple calls to set the same tag value are handled
1396							# gracefully, but may result in premature activation of a pending action.
1397							# Setting tag values by plugins should not be done directly but only through
1398							# the public API set_tag(), otherwise a pending action release may be missed.
1399							#
1400							sub tag_is_ready {
1401	658			658	0	1212	my($self, $tag) = @_;
1402	658					993	$tag = uc $tag;
1403
1404	658	50				1413	if (would_log('dbg', 'check')) {
1405	0					0	my $tag_val = $self->{tag_data}{$tag};
1406	0	0				0	dbg("check: tagrun - tag %s is now ready, value: %s",
		0
1407							$tag, !defined $tag_val ? '<UNDEF>'
1408							: ref $tag_val ne 'ARRAY' ? $tag_val
1409							: 'ARY:[' . join(',',@$tag_val) . ']' );
1410							}
1411	658	50				2022	if (ref $self->{tagrun_actions}{$tag}) { # any action blocking on this tag?
1412	0					0	my $action_ind = 0;
1413	0					0	foreach my $action_pending (@{$self->{tagrun_actions}{$tag}}) {
	0					0
1414	0	0				0	if ($action_pending) {
1415	0					0	$self->{tagrun_actions}{$tag}[$action_ind] = 0;
1416	0	0				0	if ($self->{tagrun_tagscnt}[$action_ind] <= 0) {
		0
1417							# should not happen, warn and ignore
1418							warn "tagrun error: count for $action_ind is ".
1419	0					0	$self->{tagrun_tagscnt}[$action_ind]."\n";
1420							} elsif (! --($self->{tagrun_tagscnt}[$action_ind])) {
1421	0					0	my($code,@args) = @{$self->{tagrun_subs}[$action_ind]};
	0					0
1422	0					0	dbg("check: tagrun - tag %s unblocking the action %s: %s",
1423							$tag, $action_ind, join(', ',$code,@args));
1424	0					0	&$code($self, @args);
1425							}
1426							}
1427	0					0	$action_ind++;
1428							}
1429							}
1430							}
1431
1432							# debugging aid: show actions that are still pending, waiting for their
1433							# tags to receive a value
1434							#
1435							sub report_unsatisfied_actions {
1436	124			124	0	334	my($self) = @_;
1437	124					296	my @tags;
1438	124	100				874	@tags = keys %{$self->{tagrun_actions}} if ref $self->{tagrun_actions};
	98					376
1439	124					469	for my $tag (@tags) {
1440							my @pending_actions = grep($self->{tagrun_actions}{$tag}[$_],
1441	0					0	(0 .. $#{$self->{tagrun_actions}{$tag}}));
	0					0
1442	0	0				0	dbg("check: tagrun - tag %s is still blocking action %s",
1443							$tag, join(', ', @pending_actions)) if @pending_actions;
1444							}
1445							}
1446
1447							=item $status->set_tag($tagname, $value)
1448
1449							Set a template tag, as used in C<add_header>, report templates, etc.
1450							This API is intended for use by plugins. Tag names will be converted
1451							to an all-uppercase representation internally.
1452
1453							C<$value> can be a simple scalar (string or number), or a reference to an
1454							array, in which case the public method get_tag will join array elements
1455							using a space as a separator, returning a single string for backward
1456							compatibility.
1457
1458							C<$value> can also be a subroutine reference, which will be evaluated
1459							each time the template is expanded. The first argument passed by get_tag
1460							to a called subroutine will be a PerMsgStatus object (this module's object),
1461							followed by optional arguments provided a caller to get_tag.
1462
1463							Note that perl supports closures, which means that variables set in the
1464							caller's scope can be accessed inside this C<sub>. For example:
1465
1466							my $text = "hello world!";
1467							$status->set_tag("FOO", sub {
1468							my $pms = shift;
1469							return $text;
1470							});
1471
1472							See C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details
1473							on how template tags are used.
1474
1475							C<undef> will be returned if a tag by that name has not been defined.
1476
1477							=cut
1478
1479							sub set_tag {
1480	658			658	1	1452	my($self,$tag,$val) = @_;
1481	658					2370	$self->{tag_data}->{uc $tag} = $val;
1482	658					1492	$self->tag_is_ready($tag);
1483							}
1484
1485							# public API for plugins
1486
1487							=item $string = $status->get_tag($tagname)
1488
1489							Get the current value of a template tag, as used in C<add_header>, report
1490							templates, etc. This API is intended for use by plugins. Tag names will be
1491							converted to an all-uppercase representation internally. See
1492							C<Mail::SpamAssassin::Conf>'s C<TEMPLATE TAGS> section for more details on
1493							tags.
1494
1495							C<undef> will be returned if a tag by that name has not been defined.
1496
1497							=cut
1498
1499							sub get_tag {
1500	9			9	1	69	my($self, $tag, @args) = @_;
1501
1502	9	50				35	return if !defined $tag;
1503	9					30	$tag = uc $tag;
1504	9					17	my $data;
1505	9	100				76	if (exists $common_tags{$tag}) {
		50
1506							# tag data from traditional pre-defined tag subroutines
1507	3					9	$data = $common_tags{$tag};
1508	3	50				18	$data = $data->($self,@args) if ref $data eq 'CODE';
1509	3	50				23	$data = join(' ',@$data) if ref $data eq 'ARRAY';
1510	3	50				13	$data = "" if !defined $data;
1511							} elsif (exists $self->{tag_data}->{$tag}) {
1512							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1513	6					20	$data = $self->{tag_data}->{$tag};
1514	6	50				26	$data = $data->($self,@args) if ref $data eq 'CODE';
1515	6	50				25	$data = join(' ',@$data) if ref $data eq 'ARRAY';
1516	6	50				43	$data = "" if !defined $data;
1517							}
1518	9					39	return $data;
1519							}
1520
1521							=item $string = $status->get_tag_raw($tagname, @args)
1522
1523							Similar to C<get_tag>, but keeps a tag name unchanged (does not uppercase it),
1524							and does not convert arrayref tag values into a single string.
1525
1526							=cut
1527
1528							sub get_tag_raw {
1529	430			430	1	997	my($self, $tag, @args) = @_;
1530
1531	430	50				835	return if !defined $tag;
1532	430					553	my $data;
1533	430	100				1141	if (exists $common_tags{$tag}) {
		50
1534							# tag data from traditional pre-defined tag subroutines
1535	362					654	$data = $common_tags{$tag};
1536	362	50				1345	$data = $data->($self,@args) if ref $data eq 'CODE';
1537	362	50				768	$data = "" if !defined $data;
1538							} elsif (exists $self->{tag_data}->{$tag}) {
1539							# tag data comes from $self->{tag_data}->{TAG}, typically from plugins
1540	68					165	$data = $self->{tag_data}->{$tag};
1541	68	50				216	$data = $data->($self,@args) if ref $data eq 'CODE';
1542	68	50				241	$data = "" if !defined $data;
1543							}
1544	430					887	return $data;
1545							}
1546
1547							###########################################################################
1548
1549							# public API for plugins
1550
1551							=item $status->set_spamd_result_item($subref)
1552
1553							Set an entry for the spamd result log line. C<$subref> should be a code
1554							reference for a subroutine which will return a string in C<'name=VALUE'>
1555							format, similar to the other entries in the spamd result line:
1556
1557							Jul 17 14:10:47 radish spamd[16670]: spamd: result: Y 22 - ALL_NATURAL,
1558							DATE_IN_FUTURE_03_06,DIET_1,DRUGS_ERECTILE,DRUGS_PAIN,
1559							TEST_FORGED_YAHOO_RCVD,TEST_INVALID_DATE,TEST_NOREALNAME,
1560							TEST_NORMAL_HTTP_TO_IP,UNDISC_RECIPS scantime=0.4,size=3138,user=jm,
1561							uid=1000,required_score=5.0,rhost=localhost,raddr=127.0.0.1,
1562							rport=33153,mid=<9PS291LhupY>,autolearn=spam
1563
1564							C<name> and C<VALUE> must not contain C<=> or C<,> characters, as it
1565							is important that these log lines are easy to parse.
1566
1567							The code reference will be called by spamd after the message has been scanned,
1568							and the C<PerMsgStatus::check()> method has returned.
1569
1570							=cut
1571
1572							sub set_spamd_result_item {
1573	0			0	1	0	my ($self, $ref) = @_;
1574	0					0	push @{$self->{spamd_result_log_items}}, $ref;
	0					0
1575							}
1576
1577							# called by spamd
1578							sub get_spamd_result_log_items {
1579	0			0	0	0	my ($self) = @_;
1580	0					0	my @ret;
1581	0					0	foreach my $ref (@{$self->{spamd_result_log_items}}) {
	0					0
1582	0					0	push @ret, &$ref;
1583							}
1584	0					0	return @ret;
1585							}
1586
1587							###########################################################################
1588
1589							sub _get_tag_value_for_yesno {
1590	36			36		123	my($self, $arg) = @_;
1591	36					84	my($arg_spam, $arg_ham);
1592	36	50				124	($arg_spam, $arg_ham) = split(/,/, $arg, 2) if defined $arg;
1593	36	50				183	return $self->{is_spam} ? (defined $arg_spam ? $arg_spam : 'Yes')
		50
		100
1594							: (defined $arg_ham ? $arg_ham : 'No');
1595							}
1596
1597							sub _get_tag_value_for_score {
1598	38			38		325	my ($self, $pad) = @_;
1599
1600	38					357	my $score = sprintf("%2.1f", $self->{score});
1601	38					155	my $rscore = $self->_get_tag_value_for_required_score();
1602
1603							#Change due to bug 6419 to use Util function for consistency with spamd
1604							#and PerMessageStatus
1605	38					251	$score = Mail::SpamAssassin::Util::get_tag_value_for_score($score, $rscore, $self->{is_spam});
1606
1607							#$pad IS PROVIDED BY THE _SCORE(PAD)_ tag
1608	38	50	33			158	if (defined $pad && $pad =~ /^(0+\| +)$/) {
1609	0					0	my $count = length($1) + 3 - length($score);
1610	0	0				0	$score = (substr($pad, 0, $count) . $score) if $count > 0;
1611							}
1612	38					102	return $score;
1613
1614							}
1615
1616							sub _get_tag_value_for_required_score {
1617	76			76		135	my $self = shift;
1618	76					409	return sprintf("%2.1f", $self->{conf}->{required_score});
1619							}
1620
1621
1622							###########################################################################
1623
1624							=item $status->finish ()
1625
1626							Indicate that this C<$status> object is finished with, and can be destroyed.
1627
1628							If you are using SpamAssassin in a persistent environment, or checking many
1629							mail messages from one C<Mail::SpamAssassin> factory, this method should be
1630							called to ensure Perl's garbage collection will clean up old status objects.
1631
1632							=cut
1633
1634							sub finish {
1635	124			124	1	27284	my ($self) = @_;
1636
1637	124					768	$self->{main}->call_plugins ("per_msg_finish", {
1638							permsgstatus => $self
1639							});
1640
1641	124					774	$self->report_unsatisfied_actions;
1642
1643							# Delete out all of the members of $self. This will remove any direct
1644							# circular references and let the memory get reclaimed while also being more
1645							# efficient than a foreach() loop over the keys.
1646	124					256	%{$self} = ();
	124					3226
1647							}
1648
1649							sub finish_tests {
1650	0			0	0	0	my ($conf) = @_;
1651	0					0	foreach my $method (@TEMPORARY_METHODS) {
1652	0	0				0	if (defined &{$method}) {
	0					0
1653	0					0	undef &{$method};
	0					0
1654							}
1655							}
1656	0					0	@TEMPORARY_METHODS = (); # clear for next time
1657	0					0	%TEMPORARY_EVAL_GLUE_METHODS = ();
1658							}
1659
1660
1661							=item $name = $status->get_current_eval_rule_name()
1662
1663							Return the name of the currently-running eval rule. C<undef> is
1664							returned if no eval rule is currently being run. Useful for plugins
1665							to determine the current rule name while inside an eval test function
1666							call.
1667
1668							=cut
1669
1670							sub get_current_eval_rule_name {
1671	0			0	1	0	my ($self) = @_;
1672	0					0	return $self->{current_rule_name};
1673							}
1674
1675							###########################################################################
1676
1677							sub extract_message_metadata {
1678	96			96	0	593	my ($self) = @_;
1679
1680	96					457	my $timer = $self->{main}->time_method("extract_message_metadata");
1681	96					732	$self->{msg}->extract_message_metadata($self);
1682
1683	96					606	foreach my $item (qw(
1684							relays_trusted relays_trusted_str num_relays_trusted
1685							relays_untrusted relays_untrusted_str num_relays_untrusted
1686							relays_internal relays_internal_str num_relays_internal
1687							relays_external relays_external_str num_relays_external
1688							num_relays_unparseable last_trusted_relay_index
1689							last_internal_relay_index
1690							))
1691							{
1692	1440					3270	$self->{$item} = $self->{msg}->{metadata}->{$item};
1693							}
1694
1695							# TODO: International domain names (UTF-8) must be converted to
1696							# ASCII-compatible encoding (ACE) for the purpose of setting the
1697							# SENDERDOMAIN and AUTHORDOMAIN tags (and probably for other uses too).
1698							# (explicitly required for DMARC, draft-kucherawy-dmarc-base sect. 5.6.1)
1699							#
1700	96					306	{ local $1;
	96					327
1701	96					782	my $addr = $self->get('EnvelopeFrom:addr', undef);
1702							# collect a FQDN, ignoring potential trailing WSP
1703	96	100	66			396	if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1704	1					5	$self->set_tag('SENDERDOMAIN', lc $1);
1705							}
1706							# TODO: the get ':addr' only returns the first address; this should be
1707							# augmented to be able to return all addresses in a header field, multiple
1708							# addresses in a From header field are allowed according to RFC 5322
1709	96					312	$addr = $self->get('From:addr', undef);
1710	96	100	66			1019	if (defined $addr && $addr =~ /\@([^@. \t]+\.[^@ \t]+?)[ \t]*\z/s) {
1711	57					582	$self->set_tag('AUTHORDOMAIN', lc $1);
1712							}
1713							}
1714
1715	96					600	$self->set_tag('RELAYSTRUSTED', $self->{relays_trusted_str});
1716	96					302	$self->set_tag('RELAYSUNTRUSTED', $self->{relays_untrusted_str});
1717	96					343	$self->set_tag('RELAYSINTERNAL', $self->{relays_internal_str});
1718	96					542	$self->set_tag('RELAYSEXTERNAL', $self->{relays_external_str});
1719	96					321	my $lasthop = $self->{relays_external}->[0];
1720	96	100				300	if ($lasthop) {
1721	14					51	$self->set_tag('LASTEXTERNALIP', $lasthop->{ip});
1722							}
1723	96					590	$self->set_tag('LANGUAGES', $self->{msg}->get_metadata("X-Languages"));
1724
1725							# This should happen before we get called, but just in case.
1726	96	100				452	if (!defined $self->{msg}->{metadata}->{html}) {
1727	95					516	$self->get_decoded_stripped_body_text_array();
1728							}
1729	96					391	$self->{html} = $self->{msg}->{metadata}->{html};
1730
1731							# allow plugins to add more metadata, read the stuff that's there, etc.
1732	96					628	$self->{main}->call_plugins ("parsed_metadata", { permsgstatus => $self });
1733							}
1734
1735							###########################################################################
1736
1737							=item $status->get_decoded_body_text_array ()
1738
1739							Returns the message body, with B<base64> or B<quoted-printable> encodings
1740							decoded, and non-text parts or non-inline attachments stripped.
1741
1742							It is returned as an array of strings, with each string representing
1743							one newline-separated line of the body.
1744
1745							=cut
1746
1747							sub get_decoded_body_text_array {
1748	98			98	1	414	return $_[0]->{msg}->get_decoded_body_text_array();
1749							}
1750
1751							=item $status->get_decoded_stripped_body_text_array ()
1752
1753							Returns the message body, decoded (as described in
1754							get_decoded_body_text_array()), with HTML rendered, and with whitespace
1755							normalized.
1756
1757							It will always render text/html, and will use a heuristic to determine if other
1758							text/* parts should be considered text/html.
1759
1760							It is returned as an array of strings, with each string representing one
1761							'paragraph'. Paragraphs, in plain-text mails, are double-newline-separated
1762							blocks of multi-line text.
1763
1764							=cut
1765
1766							sub get_decoded_stripped_body_text_array {
1767	327			327	1	1345	return $_[0]->{msg}->get_rendered_body_text_array();
1768							}
1769
1770							###########################################################################
1771
1772							=item $status->get (header_name [, default_value])
1773
1774							Returns a message header, pseudo-header, real name or address.
1775							C<header_name> is the name of a mail header, such as 'Subject', 'To',
1776							etc. If C<default_value> is given, it will be used if the requested
1777							C<header_name> does not exist.
1778
1779							Appending C<:raw> to the header name will inhibit decoding of quoted-printable
1780							or base-64 encoded strings.
1781
1782							Appending a modifier C<:addr> to a header field name will cause everything
1783							except the first email address to be removed from the header field. It is
1784							mainly applicable to header fields 'From', 'Sender', 'To', 'Cc' along with
1785							their 'Resent-*' counterparts, and the 'Return-Path'. For example, all of
1786							the following will result in "example@foo":
1787
1788							=over 4
1789
1790							=item example@foo
1791
1792							=item example@foo (Foo Blah)
1793
1794							=item example@foo, example@bar
1795
1796							=item display: example@foo (Foo Blah), example@bar ;
1797
1798							=item Foo Blah <example@foo>
1799
1800							=item "Foo Blah" <example@foo>
1801
1802							=item "'Foo Blah'" <example@foo>
1803
1804							=back
1805
1806							Appending a modifier C<:name> to a header field name will cause everything
1807							except the first display name to be removed from the header field. It is
1808							mainly applicable to header fields containing a single mail address: 'From',
1809							'Sender', along with their 'Resent-From' and 'Resent-Sender' counterparts.
1810							For example, all of the following will result in "Foo Blah". One level of
1811							single quotes is stripped too, as it is often seen.
1812
1813							=over 4
1814
1815							=item example@foo (Foo Blah)
1816
1817							=item example@foo (Foo Blah), example@bar
1818
1819							=item display: example@foo (Foo Blah), example@bar ;
1820
1821							=item Foo Blah <example@foo>
1822
1823							=item "Foo Blah" <example@foo>
1824
1825							=item "'Foo Blah'" <example@foo>
1826
1827							=back
1828
1829							There are several special pseudo-headers that can be specified:
1830
1831							=over 4
1832
1833							=item C<ALL> can be used to mean the text of all the message's headers.
1834
1835							=item C<ALL-TRUSTED> can be used to mean the text of all the message's headers
1836							that could only have been added by trusted relays.
1837
1838							=item C<ALL-INTERNAL> can be used to mean the text of all the message's headers
1839							that could only have been added by internal relays.
1840
1841							=item C<ALL-UNTRUSTED> can be used to mean the text of all the message's
1842							headers that may have been added by untrusted relays. To make this
1843							pseudo-header more useful for header rules the 'Received' header that was added
1844							by the last trusted relay is included, even though it can be trusted.
1845
1846							=item C<ALL-EXTERNAL> can be used to mean the text of all the message's headers
1847							that may have been added by external relays. Like C<ALL-UNTRUSTED> the
1848							'Received' header added by the last internal relay is included.
1849
1850							=item C<ToCc> can be used to mean the contents of both the 'To' and 'Cc'
1851							headers.
1852
1853							=item C<EnvelopeFrom> is the address used in the 'MAIL FROM:' phase of the SMTP
1854							transaction that delivered this message, if this data has been made available
1855							by the SMTP server.
1856
1857							=item C<MESSAGEID> is a symbol meaning all Message-Id's found in the message;
1858							some mailing list software moves the real 'Message-Id' to 'Resent-Message-Id'
1859							or 'X-Message-Id', then uses its own one in the 'Message-Id' header. The value
1860							returned for this symbol is the text from all 3 headers, separated by newlines.
1861
1862							=item C<X-Spam-Relays-Untrusted> is the generated metadata of untrusted relays
1863							the message has passed through
1864
1865							=item C<X-Spam-Relays-Trusted> is the generated metadata of trusted relays
1866							the message has passed through
1867
1868							=back
1869
1870							=cut
1871
1872							# only uses two arguments, ignores $defval
1873							sub _get {
1874	3368			3368		5933	my ($self, $request) = @_;
1875
1876	3368					3765	my $result;
1877	3368					4131	my $getaddr = 0;
1878	3368					3733	my $getname = 0;
1879	3368					3796	my $getraw = 0;
1880
1881							# special queries - process and strip modifiers
1882	3368	100				7095	if (index($request,':') >= 0) { # triage
1883	596					1455	local $1;
1884	596					3726	while ($request =~ s/:([^:]*)//) {
1885	596	100				2170	if ($1 eq 'raw') { $getraw = 1 }
	12	50				42
		0
1886	584					1843	elsif ($1 eq 'addr') { $getaddr = $getraw = 1 }
1887	0					0	elsif ($1 eq 'name') { $getname = 1 }
1888							}
1889							}
1890	3368					5368	my $request_lc = lc $request;
1891
1892							# ALL: entire pristine or semi-raw headers
1893	3368	100				16354	if ($request eq 'ALL') {
		50
		50
		50
		50
		100
		50
		50
		50
		50
		100
		100
1894							$result = $getraw ? $self->{msg}->get_pristine_header()
1895	11	50				78	: $self->{msg}->get_all_headers(1);
1896							}
1897							# ALL-TRUSTED: entire trusted raw headers
1898							elsif ($request eq 'ALL-TRUSTED') {
1899							# '+1' since we added the received header even though it's not considered
1900							# trusted, so we know that those headers can be trusted too
1901							return $self->get_all_hdrs_in_rcvd_index_range(
1902	0					0	undef, $self->{last_trusted_relay_index}+1);
1903							}
1904							# ALL-INTERNAL: entire internal raw headers
1905							elsif ($request eq 'ALL-INTERNAL') {
1906							# '+1' for the same reason as in ALL-TRUSTED above
1907							return $self->get_all_hdrs_in_rcvd_index_range(
1908	0					0	undef, $self->{last_internal_relay_index}+1);
1909							}
1910							# ALL-UNTRUSTED: entire untrusted raw headers
1911							elsif ($request eq 'ALL-UNTRUSTED') {
1912							# '+1' for the same reason as in ALL-TRUSTED above
1913							return $self->get_all_hdrs_in_rcvd_index_range(
1914	0					0	$self->{last_trusted_relay_index}+1, undef);
1915							}
1916							# ALL-EXTERNAL: entire external raw headers
1917							elsif ($request eq 'ALL-EXTERNAL') {
1918							# '+1' for the same reason as in ALL-TRUSTED above
1919							return $self->get_all_hdrs_in_rcvd_index_range(
1920	0					0	$self->{last_internal_relay_index}+1, undef);
1921							}
1922							# EnvelopeFrom: the SMTP MAIL FROM: address
1923							elsif ($request_lc eq "\LEnvelopeFrom") {
1924	110					720	$result = $self->get_envelope_from();
1925							}
1926							# untrusted relays list, as string
1927							elsif ($request_lc eq "\LX-Spam-Relays-Untrusted") {
1928	0					0	$result = $self->{relays_untrusted_str};
1929							}
1930							# trusted relays list, as string
1931							elsif ($request_lc eq "\LX-Spam-Relays-Trusted") {
1932	0					0	$result = $self->{relays_trusted_str};
1933							}
1934							# external relays list, as string
1935							elsif ($request_lc eq "\LX-Spam-Relays-External") {
1936	0					0	$result = $self->{relays_external_str};
1937							}
1938							# internal relays list, as string
1939							elsif ($request_lc eq "\LX-Spam-Relays-Internal") {
1940	0					0	$result = $self->{relays_internal_str};
1941							}
1942							# ToCc: the combined recipients list
1943							elsif ($request_lc eq "\LToCc") {
1944	77					372	$result = join("\n", $self->{msg}->get_header('To', $getraw));
1945	77	100				309	if ($result ne '') {
1946	5					17	chomp $result;
1947	5	50				30	$result .= ", " if $result =~ /\S/;
1948							}
1949	77					289	$result .= join("\n", $self->{msg}->get_header('Cc', $getraw));
1950	77	100				390	$result = undef if $result eq '';
1951							}
1952							# MESSAGEID: handle lists which move the real message-id to another
1953							# header for resending.
1954							elsif ($request eq 'MESSAGEID') {
1955	38	50				358	$result = join("\n", grep { defined($_) && $_ ne '' }
1956							$self->{msg}->get_header('X-Message-Id', $getraw),
1957							$self->{msg}->get_header('Resent-Message-Id', $getraw),
1958							$self->{msg}->get_header('X-Original-Message-ID', $getraw),
1959	77					374	$self->{msg}->get_header('Message-Id', $getraw));
1960							}
1961							# a conventional header
1962							else {
1963							my @results = $getraw ? $self->{msg}->raw_header($request)
1964	3093	100				8696	: $self->{msg}->get_header($request);
1965							# dbg("message: get(%s)%s = %s",
1966							# $request, $getraw?'raw':'', join(", ",@results));
1967	3093	100				4926	if (@results) {
1968	330					1093	$result = join('', @results);
1969							} else { # metadata
1970	2763					6353	$result = $self->{msg}->get_metadata($request);
1971							}
1972							}
1973
1974							# special queries
1975	3368	100	66			7739	if (defined $result && ($getaddr \|\| $getname)) {
			66
1976	132					436	local $1;
1977	132					383	$result =~ s/^[^:]+:(.);\s$/$1/gs; # 'undisclosed-recipients: ;'
1978	132					818	$result =~ s/\s+/ /g; # reduce whitespace
1979	132					580	$result =~ s/^\s+//; # leading whitespace
1980	132					721	$result =~ s/\s+$//; # trailing whitespace
1981
1982	132	50				391	if ($getaddr) {
		0
1983							# Get the email address out of the header
1984							# All of these should result in "jm@foo":
1985							# jm@foo
1986							# jm@foo (Foo Blah)
1987							# jm@foo, jm@bar
1988							# display: jm@foo (Foo Blah), jm@bar ;
1989							# Foo Blah <jm@foo>
1990							# "Foo Blah" <jm@foo>
1991							# "'Foo Blah'" <jm@foo>
1992							#
1993							# strip out the (comments)
1994	132					315	$result =~ s/\s$.?$//g;
1995							# strip out the "quoted text", unless it's the only thing in the string
1996	132	100				412	if ($result !~ /^".*"$/) {
1997	131					402	$result =~ s/(?<!<)"[^"]*"(?!\@)//g; #" emacs
1998							}
1999							# Foo Blah <jm@xxx> or <jm@xxx>
2000	132					349	local $1;
2001	132					364	$result =~ s/^[^"<]?<(.?)>.*$/$1/;
2002							# multiple addresses on one line? remove all but first
2003	132					375	$result =~ s/,.*$//;
2004							}
2005							elsif ($getname) {
2006							# Get the display name out of the header
2007							# All of these should result in "Foo Blah":
2008							#
2009							# jm@foo (Foo Blah)
2010							# (Foo Blah) jm@foo
2011							# jm@foo (Foo Blah), jm@bar
2012							# display: jm@foo (Foo Blah), jm@bar ;
2013							# Foo Blah <jm@foo>
2014							# "Foo Blah" <jm@foo>
2015							# "'Foo Blah'" <jm@foo>
2016							#
2017	0					0	local $1;
2018							# does not handle mailbox-list or address-list well, to be improved
2019	0	0				0	if ($result =~ /^ \s* (.?) \s < [^<>]* >/sx) {
		0
2020	0					0	$result = $1; # display-name, RFC 5322
2021							# name-addr = [display-name] angle-addr
2022							# display-name = phrase
2023							# phrase = 1*word / obs-phrase
2024							# word = atom / quoted-string
2025							# obs-phrase = word *(word / "." / CFWS)
2026	0					0	$result =~ s{ " ( (?: [^"\\] \| \\. )* ) " }
	0					0
	0					0
	0					0
2027							{ my $s=$1; $s=~s{\\(.)}{$1}gs; $s }gsxe;
2028							} elsif ($result =~ /^ [^(,]? $ (.?) $ /sx) { # legacy form
2029	0					0	# nested comments are not handled, to be improved
2030							$result = $1;
2031	0					0	} else { # no display name
2032							$result = '';
2033	0					0	}
2034							$result =~ s/^ \s* ' \s* (.?) \s ' \s* \z/$1/sx;
2035							}
2036	3368					5523	}
2037							return $result;
2038							}
2039
2040							# optimized for speed
2041							# $_[0] is self
2042							# $_[1] is request
2043							# $_[2] is defval
2044	3894			3894	1	17277	sub get {
2045	3894					4426	my $cache = $_[0]->{c};
2046	3894	100				7309	my $found;
2047							if (exists $cache->{$_[1]}) {
2048							# return cache entry if it is known
2049	526					978	# (measured hit/attempts rate on a production mailer is about 47%)
2050							$found = $cache->{$_[1]};
2051							} else {
2052	3368					5734	# fill in a cache entry
2053	3368					9433	$found = _get(@_);
2054							$cache->{$_[1]} = $found;
2055							}
2056							# if the requested header wasn't found, we should return a default value
2057							# as specified by the caller: if defval argument is present it represents
2058							# a default value even if undef; if defval argument is absent a default
2059	3894	100				21414	# value is an empty string for upwards compatibility
		100
2060							return (defined $found ? $found : @_ > 2 ? $_[2] : '');
2061							}
2062
2063							###########################################################################
2064
2065							# uri parsing from plain text:
2066							# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
2067							# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
2068							# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
2069							# When we extract a domain and look it up in an RBL, an FP on deciding that the text is a URI is not much
2070							# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
2071							# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
2072							# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
2073							# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
2074							# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
2075							# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
2076							# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
2077							#
2078							# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
2079							# with URI-encodings and redirecting URIs.
2080							#
2081							# The delimiters for start of a URI in TBird are @(`{\|[\"'<>,\s in OE they are ("<\s
2082							#
2083							# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
2084							# TBird's end delimiters are )`{}\|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
2085							# OE only uses space as a delimiter, but ignores [~!@#^&()_+`-={}\|[]:";'<>?,.] at the end.
2086							#
2087							# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
2088							# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
2089							# delimiters include ( and '
2090							#
2091							# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
2092
2093	130			130		372	sub _tbirdurire {
2094							my ($self) = @_;
2095
2096	130	50				509	# Cached?
2097							return $self->{tbirdurire} if $self->{tbirdurire};
2098
2099	130					353	# a hybrid of tbird and oe's version of uri parsing
2100	130					272	my $tbirdstartdelim = '><"\'`,{[(\|\s' . "\x1b"; # The \x1b as per bug 4522
2101	130					261	my $iso2022shift = "\x1b" . '\(.'; # bug 4522
2102	130					361	my $tbirdenddelim = '><"`}\]{[\|\s' . "\x1b"; # The \x1b as per bug 4522
2103							my $nonASCII = '\x80-\xff';
2104
2105							# bug 7100: we allow a comma to delimit the end of an email address because it will never appear in a domain name, and
2106	130					373	# it's a common thing to find in text
2107	130					321	my $tbirdenddelimemail = $tbirdenddelim . ',(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
2108							my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
2109
2110	130					475	# valid TLDs
2111							my $tldsRE = $self->{main}->{registryboundaries}->{valid_tlds_re};
2112
2113							# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
2114	130					31780	# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
2115	130					1425	my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
2116	130					1338	my $uriknownscheme = qr/(?:(?:(?:(?:https?)\|(?:ftp)):(?:\/\/)?)\|(?:(?:www\d{0,2}\|ftp)\.))[^$tbirdenddelim]{1,251}/io;
2117							my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
2118	130					30238
2119							$self->{tbirdurire} = qr/(?:\b\|(?<=$iso2022shift)\|(?<=[$tbirdstartdelim]))
2120							(?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]\|\z))) \|
2121							(?:($urimailscheme)(?=(?:[$tbirdenddelimemail]\|\z))) \|
2122							(?:\b($urischemeless)(?=(?:[$tbirdenddelim]\|\z))))/xo;
2123	130					642
2124							return $self->{tbirdurire};
2125							}
2126
2127							=item $status->get_uri_list ()
2128
2129							Returns an array of all unique URIs found in the message. It takes
2130							a combination of the URIs found in the rendered (decoded and HTML
2131							stripped) body and the URIs found when parsing the HTML in the message.
2132							Will also set $status->{uri_list} (the array as returned by this function).
2133
2134							The returned array will include the "raw" URI as well as
2135							"slightly cooked" versions. For example, the single URI
2136							'http://%77w%77.example.com/' will get turned into:
2137							( 'http://%77w%77.example.com/', 'http://www.example.com/' )
2138
2139							=cut
2140
2141	130			130	1	477	sub get_uri_list {
2142							my ($self) = @_;
2143
2144	130	50				529	# use cached answer if available
2145	0					0	if (defined $self->{uri_list}) {
	0					0
2146							return @{$self->{uri_list}};
2147							}
2148	130					235
2149							my @uris;
2150							# $self->{redirect_num} = 0;
2151
2152	130					272	# get URIs from HTML parsing
	1383					2071
2153	1253	50				2034	while(my($uri, $info) = each %{ $self->get_uri_detail_list() }) {
2154	1253					1161	if ($info->{cleaned}) {
	1253					1654
2155	1675					2471	foreach (@{$info->{cleaned}}) {
2156							push(@uris, $_);
2157
2158							# count redirection attempts and log it
2159							# if (my @http = m{\b(https?:/{0,2})}gi) {
2160							# $self->{redirect_num} = $#http if ($#http > $self->{redirect_num});
2161							# }
2162							}
2163							}
2164							}
2165	130					469
2166							$self->{uri_list} = \@uris;
2167							# $self->set_tag('URILIST', @uris == 1 ? $uris[0] : \@uris) if @uris;
2168	130					923
2169							return @uris;
2170							}
2171
2172							=item $status->get_uri_detail_list ()
2173
2174							Returns a hash reference of all unique URIs found in the message and
2175							various data about where the URIs were found in the message. It takes a
2176							combination of the URIs found in the rendered (decoded and HTML stripped)
2177							body and the URIs found when parsing the HTML in the message. Will also
2178							set $status->{uri_detail_list} (the hash reference as returned by this
2179							function). This function will also set $status->{uri_domain_count} (count of
2180							unique domains).
2181
2182							The hash format looks something like this:
2183
2184							raw_uri => {
2185							types => { a => 1, img => 1, parsed => 1 },
2186							cleaned => [ canonicalized_uri ],
2187							anchor_text => [ "click here", "no click here" ],
2188							domains => { domain1 => 1, domain2 => 1 },
2189							}
2190
2191							C<raw_uri> is whatever the URI was in the message itself
2192							(http://spamassassin.apache%2Eorg/).
2193
2194							C<types> is a hash of the HTML tags (lowercase) which referenced
2195							the raw_uri. I<parsed> is a faked type which specifies that the
2196							raw_uri was seen in the rendered text.
2197
2198							C<cleaned> is an array of the raw and canonicalized version of the raw_uri
2199							(http://spamassassin.apache%2Eorg/, http://spamassassin.apache.org/).
2200
2201							C<anchor_text> is an array of the anchor text (text between <a> and
2202							</a>), if any, which linked to the URI.
2203
2204							C<domains> is a hash of the domains found in the canonicalized URIs.
2205
2206							C<hosts> is a hash of unstripped hostnames found in the canonicalized URIs
2207							as hash keys, with their domain part stored as a value of each hash entry.
2208
2209							=cut
2210
2211	1387			1387	1	1703	sub get_uri_detail_list {
2212							my ($self) = @_;
2213
2214	1387	100				2133	# use cached answer if available
2215	1257					3065	if (defined $self->{uri_detail_list}) {
2216							return $self->{uri_detail_list};
2217							}
2218	130					553
2219							my $timer = $self->{main}->time_method("get_uri_detail_list");
2220	130					503
2221							$self->{uri_domain_count} = 0;
2222
2223	130					736	# do this so we're sure metadata->html is setup
	1306					2530
2224							my %parsed = map { $_ => 'parsed' } $self->_get_parsed_uri_list();
2225
2226
2227							# This parses of DKIM for URIs disagrees with documentation and bug 6700 votes to disable
2228							# this functionality
2229							# 2013-01-07
2230							# This functionality is re-enabled as a configuration option disabled by
2231							# default (bug 7087)
2232							# 2014-10-06
2233
2234	130	50				613	# Look for the domain in DK/DKIM headers
2235	130					550	if ( $self->{conf}->{parse_dkim_uris} ) {
	260					777
2236							my $dk = join(" ", grep {defined} ( $self->get('DomainKey-Signature',undef),
2237	130					560	$self->get('DKIM-Signature',undef) ));
2238	0					0	while ($dk =~ /\bd\s=\s([^;]+)/g) {
2239	0					0	my $dom = $1;
2240	0					0	$dom =~ s/\s+//g;
2241							$parsed{$dom} = 'domainkeys';
2242							}
2243							}
2244
2245							# get URIs from HTML parsing
2246	130		100			968	# use the metadata version since $self->{html} may not be setup
2247	130	50				609	my $detail = $self->{msg}->{metadata}->{html}->{uri_detail} \|\| { };
2248							$self->{'uri_truncated'} = 1 if $self->{msg}->{metadata}->{html}->{uri_truncated};
2249
2250	130					375	# don't keep dereferencing ...
2251							my $redirector_patterns = $self->{conf}->{redirector_patterns};
2252
2253	130					340	# canonicalize the HTML parsed URIs
	257					1026
2254	127					304	while(my($uri, $info) = each %{ $detail }) {
2255	127					282	my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
2256							$info->{cleaned} = \@tmp;
2257	127					187
2258	239					626	foreach (@tmp) {
2259	239	100	66			997	my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
			100
2260							if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
2261	111					263	# unstripped full host name as a key, and its domain part as a value
2262	111	50	33			444	$info->{hosts}->{$host} = $domain;
			33
2263	111					186	if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}) {
2264	111					203	$info->{domains}->{$domain} = 1; # stripped to domain boundary
2265							$self->{uri_domain_count}++;
2266							}
2267							}
2268							}
2269	127	50				300
2270	0					0	if (would_log('dbg', 'uri') == 2) {
2271	0					0	dbg("uri: html uri found, $uri");
2272	0					0	foreach my $nuri (@tmp) {
2273							dbg("uri: cleaned html uri, $nuri");
2274	0	0	0			0	}
2275	0					0	if ($info->{hosts} && $info->{domains}) {
	0					0
2276	0					0	for my $host (keys %{$info->{hosts}}) {
2277							dbg("uri: html host %s, domain %s", $host, $info->{hosts}->{$host});
2278							}
2279							}
2280							}
2281							}
2282
2283	130					685	# canonicalize the text parsed URIs
2284	1126					3477	while (my($uri, $type) = each %parsed) {
2285	1126					1372	$detail->{$uri}->{types}->{$type} = 1;
2286							my $info = $detail->{$uri};
2287	1126					1219
2288							my @uris;
2289	1126	50				1701
2290	1126	50				1543	if (!exists $info->{cleaned}) {
2291	1126					2151	if ($type eq 'parsed') {
2292							@uris = uri_list_canonicalize($redirector_patterns, $uri);
2293							}
2294	0					0	else {
2295							@uris = ( $uri );
2296	1126					2045	}
2297							$info->{cleaned} = \@uris;
2298	1126					1640
2299	1436					3571	foreach (@uris) {
2300	1436	100	66			6527	my($domain,$host) = $self->{main}->{registryboundaries}->uri_to_domain($_);
			100
2301							if (defined $host && $host ne '' && !$info->{hosts}->{$host}) {
2302	1126					2430	# unstripped full host name as a key, and its domain part as a value
2303	1126	50	33			4549	$info->{hosts}->{$host} = $domain;
			33
2304	1126					2164	if (defined $domain && $domain ne '' && !$info->{domains}->{$domain}){
2305	1126					1900	$info->{domains}->{$domain} = 1;
2306							$self->{uri_domain_count}++;
2307							}
2308							}
2309							}
2310							}
2311	1126	50				2464
2312	0					0	if (would_log('dbg', 'uri') == 2) {
2313	0					0	dbg("uri: parsed uri found of type $type, $uri");
2314	0					0	foreach my $nuri (@uris) {
2315							dbg("uri: cleaned parsed uri, $nuri");
2316	0	0	0			0	}
2317	0					0	if ($info->{hosts} && $info->{domains}) {
	0					0
2318	0					0	for my $host (keys %{$info->{hosts}}) {
2319							dbg("uri: parsed host %s, domain %s", $host, $info->{hosts}->{$host});
2320							}
2321							}
2322							}
2323							}
2324
2325	130					437	# setup the cache
2326							$self->{uri_detail_list} = $detail;
2327	130					886
2328							return $detail;
2329							}
2330
2331	130			130		358	sub _get_parsed_uri_list {
2332							my ($self) = @_;
2333
2334	130	50				588	# use cached answer if available
2335							unless (defined $self->{parsed_uri_list}) {
2336							# TVD: we used to use decoded_body which is fine, except then we'll
2337							# try parsing URLs out of HTML, which is what the HTML code is going
2338							# to do (note: we know the HTML parsing occurs, because we call for the
2339							# rendered text which does HTML parsing...) trying to get URLs out of
2340							# HTML w/out parsing causes issues, so let's not do it.
2341							# also, if we allow $textary to be passed in, we need to invalidate
2342	130					488	# the cache first. fyi.
2343	130					400	my $textary = $self->get_decoded_stripped_body_text_array();
2344							my $redirector_patterns = $self->{conf}->{redirector_patterns};
2345	130					500
2346	130					0	my ($rulename, $pat, @uris);
2347	130					572	my $text;
2348							my $tbirdurire = $self->_tbirdurire;
2349	130					582
2350							for my $entry (@$textary) {
2351
2352							# a workaround for [perl #69973] bug:
2353							# Invalid and tainted utf-8 char crashes perl 5.10.1 in regexp evaluation
2354							# Bug 6225, regexp and string should both be utf8, or none of them;
2355							# untainting string also seems to avoid the crash
2356							#
2357	672					1714	# Bug 6225: untaint the string in an attempt to work around a perl crash
2358							local $_ = untaint_var($entry);
2359	672					1623
2360	672					62668	local($1,$2,$3);
2361	831		66			2791	while (/$tbirdurire/igo) {
2362	831					1348	my $rawuri = $1\|\|$2\|\|$3;
2363	831					3148	$rawuri =~ s/(^[^(])\).$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
2364							$rawuri =~ s/[-~!@#^&()_+=:;\'?,.]$//; # remove trailing string of punctuations that TBird ignores
2365	831	100				1758	# skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
2366							next if $rawuri =~ /^(?:(?:https?\|ftp\|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i;
2367
2368							# If it's a hostname that was just sitting out in the
2369							# open, without a protocol, and not inside of an HTML tag,
2370							# the we should add the proper protocol in front, rather
2371	829					1043	# than using the base URI.
2372	829					871	my $uri = $rawuri;
2373	829	100				1620	my $rblonly;
2374	664	100				1761	if ($uri !~ /^(?:https?\|ftp\|mailto\|javascript\|file):/i) {
		100
		100
2375	1					3	if ($uri =~ /^ftp\./i) {
2376							$uri = "ftp://$uri";
2377							}
2378	316					542	elsif ($uri =~ /^www\d{0,2}\./i) {
2379							$uri = "http://$uri";
2380							}
2381	34					98	elsif ($uri =~ /\@/) {
2382							$uri = "mailto:$uri";
2383							}
2384							else {
2385	313					340	# some spammers are using unschemed URIs to escape filters
2386	313					524	$rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs
2387							$uri = "http://$uri";
2388							}
2389							}
2390	829	100				1682
2391							if ($uri =~ /^mailto:/i) {
2392	39	50				100	# skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters
2393	39	100				140	$uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]\|[3-6][0-9a-fA-F]\|7[0-9a-eA-E])/);
2394	37					126	next if ($uri !~ /^[^@]+@[^@]+$/);
2395	37	100				105	my $domuri = $self->{main}->{registryboundaries}->uri_to_domain($uri);
2396	35					425	next unless $domuri;
2397	35	100				114	push (@uris, $rawuri);
2398							push (@uris, $uri) unless ($rawuri eq $uri);
2399							}
2400	825	100				2945
2401							next unless ($uri =~/^(?:https?\|ftp):/i); # at this point only valid if one or the other of these
2402	790					1768
2403	790					1004	my @tmp = uri_list_canonicalize($redirector_patterns, $uri);
2404	790					1245	my $goodurifound = 0;
2405	800					2254	foreach my $cleanuri (@tmp) {
2406	800	100				1436	my $domain = $self->{main}->{registryboundaries}->uri_to_domain($cleanuri);
2407							if ($domain) {
2408	777	100				1091	# bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
2409	312					564	if ($rblonly) {
2410	312					1301	local $1;
2411							$cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/i;
2412	777					1186	}
2413	777					1151	push (@uris, $cleanuri);
2414							$goodurifound = 1;
2415							}
2416	790	100				1177	}
2417	774	100				5699	next unless $goodurifound;
2418							push @uris, $rawuri unless $rblonly;
2419							}
2420							}
2421
2422	130					567	# Make sure all the URIs are nice and short
2423	1306	50				1772	foreach my $uri ( @uris ) {
2424	0					0	if (length $uri > MAX_URI_LENGTH) {
2425	0					0	$self->{'uri_truncated'} = 1;
2426							$uri = substr $uri, 0, MAX_URI_LENGTH;
2427							}
2428							}
2429
2430	130					614	# setup the cache and return
2431							$self->{parsed_uri_list} = \@uris;
2432							}
2433	130					260
	130					720
2434							return @{$self->{parsed_uri_list}};
2435							}
2436
2437							###########################################################################
2438
2439	0			0	0	0	sub ensure_rules_are_complete {
2440	0					0	my $self = shift;
2441							my $metarule = shift;
2442							# @_ is now the list of rules
2443	0					0
2444							foreach my $r (@_) {
2445	0	0				0	# dbg("rules: meta rule depends on net rule $r");
2446							next if ($self->is_rule_complete($r));
2447	0					0
2448	0					0	dbg("rules: meta rule $metarule depends on pending rule $r, blocking");
2449							my $timer = $self->{main}->time_method("wait_for_pending_rules");
2450	0					0
2451	0					0	my $start = time;
2452	0					0	$self->harvest_until_rule_completes($r);
2453							my $elapsed = time - $start;
2454	0	0				0
		0
2455	0					0	if (!$self->is_rule_complete($r)) {
2456							dbg("rules: rule $r is still not complete; exited early?");
2457							}
2458	0					0	elsif ($elapsed > 0) {
2459							info("rules: $r took $elapsed seconds to complete, for $metarule");
2460							}
2461							}
2462							}
2463
2464							###########################################################################
2465
2466							# use a separate sub here, for brevity
2467							# called out of generated eval
2468	0			0	0	0	sub handle_eval_rule_errors {
2469	0					0	my ($self, $rulename) = @_;
2470	0					0	warn "rules: failed to run $rulename test, skipping:\n\t($@)\n";
2471							$self->{rule_errors}++;
2472							}
2473
2474	3281			3281	0	6058	sub register_plugin_eval_glue {
2475							my ($self, $function) = @_;
2476	3281	50				5724
2477	0					0	if (!$function) {
2478	0					0	warn "rules: empty function name";
2479							return;
2480							}
2481
2482	3281	100				39208	# only need to call this once per fn (globally)
2483	220					483	return if exists $TEMPORARY_EVAL_GLUE_METHODS{$function};
2484							$TEMPORARY_EVAL_GLUE_METHODS{$function} = undef;
2485
2486	220	50				848	# return if it's not an eval_plugin function
2487							return if (!exists $self->{conf}->{eval_plugins}->{$function});
2488
2489							# return if it's been registered already
2490	220	50	33			1582	return if ($self->can ($function) &&
	0					0
2491							defined &{'Mail::SpamAssassin::PerMsgStatus::'.$function});
2492	220					737
2493							my $evalstr = <<"ENDOFEVAL";
2494							{
2495							package Mail::SpamAssassin::PerMsgStatus;
2496
2497							sub $function {
2498							my (\$self) = shift;
2499							my \$plugin = \$self->{conf}->{eval_plugins}->{$function};
2500							return \$plugin->$function (\$self, \@_);
2501							}
2502
2503							1;
2504							}
2505							ENDOFEVAL
2506	220	50		462	1	24229	eval $evalstr . '; 1' ## no critic
	462			77	1	1102
	462			77	0	1034
	462			77	0	1509
	77			0	0	317
	77			77	0	303
	77			89	1	442
	77			77	1	266
	77			77	0	318
	77			165	1	445
	77			159	1	289
	77			123	1	284
	77			145	0	453
	0			593	0	0
	0			119	0	0
	0			122	0	0
	77			4	1	291
	77			0	1	270
	77			0	0	454
	77			0	1	293
	77			8	0	310
	77			0	0	549
	77			0	0	346
	77			0	0	294
	77			77	0	482
	77			77	0	267
	77			77	0	286
	77			77	0	549
	165			77	0	473
	165			77	0	477
	165					630
	119					347
	119					356
	119					521
	111					394
	111					489
	111					596
	77					320
	77					288
	77					376
	573					1404
	539					1304
	539					1751
	111					372
	77					303
	77					567
	111					361
	111					318
	111					638
	34					174
	34					112
	0					0
	34					234
	34					133
	34					121
	68					161
	68					141
	68					183
	68					240
	68					461
	42					79
	42					64
	42					51
	42					62
	42					77
	0					0
	42					144
	42					77
	42					78
	0					0
	0					0
	0					0
	42					58
	119					402
	119					340
	119					515
	119					327
	119					350
	77					466
	119					436
	119					429
	85					517
	85					303
	85					299
	123					565
	123					482
	123					372
	77					465
	77					300
	111					313
	111					506
2507	0	0				0	or do {
	0					0
2508	0					0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
2509	0					0	warn "rules: failed to run header tests, skipping some: $eval_stat\n";
2510							$self->{rule_errors}++;
2511							};
2512
2513	220					2802	# ensure this method is deleted if finish_tests() is called
2514							push (@TEMPORARY_METHODS, $function);
2515							}
2516
2517							###########################################################################
2518
2519							# note: only eval tests should store state in $self->{test_log_msgs};
2520							# pattern tests do not.
2521							#
2522							# the clearing of the test state is now inlined as:
2523							#
2524							# %{$self->{test_log_msgs}} = (); # clear test state
2525							#
2526							# except for this public API for plugin use:
2527
2528							=item $status->clear_test_state()
2529
2530							Clear test state, including test log messages from C<$status-E<gt>test_log()>.
2531
2532							=cut
2533
2534	0			0	1	0	sub clear_test_state {
2535	34					95	my ($self) = @_;
	34					96
2536							%{$self->{test_log_msgs}} = ();
2537							}
2538
2539							# internal API, called only by get_hit()
2540							# TODO: refactor and merge this into that function
2541	174			128		503	sub _handle_hit {
2542							my ($self, $rule, $score, $area, $ruletype, $desc) = @_;
2543	174					1012
2544							$self->{main}->call_plugins ("hit_rule", {
2545							permsgstatus => $self,
2546							rulename => $rule,
2547							ruletype => $ruletype,
2548							score => $score
2549							});
2550
2551	174	100				790	# ignore meta-match sub-rules.
	88					207
	76					206
	76					156
2552							if ($rule =~ /^__/) { push(@{$self->{subtest_names_hit}}, $rule); return; }
2553
2554	52	50				185	# this should not happen; warn about it
2555	46					111	if (!defined $score) {
2556	34					82	warn "rules: score undef for rule '$rule' in '$area' '$desc'";
2557							return;
2558							}
2559
2560	86	50				312	# this should not happen; warn about NaN (bug 3364)
2561	42					63	if ($score != $score) {
2562	42					106	warn "rules: score '$score' for rule '$rule' in '$area' '$desc'";
2563							return;
2564							}
2565
2566	106					299	# Add the rule hit to the score
2567							$self->{score} += $score;
2568	55					123
	94					276
2569	97		100			417	push(@{$self->{test_names_hit}}, $rule);
2570							$area \|\|= '';
2571	97	100	66			438
2572	58					137	if ($score >= 10 \|\| $score <= -10) {
2573							$score = sprintf("%4.0f", $score);
2574							}
2575	51					819	else {
2576							$score = sprintf("%4.1f", $score);
2577							}
2578
2579							# save both summaries
2580							# TODO: this is slower than necessary, if we only need one
2581							$self->{tag_data}->{REPORT} .= sprintf ("* %s %s %s%s\n%s",
2582							$score, $rule, $area,
2583							$self->_wrap_desc($desc,
2584							4+length($rule)+length($score)+length($area), "* "),
2585	97	100				555	($self->{test_log_msgs}->{TERSE} ?
2586							"* " . $self->{test_log_msgs}->{TERSE} : ''));
2587
2588							$self->{tag_data}->{SUMMARY} .= sprintf ("%s %-22s %s%s\n%s",
2589							$score, $rule, $area,
2590							$self->_wrap_desc($desc,
2591	56		100			346	3+length($rule)+length($score)+length($area), " " x 28),
2592							($self->{test_log_msgs}->{LONG} \|\| ''));
2593							}
2594
2595	120			104		378	sub _wrap_desc {
2596							my ($self, $desc, $firstlinelength, $prefix) = @_;
2597	118					378
2598	104					484	my $firstline = " " x $firstlinelength;
2599	114					524	my $wrapped = Mail::SpamAssassin::Util::wrap($desc, $prefix, $firstline, $self->{conf}->{report_wrap_width}, 0);
2600	106					845	$wrapped =~ s/^\s+//s;
2601							$wrapped;
2602							}
2603
2604							###########################################################################
2605
2606							=item $status->got_hit ($rulename, $desc_prepend [, name => value, ...])
2607
2608							Register a hit against a rule in the ruleset.
2609
2610							There are two mandatory arguments. These are C<$rulename>, the name of the rule
2611							that fired, and C<$desc_prepend>, which is a short string that will be
2612							prepended to the rules C<describe> string in output reports.
2613
2614							In addition, callers can supplement that with the following optional
2615							data:
2616
2617							=over 4
2618
2619							=item score => $num
2620
2621							Optional: the score to use for the rule hit. If unspecified,
2622							the value from the C<Mail::SpamAssassin::Conf> object's C<{scores}>
2623							hash will be used (a configured score), and in its absence the
2624							C<defscore> option value.
2625
2626							=item defscore => $num
2627
2628							Optional: the score to use for the rule hit if neither the
2629							option C<score> is provided, nor a configured score value is provided.
2630
2631							=item value => $num
2632
2633							Optional: the value to assign to the rule; the default value is C<1>.
2634							I<tflags multiple> rules use values of greater than 1 to indicate
2635							multiple hits. This value is accessible to meta rules.
2636
2637							=item ruletype => $type
2638
2639							Optional, but recommended: the rule type string. This is used in the
2640							C<hit_rule> plugin call, called by this method. If unset, I<'unknown'> is
2641							used.
2642
2643							=item tflags => $string
2644
2645							Optional: a string, i.e. a space-separated list of additional tflags
2646							to be appended to an existing list of flags in $self->{conf}->{tflags},
2647							such as: "nice noautolearn multiple". No syntax checks are performed.
2648
2649							=item description => $string
2650
2651							Optional: a custom rule description string. This is used in the
2652							C<hit_rule> plugin call, called by this method. If unset, the static
2653							description is used.
2654
2655							=back
2656
2657							Backward compatibility: the two mandatory arguments have been part of this API
2658							since SpamAssassin 2.x. The optional I<name=<gt>value> pairs, however, are a
2659							new addition in SpamAssassin 3.2.0.
2660
2661							=cut
2662
2663	138			128	1	2789	sub got_hit {
2664							my ($self, $rule, $area, %params) = @_;
2665	138					450
2666							my $conf_ref = $self->{conf};
2667	138					248
2668	128					273	my $dynamic_score_provided;
2669	132	50				291	my $score = $params{score};
2670	4					16	if (defined $score) { # overrides any configured scores
2671							$dynamic_score_provided = 1;
2672	132					281	} else {
2673	132	50				328	$score = $conf_ref->{scores}->{$rule};
2674							$score = $params{defscore} if !defined $score;
2675							}
2676
2677							# adding a hit does nothing if we don't have a score -- we probably
2678	128	50				291	# shouldn't have run it in the first place
2679	0					0	if (!$score) {
	0					0
2680	4					24	%{$self->{test_log_msgs}} = ();
2681							return;
2682							}
2683
2684							# ensure that rule values always result in an increase
2685	128					235	# of $self->{tests_already_hit}->{$rule}:
2686	128	100	66			408	my $value = $params{value};
	122					190
2687							if (!$value \|\| $value <= 0) { $value = 1 }
2688	128					228
2689	132					237	my $tflags_ref = $conf_ref->{tflags};
2690	128	50	33			348	my $tflags_add = $params{tflags};
2691							if (defined $tflags_add && $tflags_add ne '') {
2692	0	0	0			0	$_ = (!defined $_ \|\| $_ eq '') ? $tflags_add : ($_ . ' ' . $tflags_add)
2693							for $tflags_ref->{$rule};
2694							};
2695	128		100			527
2696							my $already_hit = $self->{tests_already_hit}->{$rule} \|\| 0;
2697	128	100	50			342	# don't count hits multiple times, unless 'tflags multiple' is on
			66
2698	4					15	if ($already_hit && ($tflags_ref->{$rule}\|\|'') !~ /\bmultiple\b/) {
	0					0
2699	0					0	%{$self->{test_log_msgs}} = ();
2700							return;
2701							}
2702	128					331
2703							$self->{tests_already_hit}->{$rule} = $already_hit + $value;
2704
2705	132		100			387	# default ruletype, if not specified:
2706							$params{ruletype} \|\|= 'unknown';
2707	128	50				319
2708	0					0	if ($dynamic_score_provided) { # copy it to static for proper reporting
2709	0					0	$conf_ref->{scoreset}->[$_]->{$rule} = $score for (0..3);
2710							$conf_ref->{scores}->{$rule} = $score;
2711							}
2712	132					287
2713	128	50				262	my $rule_descr = $params{description};
2714	0					0	if (defined $rule_descr) {
2715							$conf_ref->{descriptions}->{$rule} = $rule_descr; # save dynamic descr.
2716	128					481	} else {
2717							$rule_descr = $conf_ref->get_description_for_rule($rule); # static
2718							}
2719							# Bug 6880 Set Rule Description to something that says no rule
2720	168	100	66			817	#$rule_descr = $rule if !defined $rule_descr \|\| $rule_descr eq '';
2721							$rule_descr = "No description available." if !defined $rule_descr \|\| $rule_descr eq '';
2722
2723							$self->_handle_hit($rule,
2724							$score,
2725							$area,
2726	168					5665	$params{ruletype},
2727							$rule_descr);
2728
2729	128					344	# take care of duplicate rules, too (bug 5206)
2730	128	50	33			356	my $dups = $conf_ref->{duplicate_rules}->{$rule};
	0					0
2731	0					0	if ($dups && @{$dups}) {
	0					0
2732	0					0	foreach my $dup (@{$dups}) {
2733							$self->got_hit($dup, $area, %params);
2734							}
2735							}
2736	140					645
	140					309
2737	128					1788	%{$self->{test_log_msgs}} = (); # clear test logs
2738							return 1;
2739							}
2740
2741							###########################################################################
2742
2743							# TODO: this needs API doc
2744	0			0	0	0	sub test_log {
2745	12					34	my ($self, $msg) = @_;
2746	12					63	local $1;
2747	12					70	while ($msg =~ s/^(.{30,48})\s//) {
2748							$self->_test_log_line ($1);
2749	0					0	}
2750							$self->_test_log_line ($msg);
2751							}
2752
2753	12			0		61	sub _test_log_line {
2754							my ($self, $msg) = @_;
2755	22					58
2756	22	50				60	$self->{test_log_msgs}->{TERSE} .= sprintf ("[%s]\n", $msg);
2757	60					114	if (length($msg) > 47) {
2758							$self->{test_log_msgs}->{LONG} .= sprintf ("%78s\n", "[$msg]");
2759	12					87	} else {
2760							$self->{test_log_msgs}->{LONG} .= sprintf ("%27s [%s]\n", "", $msg);
2761							}
2762							}
2763
2764							###########################################################################
2765
2766							# helper for get(). Do not call directly, as get() caches its results
2767							# and this does not!
2768	116			94	0	324	sub get_envelope_from {
2769							my ($self) = @_;
2770
2771							# bug 2142:
2772							# Get the SMTP MAIL FROM:, aka. the "envelope sender", if our
2773							# calling app has helpfully marked up the source message
2774							# with it. Various MTAs and calling apps each have their
2775							# own idea of what header to use for this! see
2776	106					244
2777							my $envf;
2778
2779							# Rely on the 'envelope-sender-header' header if the user has configured one.
2780							# Assume that because they have configured it, their MTA will always add it.
2781	106	50				450	# This will prevent us falling through and picking up inappropriate headers.
2782							if (defined $self->{conf}->{envelope_sender_header}) {
2783	12					40	# make sure we get the most recent copy - there can be only one EnvelopeSender.
2784							$envf = $self->get($self->{conf}->{envelope_sender_header}.":addr",undef);
2785	12	0	0			91	# ok if it contains an "@" sign, or is "" (ie. "<>" without the < and >)
			0
2786							goto ok if defined $envf && ($envf =~ /\@/ \|\| $envf eq '');
2787	0	0				0	# Warn them if it's configured, but not there or not usable.
2788	0					0	if (defined $envf) {
2789							chomp $envf;
2790	0					0	dbg("message: envelope_sender_header '%s: %s' is not an FQDN, ignoring",
2791							$self->{conf}->{envelope_sender_header}, $envf);
2792							} else {
2793	0					0	dbg("message: envelope_sender_header '%s' not found in message",
2794							$self->{conf}->{envelope_sender_header});
2795							}
2796	8					22	# Couldn't get envelope-sender using the configured header.
2797							return;
2798							}
2799
2800							# User hasn't given us a header to trust, so try to guess the sender.
2801
2802							# use the "envelope-sender" string found in the Received headers,
2803							# if possible... use the last untrusted header, in case there's
2804	102					330	# trusted headers.
2805	94	100				280	my $lasthop = $self->{relays_untrusted}->[0];
2806							if (!defined $lasthop) {
2807							# no untrusted headers? in that case, the message is ALL_TRUSTED.
2808	86					235	# use the first trusted header (ie. the oldest, originating one).
2809							$lasthop = $self->{relays_trusted}->[-1];
2810							}
2811	94	100				266
2812	33					98	if (defined $lasthop) {
2813							$envf = $lasthop->{envfrom};
2814							# TODO FIXME: Received.pm puts both null senders and absence-of-sender
2815	33	50	33			143	# into the relays array as '', so we can't distinguish them :(
2816	0					0	if ($envf && ($envf =~ /\@/)) {
2817							goto ok;
2818							}
2819							}
2820
2821							# WARNING: a lot of list software adds an X-Sender for the original env-from
2822							# (including Yahoo! Groups). Unfortunately, fetchmail will pick it up and
2823							# reuse it as the env-from for its delivery -- even though the list
2824							# software had used a different env-from in the intervening delivery. Hence,
2825							# if this header is present, and there's a fetchmail sig in the Received
2826							# lines, we cannot trust any Envelope-From headers, since they're likely to
2827							# be incorrect fetchmail guesses.
2828	94	50				429
2829	0					0	if ($self->get("X-Sender") =~ /\@/) {
2830	12	0				38	my $rcvd = join(' ', $self->get("Received"));
2831	12					47	if ($rcvd =~ /\(fetchmail/) {
2832	0					0	dbg("message: X-Sender and fetchmail signatures found, cannot trust envelope-from");
2833							return;
2834							}
2835							}
2836
2837	94	50				353	# procmailrc notes this (we now recommend adding it to Received instead)
2838							if ($envf = $self->get("X-Envelope-From")) {
2839							# heuristic: this could have been relayed via a list which then used
2840	12	0				28	# a new Envelope-from. check
2841	12					40	if ($self->get("ALL:raw") =~ /^Received:.*^X-Envelope-From:/smi) {
2842	12					45	dbg("message: X-Envelope-From header found after 1 or more Received lines, cannot trust envelope-from");
2843							return;
2844	0					0	} else {
2845							goto ok;
2846							}
2847							}
2848
2849	106	50				447	# qmail, new-inject(1)
2850							if ($envf = $self->get("Envelope-Sender")) {
2851							# heuristic: this could have been relayed via a list which then used
2852	12	0				80	# a new Envelope-from. check
2853	12					141	if ($self->get("ALL:raw") =~ /^Received:.*^Envelope-Sender:/smi) {
2854							dbg("message: Envelope-Sender header found after 1 or more Received lines, cannot trust envelope-from");
2855	12					65	} else {
2856							goto ok;
2857							}
2858							}
2859
2860							# Postfix, sendmail, amavisd-new, ...
2861							# RFC 2821 requires it:
2862							# When the delivery SMTP server makes the "final delivery" of a
2863							# message, it inserts a return-path line at the beginning of the mail
2864							# data. This use of return-path is required; mail systems MUST support
2865							# it. The return-path line preserves the information in the <reverse-
2866	106	100				345	# path> from the MAIL command.
2867							if ($envf = $self->get("Return-Path")) {
2868							# heuristic: this could have been relayed via a list which then used
2869	13	50				28	# a new Envelope-from. check
2870	12					49	if ($self->get("ALL:raw") =~ /^Received:.*^Return-Path:/smi) {
2871							dbg("message: Return-Path header found after 1 or more Received lines, cannot trust envelope-from");
2872	31					145	} else {
2873							goto ok;
2874							}
2875							}
2876
2877	113					311	# give up.
2878							return;
2879	13					81
2880							ok:
2881	13					89	$envf =~ s/^<*//s; # remove <
2882							$envf =~ s/>\s\z//s; # remove >, whitespace, newlines
2883	13					40
2884							return $envf;
2885							}
2886
2887							###########################################################################
2888
2889							# helper for get(ALL-*). get() caches its results, so don't call this
2890							# directly unless you need a range of headers not covered by the ALL-*
2891							# psuedo-headers!
2892
2893							# Get all the headers found between an index range of received headers, the
2894							# index doesn't care if we could parse the received headers or not.
2895							# Use undef for the $start_rcvd or $end_rcvd numbers to start/end with the
2896							# first/last header in the message, otherwise indicate the index number you
2897							# want to start/end at. Set $include_start_rcvd or $include_end_rcvd to 0 to
2898							# indicate you don't want to include the received header found at the start or
2899							# end indexes... basically toggles between [s,e], [s,e), (s,e], (s,e).
2900	12			0	0	69	sub get_all_hdrs_in_rcvd_index_range {
2901							my ($self, $start_rcvd, $end_rcvd, $include_start_rcvd, $include_end_rcvd) = @_;
2902
2903	0	0	0			0	# prevent bad input causing us to return the first header found
2904							return if (defined $end_rcvd && $end_rcvd < 0);
2905	0	0				0
2906	0	0				0	$include_start_rcvd = 1 unless defined $include_start_rcvd;
2907							$include_end_rcvd = 1 unless defined $include_end_rcvd;
2908	0					0
2909	0					0	my $cur_rcvd_index = -1; # none found yet
2910							my $result = '';
2911	0					0
2912	0	0				0	foreach my $hdr (split(/^/m, $self->{msg}->get_pristine_header())) {
2913	0					0	if ($hdr =~ /^Received:/i) {
2914	0	0	0			0	$cur_rcvd_index++;
			0
2915							next if (defined $start_rcvd && !$include_start_rcvd &&
2916	0	0	0			0	$start_rcvd == $cur_rcvd_index);
			0
2917							last if (defined $end_rcvd && !$include_end_rcvd &&
2918							$end_rcvd == $cur_rcvd_index);
2919	0	0	0			0	}
		0	0
			0
			0
2920							if ((!defined $start_rcvd \|\| $start_rcvd <= $cur_rcvd_index) &&
2921	0					0	(!defined $end_rcvd \|\| $cur_rcvd_index < $end_rcvd)) {
2922							$result .= $hdr."\n";
2923							}
2924	0					0	elsif (defined $end_rcvd && $cur_rcvd_index == $end_rcvd) {
2925	0					0	$result .= $hdr."\n";
2926							last;
2927							}
2928	0	0				0	}
2929							return ($result eq '' ? undef : $result);
2930							}
2931
2932							###########################################################################
2933	0			0	0	0
2934							sub sa_die { Mail::SpamAssassin::sa_die(@_); }
2935
2936							###########################################################################
2937
2938							=item $status->create_fulltext_tmpfile (fulltext_ref)
2939
2940							This function creates a temporary file containing the passed scalar
2941							reference data (typically the full/pristine text of the message).
2942							This is typically used by external programs like pyzor and dccproc, to
2943							avoid hangs due to buffering issues. Methods that need this, should
2944							call $self->create_fulltext_tmpfile($fulltext) to retrieve the temporary
2945							filename; it will be created if it has not already been.
2946
2947							Note: This can only be called once until $status->delete_fulltext_tmpfile() is
2948							called.
2949
2950							=cut
2951
2952	0			0	1	0	sub create_fulltext_tmpfile {
2953							my ($self, $fulltext) = @_;
2954	0	0				0
2955	0					0	if (defined $self->{fulltext_tmpfile}) {
2956							return $self->{fulltext_tmpfile};
2957							}
2958	0					0
2959	0	0				0	my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
2960							$tmpfh or die "failed to create a temporary file";
2961
2962							# PerlIO's buffered print writes in 8 kB chunks - which can be slow.
2963							# print $tmpfh $$fulltext or die "error writing to $tmpf: $!";
2964							#
2965							# reducing the number of writes and bypassing extra buffering in PerlIO
2966	0					0	# speeds up writing of larger text by a factor of 2
2967	0					0	my $nwrites;
2968	0					0	for (my $ofs = 0; $ofs < length($$fulltext); $ofs += $nwrites) {
2969	0	0				0	$nwrites = $tmpfh->syswrite($$fulltext, length($$fulltext)-$ofs, $ofs);
2970							defined $nwrites or die "error writing to $tmpf: $!";
2971	0	0				0	}
2972							close $tmpfh or die "error closing $tmpf: $!";
2973	0					0
2974							$self->{fulltext_tmpfile} = $tmpf;
2975	0					0
2976							dbg("check: create_fulltext_tmpfile, written %d bytes to file %s",
2977							length($$fulltext), $tmpf);
2978	0					0
2979							return $self->{fulltext_tmpfile};
2980							}
2981
2982							=item $status->delete_fulltext_tmpfile ()
2983
2984							Will cleanup after a $status->create_fulltext_tmpfile() call. Deletes the
2985							temporary file and uncaches the filename.
2986
2987							=cut
2988
2989	189			189	1	494	sub delete_fulltext_tmpfile {
2990	189	50				1534	my ($self) = @_;
2991	0	0				0	if (defined $self->{fulltext_tmpfile}) {
2992	0					0	if (!unlink $self->{fulltext_tmpfile}) {
2993							my $msg = sprintf("cannot unlink %s: %s", $self->{fulltext_tmpfile}, $!);
2994	0	0				0	# don't fuss too much if file is missing, perhaps it wasn't even created
	0					0
	0					0
2995							if ($! == ENOENT) { warn $msg } else { die $msg }
2996	0					0	}
2997							$self->{fulltext_tmpfile} = undef;
2998							}
2999							}
3000
3001							###########################################################################
3002
3003	233			233	0	432	sub all_from_addrs {
3004							my ($self) = @_;
3005	233	100				605
	154					226
	154					977
3006							if (exists $self->{all_from_addrs}) { return @{$self->{all_from_addrs}}; }
3007	79					129
3008							my @addrs;
3009
3010	79					261	# Resent- headers take priority, if present. see bug 672
3011	79	50	33			377	my $resent = $self->get('Resent-From',undef);
3012	0					0	if (defined $resent && $resent =~ /\S/) {
3013							@addrs = $self->{main}->find_all_addrs_in_line ($resent);
3014							}
3015							else {
3016							# bug 2292: Used to use find_all_addrs_in_line() with the same
3017							# headers, but the would catch addresses in comments which caused
3018							# FNs for things like whitelist_from. Since all of these are From
3019							# headers, there should only be 1 address in each anyway (not exactly
3020							# true, RFC 2822 allows multiple addresses in a From header field),
3021							# so use the :addr code...
3022							# bug 3366: some addresses come in as 'foo@bar...', which is invalid.
3023							# so deal with the multiple periods.
3024	79					281	## no critic
	39					154
	39					176
	395					832
3025							@addrs = map { tr/././s; $_ } grep { $_ ne '' }
3026							($self->get('From:addr'), # std
3027							$self->get('Envelope-Sender:addr'), # qmail: new-inject(1)
3028							$self->get('Resent-Sender:addr'), # procmailrc manpage
3029							$self->get('X-Envelope-From:addr'), # procmailrc manpage
3030							$self->get('EnvelopeFrom:addr')); # SMTP envelope
3031							# http://www.cs.tut.fi/~jkorpela/headers.html is useful here
3032							}
3033
3034	79					261	# Remove duplicate addresses
	39					171
3035	79					277	my %addrs = map { $_ => 1 } @addrs;
3036							@addrs = keys %addrs;
3037	79					477
3038	79					375	dbg("eval: all '*From' addrs: " . join(" ", @addrs));
3039	79					515	$self->{all_from_addrs} = \@addrs;
3040							return @addrs;
3041							}
3042
3043							=item all_from_addrs_domains
3044
3045							This function returns all the various from addresses in a message using all_from_addrs()
3046							and then returns only the domain names.
3047
3048							=cut
3049
3050	0			0	1	0	sub all_from_addrs_domains {
3051							my ($self) = @_;
3052	0	0				0
3053	0					0	if (exists $self->{all_from_addrs_domains}) {
	0					0
3054							return @{$self->{all_from_addrs_domains}};
3055							}
3056
3057							#TEST POINT - my @addrs = ("test.voipquotes2.net","test.voipquotes2.co.uk");
3058	0					0	#Start with all the normal from addrs
3059							my @addrs = all_from_addrs($self);
3060	0					0
3061							dbg("eval: all '*From' addrs domains (before): " . join(" ", @addrs));
3062
3063							#Take just the domain with a dummy localpart
3064	0					0	#removing invalid and duplicate domains
3065	0					0	my(%addrs_seen, @addrs_filtered);
3066	0					0	foreach my $a (@addrs) {
3067	0	0	0			0	my $domain = $self->{main}->{registryboundaries}->uri_to_domain($a);
3068	0					0	next if !defined $domain \|\| $addrs_seen{lc $domain}++;
3069							push(@addrs_filtered, 'dummy@'.$domain);
3070							}
3071	0					0
3072							dbg("eval: all '*From' addrs domains (after uri to domain): " .
3073							join(" ", @addrs_filtered));
3074	0					0
3075							$self->{all_from_addrs_domains} = \@addrs_filtered;
3076	0					0
3077							return @addrs_filtered;
3078							}
3079
3080	156			156	0	337	sub all_to_addrs {
3081							my ($self) = @_;
3082	156	100				531
	77					155
	77					1283
3083							if (exists $self->{all_to_addrs}) { return @{$self->{all_to_addrs}}; }
3084	79					151
3085							my @addrs;
3086
3087	79					277	# Resent- headers take priority, if present. see bug 672
3088	79	50				290	my $resent = join('', $self->get('Resent-To'), $self->get('Resent-Cc'));
3089	0					0	if ($resent =~ /\S/) {
3090							@addrs = $self->{main}->find_all_addrs_in_line($resent);
3091							} else {
3092							# OK, a fetchmail trick: try to find the recipient address from
3093							# the most recent 3 Received lines. This is required for sendmail,
3094							# since it does not add a helpful header like exim, qmail
3095							# or Postfix do.
3096	79					244	#
3097	79					275	my $rcvd = $self->get('Received');
3098	79					295	$rcvd =~ s/\n[ \t]+/ /gs;
3099							$rcvd =~ s/\n+/\n/gs;
3100	79					366
	79					173
3101	79					131	my @rcvdlines = split(/\n/, $rcvd, 4); pop @rcvdlines; # forget last one
3102	79					262	my @rcvdaddrs;
3103	43	100				227	foreach my $line (@rcvdlines) {
	2					7
3104							if ($line =~ / for (\S+\@\S+);/) { push (@rcvdaddrs, $1); }
3105							}
3106
3107	79					398	@addrs = $self->{main}->find_all_addrs_in_line (
3108							join('',
3109							join(" ", @rcvdaddrs)."\n",
3110							$self->get('To'), # std
3111							$self->get('Apparently-To'), # sendmail, from envelope
3112							$self->get('Delivered-To'), # Postfix, poss qmail
3113							$self->get('Envelope-Recipients'), # qmail: new-inject(1)
3114							$self->get('Apparently-Resent-To'), # procmailrc manpage
3115							$self->get('X-Envelope-To'), # procmailrc manpage
3116							$self->get('Envelope-To'), # exim
3117							$self->get('X-Delivered-To'), # procmail quick start
3118							$self->get('X-Original-To'), # procmail quick start
3119							$self->get('X-Rcpt-To'), # procmail quick start
3120							$self->get('X-Real-To'), # procmail quick start
3121							$self->get('Cc'))); # std
3122							# those are taken from various sources; thanks to Nancy McGough, who
3123							# noted some in <http://www.ii.com/internet/robots/procmail/qs/#envelope>
3124							}
3125	79					508
3126	79					267	dbg("eval: all '*To' addrs: " . join(" ", @addrs));
3127	79					1781	$self->{all_to_addrs} = \@addrs;
3128							return @addrs;
3129
3130							# http://www.cs.tut.fi/~jkorpela/headers.html is useful here, also
3131							# http://www.exim.org/pipermail/exim-users/Week-of-Mon-20001009/021672.html
3132							}
3133
3134							###########################################################################
3135
3136							1;
3137							__END__
3138
3139							=back
3140
3141							=head1 SEE ALSO
3142
3143							C<Mail::SpamAssassin>
3144							C<spamassassin>
3145