File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm

Criterion	Covered	Total	%
statement	263	420	62.6
branch	97	212	45.7
condition	40	85	47.0
subroutine	25	30	83.3
pod	16	18	88.8
total	441	765	57.6

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21
22							=head1 DESCRIPTION
23
24							This module will encapsulate an email message and allow access to
25							the various MIME message parts.
26
27							=head1 PUBLIC METHODS
28
29							=over 4
30
31							=cut
32
33							package Mail::SpamAssassin::Message::Node;
34
35	40			40		286	use strict;
	40					90
	40					1435
36	40			40		241	use warnings;
	40					95
	40					1337
37	40			40		261	use re 'taint';
	40					125
	40					1669
38
39							require 5.008001; # needs utf8::is_utf8()
40
41	40			40		250	use Mail::SpamAssassin;
	40					95
	40					1235
42	40			40		285	use Mail::SpamAssassin::Constants qw(:sa);
	40					80
	40					6421
43	40			40		12762	use Mail::SpamAssassin::HTML;
	40					130
	40					1544
44	40			40		274	use Mail::SpamAssassin::Logger;
	40					81
	40					5604
45
46							our($enc_utf8, $enc_w1252, $have_encode_detector);
47							BEGIN {
48	40					23905	eval { require Encode }
49	40	50		40		157	and do { $enc_utf8 = Encode::find_encoding('UTF-8');
	40					383246
50	40					6379	$enc_w1252 = Encode::find_encoding('Windows-1252') };
51	40					213008	eval { require Encode::Detect::Detector }
52	40	50				15296	and do { $have_encode_detector = 1 };
	0					0
53							};
54
55							=item new()
56
57							Generates an empty Node object and returns it. Typically only called
58							by functions in Message.
59
60							=cut
61
62							sub new {
63	193			193	1	536	my $class = shift;
64	193		33			908	$class = ref($class) \|\| $class;
65
66	193					1249	my $self = {
67							headers => {},
68							raw_headers => {},
69							header_order => []
70							};
71
72							# deal with any parameters
73	193					544	my($opts) = @_;
74	193		50			1053	$self->{normalize} = $opts->{'normalize'} \|\| 0;
75
76	193					595	bless($self,$class);
77	193					630	$self;
78							}
79
80							=item find_parts()
81
82							Used to search the tree for specific MIME parts. An array of matching
83							Node objects (pointers into the tree) is returned. The parameters that
84							can be passed in are (in order, all scalars):
85
86							Regexp - Used to match against each part's Content-Type header,
87							specifically the type and not the rest of the header. ie: "Content-type:
88							text/html; encoding=quoted-printable" has a type of "text/html". If no
89							regexp is specified, find_parts() will return an empty array.
90
91							Only_leaves - By default, find_parts() will return any part that matches
92							the regexp, including multipart. If you only want to see leaves of the
93							tree (ie: parts that aren't multipart), set this to true (1).
94
95							Recursive - By default, when find_parts() finds a multipart which has
96							parts underneath it, it will recurse through all sub-children. If set to 0,
97							only look at the part and any direct children of the part.
98
99							=cut
100
101							# Used to find any MIME parts whose simple content-type matches a given regexp
102							# Searches it's own and any children parts. Returns an array of MIME
103							# objects which match. Our callers may expect the default behavior which is a
104							# depth-first array of parts.
105							#
106							sub find_parts {
107	255			255	1	729	my ($self, $re, $onlyleaves, $recursive) = @_;
108
109							# Didn't pass an RE? Just abort.
110	255	50	33			1544	return () unless defined $re && $re ne '';
111
112	255	50				761	$onlyleaves = 0 unless defined $onlyleaves;
113
114	255					432	my $depth;
115	255	50	33			733	if (defined $recursive && $recursive == 0) {
116	0					0	$depth = 1;
117							}
118
119	255					426	my @ret;
120	255					577	my @search = ( $self );
121
122	255					773	while (my $part = shift @search) {
123							# If this object matches, mark it for return.
124	287					1010	my $amialeaf = $part->is_leaf();
125
126	287	100	66			3148	if ( $part->{'type'} =~ /$re/ && (!$onlyleaves \|\| $amialeaf) ) {
			100
127	269					685	push(@ret, $part);
128							}
129
130	287	50	33			1209	if ( !$amialeaf && (!defined $depth \|\| $depth > 0)) {
			66
131	18	50				34	$depth-- if defined $depth;
132	18					29	unshift(@search, @{$part->{'body_parts'}});
	18					64
133							}
134							}
135
136	255					978	return @ret;
137							}
138
139							=item header()
140
141							Stores and retrieves headers from a specific MIME part. The first
142							parameter is the header name. If there is no other parameter, the header
143							is retrieved. If there is a second parameter, the header is stored.
144
145							Header names are case-insensitive and are stored in both raw and
146							decoded form. Using header(), only the decoded form is retrievable.
147
148							For retrieval, if header() is called in an array context, an array will
149							be returned with each header entry in a different element. In a scalar
150							context, the last specific header is returned.
151
152							ie: If 'Subject' is specified as the header, and there are 2 Subject
153							headers in a message, the last/bottom one in the message is returned in
154							scalar context or both are returned in array context.
155
156							=cut
157
158							# Store or retrieve headers from a given MIME object
159							#
160							sub header {
161	5970			5970	1	8455	my $self = shift;
162	5970					9585	my $rawkey = shift;
163
164	5970	50				13904	return unless defined $rawkey;
165
166							# we're going to do things case insensitively
167	5970					12288	my $key = lc($rawkey);
168
169							# Trim whitespace off of the header keys
170	5970					13844	$key =~ s/^\s+//;
171	5970					11090	$key =~ s/\s+$//;
172
173	5970	100				11531	if (@_) {
174	898					1657	my $raw_value = shift;
175	898	50				2127	return unless defined $raw_value;
176
177	898					1336	push @{ $self->{'header_order'} }, $rawkey;
	898					2422
178	898	100				2601	if ( !exists $self->{'headers'}->{$key} ) {
179	824					2632	$self->{'headers'}->{$key} = [];
180	824					1975	$self->{'raw_headers'}->{$key} = [];
181							}
182
183	898					1836	my $dec_value = $raw_value;
184	898					2646	$dec_value =~ s/\n[ \t]+/ /gs;
185	898					4431	$dec_value =~ s/\s+$//s;
186	898					2832	$dec_value =~ s/^\s+//s;
187	898					1392	push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
	898					2927
188
189	898					1593	push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
	898					2461
190
191	898					4727	return $self->{'headers'}->{$key}->[-1];
192							}
193
194	5072	100				8944	if (wantarray) {
195	4763	100				19029	return unless exists $self->{'headers'}->{$key};
196	1007					1522	return @{ $self->{'headers'}->{$key} };
	1007					6202
197							}
198							else {
199	309	100				2500	return '' unless exists $self->{'headers'}->{$key};
200	46					363	return $self->{'headers'}->{$key}->[-1];
201							}
202							}
203
204							=item raw_header()
205
206							Retrieves the raw version of headers from a specific MIME part. The only
207							parameter is the header name. Header names are case-insensitive.
208
209							For retrieval, if raw_header() is called in an array context, an array
210							will be returned with each header entry in a different element. In a
211							scalar context, the last specific header is returned.
212
213							ie: If 'Subject' is specified as the header, and there are 2 Subject
214							headers in a message, the last/bottom one in the message is returned in
215							scalar context or both are returned in array context.
216
217							=cut
218
219							# Retrieve raw headers from a given MIME object
220							#
221							sub raw_header {
222	493			493	1	859	my $self = shift;
223	493					1029	my $key = lc(shift);
224
225							# Trim whitespace off of the header keys
226	493					1327	$key =~ s/^\s+//;
227	493					1076	$key =~ s/\s+$//;
228
229	493	50				985	if (wantarray) {
230	493	100				2042	return unless exists $self->{'raw_headers'}->{$key};
231	128					253	return @{ $self->{'raw_headers'}->{$key} };
	128					893
232							}
233							else {
234	0	0				0	return '' unless exists $self->{'raw_headers'}->{$key};
235	0					0	return $self->{'raw_headers'}->{$key}->[-1];
236							}
237							}
238
239							=item add_body_part()
240
241							Adds a Node child object to the current node object.
242
243							=cut
244
245							# Add a MIME child part to ourselves
246							sub add_body_part {
247	48			48	1	100	my($self, $part) = @_;
248
249	48					178	dbg("message: added part, type: ".$part->{'type'});
250	48					81	push @{ $self->{'body_parts'} }, $part;
	48					132
251							}
252
253							=item is_leaf()
254
255							Returns true if the tree node in question is a leaf of the tree (ie:
256							has no children of its own). Note: This function may return odd results
257							unless the message has been mime parsed via _do_parse()!
258
259							=cut
260
261							sub is_leaf {
262	287			287	1	744	my($self) = @_;
263	287					903	return !exists $self->{'body_parts'};
264							}
265
266							=item raw()
267
268							Return a reference to the the raw array. Treat this as READ ONLY.
269
270							=cut
271
272							sub raw {
273	0			0	1	0	my $self = shift;
274
275							# Ok, if we're called we are expected to return an array.
276							# so if it's a file reference, read in the message into an array...
277							#
278							# NOTE: that "ref undef" works, so don't bother checking for a defined var
279							# first.
280	0	0				0	if (ref $self->{'raw'} eq 'GLOB') {
281	0					0	my $fd = $self->{'raw'};
282	0	0				0	seek($fd, 0, 0) or die "message: cannot rewind file: $!";
283
284							# dbg("message: (raw) reading mime part from a temporary file");
285	0					0	my($nread,$raw_str); $raw_str = '';
	0					0
286	0					0	while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
287	0	0				0	defined $nread or die "error reading: $!";
288	0					0	my @array = split(/^/m, $raw_str, -1);
289
290	0	0				0	dbg("message: empty message read") if $raw_str eq '';
291	0					0	return \@array;
292							}
293
294	0					0	return $self->{'raw'};
295							}
296
297							=item decode()
298
299							If necessary, decode the part text as base64 or quoted-printable.
300							The decoded text will be returned as a scalar string. An optional length
301							parameter can be passed in which limits how much decoded data is returned.
302							If the scalar isn't needed, call with "0" as a parameter.
303
304							=cut
305
306							sub decode {
307	252			252	1	3385	my($self, $bytes) = @_;
308
309	252	100				741	if ( !exists $self->{'decoded'} ) {
310							# Someone is looking for a decoded part where there is no raw data
311							# (multipart or subparsed message, etc.) Just return undef.
312	144	50				564	return if !exists $self->{'raw'};
313
314	144					264	my $raw;
315
316							# if the part is held in a temp file, read it into the scalar
317	144	100				624	if (ref $self->{'raw'} eq 'GLOB') {
318	4					9	my $fd = $self->{'raw'};
319	4	50				43	seek($fd, 0, 0) or die "message: cannot rewind file: $!";
320
321							# dbg("message: (decode) reading mime part from a temporary file");
322	4					10	my($nread,$raw_str); $raw = '';
	4					9
323	4					210	while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
324	4	50				18	defined $nread or die "error reading: $!";
325
326	4	50				18	dbg("message: empty message read from a temp file") if $raw eq '';
327							}
328							else {
329							# create a new scalar from the raw array in memory
330	140					306	$raw = join('', @{$self->{'raw'}});
	140					1484
331							}
332
333	144		100			658	my $encoding = lc $self->header('content-transfer-encoding') \|\| '';
334
335	144	100				753	if ( $encoding eq 'quoted-printable' ) {
		100
336	6					30	dbg("message: decoding quoted-printable");
337	6					40	$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
338	6					39	$self->{'decoded'} =~ s/\015\012/\012/gs;
339							}
340							elsif ( $encoding eq 'base64' ) {
341	5					21	dbg("message: decoding base64");
342
343							# if it's not defined or is 0, do the whole thing, otherwise only decode
344							# a portion
345	5	50				10	if ($bytes) {
346	0					0	return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
347							}
348							else {
349							# Generate the decoded output
350	5					20	$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
351							}
352
353	5	100				32	if ( $self->{'type'} =~ m@^(?:text\|message)\b/@i ) {
354	1					35	$self->{'decoded'} =~ s/\015\012/\012/gs;
355							}
356							}
357							else {
358							# Encoding is one of 7bit, 8bit, binary or x-something
359	133	100				370	if ( $encoding ) {
360	31					176	dbg("message: decoding other encoding type ($encoding), ignoring");
361							}
362							else {
363	102					371	dbg("message: no encoding detected");
364							}
365	133					838	$self->{'decoded'} = $raw;
366							}
367							}
368
369	252	100	66			1442	if ( !defined $bytes \|\| $bytes ) {
370	248	50				747	if ( !defined $bytes ) {
371							# force a copy
372	248					2470	return '' . $self->{'decoded'};
373							}
374							else {
375	0					0	return substr($self->{'decoded'}, 0, $bytes);
376							}
377							}
378							}
379
380							# Detect endianness of UTF-16 encoded data
381							sub detect_utf16 {
382	0			0	0	0	my $data = $_[0]; # could not avoid copying large strings
383	0					0	my $utf16le_clues = 0;
384	0					0	my $utf16be_clues = 0;
385	0					0	my $sum_h_e = 0;
386	0					0	my $sum_h_o = 0;
387	0					0	my $sum_l_e = 0;
388	0					0	my $sum_l_o = 0;
389	0					0	my $decoder = undef;
390
391	0					0	my @msg_h = unpack 'H' x length( $data ), $data;
392	0					0	my @msg_l = unpack 'h' x length( $data ), $data;
393
394	0					0	for( my $i = 0; $i < length( $data ); $i+=2 ) {
395	0					0	my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
396	0					0	$sum_h_e += hex $msg_h[$i];
397	0					0	$sum_h_o += hex $msg_h[$i+1];
398	0					0	$sum_l_e += hex $msg_l[$i];
399	0					0	$sum_l_o += hex $msg_l[$i+1];
400	0	0				0	if( $check_char =~ /20 00/ ) {
401							# UTF-16LE space char detected
402	0					0	$utf16le_clues++;
403							}
404	0	0				0	if( $check_char =~ /00 20/ ) {
405							# UTF-16BE space char detected
406	0					0	$utf16be_clues++;
407							}
408							}
409
410							# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
411	0	0				0	$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
412
413							# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
414	0	0				0	$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
415
416	0	0				0	if( $utf16le_clues > $utf16be_clues ) {
		0
417	0					0	dbg( "message: detect_utf16: UTF-16LE" );
418	0					0	$decoder = Encode::find_encoding("UTF-16LE");
419							} elsif( $utf16le_clues > $utf16be_clues ) {
420	0					0	dbg( "message: detect_utf16: UTF-16BE" );
421	0					0	$decoder = Encode::find_encoding("UTF-16BE");
422							} else {
423	0					0	dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
424							}
425
426	0					0	return $decoder;
427							}
428
429							# Look at a text scalar and determine whether it should be rendered
430							# as text/html.
431							#
432							# This is not a public function.
433							#
434							sub _html_render {
435	0	0		0		0	if ($_[0] =~ m/^(.{0,18}?<(?:body\|head\|html\|img\|pre\|table\|title)(?:\s.{0,18}?)?>)/is)
436							{
437	0					0	my $pad = $1;
438	0					0	my $count = 0;
439	0					0	$count += ($pad =~ tr/\n//d) * 2;
440	0					0	$count += ($pad =~ tr/\n//cd);
441	0					0	return ($count < 24);
442							}
443	0					0	return 0;
444							}
445
446							# Decode character set of a given text to perl characters (Unicode),
447							# then encode into UTF-8 octets if requested.
448							#
449							sub _normalize {
450							# my $data = $_[0]; # avoid copying large strings
451	10			10		23	my $charset_declared = $_[1];
452	10					20	my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
453
454	10	50				38	warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
455							if utf8::is_utf8($_[0]);
456
457							# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
458	10					29	my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
459
460	10	50	33			128	if (!defined $charset_declared \|\| $charset_declared eq '') {
461	0					0	$charset_declared = 'us-ascii';
462							}
463
464							# number of characters with code above 127
465	10					31	my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
466
467	10	50	33			95	if (!$cnt_8bits &&
468							$charset_declared =~
469							/^(?: (?:US-)?ASCII \| ANSI[_ ]? X3\.4- (?:1986\|1968) \|
470							ISO646-US )\z/xsi)
471							{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
472	0					0	dbg("message: kept, charset is US-ASCII as declared");
473	0					0	return $_[0]; # is all-ASCII, no need for decoding
474							}
475
476	10	100	66			93	if (!$cnt_8bits &&
477							$charset_declared =~
478							/^(?: ISO[ -]?8859 (?: - \d{1,2} )? \| Windows-\d{4} \|
479							UTF-?8 \| (KOI8\|EUC)-[A-Z]{1,2} \|
480							Big5 \| GBK \| GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
481							{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
482	8					26	dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
483	8					71	return $_[0]; # is all-ASCII, no need for decoding
484							}
485
486							# Try first to strictly decode based on a declared character set.
487
488	2					5	my $rv;
489	2	50	33			20	if ($charset_declared =~ /^UTF-?8\z/i) {
		50
		50
		50
490							# attempt decoding as strict UTF-8 (flags: FB_CROAK \| LEAVE_SRC)
491	0	0				0	if (eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
	0					0
	0					0
492	0					0	dbg("message: decoded as declared charset UTF-8");
493	0	0				0	return $_[0] if !$return_decoded;
494	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
495	0					0	return $rv; # decoded
496							} else {
497	0					0	my $err = '';
498	0	0				0	if ($@) {
499	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
500	0					0	$err = " ($err)";
501							}
502	0					0	dbg("message: failed decoding as declared charset UTF-8 ($err)");
503							}
504
505							} elsif ($charset_declared =~ /^UTF[ -]?16/i) {
506							# Handle cases where spammers use UTF-16 encoding without including a BOM
507							# or declaring endianness as reported at:
508							# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
509
510	0					0	my $decoder = detect_utf16( $_[0] );
511	0	0				0	if (eval { $rv = $decoder->decode($_[0], 1\|8); defined $rv }) {
	0					0
	0					0
512	0					0	dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
513	0	0				0	return $_[0] if !$return_decoded;
514	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
515	0					0	return $rv; # decoded
516							} else {
517	0					0	my $err = '';
518	0	0				0	if ($@) {
519	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
520	0					0	$err = " ($err)";
521							}
522	0					0	dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err);
523							};
524
525							} elsif ($cnt_8bits &&
526	0					0	eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
	0					0
527	0					0	dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
528	0	0				0	return $_[0] if !$return_decoded;
529	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
530	0					0	return $rv; # decoded
531
532							} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
533							# declared as US-ASCII but contains 8-bit characters, makes no sense
534							# to attempt decoding first as strict US-ASCII as we know it would fail
535
536							} else {
537							# try decoding as a declared character set
538
539							# -> http://en.wikipedia.org/wiki/Windows-1252
540							# Windows-1252 character encoding is a superset of ISO 8859-1, but differs
541							# from the IANA's ISO-8859-1 by using displayable characters rather than
542							# control characters in the 80 to 9F (hex) range. [...]
543							# It is very common to mislabel Windows-1252 text with the charset label
544							# ISO-8859-1. A common result was that all the quotes and apostrophes
545							# (produced by "smart quotes" in word-processing software) were replaced
546							# with question marks or boxes on non-Windows operating systems, making
547							# text difficult to read. Most modern web browsers and e-mail clients
548							# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
549							# such mislabeling. This is now standard behavior in the draft HTML 5
550							# specification, which requires that documents advertised as ISO-8859-1
551							# actually be parsed with the Windows-1252 encoding.
552							#
553	2					4	my($chset, $decoder);
554	2	50				15	if ($charset_declared =~ /^(?: ISO-?8859-1 \| Windows-1252 \| CP1252 )\z/xi) {
555	0					0	$chset = 'Windows-1252'; $decoder = $enc_w1252;
	0					0
556							} else {
557	2					7	$chset = $charset_declared; $decoder = Encode::find_encoding($chset);
	2					14
558	2	50	33			8317	if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
559	0					0	$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
560	0	0				0	dbg("message: no decoder for a declared charset %s, using GBK",
561							$chset) if $decoder;
562							}
563							}
564	2	50				7	if (!$decoder) {
565	0					0	dbg("message: failed decoding, no decoder for a declared charset %s",
566							$chset);
567							} else {
568	2					5	my $err = '';
569	2					3	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	2					9
570	2	50				232	if ($@) {
571	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
572	0					0	$err = " ($err)";
573							}
574	2	50				11	if (lc $chset eq lc $charset_declared) {
575	2	50				12	dbg("message: %s as declared charset %s%s",
576							defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err);
577							} else {
578	0	0				0	dbg("message: %s as charset %s, declared %s%s",
579							defined $rv ? 'decoded' : 'failed decoding',
580							$chset, $charset_declared, $err);
581							}
582							}
583							}
584
585							# If the above failed, check if it is US-ASCII, possibly extended by few
586							# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
587							# some popular punctuation or special characters from Windows-1252 in
588							# the \x80-\x9F range (which is unassigned in ISO-8859-*).
589							# Note that Windows-1252 is a proper superset of ISO-8859-1.
590							#
591	2	50	33			13	if (!defined $rv && !$cnt_8bits) {
		50	33
			0
592	0					0	dbg("message: kept, guessed charset is US-ASCII, declared %s",
593							$charset_declared);
594	0					0	return $_[0]; # is all-ASCII, no need for decoding
595
596							} elsif (!defined $rv && $enc_w1252 &&
597							# ASCII NBSP (c) SHY ' " ... '".- TM
598							#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
599							# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
600							# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
601							$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
602							{ # ASCII + NBSP + SHY + some punctuation characters
603							# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
604							# consider also: AE (r), 80 Euro
605	0					0	my $err = '';
606	0					0	eval { $rv = $enc_w1252->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	0					0
607	0	0				0	if ($@) {
608	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
609	0					0	$err = " ($err)";
610							}
611							# the above can't fail, but keep code general just in case
612	0	0				0	dbg("message: %s as guessed charset %s, declared %s%s",
613							defined $rv ? 'decoded' : 'failed decoding',
614							'Windows-1252', $charset_declared, $err);
615							}
616
617							# If we were unsuccessful so far, try some guesswork
618							# based on Encode::Detect::Detector .
619
620	2	50				6	if (defined $rv) {
		0
621							# done, no need for guesswork
622							} elsif (!$have_encode_detector) {
623	0					0	dbg("message: Encode::Detect::Detector not available, declared %s failed",
624							$charset_declared);
625							} else {
626	0					0	my $charset_detected = Encode::Detect::Detector::detect($_[0]);
627	0	0	0			0	if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
628	0					0	my $decoder = Encode::find_encoding($charset_detected);
629	0	0	0			0	if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
630	0					0	$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
631	0	0				0	dbg("message: no decoder for a detected charset %s, using GBK",
632							$charset_detected) if $decoder;
633							}
634	0	0				0	if (!$decoder) {
635	0					0	dbg("message: failed decoding, no decoder for a detected charset %s",
636							$charset_detected);
637							} else {
638	0					0	my $err = '';
639	0					0	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	0					0
640	0	0				0	if ($@) {
641	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
642	0					0	$err = " ($err)";
643							}
644	0	0				0	dbg("message: %s as detected charset %s, declared %s%s",
645							defined $rv ? 'decoded' : 'failed decoding',
646							$charset_detected, $charset_declared, $err);
647							}
648							}
649							}
650
651	2	50				7	if (!defined $rv) { # all decoding attempts failed so far, probably garbage
652							# go for Windows-1252 which can't fail
653	0					0	my $err = '';
654	0					0	eval { $rv = $enc_w1252->decode($_[0]) };
	0					0
655	0	0				0	if ($@) {
656	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
	0					0
	0					0
657	0					0	$err = " ($err)";
658							}
659	0	0				0	dbg("message: %s as last-resort charset %s, declared %s%s",
660							defined $rv ? 'decoded' : 'failed decoding',
661							'Windows-1252', $charset_declared, $err);
662							}
663
664	2	50				5	if (!defined $rv) { # just in case - all decoding attempts failed so far
665	0					0	return $_[0]; # garbage-in / garbage-out, return unchanged octets
666							}
667							# decoding octets to characters was successful
668	2	50				6	if (!$return_decoded) {
669							# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
670	2					6	utf8::encode($rv); # encode Unicode characters to UTF-8 octets
671							}
672	2					5	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
673	2					15	return $rv;
674							}
675
676							=item rendered()
677
678							render_text() takes the given text/* type MIME part, and attempts to
679							render it into a text scalar. It will always render text/html, and will
680							use a heuristic to determine if other text/* parts should be considered
681							text/html. Two scalars are returned: the rendered type (either text/html
682							or whatever the original type was), and the rendered text.
683
684							=cut
685
686							sub rendered {
687	141			141	1	387	my ($self) = @_;
688
689	141	100				550	if (!exists $self->{rendered}) {
690							# We only know how to render text/plain and text/html ...
691							# Note: for bug 4843, make sure to skip text/calendar parts
692							# we also want to skip things like text/x-vcard
693							# text/x-aol is ignored here, but looks like text/html ...
694	125	50				1254	return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain\|html)$/i );
695
696	125					718	my $text = $self->decode; # QP and Base64 decoding, bytes
697	125					537	my $text_len = length($text); # num of bytes in original charset encoding
698
699							# render text/html always
700	125	100	100			1479	if ($text ne '' && $self->{'type'} =~ m{^text/html$}i)
701							{
702	7					48	$self->{rendered_type} = 'text/html';
703
704							# will input text to HTML::Parser be provided as Unicode characters?
705	7					19	my $character_semantics = 0; # $text is in bytes
706	7	50	33			113	if ($self->{normalize} && $enc_utf8) { # charset decoding requested
		100	100
707							# Provide input to HTML::Parser as Unicode characters
708							# which avoids a HTML::Parser bug in utf8_mode
709							# https://rt.cpan.org/Public/Bug/Display.html?id=99755
710							# Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
711							# Avoid unnecessary step of encoding-then-decoding by telling
712							# subroutine _normalize() to return Unicode text. See Bug 7133
713							#
714	0					0	$character_semantics = 1; # $text will be in characters
715	0					0	$text = _normalize($text, $self->{charset}, 1); # bytes to chars
716							} elsif (!defined $self->{charset} \|\|
717							$self->{charset} =~ /^(?:US-ASCII\|UTF-8)\z/i) {
718							# With some luck input can be interpreted as UTF-8, do not warn.
719							# It is still possible to hit the HTML::Parses utf8_mode bug however.
720							} else {
721							dbg("message: 'normalize_charset' is off, encoding will likely ".
722	1					11	"be misinterpreted; declared charset: %s", $self->{charset});
723							}
724							# the 0 requires decoded HTML results to be in bytes (not characters)
725	7					135	my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
726
727	7					63	$html->parse($text); # parse+render text
728
729							# resulting HTML-decoded text is in bytes, likely encoded as UTF-8
730	7					38	$self->{rendered} = $html->get_rendered_text();
731	7					40	$self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
732	7					31	$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
733	7					42	$self->{html_results} = $html->get_results();
734
735							# end-of-document result values that require looking at the text
736	7					25	my $r = $self->{html_results}; # temporary reference for brevity
737
738							# count the number of spaces in the rendered text (likely UTF-8 octets)
739	7					45	my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
740							# we may want to add the count of other Unicode whitespace characters
741
742	7					27	$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
743	7					31	$r->{non_space_len} = $r->{html_length} - $space;
744	7	50				200	$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
745							}
746
747							else { # plain text
748	118	0	33			529	if ($self->{normalize} && $enc_utf8) {
749							# request transcoded result as UTF-8 octets!
750	0					0	$text = _normalize($text, $self->{charset}, 0);
751							}
752	118					510	$self->{rendered_type} = $self->{type};
753	118					615	$self->{rendered} = $self->{'visible_rendered'} = $text;
754	118					543	$self->{'invisible_rendered'} = '';
755							}
756							}
757
758	141					711	return ($self->{rendered_type}, $self->{rendered});
759							}
760
761							=item set_rendered($text, $type)
762
763							Set the rendered text and type for the given part. If type is not
764							specified, and text is a defined value, a default of 'text/plain' is used.
765							This can be used, for instance, to render non-text parts using plugins.
766
767							=cut
768
769							sub set_rendered {
770	0			0	1	0	my ($self, $text, $type) = @_;
771
772	0	0	0			0	$type = 'text/plain' if (!defined $type && defined $text);
773
774	0					0	$self->{'rendered_type'} = $type;
775	0					0	$self->{'rendered'} = $self->{'visible_rendered'} = $text;
776	0	0				0	$self->{'invisible_rendered'} = defined $text ? '' : undef;
777							}
778
779							=item visible_rendered()
780
781							Render and return the visible text in this part.
782
783							=cut
784
785							sub visible_rendered {
786	8			8	1	27	my ($self) = @_;
787	8					68	$self->rendered(); # ignore return, we want just this:
788	8					42	return ($self->{rendered_type}, $self->{visible_rendered});
789							}
790
791							=item invisible_rendered()
792
793							Render and return the invisible text in this part.
794
795							=cut
796
797							sub invisible_rendered {
798	8			8	1	25	my ($self) = @_;
799	8					31	$self->rendered(); # ignore return, we want just this:
800	8					29	return ($self->{rendered_type}, $self->{invisible_rendered});
801							}
802
803							=item content_summary()
804
805							Returns an array of scalars describing the mime parts of the message.
806							Note: This function requires that the message be parsed first!
807
808							=cut
809
810							# return an array with scalars describing mime parts
811							sub content_summary {
812	13			13	1	231	my($self) = @_;
813
814	13					53	my @ret = ( [ $self->{'type'} ] );
815	13					21	my @search;
816
817	13	100				32	if (exists $self->{'body_parts'}) {
818	11					15	my $count = @{$self->{'body_parts'}};
	11					21
819	11					34	for(my $i=0; $i<$count; $i++) {
820	24					73	push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
821							}
822							}
823
824	13					32	while(my $part = shift @search) {
825	45					53	my($index, $part) = @{$part};
	45					81
826	45					63	push(@{$ret[$index]}, $part->{'type'});
	45					102
827	45	100				147	if (exists $part->{'body_parts'}) {
828	11					16	unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
	21					57
	11					19
829							}
830							}
831
832	13					27	return map { join(",", @{$_}) } @ret;
	37					45
	37					183
833							}
834
835							=item delete_header()
836
837							Delete the specified header (decoded and raw) from the Node information.
838
839							=cut
840
841							sub delete_header {
842	505			505	1	1125	my($self, $hdr) = @_;
843
844	505					719	foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
	505					7514
845	0					0	delete $self->{'headers'}->{$_};
846	0					0	delete $self->{'raw_headers'}->{$_};
847							}
848
849	505					1217	my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
	505					5841
850	505					1971	$self->{'header_order'} = \@neworder;
851							}
852
853							# decode a header appropriately. don't bother adding it to the pod documents.
854							sub __decode_header {
855	10			10		50	my ( $encoding, $cte, $data ) = @_;
856
857	10	100				51	if ( $cte eq 'B' ) {
		50
858							# base 64 encoded
859	2					11	$data = Mail::SpamAssassin::Util::base64_decode($data);
860							}
861							elsif ( $cte eq 'Q' ) {
862							# quoted printable
863
864							# the RFC states that in the encoded text, "_" is equal to "=20"
865	8					20	$data =~ s/_/=20/g;
866
867	8					22	$data = Mail::SpamAssassin::Util::qp_decode($data);
868							}
869							else {
870							# not possible since the input has already been limited to 'B' and 'Q'
871	0					0	die "message: unknown encoding type '$cte' in RFC2047 header";
872							}
873	10					40	return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
874							}
875
876							# Decode base64 and quoted-printable in headers according to RFC2047.
877							#
878							sub _decode_header {
879	898			898		2843	my($header_field_body, $header_field_name) = @_;
880
881	898	100	66			5232	return '' unless defined $header_field_body && $header_field_body ne '';
882
883							# deal with folding and cream the newlines and such
884	848					1944	$header_field_body =~ s/\n[ \t]+/\n /g;
885	848					1381	$header_field_body =~ s/\015?\012//gs;
886
887	848	100				3487	if ($header_field_name =~
888							/^ (?: Received \| (?:Resent-)? (?: Message-ID \| Date ) \|
889							MIME-Version \| References \| In-Reply-To \| List-.* ) \z /xsi ) {
890							# Bug 6945: some header fields must not be processed for MIME encoding
891							# Bug 7466: leave out the Content-*
892
893							} else {
894	515					1909	local($1,$2,$3);
895
896							# Multiple encoded sections must ignore the interim whitespace.
897							# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
898							# separated by whitespace.
899	515					1438	1 while $header_field_body =~
900							s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
901							( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
902							{$1$2}xsg;
903
904							# transcode properly encoded RFC 2047 substrings into UTF-8 octets,
905							# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
906	515					1432	# or plain US-ASCII
907	10					54	$header_field_body =~
908							s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
909							{ __decode_header($1, uc($2), $3) }xsge;
910							}
911	848					3059
912							# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
913							return $header_field_body;
914							}
915
916							=item get_header()
917
918							Retrieve a specific header. Will have a newline at the end and will be
919							unfolded. The first parameter is the header name (case-insensitive),
920							and the second parameter (optional) is whether or not to return the
921							raw header.
922
923							If get_header() is called in an array context, an array will be returned
924							with each header entry in a different element. In a scalar context,
925							the last specific header is returned.
926
927							ie: If 'Subject' is specified as the header, and there are 2 Subject
928							headers in a message, the last/bottom one in the message is returned in
929							scalar context or both are returned in array context.
930
931							Btw, returning the last header field (not the first) happens to be consistent
932							with DKIM signatures, which search for and cover multiple header fields
933							bottom-up according to the 'h' tag. Let's keep it this way.
934
935							=cut
936	4405			4405	1	9261
937	4405		50			16140	sub get_header {
938							my ($self, $hdr, $raw) = @_;
939							$raw \|\|= 0;
940
941							# And now pick up all the entries into a list
942							# This is assumed to include a newline at the end ...
943							# This is also assumed to have removed continuation bits ...
944	4405					5674
945	4405	50				7683	# Deal with the possibility that header() or raw_header() returns undef
946	0	0				0	my @hdrs;
947	0					0	if ( $raw ) {
948							if (@hdrs = $self->raw_header($hdr)) {
949							s/\015?\012\s+/ /gs for @hdrs;
950							}
951	4405	100				9102	}
952	844					3549	else {
953							if (@hdrs = $self->header($hdr)) {
954							$_ .= "\n" for @hdrs;
955							}
956	4405	100				8703	}
957	3856					10981
958							if (wantarray) {
959							return @hdrs;
960	549	100				2819	}
961							else {
962							return @hdrs ? $hdrs[-1] : undef;
963							}
964							}
965
966							=item get_all_headers()
967
968							Retrieve all headers. Each header will have a newline at the end and
969							will be unfolded. The first parameter (optional) is whether or not to
970							return the raw headers, and the second parameter (optional) is whether
971							or not to include the mbox separator.
972
973							If get_all_header() is called in an array context, an array will be
974							returned with each header entry in a different element. In a scalar
975							context, the headers are returned in a single scalar.
976
977							=back
978
979							=cut
980
981	45			45	1	154	# build it and it will not bomb
982	45		50			280	sub get_all_headers {
983	45		100			191	my ($self, $raw, $include_mbox) = @_;
984							$raw \|\|= 0;
985	45					106	$include_mbox \|\|= 0;
986
987							my @lines;
988	45					84
989	45					78	# precalculate destination positions based on order of appearance
990	45					75	my $i = 0;
	45					181
991	520					674	my %locations;
	520					1722
992							for my $k (@{$self->{header_order}}) {
993							push(@{$locations{lc($k)}}, $i++);
994							}
995	45					102
996	45					91	# process headers in order of first appearance
997	45					390	my $header;
	902					1476
998							my $size = 0;
999							HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
1000							keys %locations)
1001	396					703	{
1002	396					782	# get all same-name headers and poke into correct position
1003	520					741	my $positions = $locations{$name};
	520					807
1004	520					1248	for my $contents ($self->get_header($name, $raw)) {
1005	520	50				1241	my $position = shift @{$positions};
1006	0					0	$size += length($name) + length($contents) + 2;
1007	0					0	if ($size > MAX_HEADER_LENGTH) {
1008							$self->{'truncated_header'} = 1;
1009	520					2328	last HEADER;
1010							}
1011							$lines[$position] = $self->{header_order}->[$position].": ".$contents;
1012							}
1013							}
1014	45	50				193
	0					0
1015							# skip undefined lines if we truncated
1016	45	50	66			191	@lines = grep { defined $_ } @lines if $self->{'truncated_header'};
1017
1018	45	100				731	splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
1019
1020							return wantarray ? @lines : join ('', @lines);
1021							}
1022				0	0
1023							# legacy public API; now a no-op.
1024							sub finish { }
1025
1026							# ---------------------------------------------------------------------------
1027
1028							1;
1029							__END__