File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm

Criterion	Covered	Total	%
statement	263	420	62.6
branch	97	212	45.7
condition	40	85	47.0
subroutine	25	30	83.3
pod	16	18	88.8
total	441	765	57.6

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21
22							=head1 DESCRIPTION
23
24							This module will encapsulate an email message and allow access to
25							the various MIME message parts.
26
27							=head1 PUBLIC METHODS
28
29							=over 4
30
31							=cut
32
33
34							use strict;
35	41			41		253	use warnings;
	41					72
	41					1195
36	41			41		219	use re 'taint';
	41					77
	41					1325
37	41			41		228
	41					94
	41					1607
38							require 5.008001; # needs utf8::is_utf8()
39
40							use Mail::SpamAssassin;
41	41			41		237	use Mail::SpamAssassin::Constants qw(:sa);
	41					74
	41					1047
42	41			41		212	use Mail::SpamAssassin::HTML;
	41					77
	41					6042
43	41			41		11179	use Mail::SpamAssassin::Logger;
	41					140
	41					1490
44	41			41		248
	41					73
	41					6026
45							our($enc_utf8, $enc_w1252, $have_encode_detector);
46							BEGIN {
47							eval { require Encode }
48	41					20823	and do { $enc_utf8 = Encode::find_encoding('UTF-8');
49	41	50		41		124	$enc_w1252 = Encode::find_encoding('Windows-1252') };
	41					340379
50	41					5746	eval { require Encode::Detect::Detector }
51	41					190808	and do { $have_encode_detector = 1 };
52	41	50				12972	};
	0					0
53
54							=item new()
55
56							Generates an empty Node object and returns it. Typically only called
57							by functions in Message.
58
59							=cut
60
61							my $class = shift;
62							$class = ref($class) \|\| $class;
63	196			196	1	438
64	196		33			802	my $self = {
65							headers => {},
66	196					1136	raw_headers => {},
67							header_order => []
68							};
69
70							# deal with any parameters
71							my($opts) = @_;
72							$self->{normalize} = $opts->{'normalize'} \|\| 0;
73	196					438
74	196		50			928	bless($self,$class);
75							$self;
76	196					544	}
77	196					534
78							=item find_parts()
79
80							Used to search the tree for specific MIME parts. An array of matching
81							Node objects (pointers into the tree) is returned. The parameters that
82							can be passed in are (in order, all scalars):
83
84							Regexp - Used to match against each part's Content-Type header,
85							specifically the type and not the rest of the header. ie: "Content-type:
86							text/html; encoding=quoted-printable" has a type of "text/html". If no
87							regexp is specified, find_parts() will return an empty array.
88
89							Only_leaves - By default, find_parts() will return any part that matches
90							the regexp, including multipart. If you only want to see leaves of the
91							tree (ie: parts that aren't multipart), set this to true (1).
92
93							Recursive - By default, when find_parts() finds a multipart which has
94							parts underneath it, it will recurse through all sub-children. If set to 0,
95							only look at the part and any direct children of the part.
96
97							=cut
98
99							# Used to find any MIME parts whose simple content-type matches a given regexp
100							# Searches it's own and any children parts. Returns an array of MIME
101							# objects which match. Our callers may expect the default behavior which is a
102							# depth-first array of parts.
103							#
104							my ($self, $re, $onlyleaves, $recursive) = @_;
105
106							# Didn't pass an RE? Just abort.
107	256			256	1	649	return () unless defined $re && $re ne '';
108
109							$onlyleaves = 0 unless defined $onlyleaves;
110	256	50	33			1363
111							my $depth;
112	256	50				558	if (defined $recursive && $recursive == 0) {
113							$depth = 1;
114	256					420	}
115	256	50	33			637
116	0					0	my @ret;
117							my @search = ( $self );
118
119	256					390	while (my $part = shift @search) {
120	256					541	# If this object matches, mark it for return.
121							my $amialeaf = $part->is_leaf();
122	256					676
123							if ( $part->{'type'} =~ /$re/ && (!$onlyleaves \|\| $amialeaf) ) {
124	290					897	push(@ret, $part);
125							}
126	290	100	66			2909
			100
127	271					517	if ( !$amialeaf && (!defined $depth \|\| $depth > 0)) {
128							$depth-- if defined $depth;
129							unshift(@search, @{$part->{'body_parts'}});
130	290	50	33			1059	}
			66
131	19	50				41	}
132	19					25
	19					64
133							return @ret;
134							}
135
136	256					995	=item header()
137
138							Stores and retrieves headers from a specific MIME part. The first
139							parameter is the header name. If there is no other parameter, the header
140							is retrieved. If there is a second parameter, the header is stored.
141
142							Header names are case-insensitive and are stored in both raw and
143							decoded form. Using header(), only the decoded form is retrievable.
144
145							For retrieval, if header() is called in an array context, an array will
146							be returned with each header entry in a different element. In a scalar
147							context, the last specific header is returned.
148
149							ie: If 'Subject' is specified as the header, and there are 2 Subject
150							headers in a message, the last/bottom one in the message is returned in
151							scalar context or both are returned in array context.
152
153							=cut
154
155							# Store or retrieve headers from a given MIME object
156							#
157							my $self = shift;
158							my $rawkey = shift;
159
160							return unless defined $rawkey;
161	5663			5663	1	6921
162	5663					7399	# we're going to do things case insensitively
163							my $key = lc($rawkey);
164	5663	50				11217
165							# Trim whitespace off of the header keys
166							$key =~ s/^\s+//;
167	5663					10021	$key =~ s/\s+$//;
168
169							if (@_) {
170	5663					11025	my $raw_value = shift;
171	5663					8979	return unless defined $raw_value;
172
173	5663	100				9572	push @{ $self->{'header_order'} }, $rawkey;
174	909					1578	if ( !exists $self->{'headers'}->{$key} ) {
175	909	50				1797	$self->{'headers'}->{$key} = [];
176							$self->{'raw_headers'}->{$key} = [];
177	909					1123	}
	909					2059
178	909	100				2317
179	835					2315	my $dec_value = $raw_value;
180	835					1764	$dec_value =~ s/\n[ \t]+/ /gs;
181							$dec_value =~ s/\s+$//s;
182							$dec_value =~ s/^\s+//s;
183	909					1494	push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
184	909					2282
185	909					3755	push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
186	909					2371
187	909					1258	return $self->{'headers'}->{$key}->[-1];
	909					2431
188							}
189	909					1390
	909					2051
190							if (wantarray) {
191	909					4000	return unless exists $self->{'headers'}->{$key};
192							return @{ $self->{'headers'}->{$key} };
193							}
194	4754	100				7358	else {
195	4441	100				14736	return '' unless exists $self->{'headers'}->{$key};
196	1002					1210	return $self->{'headers'}->{$key}->[-1];
	1002					5415
197							}
198							}
199	313	100				2295
200	48					318	=item raw_header()
201
202							Retrieves the raw version of headers from a specific MIME part. The only
203							parameter is the header name. Header names are case-insensitive.
204
205							For retrieval, if raw_header() is called in an array context, an array
206							will be returned with each header entry in a different element. In a
207							scalar context, the last specific header is returned.
208
209							ie: If 'Subject' is specified as the header, and there are 2 Subject
210							headers in a message, the last/bottom one in the message is returned in
211							scalar context or both are returned in array context.
212
213							=cut
214
215							# Retrieve raw headers from a given MIME object
216							#
217							my $self = shift;
218							my $key = lc(shift);
219
220							# Trim whitespace off of the header keys
221							$key =~ s/^\s+//;
222	633			633	1	855	$key =~ s/\s+$//;
223	633					992
224							if (wantarray) {
225							return unless exists $self->{'raw_headers'}->{$key};
226	633					1248	return @{ $self->{'raw_headers'}->{$key} };
227	633					1015	}
228							else {
229	633	50				1133	return '' unless exists $self->{'raw_headers'}->{$key};
230	633	100				1948	return $self->{'raw_headers'}->{$key}->[-1];
231	139					237	}
	139					807
232							}
233
234	0	0				0	=item add_body_part()
235	0					0
236							Adds a Node child object to the current node object.
237
238							=cut
239
240							# Add a MIME child part to ourselves
241							my($self, $part) = @_;
242
243							dbg("message: added part, type: ".$part->{'type'});
244							push @{ $self->{'body_parts'} }, $part;
245							}
246
247	50			50	1	93	=item is_leaf()
248
249	50					164	Returns true if the tree node in question is a leaf of the tree (ie:
250	50					58	has no children of its own). Note: This function may return odd results
	50					116
251							unless the message has been mime parsed via _do_parse()!
252
253							=cut
254
255							my($self) = @_;
256							return !exists $self->{'body_parts'};
257							}
258
259							=item raw()
260
261							Return a reference to the raw array. Treat this as READ ONLY.
262	290			290	1	509
263	290					787	=cut
264
265							my $self = shift;
266
267							# Ok, if we're called we are expected to return an array.
268							# so if it's a file reference, read in the message into an array...
269							#
270							# NOTE: that "ref undef" works, so don't bother checking for a defined var
271							# first.
272							if (ref $self->{'raw'} eq 'GLOB') {
273	0			0	1	0	my $fd = $self->{'raw'};
274							seek($fd, 0, 0) or die "message: cannot rewind file: $!";
275
276							# dbg("message: (raw) reading mime part from a temporary file");
277							my($nread,$raw_str); $raw_str = '';
278							while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
279							defined $nread or die "error reading: $!";
280	0	0				0	my @array = split(/^/m, $raw_str, -1);
281	0					0
282	0	0				0	dbg("message: empty message read") if $raw_str eq '';
283							return \@array;
284							}
285	0					0
	0					0
286	0					0	return $self->{'raw'};
287	0	0				0	}
288	0					0
289							=item decode()
290	0	0				0
291	0					0	If necessary, decode the part text as base64 or quoted-printable.
292							The decoded text will be returned as a scalar string. An optional length
293							parameter can be passed in which limits how much decoded data is returned.
294	0					0	If the scalar isn't needed, call with "0" as a parameter.
295
296							=cut
297
298							my($self, $bytes) = @_;
299
300							if ( !exists $self->{'decoded'} ) {
301							# Someone is looking for a decoded part where there is no raw data
302							# (multipart or subparsed message, etc.) Just return undef.
303							return if !exists $self->{'raw'};
304
305							my $raw;
306
307	254			254	1	3692	# if the part is held in a temp file, read it into the scalar
308							if (ref $self->{'raw'} eq 'GLOB') {
309	254	100				678	my $fd = $self->{'raw'};
310							seek($fd, 0, 0) or die "message: cannot rewind file: $!";
311
312	146	50				414	# dbg("message: (decode) reading mime part from a temporary file");
313							my($nread,$raw_str); $raw = '';
314	146					234	while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
315							defined $nread or die "error reading: $!";
316
317	146	100				473	dbg("message: empty message read from a temp file") if $raw eq '';
318	4					5	}
319	4	50				36	else {
320							# create a new scalar from the raw array in memory
321							$raw = join('', @{$self->{'raw'}});
322	4					9	}
	4					5
323	4					234
324	4	50				16	my $encoding = lc $self->header('content-transfer-encoding') \|\| '';
325
326	4	50				16	if ( $encoding eq 'quoted-printable' ) {
327							dbg("message: decoding quoted-printable");
328							$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
329							$self->{'decoded'} =~ s/\015\012/\012/gs;
330	142					260	}
	142					1450
331							elsif ( $encoding eq 'base64' ) {
332							dbg("message: decoding base64");
333	146		100			559
334							# if it's not defined or is 0, do the whole thing, otherwise only decode
335	146	100				711	# a portion
		100
336	6					25	if ($bytes) {
337	6					39	return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
338	6					36	}
339							else {
340							# Generate the decoded output
341	5					17	$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
342							}
343
344							if ( $self->{'type'} =~ m@^(?:text\|message)\b/@i ) {
345	5	50				7	$self->{'decoded'} =~ s/\015\012/\012/gs;
346	0					0	}
347							}
348							else {
349							# Encoding is one of 7bit, 8bit, binary or x-something
350	5					19	if ( $encoding ) {
351							dbg("message: decoding other encoding type ($encoding), ignoring");
352							}
353	5	100				26	else {
354	1					21	dbg("message: no encoding detected");
355							}
356							$self->{'decoded'} = $raw;
357							}
358							}
359	135	100				316
360	33					197	if ( !defined $bytes \|\| $bytes ) {
361							if ( !defined $bytes ) {
362							# force a copy
363	102					351	return '' . $self->{'decoded'};
364							}
365	135					727	else {
366							return substr($self->{'decoded'}, 0, $bytes);
367							}
368							}
369	254	100	66			1244	}
370	250	50				599
371							# Detect endianness of UTF-16 encoded data
372	250					2079	my $data = $_[0]; # could not avoid copying large strings
373							my $utf16le_clues = 0;
374							my $utf16be_clues = 0;
375	0					0	my $sum_h_e = 0;
376							my $sum_h_o = 0;
377							my $sum_l_e = 0;
378							my $sum_l_o = 0;
379							my $decoder = undef;
380
381							my @msg_h = unpack 'H' x length( $data ), $data;
382	0			0	0	0	my @msg_l = unpack 'h' x length( $data ), $data;
383	0					0
384	0					0	for( my $i = 0; $i < length( $data ); $i+=2 ) {
385	0					0	my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
386	0					0	$sum_h_e += hex $msg_h[$i];
387	0					0	$sum_h_o += hex $msg_h[$i+1];
388	0					0	$sum_l_e += hex $msg_l[$i];
389	0					0	$sum_l_o += hex $msg_l[$i+1];
390							if( $check_char =~ /20 00/ ) {
391	0					0	# UTF-16LE space char detected
392	0					0	$utf16le_clues++;
393							}
394	0					0	if( $check_char =~ /00 20/ ) {
395	0					0	# UTF-16BE space char detected
396	0					0	$utf16be_clues++;
397	0					0	}
398	0					0	}
399	0					0
400	0	0				0	# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
401							$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
402	0					0
403							# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
404	0	0				0	$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
405
406	0					0	if( $utf16le_clues > $utf16be_clues ) {
407							dbg( "message: detect_utf16: UTF-16LE" );
408							$decoder = Encode::find_encoding("UTF-16LE");
409							} elsif( $utf16le_clues > $utf16be_clues ) {
410							dbg( "message: detect_utf16: UTF-16BE" );
411	0	0				0	$decoder = Encode::find_encoding("UTF-16BE");
412							} else {
413							dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
414	0	0				0	}
415
416	0	0				0	return $decoder;
		0
417	0					0	}
418	0					0
419							# Look at a text scalar and determine whether it should be rendered
420	0					0	# as text/html.
421	0					0	#
422							# This is not a public function.
423	0					0	#
424							if ($_[0] =~ m/^(.{0,18}?<(?:body\|head\|html\|img\|pre\|table\|title)(?:\s.{0,18}?)?>)/is)
425							{
426	0					0	my $pad = $1;
427							my $count = 0;
428							$count += ($pad =~ tr/\n//d) * 2;
429							$count += ($pad =~ tr/\n//cd);
430							return ($count < 24);
431							}
432							return 0;
433							}
434
435	0	0		0		0	# Decode character set of a given text to perl characters (Unicode),
436							# then encode into UTF-8 octets if requested.
437	0					0	#
438	0					0	# my $data = $_[0]; # avoid copying large strings
439	0					0	my $charset_declared = $_[1];
440	0					0	my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
441	0					0
442							warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
443	0					0	if utf8::is_utf8($_[0]);
444
445							# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
446							my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
447
448							if (!defined $charset_declared \|\| $charset_declared eq '') {
449							$charset_declared = 'us-ascii';
450							}
451	10			10		19
452	10					13	# number of characters with code above 127
453							my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
454	10	50				35
455							if (!$cnt_8bits &&
456							$charset_declared =~
457							/^(?: (?:US-)?ASCII \| ANSI[_ ]? X3\.4- (?:1986\|1968) \|
458	10					39	ISO646-US )\z/xsi)
459							{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
460	10	50	33			51	dbg("message: kept, charset is US-ASCII as declared");
461	0					0	return $_[0]; # is all-ASCII, no need for decoding
462							}
463
464							if (!$cnt_8bits &&
465	10					23	$charset_declared =~
466							/^(?: ISO[ -]?8859 (?: - \d{1,2} )? \| Windows-\d{4} \|
467	10	50	33			68	UTF-?8 \| (KOI8\|EUC)-[A-Z]{1,2} \|
468							Big5 \| GBK \| GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
469							{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
470							dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
471							return $_[0]; # is all-ASCII, no need for decoding
472	0					0	}
473	0					0
474							# Try first to strictly decode based on a declared character set.
475
476	10	100	66			74	my $rv;
477							if ($charset_declared =~ /^UTF-?8\z/i) {
478							# attempt decoding as strict UTF-8 (flags: FB_CROAK \| LEAVE_SRC)
479							if (eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
480							dbg("message: decoded as declared charset UTF-8");
481							return $_[0] if !$return_decoded;
482	8					24	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
483	8					58	return $rv; # decoded
484							} else {
485							my $err = '';
486							if ($@) {
487							$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
488	2					9	$err = " ($err)";
489	2	50	33			20	}
		50
		50
		50
490							dbg("message: failed decoding as declared charset UTF-8 ($err)");
491	0	0				0	}
	0					0
	0					0
492	0					0
493	0	0				0	} elsif ($charset_declared =~ /^UTF[ -]?16/i) {
494	0					0	# Handle cases where spammers use UTF-16 encoding without including a BOM
495	0					0	# or declaring endianness as reported at:
496							# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
497	0					0
498	0	0				0	my $decoder = detect_utf16( $_[0] );
499	0					0	if (eval { $rv = $decoder->decode($_[0], 1\|8); defined $rv }) {
	0					0
	0					0
500	0					0	dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
501							return $_[0] if !$return_decoded;
502	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
503							return $rv; # decoded
504							} else {
505							my $err = '';
506							if ($@) {
507							$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
508							$err = " ($err)";
509							}
510	0					0	dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err);
511	0	0				0	};
	0					0
	0					0
512	0					0
513	0	0				0	} elsif ($cnt_8bits &&
514	0					0	eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
515	0					0	dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
516							return $_[0] if !$return_decoded;
517	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
518	0	0				0	return $rv; # decoded
519	0					0
	0					0
	0					0
520	0					0	} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
521							# declared as US-ASCII but contains 8-bit characters, makes no sense
522	0					0	# to attempt decoding first as strict US-ASCII as we know it would fail
523
524							} else {
525							# try decoding as a declared character set
526	0					0
	0					0
527	0					0	# -> http://en.wikipedia.org/wiki/Windows-1252
528	0	0				0	# Windows-1252 character encoding is a superset of ISO 8859-1, but differs
529	0					0	# from the IANA's ISO-8859-1 by using displayable characters rather than
530	0					0	# control characters in the 80 to 9F (hex) range. [...]
531							# It is very common to mislabel Windows-1252 text with the charset label
532							# ISO-8859-1. A common result was that all the quotes and apostrophes
533							# (produced by "smart quotes" in word-processing software) were replaced
534							# with question marks or boxes on non-Windows operating systems, making
535							# text difficult to read. Most modern web browsers and e-mail clients
536							# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
537							# such mislabeling. This is now standard behavior in the draft HTML 5
538							# specification, which requires that documents advertised as ISO-8859-1
539							# actually be parsed with the Windows-1252 encoding.
540							#
541							my($chset, $decoder);
542							if ($charset_declared =~ /^(?: ISO-?8859-1 \| Windows-1252 \| CP1252 )\z/xi) {
543							$chset = 'Windows-1252'; $decoder = $enc_w1252;
544							} else {
545							$chset = $charset_declared; $decoder = Encode::find_encoding($chset);
546							if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
547							$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
548							dbg("message: no decoder for a declared charset %s, using GBK",
549							$chset) if $decoder;
550							}
551							}
552							if (!$decoder) {
553	2					7	dbg("message: failed decoding, no decoder for a declared charset %s",
554	2	50				13	$chset);
555	0					0	} else {
	0					0
556							my $err = '';
557	2					4	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	2					14
558	2	50	33			6805	if ($@) {
559	0					0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
560	0	0				0	$err = " ($err)";
561							}
562							if (lc $chset eq lc $charset_declared) {
563							dbg("message: %s as declared charset %s%s",
564	2	50				5	defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err);
565	0					0	} else {
566							dbg("message: %s as charset %s, declared %s%s",
567							defined $rv ? 'decoded' : 'failed decoding',
568	2					4	$chset, $charset_declared, $err);
569	2					4	}
	2					7
570	2	50				196	}
571	0					0	}
	0					0
	0					0
572	0					0
573							# If the above failed, check if it is US-ASCII, possibly extended by few
574	2	50				9	# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
575	2	50				9	# some popular punctuation or special characters from Windows-1252 in
576							# the \x80-\x9F range (which is unassigned in ISO-8859-*).
577							# Note that Windows-1252 is a proper superset of ISO-8859-1.
578	0	0				0	#
579							if (!defined $rv && !$cnt_8bits) {
580							dbg("message: kept, guessed charset is US-ASCII, declared %s",
581							$charset_declared);
582							return $_[0]; # is all-ASCII, no need for decoding
583
584							} elsif (!defined $rv && $enc_w1252 &&
585							# ASCII NBSP (c) SHY ' " ... '".- TM
586							#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
587							# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection,
588							# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!)
589							$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c)
590							{ # ASCII + NBSP + SHY + some punctuation characters
591	2	50	33			11	# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
		50	33
			0
592	0					0	# consider also: AE (r), 80 Euro
593							my $err = '';
594	0					0	eval { $rv = $enc_w1252->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
595							if ($@) {
596							$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
597							$err = " ($err)";
598							}
599							# the above can't fail, but keep code general just in case
600							dbg("message: %s as guessed charset %s, declared %s%s",
601							defined $rv ? 'decoded' : 'failed decoding',
602							'Windows-1252', $charset_declared, $err);
603							}
604
605	0					0	# If we were unsuccessful so far, try some guesswork
606	0					0	# based on Encode::Detect::Detector .
	0					0
607	0	0				0
608	0					0	if (defined $rv) {
	0					0
	0					0
609	0					0	# done, no need for guesswork
610							} elsif (!$have_encode_detector) {
611							dbg("message: Encode::Detect::Detector not available, declared %s failed",
612	0	0				0	$charset_declared);
613							} else {
614							my $charset_detected = Encode::Detect::Detector::detect($_[0]);
615							if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
616							my $decoder = Encode::find_encoding($charset_detected);
617							if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
618							$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
619							dbg("message: no decoder for a detected charset %s, using GBK",
620	2	50				5	$charset_detected) if $decoder;
		0
621							}
622							if (!$decoder) {
623	0					0	dbg("message: failed decoding, no decoder for a detected charset %s",
624							$charset_detected);
625							} else {
626	0					0	my $err = '';
627	0	0	0			0	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
628	0					0	if ($@) {
629	0	0	0			0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
630	0					0	$err = " ($err)";
631	0	0				0	}
632							dbg("message: %s as detected charset %s, declared %s%s",
633							defined $rv ? 'decoded' : 'failed decoding',
634	0	0				0	$charset_detected, $charset_declared, $err);
635	0					0	}
636							}
637							}
638	0					0
639	0					0	if (!defined $rv) { # all decoding attempts failed so far, probably garbage
	0					0
640	0	0				0	# go for Windows-1252 which can't fail
641	0					0	my $err = '';
	0					0
	0					0
642	0					0	eval { $rv = $enc_w1252->decode($_[0]) };
643							if ($@) {
644	0	0				0	$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.) at ./$1/;
645							$err = " ($err)";
646							}
647							dbg("message: %s as last-resort charset %s, declared %s%s",
648							defined $rv ? 'decoded' : 'failed decoding',
649							'Windows-1252', $charset_declared, $err);
650							}
651	2	50				5
652							if (!defined $rv) { # just in case - all decoding attempts failed so far
653	0					0	return $_[0]; # garbage-in / garbage-out, return unchanged octets
654	0					0	}
	0					0
655	0	0				0	# decoding octets to characters was successful
656	0					0	if (!$return_decoded) {
	0					0
	0					0
657	0					0	# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
658							utf8::encode($rv); # encode Unicode characters to UTF-8 octets
659	0	0				0	}
660							$rv .= $data_taint; # carry taintedness over, avoid Encode bug
661							return $rv;
662							}
663
664	2	50				3	=item rendered()
665	0					0
666							render_text() takes the given text/* type MIME part, and attempts to
667							render it into a text scalar. It will always render text/html, and will
668	2	50				5	use a heuristic to determine if other text/* parts should be considered
669							text/html. Two scalars are returned: the rendered type (either text/html
670	2					4	or whatever the original type was), and the rendered text.
671
672	2					3	=cut
673	2					14
674							my ($self) = @_;
675
676							if (!exists $self->{rendered}) {
677							# We only know how to render text/plain and text/html ...
678							# Note: for bug 4843, make sure to skip text/calendar parts
679							# we also want to skip things like text/x-vcard
680							# text/x-aol is ignored here, but looks like text/html ...
681							return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain\|html)$/i );
682
683							my $text = $self->decode; # QP and Base64 decoding, bytes
684							my $text_len = length($text); # num of bytes in original charset encoding
685
686							# render text/html always
687	143			143	1	442	if ($text ne '' && $self->{'type'} =~ m{^text/html$}i)
688							{
689	143	100				515	$self->{rendered_type} = 'text/html';
690
691							# will input text to HTML::Parser be provided as Unicode characters?
692							my $character_semantics = 0; # $text is in bytes
693							if ($self->{normalize} && $enc_utf8) { # charset decoding requested
694	127	50				1160	# Provide input to HTML::Parser as Unicode characters
695							# which avoids a HTML::Parser bug in utf8_mode
696	127					615	# https://rt.cpan.org/Public/Bug/Display.html?id=99755
697	127					522	# Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
698							# Avoid unnecessary step of encoding-then-decoding by telling
699							# subroutine _normalize() to return Unicode text. See Bug 7133
700	127	100	100			1385	#
701							$character_semantics = 1; # $text will be in characters
702	8					37	$text = _normalize($text, $self->{charset}, 1); # bytes to chars
703							} elsif (!defined $self->{charset} \|\|
704							$self->{charset} =~ /^(?:US-ASCII\|UTF-8)\z/i) {
705	8					23	# With some luck input can be interpreted as UTF-8, do not warn.
706	8	50	33			178	# It is still possible to hit the HTML::Parses utf8_mode bug however.
		100	100
707							} else {
708							dbg("message: 'normalize_charset' is off, encoding will likely ".
709							"be misinterpreted; declared charset: %s", $self->{charset});
710							}
711							# the 0 requires decoded HTML results to be in bytes (not characters)
712							my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
713
714	0					0	$html->parse($text); # parse+render text
715	0					0
716							# resulting HTML-decoded text is in bytes, likely encoded as UTF-8
717							$self->{rendered} = $html->get_rendered_text();
718							$self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
719							$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
720							$self->{html_results} = $html->get_results();
721
722	1					13	# end-of-document result values that require looking at the text
723							my $r = $self->{html_results}; # temporary reference for brevity
724
725	8					185	# count the number of spaces in the rendered text (likely UTF-8 octets)
726							my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
727	8					70	# we may want to add the count of other Unicode whitespace characters
728
729							$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
730	8					40	$r->{non_space_len} = $r->{html_length} - $space;
731	8					49	$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
732	8					34	}
733	8					45
734							else { # plain text
735							if ($self->{normalize} && $enc_utf8) {
736	8					19	# request transcoded result as UTF-8 octets!
737							$text = _normalize($text, $self->{charset}, 0);
738							}
739	8					42	$self->{rendered_type} = $self->{type};
740							$self->{rendered} = $self->{'visible_rendered'} = $text;
741							$self->{'invisible_rendered'} = '';
742	8					31	}
743	8					26	}
744	8	50				185
745							return ($self->{rendered_type}, $self->{rendered});
746							}
747
748	119	0	33			451	=item set_rendered($text, $type)
749
750	0					0	Set the rendered text and type for the given part. If type is not
751							specified, and text is a defined value, a default of 'text/plain' is used.
752	119					426	This can be used, for instance, to render non-text parts using plugins.
753	119					532
754	119					422	=cut
755
756							my ($self, $text, $type) = @_;
757
758	143					599	$type = 'text/plain' if (!defined $type && defined $text);
759
760							$self->{'rendered_type'} = $type;
761							$self->{'rendered'} = $self->{'visible_rendered'} = $text;
762							$self->{'invisible_rendered'} = defined $text ? '' : undef;
763							}
764
765							=item visible_rendered()
766
767							Render and return the visible text in this part.
768
769							=cut
770	0			0	1	0
771							my ($self) = @_;
772	0	0	0			0	$self->rendered(); # ignore return, we want just this:
773							return ($self->{rendered_type}, $self->{visible_rendered});
774	0					0	}
775	0					0
776	0	0				0	=item invisible_rendered()
777
778							Render and return the invisible text in this part.
779
780							=cut
781
782							my ($self) = @_;
783							$self->rendered(); # ignore return, we want just this:
784							return ($self->{rendered_type}, $self->{invisible_rendered});
785							}
786	8			8	1	21
787	8					55	=item content_summary()
788	8					38
789							Returns an array of scalars describing the mime parts of the message.
790							Note: This function requires that the message be parsed first!
791
792							=cut
793
794							# return an array with scalars describing mime parts
795							my($self) = @_;
796
797							my @ret = ( [ $self->{'type'} ] );
798	8			8	1	20	my @search;
799	8					26
800	8					24	if (exists $self->{'body_parts'}) {
801							my $count = @{$self->{'body_parts'}};
802							for(my $i=0; $i<$count; $i++) {
803							push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
804							}
805							}
806
807							while(my $part = shift @search) {
808							my($index, $part) = @{$part};
809							push(@{$ret[$index]}, $part->{'type'});
810							if (exists $part->{'body_parts'}) {
811							unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
812	13			13	1	211	}
813							}
814	13					48
815	13					14	return map { join(",", @{$_}) } @ret;
816							}
817	13	100				27
818	11					13	=item delete_header()
	11					15
819	11					24
820	24					64	Delete the specified header (decoded and raw) from the Node information.
821
822							=cut
823
824	13					27	my($self, $hdr) = @_;
825	45					48
	45					68
826	45					41	foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
	45					83
827	45	100				125	delete $self->{'headers'}->{$_};
828	11					13	delete $self->{'raw_headers'}->{$_};
	21					48
	11					18
829							}
830
831							my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
832	13					18	$self->{'header_order'} = \@neworder;
	37					49
	37					178
833							}
834
835							# decode a header appropriately. don't bother adding it to the pod documents.
836							my ( $encoding, $cte, $data ) = @_;
837
838							if ( $cte eq 'B' ) {
839							# base 64 encoded
840							$data = Mail::SpamAssassin::Util::base64_decode($data);
841							}
842	505			505	1	929	elsif ( $cte eq 'Q' ) {
843							# quoted printable
844	505					641
	505					6907
845	0					0	# the RFC states that in the encoded text, "_" is equal to "=20"
846	0					0	$data =~ s/_/=20/g;
847
848							$data = Mail::SpamAssassin::Util::qp_decode($data);
849	505					1028	}
	505					5010
850	505					1678	else {
851							# not possible since the input has already been limited to 'B' and 'Q'
852							die "message: unknown encoding type '$cte' in RFC2047 header";
853							}
854							return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
855	10			10		43	}
856
857	10	100				39	# Decode base64 and quoted-printable in headers according to RFC2047.
		50
858							#
859	2					13	my($header_field_body, $header_field_name) = @_;
860
861							return '' unless defined $header_field_body && $header_field_body ne '';
862
863							# deal with folding and cream the newlines and such
864							$header_field_body =~ s/\n[ \t]+/\n /g;
865	8					16	$header_field_body =~ s/\015?\012//gs;
866
867	8					18	if ($header_field_name =~
868							/^ (?: Received \| (?:Resent-)? (?: Message-ID \| Date ) \|
869							MIME-Version \| References \| In-Reply-To \| List-.* ) \z /xsi ) {
870							# Bug 6945: some header fields must not be processed for MIME encoding
871	0					0	# Bug 7466: leave out the Content-*
872
873	10					28	} else {
874							local($1,$2,$3);
875
876							# Multiple encoded sections must ignore the interim whitespace.
877							# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
878							# separated by whitespace.
879	909			909		2543	1 while $header_field_body =~
880							s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
881	909	100	66			4498	( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
882							{$1$2}xsg;
883
884	859					1550	# transcode properly encoded RFC 2047 substrings into UTF-8 octets,
885	859					1097	# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
886							# or plain US-ASCII
887	859	100				3009	$header_field_body =~
888							s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
889							{ __decode_header($1, uc($2), $3) }xsge;
890							}
891
892							# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
893							return $header_field_body;
894	523					1777	}
895
896							=item get_header()
897
898							Retrieve a specific header. Will have a newline at the end and will be
899	523					1245	unfolded. The first parameter is the header name (case-insensitive),
900							and the second parameter (optional) is whether or not to return the
901							raw header.
902
903							If get_header() is called in an array context, an array will be returned
904							with each header entry in a different element. In a scalar context,
905							the last specific header is returned.
906	523					1201
907	10					39	ie: If 'Subject' is specified as the header, and there are 2 Subject
908							headers in a message, the last/bottom one in the message is returned in
909							scalar context or both are returned in array context.
910
911	859					2691	Btw, returning the last header field (not the first) happens to be consistent
912							with DKIM signatures, which search for and cover multiple header fields
913							bottom-up according to the 'h' tag. Let's keep it this way.
914
915							=cut
916
917							my ($self, $hdr, $raw) = @_;
918							$raw \|\|= 0;
919
920							# And now pick up all the entries into a list
921							# This is assumed to include a newline at the end ...
922							# This is also assumed to have removed continuation bits ...
923
924							# Deal with the possibility that header() or raw_header() returns undef
925							my @hdrs;
926							if ( $raw ) {
927							if (@hdrs = $self->raw_header($hdr)) {
928							s/\015?\012\s+/ /gs for @hdrs;
929							}
930							}
931							else {
932							if (@hdrs = $self->header($hdr)) {
933							$_ .= "\n" for @hdrs;
934							}
935							}
936	4078			4078	1	6951
937	4078		50			12097	if (wantarray) {
938							return @hdrs;
939							}
940							else {
941							return @hdrs ? $hdrs[-1] : undef;
942							}
943							}
944	4078					4404
945	4078	50				6029	=item get_all_headers()
946	0	0				0
947	0					0	Retrieve all headers. Each header will have a newline at the end and
948							will be unfolded. The first parameter (optional) is whether or not to
949							return the raw headers, and the second parameter (optional) is whether
950							or not to include the mbox separator.
951	4078	100				7423
952	834					3018	If get_all_header() is called in an array context, an array will be
953							returned with each header entry in a different element. In a scalar
954							context, the headers are returned in a single scalar.
955
956	4078	100				6112	=back
957	3528					8352
958							=cut
959
960	550	100				2265	# build it and it will not bomb
961							my ($self, $raw, $include_mbox) = @_;
962							$raw \|\|= 0;
963							$include_mbox \|\|= 0;
964
965							my @lines;
966
967							# precalculate destination positions based on order of appearance
968							my $i = 0;
969							my %locations;
970							for my $k (@{$self->{header_order}}) {
971							push(@{$locations{lc($k)}}, $i++);
972							}
973
974							# process headers in order of first appearance
975							my $header;
976							my $size = 0;
977							HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
978							keys %locations)
979							{
980							# get all same-name headers and poke into correct position
981	45			45	1	135	my $positions = $locations{$name};
982	45		50			290	for my $contents ($self->get_header($name, $raw)) {
983	45		100			192	my $position = shift @{$positions};
984							$size += length($name) + length($contents) + 2;
985	45					62	if ($size > MAX_HEADER_LENGTH) {
986							$self->{'truncated_header'} = 1;
987							last HEADER;
988	45					70	}
989	45					65	$lines[$position] = $self->{header_order}->[$position].": ".$contents;
990	45					97	}
	45					227
991	520					589	}
	520					1400
992
993							# skip undefined lines if we truncated
994							@lines = grep { defined $_ } @lines if $self->{'truncated_header'};
995	45					83
996	45					80	splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
997	45					408
	880					1525
998							return wantarray ? @lines : join ('', @lines);
999							}
1000
1001	396					600	# legacy public API; now a no-op.
1002	396					653
1003	520					636	# ---------------------------------------------------------------------------
	520					700
1004	520					1024
1005	520	50				1022	1;