File Coverage

blib/lib/Mail/SpamAssassin/Message/Node.pm

Criterion	Covered	Total	%
statement	266	387	68.7
branch	97	200	48.5
condition	42	88	47.7
subroutine	27	31	87.1
pod	16	18	88.8
total	448	724	61.8

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts
21
22							=head1 DESCRIPTION
23
24							This module will encapsulate an email message and allow access to
25							the various MIME message parts.
26
27							=head1 PUBLIC METHODS
28
29							=over 4
30
31							=cut
32
33							package Mail::SpamAssassin::Message::Node;
34
35	40			40		256	use strict;
	40					94
	40					1385
36	40			40		227	use warnings;
	40					109
	40					1371
37	40			40		224	use re 'taint';
	40					74
	40					1645
38
39							require 5.008001; # needs utf8::is_utf8()
40
41	40			40		227	use Mail::SpamAssassin;
	40					89
	40					1148
42	40			40		229	use Mail::SpamAssassin::Constants qw(:sa);
	40					103
	40					5924
43	40			40		12376	use Mail::SpamAssassin::HTML;
	40					162
	40					1577
44	40			40		276	use Mail::SpamAssassin::Logger;
	40					73
	40					6102
45
46							our($enc_utf8, $enc_w1252, $have_encode_detector);
47							BEGIN {
48	40					1553	eval { require Encode }
49	40	50		40		156	and do { $enc_utf8 = Encode::find_encoding('UTF-8');
	40					31445
50	40					9538	$enc_w1252 = Encode::find_encoding('Windows-1252') };
51	40					61914	eval { require Encode::Detect::Detector }
52	40	50				17540	and do { $have_encode_detector = 1 };
	0					0
53							};
54
55							=item new()
56
57							Generates an empty Node object and returns it. Typically only called
58							by functions in Message.
59
60							=cut
61
62							sub new {
63	181			181	1	438	my $class = shift;
64	181		33			804	$class = ref($class) \|\| $class;
65
66	181					1457	my $self = {
67							headers => {},
68							raw_headers => {},
69							header_order => []
70							};
71
72							# deal with any parameters
73	181					448	my($opts) = @_;
74	181		50			828	$self->{normalize} = $opts->{'normalize'} \|\| 0;
75
76	181					511	bless($self,$class);
77	181					569	$self;
78							}
79
80							=item find_parts()
81
82							Used to search the tree for specific MIME parts. An array of matching
83							Node objects (pointers into the tree) is returned. The parameters that
84							can be passed in are (in order, all scalars):
85
86							Regexp - Used to match against each part's Content-Type header,
87							specifically the type and not the rest of the header. ie: "Content-type:
88							text/html; encoding=quoted-printable" has a type of "text/html". If no
89							regexp is specified, find_parts() will return an empty array.
90
91							Only_leaves - By default, find_parts() will return any part that matches
92							the regexp, including multipart. If you only want to see leaves of the
93							tree (ie: parts that aren't multipart), set this to true (1).
94
95							Recursive - By default, when find_parts() finds a multipart which has
96							parts underneath it, it will recurse through all sub-children. If set to 0,
97							only look at the part and any direct children of the part.
98
99							=cut
100
101							# Used to find any MIME parts whose simple content-type matches a given regexp
102							# Searches it's own and any children parts. Returns an array of MIME
103							# objects which match. Our callers may expect the default behavior which is a
104							# depth-first array of parts.
105							#
106							sub find_parts {
107	231			231	1	741	my ($self, $re, $onlyleaves, $recursive) = @_;
108
109							# Didn't pass an RE? Just abort.
110	231	50	33			1402	return () unless defined $re && $re ne '';
111
112	231	50				679	$onlyleaves = 0 unless defined $onlyleaves;
113
114	231					412	my $depth;
115	231	50	33			717	if (defined $recursive && $recursive == 0) {
116	0					0	$depth = 1;
117							}
118
119	231					380	my @ret;
120	231					495	my @search = ( $self );
121
122	231					737	while (my $part = shift @search) {
123							# If this object matches, mark it for return.
124	263					925	my $amialeaf = $part->is_leaf();
125
126	263	100	66			2847	if ( $part->{'type'} =~ /$re/ && (!$onlyleaves \|\| $amialeaf) ) {
			100
127	245					605	push(@ret, $part);
128							}
129
130	263	50	33			1005	if ( !$amialeaf && (!defined $depth \|\| $depth > 0)) {
			66
131	18	50				105	$depth-- if defined $depth;
132	18					27	unshift(@search, @{$part->{'body_parts'}});
	18					48
133							}
134							}
135
136	231					848	return @ret;
137							}
138
139							=item header()
140
141							Stores and retrieves headers from a specific MIME part. The first
142							parameter is the header name. If there is no other parameter, the header
143							is retrieved. If there is a second parameter, the header is stored.
144
145							Header names are case-insensitive and are stored in both raw and
146							decoded form. Using header(), only the decoded form is retrievable.
147
148							For retrieval, if header() is called in an array context, an array will
149							be returned with each header entry in a different element. In a scalar
150							context, the last specific header is returned.
151
152							ie: If 'Subject' is specified as the header, and there are 2 Subject
153							headers in a message, the last/bottom one in the message is returned in
154							scalar context or both are returned in array context.
155
156							=cut
157
158							# Store or retrieve headers from a given MIME object
159							#
160							sub header {
161	5494			5494	1	7094	my $self = shift;
162	5494					6955	my $rawkey = shift;
163
164	5494	50				9074	return unless defined $rawkey;
165
166							# we're going to do things case insensitively
167	5494					8236	my $key = lc($rawkey);
168
169							# Trim whitespace off of the header keys
170	5494					11073	$key =~ s/^\s+//;
171	5494					9003	$key =~ s/\s+$//;
172
173	5494	100				9428	if (@_) {
174	800					1069	my $raw_value = shift;
175	800	50				1278	return unless defined $raw_value;
176
177	800					1003	push @{ $self->{'header_order'} }, $rawkey;
	800					1591
178	800	100				1821	if ( !exists $self->{'headers'}->{$key} ) {
179	726					2150	$self->{'headers'}->{$key} = [];
180	726					1429	$self->{'raw_headers'}->{$key} = [];
181							}
182
183	800					1125	my $dec_value = $raw_value;
184	800					1774	$dec_value =~ s/\n[ \t]+/ /gs;
185	800					3214	$dec_value =~ s/\s+$//s;
186	800					2059	$dec_value =~ s/^\s+//s;
187	800					1072	push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key);
	800					2340
188
189	800					1088	push @{ $self->{'raw_headers'}->{$key} }, $raw_value;
	800					1580
190
191	800					2175	return $self->{'headers'}->{$key}->[-1];
192							}
193
194	4694	100				7500	if (wantarray) {
195	4409	100				13714	return unless exists $self->{'headers'}->{$key};
196	797					1082	return @{ $self->{'headers'}->{$key} };
	797					3359
197							}
198							else {
199	285	100				2431	return '' unless exists $self->{'headers'}->{$key};
200	34					217	return $self->{'headers'}->{$key}->[-1];
201							}
202							}
203
204							=item raw_header()
205
206							Retrieves the raw version of headers from a specific MIME part. The only
207							parameter is the header name. Header names are case-insensitive.
208
209							For retrieval, if raw_header() is called in an array context, an array
210							will be returned with each header entry in a different element. In a
211							scalar context, the last specific header is returned.
212
213							ie: If 'Subject' is specified as the header, and there are 2 Subject
214							headers in a message, the last/bottom one in the message is returned in
215							scalar context or both are returned in array context.
216
217							=cut
218
219							# Retrieve raw headers from a given MIME object
220							#
221							sub raw_header {
222	475			475	1	787	my $self = shift;
223	475					831	my $key = lc(shift);
224
225							# Trim whitespace off of the header keys
226	475					996	$key =~ s/^\s+//;
227	475					958	$key =~ s/\s+$//;
228
229	475	50				1008	if (wantarray) {
230	475	100				1716	return unless exists $self->{'raw_headers'}->{$key};
231	122					245	return @{ $self->{'raw_headers'}->{$key} };
	122					595
232							}
233							else {
234	0	0				0	return '' unless exists $self->{'raw_headers'}->{$key};
235	0					0	return $self->{'raw_headers'}->{$key}->[-1];
236							}
237							}
238
239							=item add_body_part()
240
241							Adds a Node child object to the current node object.
242
243							=cut
244
245							# Add a MIME child part to ourselves
246							sub add_body_part {
247	48			48	1	69	my($self, $part) = @_;
248
249	48					125	dbg("message: added part, type: ".$part->{'type'});
250	48					50	push @{ $self->{'body_parts'} }, $part;
	48					96
251							}
252
253							=item is_leaf()
254
255							Returns true if the tree node in question is a leaf of the tree (ie:
256							has no children of its own). Note: This function may return odd results
257							unless the message has been mime parsed via _do_parse()!
258
259							=cut
260
261							sub is_leaf {
262	263			263	1	568	my($self) = @_;
263	263					777	return !exists $self->{'body_parts'};
264							}
265
266							=item raw()
267
268							Return a reference to the the raw array. Treat this as READ ONLY.
269
270							=cut
271
272							sub raw {
273	0			0	1	0	my $self = shift;
274
275							# Ok, if we're called we are expected to return an array.
276							# so if it's a file reference, read in the message into an array...
277							#
278							# NOTE: that "ref undef" works, so don't bother checking for a defined var
279							# first.
280	0	0				0	if (ref $self->{'raw'} eq 'GLOB') {
281	0					0	my $fd = $self->{'raw'};
282	0	0				0	seek($fd, 0, 0) or die "message: cannot rewind file: $!";
283
284							# dbg("message: (raw) reading mime part from a temporary file");
285	0					0	my($nread,$raw_str); $raw_str = '';
	0					0
286	0					0	while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { }
287	0	0				0	defined $nread or die "error reading: $!";
288	0					0	my @array = split(/^/m, $raw_str, -1);
289
290	0	0				0	dbg("message: empty message read") if $raw_str eq '';
291	0					0	return \@array;
292							}
293
294	0					0	return $self->{'raw'};
295							}
296
297							=item decode()
298
299							If necessary, decode the part text as base64 or quoted-printable.
300							The decoded text will be returned as a scalar string. An optional length
301							parameter can be passed in which limits how much decoded data is returned.
302							If the scalar isn't needed, call with "0" as a parameter.
303
304							=cut
305
306							sub decode {
307	228			228	1	2932	my($self, $bytes) = @_;
308
309	228	100				760	if ( !exists $self->{'decoded'} ) {
310							# Someone is looking for a decoded part where there is no raw data
311							# (multipart or subparsed message, etc.) Just return undef.
312	132	50				478	return if !exists $self->{'raw'};
313
314	132					262	my $raw;
315
316							# if the part is held in a temp file, read it into the scalar
317	132	100				527	if (ref $self->{'raw'} eq 'GLOB') {
318	4					9	my $fd = $self->{'raw'};
319	4	50				44	seek($fd, 0, 0) or die "message: cannot rewind file: $!";
320
321							# dbg("message: (decode) reading mime part from a temporary file");
322	4					10	my($nread,$raw_str); $raw = '';
	4					8
323	4					197	while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { }
324	4	50				14	defined $nread or die "error reading: $!";
325
326	4	50				11	dbg("message: empty message read from a temp file") if $raw eq '';
327							}
328							else {
329							# create a new scalar from the raw array in memory
330	128					282	$raw = join('', @{$self->{'raw'}});
	128					931
331							}
332
333	132		100			448	my $encoding = lc $self->header('content-transfer-encoding') \|\| '';
334
335	132	100				668	if ( $encoding eq 'quoted-printable' ) {
		100
336	6					29	dbg("message: decoding quoted-printable");
337	6					42	$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw);
338	6					36	$self->{'decoded'} =~ s/\015\012/\012/gs;
339							}
340							elsif ( $encoding eq 'base64' ) {
341	5					19	dbg("message: decoding base64");
342
343							# if it's not defined or is 0, do the whole thing, otherwise only decode
344							# a portion
345	5	50				13	if ($bytes) {
346	0					0	return Mail::SpamAssassin::Util::base64_decode($raw, $bytes);
347							}
348							else {
349							# Generate the decoded output
350	5					20	$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw);
351							}
352
353	5	100				32	if ( $self->{'type'} =~ m@^(?:text\|message)\b/@i ) {
354	1					23	$self->{'decoded'} =~ s/\015\012/\012/gs;
355							}
356							}
357							else {
358							# Encoding is one of 7bit, 8bit, binary or x-something
359	121	100				328	if ( $encoding ) {
360	19					101	dbg("message: decoding other encoding type ($encoding), ignoring");
361							}
362							else {
363	102					365	dbg("message: no encoding detected");
364							}
365	121					619	$self->{'decoded'} = $raw;
366							}
367							}
368
369	228	100	66			1491	if ( !defined $bytes \|\| $bytes ) {
370	224	50				577	if ( !defined $bytes ) {
371							# force a copy
372	224					2207	return '' . $self->{'decoded'};
373							}
374							else {
375	0					0	return substr($self->{'decoded'}, 0, $bytes);
376							}
377							}
378							}
379
380							# Detect endianness of UTF-16 encoded data
381							sub detect_utf16 {
382	40			40		22692	use Data::Dumper;
	40					201504
	40					132897
383	0			0	0	0	my $data = $_[0]; # could not avoid copying large strings
384	0					0	my $utf16le_clues = 0;
385	0					0	my $utf16be_clues = 0;
386	0					0	my $sum_h_e = 0;
387	0					0	my $sum_h_o = 0;
388	0					0	my $sum_l_e = 0;
389	0					0	my $sum_l_o = 0;
390	0					0	my $decoder = undef;
391
392	0					0	my @msg_h = unpack 'H' x length( $data ), $data;
393	0					0	my @msg_l = unpack 'h' x length( $data ), $data;
394
395	0					0	for( my $i = 0; $i < length( $data ); $i+=2 ) {
396	0					0	my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] );
397	0					0	$sum_h_e += hex $msg_h[$i];
398	0					0	$sum_h_o += hex $msg_h[$i+1];
399	0					0	$sum_l_e += hex $msg_l[$i];
400	0					0	$sum_l_o += hex $msg_l[$i+1];
401	0	0				0	if( $check_char =~ /20 00/ ) {
402							# UTF-16LE space char detected
403	0					0	$utf16le_clues++;
404							}
405	0	0				0	if( $check_char =~ /00 20/ ) {
406							# UTF-16BE space char detected
407	0					0	$utf16be_clues++;
408							}
409							}
410
411							# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE
412	0	0				0	$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 );
413
414							# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE
415	0	0				0	$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) );
416
417	0	0				0	if( $utf16le_clues > $utf16be_clues ) {
		0
418	0					0	dbg( "message: detect_utf16: UTF-16LE" );
419	0					0	$decoder = Encode::find_encoding("UTF-16LE");
420							} elsif( $utf16le_clues > $utf16be_clues ) {
421	0					0	dbg( "message: detect_utf16: UTF-16BE" );
422	0					0	$decoder = Encode::find_encoding("UTF-16BE");
423							} else {
424	0					0	dbg( "message: detect_utf16: Could not detect UTF-16 endianness" );
425							}
426
427	0					0	return $decoder;
428							}
429
430							# Look at a text scalar and determine whether it should be rendered
431							# as text/html.
432							#
433							# This is not a public function.
434							#
435							sub _html_render {
436	105	50		105		783	if ($_[0] =~ m/^(.{0,18}?<(?:body\|head\|html\|img\|pre\|table\|title)(?:\s.{0,18}?)?>)/is)
437							{
438	0					0	my $pad = $1;
439	0					0	my $count = 0;
440	0					0	$count += ($pad =~ tr/\n//d) * 2;
441	0					0	$count += ($pad =~ tr/\n//cd);
442	0					0	return ($count < 24);
443							}
444	105					558	return 0;
445							}
446
447							# Decode character set of a given text to perl characters (Unicode),
448							# then encode into UTF-8 octets if requested.
449							#
450							sub _normalize {
451							# my $data = $_[0]; # avoid copying large strings
452	10			10		20	my $charset_declared = $_[1];
453	10					12	my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets
454
455	10	50				35	warn "message: _normalize() was given characters, expected bytes: $_[0]\n"
456							if utf8::is_utf8($_[0]);
457
458							# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879]
459	10					29	my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data
460
461	10	50	33			54	if (!defined $charset_declared \|\| $charset_declared eq '') {
462	0					0	$charset_declared = 'us-ascii';
463							}
464
465							# number of characters with code above 127
466	10					25	my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c;
467
468	10	50	33			74	if (!$cnt_8bits &&
469							$charset_declared =~
470							/^(?: (?:US-)?ASCII \| ANSI[_ ]? X3\.4- (?:1986\|1968) \|
471							ISO646-US )\z/xsi)
472							{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is
473	0					0	dbg("message: kept, charset is US-ASCII as declared");
474	0					0	return $_[0]; # is all-ASCII, no need for decoding
475							}
476
477	10	100	66			103	if (!$cnt_8bits &&
478							$charset_declared =~
479							/^(?: ISO[ -]?8859 (?: - \d{1,2} )? \| Windows-\d{4} \|
480							UTF-?8 \| (KOI8\|EUC)-[A-Z]{1,2} \|
481							Big5 \| GBK \| GB[ -]?18030 (?:-20\d\d)? )\z/xsi)
482							{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII
483	8					24	dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared);
484	8					36	return $_[0]; # is all-ASCII, no need for decoding
485							}
486
487							# Try first to strictly decode based on a declared character set.
488
489	2					5	my $rv;
490	2	50	33			29	if ($charset_declared =~ /^UTF-?8\z/i) {
		50
		50
		50
491							# attempt decoding as strict UTF-8 (flags: FB_CROAK \| LEAVE_SRC)
492	0	0				0	if (eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
	0					0
	0					0
493	0					0	dbg("message: decoded as declared charset UTF-8");
494	0	0				0	return $_[0] if !$return_decoded;
495	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
496	0					0	return $rv; # decoded
497							} else {
498	0					0	dbg("message: failed decoding as declared charset UTF-8");
499							};
500
501							} elsif ($charset_declared =~ /^UTF[ -]?16/i) {
502							# Handle cases where spammers use UTF-16 encoding without including a BOM
503							# or declaring endianness as reported at:
504							# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252
505
506	0					0	my $decoder = detect_utf16( $_[0] );
507	0	0				0	if (eval { $rv = $decoder->decode($_[0], 1\|8); defined $rv }) {
	0					0
	0					0
508	0					0	dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name);
509	0	0				0	return $_[0] if !$return_decoded;
510	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
511	0					0	return $rv; # decoded
512							} else {
513	0					0	dbg("message: failed decoding as declared charset $charset_declared");
514							};
515
516							} elsif ($cnt_8bits &&
517	0					0	eval { $rv = $enc_utf8->decode($_[0], 1\|8); defined $rv }) {
	0					0
518	0					0	dbg("message: decoded as charset UTF-8, declared %s", $charset_declared);
519	0	0				0	return $_[0] if !$return_decoded;
520	0					0	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
521	0					0	return $rv; # decoded
522
523							} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) {
524							# declared as US-ASCII but contains 8-bit characters, makes no sense
525							# to attempt decoding first as strict US-ASCII as we know it would fail
526
527							} else {
528							# try decoding as a declared character set
529
530							# -> http://en.wikipedia.org/wiki/Windows-1252
531							# Windows-1252 character encoding is a superset of ISO 8859-1, but differs
532							# from the IANA's ISO-8859-1 by using displayable characters rather than
533							# control characters in the 80 to 9F (hex) range. [...]
534							# It is very common to mislabel Windows-1252 text with the charset label
535							# ISO-8859-1. A common result was that all the quotes and apostrophes
536							# (produced by "smart quotes" in word-processing software) were replaced
537							# with question marks or boxes on non-Windows operating systems, making
538							# text difficult to read. Most modern web browsers and e-mail clients
539							# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate
540							# such mislabeling. This is now standard behavior in the draft HTML 5
541							# specification, which requires that documents advertised as ISO-8859-1
542							# actually be parsed with the Windows-1252 encoding.
543							#
544	2					7	my($chset, $decoder);
545	2	50				12	if ($charset_declared =~ /^(?: ISO-?8859-1 \| Windows-1252 \| CP1252 )\z/xi) {
546	0					0	$chset = 'Windows-1252'; $decoder = $enc_w1252;
	0					0
547							} else {
548	2					5	$chset = $charset_declared; $decoder = Encode::find_encoding($chset);
	2					31
549	2	50	33			17108	if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
550	0					0	$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
551	0	0				0	dbg("message: no decoder for a declared charset %s, using GBK",
552							$chset) if $decoder;
553							}
554							}
555	2	50				9	if (!$decoder) {
556	0					0	dbg("message: failed decoding, no decoder for a declared charset %s",
557							$chset);
558							} else {
559	2					3	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	2					14
560	2	50				254	if (lc $chset eq lc $charset_declared) {
561	2	50				14	dbg("message: %s as declared charset %s",
562							defined $rv ? 'decoded' : 'failed decoding', $charset_declared);
563							} else {
564	0	0				0	dbg("message: %s as charset %s, declared %s",
565							defined $rv ? 'decoded' : 'failed decoding',
566							$chset, $charset_declared);
567							}
568							}
569							}
570
571							# If the above failed, check if it is US-ASCII, possibly extended by few
572							# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing
573							# some popular punctuation or special characters from Windows-1252 in
574							# the \x80-\x9F range (which is unassigned in ISO-8859-*).
575							# Note that Windows-1252 is a proper superset of ISO-8859-1.
576							#
577	2	50	33			18	if (!defined $rv && !$cnt_8bits) {
		50	33
			0
578	0					0	dbg("message: kept, guessed charset is US-ASCII, declared %s",
579							$charset_declared);
580	0					0	return $_[0]; # is all-ASCII, no need for decoding
581
582							} elsif (!defined $rv && $enc_w1252 &&
583							# ASCII NBSP (c) SHY ' " ... '".- TM
584							$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c)
585							{ # ASCII + NBSP + SHY + some punctuation characters
586							# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too
587							# consider also: AE (r), 80 Euro
588	0					0	eval { $rv = $enc_w1252->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	0					0
589							# the above can't fail, but keep code general just in case
590	0	0				0	dbg("message: %s as guessed charset %s, declared %s",
591							defined $rv ? 'decoded' : 'failed decoding',
592							'Windows-1252', $charset_declared);
593							}
594
595							# If we were unsuccessful so far, try some guesswork
596							# based on Encode::Detect::Detector .
597
598	2	50				8	if (defined $rv) {
		0
599							# done, no need for guesswork
600							} elsif (!$have_encode_detector) {
601	0					0	dbg("message: Encode::Detect::Detector not available, declared %s failed",
602							$charset_declared);
603							} else {
604	0					0	my $charset_detected = Encode::Detect::Detector::detect($_[0]);
605	0	0	0			0	if ($charset_detected && lc $charset_detected ne lc $charset_declared) {
606	0					0	my $decoder = Encode::find_encoding($charset_detected);
607	0	0	0			0	if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) {
608	0					0	$decoder = Encode::find_encoding('GBK'); # a subset of GB18030
609	0	0				0	dbg("message: no decoder for a detected charset %s, using GBK",
610							$charset_detected) if $decoder;
611							}
612	0	0				0	if (!$decoder) {
613	0					0	dbg("message: failed decoding, no decoder for a detected charset %s",
614							$charset_detected);
615							} else {
616	0					0	eval { $rv = $decoder->decode($_[0], 1\|8) }; # FB_CROAK \| LEAVE_SRC
	0					0
617	0	0				0	dbg("message: %s as detected charset %s, declared %s",
618							defined $rv ? 'decoded' : 'failed decoding',
619							$charset_detected, $charset_declared);
620							}
621							}
622							}
623
624	2	50				8	if (!defined $rv) { # all decoding attempts failed so far, probably garbage
625							# go for Windows-1252 which can't fail
626	0					0	eval { $rv = $enc_w1252->decode($_[0]) };
	0					0
627	0	0				0	dbg("message: %s as last-resort charset %s, declared %s",
628							defined $rv ? 'decoded' : 'failed decoding',
629							'Windows-1252', $charset_declared);
630							}
631
632	2	50				7	if (!defined $rv) { # just in case - all decoding attempts failed so far
633	0					0	return $_[0]; # garbage-in / garbage-out, return unchanged octets
634							}
635							# decoding octets to characters was successful
636	2	50				7	if (!$return_decoded) {
637							# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg
638	2					7	utf8::encode($rv); # encode Unicode characters to UTF-8 octets
639							}
640	2					6	$rv .= $data_taint; # carry taintedness over, avoid Encode bug
641	2					17	return $rv;
642							}
643
644							=item rendered()
645
646							render_text() takes the given text/* type MIME part, and attempts to
647							render it into a text scalar. It will always render text/html, and will
648							use a heuristic to determine if other text/* parts should be considered
649							text/html. Two scalars are returned: the rendered type (either text/html
650							or whatever the original type was), and the rendered text.
651
652							=cut
653
654							sub rendered {
655	129			129	1	398	my ($self) = @_;
656
657	129	100				545	if (!exists $self->{rendered}) {
658							# We only know how to render text/plain and text/html ...
659							# Note: for bug 4843, make sure to skip text/calendar parts
660							# we also want to skip things like text/x-vcard
661							# text/x-aol is ignored here, but looks like text/html ...
662	113	50				1065	return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain\|html)$/i );
663
664	113					649	my $text = $self->decode; # QP and Base64 decoding, bytes
665	113					398	my $text_len = length($text); # num of bytes in original charset encoding
666
667							# render text/html always, or any other text\|text/plain part as text/html
668							# based on a heuristic which simulates a certain common mail client
669	113	100	66			2466	if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i \|\|
			100
670							($self->{'type'} =~ m{^text/plain$}i &&
671							_html_render(substr($text, 0, 23)))))
672							{
673	7					62	$self->{rendered_type} = 'text/html';
674
675							# will input text to HTML::Parser be provided as Unicode characters?
676	7					38	my $character_semantics = 0; # $text is in bytes
677	7	50	33			90	if ($self->{normalize} && $enc_utf8) { # charset decoding requested
		100	100
678							# Provide input to HTML::Parser as Unicode characters
679							# which avoids a HTML::Parser bug in utf8_mode
680							# https://rt.cpan.org/Public/Bug/Display.html?id=99755
681							# Note: the above bug was fixed in HTML-Parser 3.72, January 2016.
682							# Avoid unnecessary step of encoding-then-decoding by telling
683							# subroutine _normalize() to return Unicode text. See Bug 7133
684							#
685	0					0	$character_semantics = 1; # $text will be in characters
686	0					0	$text = _normalize($text, $self->{charset}, 1); # bytes to chars
687							} elsif (!defined $self->{charset} \|\|
688							$self->{charset} =~ /^(?:US-ASCII\|UTF-8)\z/i) {
689							# With some luck input can be interpreted as UTF-8, do not warn.
690							# It is still possible to hit the HTML::Parses utf8_mode bug however.
691							} else {
692							dbg("message: 'normalize_charset' is off, encoding will likely ".
693	1					6	"be misinterpreted; declared charset: %s", $self->{charset});
694							}
695							# the 0 requires decoded HTML results to be in bytes (not characters)
696	7					166	my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
697
698	7					74	$html->parse($text); # parse+render text
699
700							# resulting HTML-decoded text is in bytes, likely encoded as UTF-8
701	7					52	$self->{rendered} = $html->get_rendered_text();
702	7					31	$self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
703	7					38	$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
704	7					37	$self->{html_results} = $html->get_results();
705
706							# end-of-document result values that require looking at the text
707	7					19	my $r = $self->{html_results}; # temporary reference for brevity
708
709							# count the number of spaces in the rendered text (likely UTF-8 octets)
710	7					40	my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//;
711							# we may want to add the count of other Unicode whitespace characters
712
713	7					27	$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8)
714	7					24	$r->{non_space_len} = $r->{html_length} - $space;
715	7	50				174	$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len;
716							}
717
718							else { # plain text
719	106	0	33			412	if ($self->{normalize} && $enc_utf8) {
720							# request transcoded result as UTF-8 octets!
721	0					0	$text = _normalize($text, $self->{charset}, 0);
722							}
723	106					462	$self->{rendered_type} = $self->{type};
724	106					493	$self->{rendered} = $self->{'visible_rendered'} = $text;
725	106					434	$self->{'invisible_rendered'} = '';
726							}
727							}
728
729	129					665	return ($self->{rendered_type}, $self->{rendered});
730							}
731
732							=item set_rendered($text, $type)
733
734							Set the rendered text and type for the given part. If type is not
735							specified, and text is a defined value, a default of 'text/plain' is used.
736							This can be used, for instance, to render non-text parts using plugins.
737
738							=cut
739
740							sub set_rendered {
741	0			0	1	0	my ($self, $text, $type) = @_;
742
743	0	0	0			0	$type = 'text/plain' if (!defined $type && defined $text);
744
745	0					0	$self->{'rendered_type'} = $type;
746	0					0	$self->{'rendered'} = $self->{'visible_rendered'} = $text;
747	0	0				0	$self->{'invisible_rendered'} = defined $text ? '' : undef;
748							}
749
750							=item visible_rendered()
751
752							Render and return the visible text in this part.
753
754							=cut
755
756							sub visible_rendered {
757	8			8	1	30	my ($self) = @_;
758	8					78	$self->rendered(); # ignore return, we want just this:
759	8					39	return ($self->{rendered_type}, $self->{visible_rendered});
760							}
761
762							=item invisible_rendered()
763
764							Render and return the invisible text in this part.
765
766							=cut
767
768							sub invisible_rendered {
769	8			8	1	23	my ($self) = @_;
770	8					44	$self->rendered(); # ignore return, we want just this:
771	8					23	return ($self->{rendered_type}, $self->{invisible_rendered});
772							}
773
774							=item content_summary()
775
776							Returns an array of scalars describing the mime parts of the message.
777							Note: This function requires that the message be parsed first!
778
779							=cut
780
781							# return an array with scalars describing mime parts
782							sub content_summary {
783	13			13	1	203	my($self) = @_;
784
785	13					49	my @ret = ( [ $self->{'type'} ] );
786	13					17	my @search;
787
788	13	100				32	if (exists $self->{'body_parts'}) {
789	11					15	my $count = @{$self->{'body_parts'}};
	11					22
790	11					28	for(my $i=0; $i<$count; $i++) {
791	24					62	push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]);
792							}
793							}
794
795	13					33	while(my $part = shift @search) {
796	45					40	my($index, $part) = @{$part};
	45					57
797	45					41	push(@{$ret[$index]}, $part->{'type'});
	45					70
798	45	100				94	if (exists $part->{'body_parts'}) {
799	11					13	unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}});
	21					44
	11					20
800							}
801							}
802
803	13					22	return map { join(",", @{$_}) } @ret;
	37					39
	37					104
804							}
805
806							=item delete_header()
807
808							Delete the specified header (decoded and raw) from the Node information.
809
810							=cut
811
812							sub delete_header {
813	505			505	1	1057	my($self, $hdr) = @_;
814
815	505					692	foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) {
	505					7528
816	0					0	delete $self->{'headers'}->{$_};
817	0					0	delete $self->{'raw_headers'}->{$_};
818							}
819
820	505					1167	my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}});
	505					5474
821	505					1783	$self->{'header_order'} = \@neworder;
822							}
823
824							# decode a header appropriately. don't bother adding it to the pod documents.
825							sub __decode_header {
826	10			10		30	my ( $encoding, $cte, $data ) = @_;
827
828	10	100				27	if ( $cte eq 'B' ) {
		50
829							# base 64 encoded
830	2					14	$data = Mail::SpamAssassin::Util::base64_decode($data);
831							}
832							elsif ( $cte eq 'Q' ) {
833							# quoted printable
834
835							# the RFC states that in the encoded text, "_" is equal to "=20"
836	8					16	$data =~ s/_/=20/g;
837
838	8					17	$data = Mail::SpamAssassin::Util::qp_decode($data);
839							}
840							else {
841							# not possible since the input has already been limited to 'B' and 'Q'
842	0					0	die "message: unknown encoding type '$cte' in RFC2047 header";
843							}
844	10					53	return _normalize($data, $encoding, 0); # transcode to UTF-8 octets
845							}
846
847							# Decode base64 and quoted-printable in headers according to RFC2047.
848							#
849							sub _decode_header {
850	800			800		1667	my($header_field_body, $header_field_name) = @_;
851
852	800	100	66			3151	return '' unless defined $header_field_body && $header_field_body ne '';
853
854							# deal with folding and cream the newlines and such
855	750					1322	$header_field_body =~ s/\n[ \t]+/\n /g;
856	750					1044	$header_field_body =~ s/\015?\012//gs;
857
858	750	100				3543	if ($header_field_name =~
859							/^ (?: Received \| (?:Resent-)? (?: Message-ID \| Date ) \|
860							MIME-Version \| References \| In-Reply-To \| List-.* ) \z /xsi ) {
861							# Bug 6945: some header fields must not be processed for MIME encoding
862							# Bug 7466: leave out the Content-*
863
864							} else {
865	453					1420	local($1,$2,$3);
866
867							# Multiple encoded sections must ignore the interim whitespace.
868							# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
869							# separated by whitespace.
870	453					1063	1 while $header_field_body =~
871							s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+
872							( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) }
873							{$1$2}xsg;
874
875							# transcode properly encoded RFC 2047 substrings into UTF-8 octets,
876							# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532)
877	453					1055	# or plain US-ASCII
878	10					38	$header_field_body =~
879							s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) }
880							{ __decode_header($1, uc($2), $3) }xsge;
881							}
882	750					1596
883							# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body);
884							return $header_field_body;
885							}
886
887							=item get_header()
888
889							Retrieve a specific header. Will have a newline at the end and will be
890							unfolded. The first parameter is the header name (case-insensitive),
891							and the second parameter (optional) is whether or not to return the
892							raw header.
893
894							If get_header() is called in an array context, an array will be returned
895							with each header entry in a different element. In a scalar context,
896							the last specific header is returned.
897
898							ie: If 'Subject' is specified as the header, and there are 2 Subject
899							headers in a message, the last/bottom one in the message is returned in
900							scalar context or both are returned in array context.
901
902							Btw, returning the last header field (not the first) happens to be consistent
903							with DKIM signatures, which search for and cover multiple header fields
904							bottom-up according to the 'h' tag. Let's keep it this way.
905
906							=cut
907	4075			4075	1	7161
908	4075		50			13154	sub get_header {
909							my ($self, $hdr, $raw) = @_;
910							$raw \|\|= 0;
911
912							# And now pick up all the entries into a list
913							# This is assumed to include a newline at the end ...
914							# This is also assumed to have removed continuation bits ...
915	4075					4715
916	4075	50				6280	# Deal with the possibility that header() or raw_header() returns undef
917	0	0				0	my @hdrs;
918	0					0	if ( $raw ) {
919							if (@hdrs = $self->raw_header($hdr)) {
920							s/\015?\012\s+/ /gs for @hdrs;
921							}
922	4075	100				7554	}
923	658					2285	else {
924							if (@hdrs = $self->header($hdr)) {
925							$_ .= "\n" for @hdrs;
926							}
927	4075	100				6151	}
928	3586					8295
929							if (wantarray) {
930							return @hdrs;
931	489	100				1838	}
932							else {
933							return @hdrs ? $hdrs[-1] : undef;
934							}
935							}
936
937							=item get_all_headers()
938
939							Retrieve all headers. Each header will have a newline at the end and
940							will be unfolded. The first parameter (optional) is whether or not to
941							return the raw headers, and the second parameter (optional) is whether
942							or not to include the mbox separator.
943
944							If get_all_header() is called in an array context, an array will be
945							returned with each header entry in a different element. In a scalar
946							context, the headers are returned in a single scalar.
947
948							=back
949
950							=cut
951
952	26			26	1	66	# build it and it will not bomb
953	26		50			155	sub get_all_headers {
954	26		100			85	my ($self, $raw, $include_mbox) = @_;
955							$raw \|\|= 0;
956	26					41	$include_mbox \|\|= 0;
957
958							my @lines;
959	26					41
960	26					37	# precalculate destination positions based on order of appearance
961	26					39	my $i = 0;
	26					108
962	332					365	my %locations;
	332					804
963							for my $k (@{$self->{header_order}}) {
964							push(@{$locations{lc($k)}}, $i++);
965							}
966	26					42
967	26					45	# process headers in order of first appearance
968	26					215	my $header;
	572					784
969							my $size = 0;
970							HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] }
971							keys %locations)
972	250					339	{
973	250					428	# get all same-name headers and poke into correct position
974	332					375	my $positions = $locations{$name};
	332					409
975	332					451	for my $contents ($self->get_header($name, $raw)) {
976	332	50				552	my $position = shift @{$positions};
977	0					0	$size += length($name) + length($contents) + 2;
978	0					0	if ($size > MAX_HEADER_LENGTH) {
979							$self->{'truncated_header'} = 1;
980	332					878	last HEADER;
981							}
982							$lines[$position] = $self->{header_order}->[$position].":".$contents;
983							}
984							}
985	26	50				103
	0					0
986							# skip undefined lines if we truncated
987	26	50	66			141	@lines = grep { defined $_ } @lines if $self->{'truncated_header'};
988
989	26	100				327	splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} );
990
991							return wantarray ? @lines : join ('', @lines);
992							}
993				0	0
994							# legacy public API; now a no-op.
995							sub finish { }
996
997							# ---------------------------------------------------------------------------
998
999							1;
1000							__END__