| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# <@LICENSE> |
|
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
|
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
|
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
|
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
|
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
|
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
|
15
|
|
|
|
|
|
|
# limitations under the License. |
|
16
|
|
|
|
|
|
|
# </@LICENSE> |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module will encapsulate an email message and allow access to |
|
25
|
|
|
|
|
|
|
the various MIME message parts. |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=over 4 |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
use strict; |
|
35
|
41
|
|
|
41
|
|
253
|
use warnings; |
|
|
41
|
|
|
|
|
72
|
|
|
|
41
|
|
|
|
|
1195
|
|
|
36
|
41
|
|
|
41
|
|
219
|
use re 'taint'; |
|
|
41
|
|
|
|
|
77
|
|
|
|
41
|
|
|
|
|
1325
|
|
|
37
|
41
|
|
|
41
|
|
228
|
|
|
|
41
|
|
|
|
|
94
|
|
|
|
41
|
|
|
|
|
1607
|
|
|
38
|
|
|
|
|
|
|
require 5.008001; # needs utf8::is_utf8() |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
use Mail::SpamAssassin; |
|
41
|
41
|
|
|
41
|
|
237
|
use Mail::SpamAssassin::Constants qw(:sa); |
|
|
41
|
|
|
|
|
74
|
|
|
|
41
|
|
|
|
|
1047
|
|
|
42
|
41
|
|
|
41
|
|
212
|
use Mail::SpamAssassin::HTML; |
|
|
41
|
|
|
|
|
77
|
|
|
|
41
|
|
|
|
|
6042
|
|
|
43
|
41
|
|
|
41
|
|
11179
|
use Mail::SpamAssassin::Logger; |
|
|
41
|
|
|
|
|
140
|
|
|
|
41
|
|
|
|
|
1490
|
|
|
44
|
41
|
|
|
41
|
|
248
|
|
|
|
41
|
|
|
|
|
73
|
|
|
|
41
|
|
|
|
|
6026
|
|
|
45
|
|
|
|
|
|
|
our($enc_utf8, $enc_w1252, $have_encode_detector); |
|
46
|
|
|
|
|
|
|
BEGIN { |
|
47
|
|
|
|
|
|
|
eval { require Encode } |
|
48
|
41
|
|
|
|
|
20823
|
and do { $enc_utf8 = Encode::find_encoding('UTF-8'); |
|
49
|
41
|
50
|
|
41
|
|
124
|
$enc_w1252 = Encode::find_encoding('Windows-1252') }; |
|
|
41
|
|
|
|
|
340379
|
|
|
50
|
41
|
|
|
|
|
5746
|
eval { require Encode::Detect::Detector } |
|
51
|
41
|
|
|
|
|
190808
|
and do { $have_encode_detector = 1 }; |
|
52
|
41
|
50
|
|
|
|
12972
|
}; |
|
|
0
|
|
|
|
|
0
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item new() |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Generates an empty Node object and returns it. Typically only called |
|
57
|
|
|
|
|
|
|
by functions in Message. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
my $class = shift; |
|
62
|
|
|
|
|
|
|
$class = ref($class) || $class; |
|
63
|
196
|
|
|
196
|
1
|
438
|
|
|
64
|
196
|
|
33
|
|
|
802
|
my $self = { |
|
65
|
|
|
|
|
|
|
headers => {}, |
|
66
|
196
|
|
|
|
|
1136
|
raw_headers => {}, |
|
67
|
|
|
|
|
|
|
header_order => [] |
|
68
|
|
|
|
|
|
|
}; |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# deal with any parameters |
|
71
|
|
|
|
|
|
|
my($opts) = @_; |
|
72
|
|
|
|
|
|
|
$self->{normalize} = $opts->{'normalize'} || 0; |
|
73
|
196
|
|
|
|
|
438
|
|
|
74
|
196
|
|
50
|
|
|
928
|
bless($self,$class); |
|
75
|
|
|
|
|
|
|
$self; |
|
76
|
196
|
|
|
|
|
544
|
} |
|
77
|
196
|
|
|
|
|
534
|
|
|
78
|
|
|
|
|
|
|
=item find_parts() |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Used to search the tree for specific MIME parts. An array of matching |
|
81
|
|
|
|
|
|
|
Node objects (pointers into the tree) is returned. The parameters that |
|
82
|
|
|
|
|
|
|
can be passed in are (in order, all scalars): |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Regexp - Used to match against each part's Content-Type header, |
|
85
|
|
|
|
|
|
|
specifically the type and not the rest of the header. ie: "Content-type: |
|
86
|
|
|
|
|
|
|
text/html; encoding=quoted-printable" has a type of "text/html". If no |
|
87
|
|
|
|
|
|
|
regexp is specified, find_parts() will return an empty array. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Only_leaves - By default, find_parts() will return any part that matches |
|
90
|
|
|
|
|
|
|
the regexp, including multipart. If you only want to see leaves of the |
|
91
|
|
|
|
|
|
|
tree (ie: parts that aren't multipart), set this to true (1). |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Recursive - By default, when find_parts() finds a multipart which has |
|
94
|
|
|
|
|
|
|
parts underneath it, it will recurse through all sub-children. If set to 0, |
|
95
|
|
|
|
|
|
|
only look at the part and any direct children of the part. |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Used to find any MIME parts whose simple content-type matches a given regexp |
|
100
|
|
|
|
|
|
|
# Searches it's own and any children parts. Returns an array of MIME |
|
101
|
|
|
|
|
|
|
# objects which match. Our callers may expect the default behavior which is a |
|
102
|
|
|
|
|
|
|
# depth-first array of parts. |
|
103
|
|
|
|
|
|
|
# |
|
104
|
|
|
|
|
|
|
my ($self, $re, $onlyleaves, $recursive) = @_; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Didn't pass an RE? Just abort. |
|
107
|
256
|
|
|
256
|
1
|
649
|
return () unless defined $re && $re ne ''; |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
$onlyleaves = 0 unless defined $onlyleaves; |
|
110
|
256
|
50
|
33
|
|
|
1363
|
|
|
111
|
|
|
|
|
|
|
my $depth; |
|
112
|
256
|
50
|
|
|
|
558
|
if (defined $recursive && $recursive == 0) { |
|
113
|
|
|
|
|
|
|
$depth = 1; |
|
114
|
256
|
|
|
|
|
420
|
} |
|
115
|
256
|
50
|
33
|
|
|
637
|
|
|
116
|
0
|
|
|
|
|
0
|
my @ret; |
|
117
|
|
|
|
|
|
|
my @search = ( $self ); |
|
118
|
|
|
|
|
|
|
|
|
119
|
256
|
|
|
|
|
390
|
while (my $part = shift @search) { |
|
120
|
256
|
|
|
|
|
541
|
# If this object matches, mark it for return. |
|
121
|
|
|
|
|
|
|
my $amialeaf = $part->is_leaf(); |
|
122
|
256
|
|
|
|
|
676
|
|
|
123
|
|
|
|
|
|
|
if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) { |
|
124
|
290
|
|
|
|
|
897
|
push(@ret, $part); |
|
125
|
|
|
|
|
|
|
} |
|
126
|
290
|
100
|
66
|
|
|
2909
|
|
|
|
|
|
100
|
|
|
|
|
|
127
|
271
|
|
|
|
|
517
|
if ( !$amialeaf && (!defined $depth || $depth > 0)) { |
|
128
|
|
|
|
|
|
|
$depth-- if defined $depth; |
|
129
|
|
|
|
|
|
|
unshift(@search, @{$part->{'body_parts'}}); |
|
130
|
290
|
50
|
33
|
|
|
1059
|
} |
|
|
|
|
66
|
|
|
|
|
|
131
|
19
|
50
|
|
|
|
41
|
} |
|
132
|
19
|
|
|
|
|
25
|
|
|
|
19
|
|
|
|
|
64
|
|
|
133
|
|
|
|
|
|
|
return @ret; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
256
|
|
|
|
|
995
|
=item header() |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Stores and retrieves headers from a specific MIME part. The first |
|
139
|
|
|
|
|
|
|
parameter is the header name. If there is no other parameter, the header |
|
140
|
|
|
|
|
|
|
is retrieved. If there is a second parameter, the header is stored. |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Header names are case-insensitive and are stored in both raw and |
|
143
|
|
|
|
|
|
|
decoded form. Using header(), only the decoded form is retrievable. |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
For retrieval, if header() is called in an array context, an array will |
|
146
|
|
|
|
|
|
|
be returned with each header entry in a different element. In a scalar |
|
147
|
|
|
|
|
|
|
context, the last specific header is returned. |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
|
150
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
|
151
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Store or retrieve headers from a given MIME object |
|
156
|
|
|
|
|
|
|
# |
|
157
|
|
|
|
|
|
|
my $self = shift; |
|
158
|
|
|
|
|
|
|
my $rawkey = shift; |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
return unless defined $rawkey; |
|
161
|
5663
|
|
|
5663
|
1
|
6921
|
|
|
162
|
5663
|
|
|
|
|
7399
|
# we're going to do things case insensitively |
|
163
|
|
|
|
|
|
|
my $key = lc($rawkey); |
|
164
|
5663
|
50
|
|
|
|
11217
|
|
|
165
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
|
166
|
|
|
|
|
|
|
$key =~ s/^\s+//; |
|
167
|
5663
|
|
|
|
|
10021
|
$key =~ s/\s+$//; |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
if (@_) { |
|
170
|
5663
|
|
|
|
|
11025
|
my $raw_value = shift; |
|
171
|
5663
|
|
|
|
|
8979
|
return unless defined $raw_value; |
|
172
|
|
|
|
|
|
|
|
|
173
|
5663
|
100
|
|
|
|
9572
|
push @{ $self->{'header_order'} }, $rawkey; |
|
174
|
909
|
|
|
|
|
1578
|
if ( !exists $self->{'headers'}->{$key} ) { |
|
175
|
909
|
50
|
|
|
|
1797
|
$self->{'headers'}->{$key} = []; |
|
176
|
|
|
|
|
|
|
$self->{'raw_headers'}->{$key} = []; |
|
177
|
909
|
|
|
|
|
1123
|
} |
|
|
909
|
|
|
|
|
2059
|
|
|
178
|
909
|
100
|
|
|
|
2317
|
|
|
179
|
835
|
|
|
|
|
2315
|
my $dec_value = $raw_value; |
|
180
|
835
|
|
|
|
|
1764
|
$dec_value =~ s/\n[ \t]+/ /gs; |
|
181
|
|
|
|
|
|
|
$dec_value =~ s/\s+$//s; |
|
182
|
|
|
|
|
|
|
$dec_value =~ s/^\s+//s; |
|
183
|
909
|
|
|
|
|
1494
|
push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key); |
|
184
|
909
|
|
|
|
|
2282
|
|
|
185
|
909
|
|
|
|
|
3755
|
push @{ $self->{'raw_headers'}->{$key} }, $raw_value; |
|
186
|
909
|
|
|
|
|
2371
|
|
|
187
|
909
|
|
|
|
|
1258
|
return $self->{'headers'}->{$key}->[-1]; |
|
|
909
|
|
|
|
|
2431
|
|
|
188
|
|
|
|
|
|
|
} |
|
189
|
909
|
|
|
|
|
1390
|
|
|
|
909
|
|
|
|
|
2051
|
|
|
190
|
|
|
|
|
|
|
if (wantarray) { |
|
191
|
909
|
|
|
|
|
4000
|
return unless exists $self->{'headers'}->{$key}; |
|
192
|
|
|
|
|
|
|
return @{ $self->{'headers'}->{$key} }; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
4754
|
100
|
|
|
|
7358
|
else { |
|
195
|
4441
|
100
|
|
|
|
14736
|
return '' unless exists $self->{'headers'}->{$key}; |
|
196
|
1002
|
|
|
|
|
1210
|
return $self->{'headers'}->{$key}->[-1]; |
|
|
1002
|
|
|
|
|
5415
|
|
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
} |
|
199
|
313
|
100
|
|
|
|
2295
|
|
|
200
|
48
|
|
|
|
|
318
|
=item raw_header() |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Retrieves the raw version of headers from a specific MIME part. The only |
|
203
|
|
|
|
|
|
|
parameter is the header name. Header names are case-insensitive. |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
For retrieval, if raw_header() is called in an array context, an array |
|
206
|
|
|
|
|
|
|
will be returned with each header entry in a different element. In a |
|
207
|
|
|
|
|
|
|
scalar context, the last specific header is returned. |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
|
210
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
|
211
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=cut |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Retrieve raw headers from a given MIME object |
|
216
|
|
|
|
|
|
|
# |
|
217
|
|
|
|
|
|
|
my $self = shift; |
|
218
|
|
|
|
|
|
|
my $key = lc(shift); |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
|
221
|
|
|
|
|
|
|
$key =~ s/^\s+//; |
|
222
|
633
|
|
|
633
|
1
|
855
|
$key =~ s/\s+$//; |
|
223
|
633
|
|
|
|
|
992
|
|
|
224
|
|
|
|
|
|
|
if (wantarray) { |
|
225
|
|
|
|
|
|
|
return unless exists $self->{'raw_headers'}->{$key}; |
|
226
|
633
|
|
|
|
|
1248
|
return @{ $self->{'raw_headers'}->{$key} }; |
|
227
|
633
|
|
|
|
|
1015
|
} |
|
228
|
|
|
|
|
|
|
else { |
|
229
|
633
|
50
|
|
|
|
1133
|
return '' unless exists $self->{'raw_headers'}->{$key}; |
|
230
|
633
|
100
|
|
|
|
1948
|
return $self->{'raw_headers'}->{$key}->[-1]; |
|
231
|
139
|
|
|
|
|
237
|
} |
|
|
139
|
|
|
|
|
807
|
|
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
0
|
=item add_body_part() |
|
235
|
0
|
|
|
|
|
0
|
|
|
236
|
|
|
|
|
|
|
Adds a Node child object to the current node object. |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=cut |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Add a MIME child part to ourselves |
|
241
|
|
|
|
|
|
|
my($self, $part) = @_; |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
dbg("message: added part, type: ".$part->{'type'}); |
|
244
|
|
|
|
|
|
|
push @{ $self->{'body_parts'} }, $part; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
50
|
|
|
50
|
1
|
93
|
=item is_leaf() |
|
248
|
|
|
|
|
|
|
|
|
249
|
50
|
|
|
|
|
164
|
Returns true if the tree node in question is a leaf of the tree (ie: |
|
250
|
50
|
|
|
|
|
58
|
has no children of its own). Note: This function may return odd results |
|
|
50
|
|
|
|
|
116
|
|
|
251
|
|
|
|
|
|
|
unless the message has been mime parsed via _do_parse()! |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=cut |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
my($self) = @_; |
|
256
|
|
|
|
|
|
|
return !exists $self->{'body_parts'}; |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=item raw() |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Return a reference to the raw array. Treat this as READ ONLY. |
|
262
|
290
|
|
|
290
|
1
|
509
|
|
|
263
|
290
|
|
|
|
|
787
|
=cut |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
my $self = shift; |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# Ok, if we're called we are expected to return an array. |
|
268
|
|
|
|
|
|
|
# so if it's a file reference, read in the message into an array... |
|
269
|
|
|
|
|
|
|
# |
|
270
|
|
|
|
|
|
|
# NOTE: that "ref undef" works, so don't bother checking for a defined var |
|
271
|
|
|
|
|
|
|
# first. |
|
272
|
|
|
|
|
|
|
if (ref $self->{'raw'} eq 'GLOB') { |
|
273
|
0
|
|
|
0
|
1
|
0
|
my $fd = $self->{'raw'}; |
|
274
|
|
|
|
|
|
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# dbg("message: (raw) reading mime part from a temporary file"); |
|
277
|
|
|
|
|
|
|
my($nread,$raw_str); $raw_str = ''; |
|
278
|
|
|
|
|
|
|
while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { } |
|
279
|
|
|
|
|
|
|
defined $nread or die "error reading: $!"; |
|
280
|
0
|
0
|
|
|
|
0
|
my @array = split(/^/m, $raw_str, -1); |
|
281
|
0
|
|
|
|
|
0
|
|
|
282
|
0
|
0
|
|
|
|
0
|
dbg("message: empty message read") if $raw_str eq ''; |
|
283
|
|
|
|
|
|
|
return \@array; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
286
|
0
|
|
|
|
|
0
|
return $self->{'raw'}; |
|
287
|
0
|
0
|
|
|
|
0
|
} |
|
288
|
0
|
|
|
|
|
0
|
|
|
289
|
|
|
|
|
|
|
=item decode() |
|
290
|
0
|
0
|
|
|
|
0
|
|
|
291
|
0
|
|
|
|
|
0
|
If necessary, decode the part text as base64 or quoted-printable. |
|
292
|
|
|
|
|
|
|
The decoded text will be returned as a scalar string. An optional length |
|
293
|
|
|
|
|
|
|
parameter can be passed in which limits how much decoded data is returned. |
|
294
|
0
|
|
|
|
|
0
|
If the scalar isn't needed, call with "0" as a parameter. |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=cut |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
my($self, $bytes) = @_; |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
if ( !exists $self->{'decoded'} ) { |
|
301
|
|
|
|
|
|
|
# Someone is looking for a decoded part where there is no raw data |
|
302
|
|
|
|
|
|
|
# (multipart or subparsed message, etc.) Just return undef. |
|
303
|
|
|
|
|
|
|
return if !exists $self->{'raw'}; |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
my $raw; |
|
306
|
|
|
|
|
|
|
|
|
307
|
254
|
|
|
254
|
1
|
3692
|
# if the part is held in a temp file, read it into the scalar |
|
308
|
|
|
|
|
|
|
if (ref $self->{'raw'} eq 'GLOB') { |
|
309
|
254
|
100
|
|
|
|
678
|
my $fd = $self->{'raw'}; |
|
310
|
|
|
|
|
|
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
|
311
|
|
|
|
|
|
|
|
|
312
|
146
|
50
|
|
|
|
414
|
# dbg("message: (decode) reading mime part from a temporary file"); |
|
313
|
|
|
|
|
|
|
my($nread,$raw_str); $raw = ''; |
|
314
|
146
|
|
|
|
|
234
|
while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { } |
|
315
|
|
|
|
|
|
|
defined $nread or die "error reading: $!"; |
|
316
|
|
|
|
|
|
|
|
|
317
|
146
|
100
|
|
|
|
473
|
dbg("message: empty message read from a temp file") if $raw eq ''; |
|
318
|
4
|
|
|
|
|
5
|
} |
|
319
|
4
|
50
|
|
|
|
36
|
else { |
|
320
|
|
|
|
|
|
|
# create a new scalar from the raw array in memory |
|
321
|
|
|
|
|
|
|
$raw = join('', @{$self->{'raw'}}); |
|
322
|
4
|
|
|
|
|
9
|
} |
|
|
4
|
|
|
|
|
5
|
|
|
323
|
4
|
|
|
|
|
234
|
|
|
324
|
4
|
50
|
|
|
|
16
|
my $encoding = lc $self->header('content-transfer-encoding') || ''; |
|
325
|
|
|
|
|
|
|
|
|
326
|
4
|
50
|
|
|
|
16
|
if ( $encoding eq 'quoted-printable' ) { |
|
327
|
|
|
|
|
|
|
dbg("message: decoding quoted-printable"); |
|
328
|
|
|
|
|
|
|
$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw); |
|
329
|
|
|
|
|
|
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
|
330
|
142
|
|
|
|
|
260
|
} |
|
|
142
|
|
|
|
|
1450
|
|
|
331
|
|
|
|
|
|
|
elsif ( $encoding eq 'base64' ) { |
|
332
|
|
|
|
|
|
|
dbg("message: decoding base64"); |
|
333
|
146
|
|
100
|
|
|
559
|
|
|
334
|
|
|
|
|
|
|
# if it's not defined or is 0, do the whole thing, otherwise only decode |
|
335
|
146
|
100
|
|
|
|
711
|
# a portion |
|
|
|
100
|
|
|
|
|
|
|
336
|
6
|
|
|
|
|
25
|
if ($bytes) { |
|
337
|
6
|
|
|
|
|
39
|
return Mail::SpamAssassin::Util::base64_decode($raw, $bytes); |
|
338
|
6
|
|
|
|
|
36
|
} |
|
339
|
|
|
|
|
|
|
else { |
|
340
|
|
|
|
|
|
|
# Generate the decoded output |
|
341
|
5
|
|
|
|
|
17
|
$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw); |
|
342
|
|
|
|
|
|
|
} |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) { |
|
345
|
5
|
50
|
|
|
|
7
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
|
346
|
0
|
|
|
|
|
0
|
} |
|
347
|
|
|
|
|
|
|
} |
|
348
|
|
|
|
|
|
|
else { |
|
349
|
|
|
|
|
|
|
# Encoding is one of 7bit, 8bit, binary or x-something |
|
350
|
5
|
|
|
|
|
19
|
if ( $encoding ) { |
|
351
|
|
|
|
|
|
|
dbg("message: decoding other encoding type ($encoding), ignoring"); |
|
352
|
|
|
|
|
|
|
} |
|
353
|
5
|
100
|
|
|
|
26
|
else { |
|
354
|
1
|
|
|
|
|
21
|
dbg("message: no encoding detected"); |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
$self->{'decoded'} = $raw; |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
} |
|
359
|
135
|
100
|
|
|
|
316
|
|
|
360
|
33
|
|
|
|
|
197
|
if ( !defined $bytes || $bytes ) { |
|
361
|
|
|
|
|
|
|
if ( !defined $bytes ) { |
|
362
|
|
|
|
|
|
|
# force a copy |
|
363
|
102
|
|
|
|
|
351
|
return '' . $self->{'decoded'}; |
|
364
|
|
|
|
|
|
|
} |
|
365
|
135
|
|
|
|
|
727
|
else { |
|
366
|
|
|
|
|
|
|
return substr($self->{'decoded'}, 0, $bytes); |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
} |
|
369
|
254
|
100
|
66
|
|
|
1244
|
} |
|
370
|
250
|
50
|
|
|
|
599
|
|
|
371
|
|
|
|
|
|
|
# Detect endianness of UTF-16 encoded data |
|
372
|
250
|
|
|
|
|
2079
|
my $data = $_[0]; # could not avoid copying large strings |
|
373
|
|
|
|
|
|
|
my $utf16le_clues = 0; |
|
374
|
|
|
|
|
|
|
my $utf16be_clues = 0; |
|
375
|
0
|
|
|
|
|
0
|
my $sum_h_e = 0; |
|
376
|
|
|
|
|
|
|
my $sum_h_o = 0; |
|
377
|
|
|
|
|
|
|
my $sum_l_e = 0; |
|
378
|
|
|
|
|
|
|
my $sum_l_o = 0; |
|
379
|
|
|
|
|
|
|
my $decoder = undef; |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
my @msg_h = unpack 'H' x length( $data ), $data; |
|
382
|
0
|
|
|
0
|
0
|
0
|
my @msg_l = unpack 'h' x length( $data ), $data; |
|
383
|
0
|
|
|
|
|
0
|
|
|
384
|
0
|
|
|
|
|
0
|
for( my $i = 0; $i < length( $data ); $i+=2 ) { |
|
385
|
0
|
|
|
|
|
0
|
my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] ); |
|
386
|
0
|
|
|
|
|
0
|
$sum_h_e += hex $msg_h[$i]; |
|
387
|
0
|
|
|
|
|
0
|
$sum_h_o += hex $msg_h[$i+1]; |
|
388
|
0
|
|
|
|
|
0
|
$sum_l_e += hex $msg_l[$i]; |
|
389
|
0
|
|
|
|
|
0
|
$sum_l_o += hex $msg_l[$i+1]; |
|
390
|
|
|
|
|
|
|
if( $check_char =~ /20 00/ ) { |
|
391
|
0
|
|
|
|
|
0
|
# UTF-16LE space char detected |
|
392
|
0
|
|
|
|
|
0
|
$utf16le_clues++; |
|
393
|
|
|
|
|
|
|
} |
|
394
|
0
|
|
|
|
|
0
|
if( $check_char =~ /00 20/ ) { |
|
395
|
0
|
|
|
|
|
0
|
# UTF-16BE space char detected |
|
396
|
0
|
|
|
|
|
0
|
$utf16be_clues++; |
|
397
|
0
|
|
|
|
|
0
|
} |
|
398
|
0
|
|
|
|
|
0
|
} |
|
399
|
0
|
|
|
|
|
0
|
|
|
400
|
0
|
0
|
|
|
|
0
|
# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE |
|
401
|
|
|
|
|
|
|
$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 ); |
|
402
|
0
|
|
|
|
|
0
|
|
|
403
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE |
|
404
|
0
|
0
|
|
|
|
0
|
$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) ); |
|
405
|
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
if( $utf16le_clues > $utf16be_clues ) { |
|
407
|
|
|
|
|
|
|
dbg( "message: detect_utf16: UTF-16LE" ); |
|
408
|
|
|
|
|
|
|
$decoder = Encode::find_encoding("UTF-16LE"); |
|
409
|
|
|
|
|
|
|
} elsif( $utf16le_clues > $utf16be_clues ) { |
|
410
|
|
|
|
|
|
|
dbg( "message: detect_utf16: UTF-16BE" ); |
|
411
|
0
|
0
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16BE"); |
|
412
|
|
|
|
|
|
|
} else { |
|
413
|
|
|
|
|
|
|
dbg( "message: detect_utf16: Could not detect UTF-16 endianness" ); |
|
414
|
0
|
0
|
|
|
|
0
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
0
|
0
|
|
|
|
0
|
return $decoder; |
|
|
|
0
|
|
|
|
|
|
|
417
|
0
|
|
|
|
|
0
|
} |
|
418
|
0
|
|
|
|
|
0
|
|
|
419
|
|
|
|
|
|
|
# Look at a text scalar and determine whether it should be rendered |
|
420
|
0
|
|
|
|
|
0
|
# as text/html. |
|
421
|
0
|
|
|
|
|
0
|
# |
|
422
|
|
|
|
|
|
|
# This is not a public function. |
|
423
|
0
|
|
|
|
|
0
|
# |
|
424
|
|
|
|
|
|
|
if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is) |
|
425
|
|
|
|
|
|
|
{ |
|
426
|
0
|
|
|
|
|
0
|
my $pad = $1; |
|
427
|
|
|
|
|
|
|
my $count = 0; |
|
428
|
|
|
|
|
|
|
$count += ($pad =~ tr/\n//d) * 2; |
|
429
|
|
|
|
|
|
|
$count += ($pad =~ tr/\n//cd); |
|
430
|
|
|
|
|
|
|
return ($count < 24); |
|
431
|
|
|
|
|
|
|
} |
|
432
|
|
|
|
|
|
|
return 0; |
|
433
|
|
|
|
|
|
|
} |
|
434
|
|
|
|
|
|
|
|
|
435
|
0
|
0
|
|
0
|
|
0
|
# Decode character set of a given text to perl characters (Unicode), |
|
436
|
|
|
|
|
|
|
# then encode into UTF-8 octets if requested. |
|
437
|
0
|
|
|
|
|
0
|
# |
|
438
|
0
|
|
|
|
|
0
|
# my $data = $_[0]; # avoid copying large strings |
|
439
|
0
|
|
|
|
|
0
|
my $charset_declared = $_[1]; |
|
440
|
0
|
|
|
|
|
0
|
my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets |
|
441
|
0
|
|
|
|
|
0
|
|
|
442
|
|
|
|
|
|
|
warn "message: _normalize() was given characters, expected bytes: $_[0]\n" |
|
443
|
0
|
|
|
|
|
0
|
if utf8::is_utf8($_[0]); |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879] |
|
446
|
|
|
|
|
|
|
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
if (!defined $charset_declared || $charset_declared eq '') { |
|
449
|
|
|
|
|
|
|
$charset_declared = 'us-ascii'; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
10
|
|
|
10
|
|
19
|
|
|
452
|
10
|
|
|
|
|
13
|
# number of characters with code above 127 |
|
453
|
|
|
|
|
|
|
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c; |
|
454
|
10
|
50
|
|
|
|
35
|
|
|
455
|
|
|
|
|
|
|
if (!$cnt_8bits && |
|
456
|
|
|
|
|
|
|
$charset_declared =~ |
|
457
|
|
|
|
|
|
|
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) | |
|
458
|
10
|
|
|
|
|
39
|
ISO646-US )\z/xsi) |
|
459
|
|
|
|
|
|
|
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is |
|
460
|
10
|
50
|
33
|
|
|
51
|
dbg("message: kept, charset is US-ASCII as declared"); |
|
461
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
|
462
|
|
|
|
|
|
|
} |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
if (!$cnt_8bits && |
|
465
|
10
|
|
|
|
|
23
|
$charset_declared =~ |
|
466
|
|
|
|
|
|
|
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} | |
|
467
|
10
|
50
|
33
|
|
|
68
|
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} | |
|
468
|
|
|
|
|
|
|
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi) |
|
469
|
|
|
|
|
|
|
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII |
|
470
|
|
|
|
|
|
|
dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared); |
|
471
|
|
|
|
|
|
|
return $_[0]; # is all-ASCII, no need for decoding |
|
472
|
0
|
|
|
|
|
0
|
} |
|
473
|
0
|
|
|
|
|
0
|
|
|
474
|
|
|
|
|
|
|
# Try first to strictly decode based on a declared character set. |
|
475
|
|
|
|
|
|
|
|
|
476
|
10
|
100
|
66
|
|
|
74
|
my $rv; |
|
477
|
|
|
|
|
|
|
if ($charset_declared =~ /^UTF-?8\z/i) { |
|
478
|
|
|
|
|
|
|
# attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC) |
|
479
|
|
|
|
|
|
|
if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
480
|
|
|
|
|
|
|
dbg("message: decoded as declared charset UTF-8"); |
|
481
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
|
482
|
8
|
|
|
|
|
24
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
|
483
|
8
|
|
|
|
|
58
|
return $rv; # decoded |
|
484
|
|
|
|
|
|
|
} else { |
|
485
|
|
|
|
|
|
|
my $err = ''; |
|
486
|
|
|
|
|
|
|
if ($@) { |
|
487
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
488
|
2
|
|
|
|
|
9
|
$err = " ($err)"; |
|
489
|
2
|
50
|
33
|
|
|
20
|
} |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
dbg("message: failed decoding as declared charset UTF-8 ($err)"); |
|
491
|
0
|
0
|
|
|
|
0
|
} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
492
|
0
|
|
|
|
|
0
|
|
|
493
|
0
|
0
|
|
|
|
0
|
} elsif ($charset_declared =~ /^UTF[ -]?16/i) { |
|
494
|
0
|
|
|
|
|
0
|
# Handle cases where spammers use UTF-16 encoding without including a BOM |
|
495
|
0
|
|
|
|
|
0
|
# or declaring endianness as reported at: |
|
496
|
|
|
|
|
|
|
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252 |
|
497
|
0
|
|
|
|
|
0
|
|
|
498
|
0
|
0
|
|
|
|
0
|
my $decoder = detect_utf16( $_[0] ); |
|
499
|
0
|
|
|
|
|
0
|
if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) { |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
500
|
0
|
|
|
|
|
0
|
dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name); |
|
501
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
|
502
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
|
503
|
|
|
|
|
|
|
return $rv; # decoded |
|
504
|
|
|
|
|
|
|
} else { |
|
505
|
|
|
|
|
|
|
my $err = ''; |
|
506
|
|
|
|
|
|
|
if ($@) { |
|
507
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
508
|
|
|
|
|
|
|
$err = " ($err)"; |
|
509
|
|
|
|
|
|
|
} |
|
510
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err); |
|
511
|
0
|
0
|
|
|
|
0
|
}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
512
|
0
|
|
|
|
|
0
|
|
|
513
|
0
|
0
|
|
|
|
0
|
} elsif ($cnt_8bits && |
|
514
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
515
|
0
|
|
|
|
|
0
|
dbg("message: decoded as charset UTF-8, declared %s", $charset_declared); |
|
516
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
|
517
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
|
518
|
0
|
0
|
|
|
|
0
|
return $rv; # decoded |
|
519
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
520
|
0
|
|
|
|
|
0
|
} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) { |
|
521
|
|
|
|
|
|
|
# declared as US-ASCII but contains 8-bit characters, makes no sense |
|
522
|
0
|
|
|
|
|
0
|
# to attempt decoding first as strict US-ASCII as we know it would fail |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
} else { |
|
525
|
|
|
|
|
|
|
# try decoding as a declared character set |
|
526
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
527
|
0
|
|
|
|
|
0
|
# -> http://en.wikipedia.org/wiki/Windows-1252 |
|
528
|
0
|
0
|
|
|
|
0
|
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs |
|
529
|
0
|
|
|
|
|
0
|
# from the IANA's ISO-8859-1 by using displayable characters rather than |
|
530
|
0
|
|
|
|
|
0
|
# control characters in the 80 to 9F (hex) range. [...] |
|
531
|
|
|
|
|
|
|
# It is very common to mislabel Windows-1252 text with the charset label |
|
532
|
|
|
|
|
|
|
# ISO-8859-1. A common result was that all the quotes and apostrophes |
|
533
|
|
|
|
|
|
|
# (produced by "smart quotes" in word-processing software) were replaced |
|
534
|
|
|
|
|
|
|
# with question marks or boxes on non-Windows operating systems, making |
|
535
|
|
|
|
|
|
|
# text difficult to read. Most modern web browsers and e-mail clients |
|
536
|
|
|
|
|
|
|
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate |
|
537
|
|
|
|
|
|
|
# such mislabeling. This is now standard behavior in the draft HTML 5 |
|
538
|
|
|
|
|
|
|
# specification, which requires that documents advertised as ISO-8859-1 |
|
539
|
|
|
|
|
|
|
# actually be parsed with the Windows-1252 encoding. |
|
540
|
|
|
|
|
|
|
# |
|
541
|
|
|
|
|
|
|
my($chset, $decoder); |
|
542
|
|
|
|
|
|
|
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) { |
|
543
|
|
|
|
|
|
|
$chset = 'Windows-1252'; $decoder = $enc_w1252; |
|
544
|
|
|
|
|
|
|
} else { |
|
545
|
|
|
|
|
|
|
$chset = $charset_declared; $decoder = Encode::find_encoding($chset); |
|
546
|
|
|
|
|
|
|
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
|
547
|
|
|
|
|
|
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
|
548
|
|
|
|
|
|
|
dbg("message: no decoder for a declared charset %s, using GBK", |
|
549
|
|
|
|
|
|
|
$chset) if $decoder; |
|
550
|
|
|
|
|
|
|
} |
|
551
|
|
|
|
|
|
|
} |
|
552
|
|
|
|
|
|
|
if (!$decoder) { |
|
553
|
2
|
|
|
|
|
7
|
dbg("message: failed decoding, no decoder for a declared charset %s", |
|
554
|
2
|
50
|
|
|
|
13
|
$chset); |
|
555
|
0
|
|
|
|
|
0
|
} else { |
|
|
0
|
|
|
|
|
0
|
|
|
556
|
|
|
|
|
|
|
my $err = ''; |
|
557
|
2
|
|
|
|
|
4
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
|
2
|
|
|
|
|
14
|
|
|
558
|
2
|
50
|
33
|
|
|
6805
|
if ($@) { |
|
559
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
560
|
0
|
0
|
|
|
|
0
|
$err = " ($err)"; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
|
|
|
|
|
|
if (lc $chset eq lc $charset_declared) { |
|
563
|
|
|
|
|
|
|
dbg("message: %s as declared charset %s%s", |
|
564
|
2
|
50
|
|
|
|
5
|
defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err); |
|
565
|
0
|
|
|
|
|
0
|
} else { |
|
566
|
|
|
|
|
|
|
dbg("message: %s as charset %s, declared %s%s", |
|
567
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
|
568
|
2
|
|
|
|
|
4
|
$chset, $charset_declared, $err); |
|
569
|
2
|
|
|
|
|
4
|
} |
|
|
2
|
|
|
|
|
7
|
|
|
570
|
2
|
50
|
|
|
|
196
|
} |
|
571
|
0
|
|
|
|
|
0
|
} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
572
|
0
|
|
|
|
|
0
|
|
|
573
|
|
|
|
|
|
|
# If the above failed, check if it is US-ASCII, possibly extended by few |
|
574
|
2
|
50
|
|
|
|
9
|
# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing |
|
575
|
2
|
50
|
|
|
|
9
|
# some popular punctuation or special characters from Windows-1252 in |
|
576
|
|
|
|
|
|
|
# the \x80-\x9F range (which is unassigned in ISO-8859-*). |
|
577
|
|
|
|
|
|
|
# Note that Windows-1252 is a proper superset of ISO-8859-1. |
|
578
|
0
|
0
|
|
|
|
0
|
# |
|
579
|
|
|
|
|
|
|
if (!defined $rv && !$cnt_8bits) { |
|
580
|
|
|
|
|
|
|
dbg("message: kept, guessed charset is US-ASCII, declared %s", |
|
581
|
|
|
|
|
|
|
$charset_declared); |
|
582
|
|
|
|
|
|
|
return $_[0]; # is all-ASCII, no need for decoding |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
} elsif (!defined $rv && $enc_w1252 && |
|
585
|
|
|
|
|
|
|
# ASCII NBSP (c) SHY ' " ... '".- TM |
|
586
|
|
|
|
|
|
|
#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c) |
|
587
|
|
|
|
|
|
|
# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection, |
|
588
|
|
|
|
|
|
|
# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!) |
|
589
|
|
|
|
|
|
|
$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c) |
|
590
|
|
|
|
|
|
|
{ # ASCII + NBSP + SHY + some punctuation characters |
|
591
|
2
|
50
|
33
|
|
|
11
|
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too |
|
|
|
50
|
33
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
592
|
0
|
|
|
|
|
0
|
# consider also: AE (r), 80 Euro |
|
593
|
|
|
|
|
|
|
my $err = ''; |
|
594
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
595
|
|
|
|
|
|
|
if ($@) { |
|
596
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
597
|
|
|
|
|
|
|
$err = " ($err)"; |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
# the above can't fail, but keep code general just in case |
|
600
|
|
|
|
|
|
|
dbg("message: %s as guessed charset %s, declared %s%s", |
|
601
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
|
602
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
|
|
605
|
0
|
|
|
|
|
0
|
# If we were unsuccessful so far, try some guesswork |
|
606
|
0
|
|
|
|
|
0
|
# based on Encode::Detect::Detector . |
|
|
0
|
|
|
|
|
0
|
|
|
607
|
0
|
0
|
|
|
|
0
|
|
|
608
|
0
|
|
|
|
|
0
|
if (defined $rv) { |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
609
|
0
|
|
|
|
|
0
|
# done, no need for guesswork |
|
610
|
|
|
|
|
|
|
} elsif (!$have_encode_detector) { |
|
611
|
|
|
|
|
|
|
dbg("message: Encode::Detect::Detector not available, declared %s failed", |
|
612
|
0
|
0
|
|
|
|
0
|
$charset_declared); |
|
613
|
|
|
|
|
|
|
} else { |
|
614
|
|
|
|
|
|
|
my $charset_detected = Encode::Detect::Detector::detect($_[0]); |
|
615
|
|
|
|
|
|
|
if ($charset_detected && lc $charset_detected ne lc $charset_declared) { |
|
616
|
|
|
|
|
|
|
my $decoder = Encode::find_encoding($charset_detected); |
|
617
|
|
|
|
|
|
|
if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
|
618
|
|
|
|
|
|
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
|
619
|
|
|
|
|
|
|
dbg("message: no decoder for a detected charset %s, using GBK", |
|
620
|
2
|
50
|
|
|
|
5
|
$charset_detected) if $decoder; |
|
|
|
0
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
if (!$decoder) { |
|
623
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a detected charset %s", |
|
624
|
|
|
|
|
|
|
$charset_detected); |
|
625
|
|
|
|
|
|
|
} else { |
|
626
|
0
|
|
|
|
|
0
|
my $err = ''; |
|
627
|
0
|
0
|
0
|
|
|
0
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
628
|
0
|
|
|
|
|
0
|
if ($@) { |
|
629
|
0
|
0
|
0
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
630
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
|
631
|
0
|
0
|
|
|
|
0
|
} |
|
632
|
|
|
|
|
|
|
dbg("message: %s as detected charset %s, declared %s%s", |
|
633
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
|
634
|
0
|
0
|
|
|
|
0
|
$charset_detected, $charset_declared, $err); |
|
635
|
0
|
|
|
|
|
0
|
} |
|
636
|
|
|
|
|
|
|
} |
|
637
|
|
|
|
|
|
|
} |
|
638
|
0
|
|
|
|
|
0
|
|
|
639
|
0
|
|
|
|
|
0
|
if (!defined $rv) { # all decoding attempts failed so far, probably garbage |
|
|
0
|
|
|
|
|
0
|
|
|
640
|
0
|
0
|
|
|
|
0
|
# go for Windows-1252 which can't fail |
|
641
|
0
|
|
|
|
|
0
|
my $err = ''; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
642
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0]) }; |
|
643
|
|
|
|
|
|
|
if ($@) { |
|
644
|
0
|
0
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
645
|
|
|
|
|
|
|
$err = " ($err)"; |
|
646
|
|
|
|
|
|
|
} |
|
647
|
|
|
|
|
|
|
dbg("message: %s as last-resort charset %s, declared %s%s", |
|
648
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
|
649
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
|
650
|
|
|
|
|
|
|
} |
|
651
|
2
|
50
|
|
|
|
5
|
|
|
652
|
|
|
|
|
|
|
if (!defined $rv) { # just in case - all decoding attempts failed so far |
|
653
|
0
|
|
|
|
|
0
|
return $_[0]; # garbage-in / garbage-out, return unchanged octets |
|
654
|
0
|
|
|
|
|
0
|
} |
|
|
0
|
|
|
|
|
0
|
|
|
655
|
0
|
0
|
|
|
|
0
|
# decoding octets to characters was successful |
|
656
|
0
|
|
|
|
|
0
|
if (!$return_decoded) { |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
657
|
0
|
|
|
|
|
0
|
# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg |
|
658
|
|
|
|
|
|
|
utf8::encode($rv); # encode Unicode characters to UTF-8 octets |
|
659
|
0
|
0
|
|
|
|
0
|
} |
|
660
|
|
|
|
|
|
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
|
661
|
|
|
|
|
|
|
return $rv; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
|
|
|
|
|
|
|
|
664
|
2
|
50
|
|
|
|
3
|
=item rendered() |
|
665
|
0
|
|
|
|
|
0
|
|
|
666
|
|
|
|
|
|
|
render_text() takes the given text/* type MIME part, and attempts to |
|
667
|
|
|
|
|
|
|
render it into a text scalar. It will always render text/html, and will |
|
668
|
2
|
50
|
|
|
|
5
|
use a heuristic to determine if other text/* parts should be considered |
|
669
|
|
|
|
|
|
|
text/html. Two scalars are returned: the rendered type (either text/html |
|
670
|
2
|
|
|
|
|
4
|
or whatever the original type was), and the rendered text. |
|
671
|
|
|
|
|
|
|
|
|
672
|
2
|
|
|
|
|
3
|
=cut |
|
673
|
2
|
|
|
|
|
14
|
|
|
674
|
|
|
|
|
|
|
my ($self) = @_; |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
if (!exists $self->{rendered}) { |
|
677
|
|
|
|
|
|
|
# We only know how to render text/plain and text/html ... |
|
678
|
|
|
|
|
|
|
# Note: for bug 4843, make sure to skip text/calendar parts |
|
679
|
|
|
|
|
|
|
# we also want to skip things like text/x-vcard |
|
680
|
|
|
|
|
|
|
# text/x-aol is ignored here, but looks like text/html ... |
|
681
|
|
|
|
|
|
|
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
my $text = $self->decode; # QP and Base64 decoding, bytes |
|
684
|
|
|
|
|
|
|
my $text_len = length($text); # num of bytes in original charset encoding |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
# render text/html always |
|
687
|
143
|
|
|
143
|
1
|
442
|
if ($text ne '' && $self->{'type'} =~ m{^text/html$}i) |
|
688
|
|
|
|
|
|
|
{ |
|
689
|
143
|
100
|
|
|
|
515
|
$self->{rendered_type} = 'text/html'; |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# will input text to HTML::Parser be provided as Unicode characters? |
|
692
|
|
|
|
|
|
|
my $character_semantics = 0; # $text is in bytes |
|
693
|
|
|
|
|
|
|
if ($self->{normalize} && $enc_utf8) { # charset decoding requested |
|
694
|
127
|
50
|
|
|
|
1160
|
# Provide input to HTML::Parser as Unicode characters |
|
695
|
|
|
|
|
|
|
# which avoids a HTML::Parser bug in utf8_mode |
|
696
|
127
|
|
|
|
|
615
|
# https://rt.cpan.org/Public/Bug/Display.html?id=99755 |
|
697
|
127
|
|
|
|
|
522
|
# Note: the above bug was fixed in HTML-Parser 3.72, January 2016. |
|
698
|
|
|
|
|
|
|
# Avoid unnecessary step of encoding-then-decoding by telling |
|
699
|
|
|
|
|
|
|
# subroutine _normalize() to return Unicode text. See Bug 7133 |
|
700
|
127
|
100
|
100
|
|
|
1385
|
# |
|
701
|
|
|
|
|
|
|
$character_semantics = 1; # $text will be in characters |
|
702
|
8
|
|
|
|
|
37
|
$text = _normalize($text, $self->{charset}, 1); # bytes to chars |
|
703
|
|
|
|
|
|
|
} elsif (!defined $self->{charset} || |
|
704
|
|
|
|
|
|
|
$self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) { |
|
705
|
8
|
|
|
|
|
23
|
# With some luck input can be interpreted as UTF-8, do not warn. |
|
706
|
8
|
50
|
33
|
|
|
178
|
# It is still possible to hit the HTML::Parses utf8_mode bug however. |
|
|
|
100
|
100
|
|
|
|
|
|
707
|
|
|
|
|
|
|
} else { |
|
708
|
|
|
|
|
|
|
dbg("message: 'normalize_charset' is off, encoding will likely ". |
|
709
|
|
|
|
|
|
|
"be misinterpreted; declared charset: %s", $self->{charset}); |
|
710
|
|
|
|
|
|
|
} |
|
711
|
|
|
|
|
|
|
# the 0 requires decoded HTML results to be in bytes (not characters) |
|
712
|
|
|
|
|
|
|
my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object |
|
713
|
|
|
|
|
|
|
|
|
714
|
0
|
|
|
|
|
0
|
$html->parse($text); # parse+render text |
|
715
|
0
|
|
|
|
|
0
|
|
|
716
|
|
|
|
|
|
|
# resulting HTML-decoded text is in bytes, likely encoded as UTF-8 |
|
717
|
|
|
|
|
|
|
$self->{rendered} = $html->get_rendered_text(); |
|
718
|
|
|
|
|
|
|
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
|
719
|
|
|
|
|
|
|
$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1); |
|
720
|
|
|
|
|
|
|
$self->{html_results} = $html->get_results(); |
|
721
|
|
|
|
|
|
|
|
|
722
|
1
|
|
|
|
|
13
|
# end-of-document result values that require looking at the text |
|
723
|
|
|
|
|
|
|
my $r = $self->{html_results}; # temporary reference for brevity |
|
724
|
|
|
|
|
|
|
|
|
725
|
8
|
|
|
|
|
185
|
# count the number of spaces in the rendered text (likely UTF-8 octets) |
|
726
|
|
|
|
|
|
|
my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//; |
|
727
|
8
|
|
|
|
|
70
|
# we may want to add the count of other Unicode whitespace characters |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8) |
|
730
|
8
|
|
|
|
|
40
|
$r->{non_space_len} = $r->{html_length} - $space; |
|
731
|
8
|
|
|
|
|
49
|
$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len; |
|
732
|
8
|
|
|
|
|
34
|
} |
|
733
|
8
|
|
|
|
|
45
|
|
|
734
|
|
|
|
|
|
|
else { # plain text |
|
735
|
|
|
|
|
|
|
if ($self->{normalize} && $enc_utf8) { |
|
736
|
8
|
|
|
|
|
19
|
# request transcoded result as UTF-8 octets! |
|
737
|
|
|
|
|
|
|
$text = _normalize($text, $self->{charset}, 0); |
|
738
|
|
|
|
|
|
|
} |
|
739
|
8
|
|
|
|
|
42
|
$self->{rendered_type} = $self->{type}; |
|
740
|
|
|
|
|
|
|
$self->{rendered} = $self->{'visible_rendered'} = $text; |
|
741
|
|
|
|
|
|
|
$self->{'invisible_rendered'} = ''; |
|
742
|
8
|
|
|
|
|
31
|
} |
|
743
|
8
|
|
|
|
|
26
|
} |
|
744
|
8
|
50
|
|
|
|
185
|
|
|
745
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{rendered}); |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
|
|
748
|
119
|
0
|
33
|
|
|
451
|
=item set_rendered($text, $type) |
|
749
|
|
|
|
|
|
|
|
|
750
|
0
|
|
|
|
|
0
|
Set the rendered text and type for the given part. If type is not |
|
751
|
|
|
|
|
|
|
specified, and text is a defined value, a default of 'text/plain' is used. |
|
752
|
119
|
|
|
|
|
426
|
This can be used, for instance, to render non-text parts using plugins. |
|
753
|
119
|
|
|
|
|
532
|
|
|
754
|
119
|
|
|
|
|
422
|
=cut |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my ($self, $text, $type) = @_; |
|
757
|
|
|
|
|
|
|
|
|
758
|
143
|
|
|
|
|
599
|
$type = 'text/plain' if (!defined $type && defined $text); |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
$self->{'rendered_type'} = $type; |
|
761
|
|
|
|
|
|
|
$self->{'rendered'} = $self->{'visible_rendered'} = $text; |
|
762
|
|
|
|
|
|
|
$self->{'invisible_rendered'} = defined $text ? '' : undef; |
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item visible_rendered() |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Render and return the visible text in this part. |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=cut |
|
770
|
0
|
|
|
0
|
1
|
0
|
|
|
771
|
|
|
|
|
|
|
my ($self) = @_; |
|
772
|
0
|
0
|
0
|
|
|
0
|
$self->rendered(); # ignore return, we want just this: |
|
773
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{visible_rendered}); |
|
774
|
0
|
|
|
|
|
0
|
} |
|
775
|
0
|
|
|
|
|
0
|
|
|
776
|
0
|
0
|
|
|
|
0
|
=item invisible_rendered() |
|
777
|
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
Render and return the invisible text in this part. |
|
779
|
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=cut |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
my ($self) = @_; |
|
783
|
|
|
|
|
|
|
$self->rendered(); # ignore return, we want just this: |
|
784
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{invisible_rendered}); |
|
785
|
|
|
|
|
|
|
} |
|
786
|
8
|
|
|
8
|
1
|
21
|
|
|
787
|
8
|
|
|
|
|
55
|
=item content_summary() |
|
788
|
8
|
|
|
|
|
38
|
|
|
789
|
|
|
|
|
|
|
Returns an array of scalars describing the mime parts of the message. |
|
790
|
|
|
|
|
|
|
Note: This function requires that the message be parsed first! |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=cut |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
# return an array with scalars describing mime parts |
|
795
|
|
|
|
|
|
|
my($self) = @_; |
|
796
|
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
my @ret = ( [ $self->{'type'} ] ); |
|
798
|
8
|
|
|
8
|
1
|
20
|
my @search; |
|
799
|
8
|
|
|
|
|
26
|
|
|
800
|
8
|
|
|
|
|
24
|
if (exists $self->{'body_parts'}) { |
|
801
|
|
|
|
|
|
|
my $count = @{$self->{'body_parts'}}; |
|
802
|
|
|
|
|
|
|
for(my $i=0; $i<$count; $i++) { |
|
803
|
|
|
|
|
|
|
push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]); |
|
804
|
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
} |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
while(my $part = shift @search) { |
|
808
|
|
|
|
|
|
|
my($index, $part) = @{$part}; |
|
809
|
|
|
|
|
|
|
push(@{$ret[$index]}, $part->{'type'}); |
|
810
|
|
|
|
|
|
|
if (exists $part->{'body_parts'}) { |
|
811
|
|
|
|
|
|
|
unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}}); |
|
812
|
13
|
|
|
13
|
1
|
211
|
} |
|
813
|
|
|
|
|
|
|
} |
|
814
|
13
|
|
|
|
|
48
|
|
|
815
|
13
|
|
|
|
|
14
|
return map { join(",", @{$_}) } @ret; |
|
816
|
|
|
|
|
|
|
} |
|
817
|
13
|
100
|
|
|
|
27
|
|
|
818
|
11
|
|
|
|
|
13
|
=item delete_header() |
|
|
11
|
|
|
|
|
15
|
|
|
819
|
11
|
|
|
|
|
24
|
|
|
820
|
24
|
|
|
|
|
64
|
Delete the specified header (decoded and raw) from the Node information. |
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=cut |
|
823
|
|
|
|
|
|
|
|
|
824
|
13
|
|
|
|
|
27
|
my($self, $hdr) = @_; |
|
825
|
45
|
|
|
|
|
48
|
|
|
|
45
|
|
|
|
|
68
|
|
|
826
|
45
|
|
|
|
|
41
|
foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) { |
|
|
45
|
|
|
|
|
83
|
|
|
827
|
45
|
100
|
|
|
|
125
|
delete $self->{'headers'}->{$_}; |
|
828
|
11
|
|
|
|
|
13
|
delete $self->{'raw_headers'}->{$_}; |
|
|
21
|
|
|
|
|
48
|
|
|
|
11
|
|
|
|
|
18
|
|
|
829
|
|
|
|
|
|
|
} |
|
830
|
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}}); |
|
832
|
13
|
|
|
|
|
18
|
$self->{'header_order'} = \@neworder; |
|
|
37
|
|
|
|
|
49
|
|
|
|
37
|
|
|
|
|
178
|
|
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# decode a header appropriately. don't bother adding it to the pod documents. |
|
836
|
|
|
|
|
|
|
my ( $encoding, $cte, $data ) = @_; |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
if ( $cte eq 'B' ) { |
|
839
|
|
|
|
|
|
|
# base 64 encoded |
|
840
|
|
|
|
|
|
|
$data = Mail::SpamAssassin::Util::base64_decode($data); |
|
841
|
|
|
|
|
|
|
} |
|
842
|
505
|
|
|
505
|
1
|
929
|
elsif ( $cte eq 'Q' ) { |
|
843
|
|
|
|
|
|
|
# quoted printable |
|
844
|
505
|
|
|
|
|
641
|
|
|
|
505
|
|
|
|
|
6907
|
|
|
845
|
0
|
|
|
|
|
0
|
# the RFC states that in the encoded text, "_" is equal to "=20" |
|
846
|
0
|
|
|
|
|
0
|
$data =~ s/_/=20/g; |
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
$data = Mail::SpamAssassin::Util::qp_decode($data); |
|
849
|
505
|
|
|
|
|
1028
|
} |
|
|
505
|
|
|
|
|
5010
|
|
|
850
|
505
|
|
|
|
|
1678
|
else { |
|
851
|
|
|
|
|
|
|
# not possible since the input has already been limited to 'B' and 'Q' |
|
852
|
|
|
|
|
|
|
die "message: unknown encoding type '$cte' in RFC2047 header"; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
|
|
|
|
|
|
return _normalize($data, $encoding, 0); # transcode to UTF-8 octets |
|
855
|
10
|
|
|
10
|
|
43
|
} |
|
856
|
|
|
|
|
|
|
|
|
857
|
10
|
100
|
|
|
|
39
|
# Decode base64 and quoted-printable in headers according to RFC2047. |
|
|
|
50
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
# |
|
859
|
2
|
|
|
|
|
13
|
my($header_field_body, $header_field_name) = @_; |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
return '' unless defined $header_field_body && $header_field_body ne ''; |
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
# deal with folding and cream the newlines and such |
|
864
|
|
|
|
|
|
|
$header_field_body =~ s/\n[ \t]+/\n /g; |
|
865
|
8
|
|
|
|
|
16
|
$header_field_body =~ s/\015?\012//gs; |
|
866
|
|
|
|
|
|
|
|
|
867
|
8
|
|
|
|
|
18
|
if ($header_field_name =~ |
|
868
|
|
|
|
|
|
|
/^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) | |
|
869
|
|
|
|
|
|
|
MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) { |
|
870
|
|
|
|
|
|
|
# Bug 6945: some header fields must not be processed for MIME encoding |
|
871
|
0
|
|
|
|
|
0
|
# Bug 7466: leave out the Content-* |
|
872
|
|
|
|
|
|
|
|
|
873
|
10
|
|
|
|
|
28
|
} else { |
|
874
|
|
|
|
|
|
|
local($1,$2,$3); |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# Multiple encoded sections must ignore the interim whitespace. |
|
877
|
|
|
|
|
|
|
# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE |
|
878
|
|
|
|
|
|
|
# separated by whitespace. |
|
879
|
909
|
|
|
909
|
|
2543
|
1 while $header_field_body =~ |
|
880
|
|
|
|
|
|
|
s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+ |
|
881
|
909
|
100
|
66
|
|
|
4498
|
( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) } |
|
882
|
|
|
|
|
|
|
{$1$2}xsg; |
|
883
|
|
|
|
|
|
|
|
|
884
|
859
|
|
|
|
|
1550
|
# transcode properly encoded RFC 2047 substrings into UTF-8 octets, |
|
885
|
859
|
|
|
|
|
1097
|
# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532) |
|
886
|
|
|
|
|
|
|
# or plain US-ASCII |
|
887
|
859
|
100
|
|
|
|
3009
|
$header_field_body =~ |
|
888
|
|
|
|
|
|
|
s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) } |
|
889
|
|
|
|
|
|
|
{ __decode_header($1, uc($2), $3) }xsge; |
|
890
|
|
|
|
|
|
|
} |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body); |
|
893
|
|
|
|
|
|
|
return $header_field_body; |
|
894
|
523
|
|
|
|
|
1777
|
} |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=item get_header() |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
Retrieve a specific header. Will have a newline at the end and will be |
|
899
|
523
|
|
|
|
|
1245
|
unfolded. The first parameter is the header name (case-insensitive), |
|
900
|
|
|
|
|
|
|
and the second parameter (optional) is whether or not to return the |
|
901
|
|
|
|
|
|
|
raw header. |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
If get_header() is called in an array context, an array will be returned |
|
904
|
|
|
|
|
|
|
with each header entry in a different element. In a scalar context, |
|
905
|
|
|
|
|
|
|
the last specific header is returned. |
|
906
|
523
|
|
|
|
|
1201
|
|
|
907
|
10
|
|
|
|
|
39
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
|
908
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
|
909
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
|
910
|
|
|
|
|
|
|
|
|
911
|
859
|
|
|
|
|
2691
|
Btw, returning the last header field (not the first) happens to be consistent |
|
912
|
|
|
|
|
|
|
with DKIM signatures, which search for and cover multiple header fields |
|
913
|
|
|
|
|
|
|
bottom-up according to the 'h' tag. Let's keep it this way. |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
=cut |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
my ($self, $hdr, $raw) = @_; |
|
918
|
|
|
|
|
|
|
$raw ||= 0; |
|
919
|
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
# And now pick up all the entries into a list |
|
921
|
|
|
|
|
|
|
# This is assumed to include a newline at the end ... |
|
922
|
|
|
|
|
|
|
# This is also assumed to have removed continuation bits ... |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
# Deal with the possibility that header() or raw_header() returns undef |
|
925
|
|
|
|
|
|
|
my @hdrs; |
|
926
|
|
|
|
|
|
|
if ( $raw ) { |
|
927
|
|
|
|
|
|
|
if (@hdrs = $self->raw_header($hdr)) { |
|
928
|
|
|
|
|
|
|
s/\015?\012\s+/ /gs for @hdrs; |
|
929
|
|
|
|
|
|
|
} |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
else { |
|
932
|
|
|
|
|
|
|
if (@hdrs = $self->header($hdr)) { |
|
933
|
|
|
|
|
|
|
$_ .= "\n" for @hdrs; |
|
934
|
|
|
|
|
|
|
} |
|
935
|
|
|
|
|
|
|
} |
|
936
|
4078
|
|
|
4078
|
1
|
6951
|
|
|
937
|
4078
|
|
50
|
|
|
12097
|
if (wantarray) { |
|
938
|
|
|
|
|
|
|
return @hdrs; |
|
939
|
|
|
|
|
|
|
} |
|
940
|
|
|
|
|
|
|
else { |
|
941
|
|
|
|
|
|
|
return @hdrs ? $hdrs[-1] : undef; |
|
942
|
|
|
|
|
|
|
} |
|
943
|
|
|
|
|
|
|
} |
|
944
|
4078
|
|
|
|
|
4404
|
|
|
945
|
4078
|
50
|
|
|
|
6029
|
=item get_all_headers() |
|
946
|
0
|
0
|
|
|
|
0
|
|
|
947
|
0
|
|
|
|
|
0
|
Retrieve all headers. Each header will have a newline at the end and |
|
948
|
|
|
|
|
|
|
will be unfolded. The first parameter (optional) is whether or not to |
|
949
|
|
|
|
|
|
|
return the raw headers, and the second parameter (optional) is whether |
|
950
|
|
|
|
|
|
|
or not to include the mbox separator. |
|
951
|
4078
|
100
|
|
|
|
7423
|
|
|
952
|
834
|
|
|
|
|
3018
|
If get_all_header() is called in an array context, an array will be |
|
953
|
|
|
|
|
|
|
returned with each header entry in a different element. In a scalar |
|
954
|
|
|
|
|
|
|
context, the headers are returned in a single scalar. |
|
955
|
|
|
|
|
|
|
|
|
956
|
4078
|
100
|
|
|
|
6112
|
=back |
|
957
|
3528
|
|
|
|
|
8352
|
|
|
958
|
|
|
|
|
|
|
=cut |
|
959
|
|
|
|
|
|
|
|
|
960
|
550
|
100
|
|
|
|
2265
|
# build it and it will not bomb |
|
961
|
|
|
|
|
|
|
my ($self, $raw, $include_mbox) = @_; |
|
962
|
|
|
|
|
|
|
$raw ||= 0; |
|
963
|
|
|
|
|
|
|
$include_mbox ||= 0; |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
my @lines; |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
# precalculate destination positions based on order of appearance |
|
968
|
|
|
|
|
|
|
my $i = 0; |
|
969
|
|
|
|
|
|
|
my %locations; |
|
970
|
|
|
|
|
|
|
for my $k (@{$self->{header_order}}) { |
|
971
|
|
|
|
|
|
|
push(@{$locations{lc($k)}}, $i++); |
|
972
|
|
|
|
|
|
|
} |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# process headers in order of first appearance |
|
975
|
|
|
|
|
|
|
my $header; |
|
976
|
|
|
|
|
|
|
my $size = 0; |
|
977
|
|
|
|
|
|
|
HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] } |
|
978
|
|
|
|
|
|
|
keys %locations) |
|
979
|
|
|
|
|
|
|
{ |
|
980
|
|
|
|
|
|
|
# get all same-name headers and poke into correct position |
|
981
|
45
|
|
|
45
|
1
|
135
|
my $positions = $locations{$name}; |
|
982
|
45
|
|
50
|
|
|
290
|
for my $contents ($self->get_header($name, $raw)) { |
|
983
|
45
|
|
100
|
|
|
192
|
my $position = shift @{$positions}; |
|
984
|
|
|
|
|
|
|
$size += length($name) + length($contents) + 2; |
|
985
|
45
|
|
|
|
|
62
|
if ($size > MAX_HEADER_LENGTH) { |
|
986
|
|
|
|
|
|
|
$self->{'truncated_header'} = 1; |
|
987
|
|
|
|
|
|
|
last HEADER; |
|
988
|
45
|
|
|
|
|
70
|
} |
|
989
|
45
|
|
|
|
|
65
|
$lines[$position] = $self->{header_order}->[$position].": ".$contents; |
|
990
|
45
|
|
|
|
|
97
|
} |
|
|
45
|
|
|
|
|
227
|
|
|
991
|
520
|
|
|
|
|
589
|
} |
|
|
520
|
|
|
|
|
1400
|
|
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# skip undefined lines if we truncated |
|
994
|
|
|
|
|
|
|
@lines = grep { defined $_ } @lines if $self->{'truncated_header'}; |
|
995
|
45
|
|
|
|
|
83
|
|
|
996
|
45
|
|
|
|
|
80
|
splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} ); |
|
997
|
45
|
|
|
|
|
408
|
|
|
|
880
|
|
|
|
|
1525
|
|
|
998
|
|
|
|
|
|
|
return wantarray ? @lines : join ('', @lines); |
|
999
|
|
|
|
|
|
|
} |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
396
|
|
|
|
|
600
|
# legacy public API; now a no-op. |
|
1002
|
396
|
|
|
|
|
653
|
|
|
1003
|
520
|
|
|
|
|
636
|
# --------------------------------------------------------------------------- |
|
|
520
|
|
|
|
|
700
|
|
|
1004
|
520
|
|
|
|
|
1024
|
|
|
1005
|
520
|
50
|
|
|
|
1022
|
1; |