line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module will encapsulate an email message and allow access to |
25
|
|
|
|
|
|
|
the various MIME message parts. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=over 4 |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
package Mail::SpamAssassin::Message::Node; |
34
|
|
|
|
|
|
|
|
35
|
40
|
|
|
40
|
|
286
|
use strict; |
|
40
|
|
|
|
|
90
|
|
|
40
|
|
|
|
|
1435
|
|
36
|
40
|
|
|
40
|
|
241
|
use warnings; |
|
40
|
|
|
|
|
95
|
|
|
40
|
|
|
|
|
1337
|
|
37
|
40
|
|
|
40
|
|
261
|
use re 'taint'; |
|
40
|
|
|
|
|
125
|
|
|
40
|
|
|
|
|
1669
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
require 5.008001; # needs utf8::is_utf8() |
40
|
|
|
|
|
|
|
|
41
|
40
|
|
|
40
|
|
250
|
use Mail::SpamAssassin; |
|
40
|
|
|
|
|
95
|
|
|
40
|
|
|
|
|
1235
|
|
42
|
40
|
|
|
40
|
|
285
|
use Mail::SpamAssassin::Constants qw(:sa); |
|
40
|
|
|
|
|
80
|
|
|
40
|
|
|
|
|
6421
|
|
43
|
40
|
|
|
40
|
|
12762
|
use Mail::SpamAssassin::HTML; |
|
40
|
|
|
|
|
130
|
|
|
40
|
|
|
|
|
1544
|
|
44
|
40
|
|
|
40
|
|
274
|
use Mail::SpamAssassin::Logger; |
|
40
|
|
|
|
|
81
|
|
|
40
|
|
|
|
|
5604
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
our($enc_utf8, $enc_w1252, $have_encode_detector); |
47
|
|
|
|
|
|
|
BEGIN { |
48
|
40
|
|
|
|
|
23905
|
eval { require Encode } |
49
|
40
|
50
|
|
40
|
|
157
|
and do { $enc_utf8 = Encode::find_encoding('UTF-8'); |
|
40
|
|
|
|
|
383246
|
|
50
|
40
|
|
|
|
|
6379
|
$enc_w1252 = Encode::find_encoding('Windows-1252') }; |
51
|
40
|
|
|
|
|
213008
|
eval { require Encode::Detect::Detector } |
52
|
40
|
50
|
|
|
|
15296
|
and do { $have_encode_detector = 1 }; |
|
0
|
|
|
|
|
0
|
|
53
|
|
|
|
|
|
|
}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item new() |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Generates an empty Node object and returns it. Typically only called |
58
|
|
|
|
|
|
|
by functions in Message. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new { |
63
|
193
|
|
|
193
|
1
|
536
|
my $class = shift; |
64
|
193
|
|
33
|
|
|
908
|
$class = ref($class) || $class; |
65
|
|
|
|
|
|
|
|
66
|
193
|
|
|
|
|
1249
|
my $self = { |
67
|
|
|
|
|
|
|
headers => {}, |
68
|
|
|
|
|
|
|
raw_headers => {}, |
69
|
|
|
|
|
|
|
header_order => [] |
70
|
|
|
|
|
|
|
}; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# deal with any parameters |
73
|
193
|
|
|
|
|
544
|
my($opts) = @_; |
74
|
193
|
|
50
|
|
|
1053
|
$self->{normalize} = $opts->{'normalize'} || 0; |
75
|
|
|
|
|
|
|
|
76
|
193
|
|
|
|
|
595
|
bless($self,$class); |
77
|
193
|
|
|
|
|
630
|
$self; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item find_parts() |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Used to search the tree for specific MIME parts. An array of matching |
83
|
|
|
|
|
|
|
Node objects (pointers into the tree) is returned. The parameters that |
84
|
|
|
|
|
|
|
can be passed in are (in order, all scalars): |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Regexp - Used to match against each part's Content-Type header, |
87
|
|
|
|
|
|
|
specifically the type and not the rest of the header. ie: "Content-type: |
88
|
|
|
|
|
|
|
text/html; encoding=quoted-printable" has a type of "text/html". If no |
89
|
|
|
|
|
|
|
regexp is specified, find_parts() will return an empty array. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Only_leaves - By default, find_parts() will return any part that matches |
92
|
|
|
|
|
|
|
the regexp, including multipart. If you only want to see leaves of the |
93
|
|
|
|
|
|
|
tree (ie: parts that aren't multipart), set this to true (1). |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Recursive - By default, when find_parts() finds a multipart which has |
96
|
|
|
|
|
|
|
parts underneath it, it will recurse through all sub-children. If set to 0, |
97
|
|
|
|
|
|
|
only look at the part and any direct children of the part. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=cut |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Used to find any MIME parts whose simple content-type matches a given regexp |
102
|
|
|
|
|
|
|
# Searches it's own and any children parts. Returns an array of MIME |
103
|
|
|
|
|
|
|
# objects which match. Our callers may expect the default behavior which is a |
104
|
|
|
|
|
|
|
# depth-first array of parts. |
105
|
|
|
|
|
|
|
# |
106
|
|
|
|
|
|
|
sub find_parts { |
107
|
255
|
|
|
255
|
1
|
729
|
my ($self, $re, $onlyleaves, $recursive) = @_; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Didn't pass an RE? Just abort. |
110
|
255
|
50
|
33
|
|
|
1544
|
return () unless defined $re && $re ne ''; |
111
|
|
|
|
|
|
|
|
112
|
255
|
50
|
|
|
|
761
|
$onlyleaves = 0 unless defined $onlyleaves; |
113
|
|
|
|
|
|
|
|
114
|
255
|
|
|
|
|
432
|
my $depth; |
115
|
255
|
50
|
33
|
|
|
733
|
if (defined $recursive && $recursive == 0) { |
116
|
0
|
|
|
|
|
0
|
$depth = 1; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
255
|
|
|
|
|
426
|
my @ret; |
120
|
255
|
|
|
|
|
577
|
my @search = ( $self ); |
121
|
|
|
|
|
|
|
|
122
|
255
|
|
|
|
|
773
|
while (my $part = shift @search) { |
123
|
|
|
|
|
|
|
# If this object matches, mark it for return. |
124
|
287
|
|
|
|
|
1010
|
my $amialeaf = $part->is_leaf(); |
125
|
|
|
|
|
|
|
|
126
|
287
|
100
|
66
|
|
|
3148
|
if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) { |
|
|
|
100
|
|
|
|
|
127
|
269
|
|
|
|
|
685
|
push(@ret, $part); |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
287
|
50
|
33
|
|
|
1209
|
if ( !$amialeaf && (!defined $depth || $depth > 0)) { |
|
|
|
66
|
|
|
|
|
131
|
18
|
50
|
|
|
|
34
|
$depth-- if defined $depth; |
132
|
18
|
|
|
|
|
29
|
unshift(@search, @{$part->{'body_parts'}}); |
|
18
|
|
|
|
|
64
|
|
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
255
|
|
|
|
|
978
|
return @ret; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item header() |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Stores and retrieves headers from a specific MIME part. The first |
142
|
|
|
|
|
|
|
parameter is the header name. If there is no other parameter, the header |
143
|
|
|
|
|
|
|
is retrieved. If there is a second parameter, the header is stored. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Header names are case-insensitive and are stored in both raw and |
146
|
|
|
|
|
|
|
decoded form. Using header(), only the decoded form is retrievable. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
For retrieval, if header() is called in an array context, an array will |
149
|
|
|
|
|
|
|
be returned with each header entry in a different element. In a scalar |
150
|
|
|
|
|
|
|
context, the last specific header is returned. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
153
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
154
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=cut |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# Store or retrieve headers from a given MIME object |
159
|
|
|
|
|
|
|
# |
160
|
|
|
|
|
|
|
sub header { |
161
|
5970
|
|
|
5970
|
1
|
8455
|
my $self = shift; |
162
|
5970
|
|
|
|
|
9585
|
my $rawkey = shift; |
163
|
|
|
|
|
|
|
|
164
|
5970
|
50
|
|
|
|
13904
|
return unless defined $rawkey; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# we're going to do things case insensitively |
167
|
5970
|
|
|
|
|
12288
|
my $key = lc($rawkey); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
170
|
5970
|
|
|
|
|
13844
|
$key =~ s/^\s+//; |
171
|
5970
|
|
|
|
|
11090
|
$key =~ s/\s+$//; |
172
|
|
|
|
|
|
|
|
173
|
5970
|
100
|
|
|
|
11531
|
if (@_) { |
174
|
898
|
|
|
|
|
1657
|
my $raw_value = shift; |
175
|
898
|
50
|
|
|
|
2127
|
return unless defined $raw_value; |
176
|
|
|
|
|
|
|
|
177
|
898
|
|
|
|
|
1336
|
push @{ $self->{'header_order'} }, $rawkey; |
|
898
|
|
|
|
|
2422
|
|
178
|
898
|
100
|
|
|
|
2601
|
if ( !exists $self->{'headers'}->{$key} ) { |
179
|
824
|
|
|
|
|
2632
|
$self->{'headers'}->{$key} = []; |
180
|
824
|
|
|
|
|
1975
|
$self->{'raw_headers'}->{$key} = []; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
898
|
|
|
|
|
1836
|
my $dec_value = $raw_value; |
184
|
898
|
|
|
|
|
2646
|
$dec_value =~ s/\n[ \t]+/ /gs; |
185
|
898
|
|
|
|
|
4431
|
$dec_value =~ s/\s+$//s; |
186
|
898
|
|
|
|
|
2832
|
$dec_value =~ s/^\s+//s; |
187
|
898
|
|
|
|
|
1392
|
push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key); |
|
898
|
|
|
|
|
2927
|
|
188
|
|
|
|
|
|
|
|
189
|
898
|
|
|
|
|
1593
|
push @{ $self->{'raw_headers'}->{$key} }, $raw_value; |
|
898
|
|
|
|
|
2461
|
|
190
|
|
|
|
|
|
|
|
191
|
898
|
|
|
|
|
4727
|
return $self->{'headers'}->{$key}->[-1]; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
5072
|
100
|
|
|
|
8944
|
if (wantarray) { |
195
|
4763
|
100
|
|
|
|
19029
|
return unless exists $self->{'headers'}->{$key}; |
196
|
1007
|
|
|
|
|
1522
|
return @{ $self->{'headers'}->{$key} }; |
|
1007
|
|
|
|
|
6202
|
|
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
else { |
199
|
309
|
100
|
|
|
|
2500
|
return '' unless exists $self->{'headers'}->{$key}; |
200
|
46
|
|
|
|
|
363
|
return $self->{'headers'}->{$key}->[-1]; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=item raw_header() |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
Retrieves the raw version of headers from a specific MIME part. The only |
207
|
|
|
|
|
|
|
parameter is the header name. Header names are case-insensitive. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
For retrieval, if raw_header() is called in an array context, an array |
210
|
|
|
|
|
|
|
will be returned with each header entry in a different element. In a |
211
|
|
|
|
|
|
|
scalar context, the last specific header is returned. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
214
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
215
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=cut |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Retrieve raw headers from a given MIME object |
220
|
|
|
|
|
|
|
# |
221
|
|
|
|
|
|
|
sub raw_header { |
222
|
493
|
|
|
493
|
1
|
859
|
my $self = shift; |
223
|
493
|
|
|
|
|
1029
|
my $key = lc(shift); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
226
|
493
|
|
|
|
|
1327
|
$key =~ s/^\s+//; |
227
|
493
|
|
|
|
|
1076
|
$key =~ s/\s+$//; |
228
|
|
|
|
|
|
|
|
229
|
493
|
50
|
|
|
|
985
|
if (wantarray) { |
230
|
493
|
100
|
|
|
|
2042
|
return unless exists $self->{'raw_headers'}->{$key}; |
231
|
128
|
|
|
|
|
253
|
return @{ $self->{'raw_headers'}->{$key} }; |
|
128
|
|
|
|
|
893
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
else { |
234
|
0
|
0
|
|
|
|
0
|
return '' unless exists $self->{'raw_headers'}->{$key}; |
235
|
0
|
|
|
|
|
0
|
return $self->{'raw_headers'}->{$key}->[-1]; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item add_body_part() |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Adds a Node child object to the current node object. |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=cut |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# Add a MIME child part to ourselves |
246
|
|
|
|
|
|
|
sub add_body_part { |
247
|
48
|
|
|
48
|
1
|
100
|
my($self, $part) = @_; |
248
|
|
|
|
|
|
|
|
249
|
48
|
|
|
|
|
178
|
dbg("message: added part, type: ".$part->{'type'}); |
250
|
48
|
|
|
|
|
81
|
push @{ $self->{'body_parts'} }, $part; |
|
48
|
|
|
|
|
132
|
|
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item is_leaf() |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Returns true if the tree node in question is a leaf of the tree (ie: |
256
|
|
|
|
|
|
|
has no children of its own). Note: This function may return odd results |
257
|
|
|
|
|
|
|
unless the message has been mime parsed via _do_parse()! |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub is_leaf { |
262
|
287
|
|
|
287
|
1
|
744
|
my($self) = @_; |
263
|
287
|
|
|
|
|
903
|
return !exists $self->{'body_parts'}; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item raw() |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Return a reference to the the raw array. Treat this as READ ONLY. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=cut |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
sub raw { |
273
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# Ok, if we're called we are expected to return an array. |
276
|
|
|
|
|
|
|
# so if it's a file reference, read in the message into an array... |
277
|
|
|
|
|
|
|
# |
278
|
|
|
|
|
|
|
# NOTE: that "ref undef" works, so don't bother checking for a defined var |
279
|
|
|
|
|
|
|
# first. |
280
|
0
|
0
|
|
|
|
0
|
if (ref $self->{'raw'} eq 'GLOB') { |
281
|
0
|
|
|
|
|
0
|
my $fd = $self->{'raw'}; |
282
|
0
|
0
|
|
|
|
0
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# dbg("message: (raw) reading mime part from a temporary file"); |
285
|
0
|
|
|
|
|
0
|
my($nread,$raw_str); $raw_str = ''; |
|
0
|
|
|
|
|
0
|
|
286
|
0
|
|
|
|
|
0
|
while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { } |
287
|
0
|
0
|
|
|
|
0
|
defined $nread or die "error reading: $!"; |
288
|
0
|
|
|
|
|
0
|
my @array = split(/^/m, $raw_str, -1); |
289
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
0
|
dbg("message: empty message read") if $raw_str eq ''; |
291
|
0
|
|
|
|
|
0
|
return \@array; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
0
|
|
|
|
|
0
|
return $self->{'raw'}; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item decode() |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
If necessary, decode the part text as base64 or quoted-printable. |
300
|
|
|
|
|
|
|
The decoded text will be returned as a scalar string. An optional length |
301
|
|
|
|
|
|
|
parameter can be passed in which limits how much decoded data is returned. |
302
|
|
|
|
|
|
|
If the scalar isn't needed, call with "0" as a parameter. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=cut |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
sub decode { |
307
|
252
|
|
|
252
|
1
|
3385
|
my($self, $bytes) = @_; |
308
|
|
|
|
|
|
|
|
309
|
252
|
100
|
|
|
|
741
|
if ( !exists $self->{'decoded'} ) { |
310
|
|
|
|
|
|
|
# Someone is looking for a decoded part where there is no raw data |
311
|
|
|
|
|
|
|
# (multipart or subparsed message, etc.) Just return undef. |
312
|
144
|
50
|
|
|
|
564
|
return if !exists $self->{'raw'}; |
313
|
|
|
|
|
|
|
|
314
|
144
|
|
|
|
|
264
|
my $raw; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# if the part is held in a temp file, read it into the scalar |
317
|
144
|
100
|
|
|
|
624
|
if (ref $self->{'raw'} eq 'GLOB') { |
318
|
4
|
|
|
|
|
9
|
my $fd = $self->{'raw'}; |
319
|
4
|
50
|
|
|
|
43
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# dbg("message: (decode) reading mime part from a temporary file"); |
322
|
4
|
|
|
|
|
10
|
my($nread,$raw_str); $raw = ''; |
|
4
|
|
|
|
|
9
|
|
323
|
4
|
|
|
|
|
210
|
while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { } |
324
|
4
|
50
|
|
|
|
18
|
defined $nread or die "error reading: $!"; |
325
|
|
|
|
|
|
|
|
326
|
4
|
50
|
|
|
|
18
|
dbg("message: empty message read from a temp file") if $raw eq ''; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
else { |
329
|
|
|
|
|
|
|
# create a new scalar from the raw array in memory |
330
|
140
|
|
|
|
|
306
|
$raw = join('', @{$self->{'raw'}}); |
|
140
|
|
|
|
|
1484
|
|
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
144
|
|
100
|
|
|
658
|
my $encoding = lc $self->header('content-transfer-encoding') || ''; |
334
|
|
|
|
|
|
|
|
335
|
144
|
100
|
|
|
|
753
|
if ( $encoding eq 'quoted-printable' ) { |
|
|
100
|
|
|
|
|
|
336
|
6
|
|
|
|
|
30
|
dbg("message: decoding quoted-printable"); |
337
|
6
|
|
|
|
|
40
|
$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw); |
338
|
6
|
|
|
|
|
39
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
elsif ( $encoding eq 'base64' ) { |
341
|
5
|
|
|
|
|
21
|
dbg("message: decoding base64"); |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# if it's not defined or is 0, do the whole thing, otherwise only decode |
344
|
|
|
|
|
|
|
# a portion |
345
|
5
|
50
|
|
|
|
10
|
if ($bytes) { |
346
|
0
|
|
|
|
|
0
|
return Mail::SpamAssassin::Util::base64_decode($raw, $bytes); |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
else { |
349
|
|
|
|
|
|
|
# Generate the decoded output |
350
|
5
|
|
|
|
|
20
|
$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw); |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
5
|
100
|
|
|
|
32
|
if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) { |
354
|
1
|
|
|
|
|
35
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
else { |
358
|
|
|
|
|
|
|
# Encoding is one of 7bit, 8bit, binary or x-something |
359
|
133
|
100
|
|
|
|
370
|
if ( $encoding ) { |
360
|
31
|
|
|
|
|
176
|
dbg("message: decoding other encoding type ($encoding), ignoring"); |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
else { |
363
|
102
|
|
|
|
|
371
|
dbg("message: no encoding detected"); |
364
|
|
|
|
|
|
|
} |
365
|
133
|
|
|
|
|
838
|
$self->{'decoded'} = $raw; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
252
|
100
|
66
|
|
|
1442
|
if ( !defined $bytes || $bytes ) { |
370
|
248
|
50
|
|
|
|
747
|
if ( !defined $bytes ) { |
371
|
|
|
|
|
|
|
# force a copy |
372
|
248
|
|
|
|
|
2470
|
return '' . $self->{'decoded'}; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
else { |
375
|
0
|
|
|
|
|
0
|
return substr($self->{'decoded'}, 0, $bytes); |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# Detect endianness of UTF-16 encoded data |
381
|
|
|
|
|
|
|
sub detect_utf16 { |
382
|
0
|
|
|
0
|
0
|
0
|
my $data = $_[0]; # could not avoid copying large strings |
383
|
0
|
|
|
|
|
0
|
my $utf16le_clues = 0; |
384
|
0
|
|
|
|
|
0
|
my $utf16be_clues = 0; |
385
|
0
|
|
|
|
|
0
|
my $sum_h_e = 0; |
386
|
0
|
|
|
|
|
0
|
my $sum_h_o = 0; |
387
|
0
|
|
|
|
|
0
|
my $sum_l_e = 0; |
388
|
0
|
|
|
|
|
0
|
my $sum_l_o = 0; |
389
|
0
|
|
|
|
|
0
|
my $decoder = undef; |
390
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
0
|
my @msg_h = unpack 'H' x length( $data ), $data; |
392
|
0
|
|
|
|
|
0
|
my @msg_l = unpack 'h' x length( $data ), $data; |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
0
|
for( my $i = 0; $i < length( $data ); $i+=2 ) { |
395
|
0
|
|
|
|
|
0
|
my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] ); |
396
|
0
|
|
|
|
|
0
|
$sum_h_e += hex $msg_h[$i]; |
397
|
0
|
|
|
|
|
0
|
$sum_h_o += hex $msg_h[$i+1]; |
398
|
0
|
|
|
|
|
0
|
$sum_l_e += hex $msg_l[$i]; |
399
|
0
|
|
|
|
|
0
|
$sum_l_o += hex $msg_l[$i+1]; |
400
|
0
|
0
|
|
|
|
0
|
if( $check_char =~ /20 00/ ) { |
401
|
|
|
|
|
|
|
# UTF-16LE space char detected |
402
|
0
|
|
|
|
|
0
|
$utf16le_clues++; |
403
|
|
|
|
|
|
|
} |
404
|
0
|
0
|
|
|
|
0
|
if( $check_char =~ /00 20/ ) { |
405
|
|
|
|
|
|
|
# UTF-16BE space char detected |
406
|
0
|
|
|
|
|
0
|
$utf16be_clues++; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE |
411
|
0
|
0
|
|
|
|
0
|
$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 ); |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE |
414
|
0
|
0
|
|
|
|
0
|
$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) ); |
415
|
|
|
|
|
|
|
|
416
|
0
|
0
|
|
|
|
0
|
if( $utf16le_clues > $utf16be_clues ) { |
|
|
0
|
|
|
|
|
|
417
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: UTF-16LE" ); |
418
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16LE"); |
419
|
|
|
|
|
|
|
} elsif( $utf16le_clues > $utf16be_clues ) { |
420
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: UTF-16BE" ); |
421
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16BE"); |
422
|
|
|
|
|
|
|
} else { |
423
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: Could not detect UTF-16 endianness" ); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
0
|
return $decoder; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
# Look at a text scalar and determine whether it should be rendered |
430
|
|
|
|
|
|
|
# as text/html. |
431
|
|
|
|
|
|
|
# |
432
|
|
|
|
|
|
|
# This is not a public function. |
433
|
|
|
|
|
|
|
# |
434
|
|
|
|
|
|
|
sub _html_render { |
435
|
0
|
0
|
|
0
|
|
0
|
if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is) |
436
|
|
|
|
|
|
|
{ |
437
|
0
|
|
|
|
|
0
|
my $pad = $1; |
438
|
0
|
|
|
|
|
0
|
my $count = 0; |
439
|
0
|
|
|
|
|
0
|
$count += ($pad =~ tr/\n//d) * 2; |
440
|
0
|
|
|
|
|
0
|
$count += ($pad =~ tr/\n//cd); |
441
|
0
|
|
|
|
|
0
|
return ($count < 24); |
442
|
|
|
|
|
|
|
} |
443
|
0
|
|
|
|
|
0
|
return 0; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Decode character set of a given text to perl characters (Unicode), |
447
|
|
|
|
|
|
|
# then encode into UTF-8 octets if requested. |
448
|
|
|
|
|
|
|
# |
449
|
|
|
|
|
|
|
sub _normalize { |
450
|
|
|
|
|
|
|
# my $data = $_[0]; # avoid copying large strings |
451
|
10
|
|
|
10
|
|
23
|
my $charset_declared = $_[1]; |
452
|
10
|
|
|
|
|
20
|
my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets |
453
|
|
|
|
|
|
|
|
454
|
10
|
50
|
|
|
|
38
|
warn "message: _normalize() was given characters, expected bytes: $_[0]\n" |
455
|
|
|
|
|
|
|
if utf8::is_utf8($_[0]); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879] |
458
|
10
|
|
|
|
|
29
|
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data |
459
|
|
|
|
|
|
|
|
460
|
10
|
50
|
33
|
|
|
128
|
if (!defined $charset_declared || $charset_declared eq '') { |
461
|
0
|
|
|
|
|
0
|
$charset_declared = 'us-ascii'; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# number of characters with code above 127 |
465
|
10
|
|
|
|
|
31
|
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c; |
466
|
|
|
|
|
|
|
|
467
|
10
|
50
|
33
|
|
|
95
|
if (!$cnt_8bits && |
468
|
|
|
|
|
|
|
$charset_declared =~ |
469
|
|
|
|
|
|
|
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) | |
470
|
|
|
|
|
|
|
ISO646-US )\z/xsi) |
471
|
|
|
|
|
|
|
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is |
472
|
0
|
|
|
|
|
0
|
dbg("message: kept, charset is US-ASCII as declared"); |
473
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
10
|
100
|
66
|
|
|
93
|
if (!$cnt_8bits && |
477
|
|
|
|
|
|
|
$charset_declared =~ |
478
|
|
|
|
|
|
|
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} | |
479
|
|
|
|
|
|
|
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} | |
480
|
|
|
|
|
|
|
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi) |
481
|
|
|
|
|
|
|
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII |
482
|
8
|
|
|
|
|
26
|
dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared); |
483
|
8
|
|
|
|
|
71
|
return $_[0]; # is all-ASCII, no need for decoding |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# Try first to strictly decode based on a declared character set. |
487
|
|
|
|
|
|
|
|
488
|
2
|
|
|
|
|
5
|
my $rv; |
489
|
2
|
50
|
33
|
|
|
20
|
if ($charset_declared =~ /^UTF-?8\z/i) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC) |
491
|
0
|
0
|
|
|
|
0
|
if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
492
|
0
|
|
|
|
|
0
|
dbg("message: decoded as declared charset UTF-8"); |
493
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
494
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
495
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
496
|
|
|
|
|
|
|
} else { |
497
|
0
|
|
|
|
|
0
|
my $err = ''; |
498
|
0
|
0
|
|
|
|
0
|
if ($@) { |
499
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
500
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
501
|
|
|
|
|
|
|
} |
502
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset UTF-8 ($err)"); |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
} elsif ($charset_declared =~ /^UTF[ -]?16/i) { |
506
|
|
|
|
|
|
|
# Handle cases where spammers use UTF-16 encoding without including a BOM |
507
|
|
|
|
|
|
|
# or declaring endianness as reported at: |
508
|
|
|
|
|
|
|
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252 |
509
|
|
|
|
|
|
|
|
510
|
0
|
|
|
|
|
0
|
my $decoder = detect_utf16( $_[0] ); |
511
|
0
|
0
|
|
|
|
0
|
if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
512
|
0
|
|
|
|
|
0
|
dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name); |
513
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
514
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
515
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
516
|
|
|
|
|
|
|
} else { |
517
|
0
|
|
|
|
|
0
|
my $err = ''; |
518
|
0
|
0
|
|
|
|
0
|
if ($@) { |
519
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
520
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
521
|
|
|
|
|
|
|
} |
522
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err); |
523
|
|
|
|
|
|
|
}; |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
} elsif ($cnt_8bits && |
526
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
527
|
0
|
|
|
|
|
0
|
dbg("message: decoded as charset UTF-8, declared %s", $charset_declared); |
528
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
529
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
530
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) { |
533
|
|
|
|
|
|
|
# declared as US-ASCII but contains 8-bit characters, makes no sense |
534
|
|
|
|
|
|
|
# to attempt decoding first as strict US-ASCII as we know it would fail |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
} else { |
537
|
|
|
|
|
|
|
# try decoding as a declared character set |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
# -> http://en.wikipedia.org/wiki/Windows-1252 |
540
|
|
|
|
|
|
|
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs |
541
|
|
|
|
|
|
|
# from the IANA's ISO-8859-1 by using displayable characters rather than |
542
|
|
|
|
|
|
|
# control characters in the 80 to 9F (hex) range. [...] |
543
|
|
|
|
|
|
|
# It is very common to mislabel Windows-1252 text with the charset label |
544
|
|
|
|
|
|
|
# ISO-8859-1. A common result was that all the quotes and apostrophes |
545
|
|
|
|
|
|
|
# (produced by "smart quotes" in word-processing software) were replaced |
546
|
|
|
|
|
|
|
# with question marks or boxes on non-Windows operating systems, making |
547
|
|
|
|
|
|
|
# text difficult to read. Most modern web browsers and e-mail clients |
548
|
|
|
|
|
|
|
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate |
549
|
|
|
|
|
|
|
# such mislabeling. This is now standard behavior in the draft HTML 5 |
550
|
|
|
|
|
|
|
# specification, which requires that documents advertised as ISO-8859-1 |
551
|
|
|
|
|
|
|
# actually be parsed with the Windows-1252 encoding. |
552
|
|
|
|
|
|
|
# |
553
|
2
|
|
|
|
|
4
|
my($chset, $decoder); |
554
|
2
|
50
|
|
|
|
15
|
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) { |
555
|
0
|
|
|
|
|
0
|
$chset = 'Windows-1252'; $decoder = $enc_w1252; |
|
0
|
|
|
|
|
0
|
|
556
|
|
|
|
|
|
|
} else { |
557
|
2
|
|
|
|
|
7
|
$chset = $charset_declared; $decoder = Encode::find_encoding($chset); |
|
2
|
|
|
|
|
14
|
|
558
|
2
|
50
|
33
|
|
|
8317
|
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
559
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
560
|
0
|
0
|
|
|
|
0
|
dbg("message: no decoder for a declared charset %s, using GBK", |
561
|
|
|
|
|
|
|
$chset) if $decoder; |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
} |
564
|
2
|
50
|
|
|
|
7
|
if (!$decoder) { |
565
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a declared charset %s", |
566
|
|
|
|
|
|
|
$chset); |
567
|
|
|
|
|
|
|
} else { |
568
|
2
|
|
|
|
|
5
|
my $err = ''; |
569
|
2
|
|
|
|
|
3
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
2
|
|
|
|
|
9
|
|
570
|
2
|
50
|
|
|
|
232
|
if ($@) { |
571
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
572
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
573
|
|
|
|
|
|
|
} |
574
|
2
|
50
|
|
|
|
11
|
if (lc $chset eq lc $charset_declared) { |
575
|
2
|
50
|
|
|
|
12
|
dbg("message: %s as declared charset %s%s", |
576
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err); |
577
|
|
|
|
|
|
|
} else { |
578
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as charset %s, declared %s%s", |
579
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
580
|
|
|
|
|
|
|
$chset, $charset_declared, $err); |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
# If the above failed, check if it is US-ASCII, possibly extended by few |
586
|
|
|
|
|
|
|
# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing |
587
|
|
|
|
|
|
|
# some popular punctuation or special characters from Windows-1252 in |
588
|
|
|
|
|
|
|
# the \x80-\x9F range (which is unassigned in ISO-8859-*). |
589
|
|
|
|
|
|
|
# Note that Windows-1252 is a proper superset of ISO-8859-1. |
590
|
|
|
|
|
|
|
# |
591
|
2
|
50
|
33
|
|
|
13
|
if (!defined $rv && !$cnt_8bits) { |
|
|
50
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
592
|
0
|
|
|
|
|
0
|
dbg("message: kept, guessed charset is US-ASCII, declared %s", |
593
|
|
|
|
|
|
|
$charset_declared); |
594
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
} elsif (!defined $rv && $enc_w1252 && |
597
|
|
|
|
|
|
|
# ASCII NBSP (c) SHY ' " ... '".- TM |
598
|
|
|
|
|
|
|
#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c) |
599
|
|
|
|
|
|
|
# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection, |
600
|
|
|
|
|
|
|
# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!) |
601
|
|
|
|
|
|
|
$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c) |
602
|
|
|
|
|
|
|
{ # ASCII + NBSP + SHY + some punctuation characters |
603
|
|
|
|
|
|
|
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too |
604
|
|
|
|
|
|
|
# consider also: AE (r), 80 Euro |
605
|
0
|
|
|
|
|
0
|
my $err = ''; |
606
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
0
|
|
|
|
|
0
|
|
607
|
0
|
0
|
|
|
|
0
|
if ($@) { |
608
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
609
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
610
|
|
|
|
|
|
|
} |
611
|
|
|
|
|
|
|
# the above can't fail, but keep code general just in case |
612
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as guessed charset %s, declared %s%s", |
613
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
614
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# If we were unsuccessful so far, try some guesswork |
618
|
|
|
|
|
|
|
# based on Encode::Detect::Detector . |
619
|
|
|
|
|
|
|
|
620
|
2
|
50
|
|
|
|
6
|
if (defined $rv) { |
|
|
0
|
|
|
|
|
|
621
|
|
|
|
|
|
|
# done, no need for guesswork |
622
|
|
|
|
|
|
|
} elsif (!$have_encode_detector) { |
623
|
0
|
|
|
|
|
0
|
dbg("message: Encode::Detect::Detector not available, declared %s failed", |
624
|
|
|
|
|
|
|
$charset_declared); |
625
|
|
|
|
|
|
|
} else { |
626
|
0
|
|
|
|
|
0
|
my $charset_detected = Encode::Detect::Detector::detect($_[0]); |
627
|
0
|
0
|
0
|
|
|
0
|
if ($charset_detected && lc $charset_detected ne lc $charset_declared) { |
628
|
0
|
|
|
|
|
0
|
my $decoder = Encode::find_encoding($charset_detected); |
629
|
0
|
0
|
0
|
|
|
0
|
if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
630
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
631
|
0
|
0
|
|
|
|
0
|
dbg("message: no decoder for a detected charset %s, using GBK", |
632
|
|
|
|
|
|
|
$charset_detected) if $decoder; |
633
|
|
|
|
|
|
|
} |
634
|
0
|
0
|
|
|
|
0
|
if (!$decoder) { |
635
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a detected charset %s", |
636
|
|
|
|
|
|
|
$charset_detected); |
637
|
|
|
|
|
|
|
} else { |
638
|
0
|
|
|
|
|
0
|
my $err = ''; |
639
|
0
|
|
|
|
|
0
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
0
|
|
|
|
|
0
|
|
640
|
0
|
0
|
|
|
|
0
|
if ($@) { |
641
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
642
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
643
|
|
|
|
|
|
|
} |
644
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as detected charset %s, declared %s%s", |
645
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
646
|
|
|
|
|
|
|
$charset_detected, $charset_declared, $err); |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
} |
650
|
|
|
|
|
|
|
|
651
|
2
|
50
|
|
|
|
7
|
if (!defined $rv) { # all decoding attempts failed so far, probably garbage |
652
|
|
|
|
|
|
|
# go for Windows-1252 which can't fail |
653
|
0
|
|
|
|
|
0
|
my $err = ''; |
654
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0]) }; |
|
0
|
|
|
|
|
0
|
|
655
|
0
|
0
|
|
|
|
0
|
if ($@) { |
656
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
657
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
658
|
|
|
|
|
|
|
} |
659
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as last-resort charset %s, declared %s%s", |
660
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
661
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
|
664
|
2
|
50
|
|
|
|
5
|
if (!defined $rv) { # just in case - all decoding attempts failed so far |
665
|
0
|
|
|
|
|
0
|
return $_[0]; # garbage-in / garbage-out, return unchanged octets |
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
# decoding octets to characters was successful |
668
|
2
|
50
|
|
|
|
6
|
if (!$return_decoded) { |
669
|
|
|
|
|
|
|
# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg |
670
|
2
|
|
|
|
|
6
|
utf8::encode($rv); # encode Unicode characters to UTF-8 octets |
671
|
|
|
|
|
|
|
} |
672
|
2
|
|
|
|
|
5
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
673
|
2
|
|
|
|
|
15
|
return $rv; |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item rendered() |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
render_text() takes the given text/* type MIME part, and attempts to |
679
|
|
|
|
|
|
|
render it into a text scalar. It will always render text/html, and will |
680
|
|
|
|
|
|
|
use a heuristic to determine if other text/* parts should be considered |
681
|
|
|
|
|
|
|
text/html. Two scalars are returned: the rendered type (either text/html |
682
|
|
|
|
|
|
|
or whatever the original type was), and the rendered text. |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=cut |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
sub rendered { |
687
|
141
|
|
|
141
|
1
|
387
|
my ($self) = @_; |
688
|
|
|
|
|
|
|
|
689
|
141
|
100
|
|
|
|
550
|
if (!exists $self->{rendered}) { |
690
|
|
|
|
|
|
|
# We only know how to render text/plain and text/html ... |
691
|
|
|
|
|
|
|
# Note: for bug 4843, make sure to skip text/calendar parts |
692
|
|
|
|
|
|
|
# we also want to skip things like text/x-vcard |
693
|
|
|
|
|
|
|
# text/x-aol is ignored here, but looks like text/html ... |
694
|
125
|
50
|
|
|
|
1254
|
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
695
|
|
|
|
|
|
|
|
696
|
125
|
|
|
|
|
718
|
my $text = $self->decode; # QP and Base64 decoding, bytes |
697
|
125
|
|
|
|
|
537
|
my $text_len = length($text); # num of bytes in original charset encoding |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
# render text/html always |
700
|
125
|
100
|
100
|
|
|
1479
|
if ($text ne '' && $self->{'type'} =~ m{^text/html$}i) |
701
|
|
|
|
|
|
|
{ |
702
|
7
|
|
|
|
|
48
|
$self->{rendered_type} = 'text/html'; |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
# will input text to HTML::Parser be provided as Unicode characters? |
705
|
7
|
|
|
|
|
19
|
my $character_semantics = 0; # $text is in bytes |
706
|
7
|
50
|
33
|
|
|
113
|
if ($self->{normalize} && $enc_utf8) { # charset decoding requested |
|
|
100
|
100
|
|
|
|
|
707
|
|
|
|
|
|
|
# Provide input to HTML::Parser as Unicode characters |
708
|
|
|
|
|
|
|
# which avoids a HTML::Parser bug in utf8_mode |
709
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=99755 |
710
|
|
|
|
|
|
|
# Note: the above bug was fixed in HTML-Parser 3.72, January 2016. |
711
|
|
|
|
|
|
|
# Avoid unnecessary step of encoding-then-decoding by telling |
712
|
|
|
|
|
|
|
# subroutine _normalize() to return Unicode text. See Bug 7133 |
713
|
|
|
|
|
|
|
# |
714
|
0
|
|
|
|
|
0
|
$character_semantics = 1; # $text will be in characters |
715
|
0
|
|
|
|
|
0
|
$text = _normalize($text, $self->{charset}, 1); # bytes to chars |
716
|
|
|
|
|
|
|
} elsif (!defined $self->{charset} || |
717
|
|
|
|
|
|
|
$self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) { |
718
|
|
|
|
|
|
|
# With some luck input can be interpreted as UTF-8, do not warn. |
719
|
|
|
|
|
|
|
# It is still possible to hit the HTML::Parses utf8_mode bug however. |
720
|
|
|
|
|
|
|
} else { |
721
|
|
|
|
|
|
|
dbg("message: 'normalize_charset' is off, encoding will likely ". |
722
|
1
|
|
|
|
|
11
|
"be misinterpreted; declared charset: %s", $self->{charset}); |
723
|
|
|
|
|
|
|
} |
724
|
|
|
|
|
|
|
# the 0 requires decoded HTML results to be in bytes (not characters) |
725
|
7
|
|
|
|
|
135
|
my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object |
726
|
|
|
|
|
|
|
|
727
|
7
|
|
|
|
|
63
|
$html->parse($text); # parse+render text |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# resulting HTML-decoded text is in bytes, likely encoded as UTF-8 |
730
|
7
|
|
|
|
|
38
|
$self->{rendered} = $html->get_rendered_text(); |
731
|
7
|
|
|
|
|
40
|
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
732
|
7
|
|
|
|
|
31
|
$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1); |
733
|
7
|
|
|
|
|
42
|
$self->{html_results} = $html->get_results(); |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# end-of-document result values that require looking at the text |
736
|
7
|
|
|
|
|
25
|
my $r = $self->{html_results}; # temporary reference for brevity |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
# count the number of spaces in the rendered text (likely UTF-8 octets) |
739
|
7
|
|
|
|
|
45
|
my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//; |
740
|
|
|
|
|
|
|
# we may want to add the count of other Unicode whitespace characters |
741
|
|
|
|
|
|
|
|
742
|
7
|
|
|
|
|
27
|
$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8) |
743
|
7
|
|
|
|
|
31
|
$r->{non_space_len} = $r->{html_length} - $space; |
744
|
7
|
50
|
|
|
|
200
|
$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len; |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
else { # plain text |
748
|
118
|
0
|
33
|
|
|
529
|
if ($self->{normalize} && $enc_utf8) { |
749
|
|
|
|
|
|
|
# request transcoded result as UTF-8 octets! |
750
|
0
|
|
|
|
|
0
|
$text = _normalize($text, $self->{charset}, 0); |
751
|
|
|
|
|
|
|
} |
752
|
118
|
|
|
|
|
510
|
$self->{rendered_type} = $self->{type}; |
753
|
118
|
|
|
|
|
615
|
$self->{rendered} = $self->{'visible_rendered'} = $text; |
754
|
118
|
|
|
|
|
543
|
$self->{'invisible_rendered'} = ''; |
755
|
|
|
|
|
|
|
} |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
|
758
|
141
|
|
|
|
|
711
|
return ($self->{rendered_type}, $self->{rendered}); |
759
|
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=item set_rendered($text, $type) |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
Set the rendered text and type for the given part. If type is not |
764
|
|
|
|
|
|
|
specified, and text is a defined value, a default of 'text/plain' is used. |
765
|
|
|
|
|
|
|
This can be used, for instance, to render non-text parts using plugins. |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
=cut |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
sub set_rendered { |
770
|
0
|
|
|
0
|
1
|
0
|
my ($self, $text, $type) = @_; |
771
|
|
|
|
|
|
|
|
772
|
0
|
0
|
0
|
|
|
0
|
$type = 'text/plain' if (!defined $type && defined $text); |
773
|
|
|
|
|
|
|
|
774
|
0
|
|
|
|
|
0
|
$self->{'rendered_type'} = $type; |
775
|
0
|
|
|
|
|
0
|
$self->{'rendered'} = $self->{'visible_rendered'} = $text; |
776
|
0
|
0
|
|
|
|
0
|
$self->{'invisible_rendered'} = defined $text ? '' : undef; |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item visible_rendered() |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
Render and return the visible text in this part. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=cut |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
sub visible_rendered { |
786
|
8
|
|
|
8
|
1
|
27
|
my ($self) = @_; |
787
|
8
|
|
|
|
|
68
|
$self->rendered(); # ignore return, we want just this: |
788
|
8
|
|
|
|
|
42
|
return ($self->{rendered_type}, $self->{visible_rendered}); |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
=item invisible_rendered() |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
Render and return the invisible text in this part. |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
=cut |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
sub invisible_rendered { |
798
|
8
|
|
|
8
|
1
|
25
|
my ($self) = @_; |
799
|
8
|
|
|
|
|
31
|
$self->rendered(); # ignore return, we want just this: |
800
|
8
|
|
|
|
|
29
|
return ($self->{rendered_type}, $self->{invisible_rendered}); |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
=item content_summary() |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
Returns an array of scalars describing the mime parts of the message. |
806
|
|
|
|
|
|
|
Note: This function requires that the message be parsed first! |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=cut |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
# return an array with scalars describing mime parts |
811
|
|
|
|
|
|
|
sub content_summary { |
812
|
13
|
|
|
13
|
1
|
231
|
my($self) = @_; |
813
|
|
|
|
|
|
|
|
814
|
13
|
|
|
|
|
53
|
my @ret = ( [ $self->{'type'} ] ); |
815
|
13
|
|
|
|
|
21
|
my @search; |
816
|
|
|
|
|
|
|
|
817
|
13
|
100
|
|
|
|
32
|
if (exists $self->{'body_parts'}) { |
818
|
11
|
|
|
|
|
15
|
my $count = @{$self->{'body_parts'}}; |
|
11
|
|
|
|
|
21
|
|
819
|
11
|
|
|
|
|
34
|
for(my $i=0; $i<$count; $i++) { |
820
|
24
|
|
|
|
|
73
|
push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]); |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
|
824
|
13
|
|
|
|
|
32
|
while(my $part = shift @search) { |
825
|
45
|
|
|
|
|
53
|
my($index, $part) = @{$part}; |
|
45
|
|
|
|
|
81
|
|
826
|
45
|
|
|
|
|
63
|
push(@{$ret[$index]}, $part->{'type'}); |
|
45
|
|
|
|
|
102
|
|
827
|
45
|
100
|
|
|
|
147
|
if (exists $part->{'body_parts'}) { |
828
|
11
|
|
|
|
|
16
|
unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}}); |
|
21
|
|
|
|
|
57
|
|
|
11
|
|
|
|
|
19
|
|
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
} |
831
|
|
|
|
|
|
|
|
832
|
13
|
|
|
|
|
27
|
return map { join(",", @{$_}) } @ret; |
|
37
|
|
|
|
|
45
|
|
|
37
|
|
|
|
|
183
|
|
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
=item delete_header() |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
Delete the specified header (decoded and raw) from the Node information. |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
=cut |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
sub delete_header { |
842
|
505
|
|
|
505
|
1
|
1125
|
my($self, $hdr) = @_; |
843
|
|
|
|
|
|
|
|
844
|
505
|
|
|
|
|
719
|
foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) { |
|
505
|
|
|
|
|
7514
|
|
845
|
0
|
|
|
|
|
0
|
delete $self->{'headers'}->{$_}; |
846
|
0
|
|
|
|
|
0
|
delete $self->{'raw_headers'}->{$_}; |
847
|
|
|
|
|
|
|
} |
848
|
|
|
|
|
|
|
|
849
|
505
|
|
|
|
|
1217
|
my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}}); |
|
505
|
|
|
|
|
5841
|
|
850
|
505
|
|
|
|
|
1971
|
$self->{'header_order'} = \@neworder; |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
# decode a header appropriately. don't bother adding it to the pod documents. |
854
|
|
|
|
|
|
|
sub __decode_header { |
855
|
10
|
|
|
10
|
|
50
|
my ( $encoding, $cte, $data ) = @_; |
856
|
|
|
|
|
|
|
|
857
|
10
|
100
|
|
|
|
51
|
if ( $cte eq 'B' ) { |
|
|
50
|
|
|
|
|
|
858
|
|
|
|
|
|
|
# base 64 encoded |
859
|
2
|
|
|
|
|
11
|
$data = Mail::SpamAssassin::Util::base64_decode($data); |
860
|
|
|
|
|
|
|
} |
861
|
|
|
|
|
|
|
elsif ( $cte eq 'Q' ) { |
862
|
|
|
|
|
|
|
# quoted printable |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
# the RFC states that in the encoded text, "_" is equal to "=20" |
865
|
8
|
|
|
|
|
20
|
$data =~ s/_/=20/g; |
866
|
|
|
|
|
|
|
|
867
|
8
|
|
|
|
|
22
|
$data = Mail::SpamAssassin::Util::qp_decode($data); |
868
|
|
|
|
|
|
|
} |
869
|
|
|
|
|
|
|
else { |
870
|
|
|
|
|
|
|
# not possible since the input has already been limited to 'B' and 'Q' |
871
|
0
|
|
|
|
|
0
|
die "message: unknown encoding type '$cte' in RFC2047 header"; |
872
|
|
|
|
|
|
|
} |
873
|
10
|
|
|
|
|
40
|
return _normalize($data, $encoding, 0); # transcode to UTF-8 octets |
874
|
|
|
|
|
|
|
} |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# Decode base64 and quoted-printable in headers according to RFC2047. |
877
|
|
|
|
|
|
|
# |
878
|
|
|
|
|
|
|
sub _decode_header { |
879
|
898
|
|
|
898
|
|
2843
|
my($header_field_body, $header_field_name) = @_; |
880
|
|
|
|
|
|
|
|
881
|
898
|
100
|
66
|
|
|
5232
|
return '' unless defined $header_field_body && $header_field_body ne ''; |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
# deal with folding and cream the newlines and such |
884
|
848
|
|
|
|
|
1944
|
$header_field_body =~ s/\n[ \t]+/\n /g; |
885
|
848
|
|
|
|
|
1381
|
$header_field_body =~ s/\015?\012//gs; |
886
|
|
|
|
|
|
|
|
887
|
848
|
100
|
|
|
|
3487
|
if ($header_field_name =~ |
888
|
|
|
|
|
|
|
/^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) | |
889
|
|
|
|
|
|
|
MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) { |
890
|
|
|
|
|
|
|
# Bug 6945: some header fields must not be processed for MIME encoding |
891
|
|
|
|
|
|
|
# Bug 7466: leave out the Content-* |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
} else { |
894
|
515
|
|
|
|
|
1909
|
local($1,$2,$3); |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# Multiple encoded sections must ignore the interim whitespace. |
897
|
|
|
|
|
|
|
# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE |
898
|
|
|
|
|
|
|
# separated by whitespace. |
899
|
515
|
|
|
|
|
1438
|
1 while $header_field_body =~ |
900
|
|
|
|
|
|
|
s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+ |
901
|
|
|
|
|
|
|
( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) } |
902
|
|
|
|
|
|
|
{$1$2}xsg; |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
# transcode properly encoded RFC 2047 substrings into UTF-8 octets, |
905
|
|
|
|
|
|
|
# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532) |
906
|
515
|
|
|
|
|
1432
|
# or plain US-ASCII |
907
|
10
|
|
|
|
|
54
|
$header_field_body =~ |
908
|
|
|
|
|
|
|
s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) } |
909
|
|
|
|
|
|
|
{ __decode_header($1, uc($2), $3) }xsge; |
910
|
|
|
|
|
|
|
} |
911
|
848
|
|
|
|
|
3059
|
|
912
|
|
|
|
|
|
|
# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body); |
913
|
|
|
|
|
|
|
return $header_field_body; |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
=item get_header() |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
Retrieve a specific header. Will have a newline at the end and will be |
919
|
|
|
|
|
|
|
unfolded. The first parameter is the header name (case-insensitive), |
920
|
|
|
|
|
|
|
and the second parameter (optional) is whether or not to return the |
921
|
|
|
|
|
|
|
raw header. |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
If get_header() is called in an array context, an array will be returned |
924
|
|
|
|
|
|
|
with each header entry in a different element. In a scalar context, |
925
|
|
|
|
|
|
|
the last specific header is returned. |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
928
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
929
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
Btw, returning the last header field (not the first) happens to be consistent |
932
|
|
|
|
|
|
|
with DKIM signatures, which search for and cover multiple header fields |
933
|
|
|
|
|
|
|
bottom-up according to the 'h' tag. Let's keep it this way. |
934
|
|
|
|
|
|
|
|
935
|
|
|
|
|
|
|
=cut |
936
|
4405
|
|
|
4405
|
1
|
9261
|
|
937
|
4405
|
|
50
|
|
|
16140
|
sub get_header { |
938
|
|
|
|
|
|
|
my ($self, $hdr, $raw) = @_; |
939
|
|
|
|
|
|
|
$raw ||= 0; |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
# And now pick up all the entries into a list |
942
|
|
|
|
|
|
|
# This is assumed to include a newline at the end ... |
943
|
|
|
|
|
|
|
# This is also assumed to have removed continuation bits ... |
944
|
4405
|
|
|
|
|
5674
|
|
945
|
4405
|
50
|
|
|
|
7683
|
# Deal with the possibility that header() or raw_header() returns undef |
946
|
0
|
0
|
|
|
|
0
|
my @hdrs; |
947
|
0
|
|
|
|
|
0
|
if ( $raw ) { |
948
|
|
|
|
|
|
|
if (@hdrs = $self->raw_header($hdr)) { |
949
|
|
|
|
|
|
|
s/\015?\012\s+/ /gs for @hdrs; |
950
|
|
|
|
|
|
|
} |
951
|
4405
|
100
|
|
|
|
9102
|
} |
952
|
844
|
|
|
|
|
3549
|
else { |
953
|
|
|
|
|
|
|
if (@hdrs = $self->header($hdr)) { |
954
|
|
|
|
|
|
|
$_ .= "\n" for @hdrs; |
955
|
|
|
|
|
|
|
} |
956
|
4405
|
100
|
|
|
|
8703
|
} |
957
|
3856
|
|
|
|
|
10981
|
|
958
|
|
|
|
|
|
|
if (wantarray) { |
959
|
|
|
|
|
|
|
return @hdrs; |
960
|
549
|
100
|
|
|
|
2819
|
} |
961
|
|
|
|
|
|
|
else { |
962
|
|
|
|
|
|
|
return @hdrs ? $hdrs[-1] : undef; |
963
|
|
|
|
|
|
|
} |
964
|
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
=item get_all_headers() |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
Retrieve all headers. Each header will have a newline at the end and |
969
|
|
|
|
|
|
|
will be unfolded. The first parameter (optional) is whether or not to |
970
|
|
|
|
|
|
|
return the raw headers, and the second parameter (optional) is whether |
971
|
|
|
|
|
|
|
or not to include the mbox separator. |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
If get_all_header() is called in an array context, an array will be |
974
|
|
|
|
|
|
|
returned with each header entry in a different element. In a scalar |
975
|
|
|
|
|
|
|
context, the headers are returned in a single scalar. |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
=back |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
=cut |
980
|
|
|
|
|
|
|
|
981
|
45
|
|
|
45
|
1
|
154
|
# build it and it will not bomb |
982
|
45
|
|
50
|
|
|
280
|
sub get_all_headers { |
983
|
45
|
|
100
|
|
|
191
|
my ($self, $raw, $include_mbox) = @_; |
984
|
|
|
|
|
|
|
$raw ||= 0; |
985
|
45
|
|
|
|
|
106
|
$include_mbox ||= 0; |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
my @lines; |
988
|
45
|
|
|
|
|
84
|
|
989
|
45
|
|
|
|
|
78
|
# precalculate destination positions based on order of appearance |
990
|
45
|
|
|
|
|
75
|
my $i = 0; |
|
45
|
|
|
|
|
181
|
|
991
|
520
|
|
|
|
|
674
|
my %locations; |
|
520
|
|
|
|
|
1722
|
|
992
|
|
|
|
|
|
|
for my $k (@{$self->{header_order}}) { |
993
|
|
|
|
|
|
|
push(@{$locations{lc($k)}}, $i++); |
994
|
|
|
|
|
|
|
} |
995
|
45
|
|
|
|
|
102
|
|
996
|
45
|
|
|
|
|
91
|
# process headers in order of first appearance |
997
|
45
|
|
|
|
|
390
|
my $header; |
|
902
|
|
|
|
|
1476
|
|
998
|
|
|
|
|
|
|
my $size = 0; |
999
|
|
|
|
|
|
|
HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] } |
1000
|
|
|
|
|
|
|
keys %locations) |
1001
|
396
|
|
|
|
|
703
|
{ |
1002
|
396
|
|
|
|
|
782
|
# get all same-name headers and poke into correct position |
1003
|
520
|
|
|
|
|
741
|
my $positions = $locations{$name}; |
|
520
|
|
|
|
|
807
|
|
1004
|
520
|
|
|
|
|
1248
|
for my $contents ($self->get_header($name, $raw)) { |
1005
|
520
|
50
|
|
|
|
1241
|
my $position = shift @{$positions}; |
1006
|
0
|
|
|
|
|
0
|
$size += length($name) + length($contents) + 2; |
1007
|
0
|
|
|
|
|
0
|
if ($size > MAX_HEADER_LENGTH) { |
1008
|
|
|
|
|
|
|
$self->{'truncated_header'} = 1; |
1009
|
520
|
|
|
|
|
2328
|
last HEADER; |
1010
|
|
|
|
|
|
|
} |
1011
|
|
|
|
|
|
|
$lines[$position] = $self->{header_order}->[$position].": ".$contents; |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
} |
1014
|
45
|
50
|
|
|
|
193
|
|
|
0
|
|
|
|
|
0
|
|
1015
|
|
|
|
|
|
|
# skip undefined lines if we truncated |
1016
|
45
|
50
|
66
|
|
|
191
|
@lines = grep { defined $_ } @lines if $self->{'truncated_header'}; |
1017
|
|
|
|
|
|
|
|
1018
|
45
|
100
|
|
|
|
731
|
splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} ); |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
return wantarray ? @lines : join ('', @lines); |
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
0
|
0
|
|
|
1023
|
|
|
|
|
|
|
# legacy public API; now a no-op. |
1024
|
|
|
|
|
|
|
sub finish { } |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
1; |
1029
|
|
|
|
|
|
|
__END__ |