line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module will encapsulate an email message and allow access to |
25
|
|
|
|
|
|
|
the various MIME message parts. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=over 4 |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
use strict; |
35
|
41
|
|
|
41
|
|
253
|
use warnings; |
|
41
|
|
|
|
|
72
|
|
|
41
|
|
|
|
|
1195
|
|
36
|
41
|
|
|
41
|
|
219
|
use re 'taint'; |
|
41
|
|
|
|
|
77
|
|
|
41
|
|
|
|
|
1325
|
|
37
|
41
|
|
|
41
|
|
228
|
|
|
41
|
|
|
|
|
94
|
|
|
41
|
|
|
|
|
1607
|
|
38
|
|
|
|
|
|
|
require 5.008001; # needs utf8::is_utf8() |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
use Mail::SpamAssassin; |
41
|
41
|
|
|
41
|
|
237
|
use Mail::SpamAssassin::Constants qw(:sa); |
|
41
|
|
|
|
|
74
|
|
|
41
|
|
|
|
|
1047
|
|
42
|
41
|
|
|
41
|
|
212
|
use Mail::SpamAssassin::HTML; |
|
41
|
|
|
|
|
77
|
|
|
41
|
|
|
|
|
6042
|
|
43
|
41
|
|
|
41
|
|
11179
|
use Mail::SpamAssassin::Logger; |
|
41
|
|
|
|
|
140
|
|
|
41
|
|
|
|
|
1490
|
|
44
|
41
|
|
|
41
|
|
248
|
|
|
41
|
|
|
|
|
73
|
|
|
41
|
|
|
|
|
6026
|
|
45
|
|
|
|
|
|
|
our($enc_utf8, $enc_w1252, $have_encode_detector); |
46
|
|
|
|
|
|
|
BEGIN { |
47
|
|
|
|
|
|
|
eval { require Encode } |
48
|
41
|
|
|
|
|
20823
|
and do { $enc_utf8 = Encode::find_encoding('UTF-8'); |
49
|
41
|
50
|
|
41
|
|
124
|
$enc_w1252 = Encode::find_encoding('Windows-1252') }; |
|
41
|
|
|
|
|
340379
|
|
50
|
41
|
|
|
|
|
5746
|
eval { require Encode::Detect::Detector } |
51
|
41
|
|
|
|
|
190808
|
and do { $have_encode_detector = 1 }; |
52
|
41
|
50
|
|
|
|
12972
|
}; |
|
0
|
|
|
|
|
0
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item new() |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Generates an empty Node object and returns it. Typically only called |
57
|
|
|
|
|
|
|
by functions in Message. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
my $class = shift; |
62
|
|
|
|
|
|
|
$class = ref($class) || $class; |
63
|
196
|
|
|
196
|
1
|
438
|
|
64
|
196
|
|
33
|
|
|
802
|
my $self = { |
65
|
|
|
|
|
|
|
headers => {}, |
66
|
196
|
|
|
|
|
1136
|
raw_headers => {}, |
67
|
|
|
|
|
|
|
header_order => [] |
68
|
|
|
|
|
|
|
}; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# deal with any parameters |
71
|
|
|
|
|
|
|
my($opts) = @_; |
72
|
|
|
|
|
|
|
$self->{normalize} = $opts->{'normalize'} || 0; |
73
|
196
|
|
|
|
|
438
|
|
74
|
196
|
|
50
|
|
|
928
|
bless($self,$class); |
75
|
|
|
|
|
|
|
$self; |
76
|
196
|
|
|
|
|
544
|
} |
77
|
196
|
|
|
|
|
534
|
|
78
|
|
|
|
|
|
|
=item find_parts() |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Used to search the tree for specific MIME parts. An array of matching |
81
|
|
|
|
|
|
|
Node objects (pointers into the tree) is returned. The parameters that |
82
|
|
|
|
|
|
|
can be passed in are (in order, all scalars): |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Regexp - Used to match against each part's Content-Type header, |
85
|
|
|
|
|
|
|
specifically the type and not the rest of the header. ie: "Content-type: |
86
|
|
|
|
|
|
|
text/html; encoding=quoted-printable" has a type of "text/html". If no |
87
|
|
|
|
|
|
|
regexp is specified, find_parts() will return an empty array. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Only_leaves - By default, find_parts() will return any part that matches |
90
|
|
|
|
|
|
|
the regexp, including multipart. If you only want to see leaves of the |
91
|
|
|
|
|
|
|
tree (ie: parts that aren't multipart), set this to true (1). |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Recursive - By default, when find_parts() finds a multipart which has |
94
|
|
|
|
|
|
|
parts underneath it, it will recurse through all sub-children. If set to 0, |
95
|
|
|
|
|
|
|
only look at the part and any direct children of the part. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=cut |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# Used to find any MIME parts whose simple content-type matches a given regexp |
100
|
|
|
|
|
|
|
# Searches it's own and any children parts. Returns an array of MIME |
101
|
|
|
|
|
|
|
# objects which match. Our callers may expect the default behavior which is a |
102
|
|
|
|
|
|
|
# depth-first array of parts. |
103
|
|
|
|
|
|
|
# |
104
|
|
|
|
|
|
|
my ($self, $re, $onlyleaves, $recursive) = @_; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Didn't pass an RE? Just abort. |
107
|
256
|
|
|
256
|
1
|
649
|
return () unless defined $re && $re ne ''; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
$onlyleaves = 0 unless defined $onlyleaves; |
110
|
256
|
50
|
33
|
|
|
1363
|
|
111
|
|
|
|
|
|
|
my $depth; |
112
|
256
|
50
|
|
|
|
558
|
if (defined $recursive && $recursive == 0) { |
113
|
|
|
|
|
|
|
$depth = 1; |
114
|
256
|
|
|
|
|
420
|
} |
115
|
256
|
50
|
33
|
|
|
637
|
|
116
|
0
|
|
|
|
|
0
|
my @ret; |
117
|
|
|
|
|
|
|
my @search = ( $self ); |
118
|
|
|
|
|
|
|
|
119
|
256
|
|
|
|
|
390
|
while (my $part = shift @search) { |
120
|
256
|
|
|
|
|
541
|
# If this object matches, mark it for return. |
121
|
|
|
|
|
|
|
my $amialeaf = $part->is_leaf(); |
122
|
256
|
|
|
|
|
676
|
|
123
|
|
|
|
|
|
|
if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) { |
124
|
290
|
|
|
|
|
897
|
push(@ret, $part); |
125
|
|
|
|
|
|
|
} |
126
|
290
|
100
|
66
|
|
|
2909
|
|
|
|
|
100
|
|
|
|
|
127
|
271
|
|
|
|
|
517
|
if ( !$amialeaf && (!defined $depth || $depth > 0)) { |
128
|
|
|
|
|
|
|
$depth-- if defined $depth; |
129
|
|
|
|
|
|
|
unshift(@search, @{$part->{'body_parts'}}); |
130
|
290
|
50
|
33
|
|
|
1059
|
} |
|
|
|
66
|
|
|
|
|
131
|
19
|
50
|
|
|
|
41
|
} |
132
|
19
|
|
|
|
|
25
|
|
|
19
|
|
|
|
|
64
|
|
133
|
|
|
|
|
|
|
return @ret; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
256
|
|
|
|
|
995
|
=item header() |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Stores and retrieves headers from a specific MIME part. The first |
139
|
|
|
|
|
|
|
parameter is the header name. If there is no other parameter, the header |
140
|
|
|
|
|
|
|
is retrieved. If there is a second parameter, the header is stored. |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Header names are case-insensitive and are stored in both raw and |
143
|
|
|
|
|
|
|
decoded form. Using header(), only the decoded form is retrievable. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
For retrieval, if header() is called in an array context, an array will |
146
|
|
|
|
|
|
|
be returned with each header entry in a different element. In a scalar |
147
|
|
|
|
|
|
|
context, the last specific header is returned. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
150
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
151
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Store or retrieve headers from a given MIME object |
156
|
|
|
|
|
|
|
# |
157
|
|
|
|
|
|
|
my $self = shift; |
158
|
|
|
|
|
|
|
my $rawkey = shift; |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
return unless defined $rawkey; |
161
|
5663
|
|
|
5663
|
1
|
6921
|
|
162
|
5663
|
|
|
|
|
7399
|
# we're going to do things case insensitively |
163
|
|
|
|
|
|
|
my $key = lc($rawkey); |
164
|
5663
|
50
|
|
|
|
11217
|
|
165
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
166
|
|
|
|
|
|
|
$key =~ s/^\s+//; |
167
|
5663
|
|
|
|
|
10021
|
$key =~ s/\s+$//; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
if (@_) { |
170
|
5663
|
|
|
|
|
11025
|
my $raw_value = shift; |
171
|
5663
|
|
|
|
|
8979
|
return unless defined $raw_value; |
172
|
|
|
|
|
|
|
|
173
|
5663
|
100
|
|
|
|
9572
|
push @{ $self->{'header_order'} }, $rawkey; |
174
|
909
|
|
|
|
|
1578
|
if ( !exists $self->{'headers'}->{$key} ) { |
175
|
909
|
50
|
|
|
|
1797
|
$self->{'headers'}->{$key} = []; |
176
|
|
|
|
|
|
|
$self->{'raw_headers'}->{$key} = []; |
177
|
909
|
|
|
|
|
1123
|
} |
|
909
|
|
|
|
|
2059
|
|
178
|
909
|
100
|
|
|
|
2317
|
|
179
|
835
|
|
|
|
|
2315
|
my $dec_value = $raw_value; |
180
|
835
|
|
|
|
|
1764
|
$dec_value =~ s/\n[ \t]+/ /gs; |
181
|
|
|
|
|
|
|
$dec_value =~ s/\s+$//s; |
182
|
|
|
|
|
|
|
$dec_value =~ s/^\s+//s; |
183
|
909
|
|
|
|
|
1494
|
push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key); |
184
|
909
|
|
|
|
|
2282
|
|
185
|
909
|
|
|
|
|
3755
|
push @{ $self->{'raw_headers'}->{$key} }, $raw_value; |
186
|
909
|
|
|
|
|
2371
|
|
187
|
909
|
|
|
|
|
1258
|
return $self->{'headers'}->{$key}->[-1]; |
|
909
|
|
|
|
|
2431
|
|
188
|
|
|
|
|
|
|
} |
189
|
909
|
|
|
|
|
1390
|
|
|
909
|
|
|
|
|
2051
|
|
190
|
|
|
|
|
|
|
if (wantarray) { |
191
|
909
|
|
|
|
|
4000
|
return unless exists $self->{'headers'}->{$key}; |
192
|
|
|
|
|
|
|
return @{ $self->{'headers'}->{$key} }; |
193
|
|
|
|
|
|
|
} |
194
|
4754
|
100
|
|
|
|
7358
|
else { |
195
|
4441
|
100
|
|
|
|
14736
|
return '' unless exists $self->{'headers'}->{$key}; |
196
|
1002
|
|
|
|
|
1210
|
return $self->{'headers'}->{$key}->[-1]; |
|
1002
|
|
|
|
|
5415
|
|
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
} |
199
|
313
|
100
|
|
|
|
2295
|
|
200
|
48
|
|
|
|
|
318
|
=item raw_header() |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Retrieves the raw version of headers from a specific MIME part. The only |
203
|
|
|
|
|
|
|
parameter is the header name. Header names are case-insensitive. |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
For retrieval, if raw_header() is called in an array context, an array |
206
|
|
|
|
|
|
|
will be returned with each header entry in a different element. In a |
207
|
|
|
|
|
|
|
scalar context, the last specific header is returned. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
210
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
211
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=cut |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Retrieve raw headers from a given MIME object |
216
|
|
|
|
|
|
|
# |
217
|
|
|
|
|
|
|
my $self = shift; |
218
|
|
|
|
|
|
|
my $key = lc(shift); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
221
|
|
|
|
|
|
|
$key =~ s/^\s+//; |
222
|
633
|
|
|
633
|
1
|
855
|
$key =~ s/\s+$//; |
223
|
633
|
|
|
|
|
992
|
|
224
|
|
|
|
|
|
|
if (wantarray) { |
225
|
|
|
|
|
|
|
return unless exists $self->{'raw_headers'}->{$key}; |
226
|
633
|
|
|
|
|
1248
|
return @{ $self->{'raw_headers'}->{$key} }; |
227
|
633
|
|
|
|
|
1015
|
} |
228
|
|
|
|
|
|
|
else { |
229
|
633
|
50
|
|
|
|
1133
|
return '' unless exists $self->{'raw_headers'}->{$key}; |
230
|
633
|
100
|
|
|
|
1948
|
return $self->{'raw_headers'}->{$key}->[-1]; |
231
|
139
|
|
|
|
|
237
|
} |
|
139
|
|
|
|
|
807
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
0
|
=item add_body_part() |
235
|
0
|
|
|
|
|
0
|
|
236
|
|
|
|
|
|
|
Adds a Node child object to the current node object. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=cut |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Add a MIME child part to ourselves |
241
|
|
|
|
|
|
|
my($self, $part) = @_; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
dbg("message: added part, type: ".$part->{'type'}); |
244
|
|
|
|
|
|
|
push @{ $self->{'body_parts'} }, $part; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
50
|
|
|
50
|
1
|
93
|
=item is_leaf() |
248
|
|
|
|
|
|
|
|
249
|
50
|
|
|
|
|
164
|
Returns true if the tree node in question is a leaf of the tree (ie: |
250
|
50
|
|
|
|
|
58
|
has no children of its own). Note: This function may return odd results |
|
50
|
|
|
|
|
116
|
|
251
|
|
|
|
|
|
|
unless the message has been mime parsed via _do_parse()! |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=cut |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
my($self) = @_; |
256
|
|
|
|
|
|
|
return !exists $self->{'body_parts'}; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=item raw() |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Return a reference to the raw array. Treat this as READ ONLY. |
262
|
290
|
|
|
290
|
1
|
509
|
|
263
|
290
|
|
|
|
|
787
|
=cut |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
my $self = shift; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# Ok, if we're called we are expected to return an array. |
268
|
|
|
|
|
|
|
# so if it's a file reference, read in the message into an array... |
269
|
|
|
|
|
|
|
# |
270
|
|
|
|
|
|
|
# NOTE: that "ref undef" works, so don't bother checking for a defined var |
271
|
|
|
|
|
|
|
# first. |
272
|
|
|
|
|
|
|
if (ref $self->{'raw'} eq 'GLOB') { |
273
|
0
|
|
|
0
|
1
|
0
|
my $fd = $self->{'raw'}; |
274
|
|
|
|
|
|
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
# dbg("message: (raw) reading mime part from a temporary file"); |
277
|
|
|
|
|
|
|
my($nread,$raw_str); $raw_str = ''; |
278
|
|
|
|
|
|
|
while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { } |
279
|
|
|
|
|
|
|
defined $nread or die "error reading: $!"; |
280
|
0
|
0
|
|
|
|
0
|
my @array = split(/^/m, $raw_str, -1); |
281
|
0
|
|
|
|
|
0
|
|
282
|
0
|
0
|
|
|
|
0
|
dbg("message: empty message read") if $raw_str eq ''; |
283
|
|
|
|
|
|
|
return \@array; |
284
|
|
|
|
|
|
|
} |
285
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
286
|
0
|
|
|
|
|
0
|
return $self->{'raw'}; |
287
|
0
|
0
|
|
|
|
0
|
} |
288
|
0
|
|
|
|
|
0
|
|
289
|
|
|
|
|
|
|
=item decode() |
290
|
0
|
0
|
|
|
|
0
|
|
291
|
0
|
|
|
|
|
0
|
If necessary, decode the part text as base64 or quoted-printable. |
292
|
|
|
|
|
|
|
The decoded text will be returned as a scalar string. An optional length |
293
|
|
|
|
|
|
|
parameter can be passed in which limits how much decoded data is returned. |
294
|
0
|
|
|
|
|
0
|
If the scalar isn't needed, call with "0" as a parameter. |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=cut |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
my($self, $bytes) = @_; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
if ( !exists $self->{'decoded'} ) { |
301
|
|
|
|
|
|
|
# Someone is looking for a decoded part where there is no raw data |
302
|
|
|
|
|
|
|
# (multipart or subparsed message, etc.) Just return undef. |
303
|
|
|
|
|
|
|
return if !exists $self->{'raw'}; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
my $raw; |
306
|
|
|
|
|
|
|
|
307
|
254
|
|
|
254
|
1
|
3692
|
# if the part is held in a temp file, read it into the scalar |
308
|
|
|
|
|
|
|
if (ref $self->{'raw'} eq 'GLOB') { |
309
|
254
|
100
|
|
|
|
678
|
my $fd = $self->{'raw'}; |
310
|
|
|
|
|
|
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
311
|
|
|
|
|
|
|
|
312
|
146
|
50
|
|
|
|
414
|
# dbg("message: (decode) reading mime part from a temporary file"); |
313
|
|
|
|
|
|
|
my($nread,$raw_str); $raw = ''; |
314
|
146
|
|
|
|
|
234
|
while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { } |
315
|
|
|
|
|
|
|
defined $nread or die "error reading: $!"; |
316
|
|
|
|
|
|
|
|
317
|
146
|
100
|
|
|
|
473
|
dbg("message: empty message read from a temp file") if $raw eq ''; |
318
|
4
|
|
|
|
|
5
|
} |
319
|
4
|
50
|
|
|
|
36
|
else { |
320
|
|
|
|
|
|
|
# create a new scalar from the raw array in memory |
321
|
|
|
|
|
|
|
$raw = join('', @{$self->{'raw'}}); |
322
|
4
|
|
|
|
|
9
|
} |
|
4
|
|
|
|
|
5
|
|
323
|
4
|
|
|
|
|
234
|
|
324
|
4
|
50
|
|
|
|
16
|
my $encoding = lc $self->header('content-transfer-encoding') || ''; |
325
|
|
|
|
|
|
|
|
326
|
4
|
50
|
|
|
|
16
|
if ( $encoding eq 'quoted-printable' ) { |
327
|
|
|
|
|
|
|
dbg("message: decoding quoted-printable"); |
328
|
|
|
|
|
|
|
$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw); |
329
|
|
|
|
|
|
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
330
|
142
|
|
|
|
|
260
|
} |
|
142
|
|
|
|
|
1450
|
|
331
|
|
|
|
|
|
|
elsif ( $encoding eq 'base64' ) { |
332
|
|
|
|
|
|
|
dbg("message: decoding base64"); |
333
|
146
|
|
100
|
|
|
559
|
|
334
|
|
|
|
|
|
|
# if it's not defined or is 0, do the whole thing, otherwise only decode |
335
|
146
|
100
|
|
|
|
711
|
# a portion |
|
|
100
|
|
|
|
|
|
336
|
6
|
|
|
|
|
25
|
if ($bytes) { |
337
|
6
|
|
|
|
|
39
|
return Mail::SpamAssassin::Util::base64_decode($raw, $bytes); |
338
|
6
|
|
|
|
|
36
|
} |
339
|
|
|
|
|
|
|
else { |
340
|
|
|
|
|
|
|
# Generate the decoded output |
341
|
5
|
|
|
|
|
17
|
$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw); |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) { |
345
|
5
|
50
|
|
|
|
7
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
346
|
0
|
|
|
|
|
0
|
} |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
else { |
349
|
|
|
|
|
|
|
# Encoding is one of 7bit, 8bit, binary or x-something |
350
|
5
|
|
|
|
|
19
|
if ( $encoding ) { |
351
|
|
|
|
|
|
|
dbg("message: decoding other encoding type ($encoding), ignoring"); |
352
|
|
|
|
|
|
|
} |
353
|
5
|
100
|
|
|
|
26
|
else { |
354
|
1
|
|
|
|
|
21
|
dbg("message: no encoding detected"); |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
$self->{'decoded'} = $raw; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
} |
359
|
135
|
100
|
|
|
|
316
|
|
360
|
33
|
|
|
|
|
197
|
if ( !defined $bytes || $bytes ) { |
361
|
|
|
|
|
|
|
if ( !defined $bytes ) { |
362
|
|
|
|
|
|
|
# force a copy |
363
|
102
|
|
|
|
|
351
|
return '' . $self->{'decoded'}; |
364
|
|
|
|
|
|
|
} |
365
|
135
|
|
|
|
|
727
|
else { |
366
|
|
|
|
|
|
|
return substr($self->{'decoded'}, 0, $bytes); |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
254
|
100
|
66
|
|
|
1244
|
} |
370
|
250
|
50
|
|
|
|
599
|
|
371
|
|
|
|
|
|
|
# Detect endianness of UTF-16 encoded data |
372
|
250
|
|
|
|
|
2079
|
my $data = $_[0]; # could not avoid copying large strings |
373
|
|
|
|
|
|
|
my $utf16le_clues = 0; |
374
|
|
|
|
|
|
|
my $utf16be_clues = 0; |
375
|
0
|
|
|
|
|
0
|
my $sum_h_e = 0; |
376
|
|
|
|
|
|
|
my $sum_h_o = 0; |
377
|
|
|
|
|
|
|
my $sum_l_e = 0; |
378
|
|
|
|
|
|
|
my $sum_l_o = 0; |
379
|
|
|
|
|
|
|
my $decoder = undef; |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
my @msg_h = unpack 'H' x length( $data ), $data; |
382
|
0
|
|
|
0
|
0
|
0
|
my @msg_l = unpack 'h' x length( $data ), $data; |
383
|
0
|
|
|
|
|
0
|
|
384
|
0
|
|
|
|
|
0
|
for( my $i = 0; $i < length( $data ); $i+=2 ) { |
385
|
0
|
|
|
|
|
0
|
my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] ); |
386
|
0
|
|
|
|
|
0
|
$sum_h_e += hex $msg_h[$i]; |
387
|
0
|
|
|
|
|
0
|
$sum_h_o += hex $msg_h[$i+1]; |
388
|
0
|
|
|
|
|
0
|
$sum_l_e += hex $msg_l[$i]; |
389
|
0
|
|
|
|
|
0
|
$sum_l_o += hex $msg_l[$i+1]; |
390
|
|
|
|
|
|
|
if( $check_char =~ /20 00/ ) { |
391
|
0
|
|
|
|
|
0
|
# UTF-16LE space char detected |
392
|
0
|
|
|
|
|
0
|
$utf16le_clues++; |
393
|
|
|
|
|
|
|
} |
394
|
0
|
|
|
|
|
0
|
if( $check_char =~ /00 20/ ) { |
395
|
0
|
|
|
|
|
0
|
# UTF-16BE space char detected |
396
|
0
|
|
|
|
|
0
|
$utf16be_clues++; |
397
|
0
|
|
|
|
|
0
|
} |
398
|
0
|
|
|
|
|
0
|
} |
399
|
0
|
|
|
|
|
0
|
|
400
|
0
|
0
|
|
|
|
0
|
# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE |
401
|
|
|
|
|
|
|
$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 ); |
402
|
0
|
|
|
|
|
0
|
|
403
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE |
404
|
0
|
0
|
|
|
|
0
|
$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) ); |
405
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
if( $utf16le_clues > $utf16be_clues ) { |
407
|
|
|
|
|
|
|
dbg( "message: detect_utf16: UTF-16LE" ); |
408
|
|
|
|
|
|
|
$decoder = Encode::find_encoding("UTF-16LE"); |
409
|
|
|
|
|
|
|
} elsif( $utf16le_clues > $utf16be_clues ) { |
410
|
|
|
|
|
|
|
dbg( "message: detect_utf16: UTF-16BE" ); |
411
|
0
|
0
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16BE"); |
412
|
|
|
|
|
|
|
} else { |
413
|
|
|
|
|
|
|
dbg( "message: detect_utf16: Could not detect UTF-16 endianness" ); |
414
|
0
|
0
|
|
|
|
0
|
} |
415
|
|
|
|
|
|
|
|
416
|
0
|
0
|
|
|
|
0
|
return $decoder; |
|
|
0
|
|
|
|
|
|
417
|
0
|
|
|
|
|
0
|
} |
418
|
0
|
|
|
|
|
0
|
|
419
|
|
|
|
|
|
|
# Look at a text scalar and determine whether it should be rendered |
420
|
0
|
|
|
|
|
0
|
# as text/html. |
421
|
0
|
|
|
|
|
0
|
# |
422
|
|
|
|
|
|
|
# This is not a public function. |
423
|
0
|
|
|
|
|
0
|
# |
424
|
|
|
|
|
|
|
if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is) |
425
|
|
|
|
|
|
|
{ |
426
|
0
|
|
|
|
|
0
|
my $pad = $1; |
427
|
|
|
|
|
|
|
my $count = 0; |
428
|
|
|
|
|
|
|
$count += ($pad =~ tr/\n//d) * 2; |
429
|
|
|
|
|
|
|
$count += ($pad =~ tr/\n//cd); |
430
|
|
|
|
|
|
|
return ($count < 24); |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
return 0; |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
0
|
0
|
|
0
|
|
0
|
# Decode character set of a given text to perl characters (Unicode), |
436
|
|
|
|
|
|
|
# then encode into UTF-8 octets if requested. |
437
|
0
|
|
|
|
|
0
|
# |
438
|
0
|
|
|
|
|
0
|
# my $data = $_[0]; # avoid copying large strings |
439
|
0
|
|
|
|
|
0
|
my $charset_declared = $_[1]; |
440
|
0
|
|
|
|
|
0
|
my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets |
441
|
0
|
|
|
|
|
0
|
|
442
|
|
|
|
|
|
|
warn "message: _normalize() was given characters, expected bytes: $_[0]\n" |
443
|
0
|
|
|
|
|
0
|
if utf8::is_utf8($_[0]); |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879] |
446
|
|
|
|
|
|
|
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
if (!defined $charset_declared || $charset_declared eq '') { |
449
|
|
|
|
|
|
|
$charset_declared = 'us-ascii'; |
450
|
|
|
|
|
|
|
} |
451
|
10
|
|
|
10
|
|
19
|
|
452
|
10
|
|
|
|
|
13
|
# number of characters with code above 127 |
453
|
|
|
|
|
|
|
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c; |
454
|
10
|
50
|
|
|
|
35
|
|
455
|
|
|
|
|
|
|
if (!$cnt_8bits && |
456
|
|
|
|
|
|
|
$charset_declared =~ |
457
|
|
|
|
|
|
|
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) | |
458
|
10
|
|
|
|
|
39
|
ISO646-US )\z/xsi) |
459
|
|
|
|
|
|
|
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is |
460
|
10
|
50
|
33
|
|
|
51
|
dbg("message: kept, charset is US-ASCII as declared"); |
461
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
if (!$cnt_8bits && |
465
|
10
|
|
|
|
|
23
|
$charset_declared =~ |
466
|
|
|
|
|
|
|
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} | |
467
|
10
|
50
|
33
|
|
|
68
|
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} | |
468
|
|
|
|
|
|
|
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi) |
469
|
|
|
|
|
|
|
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII |
470
|
|
|
|
|
|
|
dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared); |
471
|
|
|
|
|
|
|
return $_[0]; # is all-ASCII, no need for decoding |
472
|
0
|
|
|
|
|
0
|
} |
473
|
0
|
|
|
|
|
0
|
|
474
|
|
|
|
|
|
|
# Try first to strictly decode based on a declared character set. |
475
|
|
|
|
|
|
|
|
476
|
10
|
100
|
66
|
|
|
74
|
my $rv; |
477
|
|
|
|
|
|
|
if ($charset_declared =~ /^UTF-?8\z/i) { |
478
|
|
|
|
|
|
|
# attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC) |
479
|
|
|
|
|
|
|
if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
480
|
|
|
|
|
|
|
dbg("message: decoded as declared charset UTF-8"); |
481
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
482
|
8
|
|
|
|
|
24
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
483
|
8
|
|
|
|
|
58
|
return $rv; # decoded |
484
|
|
|
|
|
|
|
} else { |
485
|
|
|
|
|
|
|
my $err = ''; |
486
|
|
|
|
|
|
|
if ($@) { |
487
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
488
|
2
|
|
|
|
|
9
|
$err = " ($err)"; |
489
|
2
|
50
|
33
|
|
|
20
|
} |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
490
|
|
|
|
|
|
|
dbg("message: failed decoding as declared charset UTF-8 ($err)"); |
491
|
0
|
0
|
|
|
|
0
|
} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
492
|
0
|
|
|
|
|
0
|
|
493
|
0
|
0
|
|
|
|
0
|
} elsif ($charset_declared =~ /^UTF[ -]?16/i) { |
494
|
0
|
|
|
|
|
0
|
# Handle cases where spammers use UTF-16 encoding without including a BOM |
495
|
0
|
|
|
|
|
0
|
# or declaring endianness as reported at: |
496
|
|
|
|
|
|
|
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252 |
497
|
0
|
|
|
|
|
0
|
|
498
|
0
|
0
|
|
|
|
0
|
my $decoder = detect_utf16( $_[0] ); |
499
|
0
|
|
|
|
|
0
|
if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
500
|
0
|
|
|
|
|
0
|
dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name); |
501
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
502
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
503
|
|
|
|
|
|
|
return $rv; # decoded |
504
|
|
|
|
|
|
|
} else { |
505
|
|
|
|
|
|
|
my $err = ''; |
506
|
|
|
|
|
|
|
if ($@) { |
507
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
508
|
|
|
|
|
|
|
$err = " ($err)"; |
509
|
|
|
|
|
|
|
} |
510
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset %s%s", $charset_declared, $err); |
511
|
0
|
0
|
|
|
|
0
|
}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
512
|
0
|
|
|
|
|
0
|
|
513
|
0
|
0
|
|
|
|
0
|
} elsif ($cnt_8bits && |
514
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
515
|
0
|
|
|
|
|
0
|
dbg("message: decoded as charset UTF-8, declared %s", $charset_declared); |
516
|
|
|
|
|
|
|
return $_[0] if !$return_decoded; |
517
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
518
|
0
|
0
|
|
|
|
0
|
return $rv; # decoded |
519
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
520
|
0
|
|
|
|
|
0
|
} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) { |
521
|
|
|
|
|
|
|
# declared as US-ASCII but contains 8-bit characters, makes no sense |
522
|
0
|
|
|
|
|
0
|
# to attempt decoding first as strict US-ASCII as we know it would fail |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
} else { |
525
|
|
|
|
|
|
|
# try decoding as a declared character set |
526
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
527
|
0
|
|
|
|
|
0
|
# -> http://en.wikipedia.org/wiki/Windows-1252 |
528
|
0
|
0
|
|
|
|
0
|
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs |
529
|
0
|
|
|
|
|
0
|
# from the IANA's ISO-8859-1 by using displayable characters rather than |
530
|
0
|
|
|
|
|
0
|
# control characters in the 80 to 9F (hex) range. [...] |
531
|
|
|
|
|
|
|
# It is very common to mislabel Windows-1252 text with the charset label |
532
|
|
|
|
|
|
|
# ISO-8859-1. A common result was that all the quotes and apostrophes |
533
|
|
|
|
|
|
|
# (produced by "smart quotes" in word-processing software) were replaced |
534
|
|
|
|
|
|
|
# with question marks or boxes on non-Windows operating systems, making |
535
|
|
|
|
|
|
|
# text difficult to read. Most modern web browsers and e-mail clients |
536
|
|
|
|
|
|
|
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate |
537
|
|
|
|
|
|
|
# such mislabeling. This is now standard behavior in the draft HTML 5 |
538
|
|
|
|
|
|
|
# specification, which requires that documents advertised as ISO-8859-1 |
539
|
|
|
|
|
|
|
# actually be parsed with the Windows-1252 encoding. |
540
|
|
|
|
|
|
|
# |
541
|
|
|
|
|
|
|
my($chset, $decoder); |
542
|
|
|
|
|
|
|
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) { |
543
|
|
|
|
|
|
|
$chset = 'Windows-1252'; $decoder = $enc_w1252; |
544
|
|
|
|
|
|
|
} else { |
545
|
|
|
|
|
|
|
$chset = $charset_declared; $decoder = Encode::find_encoding($chset); |
546
|
|
|
|
|
|
|
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
547
|
|
|
|
|
|
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
548
|
|
|
|
|
|
|
dbg("message: no decoder for a declared charset %s, using GBK", |
549
|
|
|
|
|
|
|
$chset) if $decoder; |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
if (!$decoder) { |
553
|
2
|
|
|
|
|
7
|
dbg("message: failed decoding, no decoder for a declared charset %s", |
554
|
2
|
50
|
|
|
|
13
|
$chset); |
555
|
0
|
|
|
|
|
0
|
} else { |
|
0
|
|
|
|
|
0
|
|
556
|
|
|
|
|
|
|
my $err = ''; |
557
|
2
|
|
|
|
|
4
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
2
|
|
|
|
|
14
|
|
558
|
2
|
50
|
33
|
|
|
6805
|
if ($@) { |
559
|
0
|
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
560
|
0
|
0
|
|
|
|
0
|
$err = " ($err)"; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
if (lc $chset eq lc $charset_declared) { |
563
|
|
|
|
|
|
|
dbg("message: %s as declared charset %s%s", |
564
|
2
|
50
|
|
|
|
5
|
defined $rv ? 'decoded' : 'failed decoding', $charset_declared, $err); |
565
|
0
|
|
|
|
|
0
|
} else { |
566
|
|
|
|
|
|
|
dbg("message: %s as charset %s, declared %s%s", |
567
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
568
|
2
|
|
|
|
|
4
|
$chset, $charset_declared, $err); |
569
|
2
|
|
|
|
|
4
|
} |
|
2
|
|
|
|
|
7
|
|
570
|
2
|
50
|
|
|
|
196
|
} |
571
|
0
|
|
|
|
|
0
|
} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
572
|
0
|
|
|
|
|
0
|
|
573
|
|
|
|
|
|
|
# If the above failed, check if it is US-ASCII, possibly extended by few |
574
|
2
|
50
|
|
|
|
9
|
# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing |
575
|
2
|
50
|
|
|
|
9
|
# some popular punctuation or special characters from Windows-1252 in |
576
|
|
|
|
|
|
|
# the \x80-\x9F range (which is unassigned in ISO-8859-*). |
577
|
|
|
|
|
|
|
# Note that Windows-1252 is a proper superset of ISO-8859-1. |
578
|
0
|
0
|
|
|
|
0
|
# |
579
|
|
|
|
|
|
|
if (!defined $rv && !$cnt_8bits) { |
580
|
|
|
|
|
|
|
dbg("message: kept, guessed charset is US-ASCII, declared %s", |
581
|
|
|
|
|
|
|
$charset_declared); |
582
|
|
|
|
|
|
|
return $_[0]; # is all-ASCII, no need for decoding |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
} elsif (!defined $rv && $enc_w1252 && |
585
|
|
|
|
|
|
|
# ASCII NBSP (c) SHY ' " ... '".- TM |
586
|
|
|
|
|
|
|
#$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c) |
587
|
|
|
|
|
|
|
# Bug 7656: Include latin1 diacritic letters to Windows-1252 autodetection, |
588
|
|
|
|
|
|
|
# Encode::Detect::Detector might identify them as Windows-1255 (Hebrew!) |
589
|
|
|
|
|
|
|
$_[0] !~ tr/\x00-\x7f\xa0\xa9\xad\x82\x84\x85\x91-\x97\x99\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe//c) |
590
|
|
|
|
|
|
|
{ # ASCII + NBSP + SHY + some punctuation characters |
591
|
2
|
50
|
33
|
|
|
11
|
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too |
|
|
50
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
592
|
0
|
|
|
|
|
0
|
# consider also: AE (r), 80 Euro |
593
|
|
|
|
|
|
|
my $err = ''; |
594
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
595
|
|
|
|
|
|
|
if ($@) { |
596
|
|
|
|
|
|
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
597
|
|
|
|
|
|
|
$err = " ($err)"; |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
# the above can't fail, but keep code general just in case |
600
|
|
|
|
|
|
|
dbg("message: %s as guessed charset %s, declared %s%s", |
601
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
602
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
|
605
|
0
|
|
|
|
|
0
|
# If we were unsuccessful so far, try some guesswork |
606
|
0
|
|
|
|
|
0
|
# based on Encode::Detect::Detector . |
|
0
|
|
|
|
|
0
|
|
607
|
0
|
0
|
|
|
|
0
|
|
608
|
0
|
|
|
|
|
0
|
if (defined $rv) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
609
|
0
|
|
|
|
|
0
|
# done, no need for guesswork |
610
|
|
|
|
|
|
|
} elsif (!$have_encode_detector) { |
611
|
|
|
|
|
|
|
dbg("message: Encode::Detect::Detector not available, declared %s failed", |
612
|
0
|
0
|
|
|
|
0
|
$charset_declared); |
613
|
|
|
|
|
|
|
} else { |
614
|
|
|
|
|
|
|
my $charset_detected = Encode::Detect::Detector::detect($_[0]); |
615
|
|
|
|
|
|
|
if ($charset_detected && lc $charset_detected ne lc $charset_declared) { |
616
|
|
|
|
|
|
|
my $decoder = Encode::find_encoding($charset_detected); |
617
|
|
|
|
|
|
|
if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
618
|
|
|
|
|
|
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
619
|
|
|
|
|
|
|
dbg("message: no decoder for a detected charset %s, using GBK", |
620
|
2
|
50
|
|
|
|
5
|
$charset_detected) if $decoder; |
|
|
0
|
|
|
|
|
|
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
if (!$decoder) { |
623
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a detected charset %s", |
624
|
|
|
|
|
|
|
$charset_detected); |
625
|
|
|
|
|
|
|
} else { |
626
|
0
|
|
|
|
|
0
|
my $err = ''; |
627
|
0
|
0
|
0
|
|
|
0
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
628
|
0
|
|
|
|
|
0
|
if ($@) { |
629
|
0
|
0
|
0
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
630
|
0
|
|
|
|
|
0
|
$err = " ($err)"; |
631
|
0
|
0
|
|
|
|
0
|
} |
632
|
|
|
|
|
|
|
dbg("message: %s as detected charset %s, declared %s%s", |
633
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
634
|
0
|
0
|
|
|
|
0
|
$charset_detected, $charset_declared, $err); |
635
|
0
|
|
|
|
|
0
|
} |
636
|
|
|
|
|
|
|
} |
637
|
|
|
|
|
|
|
} |
638
|
0
|
|
|
|
|
0
|
|
639
|
0
|
|
|
|
|
0
|
if (!defined $rv) { # all decoding attempts failed so far, probably garbage |
|
0
|
|
|
|
|
0
|
|
640
|
0
|
0
|
|
|
|
0
|
# go for Windows-1252 which can't fail |
641
|
0
|
|
|
|
|
0
|
my $err = ''; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
642
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0]) }; |
643
|
|
|
|
|
|
|
if ($@) { |
644
|
0
|
0
|
|
|
|
0
|
$err = $@; $err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/; |
645
|
|
|
|
|
|
|
$err = " ($err)"; |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
dbg("message: %s as last-resort charset %s, declared %s%s", |
648
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
649
|
|
|
|
|
|
|
'Windows-1252', $charset_declared, $err); |
650
|
|
|
|
|
|
|
} |
651
|
2
|
50
|
|
|
|
5
|
|
652
|
|
|
|
|
|
|
if (!defined $rv) { # just in case - all decoding attempts failed so far |
653
|
0
|
|
|
|
|
0
|
return $_[0]; # garbage-in / garbage-out, return unchanged octets |
654
|
0
|
|
|
|
|
0
|
} |
|
0
|
|
|
|
|
0
|
|
655
|
0
|
0
|
|
|
|
0
|
# decoding octets to characters was successful |
656
|
0
|
|
|
|
|
0
|
if (!$return_decoded) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
657
|
0
|
|
|
|
|
0
|
# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg |
658
|
|
|
|
|
|
|
utf8::encode($rv); # encode Unicode characters to UTF-8 octets |
659
|
0
|
0
|
|
|
|
0
|
} |
660
|
|
|
|
|
|
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
661
|
|
|
|
|
|
|
return $rv; |
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
|
664
|
2
|
50
|
|
|
|
3
|
=item rendered() |
665
|
0
|
|
|
|
|
0
|
|
666
|
|
|
|
|
|
|
render_text() takes the given text/* type MIME part, and attempts to |
667
|
|
|
|
|
|
|
render it into a text scalar. It will always render text/html, and will |
668
|
2
|
50
|
|
|
|
5
|
use a heuristic to determine if other text/* parts should be considered |
669
|
|
|
|
|
|
|
text/html. Two scalars are returned: the rendered type (either text/html |
670
|
2
|
|
|
|
|
4
|
or whatever the original type was), and the rendered text. |
671
|
|
|
|
|
|
|
|
672
|
2
|
|
|
|
|
3
|
=cut |
673
|
2
|
|
|
|
|
14
|
|
674
|
|
|
|
|
|
|
my ($self) = @_; |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
if (!exists $self->{rendered}) { |
677
|
|
|
|
|
|
|
# We only know how to render text/plain and text/html ... |
678
|
|
|
|
|
|
|
# Note: for bug 4843, make sure to skip text/calendar parts |
679
|
|
|
|
|
|
|
# we also want to skip things like text/x-vcard |
680
|
|
|
|
|
|
|
# text/x-aol is ignored here, but looks like text/html ... |
681
|
|
|
|
|
|
|
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
my $text = $self->decode; # QP and Base64 decoding, bytes |
684
|
|
|
|
|
|
|
my $text_len = length($text); # num of bytes in original charset encoding |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
# render text/html always |
687
|
143
|
|
|
143
|
1
|
442
|
if ($text ne '' && $self->{'type'} =~ m{^text/html$}i) |
688
|
|
|
|
|
|
|
{ |
689
|
143
|
100
|
|
|
|
515
|
$self->{rendered_type} = 'text/html'; |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# will input text to HTML::Parser be provided as Unicode characters? |
692
|
|
|
|
|
|
|
my $character_semantics = 0; # $text is in bytes |
693
|
|
|
|
|
|
|
if ($self->{normalize} && $enc_utf8) { # charset decoding requested |
694
|
127
|
50
|
|
|
|
1160
|
# Provide input to HTML::Parser as Unicode characters |
695
|
|
|
|
|
|
|
# which avoids a HTML::Parser bug in utf8_mode |
696
|
127
|
|
|
|
|
615
|
# https://rt.cpan.org/Public/Bug/Display.html?id=99755 |
697
|
127
|
|
|
|
|
522
|
# Note: the above bug was fixed in HTML-Parser 3.72, January 2016. |
698
|
|
|
|
|
|
|
# Avoid unnecessary step of encoding-then-decoding by telling |
699
|
|
|
|
|
|
|
# subroutine _normalize() to return Unicode text. See Bug 7133 |
700
|
127
|
100
|
100
|
|
|
1385
|
# |
701
|
|
|
|
|
|
|
$character_semantics = 1; # $text will be in characters |
702
|
8
|
|
|
|
|
37
|
$text = _normalize($text, $self->{charset}, 1); # bytes to chars |
703
|
|
|
|
|
|
|
} elsif (!defined $self->{charset} || |
704
|
|
|
|
|
|
|
$self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) { |
705
|
8
|
|
|
|
|
23
|
# With some luck input can be interpreted as UTF-8, do not warn. |
706
|
8
|
50
|
33
|
|
|
178
|
# It is still possible to hit the HTML::Parses utf8_mode bug however. |
|
|
100
|
100
|
|
|
|
|
707
|
|
|
|
|
|
|
} else { |
708
|
|
|
|
|
|
|
dbg("message: 'normalize_charset' is off, encoding will likely ". |
709
|
|
|
|
|
|
|
"be misinterpreted; declared charset: %s", $self->{charset}); |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
# the 0 requires decoded HTML results to be in bytes (not characters) |
712
|
|
|
|
|
|
|
my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object |
713
|
|
|
|
|
|
|
|
714
|
0
|
|
|
|
|
0
|
$html->parse($text); # parse+render text |
715
|
0
|
|
|
|
|
0
|
|
716
|
|
|
|
|
|
|
# resulting HTML-decoded text is in bytes, likely encoded as UTF-8 |
717
|
|
|
|
|
|
|
$self->{rendered} = $html->get_rendered_text(); |
718
|
|
|
|
|
|
|
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
719
|
|
|
|
|
|
|
$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1); |
720
|
|
|
|
|
|
|
$self->{html_results} = $html->get_results(); |
721
|
|
|
|
|
|
|
|
722
|
1
|
|
|
|
|
13
|
# end-of-document result values that require looking at the text |
723
|
|
|
|
|
|
|
my $r = $self->{html_results}; # temporary reference for brevity |
724
|
|
|
|
|
|
|
|
725
|
8
|
|
|
|
|
185
|
# count the number of spaces in the rendered text (likely UTF-8 octets) |
726
|
|
|
|
|
|
|
my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//; |
727
|
8
|
|
|
|
|
70
|
# we may want to add the count of other Unicode whitespace characters |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8) |
730
|
8
|
|
|
|
|
40
|
$r->{non_space_len} = $r->{html_length} - $space; |
731
|
8
|
|
|
|
|
49
|
$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len; |
732
|
8
|
|
|
|
|
34
|
} |
733
|
8
|
|
|
|
|
45
|
|
734
|
|
|
|
|
|
|
else { # plain text |
735
|
|
|
|
|
|
|
if ($self->{normalize} && $enc_utf8) { |
736
|
8
|
|
|
|
|
19
|
# request transcoded result as UTF-8 octets! |
737
|
|
|
|
|
|
|
$text = _normalize($text, $self->{charset}, 0); |
738
|
|
|
|
|
|
|
} |
739
|
8
|
|
|
|
|
42
|
$self->{rendered_type} = $self->{type}; |
740
|
|
|
|
|
|
|
$self->{rendered} = $self->{'visible_rendered'} = $text; |
741
|
|
|
|
|
|
|
$self->{'invisible_rendered'} = ''; |
742
|
8
|
|
|
|
|
31
|
} |
743
|
8
|
|
|
|
|
26
|
} |
744
|
8
|
50
|
|
|
|
185
|
|
745
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{rendered}); |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
|
748
|
119
|
0
|
33
|
|
|
451
|
=item set_rendered($text, $type) |
749
|
|
|
|
|
|
|
|
750
|
0
|
|
|
|
|
0
|
Set the rendered text and type for the given part. If type is not |
751
|
|
|
|
|
|
|
specified, and text is a defined value, a default of 'text/plain' is used. |
752
|
119
|
|
|
|
|
426
|
This can be used, for instance, to render non-text parts using plugins. |
753
|
119
|
|
|
|
|
532
|
|
754
|
119
|
|
|
|
|
422
|
=cut |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my ($self, $text, $type) = @_; |
757
|
|
|
|
|
|
|
|
758
|
143
|
|
|
|
|
599
|
$type = 'text/plain' if (!defined $type && defined $text); |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
$self->{'rendered_type'} = $type; |
761
|
|
|
|
|
|
|
$self->{'rendered'} = $self->{'visible_rendered'} = $text; |
762
|
|
|
|
|
|
|
$self->{'invisible_rendered'} = defined $text ? '' : undef; |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item visible_rendered() |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
Render and return the visible text in this part. |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
=cut |
770
|
0
|
|
|
0
|
1
|
0
|
|
771
|
|
|
|
|
|
|
my ($self) = @_; |
772
|
0
|
0
|
0
|
|
|
0
|
$self->rendered(); # ignore return, we want just this: |
773
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{visible_rendered}); |
774
|
0
|
|
|
|
|
0
|
} |
775
|
0
|
|
|
|
|
0
|
|
776
|
0
|
0
|
|
|
|
0
|
=item invisible_rendered() |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
Render and return the invisible text in this part. |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=cut |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
my ($self) = @_; |
783
|
|
|
|
|
|
|
$self->rendered(); # ignore return, we want just this: |
784
|
|
|
|
|
|
|
return ($self->{rendered_type}, $self->{invisible_rendered}); |
785
|
|
|
|
|
|
|
} |
786
|
8
|
|
|
8
|
1
|
21
|
|
787
|
8
|
|
|
|
|
55
|
=item content_summary() |
788
|
8
|
|
|
|
|
38
|
|
789
|
|
|
|
|
|
|
Returns an array of scalars describing the mime parts of the message. |
790
|
|
|
|
|
|
|
Note: This function requires that the message be parsed first! |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=cut |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
# return an array with scalars describing mime parts |
795
|
|
|
|
|
|
|
my($self) = @_; |
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
my @ret = ( [ $self->{'type'} ] ); |
798
|
8
|
|
|
8
|
1
|
20
|
my @search; |
799
|
8
|
|
|
|
|
26
|
|
800
|
8
|
|
|
|
|
24
|
if (exists $self->{'body_parts'}) { |
801
|
|
|
|
|
|
|
my $count = @{$self->{'body_parts'}}; |
802
|
|
|
|
|
|
|
for(my $i=0; $i<$count; $i++) { |
803
|
|
|
|
|
|
|
push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]); |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
while(my $part = shift @search) { |
808
|
|
|
|
|
|
|
my($index, $part) = @{$part}; |
809
|
|
|
|
|
|
|
push(@{$ret[$index]}, $part->{'type'}); |
810
|
|
|
|
|
|
|
if (exists $part->{'body_parts'}) { |
811
|
|
|
|
|
|
|
unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}}); |
812
|
13
|
|
|
13
|
1
|
211
|
} |
813
|
|
|
|
|
|
|
} |
814
|
13
|
|
|
|
|
48
|
|
815
|
13
|
|
|
|
|
14
|
return map { join(",", @{$_}) } @ret; |
816
|
|
|
|
|
|
|
} |
817
|
13
|
100
|
|
|
|
27
|
|
818
|
11
|
|
|
|
|
13
|
=item delete_header() |
|
11
|
|
|
|
|
15
|
|
819
|
11
|
|
|
|
|
24
|
|
820
|
24
|
|
|
|
|
64
|
Delete the specified header (decoded and raw) from the Node information. |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=cut |
823
|
|
|
|
|
|
|
|
824
|
13
|
|
|
|
|
27
|
my($self, $hdr) = @_; |
825
|
45
|
|
|
|
|
48
|
|
|
45
|
|
|
|
|
68
|
|
826
|
45
|
|
|
|
|
41
|
foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) { |
|
45
|
|
|
|
|
83
|
|
827
|
45
|
100
|
|
|
|
125
|
delete $self->{'headers'}->{$_}; |
828
|
11
|
|
|
|
|
13
|
delete $self->{'raw_headers'}->{$_}; |
|
21
|
|
|
|
|
48
|
|
|
11
|
|
|
|
|
18
|
|
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}}); |
832
|
13
|
|
|
|
|
18
|
$self->{'header_order'} = \@neworder; |
|
37
|
|
|
|
|
49
|
|
|
37
|
|
|
|
|
178
|
|
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# decode a header appropriately. don't bother adding it to the pod documents. |
836
|
|
|
|
|
|
|
my ( $encoding, $cte, $data ) = @_; |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
if ( $cte eq 'B' ) { |
839
|
|
|
|
|
|
|
# base 64 encoded |
840
|
|
|
|
|
|
|
$data = Mail::SpamAssassin::Util::base64_decode($data); |
841
|
|
|
|
|
|
|
} |
842
|
505
|
|
|
505
|
1
|
929
|
elsif ( $cte eq 'Q' ) { |
843
|
|
|
|
|
|
|
# quoted printable |
844
|
505
|
|
|
|
|
641
|
|
|
505
|
|
|
|
|
6907
|
|
845
|
0
|
|
|
|
|
0
|
# the RFC states that in the encoded text, "_" is equal to "=20" |
846
|
0
|
|
|
|
|
0
|
$data =~ s/_/=20/g; |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
$data = Mail::SpamAssassin::Util::qp_decode($data); |
849
|
505
|
|
|
|
|
1028
|
} |
|
505
|
|
|
|
|
5010
|
|
850
|
505
|
|
|
|
|
1678
|
else { |
851
|
|
|
|
|
|
|
# not possible since the input has already been limited to 'B' and 'Q' |
852
|
|
|
|
|
|
|
die "message: unknown encoding type '$cte' in RFC2047 header"; |
853
|
|
|
|
|
|
|
} |
854
|
|
|
|
|
|
|
return _normalize($data, $encoding, 0); # transcode to UTF-8 octets |
855
|
10
|
|
|
10
|
|
43
|
} |
856
|
|
|
|
|
|
|
|
857
|
10
|
100
|
|
|
|
39
|
# Decode base64 and quoted-printable in headers according to RFC2047. |
|
|
50
|
|
|
|
|
|
858
|
|
|
|
|
|
|
# |
859
|
2
|
|
|
|
|
13
|
my($header_field_body, $header_field_name) = @_; |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
return '' unless defined $header_field_body && $header_field_body ne ''; |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
# deal with folding and cream the newlines and such |
864
|
|
|
|
|
|
|
$header_field_body =~ s/\n[ \t]+/\n /g; |
865
|
8
|
|
|
|
|
16
|
$header_field_body =~ s/\015?\012//gs; |
866
|
|
|
|
|
|
|
|
867
|
8
|
|
|
|
|
18
|
if ($header_field_name =~ |
868
|
|
|
|
|
|
|
/^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) | |
869
|
|
|
|
|
|
|
MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) { |
870
|
|
|
|
|
|
|
# Bug 6945: some header fields must not be processed for MIME encoding |
871
|
0
|
|
|
|
|
0
|
# Bug 7466: leave out the Content-* |
872
|
|
|
|
|
|
|
|
873
|
10
|
|
|
|
|
28
|
} else { |
874
|
|
|
|
|
|
|
local($1,$2,$3); |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# Multiple encoded sections must ignore the interim whitespace. |
877
|
|
|
|
|
|
|
# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE |
878
|
|
|
|
|
|
|
# separated by whitespace. |
879
|
909
|
|
|
909
|
|
2543
|
1 while $header_field_body =~ |
880
|
|
|
|
|
|
|
s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+ |
881
|
909
|
100
|
66
|
|
|
4498
|
( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) } |
882
|
|
|
|
|
|
|
{$1$2}xsg; |
883
|
|
|
|
|
|
|
|
884
|
859
|
|
|
|
|
1550
|
# transcode properly encoded RFC 2047 substrings into UTF-8 octets, |
885
|
859
|
|
|
|
|
1097
|
# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532) |
886
|
|
|
|
|
|
|
# or plain US-ASCII |
887
|
859
|
100
|
|
|
|
3009
|
$header_field_body =~ |
888
|
|
|
|
|
|
|
s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) } |
889
|
|
|
|
|
|
|
{ __decode_header($1, uc($2), $3) }xsge; |
890
|
|
|
|
|
|
|
} |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body); |
893
|
|
|
|
|
|
|
return $header_field_body; |
894
|
523
|
|
|
|
|
1777
|
} |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=item get_header() |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
Retrieve a specific header. Will have a newline at the end and will be |
899
|
523
|
|
|
|
|
1245
|
unfolded. The first parameter is the header name (case-insensitive), |
900
|
|
|
|
|
|
|
and the second parameter (optional) is whether or not to return the |
901
|
|
|
|
|
|
|
raw header. |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
If get_header() is called in an array context, an array will be returned |
904
|
|
|
|
|
|
|
with each header entry in a different element. In a scalar context, |
905
|
|
|
|
|
|
|
the last specific header is returned. |
906
|
523
|
|
|
|
|
1201
|
|
907
|
10
|
|
|
|
|
39
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
908
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
909
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
910
|
|
|
|
|
|
|
|
911
|
859
|
|
|
|
|
2691
|
Btw, returning the last header field (not the first) happens to be consistent |
912
|
|
|
|
|
|
|
with DKIM signatures, which search for and cover multiple header fields |
913
|
|
|
|
|
|
|
bottom-up according to the 'h' tag. Let's keep it this way. |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
=cut |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
my ($self, $hdr, $raw) = @_; |
918
|
|
|
|
|
|
|
$raw ||= 0; |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
# And now pick up all the entries into a list |
921
|
|
|
|
|
|
|
# This is assumed to include a newline at the end ... |
922
|
|
|
|
|
|
|
# This is also assumed to have removed continuation bits ... |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
# Deal with the possibility that header() or raw_header() returns undef |
925
|
|
|
|
|
|
|
my @hdrs; |
926
|
|
|
|
|
|
|
if ( $raw ) { |
927
|
|
|
|
|
|
|
if (@hdrs = $self->raw_header($hdr)) { |
928
|
|
|
|
|
|
|
s/\015?\012\s+/ /gs for @hdrs; |
929
|
|
|
|
|
|
|
} |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
else { |
932
|
|
|
|
|
|
|
if (@hdrs = $self->header($hdr)) { |
933
|
|
|
|
|
|
|
$_ .= "\n" for @hdrs; |
934
|
|
|
|
|
|
|
} |
935
|
|
|
|
|
|
|
} |
936
|
4078
|
|
|
4078
|
1
|
6951
|
|
937
|
4078
|
|
50
|
|
|
12097
|
if (wantarray) { |
938
|
|
|
|
|
|
|
return @hdrs; |
939
|
|
|
|
|
|
|
} |
940
|
|
|
|
|
|
|
else { |
941
|
|
|
|
|
|
|
return @hdrs ? $hdrs[-1] : undef; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
} |
944
|
4078
|
|
|
|
|
4404
|
|
945
|
4078
|
50
|
|
|
|
6029
|
=item get_all_headers() |
946
|
0
|
0
|
|
|
|
0
|
|
947
|
0
|
|
|
|
|
0
|
Retrieve all headers. Each header will have a newline at the end and |
948
|
|
|
|
|
|
|
will be unfolded. The first parameter (optional) is whether or not to |
949
|
|
|
|
|
|
|
return the raw headers, and the second parameter (optional) is whether |
950
|
|
|
|
|
|
|
or not to include the mbox separator. |
951
|
4078
|
100
|
|
|
|
7423
|
|
952
|
834
|
|
|
|
|
3018
|
If get_all_header() is called in an array context, an array will be |
953
|
|
|
|
|
|
|
returned with each header entry in a different element. In a scalar |
954
|
|
|
|
|
|
|
context, the headers are returned in a single scalar. |
955
|
|
|
|
|
|
|
|
956
|
4078
|
100
|
|
|
|
6112
|
=back |
957
|
3528
|
|
|
|
|
8352
|
|
958
|
|
|
|
|
|
|
=cut |
959
|
|
|
|
|
|
|
|
960
|
550
|
100
|
|
|
|
2265
|
# build it and it will not bomb |
961
|
|
|
|
|
|
|
my ($self, $raw, $include_mbox) = @_; |
962
|
|
|
|
|
|
|
$raw ||= 0; |
963
|
|
|
|
|
|
|
$include_mbox ||= 0; |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
my @lines; |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
# precalculate destination positions based on order of appearance |
968
|
|
|
|
|
|
|
my $i = 0; |
969
|
|
|
|
|
|
|
my %locations; |
970
|
|
|
|
|
|
|
for my $k (@{$self->{header_order}}) { |
971
|
|
|
|
|
|
|
push(@{$locations{lc($k)}}, $i++); |
972
|
|
|
|
|
|
|
} |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# process headers in order of first appearance |
975
|
|
|
|
|
|
|
my $header; |
976
|
|
|
|
|
|
|
my $size = 0; |
977
|
|
|
|
|
|
|
HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] } |
978
|
|
|
|
|
|
|
keys %locations) |
979
|
|
|
|
|
|
|
{ |
980
|
|
|
|
|
|
|
# get all same-name headers and poke into correct position |
981
|
45
|
|
|
45
|
1
|
135
|
my $positions = $locations{$name}; |
982
|
45
|
|
50
|
|
|
290
|
for my $contents ($self->get_header($name, $raw)) { |
983
|
45
|
|
100
|
|
|
192
|
my $position = shift @{$positions}; |
984
|
|
|
|
|
|
|
$size += length($name) + length($contents) + 2; |
985
|
45
|
|
|
|
|
62
|
if ($size > MAX_HEADER_LENGTH) { |
986
|
|
|
|
|
|
|
$self->{'truncated_header'} = 1; |
987
|
|
|
|
|
|
|
last HEADER; |
988
|
45
|
|
|
|
|
70
|
} |
989
|
45
|
|
|
|
|
65
|
$lines[$position] = $self->{header_order}->[$position].": ".$contents; |
990
|
45
|
|
|
|
|
97
|
} |
|
45
|
|
|
|
|
227
|
|
991
|
520
|
|
|
|
|
589
|
} |
|
520
|
|
|
|
|
1400
|
|
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# skip undefined lines if we truncated |
994
|
|
|
|
|
|
|
@lines = grep { defined $_ } @lines if $self->{'truncated_header'}; |
995
|
45
|
|
|
|
|
83
|
|
996
|
45
|
|
|
|
|
80
|
splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} ); |
997
|
45
|
|
|
|
|
408
|
|
|
880
|
|
|
|
|
1525
|
|
998
|
|
|
|
|
|
|
return wantarray ? @lines : join ('', @lines); |
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
|
1001
|
396
|
|
|
|
|
600
|
# legacy public API; now a no-op. |
1002
|
396
|
|
|
|
|
653
|
|
1003
|
520
|
|
|
|
|
636
|
# --------------------------------------------------------------------------- |
|
520
|
|
|
|
|
700
|
|
1004
|
520
|
|
|
|
|
1024
|
|
1005
|
520
|
50
|
|
|
|
1022
|
1; |