line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Message::Node - decode, render, and make available MIME message parts |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module will encapsulate an email message and allow access to |
25
|
|
|
|
|
|
|
the various MIME message parts. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 PUBLIC METHODS |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=over 4 |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
package Mail::SpamAssassin::Message::Node; |
34
|
|
|
|
|
|
|
|
35
|
40
|
|
|
40
|
|
256
|
use strict; |
|
40
|
|
|
|
|
94
|
|
|
40
|
|
|
|
|
1385
|
|
36
|
40
|
|
|
40
|
|
227
|
use warnings; |
|
40
|
|
|
|
|
109
|
|
|
40
|
|
|
|
|
1371
|
|
37
|
40
|
|
|
40
|
|
224
|
use re 'taint'; |
|
40
|
|
|
|
|
74
|
|
|
40
|
|
|
|
|
1645
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
require 5.008001; # needs utf8::is_utf8() |
40
|
|
|
|
|
|
|
|
41
|
40
|
|
|
40
|
|
227
|
use Mail::SpamAssassin; |
|
40
|
|
|
|
|
89
|
|
|
40
|
|
|
|
|
1148
|
|
42
|
40
|
|
|
40
|
|
229
|
use Mail::SpamAssassin::Constants qw(:sa); |
|
40
|
|
|
|
|
103
|
|
|
40
|
|
|
|
|
5924
|
|
43
|
40
|
|
|
40
|
|
12376
|
use Mail::SpamAssassin::HTML; |
|
40
|
|
|
|
|
162
|
|
|
40
|
|
|
|
|
1577
|
|
44
|
40
|
|
|
40
|
|
276
|
use Mail::SpamAssassin::Logger; |
|
40
|
|
|
|
|
73
|
|
|
40
|
|
|
|
|
6102
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
our($enc_utf8, $enc_w1252, $have_encode_detector); |
47
|
|
|
|
|
|
|
BEGIN { |
48
|
40
|
|
|
|
|
1553
|
eval { require Encode } |
49
|
40
|
50
|
|
40
|
|
156
|
and do { $enc_utf8 = Encode::find_encoding('UTF-8'); |
|
40
|
|
|
|
|
31445
|
|
50
|
40
|
|
|
|
|
9538
|
$enc_w1252 = Encode::find_encoding('Windows-1252') }; |
51
|
40
|
|
|
|
|
61914
|
eval { require Encode::Detect::Detector } |
52
|
40
|
50
|
|
|
|
17540
|
and do { $have_encode_detector = 1 }; |
|
0
|
|
|
|
|
0
|
|
53
|
|
|
|
|
|
|
}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item new() |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Generates an empty Node object and returns it. Typically only called |
58
|
|
|
|
|
|
|
by functions in Message. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new { |
63
|
181
|
|
|
181
|
1
|
438
|
my $class = shift; |
64
|
181
|
|
33
|
|
|
804
|
$class = ref($class) || $class; |
65
|
|
|
|
|
|
|
|
66
|
181
|
|
|
|
|
1457
|
my $self = { |
67
|
|
|
|
|
|
|
headers => {}, |
68
|
|
|
|
|
|
|
raw_headers => {}, |
69
|
|
|
|
|
|
|
header_order => [] |
70
|
|
|
|
|
|
|
}; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# deal with any parameters |
73
|
181
|
|
|
|
|
448
|
my($opts) = @_; |
74
|
181
|
|
50
|
|
|
828
|
$self->{normalize} = $opts->{'normalize'} || 0; |
75
|
|
|
|
|
|
|
|
76
|
181
|
|
|
|
|
511
|
bless($self,$class); |
77
|
181
|
|
|
|
|
569
|
$self; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=item find_parts() |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Used to search the tree for specific MIME parts. An array of matching |
83
|
|
|
|
|
|
|
Node objects (pointers into the tree) is returned. The parameters that |
84
|
|
|
|
|
|
|
can be passed in are (in order, all scalars): |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Regexp - Used to match against each part's Content-Type header, |
87
|
|
|
|
|
|
|
specifically the type and not the rest of the header. ie: "Content-type: |
88
|
|
|
|
|
|
|
text/html; encoding=quoted-printable" has a type of "text/html". If no |
89
|
|
|
|
|
|
|
regexp is specified, find_parts() will return an empty array. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Only_leaves - By default, find_parts() will return any part that matches |
92
|
|
|
|
|
|
|
the regexp, including multipart. If you only want to see leaves of the |
93
|
|
|
|
|
|
|
tree (ie: parts that aren't multipart), set this to true (1). |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Recursive - By default, when find_parts() finds a multipart which has |
96
|
|
|
|
|
|
|
parts underneath it, it will recurse through all sub-children. If set to 0, |
97
|
|
|
|
|
|
|
only look at the part and any direct children of the part. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=cut |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Used to find any MIME parts whose simple content-type matches a given regexp |
102
|
|
|
|
|
|
|
# Searches it's own and any children parts. Returns an array of MIME |
103
|
|
|
|
|
|
|
# objects which match. Our callers may expect the default behavior which is a |
104
|
|
|
|
|
|
|
# depth-first array of parts. |
105
|
|
|
|
|
|
|
# |
106
|
|
|
|
|
|
|
sub find_parts { |
107
|
231
|
|
|
231
|
1
|
741
|
my ($self, $re, $onlyleaves, $recursive) = @_; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# Didn't pass an RE? Just abort. |
110
|
231
|
50
|
33
|
|
|
1402
|
return () unless defined $re && $re ne ''; |
111
|
|
|
|
|
|
|
|
112
|
231
|
50
|
|
|
|
679
|
$onlyleaves = 0 unless defined $onlyleaves; |
113
|
|
|
|
|
|
|
|
114
|
231
|
|
|
|
|
412
|
my $depth; |
115
|
231
|
50
|
33
|
|
|
717
|
if (defined $recursive && $recursive == 0) { |
116
|
0
|
|
|
|
|
0
|
$depth = 1; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
231
|
|
|
|
|
380
|
my @ret; |
120
|
231
|
|
|
|
|
495
|
my @search = ( $self ); |
121
|
|
|
|
|
|
|
|
122
|
231
|
|
|
|
|
737
|
while (my $part = shift @search) { |
123
|
|
|
|
|
|
|
# If this object matches, mark it for return. |
124
|
263
|
|
|
|
|
925
|
my $amialeaf = $part->is_leaf(); |
125
|
|
|
|
|
|
|
|
126
|
263
|
100
|
66
|
|
|
2847
|
if ( $part->{'type'} =~ /$re/ && (!$onlyleaves || $amialeaf) ) { |
|
|
|
100
|
|
|
|
|
127
|
245
|
|
|
|
|
605
|
push(@ret, $part); |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
263
|
50
|
33
|
|
|
1005
|
if ( !$amialeaf && (!defined $depth || $depth > 0)) { |
|
|
|
66
|
|
|
|
|
131
|
18
|
50
|
|
|
|
105
|
$depth-- if defined $depth; |
132
|
18
|
|
|
|
|
27
|
unshift(@search, @{$part->{'body_parts'}}); |
|
18
|
|
|
|
|
48
|
|
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
231
|
|
|
|
|
848
|
return @ret; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=item header() |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Stores and retrieves headers from a specific MIME part. The first |
142
|
|
|
|
|
|
|
parameter is the header name. If there is no other parameter, the header |
143
|
|
|
|
|
|
|
is retrieved. If there is a second parameter, the header is stored. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Header names are case-insensitive and are stored in both raw and |
146
|
|
|
|
|
|
|
decoded form. Using header(), only the decoded form is retrievable. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
For retrieval, if header() is called in an array context, an array will |
149
|
|
|
|
|
|
|
be returned with each header entry in a different element. In a scalar |
150
|
|
|
|
|
|
|
context, the last specific header is returned. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
153
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
154
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=cut |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# Store or retrieve headers from a given MIME object |
159
|
|
|
|
|
|
|
# |
160
|
|
|
|
|
|
|
sub header { |
161
|
5494
|
|
|
5494
|
1
|
7094
|
my $self = shift; |
162
|
5494
|
|
|
|
|
6955
|
my $rawkey = shift; |
163
|
|
|
|
|
|
|
|
164
|
5494
|
50
|
|
|
|
9074
|
return unless defined $rawkey; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# we're going to do things case insensitively |
167
|
5494
|
|
|
|
|
8236
|
my $key = lc($rawkey); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
170
|
5494
|
|
|
|
|
11073
|
$key =~ s/^\s+//; |
171
|
5494
|
|
|
|
|
9003
|
$key =~ s/\s+$//; |
172
|
|
|
|
|
|
|
|
173
|
5494
|
100
|
|
|
|
9428
|
if (@_) { |
174
|
800
|
|
|
|
|
1069
|
my $raw_value = shift; |
175
|
800
|
50
|
|
|
|
1278
|
return unless defined $raw_value; |
176
|
|
|
|
|
|
|
|
177
|
800
|
|
|
|
|
1003
|
push @{ $self->{'header_order'} }, $rawkey; |
|
800
|
|
|
|
|
1591
|
|
178
|
800
|
100
|
|
|
|
1821
|
if ( !exists $self->{'headers'}->{$key} ) { |
179
|
726
|
|
|
|
|
2150
|
$self->{'headers'}->{$key} = []; |
180
|
726
|
|
|
|
|
1429
|
$self->{'raw_headers'}->{$key} = []; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
800
|
|
|
|
|
1125
|
my $dec_value = $raw_value; |
184
|
800
|
|
|
|
|
1774
|
$dec_value =~ s/\n[ \t]+/ /gs; |
185
|
800
|
|
|
|
|
3214
|
$dec_value =~ s/\s+$//s; |
186
|
800
|
|
|
|
|
2059
|
$dec_value =~ s/^\s+//s; |
187
|
800
|
|
|
|
|
1072
|
push @{ $self->{'headers'}->{$key} }, _decode_header($dec_value,$key); |
|
800
|
|
|
|
|
2340
|
|
188
|
|
|
|
|
|
|
|
189
|
800
|
|
|
|
|
1088
|
push @{ $self->{'raw_headers'}->{$key} }, $raw_value; |
|
800
|
|
|
|
|
1580
|
|
190
|
|
|
|
|
|
|
|
191
|
800
|
|
|
|
|
2175
|
return $self->{'headers'}->{$key}->[-1]; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
4694
|
100
|
|
|
|
7500
|
if (wantarray) { |
195
|
4409
|
100
|
|
|
|
13714
|
return unless exists $self->{'headers'}->{$key}; |
196
|
797
|
|
|
|
|
1082
|
return @{ $self->{'headers'}->{$key} }; |
|
797
|
|
|
|
|
3359
|
|
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
else { |
199
|
285
|
100
|
|
|
|
2431
|
return '' unless exists $self->{'headers'}->{$key}; |
200
|
34
|
|
|
|
|
217
|
return $self->{'headers'}->{$key}->[-1]; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=item raw_header() |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
Retrieves the raw version of headers from a specific MIME part. The only |
207
|
|
|
|
|
|
|
parameter is the header name. Header names are case-insensitive. |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
For retrieval, if raw_header() is called in an array context, an array |
210
|
|
|
|
|
|
|
will be returned with each header entry in a different element. In a |
211
|
|
|
|
|
|
|
scalar context, the last specific header is returned. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
214
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
215
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=cut |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Retrieve raw headers from a given MIME object |
220
|
|
|
|
|
|
|
# |
221
|
|
|
|
|
|
|
sub raw_header { |
222
|
475
|
|
|
475
|
1
|
787
|
my $self = shift; |
223
|
475
|
|
|
|
|
831
|
my $key = lc(shift); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Trim whitespace off of the header keys |
226
|
475
|
|
|
|
|
996
|
$key =~ s/^\s+//; |
227
|
475
|
|
|
|
|
958
|
$key =~ s/\s+$//; |
228
|
|
|
|
|
|
|
|
229
|
475
|
50
|
|
|
|
1008
|
if (wantarray) { |
230
|
475
|
100
|
|
|
|
1716
|
return unless exists $self->{'raw_headers'}->{$key}; |
231
|
122
|
|
|
|
|
245
|
return @{ $self->{'raw_headers'}->{$key} }; |
|
122
|
|
|
|
|
595
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
else { |
234
|
0
|
0
|
|
|
|
0
|
return '' unless exists $self->{'raw_headers'}->{$key}; |
235
|
0
|
|
|
|
|
0
|
return $self->{'raw_headers'}->{$key}->[-1]; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=item add_body_part() |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Adds a Node child object to the current node object. |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=cut |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# Add a MIME child part to ourselves |
246
|
|
|
|
|
|
|
sub add_body_part { |
247
|
48
|
|
|
48
|
1
|
69
|
my($self, $part) = @_; |
248
|
|
|
|
|
|
|
|
249
|
48
|
|
|
|
|
125
|
dbg("message: added part, type: ".$part->{'type'}); |
250
|
48
|
|
|
|
|
50
|
push @{ $self->{'body_parts'} }, $part; |
|
48
|
|
|
|
|
96
|
|
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item is_leaf() |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Returns true if the tree node in question is a leaf of the tree (ie: |
256
|
|
|
|
|
|
|
has no children of its own). Note: This function may return odd results |
257
|
|
|
|
|
|
|
unless the message has been mime parsed via _do_parse()! |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub is_leaf { |
262
|
263
|
|
|
263
|
1
|
568
|
my($self) = @_; |
263
|
263
|
|
|
|
|
777
|
return !exists $self->{'body_parts'}; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item raw() |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
Return a reference to the the raw array. Treat this as READ ONLY. |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
=cut |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
sub raw { |
273
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# Ok, if we're called we are expected to return an array. |
276
|
|
|
|
|
|
|
# so if it's a file reference, read in the message into an array... |
277
|
|
|
|
|
|
|
# |
278
|
|
|
|
|
|
|
# NOTE: that "ref undef" works, so don't bother checking for a defined var |
279
|
|
|
|
|
|
|
# first. |
280
|
0
|
0
|
|
|
|
0
|
if (ref $self->{'raw'} eq 'GLOB') { |
281
|
0
|
|
|
|
|
0
|
my $fd = $self->{'raw'}; |
282
|
0
|
0
|
|
|
|
0
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# dbg("message: (raw) reading mime part from a temporary file"); |
285
|
0
|
|
|
|
|
0
|
my($nread,$raw_str); $raw_str = ''; |
|
0
|
|
|
|
|
0
|
|
286
|
0
|
|
|
|
|
0
|
while ( $nread=sysread($fd, $raw_str, 16384, length $raw_str) ) { } |
287
|
0
|
0
|
|
|
|
0
|
defined $nread or die "error reading: $!"; |
288
|
0
|
|
|
|
|
0
|
my @array = split(/^/m, $raw_str, -1); |
289
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
0
|
dbg("message: empty message read") if $raw_str eq ''; |
291
|
0
|
|
|
|
|
0
|
return \@array; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
0
|
|
|
|
|
0
|
return $self->{'raw'}; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=item decode() |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
If necessary, decode the part text as base64 or quoted-printable. |
300
|
|
|
|
|
|
|
The decoded text will be returned as a scalar string. An optional length |
301
|
|
|
|
|
|
|
parameter can be passed in which limits how much decoded data is returned. |
302
|
|
|
|
|
|
|
If the scalar isn't needed, call with "0" as a parameter. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=cut |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
sub decode { |
307
|
228
|
|
|
228
|
1
|
2932
|
my($self, $bytes) = @_; |
308
|
|
|
|
|
|
|
|
309
|
228
|
100
|
|
|
|
760
|
if ( !exists $self->{'decoded'} ) { |
310
|
|
|
|
|
|
|
# Someone is looking for a decoded part where there is no raw data |
311
|
|
|
|
|
|
|
# (multipart or subparsed message, etc.) Just return undef. |
312
|
132
|
50
|
|
|
|
478
|
return if !exists $self->{'raw'}; |
313
|
|
|
|
|
|
|
|
314
|
132
|
|
|
|
|
262
|
my $raw; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# if the part is held in a temp file, read it into the scalar |
317
|
132
|
100
|
|
|
|
527
|
if (ref $self->{'raw'} eq 'GLOB') { |
318
|
4
|
|
|
|
|
9
|
my $fd = $self->{'raw'}; |
319
|
4
|
50
|
|
|
|
44
|
seek($fd, 0, 0) or die "message: cannot rewind file: $!"; |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# dbg("message: (decode) reading mime part from a temporary file"); |
322
|
4
|
|
|
|
|
10
|
my($nread,$raw_str); $raw = ''; |
|
4
|
|
|
|
|
8
|
|
323
|
4
|
|
|
|
|
197
|
while ( $nread=sysread($fd, $raw, 16384, length $raw) ) { } |
324
|
4
|
50
|
|
|
|
14
|
defined $nread or die "error reading: $!"; |
325
|
|
|
|
|
|
|
|
326
|
4
|
50
|
|
|
|
11
|
dbg("message: empty message read from a temp file") if $raw eq ''; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
else { |
329
|
|
|
|
|
|
|
# create a new scalar from the raw array in memory |
330
|
128
|
|
|
|
|
282
|
$raw = join('', @{$self->{'raw'}}); |
|
128
|
|
|
|
|
931
|
|
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
132
|
|
100
|
|
|
448
|
my $encoding = lc $self->header('content-transfer-encoding') || ''; |
334
|
|
|
|
|
|
|
|
335
|
132
|
100
|
|
|
|
668
|
if ( $encoding eq 'quoted-printable' ) { |
|
|
100
|
|
|
|
|
|
336
|
6
|
|
|
|
|
29
|
dbg("message: decoding quoted-printable"); |
337
|
6
|
|
|
|
|
42
|
$self->{'decoded'} = Mail::SpamAssassin::Util::qp_decode($raw); |
338
|
6
|
|
|
|
|
36
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
elsif ( $encoding eq 'base64' ) { |
341
|
5
|
|
|
|
|
19
|
dbg("message: decoding base64"); |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# if it's not defined or is 0, do the whole thing, otherwise only decode |
344
|
|
|
|
|
|
|
# a portion |
345
|
5
|
50
|
|
|
|
13
|
if ($bytes) { |
346
|
0
|
|
|
|
|
0
|
return Mail::SpamAssassin::Util::base64_decode($raw, $bytes); |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
else { |
349
|
|
|
|
|
|
|
# Generate the decoded output |
350
|
5
|
|
|
|
|
20
|
$self->{'decoded'} = Mail::SpamAssassin::Util::base64_decode($raw); |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
5
|
100
|
|
|
|
32
|
if ( $self->{'type'} =~ m@^(?:text|message)\b/@i ) { |
354
|
1
|
|
|
|
|
23
|
$self->{'decoded'} =~ s/\015\012/\012/gs; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
else { |
358
|
|
|
|
|
|
|
# Encoding is one of 7bit, 8bit, binary or x-something |
359
|
121
|
100
|
|
|
|
328
|
if ( $encoding ) { |
360
|
19
|
|
|
|
|
101
|
dbg("message: decoding other encoding type ($encoding), ignoring"); |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
else { |
363
|
102
|
|
|
|
|
365
|
dbg("message: no encoding detected"); |
364
|
|
|
|
|
|
|
} |
365
|
121
|
|
|
|
|
619
|
$self->{'decoded'} = $raw; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
228
|
100
|
66
|
|
|
1491
|
if ( !defined $bytes || $bytes ) { |
370
|
224
|
50
|
|
|
|
577
|
if ( !defined $bytes ) { |
371
|
|
|
|
|
|
|
# force a copy |
372
|
224
|
|
|
|
|
2207
|
return '' . $self->{'decoded'}; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
else { |
375
|
0
|
|
|
|
|
0
|
return substr($self->{'decoded'}, 0, $bytes); |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# Detect endianness of UTF-16 encoded data |
381
|
|
|
|
|
|
|
sub detect_utf16 { |
382
|
40
|
|
|
40
|
|
22692
|
use Data::Dumper; |
|
40
|
|
|
|
|
201504
|
|
|
40
|
|
|
|
|
132897
|
|
383
|
0
|
|
|
0
|
0
|
0
|
my $data = $_[0]; # could not avoid copying large strings |
384
|
0
|
|
|
|
|
0
|
my $utf16le_clues = 0; |
385
|
0
|
|
|
|
|
0
|
my $utf16be_clues = 0; |
386
|
0
|
|
|
|
|
0
|
my $sum_h_e = 0; |
387
|
0
|
|
|
|
|
0
|
my $sum_h_o = 0; |
388
|
0
|
|
|
|
|
0
|
my $sum_l_e = 0; |
389
|
0
|
|
|
|
|
0
|
my $sum_l_o = 0; |
390
|
0
|
|
|
|
|
0
|
my $decoder = undef; |
391
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
0
|
my @msg_h = unpack 'H' x length( $data ), $data; |
393
|
0
|
|
|
|
|
0
|
my @msg_l = unpack 'h' x length( $data ), $data; |
394
|
|
|
|
|
|
|
|
395
|
0
|
|
|
|
|
0
|
for( my $i = 0; $i < length( $data ); $i+=2 ) { |
396
|
0
|
|
|
|
|
0
|
my $check_char = sprintf( "%01X%01X %01X%01X", hex $msg_h[$i], hex $msg_l[$i], hex $msg_h[$i+1], hex $msg_l[$i+1] ); |
397
|
0
|
|
|
|
|
0
|
$sum_h_e += hex $msg_h[$i]; |
398
|
0
|
|
|
|
|
0
|
$sum_h_o += hex $msg_h[$i+1]; |
399
|
0
|
|
|
|
|
0
|
$sum_l_e += hex $msg_l[$i]; |
400
|
0
|
|
|
|
|
0
|
$sum_l_o += hex $msg_l[$i+1]; |
401
|
0
|
0
|
|
|
|
0
|
if( $check_char =~ /20 00/ ) { |
402
|
|
|
|
|
|
|
# UTF-16LE space char detected |
403
|
0
|
|
|
|
|
0
|
$utf16le_clues++; |
404
|
|
|
|
|
|
|
} |
405
|
0
|
0
|
|
|
|
0
|
if( $check_char =~ /00 20/ ) { |
406
|
|
|
|
|
|
|
# UTF-16BE space char detected |
407
|
0
|
|
|
|
|
0
|
$utf16be_clues++; |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the odd bytes, we're probably UTF-16LE |
412
|
0
|
0
|
|
|
|
0
|
$utf16le_clues++ if( ($sum_h_e + $sum_l_e) > ($sum_h_o + $sum_l_o)*4 ); |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# If we have 4x as many non-null characters in the even bytes, we're probably UTF-16BE |
415
|
0
|
0
|
|
|
|
0
|
$utf16be_clues++ if( ($sum_h_o + $sum_l_o)*4 > ($sum_h_e + $sum_l_e) ); |
416
|
|
|
|
|
|
|
|
417
|
0
|
0
|
|
|
|
0
|
if( $utf16le_clues > $utf16be_clues ) { |
|
|
0
|
|
|
|
|
|
418
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: UTF-16LE" ); |
419
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16LE"); |
420
|
|
|
|
|
|
|
} elsif( $utf16le_clues > $utf16be_clues ) { |
421
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: UTF-16BE" ); |
422
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding("UTF-16BE"); |
423
|
|
|
|
|
|
|
} else { |
424
|
0
|
|
|
|
|
0
|
dbg( "message: detect_utf16: Could not detect UTF-16 endianness" ); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
0
|
|
|
|
|
0
|
return $decoder; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Look at a text scalar and determine whether it should be rendered |
431
|
|
|
|
|
|
|
# as text/html. |
432
|
|
|
|
|
|
|
# |
433
|
|
|
|
|
|
|
# This is not a public function. |
434
|
|
|
|
|
|
|
# |
435
|
|
|
|
|
|
|
sub _html_render { |
436
|
105
|
50
|
|
105
|
|
783
|
if ($_[0] =~ m/^(.{0,18}?<(?:body|head|html|img|pre|table|title)(?:\s.{0,18}?)?>)/is) |
437
|
|
|
|
|
|
|
{ |
438
|
0
|
|
|
|
|
0
|
my $pad = $1; |
439
|
0
|
|
|
|
|
0
|
my $count = 0; |
440
|
0
|
|
|
|
|
0
|
$count += ($pad =~ tr/\n//d) * 2; |
441
|
0
|
|
|
|
|
0
|
$count += ($pad =~ tr/\n//cd); |
442
|
0
|
|
|
|
|
0
|
return ($count < 24); |
443
|
|
|
|
|
|
|
} |
444
|
105
|
|
|
|
|
558
|
return 0; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# Decode character set of a given text to perl characters (Unicode), |
448
|
|
|
|
|
|
|
# then encode into UTF-8 octets if requested. |
449
|
|
|
|
|
|
|
# |
450
|
|
|
|
|
|
|
sub _normalize { |
451
|
|
|
|
|
|
|
# my $data = $_[0]; # avoid copying large strings |
452
|
10
|
|
|
10
|
|
20
|
my $charset_declared = $_[1]; |
453
|
10
|
|
|
|
|
12
|
my $return_decoded = $_[2]; # true: Unicode characters, false: UTF-8 octets |
454
|
|
|
|
|
|
|
|
455
|
10
|
50
|
|
|
|
35
|
warn "message: _normalize() was given characters, expected bytes: $_[0]\n" |
456
|
|
|
|
|
|
|
if utf8::is_utf8($_[0]); |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# workaround for Encode::decode taint laundering bug [rt.cpan.org #84879] |
459
|
10
|
|
|
|
|
29
|
my $data_taint = substr($_[0], 0, 0); # empty string, tainted like $data |
460
|
|
|
|
|
|
|
|
461
|
10
|
50
|
33
|
|
|
54
|
if (!defined $charset_declared || $charset_declared eq '') { |
462
|
0
|
|
|
|
|
0
|
$charset_declared = 'us-ascii'; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# number of characters with code above 127 |
466
|
10
|
|
|
|
|
25
|
my $cnt_8bits = $_[0] =~ tr/\x00-\x7F//c; |
467
|
|
|
|
|
|
|
|
468
|
10
|
50
|
33
|
|
|
74
|
if (!$cnt_8bits && |
469
|
|
|
|
|
|
|
$charset_declared =~ |
470
|
|
|
|
|
|
|
/^(?: (?:US-)?ASCII | ANSI[_ ]? X3\.4- (?:1986|1968) | |
471
|
|
|
|
|
|
|
ISO646-US )\z/xsi) |
472
|
|
|
|
|
|
|
{ # declared as US-ASCII (a.k.a. ANSI X3.4-1986) and it really is |
473
|
0
|
|
|
|
|
0
|
dbg("message: kept, charset is US-ASCII as declared"); |
474
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
10
|
100
|
66
|
|
|
103
|
if (!$cnt_8bits && |
478
|
|
|
|
|
|
|
$charset_declared =~ |
479
|
|
|
|
|
|
|
/^(?: ISO[ -]?8859 (?: - \d{1,2} )? | Windows-\d{4} | |
480
|
|
|
|
|
|
|
UTF-?8 | (KOI8|EUC)-[A-Z]{1,2} | |
481
|
|
|
|
|
|
|
Big5 | GBK | GB[ -]?18030 (?:-20\d\d)? )\z/xsi) |
482
|
|
|
|
|
|
|
{ # declared as extended ASCII, but it is actually a plain 7-bit US-ASCII |
483
|
8
|
|
|
|
|
24
|
dbg("message: kept, charset is US-ASCII, declared %s", $charset_declared); |
484
|
8
|
|
|
|
|
36
|
return $_[0]; # is all-ASCII, no need for decoding |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
# Try first to strictly decode based on a declared character set. |
488
|
|
|
|
|
|
|
|
489
|
2
|
|
|
|
|
5
|
my $rv; |
490
|
2
|
50
|
33
|
|
|
29
|
if ($charset_declared =~ /^UTF-?8\z/i) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# attempt decoding as strict UTF-8 (flags: FB_CROAK | LEAVE_SRC) |
492
|
0
|
0
|
|
|
|
0
|
if (eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
493
|
0
|
|
|
|
|
0
|
dbg("message: decoded as declared charset UTF-8"); |
494
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
495
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
496
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
497
|
|
|
|
|
|
|
} else { |
498
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset UTF-8"); |
499
|
|
|
|
|
|
|
}; |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
} elsif ($charset_declared =~ /^UTF[ -]?16/i) { |
502
|
|
|
|
|
|
|
# Handle cases where spammers use UTF-16 encoding without including a BOM |
503
|
|
|
|
|
|
|
# or declaring endianness as reported at: |
504
|
|
|
|
|
|
|
# https://bz.apache.org/SpamAssassin/show_bug.cgi?id=7252 |
505
|
|
|
|
|
|
|
|
506
|
0
|
|
|
|
|
0
|
my $decoder = detect_utf16( $_[0] ); |
507
|
0
|
0
|
|
|
|
0
|
if (eval { $rv = $decoder->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
508
|
0
|
|
|
|
|
0
|
dbg("message: declared charset %s decoded as charset %s", $charset_declared, $decoder->name); |
509
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
510
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
511
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
512
|
|
|
|
|
|
|
} else { |
513
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding as declared charset $charset_declared"); |
514
|
|
|
|
|
|
|
}; |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
} elsif ($cnt_8bits && |
517
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_utf8->decode($_[0], 1|8); defined $rv }) { |
|
0
|
|
|
|
|
0
|
|
518
|
0
|
|
|
|
|
0
|
dbg("message: decoded as charset UTF-8, declared %s", $charset_declared); |
519
|
0
|
0
|
|
|
|
0
|
return $_[0] if !$return_decoded; |
520
|
0
|
|
|
|
|
0
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
521
|
0
|
|
|
|
|
0
|
return $rv; # decoded |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
} elsif ($charset_declared =~ /^(?:US-)?ASCII\z/i) { |
524
|
|
|
|
|
|
|
# declared as US-ASCII but contains 8-bit characters, makes no sense |
525
|
|
|
|
|
|
|
# to attempt decoding first as strict US-ASCII as we know it would fail |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
} else { |
528
|
|
|
|
|
|
|
# try decoding as a declared character set |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# -> http://en.wikipedia.org/wiki/Windows-1252 |
531
|
|
|
|
|
|
|
# Windows-1252 character encoding is a superset of ISO 8859-1, but differs |
532
|
|
|
|
|
|
|
# from the IANA's ISO-8859-1 by using displayable characters rather than |
533
|
|
|
|
|
|
|
# control characters in the 80 to 9F (hex) range. [...] |
534
|
|
|
|
|
|
|
# It is very common to mislabel Windows-1252 text with the charset label |
535
|
|
|
|
|
|
|
# ISO-8859-1. A common result was that all the quotes and apostrophes |
536
|
|
|
|
|
|
|
# (produced by "smart quotes" in word-processing software) were replaced |
537
|
|
|
|
|
|
|
# with question marks or boxes on non-Windows operating systems, making |
538
|
|
|
|
|
|
|
# text difficult to read. Most modern web browsers and e-mail clients |
539
|
|
|
|
|
|
|
# treat the MIME charset ISO-8859-1 as Windows-1252 to accommodate |
540
|
|
|
|
|
|
|
# such mislabeling. This is now standard behavior in the draft HTML 5 |
541
|
|
|
|
|
|
|
# specification, which requires that documents advertised as ISO-8859-1 |
542
|
|
|
|
|
|
|
# actually be parsed with the Windows-1252 encoding. |
543
|
|
|
|
|
|
|
# |
544
|
2
|
|
|
|
|
7
|
my($chset, $decoder); |
545
|
2
|
50
|
|
|
|
12
|
if ($charset_declared =~ /^(?: ISO-?8859-1 | Windows-1252 | CP1252 )\z/xi) { |
546
|
0
|
|
|
|
|
0
|
$chset = 'Windows-1252'; $decoder = $enc_w1252; |
|
0
|
|
|
|
|
0
|
|
547
|
|
|
|
|
|
|
} else { |
548
|
2
|
|
|
|
|
5
|
$chset = $charset_declared; $decoder = Encode::find_encoding($chset); |
|
2
|
|
|
|
|
31
|
|
549
|
2
|
50
|
33
|
|
|
17108
|
if (!$decoder && $chset =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
550
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
551
|
0
|
0
|
|
|
|
0
|
dbg("message: no decoder for a declared charset %s, using GBK", |
552
|
|
|
|
|
|
|
$chset) if $decoder; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
} |
555
|
2
|
50
|
|
|
|
9
|
if (!$decoder) { |
556
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a declared charset %s", |
557
|
|
|
|
|
|
|
$chset); |
558
|
|
|
|
|
|
|
} else { |
559
|
2
|
|
|
|
|
3
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
2
|
|
|
|
|
14
|
|
560
|
2
|
50
|
|
|
|
254
|
if (lc $chset eq lc $charset_declared) { |
561
|
2
|
50
|
|
|
|
14
|
dbg("message: %s as declared charset %s", |
562
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', $charset_declared); |
563
|
|
|
|
|
|
|
} else { |
564
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as charset %s, declared %s", |
565
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
566
|
|
|
|
|
|
|
$chset, $charset_declared); |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# If the above failed, check if it is US-ASCII, possibly extended by few |
572
|
|
|
|
|
|
|
# NBSP or SHY characters from ISO-8859-* or Windows-1252, or containing |
573
|
|
|
|
|
|
|
# some popular punctuation or special characters from Windows-1252 in |
574
|
|
|
|
|
|
|
# the \x80-\x9F range (which is unassigned in ISO-8859-*). |
575
|
|
|
|
|
|
|
# Note that Windows-1252 is a proper superset of ISO-8859-1. |
576
|
|
|
|
|
|
|
# |
577
|
2
|
50
|
33
|
|
|
18
|
if (!defined $rv && !$cnt_8bits) { |
|
|
50
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
578
|
0
|
|
|
|
|
0
|
dbg("message: kept, guessed charset is US-ASCII, declared %s", |
579
|
|
|
|
|
|
|
$charset_declared); |
580
|
0
|
|
|
|
|
0
|
return $_[0]; # is all-ASCII, no need for decoding |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
} elsif (!defined $rv && $enc_w1252 && |
583
|
|
|
|
|
|
|
# ASCII NBSP (c) SHY ' " ... '".- TM |
584
|
|
|
|
|
|
|
$_[0] !~ tr/\x00-\x7F\xA0\xA9\xAD\x82\x84\x85\x91-\x97\x99//c) |
585
|
|
|
|
|
|
|
{ # ASCII + NBSP + SHY + some punctuation characters |
586
|
|
|
|
|
|
|
# NBSP (A0) and SHY (AD) are at the same position in ISO-8859-* too |
587
|
|
|
|
|
|
|
# consider also: AE (r), 80 Euro |
588
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
0
|
|
|
|
|
0
|
|
589
|
|
|
|
|
|
|
# the above can't fail, but keep code general just in case |
590
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as guessed charset %s, declared %s", |
591
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
592
|
|
|
|
|
|
|
'Windows-1252', $charset_declared); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# If we were unsuccessful so far, try some guesswork |
596
|
|
|
|
|
|
|
# based on Encode::Detect::Detector . |
597
|
|
|
|
|
|
|
|
598
|
2
|
50
|
|
|
|
8
|
if (defined $rv) { |
|
|
0
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# done, no need for guesswork |
600
|
|
|
|
|
|
|
} elsif (!$have_encode_detector) { |
601
|
0
|
|
|
|
|
0
|
dbg("message: Encode::Detect::Detector not available, declared %s failed", |
602
|
|
|
|
|
|
|
$charset_declared); |
603
|
|
|
|
|
|
|
} else { |
604
|
0
|
|
|
|
|
0
|
my $charset_detected = Encode::Detect::Detector::detect($_[0]); |
605
|
0
|
0
|
0
|
|
|
0
|
if ($charset_detected && lc $charset_detected ne lc $charset_declared) { |
606
|
0
|
|
|
|
|
0
|
my $decoder = Encode::find_encoding($charset_detected); |
607
|
0
|
0
|
0
|
|
|
0
|
if (!$decoder && $charset_detected =~ /^GB[ -]?18030(?:-20\d\d)?\z/i) { |
608
|
0
|
|
|
|
|
0
|
$decoder = Encode::find_encoding('GBK'); # a subset of GB18030 |
609
|
0
|
0
|
|
|
|
0
|
dbg("message: no decoder for a detected charset %s, using GBK", |
610
|
|
|
|
|
|
|
$charset_detected) if $decoder; |
611
|
|
|
|
|
|
|
} |
612
|
0
|
0
|
|
|
|
0
|
if (!$decoder) { |
613
|
0
|
|
|
|
|
0
|
dbg("message: failed decoding, no decoder for a detected charset %s", |
614
|
|
|
|
|
|
|
$charset_detected); |
615
|
|
|
|
|
|
|
} else { |
616
|
0
|
|
|
|
|
0
|
eval { $rv = $decoder->decode($_[0], 1|8) }; # FB_CROAK | LEAVE_SRC |
|
0
|
|
|
|
|
0
|
|
617
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as detected charset %s, declared %s", |
618
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
619
|
|
|
|
|
|
|
$charset_detected, $charset_declared); |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
|
624
|
2
|
50
|
|
|
|
8
|
if (!defined $rv) { # all decoding attempts failed so far, probably garbage |
625
|
|
|
|
|
|
|
# go for Windows-1252 which can't fail |
626
|
0
|
|
|
|
|
0
|
eval { $rv = $enc_w1252->decode($_[0]) }; |
|
0
|
|
|
|
|
0
|
|
627
|
0
|
0
|
|
|
|
0
|
dbg("message: %s as last-resort charset %s, declared %s", |
628
|
|
|
|
|
|
|
defined $rv ? 'decoded' : 'failed decoding', |
629
|
|
|
|
|
|
|
'Windows-1252', $charset_declared); |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
2
|
50
|
|
|
|
7
|
if (!defined $rv) { # just in case - all decoding attempts failed so far |
633
|
0
|
|
|
|
|
0
|
return $_[0]; # garbage-in / garbage-out, return unchanged octets |
634
|
|
|
|
|
|
|
} |
635
|
|
|
|
|
|
|
# decoding octets to characters was successful |
636
|
2
|
50
|
|
|
|
7
|
if (!$return_decoded) { |
637
|
|
|
|
|
|
|
# utf8::encode() is much faster than $enc_utf8->encode on utf8-flagged arg |
638
|
2
|
|
|
|
|
7
|
utf8::encode($rv); # encode Unicode characters to UTF-8 octets |
639
|
|
|
|
|
|
|
} |
640
|
2
|
|
|
|
|
6
|
$rv .= $data_taint; # carry taintedness over, avoid Encode bug |
641
|
2
|
|
|
|
|
17
|
return $rv; |
642
|
|
|
|
|
|
|
} |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=item rendered() |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
render_text() takes the given text/* type MIME part, and attempts to |
647
|
|
|
|
|
|
|
render it into a text scalar. It will always render text/html, and will |
648
|
|
|
|
|
|
|
use a heuristic to determine if other text/* parts should be considered |
649
|
|
|
|
|
|
|
text/html. Two scalars are returned: the rendered type (either text/html |
650
|
|
|
|
|
|
|
or whatever the original type was), and the rendered text. |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=cut |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
sub rendered { |
655
|
129
|
|
|
129
|
1
|
398
|
my ($self) = @_; |
656
|
|
|
|
|
|
|
|
657
|
129
|
100
|
|
|
|
545
|
if (!exists $self->{rendered}) { |
658
|
|
|
|
|
|
|
# We only know how to render text/plain and text/html ... |
659
|
|
|
|
|
|
|
# Note: for bug 4843, make sure to skip text/calendar parts |
660
|
|
|
|
|
|
|
# we also want to skip things like text/x-vcard |
661
|
|
|
|
|
|
|
# text/x-aol is ignored here, but looks like text/html ... |
662
|
113
|
50
|
|
|
|
1065
|
return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); |
663
|
|
|
|
|
|
|
|
664
|
113
|
|
|
|
|
649
|
my $text = $self->decode; # QP and Base64 decoding, bytes |
665
|
113
|
|
|
|
|
398
|
my $text_len = length($text); # num of bytes in original charset encoding |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
# render text/html always, or any other text|text/plain part as text/html |
668
|
|
|
|
|
|
|
# based on a heuristic which simulates a certain common mail client |
669
|
113
|
100
|
66
|
|
|
2466
|
if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i || |
|
|
|
100
|
|
|
|
|
670
|
|
|
|
|
|
|
($self->{'type'} =~ m{^text/plain$}i && |
671
|
|
|
|
|
|
|
_html_render(substr($text, 0, 23))))) |
672
|
|
|
|
|
|
|
{ |
673
|
7
|
|
|
|
|
62
|
$self->{rendered_type} = 'text/html'; |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
# will input text to HTML::Parser be provided as Unicode characters? |
676
|
7
|
|
|
|
|
38
|
my $character_semantics = 0; # $text is in bytes |
677
|
7
|
50
|
33
|
|
|
90
|
if ($self->{normalize} && $enc_utf8) { # charset decoding requested |
|
|
100
|
100
|
|
|
|
|
678
|
|
|
|
|
|
|
# Provide input to HTML::Parser as Unicode characters |
679
|
|
|
|
|
|
|
# which avoids a HTML::Parser bug in utf8_mode |
680
|
|
|
|
|
|
|
# https://rt.cpan.org/Public/Bug/Display.html?id=99755 |
681
|
|
|
|
|
|
|
# Note: the above bug was fixed in HTML-Parser 3.72, January 2016. |
682
|
|
|
|
|
|
|
# Avoid unnecessary step of encoding-then-decoding by telling |
683
|
|
|
|
|
|
|
# subroutine _normalize() to return Unicode text. See Bug 7133 |
684
|
|
|
|
|
|
|
# |
685
|
0
|
|
|
|
|
0
|
$character_semantics = 1; # $text will be in characters |
686
|
0
|
|
|
|
|
0
|
$text = _normalize($text, $self->{charset}, 1); # bytes to chars |
687
|
|
|
|
|
|
|
} elsif (!defined $self->{charset} || |
688
|
|
|
|
|
|
|
$self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) { |
689
|
|
|
|
|
|
|
# With some luck input can be interpreted as UTF-8, do not warn. |
690
|
|
|
|
|
|
|
# It is still possible to hit the HTML::Parses utf8_mode bug however. |
691
|
|
|
|
|
|
|
} else { |
692
|
|
|
|
|
|
|
dbg("message: 'normalize_charset' is off, encoding will likely ". |
693
|
1
|
|
|
|
|
6
|
"be misinterpreted; declared charset: %s", $self->{charset}); |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
# the 0 requires decoded HTML results to be in bytes (not characters) |
696
|
7
|
|
|
|
|
166
|
my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object |
697
|
|
|
|
|
|
|
|
698
|
7
|
|
|
|
|
74
|
$html->parse($text); # parse+render text |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# resulting HTML-decoded text is in bytes, likely encoded as UTF-8 |
701
|
7
|
|
|
|
|
52
|
$self->{rendered} = $html->get_rendered_text(); |
702
|
7
|
|
|
|
|
31
|
$self->{visible_rendered} = $html->get_rendered_text(invisible => 0); |
703
|
7
|
|
|
|
|
38
|
$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1); |
704
|
7
|
|
|
|
|
37
|
$self->{html_results} = $html->get_results(); |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
# end-of-document result values that require looking at the text |
707
|
7
|
|
|
|
|
19
|
my $r = $self->{html_results}; # temporary reference for brevity |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
# count the number of spaces in the rendered text (likely UTF-8 octets) |
710
|
7
|
|
|
|
|
40
|
my $space = $self->{rendered} =~ tr/ \t\n\r\x0b//; |
711
|
|
|
|
|
|
|
# we may want to add the count of other Unicode whitespace characters |
712
|
|
|
|
|
|
|
|
713
|
7
|
|
|
|
|
27
|
$r->{html_length} = length $self->{rendered}; # bytes (likely UTF-8) |
714
|
7
|
|
|
|
|
24
|
$r->{non_space_len} = $r->{html_length} - $space; |
715
|
7
|
50
|
|
|
|
174
|
$r->{ratio} = ($text_len - $r->{html_length}) / $text_len if $text_len; |
716
|
|
|
|
|
|
|
} |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
else { # plain text |
719
|
106
|
0
|
33
|
|
|
412
|
if ($self->{normalize} && $enc_utf8) { |
720
|
|
|
|
|
|
|
# request transcoded result as UTF-8 octets! |
721
|
0
|
|
|
|
|
0
|
$text = _normalize($text, $self->{charset}, 0); |
722
|
|
|
|
|
|
|
} |
723
|
106
|
|
|
|
|
462
|
$self->{rendered_type} = $self->{type}; |
724
|
106
|
|
|
|
|
493
|
$self->{rendered} = $self->{'visible_rendered'} = $text; |
725
|
106
|
|
|
|
|
434
|
$self->{'invisible_rendered'} = ''; |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
129
|
|
|
|
|
665
|
return ($self->{rendered_type}, $self->{rendered}); |
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
=item set_rendered($text, $type) |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
Set the rendered text and type for the given part. If type is not |
735
|
|
|
|
|
|
|
specified, and text is a defined value, a default of 'text/plain' is used. |
736
|
|
|
|
|
|
|
This can be used, for instance, to render non-text parts using plugins. |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
=cut |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
sub set_rendered { |
741
|
0
|
|
|
0
|
1
|
0
|
my ($self, $text, $type) = @_; |
742
|
|
|
|
|
|
|
|
743
|
0
|
0
|
0
|
|
|
0
|
$type = 'text/plain' if (!defined $type && defined $text); |
744
|
|
|
|
|
|
|
|
745
|
0
|
|
|
|
|
0
|
$self->{'rendered_type'} = $type; |
746
|
0
|
|
|
|
|
0
|
$self->{'rendered'} = $self->{'visible_rendered'} = $text; |
747
|
0
|
0
|
|
|
|
0
|
$self->{'invisible_rendered'} = defined $text ? '' : undef; |
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=item visible_rendered() |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
Render and return the visible text in this part. |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=cut |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
sub visible_rendered { |
757
|
8
|
|
|
8
|
1
|
30
|
my ($self) = @_; |
758
|
8
|
|
|
|
|
78
|
$self->rendered(); # ignore return, we want just this: |
759
|
8
|
|
|
|
|
39
|
return ($self->{rendered_type}, $self->{visible_rendered}); |
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=item invisible_rendered() |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Render and return the invisible text in this part. |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=cut |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
sub invisible_rendered { |
769
|
8
|
|
|
8
|
1
|
23
|
my ($self) = @_; |
770
|
8
|
|
|
|
|
44
|
$self->rendered(); # ignore return, we want just this: |
771
|
8
|
|
|
|
|
23
|
return ($self->{rendered_type}, $self->{invisible_rendered}); |
772
|
|
|
|
|
|
|
} |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=item content_summary() |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
Returns an array of scalars describing the mime parts of the message. |
777
|
|
|
|
|
|
|
Note: This function requires that the message be parsed first! |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=cut |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
# return an array with scalars describing mime parts |
782
|
|
|
|
|
|
|
sub content_summary { |
783
|
13
|
|
|
13
|
1
|
203
|
my($self) = @_; |
784
|
|
|
|
|
|
|
|
785
|
13
|
|
|
|
|
49
|
my @ret = ( [ $self->{'type'} ] ); |
786
|
13
|
|
|
|
|
17
|
my @search; |
787
|
|
|
|
|
|
|
|
788
|
13
|
100
|
|
|
|
32
|
if (exists $self->{'body_parts'}) { |
789
|
11
|
|
|
|
|
15
|
my $count = @{$self->{'body_parts'}}; |
|
11
|
|
|
|
|
22
|
|
790
|
11
|
|
|
|
|
28
|
for(my $i=0; $i<$count; $i++) { |
791
|
24
|
|
|
|
|
62
|
push(@search, [ $i+1, $self->{'body_parts'}->[$i] ]); |
792
|
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
} |
794
|
|
|
|
|
|
|
|
795
|
13
|
|
|
|
|
33
|
while(my $part = shift @search) { |
796
|
45
|
|
|
|
|
40
|
my($index, $part) = @{$part}; |
|
45
|
|
|
|
|
57
|
|
797
|
45
|
|
|
|
|
41
|
push(@{$ret[$index]}, $part->{'type'}); |
|
45
|
|
|
|
|
70
|
|
798
|
45
|
100
|
|
|
|
94
|
if (exists $part->{'body_parts'}) { |
799
|
11
|
|
|
|
|
13
|
unshift(@search, map { [ $index, $_ ] } @{$part->{'body_parts'}}); |
|
21
|
|
|
|
|
44
|
|
|
11
|
|
|
|
|
20
|
|
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
13
|
|
|
|
|
22
|
return map { join(",", @{$_}) } @ret; |
|
37
|
|
|
|
|
39
|
|
|
37
|
|
|
|
|
104
|
|
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=item delete_header() |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
Delete the specified header (decoded and raw) from the Node information. |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
=cut |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
sub delete_header { |
813
|
505
|
|
|
505
|
1
|
1057
|
my($self, $hdr) = @_; |
814
|
|
|
|
|
|
|
|
815
|
505
|
|
|
|
|
692
|
foreach ( grep(/^${hdr}$/i, keys %{$self->{'headers'}}) ) { |
|
505
|
|
|
|
|
7528
|
|
816
|
0
|
|
|
|
|
0
|
delete $self->{'headers'}->{$_}; |
817
|
0
|
|
|
|
|
0
|
delete $self->{'raw_headers'}->{$_}; |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
|
820
|
505
|
|
|
|
|
1167
|
my @neworder = grep(!/^${hdr}$/i, @{$self->{'header_order'}}); |
|
505
|
|
|
|
|
5474
|
|
821
|
505
|
|
|
|
|
1783
|
$self->{'header_order'} = \@neworder; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
# decode a header appropriately. don't bother adding it to the pod documents. |
825
|
|
|
|
|
|
|
sub __decode_header { |
826
|
10
|
|
|
10
|
|
30
|
my ( $encoding, $cte, $data ) = @_; |
827
|
|
|
|
|
|
|
|
828
|
10
|
100
|
|
|
|
27
|
if ( $cte eq 'B' ) { |
|
|
50
|
|
|
|
|
|
829
|
|
|
|
|
|
|
# base 64 encoded |
830
|
2
|
|
|
|
|
14
|
$data = Mail::SpamAssassin::Util::base64_decode($data); |
831
|
|
|
|
|
|
|
} |
832
|
|
|
|
|
|
|
elsif ( $cte eq 'Q' ) { |
833
|
|
|
|
|
|
|
# quoted printable |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# the RFC states that in the encoded text, "_" is equal to "=20" |
836
|
8
|
|
|
|
|
16
|
$data =~ s/_/=20/g; |
837
|
|
|
|
|
|
|
|
838
|
8
|
|
|
|
|
17
|
$data = Mail::SpamAssassin::Util::qp_decode($data); |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
else { |
841
|
|
|
|
|
|
|
# not possible since the input has already been limited to 'B' and 'Q' |
842
|
0
|
|
|
|
|
0
|
die "message: unknown encoding type '$cte' in RFC2047 header"; |
843
|
|
|
|
|
|
|
} |
844
|
10
|
|
|
|
|
53
|
return _normalize($data, $encoding, 0); # transcode to UTF-8 octets |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
# Decode base64 and quoted-printable in headers according to RFC2047. |
848
|
|
|
|
|
|
|
# |
849
|
|
|
|
|
|
|
sub _decode_header { |
850
|
800
|
|
|
800
|
|
1667
|
my($header_field_body, $header_field_name) = @_; |
851
|
|
|
|
|
|
|
|
852
|
800
|
100
|
66
|
|
|
3151
|
return '' unless defined $header_field_body && $header_field_body ne ''; |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# deal with folding and cream the newlines and such |
855
|
750
|
|
|
|
|
1322
|
$header_field_body =~ s/\n[ \t]+/\n /g; |
856
|
750
|
|
|
|
|
1044
|
$header_field_body =~ s/\015?\012//gs; |
857
|
|
|
|
|
|
|
|
858
|
750
|
100
|
|
|
|
3543
|
if ($header_field_name =~ |
859
|
|
|
|
|
|
|
/^ (?: Received | (?:Resent-)? (?: Message-ID | Date ) | |
860
|
|
|
|
|
|
|
MIME-Version | References | In-Reply-To | List-.* ) \z /xsi ) { |
861
|
|
|
|
|
|
|
# Bug 6945: some header fields must not be processed for MIME encoding |
862
|
|
|
|
|
|
|
# Bug 7466: leave out the Content-* |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
} else { |
865
|
453
|
|
|
|
|
1420
|
local($1,$2,$3); |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
# Multiple encoded sections must ignore the interim whitespace. |
868
|
|
|
|
|
|
|
# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE |
869
|
|
|
|
|
|
|
# separated by whitespace. |
870
|
453
|
|
|
|
|
1063
|
1 while $header_field_body =~ |
871
|
|
|
|
|
|
|
s{ ( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) \s+ |
872
|
|
|
|
|
|
|
( = \? [A-Za-z0-9_-]+ \? [bqBQ] \? [^?]* \? = ) } |
873
|
|
|
|
|
|
|
{$1$2}xsg; |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
# transcode properly encoded RFC 2047 substrings into UTF-8 octets, |
876
|
|
|
|
|
|
|
# leave everything else unchanged as it is supposed to be UTF-8 (RFC 6532) |
877
|
453
|
|
|
|
|
1055
|
# or plain US-ASCII |
878
|
10
|
|
|
|
|
38
|
$header_field_body =~ |
879
|
|
|
|
|
|
|
s{ (?: = \? ([A-Za-z0-9_-]+) \? ([bqBQ]) \? ([^?]*) \? = ) } |
880
|
|
|
|
|
|
|
{ __decode_header($1, uc($2), $3) }xsge; |
881
|
|
|
|
|
|
|
} |
882
|
750
|
|
|
|
|
1596
|
|
883
|
|
|
|
|
|
|
# dbg("message: _decode_header %s: %s", $header_field_name, $header_field_body); |
884
|
|
|
|
|
|
|
return $header_field_body; |
885
|
|
|
|
|
|
|
} |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
=item get_header() |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
Retrieve a specific header. Will have a newline at the end and will be |
890
|
|
|
|
|
|
|
unfolded. The first parameter is the header name (case-insensitive), |
891
|
|
|
|
|
|
|
and the second parameter (optional) is whether or not to return the |
892
|
|
|
|
|
|
|
raw header. |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
If get_header() is called in an array context, an array will be returned |
895
|
|
|
|
|
|
|
with each header entry in a different element. In a scalar context, |
896
|
|
|
|
|
|
|
the last specific header is returned. |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
ie: If 'Subject' is specified as the header, and there are 2 Subject |
899
|
|
|
|
|
|
|
headers in a message, the last/bottom one in the message is returned in |
900
|
|
|
|
|
|
|
scalar context or both are returned in array context. |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
Btw, returning the last header field (not the first) happens to be consistent |
903
|
|
|
|
|
|
|
with DKIM signatures, which search for and cover multiple header fields |
904
|
|
|
|
|
|
|
bottom-up according to the 'h' tag. Let's keep it this way. |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
=cut |
907
|
4075
|
|
|
4075
|
1
|
7161
|
|
908
|
4075
|
|
50
|
|
|
13154
|
sub get_header { |
909
|
|
|
|
|
|
|
my ($self, $hdr, $raw) = @_; |
910
|
|
|
|
|
|
|
$raw ||= 0; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
# And now pick up all the entries into a list |
913
|
|
|
|
|
|
|
# This is assumed to include a newline at the end ... |
914
|
|
|
|
|
|
|
# This is also assumed to have removed continuation bits ... |
915
|
4075
|
|
|
|
|
4715
|
|
916
|
4075
|
50
|
|
|
|
6280
|
# Deal with the possibility that header() or raw_header() returns undef |
917
|
0
|
0
|
|
|
|
0
|
my @hdrs; |
918
|
0
|
|
|
|
|
0
|
if ( $raw ) { |
919
|
|
|
|
|
|
|
if (@hdrs = $self->raw_header($hdr)) { |
920
|
|
|
|
|
|
|
s/\015?\012\s+/ /gs for @hdrs; |
921
|
|
|
|
|
|
|
} |
922
|
4075
|
100
|
|
|
|
7554
|
} |
923
|
658
|
|
|
|
|
2285
|
else { |
924
|
|
|
|
|
|
|
if (@hdrs = $self->header($hdr)) { |
925
|
|
|
|
|
|
|
$_ .= "\n" for @hdrs; |
926
|
|
|
|
|
|
|
} |
927
|
4075
|
100
|
|
|
|
6151
|
} |
928
|
3586
|
|
|
|
|
8295
|
|
929
|
|
|
|
|
|
|
if (wantarray) { |
930
|
|
|
|
|
|
|
return @hdrs; |
931
|
489
|
100
|
|
|
|
1838
|
} |
932
|
|
|
|
|
|
|
else { |
933
|
|
|
|
|
|
|
return @hdrs ? $hdrs[-1] : undef; |
934
|
|
|
|
|
|
|
} |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
=item get_all_headers() |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
Retrieve all headers. Each header will have a newline at the end and |
940
|
|
|
|
|
|
|
will be unfolded. The first parameter (optional) is whether or not to |
941
|
|
|
|
|
|
|
return the raw headers, and the second parameter (optional) is whether |
942
|
|
|
|
|
|
|
or not to include the mbox separator. |
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
If get_all_header() is called in an array context, an array will be |
945
|
|
|
|
|
|
|
returned with each header entry in a different element. In a scalar |
946
|
|
|
|
|
|
|
context, the headers are returned in a single scalar. |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
=back |
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=cut |
951
|
|
|
|
|
|
|
|
952
|
26
|
|
|
26
|
1
|
66
|
# build it and it will not bomb |
953
|
26
|
|
50
|
|
|
155
|
sub get_all_headers { |
954
|
26
|
|
100
|
|
|
85
|
my ($self, $raw, $include_mbox) = @_; |
955
|
|
|
|
|
|
|
$raw ||= 0; |
956
|
26
|
|
|
|
|
41
|
$include_mbox ||= 0; |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
my @lines; |
959
|
26
|
|
|
|
|
41
|
|
960
|
26
|
|
|
|
|
37
|
# precalculate destination positions based on order of appearance |
961
|
26
|
|
|
|
|
39
|
my $i = 0; |
|
26
|
|
|
|
|
108
|
|
962
|
332
|
|
|
|
|
365
|
my %locations; |
|
332
|
|
|
|
|
804
|
|
963
|
|
|
|
|
|
|
for my $k (@{$self->{header_order}}) { |
964
|
|
|
|
|
|
|
push(@{$locations{lc($k)}}, $i++); |
965
|
|
|
|
|
|
|
} |
966
|
26
|
|
|
|
|
42
|
|
967
|
26
|
|
|
|
|
45
|
# process headers in order of first appearance |
968
|
26
|
|
|
|
|
215
|
my $header; |
|
572
|
|
|
|
|
784
|
|
969
|
|
|
|
|
|
|
my $size = 0; |
970
|
|
|
|
|
|
|
HEADER: for my $name (sort { $locations{$a}->[0] <=> $locations{$b}->[0] } |
971
|
|
|
|
|
|
|
keys %locations) |
972
|
250
|
|
|
|
|
339
|
{ |
973
|
250
|
|
|
|
|
428
|
# get all same-name headers and poke into correct position |
974
|
332
|
|
|
|
|
375
|
my $positions = $locations{$name}; |
|
332
|
|
|
|
|
409
|
|
975
|
332
|
|
|
|
|
451
|
for my $contents ($self->get_header($name, $raw)) { |
976
|
332
|
50
|
|
|
|
552
|
my $position = shift @{$positions}; |
977
|
0
|
|
|
|
|
0
|
$size += length($name) + length($contents) + 2; |
978
|
0
|
|
|
|
|
0
|
if ($size > MAX_HEADER_LENGTH) { |
979
|
|
|
|
|
|
|
$self->{'truncated_header'} = 1; |
980
|
332
|
|
|
|
|
878
|
last HEADER; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
$lines[$position] = $self->{header_order}->[$position].":".$contents; |
983
|
|
|
|
|
|
|
} |
984
|
|
|
|
|
|
|
} |
985
|
26
|
50
|
|
|
|
103
|
|
|
0
|
|
|
|
|
0
|
|
986
|
|
|
|
|
|
|
# skip undefined lines if we truncated |
987
|
26
|
50
|
66
|
|
|
141
|
@lines = grep { defined $_ } @lines if $self->{'truncated_header'}; |
988
|
|
|
|
|
|
|
|
989
|
26
|
100
|
|
|
|
327
|
splice @lines, 0, 0, $self->{mbox_sep} if ( $include_mbox && exists $self->{mbox_sep} ); |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
return wantarray ? @lines : join ('', @lines); |
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
0
|
0
|
|
|
994
|
|
|
|
|
|
|
# legacy public API; now a no-op. |
995
|
|
|
|
|
|
|
sub finish { } |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
1; |
1000
|
|
|
|
|
|
|
__END__ |