| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# <@LICENSE> |
|
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
|
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
|
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
|
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
|
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
|
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
|
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
|
15
|
|
|
|
|
|
|
# limitations under the License. |
|
16
|
|
|
|
|
|
|
# </@LICENSE> |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
loadplugin Mail::SpamAssassin::Plugin::PDFInfo |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This plugin helps detected spam using attached PDF files |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=over 4 |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=item See "Usage:" below - more documentation see 20_pdfinfo.cf |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
Original info kept for history. For later changes see SVN repo |
|
35
|
|
|
|
|
|
|
------------------------------------------------------- |
|
36
|
|
|
|
|
|
|
PDFInfo Plugin for SpamAssassin |
|
37
|
|
|
|
|
|
|
Version: 0.8 |
|
38
|
|
|
|
|
|
|
Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $ |
|
39
|
|
|
|
|
|
|
Created: 2007-08-10 |
|
40
|
|
|
|
|
|
|
Modified: 2007-08-10 |
|
41
|
|
|
|
|
|
|
By: Dallas Engelken |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Changes: |
|
44
|
|
|
|
|
|
|
0.8 - added .fdf detection (thanks John Lundin) [axb] |
|
45
|
|
|
|
|
|
|
0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb] |
|
46
|
|
|
|
|
|
|
0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc. |
|
47
|
|
|
|
|
|
|
- fixed issue on perl 5.6.1 where pdf_match_details() failed to call |
|
48
|
|
|
|
|
|
|
_find_pdf_mime_parts(), resulting in no detection of pdf mime parts. |
|
49
|
|
|
|
|
|
|
- quoted-printable support - requires MIME::QuotedPrint (which should be in everyones |
|
50
|
|
|
|
|
|
|
install as a part of the MIME-Base64 package which is a SA req) |
|
51
|
|
|
|
|
|
|
- added simple pdf_is_empty_body() function with counts the body bytes minus the |
|
52
|
|
|
|
|
|
|
subject line. can add optional <bytes> param if you need to allow for a few bytes. |
|
53
|
|
|
|
|
|
|
0.5 - fix warns for undef $pdf_tags |
|
54
|
|
|
|
|
|
|
- remove { } and \ before running eval in pdf_match_details to avoid eval error |
|
55
|
|
|
|
|
|
|
0.4 - added pdf_is_encrypted() function |
|
56
|
|
|
|
|
|
|
- added option to look for image HxW on same line |
|
57
|
|
|
|
|
|
|
0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data |
|
58
|
|
|
|
|
|
|
- renamed pdf_image_named() to pdf_named() |
|
59
|
|
|
|
|
|
|
- PDF images are encapsulated and have no names. We are matching the PDF file name. |
|
60
|
|
|
|
|
|
|
- renamed pdf_image_name_regex() to pdf_name_regex() |
|
61
|
|
|
|
|
|
|
- PDF images are encapsulated and have no names. We are matching the PDF file name. |
|
62
|
|
|
|
|
|
|
- changed pdf_image_count() a bit and added pdf_count(). |
|
63
|
|
|
|
|
|
|
- pdf_count() checks how many pdf attachments there are on the mail |
|
64
|
|
|
|
|
|
|
- pdf_image_count() checks how many images are found within all pdfs in the mail. |
|
65
|
|
|
|
|
|
|
- removed the restriction of the pdf containing an image in order to md5 it. |
|
66
|
|
|
|
|
|
|
- added pdf_match_details() function to check the following 'details' |
|
67
|
|
|
|
|
|
|
- author: Author of PDF if specified |
|
68
|
|
|
|
|
|
|
- producer: Software used to produce PDF |
|
69
|
|
|
|
|
|
|
- creator: Software used to produce PDF, usually similar to producer |
|
70
|
|
|
|
|
|
|
- title: Title of PDF |
|
71
|
|
|
|
|
|
|
- created: Creation Date |
|
72
|
|
|
|
|
|
|
- modified: Last Modified |
|
73
|
|
|
|
|
|
|
0.2 - support PDF octet-stream |
|
74
|
|
|
|
|
|
|
0.1 - just ported over the imageinfo code, and renamed to pdfinfo. |
|
75
|
|
|
|
|
|
|
- removed all support for png, gif, and jpg from the code. |
|
76
|
|
|
|
|
|
|
- prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2. |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
Usage: |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
pdf_count() |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
body RULENAME eval:pdf_count(<min>,[max]) |
|
83
|
|
|
|
|
|
|
min: required, message contains at least x pdf mime parts |
|
84
|
|
|
|
|
|
|
max: optional, if specified, must not contain more than x pdf mime parts |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
pdf_image_count() |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
body RULENAME eval:pdf_image_count(<min>,[max]) |
|
89
|
|
|
|
|
|
|
min: required, message contains at least x images in pdf attachments. |
|
90
|
|
|
|
|
|
|
max: optional, if specified, must not contain more than x pdf images |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
pdf_pixel_coverage() |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
body RULENAME eval:pdf_pixel_coverage(<min>,[max]) |
|
95
|
|
|
|
|
|
|
min: required, message contains at least this much pixel area |
|
96
|
|
|
|
|
|
|
max: optional, if specified, message must not contain more than this much pixel area |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
pdf_named() |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
body RULENAME eval:pdf_named(<string>) |
|
101
|
|
|
|
|
|
|
string: exact file name match, if you need partial match, see pdf_name_regex() |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
pdf_name_regex() |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
body RULENAME eval:pdf_name_regex(<regex>) |
|
106
|
|
|
|
|
|
|
regex: regular expression, see examples in ruleset |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
pdf_match_md5() |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_md5(<string>) |
|
111
|
|
|
|
|
|
|
string: 32-byte md5 hex |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
pdf_match_fuzzy_md5() |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_md5(<string>) |
|
116
|
|
|
|
|
|
|
string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5 |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
pdf_match_details() |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_details(<detail>,<regex>); |
|
121
|
|
|
|
|
|
|
detail: author, creator, created, modified, producer, title |
|
122
|
|
|
|
|
|
|
regex: regular expression, see examples in ruleset |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
pdf_is_encrypted() |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
body RULENAME eval:pdf_is_encrypted() |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
pdf_is_empty_body() |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
body RULENAME eval:pdf_is_empty_body(<bytes>) |
|
131
|
|
|
|
|
|
|
bytes: maximum byte count to allow and still consider it empty |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
NOTE: See the ruleset for more examples that are not documented here. |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=back |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# ------------------------------------------------------- |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
package Mail::SpamAssassin::Plugin::PDFInfo; |
|
142
|
|
|
|
|
|
|
|
|
143
|
19
|
|
|
19
|
|
160
|
use Mail::SpamAssassin::Plugin; |
|
|
19
|
|
|
|
|
50
|
|
|
|
19
|
|
|
|
|
680
|
|
|
144
|
19
|
|
|
19
|
|
123
|
use Mail::SpamAssassin::Logger; |
|
|
19
|
|
|
|
|
48
|
|
|
|
19
|
|
|
|
|
1213
|
|
|
145
|
19
|
|
|
19
|
|
145
|
use Mail::SpamAssassin::Util qw(compile_regexp); |
|
|
19
|
|
|
|
|
61
|
|
|
|
19
|
|
|
|
|
984
|
|
|
146
|
19
|
|
|
19
|
|
137
|
use strict; |
|
|
19
|
|
|
|
|
46
|
|
|
|
19
|
|
|
|
|
486
|
|
|
147
|
19
|
|
|
19
|
|
107
|
use warnings; |
|
|
19
|
|
|
|
|
53
|
|
|
|
19
|
|
|
|
|
663
|
|
|
148
|
|
|
|
|
|
|
# use bytes; |
|
149
|
19
|
|
|
19
|
|
173
|
use Digest::MD5 qw(md5_hex); |
|
|
19
|
|
|
|
|
44
|
|
|
|
19
|
|
|
|
|
1417
|
|
|
150
|
19
|
|
|
19
|
|
9270
|
use MIME::QuotedPrint; |
|
|
19
|
|
|
|
|
5136
|
|
|
|
19
|
|
|
|
|
72583
|
|
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# constructor: register the eval rule |
|
155
|
|
|
|
|
|
|
sub new { |
|
156
|
60
|
|
|
60
|
1
|
216
|
my $class = shift; |
|
157
|
60
|
|
|
|
|
190
|
my $mailsaobject = shift; |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# some boilerplate... |
|
160
|
60
|
|
33
|
|
|
445
|
$class = ref($class) || $class; |
|
161
|
60
|
|
|
|
|
353
|
my $self = $class->SUPER::new($mailsaobject); |
|
162
|
60
|
|
|
|
|
181
|
bless ($self, $class); |
|
163
|
|
|
|
|
|
|
|
|
164
|
60
|
|
|
|
|
309
|
$self->register_eval_rule ("pdf_count"); |
|
165
|
60
|
|
|
|
|
228
|
$self->register_eval_rule ("pdf_image_count"); |
|
166
|
60
|
|
|
|
|
226
|
$self->register_eval_rule ("pdf_pixel_coverage"); |
|
167
|
60
|
|
|
|
|
232
|
$self->register_eval_rule ("pdf_image_size_exact"); |
|
168
|
60
|
|
|
|
|
226
|
$self->register_eval_rule ("pdf_image_size_range"); |
|
169
|
60
|
|
|
|
|
221
|
$self->register_eval_rule ("pdf_named"); |
|
170
|
60
|
|
|
|
|
210
|
$self->register_eval_rule ("pdf_name_regex"); |
|
171
|
60
|
|
|
|
|
220
|
$self->register_eval_rule ("pdf_image_to_text_ratio"); |
|
172
|
60
|
|
|
|
|
203
|
$self->register_eval_rule ("pdf_match_md5"); |
|
173
|
60
|
|
|
|
|
208
|
$self->register_eval_rule ("pdf_match_fuzzy_md5"); |
|
174
|
60
|
|
|
|
|
215
|
$self->register_eval_rule ("pdf_match_details"); |
|
175
|
60
|
|
|
|
|
246
|
$self->register_eval_rule ("pdf_is_encrypted"); |
|
176
|
60
|
|
|
|
|
235
|
$self->register_eval_rule ("pdf_is_empty_body"); |
|
177
|
|
|
|
|
|
|
|
|
178
|
60
|
|
|
|
|
563
|
return $self; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# ----------------------------------------- |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
my %get_details = ( |
|
184
|
|
|
|
|
|
|
'pdf' => sub { |
|
185
|
|
|
|
|
|
|
my ($self, $pms, $part) = @_; |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $type = $part->{'type'} || 'base64'; |
|
188
|
|
|
|
|
|
|
my $data = ''; |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
if ($type eq 'quoted-printable') { |
|
191
|
|
|
|
|
|
|
$data = decode_qp($data); # use QuotedPrint->decode_qp |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
else { |
|
194
|
|
|
|
|
|
|
$data = $part->decode(); # just use built in base64 decoder |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
my $index = substr($data, 0, 8); |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
return unless ($index =~ /.PDF\-(\d\.\d)/); |
|
200
|
|
|
|
|
|
|
my $version = $1; |
|
201
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFVERSION', $version); |
|
202
|
|
|
|
|
|
|
# dbg("pdfinfo: pdf version = $version"); |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my ($height, $width, $fuzzy_data, $pdf_tags); |
|
205
|
|
|
|
|
|
|
my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown'); |
|
206
|
|
|
|
|
|
|
my ($md5, $fuzzy_md5) = ('', ''); |
|
207
|
|
|
|
|
|
|
my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0); |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
my $name = $part->{'name'} || ''; |
|
210
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFNAME', $name); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
my $no_more_fuzzy = 0; |
|
213
|
|
|
|
|
|
|
my $got_image = 0; |
|
214
|
|
|
|
|
|
|
my $encrypted = 0; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
while($data =~ /([^\n]+)/g) { |
|
217
|
|
|
|
|
|
|
# dbg("pdfinfo: line=$1"); |
|
218
|
|
|
|
|
|
|
my $line = $1; |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
$line_count++; |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# lines containing high bytes will have no data we need, so save some cycles |
|
223
|
|
|
|
|
|
|
next if ($line =~ /[\x80-\xff]/); |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
if (!$no_more_fuzzy && $line_count < 70) { |
|
226
|
|
|
|
|
|
|
if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) { |
|
227
|
|
|
|
|
|
|
$line =~ s/\s+$//; # strip off whitespace at end. |
|
228
|
|
|
|
|
|
|
$fuzzy_data .= $line; |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
if ($line =~ m/^\/([A-Za-z]+)/) { |
|
233
|
|
|
|
|
|
|
$pdf_tags .= $1; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
$got_image=1 if ($line =~ m/\/Image/); |
|
237
|
|
|
|
|
|
|
$encrypted=1 if ($line =~ m/^\/Encrypt/); |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# once we hit the first stream, we stop collecting data for fuzzy md5 |
|
240
|
|
|
|
|
|
|
$no_more_fuzzy = 1 if ($line =~ m/stream/); |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# From a v1.3 pdf |
|
243
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm |
|
244
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=/Width 630 |
|
245
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=/Height 149 |
|
246
|
|
|
|
|
|
|
if ($got_image) { |
|
247
|
|
|
|
|
|
|
if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) { |
|
248
|
|
|
|
|
|
|
$width = $1; |
|
249
|
|
|
|
|
|
|
$height = $2; |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
elsif ($line =~ /^\/Width\s(\d+)/) { |
|
252
|
|
|
|
|
|
|
$width = $1; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
elsif ($line =~ /^\/Height\s(\d+)/) { |
|
255
|
|
|
|
|
|
|
$height = $1; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) { |
|
258
|
|
|
|
|
|
|
$width = $1; |
|
259
|
|
|
|
|
|
|
$height = $2; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# did pdf contain image data? |
|
264
|
|
|
|
|
|
|
if ($got_image && $width && $height) { |
|
265
|
|
|
|
|
|
|
$no_more_fuzzy = 1; |
|
266
|
|
|
|
|
|
|
my $area = $width * $height; |
|
267
|
|
|
|
|
|
|
$total_height += $height; |
|
268
|
|
|
|
|
|
|
$total_width += $width; |
|
269
|
|
|
|
|
|
|
$total_area += $area; |
|
270
|
|
|
|
|
|
|
$pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1; |
|
271
|
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf_images"} ++; |
|
272
|
|
|
|
|
|
|
dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)"); |
|
273
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}"); |
|
274
|
|
|
|
|
|
|
$height=0; $width=0; # reset and check for next image |
|
275
|
|
|
|
|
|
|
$got_image = 0; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15) |
|
279
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220) |
|
280
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220) |
|
281
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1) |
|
282
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2) |
|
283
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Author(colet)>>endobj |
|
284
|
|
|
|
|
|
|
# or all on same line inside xml - v1.6+ |
|
285
|
|
|
|
|
|
|
# <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>> |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
if ($line =~ /\/Producer\s?\(([^\)\\]+)/) { |
|
288
|
|
|
|
|
|
|
$producer = $1; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) { |
|
291
|
|
|
|
|
|
|
$created = $1; |
|
292
|
|
|
|
|
|
|
} |
|
293
|
|
|
|
|
|
|
if ($line =~ /\/ModDate\s?\(D\:(\d+)/) { |
|
294
|
|
|
|
|
|
|
$modified = $1; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
if ($line =~ /\/Title\s?\(([^\)\\]+)/) { |
|
297
|
|
|
|
|
|
|
$title = $1; |
|
298
|
|
|
|
|
|
|
# Title=\376\377\000w\000w\000n\000g |
|
299
|
|
|
|
|
|
|
# Title=wwng |
|
300
|
|
|
|
|
|
|
$title =~ s/\\\d{3}//g; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
if ($line =~ /\/Creator\s?\(([^\)\\]+)/) { |
|
303
|
|
|
|
|
|
|
$creator = $1; |
|
304
|
|
|
|
|
|
|
} |
|
305
|
|
|
|
|
|
|
if ($line =~ /\/Author\s?\(([^\)]+)/) { |
|
306
|
|
|
|
|
|
|
$author = $1; |
|
307
|
|
|
|
|
|
|
# Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r |
|
308
|
|
|
|
|
|
|
# Author=HP_Administrator |
|
309
|
|
|
|
|
|
|
$author =~ s/\\\d{3}//g; |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
} |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# store the file name so we can check pdf_named() or pdf_name_match() later. |
|
314
|
|
|
|
|
|
|
$pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name; |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# store encrypted flag. |
|
317
|
|
|
|
|
|
|
$pms->{pdfinfo}->{encrypted} = $encrypted; |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# if we had multiple images in the pdf, we need to store the total HxW as well. |
|
320
|
|
|
|
|
|
|
# If it was a single Image PDF, then this value will already be in the hash. |
|
321
|
|
|
|
|
|
|
$pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);; |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
if ($total_area) { |
|
324
|
|
|
|
|
|
|
$pms->{pdfinfo}->{pc_pdf} = $total_area; |
|
325
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGAREA', $total_area); |
|
326
|
|
|
|
|
|
|
dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area); |
|
327
|
|
|
|
|
|
|
} |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified"); |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
$md5 = uc(md5_hex($data)) if $data; |
|
332
|
|
|
|
|
|
|
$fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data; |
|
333
|
|
|
|
|
|
|
my $tags_md5; |
|
334
|
|
|
|
|
|
|
$tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags; |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : '')); |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# we dont need tags for these. |
|
339
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{created} = $created if $created; |
|
340
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{modified} = $modified if $modified; |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
if ($producer) { |
|
343
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{producer} = $producer if $producer; |
|
344
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFPRODUCER', $producer); |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
if ($title) { |
|
347
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{title} = $title; |
|
348
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFTITLE', $title); |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
if ($creator) { |
|
351
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{creator} = $creator; |
|
352
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFCREATOR', $creator); |
|
353
|
|
|
|
|
|
|
} |
|
354
|
|
|
|
|
|
|
if ($author) { |
|
355
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{author} = $author; |
|
356
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFAUTHOR', $author); |
|
357
|
|
|
|
|
|
|
} |
|
358
|
|
|
|
|
|
|
if ($md5) { |
|
359
|
|
|
|
|
|
|
$pms->{pdfinfo}->{md5}->{$md5} = 1; |
|
360
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5', $fuzzy_md5); |
|
361
|
|
|
|
|
|
|
} |
|
362
|
|
|
|
|
|
|
if ($fuzzy_md5) { |
|
363
|
|
|
|
|
|
|
$pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1; |
|
364
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5); |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
if ($tags_md5) { |
|
367
|
|
|
|
|
|
|
$pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1; |
|
368
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5); |
|
369
|
|
|
|
|
|
|
} |
|
370
|
|
|
|
|
|
|
}, |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
); |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# ---------------------------------------- |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub _set_tag { |
|
377
|
|
|
|
|
|
|
|
|
378
|
0
|
|
|
0
|
|
|
my ($self, $pms, $tag, $value) = @_; |
|
379
|
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
dbg("pdfinfo: set_tag called for $tag $value"); |
|
381
|
0
|
0
|
0
|
|
|
|
return unless ($tag && $value); |
|
382
|
|
|
|
|
|
|
|
|
383
|
0
|
0
|
|
|
|
|
if (exists $pms->{tag_data}->{$tag}) { |
|
384
|
0
|
|
|
|
|
|
$pms->{tag_data}->{$tag} .= " $value"; # append value |
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
else { |
|
387
|
0
|
|
|
|
|
|
$pms->{tag_data}->{$tag} = $value; |
|
388
|
|
|
|
|
|
|
} |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# ---------------------------------------- |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub _find_pdf_mime_parts { |
|
394
|
0
|
|
|
0
|
|
|
my ($self,$pms) = @_; |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# bail early if message does not have pdf parts |
|
397
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# initialize |
|
400
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"pc_pdf"} = 0; |
|
401
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf"} = 0; |
|
402
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf_images"} = 0; |
|
403
|
|
|
|
|
|
|
|
|
404
|
0
|
|
|
|
|
|
my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1); |
|
405
|
0
|
|
|
|
|
|
my $part_count = scalar @parts; |
|
406
|
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
|
dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content"); |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# cache this so we can easily bail |
|
410
|
0
|
0
|
|
|
|
|
$pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count; |
|
411
|
|
|
|
|
|
|
|
|
412
|
0
|
|
|
|
|
|
foreach my $p (@parts) { |
|
413
|
0
|
|
|
|
|
|
my $type = $p->{'type'} =~ m@/([\w\-]+)$@; |
|
414
|
0
|
|
0
|
|
|
|
my $name = $p->{'name'} || ''; |
|
415
|
|
|
|
|
|
|
|
|
416
|
0
|
|
0
|
|
|
|
my $cte = lc( $p->get_header('content-transfer-encoding') || '' ); |
|
417
|
|
|
|
|
|
|
|
|
418
|
0
|
0
|
|
|
|
|
dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '').""); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# make sure its a cte we support |
|
421
|
0
|
0
|
|
|
|
|
next unless ($cte =~ /^(?:base64|quoted\-printable)$/); |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# filename must end with .pdf, or application type can be pdf |
|
424
|
|
|
|
|
|
|
# sometimes windows muas will wrap a pdf up inside a .dat file |
|
425
|
|
|
|
|
|
|
# v0.8 - Added .fdf phoney PDF detection |
|
426
|
0
|
0
|
0
|
|
|
|
next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf'); |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# if we get this far, make sure type is pdf for sure (not octet-stream or anything else) |
|
429
|
0
|
|
|
|
|
|
$type='pdf'; |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
0
|
0
|
|
|
|
if ($type && exists $get_details{$type}) { |
|
432
|
0
|
|
|
|
|
|
$get_details{$type}->($self, $pms, $p); |
|
433
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_$type"} ++; |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
0
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"}); |
|
438
|
0
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"}); |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# ---------------------------------------- |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub pdf_named { |
|
445
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$name) = @_; |
|
446
|
0
|
0
|
|
|
|
|
return unless (defined $name); |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
449
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
450
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
|
|
453
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
454
|
|
|
|
|
|
|
|
|
455
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); |
|
456
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name}); |
|
457
|
0
|
|
|
|
|
|
return 0; |
|
458
|
|
|
|
|
|
|
} |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# ----------------------------------------- |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
sub pdf_name_regex { |
|
463
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$re) = @_; |
|
464
|
0
|
0
|
|
|
|
|
return unless (defined $re); |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
467
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
468
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
|
|
|
|
|
|
|
|
471
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
472
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); |
|
473
|
|
|
|
|
|
|
|
|
474
|
0
|
|
|
|
|
|
my ($rec, $err) = compile_regexp($re, 2); |
|
475
|
0
|
0
|
|
|
|
|
if (!$rec) { |
|
476
|
0
|
|
|
|
|
|
info("pdfinfo: invalid regexp '$re': $err"); |
|
477
|
0
|
|
|
|
|
|
return 0; |
|
478
|
|
|
|
|
|
|
} |
|
479
|
|
|
|
|
|
|
|
|
480
|
0
|
|
|
|
|
|
my $hit = 0; |
|
481
|
0
|
|
|
|
|
|
foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) { |
|
|
0
|
|
|
|
|
|
|
|
482
|
0
|
0
|
|
|
|
|
if ($name =~ $rec) { |
|
483
|
0
|
|
|
|
|
|
dbg("pdfinfo: pdf_name_regex hit on $name"); |
|
484
|
0
|
|
|
|
|
|
return 1; |
|
485
|
|
|
|
|
|
|
} |
|
486
|
|
|
|
|
|
|
} |
|
487
|
0
|
|
|
|
|
|
return 0; |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
} |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# ----------------------------------------- |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
sub pdf_is_encrypted { |
|
494
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body) = @_; |
|
495
|
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
497
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
498
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
|
|
501
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
502
|
0
|
|
|
|
|
|
return $pms->{'pdfinfo'}->{'encrypted'}; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# ----------------------------------------- |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub pdf_count { |
|
508
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
|
509
|
0
|
0
|
|
|
|
|
return unless defined $min; |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
512
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
513
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
517
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"}); |
|
518
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"}); |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# ----------------------------------------- |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub pdf_image_count { |
|
525
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
|
526
|
0
|
0
|
|
|
|
|
return unless defined $min; |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
529
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
530
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
|
|
533
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
534
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"}); |
|
535
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"}); |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
# ----------------------------------------- |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
sub pdf_pixel_coverage { |
|
542
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
|
543
|
0
|
0
|
|
|
|
|
return unless (defined $min); |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
546
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
547
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
|
|
550
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
551
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"}); |
|
554
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"}); |
|
555
|
|
|
|
|
|
|
} |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# ----------------------------------------- |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
sub pdf_image_to_text_ratio { |
|
560
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
|
561
|
0
|
0
|
0
|
|
|
|
return unless (defined $min && defined $max); |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
564
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
565
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
|
|
568
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
569
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# depending on how you call this eval (body vs rawbody), |
|
572
|
|
|
|
|
|
|
# the $textlen will differ. |
|
573
|
0
|
|
|
|
|
|
my $textlen = length(join('',@$body)); |
|
574
|
|
|
|
|
|
|
|
|
575
|
0
|
0
|
0
|
|
|
|
return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0); |
|
|
|
|
0
|
|
|
|
|
|
576
|
|
|
|
|
|
|
|
|
577
|
0
|
|
|
|
|
|
my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"}; |
|
578
|
0
|
|
|
|
|
|
dbg("pdfinfo: image ratio=$ratio, min=$min max=$max"); |
|
579
|
0
|
|
|
|
|
|
return result_check($min, $max, $ratio, 1); |
|
580
|
|
|
|
|
|
|
} |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# ----------------------------------------- |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
sub pdf_is_empty_body { |
|
585
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min) = @_; |
|
586
|
|
|
|
|
|
|
|
|
587
|
0
|
|
0
|
|
|
|
$min ||= 0; # default to 0 bytes |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
590
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
591
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
592
|
|
|
|
|
|
|
} |
|
593
|
|
|
|
|
|
|
|
|
594
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
595
|
0
|
0
|
|
|
|
|
return 0 unless $pms->{'pdfinfo'}->{"count_pdf"}; |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# check for cached result |
|
598
|
0
|
0
|
|
|
|
|
return 1 if $pms->{'pdfinfo'}->{"no_body_text"}; |
|
599
|
|
|
|
|
|
|
|
|
600
|
0
|
|
|
|
|
|
shift @$body; # shift body array removes line #1 -> subject line. |
|
601
|
|
|
|
|
|
|
|
|
602
|
0
|
|
|
|
|
|
my $bytes = 0; |
|
603
|
0
|
|
|
|
|
|
my $textlen = length(join('',@$body)); |
|
604
|
0
|
|
|
|
|
|
foreach my $line (@$body) { |
|
605
|
0
|
0
|
|
|
|
|
next unless ($line =~ m/\S/); |
|
606
|
0
|
0
|
|
|
|
|
next if ($line =~ m/^Subject/); |
|
607
|
0
|
|
|
|
|
|
$bytes += length($line); |
|
608
|
|
|
|
|
|
|
} |
|
609
|
|
|
|
|
|
|
|
|
610
|
0
|
|
|
|
|
|
dbg("pdfinfo: is_empty_body = $bytes bytes"); |
|
611
|
|
|
|
|
|
|
|
|
612
|
0
|
0
|
0
|
|
|
|
if ($bytes == 0 || ($bytes <= $min)) { |
|
613
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"no_body_text"} = 1; |
|
614
|
0
|
|
|
|
|
|
return 1; |
|
615
|
|
|
|
|
|
|
} |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# cache it and return 0 |
|
618
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"no_body_text"} = 0; |
|
619
|
0
|
|
|
|
|
|
return 0; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# ----------------------------------------- |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
sub pdf_image_size_exact { |
|
625
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$height,$width) = @_; |
|
626
|
0
|
0
|
0
|
|
|
|
return unless (defined $height && defined $width); |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
629
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
630
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
631
|
|
|
|
|
|
|
} |
|
632
|
|
|
|
|
|
|
|
|
633
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
634
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); |
|
635
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"}); |
|
636
|
0
|
|
|
|
|
|
return 0; |
|
637
|
|
|
|
|
|
|
} |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
# ----------------------------------------- |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub pdf_image_size_range { |
|
642
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_; |
|
643
|
0
|
0
|
0
|
|
|
|
return unless (defined $minh && defined $minw); |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
# make sure we have image data read in. |
|
646
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
647
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
648
|
|
|
|
|
|
|
} |
|
649
|
|
|
|
|
|
|
|
|
650
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
651
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); |
|
652
|
|
|
|
|
|
|
|
|
653
|
0
|
|
|
|
|
|
foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) { |
|
|
0
|
|
|
|
|
|
|
|
654
|
0
|
|
|
|
|
|
my ($h,$w) = split(/x/,$dem); |
|
655
|
0
|
0
|
|
|
|
|
next if ($h < $minh); # height less than min height |
|
656
|
0
|
0
|
|
|
|
|
next if ($w < $minw); # width less than min width |
|
657
|
0
|
0
|
0
|
|
|
|
next if (defined $maxh && $h > $maxh); # height more than max height |
|
658
|
0
|
0
|
0
|
|
|
|
next if (defined $maxw && $w > $maxw); # width more than max width |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# if we make it here, we have a match |
|
661
|
0
|
|
|
|
|
|
return 1; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
|
|
|
|
|
|
|
|
664
|
0
|
|
|
|
|
|
return 0; |
|
665
|
|
|
|
|
|
|
} |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
# ----------------------------------------- |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
sub pdf_match_md5 { |
|
670
|
|
|
|
|
|
|
|
|
671
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$md5) = @_; |
|
672
|
0
|
0
|
|
|
|
|
return unless defined $md5; |
|
673
|
|
|
|
|
|
|
|
|
674
|
0
|
|
|
|
|
|
my $uc_md5 = uc($md5); # uppercase matches only |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
|
677
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
678
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
679
|
|
|
|
|
|
|
} |
|
680
|
|
|
|
|
|
|
|
|
681
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
682
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"md5"}); |
|
683
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5}); |
|
684
|
0
|
|
|
|
|
|
return 0; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
# ----------------------------------------- |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub pdf_match_fuzzy_md5 { |
|
690
|
|
|
|
|
|
|
|
|
691
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$md5) = @_; |
|
692
|
0
|
0
|
|
|
|
|
return unless defined $md5; |
|
693
|
|
|
|
|
|
|
|
|
694
|
0
|
|
|
|
|
|
my $uc_md5 = uc($md5); # uppercase matches only |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
|
697
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
698
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
|
|
701
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
702
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}); |
|
703
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5}); |
|
704
|
0
|
|
|
|
|
|
return 0; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# ----------------------------------------- |
|
708
|
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
sub pdf_match_details { |
|
710
|
0
|
|
|
0
|
0
|
|
my ($self, $pms, $body, $detail, $regex) = @_; |
|
711
|
0
|
0
|
0
|
|
|
|
return unless ($detail && $regex); |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
|
714
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
|
715
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
|
716
|
|
|
|
|
|
|
} |
|
717
|
|
|
|
|
|
|
|
|
718
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
|
719
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{'details'}); |
|
720
|
|
|
|
|
|
|
|
|
721
|
0
|
|
|
|
|
|
my $check_value = $pms->{pdfinfo}->{details}->{$detail}; |
|
722
|
0
|
0
|
|
|
|
|
return unless $check_value; |
|
723
|
|
|
|
|
|
|
|
|
724
|
0
|
|
|
|
|
|
my ($rec, $err) = compile_regexp($regex, 2); |
|
725
|
0
|
0
|
|
|
|
|
if (!$rec) { |
|
726
|
0
|
|
|
|
|
|
info("pdfinfo: invalid regexp '$regex': $err"); |
|
727
|
0
|
|
|
|
|
|
return 0; |
|
728
|
|
|
|
|
|
|
} |
|
729
|
|
|
|
|
|
|
|
|
730
|
0
|
0
|
|
|
|
|
if ($check_value =~ $rec) { |
|
731
|
0
|
|
|
|
|
|
dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value"); |
|
732
|
0
|
|
|
|
|
|
return 1; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
0
|
|
|
|
|
|
return 0; |
|
735
|
|
|
|
|
|
|
} |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
# ----------------------------------------- |
|
738
|
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
sub result_check { |
|
740
|
0
|
|
|
0
|
0
|
|
my ($min, $max, $value, $nomaxequal) = @_; |
|
741
|
0
|
0
|
|
|
|
|
return 0 unless defined $value; |
|
742
|
0
|
0
|
|
|
|
|
return 0 if ($value < $min); |
|
743
|
0
|
0
|
0
|
|
|
|
return 0 if (defined $max && $value > $max); |
|
744
|
0
|
0
|
0
|
|
|
|
return 0 if (defined $nomaxequal && $nomaxequal && $value == $max); |
|
|
|
|
0
|
|
|
|
|
|
745
|
0
|
|
|
|
|
|
return 1; |
|
746
|
|
|
|
|
|
|
} |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# ----------------------------------------- |
|
749
|
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
1; |
|
751
|
|
|
|
|
|
|
|