line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 SYNOPSIS |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
loadplugin Mail::SpamAssassin::Plugin::PDFInfo |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This plugin helps detected spam using attached PDF files |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=over 4 |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=item See "Usage:" below - more documentation see 20_pdfinfo.cf |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
Original info kept for history. For later changes see SVN repo |
35
|
|
|
|
|
|
|
------------------------------------------------------- |
36
|
|
|
|
|
|
|
PDFInfo Plugin for SpamAssassin |
37
|
|
|
|
|
|
|
Version: 0.8 |
38
|
|
|
|
|
|
|
Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $ |
39
|
|
|
|
|
|
|
Created: 2007-08-10 |
40
|
|
|
|
|
|
|
Modified: 2007-08-10 |
41
|
|
|
|
|
|
|
By: Dallas Engelken |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Changes: |
44
|
|
|
|
|
|
|
0.8 - added .fdf detection (thanks John Lundin) [axb] |
45
|
|
|
|
|
|
|
0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb] |
46
|
|
|
|
|
|
|
0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc. |
47
|
|
|
|
|
|
|
- fixed issue on perl 5.6.1 where pdf_match_details() failed to call |
48
|
|
|
|
|
|
|
_find_pdf_mime_parts(), resulting in no detection of pdf mime parts. |
49
|
|
|
|
|
|
|
- quoted-printable support - requires MIME::QuotedPrint (which should be in everyones |
50
|
|
|
|
|
|
|
install as a part of the MIME-Base64 package which is a SA req) |
51
|
|
|
|
|
|
|
- added simple pdf_is_empty_body() function with counts the body bytes minus the |
52
|
|
|
|
|
|
|
subject line. can add optional <bytes> param if you need to allow for a few bytes. |
53
|
|
|
|
|
|
|
0.5 - fix warns for undef $pdf_tags |
54
|
|
|
|
|
|
|
- remove { } and \ before running eval in pdf_match_details to avoid eval error |
55
|
|
|
|
|
|
|
0.4 - added pdf_is_encrypted() function |
56
|
|
|
|
|
|
|
- added option to look for image HxW on same line |
57
|
|
|
|
|
|
|
0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data |
58
|
|
|
|
|
|
|
- renamed pdf_image_named() to pdf_named() |
59
|
|
|
|
|
|
|
- PDF images are encapsulated and have no names. We are matching the PDF file name. |
60
|
|
|
|
|
|
|
- renamed pdf_image_name_regex() to pdf_name_regex() |
61
|
|
|
|
|
|
|
- PDF images are encapsulated and have no names. We are matching the PDF file name. |
62
|
|
|
|
|
|
|
- changed pdf_image_count() a bit and added pdf_count(). |
63
|
|
|
|
|
|
|
- pdf_count() checks how many pdf attachments there are on the mail |
64
|
|
|
|
|
|
|
- pdf_image_count() checks how many images are found within all pdfs in the mail. |
65
|
|
|
|
|
|
|
- removed the restriction of the pdf containing an image in order to md5 it. |
66
|
|
|
|
|
|
|
- added pdf_match_details() function to check the following 'details' |
67
|
|
|
|
|
|
|
- author: Author of PDF if specified |
68
|
|
|
|
|
|
|
- producer: Software used to produce PDF |
69
|
|
|
|
|
|
|
- creator: Software used to produce PDF, usually similar to producer |
70
|
|
|
|
|
|
|
- title: Title of PDF |
71
|
|
|
|
|
|
|
- created: Creation Date |
72
|
|
|
|
|
|
|
- modified: Last Modified |
73
|
|
|
|
|
|
|
0.2 - support PDF octet-stream |
74
|
|
|
|
|
|
|
0.1 - just ported over the imageinfo code, and renamed to pdfinfo. |
75
|
|
|
|
|
|
|
- removed all support for png, gif, and jpg from the code. |
76
|
|
|
|
|
|
|
- prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
Usage: |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
pdf_count() |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
body RULENAME eval:pdf_count(<min>,[max]) |
83
|
|
|
|
|
|
|
min: required, message contains at least x pdf mime parts |
84
|
|
|
|
|
|
|
max: optional, if specified, must not contain more than x pdf mime parts |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
pdf_image_count() |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
body RULENAME eval:pdf_image_count(<min>,[max]) |
89
|
|
|
|
|
|
|
min: required, message contains at least x images in pdf attachments. |
90
|
|
|
|
|
|
|
max: optional, if specified, must not contain more than x pdf images |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
pdf_pixel_coverage() |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
body RULENAME eval:pdf_pixel_coverage(<min>,[max]) |
95
|
|
|
|
|
|
|
min: required, message contains at least this much pixel area |
96
|
|
|
|
|
|
|
max: optional, if specified, message must not contain more than this much pixel area |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
pdf_named() |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
body RULENAME eval:pdf_named(<string>) |
101
|
|
|
|
|
|
|
string: exact file name match, if you need partial match, see pdf_name_regex() |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
pdf_name_regex() |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
body RULENAME eval:pdf_name_regex(<regex>) |
106
|
|
|
|
|
|
|
regex: regular expression, see examples in ruleset |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
pdf_match_md5() |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_md5(<string>) |
111
|
|
|
|
|
|
|
string: 32-byte md5 hex |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
pdf_match_fuzzy_md5() |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_md5(<string>) |
116
|
|
|
|
|
|
|
string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5 |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
pdf_match_details() |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
body RULENAME eval:pdf_match_details(<detail>,<regex>); |
121
|
|
|
|
|
|
|
detail: author, creator, created, modified, producer, title |
122
|
|
|
|
|
|
|
regex: regular expression, see examples in ruleset |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
pdf_is_encrypted() |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
body RULENAME eval:pdf_is_encrypted() |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
pdf_is_empty_body() |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
body RULENAME eval:pdf_is_empty_body(<bytes>) |
131
|
|
|
|
|
|
|
bytes: maximum byte count to allow and still consider it empty |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
NOTE: See the ruleset for more examples that are not documented here. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=back |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# ------------------------------------------------------- |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
package Mail::SpamAssassin::Plugin::PDFInfo; |
142
|
|
|
|
|
|
|
|
143
|
19
|
|
|
19
|
|
160
|
use Mail::SpamAssassin::Plugin; |
|
19
|
|
|
|
|
50
|
|
|
19
|
|
|
|
|
680
|
|
144
|
19
|
|
|
19
|
|
123
|
use Mail::SpamAssassin::Logger; |
|
19
|
|
|
|
|
48
|
|
|
19
|
|
|
|
|
1213
|
|
145
|
19
|
|
|
19
|
|
145
|
use Mail::SpamAssassin::Util qw(compile_regexp); |
|
19
|
|
|
|
|
61
|
|
|
19
|
|
|
|
|
984
|
|
146
|
19
|
|
|
19
|
|
137
|
use strict; |
|
19
|
|
|
|
|
46
|
|
|
19
|
|
|
|
|
486
|
|
147
|
19
|
|
|
19
|
|
107
|
use warnings; |
|
19
|
|
|
|
|
53
|
|
|
19
|
|
|
|
|
663
|
|
148
|
|
|
|
|
|
|
# use bytes; |
149
|
19
|
|
|
19
|
|
173
|
use Digest::MD5 qw(md5_hex); |
|
19
|
|
|
|
|
44
|
|
|
19
|
|
|
|
|
1417
|
|
150
|
19
|
|
|
19
|
|
9270
|
use MIME::QuotedPrint; |
|
19
|
|
|
|
|
5136
|
|
|
19
|
|
|
|
|
72583
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# constructor: register the eval rule |
155
|
|
|
|
|
|
|
sub new { |
156
|
60
|
|
|
60
|
1
|
216
|
my $class = shift; |
157
|
60
|
|
|
|
|
190
|
my $mailsaobject = shift; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# some boilerplate... |
160
|
60
|
|
33
|
|
|
445
|
$class = ref($class) || $class; |
161
|
60
|
|
|
|
|
353
|
my $self = $class->SUPER::new($mailsaobject); |
162
|
60
|
|
|
|
|
181
|
bless ($self, $class); |
163
|
|
|
|
|
|
|
|
164
|
60
|
|
|
|
|
309
|
$self->register_eval_rule ("pdf_count"); |
165
|
60
|
|
|
|
|
228
|
$self->register_eval_rule ("pdf_image_count"); |
166
|
60
|
|
|
|
|
226
|
$self->register_eval_rule ("pdf_pixel_coverage"); |
167
|
60
|
|
|
|
|
232
|
$self->register_eval_rule ("pdf_image_size_exact"); |
168
|
60
|
|
|
|
|
226
|
$self->register_eval_rule ("pdf_image_size_range"); |
169
|
60
|
|
|
|
|
221
|
$self->register_eval_rule ("pdf_named"); |
170
|
60
|
|
|
|
|
210
|
$self->register_eval_rule ("pdf_name_regex"); |
171
|
60
|
|
|
|
|
220
|
$self->register_eval_rule ("pdf_image_to_text_ratio"); |
172
|
60
|
|
|
|
|
203
|
$self->register_eval_rule ("pdf_match_md5"); |
173
|
60
|
|
|
|
|
208
|
$self->register_eval_rule ("pdf_match_fuzzy_md5"); |
174
|
60
|
|
|
|
|
215
|
$self->register_eval_rule ("pdf_match_details"); |
175
|
60
|
|
|
|
|
246
|
$self->register_eval_rule ("pdf_is_encrypted"); |
176
|
60
|
|
|
|
|
235
|
$self->register_eval_rule ("pdf_is_empty_body"); |
177
|
|
|
|
|
|
|
|
178
|
60
|
|
|
|
|
563
|
return $self; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# ----------------------------------------- |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
my %get_details = ( |
184
|
|
|
|
|
|
|
'pdf' => sub { |
185
|
|
|
|
|
|
|
my ($self, $pms, $part) = @_; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $type = $part->{'type'} || 'base64'; |
188
|
|
|
|
|
|
|
my $data = ''; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
if ($type eq 'quoted-printable') { |
191
|
|
|
|
|
|
|
$data = decode_qp($data); # use QuotedPrint->decode_qp |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
else { |
194
|
|
|
|
|
|
|
$data = $part->decode(); # just use built in base64 decoder |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
my $index = substr($data, 0, 8); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
return unless ($index =~ /.PDF\-(\d\.\d)/); |
200
|
|
|
|
|
|
|
my $version = $1; |
201
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFVERSION', $version); |
202
|
|
|
|
|
|
|
# dbg("pdfinfo: pdf version = $version"); |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my ($height, $width, $fuzzy_data, $pdf_tags); |
205
|
|
|
|
|
|
|
my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown'); |
206
|
|
|
|
|
|
|
my ($md5, $fuzzy_md5) = ('', ''); |
207
|
|
|
|
|
|
|
my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
my $name = $part->{'name'} || ''; |
210
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFNAME', $name); |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
my $no_more_fuzzy = 0; |
213
|
|
|
|
|
|
|
my $got_image = 0; |
214
|
|
|
|
|
|
|
my $encrypted = 0; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
while($data =~ /([^\n]+)/g) { |
217
|
|
|
|
|
|
|
# dbg("pdfinfo: line=$1"); |
218
|
|
|
|
|
|
|
my $line = $1; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
$line_count++; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# lines containing high bytes will have no data we need, so save some cycles |
223
|
|
|
|
|
|
|
next if ($line =~ /[\x80-\xff]/); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
if (!$no_more_fuzzy && $line_count < 70) { |
226
|
|
|
|
|
|
|
if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) { |
227
|
|
|
|
|
|
|
$line =~ s/\s+$//; # strip off whitespace at end. |
228
|
|
|
|
|
|
|
$fuzzy_data .= $line; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
if ($line =~ m/^\/([A-Za-z]+)/) { |
233
|
|
|
|
|
|
|
$pdf_tags .= $1; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
$got_image=1 if ($line =~ m/\/Image/); |
237
|
|
|
|
|
|
|
$encrypted=1 if ($line =~ m/^\/Encrypt/); |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# once we hit the first stream, we stop collecting data for fuzzy md5 |
240
|
|
|
|
|
|
|
$no_more_fuzzy = 1 if ($line =~ m/stream/); |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# From a v1.3 pdf |
243
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm |
244
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=/Width 630 |
245
|
|
|
|
|
|
|
# [12234] dbg: pdfinfo: line=/Height 149 |
246
|
|
|
|
|
|
|
if ($got_image) { |
247
|
|
|
|
|
|
|
if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) { |
248
|
|
|
|
|
|
|
$width = $1; |
249
|
|
|
|
|
|
|
$height = $2; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
elsif ($line =~ /^\/Width\s(\d+)/) { |
252
|
|
|
|
|
|
|
$width = $1; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
elsif ($line =~ /^\/Height\s(\d+)/) { |
255
|
|
|
|
|
|
|
$height = $1; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) { |
258
|
|
|
|
|
|
|
$width = $1; |
259
|
|
|
|
|
|
|
$height = $2; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# did pdf contain image data? |
264
|
|
|
|
|
|
|
if ($got_image && $width && $height) { |
265
|
|
|
|
|
|
|
$no_more_fuzzy = 1; |
266
|
|
|
|
|
|
|
my $area = $width * $height; |
267
|
|
|
|
|
|
|
$total_height += $height; |
268
|
|
|
|
|
|
|
$total_width += $width; |
269
|
|
|
|
|
|
|
$total_area += $area; |
270
|
|
|
|
|
|
|
$pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1; |
271
|
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf_images"} ++; |
272
|
|
|
|
|
|
|
dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)"); |
273
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}"); |
274
|
|
|
|
|
|
|
$height=0; $width=0; # reset and check for next image |
275
|
|
|
|
|
|
|
$got_image = 0; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=<</Producer(GPL Ghostscript 8.15) |
279
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/CreationDate(D:20070703144220) |
280
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/ModDate(D:20070703144220) |
281
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Title(Microsoft Word - Document1) |
282
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Creator(PScript5.dll Version 5.2) |
283
|
|
|
|
|
|
|
# [5310] dbg: pdfinfo: line=/Author(colet)>>endobj |
284
|
|
|
|
|
|
|
# or all on same line inside xml - v1.6+ |
285
|
|
|
|
|
|
|
# <</CreationDate(D:20070226165054-06'00')/Creator( Adobe Photoshop CS2 Windows)/Producer(Adobe Photoshop for Windows -- Image Conversion Plug-in)/ModDate(D:20070226165100-06'00')>> |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
if ($line =~ /\/Producer\s?\(([^\)\\]+)/) { |
288
|
|
|
|
|
|
|
$producer = $1; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) { |
291
|
|
|
|
|
|
|
$created = $1; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
if ($line =~ /\/ModDate\s?\(D\:(\d+)/) { |
294
|
|
|
|
|
|
|
$modified = $1; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
if ($line =~ /\/Title\s?\(([^\)\\]+)/) { |
297
|
|
|
|
|
|
|
$title = $1; |
298
|
|
|
|
|
|
|
# Title=\376\377\000w\000w\000n\000g |
299
|
|
|
|
|
|
|
# Title=wwng |
300
|
|
|
|
|
|
|
$title =~ s/\\\d{3}//g; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
if ($line =~ /\/Creator\s?\(([^\)\\]+)/) { |
303
|
|
|
|
|
|
|
$creator = $1; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
if ($line =~ /\/Author\s?\(([^\)]+)/) { |
306
|
|
|
|
|
|
|
$author = $1; |
307
|
|
|
|
|
|
|
# Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r |
308
|
|
|
|
|
|
|
# Author=HP_Administrator |
309
|
|
|
|
|
|
|
$author =~ s/\\\d{3}//g; |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# store the file name so we can check pdf_named() or pdf_name_match() later. |
314
|
|
|
|
|
|
|
$pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# store encrypted flag. |
317
|
|
|
|
|
|
|
$pms->{pdfinfo}->{encrypted} = $encrypted; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# if we had multiple images in the pdf, we need to store the total HxW as well. |
320
|
|
|
|
|
|
|
# If it was a single Image PDF, then this value will already be in the hash. |
321
|
|
|
|
|
|
|
$pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
if ($total_area) { |
324
|
|
|
|
|
|
|
$pms->{pdfinfo}->{pc_pdf} = $total_area; |
325
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGAREA', $total_area); |
326
|
|
|
|
|
|
|
dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area); |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified"); |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
$md5 = uc(md5_hex($data)) if $data; |
332
|
|
|
|
|
|
|
$fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data; |
333
|
|
|
|
|
|
|
my $tags_md5; |
334
|
|
|
|
|
|
|
$tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : '')); |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# we dont need tags for these. |
339
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{created} = $created if $created; |
340
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{modified} = $modified if $modified; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
if ($producer) { |
343
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{producer} = $producer if $producer; |
344
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFPRODUCER', $producer); |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
if ($title) { |
347
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{title} = $title; |
348
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFTITLE', $title); |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
if ($creator) { |
351
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{creator} = $creator; |
352
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFCREATOR', $creator); |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
if ($author) { |
355
|
|
|
|
|
|
|
$pms->{pdfinfo}->{details}->{author} = $author; |
356
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFAUTHOR', $author); |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
if ($md5) { |
359
|
|
|
|
|
|
|
$pms->{pdfinfo}->{md5}->{$md5} = 1; |
360
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5', $fuzzy_md5); |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
if ($fuzzy_md5) { |
363
|
|
|
|
|
|
|
$pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1; |
364
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5); |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
if ($tags_md5) { |
367
|
|
|
|
|
|
|
$pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1; |
368
|
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5); |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
}, |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# ---------------------------------------- |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub _set_tag { |
377
|
|
|
|
|
|
|
|
378
|
0
|
|
|
0
|
|
|
my ($self, $pms, $tag, $value) = @_; |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
dbg("pdfinfo: set_tag called for $tag $value"); |
381
|
0
|
0
|
0
|
|
|
|
return unless ($tag && $value); |
382
|
|
|
|
|
|
|
|
383
|
0
|
0
|
|
|
|
|
if (exists $pms->{tag_data}->{$tag}) { |
384
|
0
|
|
|
|
|
|
$pms->{tag_data}->{$tag} .= " $value"; # append value |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
else { |
387
|
0
|
|
|
|
|
|
$pms->{tag_data}->{$tag} = $value; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# ---------------------------------------- |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub _find_pdf_mime_parts { |
394
|
0
|
|
|
0
|
|
|
my ($self,$pms) = @_; |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# bail early if message does not have pdf parts |
397
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# initialize |
400
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"pc_pdf"} = 0; |
401
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf"} = 0; |
402
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_pdf_images"} = 0; |
403
|
|
|
|
|
|
|
|
404
|
0
|
|
|
|
|
|
my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1); |
405
|
0
|
|
|
|
|
|
my $part_count = scalar @parts; |
406
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
|
dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content"); |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# cache this so we can easily bail |
410
|
0
|
0
|
|
|
|
|
$pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count; |
411
|
|
|
|
|
|
|
|
412
|
0
|
|
|
|
|
|
foreach my $p (@parts) { |
413
|
0
|
|
|
|
|
|
my $type = $p->{'type'} =~ m@/([\w\-]+)$@; |
414
|
0
|
|
0
|
|
|
|
my $name = $p->{'name'} || ''; |
415
|
|
|
|
|
|
|
|
416
|
0
|
|
0
|
|
|
|
my $cte = lc( $p->get_header('content-transfer-encoding') || '' ); |
417
|
|
|
|
|
|
|
|
418
|
0
|
0
|
|
|
|
|
dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '').""); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# make sure its a cte we support |
421
|
0
|
0
|
|
|
|
|
next unless ($cte =~ /^(?:base64|quoted\-printable)$/); |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# filename must end with .pdf, or application type can be pdf |
424
|
|
|
|
|
|
|
# sometimes windows muas will wrap a pdf up inside a .dat file |
425
|
|
|
|
|
|
|
# v0.8 - Added .fdf phoney PDF detection |
426
|
0
|
0
|
0
|
|
|
|
next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf'); |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# if we get this far, make sure type is pdf for sure (not octet-stream or anything else) |
429
|
0
|
|
|
|
|
|
$type='pdf'; |
430
|
|
|
|
|
|
|
|
431
|
0
|
0
|
0
|
|
|
|
if ($type && exists $get_details{$type}) { |
432
|
0
|
|
|
|
|
|
$get_details{$type}->($self, $pms, $p); |
433
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"count_$type"} ++; |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
0
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"}); |
438
|
0
|
|
|
|
|
|
$self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"}); |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# ---------------------------------------- |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub pdf_named { |
445
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$name) = @_; |
446
|
0
|
0
|
|
|
|
|
return unless (defined $name); |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# make sure we have image data read in. |
449
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
450
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
454
|
|
|
|
|
|
|
|
455
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); |
456
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name}); |
457
|
0
|
|
|
|
|
|
return 0; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# ----------------------------------------- |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
sub pdf_name_regex { |
463
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$re) = @_; |
464
|
0
|
0
|
|
|
|
|
return unless (defined $re); |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# make sure we have image data read in. |
467
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
468
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
472
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); |
473
|
|
|
|
|
|
|
|
474
|
0
|
|
|
|
|
|
my ($rec, $err) = compile_regexp($re, 2); |
475
|
0
|
0
|
|
|
|
|
if (!$rec) { |
476
|
0
|
|
|
|
|
|
info("pdfinfo: invalid regexp '$re': $err"); |
477
|
0
|
|
|
|
|
|
return 0; |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
0
|
|
|
|
|
|
my $hit = 0; |
481
|
0
|
|
|
|
|
|
foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) { |
|
0
|
|
|
|
|
|
|
482
|
0
|
0
|
|
|
|
|
if ($name =~ $rec) { |
483
|
0
|
|
|
|
|
|
dbg("pdfinfo: pdf_name_regex hit on $name"); |
484
|
0
|
|
|
|
|
|
return 1; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
} |
487
|
0
|
|
|
|
|
|
return 0; |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# ----------------------------------------- |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
sub pdf_is_encrypted { |
494
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body) = @_; |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
# make sure we have image data read in. |
497
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
498
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
502
|
0
|
|
|
|
|
|
return $pms->{'pdfinfo'}->{'encrypted'}; |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# ----------------------------------------- |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub pdf_count { |
508
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
509
|
0
|
0
|
|
|
|
|
return unless defined $min; |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# make sure we have image data read in. |
512
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
513
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
517
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"}); |
518
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"}); |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# ----------------------------------------- |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub pdf_image_count { |
525
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
526
|
0
|
0
|
|
|
|
|
return unless defined $min; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# make sure we have image data read in. |
529
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
530
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
|
533
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
534
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"}); |
535
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"}); |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
# ----------------------------------------- |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
sub pdf_pixel_coverage { |
542
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
543
|
0
|
0
|
|
|
|
|
return unless (defined $min); |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# make sure we have image data read in. |
546
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
547
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
551
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"}); |
554
|
0
|
|
|
|
|
|
return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"}); |
555
|
|
|
|
|
|
|
} |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# ----------------------------------------- |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
sub pdf_image_to_text_ratio { |
560
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min,$max) = @_; |
561
|
0
|
0
|
0
|
|
|
|
return unless (defined $min && defined $max); |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# make sure we have image data read in. |
564
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
565
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
|
568
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
569
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# depending on how you call this eval (body vs rawbody), |
572
|
|
|
|
|
|
|
# the $textlen will differ. |
573
|
0
|
|
|
|
|
|
my $textlen = length(join('',@$body)); |
574
|
|
|
|
|
|
|
|
575
|
0
|
0
|
0
|
|
|
|
return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0); |
|
|
|
0
|
|
|
|
|
576
|
|
|
|
|
|
|
|
577
|
0
|
|
|
|
|
|
my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"}; |
578
|
0
|
|
|
|
|
|
dbg("pdfinfo: image ratio=$ratio, min=$min max=$max"); |
579
|
0
|
|
|
|
|
|
return result_check($min, $max, $ratio, 1); |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# ----------------------------------------- |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
sub pdf_is_empty_body { |
585
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$min) = @_; |
586
|
|
|
|
|
|
|
|
587
|
0
|
|
0
|
|
|
|
$min ||= 0; # default to 0 bytes |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
# make sure we have image data read in. |
590
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
591
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
595
|
0
|
0
|
|
|
|
|
return 0 unless $pms->{'pdfinfo'}->{"count_pdf"}; |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# check for cached result |
598
|
0
|
0
|
|
|
|
|
return 1 if $pms->{'pdfinfo'}->{"no_body_text"}; |
599
|
|
|
|
|
|
|
|
600
|
0
|
|
|
|
|
|
shift @$body; # shift body array removes line #1 -> subject line. |
601
|
|
|
|
|
|
|
|
602
|
0
|
|
|
|
|
|
my $bytes = 0; |
603
|
0
|
|
|
|
|
|
my $textlen = length(join('',@$body)); |
604
|
0
|
|
|
|
|
|
foreach my $line (@$body) { |
605
|
0
|
0
|
|
|
|
|
next unless ($line =~ m/\S/); |
606
|
0
|
0
|
|
|
|
|
next if ($line =~ m/^Subject/); |
607
|
0
|
|
|
|
|
|
$bytes += length($line); |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
|
610
|
0
|
|
|
|
|
|
dbg("pdfinfo: is_empty_body = $bytes bytes"); |
611
|
|
|
|
|
|
|
|
612
|
0
|
0
|
0
|
|
|
|
if ($bytes == 0 || ($bytes <= $min)) { |
613
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"no_body_text"} = 1; |
614
|
0
|
|
|
|
|
|
return 1; |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# cache it and return 0 |
618
|
0
|
|
|
|
|
|
$pms->{'pdfinfo'}->{"no_body_text"} = 0; |
619
|
0
|
|
|
|
|
|
return 0; |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# ----------------------------------------- |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
sub pdf_image_size_exact { |
625
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$height,$width) = @_; |
626
|
0
|
0
|
0
|
|
|
|
return unless (defined $height && defined $width); |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
# make sure we have image data read in. |
629
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
630
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
|
633
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
634
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); |
635
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"}); |
636
|
0
|
|
|
|
|
|
return 0; |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
# ----------------------------------------- |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
sub pdf_image_size_range { |
642
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_; |
643
|
0
|
0
|
0
|
|
|
|
return unless (defined $minh && defined $minw); |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
# make sure we have image data read in. |
646
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
647
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
648
|
|
|
|
|
|
|
} |
649
|
|
|
|
|
|
|
|
650
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
651
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); |
652
|
|
|
|
|
|
|
|
653
|
0
|
|
|
|
|
|
foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) { |
|
0
|
|
|
|
|
|
|
654
|
0
|
|
|
|
|
|
my ($h,$w) = split(/x/,$dem); |
655
|
0
|
0
|
|
|
|
|
next if ($h < $minh); # height less than min height |
656
|
0
|
0
|
|
|
|
|
next if ($w < $minw); # width less than min width |
657
|
0
|
0
|
0
|
|
|
|
next if (defined $maxh && $h > $maxh); # height more than max height |
658
|
0
|
0
|
0
|
|
|
|
next if (defined $maxw && $w > $maxw); # width more than max width |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# if we make it here, we have a match |
661
|
0
|
|
|
|
|
|
return 1; |
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
|
664
|
0
|
|
|
|
|
|
return 0; |
665
|
|
|
|
|
|
|
} |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
# ----------------------------------------- |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
sub pdf_match_md5 { |
670
|
|
|
|
|
|
|
|
671
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$md5) = @_; |
672
|
0
|
0
|
|
|
|
|
return unless defined $md5; |
673
|
|
|
|
|
|
|
|
674
|
0
|
|
|
|
|
|
my $uc_md5 = uc($md5); # uppercase matches only |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
677
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
678
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
679
|
|
|
|
|
|
|
} |
680
|
|
|
|
|
|
|
|
681
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
682
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"md5"}); |
683
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5}); |
684
|
0
|
|
|
|
|
|
return 0; |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
# ----------------------------------------- |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub pdf_match_fuzzy_md5 { |
690
|
|
|
|
|
|
|
|
691
|
0
|
|
|
0
|
0
|
|
my ($self,$pms,$body,$md5) = @_; |
692
|
0
|
0
|
|
|
|
|
return unless defined $md5; |
693
|
|
|
|
|
|
|
|
694
|
0
|
|
|
|
|
|
my $uc_md5 = uc($md5); # uppercase matches only |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
697
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
698
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
702
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}); |
703
|
0
|
0
|
|
|
|
|
return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5}); |
704
|
0
|
|
|
|
|
|
return 0; |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# ----------------------------------------- |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
sub pdf_match_details { |
710
|
0
|
|
|
0
|
0
|
|
my ($self, $pms, $body, $detail, $regex) = @_; |
711
|
0
|
0
|
0
|
|
|
|
return unless ($detail && $regex); |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# make sure we have pdf data read in. |
714
|
0
|
0
|
|
|
|
|
if (!exists $pms->{'pdfinfo'}) { |
715
|
0
|
|
|
|
|
|
$self->_find_pdf_mime_parts($pms); |
716
|
|
|
|
|
|
|
} |
717
|
|
|
|
|
|
|
|
718
|
0
|
0
|
|
|
|
|
return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); |
719
|
0
|
0
|
|
|
|
|
return 0 unless (exists $pms->{'pdfinfo'}->{'details'}); |
720
|
|
|
|
|
|
|
|
721
|
0
|
|
|
|
|
|
my $check_value = $pms->{pdfinfo}->{details}->{$detail}; |
722
|
0
|
0
|
|
|
|
|
return unless $check_value; |
723
|
|
|
|
|
|
|
|
724
|
0
|
|
|
|
|
|
my ($rec, $err) = compile_regexp($regex, 2); |
725
|
0
|
0
|
|
|
|
|
if (!$rec) { |
726
|
0
|
|
|
|
|
|
info("pdfinfo: invalid regexp '$regex': $err"); |
727
|
0
|
|
|
|
|
|
return 0; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
|
730
|
0
|
0
|
|
|
|
|
if ($check_value =~ $rec) { |
731
|
0
|
|
|
|
|
|
dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value"); |
732
|
0
|
|
|
|
|
|
return 1; |
733
|
|
|
|
|
|
|
} |
734
|
0
|
|
|
|
|
|
return 0; |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
# ----------------------------------------- |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
sub result_check { |
740
|
0
|
|
|
0
|
0
|
|
my ($min, $max, $value, $nomaxequal) = @_; |
741
|
0
|
0
|
|
|
|
|
return 0 unless defined $value; |
742
|
0
|
0
|
|
|
|
|
return 0 if ($value < $min); |
743
|
0
|
0
|
0
|
|
|
|
return 0 if (defined $max && $value > $max); |
744
|
0
|
0
|
0
|
|
|
|
return 0 if (defined $nomaxequal && $nomaxequal && $value == $max); |
|
|
|
0
|
|
|
|
|
745
|
0
|
|
|
|
|
|
return 1; |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# ----------------------------------------- |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
1; |
751
|
|
|
|
|
|
|
|