line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package SWISH::Filter; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
30865
|
use 5.005; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
78
|
|
4
|
2
|
|
|
2
|
|
12
|
use strict; |
|
2
|
|
|
|
|
37
|
|
|
2
|
|
|
|
|
75
|
|
5
|
2
|
|
|
2
|
|
12
|
use File::Basename; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
229
|
|
6
|
2
|
|
|
2
|
|
11
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
156
|
|
7
|
2
|
|
|
2
|
|
1109
|
use SWISH::Filter::MIMETypes; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
124
|
|
8
|
2
|
|
|
2
|
|
1456
|
use SWISH::Filter::Document; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
73
|
|
9
|
2
|
|
|
2
|
|
1585
|
use SWISH::Filters::Base; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
96
|
|
10
|
|
|
|
|
|
|
use Module::Pluggable |
11
|
2
|
|
|
|
|
16
|
search_path => 'SWISH::Filters', |
12
|
|
|
|
|
|
|
except => 'SWISH::Filters::Base', |
13
|
|
|
|
|
|
|
sub_name => 'filters_found', |
14
|
|
|
|
|
|
|
require => 1, |
15
|
2
|
|
|
2
|
|
1754
|
instantiate => 'new'; |
|
2
|
|
|
|
|
43853
|
|
16
|
|
|
|
|
|
|
|
17
|
2
|
|
|
2
|
|
174
|
use vars qw/ $VERSION %extra_methods /; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
11601
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$VERSION = '0.190'; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Define the available parameters |
22
|
|
|
|
|
|
|
%extra_methods = map { $_ => 1 } qw( meta_data name user_data ); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# For testing only |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
if ( $0 =~ 'Filter.pm' && @ARGV >= 2 && shift =~ /^test/i ) { |
27
|
|
|
|
|
|
|
die "Please use the 'swish-filter-test' program.\n"; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 NAME |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
SWISH::Filter - filter documents for indexing with Swish-e |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 SYNOPSIS |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
use SWISH::Filter; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# load available filters into memory |
39
|
|
|
|
|
|
|
my $filter = SWISH::Filter->new; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# convert a document |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $doc = $filter->convert( |
45
|
|
|
|
|
|
|
document => \$scalar_ref, # path or ref to a doc |
46
|
|
|
|
|
|
|
content_type => $content_type, # content type if doc reference |
47
|
|
|
|
|
|
|
name => $real_path, # optional name for this file (useful for debugging) |
48
|
|
|
|
|
|
|
user_data => $whatever, # optional data to make available to filters |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
return unless $doc; # empty doc, zero size, or no filters installed |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Was the document converted by a filter? |
54
|
|
|
|
|
|
|
my $was_filtered = $doc->was_filtered; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Skip if the file is not text |
57
|
|
|
|
|
|
|
return if $doc->is_binary; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Print out the doc |
60
|
|
|
|
|
|
|
my $doc_ref = $doc->fetch_doc; |
61
|
|
|
|
|
|
|
print $$doc_ref; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Fetch the final content type of the document |
64
|
|
|
|
|
|
|
my $content_type = $doc->content_type; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Fetch Swish-e parser type (TXT*, XML*, HTML*, or undefined) |
67
|
|
|
|
|
|
|
my $doc_type = $doc->swish_parser_type; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 DESCRIPTION |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
SWISH::Filter provides a unified way to convert documents into a type that |
72
|
|
|
|
|
|
|
Swish-e can index. Individual filters are installed as separate subclasses (modules). |
73
|
|
|
|
|
|
|
For example, there might be a filter that converts from PDF format to HTML |
74
|
|
|
|
|
|
|
format. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
SWISH::Filter is a framework that relies on other packages to do the heavy lifting |
77
|
|
|
|
|
|
|
of converting non-text documents to text. B
|
78
|
|
|
|
|
|
|
programs or Perl modules may need to be installed to use SWISH::Filter to filter |
79
|
|
|
|
|
|
|
documents.> For example, to filter PDF documents you must install the C |
80
|
|
|
|
|
|
|
package. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The filters are automatically loaded when Cnew()> is |
83
|
|
|
|
|
|
|
called. Filters define a type and priority that determines the processing |
84
|
|
|
|
|
|
|
order of the filter. Filters are processed in this sort order until a filter |
85
|
|
|
|
|
|
|
accepts the document for filtering. The filter uses the document's content type |
86
|
|
|
|
|
|
|
to determine if the filter should handle the current document. The |
87
|
|
|
|
|
|
|
content-type is determined by the files suffix if not supplied by the calling |
88
|
|
|
|
|
|
|
program. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The individual filters are not designed to be used as separate modules. All |
91
|
|
|
|
|
|
|
access to the filters is through this SWISH::Filter module. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Normally, once a document is filtered processing stops. Filters can filter the |
94
|
|
|
|
|
|
|
document and then set a flag saying that filtering should continue (for example |
95
|
|
|
|
|
|
|
a filter that uncompresses a MS Word document before passing on to the filter |
96
|
|
|
|
|
|
|
that converts from MS Word to text). All this should be transparent to the end |
97
|
|
|
|
|
|
|
user. So, filters can be pipe-lined. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
The idea of SWISH::Filter is that new filters can be created, and then |
100
|
|
|
|
|
|
|
downloaded and installed to provide new filtering capabilities. For example, |
101
|
|
|
|
|
|
|
if you needed to index MS Excel documents you might be able to download a |
102
|
|
|
|
|
|
|
filter from the Swish-e site and magically next time you run indexing MS Excel |
103
|
|
|
|
|
|
|
docs would be indexed. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
The SWISH::Filter setup can be used with -S prog or -S http. It works best |
106
|
|
|
|
|
|
|
with the -S prog method because the filter modules only need to be loaded and |
107
|
|
|
|
|
|
|
compiled one time. The -S prog program F will automatically use |
108
|
|
|
|
|
|
|
SWISH::Filter when spidering with default settings (using "default" as the |
109
|
|
|
|
|
|
|
first parameter to spider.pl). |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
The -S http indexing method uses a Perl helper script called F. |
112
|
|
|
|
|
|
|
F has been updated to work with SWISH::Filter, but (unlike |
113
|
|
|
|
|
|
|
spider.pl) does not contain a "use lib" line to point to the location of |
114
|
|
|
|
|
|
|
SWISH::Filter. This means that by default F will B use |
115
|
|
|
|
|
|
|
SWISH::Filter for filtering. The reason for this is because F |
116
|
|
|
|
|
|
|
runs for every URL fetched, and loading the Filters for each document can be |
117
|
|
|
|
|
|
|
slow. The recommended way of spidering is using -S prog with spider.pl, but if |
118
|
|
|
|
|
|
|
-S http is desired the way to enable SWISH::Filter is to set PERL5LIB before |
119
|
|
|
|
|
|
|
running swish so that F will be able to locate the SWISH::Filter |
120
|
|
|
|
|
|
|
module. Here's one way to set the PERL5LIB with the bash shell: |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
$ export PERL5LIB=`swish-filter-test -path` |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 METHODS |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 new( %I ) |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
new() creates a SWISH::Filter object. You may pass in options as a list or a hash reference. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head3 Options |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
There is currently only one option that can be passed in to new(): |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=over 4 |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item ignore_filters |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Pass in a reference to a list of filter names to ignore. For example, if you have two filters installed |
141
|
|
|
|
|
|
|
"Pdf2HTML" and "Pdf2XML" and want to avoid using "Pdf2XML": |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my $filter = SWISH::Filter->new( ignore_filters => ['Pdf2XML']; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=back |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=cut |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub new { |
150
|
1
|
|
|
1
|
1
|
912
|
my $class = shift; |
151
|
1
|
|
33
|
|
|
14
|
$class = ref($class) || $class; |
152
|
|
|
|
|
|
|
|
153
|
1
|
0
|
|
|
|
6
|
my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_; |
|
0
|
50
|
|
|
|
0
|
|
154
|
|
|
|
|
|
|
|
155
|
1
|
|
|
|
|
4
|
my $self = bless {}, $class; |
156
|
|
|
|
|
|
|
|
157
|
1
|
|
|
|
|
7
|
$self->{skip_filters} = {}; |
158
|
|
|
|
|
|
|
|
159
|
1
|
50
|
|
|
|
6
|
$self->ignore_filters( delete $attr{ignore_filters} ) |
160
|
|
|
|
|
|
|
if $attr{ignore_filters}; |
161
|
|
|
|
|
|
|
|
162
|
1
|
|
|
|
|
5
|
warn "Unknown SWISH::Filter->new() config setting '$_'\n" for keys %attr; |
163
|
|
|
|
|
|
|
|
164
|
1
|
|
|
|
|
10
|
$self->{mimetypes} = SWISH::Filter::MIMETypes->new; |
165
|
|
|
|
|
|
|
|
166
|
1
|
|
|
|
|
5
|
$self->create_filter_list(%attr); |
167
|
|
|
|
|
|
|
|
168
|
1
|
|
50
|
|
|
8
|
$self->{doc_class} ||= 'SWISH::Filter::Document'; |
169
|
|
|
|
|
|
|
|
170
|
1
|
|
|
|
|
8
|
return $self; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub ignore_filters { |
174
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $filters ) = @_; |
175
|
|
|
|
|
|
|
|
176
|
0
|
0
|
|
|
|
0
|
unless ($filters) { |
177
|
0
|
0
|
|
|
|
0
|
return unless $self->{ignore_filter_list}; |
178
|
0
|
|
|
|
|
0
|
return @{ $self->{ignore_filter_list} }; |
|
0
|
|
|
|
|
0
|
|
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
0
|
@{ $self->{ignore_filter_list} } = @$filters; |
|
0
|
|
|
|
|
0
|
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# create lookup hash for filters to skip |
184
|
0
|
|
|
|
|
0
|
$self->{skip_filters} = { map { $_, 1 } @$filters }; |
|
0
|
|
|
|
|
0
|
|
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head2 doc_class |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
If you subclass SWISH::Filter::Document with your own class, indicate your class |
190
|
|
|
|
|
|
|
name in the new() method with the C param. The return value of doc_class() |
191
|
|
|
|
|
|
|
is used in convert() for instatiating the Document object. The default value is |
192
|
|
|
|
|
|
|
C. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub doc_class { |
197
|
0
|
|
|
0
|
1
|
0
|
return $_[0]->{doc_class}; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head2 convert |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This method filters a document. Returns an object belonging to doc_class() |
203
|
|
|
|
|
|
|
on success. If passed an empty document, a filename that cannot be read off disk, or |
204
|
|
|
|
|
|
|
if no filters have been loaded, returns undef. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
See the SWISH::Filter::Document documentation. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
You must pass in a hash (or hash reference) of parameters to the convert() method. The |
209
|
|
|
|
|
|
|
possible parameters are: |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=over 8 |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=item document |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
This can be either a path to a file, or a scalar reference to a document in memory. |
216
|
|
|
|
|
|
|
This is required. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=item content_type |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
The MIME type of the document. This is only required when passing in a scalar |
221
|
|
|
|
|
|
|
reference to a document. The content type string is what the filters use to |
222
|
|
|
|
|
|
|
match a document type. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
When passing in a file name and C is not set, then the content type will |
225
|
|
|
|
|
|
|
be determined from the file's extension by using the MIME::Types Perl module (available on CPAN). |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=item name |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Optional name to pass in to filters that will be used in error and warning messages. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item user_data |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Optional data structure that all filters may access. |
234
|
|
|
|
|
|
|
This can be fetched in a filter by: |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
my $user_data = $doc_object->user_data; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
And used in the filter as: |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
if ( ref $user_data && $user_data->{pdf2html}{title} ) { |
241
|
|
|
|
|
|
|
... |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
It's up to the filter author to use a unique first-level hash key for a given filter. |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item meta_data |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Optional data structure intended for meta name/content pairs for HTML |
249
|
|
|
|
|
|
|
or XML output. See SWISH::Filter::Document for discussion of this data. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=back |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Example of using the convert() method: |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
$doc_object = $filter->convert( |
256
|
|
|
|
|
|
|
document => $doc_ref, |
257
|
|
|
|
|
|
|
content-type => 'application/pdf', |
258
|
|
|
|
|
|
|
); |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=cut |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub convert { |
263
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
264
|
0
|
0
|
|
|
|
0
|
my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_; |
|
0
|
0
|
|
|
|
0
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Any filters? |
267
|
0
|
0
|
|
|
|
0
|
return unless $self->filter_list; |
268
|
|
|
|
|
|
|
|
269
|
0
|
|
0
|
|
|
0
|
my $doc = delete $attr{document} |
270
|
|
|
|
|
|
|
|| die |
271
|
|
|
|
|
|
|
"Failed to supply document attribute 'document' when calling filter()\n"; |
272
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
0
|
my $content_type = delete $attr{content_type}; |
274
|
|
|
|
|
|
|
|
275
|
0
|
0
|
|
|
|
0
|
if ( ref $content_type ) { |
276
|
0
|
|
|
|
|
0
|
my $type = $self->decode_content_type($$content_type); |
277
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
0
|
unless ($type) { |
279
|
0
|
|
|
|
|
0
|
warn |
280
|
|
|
|
|
|
|
"Failed to set content type for file reference '$$content_type'\n"; |
281
|
0
|
|
|
|
|
0
|
return; |
282
|
|
|
|
|
|
|
} |
283
|
0
|
|
|
|
|
0
|
$content_type = $type; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
0
|
0
|
|
|
|
0
|
if ( ref $doc ) { |
287
|
0
|
0
|
|
|
|
0
|
die |
288
|
|
|
|
|
|
|
"Must supply a content type when passing in a reference to a document\n" |
289
|
|
|
|
|
|
|
unless $content_type; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
else { |
292
|
0
|
|
0
|
|
|
0
|
$content_type ||= $self->decode_content_type($doc); |
293
|
0
|
0
|
|
|
|
0
|
unless ($content_type) { |
294
|
0
|
|
|
|
|
0
|
warn "Failed to set content type for document '$doc'\n"; |
295
|
0
|
|
|
|
|
0
|
return; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
0
|
|
0
|
|
|
0
|
$attr{name} ||= $doc; # Set default name of document |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
|
301
|
0
|
|
|
|
|
0
|
$self->mywarn( |
302
|
|
|
|
|
|
|
"\n>> Starting to process new document: $attr{name} -> $content_type" |
303
|
|
|
|
|
|
|
); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
## Create a new document object |
306
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
0
|
my $doc_object = $self->doc_class->new( $doc, $content_type ); |
308
|
0
|
0
|
|
|
|
0
|
return unless $doc_object; # fails on empty doc or doc not readable |
309
|
|
|
|
|
|
|
|
310
|
0
|
|
|
|
|
0
|
$self->_set_extra_methods( $doc_object, {%attr} ); |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# Now run through the filters |
313
|
0
|
|
|
|
|
0
|
for my $filter ( $self->filter_list ) { |
314
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
0
|
$self->mywarn(" ++Checking filter [$filter] for $content_type"); |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# can this filter handle this content type? |
318
|
0
|
0
|
|
|
|
0
|
next unless $filter->can_filter_mimetype( $doc_object->content_type ); |
319
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
0
|
my $start_content_type = $doc_object->content_type; |
321
|
0
|
|
|
|
|
0
|
my ( $filtered_doc, $metadata ); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# run the filter |
324
|
0
|
|
|
|
|
0
|
eval { |
325
|
0
|
|
|
|
|
0
|
local $SIG{__DIE__}; |
326
|
0
|
|
|
|
|
0
|
( $filtered_doc, $metadata ) = $filter->filter($doc_object); |
327
|
|
|
|
|
|
|
}; |
328
|
|
|
|
|
|
|
|
329
|
0
|
0
|
|
|
|
0
|
if ($@) { |
330
|
0
|
|
|
|
|
0
|
$self->mywarn( |
331
|
|
|
|
|
|
|
"Problems with filter '$filter'. Filter disabled:\n -> $@"); |
332
|
0
|
|
|
|
|
0
|
$self->filter_list( |
333
|
0
|
|
|
|
|
0
|
[ grep { $_ != $filter } $self->filter_list ] ); |
334
|
0
|
|
|
|
|
0
|
next; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
0
|
0
|
|
|
|
0
|
$self->mywarn( " ++ $content_type " |
338
|
|
|
|
|
|
|
. ( $filtered_doc ? '*WAS*' : 'was not' ) |
339
|
|
|
|
|
|
|
. " filtered by $filter\n" ); |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# save the working filters in this list |
342
|
|
|
|
|
|
|
|
343
|
0
|
0
|
|
|
|
0
|
if ($filtered_doc) { # either a file name or a reference to the doc |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# Track chain of filters |
346
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
0
|
push @{ $doc_object->{filters_used} }, |
|
0
|
|
|
|
|
0
|
|
348
|
|
|
|
|
|
|
{ |
349
|
|
|
|
|
|
|
name => $filter, |
350
|
|
|
|
|
|
|
start_content_type => $start_content_type, |
351
|
|
|
|
|
|
|
end_content_type => $doc_object->content_type, |
352
|
|
|
|
|
|
|
}; |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
# and save it (filename or reference) |
355
|
0
|
|
|
|
|
0
|
$doc_object->cur_doc($filtered_doc); |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# set meta_data explicitly since %attr only has what we originally had |
358
|
0
|
|
|
|
|
0
|
$doc_object->set_meta_data($metadata); |
359
|
0
|
|
|
|
|
0
|
delete $attr{'meta_data'}; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# All done? |
362
|
0
|
0
|
|
|
|
0
|
last unless $doc_object->continue(0); |
363
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
0
|
$self->_set_extra_methods( $doc_object, {%attr} ); |
365
|
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
0
|
$content_type = $doc_object->content_type(); |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
0
|
0
|
|
|
|
0
|
$doc_object->dump_filters_used if $ENV{FILTER_DEBUG}; |
371
|
|
|
|
|
|
|
|
372
|
0
|
|
|
|
|
0
|
return $doc_object; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub _set_extra_methods { |
377
|
0
|
|
|
0
|
|
0
|
my ( $self, $doc_object, $attr ) = @_; |
378
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
0
|
local $SIG{__DIE__}; |
380
|
0
|
|
|
|
|
0
|
local $SIG{__WARN__}; |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
# Look for left over config settings that we do not know about |
383
|
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
0
|
for my $setting ( keys %extra_methods ) { |
385
|
0
|
0
|
|
|
|
0
|
next unless $attr->{$setting}; |
386
|
0
|
|
|
|
|
0
|
my $method = "set_" . $setting; |
387
|
0
|
|
|
|
|
0
|
$doc_object->$method( delete $attr->{$setting} ); |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# if given a document name then use that in error messages |
390
|
|
|
|
|
|
|
|
391
|
0
|
0
|
|
|
|
0
|
if ( $setting eq 'name' ) { |
392
|
|
|
|
|
|
|
$SIG{__DIE__} |
393
|
0
|
|
|
0
|
|
0
|
= sub { die "$$ Error- ", $doc_object->name, ": ", @_ }; |
|
0
|
|
|
|
|
0
|
|
394
|
|
|
|
|
|
|
$SIG{__WARN__} |
395
|
0
|
|
|
0
|
|
0
|
= sub { warn "$$ Warning - ", $doc_object->name, ": ", @_ }; |
|
0
|
|
|
|
|
0
|
|
396
|
|
|
|
|
|
|
} |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
0
|
|
|
|
|
0
|
warn "Unknown filter config setting '$_'\n" for keys %$attr; |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head2 mywarn |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Internal method used for writing warning messages to STDERR if |
406
|
|
|
|
|
|
|
$ENV{FILTER_DEBUG} is set. Set the environment variable FILTER_DEBUG before |
407
|
|
|
|
|
|
|
running to see extra messages while processing. |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=cut |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
sub mywarn { |
412
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
413
|
|
|
|
|
|
|
|
414
|
0
|
0
|
|
|
|
0
|
print STDERR @_, "\n" if $ENV{FILTER_DEBUG}; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=head2 filter_list |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
Returns a list of filter objects installed. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=cut |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub filter_list { |
424
|
1
|
|
|
1
|
1
|
2
|
my ( $self, $filter_ref ) = @_; |
425
|
|
|
|
|
|
|
|
426
|
1
|
50
|
|
|
|
4
|
unless ($filter_ref) { |
427
|
0
|
0
|
|
|
|
0
|
return ref $self->{filters} ? @{ $self->{filters} } : (); |
|
0
|
|
|
|
|
0
|
|
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
1
|
|
|
|
|
4
|
$self->{filters} = $filter_ref; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Creates the list of filters |
434
|
|
|
|
|
|
|
sub create_filter_list { |
435
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
436
|
1
|
|
|
|
|
3
|
my %attr = @_; |
437
|
|
|
|
|
|
|
|
438
|
1
|
|
|
|
|
6
|
my @filters = grep {defined} $self->filters_found(%attr); |
|
1
|
|
|
|
|
35
|
|
439
|
|
|
|
|
|
|
|
440
|
1
|
50
|
|
|
|
9
|
unless (@filters) { |
441
|
0
|
|
|
|
|
0
|
warn "No SWISH filters found\n"; |
442
|
0
|
|
|
|
|
0
|
return; |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# Now sort the filters in order. |
446
|
1
|
0
|
|
|
|
4
|
@filters = sort { $a->type <=> $b->type || $a->priority <=> $b->priority } |
|
0
|
|
|
|
|
0
|
|
447
|
|
|
|
|
|
|
@filters; |
448
|
1
|
|
|
|
|
7
|
$self->filter_list( \@filters ); |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=head2 can_filter( I ) |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
This is useful for testing to see if a mimetype might be handled by SWISH::Filter |
454
|
|
|
|
|
|
|
wihtout having to pass in a document. Helpful if doing HEAD requests. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
Returns an array of filters that can handle this type of document |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
my %can_filter = (); # memoize |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
sub can_filter { |
463
|
0
|
|
|
0
|
1
|
|
my ( $self, $content_type ) = @_; |
464
|
|
|
|
|
|
|
|
465
|
0
|
0
|
|
|
|
|
unless ($content_type) { |
466
|
0
|
|
|
|
|
|
carp "Failed to pass in a content type to can_filter() method"; |
467
|
0
|
|
|
|
|
|
return; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
0
|
0
|
|
|
|
|
if ( exists $can_filter{$content_type} ) { |
471
|
0
|
|
|
|
|
|
return @{ $can_filter{$content_type} }; |
|
0
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
else { |
474
|
0
|
|
|
|
|
|
$can_filter{$content_type} = []; |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
|
for my $filter ( $self->filter_list ) { |
478
|
0
|
0
|
|
|
|
|
if ( $filter->can_filter_mimetype($content_type) ) { |
479
|
0
|
|
|
|
|
|
push @{ $can_filter{$content_type} }, $filter; |
|
0
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
0
|
|
|
|
|
|
return @{ $can_filter{$content_type} }; |
|
0
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=head2 decode_content_type( I ) |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Returns MIME type for I if known. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=cut |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub decode_content_type { |
493
|
0
|
|
|
0
|
1
|
|
my ( $self, $file ) = @_; |
494
|
|
|
|
|
|
|
|
495
|
0
|
0
|
|
|
|
|
return unless $file; |
496
|
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
return $self->{mimetypes}->get_mime_type($file); |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=head1 WRITING FILTERS |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
Filters are standard perl modules that are installed into the C name space. |
503
|
|
|
|
|
|
|
Filters are not complicated -- see the core SWISH::Filters::* modules for examples. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Each filter defines the content-types (or mimetypes) that it can handle. These |
506
|
|
|
|
|
|
|
are specified as a list of regular expressions to match against the document's |
507
|
|
|
|
|
|
|
content-type. If one of the mimetypes of a filter match the incoming |
508
|
|
|
|
|
|
|
document's content-type the filter is called. The filter can then either |
509
|
|
|
|
|
|
|
filter the content or return undefined indicating that it decided not to |
510
|
|
|
|
|
|
|
filter the document for some reason. If the document is converted the filter |
511
|
|
|
|
|
|
|
returns either a reference to a scalar of the content or a file name where the |
512
|
|
|
|
|
|
|
content is stored. The filter also must change the content-type of the document |
513
|
|
|
|
|
|
|
to reflect the new document. |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
Filters typically use external programs or modules to do that actual work of |
516
|
|
|
|
|
|
|
converting a document from one type to another. For example, programs in the |
517
|
|
|
|
|
|
|
Xpdf packages are used for converting PDF files. The filter can (and should) |
518
|
|
|
|
|
|
|
test for those programs in its new() method. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
Filters also can define a type and priority. These attributes are used |
521
|
|
|
|
|
|
|
to set the order filters are tested for a content-type match. This allows |
522
|
|
|
|
|
|
|
you to have more than one filter that can work on the same content-type. A lower |
523
|
|
|
|
|
|
|
priority value is given preference over a higher priority value. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
If a filter calls die() then the filter is removed from the chain and will not be |
526
|
|
|
|
|
|
|
called again I. Calling die when running with -S http or |
527
|
|
|
|
|
|
|
-S fs has no effect since the program is run once per document. |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
Once a filter returns something other than undef no more filters will be |
530
|
|
|
|
|
|
|
called. If the filter calls $filter-Eset_continue then processing will |
531
|
|
|
|
|
|
|
continue as if the file was not filtered. For example, a filter can uncompress |
532
|
|
|
|
|
|
|
data and then set $filter-Eset_continue and let other filters process the |
533
|
|
|
|
|
|
|
document. |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
A filter may define the following methods (required methods are indicated): |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=over 4 |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item new() B |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
This method returns either an object which provides access to the filter, or undefined |
543
|
|
|
|
|
|
|
if the filter is not to be used. |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
The new() method is a good place to check for required modules or helper programs. |
546
|
|
|
|
|
|
|
Returning undefined prevents the filter from being included in the filter chain. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
The new method must return a blessed hash reference. The only required attribute |
549
|
|
|
|
|
|
|
is B. This attribute must contain a reference to an array of regular |
550
|
|
|
|
|
|
|
expressions used for matching the content-type of the document passed in. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
Example: |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
sub new { |
555
|
|
|
|
|
|
|
my ( $class ) = @_; |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# List of regular expressions |
558
|
|
|
|
|
|
|
my @mimetypes = ( |
559
|
|
|
|
|
|
|
qr[application/(x-)?msword], |
560
|
|
|
|
|
|
|
qr[application/worddoc], |
561
|
|
|
|
|
|
|
); |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
my %settings = ( |
564
|
|
|
|
|
|
|
mimetypes => \@mimetypes, |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# Optional settings |
567
|
|
|
|
|
|
|
priority => 20, |
568
|
|
|
|
|
|
|
type => 2, |
569
|
|
|
|
|
|
|
); |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
return bless \%settings, $class; |
572
|
|
|
|
|
|
|
} |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
The attribute "mimetypes" returns an array reference to a list of regular |
575
|
|
|
|
|
|
|
expressions. Those patterns are matched against each document's content type. |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=item filter() B |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
This is the function that does the work of converting a document from one content type |
580
|
|
|
|
|
|
|
to another. The function is passed the document object. See document object methods |
581
|
|
|
|
|
|
|
listed below for what methods may be called on a document. |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
The function can return undefined (or any false value) to indicate that the |
584
|
|
|
|
|
|
|
filter did not want to process the document. Other filters will then be tested for |
585
|
|
|
|
|
|
|
a content type match. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
If the document is filtered then the filter must set the new document's content |
588
|
|
|
|
|
|
|
type (if it changed) and return either a file name where the document can be found or |
589
|
|
|
|
|
|
|
a reference to a scalar containing the document. |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
The filter() method may also return a second value for storing metadata. The value |
592
|
|
|
|
|
|
|
is typically a hash ref of name/value pairs. This value can then |
593
|
|
|
|
|
|
|
be accessed via the meta_data() method in the SWISH::Filter::Document class. |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=item type() |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Returns a number. Filters are sorted (for processing in a specific order) |
598
|
|
|
|
|
|
|
and this number is simply the primary key used in sorting. If not specified |
599
|
|
|
|
|
|
|
the filter's type used for sorting is 2. |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
This is an optional method. You can also set the type in your new() constructor |
602
|
|
|
|
|
|
|
as shown above. |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=item priority() |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Returns a number. Filters are sorted (for processing in a specific order) |
608
|
|
|
|
|
|
|
and this number is simply the secondary key used in sorting. If not specified |
609
|
|
|
|
|
|
|
the filter's priority is 50. |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
This is an optional method. You can also set the priority in your new() constructor |
612
|
|
|
|
|
|
|
as shown above. |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=back |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
Again, the point of the type() and priority() methods is to allow setting the sort order |
618
|
|
|
|
|
|
|
of the filters. Useful if you have two filters for filtering the same content-type, |
619
|
|
|
|
|
|
|
but prefer to use one over the other. Neither are required. |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=head1 EXAMPLE FILTER |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
Here's a module to convert MS Word documents using the program "catdoc": |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
package SWISH::Filters::Doc2txt; |
627
|
|
|
|
|
|
|
use vars qw/ $VERSION /; |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
$VERSION = '0.190'; |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
sub new { |
633
|
|
|
|
|
|
|
my ( $class ) = @_; |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
my $self = bless { |
636
|
|
|
|
|
|
|
mimetypes => [ qr!application/(x-)?msword! ], |
637
|
|
|
|
|
|
|
priority => 50, |
638
|
|
|
|
|
|
|
}, $class; |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
# check for helpers |
642
|
|
|
|
|
|
|
return $self->set_programs( 'catdoc' ); |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
sub filter { |
648
|
|
|
|
|
|
|
my ( $self, $doc ) = @_; |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
my $content = $self->run_catdoc( $doc->fetch_filename ) || return; |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# update the document's content type |
653
|
|
|
|
|
|
|
$filter->set_content_type( 'text/plain' ); |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
# return the document |
656
|
|
|
|
|
|
|
return \$content; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
1; |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
The new() constructor creates a blessed hash which contains an array reference |
661
|
|
|
|
|
|
|
of mimetypes patterns that this filter accepts. The priority sets this |
662
|
|
|
|
|
|
|
filter to run after any other filters that might handle the same type of content. |
663
|
|
|
|
|
|
|
The F function says that we need to call a program called "catdoc". |
664
|
|
|
|
|
|
|
The function either returns $self or undefined if catdoc could not be found. |
665
|
|
|
|
|
|
|
The F function creates a new method for running catdoc. |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
The filter function runs catdoc passing in the name of the file (if the file is in memory |
668
|
|
|
|
|
|
|
a temporary file is created). That F function was created by the |
669
|
|
|
|
|
|
|
F call above. |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=cut |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
1; |
675
|
|
|
|
|
|
|
__END__ |