| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package SWISH::Filter; |
|
2
|
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
20143
|
use 5.005; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
64
|
|
|
4
|
2
|
|
|
2
|
|
8
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
58
|
|
|
5
|
2
|
|
|
2
|
|
9
|
use File::Basename; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
164
|
|
|
6
|
2
|
|
|
2
|
|
9
|
use Carp; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
107
|
|
|
7
|
2
|
|
|
2
|
|
645
|
use SWISH::Filter::MIMETypes; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
77
|
|
|
8
|
2
|
|
|
2
|
|
894
|
use SWISH::Filter::Document; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
58
|
|
|
9
|
2
|
|
|
2
|
|
769
|
use SWISH::Filters::Base; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
72
|
|
|
10
|
|
|
|
|
|
|
use Module::Pluggable |
|
11
|
2
|
|
|
|
|
14
|
search_path => 'SWISH::Filters', |
|
12
|
|
|
|
|
|
|
except => 'SWISH::Filters::Base', |
|
13
|
|
|
|
|
|
|
sub_name => 'filters_found', |
|
14
|
|
|
|
|
|
|
require => 1, |
|
15
|
2
|
|
|
2
|
|
935
|
instantiate => 'new'; |
|
|
2
|
|
|
|
|
14274
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
2
|
|
|
2
|
|
154
|
use vars qw/ $VERSION %extra_methods /; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
2506
|
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$VERSION = '0.191'; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Define the available parameters |
|
22
|
|
|
|
|
|
|
%extra_methods = map { $_ => 1 } qw( meta_data name user_data ); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# For testing only |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
if ( $0 =~ 'Filter.pm' && @ARGV >= 2 && shift =~ /^test/i ) { |
|
27
|
|
|
|
|
|
|
die "Please use the 'swish-filter-test' program.\n"; |
|
28
|
|
|
|
|
|
|
} |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 NAME |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
SWISH::Filter - filter documents for indexing with Swish-e |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
use SWISH::Filter; |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# load available filters into memory |
|
39
|
|
|
|
|
|
|
my $filter = SWISH::Filter->new; |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# convert a document |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $doc = $filter->convert( |
|
45
|
|
|
|
|
|
|
document => \$scalar_ref, # path or ref to a doc |
|
46
|
|
|
|
|
|
|
content_type => $content_type, # content type if doc reference |
|
47
|
|
|
|
|
|
|
name => $real_path, # optional name for this file (useful for debugging) |
|
48
|
|
|
|
|
|
|
user_data => $whatever, # optional data to make available to filters |
|
49
|
|
|
|
|
|
|
); |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
return unless $doc; # empty doc, zero size, or no filters installed |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# Was the document converted by a filter? |
|
54
|
|
|
|
|
|
|
my $was_filtered = $doc->was_filtered; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Skip if the file is not text |
|
57
|
|
|
|
|
|
|
return if $doc->is_binary; |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Print out the doc |
|
60
|
|
|
|
|
|
|
my $doc_ref = $doc->fetch_doc; |
|
61
|
|
|
|
|
|
|
print $$doc_ref; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# Fetch the final content type of the document |
|
64
|
|
|
|
|
|
|
my $content_type = $doc->content_type; |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Fetch Swish-e parser type (TXT*, XML*, HTML*, or undefined) |
|
67
|
|
|
|
|
|
|
my $doc_type = $doc->swish_parser_type; |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
SWISH::Filter provides a unified way to convert documents into a type that |
|
72
|
|
|
|
|
|
|
Swish-e can index. Individual filters are installed as separate subclasses (modules). |
|
73
|
|
|
|
|
|
|
For example, there might be a filter that converts from PDF format to HTML |
|
74
|
|
|
|
|
|
|
format. |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
SWISH::Filter is a framework that relies on other packages to do the heavy lifting |
|
77
|
|
|
|
|
|
|
of converting non-text documents to text. B
|
|
78
|
|
|
|
|
|
|
programs or Perl modules may need to be installed to use SWISH::Filter to filter |
|
79
|
|
|
|
|
|
|
documents.> For example, to filter PDF documents you must install the C |
|
80
|
|
|
|
|
|
|
package. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The filters are automatically loaded when Cnew()> is |
|
83
|
|
|
|
|
|
|
called. Filters define a type and priority that determines the processing |
|
84
|
|
|
|
|
|
|
order of the filter. Filters are processed in this sort order until a filter |
|
85
|
|
|
|
|
|
|
accepts the document for filtering. The filter uses the document's content type |
|
86
|
|
|
|
|
|
|
to determine if the filter should handle the current document. The |
|
87
|
|
|
|
|
|
|
content-type is determined by the files suffix if not supplied by the calling |
|
88
|
|
|
|
|
|
|
program. |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The individual filters are not designed to be used as separate modules. All |
|
91
|
|
|
|
|
|
|
access to the filters is through this SWISH::Filter module. |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Normally, once a document is filtered processing stops. Filters can filter the |
|
94
|
|
|
|
|
|
|
document and then set a flag saying that filtering should continue (for example |
|
95
|
|
|
|
|
|
|
a filter that uncompresses a MS Word document before passing on to the filter |
|
96
|
|
|
|
|
|
|
that converts from MS Word to text). All this should be transparent to the end |
|
97
|
|
|
|
|
|
|
user. So, filters can be pipe-lined. |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
The idea of SWISH::Filter is that new filters can be created, and then |
|
100
|
|
|
|
|
|
|
downloaded and installed to provide new filtering capabilities. For example, |
|
101
|
|
|
|
|
|
|
if you needed to index MS Excel documents you might be able to download a |
|
102
|
|
|
|
|
|
|
filter from the Swish-e site and magically next time you run indexing MS Excel |
|
103
|
|
|
|
|
|
|
docs would be indexed. |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
The SWISH::Filter setup can be used with -S prog or -S http. It works best |
|
106
|
|
|
|
|
|
|
with the -S prog method because the filter modules only need to be loaded and |
|
107
|
|
|
|
|
|
|
compiled one time. The -S prog program F will automatically use |
|
108
|
|
|
|
|
|
|
SWISH::Filter when spidering with default settings (using "default" as the |
|
109
|
|
|
|
|
|
|
first parameter to spider.pl). |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
The -S http indexing method uses a Perl helper script called F. |
|
112
|
|
|
|
|
|
|
F has been updated to work with SWISH::Filter, but (unlike |
|
113
|
|
|
|
|
|
|
spider.pl) does not contain a "use lib" line to point to the location of |
|
114
|
|
|
|
|
|
|
SWISH::Filter. This means that by default F will B use |
|
115
|
|
|
|
|
|
|
SWISH::Filter for filtering. The reason for this is because F |
|
116
|
|
|
|
|
|
|
runs for every URL fetched, and loading the Filters for each document can be |
|
117
|
|
|
|
|
|
|
slow. The recommended way of spidering is using -S prog with spider.pl, but if |
|
118
|
|
|
|
|
|
|
-S http is desired the way to enable SWISH::Filter is to set PERL5LIB before |
|
119
|
|
|
|
|
|
|
running swish so that F will be able to locate the SWISH::Filter |
|
120
|
|
|
|
|
|
|
module. Here's one way to set the PERL5LIB with the bash shell: |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
$ export PERL5LIB=`swish-filter-test -path` |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 METHODS |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 new( %I ) |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
new() creates a SWISH::Filter object. You may pass in options as a list or a hash reference. |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head3 Options |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
There is currently only one option that can be passed in to new(): |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=over 4 |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item ignore_filters |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Pass in a reference to a list of filter names to ignore. For example, if you have two filters installed |
|
141
|
|
|
|
|
|
|
"Pdf2HTML" and "Pdf2XML" and want to avoid using "Pdf2XML": |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my $filter = SWISH::Filter->new( ignore_filters => ['Pdf2XML']; |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=back |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=cut |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub new { |
|
150
|
1
|
|
|
1
|
1
|
313
|
my $class = shift; |
|
151
|
1
|
|
33
|
|
|
10
|
$class = ref($class) || $class; |
|
152
|
|
|
|
|
|
|
|
|
153
|
1
|
0
|
|
|
|
3
|
my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_; |
|
|
0
|
50
|
|
|
|
0
|
|
|
154
|
|
|
|
|
|
|
|
|
155
|
1
|
|
|
|
|
2
|
my $self = bless {}, $class; |
|
156
|
|
|
|
|
|
|
|
|
157
|
1
|
|
|
|
|
5
|
$self->{skip_filters} = {}; |
|
158
|
|
|
|
|
|
|
|
|
159
|
1
|
50
|
|
|
|
3
|
$self->ignore_filters( delete $attr{ignore_filters} ) |
|
160
|
|
|
|
|
|
|
if $attr{ignore_filters}; |
|
161
|
|
|
|
|
|
|
|
|
162
|
1
|
|
|
|
|
3
|
warn "Unknown SWISH::Filter->new() config setting '$_'\n" for keys %attr; |
|
163
|
|
|
|
|
|
|
|
|
164
|
1
|
|
|
|
|
6
|
$self->{mimetypes} = SWISH::Filter::MIMETypes->new; |
|
165
|
|
|
|
|
|
|
|
|
166
|
1
|
|
|
|
|
2
|
$self->create_filter_list(%attr); |
|
167
|
|
|
|
|
|
|
|
|
168
|
1
|
|
50
|
|
|
9
|
$self->{doc_class} ||= 'SWISH::Filter::Document'; |
|
169
|
|
|
|
|
|
|
|
|
170
|
1
|
|
|
|
|
8
|
return $self; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub ignore_filters { |
|
174
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $filters ) = @_; |
|
175
|
|
|
|
|
|
|
|
|
176
|
0
|
0
|
|
|
|
0
|
unless ($filters) { |
|
177
|
0
|
0
|
|
|
|
0
|
return unless $self->{ignore_filter_list}; |
|
178
|
0
|
|
|
|
|
0
|
return @{ $self->{ignore_filter_list} }; |
|
|
0
|
|
|
|
|
0
|
|
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
0
|
@{ $self->{ignore_filter_list} } = @$filters; |
|
|
0
|
|
|
|
|
0
|
|
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# create lookup hash for filters to skip |
|
184
|
0
|
|
|
|
|
0
|
$self->{skip_filters} = { map { $_, 1 } @$filters }; |
|
|
0
|
|
|
|
|
0
|
|
|
185
|
|
|
|
|
|
|
} |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head2 doc_class |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
If you subclass SWISH::Filter::Document with your own class, indicate your class |
|
190
|
|
|
|
|
|
|
name in the new() method with the C param. The return value of doc_class() |
|
191
|
|
|
|
|
|
|
is used in convert() for instatiating the Document object. The default value is |
|
192
|
|
|
|
|
|
|
C. |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub doc_class { |
|
197
|
0
|
|
|
0
|
1
|
0
|
return $_[0]->{doc_class}; |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head2 convert |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
This method filters a document. Returns an object belonging to doc_class() |
|
203
|
|
|
|
|
|
|
on success. If passed an empty document, a filename that cannot be read off disk, or |
|
204
|
|
|
|
|
|
|
if no filters have been loaded, returns undef. |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
See the SWISH::Filter::Document documentation. |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
You must pass in a hash (or hash reference) of parameters to the convert() method. The |
|
209
|
|
|
|
|
|
|
possible parameters are: |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=over 8 |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=item document |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
This can be either a path to a file, or a scalar reference to a document in memory. |
|
216
|
|
|
|
|
|
|
This is required. |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=item content_type |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
The MIME type of the document. This is only required when passing in a scalar |
|
221
|
|
|
|
|
|
|
reference to a document. The content type string is what the filters use to |
|
222
|
|
|
|
|
|
|
match a document type. |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
When passing in a file name and C is not set, then the content type will |
|
225
|
|
|
|
|
|
|
be determined from the file's extension by using the MIME::Types Perl module (available on CPAN). |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=item name |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Optional name to pass in to filters that will be used in error and warning messages. |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=item user_data |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Optional data structure that all filters may access. |
|
234
|
|
|
|
|
|
|
This can be fetched in a filter by: |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
my $user_data = $doc_object->user_data; |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
And used in the filter as: |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
if ( ref $user_data && $user_data->{pdf2html}{title} ) { |
|
241
|
|
|
|
|
|
|
... |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
It's up to the filter author to use a unique first-level hash key for a given filter. |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item meta_data |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Optional data structure intended for meta name/content pairs for HTML |
|
249
|
|
|
|
|
|
|
or XML output. See SWISH::Filter::Document for discussion of this data. |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=back |
|
252
|
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Example of using the convert() method: |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
$doc_object = $filter->convert( |
|
256
|
|
|
|
|
|
|
document => $doc_ref, |
|
257
|
|
|
|
|
|
|
content-type => 'application/pdf', |
|
258
|
|
|
|
|
|
|
); |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=cut |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub convert { |
|
263
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
264
|
0
|
0
|
|
|
|
0
|
my %attr = ref $_[0] ? %{ $_[0] } : @_ if @_; |
|
|
0
|
0
|
|
|
|
0
|
|
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# Any filters? |
|
267
|
0
|
0
|
|
|
|
0
|
return unless $self->filter_list; |
|
268
|
|
|
|
|
|
|
|
|
269
|
0
|
|
0
|
|
|
0
|
my $doc = delete $attr{document} |
|
270
|
|
|
|
|
|
|
|| die |
|
271
|
|
|
|
|
|
|
"Failed to supply document attribute 'document' when calling filter()\n"; |
|
272
|
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
0
|
my $content_type = delete $attr{content_type}; |
|
274
|
|
|
|
|
|
|
|
|
275
|
0
|
0
|
|
|
|
0
|
if ( ref $content_type ) { |
|
276
|
0
|
|
|
|
|
0
|
my $type = $self->decode_content_type($$content_type); |
|
277
|
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
0
|
unless ($type) { |
|
279
|
0
|
|
|
|
|
0
|
warn |
|
280
|
|
|
|
|
|
|
"Failed to set content type for file reference '$$content_type'\n"; |
|
281
|
0
|
|
|
|
|
0
|
return; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
0
|
|
|
|
|
0
|
$content_type = $type; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
0
|
0
|
|
|
|
0
|
if ( ref $doc ) { |
|
287
|
0
|
0
|
|
|
|
0
|
die |
|
288
|
|
|
|
|
|
|
"Must supply a content type when passing in a reference to a document\n" |
|
289
|
|
|
|
|
|
|
unless $content_type; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
else { |
|
292
|
0
|
|
0
|
|
|
0
|
$content_type ||= $self->decode_content_type($doc); |
|
293
|
0
|
0
|
|
|
|
0
|
unless ($content_type) { |
|
294
|
0
|
|
|
|
|
0
|
warn "Failed to set content type for document '$doc'\n"; |
|
295
|
0
|
|
|
|
|
0
|
return; |
|
296
|
|
|
|
|
|
|
} |
|
297
|
|
|
|
|
|
|
|
|
298
|
0
|
|
0
|
|
|
0
|
$attr{name} ||= $doc; # Set default name of document |
|
299
|
|
|
|
|
|
|
} |
|
300
|
|
|
|
|
|
|
|
|
301
|
0
|
|
|
|
|
0
|
$self->mywarn( |
|
302
|
|
|
|
|
|
|
"\n>> Starting to process new document: $attr{name} -> $content_type" |
|
303
|
|
|
|
|
|
|
); |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
## Create a new document object |
|
306
|
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
0
|
my $doc_object = $self->doc_class->new( $doc, $content_type ); |
|
308
|
0
|
0
|
|
|
|
0
|
return unless $doc_object; # fails on empty doc or doc not readable |
|
309
|
|
|
|
|
|
|
|
|
310
|
0
|
|
|
|
|
0
|
$self->_set_extra_methods( $doc_object, {%attr} ); |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
# Now run through the filters |
|
313
|
0
|
|
|
|
|
0
|
for my $filter ( $self->filter_list ) { |
|
314
|
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
0
|
$self->mywarn(" ++Checking filter [$filter] for $content_type"); |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# can this filter handle this content type? |
|
318
|
0
|
0
|
|
|
|
0
|
next unless $filter->can_filter_mimetype( $doc_object->content_type ); |
|
319
|
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
0
|
my $start_content_type = $doc_object->content_type; |
|
321
|
0
|
|
|
|
|
0
|
my ( $filtered_doc, $metadata ); |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# run the filter |
|
324
|
0
|
|
|
|
|
0
|
eval { |
|
325
|
0
|
|
|
|
|
0
|
local $SIG{__DIE__}; |
|
326
|
0
|
|
|
|
|
0
|
( $filtered_doc, $metadata ) = $filter->filter($doc_object); |
|
327
|
|
|
|
|
|
|
}; |
|
328
|
|
|
|
|
|
|
|
|
329
|
0
|
0
|
|
|
|
0
|
if ($@) { |
|
330
|
0
|
|
|
|
|
0
|
$self->mywarn( |
|
331
|
|
|
|
|
|
|
"Problems with filter '$filter'. Filter disabled:\n -> $@"); |
|
332
|
0
|
|
|
|
|
0
|
$self->filter_list( |
|
333
|
0
|
|
|
|
|
0
|
[ grep { $_ != $filter } $self->filter_list ] ); |
|
334
|
0
|
|
|
|
|
0
|
next; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
|
|
|
|
|
|
|
|
337
|
0
|
0
|
|
|
|
0
|
$self->mywarn( " ++ $content_type " |
|
338
|
|
|
|
|
|
|
. ( $filtered_doc ? '*WAS*' : 'was not' ) |
|
339
|
|
|
|
|
|
|
. " filtered by $filter\n" ); |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# save the working filters in this list |
|
342
|
|
|
|
|
|
|
|
|
343
|
0
|
0
|
|
|
|
0
|
if ($filtered_doc) { # either a file name or a reference to the doc |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
# Track chain of filters |
|
346
|
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
0
|
push @{ $doc_object->{filters_used} }, |
|
|
0
|
|
|
|
|
0
|
|
|
348
|
|
|
|
|
|
|
{ |
|
349
|
|
|
|
|
|
|
name => $filter, |
|
350
|
|
|
|
|
|
|
start_content_type => $start_content_type, |
|
351
|
|
|
|
|
|
|
end_content_type => $doc_object->content_type, |
|
352
|
|
|
|
|
|
|
}; |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
# and save it (filename or reference) |
|
355
|
0
|
|
|
|
|
0
|
$doc_object->cur_doc($filtered_doc); |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# set meta_data explicitly since %attr only has what we originally had |
|
358
|
0
|
|
|
|
|
0
|
$doc_object->set_meta_data($metadata); |
|
359
|
0
|
|
|
|
|
0
|
delete $attr{'meta_data'}; |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# All done? |
|
362
|
0
|
0
|
|
|
|
0
|
last unless $doc_object->continue(0); |
|
363
|
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
0
|
$self->_set_extra_methods( $doc_object, {%attr} ); |
|
365
|
|
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
0
|
$content_type = $doc_object->content_type(); |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
} |
|
369
|
|
|
|
|
|
|
|
|
370
|
0
|
0
|
|
|
|
0
|
$doc_object->dump_filters_used if $ENV{FILTER_DEBUG}; |
|
371
|
|
|
|
|
|
|
|
|
372
|
0
|
|
|
|
|
0
|
return $doc_object; |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub _set_extra_methods { |
|
377
|
0
|
|
|
0
|
|
0
|
my ( $self, $doc_object, $attr ) = @_; |
|
378
|
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
0
|
local $SIG{__DIE__}; |
|
380
|
0
|
|
|
|
|
0
|
local $SIG{__WARN__}; |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
# Look for left over config settings that we do not know about |
|
383
|
|
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
0
|
for my $setting ( keys %extra_methods ) { |
|
385
|
0
|
0
|
|
|
|
0
|
next unless $attr->{$setting}; |
|
386
|
0
|
|
|
|
|
0
|
my $method = "set_" . $setting; |
|
387
|
0
|
|
|
|
|
0
|
$doc_object->$method( delete $attr->{$setting} ); |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# if given a document name then use that in error messages |
|
390
|
|
|
|
|
|
|
|
|
391
|
0
|
0
|
|
|
|
0
|
if ( $setting eq 'name' ) { |
|
392
|
|
|
|
|
|
|
$SIG{__DIE__} |
|
393
|
0
|
|
|
0
|
|
0
|
= sub { die "$$ Error- ", $doc_object->name, ": ", @_ }; |
|
|
0
|
|
|
|
|
0
|
|
|
394
|
|
|
|
|
|
|
$SIG{__WARN__} |
|
395
|
0
|
|
|
0
|
|
0
|
= sub { warn "$$ Warning - ", $doc_object->name, ": ", @_ }; |
|
|
0
|
|
|
|
|
0
|
|
|
396
|
|
|
|
|
|
|
} |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
0
|
|
|
|
|
0
|
warn "Unknown filter config setting '$_'\n" for keys %$attr; |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
} |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head2 mywarn |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Internal method used for writing warning messages to STDERR if |
|
406
|
|
|
|
|
|
|
$ENV{FILTER_DEBUG} is set. Set the environment variable FILTER_DEBUG before |
|
407
|
|
|
|
|
|
|
running to see extra messages while processing. |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=cut |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
sub mywarn { |
|
412
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
|
413
|
|
|
|
|
|
|
|
|
414
|
0
|
0
|
|
|
|
0
|
print STDERR @_, "\n" if $ENV{FILTER_DEBUG}; |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
=head2 filter_list |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
Returns a list of filter objects installed. |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
=cut |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub filter_list { |
|
424
|
1
|
|
|
1
|
1
|
3
|
my ( $self, $filter_ref ) = @_; |
|
425
|
|
|
|
|
|
|
|
|
426
|
1
|
50
|
|
|
|
4
|
unless ($filter_ref) { |
|
427
|
0
|
0
|
|
|
|
0
|
return ref $self->{filters} ? @{ $self->{filters} } : (); |
|
|
0
|
|
|
|
|
0
|
|
|
428
|
|
|
|
|
|
|
} |
|
429
|
|
|
|
|
|
|
|
|
430
|
1
|
|
|
|
|
4
|
$self->{filters} = $filter_ref; |
|
431
|
|
|
|
|
|
|
} |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Creates the list of filters |
|
434
|
|
|
|
|
|
|
sub create_filter_list { |
|
435
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
|
436
|
1
|
|
|
|
|
2
|
my %attr = @_; |
|
437
|
|
|
|
|
|
|
|
|
438
|
1
|
|
|
|
|
2
|
my @filters = grep {defined} $self->filters_found(%attr); |
|
|
1
|
|
|
|
|
31
|
|
|
439
|
|
|
|
|
|
|
|
|
440
|
1
|
50
|
|
|
|
8
|
unless (@filters) { |
|
441
|
0
|
|
|
|
|
0
|
warn "No SWISH filters found\n"; |
|
442
|
0
|
|
|
|
|
0
|
return; |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# Now sort the filters in order. |
|
446
|
1
|
0
|
|
|
|
3
|
@filters = sort { $a->type <=> $b->type || $a->priority <=> $b->priority } |
|
|
0
|
|
|
|
|
0
|
|
|
447
|
|
|
|
|
|
|
@filters; |
|
448
|
1
|
|
|
|
|
7
|
$self->filter_list( \@filters ); |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=head2 can_filter( I ) |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
This is useful for testing to see if a mimetype might be handled by SWISH::Filter |
|
454
|
|
|
|
|
|
|
wihtout having to pass in a document. Helpful if doing HEAD requests. |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
Returns an array of filters that can handle this type of document |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
my %can_filter = (); # memoize |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
sub can_filter { |
|
463
|
0
|
|
|
0
|
1
|
|
my ( $self, $content_type ) = @_; |
|
464
|
|
|
|
|
|
|
|
|
465
|
0
|
0
|
|
|
|
|
unless ($content_type) { |
|
466
|
0
|
|
|
|
|
|
carp "Failed to pass in a content type to can_filter() method"; |
|
467
|
0
|
|
|
|
|
|
return; |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
0
|
0
|
|
|
|
|
if ( exists $can_filter{$content_type} ) { |
|
471
|
0
|
|
|
|
|
|
return @{ $can_filter{$content_type} }; |
|
|
0
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
else { |
|
474
|
0
|
|
|
|
|
|
$can_filter{$content_type} = []; |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
|
for my $filter ( $self->filter_list ) { |
|
478
|
0
|
0
|
|
|
|
|
if ( $filter->can_filter_mimetype($content_type) ) { |
|
479
|
0
|
|
|
|
|
|
push @{ $can_filter{$content_type} }, $filter; |
|
|
0
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
0
|
|
|
|
|
|
return @{ $can_filter{$content_type} }; |
|
|
0
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=head2 decode_content_type( I ) |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Returns MIME type for I if known. |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=cut |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub decode_content_type { |
|
493
|
0
|
|
|
0
|
1
|
|
my ( $self, $file ) = @_; |
|
494
|
|
|
|
|
|
|
|
|
495
|
0
|
0
|
|
|
|
|
return unless $file; |
|
496
|
|
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
return $self->{mimetypes}->get_mime_type($file); |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
=head1 WRITING FILTERS |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
Filters are standard perl modules that are installed into the C name space. |
|
503
|
|
|
|
|
|
|
Filters are not complicated -- see the core SWISH::Filters::* modules for examples. |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Each filter defines the content-types (or mimetypes) that it can handle. These |
|
506
|
|
|
|
|
|
|
are specified as a list of regular expressions to match against the document's |
|
507
|
|
|
|
|
|
|
content-type. If one of the mimetypes of a filter match the incoming |
|
508
|
|
|
|
|
|
|
document's content-type the filter is called. The filter can then either |
|
509
|
|
|
|
|
|
|
filter the content or return undefined indicating that it decided not to |
|
510
|
|
|
|
|
|
|
filter the document for some reason. If the document is converted the filter |
|
511
|
|
|
|
|
|
|
returns either a reference to a scalar of the content or a file name where the |
|
512
|
|
|
|
|
|
|
content is stored. The filter also must change the content-type of the document |
|
513
|
|
|
|
|
|
|
to reflect the new document. |
|
514
|
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
Filters typically use external programs or modules to do that actual work of |
|
516
|
|
|
|
|
|
|
converting a document from one type to another. For example, programs in the |
|
517
|
|
|
|
|
|
|
Xpdf packages are used for converting PDF files. The filter can (and should) |
|
518
|
|
|
|
|
|
|
test for those programs in its new() method. |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
Filters also can define a type and priority. These attributes are used |
|
521
|
|
|
|
|
|
|
to set the order filters are tested for a content-type match. This allows |
|
522
|
|
|
|
|
|
|
you to have more than one filter that can work on the same content-type. A lower |
|
523
|
|
|
|
|
|
|
priority value is given preference over a higher priority value. |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
If a filter calls die() then the filter is removed from the chain and will not be |
|
526
|
|
|
|
|
|
|
called again I. Calling die when running with -S http or |
|
527
|
|
|
|
|
|
|
-S fs has no effect since the program is run once per document. |
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
Once a filter returns something other than undef no more filters will be |
|
530
|
|
|
|
|
|
|
called. If the filter calls $filter-Eset_continue then processing will |
|
531
|
|
|
|
|
|
|
continue as if the file was not filtered. For example, a filter can uncompress |
|
532
|
|
|
|
|
|
|
data and then set $filter-Eset_continue and let other filters process the |
|
533
|
|
|
|
|
|
|
document. |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
A filter may define the following methods (required methods are indicated): |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=over 4 |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item new() B |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
This method returns either an object which provides access to the filter, or undefined |
|
543
|
|
|
|
|
|
|
if the filter is not to be used. |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
The new() method is a good place to check for required modules or helper programs. |
|
546
|
|
|
|
|
|
|
Returning undefined prevents the filter from being included in the filter chain. |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
The new method must return a blessed hash reference. The only required attribute |
|
549
|
|
|
|
|
|
|
is B. This attribute must contain a reference to an array of regular |
|
550
|
|
|
|
|
|
|
expressions used for matching the content-type of the document passed in. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
Example: |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
sub new { |
|
555
|
|
|
|
|
|
|
my ( $class ) = @_; |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# List of regular expressions |
|
558
|
|
|
|
|
|
|
my @mimetypes = ( |
|
559
|
|
|
|
|
|
|
qr[application/(x-)?msword], |
|
560
|
|
|
|
|
|
|
qr[application/worddoc], |
|
561
|
|
|
|
|
|
|
); |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
my %settings = ( |
|
564
|
|
|
|
|
|
|
mimetypes => \@mimetypes, |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# Optional settings |
|
567
|
|
|
|
|
|
|
priority => 20, |
|
568
|
|
|
|
|
|
|
type => 2, |
|
569
|
|
|
|
|
|
|
); |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
return bless \%settings, $class; |
|
572
|
|
|
|
|
|
|
} |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
The attribute "mimetypes" returns an array reference to a list of regular |
|
575
|
|
|
|
|
|
|
expressions. Those patterns are matched against each document's content type. |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=item filter() B |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
This is the function that does the work of converting a document from one content type |
|
580
|
|
|
|
|
|
|
to another. The function is passed the document object. See document object methods |
|
581
|
|
|
|
|
|
|
listed below for what methods may be called on a document. |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
The function can return undefined (or any false value) to indicate that the |
|
584
|
|
|
|
|
|
|
filter did not want to process the document. Other filters will then be tested for |
|
585
|
|
|
|
|
|
|
a content type match. |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
If the document is filtered then the filter must set the new document's content |
|
588
|
|
|
|
|
|
|
type (if it changed) and return either a file name where the document can be found or |
|
589
|
|
|
|
|
|
|
a reference to a scalar containing the document. |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
The filter() method may also return a second value for storing metadata. The value |
|
592
|
|
|
|
|
|
|
is typically a hash ref of name/value pairs. This value can then |
|
593
|
|
|
|
|
|
|
be accessed via the meta_data() method in the SWISH::Filter::Document class. |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=item type() |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Returns a number. Filters are sorted (for processing in a specific order) |
|
598
|
|
|
|
|
|
|
and this number is simply the primary key used in sorting. If not specified |
|
599
|
|
|
|
|
|
|
the filter's type used for sorting is 2. |
|
600
|
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
This is an optional method. You can also set the type in your new() constructor |
|
602
|
|
|
|
|
|
|
as shown above. |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=item priority() |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Returns a number. Filters are sorted (for processing in a specific order) |
|
608
|
|
|
|
|
|
|
and this number is simply the secondary key used in sorting. If not specified |
|
609
|
|
|
|
|
|
|
the filter's priority is 50. |
|
610
|
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
This is an optional method. You can also set the priority in your new() constructor |
|
612
|
|
|
|
|
|
|
as shown above. |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=back |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
Again, the point of the type() and priority() methods is to allow setting the sort order |
|
618
|
|
|
|
|
|
|
of the filters. Useful if you have two filters for filtering the same content-type, |
|
619
|
|
|
|
|
|
|
but prefer to use one over the other. Neither are required. |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=head1 EXAMPLE FILTER |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
Here's a module to convert MS Word documents using the program "catdoc": |
|
625
|
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
package SWISH::Filters::Doc2txt; |
|
627
|
|
|
|
|
|
|
use vars qw/ $VERSION /; |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
$VERSION = '0.191'; |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
sub new { |
|
633
|
|
|
|
|
|
|
my ( $class ) = @_; |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
my $self = bless { |
|
636
|
|
|
|
|
|
|
mimetypes => [ qr!application/(x-)?msword! ], |
|
637
|
|
|
|
|
|
|
priority => 50, |
|
638
|
|
|
|
|
|
|
}, $class; |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
# check for helpers |
|
642
|
|
|
|
|
|
|
return $self->set_programs( 'catdoc' ); |
|
643
|
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
} |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
sub filter { |
|
648
|
|
|
|
|
|
|
my ( $self, $doc ) = @_; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
my $content = $self->run_catdoc( $doc->fetch_filename ) || return; |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# update the document's content type |
|
653
|
|
|
|
|
|
|
$filter->set_content_type( 'text/plain' ); |
|
654
|
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
# return the document |
|
656
|
|
|
|
|
|
|
return \$content; |
|
657
|
|
|
|
|
|
|
} |
|
658
|
|
|
|
|
|
|
1; |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
The new() constructor creates a blessed hash which contains an array reference |
|
661
|
|
|
|
|
|
|
of mimetypes patterns that this filter accepts. The priority sets this |
|
662
|
|
|
|
|
|
|
filter to run after any other filters that might handle the same type of content. |
|
663
|
|
|
|
|
|
|
The F function says that we need to call a program called "catdoc". |
|
664
|
|
|
|
|
|
|
The function either returns $self or undefined if catdoc could not be found. |
|
665
|
|
|
|
|
|
|
The F function creates a new method for running catdoc. |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
The filter function runs catdoc passing in the name of the file (if the file is in memory |
|
668
|
|
|
|
|
|
|
a temporary file is created). That F function was created by the |
|
669
|
|
|
|
|
|
|
F call above. |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=cut |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
1; |
|
675
|
|
|
|
|
|
|
__END__ |