| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/File-Extract/trunk/lib/File/Extract.pm 9350 2007-11-18T13:33:38.729170Z daisuke $ |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# Copyright (c) 2005-2007 Daisuke Maki |
|
4
|
|
|
|
|
|
|
# All rights reserved. |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package File::Extract; |
|
7
|
2
|
|
|
2
|
|
63552
|
use strict; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
114
|
|
|
8
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
69
|
|
|
9
|
2
|
|
|
2
|
|
12
|
use base qw(Class::Data::Inheritable); |
|
|
2
|
|
|
|
|
8
|
|
|
|
2
|
|
|
|
|
2348
|
|
|
10
|
2
|
|
|
2
|
|
2549
|
use File::MMagic::XS qw(:compat); |
|
|
2
|
|
|
|
|
2711
|
|
|
|
2
|
|
|
|
|
15
|
|
|
11
|
2
|
|
|
2
|
|
26326
|
use File::Temp(); |
|
|
2
|
|
|
|
|
80591
|
|
|
|
2
|
|
|
|
|
1790
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.07000'; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new |
|
15
|
|
|
|
|
|
|
{ |
|
16
|
1
|
|
|
1
|
1
|
4
|
my $class = shift; |
|
17
|
1
|
|
|
|
|
5
|
my %args = @_; |
|
18
|
|
|
|
|
|
|
|
|
19
|
1
|
|
50
|
|
|
8
|
my $encoding = $args{output_encoding} || 'utf8'; |
|
20
|
0
|
|
|
|
|
0
|
my @encodings = $args{encodings} ? |
|
21
|
1
|
0
|
|
|
|
6
|
(ref($args{encodings}) eq 'ARRAY' ? @{$args{encodings}} : $args{encodings}) : (); |
|
|
|
50
|
|
|
|
|
|
|
22
|
0
|
|
|
|
|
0
|
my $self = bless { |
|
23
|
|
|
|
|
|
|
filters => $args{filters}, |
|
24
|
|
|
|
|
|
|
processors => $args{processors}, |
|
25
|
|
|
|
|
|
|
magic => |
|
26
|
|
|
|
|
|
|
$args{file_mmagic_args} ? |
|
27
|
1
|
50
|
|
|
|
14
|
File::MMagic::XS->new(%{$args{file_mmagic_args}}) : |
|
28
|
|
|
|
|
|
|
File::MMagic::XS->new(), |
|
29
|
|
|
|
|
|
|
encodings => \@encodings, |
|
30
|
|
|
|
|
|
|
output_encoding => $encoding |
|
31
|
|
|
|
|
|
|
}, $class; |
|
32
|
|
|
|
|
|
|
|
|
33
|
1
|
|
|
|
|
612
|
return $self; |
|
34
|
|
|
|
|
|
|
} |
|
35
|
|
|
|
|
|
|
|
|
36
|
0
|
|
|
0
|
1
|
0
|
sub magic { shift->{magic} } |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub register_processor |
|
39
|
|
|
|
|
|
|
{ |
|
40
|
12
|
|
|
12
|
1
|
24
|
my $class = shift; |
|
41
|
12
|
|
|
|
|
23
|
my $pkg = shift; |
|
42
|
|
|
|
|
|
|
|
|
43
|
12
|
50
|
|
|
|
1144
|
eval "require $pkg" or die; |
|
44
|
12
|
|
|
|
|
134
|
my $mime = $pkg->mime_type; |
|
45
|
12
|
|
50
|
|
|
96
|
$class->RegisteredProcessors->{$mime} ||= []; |
|
46
|
12
|
|
|
|
|
391
|
push @{$class->RegisteredProcessors->{$mime}}, $pkg; |
|
|
12
|
|
|
|
|
204
|
|
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub register_filter |
|
50
|
|
|
|
|
|
|
{ |
|
51
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
|
52
|
0
|
|
|
|
|
0
|
my $pkg = shift; |
|
53
|
|
|
|
|
|
|
|
|
54
|
0
|
0
|
|
|
|
0
|
eval "require $pkg" or die; |
|
55
|
0
|
|
|
|
|
0
|
my $mime = $pkg->mime_type; |
|
56
|
0
|
|
0
|
|
|
0
|
$class->RegisteredFilter->{$mime} ||= []; |
|
57
|
0
|
|
|
|
|
0
|
push @{$class->RegisteredFilter->{$mime}}, $pkg; |
|
|
0
|
|
|
|
|
0
|
|
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub _processors |
|
61
|
|
|
|
|
|
|
{ |
|
62
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
63
|
1
|
|
|
|
|
6
|
my $mime = shift; |
|
64
|
|
|
|
|
|
|
|
|
65
|
1
|
|
|
|
|
7
|
my $processors; |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# First, check if we have instance specific processors |
|
68
|
1
|
|
|
|
|
4
|
$processors = $self->{processors}{$mime}; |
|
69
|
1
|
50
|
|
|
|
14
|
if ($processors) { |
|
70
|
0
|
|
|
|
|
0
|
return @$processors; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
1
|
|
|
|
|
27
|
$processors = ref($self)->RegisteredProcessors->{$mime}; |
|
74
|
1
|
50
|
|
|
|
22
|
if ($processors) { |
|
75
|
1
|
|
|
|
|
8
|
return @$processors; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
0
|
|
|
|
|
0
|
return (); |
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub _filters |
|
82
|
|
|
|
|
|
|
{ |
|
83
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
84
|
1
|
|
|
|
|
3
|
my $mime = shift; |
|
85
|
|
|
|
|
|
|
|
|
86
|
1
|
|
|
|
|
2
|
my $filters; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# First, check if we have instance specific filters |
|
89
|
1
|
|
|
|
|
4
|
$filters = $self->{filters}{$mime}; |
|
90
|
1
|
50
|
|
|
|
4
|
if ($filters) { |
|
91
|
1
|
|
|
|
|
8
|
return @$filters; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
0
|
|
|
|
|
0
|
$filters = ref($self)->RegisteredFilters->{$mime}; |
|
95
|
0
|
0
|
|
|
|
0
|
if ($filters) { |
|
96
|
0
|
|
|
|
|
0
|
return @$filters; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
0
|
return (); |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub extract |
|
103
|
|
|
|
|
|
|
{ |
|
104
|
1
|
|
|
1
|
1
|
7
|
my $self = shift; |
|
105
|
1
|
|
|
|
|
5
|
my $file = shift; |
|
106
|
|
|
|
|
|
|
|
|
107
|
1
|
|
|
|
|
9
|
my $magic = $self->{magic}; |
|
108
|
1
|
|
|
|
|
275
|
my $mime = $magic->checktype_filename($file); |
|
109
|
1
|
50
|
|
|
|
7
|
return unless $mime; |
|
110
|
1
|
|
|
|
|
2
|
my $o_mime = $mime; |
|
111
|
|
|
|
|
|
|
|
|
112
|
1
|
|
|
|
|
2
|
my $tmp; |
|
113
|
1
|
|
|
|
|
3
|
my $source = $file; |
|
114
|
1
|
50
|
|
|
|
7
|
if (my @filters = $self->_filters($mime)) { |
|
115
|
|
|
|
|
|
|
# Filters are applied one after the other, even if that may cause the |
|
116
|
|
|
|
|
|
|
# underlying MIME type to change (i.e. maybe you are crazy enough to |
|
117
|
|
|
|
|
|
|
# apply a filter that changes a plain text file to HTML -- god knows |
|
118
|
|
|
|
|
|
|
# why ;). This may be a bit confusing, since text extractors are |
|
119
|
|
|
|
|
|
|
# applied from the MIME type of the resulting file. |
|
120
|
1
|
|
|
|
|
3
|
foreach my $f (@filters) { |
|
121
|
1
|
|
|
|
|
11
|
$tmp = File::Temp->new(UNLINK => 1); |
|
122
|
1
|
|
|
|
|
933
|
$f->filter(file => $source, output => $tmp); |
|
123
|
1
|
|
|
|
|
65
|
$source = $tmp->filename; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
1
|
|
|
|
|
4253
|
$tmp->flush; |
|
127
|
1
|
|
|
|
|
211
|
$mime = $magic->checktype_filename($source); |
|
128
|
1
|
50
|
|
|
|
13
|
return unless $mime; |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
1
|
50
|
|
|
|
107
|
if (my @processors = $self->_processors($mime)) { |
|
132
|
1
|
|
|
|
|
6
|
foreach my $pkg (@processors) { |
|
133
|
1
|
|
|
|
|
42
|
my $p = $pkg->new( |
|
134
|
|
|
|
|
|
|
encodings => $self->{encodings}, |
|
135
|
|
|
|
|
|
|
output_encoding => $self->{output_encoding} |
|
136
|
|
|
|
|
|
|
); |
|
137
|
1
|
|
|
|
|
5
|
my $r = eval { $p->extract($source) }; |
|
|
1
|
|
|
|
|
16
|
|
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Restore the original mime type of the source file. This is |
|
140
|
|
|
|
|
|
|
# required because we might have passed through several filters |
|
141
|
1
|
50
|
|
|
|
54
|
if ($r) { |
|
142
|
1
|
50
|
|
|
|
7
|
if ($source ne $file) { |
|
143
|
1
|
|
|
|
|
9
|
$r->filename($file); |
|
144
|
1
|
|
|
|
|
70
|
$r->mime_type($o_mime); |
|
145
|
|
|
|
|
|
|
} |
|
146
|
1
|
|
|
|
|
104
|
return $r; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return undef; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
BEGIN |
|
155
|
|
|
|
|
|
|
{ |
|
156
|
2
|
|
|
2
|
|
27
|
__PACKAGE__->mk_classdata('RegisteredFilters'); |
|
157
|
2
|
|
|
|
|
54
|
__PACKAGE__->mk_classdata('RegisteredProcessors'); |
|
158
|
2
|
|
|
|
|
47
|
__PACKAGE__->RegisteredFilters({}); |
|
159
|
2
|
|
|
|
|
22
|
__PACKAGE__->RegisteredProcessors({}); |
|
160
|
|
|
|
|
|
|
|
|
161
|
2
|
|
|
|
|
15
|
my @p = qw( |
|
162
|
|
|
|
|
|
|
File::Extract::Excel |
|
163
|
|
|
|
|
|
|
File::Extract::HTML |
|
164
|
|
|
|
|
|
|
File::Extract::MP3 |
|
165
|
|
|
|
|
|
|
File::Extract::PDF |
|
166
|
|
|
|
|
|
|
File::Extract::Plain |
|
167
|
|
|
|
|
|
|
File::Extract::RTF |
|
168
|
|
|
|
|
|
|
); |
|
169
|
2
|
|
|
|
|
4
|
foreach my $p (@p) { |
|
170
|
12
|
|
|
|
|
147
|
__PACKAGE__->register_processor($p); |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
1; |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
__END__ |