line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/File-Extract/trunk/lib/File/Extract.pm 9350 2007-11-18T13:33:38.729170Z daisuke $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Copyright (c) 2005-2007 Daisuke Maki |
4
|
|
|
|
|
|
|
# All rights reserved. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package File::Extract; |
7
|
2
|
|
|
2
|
|
63552
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
114
|
|
8
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
69
|
|
9
|
2
|
|
|
2
|
|
12
|
use base qw(Class::Data::Inheritable); |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
2348
|
|
10
|
2
|
|
|
2
|
|
2549
|
use File::MMagic::XS qw(:compat); |
|
2
|
|
|
|
|
2711
|
|
|
2
|
|
|
|
|
15
|
|
11
|
2
|
|
|
2
|
|
26326
|
use File::Temp(); |
|
2
|
|
|
|
|
80591
|
|
|
2
|
|
|
|
|
1790
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.07000'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new |
15
|
|
|
|
|
|
|
{ |
16
|
1
|
|
|
1
|
1
|
4
|
my $class = shift; |
17
|
1
|
|
|
|
|
5
|
my %args = @_; |
18
|
|
|
|
|
|
|
|
19
|
1
|
|
50
|
|
|
8
|
my $encoding = $args{output_encoding} || 'utf8'; |
20
|
0
|
|
|
|
|
0
|
my @encodings = $args{encodings} ? |
21
|
1
|
0
|
|
|
|
6
|
(ref($args{encodings}) eq 'ARRAY' ? @{$args{encodings}} : $args{encodings}) : (); |
|
|
50
|
|
|
|
|
|
22
|
0
|
|
|
|
|
0
|
my $self = bless { |
23
|
|
|
|
|
|
|
filters => $args{filters}, |
24
|
|
|
|
|
|
|
processors => $args{processors}, |
25
|
|
|
|
|
|
|
magic => |
26
|
|
|
|
|
|
|
$args{file_mmagic_args} ? |
27
|
1
|
50
|
|
|
|
14
|
File::MMagic::XS->new(%{$args{file_mmagic_args}}) : |
28
|
|
|
|
|
|
|
File::MMagic::XS->new(), |
29
|
|
|
|
|
|
|
encodings => \@encodings, |
30
|
|
|
|
|
|
|
output_encoding => $encoding |
31
|
|
|
|
|
|
|
}, $class; |
32
|
|
|
|
|
|
|
|
33
|
1
|
|
|
|
|
612
|
return $self; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
|
0
|
1
|
0
|
sub magic { shift->{magic} } |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub register_processor |
39
|
|
|
|
|
|
|
{ |
40
|
12
|
|
|
12
|
1
|
24
|
my $class = shift; |
41
|
12
|
|
|
|
|
23
|
my $pkg = shift; |
42
|
|
|
|
|
|
|
|
43
|
12
|
50
|
|
|
|
1144
|
eval "require $pkg" or die; |
44
|
12
|
|
|
|
|
134
|
my $mime = $pkg->mime_type; |
45
|
12
|
|
50
|
|
|
96
|
$class->RegisteredProcessors->{$mime} ||= []; |
46
|
12
|
|
|
|
|
391
|
push @{$class->RegisteredProcessors->{$mime}}, $pkg; |
|
12
|
|
|
|
|
204
|
|
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub register_filter |
50
|
|
|
|
|
|
|
{ |
51
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
52
|
0
|
|
|
|
|
0
|
my $pkg = shift; |
53
|
|
|
|
|
|
|
|
54
|
0
|
0
|
|
|
|
0
|
eval "require $pkg" or die; |
55
|
0
|
|
|
|
|
0
|
my $mime = $pkg->mime_type; |
56
|
0
|
|
0
|
|
|
0
|
$class->RegisteredFilter->{$mime} ||= []; |
57
|
0
|
|
|
|
|
0
|
push @{$class->RegisteredFilter->{$mime}}, $pkg; |
|
0
|
|
|
|
|
0
|
|
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub _processors |
61
|
|
|
|
|
|
|
{ |
62
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
63
|
1
|
|
|
|
|
6
|
my $mime = shift; |
64
|
|
|
|
|
|
|
|
65
|
1
|
|
|
|
|
7
|
my $processors; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# First, check if we have instance specific processors |
68
|
1
|
|
|
|
|
4
|
$processors = $self->{processors}{$mime}; |
69
|
1
|
50
|
|
|
|
14
|
if ($processors) { |
70
|
0
|
|
|
|
|
0
|
return @$processors; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
1
|
|
|
|
|
27
|
$processors = ref($self)->RegisteredProcessors->{$mime}; |
74
|
1
|
50
|
|
|
|
22
|
if ($processors) { |
75
|
1
|
|
|
|
|
8
|
return @$processors; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
0
|
|
|
|
|
0
|
return (); |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub _filters |
82
|
|
|
|
|
|
|
{ |
83
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
84
|
1
|
|
|
|
|
3
|
my $mime = shift; |
85
|
|
|
|
|
|
|
|
86
|
1
|
|
|
|
|
2
|
my $filters; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# First, check if we have instance specific filters |
89
|
1
|
|
|
|
|
4
|
$filters = $self->{filters}{$mime}; |
90
|
1
|
50
|
|
|
|
4
|
if ($filters) { |
91
|
1
|
|
|
|
|
8
|
return @$filters; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
0
|
|
|
|
|
0
|
$filters = ref($self)->RegisteredFilters->{$mime}; |
95
|
0
|
0
|
|
|
|
0
|
if ($filters) { |
96
|
0
|
|
|
|
|
0
|
return @$filters; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
0
|
|
|
|
|
0
|
return (); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub extract |
103
|
|
|
|
|
|
|
{ |
104
|
1
|
|
|
1
|
1
|
7
|
my $self = shift; |
105
|
1
|
|
|
|
|
5
|
my $file = shift; |
106
|
|
|
|
|
|
|
|
107
|
1
|
|
|
|
|
9
|
my $magic = $self->{magic}; |
108
|
1
|
|
|
|
|
275
|
my $mime = $magic->checktype_filename($file); |
109
|
1
|
50
|
|
|
|
7
|
return unless $mime; |
110
|
1
|
|
|
|
|
2
|
my $o_mime = $mime; |
111
|
|
|
|
|
|
|
|
112
|
1
|
|
|
|
|
2
|
my $tmp; |
113
|
1
|
|
|
|
|
3
|
my $source = $file; |
114
|
1
|
50
|
|
|
|
7
|
if (my @filters = $self->_filters($mime)) { |
115
|
|
|
|
|
|
|
# Filters are applied one after the other, even if that may cause the |
116
|
|
|
|
|
|
|
# underlying MIME type to change (i.e. maybe you are crazy enough to |
117
|
|
|
|
|
|
|
# apply a filter that changes a plain text file to HTML -- god knows |
118
|
|
|
|
|
|
|
# why ;). This may be a bit confusing, since text extractors are |
119
|
|
|
|
|
|
|
# applied from the MIME type of the resulting file. |
120
|
1
|
|
|
|
|
3
|
foreach my $f (@filters) { |
121
|
1
|
|
|
|
|
11
|
$tmp = File::Temp->new(UNLINK => 1); |
122
|
1
|
|
|
|
|
933
|
$f->filter(file => $source, output => $tmp); |
123
|
1
|
|
|
|
|
65
|
$source = $tmp->filename; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
1
|
|
|
|
|
4253
|
$tmp->flush; |
127
|
1
|
|
|
|
|
211
|
$mime = $magic->checktype_filename($source); |
128
|
1
|
50
|
|
|
|
13
|
return unless $mime; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
1
|
50
|
|
|
|
107
|
if (my @processors = $self->_processors($mime)) { |
132
|
1
|
|
|
|
|
6
|
foreach my $pkg (@processors) { |
133
|
1
|
|
|
|
|
42
|
my $p = $pkg->new( |
134
|
|
|
|
|
|
|
encodings => $self->{encodings}, |
135
|
|
|
|
|
|
|
output_encoding => $self->{output_encoding} |
136
|
|
|
|
|
|
|
); |
137
|
1
|
|
|
|
|
5
|
my $r = eval { $p->extract($source) }; |
|
1
|
|
|
|
|
16
|
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Restore the original mime type of the source file. This is |
140
|
|
|
|
|
|
|
# required because we might have passed through several filters |
141
|
1
|
50
|
|
|
|
54
|
if ($r) { |
142
|
1
|
50
|
|
|
|
7
|
if ($source ne $file) { |
143
|
1
|
|
|
|
|
9
|
$r->filename($file); |
144
|
1
|
|
|
|
|
70
|
$r->mime_type($o_mime); |
145
|
|
|
|
|
|
|
} |
146
|
1
|
|
|
|
|
104
|
return $r; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
return undef; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
BEGIN |
155
|
|
|
|
|
|
|
{ |
156
|
2
|
|
|
2
|
|
27
|
__PACKAGE__->mk_classdata('RegisteredFilters'); |
157
|
2
|
|
|
|
|
54
|
__PACKAGE__->mk_classdata('RegisteredProcessors'); |
158
|
2
|
|
|
|
|
47
|
__PACKAGE__->RegisteredFilters({}); |
159
|
2
|
|
|
|
|
22
|
__PACKAGE__->RegisteredProcessors({}); |
160
|
|
|
|
|
|
|
|
161
|
2
|
|
|
|
|
15
|
my @p = qw( |
162
|
|
|
|
|
|
|
File::Extract::Excel |
163
|
|
|
|
|
|
|
File::Extract::HTML |
164
|
|
|
|
|
|
|
File::Extract::MP3 |
165
|
|
|
|
|
|
|
File::Extract::PDF |
166
|
|
|
|
|
|
|
File::Extract::Plain |
167
|
|
|
|
|
|
|
File::Extract::RTF |
168
|
|
|
|
|
|
|
); |
169
|
2
|
|
|
|
|
4
|
foreach my $p (@p) { |
170
|
12
|
|
|
|
|
147
|
__PACKAGE__->register_processor($p); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
1; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
__END__ |