| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package App::optex::textconv::msdoc; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.13'; |
|
4
|
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
12
|
use v5.14; |
|
|
1
|
|
|
|
|
4
|
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
29
|
|
|
7
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
78
|
|
|
8
|
1
|
|
|
1
|
|
627
|
use utf8; |
|
|
1
|
|
|
|
|
16
|
|
|
|
1
|
|
|
|
|
5
|
|
|
9
|
1
|
|
|
1
|
|
31
|
use Encode; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
68
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
474
|
use App::optex v0.3; |
|
|
1
|
|
|
|
|
112
|
|
|
|
1
|
|
|
|
|
28
|
|
|
13
|
1
|
|
|
1
|
|
6
|
use App::optex::textconv::Converter 'import'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
6
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(to_text get_list); |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @CONVERTER = ( |
|
18
|
|
|
|
|
|
|
[ qr/\.doc[xm]$/ => \&to_text ], |
|
19
|
|
|
|
|
|
|
[ qr/\.ppt[xm]$/ => \&to_text ], |
|
20
|
|
|
|
|
|
|
[ qr/\.xls[xm]$/ => \&to_text ], |
|
21
|
|
|
|
|
|
|
); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub xml2text { |
|
24
|
0
|
|
|
0
|
0
|
|
local $_ = shift; |
|
25
|
0
|
|
|
|
|
|
my $type = shift; |
|
26
|
0
|
|
|
|
|
|
my $xml_re = qr/<\?xml\b[^>]*\?>\s*/; |
|
27
|
0
|
0
|
|
|
|
|
return $_ unless /$xml_re/; |
|
28
|
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
my @xml = grep { length } split /$xml_re/; |
|
|
0
|
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
my @text = map { _xml2text($_, $type) } @xml; |
|
|
0
|
|
|
|
|
|
|
|
31
|
0
|
|
|
|
|
|
join "\n", @text; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my %param = ( |
|
35
|
|
|
|
|
|
|
docx => { space => 2, separator => "" }, |
|
36
|
|
|
|
|
|
|
docm => { space => 2, separator => "" }, |
|
37
|
|
|
|
|
|
|
xlsx => { space => 1, separator => "\t" }, |
|
38
|
|
|
|
|
|
|
xlsm => { space => 1, separator => "\t" }, |
|
39
|
|
|
|
|
|
|
pptx => { space => 1, separator => "" }, |
|
40
|
|
|
|
|
|
|
pptm => { space => 1, separator => "" }, |
|
41
|
|
|
|
|
|
|
); |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $replace_reference = do { |
|
44
|
|
|
|
|
|
|
my %hash = qw( amp & lt < gt > ); |
|
45
|
|
|
|
|
|
|
my @keys = keys %hash; |
|
46
|
|
|
|
|
|
|
my $re = do { local $" = '|'; qr/&(@keys);/ }; |
|
47
|
|
|
|
|
|
|
sub { s/$re/$hash{$1}/g } |
|
48
|
|
|
|
|
|
|
}; |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _xml2text { |
|
51
|
0
|
|
|
0
|
|
|
local $_ = shift; |
|
52
|
0
|
|
|
|
|
|
my $type = shift; |
|
53
|
0
|
0
|
|
|
|
|
my $param = $param{$type} or die; |
|
54
|
|
|
|
|
|
|
|
|
55
|
0
|
|
|
|
|
|
my @p; |
|
56
|
0
|
|
|
|
|
|
while (m{<(?[apw]:p|si)\b[^>]*>(?.*?)\g{tag}>}sg) { |
|
57
|
1
|
|
|
1
|
|
1489
|
my $p = $+{para}; |
|
|
1
|
|
|
|
|
1394
|
|
|
|
1
|
|
|
|
|
408
|
|
|
|
0
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
my @s; |
|
59
|
0
|
|
|
|
|
|
while ($p =~ m{ |
|
60
|
|
|
|
|
|
|
(? | ) |
|
61
|
|
|
|
|
|
|
| |
|
62
|
|
|
|
|
|
|
<(?(?:[apw]:)?t)\b[^>]*> (?[^<]*?) \g{tag}> |
|
63
|
|
|
|
|
|
|
}xsg) { |
|
64
|
0
|
0
|
|
|
|
|
if ($+{tab}) { |
|
65
|
0
|
|
|
|
|
|
push @s, " "; |
|
66
|
|
|
|
|
|
|
} else { |
|
67
|
0
|
0
|
|
|
|
|
push @s, $+{text} if $+{text} ne ''; |
|
68
|
|
|
|
|
|
|
} |
|
69
|
|
|
|
|
|
|
} |
|
70
|
0
|
0
|
|
|
|
|
@s or next; |
|
71
|
0
|
|
|
|
|
|
push @p, join($param->{separator}, @s) . ("\n" x $param->{space}); |
|
72
|
|
|
|
|
|
|
} |
|
73
|
0
|
|
|
|
|
|
my $text = join '', @p; |
|
74
|
0
|
|
|
|
|
|
$replace_reference->() for $text; |
|
75
|
0
|
|
|
|
|
|
$text; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
|
|
|
|
|
|
|
|
78
|
1
|
|
|
1
|
|
1160
|
use Archive::Zip 1.37 qw( :ERROR_CODES :CONSTANTS ); |
|
|
1
|
|
|
|
|
82810
|
|
|
|
1
|
|
|
|
|
690
|
|
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub to_text { |
|
81
|
0
|
|
|
0
|
0
|
|
my $file = shift; |
|
82
|
0
|
0
|
|
|
|
|
my $type = ($file =~ /\.((?:doc|xls|ppt)[xm])$/)[0] or return; |
|
83
|
0
|
0
|
|
|
|
|
my $zip = Archive::Zip->new($file) or die; |
|
84
|
0
|
|
|
|
|
|
my @contents; |
|
85
|
0
|
|
|
|
|
|
for my $entry (get_list($zip, $type)) { |
|
86
|
0
|
0
|
|
|
|
|
my $member = $zip->memberNamed($entry) or next; |
|
87
|
0
|
0
|
|
|
|
|
my $xml = $member->contents or next; |
|
88
|
0
|
0
|
|
|
|
|
my $text = xml2text $xml, $type or next; |
|
89
|
0
|
0
|
|
|
|
|
$file = encode 'utf8', $file if utf8::is_utf8($file); |
|
90
|
0
|
|
|
|
|
|
push @contents, "[ \"$file\" $entry ]\n\n$text"; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
0
|
|
|
|
|
|
join "\n", @contents; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub get_list { |
|
96
|
0
|
|
|
0
|
0
|
|
my($zip, $type) = @_; |
|
97
|
0
|
0
|
|
|
|
|
if ($type =~ /^doc[xm]$/) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
98
|
0
|
|
|
|
|
|
map { "word/$_.xml" } qw(document endnotes footnotes); |
|
|
0
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
elsif ($type =~ /^xls[xm]$/) { |
|
101
|
0
|
|
|
|
|
|
map { "xl/$_.xml" } qw(sharedStrings); |
|
|
0
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
elsif ($type =~ /^ppt[xm]$/) { |
|
104
|
0
|
|
|
|
|
|
map { $_->[0] } |
|
105
|
0
|
|
|
|
|
|
sort { $a->[1] <=> $b->[1] } |
|
106
|
0
|
0
|
|
|
|
|
map { m{(ppt/slides/slide(\d+)\.xml)$} ? [ $1, $2 ] : () } |
|
|
0
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$zip->memberNames; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
1; |