line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package App::optex::textconv::ooxml::regex; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '0.1401'; |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
12
|
use v5.14; |
|
1
|
|
|
|
|
4
|
|
6
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
24
|
|
7
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
62
|
|
8
|
1
|
|
|
1
|
|
681
|
use utf8; |
|
1
|
|
|
|
|
16
|
|
|
1
|
|
|
|
|
6
|
|
9
|
1
|
|
|
1
|
|
31
|
use Encode; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
70
|
|
10
|
1
|
|
|
1
|
|
7
|
use Data::Dumper; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
43
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
6
|
use App::optex v0.3; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
23
|
|
13
|
1
|
|
|
1
|
|
5
|
use App::optex::textconv::Converter 'import'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
16
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(to_text get_list); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our @CONVERTER = ( |
18
|
|
|
|
|
|
|
[ qr/\.doc[xm]$/ => \&to_text ], |
19
|
|
|
|
|
|
|
[ qr/\.ppt[xm]$/ => \&to_text ], |
20
|
|
|
|
|
|
|
[ qr/\.xls[xm]$/ => \&to_text ], |
21
|
|
|
|
|
|
|
); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub xml2text { |
24
|
0
|
|
|
0
|
0
|
|
local $_ = shift; |
25
|
0
|
|
|
|
|
|
my $type = shift; |
26
|
0
|
|
|
|
|
|
my $xml_re = qr/<\?xml\b[^>]*\?>\s*/; |
27
|
0
|
0
|
|
|
|
|
return $_ unless /$xml_re/; |
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
my @xml = grep { length } split /$xml_re/; |
|
0
|
|
|
|
|
|
|
30
|
0
|
|
|
|
|
|
my @text = map { _xml2text($_, $type) } @xml; |
|
0
|
|
|
|
|
|
|
31
|
0
|
|
|
|
|
|
join "\n", @text; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my %param = ( |
35
|
|
|
|
|
|
|
docx => { space => 2, separator => "" }, |
36
|
|
|
|
|
|
|
docm => { space => 2, separator => "" }, |
37
|
|
|
|
|
|
|
xlsx => { space => 1, separator => "\t" }, |
38
|
|
|
|
|
|
|
xlsm => { space => 1, separator => "\t" }, |
39
|
|
|
|
|
|
|
pptx => { space => 1, separator => "" }, |
40
|
|
|
|
|
|
|
pptm => { space => 1, separator => "" }, |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $replace_reference = do { |
44
|
|
|
|
|
|
|
my %hash = qw( amp & lt < gt > ); |
45
|
|
|
|
|
|
|
my @keys = keys %hash; |
46
|
|
|
|
|
|
|
my $re = do { local $" = '|'; qr/&(@keys);/ }; |
47
|
|
|
|
|
|
|
sub { s/$re/$hash{$1}/g } |
48
|
|
|
|
|
|
|
}; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _xml2text { |
51
|
0
|
|
|
0
|
|
|
local $_ = shift; |
52
|
0
|
|
|
|
|
|
my $type = shift; |
53
|
0
|
0
|
|
|
|
|
my $param = $param{$type} or die; |
54
|
|
|
|
|
|
|
|
55
|
0
|
|
|
|
|
|
my @p; |
56
|
0
|
|
|
|
|
|
while (m{<(?[apw]:p|si)\b[^>]*>(?.*?)\g{tag}>}sg) { |
57
|
1
|
|
|
1
|
|
475
|
my $p = $+{para}; |
|
1
|
|
|
|
|
376
|
|
|
1
|
|
|
|
|
246
|
|
|
0
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
my @s; |
59
|
0
|
|
|
|
|
|
while ($p =~ m{ |
60
|
|
|
|
|
|
|
(? | ) |
61
|
|
|
|
|
|
|
| |
62
|
|
|
|
|
|
|
<(?(?:[apw]:)?t)\b[^>]*> (?[^<]*?) \g{tag}> |
63
|
|
|
|
|
|
|
}xsg) { |
64
|
0
|
0
|
|
|
|
|
if ($+{tab}) { |
65
|
0
|
|
|
|
|
|
push @s, " "; |
66
|
|
|
|
|
|
|
} else { |
67
|
0
|
0
|
|
|
|
|
push @s, $+{text} if $+{text} ne ''; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
} |
70
|
0
|
0
|
|
|
|
|
@s or next; |
71
|
0
|
|
|
|
|
|
push @p, join($param->{separator}, @s) . ("\n" x $param->{space}); |
72
|
|
|
|
|
|
|
} |
73
|
0
|
|
|
|
|
|
my $text = join '', @p; |
74
|
0
|
|
|
|
|
|
$replace_reference->() for $text; |
75
|
0
|
|
|
|
|
|
$text; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
1
|
|
|
1
|
|
1542
|
use Archive::Zip 1.37 qw( :ERROR_CODES :CONSTANTS ); |
|
1
|
|
|
|
|
79709
|
|
|
1
|
|
|
|
|
680
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub to_text { |
81
|
0
|
|
|
0
|
0
|
|
my $file = shift; |
82
|
0
|
0
|
|
|
|
|
my $type = ($file =~ /\.((?:doc|xls|ppt)[xm])$/)[0] or return; |
83
|
0
|
0
|
|
|
|
|
my $zip = Archive::Zip->new($file) or die; |
84
|
0
|
|
|
|
|
|
my @contents; |
85
|
0
|
|
|
|
|
|
for my $entry (get_list($zip, $type)) { |
86
|
0
|
0
|
|
|
|
|
my $member = $zip->memberNamed($entry) or next; |
87
|
0
|
0
|
|
|
|
|
my $xml = $member->contents or next; |
88
|
0
|
0
|
|
|
|
|
my $text = xml2text $xml, $type or next; |
89
|
0
|
0
|
|
|
|
|
$file = encode 'utf8', $file if utf8::is_utf8($file); |
90
|
0
|
|
|
|
|
|
push @contents, "[ \"$file\" $entry ]\n\n$text"; |
91
|
|
|
|
|
|
|
} |
92
|
0
|
|
|
|
|
|
join "\n", @contents; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub get_list { |
96
|
0
|
|
|
0
|
0
|
|
my($zip, $type) = @_; |
97
|
0
|
0
|
|
|
|
|
if ($type =~ /^doc[xm]$/) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
98
|
0
|
|
|
|
|
|
map { "word/$_.xml" } qw(document endnotes footnotes); |
|
0
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
elsif ($type =~ /^xls[xm]$/) { |
101
|
0
|
|
|
|
|
|
map { "xl/$_.xml" } qw(sharedStrings); |
|
0
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
elsif ($type =~ /^ppt[xm]$/) { |
104
|
0
|
|
|
|
|
|
map { $_->[0] } |
105
|
0
|
|
|
|
|
|
sort { $a->[1] <=> $b->[1] } |
106
|
0
|
0
|
|
|
|
|
map { m{(ppt/slides/slide(\d+)\.xml)$} ? [ $1, $2 ] : () } |
|
0
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$zip->memberNames; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
1; |