line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Excel::ValueReader::XLSX; |
2
|
1
|
|
|
1
|
|
179625
|
use utf8; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
3
|
1
|
|
|
1
|
|
533
|
use Moose; |
|
1
|
|
|
|
|
660258
|
|
|
1
|
|
|
|
|
7
|
|
4
|
1
|
|
|
1
|
|
7894
|
use Module::Load qw/load/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
5
|
1
|
|
|
1
|
|
605
|
use Date::Calc qw/Add_Delta_Days/; |
|
1
|
|
|
|
|
6793
|
|
|
1
|
|
|
|
|
134
|
|
6
|
1
|
|
|
1
|
|
8
|
use POSIX qw/strftime modf/; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
7
|
1
|
|
|
1
|
|
1554
|
use Carp qw/croak/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
42
|
|
8
|
1
|
|
|
1
|
|
7
|
use feature 'state'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1953
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our $VERSION = '1.09'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
#====================================================================== |
13
|
|
|
|
|
|
|
# ATTRIBUTES |
14
|
|
|
|
|
|
|
#====================================================================== |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# PUBLIC ATTRIBUTES |
17
|
|
|
|
|
|
|
has 'xlsx' => (is => 'ro', isa => 'Str', required => 1); # path of xlsx file |
18
|
|
|
|
|
|
|
has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class |
19
|
|
|
|
|
|
|
has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y'); |
20
|
|
|
|
|
|
|
has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S'); |
21
|
|
|
|
|
|
|
has 'datetime_format' => (is => 'ro', isa => 'Str', |
22
|
|
|
|
|
|
|
builder => '_datetime_format', lazy => 1); |
23
|
|
|
|
|
|
|
has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]', |
24
|
|
|
|
|
|
|
builder => '_date_formatter', lazy => 1); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED |
29
|
|
|
|
|
|
|
has 'backend' => (is => 'ro', isa => 'Object', init_arg => undef, |
30
|
|
|
|
|
|
|
builder => '_backend', lazy => 1, |
31
|
|
|
|
|
|
|
handles => [qw/values base_year sheets/]); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
#====================================================================== |
34
|
|
|
|
|
|
|
# BUILDING |
35
|
|
|
|
|
|
|
#====================================================================== |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path) |
38
|
|
|
|
|
|
|
around BUILDARGS => sub { |
39
|
|
|
|
|
|
|
my $orig = shift; |
40
|
|
|
|
|
|
|
my $class = shift; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
if ( @_ == 1 && !ref $_[0] ) { |
43
|
|
|
|
|
|
|
return $class->$orig(xlsx => $_[0]); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
else { |
46
|
|
|
|
|
|
|
return $class->$orig(@_); |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
}; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
#====================================================================== |
53
|
|
|
|
|
|
|
# ATTRIBUTE CONSTRUCTORS |
54
|
|
|
|
|
|
|
#====================================================================== |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _backend { |
58
|
12
|
|
|
12
|
|
26
|
my $self = shift; |
59
|
|
|
|
|
|
|
|
60
|
12
|
|
|
|
|
318
|
my $backend_class = ref($self) . '::Backend::' . $self->using; |
61
|
12
|
|
|
|
|
66
|
load $backend_class; |
62
|
|
|
|
|
|
|
|
63
|
12
|
|
|
|
|
945
|
return $backend_class->new(frontend => $self); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub _datetime_format { |
67
|
10
|
|
|
10
|
|
20
|
my ($self) = @_; |
68
|
10
|
|
|
|
|
236
|
return $self->date_format . ' ' . $self->time_format; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _date_formatter { |
72
|
10
|
|
|
10
|
|
26
|
my ($self) = @_; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# local copies of the various formats so that we can build a closure |
75
|
10
|
|
|
|
|
261
|
my @formats = (undef, # 0 -- error |
76
|
|
|
|
|
|
|
$self->date_format, # 1 -- just a date |
77
|
|
|
|
|
|
|
$self->time_format, # 2 -- just a time |
78
|
|
|
|
|
|
|
$self->datetime_format); # 3 -- date and time |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
my $strftime_formatter = sub { |
81
|
156
|
|
|
156
|
|
363
|
my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# choose the proper format for strftime |
84
|
156
|
|
|
|
|
197
|
my $ix = 0; # index into the @formats array |
85
|
156
|
100
|
|
|
|
462
|
$ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion |
86
|
156
|
100
|
|
|
|
304
|
$ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion |
87
|
156
|
50
|
|
|
|
293
|
my $strftime_format = $formats[$ix] |
88
|
|
|
|
|
|
|
or die "cell with unexpected Excel date format : $xl_date_format"; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# formatting through strftime |
91
|
156
|
|
|
|
|
4849
|
my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900); |
92
|
|
|
|
|
|
|
|
93
|
156
|
|
|
|
|
602
|
return $formatted_date; |
94
|
10
|
|
|
|
|
57
|
}; |
95
|
|
|
|
|
|
|
|
96
|
10
|
|
|
|
|
260
|
return $strftime_formatter; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#====================================================================== |
101
|
|
|
|
|
|
|
# GENERAL METHODS |
102
|
|
|
|
|
|
|
#====================================================================== |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub sheet_names { |
106
|
2
|
|
|
2
|
1
|
4026
|
my ($self) = @_; |
107
|
|
|
|
|
|
|
|
108
|
2
|
|
|
|
|
13
|
my $sheets = $self->sheets; # arrayref of shape {$name => $sheet_position} |
109
|
|
|
|
|
|
|
|
110
|
2
|
|
|
|
|
16
|
my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets; |
|
18
|
|
|
|
|
56
|
|
111
|
2
|
|
|
|
|
11
|
return @sorted_names; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub A1_to_num { # convert Excel A1 reference format to a number |
116
|
2404
|
|
|
2404
|
0
|
25250
|
my ($self, $string) = @_; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# ordinal number for character just before 'A' |
119
|
2404
|
|
|
|
|
3384
|
state $base = ord('A') - 1; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# iterate on 'digits' (letters of the A1 cell reference) |
122
|
2404
|
|
|
|
|
3197
|
my $num = 0; |
123
|
2404
|
|
|
|
|
5457
|
foreach my $digit (map {ord($_) - $base} split //, $string) { |
|
2404
|
|
|
|
|
6646
|
|
124
|
2404
|
|
|
|
|
4486
|
$num = $num*26 + $digit; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
2404
|
|
|
|
|
7619
|
return $num; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
sub formatted_date { |
132
|
156
|
|
|
156
|
0
|
16606
|
my ($self, $val, $date_format, $date_formatter) = @_; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# separate date (integer part) from time (fractional part) |
135
|
156
|
|
|
|
|
555
|
my ($time, $n_days) = modf($val); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Convert $n_days into a date in Date::Calc format (year, month, day). |
138
|
|
|
|
|
|
|
# The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while |
139
|
|
|
|
|
|
|
# in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system, |
140
|
|
|
|
|
|
|
# Excel treats 1900 as a leap year. |
141
|
156
|
|
|
|
|
482
|
my $base_year = $self->base_year; |
142
|
156
|
100
|
|
|
|
321
|
if ($base_year == 1900) { |
143
|
114
|
|
|
|
|
261
|
my $is_after_february_1900 = $n_days > 60; |
144
|
114
|
100
|
|
|
|
379
|
$n_days -= $is_after_february_1900 ? 2 : 1; |
145
|
|
|
|
|
|
|
} |
146
|
156
|
|
|
|
|
515
|
my @d = Add_Delta_Days($base_year, 1, 1, $n_days); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# decode the fractional part (the time) into hours, minutes, seconds, milliseconds |
149
|
156
|
|
|
|
|
217
|
my @t; |
150
|
156
|
|
|
|
|
240
|
foreach my $subdivision (24, 60, 60, 1000) { |
151
|
624
|
|
|
|
|
787
|
$time *= $subdivision; |
152
|
624
|
|
|
|
|
1124
|
($time, my $time_portion) = modf($time); |
153
|
624
|
|
|
|
|
858
|
push @t, $time_portion; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second |
157
|
156
|
|
|
|
|
393
|
my ($h, $m, $s, $ms) = @t; |
158
|
156
|
100
|
|
|
|
276
|
if ($ms == 999) { |
159
|
2
|
|
|
|
|
7
|
$s += 1, $ms = 0; |
160
|
2
|
50
|
|
|
|
8
|
if ($s == 60) { |
161
|
0
|
|
|
|
|
0
|
$m += 1, $s = 0; |
162
|
0
|
0
|
|
|
|
0
|
if ($m == 60) { |
163
|
0
|
|
|
|
|
0
|
$h += 1, $m = 0; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
# NOTE : because of this hack, theoretically we could end up with a value |
168
|
|
|
|
|
|
|
# like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different |
169
|
|
|
|
|
|
|
# in its rendering. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# call the date_formatter subroutine |
172
|
156
|
50
|
33
|
|
|
4487
|
$date_formatter //= $self->date_formatter |
173
|
|
|
|
|
|
|
or die ref($self) . " has no date_formatter subroutine"; |
174
|
156
|
|
|
|
|
423
|
my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms); |
175
|
|
|
|
|
|
|
|
176
|
156
|
|
|
|
|
612
|
return $formatted_date; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#====================================================================== |
180
|
|
|
|
|
|
|
# METHODS FOR PARSING EXCEL TABLES |
181
|
|
|
|
|
|
|
#====================================================================== |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub table_names { |
185
|
2
|
|
|
2
|
1
|
9095
|
my ($self) = @_; |
186
|
|
|
|
|
|
|
|
187
|
2
|
|
|
|
|
73
|
my $table_info = $self->backend->table_info; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# sort on table id (field [1] in table_info arrayrefs) |
190
|
2
|
|
|
|
|
15
|
my @table_names = sort {$table_info->{$a}[1] <=> $table_info->{$b}->[1]} keys %$table_info; |
|
16
|
|
|
|
|
40
|
|
191
|
|
|
|
|
|
|
|
192
|
2
|
|
|
|
|
17
|
return @table_names; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# info fields returned from the backend parsing methods |
197
|
|
|
|
|
|
|
my @table_info_fields = qw/sheet table_id ref columns no_headers/; |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# the same fields are also the valid args for the method call |
201
|
|
|
|
|
|
|
my $is_valid_arg = "^(" . join("|", @table_info_fields) . ")\$"; |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub table { |
204
|
10
|
|
|
10
|
1
|
9061
|
my $self = shift; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# syntactic sugar : ->table('foo') is treated as ->table(name => 'foo') |
207
|
10
|
50
|
|
|
|
49
|
my %args = @_ == 1 ? (name => $_[0]) : @_; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# if called with a table name, derive all other args from internal workbook info |
210
|
10
|
50
|
|
|
|
48
|
if (my $table_name = delete $args{name}) { |
211
|
10
|
|
50
|
|
|
77
|
!$args{$_} or croak "table() : arg '$_' is incompatible with 'name'" for @table_info_fields; |
212
|
10
|
50
|
|
|
|
120
|
@args{@table_info_fields} = @{$self->backend->table_info->{$table_name}} |
|
10
|
|
|
|
|
482
|
|
213
|
|
|
|
|
|
|
or croak "no table info for table: $table_name"; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# check args |
217
|
10
|
|
|
|
|
34
|
my @invalid_args = grep {!/$is_valid_arg/} keys %args; |
|
50
|
|
|
|
|
587
|
|
218
|
10
|
50
|
|
|
|
31
|
croak "invalid args to table(): " . join ", ", @invalid_args if @invalid_args; |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# get raw values from the sheet |
221
|
10
|
|
|
|
|
60
|
my $values = $self->values($args{sheet}); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# restrict values to the table subrange (if applicable) |
224
|
10
|
50
|
|
|
|
225
|
$values = $self->_subrange($values, $args{ref}) if $args{ref}; |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# take headers from first row if not already given in $args{columns} |
227
|
10
|
|
33
|
|
|
33
|
$args{columns} //= $values->[0]; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# if this table has headers (which is almost always the case), drop the header row |
230
|
10
|
100
|
|
|
|
26
|
shift @$values unless $args{no_headers}; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# build a table of hashes. This could be done with a simple map(), but using a loop |
233
|
|
|
|
|
|
|
# avoids to store 2 copies of cell values in memory : @$values is shifted when @table is pushed. |
234
|
10
|
|
|
|
|
35
|
my @cols = @{$args{columns}}; |
|
10
|
|
|
|
|
31
|
|
235
|
10
|
50
|
|
|
|
22
|
croak "table contains undefined columns" if grep {!defined $_} @cols; |
|
32
|
|
|
|
|
66
|
|
236
|
10
|
|
|
|
|
21
|
my @rows; |
237
|
10
|
|
|
|
|
27
|
while (my $vals = shift @$values) { |
238
|
150
|
|
|
|
|
164
|
my %row; |
239
|
150
|
|
|
|
|
574
|
@row{@cols} = @$vals; |
240
|
150
|
|
|
|
|
435
|
push @rows, \%row; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# in scalar context, just return the rows. In list context, also return the column names |
244
|
10
|
100
|
|
|
|
74
|
return wantarray ? (\@cols, \@rows) : \@rows; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub _subrange { |
250
|
10
|
|
|
10
|
|
32
|
my ($self, $values, $ref) = @_; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# parse rows and columns from the $ref string (of shape like for example "A1:D34") |
253
|
10
|
50
|
|
|
|
101
|
my ($col1, $row1, $col2, $row2) = $ref =~ /^([A-Z]+)(\d+):([A-Z]+)(\d+)$/ |
254
|
|
|
|
|
|
|
or croak "_subrange : invalid ref: $ref"; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# restrict to the row range |
257
|
10
|
100
|
100
|
|
|
55
|
if ($row1 > 1 || $row2 < @$values){ |
258
|
8
|
|
|
|
|
32
|
$values = [ @$values[$row1-1 .. $row2-1] ]; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
# restrict to the column range |
262
|
10
|
|
|
|
|
24
|
my @col_nums = map {$self->A1_to_num($_) - 1} ($col1, $col2); |
|
20
|
|
|
|
|
40
|
|
263
|
10
|
100
|
|
|
|
27
|
if ($col_nums[0] > 1){ # THINK : should check if $colnum2 is smaller that the max row size ?? |
264
|
6
|
|
|
|
|
12
|
my @col_range = ($col_nums[0] .. $col_nums[1]); |
265
|
6
|
|
|
|
|
13
|
$values = [map { [ @$_[@col_range] ]} @$values]; |
|
16
|
|
|
|
|
44
|
|
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
10
|
|
|
|
|
42
|
return $values; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
1; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
__END__ |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 NAME |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head1 SYNOPSIS |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename); |
285
|
|
|
|
|
|
|
# .. or with syntactic sugar : |
286
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new($filename); |
287
|
|
|
|
|
|
|
# .. or with LibXML backend : |
288
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename, |
289
|
|
|
|
|
|
|
using => 'LibXML'); |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
foreach my $sheet_name ($reader->sheet_names) { |
292
|
|
|
|
|
|
|
my $grid = $reader->values($sheet_name); |
293
|
|
|
|
|
|
|
my $n_rows = @$grid; |
294
|
|
|
|
|
|
|
print "sheet $sheet_name has $n_rows rows; ", |
295
|
|
|
|
|
|
|
"first cell contains : ", $grid->[0][0]; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
foreach my $table_name ($reader->table_names) { |
299
|
|
|
|
|
|
|
my ($columns, $rows) = $reader->table($table_name); |
300
|
|
|
|
|
|
|
my $n_data_rows = @$rows; |
301
|
|
|
|
|
|
|
my $n_columns = @$columns; |
302
|
|
|
|
|
|
|
print "table $table_name has $n_data_rows rows and $n_columns columns; ", |
303
|
|
|
|
|
|
|
"column 'foo' in first row contains : ", $rows->[0]{foo}; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head1 DESCRIPTION |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
This module reads the contents of an Excel file in XLSX format. |
309
|
|
|
|
|
|
|
Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Spreadsheet::XLSX>, |
310
|
|
|
|
|
|
|
there is no support for reading formulas, formats or other Excel internal |
311
|
|
|
|
|
|
|
information; all you get are plain values -- but you get them much |
312
|
|
|
|
|
|
|
faster ! Besides, this module also has support for parsing Excel tables. |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
This front module has two different backends for extracting values : |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=over |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=item Regex (default) |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
this backend uses regular expressions to parse the XML content. |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
=item LibXML |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
this backend uses L<XML::LibXML::Reader> to parse the XML content. |
325
|
|
|
|
|
|
|
It is probably safer but about three times slower than the Regex backend |
326
|
|
|
|
|
|
|
(but still much faster than L<Spreadsheet::ParseXLSX>). |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=back |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=head1 METHODS |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=head2 new |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename, |
336
|
|
|
|
|
|
|
using => $backend, |
337
|
|
|
|
|
|
|
%date_formatting_options); |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed. |
340
|
|
|
|
|
|
|
The C<using> argument is optional; it specifies the backend to be used for parsing; |
341
|
|
|
|
|
|
|
default is 'Regex'. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
As syntactic sugar, a shorter form is admitted : |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new($filename); |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Optional parameters for formatting date and time values |
348
|
|
|
|
|
|
|
are described in the L</DATE AND TIME FORMATS> section below. |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=head2 sheet_names |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
my @sheets = $reader->sheet_names; |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
Returns the list of worksheet names, in the same order as in the Excel file. |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
=head2 values |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
my $grid = $reader->values($sheet); |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
Returns a bidimensional array of scalars, corresponding to cell |
362
|
|
|
|
|
|
|
values in the specified worksheet. The C<$sheet> argument can be either |
363
|
|
|
|
|
|
|
a sheet name or a sheet position (starting at 1). |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Unlike the original Excel cells, positions in the grid are zero-based, |
366
|
|
|
|
|
|
|
so for example the content of cell B3 is in C<< $grid->[1][2] >>. |
367
|
|
|
|
|
|
|
The grid is sparse : the size of each row depends on the |
368
|
|
|
|
|
|
|
position of the last non-empty cell in that row. |
369
|
|
|
|
|
|
|
Thanks to Perl's auto-vivification mechanism, any attempt to access |
370
|
|
|
|
|
|
|
a non-existent cell will automatically create the corresponding cell |
371
|
|
|
|
|
|
|
within the grid. The number of rows and columns in the grid can be computed |
372
|
|
|
|
|
|
|
like this : |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
my $nb_rows = @$grid; |
375
|
|
|
|
|
|
|
my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=head2 table_names |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
my @table_names = $reader->table_names; |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
Returns the list of names of table registered in this workbook. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head2 table |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name) |
387
|
|
|
|
|
|
|
# or |
388
|
|
|
|
|
|
|
my ($columns, $rows) = $reader->table(name => $table_name); |
389
|
|
|
|
|
|
|
# or |
390
|
|
|
|
|
|
|
my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $range] |
391
|
|
|
|
|
|
|
[, columns => \@columns] |
392
|
|
|
|
|
|
|
[, no_headers => 1] |
393
|
|
|
|
|
|
|
); |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
In its simplest form, this method returns the content of an Excel table referenced by its table name |
396
|
|
|
|
|
|
|
(in Excel, the table name appears and can be modified through the Table tools / Design tab). |
397
|
|
|
|
|
|
|
The table name is passed either through the named argument C<name>, or positionally as unique argument |
398
|
|
|
|
|
|
|
to the method. |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Rows are returned as hashrefs, where keys of the hashes correspond to column names |
401
|
|
|
|
|
|
|
in the table. In scalar context, the method just returns an arrayref to the list of rows. In list |
402
|
|
|
|
|
|
|
context, the method returns a pair, where the first element is an arrayref of column names, and the |
403
|
|
|
|
|
|
|
second element is an arrayref to the list of rows. |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Instead of specifying a table name, it is also possible to give a sheet name or sheet number. |
406
|
|
|
|
|
|
|
By default, this considers the whole sheet content as a single table, where column names |
407
|
|
|
|
|
|
|
are on the first row. However, additional arguments can be supplied to change the default |
408
|
|
|
|
|
|
|
behaviour : |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=over |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=item ref |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
a specific range of cells within the sheet that contain the table rows and columns. |
415
|
|
|
|
|
|
|
The range must be expressed using traditional Excel notation, |
416
|
|
|
|
|
|
|
like for example C<"C9:E23"> (colums 3 to 5, rows 9 to 23). |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=item columns |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
an arrayref containing the list of column names. |
421
|
|
|
|
|
|
|
If absent, column names will be taken from the first row in the table. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=item no_headers |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
if true, the first row in the table will be treated as a regular data row, instead |
426
|
|
|
|
|
|
|
of being treated as a list of column names. In that case, since column names cannot |
427
|
|
|
|
|
|
|
be inferred from cell values in the first row, the C<columns> argument to the method |
428
|
|
|
|
|
|
|
must be present. |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=back |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=head1 AUXILIARY METHODS |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=head1 A1_to_num |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
my $col_num = $reader->A1_to_num('A'); # 1 |
438
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('AZ'); # 52 |
439
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('AA'); # 26 |
440
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('ABC'); # 731 |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation) |
443
|
|
|
|
|
|
|
into the corresponding numeric value. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 formatted_date |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
my $date = $reader->formatted_date($numeric_date, $excel_date_format); |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Given a numeric date, this method returns a string date formatted according |
451
|
|
|
|
|
|
|
to the I<date formatter> routine explained in the next section. The C<$excel_date_format> |
452
|
|
|
|
|
|
|
argument should be the Excel format string for that specific cell; it is used |
453
|
|
|
|
|
|
|
only for for deciding if the numeric value should be presented as a date, as a time, |
454
|
|
|
|
|
|
|
or both. Optionally, a custom date formatter callback could be passed as third argument. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head1 DATE AND TIME FORMATS |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=head2 Date and time handling |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
In Excel, date and times values are stored as numeric values, where the integer part |
462
|
|
|
|
|
|
|
represents the date, and the fractional part represents the time. What distinguishes |
463
|
|
|
|
|
|
|
such numbers from ordinary numbers is the I<numeric format> applied to the cells |
464
|
|
|
|
|
|
|
where they appear. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
Numeric formats in Excel are complex to reproduce, in particular |
467
|
|
|
|
|
|
|
because they are locale-dependent; therefore the present module does not attempt |
468
|
|
|
|
|
|
|
to faithfully interpret Excel formats. It just infers from formats which |
469
|
|
|
|
|
|
|
cells should be presented as date and/or time values. All such values are then |
470
|
|
|
|
|
|
|
presented through the same I<date_formatter> routine. The default formatter |
471
|
|
|
|
|
|
|
is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter> |
472
|
|
|
|
|
|
|
parameter (explained below). |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=head2 Parameters for the default strftime formatter |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
When using the default strftime formatter, the following parameters may be passed |
477
|
|
|
|
|
|
|
to the constructor : |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=over |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=item date_format |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>. |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
=item time_format |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>. |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=item datetime_format |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing date and time together. |
492
|
|
|
|
|
|
|
The default is the concatenation of C<date_format> and C<time_format>, with |
493
|
|
|
|
|
|
|
a space inbetween. |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=back |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=head2 Writing a custom formatter |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
A custom algorithm for date formatting can be specified as a parameter to the constructor |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename, |
503
|
|
|
|
|
|
|
date_formatter => sub {...}); |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
If this parameter is C<undef>, date formatting is canceled and therefore date and |
506
|
|
|
|
|
|
|
time values will be presented as plain numbers. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
If not C<undef>, the date formatting routine will we called as : |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
$date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond); |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
where |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=over |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=item * |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
C<$excel_date_format> is the Excel numbering format associated to that cell, like for example |
519
|
|
|
|
|
|
|
C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description. |
520
|
|
|
|
|
|
|
This is useful to decide if the value should be presented as a date, a time, or both. |
521
|
|
|
|
|
|
|
The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should |
522
|
|
|
|
|
|
|
be presented as a date; if the format contains C<h> or C<s>, it should be presented |
523
|
|
|
|
|
|
|
as a time. The letter C<m> is not taken into consideration because it is ambiguous : |
524
|
|
|
|
|
|
|
depending on the position in the format string, it may represent either a "month" or a "minute". |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=item * |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904, |
529
|
|
|
|
|
|
|
see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the |
530
|
|
|
|
|
|
|
C<year> component wil always be above this value. |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=item * |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
C<month> is the numeric value of the month, starting at 1 |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item * |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
C<day> is the numeric value of the day in month, starting at 1 |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item * |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding |
543
|
|
|
|
|
|
|
numeric values. |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=back |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=head1 CAVEATS |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=over |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=item * |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
This module was optimized for speed, not for completeness of |
555
|
|
|
|
|
|
|
OOXML-SpreadsheetML support; so there may be some edge cases where the |
556
|
|
|
|
|
|
|
output is incorrect with respect to the original Excel data. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=item * |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
Embedded newline characters in strings are stored in Excel as C<< \r\n >>, |
561
|
|
|
|
|
|
|
following the old Windows convention. When retrieved through the C<Regex> |
562
|
|
|
|
|
|
|
backend, the result contains the original C<< \r >> and C<< \n >> characters; |
563
|
|
|
|
|
|
|
but when retrieved through the LibXML, C<< \r >> are silently removed by the |
564
|
|
|
|
|
|
|
C<XML::LibXML> package. |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=back |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
=head1 SEE ALSO |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
The official reference for OOXML-SpreadsheetML format is in |
571
|
|
|
|
|
|
|
L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>. |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Introductory material on XLSX file structure can be found at |
574
|
|
|
|
|
|
|
L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>. |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
The CPAN module L<Data::XLSX::Parser> is claimed to be in alpha stage; |
577
|
|
|
|
|
|
|
it seems to be working but the documentation is insufficient -- I had |
578
|
|
|
|
|
|
|
to inspect the test suite to understand how to use it. |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
Another unpublished but working module for parsing Excel files in Perl |
581
|
|
|
|
|
|
|
can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>. |
582
|
|
|
|
|
|
|
Some test cases were borrowed from that distribution. |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
Conversions from and to Excel internal date format can also be performed |
585
|
|
|
|
|
|
|
through the L<DateTime::Format::Excel> module. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=head1 BENCHMARKS |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
Below are some benchmarks computed with the program C<benchmark.pl> in |
590
|
|
|
|
|
|
|
this distribution. The task was to parse an Excel file of five worksheets |
591
|
|
|
|
|
|
|
with about 62600 rows in total, and report the number of rows per sheet. |
592
|
|
|
|
|
|
|
Reported figures are in seconds. |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::Regex 11 elapsed, 10 cpu, 0 system |
595
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::LibXML 35 elapsed, 34 cpu, 0 system |
596
|
|
|
|
|
|
|
[unpublished] Excel::Reader::XLSX 39 elapsed, 37 cpu, 0 system |
597
|
|
|
|
|
|
|
Spreadsheet::ParseXLSX 244 elapsed, 240 cpu, 1 system |
598
|
|
|
|
|
|
|
Data::XLSX::Parser 37 elapsed, 35 cpu, 0 system |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
These figures show that the regex version is about 3 times faster |
601
|
|
|
|
|
|
|
than the LibXML version, and about 22 times faster than |
602
|
|
|
|
|
|
|
L<Spreadsheet::ParseXLSX>. Tests with a bigger file of about 90000 rows |
603
|
|
|
|
|
|
|
showed similar ratios. |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
Modules |
606
|
|
|
|
|
|
|
C<Excel::Reader::XLSX> (unpublished) and L<Data::XLSX::Parser> |
607
|
|
|
|
|
|
|
are based on L<XML::LibXML> like L<Excel::ValueReader::XLSX::Backend::LibXML>; |
608
|
|
|
|
|
|
|
execution times for those three modules are very close. |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=over |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
=item * |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
David Flink signaled (and fixed) a bug about strings with embedded newline characters |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
=back |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=head1 AUTHOR |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami at cpan.orgE<gt> |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
Copyright 2020-2022 by Laurent Dami. |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
630
|
|
|
|
|
|
|
it under the same terms as Perl itself. |