| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Excel::ValueReader::XLSX; |
|
2
|
2
|
|
|
2
|
|
369617
|
use 5.12.1; |
|
|
2
|
|
|
|
|
10
|
|
|
3
|
2
|
|
|
2
|
|
13
|
use utf8; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
19
|
|
|
4
|
2
|
|
|
2
|
|
2812
|
use Moose; |
|
|
2
|
|
|
|
|
1319106
|
|
|
|
2
|
|
|
|
|
16
|
|
|
5
|
2
|
|
|
2
|
|
21901
|
use MooseX::StrictConstructor; |
|
|
2
|
|
|
|
|
89902
|
|
|
|
2
|
|
|
|
|
11
|
|
|
6
|
2
|
|
|
2
|
|
26607
|
use Moose::Util::TypeConstraints qw/union/; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
26
|
|
|
7
|
2
|
|
|
2
|
|
1305
|
use Module::Load qw/load/; |
|
|
2
|
|
|
|
|
26
|
|
|
|
2
|
|
|
|
|
28
|
|
|
8
|
2
|
|
|
2
|
|
1748
|
use Date::Calc qw/Add_Delta_Days/; |
|
|
2
|
|
|
|
|
19477
|
|
|
|
2
|
|
|
|
|
281
|
|
|
9
|
2
|
|
|
2
|
|
1439
|
use POSIX qw/strftime modf/; |
|
|
2
|
|
|
|
|
18155
|
|
|
|
2
|
|
|
|
|
16
|
|
|
10
|
2
|
|
|
2
|
|
4646
|
use Carp qw/croak/; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
130
|
|
|
11
|
2
|
|
|
2
|
|
934
|
use Iterator::Simple qw/iter/; |
|
|
2
|
|
|
|
|
3357
|
|
|
|
2
|
|
|
|
|
5757
|
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
#====================================================================== |
|
14
|
|
|
|
|
|
|
# GLOBALS |
|
15
|
|
|
|
|
|
|
#====================================================================== |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '1.17'; |
|
18
|
|
|
|
|
|
|
our %A1_to_num_memoized; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
#====================================================================== |
|
21
|
|
|
|
|
|
|
# TYPES AND ATTRIBUTES |
|
22
|
|
|
|
|
|
|
#====================================================================== |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# TYPES |
|
25
|
|
|
|
|
|
|
my $XlsxSource = union([qw/Str FileHandle/]); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# PUBLIC ATTRIBUTES |
|
28
|
|
|
|
|
|
|
has 'xlsx' => (is => 'ro', isa => $XlsxSource, required => 1); # path of xlsx file |
|
29
|
|
|
|
|
|
|
has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class |
|
30
|
|
|
|
|
|
|
has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y'); |
|
31
|
|
|
|
|
|
|
has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S'); |
|
32
|
|
|
|
|
|
|
has 'datetime_format' => (is => 'ro', isa => 'Str', builder => '_datetime_format', lazy => 1); |
|
33
|
|
|
|
|
|
|
has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]', builder => '_date_formatter', lazy => 1); |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED |
|
36
|
|
|
|
|
|
|
has 'backend' => (is => 'ro', isa => 'Object', builder => '_backend', lazy => 1, |
|
37
|
|
|
|
|
|
|
init_arg => undef, handles => [qw/base_year sheets active_sheet table_info/]); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
#====================================================================== |
|
40
|
|
|
|
|
|
|
# BUILDING |
|
41
|
|
|
|
|
|
|
#====================================================================== |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path) |
|
44
|
|
|
|
|
|
|
around BUILDARGS => sub { |
|
45
|
|
|
|
|
|
|
my $orig = shift; |
|
46
|
|
|
|
|
|
|
my $class = shift; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
unshift @_, 'xlsx' if scalar(@_) % 2 and $XlsxSource->check($_[0]); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
$class->$orig(@_); |
|
51
|
|
|
|
|
|
|
}; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
#====================================================================== |
|
55
|
|
|
|
|
|
|
# ATTRIBUTE CONSTRUCTORS |
|
56
|
|
|
|
|
|
|
#====================================================================== |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub _backend { |
|
59
|
30
|
|
|
30
|
|
63
|
my $self = shift; |
|
60
|
|
|
|
|
|
|
|
|
61
|
30
|
|
|
|
|
1089
|
my $backend_class = ref($self) . '::Backend::' . $self->using; |
|
62
|
30
|
|
|
|
|
238
|
load $backend_class; |
|
63
|
|
|
|
|
|
|
|
|
64
|
30
|
|
|
|
|
2912
|
return $backend_class->new(frontend => $self); |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
sub _datetime_format { |
|
68
|
24
|
|
|
24
|
|
63
|
my ($self) = @_; |
|
69
|
24
|
|
|
|
|
856
|
return $self->date_format . ' ' . $self->time_format; |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub _date_formatter { |
|
73
|
24
|
|
|
24
|
|
60
|
my ($self) = @_; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# local copies of the various formats so that we can build a closure |
|
76
|
24
|
|
|
|
|
871
|
my @formats = (undef, # 0 -- error |
|
77
|
|
|
|
|
|
|
$self->date_format, # 1 -- just a date |
|
78
|
|
|
|
|
|
|
$self->time_format, # 2 -- just a time |
|
79
|
|
|
|
|
|
|
$self->datetime_format); # 3 -- date and time |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
my $strftime_formatter = sub { |
|
82
|
312
|
|
|
312
|
|
906
|
my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_; |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# choose the proper format for strftime |
|
85
|
312
|
|
|
|
|
470
|
my $ix = 0; # index into the @formats array |
|
86
|
312
|
100
|
|
|
|
1161
|
$ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion |
|
87
|
312
|
100
|
|
|
|
852
|
$ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion |
|
88
|
312
|
50
|
|
|
|
983
|
my $strftime_format = $formats[$ix] |
|
89
|
|
|
|
|
|
|
or die "cell with unexpected Excel date format : $xl_date_format"; |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# formatting through strftime |
|
92
|
312
|
|
|
|
|
26850
|
my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900); |
|
93
|
|
|
|
|
|
|
|
|
94
|
312
|
|
|
|
|
1246
|
return $formatted_date; |
|
95
|
24
|
|
|
|
|
208
|
}; |
|
96
|
|
|
|
|
|
|
|
|
97
|
24
|
|
|
|
|
891
|
return $strftime_formatter; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
#====================================================================== |
|
102
|
|
|
|
|
|
|
# GENERAL METHODS |
|
103
|
|
|
|
|
|
|
#====================================================================== |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub sheet_names { |
|
107
|
6
|
|
|
6
|
1
|
16356
|
my ($self) = @_; |
|
108
|
|
|
|
|
|
|
|
|
109
|
6
|
|
|
|
|
42
|
my $sheets = $self->sheets; # hashref of shape {$name => $sheet_position} |
|
110
|
|
|
|
|
|
|
|
|
111
|
6
|
|
|
|
|
61
|
my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets; |
|
|
81
|
|
|
|
|
161
|
|
|
112
|
6
|
|
|
|
|
45
|
return @sorted_names; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
|
|
116
|
44
|
|
|
44
|
1
|
81341
|
sub values { my ($self, $sheet) = @_; $self->backend->_values($sheet, 0)} |
|
|
44
|
|
|
|
|
1924
|
|
|
117
|
44
|
|
|
44
|
1
|
78212
|
sub ivalues { my ($self, $sheet) = @_; $self->backend->_values($sheet, 1)} |
|
|
44
|
|
|
|
|
2008
|
|
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub formatted_date { |
|
120
|
312
|
|
|
312
|
1
|
21596
|
my ($self, $val, $date_format, $date_formatter) = @_; |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# separate date (integer part) from time (fractional part) |
|
123
|
312
|
|
|
|
|
1356
|
my ($time, $n_days) = modf($val); |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# Convert $n_days into a date in Date::Calc format (year, month, day). |
|
126
|
|
|
|
|
|
|
# The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while |
|
127
|
|
|
|
|
|
|
# in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system, |
|
128
|
|
|
|
|
|
|
# Excel treats 1900 as a leap year. |
|
129
|
312
|
|
|
|
|
1225
|
my $base_year = $self->base_year; |
|
130
|
312
|
100
|
|
|
|
816
|
if ($base_year == 1900) { |
|
131
|
228
|
|
|
|
|
515
|
my $is_after_february_1900 = $n_days > 60; |
|
132
|
228
|
100
|
|
|
|
549
|
$n_days -= $is_after_february_1900 ? 2 : 1; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
312
|
|
|
|
|
1434
|
my @d = Add_Delta_Days($base_year, 1, 1, $n_days); |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# decode the fractional part (the time) into hours, minutes, seconds, milliseconds |
|
137
|
312
|
|
|
|
|
511
|
my @t; |
|
138
|
312
|
|
|
|
|
636
|
foreach my $subdivision (24, 60, 60, 1000) { |
|
139
|
1248
|
|
|
|
|
1846
|
$time *= $subdivision; |
|
140
|
1248
|
|
|
|
|
2643
|
($time, my $time_portion) = modf($time); |
|
141
|
1248
|
|
|
|
|
2381
|
push @t, $time_portion; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second |
|
145
|
312
|
|
|
|
|
656
|
my ($h, $m, $s, $ms) = @t; |
|
146
|
312
|
100
|
|
|
|
712
|
if ($ms == 999) { |
|
147
|
4
|
|
|
|
|
15
|
$s += 1, $ms = 0; |
|
148
|
4
|
50
|
|
|
|
15
|
if ($s == 60) { |
|
149
|
0
|
|
|
|
|
0
|
$m += 1, $s = 0; |
|
150
|
0
|
0
|
|
|
|
0
|
if ($m == 60) { |
|
151
|
0
|
|
|
|
|
0
|
$h += 1, $m = 0; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
# NOTE : because of this hack, theoretically we could end up with a value |
|
156
|
|
|
|
|
|
|
# like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different |
|
157
|
|
|
|
|
|
|
# in its rendering. |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# call the date_formatter subroutine |
|
160
|
312
|
50
|
33
|
|
|
12959
|
$date_formatter //= $self->date_formatter |
|
161
|
|
|
|
|
|
|
or die ref($self) . " has no date_formatter subroutine"; |
|
162
|
312
|
|
|
|
|
828
|
my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms); |
|
163
|
|
|
|
|
|
|
|
|
164
|
312
|
|
|
|
|
1597
|
return $formatted_date; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
#====================================================================== |
|
168
|
|
|
|
|
|
|
# METHODS FOR PARSING EXCEL TABLES |
|
169
|
|
|
|
|
|
|
#====================================================================== |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub table_names { |
|
173
|
4
|
|
|
4
|
1
|
13492
|
my ($self) = @_; |
|
174
|
|
|
|
|
|
|
|
|
175
|
4
|
|
|
|
|
32
|
my $table_info = $self->table_info; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# sort on table id (field [1] in table_info arrayrefs) |
|
178
|
4
|
|
|
|
|
44
|
my @table_names = sort {$table_info->{$a}{id} <=> $table_info->{$b}{id}} keys %$table_info; |
|
|
38
|
|
|
|
|
128
|
|
|
179
|
|
|
|
|
|
|
|
|
180
|
4
|
|
|
|
|
37
|
return @table_names; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
20
|
|
|
20
|
1
|
23482
|
sub table {shift->_table(0, @_);} # 0 = does not want an iterator, just a regular arrayref |
|
184
|
20
|
|
|
20
|
0
|
25345
|
sub itable {shift->_table(1, @_);} # 1 = does want an iterator |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
my %valid_table_arg = map {$_ => 1} qw/name sheet ref columns no_headers with_totals want_records/; |
|
187
|
|
|
|
|
|
|
sub _table { |
|
188
|
40
|
|
|
40
|
|
92
|
my $self = shift; |
|
189
|
40
|
|
|
|
|
70
|
my $want_iterator = shift; |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# syntactic sugar : ->table('foo') is treated as ->table(name => 'foo') |
|
192
|
40
|
100
|
|
|
|
217
|
unshift @_, 'name' if scalar(@_) % 2; |
|
193
|
40
|
|
|
|
|
201
|
my %args = @_; |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# check for invalid args |
|
196
|
40
|
|
|
|
|
142
|
my @invalid_args = grep {!$valid_table_arg{$_}} keys %args; |
|
|
56
|
|
|
|
|
213
|
|
|
197
|
40
|
50
|
|
|
|
146
|
croak "invalid table args: ", join(", ", @invalid_args) if @invalid_args; |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# defaults |
|
200
|
40
|
|
100
|
|
|
243
|
$args{want_records} //= 1; |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# if called with a table name, derive positional args from the internal workbook info |
|
203
|
40
|
100
|
|
|
|
125
|
if (my $table_name = delete $args{name}) { |
|
204
|
32
|
50
|
|
|
|
177
|
my $table_info = $self->table_info->{$table_name} |
|
205
|
|
|
|
|
|
|
or croak sprintf "Excel file '%s' contains no table named '%s'", $self->xlsx, $table_name; |
|
206
|
32
|
|
66
|
|
|
655
|
$args{$_} //= $table_info->{$_} for keys %$table_info; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# get values from the sheet |
|
210
|
|
|
|
|
|
|
my ($sheet_ref, $vals_or_it) = $want_iterator ? $self->ivalues($args{sheet}) |
|
211
|
40
|
100
|
|
|
|
217
|
: $self->values($args{sheet}); |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# table boundaries |
|
214
|
40
|
|
|
|
|
522
|
my ($scol1, $srow1, $scol2, $srow2) = $self->range_from_ref($sheet_ref); |
|
215
|
|
|
|
|
|
|
my ($tcol1, $trow1, $tcol2, $trow2) = $args{ref} ? $self->range_from_ref($args{ref}) |
|
216
|
40
|
100
|
|
|
|
173
|
: ($scol1, $srow1, $scol2, $srow2); |
|
217
|
40
|
|
|
|
|
110
|
my $skip_initial_rows = $trow1 - $srow1; |
|
218
|
40
|
|
|
|
|
91
|
my $keep_rows = $trow2 - $trow1 + 1; |
|
219
|
40
|
|
|
|
|
64
|
my $skip_initial_cols = $tcol1 - $scol1; |
|
220
|
40
|
|
|
|
|
107
|
my $keep_cols = $tcol2 - $tcol1 + 1; |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# if a totals row is present, skip it, unless the 'with_totals' arg is present |
|
223
|
40
|
100
|
100
|
|
|
172
|
$keep_rows -=1 if $args{has_totals} and !$args{with_totals}; |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# skip initial rows if the table does not start at top row |
|
226
|
40
|
100
|
|
|
|
103
|
if ($skip_initial_rows) { |
|
227
|
20
|
100
|
|
|
|
48
|
if ($want_iterator) {$vals_or_it->() while $skip_initial_rows-- > 0;} |
|
|
10
|
|
|
|
|
50
|
|
|
228
|
10
|
|
|
|
|
40
|
else {splice @$vals_or_it, 0, $skip_initial_rows} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# read headers from first row -- even if this may be redundant with the 'columns' list declared in the table description |
|
232
|
40
|
|
|
|
|
115
|
my $headers; |
|
233
|
40
|
100
|
|
|
|
115
|
unless ($args{no_headers}) { |
|
234
|
36
|
|
|
|
|
63
|
$keep_rows--; |
|
235
|
36
|
100
|
|
|
|
106
|
$headers = $want_iterator ? $vals_or_it->() : shift @$vals_or_it; |
|
236
|
36
|
100
|
|
|
|
101
|
splice @$headers, 0, $skip_initial_cols if $skip_initial_cols; |
|
237
|
36
|
|
|
|
|
95
|
splice @$headers, $keep_cols; |
|
238
|
|
|
|
|
|
|
} |
|
239
|
40
|
|
66
|
|
|
121
|
$args{columns} //= $headers; |
|
240
|
40
|
50
|
|
|
|
60
|
croak "table contains undefined columns" if grep {!defined $_} @{$args{columns}}; |
|
|
128
|
|
|
|
|
251
|
|
|
|
40
|
|
|
|
|
133
|
|
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# dual closure : can be used as an iterator or can compute all values in one call |
|
243
|
40
|
|
|
|
|
70
|
my @rows; |
|
244
|
|
|
|
|
|
|
my $get_values = sub { |
|
245
|
348
|
|
|
348
|
|
1951
|
while (1) { |
|
246
|
656
|
100
|
66
|
|
|
2035
|
$keep_rows-- |
|
|
|
100
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
and my $vals = $want_iterator ? $vals_or_it->() : shift @$vals_or_it |
|
248
|
|
|
|
|
|
|
or return; # no more records -- end of iterator |
|
249
|
616
|
100
|
|
|
|
1109
|
splice @$vals, 0, $skip_initial_cols if $skip_initial_cols; |
|
250
|
616
|
|
|
|
|
961
|
splice @$vals, $keep_cols; |
|
251
|
616
|
100
|
|
|
|
1053
|
my $row = $args{want_records} ? do {my %r; @r{@{$args{columns}}} = @$vals; \%r} |
|
|
608
|
|
|
|
|
755
|
|
|
|
608
|
|
|
|
|
861
|
|
|
|
608
|
|
|
|
|
3424
|
|
|
|
608
|
|
|
|
|
1152
|
|
|
252
|
|
|
|
|
|
|
: $vals; |
|
253
|
616
|
100
|
|
|
|
1064
|
if ($want_iterator) {return $row} # current iteration successful |
|
|
308
|
|
|
|
|
1219
|
|
|
254
|
308
|
|
|
|
|
531
|
else {push @rows, $row}; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
40
|
|
|
|
|
257
|
}; |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
# return either an iterator or the accumulated table records |
|
259
|
|
|
|
|
|
|
return ($args{columns}, $want_iterator ? iter($get_values) |
|
260
|
40
|
100
|
|
|
|
162
|
: do {$get_values->(); \@rows}); |
|
|
20
|
|
|
|
|
48
|
|
|
|
20
|
|
|
|
|
479
|
|
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
sub A1_to_num { # convert Excel A1 reference format to a number |
|
265
|
15
|
|
|
15
|
1
|
42
|
my ($self, $A1) = @_; |
|
266
|
15
|
|
|
|
|
28
|
my $num = 0; |
|
267
|
15
|
|
|
|
|
56
|
foreach my $digit (unpack "C*", $A1) { |
|
268
|
17
|
|
|
|
|
44
|
$num = $num*26 + $digit-64; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
15
|
|
|
|
|
66
|
return $num; |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub range_from_ref { # convert a range reference like 'C9:E21' into ($col1, $row1, $col2, $row2) |
|
276
|
76
|
|
|
76
|
1
|
163
|
my ($self, $range) = @_; |
|
277
|
|
|
|
|
|
|
|
|
278
|
76
|
50
|
|
|
|
555
|
$range =~ /^([A-Z]+)(\d+) # mandatory 1st col and row |
|
279
|
|
|
|
|
|
|
(?: # .. optionally followed by |
|
280
|
|
|
|
|
|
|
: # colon |
|
281
|
|
|
|
|
|
|
([A-Z]+)(\d+) # and 2nd col and row |
|
282
|
|
|
|
|
|
|
)? |
|
283
|
|
|
|
|
|
|
$/x |
|
284
|
|
|
|
|
|
|
or croak "->range_from_ref($range) : invalid ref"; |
|
285
|
|
|
|
|
|
|
|
|
286
|
76
|
|
33
|
|
|
390
|
my @range = ($A1_to_num_memoized{$1} //= $self->A1_to_num($1), $2); # col, row of topleft cell |
|
287
|
76
|
50
|
33
|
|
|
400
|
push @range, ($3 ? ($A1_to_num_memoized{$3} //= $self->A1_to_num($3), $4) # col, row of bottomright cell, or .. |
|
288
|
|
|
|
|
|
|
: @range); # .. copy of topleft cell |
|
289
|
76
|
|
|
|
|
336
|
return @range; |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
1; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
__END__ |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head1 NAME |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle); |
|
310
|
|
|
|
|
|
|
# .. or with syntactic sugar : |
|
311
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new($filename_or_handle); |
|
312
|
|
|
|
|
|
|
# .. or with LibXML backend : |
|
313
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle, |
|
314
|
|
|
|
|
|
|
using => 'LibXML'); |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
foreach my $sheet_name ($reader->sheet_names) { |
|
317
|
|
|
|
|
|
|
my $grid = $reader->values($sheet_name); |
|
318
|
|
|
|
|
|
|
my $n_rows = @$grid; |
|
319
|
|
|
|
|
|
|
print "sheet $sheet_name has $n_rows rows; ", |
|
320
|
|
|
|
|
|
|
"first cell contains : ", $grid->[0][0]; |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
foreach my $table_name ($reader->table_names) { |
|
324
|
|
|
|
|
|
|
my ($columns, $records) = $reader->table($table_name); |
|
325
|
|
|
|
|
|
|
my $n_records = @$records; |
|
326
|
|
|
|
|
|
|
my $n_columns = @$columns; |
|
327
|
|
|
|
|
|
|
print "table $table_name has $n_records records and $n_columns columns; ", |
|
328
|
|
|
|
|
|
|
"column 'foo' in first row contains : ", $records->[0]{foo}; |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
my $first_grid = $reader->values(1); # the arg can be a sheet index instead of a sheet name |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# iterator version of ->values() |
|
334
|
|
|
|
|
|
|
my $iterator = $reader->ivalues($sheet_name); |
|
335
|
|
|
|
|
|
|
while (my $row = $iterator->()) { process_row($row) } |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# iterator version of ->table() |
|
338
|
|
|
|
|
|
|
my ($columns, $iterator) = $reader->itable($table_name); |
|
339
|
|
|
|
|
|
|
while (my $record = $iterator->()) { process_record($record) } |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 Purpose |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
This module reads the contents of an Excel file in XLSX format. |
|
346
|
|
|
|
|
|
|
Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>, |
|
347
|
|
|
|
|
|
|
this module has no support for reading formulas, formats or other Excel internal |
|
348
|
|
|
|
|
|
|
information; all you get are plain values -- but you get them much faster ! |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Besides, this API has some features not found in concurrent parsers : |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
=over |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=item * |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
has support for parsing Excel tables |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=item * |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
iterator methods for getting one row at a time from a worksheet or from a table -- very useful for sparing |
|
361
|
|
|
|
|
|
|
memory when dealing with large Excel files. |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=back |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=head2 Backends |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Two different backends may be used for extracting values : |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=over |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=item Regex |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
using regular expressions to parse the XML content. |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=item LibXML |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
using L<XML::LibXML::Reader> to parse the XML content. |
|
379
|
|
|
|
|
|
|
It is probably safer but two to three times slower than the Regex backend |
|
380
|
|
|
|
|
|
|
(but still much faster than L<Spreadsheet::ParseXLSX>). |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=back |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
The default is the C<Regex> backend. |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 Sheet numbering |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Although worksheets are usually accessed by name, they may also |
|
389
|
|
|
|
|
|
|
be accessed by numerical indices, I<starting at value 1>. |
|
390
|
|
|
|
|
|
|
Some other Perl parsing modules use a different convention, where the first sheet has index 0. |
|
391
|
|
|
|
|
|
|
Here index 1 was chosen to be consistent with the common API for "collections" in |
|
392
|
|
|
|
|
|
|
Microsoft Office object model. |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head1 NOTE ON ITERATORS |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Methods L</ivalues> and L</itable> return I<iterators>. |
|
398
|
|
|
|
|
|
|
Each call to the iterator produces a new data row from the Excel content, until reaching |
|
399
|
|
|
|
|
|
|
the end of data where the iterator returns C<undef>. Following the L<Iterator::Simple> protocol, |
|
400
|
|
|
|
|
|
|
iterators support three different but semantically equivalent syntaxes : |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
while (my $row = $iterator->()) { process($row) } |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
while (my $row = $iterator->next) { process($row) } |
|
405
|
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
while (<$iterator>) { process($_) } |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
Working with iterators is especially interesting when dealing with large Excel files, because |
|
409
|
|
|
|
|
|
|
rows can be processed one at a time instead of being loaded all at once in memory. For example |
|
410
|
|
|
|
|
|
|
a typical pattern for loading the Excel content into a database would be : |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
my $iter = $valuereader->ivalues('MySheet'); |
|
413
|
|
|
|
|
|
|
my $sth = $dbh->prepare("INSERT INTO MYTABLE(col1, col2, col3) VALUES (?, ?, ?)"); |
|
414
|
|
|
|
|
|
|
while (my $row = $iter->()) { |
|
415
|
|
|
|
|
|
|
$sth->execute(@$row); |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
As another example, suppose a large population table, from which we want to produce a list of |
|
419
|
|
|
|
|
|
|
list of minor girls. This can be done with a combination of iterator operations : |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
use Iterator::Simple qw/igrep imap/; |
|
422
|
|
|
|
|
|
|
use Iterator::Simple::Util qw/ireduce/; |
|
423
|
|
|
|
|
|
|
my $minor_girls = ireduce {"$a, $b"} # successive results joined with ", " |
|
424
|
|
|
|
|
|
|
imap {"$_->{firstname} $_->{lastname}"} # produce a flat string from an input record with first/last name |
|
425
|
|
|
|
|
|
|
igrep {$_->{gender} eq 'F' && $_->{age} < 18} # filter input records |
|
426
|
|
|
|
|
|
|
$valuereader->itable('Population'); # source iterator |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head1 METHODS |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head2 new |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle, %options); |
|
433
|
|
|
|
|
|
|
# .. or with syntactic sugar : |
|
434
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new($filename_or_handle, %options); |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed, |
|
437
|
|
|
|
|
|
|
or to an open filehandle. |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Options are : |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=over |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item C<using> |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
The backend to be used for parsing; default is 'Regex'. |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=item C<date_format>, C<time_format>, C<datetime_format>, C<date_formatter> |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
Parameters for formatting date and time values; these are described in the L</DATE AND TIME FORMATS> section below. |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=back |
|
452
|
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=head2 sheet_names |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
my @sheets = $reader->sheet_names; |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
Returns the list of worksheet names, in the same order as in the Excel file. |
|
460
|
|
|
|
|
|
|
The first name in the list corresponds to sheet number 1. |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
=head2 active_sheet |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
my $active_sheet_number = $reader->active_sheet; |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
Returns the numerical index (starting at 1) of the sheet that was active when the file was last saved. |
|
467
|
|
|
|
|
|
|
May return C<undef>. |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
=head2 values |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
my ($ref, $grid) = $reader->values($sheet); |
|
472
|
|
|
|
|
|
|
# or |
|
473
|
|
|
|
|
|
|
my $grid = $reader->values($sheet); |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
Returns a pair where |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=over |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=item * |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
the first item is a string that describes the range of the sheet, |
|
482
|
|
|
|
|
|
|
in Excel A1 format (like for example C<A1:F12> |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=item * |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
the second item is a bidimensional array of scalars (in other words, |
|
487
|
|
|
|
|
|
|
an arrayref of arrayrefs of scalars), corresponding to cell |
|
488
|
|
|
|
|
|
|
values in the specified worksheet. |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
=back |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
The C<$sheet> argument can be either a sheet name or a sheet position (starting at 1). |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
When called in scalar context, this method only returns the grid of values. |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
Unlike the original Excel cells, positions in the grid are zero-based, |
|
498
|
|
|
|
|
|
|
so for example the content of cell B3 is in C<< $grid->[1][2] >>. |
|
499
|
|
|
|
|
|
|
The grid is sparse : the size of each row depends on the |
|
500
|
|
|
|
|
|
|
position of the last non-empty cell in that row. |
|
501
|
|
|
|
|
|
|
Thanks to Perl's auto-vivification mechanism, any attempt to access |
|
502
|
|
|
|
|
|
|
a non-existent cell will automatically create the corresponding cell |
|
503
|
|
|
|
|
|
|
within the grid. The number of rows and columns in the grid can be computed |
|
504
|
|
|
|
|
|
|
like this : |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
my $nb_rows = @$grid; |
|
507
|
|
|
|
|
|
|
my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
Alternatively, these numbers can also be obtained through the L</range_from_ref> method. |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head2 ivalues |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
my ($ref, $iterator) = $reader->ivalues($sheet); |
|
515
|
|
|
|
|
|
|
# or |
|
516
|
|
|
|
|
|
|
my $iterator = $reader->ivalues($sheet); |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
while (my $row = $iterator->()) { |
|
519
|
|
|
|
|
|
|
say join ", ", @$row; |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
Like the L</values> method, except that it returns an iterator instead of a fully populated data grid. |
|
523
|
|
|
|
|
|
|
Data rows are retrieved through successive calls to the iterator. |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=head2 table_names |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
my @table_names = $reader->table_names; |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
Returns the list of names of tables registered in this workbook. |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=head2 table |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name) |
|
535
|
|
|
|
|
|
|
# or |
|
536
|
|
|
|
|
|
|
my ($columns, $rows) = $reader->table(name => $table_name); |
|
537
|
|
|
|
|
|
|
# or |
|
538
|
|
|
|
|
|
|
my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $ref] |
|
539
|
|
|
|
|
|
|
[, columns => \@columns] |
|
540
|
|
|
|
|
|
|
[, no_headers => 1] |
|
541
|
|
|
|
|
|
|
[, with_totals => 1] |
|
542
|
|
|
|
|
|
|
[, want_records => 0] |
|
543
|
|
|
|
|
|
|
); |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
In its simplest form, this method returns the content of an Excel table referenced by its table name |
|
546
|
|
|
|
|
|
|
(in Excel, the table name appears and can be modified through the ribbon tab |
|
547
|
|
|
|
|
|
|
entry "Table tools / Design"). |
|
548
|
|
|
|
|
|
|
The table name is passed either through the named argument C<name>, or positionally as unique argument |
|
549
|
|
|
|
|
|
|
to the method. |
|
550
|
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
In list context, the method returns a pair, where the first element is |
|
552
|
|
|
|
|
|
|
an arrayref of column names, and the second element is an arrayref of rows. |
|
553
|
|
|
|
|
|
|
In scalar context, the method just returns the arrayref of rows. |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
Rows are normally returned as hashrefs, where keys of the hashes correspond to column names |
|
556
|
|
|
|
|
|
|
in the table. Under option C<< want_records => 0>>, rows are returned as arrayrefs, and it is up |
|
557
|
|
|
|
|
|
|
to the client to make the correspondance with column names in C<$columns>. |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
Instead of specifying a table name, it is also possible to give a sheet name or sheet number. |
|
560
|
|
|
|
|
|
|
By default, this considers the whole sheet content as a single table, where column names |
|
561
|
|
|
|
|
|
|
are on the first row. However, additional arguments can be supplied to change the default |
|
562
|
|
|
|
|
|
|
behaviour : |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=over |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=item ref |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
a specific range of cells within the sheet that contain the table rows and columns. |
|
569
|
|
|
|
|
|
|
The range must be expressed using traditional Excel notation, |
|
570
|
|
|
|
|
|
|
like for example C<"C9:E23"> (columns 3 to 5, rows 9 to 23). |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item columns |
|
573
|
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
an arrayref containing the list of column names. |
|
575
|
|
|
|
|
|
|
If absent, column names will be taken from the first row in the table. |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=item no_headers |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
if true, the first row in the table will be treated as a regular data row, instead |
|
580
|
|
|
|
|
|
|
of being treated as a list of column names. In that case, since column names cannot |
|
581
|
|
|
|
|
|
|
be inferred from cell values in the first row, the C<columns> argument to the method |
|
582
|
|
|
|
|
|
|
must be present. |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
=item with_totals |
|
585
|
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
For tables that have a "totals" row (turned on by a specific checkbox |
|
587
|
|
|
|
|
|
|
in the Excel ribbon), this row is normally not included in the result. |
|
588
|
|
|
|
|
|
|
To include it as a final row, pass a true value to the C<with_totals> option. |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=back |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
=head1 AUXILIARY METHODS |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
=head2 A1_to_num |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
my $col_num = $reader->A1_to_num('A'); # 1 |
|
598
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('AZ'); # 52 |
|
599
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('AA'); # 26 |
|
600
|
|
|
|
|
|
|
$col_num = $reader->A1_to_num('ABC'); # 731 |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation) |
|
603
|
|
|
|
|
|
|
into the corresponding numeric value. |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
The module also has a global hash C<$Excel::ValueReader::XLSX::A1_to_num_memoized> where |
|
606
|
|
|
|
|
|
|
results from such conversions are memoized. |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
=head2 range_from_ref |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
my ($col1, $row1, $col2, $row2) = $reader->range_from_ref("C4:BB123"); |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
Returns the coordinates of the topleft and bottomright cells corresponding to a given Excel range. |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=head2 table_info |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
my $info = $reader->table_info->{$table_name}; |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
Returns information about an Excel table in the form of a hashref with keys |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=over |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=item name |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
the name of the table |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=item ref |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
the range of the table, in Excel notation (e.g "G6:Z44") |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=item columns |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
an arrayref of column names |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
=item id |
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
numerical id of the table |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=item sheet |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
numerical id of the sheet to which the table belongs |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=item no_headers |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
boolean flag corresponding to the negation of the checkbox "Headers row" in Excel. |
|
646
|
|
|
|
|
|
|
By default tables have a header row, both in Excel and in this module. |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=item has_totals |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
boolean flag corresponding to the checkbox "Totals row" in Excel. |
|
651
|
|
|
|
|
|
|
By default tables have no totals row, both in Excel and in this module. |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=back |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
=head2 formatted_date |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
my $date = $reader->formatted_date($numeric_date, $excel_date_format); |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
Given a numeric date, this method returns a string date formatted according |
|
662
|
|
|
|
|
|
|
to the I<date formatter> routine explained in the next section. The C<$excel_date_format> |
|
663
|
|
|
|
|
|
|
argument should be the Excel format string for that specific cell; it is used |
|
664
|
|
|
|
|
|
|
only for for deciding if the numeric value should be presented as a date, as a time, |
|
665
|
|
|
|
|
|
|
or both. Optionally, a custom date formatter callback could be passed as third argument. |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=head1 DATE AND TIME FORMATS |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=head2 Date and time handling |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
In Excel, date and times values are stored as numeric values, where the integer part |
|
677
|
|
|
|
|
|
|
represents the date, and the fractional part represents the time. What distinguishes |
|
678
|
|
|
|
|
|
|
such numbers from ordinary numbers is the I<numeric format> applied to the cells |
|
679
|
|
|
|
|
|
|
where they appear. |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
Numeric formats in Excel are complex to reproduce, in particular |
|
682
|
|
|
|
|
|
|
because they are locale-dependent; therefore the present module does not attempt |
|
683
|
|
|
|
|
|
|
to faithfully interpret Excel formats. It just infers from formats which |
|
684
|
|
|
|
|
|
|
cells should be presented as date and/or time values. All such values are then |
|
685
|
|
|
|
|
|
|
presented through the same I<date_formatter> routine. The default formatter |
|
686
|
|
|
|
|
|
|
is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter> |
|
687
|
|
|
|
|
|
|
parameter (explained below). |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=head2 Parameters for the default strftime formatter |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
When using the default strftime formatter, the following parameters may be passed |
|
692
|
|
|
|
|
|
|
to the constructor : |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
=over |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=item date_format |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>. |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item time_format |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>. |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
=item datetime_format |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
The L<POSIX/strftime> format for representing date and time together. |
|
707
|
|
|
|
|
|
|
The default is the concatenation of C<date_format> and C<time_format>, with |
|
708
|
|
|
|
|
|
|
a space in between. |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=back |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=head2 Writing a custom formatter |
|
714
|
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
A custom algorithm for date formatting can be specified as a parameter to the constructor |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename, |
|
718
|
|
|
|
|
|
|
date_formatter => sub {...}); |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
If this parameter is C<undef>, date formatting is canceled and therefore date and |
|
721
|
|
|
|
|
|
|
time values will be presented as plain numbers. |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
If not C<undef>, the date formatting routine will we called as : |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
$date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond); |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
where |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
=over |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item * |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
C<$excel_date_format> is the Excel numbering format associated to that cell, like for example |
|
734
|
|
|
|
|
|
|
C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description. |
|
735
|
|
|
|
|
|
|
This is useful to decide if the value should be presented as a date, a time, or both. |
|
736
|
|
|
|
|
|
|
The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should |
|
737
|
|
|
|
|
|
|
be presented as a date; if the format contains C<h> or C<s>, it should be presented |
|
738
|
|
|
|
|
|
|
as a time. The letter C<m> is not taken into consideration because it is ambiguous : |
|
739
|
|
|
|
|
|
|
depending on the position in the format string, it may represent either a "month" or a "minute". |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=item * |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904, |
|
744
|
|
|
|
|
|
|
see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>) |
|
745
|
|
|
|
|
|
|
is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the |
|
746
|
|
|
|
|
|
|
C<year> component will always be above this value. |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
=item * |
|
749
|
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
C<month> is the numeric value of the month, starting at 1 |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
=item * |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
C<day> is the numeric value of the day in month, starting at 1 |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=item * |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding |
|
759
|
|
|
|
|
|
|
numeric values. |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=back |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
=head1 CAVEATS |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=over |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=item * |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
This module was optimized for speed, not for completeness of |
|
771
|
|
|
|
|
|
|
OOXML-SpreadsheetML support; so there may be some edge cases where the |
|
772
|
|
|
|
|
|
|
output is incorrect with respect to the original Excel data. |
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=item * |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
Embedded newline characters in strings are stored in Excel as C<< \r\n >>, |
|
777
|
|
|
|
|
|
|
following the old Windows convention. When retrieved through the C<Regex> |
|
778
|
|
|
|
|
|
|
backend, the result contains the original C<< \r >> and C<< \n >> characters; |
|
779
|
|
|
|
|
|
|
but when retrieved through the C<LibXML> backend, C<< \r >> are silently removed by the |
|
780
|
|
|
|
|
|
|
C<XML::LibXML> package. |
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=back |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
The official reference for OOXML-SpreadsheetML format is in |
|
787
|
|
|
|
|
|
|
L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>. |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
Introductory material on XLSX file structure can be found at |
|
790
|
|
|
|
|
|
|
L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>. |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Concurrent modules L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>. |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Another unpublished but working module for parsing Excel files in Perl |
|
795
|
|
|
|
|
|
|
can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>. |
|
796
|
|
|
|
|
|
|
Some test cases were borrowed from that distribution. |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
Conversions from and to Excel internal date format can also be performed |
|
799
|
|
|
|
|
|
|
through the L<DateTime::Format::Excel> module. |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
=head1 BENCHMARKS |
|
802
|
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
Below are some comparative figures. The task |
|
804
|
|
|
|
|
|
|
computed here was to read a large Excel file with 800131 rows of 7 columns, |
|
805
|
|
|
|
|
|
|
and report the total number of rows. Reported figures are in seconds. |
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
Spreadsheet::ParseXLSX 1272 elapsed, 870 cpu, 4 system |
|
808
|
|
|
|
|
|
|
Data::XLSX::Parser 125 elapsed, 107 cpu, 1 system |
|
809
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::Regex 40 elapsed, 32 cpu, 0 system |
|
810
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::Regex, iterator 34 elapsed, 30 cpu, 0 system |
|
811
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::LibXML 101 elapsed, 83 cpu, 0 system |
|
812
|
|
|
|
|
|
|
Excel::ValueReader::XLSX::LibXML, iterator 91 elapsed, 80 cpu, 0 system |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
=head1 ACKNOWLEDGMENTS |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
=over |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
=item * |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
David Flink signaled (and fixed) a bug about strings with embedded newline characters, and |
|
821
|
|
|
|
|
|
|
signaled that the 'r' attribute in cells is optional. |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=item * |
|
824
|
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
Ulibuck signaled bugs several minor bugs on the LibXML backend. |
|
826
|
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=item * |
|
828
|
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
H.Merijn Brand suggested additions to the API and several improvements to the code source. |
|
830
|
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=item * |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
Ephraim Stevens signaled a bug in the table() method with 'ref' param. |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
=back |
|
836
|
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
=head1 AUTHOR |
|
839
|
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
Laurent Dami, E<lt>dami at cpan.orgE<gt> |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
843
|
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
Copyright 2020-2025 by Laurent Dami. |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
847
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=cut |
|
850
|
|
|
|
|
|
|
|