File Coverage

blib/lib/Excel/ValueReader/XLSX.pm

Criterion	Covered	Total	%
statement	141	144	97.9
branch	50	60	83.3
condition	14	23	60.8
subroutine	25	25	100.0
pod	8	9	88.8
total	238	261	91.1

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX;
2	2			2		369617	use 5.12.1;
	2					10
3	2			2		13	use utf8;
	2					3
	2					19
4	2			2		2812	use Moose;
	2					1319106
	2					16
5	2			2		21901	use MooseX::StrictConstructor;
	2					89902
	2					11
6	2			2		26607	use Moose::Util::TypeConstraints qw/union/;
	2					5
	2					26
7	2			2		1305	use Module::Load qw/load/;
	2					26
	2					28
8	2			2		1748	use Date::Calc qw/Add_Delta_Days/;
	2					19477
	2					281
9	2			2		1439	use POSIX qw/strftime modf/;
	2					18155
	2					16
10	2			2		4646	use Carp qw/croak/;
	2					6
	2					130
11	2			2		934	use Iterator::Simple qw/iter/;
	2					3357
	2					5757
12
13							#======================================================================
14							# GLOBALS
15							#======================================================================
16
17							our $VERSION = '1.17';
18							our %A1_to_num_memoized;
19
20							#======================================================================
21							# TYPES AND ATTRIBUTES
22							#======================================================================
23
24							# TYPES
25							my $XlsxSource = union([qw/Str FileHandle/]);
26
27							# PUBLIC ATTRIBUTES
28							has 'xlsx' => (is => 'ro', isa => $XlsxSource, required => 1); # path of xlsx file
29							has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
30							has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
31							has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
32							has 'datetime_format' => (is => 'ro', isa => 'Str', builder => '_datetime_format', lazy => 1);
33							has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]', builder => '_date_formatter', lazy => 1);
34
35							# ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
36							has 'backend' => (is => 'ro', isa => 'Object', builder => '_backend', lazy => 1,
37							init_arg => undef, handles => [qw/base_year sheets active_sheet table_info/]);
38
39							#======================================================================
40							# BUILDING
41							#======================================================================
42
43							# syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
44							around BUILDARGS => sub {
45							my $orig = shift;
46							my $class = shift;
47
48							unshift @_, 'xlsx' if scalar(@_) % 2 and $XlsxSource->check($_[0]);
49
50							$class->$orig(@_);
51							};
52
53
54							#======================================================================
55							# ATTRIBUTE CONSTRUCTORS
56							#======================================================================
57
58							sub _backend {
59	30			30		63	my $self = shift;
60
61	30					1089	my $backend_class = ref($self) . '::Backend::' . $self->using;
62	30					238	load $backend_class;
63
64	30					2912	return $backend_class->new(frontend => $self);
65							}
66
67							sub _datetime_format {
68	24			24		63	my ($self) = @_;
69	24					856	return $self->date_format . ' ' . $self->time_format;
70							}
71
72							sub _date_formatter {
73	24			24		60	my ($self) = @_;
74
75							# local copies of the various formats so that we can build a closure
76	24					871	my @formats = (undef, # 0 -- error
77							$self->date_format, # 1 -- just a date
78							$self->time_format, # 2 -- just a time
79							$self->datetime_format); # 3 -- date and time
80
81							my $strftime_formatter = sub {
82	312			312		906	my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
83
84							# choose the proper format for strftime
85	312					470	my $ix = 0; # index into the @formats array
86	312	100				1161	$ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
87	312	100				852	$ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
88	312	50				983	my $strftime_format = $formats[$ix]
89							or die "cell with unexpected Excel date format : $xl_date_format";
90
91							# formatting through strftime
92	312					26850	my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
93
94	312					1246	return $formatted_date;
95	24					208	};
96
97	24					891	return $strftime_formatter;
98							}
99
100
101							#======================================================================
102							# GENERAL METHODS
103							#======================================================================
104
105
106							sub sheet_names {
107	6			6	1	16356	my ($self) = @_;
108
109	6					42	my $sheets = $self->sheets; # hashref of shape {$name => $sheet_position}
110
111	6					61	my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
	81					161
112	6					45	return @sorted_names;
113							}
114
115
116	44			44	1	81341	sub values { my ($self, $sheet) = @_; $self->backend->_values($sheet, 0)}
	44					1924
117	44			44	1	78212	sub ivalues { my ($self, $sheet) = @_; $self->backend->_values($sheet, 1)}
	44					2008
118
119							sub formatted_date {
120	312			312	1	21596	my ($self, $val, $date_format, $date_formatter) = @_;
121
122							# separate date (integer part) from time (fractional part)
123	312					1356	my ($time, $n_days) = modf($val);
124
125							# Convert $n_days into a date in Date::Calc format (year, month, day).
126							# The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
127							# in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
128							# Excel treats 1900 as a leap year.
129	312					1225	my $base_year = $self->base_year;
130	312	100				816	if ($base_year == 1900) {
131	228					515	my $is_after_february_1900 = $n_days > 60;
132	228	100				549	$n_days -= $is_after_february_1900 ? 2 : 1;
133							}
134	312					1434	my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
135
136							# decode the fractional part (the time) into hours, minutes, seconds, milliseconds
137	312					511	my @t;
138	312					636	foreach my $subdivision (24, 60, 60, 1000) {
139	1248					1846	$time *= $subdivision;
140	1248					2643	($time, my $time_portion) = modf($time);
141	1248					2381	push @t, $time_portion;
142							}
143
144							# dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
145	312					656	my ($h, $m, $s, $ms) = @t;
146	312	100				712	if ($ms == 999) {
147	4					15	$s += 1, $ms = 0;
148	4	50				15	if ($s == 60) {
149	0					0	$m += 1, $s = 0;
150	0	0				0	if ($m == 60) {
151	0					0	$h += 1, $m = 0;
152							}
153							}
154							}
155							# NOTE : because of this hack, theoretically we could end up with a value
156							# like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
157							# in its rendering.
158
159							# call the date_formatter subroutine
160	312	50	33			12959	$date_formatter //= $self->date_formatter
161							or die ref($self) . " has no date_formatter subroutine";
162	312					828	my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
163
164	312					1597	return $formatted_date;
165							}
166
167							#======================================================================
168							# METHODS FOR PARSING EXCEL TABLES
169							#======================================================================
170
171
172							sub table_names {
173	4			4	1	13492	my ($self) = @_;
174
175	4					32	my $table_info = $self->table_info;
176
177							# sort on table id (field [1] in table_info arrayrefs)
178	4					44	my @table_names = sort {$table_info->{$a}{id} <=> $table_info->{$b}{id}} keys %$table_info;
	38					128
179
180	4					37	return @table_names;
181							}
182
183	20			20	1	23482	sub table {shift->_table(0, @_);} # 0 = does not want an iterator, just a regular arrayref
184	20			20	0	25345	sub itable {shift->_table(1, @_);} # 1 = does want an iterator
185
186							my %valid_table_arg = map {$_ => 1} qw/name sheet ref columns no_headers with_totals want_records/;
187							sub _table {
188	40			40		92	my $self = shift;
189	40					70	my $want_iterator = shift;
190
191							# syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
192	40	100				217	unshift @_, 'name' if scalar(@_) % 2;
193	40					201	my %args = @_;
194
195							# check for invalid args
196	40					142	my @invalid_args = grep {!$valid_table_arg{$_}} keys %args;
	56					213
197	40	50				146	croak "invalid table args: ", join(", ", @invalid_args) if @invalid_args;
198
199							# defaults
200	40		100			243	$args{want_records} //= 1;
201
202							# if called with a table name, derive positional args from the internal workbook info
203	40	100				125	if (my $table_name = delete $args{name}) {
204	32	50				177	my $table_info = $self->table_info->{$table_name}
205							or croak sprintf "Excel file '%s' contains no table named '%s'", $self->xlsx, $table_name;
206	32		66			655	$args{$_} //= $table_info->{$_} for keys %$table_info;
207							}
208
209							# get values from the sheet
210							my ($sheet_ref, $vals_or_it) = $want_iterator ? $self->ivalues($args{sheet})
211	40	100				217	: $self->values($args{sheet});
212
213							# table boundaries
214	40					522	my ($scol1, $srow1, $scol2, $srow2) = $self->range_from_ref($sheet_ref);
215							my ($tcol1, $trow1, $tcol2, $trow2) = $args{ref} ? $self->range_from_ref($args{ref})
216	40	100				173	: ($scol1, $srow1, $scol2, $srow2);
217	40					110	my $skip_initial_rows = $trow1 - $srow1;
218	40					91	my $keep_rows = $trow2 - $trow1 + 1;
219	40					64	my $skip_initial_cols = $tcol1 - $scol1;
220	40					107	my $keep_cols = $tcol2 - $tcol1 + 1;
221
222							# if a totals row is present, skip it, unless the 'with_totals' arg is present
223	40	100	100			172	$keep_rows -=1 if $args{has_totals} and !$args{with_totals};
224
225							# skip initial rows if the table does not start at top row
226	40	100				103	if ($skip_initial_rows) {
227	20	100				48	if ($want_iterator) {$vals_or_it->() while $skip_initial_rows-- > 0;}
	10					50
228	10					40	else {splice @$vals_or_it, 0, $skip_initial_rows}
229							}
230
231							# read headers from first row -- even if this may be redundant with the 'columns' list declared in the table description
232	40					115	my $headers;
233	40	100				115	unless ($args{no_headers}) {
234	36					63	$keep_rows--;
235	36	100				106	$headers = $want_iterator ? $vals_or_it->() : shift @$vals_or_it;
236	36	100				101	splice @$headers, 0, $skip_initial_cols if $skip_initial_cols;
237	36					95	splice @$headers, $keep_cols;
238							}
239	40		66			121	$args{columns} //= $headers;
240	40	50				60	croak "table contains undefined columns" if grep {!defined $_} @{$args{columns}};
	128					251
	40					133
241
242							# dual closure : can be used as an iterator or can compute all values in one call
243	40					70	my @rows;
244							my $get_values = sub {
245	348			348		1951	while (1) {
246	656	100	66			2035	$keep_rows--
		100
247							and my $vals = $want_iterator ? $vals_or_it->() : shift @$vals_or_it
248							or return; # no more records -- end of iterator
249	616	100				1109	splice @$vals, 0, $skip_initial_cols if $skip_initial_cols;
250	616					961	splice @$vals, $keep_cols;
251	616	100				1053	my $row = $args{want_records} ? do {my %r; @r{@{$args{columns}}} = @$vals; \%r}
	608					755
	608					861
	608					3424
	608					1152
252							: $vals;
253	616	100				1064	if ($want_iterator) {return $row} # current iteration successful
	308					1219
254	308					531	else {push @rows, $row};
255							}
256	40					257	};
257
258							# return either an iterator or the accumulated table records
259							return ($args{columns}, $want_iterator ? iter($get_values)
260	40	100				162	: do {$get_values->(); \@rows});
	20					48
	20					479
261							}
262
263
264							sub A1_to_num { # convert Excel A1 reference format to a number
265	15			15	1	42	my ($self, $A1) = @_;
266	15					28	my $num = 0;
267	15					56	foreach my $digit (unpack "C*", $A1) {
268	17					44	$num = $num*26 + $digit-64;
269							}
270
271	15					66	return $num;
272							}
273
274
275							sub range_from_ref { # convert a range reference like 'C9:E21' into ($col1, $row1, $col2, $row2)
276	76			76	1	163	my ($self, $range) = @_;
277
278	76	50				555	$range =~ /^([A-Z]+)(\d+) # mandatory 1st col and row
279							(?: # .. optionally followed by
280							: # colon
281							([A-Z]+)(\d+) # and 2nd col and row
282							)?
283							$/x
284							or croak "->range_from_ref($range) : invalid ref";
285
286	76		33			390	my @range = ($A1_to_num_memoized{$1} //= $self->A1_to_num($1), $2); # col, row of topleft cell
287	76	50	33			400	push @range, ($3 ? ($A1_to_num_memoized{$3} //= $self->A1_to_num($3), $4) # col, row of bottomright cell, or ..
288							: @range); # .. copy of topleft cell
289	76					336	return @range;
290							}
291
292
293
294
295
296
297							1;
298
299
300							__END__
301
302							=head1 NAME
303
304
305							Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
306
307							=head1 SYNOPSIS
308
309							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle);
310							# .. or with syntactic sugar :
311							my $reader = Excel::ValueReader::XLSX->new($filename_or_handle);
312							# .. or with LibXML backend :
313							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle,
314							using => 'LibXML');
315
316							foreach my $sheet_name ($reader->sheet_names) {
317							my $grid = $reader->values($sheet_name);
318							my $n_rows = @$grid;
319							print "sheet $sheet_name has $n_rows rows; ",
320							"first cell contains : ", $grid->[0][0];
321							}
322
323							foreach my $table_name ($reader->table_names) {
324							my ($columns, $records) = $reader->table($table_name);
325							my $n_records = @$records;
326							my $n_columns = @$columns;
327							print "table $table_name has $n_records records and $n_columns columns; ",
328							"column 'foo' in first row contains : ", $records->[0]{foo};
329							}
330
331							my $first_grid = $reader->values(1); # the arg can be a sheet index instead of a sheet name
332
333							# iterator version of ->values()
334							my $iterator = $reader->ivalues($sheet_name);
335							while (my $row = $iterator->()) { process_row($row) }
336
337							# iterator version of ->table()
338							my ($columns, $iterator) = $reader->itable($table_name);
339							while (my $record = $iterator->()) { process_record($record) }
340
341							=head1 DESCRIPTION
342
343							=head2 Purpose
344
345							This module reads the contents of an Excel file in XLSX format.
346							Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>,
347							this module has no support for reading formulas, formats or other Excel internal
348							information; all you get are plain values -- but you get them much faster !
349
350							Besides, this API has some features not found in concurrent parsers :
351
352							=over
353
354							=item *
355
356							has support for parsing Excel tables
357
358							=item *
359
360							iterator methods for getting one row at a time from a worksheet or from a table -- very useful for sparing
361							memory when dealing with large Excel files.
362
363							=back
364
365
366							=head2 Backends
367
368							Two different backends may be used for extracting values :
369
370							=over
371
372							=item Regex
373
374							using regular expressions to parse the XML content.
375
376							=item LibXML
377
378							using L<XML::LibXML::Reader> to parse the XML content.
379							It is probably safer but two to three times slower than the Regex backend
380							(but still much faster than L<Spreadsheet::ParseXLSX>).
381
382							=back
383
384							The default is the C<Regex> backend.
385
386							=head2 Sheet numbering
387
388							Although worksheets are usually accessed by name, they may also
389							be accessed by numerical indices, I<starting at value 1>.
390							Some other Perl parsing modules use a different convention, where the first sheet has index 0.
391							Here index 1 was chosen to be consistent with the common API for "collections" in
392							Microsoft Office object model.
393
394
395							=head1 NOTE ON ITERATORS
396
397							Methods L</ivalues> and L</itable> return I<iterators>.
398							Each call to the iterator produces a new data row from the Excel content, until reaching
399							the end of data where the iterator returns C<undef>. Following the L<Iterator::Simple> protocol,
400							iterators support three different but semantically equivalent syntaxes :
401
402							while (my $row = $iterator->()) { process($row) }
403
404							while (my $row = $iterator->next) { process($row) }
405
406							while (<$iterator>) { process($_) }
407
408							Working with iterators is especially interesting when dealing with large Excel files, because
409							rows can be processed one at a time instead of being loaded all at once in memory. For example
410							a typical pattern for loading the Excel content into a database would be :
411
412							my $iter = $valuereader->ivalues('MySheet');
413							my $sth = $dbh->prepare("INSERT INTO MYTABLE(col1, col2, col3) VALUES (?, ?, ?)");
414							while (my $row = $iter->()) {
415							$sth->execute(@$row);
416							}
417
418							As another example, suppose a large population table, from which we want to produce a list of
419							list of minor girls. This can be done with a combination of iterator operations :
420
421							use Iterator::Simple qw/igrep imap/;
422							use Iterator::Simple::Util qw/ireduce/;
423							my $minor_girls = ireduce {"$a, $b"} # successive results joined with ", "
424							imap {"$_->{firstname} $_->{lastname}"} # produce a flat string from an input record with first/last name
425							igrep {$_->{gender} eq 'F' && $_->{age} < 18} # filter input records
426							$valuereader->itable('Population'); # source iterator
427
428							=head1 METHODS
429
430							=head2 new
431
432							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle, %options);
433							# .. or with syntactic sugar :
434							my $reader = Excel::ValueReader::XLSX->new($filename_or_handle, %options);
435
436							The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed,
437							or to an open filehandle.
438
439							Options are :
440
441							=over
442
443							=item C<using>
444
445							The backend to be used for parsing; default is 'Regex'.
446
447							=item C<date_format>, C<time_format>, C<datetime_format>, C<date_formatter>
448
449							Parameters for formatting date and time values; these are described in the L</DATE AND TIME FORMATS> section below.
450
451							=back
452
453
454
455							=head2 sheet_names
456
457							my @sheets = $reader->sheet_names;
458
459							Returns the list of worksheet names, in the same order as in the Excel file.
460							The first name in the list corresponds to sheet number 1.
461
462							=head2 active_sheet
463
464							my $active_sheet_number = $reader->active_sheet;
465
466							Returns the numerical index (starting at 1) of the sheet that was active when the file was last saved.
467							May return C<undef>.
468
469							=head2 values
470
471							my ($ref, $grid) = $reader->values($sheet);
472							# or
473							my $grid = $reader->values($sheet);
474
475							Returns a pair where
476
477							=over
478
479							=item *
480
481							the first item is a string that describes the range of the sheet,
482							in Excel A1 format (like for example C<A1:F12>
483
484							=item *
485
486							the second item is a bidimensional array of scalars (in other words,
487							an arrayref of arrayrefs of scalars), corresponding to cell
488							values in the specified worksheet.
489
490
491							=back
492
493							The C<$sheet> argument can be either a sheet name or a sheet position (starting at 1).
494
495							When called in scalar context, this method only returns the grid of values.
496
497							Unlike the original Excel cells, positions in the grid are zero-based,
498							so for example the content of cell B3 is in C<< $grid->[1][2] >>.
499							The grid is sparse : the size of each row depends on the
500							position of the last non-empty cell in that row.
501							Thanks to Perl's auto-vivification mechanism, any attempt to access
502							a non-existent cell will automatically create the corresponding cell
503							within the grid. The number of rows and columns in the grid can be computed
504							like this :
505
506							my $nb_rows = @$grid;
507							my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
508
509							Alternatively, these numbers can also be obtained through the L</range_from_ref> method.
510
511
512							=head2 ivalues
513
514							my ($ref, $iterator) = $reader->ivalues($sheet);
515							# or
516							my $iterator = $reader->ivalues($sheet);
517
518							while (my $row = $iterator->()) {
519							say join ", ", @$row;
520							}
521
522							Like the L</values> method, except that it returns an iterator instead of a fully populated data grid.
523							Data rows are retrieved through successive calls to the iterator.
524
525
526							=head2 table_names
527
528							my @table_names = $reader->table_names;
529
530							Returns the list of names of tables registered in this workbook.
531
532							=head2 table
533
534							my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
535							# or
536							my ($columns, $rows) = $reader->table(name => $table_name);
537							# or
538							my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $ref]
539							[, columns => \@columns]
540							[, no_headers => 1]
541							[, with_totals => 1]
542							[, want_records => 0]
543							);
544
545							In its simplest form, this method returns the content of an Excel table referenced by its table name
546							(in Excel, the table name appears and can be modified through the ribbon tab
547							entry "Table tools / Design").
548							The table name is passed either through the named argument C<name>, or positionally as unique argument
549							to the method.
550
551							In list context, the method returns a pair, where the first element is
552							an arrayref of column names, and the second element is an arrayref of rows.
553							In scalar context, the method just returns the arrayref of rows.
554
555							Rows are normally returned as hashrefs, where keys of the hashes correspond to column names
556							in the table. Under option C<< want_records => 0>>, rows are returned as arrayrefs, and it is up
557							to the client to make the correspondance with column names in C<$columns>.
558
559							Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
560							By default, this considers the whole sheet content as a single table, where column names
561							are on the first row. However, additional arguments can be supplied to change the default
562							behaviour :
563
564							=over
565
566							=item ref
567
568							a specific range of cells within the sheet that contain the table rows and columns.
569							The range must be expressed using traditional Excel notation,
570							like for example C<"C9:E23"> (columns 3 to 5, rows 9 to 23).
571
572							=item columns
573
574							an arrayref containing the list of column names.
575							If absent, column names will be taken from the first row in the table.
576
577							=item no_headers
578
579							if true, the first row in the table will be treated as a regular data row, instead
580							of being treated as a list of column names. In that case, since column names cannot
581							be inferred from cell values in the first row, the C<columns> argument to the method
582							must be present.
583
584							=item with_totals
585
586							For tables that have a "totals" row (turned on by a specific checkbox
587							in the Excel ribbon), this row is normally not included in the result.
588							To include it as a final row, pass a true value to the C<with_totals> option.
589
590							=back
591
592
593							=head1 AUXILIARY METHODS
594
595							=head2 A1_to_num
596
597							my $col_num = $reader->A1_to_num('A'); # 1
598							$col_num = $reader->A1_to_num('AZ'); # 52
599							$col_num = $reader->A1_to_num('AA'); # 26
600							$col_num = $reader->A1_to_num('ABC'); # 731
601
602							Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
603							into the corresponding numeric value.
604
605							The module also has a global hash C<$Excel::ValueReader::XLSX::A1_to_num_memoized> where
606							results from such conversions are memoized.
607
608							=head2 range_from_ref
609
610							my ($col1, $row1, $col2, $row2) = $reader->range_from_ref("C4:BB123");
611
612							Returns the coordinates of the topleft and bottomright cells corresponding to a given Excel range.
613
614
615							=head2 table_info
616
617							my $info = $reader->table_info->{$table_name};
618
619							Returns information about an Excel table in the form of a hashref with keys
620
621							=over
622
623							=item name
624
625							the name of the table
626
627							=item ref
628
629							the range of the table, in Excel notation (e.g "G6:Z44")
630
631							=item columns
632
633							an arrayref of column names
634
635							=item id
636
637							numerical id of the table
638
639							=item sheet
640
641							numerical id of the sheet to which the table belongs
642
643							=item no_headers
644
645							boolean flag corresponding to the negation of the checkbox "Headers row" in Excel.
646							By default tables have a header row, both in Excel and in this module.
647
648							=item has_totals
649
650							boolean flag corresponding to the checkbox "Totals row" in Excel.
651							By default tables have no totals row, both in Excel and in this module.
652
653
654							=back
655
656
657							=head2 formatted_date
658
659							my $date = $reader->formatted_date($numeric_date, $excel_date_format);
660
661							Given a numeric date, this method returns a string date formatted according
662							to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
663							argument should be the Excel format string for that specific cell; it is used
664							only for for deciding if the numeric value should be presented as a date, as a time,
665							or both. Optionally, a custom date formatter callback could be passed as third argument.
666
667
668
669
670
671
672							=head1 DATE AND TIME FORMATS
673
674							=head2 Date and time handling
675
676							In Excel, date and times values are stored as numeric values, where the integer part
677							represents the date, and the fractional part represents the time. What distinguishes
678							such numbers from ordinary numbers is the I<numeric format> applied to the cells
679							where they appear.
680
681							Numeric formats in Excel are complex to reproduce, in particular
682							because they are locale-dependent; therefore the present module does not attempt
683							to faithfully interpret Excel formats. It just infers from formats which
684							cells should be presented as date and/or time values. All such values are then
685							presented through the same I<date_formatter> routine. The default formatter
686							is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
687							parameter (explained below).
688
689							=head2 Parameters for the default strftime formatter
690
691							When using the default strftime formatter, the following parameters may be passed
692							to the constructor :
693
694							=over
695
696							=item date_format
697
698							The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
699
700							=item time_format
701
702							The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
703
704							=item datetime_format
705
706							The L<POSIX/strftime> format for representing date and time together.
707							The default is the concatenation of C<date_format> and C<time_format>, with
708							a space in between.
709
710							=back
711
712
713							=head2 Writing a custom formatter
714
715							A custom algorithm for date formatting can be specified as a parameter to the constructor
716
717							my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
718							date_formatter => sub {...});
719
720							If this parameter is C<undef>, date formatting is canceled and therefore date and
721							time values will be presented as plain numbers.
722
723							If not C<undef>, the date formatting routine will we called as :
724
725							$date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
726
727							where
728
729							=over
730
731							=item *
732
733							C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
734							C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
735							This is useful to decide if the value should be presented as a date, a time, or both.
736							The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
737							be presented as a date; if the format contains C<h> or C<s>, it should be presented
738							as a time. The letter C<m> is not taken into consideration because it is ambiguous :
739							depending on the position in the format string, it may represent either a "month" or a "minute".
740
741							=item *
742
743							C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
744							see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>)
745							is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
746							C<year> component will always be above this value.
747
748							=item *
749
750							C<month> is the numeric value of the month, starting at 1
751
752							=item *
753
754							C<day> is the numeric value of the day in month, starting at 1
755
756							=item *
757
758							C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
759							numeric values.
760
761							=back
762
763
764							=head1 CAVEATS
765
766							=over
767
768							=item *
769
770							This module was optimized for speed, not for completeness of
771							OOXML-SpreadsheetML support; so there may be some edge cases where the
772							output is incorrect with respect to the original Excel data.
773
774							=item *
775
776							Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
777							following the old Windows convention. When retrieved through the C<Regex>
778							backend, the result contains the original C<< \r >> and C<< \n >> characters;
779							but when retrieved through the C<LibXML> backend, C<< \r >> are silently removed by the
780							C<XML::LibXML> package.
781
782							=back
783
784							=head1 SEE ALSO
785
786							The official reference for OOXML-SpreadsheetML format is in
787							L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
788
789							Introductory material on XLSX file structure can be found at
790							L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
791
792							Concurrent modules L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>.
793
794							Another unpublished but working module for parsing Excel files in Perl
795							can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
796							Some test cases were borrowed from that distribution.
797
798							Conversions from and to Excel internal date format can also be performed
799							through the L<DateTime::Format::Excel> module.
800
801							=head1 BENCHMARKS
802
803							Below are some comparative figures. The task
804							computed here was to read a large Excel file with 800131 rows of 7 columns,
805							and report the total number of rows. Reported figures are in seconds.
806
807							Spreadsheet::ParseXLSX 1272 elapsed, 870 cpu, 4 system
808							Data::XLSX::Parser 125 elapsed, 107 cpu, 1 system
809							Excel::ValueReader::XLSX::Regex 40 elapsed, 32 cpu, 0 system
810							Excel::ValueReader::XLSX::Regex, iterator 34 elapsed, 30 cpu, 0 system
811							Excel::ValueReader::XLSX::LibXML 101 elapsed, 83 cpu, 0 system
812							Excel::ValueReader::XLSX::LibXML, iterator 91 elapsed, 80 cpu, 0 system
813
814							=head1 ACKNOWLEDGMENTS
815
816							=over
817
818							=item *
819
820							David Flink signaled (and fixed) a bug about strings with embedded newline characters, and
821							signaled that the 'r' attribute in cells is optional.
822
823							=item *
824
825							Ulibuck signaled bugs several minor bugs on the LibXML backend.
826
827							=item *
828
829							H.Merijn Brand suggested additions to the API and several improvements to the code source.
830
831							=item *
832
833							Ephraim Stevens signaled a bug in the table() method with 'ref' param.
834
835							=back
836
837
838							=head1 AUTHOR
839
840							Laurent Dami, E<lt>dami at cpan.orgE<gt>
841
842							=head1 COPYRIGHT AND LICENSE
843
844							Copyright 2020-2025 by Laurent Dami.
845
846							This library is free software; you can redistribute it and/or modify
847							it under the same terms as Perl itself.
848
849							=cut
850