File Coverage

blib/lib/Excel/ValueReader/XLSX.pm
Criterion Covered Total %
statement 141 144 97.9
branch 50 60 83.3
condition 14 23 60.8
subroutine 25 25 100.0
pod 8 9 88.8
total 238 261 91.1


line stmt bran cond sub pod time code
1             package Excel::ValueReader::XLSX;
2 2     2   369617 use 5.12.1;
  2         10  
3 2     2   13 use utf8;
  2         3  
  2         19  
4 2     2   2812 use Moose;
  2         1319106  
  2         16  
5 2     2   21901 use MooseX::StrictConstructor;
  2         89902  
  2         11  
6 2     2   26607 use Moose::Util::TypeConstraints qw/union/;
  2         5  
  2         26  
7 2     2   1305 use Module::Load qw/load/;
  2         26  
  2         28  
8 2     2   1748 use Date::Calc qw/Add_Delta_Days/;
  2         19477  
  2         281  
9 2     2   1439 use POSIX qw/strftime modf/;
  2         18155  
  2         16  
10 2     2   4646 use Carp qw/croak/;
  2         6  
  2         130  
11 2     2   934 use Iterator::Simple qw/iter/;
  2         3357  
  2         5757  
12              
13             #======================================================================
14             # GLOBALS
15             #======================================================================
16              
17             our $VERSION = '1.17';
18             our %A1_to_num_memoized;
19              
20             #======================================================================
21             # TYPES AND ATTRIBUTES
22             #======================================================================
23              
24             # TYPES
25             my $XlsxSource = union([qw/Str FileHandle/]);
26              
27             # PUBLIC ATTRIBUTES
28             has 'xlsx' => (is => 'ro', isa => $XlsxSource, required => 1); # path of xlsx file
29             has 'using' => (is => 'ro', isa => 'Str', default => 'Regex'); # name of backend class
30             has 'date_format' => (is => 'ro', isa => 'Str', default => '%d.%m.%Y');
31             has 'time_format' => (is => 'ro', isa => 'Str', default => '%H:%M:%S');
32             has 'datetime_format' => (is => 'ro', isa => 'Str', builder => '_datetime_format', lazy => 1);
33             has 'date_formatter' => (is => 'ro', isa => 'Maybe[CodeRef]', builder => '_date_formatter', lazy => 1);
34              
35             # ATTRIBUTES USED INTERNALLY, NOT DOCUMENTED
36             has 'backend' => (is => 'ro', isa => 'Object', builder => '_backend', lazy => 1,
37             init_arg => undef, handles => [qw/base_year sheets active_sheet table_info/]);
38              
39             #======================================================================
40             # BUILDING
41             #======================================================================
42              
43             # syntactic sugar for supporting ->new($path) instead of ->new(xlsx => $path)
44             around BUILDARGS => sub {
45             my $orig = shift;
46             my $class = shift;
47              
48             unshift @_, 'xlsx' if scalar(@_) % 2 and $XlsxSource->check($_[0]);
49              
50             $class->$orig(@_);
51             };
52              
53              
54             #======================================================================
55             # ATTRIBUTE CONSTRUCTORS
56             #======================================================================
57              
58             sub _backend {
59 30     30   63 my $self = shift;
60              
61 30         1089 my $backend_class = ref($self) . '::Backend::' . $self->using;
62 30         238 load $backend_class;
63              
64 30         2912 return $backend_class->new(frontend => $self);
65             }
66              
67             sub _datetime_format {
68 24     24   63 my ($self) = @_;
69 24         856 return $self->date_format . ' ' . $self->time_format;
70             }
71              
72             sub _date_formatter {
73 24     24   60 my ($self) = @_;
74              
75             # local copies of the various formats so that we can build a closure
76 24         871 my @formats = (undef, # 0 -- error
77             $self->date_format, # 1 -- just a date
78             $self->time_format, # 2 -- just a time
79             $self->datetime_format); # 3 -- date and time
80              
81             my $strftime_formatter = sub {
82 312     312   906 my ($xl_date_format, $y, $m, $d, $h, $min, $s, $ms) = @_;
83              
84             # choose the proper format for strftime
85 312         470 my $ix = 0; # index into the @formats array
86 312 100       1161 $ix += 1 if $xl_date_format =~ /[dy]/; # the Excel format contains a date portion
87 312 100       852 $ix += 2 if $xl_date_format =~ /[hs]/; # the Excel format contains a time portion
88 312 50       983 my $strftime_format = $formats[$ix]
89             or die "cell with unexpected Excel date format : $xl_date_format";
90              
91             # formatting through strftime
92 312         26850 my $formatted_date = strftime($strftime_format, $s, $min, $h, $d, $m-1, $y-1900);
93              
94 312         1246 return $formatted_date;
95 24         208 };
96              
97 24         891 return $strftime_formatter;
98             }
99              
100              
101             #======================================================================
102             # GENERAL METHODS
103             #======================================================================
104              
105              
106             sub sheet_names {
107 6     6 1 16356 my ($self) = @_;
108              
109 6         42 my $sheets = $self->sheets; # hashref of shape {$name => $sheet_position}
110              
111 6         61 my @sorted_names = sort {$sheets->{$a} <=> $sheets->{$b}} keys %$sheets;
  81         161  
112 6         45 return @sorted_names;
113             }
114              
115              
116 44     44 1 81341 sub values { my ($self, $sheet) = @_; $self->backend->_values($sheet, 0)}
  44         1924  
117 44     44 1 78212 sub ivalues { my ($self, $sheet) = @_; $self->backend->_values($sheet, 1)}
  44         2008  
118              
119             sub formatted_date {
120 312     312 1 21596 my ($self, $val, $date_format, $date_formatter) = @_;
121              
122             # separate date (integer part) from time (fractional part)
123 312         1356 my ($time, $n_days) = modf($val);
124              
125             # Convert $n_days into a date in Date::Calc format (year, month, day).
126             # The algorithm is quite odd because in the 1900 system, 01.01.1900 == 0 while
127             # in the 1904 system, 01.01.1904 == 1; furthermore, in the 1900 system,
128             # Excel treats 1900 as a leap year.
129 312         1225 my $base_year = $self->base_year;
130 312 100       816 if ($base_year == 1900) {
131 228         515 my $is_after_february_1900 = $n_days > 60;
132 228 100       549 $n_days -= $is_after_february_1900 ? 2 : 1;
133             }
134 312         1434 my @d = Add_Delta_Days($base_year, 1, 1, $n_days);
135              
136             # decode the fractional part (the time) into hours, minutes, seconds, milliseconds
137 312         511 my @t;
138 312         636 foreach my $subdivision (24, 60, 60, 1000) {
139 1248         1846 $time *= $subdivision;
140 1248         2643 ($time, my $time_portion) = modf($time);
141 1248         2381 push @t, $time_portion;
142             }
143              
144             # dirty hack to deal with float imprecisions : if 999 millisecs, round to the next second
145 312         656 my ($h, $m, $s, $ms) = @t;
146 312 100       712 if ($ms == 999) {
147 4         15 $s += 1, $ms = 0;
148 4 50       15 if ($s == 60) {
149 0         0 $m += 1, $s = 0;
150 0 0       0 if ($m == 60) {
151 0         0 $h += 1, $m = 0;
152             }
153             }
154             }
155             # NOTE : because of this hack, theoretically we could end up with a value
156             # like 01.01.2000 24:00:00, semantically equal to 02.01.2000 00:00:00 but different
157             # in its rendering.
158              
159             # call the date_formatter subroutine
160 312 50 33     12959 $date_formatter //= $self->date_formatter
161             or die ref($self) . " has no date_formatter subroutine";
162 312         828 my $formatted_date = $date_formatter->($date_format, @d, $h, $m, $s, $ms);
163              
164 312         1597 return $formatted_date;
165             }
166              
167             #======================================================================
168             # METHODS FOR PARSING EXCEL TABLES
169             #======================================================================
170              
171              
172             sub table_names {
173 4     4 1 13492 my ($self) = @_;
174              
175 4         32 my $table_info = $self->table_info;
176              
177             # sort on table id (field [1] in table_info arrayrefs)
178 4         44 my @table_names = sort {$table_info->{$a}{id} <=> $table_info->{$b}{id}} keys %$table_info;
  38         128  
179              
180 4         37 return @table_names;
181             }
182              
183 20     20 1 23482 sub table {shift->_table(0, @_);} # 0 = does not want an iterator, just a regular arrayref
184 20     20 0 25345 sub itable {shift->_table(1, @_);} # 1 = does want an iterator
185              
186             my %valid_table_arg = map {$_ => 1} qw/name sheet ref columns no_headers with_totals want_records/;
187             sub _table {
188 40     40   92 my $self = shift;
189 40         70 my $want_iterator = shift;
190              
191             # syntactic sugar : ->table('foo') is treated as ->table(name => 'foo')
192 40 100       217 unshift @_, 'name' if scalar(@_) % 2;
193 40         201 my %args = @_;
194              
195             # check for invalid args
196 40         142 my @invalid_args = grep {!$valid_table_arg{$_}} keys %args;
  56         213  
197 40 50       146 croak "invalid table args: ", join(", ", @invalid_args) if @invalid_args;
198              
199             # defaults
200 40   100     243 $args{want_records} //= 1;
201              
202             # if called with a table name, derive positional args from the internal workbook info
203 40 100       125 if (my $table_name = delete $args{name}) {
204 32 50       177 my $table_info = $self->table_info->{$table_name}
205             or croak sprintf "Excel file '%s' contains no table named '%s'", $self->xlsx, $table_name;
206 32   66     655 $args{$_} //= $table_info->{$_} for keys %$table_info;
207             }
208              
209             # get values from the sheet
210             my ($sheet_ref, $vals_or_it) = $want_iterator ? $self->ivalues($args{sheet})
211 40 100       217 : $self->values($args{sheet});
212              
213             # table boundaries
214 40         522 my ($scol1, $srow1, $scol2, $srow2) = $self->range_from_ref($sheet_ref);
215             my ($tcol1, $trow1, $tcol2, $trow2) = $args{ref} ? $self->range_from_ref($args{ref})
216 40 100       173 : ($scol1, $srow1, $scol2, $srow2);
217 40         110 my $skip_initial_rows = $trow1 - $srow1;
218 40         91 my $keep_rows = $trow2 - $trow1 + 1;
219 40         64 my $skip_initial_cols = $tcol1 - $scol1;
220 40         107 my $keep_cols = $tcol2 - $tcol1 + 1;
221              
222             # if a totals row is present, skip it, unless the 'with_totals' arg is present
223 40 100 100     172 $keep_rows -=1 if $args{has_totals} and !$args{with_totals};
224              
225             # skip initial rows if the table does not start at top row
226 40 100       103 if ($skip_initial_rows) {
227 20 100       48 if ($want_iterator) {$vals_or_it->() while $skip_initial_rows-- > 0;}
  10         50  
228 10         40 else {splice @$vals_or_it, 0, $skip_initial_rows}
229             }
230              
231             # read headers from first row -- even if this may be redundant with the 'columns' list declared in the table description
232 40         115 my $headers;
233 40 100       115 unless ($args{no_headers}) {
234 36         63 $keep_rows--;
235 36 100       106 $headers = $want_iterator ? $vals_or_it->() : shift @$vals_or_it;
236 36 100       101 splice @$headers, 0, $skip_initial_cols if $skip_initial_cols;
237 36         95 splice @$headers, $keep_cols;
238             }
239 40   66     121 $args{columns} //= $headers;
240 40 50       60 croak "table contains undefined columns" if grep {!defined $_} @{$args{columns}};
  128         251  
  40         133  
241              
242             # dual closure : can be used as an iterator or can compute all values in one call
243 40         70 my @rows;
244             my $get_values = sub {
245 348     348   1951 while (1) {
246 656 100 66     2035 $keep_rows--
    100          
247             and my $vals = $want_iterator ? $vals_or_it->() : shift @$vals_or_it
248             or return; # no more records -- end of iterator
249 616 100       1109 splice @$vals, 0, $skip_initial_cols if $skip_initial_cols;
250 616         961 splice @$vals, $keep_cols;
251 616 100       1053 my $row = $args{want_records} ? do {my %r; @r{@{$args{columns}}} = @$vals; \%r}
  608         755  
  608         861  
  608         3424  
  608         1152  
252             : $vals;
253 616 100       1064 if ($want_iterator) {return $row} # current iteration successful
  308         1219  
254 308         531 else {push @rows, $row};
255             }
256 40         257 };
257              
258             # return either an iterator or the accumulated table records
259             return ($args{columns}, $want_iterator ? iter($get_values)
260 40 100       162 : do {$get_values->(); \@rows});
  20         48  
  20         479  
261             }
262              
263              
264             sub A1_to_num { # convert Excel A1 reference format to a number
265 15     15 1 42 my ($self, $A1) = @_;
266 15         28 my $num = 0;
267 15         56 foreach my $digit (unpack "C*", $A1) {
268 17         44 $num = $num*26 + $digit-64;
269             }
270              
271 15         66 return $num;
272             }
273              
274              
275             sub range_from_ref { # convert a range reference like 'C9:E21' into ($col1, $row1, $col2, $row2)
276 76     76 1 163 my ($self, $range) = @_;
277              
278 76 50       555 $range =~ /^([A-Z]+)(\d+) # mandatory 1st col and row
279             (?: # .. optionally followed by
280             : # colon
281             ([A-Z]+)(\d+) # and 2nd col and row
282             )?
283             $/x
284             or croak "->range_from_ref($range) : invalid ref";
285              
286 76   33     390 my @range = ($A1_to_num_memoized{$1} //= $self->A1_to_num($1), $2); # col, row of topleft cell
287 76 50 33     400 push @range, ($3 ? ($A1_to_num_memoized{$3} //= $self->A1_to_num($3), $4) # col, row of bottomright cell, or ..
288             : @range); # .. copy of topleft cell
289 76         336 return @range;
290             }
291              
292              
293              
294              
295              
296              
297             1;
298              
299              
300             __END__
301              
302             =head1 NAME
303              
304              
305             Excel::ValueReader::XLSX - extracting values from Excel workbooks in XLSX format, fast
306              
307             =head1 SYNOPSIS
308              
309             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle);
310             # .. or with syntactic sugar :
311             my $reader = Excel::ValueReader::XLSX->new($filename_or_handle);
312             # .. or with LibXML backend :
313             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle,
314             using => 'LibXML');
315            
316             foreach my $sheet_name ($reader->sheet_names) {
317             my $grid = $reader->values($sheet_name);
318             my $n_rows = @$grid;
319             print "sheet $sheet_name has $n_rows rows; ",
320             "first cell contains : ", $grid->[0][0];
321             }
322            
323             foreach my $table_name ($reader->table_names) {
324             my ($columns, $records) = $reader->table($table_name);
325             my $n_records = @$records;
326             my $n_columns = @$columns;
327             print "table $table_name has $n_records records and $n_columns columns; ",
328             "column 'foo' in first row contains : ", $records->[0]{foo};
329             }
330            
331             my $first_grid = $reader->values(1); # the arg can be a sheet index instead of a sheet name
332            
333             # iterator version of ->values()
334             my $iterator = $reader->ivalues($sheet_name);
335             while (my $row = $iterator->()) { process_row($row) }
336              
337             # iterator version of ->table()
338             my ($columns, $iterator) = $reader->itable($table_name);
339             while (my $record = $iterator->()) { process_record($record) }
340              
341             =head1 DESCRIPTION
342              
343             =head2 Purpose
344              
345             This module reads the contents of an Excel file in XLSX format.
346             Unlike other modules like L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>,
347             this module has no support for reading formulas, formats or other Excel internal
348             information; all you get are plain values -- but you get them much faster !
349              
350             Besides, this API has some features not found in concurrent parsers :
351              
352             =over
353              
354             =item *
355              
356             has support for parsing Excel tables
357              
358             =item *
359              
360             iterator methods for getting one row at a time from a worksheet or from a table -- very useful for sparing
361             memory when dealing with large Excel files.
362              
363             =back
364              
365              
366             =head2 Backends
367              
368             Two different backends may be used for extracting values :
369              
370             =over
371              
372             =item Regex
373              
374             using regular expressions to parse the XML content.
375              
376             =item LibXML
377              
378             using L<XML::LibXML::Reader> to parse the XML content.
379             It is probably safer but two to three times slower than the Regex backend
380             (but still much faster than L<Spreadsheet::ParseXLSX>).
381              
382             =back
383              
384             The default is the C<Regex> backend.
385              
386             =head2 Sheet numbering
387              
388             Although worksheets are usually accessed by name, they may also
389             be accessed by numerical indices, I<starting at value 1>.
390             Some other Perl parsing modules use a different convention, where the first sheet has index 0.
391             Here index 1 was chosen to be consistent with the common API for "collections" in
392             Microsoft Office object model.
393              
394              
395             =head1 NOTE ON ITERATORS
396              
397             Methods L</ivalues> and L</itable> return I<iterators>.
398             Each call to the iterator produces a new data row from the Excel content, until reaching
399             the end of data where the iterator returns C<undef>. Following the L<Iterator::Simple> protocol,
400             iterators support three different but semantically equivalent syntaxes :
401              
402             while (my $row = $iterator->()) { process($row) }
403            
404             while (my $row = $iterator->next) { process($row) }
405            
406             while (<$iterator>) { process($_) }
407              
408             Working with iterators is especially interesting when dealing with large Excel files, because
409             rows can be processed one at a time instead of being loaded all at once in memory. For example
410             a typical pattern for loading the Excel content into a database would be :
411              
412             my $iter = $valuereader->ivalues('MySheet');
413             my $sth = $dbh->prepare("INSERT INTO MYTABLE(col1, col2, col3) VALUES (?, ?, ?)");
414             while (my $row = $iter->()) {
415             $sth->execute(@$row);
416             }
417            
418             As another example, suppose a large population table, from which we want to produce a list of
419             list of minor girls. This can be done with a combination of iterator operations :
420              
421             use Iterator::Simple qw/igrep imap/;
422             use Iterator::Simple::Util qw/ireduce/;
423             my $minor_girls = ireduce {"$a, $b"} # successive results joined with ", "
424             imap {"$_->{firstname} $_->{lastname}"} # produce a flat string from an input record with first/last name
425             igrep {$_->{gender} eq 'F' && $_->{age} < 18} # filter input records
426             $valuereader->itable('Population'); # source iterator
427              
428             =head1 METHODS
429              
430             =head2 new
431              
432             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename_or_handle, %options);
433             # .. or with syntactic sugar :
434             my $reader = Excel::ValueReader::XLSX->new($filename_or_handle, %options);
435              
436             The C<xlsx> argument is mandatory and points to the C<.xlsx> file to be parsed,
437             or to an open filehandle.
438              
439             Options are :
440              
441             =over
442              
443             =item C<using>
444              
445             The backend to be used for parsing; default is 'Regex'.
446              
447             =item C<date_format>, C<time_format>, C<datetime_format>, C<date_formatter>
448              
449             Parameters for formatting date and time values; these are described in the L</DATE AND TIME FORMATS> section below.
450              
451             =back
452              
453              
454              
455             =head2 sheet_names
456              
457             my @sheets = $reader->sheet_names;
458              
459             Returns the list of worksheet names, in the same order as in the Excel file.
460             The first name in the list corresponds to sheet number 1.
461              
462             =head2 active_sheet
463              
464             my $active_sheet_number = $reader->active_sheet;
465              
466             Returns the numerical index (starting at 1) of the sheet that was active when the file was last saved.
467             May return C<undef>.
468              
469             =head2 values
470              
471             my ($ref, $grid) = $reader->values($sheet);
472             # or
473             my $grid = $reader->values($sheet);
474              
475             Returns a pair where
476              
477             =over
478              
479             =item *
480              
481             the first item is a string that describes the range of the sheet,
482             in Excel A1 format (like for example C<A1:F12>
483              
484             =item *
485              
486             the second item is a bidimensional array of scalars (in other words,
487             an arrayref of arrayrefs of scalars), corresponding to cell
488             values in the specified worksheet.
489              
490              
491             =back
492              
493             The C<$sheet> argument can be either a sheet name or a sheet position (starting at 1).
494              
495             When called in scalar context, this method only returns the grid of values.
496              
497             Unlike the original Excel cells, positions in the grid are zero-based,
498             so for example the content of cell B3 is in C<< $grid->[1][2] >>.
499             The grid is sparse : the size of each row depends on the
500             position of the last non-empty cell in that row.
501             Thanks to Perl's auto-vivification mechanism, any attempt to access
502             a non-existent cell will automatically create the corresponding cell
503             within the grid. The number of rows and columns in the grid can be computed
504             like this :
505              
506             my $nb_rows = @$grid;
507             my $nb_cols = max map {scalar @$_} @$grid; # must import List::Util::max
508              
509             Alternatively, these numbers can also be obtained through the L</range_from_ref> method.
510              
511              
512             =head2 ivalues
513              
514             my ($ref, $iterator) = $reader->ivalues($sheet);
515             # or
516             my $iterator = $reader->ivalues($sheet);
517            
518             while (my $row = $iterator->()) {
519             say join ", ", @$row;
520             }
521              
522             Like the L</values> method, except that it returns an iterator instead of a fully populated data grid.
523             Data rows are retrieved through successive calls to the iterator.
524              
525              
526             =head2 table_names
527              
528             my @table_names = $reader->table_names;
529              
530             Returns the list of names of tables registered in this workbook.
531              
532             =head2 table
533              
534             my $rows = $reader->table(name => $table_name); # or just : $reader->table($table_name)
535             # or
536             my ($columns, $rows) = $reader->table(name => $table_name);
537             # or
538             my ($columns, $rows) = $reader->table(sheet => $sheet [, ref => $ref]
539             [, columns => \@columns]
540             [, no_headers => 1]
541             [, with_totals => 1]
542             [, want_records => 0]
543             );
544              
545             In its simplest form, this method returns the content of an Excel table referenced by its table name
546             (in Excel, the table name appears and can be modified through the ribbon tab
547             entry "Table tools / Design").
548             The table name is passed either through the named argument C<name>, or positionally as unique argument
549             to the method.
550              
551             In list context, the method returns a pair, where the first element is
552             an arrayref of column names, and the second element is an arrayref of rows.
553             In scalar context, the method just returns the arrayref of rows.
554              
555             Rows are normally returned as hashrefs, where keys of the hashes correspond to column names
556             in the table. Under option C<< want_records => 0>>, rows are returned as arrayrefs, and it is up
557             to the client to make the correspondance with column names in C<$columns>.
558              
559             Instead of specifying a table name, it is also possible to give a sheet name or sheet number.
560             By default, this considers the whole sheet content as a single table, where column names
561             are on the first row. However, additional arguments can be supplied to change the default
562             behaviour :
563              
564             =over
565              
566             =item ref
567              
568             a specific range of cells within the sheet that contain the table rows and columns.
569             The range must be expressed using traditional Excel notation,
570             like for example C<"C9:E23"> (columns 3 to 5, rows 9 to 23).
571              
572             =item columns
573              
574             an arrayref containing the list of column names.
575             If absent, column names will be taken from the first row in the table.
576              
577             =item no_headers
578              
579             if true, the first row in the table will be treated as a regular data row, instead
580             of being treated as a list of column names. In that case, since column names cannot
581             be inferred from cell values in the first row, the C<columns> argument to the method
582             must be present.
583              
584             =item with_totals
585              
586             For tables that have a "totals" row (turned on by a specific checkbox
587             in the Excel ribbon), this row is normally not included in the result.
588             To include it as a final row, pass a true value to the C<with_totals> option.
589              
590             =back
591              
592              
593             =head1 AUXILIARY METHODS
594              
595             =head2 A1_to_num
596              
597             my $col_num = $reader->A1_to_num('A'); # 1
598             $col_num = $reader->A1_to_num('AZ'); # 52
599             $col_num = $reader->A1_to_num('AA'); # 26
600             $col_num = $reader->A1_to_num('ABC'); # 731
601              
602             Converts a column expressed as a sequence of capital letters (in Excel's "A1" notation)
603             into the corresponding numeric value.
604              
605             The module also has a global hash C<$Excel::ValueReader::XLSX::A1_to_num_memoized> where
606             results from such conversions are memoized.
607              
608             =head2 range_from_ref
609              
610             my ($col1, $row1, $col2, $row2) = $reader->range_from_ref("C4:BB123");
611              
612             Returns the coordinates of the topleft and bottomright cells corresponding to a given Excel range.
613              
614              
615             =head2 table_info
616              
617             my $info = $reader->table_info->{$table_name};
618              
619             Returns information about an Excel table in the form of a hashref with keys
620              
621             =over
622              
623             =item name
624              
625             the name of the table
626              
627             =item ref
628              
629             the range of the table, in Excel notation (e.g "G6:Z44")
630              
631             =item columns
632              
633             an arrayref of column names
634              
635             =item id
636              
637             numerical id of the table
638              
639             =item sheet
640              
641             numerical id of the sheet to which the table belongs
642              
643             =item no_headers
644              
645             boolean flag corresponding to the negation of the checkbox "Headers row" in Excel.
646             By default tables have a header row, both in Excel and in this module.
647              
648             =item has_totals
649              
650             boolean flag corresponding to the checkbox "Totals row" in Excel.
651             By default tables have no totals row, both in Excel and in this module.
652              
653              
654             =back
655              
656              
657             =head2 formatted_date
658              
659             my $date = $reader->formatted_date($numeric_date, $excel_date_format);
660              
661             Given a numeric date, this method returns a string date formatted according
662             to the I<date formatter> routine explained in the next section. The C<$excel_date_format>
663             argument should be the Excel format string for that specific cell; it is used
664             only for for deciding if the numeric value should be presented as a date, as a time,
665             or both. Optionally, a custom date formatter callback could be passed as third argument.
666              
667              
668              
669              
670              
671              
672             =head1 DATE AND TIME FORMATS
673              
674             =head2 Date and time handling
675              
676             In Excel, date and times values are stored as numeric values, where the integer part
677             represents the date, and the fractional part represents the time. What distinguishes
678             such numbers from ordinary numbers is the I<numeric format> applied to the cells
679             where they appear.
680              
681             Numeric formats in Excel are complex to reproduce, in particular
682             because they are locale-dependent; therefore the present module does not attempt
683             to faithfully interpret Excel formats. It just infers from formats which
684             cells should be presented as date and/or time values. All such values are then
685             presented through the same I<date_formatter> routine. The default formatter
686             is based on L<POSIX/strftime>; other behaviours may be specified through the C<date_formatter>
687             parameter (explained below).
688              
689             =head2 Parameters for the default strftime formatter
690              
691             When using the default strftime formatter, the following parameters may be passed
692             to the constructor :
693              
694             =over
695              
696             =item date_format
697              
698             The L<POSIX/strftime> format for representing dates. The default is C<%d.%m.%Y>.
699              
700             =item time_format
701              
702             The L<POSIX/strftime> format for representing times. The default is C<%H:%M:%S>.
703              
704             =item datetime_format
705              
706             The L<POSIX/strftime> format for representing date and time together.
707             The default is the concatenation of C<date_format> and C<time_format>, with
708             a space in between.
709              
710             =back
711              
712              
713             =head2 Writing a custom formatter
714              
715             A custom algorithm for date formatting can be specified as a parameter to the constructor
716              
717             my $reader = Excel::ValueReader::XLSX->new(xlsx => $filename,
718             date_formatter => sub {...});
719              
720             If this parameter is C<undef>, date formatting is canceled and therefore date and
721             time values will be presented as plain numbers.
722              
723             If not C<undef>, the date formatting routine will we called as :
724              
725             $date_formater->($excel_date_format, $year, $month, $day, $hour, $minute, $second, $millisecond);
726              
727             where
728              
729             =over
730              
731             =item *
732              
733             C<$excel_date_format> is the Excel numbering format associated to that cell, like for example
734             C<mm-dd-yy> or C<h:mm:ss AM/PM>. See the Excel documentation for the syntax description.
735             This is useful to decide if the value should be presented as a date, a time, or both.
736             The present module uses a simple heuristic : if the format contains C<d> or C<y>, it should
737             be presented as a date; if the format contains C<h> or C<s>, it should be presented
738             as a time. The letter C<m> is not taken into consideration because it is ambiguous :
739             depending on the position in the format string, it may represent either a "month" or a "minute".
740              
741             =item *
742              
743             C<year> is the full year, such as 1993 or 2021. The date system of the Excel file (either 1900 or 1904,
744             see L<https://support.microsoft.com/en-us/office/date-systems-in-excel-e7fe7167-48a9-4b96-bb53-5612a800b487>)
745             is properly taken into account. Excel has no support for dates prior to 1900 or 1904, so the
746             C<year> component will always be above this value.
747              
748             =item *
749              
750             C<month> is the numeric value of the month, starting at 1
751              
752             =item *
753              
754             C<day> is the numeric value of the day in month, starting at 1
755              
756             =item *
757              
758             C<$hour>, C<$minute>, C<$second>, C<$millisecond> obviously contain the corresponding
759             numeric values.
760              
761             =back
762              
763              
764             =head1 CAVEATS
765              
766             =over
767              
768             =item *
769              
770             This module was optimized for speed, not for completeness of
771             OOXML-SpreadsheetML support; so there may be some edge cases where the
772             output is incorrect with respect to the original Excel data.
773              
774             =item *
775              
776             Embedded newline characters in strings are stored in Excel as C<< \r\n >>,
777             following the old Windows convention. When retrieved through the C<Regex>
778             backend, the result contains the original C<< \r >> and C<< \n >> characters;
779             but when retrieved through the C<LibXML> backend, C<< \r >> are silently removed by the
780             C<XML::LibXML> package.
781              
782             =back
783              
784             =head1 SEE ALSO
785              
786             The official reference for OOXML-SpreadsheetML format is in
787             L<https://www.ecma-international.org/publications/standards/Ecma-376.htm>.
788              
789             Introductory material on XLSX file structure can be found at
790             L<http://officeopenxml.com/anatomyofOOXML-xlsx.php>.
791              
792             Concurrent modules L<Spreadsheet::ParseXLSX> or L<Data::XLSX::Parser>.
793              
794             Another unpublished but working module for parsing Excel files in Perl
795             can be found at L<https://github.com/jmcnamara/excel-reader-xlsx>.
796             Some test cases were borrowed from that distribution.
797              
798             Conversions from and to Excel internal date format can also be performed
799             through the L<DateTime::Format::Excel> module.
800              
801             =head1 BENCHMARKS
802              
803             Below are some comparative figures. The task
804             computed here was to read a large Excel file with 800131 rows of 7 columns,
805             and report the total number of rows. Reported figures are in seconds.
806              
807             Spreadsheet::ParseXLSX 1272 elapsed, 870 cpu, 4 system
808             Data::XLSX::Parser 125 elapsed, 107 cpu, 1 system
809             Excel::ValueReader::XLSX::Regex 40 elapsed, 32 cpu, 0 system
810             Excel::ValueReader::XLSX::Regex, iterator 34 elapsed, 30 cpu, 0 system
811             Excel::ValueReader::XLSX::LibXML 101 elapsed, 83 cpu, 0 system
812             Excel::ValueReader::XLSX::LibXML, iterator 91 elapsed, 80 cpu, 0 system
813              
814             =head1 ACKNOWLEDGMENTS
815              
816             =over
817              
818             =item *
819              
820             David Flink signaled (and fixed) a bug about strings with embedded newline characters, and
821             signaled that the 'r' attribute in cells is optional.
822              
823             =item *
824              
825             Ulibuck signaled bugs several minor bugs on the LibXML backend.
826              
827             =item *
828              
829             H.Merijn Brand suggested additions to the API and several improvements to the code source.
830              
831             =item *
832              
833             Ephraim Stevens signaled a bug in the table() method with 'ref' param.
834              
835             =back
836              
837              
838             =head1 AUTHOR
839              
840             Laurent Dami, E<lt>dami at cpan.orgE<gt>
841              
842             =head1 COPYRIGHT AND LICENSE
843              
844             Copyright 2020-2025 by Laurent Dami.
845              
846             This library is free software; you can redistribute it and/or modify
847             it under the same terms as Perl itself.
848              
849             =cut
850