File Coverage

blib/lib/Excel/ValueReader/XLSX/Backend/Regex.pm
Criterion Covered Total %
statement 105 112 93.7
branch 35 42 83.3
condition 34 41 82.9
subroutine 15 15 100.0
pod n/a
total 189 210 90.0


line stmt bran cond sub pod time code
1             package Excel::ValueReader::XLSX::Backend::Regex;
2 2     2   1698 use utf8;
  2         3  
  2         17  
3 2     2   95 use 5.12.1;
  2         7  
4 2     2   10 use Moose;
  2         3  
  2         17  
5 2     2   15629 use Scalar::Util qw/looks_like_number/;
  2         3  
  2         171  
6 2     2   12 use Iterator::Simple qw/iter/;
  2         21  
  2         4977  
7              
8             extends 'Excel::ValueReader::XLSX::Backend';
9              
10              
11             #======================================================================
12             # LAZY ATTRIBUTE CONSTRUCTORS
13             #======================================================================
14              
15             sub _strings {
16 8     8   15 my $self = shift;
17 8         16 my @strings;
18              
19             # read from the sharedStrings zip member
20 8         25 my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
21              
22             # iterate on <si> nodes
23 8         546 while ($contents =~ m[<si>(.*?)</si>]sg) {
24 530         1064 my $innerXML = $1;
25              
26             # concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
27 530         1819 my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
28 530         1174 _decode_xml_entities($string);
29              
30 530         6411 push @strings, $string;
31             }
32              
33 8         410 return \@strings;
34             }
35              
36              
37             sub _workbook_data {
38 15     15   33 my $self = shift;
39              
40 15         33 my %workbook_data;
41              
42             # read from the workbook.xml zip member
43 15         66 my $workbook = $self->_zip_member_contents('xl/workbook.xml');
44              
45             # extract sheet names
46 15         1379 my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
47 15         79 $workbook_data{sheets} = {map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names};
  61         257  
48              
49             # does this workbook use the 1904 calendar ?
50 15         118 my ($date1904) = $workbook =~ m[date1904="(.+?)"];
51 15 100 100     134 $workbook_data{base_year} = $date1904 && $date1904 =~ /^(1|true)$/ ? 1904 : 1900;
52              
53             # active sheet
54 15         129 my ($active_tab) = $workbook =~ m[<workbookView[^>]+activeTab="(\d+)"];
55 15 100       63 $workbook_data{active_sheet} = $active_tab + 1 if defined $active_tab;
56              
57 15         806 return \%workbook_data;
58             }
59              
60              
61              
62             sub _date_styles {
63 10     10   24 my $self = shift;
64              
65 10         79 state $date_style_regex = qr{[dy]|\bmm\b};
66              
67             # read from the styles.xml zip member
68 10         45 my $styles = $self->_zip_member_contents('xl/styles.xml');
69              
70             # start with Excel builtin number formats for dates and times
71 10         591 my @numFmt = $self->Excel_builtin_date_formats;
72              
73             # add other date formats explicitly specified in this workbook
74 10         130 while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
75 76         186 my ($id, $code) = ($1, $2);
76 76 100       9824 $numFmt[$id] = $code if $code =~ $date_style_regex;
77             }
78              
79             # read all cell formats, just rembember those that involve a date number format
80 10         424 my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
81 10         51 my @cell_formats = $self->_extract_xf($cellXfs);
82 10         31 my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
  180         353  
83              
84 10         672 return \@date_styles; # array of shape (xf_index => numFmt_code)
85             }
86              
87              
88             sub _extract_xf {
89 10     10   29 my ($self, $xml) = @_;
90              
91 10         23 state $xf_node_regex = qr{
92             <xf # initial format tag
93             \s
94             ([^>/]*+) # attributes (captured in $1)
95             (?: # non-capturing group for an alternation :
96             /> # .. either an xml closing without content
97             | # or
98             > # .. closing for the xf tag
99             .*? # .. then some formatting content
100             </xf> # .. then the ending tag for the xf node
101             )
102             }x;
103              
104 10         21 my @xf_nodes;
105 10         142 while ($xml =~ /$xf_node_regex/g) {
106 180         1562 push @xf_nodes, _xml_attrs($1);
107             }
108 10         47 return @xf_nodes;
109             }
110              
111              
112             #======================================================================
113             # METHODS
114             #======================================================================
115              
116             sub _values {
117 44     44   172 my ($self, $sheet, $want_iterator) = @_;
118              
119             # regex for the initial preamble
120 44         88 state $preamble_regex = qr(
121             <dimension\s+ref="([A-Z]+\d+(?::[A-Z]+\d+)?)"/> # node specifying the range of defined cells
122             .*?
123             <sheetData> # start container node for actual rows and cells content
124             )xs;
125              
126             # regex for extracting information from cell nodes
127 44         66 state $row_or_cell_regex = qr(
128             <(row) # row tag ($1)
129             (?:\s+r="(\d+)")? # optional row number ($2)
130             [^>/]*? # unused attrs
131             > # end of tag
132              
133             | # .. or ..
134              
135             <(c) # cell tag ($3)
136             (?: \s+ | (?=>) ) # either a space before attrs, or end of tag
137             (?:r="([A-Z]+)(\d+)")? # capture col ($4) and row ($5)
138             [^>/]*? # unused attrs
139             (?:s="(\d+)"\s*)? # style attribute ($6)
140             (?:t="(\w+)"\s*)? # type attribute ($7)
141             (?: # non-capturing group for an alternation :
142             /> # .. either an xml closing without content
143             | # or
144             > # .. closing xml tag, followed by ..
145             (?:
146             <v>(.+?)</v> # .. a value ($8)
147             | # or
148             (.+?) # .. some node content ($9)
149             )
150             </c> # followed by a closing cell tag
151             )
152             )xs;
153             # NOTE : this regex uses positional capturing groups; it would be more readable with named
154             # captures instead, but this would double the execution time on big Excel files, so I
155             # stick to plain old capturing groups.
156              
157             # does this instance want date formatting ?
158 44         3204 my $has_date_formatter = $self->frontend->date_formatter;
159              
160             # get worksheet XML
161 44         276 my $contents = $self->_zip_member_contents($self->_zip_member_name_for_sheet($sheet));
162              
163             # parse the preamble
164 44         4307 my ($ref) = $contents =~ /$preamble_regex/g; # /g to leave the pos() cursor before the 1st cell
165              
166             # variables for the closure below
167 44         167 my ($row_num, $col_num, @rows) = (0, 0);
168              
169             # dual closure : may be used as an iterator or as a regular sub, depending on $want_iterator. Of course
170             # it would have been simpler to just write an iterator, and call it in a loop if the client wants all rows
171             # at once ... but thousands of additional sub calls would slow down the process. So this more complex implementation
172             # is for the sake of processing speed.
173             my $get_values = sub {
174              
175             # in iterator mode, if we have a row ready, just return it
176 2100066 100 100 2100066   8387600 return shift @rows if $want_iterator and @rows > 1;
177              
178             # otherwise loop on matching nodes
179 856         6035 while ($contents =~ /$row_or_cell_regex/cg) { # /g allows the iterator to remember where the last cell left off
180 5626 100       13578 if ($1) { # this is a 'row' tag
    50          
181 1746         2748 my $prev_row = $row_num;
182 1746   66     4339 $row_num = $2 // $row_num+1; # if present, capture group $2 is the row number
183 1746         2285 $col_num = 0;
184 1746         1972411 push @rows, [] for 1 .. $row_num-$prev_row;
185              
186             # in iterator mode, if we have a closed empty row, just return it
187 1746 100 100     11476 return shift @rows if $want_iterator and @rows > 1;
188             }
189             elsif ($3) { # this is a 'c' tag
190 3880         10766 my ($col_A1, $given_row, $style, $cell_type, $val, $inner) = ($4, $5, $6, $7, $8, $9);
191              
192             # deal with the row number given in the 'r' attribute, if present
193 3880   66     6677 $given_row //= $row_num;
194 3880 50       8034 if ($given_row < $row_num) {die "cell claims to be in row $given_row while current row is $row_num"}
  0 50       0  
195 0         0 elsif ($given_row > $row_num) {push @rows, [] for 1 .. $given_row-$row_num;
196 0         0 $col_num = 0;
197 0         0 $row_num = $given_row;}
198              
199              
200             # deal with the col number given in the 'r' attribute, if present
201 3880 100 66     5415 if ($col_A1) {$col_num = $Excel::ValueReader::XLSX::A1_to_num_memoized{$col_A1}
  3856         7510  
202             //= Excel::ValueReader::XLSX->A1_to_num($col_A1)}
203 24         35 else {$col_num++}
204              
205             # handle the cell value according to cell type
206 3880   100     8420 $cell_type //= '';
207 3880 50       7079 if ($cell_type eq 'inlineStr') {
    100          
208             # this is an inline string; gather all <t> nodes within the cell node
209 0         0 $val = join "", ($inner =~ m[<t>(.+?)</t>]g);
210 0 0       0 _decode_xml_entities($val) if $val;
211             }
212             elsif ($cell_type eq 's') {
213             # this is a string cell; $val is a pointer into the global array of shared strings
214 2433         83353 $val = $self->strings->[$val];
215             }
216             else {
217             # this is a plain value
218 1447 100 100     3521 ($val) = ($inner =~ m[<v>(.*?)</v>]) if !defined $val && $inner;
219 1447 100 100     3826 _decode_xml_entities($val) if $val && $cell_type eq 'str';
220              
221             # if necessary, transform the numeric value into a formatted date
222 1447 100 100     5904 if ($has_date_formatter && $style && looks_like_number($val) && $val >= 0) {
      100        
      66        
223 486         17863 my $date_style = $self->date_styles->[$style];
224 486 100       1313 $val = $self->formatted_date($val, $date_style) if $date_style;
225             }
226             }
227              
228             # insert this value into the last row
229 3880         30641 $rows[-1][$col_num-1] = $val;
230             }
231 0         0 else {die "found a node which is neither a <row> nor a <c> (cell)"}
232             }
233              
234             # end of regex matches. In iterator mode, return a row if we have one
235 48 100       202 return @rows ? shift @rows : undef if $want_iterator;
    100          
236 44         474 };
237              
238             # decide what to return depending on the dual mode
239             my $retval = $want_iterator ? iter($get_values)
240 44 100       255 : do {$get_values->(); \@rows}; # run the closure and return the rows
  22         62  
  22         58  
241              
242 44         1283 return ($ref, $retval);
243             }
244              
245              
246             sub _table_targets {
247 10     10   23 my ($self, $rel_xml) = @_;
248              
249 10         120 my @table_targets = $rel_xml =~ m[<Relationship .*? Target="../tables/table(\d+)\.xml"]g;
250 10         35 return @table_targets; # a list of positive integers corresponding to table ids
251             }
252              
253              
254             sub _parse_table_xml {
255 12     12   35 my ($self, $xml) = @_;
256              
257 12 50 33     215 $xml =~ m[<table (.*?)>]g and my $table_attrs = _xml_attrs($1)
258             or die "invalid table XML: $xml";
259              
260             # extract relevant attributes
261             my %table_info = (
262             name => $table_attrs->{displayName},
263             ref => $table_attrs->{ref},
264             no_headers => exists $table_attrs->{headerRowCount} && !$table_attrs->{headerRowCount},
265             has_totals => $table_attrs->{totalsRowCount},
266 12   66     272 columns => [$xml =~ m{<tableColumn [^>]+? name="([^"]+)"}gx],
267             );
268              
269              
270             # decode entites for all string values
271 12         32 _decode_xml_entities($_) for $table_info{name}, @{$table_info{columns}};
  12         60  
272              
273 12         76 return \%table_info;
274             }
275              
276              
277             #======================================================================
278             # AUXILIARY FUNCTIONS
279             #======================================================================
280              
281              
282             sub _decode_xml_entities {
283 584     584   772 state $xml_entities = { amp => '&',
284             lt => '<',
285             gt => '>',
286             quot => '"',
287             apos => "'",
288             };
289 584         806 state $entity_names = join '|', keys %$xml_entities;
290 584         11428 state $regex_entities = qr/&($entity_names);/;
291              
292             # substitute in-place
293 584         2033 $_[0] =~ s/$regex_entities/$xml_entities->{$1}/eg;
  28         117  
294             }
295              
296              
297             sub _xml_attrs {
298 192     192   375 my $attrs_list = shift;
299 192         3028 my %attr = $attrs_list =~ m[(\w+)="(.+?)"]g;
300 192         1539 return \%attr;
301             }
302              
303              
304              
305              
306             1;
307              
308             __END__
309              
310             =head1 NAME
311              
312             Excel::ValueReader::XLSX::Backend::Regex - using regexes for extracting values from Excel workbooks
313              
314             =head1 DESCRIPTION
315              
316             This is one of two backend modules for L<Excel::ValueReader::XLSX>; the other
317             possible backend is L<Excel::ValueReader::XLSX::Backend::LibXML>.
318              
319             This backend parses OOXML structures using regular expressions.
320              
321             =head1 AUTHOR
322              
323             Laurent Dami, E<lt>dami at cpan.orgE<gt>
324              
325             =head1 COPYRIGHT AND LICENSE
326              
327             Copyright 2020-2023 by Laurent Dami.
328              
329             This library is free software; you can redistribute it and/or modify
330             it under the same terms as Perl itself.
331              
332             =cut