File Coverage

blib/lib/Excel/ValueReader/XLSX/Backend/Regex.pm

Criterion	Covered	Total	%
statement	105	112	93.7
branch	35	42	83.3
condition	34	41	82.9
subroutine	15	15	100.0
pod			n/a
total	189	210	90.0

line	stmt	bran	cond	sub	time	code
1						package Excel::ValueReader::XLSX::Backend::Regex;
2	2			2	1698	use utf8;
	2				3
	2				17
3	2			2	95	use 5.12.1;
	2				7
4	2			2	10	use Moose;
	2				3
	2				17
5	2			2	15629	use Scalar::Util qw/looks_like_number/;
	2				3
	2				171
6	2			2	12	use Iterator::Simple qw/iter/;
	2				21
	2				4977
7
8						extends 'Excel::ValueReader::XLSX::Backend';
9
10
11						#======================================================================
12						# LAZY ATTRIBUTE CONSTRUCTORS
13						#======================================================================
14
15						sub _strings {
16	8			8	15	my $self = shift;
17	8				16	my @strings;
18
19						# read from the sharedStrings zip member
20	8				25	my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
21
22						# iterate on <si> nodes
23	8				546	while ($contents =~ m[<si>(.*?)</si>]sg) {
24	530				1064	my $innerXML = $1;
25
26						# concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
27	530				1819	my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
28	530				1174	_decode_xml_entities($string);
29
30	530				6411	push @strings, $string;
31						}
32
33	8				410	return \@strings;
34						}
35
36
37						sub _workbook_data {
38	15			15	33	my $self = shift;
39
40	15				33	my %workbook_data;
41
42						# read from the workbook.xml zip member
43	15				66	my $workbook = $self->_zip_member_contents('xl/workbook.xml');
44
45						# extract sheet names
46	15				1379	my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
47	15				79	$workbook_data{sheets} = {map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names};
	61				257
48
49						# does this workbook use the 1904 calendar ?
50	15				118	my ($date1904) = $workbook =~ m[date1904="(.+?)"];
51	15	100	100		134	$workbook_data{base_year} = $date1904 && $date1904 =~ /^(1\|true)$/ ? 1904 : 1900;
52
53						# active sheet
54	15				129	my ($active_tab) = $workbook =~ m[<workbookView[^>]+activeTab="(\d+)"];
55	15	100			63	$workbook_data{active_sheet} = $active_tab + 1 if defined $active_tab;
56
57	15				806	return \%workbook_data;
58						}
59
60
61
62						sub _date_styles {
63	10			10	24	my $self = shift;
64
65	10				79	state $date_style_regex = qr{[dy]\|\bmm\b};
66
67						# read from the styles.xml zip member
68	10				45	my $styles = $self->_zip_member_contents('xl/styles.xml');
69
70						# start with Excel builtin number formats for dates and times
71	10				591	my @numFmt = $self->Excel_builtin_date_formats;
72
73						# add other date formats explicitly specified in this workbook
74	10				130	while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
75	76				186	my ($id, $code) = ($1, $2);
76	76	100			9824	$numFmt[$id] = $code if $code =~ $date_style_regex;
77						}
78
79						# read all cell formats, just rembember those that involve a date number format
80	10				424	my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
81	10				51	my @cell_formats = $self->_extract_xf($cellXfs);
82	10				31	my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
	180				353
83
84	10				672	return \@date_styles; # array of shape (xf_index => numFmt_code)
85						}
86
87
88						sub _extract_xf {
89	10			10	29	my ($self, $xml) = @_;
90
91	10				23	state $xf_node_regex = qr{
92						<xf # initial format tag
93						\s
94						([^>/]*+) # attributes (captured in $1)
95						(?: # non-capturing group for an alternation :
96						/> # .. either an xml closing without content
97						\| # or
98						> # .. closing for the xf tag
99						.*? # .. then some formatting content
100						</xf> # .. then the ending tag for the xf node
101						)
102						}x;
103
104	10				21	my @xf_nodes;
105	10				142	while ($xml =~ /$xf_node_regex/g) {
106	180				1562	push @xf_nodes, _xml_attrs($1);
107						}
108	10				47	return @xf_nodes;
109						}
110
111
112						#======================================================================
113						# METHODS
114						#======================================================================
115
116						sub _values {
117	44			44	172	my ($self, $sheet, $want_iterator) = @_;
118
119						# regex for the initial preamble
120	44				88	state $preamble_regex = qr(
121						<dimension\s+ref="([A-Z]+\d+(?::[A-Z]+\d+)?)"/> # node specifying the range of defined cells
122						.*?
123						<sheetData> # start container node for actual rows and cells content
124						)xs;
125
126						# regex for extracting information from cell nodes
127	44				66	state $row_or_cell_regex = qr(
128						<(row) # row tag ($1)
129						(?:\s+r="(\d+)")? # optional row number ($2)
130						[^>/]*? # unused attrs
131						> # end of tag
132
133						\| # .. or ..
134
135						<(c) # cell tag ($3)
136						(?: \s+ \| (?=>) ) # either a space before attrs, or end of tag
137						(?:r="([A-Z]+)(\d+)")? # capture col ($4) and row ($5)
138						[^>/]*? # unused attrs
139						(?:s="(\d+)"\s*)? # style attribute ($6)
140						(?:t="(\w+)"\s*)? # type attribute ($7)
141						(?: # non-capturing group for an alternation :
142						/> # .. either an xml closing without content
143						\| # or
144						> # .. closing xml tag, followed by ..
145						(?:
146						<v>(.+?)</v> # .. a value ($8)
147						\| # or
148						(.+?) # .. some node content ($9)
149						)
150						</c> # followed by a closing cell tag
151						)
152						)xs;
153						# NOTE : this regex uses positional capturing groups; it would be more readable with named
154						# captures instead, but this would double the execution time on big Excel files, so I
155						# stick to plain old capturing groups.
156
157						# does this instance want date formatting ?
158	44				3204	my $has_date_formatter = $self->frontend->date_formatter;
159
160						# get worksheet XML
161	44				276	my $contents = $self->_zip_member_contents($self->_zip_member_name_for_sheet($sheet));
162
163						# parse the preamble
164	44				4307	my ($ref) = $contents =~ /$preamble_regex/g; # /g to leave the pos() cursor before the 1st cell
165
166						# variables for the closure below
167	44				167	my ($row_num, $col_num, @rows) = (0, 0);
168
169						# dual closure : may be used as an iterator or as a regular sub, depending on $want_iterator. Of course
170						# it would have been simpler to just write an iterator, and call it in a loop if the client wants all rows
171						# at once ... but thousands of additional sub calls would slow down the process. So this more complex implementation
172						# is for the sake of processing speed.
173						my $get_values = sub {
174
175						# in iterator mode, if we have a row ready, just return it
176	2100066	100	100	2100066	8387600	return shift @rows if $want_iterator and @rows > 1;
177
178						# otherwise loop on matching nodes
179	856				6035	while ($contents =~ /$row_or_cell_regex/cg) { # /g allows the iterator to remember where the last cell left off
180	5626	100			13578	if ($1) { # this is a 'row' tag
		50
181	1746				2748	my $prev_row = $row_num;
182	1746		66		4339	$row_num = $2 // $row_num+1; # if present, capture group $2 is the row number
183	1746				2285	$col_num = 0;
184	1746				1972411	push @rows, [] for 1 .. $row_num-$prev_row;
185
186						# in iterator mode, if we have a closed empty row, just return it
187	1746	100	100		11476	return shift @rows if $want_iterator and @rows > 1;
188						}
189						elsif ($3) { # this is a 'c' tag
190	3880				10766	my ($col_A1, $given_row, $style, $cell_type, $val, $inner) = ($4, $5, $6, $7, $8, $9);
191
192						# deal with the row number given in the 'r' attribute, if present
193	3880		66		6677	$given_row //= $row_num;
194	3880	50			8034	if ($given_row < $row_num) {die "cell claims to be in row $given_row while current row is $row_num"}
	0	50			0
195	0				0	elsif ($given_row > $row_num) {push @rows, [] for 1 .. $given_row-$row_num;
196	0				0	$col_num = 0;
197	0				0	$row_num = $given_row;}
198
199
200						# deal with the col number given in the 'r' attribute, if present
201	3880	100	66		5415	if ($col_A1) {$col_num = $Excel::ValueReader::XLSX::A1_to_num_memoized{$col_A1}
	3856				7510
202						//= Excel::ValueReader::XLSX->A1_to_num($col_A1)}
203	24				35	else {$col_num++}
204
205						# handle the cell value according to cell type
206	3880		100		8420	$cell_type //= '';
207	3880	50			7079	if ($cell_type eq 'inlineStr') {
		100
208						# this is an inline string; gather all <t> nodes within the cell node
209	0				0	$val = join "", ($inner =~ m[<t>(.+?)</t>]g);
210	0	0			0	_decode_xml_entities($val) if $val;
211						}
212						elsif ($cell_type eq 's') {
213						# this is a string cell; $val is a pointer into the global array of shared strings
214	2433				83353	$val = $self->strings->[$val];
215						}
216						else {
217						# this is a plain value
218	1447	100	100		3521	($val) = ($inner =~ m[<v>(.*?)</v>]) if !defined $val && $inner;
219	1447	100	100		3826	_decode_xml_entities($val) if $val && $cell_type eq 'str';
220
221						# if necessary, transform the numeric value into a formatted date
222	1447	100	100		5904	if ($has_date_formatter && $style && looks_like_number($val) && $val >= 0) {
			100
			66
223	486				17863	my $date_style = $self->date_styles->[$style];
224	486	100			1313	$val = $self->formatted_date($val, $date_style) if $date_style;
225						}
226						}
227
228						# insert this value into the last row
229	3880				30641	$rows[-1][$col_num-1] = $val;
230						}
231	0				0	else {die "found a node which is neither a <row> nor a <c> (cell)"}
232						}
233
234						# end of regex matches. In iterator mode, return a row if we have one
235	48	100			202	return @rows ? shift @rows : undef if $want_iterator;
		100
236	44				474	};
237
238						# decide what to return depending on the dual mode
239						my $retval = $want_iterator ? iter($get_values)
240	44	100			255	: do {$get_values->(); \@rows}; # run the closure and return the rows
	22				62
	22				58
241
242	44				1283	return ($ref, $retval);
243						}
244
245
246						sub _table_targets {
247	10			10	23	my ($self, $rel_xml) = @_;
248
249	10				120	my @table_targets = $rel_xml =~ m[<Relationship .*? Target="../tables/table(\d+)\.xml"]g;
250	10				35	return @table_targets; # a list of positive integers corresponding to table ids
251						}
252
253
254						sub _parse_table_xml {
255	12			12	35	my ($self, $xml) = @_;
256
257	12	50	33		215	$xml =~ m[<table (.*?)>]g and my $table_attrs = _xml_attrs($1)
258						or die "invalid table XML: $xml";
259
260						# extract relevant attributes
261						my %table_info = (
262						name => $table_attrs->{displayName},
263						ref => $table_attrs->{ref},
264						no_headers => exists $table_attrs->{headerRowCount} && !$table_attrs->{headerRowCount},
265						has_totals => $table_attrs->{totalsRowCount},
266	12		66		272	columns => [$xml =~ m{<tableColumn [^>]+? name="([^"]+)"}gx],
267						);
268
269
270						# decode entites for all string values
271	12				32	_decode_xml_entities($_) for $table_info{name}, @{$table_info{columns}};
	12				60
272
273	12				76	return \%table_info;
274						}
275
276
277						#======================================================================
278						# AUXILIARY FUNCTIONS
279						#======================================================================
280
281
282						sub _decode_xml_entities {
283	584			584	772	state $xml_entities = { amp => '&',
284						lt => '<',
285						gt => '>',
286						quot => '"',
287						apos => "'",
288						};
289	584				806	state $entity_names = join '\|', keys %$xml_entities;
290	584				11428	state $regex_entities = qr/&($entity_names);/;
291
292						# substitute in-place
293	584				2033	$_[0] =~ s/$regex_entities/$xml_entities->{$1}/eg;
	28				117
294						}
295
296
297						sub _xml_attrs {
298	192			192	375	my $attrs_list = shift;
299	192				3028	my %attr = $attrs_list =~ m[(\w+)="(.+?)"]g;
300	192				1539	return \%attr;
301						}
302
303
304
305
306						1;
307
308						__END__
309
310						=head1 NAME
311
312						Excel::ValueReader::XLSX::Backend::Regex - using regexes for extracting values from Excel workbooks
313
314						=head1 DESCRIPTION
315
316						This is one of two backend modules for L<Excel::ValueReader::XLSX>; the other
317						possible backend is L<Excel::ValueReader::XLSX::Backend::LibXML>.
318
319						This backend parses OOXML structures using regular expressions.
320
321						=head1 AUTHOR
322
323						Laurent Dami, E<lt>dami at cpan.orgE<gt>
324
325						=head1 COPYRIGHT AND LICENSE
326
327						Copyright 2020-2023 by Laurent Dami.
328
329						This library is free software; you can redistribute it and/or modify
330						it under the same terms as Perl itself.
331
332						=cut