File Coverage

blib/lib/Excel/ValueReader/XLSX/Backend/Regex.pm

Criterion	Covered	Total	%
statement	86	88	97.7
branch	16	20	80.0
condition	21	22	95.4
subroutine	13	13	100.0
pod	1	1	100.0
total	137	144	95.1

line	stmt	bran	cond	sub	pod	time	code
1							package Excel::ValueReader::XLSX::Backend::Regex;
2	1			1		692	use utf8;
	1					3
	1					6
3	1			1		47	use 5.10.1;
	1					3
4	1			1		53	use Moose;
	1					1
	1					8
5	1			1		6508	use Scalar::Util qw/looks_like_number/;
	1					2
	1					64
6	1			1		6	use Carp qw/croak/;
	1					2
	1					1594
7
8							extends 'Excel::ValueReader::XLSX::Backend';
9
10							our $VERSION = '1.09';
11
12							#======================================================================
13							# LAZY ATTRIBUTE CONSTRUCTORS
14							#======================================================================
15
16							sub _strings {
17	3			3		7	my $self = shift;
18	3					5	my @strings;
19
20							# read from the sharedStrings zip member
21	3					10	my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
22
23							# iterate on <si> nodes
24	3					30	while ($contents =~ m[<si>(.*?)</si>]sg) {
25	250					438	my $innerXML = $1;
26
27							# concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
28	250					947	my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
29	250					596	_decode_xml_entities($string);
30
31	250					892	push @strings, $string;
32							}
33
34	3					101	return \@strings;
35							}
36
37
38							sub _workbook_data {
39	6			6		14	my $self = shift;
40
41							# read from the workbook.xml zip member
42	6					22	my $workbook = $self->_zip_member_contents('xl/workbook.xml');
43
44							# extract sheet names
45	6					239	my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
46	6					26	my %sheets = map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names;
	23					64
47
48							# does this workbook use the 1904 calendar ?
49	6					38	my ($date1904) = $workbook =~ m[date1904="(.+?)"];
50	6	100	100			36	my $base_year = $date1904 && $date1904 =~ /^(1\|true)$/ ? 1904 : 1900;
51
52	6					238	return {sheets => \%sheets, base_year => $base_year};
53							}
54
55
56
57							sub _date_styles {
58	5			5		10	my $self = shift;
59
60	5					12	state $date_style_regex = qr{[dy]\|\bmm\b};
61
62							# read from the styles.xml zip member
63	5					17	my $styles = $self->_zip_member_contents('xl/styles.xml');
64
65							# start with Excel builtin number formats for dates and times
66	5					32	my @numFmt = $self->Excel_builtin_date_formats;
67
68							# add other date formats explicitly specified in this workbook
69	5					46	while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
70	38					90	my ($id, $code) = ($1, $2);
71	38	100				235	$numFmt[$id] = $code if $code =~ $date_style_regex;
72							}
73
74							# read all cell formats, just rembember those that involve a date number format
75	5					94	my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
76	5					19	my @cell_formats = $self->_extract_xf($cellXfs);
77	5					13	my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
	90					139
78
79	5					216	return \@date_styles; # array of shape (xf_index => numFmt_code)
80							}
81
82
83							sub _extract_xf {
84	5			5		16	my ($self, $xml) = @_;
85
86	5					14	state $xf_node_regex = qr{
87							<xf # initial format tag
88							\s
89							([^>/]*+) # attributes (captured in $1)
90							(?: # non-capturing group for an alternation :
91							/> # .. either an xml closing without content
92							\| # or
93							> # .. closing for the xf tag
94							.*? # .. then some formatting content
95							</xf> # .. then the ending tag for the xf node
96							)
97							}x;
98
99	5					9	my @xf_nodes;
100	5					43	while ($xml =~ /$xf_node_regex/g) {
101	90					262	my $all_attrs = $1;
102	90					108	my %attr;
103	90					262	while ($all_attrs =~ m[(\w+)="(.+?)"]g) {
104	608					2298	$attr{$1} = $2;
105							}
106	90					673	push @xf_nodes, \%attr;
107							}
108	5					19	return @xf_nodes;
109							}
110
111
112							#======================================================================
113							# METHODS
114							#======================================================================
115
116							sub values {
117	15			15	1	243	my ($self, $sheet) = @_;
118	15					32	my @data;
119	15					35	my ($row, $col, $cell_type, $seen_node);
120
121							# regex for extracting information from cell nodes
122	15					30	state $cell_regex = qr(
123							<c\ # initial cell tag
124							r="([A-Z]+)(\d+)" # capture col and row ($1 and $2)
125							[^>/]*? # unused attrs
126							(?:s="(\d+)"\s*)? # style attribute ($3)
127							(?:t="(\w+)"\s*)? # type attribute ($4)
128							(?: # non-capturing group for an alternation :
129							/> # .. either an xml closing without content
130							\| # or
131							> # .. closing xml tag, followed by
132							(?:
133
134							<v>(.+?)</v> # .. a value ($5)
135							\| # or
136							(.+?) # .. some node content ($6)
137							)
138							</c> # followed by a closing cell tag
139							)
140							)x;
141							# NOTE : this regex uses capturing groups; it would be more readable with named
142							# captures instead, but this doubles the execution time on big Excel files, so I
143							# came back to plain old capturing groups.
144
145							# does this instance want date formatting ?
146	15					700	my $has_date_formatter = $self->frontend->date_formatter;
147
148							# parse worksheet XML, gathering all cells
149	15					74	my $contents = $self->_zip_member_contents($self->_zip_member_name_for_sheet($sheet));
150	15					232	while ($contents =~ /$cell_regex/g) {
151	1192					6058	my ($col, $row, $style, $cell_type, $val, $inner) = ($self->A1_to_num($1), $2, $3, $4, $5, $6);
152
153							# handle cell value according to cell type
154	1192		100			7690	$cell_type //= '';
155	1192	50				2979	if ($cell_type eq 'inlineStr') {
		100
156							# this is an inline string; gather all <t> nodes within the cell node
157	0					0	$val = join "", ($inner =~ m[<t>(.+?)</t>]g);
158	0	0				0	_decode_xml_entities($val) if $val;
159							}
160							elsif ($cell_type eq 's') {
161							# this is a string cell; $val is a pointer into the global array of shared strings
162	644					20592	$val = $self->strings->[$val];
163							}
164							else {
165							# this is a plain value
166	548	100	100			1626	($val) = ($inner =~ m[<v>(.*?)</v>]) if !defined $val && $inner;
167	548	100	100			1937	_decode_xml_entities($val) if $val && $cell_type eq 'str';
168
169							# if necessary, transform the numeric value into a formatted date
170	548	100	100			2929	if ($has_date_formatter && $style && looks_like_number($val) && $val >= 0) {
			100
			66
171	243					10068	my $date_style = $self->date_styles->[$style];
172	243	100				722	$val = $self->formatted_date($val, $date_style) if $date_style;
173							}
174							}
175
176							# insert this value into the global data array
177	1192					13657	$data[$row-1][$col-1] = $val;
178							}
179
180							# insert arrayrefs for empty rows
181	15		100			184	$_ //= [] foreach @data;
182
183	15					126	return \@data;
184							}
185
186
187							sub _table_targets {
188	5			5		10	my ($self, $rel_xml) = @_;
189
190	5					39	my @table_targets = $rel_xml =~ m[<Relationship .*? Target="../tables/table(\d+)\.xml"]g;
191	5					60	return @table_targets; # a list of positive integers corresponding to table ids
192							}
193
194
195							sub _parse_table_xml {
196	5			5		12	my ($self, $xml) = @_;
197
198	5					13	state $table_regex = qr{
199							<table .+? displayName="(\w+)"
200							.+? ref="([:A-Z0-9]+)"
201							.+? (headerRowCount="0")?
202							.+?>
203							}x;
204
205							# extract relevant attributes from the <table> node
206	5	50				64	my ($name, $ref, $no_headers) = $xml =~ /$table_regex/g
207							or croak "invalid table XML";
208
209							# column names. Other attributes from <tableColumn> nodes are ignored.
210	5					44	my @columns = ($xml =~ m{<tableColumn [^>]+? name="([^"]+)"}gx);
211
212							# decode entites for all string values
213	5					19	_decode_xml_entities($_) for $name, @columns;
214
215	5					21	return ($name, $ref, \@columns, $no_headers);
216							}
217
218
219							#======================================================================
220							# AUXILIARY FUNCTIONS
221							#======================================================================
222
223
224							sub _decode_xml_entities {
225	273			273		318	state $xml_entities = { amp => '&',
226							lt => '<',
227							gt => '>',
228							quot => '"',
229							apos => "'",
230							};
231	273					294	state $entity_names = join '\|', keys %$xml_entities;
232	273					371	state $regex_entities = qr/&($entity_names);/;
233
234							# substitute in-place
235	273					735	$_[0] =~ s/$regex_entities/$xml_entities->{$1}/eg;
	14					45
236							}
237
238
239							1;
240
241							__END__
242
243							=head1 NAME
244
245							Excel::ValueReader::XLSX::Backend::Regex - using regexes for extracting values from Excel workbooks
246
247							=head1 DESCRIPTION
248
249							This is one of two backend modules for L<Excel::ValueReader::XLSX>; the other
250							possible backend is L<Excel::ValueReader::XLSX::Backend::LibXML>.
251
252							This backend parses OOXML structures using regular expressions.
253
254							=head1 AUTHOR
255
256							Laurent Dami, E<lt>dami at cpan.orgE<gt>
257
258							=head1 COPYRIGHT AND LICENSE
259
260							Copyright 2020,2021 by Laurent Dami.
261
262							This library is free software; you can redistribute it and/or modify
263							it under the same terms as Perl itself.