File Coverage

blib/lib/Zoidberg/StringParser.pm

Criterion	Covered	Total	%
statement	126	142	88.7
branch	76	90	84.4
condition	32	41	78.0
subroutine	6	6	100.0
pod	2	2	100.0
total	242	281	86.1

line	stmt	bran	cond	sub	pod	time	code
1							package Zoidberg::StringParser;
2
3							# Hic sunt leones.
4
5							our $VERSION = '0.981';
6
7	20			20		32756	use strict;
	20					28
	20					700
8	20			20		121	no warnings; # can't stand the nagging
	20					24
	20					924
9	20			20		669	use Zoidberg::Utils qw/debug error bug/;
	20					78
	20					129
10
11							our $ERROR_CALLER = 1;
12
13							# TODO :
14							# esc per type ?
15
16							# how bout more general state machine approach,
17							# making QUOTE and NEST operations like CUT, POP and RECURS
18
19							# grammar can be big hash (sort keys on length) .. how to deal with regexes than ?
20							# ... optimise for normal string tokens, regexes are the exception
21							# need seperate hashes for overloading
22
23							# how bout ->for(gram, string, int, sub) ? exec sub on token with most parser vars in scope
24							# %state ?
25
26							sub new {
27	23			23	1	82754	my $class = shift;
28	23		50			1043	my $self = {
			100
			100
29							base_gram => shift \|\| {},
30							collection => shift \|\| {},
31							settings => shift \|\| {},
32							};
33	23					71	bless $self, $class;
34	23					89	return $self;
35							}
36
37							sub split {
38	1551			1551	1	36500	my ($self, $gram, $input, $int) = @_;
39	1551					2305	$int--; # 1 based => 0 based
40
41	1551					3516	$$self{broken} = undef; # reset error
42
43	1551					8375	debug "splitting with $gram";
44	1551	100				4949	unless (ref $gram) {
		50
45	1548	50				6023	error "No such grammar: $gram" unless $$self{collection}{$gram};
46	1548					5514	$gram = [$$self{collection}{$gram}]
47							}
48							elsif (ref($gram) eq 'ARRAY') {
49	0					0	my $error;
50	0	0	0			0	$gram = [ map {
51	0					0	ref($_) ? $_ : ($$self{collection}{$_} \|\| $error++)
52							} @$gram ];
53	0	0				0	error "No such grammar: $_" if $error;
54							}
55	3					7	else { $gram = [$gram] } # hash or regex
56	1551					3843	unshift @$gram, $$self{base_gram};
57
58	1551					2050	my ($expr, $types);
59	1551					4142	($gram, $expr, $types) = $self->_prepare_gram($gram);
60							# use Data::Dumper; print STDERR Dumper $gram, $expr, $types;
61
62	1551					4588	my $string;
63	1551	100				3507	if (ref($input) eq 'ARRAY') { $string = shift @$input }
	10					18
64	1541					4710	else { ($string, $input) = ("$input", []) } # quotes in case of overload
65
66	1551	100	66			6431	return unless length $string or @$input;
67
68	1550					1708	my ($block, @parts, @open, $i, $s_i); # $i counts splitted parts, $s_i the stack size
69
70	2039					16370	PARSE_TOKEN:
71							debug 'splitting string: '.$string;
72
73	2039					4067	my ($token, $type, $sign);
74	2039		66			90811	while ( !$token && $string =~ s{\A(.*?)($expr\z)}{}s ) {
75	2639	100				16779	$block .= $1 if length $1;
76	2639					6011	$sign = $2;
77
78	2639					3924	my $i = 0;
79	2639	100				15792	($_ eq $2) ? last : $i++ for ($3, $4, $5);
80	2639					4588	$type = $$types[$i];
81
82	2639	100	66			11895	last unless length $sign or length $string; # catch the \z
83
84	1312	100				2958	if ($type eq 'd_esc') {
85	9					36	debug "block: ==>$block<== token: ==>$sign<== type: $type";
86	9					14	$block .= $sign;
87	9					74	next;
88							}
89
90							# fetch token
91	1303					1415	my $item;
92	1303					6029	my ($slice) = grep exists($$_{$type}), reverse @$gram;
93	1303	100				4031	if (ref($$slice{$type}[1]) eq 'ARRAY') { # for loop probably faster
94	771	100				6180	($item) = map $$_[1],
95	488					1258	grep {ref($$_[0]) ? ($sign =~ $$_[0]) : ($sign eq $$_[0])}
96	488					860	@{$$slice{$type}[1]}
97							}
98	815					2252	else { $item = $$slice{$type}[1]{$sign} }
99	1303					8159	debug "block: ==>$block<== token: ==>$sign<== type: $type item: $item";
100	1303	100				6979	$item = $sign if $item eq '_SELF';
101
102	1303	100	66			14124	if (exists $$slice{s_esc} and $1 =~ /$$slice{s_esc}$/) {
103	12					83	debug 'escaped token s_esc: '.$$slice{s_esc};
104	12	100	100			84	$block =~ s/$$slice{s_esc}$//
105							if $type eq 'tokens' and ! $$self{settings}{no_esc_rm};
106	12					26	$block .= $sign;
107	12					136	next;
108							}
109
110	1291	100				2809	if ($type eq 'tokens') {
111	891	100				1718	unless ($s_i) {
112	493	50				1046	if (ref $item) { # for $() matching tactics
113	0					0	debug 'push stack (tokens)';
114	0					0	push @$gram, $item;
115	0					0	$s_i++;
116	0					0	($gram, $expr, $types) = $self->_prepare_gram($gram);
117	0					0	@open = ($sign, $type);
118	0					0	$token = $$gram[-1]{token};
119							}
120	493					819	else { $token = $item }
121							}
122							else {
123	398	50				1558	if ($item eq '_POP') {
		0
124	398					816	$block .= $sign;
125	398					1716	debug "pop stack ($item)";
126	398					874	pop @$gram;
127	398					621	$s_i--;
128							}
129							elsif ($item eq '_CUT') { # for $() matching
130	0					0	$token = $item;
131	0					0	debug "cut stack ($item)";
132	0					0	splice @$gram, -$s_i;
133	0					0	$s_i = 0;
134							}
135	0					0	else { bug "what to do with $item !?" }
136	398					1058	($gram, $expr, $types) = $self->_prepare_gram($gram);
137							}
138							}
139							else { # open nest or quote
140	400					545	$block .= $sign;
141	400	50				839	unless (ref $item) {
142	400	100				866	if ($item eq '_REC') { $item = {} } # recurs UGLY
	48					173
143							else { # generate a grammar on the fly
144	352	100				2990	$item = ($type eq 'nests')
145							? {
146							tokens => {$item => '_POP'},
147							nests => {$sign => '_REC'},
148							} : {
149							tokens => {$item => '_POP'},
150							quotes => {$sign => '_REC'},
151							nests => {},
152							} ;
153							}
154							}
155							# else if item is ref => item is grammar
156	400					1742	debug "push stack ($type)";
157	400					1090	push @$gram, $item;
158	400					580	$s_i++;
159	400					1691	($gram, $expr, $types) = $self->_prepare_gram($gram);
160	400					1509	@open = ($sign, $type);
161							}
162	1291	100				41642	last unless length $string;
163							}
164
165	2039	100				4845	if (length $block) {
166	1979					3782	my $part = $block; # force copy
167	1979					4836	push @parts, \$part;
168							}
169	2039	100	100			6595	if ($token and $token ne '_CUT') { push @parts, $token }
	84					150
170	2039					2854	$block = $token = undef;
171
172	2039	100	100			16649	if (($s_i or ++$i != $int) and length($string) \|\| scalar(@$input)) {
		100	100
			66
173	489	100				1136	$string = shift @$input unless length $string;
174	489					2545	goto PARSE_TOKEN;
175							}
176							elsif ($i == $int) {
177	1					4	my $part = join '', $string, @$input;
178	1					3	push @parts, \$part;
179							}
180
181	1550	100				3247	if ($s_i) { # broken
182	2					7	debug 'stack not empty';
183	2					8	$open[1] =~ s/s$// ;
184	2					6	$$self{broken} = "Unmatched $open[1] at end of input: $open[0]";
185	2	100				11	error $$self{broken} unless $$self{settings}{allow_broken};
186	1					7	pop @$gram for 1 .. $s_i;
187							}
188
189	1549	50	100			6360	return grep defined($_), map {ref($_) ? $$_ : $_} @parts
	527	100				8481
190							if $$gram[-1]{was_regexp} && ! $$self{settings}{no_split_intel};
191	1365					11631	return grep defined($_), @parts;
192							}
193
194							sub _prepare_gram { # index immediatly here
195	2349			2349		3412	my ($self, $gram) = @_;
196	2349					4008	my %index;
197	2349					4118	for my $ref (@$gram) { # prepare grammars for usage
198	5424	100				21454	if (ref($ref) eq 'Regexp') {
		50
199	185					1456	$ref = {tokens => [[$ref, '_CUT']], was_regexp => 1};
200							}
201							elsif (ref($ref) ne 'HASH') {
202	0					0	error 'Grammar has wrong data type: '.ref($ref)."\n";
203							}
204
205	5424	100				15353	unless ($$ref{prepared}) {
206	677	100	100			5579	if (exists $$ref{esc}) {
		100
207	21	50				276	$$ref{s_esc} = ref($$ref{esc}) ? $$ref{esc}
208							: quotemeta $$ref{esc}; # single esc regexp
209	21					169	$$ref{d_esc} = '('.($$ref{s_esc}x2).')\|'; # double esc regexp
210							}
211							elsif (! exists $$ref{s_esc} and exists $index{s_esc}) {
212	636					2036	$$ref{s_esc} = $index{s_esc};
213							}
214
215	677					1468	for (qw/tokens nests quotes/) {
216	2031	100				5187	next unless exists $$ref{$_};
217	527	100				2110	my $expr = (ref($$ref{$_}) eq 'ARRAY')
218							? join( '\|', map {
219	237					778	ref($$_[0]) ? $$_[0] : quotemeta($$_[0])
220	853					2958	} @{$$ref{$_}} )
221	1252	100				4463	: join( '\|', map { quotemeta($_) } keys %{$$ref{$_}} ) ;
	1015					3915
222	1252	100				3667	$expr = $expr ? '('.$expr.')\|' : '';
223	1252					4465	$$ref{$_} = [$expr, $$ref{$_}];
224							}
225	677					8067	$$ref{prepared}++;
226							}
227
228	5424					37318	$index{$_} = $$ref{$_}[0] for grep exists($$ref{$_}), qw/tokens nests quotes/;
229	5424					32735	$index{$_} = $$ref{$_} for grep exists($$ref{$_}), qw/s_esc d_esc/;
230							}
231
232	2349					13555	my ($expr, @types) = ('');
233	2349					3920	for (qw/d_esc tokens nests quotes/) {
234	9396	100				22438	next unless length $index{$_};
235	9108					17980	push @types, $_;
236	9108					19403	$expr .= $index{$_};
237							}
238	2349					12524	return $gram, $expr, \@types;
239							}
240
241							1;
242
243							__END__