File Coverage

blib/lib/Business/UDC/Tokenizer.pm

Criterion	Covered	Total	%
statement	66	69	95.6
branch	25	26	96.1
condition			n/a
subroutine	8	8	100.0
pod	0	1	0.0
total	99	104	95.1

line	stmt	bran	sub	pod	time	code
1						package Business::UDC::Tokenizer;
2
3	12		12		118585	use base qw(Exporter);
	12				25
	12				1375
4	12		12		71	use strict;
	12				30
	12				264
5	12		12		36	use warnings;
	12				26
	12				636
6
7	12		12		4820	use Error::Pure qw(err);
	12				84035
	12				220
8	12		12		650	use Readonly;
	12				46
	12				10871
9
10						Readonly::Array our @EXPORT_OK => qw(tokenize);
11
12						our $VERSION = 0.04;
13
14						sub tokenize {
15	66		66	0	218463	my ($input) = @_;
16	66				79	my @tokens;
17
18	66				169	pos($input) = 0;
19
20	66				160	while (pos($input) < length($input)) {
21	180				190	my $start = pos($input);
22
23	180	100			385	if ($input =~ /\G(\s)/gc) {
24	3				11	err "Whitespace is not allowed in UDC string.",
25						'position' => $start,
26						'character' => $1,
27						;
28						}
29
30	177	100			446	if ($input =~ /\G(\d+(?:\.\d+)*)/gc) {
31	66				153	_push_token(\@tokens, 'NUMBER', $1, $start);
32	66				164	next;
33						}
34
35	111	100			254	if ($input =~ /\G(\.\d+(?:\.\d+)*)/gc) {
36	9				17	_push_token(\@tokens, 'AUX_DOT', $1, $start);
37	9				19	next;
38						}
39
40	102	100			167	if ($input =~ /\G(\[)/gc) {
41	4				10	_push_token(\@tokens, 'LBRACK', $1, $start);
42	4				7	next;
43						}
44
45	98	100			157	if ($input =~ /\G(\])/gc) {
46	5				8	_push_token(\@tokens, 'RBRACK', $1, $start);
47	5				10	next;
48						}
49
50	93	100			170	if ($input =~ /\G([:+\/])/gc) {
51	28				56	_push_token(\@tokens, 'OP', $1, $start);
52	28				44	next;
53						}
54
55	65	100			132	if ($input =~ /\G(-\d+(?:\.\d+)*)/gc) {
56	13				27	_push_token(\@tokens, 'FORM', $1, $start);
57	13				24	next;
58						}
59
60	52	100			121	if ($input =~ /\G($[^)]+$)/gc) {
61	22				43	_push_token(\@tokens, 'AUX_GROUP', $1, $start);
62	22				43	next;
63						}
64
65	30	100			65	if ($input =~ /\G("[^"]*")/gc) {
66	2				6	_push_token(\@tokens, 'AUX_TIME', $1, $start);
67	2				5	next;
68						}
69
70	28	100			77	if ($input =~ /\G(=+(?:[A-Za-z]+\|\d+(?:\.\d+)*))/gc) {
71	2				6	_push_token(\@tokens, 'AUX_LANG', $1, $start);
72	2				5	next;
73						}
74
75	26	100			62	if ($input =~ /\G(\p{L}[\p{L}\p{N}._-]*)/gcu) {
76	10				27	_push_token(\@tokens, 'ALPHA_SPEC', $1, $start);
77	10				18	next;
78						}
79
80	16	100			44	if ($input =~ /\G(\'\d+(?:\.\d+)*)/gc) {
81	13				28	_push_token(\@tokens, 'APOS_AUX', $1, $start);
82	13				25	next;
83						}
84
85	3				6	my $bad = substr($input, $start, 20);
86	3				9	err "Unrecognized input near '$bad'.",
87						'position' => $start,
88						;
89						}
90
91	60				163	return \@tokens;
92						}
93
94						sub _check_whitespace {
95	174		174		256	my ($value, $start) = @_;
96
97	174	50			397	if ($value =~ /^(.*?)\s/s) {
98	0				0	my $ws_pos = length($1);
99	0				0	my $char = substr($value, $ws_pos, 1);
100	0				0	err "Whitespace is not allowed in UDC string.",
101						'position' => $start + $ws_pos,
102						'character' => $char,
103						;
104						}
105
106	174				272	return;
107						}
108
109						sub _push_token {
110	174		174		458	my ($tokens_ar, $type, $value, $start) = @_;
111
112	174				319	_check_whitespace($value, $start);
113
114	174				206	push @{$tokens_ar}, {
	174				476
115						type => $type,
116						value => $value,
117						pos => $start,
118						};
119
120	174				222	return;
121						}
122
123
124						1;
125
126						__END__