File Coverage

blib/lib/RDF/Trine/Parser/Turtle/Lexer.pm

Criterion	Covered	Total	%
statement	310	363	85.4
branch	178	236	75.4
condition	16	21	76.1
subroutine	29	29	100.0
pod	5	5	100.0
total	538	654	82.2

line	stmt	bran	cond	sub	pod	time	code
1							# RDF::Trine::Parser::Turtle::Lexer
2							# -----------------------------------------------------------------------------
3
4							=head1 NAME
5
6							RDF::Trine::Parser::Turtle::Lexer - Tokenizer for parsing Turtle, TriG, and N-Triples
7
8							=head1 VERSION
9
10							This document describes RDF::Trine::Parser::Turtle::Lexer version 1.017
11
12							=head1 SYNOPSIS
13
14							use RDF::Trine::Parser::Lexer;
15							my $l = RDF::Trine::Parser::Lexer->new( file => $fh );
16							while (my $t = $l->get_token) {
17							...
18							}
19
20							=head1 METHODS
21
22							=over 4
23
24							=cut
25
26							package RDF::Trine::Parser::Turtle::Lexer;
27
28	68			68		431	use RDF::Trine::Parser::Turtle::Constants;
	68					163
	68					6578
29	68			68		998	use 5.010;
	68					236
30	68			68		335	use strict;
	68					157
	68					1223
31	68			68		325	use warnings;
	68					173
	68					13401
32	68			68		32665	use Moose;
	68					28488346
	68					569
33	68			68		467668	use Data::Dumper;
	68					398
	68					4063
34	68			68		451	use RDF::Trine::Error;
	68					411
	68					673
35
36							our $VERSION;
37							BEGIN {
38	68			68		117789	$VERSION = '1.017';
39							}
40
41							my $r_nameChar_extra = qr'[-0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]'o;
42							my $r_nameStartChar_minus_underscore = qr'[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{00010000}-\x{000EFFFF}]'o;
43							my $r_nameStartChar = qr/[A-Za-z_\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/;
44							my $r_nameChar = qr/${r_nameStartChar}\|[-0-9\x{b7}\x{0300}-\x{036f}\x{203F}-\x{2040}]/;
45							my $r_prefixName = qr/(?:(?!_)${r_nameStartChar})(?:$r_nameChar)*/;
46							my $r_nameChar_test = qr"(?:$r_nameStartChar\|$r_nameChar_extra)";
47							my $r_double = qr'[+-]?([0-9]+\.[0-9]*[eE][+-]?[0-9]+\|\.[0-9]+[eE][+-]?[0-9]+\|[0-9]+[eE][+-]?[0-9]+)';
48							my $r_decimal = qr'[+-]?(([0-9]+\.[0-9]+)\|\.([0-9])+)';
49							my $r_integer = qr'[+-]?[0-9]+';
50							my $r_PN_CHARS_U = qr/[_A-Za-z_\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/;
51							my $r_PN_CHARS = qr"${r_PN_CHARS_U}\|[-0-9\x{00B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]";
52							my $r_bnode_id = qr"(?:${r_PN_CHARS_U}\|[0-9])((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?";
53
54							my $r_PN_CHARS_BASE = qr/([A-Z]\|[a-z]\|[\x{00C0}-\x{00D6}]\|[\x{00D8}-\x{00F6}]\|[\x{00F8}-\x{02FF}]\|[\x{0370}-\x{037D}]\|[\x{037F}-\x{1FFF}]\|[\x{200C}-\x{200D}]\|[\x{2070}-\x{218F}]\|[\x{2C00}-\x{2FEF}]\|[\x{3001}-\x{D7FF}]\|[\x{F900}-\x{FDCF}]\|[\x{FDF0}-\x{FFFD}]\|[\x{10000}-\x{EFFFF}])/;
55							# my $r_PN_CHARS_U = qr/([_]\|${r_PN_CHARS_BASE})/;
56							# my $r_PN_CHARS = qr/${r_PN_CHARS_U}\|-\|[0-9]\|\x{00B7}\|[\x{0300}-\x{036F}]\|[\x{203F}-\x{2040}]/;
57							my $r_PN_PREFIX = qr/(${r_PN_CHARS_BASE}((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?)/;
58							my $r_PN_LOCAL_ESCAPED = qr{(\\([-~.!&'()*+,;=/?#@%_\$]))\|%[0-9A-Fa-f]{2}};
59							my $r_PN_LOCAL = qr/((${r_PN_CHARS_U}\|[:0-9]\|${r_PN_LOCAL_ESCAPED})((${r_PN_CHARS}\|${r_PN_LOCAL_ESCAPED}\|[:.])*(${r_PN_CHARS}\|[:]\|${r_PN_LOCAL_ESCAPED}))?)/;
60							my $r_PN_LOCAL_BNODE = qr/((${r_PN_CHARS_U}\|[0-9])((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?)/;
61							my $r_PNAME_NS = qr/((${r_PN_PREFIX})?:)/;
62							my $r_PNAME_LN = qr/(${r_PNAME_NS}${r_PN_LOCAL})/;
63
64							has file => (
65							is => 'ro',
66							isa => 'FileHandle',
67							required => 1,
68							);
69
70							has linebuffer => (
71							is => 'rw',
72							isa => 'Str',
73							default => '',
74							);
75
76							has line => (
77							is => 'rw',
78							isa => 'Int',
79							default => 1,
80							);
81
82							has column => (
83							is => 'rw',
84							isa => 'Int',
85							default => 1,
86							);
87
88							has buffer => (
89							is => 'rw',
90							isa => 'Str',
91							default => '',
92							);
93
94							has start_column => (
95							is => 'rw',
96							isa => 'Int',
97							default => -1,
98							);
99
100							has start_line => (
101							is => 'rw',
102							isa => 'Int',
103							default => -1,
104							);
105
106							sub BUILDARGS {
107	412			412	1	996	my $class = shift;
108	412	50				1449	if (scalar(@_) == 1) {
109	412					11982	return { file => shift };
110							} else {
111	0					0	return $class->SUPER::BUILDARGS(@_);
112							}
113							}
114
115							=item C<< new_token ( $type, @values ) >>
116
117							Returns a new token with the given type and optional values, capturing the
118							current line and column of the input data.
119
120							=cut
121
122							sub new_token {
123	9482			9482	1	15150	my $self = shift;
124	9482					17858	my $type = shift;
125	9482					271432	my $start_line = $self->start_line;
126	9482					246419	my $start_col = $self->start_column;
127	9482					234351	my $line = $self->line;
128	9482					237480	my $col = $self->column;
129	9482					46280	return RDF::Trine::Parser::Turtle::Token->fast_constructor(
130							$type,
131							$start_line,
132							$start_col,
133							$line,
134							$col,
135							\@_,
136							);
137							}
138
139							my %CHAR_TOKEN = (
140							'.' => DOT,
141							';' => SEMICOLON,
142							'[' => LBRACKET,
143							']' => RBRACKET,
144							'(' => LPAREN,
145							')' => RPAREN,
146							'{' => LBRACE,
147							'}' => RBRACE,
148							',' => COMMA,
149							'=' => EQUALS,
150							);
151
152							my %METHOD_TOKEN = (
153							# q[#] => '_get_comment',
154							q[@] => '_get_keyword',
155							q[<] => '_get_iriref',
156							q[_] => '_get_bnode',
157							q['] => '_get_single_literal',
158							q["] => '_get_double_literal',
159							q[:] => '_get_pname',
160							(map {$_ => '_get_number'} (0 .. 9, '-', '+'))
161							);
162
163							=item C<< get_token >>
164
165							Returns the next token present in the input.
166
167							=cut
168
169							sub get_token {
170	9620			9620	1	15911	my $self = shift;
171	9620					15713	while (1) {
172	18827	100				51485	unless (length($self->{buffer})) {
173	516					1460	$self->fill_buffer;
174							}
175							# warn "getting token with buffer: " . Dumper($self->{buffer});
176	18827					40562	my $c = $self->_peek_char();
177	18827	100	66			80924	return unless (defined($c) and length($c));
178
179	18526					533803	$self->start_column( $self->column );
180	18526					479620	$self->start_line( $self->line );
181
182	18526	100	100			59306	if ($c eq '.' and $self->{buffer} =~ $r_decimal) {
183	1					5	return $self->_get_number();
184							}
185
186	18525	100				88016	if (defined(my $name = $CHAR_TOKEN{$c})) { $self->_get_char; return $self->new_token($name); }
	2953	100				9198
	2953	100				8572
		100
		100
		100
187	3672					14566	elsif (defined(my $method = $METHOD_TOKEN{$c})) { return $self->$method() }
188							elsif ($c eq '#') {
189							# we're ignoring comment tokens, but we could return them here instead of falling through to the 'next':
190	196					864	$self->_get_comment();
191	196					5770	next;
192							}
193							elsif ($c =~ /[ \r\n\t]/) {
194	9011		66			59466	while (defined($c) and length($c) and $c =~ /[\t\r\n ]/) {
			100
195	18407					49848	$self->_get_char;
196	18407					35849	$c = $self->_peek_char;
197							}
198
199							# we're ignoring whitespace tokens, but we could return them here instead of falling through to the 'next':
200							# return $self->new_token(WS);
201	9011					18664	next;
202							}
203							elsif ($c =~ /[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/) {
204	2649	100				16007	if ($self->{buffer} =~ /^a(?!:)\s/) {
		100
		100
		100
205	60					224	$self->_get_char;
206	60					197	return $self->new_token(A);
207							} elsif ($self->{buffer} =~ /^(?:true\|false)(?!:)\b/) {
208	10					49	my $bool = $self->_read_length($+[0]);
209	10					41	return $self->new_token(BOOLEAN, $bool);
210							} elsif ($self->{buffer} =~ /^BASE(?!:)\b/i) {
211	4					17	$self->_read_length(4);
212	4					16	return $self->new_token(SPARQLBASE);
213							} elsif ($self->{buffer} =~ /^PREFIX(?!:)\b/i) {
214	3					14	$self->_read_length(6);
215	3					14	return $self->new_token(SPARQLPREFIX);
216							} else {
217	2572					7643	return $self->_get_pname;
218							}
219							}
220	38					211	elsif ($c eq '^') { $self->_read_word('^^'); return $self->new_token(HATHAT); }
	37					149
221							else {
222							# Carp::cluck sprintf("Unexpected byte '$c' (0x%02x)", ord($c));
223	6					61	return $self->_throw_error(sprintf("Unexpected byte '%s' (0x%02x)", $c, ord($c)));
224							}
225	0					0	warn 'byte: ' . Dumper($c);
226							}
227							}
228
229							=begin private
230
231							=cut
232
233							=item C<< fill_buffer >>
234
235							Fills the internal parse buffer with a new line from the input source.
236
237							=cut
238
239							sub fill_buffer {
240	4590			4590	1	8094	my $self = shift;
241	4590	50				129159	unless (length($self->buffer)) {
242	4590					117765	my $line = $self->file->getline;
243	4590	100				121110	if (defined($line)) {
244	3698					10706	$self->{buffer} .= $line;
245							}
246							}
247							}
248
249							=item C<< check_for_bom >>
250
251							Checks the input buffer for a Unicode BOM, and consumes it if it is present.
252
253							=cut
254
255							sub check_for_bom {
256	387			387	1	851	my $self = shift;
257	387					1453	my $c = $self->_peek_char();
258	387	100	100			2853	if (defined($c) and $c eq "\x{FEFF}") {
259	1					4	$self->_get_char;
260							}
261							}
262
263							sub _get_char_safe {
264	5888			5888		9511	my $self = shift;
265	5888					9554	my $char = shift;
266	5888					12265	my $c = $self->_get_char;
267	5888	50				15376	if ($c ne $char) {
268	0					0	$self->_throw_error("Expected '$char' but got '$c'");
269							}
270	5888					9847	return $c;
271							}
272
273							sub _get_char_fill_buffer {
274	26			26		46	my $self = shift;
275	26	50				86	if (length($self->{buffer}) == 0) {
276	0					0	$self->fill_buffer;
277	0	0				0	if (length($self->{buffer}) == 0) {
278	0					0	return;
279							}
280							}
281	26					78	my $c = substr($self->{buffer}, 0, 1, '');
282	26	50				105	if ($c eq "\n") {
283							# $self->{linebuffer} = '';
284	0					0	$self->{line} = 1+$self->{line};
285	0					0	$self->{column} = 1;
286							} else {
287							# $self->{linebuffer} .= $c;
288	26					53	$self->{column} = 1+$self->{column};
289							}
290	26					66	return $c;
291							}
292
293							sub _get_char {
294	33052			33052		52690	my $self = shift;
295	33052					89344	my $c = substr($self->{buffer}, 0, 1, '');
296	33052	100				70087	if ($c eq "\n") {
297							# $self->{linebuffer} = '';
298	3555					7189	$self->{line} = 1+$self->{line};
299	3555					6056	$self->{column} = 1;
300							} else {
301							# $self->{linebuffer} .= $c;
302	29497					51083	$self->{column} = 1+$self->{column};
303							}
304	33052					59134	return $c;
305							}
306
307							sub _peek_char {
308	47760			47760		72028	my $self = shift;
309	47760	100				115843	if (length($self->{buffer}) == 0) {
310	4047					11983	$self->fill_buffer;
311	4047	100				14334	if (length($self->{buffer}) == 0) {
312	587					1785	return;
313							}
314							}
315	47173					88969	my $c = substr($self->{buffer}, 0, 1);
316	47173					191577	return $c;
317							}
318
319							sub _read_word {
320	507			507		987	my $self = shift;
321	507					1053	my $word = shift;
322	507					1987	while (length($self->{buffer}) < length($word)) {
323	0					0	$self->fill_buffer;
324							}
325
326	507	100				1956	if (substr($self->{buffer}, 0, length($word)) ne $word) {
327	1					7	$self->_throw_error("Expected '$word'");
328							}
329
330	506					1427	my $lines = ($word =~ tr/\n//);
331	506					1312	my $lastnl = rindex($word, "\n");
332	506					1217	my $cols = length($word) - $lastnl - 1;
333	506					1401	$self->{lines} += $lines;
334	506	50				1531	if ($lines) {
335	0					0	$self->{column} = $cols;
336							} else {
337	506					1082	$self->{column} += $cols;
338							}
339	506					1741	substr($self->{buffer}, 0, length($word), '');
340							}
341
342							sub _read_length {
343	6029			6029		12272	my $self = shift;
344	6029					13372	my $len = shift;
345	6029					18350	while (length($self->{buffer}) < $len) {
346	0					0	$self->fill_buffer;
347							}
348
349	6029					20514	my $word = substr($self->{buffer}, 0, $len, '');
350	6029					13247	my $lines = ($word =~ tr/\n//);
351	6029					12650	my $lastnl = rindex($word, "\n");
352	6029					11610	my $cols = length($word) - $lastnl - 1;
353	6029					10632	$self->{lines} += $lines;
354	6029	100				12666	if ($lines) {
355	31					60	$self->{column} = $cols;
356							} else {
357	5998					9780	$self->{column} += $cols;
358							}
359	6029					17602	return $word;
360							}
361
362							sub _get_pname {
363	3119			3119		5337	my $self = shift;
364	3119					5443	my $prefix = '';
365
366	3119	100				41678	if ($self->{buffer} =~ /^$r_PNAME_LN/) {
367	2839					11895	my $ln = $self->_read_length($+[0]);
368	2839					15418	my ($ns,$local) = ($ln =~ /^([^:]:)(.)$/);
369	68			68		726	no warnings 'uninitialized';
	68					210
	68					151132
370	2839					7154	$local =~ s{\\([-~.!&'()*+,;=:/?#@%_\$])}{$1}g;
371	2839					7548	return $self->new_token(PREFIXNAME, $ns, $local);
372							} else {
373	280					2635	$self->{buffer} =~ $r_PNAME_NS;
374	280					1407	my $ns = $self->_read_length($+[0]);
375	280					1009	return $self->new_token(PREFIXNAME, $ns);
376							}
377							}
378
379							sub _get_iriref {
380	1876			1876		3785	my $self = shift;
381	1876					5864	$self->_get_char_safe(q[<]);
382	1876					3882	my $iri = '';
383	1876					3830	while (1) {
384	3755					8170	my $c = $self->_peek_char;
385	3755	50				9351	last unless defined($c);
386	3755	100				19068	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
387	11					39	$self->_get_char_safe('\\');
388	11					32	my $esc = $self->_get_char;
389	11	50				72	if ($esc eq '\\') {
		100
		100
390	0					0	$iri .= "\\";
391							} elsif ($esc eq 'U') {
392	3					12	my $codepoint = $self->_read_length(8);
393	3	100				20	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
394	1					8	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
395							}
396	2					11	$iri .= chr(hex($codepoint));
397							} elsif ($esc eq 'u') {
398	6					20	my $codepoint = $self->_read_length(4);
399	6	100				46	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
400	1					6	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
401							}
402	5					29	my $char = chr(hex($codepoint));
403	5	100				26	if ($char =~ /[<>" {}\|\\^`]/) {
404	3					41	$self->_throw_error(sprintf("Bad IRI character: '%s' (0x%x)", $char, ord($char)));
405							}
406	2					6	$iri .= $char;
407							} else {
408	2					13	$self->_throw_error("Unrecognized iri escape '$esc'");
409							}
410							} elsif ($self->{buffer} =~ /^[^<>\x00-\x20\\"{}\|^`]+/) {
411	1875					7242	$iri .= $self->_read_length($+[0]);
412							} elsif (substr($self->{buffer}, 0, 1) eq '>') {
413	1865					4075	last;
414							} else {
415	4					25	$self->_throw_error("Got '$c' while expecting IRI character");
416							}
417							}
418	1865					5566	$self->_get_char_safe(q[>]);
419	1865					4976	return $self->new_token(IRI, $iri);
420							}
421
422							sub _get_bnode {
423	83			83		175	my $self = shift;
424	83					306	$self->_read_word('_:');
425	83	50				1260	unless ($self->{buffer} =~ /^${r_bnode_id}/o) {
426	0					0	$self->_throw_error("Expected: name");
427							}
428	83					378	my $name = substr($self->{buffer}, 0, $+[0]);
429	83					284	$self->_read_word($name);
430	83					257	return $self->new_token(BNODE, $name);
431							}
432
433							sub _get_number {
434	81			81		193	my $self = shift;
435	81	100				1622	if ($self->{buffer} =~ /^${r_double}/) {
		100
		100
436	9					51	return $self->new_token(DOUBLE, $self->_read_length($+[0]));
437							} elsif ($self->{buffer} =~ /^${r_decimal}/) {
438	11					57	return $self->new_token(DECIMAL, $self->_read_length($+[0]));
439							} elsif ($self->{buffer} =~ /^${r_integer}/) {
440	59					274	return $self->new_token(INTEGER, $self->_read_length($+[0]));
441							} else {
442	2					12	$self->_throw_error("Expected number");
443							}
444							}
445
446							sub _get_comment {
447	196			196		414	my $self = shift;
448	196					732	$self->_get_char_safe('#');
449	196					498	my $comment = '';
450	196					529	my $c = $self->_peek_char;
451	196		66			1331	while (length($c) and $c !~ /[\r\n]/) {
452	5287					10860	$comment .= $self->_get_char;
453	5287					10815	$c = $self->_peek_char;
454							}
455	196	50	33			1239	if (length($c) and $c =~ /[\r\n]/) {
456	196					507	$self->_get_char;
457							}
458	196					669	return $self->new_token(COMMENT, $comment);
459							}
460
461							sub _get_double_literal {
462	765			765		1640	my $self = shift;
463	765					1849	my $c = $self->_peek_char();
464	765					2506	$self->_get_char_safe(q["]);
465	765	100				2817	if (substr($self->{buffer}, 0, 2) eq q[""]) {
466							# #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
467	29					136	$self->_read_word(q[""]);
468
469	29					66	my $quote_count = 0;
470	29					69	my $string = '';
471	29					67	while (1) {
472	184	100				550	if (length($self->{buffer}) == 0) {
473	23					70	$self->fill_buffer;
474	23	100				94	if (length($self->{buffer}) == 0) {
475	3					15	$self->_throw_error("Found EOF in string literal");
476							}
477							}
478	181	100				467	if (substr($self->{buffer}, 0, 1) eq '"') {
479	88					210	my $c = $self->_get_char;
480	88					152	$quote_count++;
481	88	100				249	if ($quote_count == 3) {
482	26					71	last;
483							}
484							} else {
485	93	100				235	if ($quote_count) {
486	8					39	$string .= '"' foreach (1..$quote_count);
487	8					21	$quote_count = 0;
488							}
489	93	100				254	if (substr($self->{buffer}, 0, 1) eq '\\') {
490	26					66	my $c = $self->_get_char;
491							# $self->_get_char_safe('\\');
492	26					77	my $esc = $self->_get_char_fill_buffer;
493	26	100				164	if ($esc eq '\\'){ $string .= "\\" }
	1	100				2
		50
		100
		100
		100
		50
		50
		50
		100
		50
494	5					19	elsif ($esc eq '"'){ $string .= '"' }
495	0					0	elsif ($esc eq "'"){ $string .= "'" }
496	4					11	elsif ($esc eq 'r'){ $string .= "\r" }
497	4					13	elsif ($esc eq 't'){ $string .= "\t" }
498	4					11	elsif ($esc eq 'n'){ $string .= "\n" }
499	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
500	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
501	0					0	elsif ($esc eq '>'){ $string .= ">" }
502							elsif ($esc eq 'U'){
503	4					14	my $codepoint = $self->_read_length(8);
504	4	50				20	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
505	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
506							}
507	4					21	$string .= chr(hex($codepoint));
508							}
509							elsif ($esc eq 'u'){
510	4					14	my $codepoint = $self->_read_length(4);
511	4	50				27	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
512	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
513							}
514	4					23	$string .= chr(hex($codepoint));
515							}
516							else {
517	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
518							}
519							} else {
520	67					221	$self->{buffer} =~ /^[^"\\]+/;
521	67					234	$string .= $self->_read_length($+[0]);
522							}
523							}
524							}
525	26					87	return $self->new_token(STRING3D, $string);
526							} else {
527							### #x22 scharacter* #x22
528	736					1640	my $string = '';
529	736					1300	while (1) {
530	1623	100				8404	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
531	96					207	my $c = $self->_peek_char;
532	96					239	$self->_get_char_safe('\\');
533	96					189	my $esc = $self->_get_char;
534	96	100				339	if ($esc eq '\\'){ $string .= "\\" }
	45	100				86
		50
		50
		50
		100
		50
		50
		50
		100
		100
535	44					95	elsif ($esc eq '"'){ $string .= '"' }
536	0					0	elsif ($esc eq "'"){ $string .= "'" }
537	0					0	elsif ($esc eq 'r'){ $string .= "\r" }
538	0					0	elsif ($esc eq 't'){ $string .= "\t" }
539	1					3	elsif ($esc eq 'n'){ $string .= "\n" }
540	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
541	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
542	0					0	elsif ($esc eq '>'){ $string .= ">" }
543							elsif ($esc eq 'U'){
544	3					12	my $codepoint = $self->_read_length(8);
545	3	100				17	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
546	2					9	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
547							}
548	1					7	$string .= chr(hex($codepoint));
549							}
550							elsif ($esc eq 'u'){
551	2					8	my $codepoint = $self->_read_length(4);
552	2	100				12	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
553	1					5	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
554							}
555	1					7	$string .= chr(hex($codepoint));
556							}
557							else {
558	1					5	$self->_throw_error("Unrecognized string escape '$esc'");
559							}
560							} elsif ($self->{buffer} =~ /^[^"\\]+/) {
561	795					2879	$string .= $self->_read_length($+[0]);
562							} elsif (substr($self->{buffer}, 0, 1) eq '"') {
563	730					1639	last;
564							} else {
565	2					10	$self->_throw_error("Got '$c' while expecting string character");
566							}
567							}
568	730					2444	$self->_get_char_safe(q["]);
569	730					2080	return $self->new_token(STRING1D, $string);
570							}
571							}
572
573							sub _get_single_literal {
574	32			32		79	my $self = shift;
575	32					93	my $c = $self->_peek_char();
576	32					114	$self->_get_char_safe("'");
577	32	100				160	if (substr($self->{buffer}, 0, 2) eq q['']) {
578							# #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
579	10					35	$self->_read_word(q['']);
580
581	10					19	my $quote_count = 0;
582	10					25	my $string = '';
583	10					24	while (1) {
584	47	100				124	if (length($self->{buffer}) == 0) {
585	4					16	$self->fill_buffer;
586	4	100				18	if (length($self->{buffer}) == 0) {
587	1					5	$self->_throw_error("Found EOF in string literal");
588							}
589							}
590	46	100				115	if (substr($self->{buffer}, 0, 1) eq "'") {
591	31					66	my $c = $self->_get_char;
592	31					51	$quote_count++;
593	31	100				79	if ($quote_count == 3) {
594	9					23	last;
595							}
596							} else {
597	15	100				40	if ($quote_count) {
598	3					14	$string .= "'" foreach (1..$quote_count);
599	3					6	$quote_count = 0;
600							}
601	15	50				48	if (substr($self->{buffer}, 0, 1) eq '\\') {
602	0					0	my $c = $self->_get_char;
603							# $self->_get_char_safe('\\');
604	0					0	my $esc = $self->_get_char_fill_buffer;
605	0	0				0	if ($esc eq '\\'){ $string .= "\\" }
	0	0				0
		0
		0
		0
		0
		0
		0
		0
		0
		0
606	0					0	elsif ($esc eq '"'){ $string .= '"' }
607	0					0	elsif ($esc eq "'"){ $string .= "'" }
608	0					0	elsif ($esc eq 'r'){ $string .= "\r" }
609	0					0	elsif ($esc eq 't'){ $string .= "\t" }
610	0					0	elsif ($esc eq 'n'){ $string .= "\n" }
611	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
612	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
613	0					0	elsif ($esc eq '>'){ $string .= ">" }
614							elsif ($esc eq 'U'){
615	0					0	my $codepoint = $self->_read_length(8);
616	0	0				0	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
617	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
618							}
619	0					0	$string .= chr(hex($codepoint));
620							}
621							elsif ($esc eq 'u'){
622	0					0	my $codepoint = $self->_read_length(4);
623	0	0				0	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
624	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
625							}
626	0					0	$string .= chr(hex($codepoint));
627							}
628							else {
629	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
630							}
631							} else {
632	15					51	$self->{buffer} =~ /^[^'\\]+/;
633	15					51	$string .= $self->_read_length($+[0]);
634							}
635							}
636							}
637	9					27	return $self->new_token(STRING3S, $string);
638							} else {
639							### #x22 scharacter* #x22
640	22					57	my $string = '';
641	22					45	while (1) {
642	44	100				265	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
643	8					27	my $c = $self->_peek_char;
644	8					26	$self->_get_char_safe('\\');
645	8					21	my $esc = $self->_get_char;
646	8	100				91	if ($esc eq '\\'){ $string .= "\\" }
	1	50				3
		50
		100
		100
		100
		100
		100
		50
		100
		50
647	0					0	elsif ($esc eq '"'){ $string .= '"' }
648	0					0	elsif ($esc eq "'"){ $string .= "'" }
649	1					5	elsif ($esc eq 'r'){ $string .= "\r" }
650	1					4	elsif ($esc eq 't'){ $string .= "\t" }
651	1					4	elsif ($esc eq 'n'){ $string .= "\n" }
652	1					4	elsif ($esc eq 'b'){ $string .= "\b" }
653	1					3	elsif ($esc eq 'f'){ $string .= "\f" }
654	0					0	elsif ($esc eq '>'){ $string .= ">" }
655							elsif ($esc eq 'U'){
656	1					4	my $codepoint = $self->_read_length(8);
657	1	50				12	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
658	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
659							}
660	1					9	$string .= chr(hex($codepoint));
661							}
662							elsif ($esc eq 'u'){
663	1					5	my $codepoint = $self->_read_length(4);
664	1	50				7	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
665	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
666							}
667	1					6	$string .= chr(hex($codepoint));
668							}
669							else {
670	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
671							}
672							} elsif ($self->{buffer} =~ /^[^'\\]+/) {
673	14					63	$string .= $self->_read_length($+[0]);
674							} elsif (substr($self->{buffer}, 0, 1) eq "'") {
675	20					50	last;
676							} else {
677	2					11	$self->_throw_error("Got '$c' while expecting string character");
678							}
679							}
680	20					65	$self->_get_char_safe(q[']);
681	20					65	return $self->new_token(STRING1S, $string);
682							}
683							}
684
685							sub _get_keyword {
686	289			289		651	my $self = shift;
687	289					1020	$self->_get_char_safe('@');
688	289	100				1870	if ($self->{buffer} =~ /^base/) {
		100
689	7					33	$self->_read_word('base');
690	7					32	return $self->new_token(BASE);
691							} elsif ($self->{buffer} =~ /^prefix/) {
692	257					1033	$self->_read_word('prefix');
693	257					909	return $self->new_token(PREFIX);
694							} else {
695	25	100				143	if ($self->{buffer} =~ /^[a-zA-Z]+(-[a-zA-Z0-9]+)*\b/) {
696	24					108	my $lang = $self->_read_length($+[0]);
697	24					92	return $self->new_token(LANG, $lang);
698							} else {
699	1					5	$self->_throw_error("Expected keyword or language tag");
700							}
701							}
702							}
703
704							sub _throw_error {
705	37			37		90	my $self = shift;
706	37					70	my $error = shift;
707	37					1049	my $line = $self->line;
708	37					972	my $col = $self->column;
709							# Carp::cluck "$line:$col: $error: " . Dumper($self->{buffer});
710	37					541	RDF::Trine::Error::ParserError::Positioned->throw(
711							-text => "$error at $line:$col",
712							-value => [$line, $col],
713							);
714							}
715
716							__PACKAGE__->meta->make_immutable;
717
718							1;
719
720							__END__
721
722							=end private
723
724							=back
725
726							=head1 BUGS
727
728							Please report any bugs or feature requests to through the GitHub web interface
729							at L<https://github.com/kasei/perlrdf/issues>.
730
731							=head1 AUTHOR
732
733							Gregory Todd Williams C<< <gwilliams@cpan.org> >>
734
735							=head1 COPYRIGHT
736
737							Copyright (c) 2006-2012 Gregory Todd Williams. This
738							program is free software; you can redistribute it and/or modify it under
739							the same terms as Perl itself.
740
741							=cut