File Coverage

blib/lib/RDF/Trine/Parser/Turtle/Lexer.pm

Criterion	Covered	Total	%
statement	310	363	85.4
branch	178	236	75.4
condition	16	21	76.1
subroutine	29	29	100.0
pod	5	5	100.0
total	538	654	82.2

line	stmt	bran	cond	sub	pod	time	code
1							# RDF::Trine::Parser::Turtle::Lexer
2							# -----------------------------------------------------------------------------
3
4							=head1 NAME
5
6							RDF::Trine::Parser::Turtle::Lexer - Tokenizer for parsing Turtle, TriG, and N-Triples
7
8							=head1 VERSION
9
10							This document describes RDF::Trine::Parser::Turtle::Lexer version 1.018
11
12							=head1 SYNOPSIS
13
14							use RDF::Trine::Parser::Lexer;
15							my $l = RDF::Trine::Parser::Lexer->new( file => $fh );
16							while (my $t = $l->get_token) {
17							...
18							}
19
20							=head1 METHODS
21
22							=over 4
23
24							=cut
25
26							package RDF::Trine::Parser::Turtle::Lexer;
27
28	68			68		440	use RDF::Trine::Parser::Turtle::Constants;
	68					156
	68					6333
29	68			68		973	use 5.010;
	68					226
30	68			68		327	use strict;
	68					155
	68					1217
31	68			68		319	use warnings;
	68					168
	68					12477
32	68			68		32218	use Moose;
	68					28702906
	68					482
33	68			68		470263	use Data::Dumper;
	68					405
	68					3816
34	68			68		428	use RDF::Trine::Error;
	68					385
	68					630
35
36							our $VERSION;
37							BEGIN {
38	68			68		120393	$VERSION = '1.018';
39							}
40
41							my $r_nameChar_extra = qr'[-0-9\x{B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]'o;
42							my $r_nameStartChar_minus_underscore = qr'[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{00010000}-\x{000EFFFF}]'o;
43							my $r_nameStartChar = qr/[A-Za-z_\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/;
44							my $r_nameChar = qr/${r_nameStartChar}\|[-0-9\x{b7}\x{0300}-\x{036f}\x{203F}-\x{2040}]/;
45							my $r_prefixName = qr/(?:(?!_)${r_nameStartChar})(?:$r_nameChar)*/;
46							my $r_nameChar_test = qr"(?:$r_nameStartChar\|$r_nameChar_extra)";
47							my $r_double = qr'[+-]?([0-9]+\.[0-9]*[eE][+-]?[0-9]+\|\.[0-9]+[eE][+-]?[0-9]+\|[0-9]+[eE][+-]?[0-9]+)';
48							my $r_decimal = qr'[+-]?(([0-9]+\.[0-9]+)\|\.([0-9])+)';
49							my $r_integer = qr'[+-]?[0-9]+';
50							my $r_PN_CHARS_U = qr/[_A-Za-z_\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/;
51							my $r_PN_CHARS = qr"${r_PN_CHARS_U}\|[-0-9\x{00B7}\x{0300}-\x{036F}\x{203F}-\x{2040}]";
52							my $r_bnode_id = qr"(?:${r_PN_CHARS_U}\|[0-9])((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?";
53
54							my $r_PN_CHARS_BASE = qr/([A-Z]\|[a-z]\|[\x{00C0}-\x{00D6}]\|[\x{00D8}-\x{00F6}]\|[\x{00F8}-\x{02FF}]\|[\x{0370}-\x{037D}]\|[\x{037F}-\x{1FFF}]\|[\x{200C}-\x{200D}]\|[\x{2070}-\x{218F}]\|[\x{2C00}-\x{2FEF}]\|[\x{3001}-\x{D7FF}]\|[\x{F900}-\x{FDCF}]\|[\x{FDF0}-\x{FFFD}]\|[\x{10000}-\x{EFFFF}])/;
55							# my $r_PN_CHARS_U = qr/([_]\|${r_PN_CHARS_BASE})/;
56							# my $r_PN_CHARS = qr/${r_PN_CHARS_U}\|-\|[0-9]\|\x{00B7}\|[\x{0300}-\x{036F}]\|[\x{203F}-\x{2040}]/;
57							my $r_PN_PREFIX = qr/(${r_PN_CHARS_BASE}((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?)/;
58							my $r_PN_LOCAL_ESCAPED = qr{(\\([-~.!&'()*+,;=/?#@%_\$]))\|%[0-9A-Fa-f]{2}};
59							my $r_PN_LOCAL = qr/((${r_PN_CHARS_U}\|[:0-9]\|${r_PN_LOCAL_ESCAPED})((${r_PN_CHARS}\|${r_PN_LOCAL_ESCAPED}\|[:.])*(${r_PN_CHARS}\|[:]\|${r_PN_LOCAL_ESCAPED}))?)/;
60							my $r_PN_LOCAL_BNODE = qr/((${r_PN_CHARS_U}\|[0-9])((${r_PN_CHARS}\|[.])*${r_PN_CHARS})?)/;
61							my $r_PNAME_NS = qr/((${r_PN_PREFIX})?:)/;
62							my $r_PNAME_LN = qr/(${r_PNAME_NS}${r_PN_LOCAL})/;
63
64							has file => (
65							is => 'ro',
66							isa => 'FileHandle',
67							required => 1,
68							);
69
70							has linebuffer => (
71							is => 'rw',
72							isa => 'Str',
73							default => '',
74							);
75
76							has line => (
77							is => 'rw',
78							isa => 'Int',
79							default => 1,
80							);
81
82							has column => (
83							is => 'rw',
84							isa => 'Int',
85							default => 1,
86							);
87
88							has buffer => (
89							is => 'rw',
90							isa => 'Str',
91							default => '',
92							);
93
94							has start_column => (
95							is => 'rw',
96							isa => 'Int',
97							default => -1,
98							);
99
100							has start_line => (
101							is => 'rw',
102							isa => 'Int',
103							default => -1,
104							);
105
106							sub BUILDARGS {
107	412			412	1	869	my $class = shift;
108	412	50				1323	if (scalar(@_) == 1) {
109	412					11468	return { file => shift };
110							} else {
111	0					0	return $class->SUPER::BUILDARGS(@_);
112							}
113							}
114
115							=item C<< new_token ( $type, @values ) >>
116
117							Returns a new token with the given type and optional values, capturing the
118							current line and column of the input data.
119
120							=cut
121
122							sub new_token {
123	9482			9482	1	16164	my $self = shift;
124	9482					16215	my $type = shift;
125	9482					257420	my $start_line = $self->start_line;
126	9482					236045	my $start_col = $self->start_column;
127	9482					221919	my $line = $self->line;
128	9482					224318	my $col = $self->column;
129	9482					41069	return RDF::Trine::Parser::Turtle::Token->fast_constructor(
130							$type,
131							$start_line,
132							$start_col,
133							$line,
134							$col,
135							\@_,
136							);
137							}
138
139							my %CHAR_TOKEN = (
140							'.' => DOT,
141							';' => SEMICOLON,
142							'[' => LBRACKET,
143							']' => RBRACKET,
144							'(' => LPAREN,
145							')' => RPAREN,
146							'{' => LBRACE,
147							'}' => RBRACE,
148							',' => COMMA,
149							'=' => EQUALS,
150							);
151
152							my %METHOD_TOKEN = (
153							# q[#] => '_get_comment',
154							q[@] => '_get_keyword',
155							q[<] => '_get_iriref',
156							q[_] => '_get_bnode',
157							q['] => '_get_single_literal',
158							q["] => '_get_double_literal',
159							q[:] => '_get_pname',
160							(map {$_ => '_get_number'} (0 .. 9, '-', '+'))
161							);
162
163							=item C<< get_token >>
164
165							Returns the next token present in the input.
166
167							=cut
168
169							sub get_token {
170	9620			9620	1	15357	my $self = shift;
171	9620					14766	while (1) {
172	18827	100				49559	unless (length($self->{buffer})) {
173	516					1392	$self->fill_buffer;
174							}
175							# warn "getting token with buffer: " . Dumper($self->{buffer});
176	18827					40430	my $c = $self->_peek_char();
177	18827	100	66			78803	return unless (defined($c) and length($c));
178
179	18526					500320	$self->start_column( $self->column );
180	18526					447738	$self->start_line( $self->line );
181
182	18526	100	100			57434	if ($c eq '.' and $self->{buffer} =~ $r_decimal) {
183	1					5	return $self->_get_number();
184							}
185
186	18525	100				87213	if (defined(my $name = $CHAR_TOKEN{$c})) { $self->_get_char; return $self->new_token($name); }
	2953	100				8499
	2953	100				7613
		100
		100
		100
187	3672					13827	elsif (defined(my $method = $METHOD_TOKEN{$c})) { return $self->$method() }
188							elsif ($c eq '#') {
189							# we're ignoring comment tokens, but we could return them here instead of falling through to the 'next':
190	196					697	$self->_get_comment();
191	196					5323	next;
192							}
193							elsif ($c =~ /[ \r\n\t]/) {
194	9011		66			57176	while (defined($c) and length($c) and $c =~ /[\t\r\n ]/) {
			100
195	18407					46829	$self->_get_char;
196	18407					33858	$c = $self->_peek_char;
197							}
198
199							# we're ignoring whitespace tokens, but we could return them here instead of falling through to the 'next':
200							# return $self->new_token(WS);
201	9011					18283	next;
202							}
203							elsif ($c =~ /[A-Za-z\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}\x{00F8}-\x{02FF}\x{0370}-\x{037D}\x{037F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/) {
204	2649	100				16937	if ($self->{buffer} =~ /^a(?!:)\s/) {
		100
		100
		100
205	60					203	$self->_get_char;
206	60					187	return $self->new_token(A);
207							} elsif ($self->{buffer} =~ /^(?:true\|false)(?!:)\b/) {
208	10					49	my $bool = $self->_read_length($+[0]);
209	10					43	return $self->new_token(BOOLEAN, $bool);
210							} elsif ($self->{buffer} =~ /^BASE(?!:)\b/i) {
211	4					22	$self->_read_length(4);
212	4					16	return $self->new_token(SPARQLBASE);
213							} elsif ($self->{buffer} =~ /^PREFIX(?!:)\b/i) {
214	3					16	$self->_read_length(6);
215	3					15	return $self->new_token(SPARQLPREFIX);
216							} else {
217	2572					7579	return $self->_get_pname;
218							}
219							}
220	38					146	elsif ($c eq '^') { $self->_read_word('^^'); return $self->new_token(HATHAT); }
	37					115
221							else {
222							# Carp::cluck sprintf("Unexpected byte '$c' (0x%02x)", ord($c));
223	6					41	return $self->_throw_error(sprintf("Unexpected byte '%s' (0x%02x)", $c, ord($c)));
224							}
225	0					0	warn 'byte: ' . Dumper($c);
226							}
227							}
228
229							=begin private
230
231							=cut
232
233							=item C<< fill_buffer >>
234
235							Fills the internal parse buffer with a new line from the input source.
236
237							=cut
238
239							sub fill_buffer {
240	4590			4590	1	7026	my $self = shift;
241	4590	50				122495	unless (length($self->buffer)) {
242	4590					111133	my $line = $self->file->getline;
243	4590	100				114401	if (defined($line)) {
244	3698					10073	$self->{buffer} .= $line;
245							}
246							}
247							}
248
249							=item C<< check_for_bom >>
250
251							Checks the input buffer for a Unicode BOM, and consumes it if it is present.
252
253							=cut
254
255							sub check_for_bom {
256	387			387	1	746	my $self = shift;
257	387					1207	my $c = $self->_peek_char();
258	387	100	100			2668	if (defined($c) and $c eq "\x{FEFF}") {
259	1					4	$self->_get_char;
260							}
261							}
262
263							sub _get_char_safe {
264	5888			5888		9434	my $self = shift;
265	5888					9209	my $char = shift;
266	5888					11781	my $c = $self->_get_char;
267	5888	50				14091	if ($c ne $char) {
268	0					0	$self->_throw_error("Expected '$char' but got '$c'");
269							}
270	5888					9712	return $c;
271							}
272
273							sub _get_char_fill_buffer {
274	26			26		44	my $self = shift;
275	26	50				74	if (length($self->{buffer}) == 0) {
276	0					0	$self->fill_buffer;
277	0	0				0	if (length($self->{buffer}) == 0) {
278	0					0	return;
279							}
280							}
281	26					72	my $c = substr($self->{buffer}, 0, 1, '');
282	26	50				62	if ($c eq "\n") {
283							# $self->{linebuffer} = '';
284	0					0	$self->{line} = 1+$self->{line};
285	0					0	$self->{column} = 1;
286							} else {
287							# $self->{linebuffer} .= $c;
288	26					49	$self->{column} = 1+$self->{column};
289							}
290	26					49	return $c;
291							}
292
293							sub _get_char {
294	33052			33052		48907	my $self = shift;
295	33052					83739	my $c = substr($self->{buffer}, 0, 1, '');
296	33052	100				67754	if ($c eq "\n") {
297							# $self->{linebuffer} = '';
298	3555					6856	$self->{line} = 1+$self->{line};
299	3555					5589	$self->{column} = 1;
300							} else {
301							# $self->{linebuffer} .= $c;
302	29497					49954	$self->{column} = 1+$self->{column};
303							}
304	33052					56848	return $c;
305							}
306
307							sub _peek_char {
308	47760			47760		70117	my $self = shift;
309	47760	100				110593	if (length($self->{buffer}) == 0) {
310	4047					10886	$self->fill_buffer;
311	4047	100				14168	if (length($self->{buffer}) == 0) {
312	587					1839	return;
313							}
314							}
315	47173					84402	my $c = substr($self->{buffer}, 0, 1);
316	47173					185426	return $c;
317							}
318
319							sub _read_word {
320	507			507		855	my $self = shift;
321	507					898	my $word = shift;
322	507					1678	while (length($self->{buffer}) < length($word)) {
323	0					0	$self->fill_buffer;
324							}
325
326	507	100				1649	if (substr($self->{buffer}, 0, length($word)) ne $word) {
327	1					6	$self->_throw_error("Expected '$word'");
328							}
329
330	506					1186	my $lines = ($word =~ tr/\n//);
331	506					1373	my $lastnl = rindex($word, "\n");
332	506					992	my $cols = length($word) - $lastnl - 1;
333	506					1230	$self->{lines} += $lines;
334	506	50				1175	if ($lines) {
335	0					0	$self->{column} = $cols;
336							} else {
337	506					917	$self->{column} += $cols;
338							}
339	506					1519	substr($self->{buffer}, 0, length($word), '');
340							}
341
342							sub _read_length {
343	6029			6029		10249	my $self = shift;
344	6029					11821	my $len = shift;
345	6029					17341	while (length($self->{buffer}) < $len) {
346	0					0	$self->fill_buffer;
347							}
348
349	6029					18268	my $word = substr($self->{buffer}, 0, $len, '');
350	6029					13075	my $lines = ($word =~ tr/\n//);
351	6029					11980	my $lastnl = rindex($word, "\n");
352	6029					11149	my $cols = length($word) - $lastnl - 1;
353	6029					11498	$self->{lines} += $lines;
354	6029	100				12455	if ($lines) {
355	31					58	$self->{column} = $cols;
356							} else {
357	5998					10064	$self->{column} += $cols;
358							}
359	6029					17251	return $word;
360							}
361
362							sub _get_pname {
363	3119			3119		4968	my $self = shift;
364	3119					5215	my $prefix = '';
365
366	3119	100				39991	if ($self->{buffer} =~ /^$r_PNAME_LN/) {
367	2839					10908	my $ln = $self->_read_length($+[0]);
368	2839					13920	my ($ns,$local) = ($ln =~ /^([^:]:)(.)$/);
369	68			68		640	no warnings 'uninitialized';
	68					185
	68					154519
370	2839					7221	$local =~ s{\\([-~.!&'()*+,;=:/?#@%_\$])}{$1}g;
371	2839					7357	return $self->new_token(PREFIXNAME, $ns, $local);
372							} else {
373	280					2100	$self->{buffer} =~ $r_PNAME_NS;
374	280					1150	my $ns = $self->_read_length($+[0]);
375	280					952	return $self->new_token(PREFIXNAME, $ns);
376							}
377							}
378
379							sub _get_iriref {
380	1876			1876		3694	my $self = shift;
381	1876					5482	$self->_get_char_safe(q[<]);
382	1876					3300	my $iri = '';
383	1876					3066	while (1) {
384	3755					7876	my $c = $self->_peek_char;
385	3755	50				9391	last unless defined($c);
386	3755	100				18247	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
387	11					36	$self->_get_char_safe('\\');
388	11					25	my $esc = $self->_get_char;
389	11	50				56	if ($esc eq '\\') {
		100
		100
390	0					0	$iri .= "\\";
391							} elsif ($esc eq 'U') {
392	3					9	my $codepoint = $self->_read_length(8);
393	3	100				20	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
394	1					4	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
395							}
396	2					12	$iri .= chr(hex($codepoint));
397							} elsif ($esc eq 'u') {
398	6					22	my $codepoint = $self->_read_length(4);
399	6	100				40	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
400	1					6	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
401							}
402	5					27	my $char = chr(hex($codepoint));
403	5	100				24	if ($char =~ /[<>" {}\|\\^`]/) {
404	3					30	$self->_throw_error(sprintf("Bad IRI character: '%s' (0x%x)", $char, ord($char)));
405							}
406	2					5	$iri .= $char;
407							} else {
408	2					12	$self->_throw_error("Unrecognized iri escape '$esc'");
409							}
410							} elsif ($self->{buffer} =~ /^[^<>\x00-\x20\\"{}\|^`]+/) {
411	1875					6702	$iri .= $self->_read_length($+[0]);
412							} elsif (substr($self->{buffer}, 0, 1) eq '>') {
413	1865					3806	last;
414							} else {
415	4					25	$self->_throw_error("Got '$c' while expecting IRI character");
416							}
417							}
418	1865					4918	$self->_get_char_safe(q[>]);
419	1865					4793	return $self->new_token(IRI, $iri);
420							}
421
422							sub _get_bnode {
423	83			83		183	my $self = shift;
424	83					275	$self->_read_word('_:');
425	83	50				1156	unless ($self->{buffer} =~ /^${r_bnode_id}/o) {
426	0					0	$self->_throw_error("Expected: name");
427							}
428	83					379	my $name = substr($self->{buffer}, 0, $+[0]);
429	83					292	$self->_read_word($name);
430	83					244	return $self->new_token(BNODE, $name);
431							}
432
433							sub _get_number {
434	81			81		158	my $self = shift;
435	81	100				1429	if ($self->{buffer} =~ /^${r_double}/) {
		100
		100
436	9					46	return $self->new_token(DOUBLE, $self->_read_length($+[0]));
437							} elsif ($self->{buffer} =~ /^${r_decimal}/) {
438	11					56	return $self->new_token(DECIMAL, $self->_read_length($+[0]));
439							} elsif ($self->{buffer} =~ /^${r_integer}/) {
440	59					287	return $self->new_token(INTEGER, $self->_read_length($+[0]));
441							} else {
442	2					9	$self->_throw_error("Expected number");
443							}
444							}
445
446							sub _get_comment {
447	196			196		383	my $self = shift;
448	196					558	$self->_get_char_safe('#');
449	196					335	my $comment = '';
450	196					460	my $c = $self->_peek_char;
451	196		66			1044	while (length($c) and $c !~ /[\r\n]/) {
452	5287					9683	$comment .= $self->_get_char;
453	5287					9875	$c = $self->_peek_char;
454							}
455	196	50	33			999	if (length($c) and $c =~ /[\r\n]/) {
456	196					460	$self->_get_char;
457							}
458	196					574	return $self->new_token(COMMENT, $comment);
459							}
460
461							sub _get_double_literal {
462	765			765		1447	my $self = shift;
463	765					1918	my $c = $self->_peek_char();
464	765					2463	$self->_get_char_safe(q["]);
465	765	100				2776	if (substr($self->{buffer}, 0, 2) eq q[""]) {
466							# #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
467	29					91	$self->_read_word(q[""]);
468
469	29					68	my $quote_count = 0;
470	29					57	my $string = '';
471	29					52	while (1) {
472	184	100				462	if (length($self->{buffer}) == 0) {
473	23					58	$self->fill_buffer;
474	23	100				78	if (length($self->{buffer}) == 0) {
475	3					12	$self->_throw_error("Found EOF in string literal");
476							}
477							}
478	181	100				394	if (substr($self->{buffer}, 0, 1) eq '"') {
479	88					166	my $c = $self->_get_char;
480	88					133	$quote_count++;
481	88	100				202	if ($quote_count == 3) {
482	26					53	last;
483							}
484							} else {
485	93	100				204	if ($quote_count) {
486	8					31	$string .= '"' foreach (1..$quote_count);
487	8					16	$quote_count = 0;
488							}
489	93	100				232	if (substr($self->{buffer}, 0, 1) eq '\\') {
490	26					54	my $c = $self->_get_char;
491							# $self->_get_char_safe('\\');
492	26					54	my $esc = $self->_get_char_fill_buffer;
493	26	100				147	if ($esc eq '\\'){ $string .= "\\" }
	1	100				4
		50
		100
		100
		100
		50
		50
		50
		100
		50
494	5					12	elsif ($esc eq '"'){ $string .= '"' }
495	0					0	elsif ($esc eq "'"){ $string .= "'" }
496	4					10	elsif ($esc eq 'r'){ $string .= "\r" }
497	4					10	elsif ($esc eq 't'){ $string .= "\t" }
498	4					11	elsif ($esc eq 'n'){ $string .= "\n" }
499	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
500	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
501	0					0	elsif ($esc eq '>'){ $string .= ">" }
502							elsif ($esc eq 'U'){
503	4					11	my $codepoint = $self->_read_length(8);
504	4	50				20	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
505	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
506							}
507	4					15	$string .= chr(hex($codepoint));
508							}
509							elsif ($esc eq 'u'){
510	4					11	my $codepoint = $self->_read_length(4);
511	4	50				20	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
512	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
513							}
514	4					13	$string .= chr(hex($codepoint));
515							}
516							else {
517	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
518							}
519							} else {
520	67					195	$self->{buffer} =~ /^[^"\\]+/;
521	67					191	$string .= $self->_read_length($+[0]);
522							}
523							}
524							}
525	26					79	return $self->new_token(STRING3D, $string);
526							} else {
527							### #x22 scharacter* #x22
528	736					1666	my $string = '';
529	736					1188	while (1) {
530	1623	100				7992	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
531	96					205	my $c = $self->_peek_char;
532	96					258	$self->_get_char_safe('\\');
533	96					196	my $esc = $self->_get_char;
534	96	100				291	if ($esc eq '\\'){ $string .= "\\" }
	45	100				87
		50
		50
		50
		100
		50
		50
		50
		100
		100
535	44					81	elsif ($esc eq '"'){ $string .= '"' }
536	0					0	elsif ($esc eq "'"){ $string .= "'" }
537	0					0	elsif ($esc eq 'r'){ $string .= "\r" }
538	0					0	elsif ($esc eq 't'){ $string .= "\t" }
539	1					3	elsif ($esc eq 'n'){ $string .= "\n" }
540	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
541	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
542	0					0	elsif ($esc eq '>'){ $string .= ">" }
543							elsif ($esc eq 'U'){
544	3					9	my $codepoint = $self->_read_length(8);
545	3	100				18	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
546	2					10	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
547							}
548	1					7	$string .= chr(hex($codepoint));
549							}
550							elsif ($esc eq 'u'){
551	2					8	my $codepoint = $self->_read_length(4);
552	2	100				11	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
553	1					5	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
554							}
555	1					7	$string .= chr(hex($codepoint));
556							}
557							else {
558	1					6	$self->_throw_error("Unrecognized string escape '$esc'");
559							}
560							} elsif ($self->{buffer} =~ /^[^"\\]+/) {
561	795					3150	$string .= $self->_read_length($+[0]);
562							} elsif (substr($self->{buffer}, 0, 1) eq '"') {
563	730					1570	last;
564							} else {
565	2					10	$self->_throw_error("Got '$c' while expecting string character");
566							}
567							}
568	730					2305	$self->_get_char_safe(q["]);
569	730					1839	return $self->new_token(STRING1D, $string);
570							}
571							}
572
573							sub _get_single_literal {
574	32			32		79	my $self = shift;
575	32					84	my $c = $self->_peek_char();
576	32					104	$self->_get_char_safe("'");
577	32	100				120	if (substr($self->{buffer}, 0, 2) eq q['']) {
578							# #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
579	10					37	$self->_read_word(q['']);
580
581	10					24	my $quote_count = 0;
582	10					25	my $string = '';
583	10					21	while (1) {
584	47	100				127	if (length($self->{buffer}) == 0) {
585	4					15	$self->fill_buffer;
586	4	100				16	if (length($self->{buffer}) == 0) {
587	1					4	$self->_throw_error("Found EOF in string literal");
588							}
589							}
590	46	100				121	if (substr($self->{buffer}, 0, 1) eq "'") {
591	31					69	my $c = $self->_get_char;
592	31					51	$quote_count++;
593	31	100				84	if ($quote_count == 3) {
594	9					20	last;
595							}
596							} else {
597	15	100				46	if ($quote_count) {
598	3					15	$string .= "'" foreach (1..$quote_count);
599	3					8	$quote_count = 0;
600							}
601	15	50				51	if (substr($self->{buffer}, 0, 1) eq '\\') {
602	0					0	my $c = $self->_get_char;
603							# $self->_get_char_safe('\\');
604	0					0	my $esc = $self->_get_char_fill_buffer;
605	0	0				0	if ($esc eq '\\'){ $string .= "\\" }
	0	0				0
		0
		0
		0
		0
		0
		0
		0
		0
		0
606	0					0	elsif ($esc eq '"'){ $string .= '"' }
607	0					0	elsif ($esc eq "'"){ $string .= "'" }
608	0					0	elsif ($esc eq 'r'){ $string .= "\r" }
609	0					0	elsif ($esc eq 't'){ $string .= "\t" }
610	0					0	elsif ($esc eq 'n'){ $string .= "\n" }
611	0					0	elsif ($esc eq 'b'){ $string .= "\b" }
612	0					0	elsif ($esc eq 'f'){ $string .= "\f" }
613	0					0	elsif ($esc eq '>'){ $string .= ">" }
614							elsif ($esc eq 'U'){
615	0					0	my $codepoint = $self->_read_length(8);
616	0	0				0	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
617	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
618							}
619	0					0	$string .= chr(hex($codepoint));
620							}
621							elsif ($esc eq 'u'){
622	0					0	my $codepoint = $self->_read_length(4);
623	0	0				0	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
624	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
625							}
626	0					0	$string .= chr(hex($codepoint));
627							}
628							else {
629	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
630							}
631							} else {
632	15					58	$self->{buffer} =~ /^[^'\\]+/;
633	15					57	$string .= $self->_read_length($+[0]);
634							}
635							}
636							}
637	9					28	return $self->new_token(STRING3S, $string);
638							} else {
639							### #x22 scharacter* #x22
640	22					52	my $string = '';
641	22					46	while (1) {
642	44	100				262	if (substr($self->{buffer}, 0, 1) eq '\\') {
		100
		100
643	8					19	my $c = $self->_peek_char;
644	8					21	$self->_get_char_safe('\\');
645	8					18	my $esc = $self->_get_char;
646	8	100				62	if ($esc eq '\\'){ $string .= "\\" }
	1	50				3
		50
		100
		100
		100
		100
		100
		50
		100
		50
647	0					0	elsif ($esc eq '"'){ $string .= '"' }
648	0					0	elsif ($esc eq "'"){ $string .= "'" }
649	1					3	elsif ($esc eq 'r'){ $string .= "\r" }
650	1					3	elsif ($esc eq 't'){ $string .= "\t" }
651	1					3	elsif ($esc eq 'n'){ $string .= "\n" }
652	1					4	elsif ($esc eq 'b'){ $string .= "\b" }
653	1					3	elsif ($esc eq 'f'){ $string .= "\f" }
654	0					0	elsif ($esc eq '>'){ $string .= ">" }
655							elsif ($esc eq 'U'){
656	1					3	my $codepoint = $self->_read_length(8);
657	1	50				6	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
658	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
659							}
660	1					5	$string .= chr(hex($codepoint));
661							}
662							elsif ($esc eq 'u'){
663	1					5	my $codepoint = $self->_read_length(4);
664	1	50				6	unless ($codepoint =~ /^[0-9A-Fa-f]+$/) {
665	0					0	$self->_throw_error("Bad unicode escape codepoint '$codepoint'");
666							}
667	1					5	$string .= chr(hex($codepoint));
668							}
669							else {
670	0					0	$self->_throw_error("Unrecognized string escape '$esc'");
671							}
672							} elsif ($self->{buffer} =~ /^[^'\\]+/) {
673	14					66	$string .= $self->_read_length($+[0]);
674							} elsif (substr($self->{buffer}, 0, 1) eq "'") {
675	20					41	last;
676							} else {
677	2					10	$self->_throw_error("Got '$c' while expecting string character");
678							}
679							}
680	20					66	$self->_get_char_safe(q[']);
681	20					60	return $self->new_token(STRING1S, $string);
682							}
683							}
684
685							sub _get_keyword {
686	289			289		547	my $self = shift;
687	289					868	$self->_get_char_safe('@');
688	289	100				1393	if ($self->{buffer} =~ /^base/) {
		100
689	7					27	$self->_read_word('base');
690	7					26	return $self->new_token(BASE);
691							} elsif ($self->{buffer} =~ /^prefix/) {
692	257					905	$self->_read_word('prefix');
693	257					849	return $self->new_token(PREFIX);
694							} else {
695	25	100				139	if ($self->{buffer} =~ /^[a-zA-Z]+(-[a-zA-Z0-9]+)*\b/) {
696	24					98	my $lang = $self->_read_length($+[0]);
697	24					83	return $self->new_token(LANG, $lang);
698							} else {
699	1					5	$self->_throw_error("Expected keyword or language tag");
700							}
701							}
702							}
703
704							sub _throw_error {
705	37			37		78	my $self = shift;
706	37					68	my $error = shift;
707	37					1068	my $line = $self->line;
708	37					925	my $col = $self->column;
709							# Carp::cluck "$line:$col: $error: " . Dumper($self->{buffer});
710	37					452	RDF::Trine::Error::ParserError::Positioned->throw(
711							-text => "$error at $line:$col",
712							-value => [$line, $col],
713							);
714							}
715
716							__PACKAGE__->meta->make_immutable;
717
718							1;
719
720							__END__
721
722							=end private
723
724							=back
725
726							=head1 BUGS
727
728							Please report any bugs or feature requests to through the GitHub web interface
729							at L<https://github.com/kasei/perlrdf/issues>.
730
731							=head1 AUTHOR
732
733							Gregory Todd Williams C<< <gwilliams@cpan.org> >>
734
735							=head1 COPYRIGHT
736
737							Copyright (c) 2006-2012 Gregory Todd Williams. This
738							program is free software; you can redistribute it and/or modify it under
739							the same terms as Perl itself.
740
741							=cut