File Coverage

blib/lib/Markdown/Compiler/Lexer.pm

Criterion	Covered	Total	%
statement	236	257	91.8
branch	24	32	75.0
condition			n/a
subroutine	71	77	92.2
pod	0	37	0.0
total	331	403	82.1

line	stmt	bran	sub	pod	time	code
1						package Markdown::Compiler::Lexer;
2						BEGIN {
3						{
4						package Markdown::Compiler::Lexer::Token;
5	18		18		128	use Moo;
	18				34
	18				112
6
7	18				107	has source => (
8						is => 'ro',
9						required => 1,
10						);
11
12	18				14187	has start => (
13						is => 'ro',
14						required => 1,
15						);
16
17	18				3895	has end => (
18						is => 'ro',
19						required => 1,
20						);
21
22						has line => (
23						is => 'ro',
24						lazy => 1,
25						builder => sub {
26	0		0		0	my $self = shift;
27
28	0				0	my $lines = grep { $_ eq "\n" } (split(//, substr(${$self->source}, 0, $self->start)));
	0				0
	0				0
29	0				0	return $lines;
30						},
31	18				4388	);
32
33						has content => (
34						is => 'ro',
35						lazy => 1,
36						builder => sub {
37	398		398		2732	my $self = shift;
38	398				470	return substr( ${$self->source}, $self->start, ( $self->end - $self->start ) );
	398				2667
39						},
40	18				16028	);
41
42						# Allow to overide, for example to return multiple tokens.
43						sub tokens {
44	398		398	0	141288	return shift;
45						}
46
47	18				12231	1;
48						}
49						{
50	18		18		6122	package Markdown::Compiler::Lexer::Token::EscapedChar;
51	18		18		9476	use Moo;
	18				45
	18				85
52	18				75	extends 'Markdown::Compiler::Lexer::Token';
53
54	294		294	0	526	sub type { 'EscapedChar' }
55	424		424	0	1268	sub match { [ qr/\G(\\\\\|\\\`\|\\\*\|\\\_\|\\\{\|\\\}\|\\\[\|\\\]\|\\$\|\\$\|\\\#\|\\\+\|\\\-\|\\\.\|\\\!)/ ] }
56
57						# Delete the first \
58						around content => sub {
59	14				242	my $orig = shift;
60	14				170	my $value = $orig->(@_);
61
62	14				56	return substr($value,1);
63	18				2978	};
64
65	18				45366	1;
66						}
67						{
68	18				47	package Markdown::Compiler::Lexer::Token::CodeBlock;
69	18		18		7603	use Moo;
	18				82
	18				73
70	18				164	extends 'Markdown::Compiler::Lexer::Token';
71
72	30		30	0	90	sub type { 'CodeBlock' }
73						sub match {[
74	410		410	0	1173	qr\|\G\`\`\`(?:\n\\|$)\|,
75						qr\|\G\`\`\`[ ]\S+[ ]\n\|,
76						]}
77
78
79
80						has language => (
81						is => 'ro',
82						lazy => 1,
83						builder => sub {
84	2		2		54	my $content = shift->content;
85
86	2	50			9	if ( $content =~ m\|\`\`\`[ ](\S+)[ ]\n\| ) {
87	0				0	return $1;
88						}
89	2				18	return undef;
90						}
91	18				2795	);
92
93	18				23534	1;
94						}
95
96						{
97	18				44	package Markdown::Compiler::Lexer::Token::HR;
98	18		18		8264	use Moo;
	18				38
	18				70
99	18				77	extends 'Markdown::Compiler::Lexer::Token';
100
101	0		0	0	0	sub type { 'HR' }
102	406		406	0	924	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))((?:(\\s\\s\)\|(-\s-\s-)\|(_\s_\s_))[-_\s]*)\n/ ] }
103	18				2270	1;
104						}
105
106						{
107	18				42	package Markdown::Compiler::Lexer::Token::Image;
108	18		18		6712	use Moo;
	18				38
	18				80
109	18				75	extends 'Markdown::Compiler::Lexer::Token';
110	18		18		15258	use Regexp::Common qw( URI );
	18				42753
	18				71
111
112						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
113	18				2027	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:\#[A-z0-9-_]+)?/;
114
115	62		62	0	127	sub type { 'Image' }
116						# sub match {[
117						# qr/\G\!\[(.)\]$($url_match)\s+"([^"]+)"\s$/,
118						# qr/\G\!\[(.)\]$($url_match\s)$/,
119						# qr/\G\!($url_match)/,
120						# ]}
121						sub match {[
122	406		406	0	4941	qr/\G\!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/,
123						qr/\G\!\[(.)\]$([^ ]+\s)$/,
124						qr/\G\!($url_match)/,
125						]}
126
127						has text => (
128						is => 'ro',
129						lazy => 1,
130	4		4		90	builder => sub { shift->data->{text} },
131	18				7396	);
132
133						has href => (
134						is => 'ro',
135						lazy => 1,
136	4		4		144	builder => sub { shift->data->{href} },
137	18				21344	);
138
139						has title => (
140						is => 'ro',
141						lazy => 1,
142	4		4		78	builder => sub { shift->data->{title} },
143	18				12659	);
144
145						has data => (
146						is => 'ro',
147						lazy => 1,
148						builder => sub {
149	4		4		96	my $content = shift->content;
150
151	4	100			232	if ( $content =~ /!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/ ) {
		100
		50
152						return {
153	1				20	text => $1,
154						href => $2,
155						title => $3,
156						}
157						} elsif ( $content =~ /!\[(.)\]$([^ ]+\s)$/ ) {
158						return {
159	1				22	text => $1,
160						href => $2,
161						title => undef,
162						}
163						} elsif ( $content =~ /!($url_match)/ ) {
164						return {
165	2				52	text => undef,
166						href => $1,
167						title => undef,
168						};
169						}
170						}
171	18				12531	);
172
173	18				12276	1;
174						}
175						{
176	18				38	package Markdown::Compiler::Lexer::Token::Link;
177	18		18		397587	use Moo;
	18				79
	18				169
178	18				77	extends 'Markdown::Compiler::Lexer::Token';
179	18		18		5830	use Regexp::Common qw( URI );
	18				56
	18				420
180
181						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
182	18				2657	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:#[A-z0-9-_]+)?(?=[ )])/;
183
184	132		132	0	277	sub type { 'Link' }
185						# qr/\G\[.\]$$url_match\s+"([^"]+)"\s$/,
186						# qr/\G\[.*\]$$url_match$/,
187						# qr/\G$url_match/,
188						sub match {[
189	402		402	0	10058	qr/\G\[.?\]$$url_match\s+"([^"]+)"\s$/,
190						qr/\G\[.*?\]$$url_match$/,
191						qr/\G$url_match/,
192						qr/\G$RE{URI}{HTTP}{ -scheme => 'https?' }/,
193						]}
194
195						has text => (
196						is => 'ro',
197						lazy => 1,
198	10		10		266	builder => sub { shift->data->{text} },
199	18				5868	);
200
201						has title => (
202						is => 'ro',
203						lazy => 1,
204	10		10		231	builder => sub { shift->data->{title} },
205	18				21420	);
206
207						has href => (
208						is => 'ro',
209						lazy => 1,
210	10		10		391	builder => sub { shift->data->{href} },
211	18				12199	);
212
213						has data => (
214						is => 'ro',
215						lazy => 1,
216						builder => sub {
217	10		10		222	my $content = shift->content;
218
219	10	100			3720	if ( $content =~ /\[(.)\]$($url_match)\s+"([^"]+)"\s$/ ) {
		100
		50
		50
220						return {
221	1				27	text => $1,
222						href => $2,
223						title => $3,
224						};
225						} elsif ( $content =~ /\[(.)\]$($url_match\s)$/ ) {
226						return {
227	7				209	text => $1,
228						href => $2,
229						title => undef,
230						};
231						} elsif ( $content =~ /($url_match)/ ) {
232						return {
233	0				0	text => undef,
234						href => $1,
235						title => undef,
236						};
237						} elsif ( $content =~ /($RE{URI}{HTTP}{ -scheme => 'https?' })/ ) {
238						return {
239	2				558	text => undef,
240						href => $1,
241						title => undef,
242						};
243						}
244						},
245	18				12188	);
246
247	18				12202	1;
248						}
249
250						{
251	18				40	package Markdown::Compiler::Lexer::Token::Item;
252	18		18		8554	use Moo;
	18				302
	18				93
253	18				85	extends 'Markdown::Compiler::Lexer::Token';
254
255	126		126	0	416	sub type { 'Item' }
256						sub match { [
257						# Unordered / Beginning of line, then * + or -
258	392		392	0	1158	qr/\G(?:(?<=^)\|(?<=\n))(?:\*\|\+\|\-) /,
259
260						# Numbered / Beginning of line, [number].[space]
261						qr/\G(?:(?<=^)\|(?<=\n))\d+\.\s+/,
262						]}
263
264						# Note: I have the following version of this I should solve why I did this:
265						# $str =~ /\G(?:(?=^)\|(?=\n))(?:\\|\+\|\-) /gc or ( exists $tokens[-1] and $tokens[-1]->{type} eq 'line_break' and $str =~ /\G(?:\\|\+\|\-) /gc
266
267	18				2882	1;
268						}
269
270						{
271	18				44	package Markdown::Compiler::Lexer::Token::TableStart;
272	18		18		7165	use Moo;
	18				42
	18				79
273	18				83	extends 'Markdown::Compiler::Lexer::Token';
274
275	57		57	0	497	sub type { 'TableStart' }
276	380		380	0	849	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
277
278	18				1974	1;
279						}
280
281						{
282	18				43	package Markdown::Compiler::Lexer::Token::TableHeaderSep;
283	18		18		6133	use Moo;
	18				48
	18				69
284	18				78	extends 'Markdown::Compiler::Lexer::Token';
285
286	0		0	0	0	sub type { 'TableHeaderSep' }
287						# sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
288
289						sub match { return [
290	374		374	0	1243	qr/\G:---:/,
291						qr/\G:--/,
292						qr/\G--:/,
293						];
294						}
295
296	18				1983	1;
297						}
298
299						{
300	18				59	package Markdown::Compiler::Lexer::Token::BlockQuote;
301	18		18		7096	use Moo;
	18				50
	18				69
302	18				76	extends 'Markdown::Compiler::Lexer::Token';
303
304	18		18	0	61	sub type { 'BlockQuote' }
305	370		370	0	835	sub match { [ qr/\G(?:(?=^)\|(?=\n)\|(?=>\s))> / ] }
306
307	18				1992	1;
308						}
309
310						{
311	18				35	package Markdown::Compiler::Lexer::Token::Header;
312	18		18		6301	use Moo;
	18				35
	18				74
313	18				112	extends 'Markdown::Compiler::Lexer::Token';
314
315	2		2	0	85	sub type { 'Header' }
316	367		367	0	801	sub match { [ qr/\G([\#]+) (.+?)(?=\n\|$)/ ] }
317
318						has size => (
319						is => 'ro',
320						lazy => 1,
321	1				27	default => sub { length(shift->data->{header}) },
322	18				2339	);
323
324						has title => (
325						is => 'ro',
326						lazy => 1,
327	1				39	default => sub { shift->data->{title} },
328	18				22167	);
329
330						has data => (
331						is => 'ro',
332						lazy => 1,
333						builder => sub {
334	1		1		27	my $content = shift->content;
335
336	1	50			13	if ( $content =~ /^([\#]+)\s+(.+?)$/ ) {
337						return {
338	1				24	header => $1,
339						title => $2,
340						};
341						}
342						},
343	18				12289	);
344
345
346	18				12337	1;
347						}
348
349						{
350	18				61	package Markdown::Compiler::Lexer::Token::InlineCode;
351	18		18		9025	use Moo;
	18				59
	18				81
352	18				75	extends 'Markdown::Compiler::Lexer::Token';
353
354	40		40	0	102	sub type { 'InlineCode' }
355	374		374	0	844	sub match { [ qr/\G`/ ] }
356
357	18				2515	1;
358						}
359
360						{
361	18				43	package Markdown::Compiler::Lexer::Token::Bold;
362	18		18		5918	use Moo;
	18				44
	18				67
363	18				105	extends 'Markdown::Compiler::Lexer::Token';
364
365	132		132	0	248	sub type { 'Bold' }
366
367	18				1926	1;
368						}
369
370						{
371	18				37	package Markdown::Compiler::Lexer::Token::Italic;
372	18		18		5570	use Moo;
	18				39
	18				74
373	18				86	extends 'Markdown::Compiler::Lexer::Token';
374
375	156		156	0	334	sub type { 'Italic' }
376
377	18				1879	1;
378						}
379
380						{
381	18				37	package Markdown::Compiler::Lexer::Token::BoldItalic;
382	18		18		5638	use Moo;
	18				70
	18				89
383	18				79	extends 'Markdown::Compiler::Lexer::Token';
384
385	0		0	0	0	sub type { 'BoldItalic' }
386
387	18				1879	1;
388						}
389
390						{
391	18				36	package Markdown::Compiler::Lexer::Token::BoldItalicMaker;
392	18		18		5202	use Moo;
	18				35
	18				68
393	18				120	extends 'Markdown::Compiler::Lexer::Token';
394
395	0		0	0	0	sub type { 'ShortAttribute' }
396						sub match {
397						return [
398	365		365	0	1804	qr/\G\\\*/,
399						qr/\G___/,
400						qr/\G\\/,
401						qr/\G(?:(?<=^)\|(?<=[\s]))\*(?=\S\|$)/,
402						qr/\G(?<=[\S])\*/,
403						qr/\G__/,
404						qr/\G_/,
405						]
406						}
407
408						sub tokens {
409	26		26	0	5937	my ( $self ) = @_;
410	26				364	my $content = $self->content;
411
412	26	50			148	if ( $content =~ /^___/ ) {
		50
		100
		100
		100
		50
413	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
414						start => $self->start,
415						end => $self->end,
416						source => $self->source
417						);
418						} elsif ( $content =~ /^\\\*/ ) {
419	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
420						start => $self->start,
421						end => $self->end,
422						source => $self->source
423						);
424						} elsif ( $content =~ /^\\/ ) {
425	8				114	return Markdown::Compiler::Lexer::Token::Bold->new(
426						start => $self->start,
427						end => $self->end,
428						source => $self->source
429						);
430						} elsif ( $content =~ /^__/ ) {
431	4				62	return Markdown::Compiler::Lexer::Token::Bold->new(
432						start => $self->start,
433						end => $self->end,
434						source => $self->source
435						);
436						} elsif ( $content =~ /^_/ ) {
437	6				127	return Markdown::Compiler::Lexer::Token::Italic->new(
438						start => $self->start,
439						end => $self->end,
440						source => $self->source
441						);
442						} elsif ( $content =~ /^\*/ ) {
443	8				115	return Markdown::Compiler::Lexer::Token::Italic->new(
444						start => $self->start,
445						end => $self->end,
446						source => $self->source
447						);
448
449						}
450						};
451
452
453
454	18				1981	1;
455						}
456
457						{
458	18				44	package Markdown::Compiler::Lexer::Token::LineBreak;
459	18		18		11905	use Moo;
	18				33
	18				79
460	18				190	extends 'Markdown::Compiler::Lexer::Token';
461
462	195		195	0	593	sub type { 'LineBreak' }
463	339		339	0	743	sub match { [ qr/\G\n/ ] }
464
465	18				2010	1;
466						}
467
468						{
469	18				63	package Markdown::Compiler::Lexer::Token::Space;
470	18		18		6252	use Moo;
	18				33
	18				75
471	18				112	extends 'Markdown::Compiler::Lexer::Token';
472
473	1175		1175	0	2224	sub type { 'Space' }
474	301		301	0	674	sub match { [ qr/\G\s+/ ] }
475
476						has length => (
477						is => 'ro',
478						lazy => 1,
479	1		1		32	builder => sub { length(shift->content) },
480	18				2200	);
481
482	18				21925	1;
483						}
484
485						{
486	18				53	package Markdown::Compiler::Lexer::Token::Word;
487	18		18		6661	use Moo;
	18				44
	18				104
488	18				87	extends 'Markdown::Compiler::Lexer::Token';
489
490						# We'll match words to avoid making too many objects, such
491						# that "Hello World" becomes 11 objects.
492	1700		1700	0	3297	sub type { 'Word' }
493	196		196	0	661	sub match { [ qr\|\G[a-zA-Z]+\|, qr\|\G\d+\.\d+\|, qr\|\G\d+\| ] }
494
495	18				2350	1;
496						}
497
498						{
499	18				45	package Markdown::Compiler::Lexer::Token::Char;
	18				45
500	18		18		6880	use Moo;
	18				50
	18				102
501	18				78	extends 'Markdown::Compiler::Lexer::Token';
502
503	490		490	0	1966	sub type { 'Char' }
504	55		55	0	142	sub match { [ qr/\G./s ] }
505
506	18				2507	1;
507						}
508						}
509	18		18		141	use Moo;
	18				36
	18				73
510	18		18		4748	use v5.10;
	18				59
511
512						has source => (
513						is => 'ro',
514						required => 1,
515
516						);
517
518						has tokens => (
519						is => 'ro',
520						builder => '_build_tokens',
521						init_arg => undef,
522
523						);
524
525						has token_table => (
526						is => 'ro',
527						lazy => 1,
528						builder => sub {
529	0		0		0	my ( $self ) = @_;
530
531	0				0	my $str;
532
533	0				0	foreach my $token ( @{$self->tokens} ) {
	0				0
534	0				0	( my $content = $token->content ) =~ s/\n//g;
535	0				0	$str .= sprintf( "%20s \| %s\n", $content, $token->type );
536						}
537
538	0				0	return $str;
539						}
540						);
541
542						has hooks => (
543						is => 'ro',
544						default => sub { [] },
545						);
546
547						has lexer_tokens => (
548						is => 'ro',
549						default => sub {
550						return [qw(
551						Markdown::Compiler::Lexer::Token::EscapedChar
552						Markdown::Compiler::Lexer::Token::CodeBlock
553						Markdown::Compiler::Lexer::Token::HR
554						Markdown::Compiler::Lexer::Token::Image
555						Markdown::Compiler::Lexer::Token::Link
556						Markdown::Compiler::Lexer::Token::Item
557						Markdown::Compiler::Lexer::Token::TableStart
558						Markdown::Compiler::Lexer::Token::TableHeaderSep
559						Markdown::Compiler::Lexer::Token::InlineCode
560						Markdown::Compiler::Lexer::Token::BlockQuote
561						Markdown::Compiler::Lexer::Token::Header
562						Markdown::Compiler::Lexer::Token::BoldItalicMaker
563						Markdown::Compiler::Lexer::Token::LineBreak
564						Markdown::Compiler::Lexer::Token::Space
565						Markdown::Compiler::Lexer::Token::Word
566						Markdown::Compiler::Lexer::Token::Char
567						)];
568						# Removed from betweenb Space and Char, might have been
569						# more trouble than it's worth.
570						# Markdown::Compiler::Lexer::Token::Word
571						}
572						);
573
574						sub _build_tokens {
575	65		65		322	my ( $self ) = @_;
576
577	65				191	my $str = $self->source;
578
579	65				209	pos($str) = 0;
580	65				133	my @tokens;
581
582	65				224	PARSE: while ( length($str) != pos($str) ) {
583	424				673	my $start_pos = pos($str);
584
585	424				523	TOKEN: foreach my $token_class ( @{$self->lexer_tokens} ) {
	424				1089
586	5561				12406	my $matches = $token_class->match;
587
588	5561				58247	foreach my $match ( @{$matches} ) {
	5561				7389
589	11368	100			39857	if ( $str =~ m\|$match\|gc ) {
590	424				7957	push @tokens, $token_class->new(
591						source => \$self->source,
592						start => $start_pos,
593						end => pos($str),
594						)->tokens;
595	424				7522	next PARSE;
596						}
597						}
598						}
599						# We were not able to match the content, so we're blowing up now.
600	0				0	die "Error at offset $start_pos of document: next 10 chars" . substr($self->source, $start_pos, 10 );
601						}
602
603	65				1453	return [ @tokens ];
604						}
605
606						1;