File Coverage

lib/MKDoc/XML/Tokenizer.pm

Criterion	Covered	Total	%
statement	31	31	100.0
branch	6	6	100.0
condition	4	6	66.6
subroutine	7	7	100.0
pod	2	2	100.0
total	50	52	96.1

line	stmt	bran	cond	sub	pod	time	code
1							# -------------------------------------------------------------------------------------
2							# MKDoc::XML::Tokenizer
3							# -------------------------------------------------------------------------------------
4							# Author : Jean-Michel Hiver.
5							# Copyright : (c) MKDoc Holdings Ltd, 2003
6							#
7							# This module turns an XML string into a list of tokens and returns this list.
8							# It is using Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions"
9							#
10							# This module is distributed under the same license as Perl itself.
11							# -------------------------------------------------------------------------------------
12							package MKDoc::XML::Tokenizer;
13	20			20		173587	use MKDoc::XML::Token;
	20					55
	20					1203
14	20			20		161	use strict;
	20					33
	20					708
15	20			20		102	use warnings;
	20					38
	20					23595
16
17							our $prev_token;
18
19							# REX/Perl 1.0
20							# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
21							# Technical Report TR 1998-17, School of Computing Science, Simon Fraser
22							# University, November, 1998.
23							# Copyright (c) 1998, Robert D. Cameron.
24							# The following code may be freely used and distributed provided that
25							# this copyright and citation notice remains intact and that modifications
26							# or additions are clearly identified.
27							#
28							# Additions:
29							# ----------
30							# added 'my' and 'our' keywords in front of variables
31							# I like strict mode :)
32							my $TextSE = "[^<]+";
33							my $UntilHyphen = "[^-]*-";
34							my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
35							my $CommentCE = "$Until2Hyphens>?";
36							my $UntilRSBs = "[^\\]]](?:[^\\]]+])]+";
37							my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
38							my $S = "[ \\n\\t\\r]+";
39							my $NameStrt = "[A-Za-z_:]\|[^\\x00-\\x7F]";
40							my $NameChar = "[A-Za-z0-9_:.-]\|[^\\x00-\\x7F]";
41							my $Name = "(?:$NameStrt)(?:$NameChar)*";
42							my $QuoteSE = "\"[^\"]\"\|'[^']'";
43							my $DT_IdentSE = "$S$Name(?:$S(?:$Name\|$QuoteSE))*";
44							my $MarkupDeclCE = "(?:[^\\]\"'><]+\|$QuoteSE)*>";
45							my $S1 = "[\\n\\r\\t ]";
46							my $UntilQMs = "[^?]*\\?+";
47							my $PI_Tail = "\\?>\|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
48							my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>\|[^-]$MarkupDeclCE)\|\\?$Name(?:$PI_Tail))\|%$Name;\|$S";
49							my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
50							my $DeclCE = "--(?:$CommentCE)?\|\\[CDATA\\[(?:$CDATA_CE)?\|DOCTYPE(?:$DocTypeCE)?";
51							my $PI_CE = "$Name(?:$PI_Tail)?";
52							my $EndTagCE = "$Name(?:$S)?>?";
53							my $AttValSE = "\"[^<\"]\"\|'[^<']'";
54							my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
55							my $MarkupSPE = "<(?:!(?:$DeclCE)?\|\\?(?:$PI_CE)?\|/(?:$EndTagCE)?\|(?:$ElemTagCE)?)";
56							our $XML_SPE = "$TextSE\|$MarkupSPE";
57
58
59							# Rather than have this:
60							# sub ShallowParse {
61							# my($XML_document) = @_;
62							# return $XML_document =~ /$XML_SPE/g;
63							# }
64							sub process_data
65							{
66	50			50	1	5356	my $class = shift;
67	50					105	my $xml = shift;
68
69							# remove trailing whitespace
70	50					429	$xml =~ s/^(?:\s\|\r\|\n)*\
71	50					1500	$xml =~ s/\>(?:\s\|\r\|\n)*$/\>/s;
72
73	50					133	local ($prev_token) = '';
74	985					1789	my @res = map {
75	50					11209	_check_001();
76	984					1523	_check_002();
77	979					1218	$prev_token = $_;
78	979					2887	bless \$_, 'MKDoc::XML::Token';
79							} $xml =~ /$XML_SPE/go;
80
81	44					345	return \@res;
82							}
83
84
85							#
86							sub _check_002
87							{
88	984	100		984		2527	$prev_token =~ /^
89	476	100				1536	$prev_token =~ />$/ or
90							die "cannot tokenize: $prev_token$_";
91							}
92
93
94							#
95							sub _check_001
96							{
97	985	100	100	985		2581	/^