File Coverage

lib/MKDoc/XML/Tokenizer.pm
Criterion Covered Total %
statement 31 31 100.0
branch 6 6 100.0
condition 4 6 66.6
subroutine 7 7 100.0
pod 2 2 100.0
total 50 52 96.1


line stmt bran cond sub pod time code
1             # -------------------------------------------------------------------------------------
2             # MKDoc::XML::Tokenizer
3             # -------------------------------------------------------------------------------------
4             # Author : Jean-Michel Hiver.
5             # Copyright : (c) MKDoc Holdings Ltd, 2003
6             #
7             # This module turns an XML string into a list of tokens and returns this list.
8             # It is using Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions"
9             #
10             # This module is distributed under the same license as Perl itself.
11             # -------------------------------------------------------------------------------------
12             package MKDoc::XML::Tokenizer;
13 20     20   173587 use MKDoc::XML::Token;
  20         55  
  20         1203  
14 20     20   161 use strict;
  20         33  
  20         708  
15 20     20   102 use warnings;
  20         38  
  20         23595  
16              
17             our $prev_token;
18              
19             # REX/Perl 1.0
20             # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
21             # Technical Report TR 1998-17, School of Computing Science, Simon Fraser
22             # University, November, 1998.
23             # Copyright (c) 1998, Robert D. Cameron.
24             # The following code may be freely used and distributed provided that
25             # this copyright and citation notice remains intact and that modifications
26             # or additions are clearly identified.
27             #
28             # Additions:
29             # ----------
30             # added 'my' and 'our' keywords in front of variables
31             # I like strict mode :)
32             my $TextSE = "[^<]+";
33             my $UntilHyphen = "[^-]*-";
34             my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
35             my $CommentCE = "$Until2Hyphens>?";
36             my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
37             my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
38             my $S = "[ \\n\\t\\r]+";
39             my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
40             my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
41             my $Name = "(?:$NameStrt)(?:$NameChar)*";
42             my $QuoteSE = "\"[^\"]*\"|'[^']*'";
43             my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
44             my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
45             my $S1 = "[\\n\\r\\t ]";
46             my $UntilQMs = "[^?]*\\?+";
47             my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
48             my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
49             my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
50             my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
51             my $PI_CE = "$Name(?:$PI_Tail)?";
52             my $EndTagCE = "$Name(?:$S)?>?";
53             my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
54             my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
55             my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
56             our $XML_SPE = "$TextSE|$MarkupSPE";
57              
58              
59             # Rather than have this:
60             # sub ShallowParse {
61             # my($XML_document) = @_;
62             # return $XML_document =~ /$XML_SPE/g;
63             # }
64             sub process_data
65             {
66 50     50 1 5356 my $class = shift;
67 50         105 my $xml = shift;
68            
69             # remove trailing whitespace
70 50         429 $xml =~ s/^(?:\s|\r|\n)*\
71 50         1500 $xml =~ s/\>(?:\s|\r|\n)*$/\>/s;
72              
73 50         133 local ($prev_token) = '';
74 985         1789 my @res = map {
75 50         11209 _check_001();
76 984         1523 _check_002();
77 979         1218 $prev_token = $_;
78 979         2887 bless \$_, 'MKDoc::XML::Token';
79             } $xml =~ /$XML_SPE/go;
80              
81 44         345 return \@res;
82             }
83              
84              
85             #

86             sub _check_002
87             {
88 984 100   984   2527 $prev_token =~ /^
89 476 100       1536 $prev_token =~ />$/ or
90             die "cannot tokenize: $prev_token$_";
91             }
92              
93              
94             #
95             sub _check_001
96             {
97 985 100 100 985   2581 /^