File Coverage

blib/lib/HTML/HTML5/Parser/Tokenizer.pm
Criterion Covered Total %
statement 1011 3734 27.0
branch 540 1870 28.8
condition 169 453 37.3
subroutine 8 8 100.0
pod n/a
total 1728 6065 28.4


line stmt bran cond sub pod time code
1             package HTML::HTML5::Parser::Tokenizer; # -*- Perl -*-
2             ## skip Test::Tabs
3 11     11   86 use strict;
  11         23  
  11         356  
4 11     11   71 use warnings;
  11         22  
  11         530  
5             our $VERSION='0.991';
6              
7             ## This module implements the tokenization phase of both HTML5 and
8             ## XML5. Notes like this are usually based on the latest HTML
9             ## specification. Since XML is different from HTML, and since XML5
10             ## specification has not been maintained any more, there is a few
11             ## differences from HTML's tokenization. Such differences are marked
12             ## by prefix "XML5:".
13              
14             ## Warnings that depend on the HTML/XML input stream, such as ones
15             ## related to surrogate code positions, are not useful.
16 11     11   79 no warnings 'utf8';
  11         25  
  11         1441  
17              
18             ## ------ Token types ------
19              
20             BEGIN {
21 11     11   101 require Exporter;
22 11         112 push our @ISA, 'Exporter';
23              
24 11         48 our @EXPORT_OK = qw(
25             DOCTYPE_TOKEN
26             COMMENT_TOKEN
27             START_TAG_TOKEN
28             END_TAG_TOKEN
29             END_OF_FILE_TOKEN
30             CHARACTER_TOKEN
31             PI_TOKEN
32             ABORT_TOKEN
33             END_OF_DOCTYPE_TOKEN
34             ATTLIST_TOKEN
35             ELEMENT_TOKEN
36             GENERAL_ENTITY_TOKEN
37             PARAMETER_ENTITY_TOKEN
38             NOTATION_TOKEN
39             );
40            
41 11         2662 our %EXPORT_TAGS = (
42             token => [qw(
43             DOCTYPE_TOKEN
44             COMMENT_TOKEN
45             START_TAG_TOKEN
46             END_TAG_TOKEN
47             END_OF_FILE_TOKEN
48             CHARACTER_TOKEN
49             PI_TOKEN
50             ABORT_TOKEN
51             END_OF_DOCTYPE_TOKEN
52             ATTLIST_TOKEN
53             ELEMENT_TOKEN
54             GENERAL_ENTITY_TOKEN
55             PARAMETER_ENTITY_TOKEN
56             NOTATION_TOKEN
57             )],
58             );
59             }
60              
61             sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
62             sub COMMENT_TOKEN () { 2 }
63             sub START_TAG_TOKEN () { 3 }
64             sub END_TAG_TOKEN () { 4 }
65             sub END_OF_FILE_TOKEN () { 5 }
66             sub CHARACTER_TOKEN () { 6 }
67             sub PI_TOKEN () { 7 } ## NOTE: XML only.
68             sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
69             sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
70             sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
71             sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
72             sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
73             sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
74             sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
75              
76             ## XML5: XML5 has "empty tag token". In this implementation, it is
77             ## represented as a start tag token with $self->{self_closing} flag
78             ## set to true.
79              
80             ## XML5: XML5 has "short end tag token". In this implementation, it
81             ## is represented as an end tag token with $token->{tag_name} flag set
82             ## to an empty string.
83              
84             package HTML::HTML5::Parser::TagSoupParser;
85              
86 11     11   2654 BEGIN { HTML::HTML5::Parser::Tokenizer->import (':token') }
87              
88 11     11   8997 use HTML::HTML5::Entities qw[%entity2char];
  11         198008  
  11         624521  
89              
90             ## ------ Tokenizer states ------
91              
92             sub DATA_STATE () { 0 }
93             sub RCDATA_STATE () { 107 }
94             sub RAWTEXT_STATE () { 108 }
95             sub SCRIPT_DATA_STATE () { 109 }
96             sub PLAINTEXT_STATE () { 110 }
97             sub TAG_OPEN_STATE () { 2 }
98             sub RCDATA_LT_STATE () { 111 }
99             sub RAWTEXT_LT_STATE () { 112 }
100             sub SCRIPT_DATA_LT_STATE () { 113 }
101             sub CLOSE_TAG_OPEN_STATE () { 3 }
102             sub RCDATA_END_TAG_OPEN_STATE () { 114 }
103             sub RAWTEXT_END_TAG_OPEN_STATE () { 115 }
104             sub SCRIPT_DATA_END_TAG_OPEN_STATE () { 116 }
105             sub SCRIPT_DATA_ESCAPE_START_STATE () { 1 }
106             sub SCRIPT_DATA_ESCAPE_START_DASH_STATE () { 12 }
107             sub SCRIPT_DATA_ESCAPED_STATE () { 117 }
108             sub SCRIPT_DATA_ESCAPED_DASH_STATE () { 118 }
109             sub SCRIPT_DATA_ESCAPED_DASH_DASH_STATE () { 119 }
110             sub SCRIPT_DATA_ESCAPED_LT_STATE () { 120 }
111             sub SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE () { 121 }
112             sub SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE () { 122 }
113             sub SCRIPT_DATA_DOUBLE_ESCAPED_STATE () { 123 }
114             sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE () { 124 }
115             sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE () { 125 }
116             sub SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE () { 126 }
117             sub SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE () { 127 }
118             sub TAG_NAME_STATE () { 4 }
119             sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
120             sub ATTRIBUTE_NAME_STATE () { 6 }
121             sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
122             sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
123             sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
124             sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
125             sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
126             sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
127             sub COMMENT_START_STATE () { 14 }
128             sub COMMENT_START_DASH_STATE () { 15 }
129             sub COMMENT_STATE () { 16 }
130             sub COMMENT_END_STATE () { 17 }
131             sub COMMENT_END_BANG_STATE () { 102 }
132             #sub COMMENT_END_SPACE_STATE () { 103 } ## REMOVED
133             sub COMMENT_END_DASH_STATE () { 18 }
134             sub BOGUS_COMMENT_STATE () { 19 }
135             sub DOCTYPE_STATE () { 20 }
136             sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
137             sub DOCTYPE_NAME_STATE () { 22 }
138             sub AFTER_DOCTYPE_NAME_STATE () { 23 }
139             sub AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE () { 104 }
140             sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
141             sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
142             sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
143             sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
144             sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
145             sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
146             sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
147             sub BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE () { 105 }
148             sub AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE () { 106 }
149             sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
150             sub BOGUS_DOCTYPE_STATE () { 32 }
151             sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
152             sub SELF_CLOSING_START_TAG_STATE () { 34 }
153             sub CDATA_SECTION_STATE () { 35 }
154             sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
155             sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
156             sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
157             #sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
158             sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
159             sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
160             sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
161             sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
162             ##
163             ## NOTE: "Entity data state", "entity in attribute value state", and
164             ## the "consume a character reference" algorithm, are jointly
165             ## implemented as the following six states:
166             sub ENTITY_STATE () { 44 }
167             sub ENTITY_HASH_STATE () { 45 }
168             sub NCR_NUM_STATE () { 46 }
169             sub HEXREF_X_STATE () { 47 }
170             sub HEXREF_HEX_STATE () { 48 }
171             sub ENTITY_NAME_STATE () { 49 }
172             ##
173             ## XML-only states
174             sub DATA_MSE1_STATE () { 50 }
175             sub DATA_MSE2_STATE () { 128 } # last
176             sub PI_STATE () { 51 }
177             sub PI_TARGET_STATE () { 52 }
178             sub PI_TARGET_AFTER_STATE () { 53 }
179             sub PI_DATA_STATE () { 54 }
180             sub PI_AFTER_STATE () { 55 }
181             sub PI_DATA_AFTER_STATE () { 56 }
182             sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
183             sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
184             sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
185             sub DOCTYPE_TAG_STATE () { 60 }
186             sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
187             sub MD_ATTLIST_STATE () { 62 }
188             sub MD_E_STATE () { 63 }
189             sub MD_ELEMENT_STATE () { 64 }
190             sub MD_ENTITY_STATE () { 65 }
191             sub MD_NOTATION_STATE () { 66 }
192             sub DOCTYPE_MD_STATE () { 67 }
193             sub BEFORE_MD_NAME_STATE () { 68 }
194             sub MD_NAME_STATE () { 69 }
195             sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
196             sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
197             sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
198             sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
199             sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
200             sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
201             sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
202             sub ALLOWED_TOKEN_STATE () { 77 }
203             sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
204             sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
205             sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
206             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
207             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
208             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
209             sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
210             sub BEFORE_NDATA_STATE () { 85 }
211             sub NDATA_STATE () { 86 }
212             sub AFTER_NDATA_STATE () { 87 }
213             sub BEFORE_NOTATION_NAME_STATE () { 88 }
214             sub NOTATION_NAME_STATE () { 89 }
215             sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
216             sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
217             sub ENTITY_VALUE_ENTITY_STATE () { 92 }
218             sub AFTER_ELEMENT_NAME_STATE () { 93 }
219             sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
220             sub CONTENT_KEYWORD_STATE () { 95 }
221             sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
222             sub CM_ELEMENT_NAME_STATE () { 97 }
223             sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
224             sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
225             sub AFTER_MD_DEF_STATE () { 100 }
226             sub BOGUS_MD_STATE () { 101 }
227              
228             ## ------ Tree constructor state constants ------
229              
230             ## Whether the parsed string is in the foreign island or not affect
231             ## how tokenization is done, unfortunately. These are a copy of some
232             ## of tokenization state constants. See Whatpm::HTML for the full
233             ## list and the descriptions for constants.
234              
235             sub FOREIGN_EL () { 0b1_00000000000 }
236              
237             ## ------ Character reference mappings ------
238              
239             my $charref_map = {
240             0x00 => 0xFFFD, # REPLACEMENT CHARACTER
241             0x0D => 0x000D, # CARRIAGE RETURN
242             0x80 => 0x20AC,
243             0x81 => 0x0081,
244             0x82 => 0x201A,
245             0x83 => 0x0192,
246             0x84 => 0x201E,
247             0x85 => 0x2026,
248             0x86 => 0x2020,
249             0x87 => 0x2021,
250             0x88 => 0x02C6,
251             0x89 => 0x2030,
252             0x8A => 0x0160,
253             0x8B => 0x2039,
254             0x8C => 0x0152,
255             0x8D => 0x008D,
256             0x8E => 0x017D,
257             0x8F => 0x008F,
258             0x90 => 0x0090,
259             0x91 => 0x2018,
260             0x92 => 0x2019,
261             0x93 => 0x201C,
262             0x94 => 0x201D,
263             0x95 => 0x2022,
264             0x96 => 0x2013,
265             0x97 => 0x2014,
266             0x98 => 0x02DC,
267             0x99 => 0x2122,
268             0x9A => 0x0161,
269             0x9B => 0x203A,
270             0x9C => 0x0153,
271             0x9D => 0x009D,
272             0x9E => 0x017E,
273             0x9F => 0x0178,
274             }; # $charref_map
275             $charref_map->{$_} = 0xFFFD # REPLACEMENT CHARACTER
276             for 0xD800..0xDFFF;
277             $charref_map->{$_} = $_
278             for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
279             0xFDD0..0xFDEF,
280             0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
281             0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
282             0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
283             0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
284             0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
285              
286             ## ------ Special character-like constants ------
287              
288             ## The "EOF" pseudo-character in the HTML parsing algorithm.
289             sub EOF_CHAR () { -1 }
290              
291             ## A pseudo-character code that can never appear in the input stream.
292             sub NEVER_CHAR () { -2 }
293              
294             ## ------ The tokenizer ------
295              
296             ## Implementations MUST act as if state machine in the spec
297              
298             sub _initialize_tokenizer ($) {
299 711     711   1127 my $self = shift;
300              
301             ## NOTE: Fields set by |new| constructor:
302             #$self->{level}
303             #$self->{set_nc}
304             #$self->{parse_error}
305             #$self->{is_xml} (if XML)
306              
307 711         1459 $self->{state} = DATA_STATE; # MUST
308             #$self->{kwd} = ''; # State-dependent keyword; initialized when used
309             #$self->{entity__value}; # initialized when used
310             #$self->{entity__match}; # initialized when used
311 711         2664 undef $self->{ct}; # current token
312 711         1440 undef $self->{ca}; # current attribute
313 711         1451 undef $self->{last_stag_name}; # last emitted start tag name
314             #$self->{prev_state}; # initialized when used
315 711         1329 delete $self->{self_closing};
316 711         1419 $self->{char_buffer} = '';
317 711         1153 $self->{char_buffer_pos} = 0;
318 711         1170 $self->{nc} = -1; # next input character
319             #$self->{next_nc}
320            
321 711 50       1640 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322 0         0 $self->{line_prev} = $self->{line};
323 0         0 $self->{column_prev} = $self->{column};
324 0         0 $self->{column}++;
325             $self->{nc}
326 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327             } else {
328 711         1777 $self->{set_nc}->($self);
329             }
330            
331 711         2235 $self->{token} = [];
332             # $self->{escape}
333             } # _initialize_tokenizer
334              
335             ## A token has:
336             ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
337             ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
338             ## ->{name} (DOCTYPE_TOKEN)
339             ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
340             ## ->{target} (PI_TOKEN)
341             ## ->{pubid} (DOCTYPE_TOKEN)
342             ## ->{sysid} (DOCTYPE_TOKEN)
343             ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
344             ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
345             ## ->{name}
346             ## ->{value}
347             ## ->{has_reference} == 1 or 0
348             ## ->{index}: Index of the attribute in a tag.
349             ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
350             ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
351             ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
352             ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
353              
354             ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
355             ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
356             ## while the token is pushed back to the stack.
357              
358             ## Emitted token MUST immediately be handled by the tree construction state.
359              
360             ## Before each step, UA MAY check to see if either one of the scripts in
361             ## "list of scripts that will execute as soon as possible" or the first
362             ## script in the "list of scripts that will execute asynchronously",
363             ## has completed loading. If one has, then it MUST be executed
364             ## and removed from the list.
365              
366             ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
367             ## (This requirement was dropped from HTML5 spec, unfortunately.)
368              
369             my $is_space = {
370             0x0009 => 1, # CHARACTER TABULATION (HT)
371             0x000A => 1, # LINE FEED (LF)
372             #0x000B => 0, # LINE TABULATION (VT)
373             0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
374             0x000D => 1, # CARRIAGE RETURN (CR)
375             0x0020 => 1, # SPACE (SP)
376             };
377              
378             sub KEY_ELSE_CHAR () { 255 }
379             sub KEY_ULATIN_CHAR () { 254 }
380             sub KEY_LLATIN_CHAR () { 253 }
381             sub KEY_EOF_CHAR () { 252 }
382             sub KEY_SPACE_CHAR () { 251 }
383              
384             my $Action;
385             my $XMLAction;
386             $Action->[DATA_STATE]->[0x0026] = {
387             name => 'data &',
388             state => ENTITY_STATE, # "entity data state" + "consume a character reference"
389             state_set => {entity_add => -1, prev_state => DATA_STATE},
390             };
391             $Action->[DATA_STATE]->[0x003C] = {
392             name => 'data <',
393             state => TAG_OPEN_STATE,
394             };
395             $Action->[DATA_STATE]->[KEY_EOF_CHAR] = {
396             name => 'data eof',
397             emit => END_OF_FILE_TOKEN,
398             reconsume => 1,
399             };
400             $Action->[DATA_STATE]->[0x0000] = {
401             name => 'data null',
402             emit => CHARACTER_TOKEN,
403             error => 'NULL',
404             };
405             $Action->[DATA_STATE]->[KEY_ELSE_CHAR] = {
406             name => 'data else',
407             emit => CHARACTER_TOKEN,
408             emit_data_read_until => qq{\x00<&},
409             };
410             $XMLAction->[DATA_STATE]->[0x005D] = { # ]
411             name => 'data ]',
412             state => DATA_MSE1_STATE,
413             emit => CHARACTER_TOKEN,
414             };
415             $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = {
416             name => 'data else xml',
417             emit => CHARACTER_TOKEN,
418             emit_data_read_until => qq{\x00<&\]},
419             };
420             $Action->[RCDATA_STATE]->[0x0026] = {
421             name => 'rcdata &',
422             state => ENTITY_STATE, # "entity data state" + "consume a character reference"
423             state_set => {entity_add => -1, prev_state => RCDATA_STATE},
424             };
425             $Action->[RCDATA_STATE]->[0x003C] = {
426             name => 'rcdata <',
427             state => RCDATA_LT_STATE,
428             };
429             $Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
430             $Action->[RCDATA_STATE]->[0x0000] = {
431             name => 'rcdata null',
432             emit => CHARACTER_TOKEN,
433             emit_data => "\x{FFFD}",
434             error => 'NULL',
435             };
436             $Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = {
437             name => 'rcdata else',
438             emit => CHARACTER_TOKEN,
439             emit_data_read_until => qq{\x00<&},
440             };
441             $Action->[RAWTEXT_STATE]->[0x003C] = {
442             name => 'rawtext <',
443             state => RAWTEXT_LT_STATE,
444             };
445             $Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
446             $Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000];
447             $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = {
448             name => 'rawtext else',
449             emit => CHARACTER_TOKEN,
450             emit_data_read_until => qq{\x00<},
451             };
452             $Action->[SCRIPT_DATA_STATE]->[0x003C] = {
453             name => 'script data <',
454             state => SCRIPT_DATA_LT_STATE,
455             };
456             $Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
457             $Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
458             $Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR];
459             $Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
460             $Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
461             $Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = {
462             name => 'plaintext else',
463             emit => CHARACTER_TOKEN,
464             emit_data_read_until => qq{\x00},
465             };
466             # "Tag open state" is known as "tag state" in XML5.
467             $Action->[TAG_OPEN_STATE]->[0x0021] = {
468             name => 'tag open !',
469             state => MARKUP_DECLARATION_OPEN_STATE,
470             };
471             $Action->[TAG_OPEN_STATE]->[0x002F] = {
472             name => 'tag open /',
473             state => CLOSE_TAG_OPEN_STATE,
474             };
475             $Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
476             name => 'tag open uc',
477             ct => {
478             type => START_TAG_TOKEN,
479             delta => 1,
480             append_tag_name => 0x0020, # UC -> lc
481             },
482             state => TAG_NAME_STATE,
483             };
484             $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
485             name => 'tag open uc xml',
486             ct => {
487             type => START_TAG_TOKEN,
488             delta => 1,
489             append_tag_name => 0x0000,
490             },
491             state => TAG_NAME_STATE,
492             };
493             $Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
494             name => 'tag open lc',
495             ct => {
496             type => START_TAG_TOKEN,
497             delta => 1,
498             append_tag_name => 0x0000,
499             },
500             state => TAG_NAME_STATE,
501             };
502             $Action->[TAG_OPEN_STATE]->[0x003F] = {
503             name => 'tag open ?',
504             state => BOGUS_COMMENT_STATE,
505             error => 'pio',
506             error_delta => 1,
507             ct => {
508             type => COMMENT_TOKEN,
509             },
510             reconsume => 1, ## $self->{nc} is intentionally left as is
511             };
512             $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ?
513             name => 'tag open ? xml',
514             state => PI_STATE,
515             };
516             $Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
517             $Action->[TAG_OPEN_STATE]->[0x003E] = { # >
518             name => 'tag open else',
519             error => 'bare stago',
520             error_delta => 1,
521             state => DATA_STATE,
522             reconsume => 1,
523             emit => CHARACTER_TOKEN,
524             emit_data => '<',
525             emit_delta => 1,
526             };
527             $Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E];
528             $XMLAction->[TAG_OPEN_STATE]->[0x0000] = {
529             name => 'tag open null xml',
530             ct => {
531             type => START_TAG_TOKEN,
532             delta => 1,
533             append_tag_name => 0xFFFD,
534             },
535             error => 'NULL',
536             state => TAG_NAME_STATE,
537             };
538             ## XML5: "<:" has a parse error.
539             $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
540             name => 'tag open else xml',
541             ct => {
542             type => START_TAG_TOKEN,
543             delta => 1,
544             append_tag_name => 0x0000,
545             },
546             state => TAG_NAME_STATE,
547             };
548             $Action->[RCDATA_LT_STATE]->[0x002F] = {
549             name => 'rcdata lt /',
550             state => RCDATA_END_TAG_OPEN_STATE,
551             buffer => {clear => 1},
552             };
553             $Action->[RAWTEXT_LT_STATE]->[0x002F] = {
554             name => 'rawtext lt /',
555             state => RAWTEXT_END_TAG_OPEN_STATE,
556             buffer => {clear => 1},
557             };
558             $Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = {
559             name => 'script data lt /',
560             state => SCRIPT_DATA_END_TAG_OPEN_STATE,
561             buffer => {clear => 1},
562             };
563             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = {
564             name => 'script data escaped lt /',
565             state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE,
566             buffer => {clear => 1},
567             };
568             $Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = {
569             name => 'script data lt !',
570             state => SCRIPT_DATA_ESCAPE_START_STATE,
571             emit => CHARACTER_TOKEN,
572             emit_data => '
573             };
574             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = {
575             name => 'script data escaped lt uc',
576             emit => CHARACTER_TOKEN,
577             emit_data => '<',
578             emit_data_append => 1,
579             buffer => {clear => 1, append => 0x0020}, # UC -> lc
580             state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
581             };
582             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = {
583             name => 'script data escaped lt lc',
584             emit => CHARACTER_TOKEN,
585             emit_data => '<',
586             emit_data_append => 1,
587             buffer => {clear => 1, append => 0x0000},
588             state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
589             };
590             $Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = {
591             name => 'rcdata lt else',
592             state => RCDATA_STATE,
593             reconsume => 1,
594             emit => CHARACTER_TOKEN,
595             emit_data => '<',
596             };
597             $Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = {
598             name => 'rawtext lt else',
599             state => RAWTEXT_STATE,
600             reconsume => 1,
601             emit => CHARACTER_TOKEN,
602             emit_data => '<',
603             };
604             $Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = {
605             name => 'script data lt else',
606             state => SCRIPT_DATA_STATE,
607             reconsume => 1,
608             emit => CHARACTER_TOKEN,
609             emit_data => '<',
610             };
611             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
612             name => 'script data escaped lt else',
613             state => SCRIPT_DATA_ESCAPED_STATE,
614             reconsume => 1,
615             emit => CHARACTER_TOKEN,
616             emit_data => '<',
617             };
618             ## XXX "End tag token" in latest HTML5 and in XML5.
619             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
620             name => 'end tag open uc',
621             ct => {
622             type => END_TAG_TOKEN,
623             delta => 2,
624             append_tag_name => 0x0020, # UC -> lc
625             },
626             state => TAG_NAME_STATE,
627             };
628             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
629             name => 'end tag open uc xml',
630             ct => {
631             type => END_TAG_TOKEN,
632             delta => 2,
633             append_tag_name => 0x0000,
634             },
635             state => TAG_NAME_STATE,
636             };
637             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
638             name => 'end tag open lc',
639             ct => {
640             type => END_TAG_TOKEN,
641             delta => 2,
642             append_tag_name => 0x0000,
643             },
644             state => TAG_NAME_STATE,
645             };
646             $Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
647             name => 'end tag open >',
648             error => 'empty end tag',
649             error_delta => 2, # "<" in ""
650             state => DATA_STATE,
651             };
652             ## XML5: No parse error.
653            
654             ## NOTE: This parser raises a parse error, since it supports XML1,
655             ## not XML5.
656            
657             ## NOTE: A short end tag token.
658              
659             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
660             name => 'end tag open > xml',
661             error => 'empty end tag',
662             error_delta => 2, # "<" in ""
663             state => DATA_STATE,
664             ct => {
665             type => END_TAG_TOKEN,
666             delta => 2,
667             },
668             emit => '',
669             };
670             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_EOF_CHAR] = {
671             name => 'end tag open eof',
672             error => 'bare etago',
673             state => DATA_STATE,
674             reconsume => 1,
675             emit => CHARACTER_TOKEN,
676             emit_data => '
677             emit_delta => 2,
678             };
679             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
680             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
681             name => 'end tag open else',
682             error => 'bogus end tag',
683             error_delta => 2, # "<" of "
684             state => BOGUS_COMMENT_STATE,
685             ct => {
686             type => COMMENT_TOKEN,
687             delta => 2, # "<" of "
688             },
689             reconsume => 1,
690             ## NOTE: $self->{nc} is intentionally left as is. Although the
691             ## "anything else" case of the spec not explicitly states that the
692             ## next input character is to be reconsumed, it will be included to
693             ## the |data| of the comment token generated from the bogus end tag,
694             ## as defined in the "bogus comment state" entry.
695             };
696             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x0000] = {
697             name => 'end tag open null xml',
698             ct => {
699             type => END_TAG_TOKEN,
700             delta => 2,
701             append_tag_name => 0xFFFD,
702             },
703             error => 'NULL',
704             state => TAG_NAME_STATE, ## XML5: "end tag name state".
705             };
706             ## XML5: "
707             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
708             name => 'end tag open else xml',
709             ct => {
710             type => END_TAG_TOKEN,
711             delta => 2,
712             append_tag_name => 0x0000,
713             },
714             state => TAG_NAME_STATE, ## XML5: "end tag name state".
715             };
716             ## This switch-case implements "tag name state", "RCDATA end tag
717             ## name state", "RAWTEXT end tag name state", and "script data
718             ## end tag name state" jointly with the implementation of
719             ## "RCDATA end tag open state" and so on.
720             $Action->[TAG_NAME_STATE]->[KEY_SPACE_CHAR] = {
721             name => 'tag name sp',
722             state => BEFORE_ATTRIBUTE_NAME_STATE,
723             };
724             $Action->[TAG_NAME_STATE]->[0x003E] = {
725             name => 'tag name >',
726             state => DATA_STATE,
727             emit => '',
728             };
729             $Action->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
730             name => 'tag name uc',
731             ct => {
732             append_tag_name => 0x0020, # UC -> lc
733             },
734             };
735             $XMLAction->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
736             name => 'tag name uc xml',
737             ct => {
738             append_tag_name => 0x0000,
739             },
740             };
741             $Action->[TAG_NAME_STATE]->[KEY_EOF_CHAR] = {
742             name => 'tag name eof',
743             error => 'unclosed tag',
744             state => DATA_STATE,
745             reconsume => 1,
746             };
747             $Action->[TAG_NAME_STATE]->[0x002F] = {
748             name => 'tag name /',
749             state => SELF_CLOSING_START_TAG_STATE,
750             };
751             $Action->[TAG_NAME_STATE]->[0x0000] = {
752             name => 'tag name null',
753             ct => {
754             append_tag_name => 0xFFFD,
755             },
756             error => 'NULL',
757             };
758             $Action->[TAG_NAME_STATE]->[KEY_ELSE_CHAR] = {
759             name => 'tag name else',
760             ct => {
761             append_tag_name => 0x0000,
762             },
763             };
764             $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[0x002D] = {
765             name => 'script data escape start -',
766             state => SCRIPT_DATA_ESCAPE_START_DASH_STATE,
767             emit => CHARACTER_TOKEN,
768             emit_data => '-',
769             };
770             $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[0x002D] = {
771             name => 'script data escape start dash -',
772             state => SCRIPT_DATA_ESCAPED_STATE,
773             emit => CHARACTER_TOKEN,
774             emit_data => '-',
775             };
776             $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
777             name => 'script data escape start else',
778             state => SCRIPT_DATA_STATE,
779             reconsume => 1,
780             };
781             $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[KEY_ELSE_CHAR] = $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR];
782             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x002D] = {
783             name => 'script data escaped -',
784             state => SCRIPT_DATA_ESCAPED_DASH_STATE,
785             emit => CHARACTER_TOKEN,
786             emit_data => '-',
787             };
788             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x002D] = {
789             name => 'script data escaped dash -',
790             state => SCRIPT_DATA_ESCAPED_DASH_DASH_STATE,
791             emit => CHARACTER_TOKEN,
792             emit_data => '-',
793             };
794             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
795             name => 'script data escaped dash dash -',
796             emit => CHARACTER_TOKEN,
797             emit_data => '-',
798             };
799             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x002D] = {
800             name => 'script data double escaped -',
801             state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE,
802             emit => CHARACTER_TOKEN,
803             emit_data => '-',
804             };
805             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x002D] = {
806             name => 'script data double escaped -',
807             state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE,
808             emit => CHARACTER_TOKEN,
809             emit_data => '-',
810             };
811             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
812             name => 'script data double escaped dash dash -',
813             emit => CHARACTER_TOKEN,
814             emit_data => '-',
815             };
816             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x003C] = {
817             name => 'script data escaped <',
818             state => SCRIPT_DATA_ESCAPED_LT_STATE,
819             };
820             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x003C] = {
821             name => 'script data escaped dash <',
822             state => SCRIPT_DATA_ESCAPED_LT_STATE,
823             };
824             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
825             name => 'script data escaped dash dash <',
826             state => SCRIPT_DATA_ESCAPED_LT_STATE,
827             };
828             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x003C] = {
829             name => 'script data double escaped <',
830             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
831             emit => CHARACTER_TOKEN,
832             emit_data => '<',
833             };
834             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x003C] = {
835             name => 'script data double escaped dash <',
836             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
837             emit => CHARACTER_TOKEN,
838             emit_data => '<',
839             };
840             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
841             name => 'script data double escaped dash dash <',
842             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
843             emit => CHARACTER_TOKEN,
844             emit_data => '<',
845             };
846             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E] = {
847             name => 'script data escaped dash dash >',
848             state => SCRIPT_DATA_STATE,
849             emit => CHARACTER_TOKEN,
850             emit_data => '>',
851             };
852             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003E] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E];
853             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_EOF_CHAR] =
854             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
855             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] =
856             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_EOF_CHAR] =
857             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
858             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = {
859             name => 'script data escaped eof',
860             error => 'eof in escaped script data', # XXXdocumentation
861             state => DATA_STATE,
862             reconsume => 1,
863             };
864             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x0000] =
865             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x0000] =
866             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x0000] =
867             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x0000] =
868             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x0000] =
869             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x0000] = {
870             name => 'script data escaped null',
871             emit => CHARACTER_TOKEN,
872             emit_data => "\x{FFFD}",
873             error => 'NULL',
874             state => SCRIPT_DATA_ESCAPED_STATE,
875             };
876             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
877             name => 'script data escaped else',
878             emit => CHARACTER_TOKEN,
879             state => SCRIPT_DATA_ESCAPED_STATE,
880             };
881             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
882             name => 'script data escaped dash else',
883             emit => CHARACTER_TOKEN,
884             state => SCRIPT_DATA_ESCAPED_STATE,
885             };
886             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
887             name => 'script data escaped dash dash else',
888             emit => CHARACTER_TOKEN,
889             state => SCRIPT_DATA_ESCAPED_STATE,
890             };
891             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
892             name => 'script data double escaped else',
893             emit => CHARACTER_TOKEN,
894             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
895             };
896             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
897             name => 'script data double escaped dash else',
898             emit => CHARACTER_TOKEN,
899             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
900             };
901             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
902             name => 'script data double escaped dash dash else',
903             emit => CHARACTER_TOKEN,
904             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
905             };
906             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_SPACE_CHAR] =
907             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_SPACE_CHAR] =
908             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x003E] =
909             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x003E] =
910             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x002F] =
911             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x002F] = {
912             name => 'script data double escape start sp>/',
913             skip => 1,
914             };
915             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ULATIN_CHAR] =
916             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ULATIN_CHAR] = {
917             name => 'script data double escape start uc',
918             emit => CHARACTER_TOKEN,
919             buffer => {append => 0x0020}, # UC -> lc
920             };
921             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_LLATIN_CHAR] =
922             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_LLATIN_CHAR] = {
923             name => 'script data double escape start lc',
924             emit => CHARACTER_TOKEN,
925             buffer => {append => 0x0000},
926             };
927             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
928             name => 'script data double escape start else',
929             state => SCRIPT_DATA_ESCAPED_STATE,
930             reconsume => 1,
931             };
932             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ELSE_CHAR] = {
933             name => 'script data double escape end else',
934             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
935             reconsume => 1,
936             };
937             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[0x002F] = {
938             name => 'script data double escaped lt /',
939             buffer => {clear => 1},
940             state => SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE,
941             emit => CHARACTER_TOKEN,
942             emit_data => '/',
943             };
944             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
945             name => 'script data double escaped lt else',
946             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
947             reconsume => 1,
948             };
949             ## XML5: Part of the "data state".
950             $Action->[DATA_MSE1_STATE]->[0x005D] = {
951             name => 'data mse1 ]',
952             state => DATA_MSE2_STATE,
953             emit => CHARACTER_TOKEN,
954             emit_data => ']',
955             };
956             $Action->[DATA_MSE1_STATE]->[KEY_ELSE_CHAR] = {
957             name => 'data mse1 else',
958             state => DATA_STATE,
959             reconsume => 1,
960             };
961             $Action->[DATA_MSE2_STATE]->[0x003E] = {
962             name => 'data mse2 >',
963             error => 'unmatched mse', # XML5: Not a parse error. # XXXdocumentation
964             error_delta => 2,
965             state => DATA_STATE,
966             emit => CHARACTER_TOKEN,
967             emit_data => '>',
968             };
969             $Action->[DATA_MSE2_STATE]->[0x005D] = {
970             name => 'data mse2 ]',
971             emit => CHARACTER_TOKEN,
972             emit_data => ']',
973             };
974             $Action->[DATA_MSE2_STATE]->[KEY_ELSE_CHAR] = {
975             name => 'data mse2 else',
976             state => DATA_STATE,
977             reconsume => 1,
978             };
979             ## XML5: "Tag attribute name before state".
980             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
981             name => 'before attr name sp',
982             };
983             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003E] = {
984             name => 'before attr name >',
985             emit => '',
986             state => DATA_STATE,
987             };
988             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
989             name => 'before attr name uc',
990             ca => {
991             set_name => 0x0020, # UC -> lc
992             },
993             state => ATTRIBUTE_NAME_STATE,
994             };
995             $XMLAction->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
996             name => 'before attr name uc xml',
997             ca => {
998             set_name => 0x0000,
999             },
1000             state => ATTRIBUTE_NAME_STATE,
1001             };
1002             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1003             name => 'before attr name /',
1004             state => SELF_CLOSING_START_TAG_STATE,
1005             };
1006             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1007             name => 'before attr name eof',
1008             error => 'unclosed tag',
1009             state => DATA_STATE,
1010             };
1011             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0022] =
1012             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0027] =
1013             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003C] =
1014             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1015             name => q[before attr name "'<=],
1016             error => 'bad attribute name', ## XML5: Not a parse error.
1017             ca => {set_name => 0x0000},
1018             state => ATTRIBUTE_NAME_STATE,
1019             };
1020             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1021             name => 'before attr name null',
1022             ca => {set_name => 0xFFFD},
1023             error => 'NULL',
1024             state => ATTRIBUTE_NAME_STATE,
1025             };
1026             ## XML5: ":" raises a parse error and is ignored.
1027             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1028             name => 'before attr name else',
1029             ca => {set_name => 0x0000},
1030             state => ATTRIBUTE_NAME_STATE,
1031             };
1032              
1033             ## XML5: "Tag attribute name state".
1034             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1035             name => 'attr name sp',
1036             ca => {leave => 1},
1037             state => AFTER_ATTRIBUTE_NAME_STATE,
1038             };
1039             $Action->[ATTRIBUTE_NAME_STATE]->[0x003D] = {
1040             name => 'attr name =',
1041             ca => {leave => 1},
1042             state => BEFORE_ATTRIBUTE_VALUE_STATE,
1043             };
1044             $Action->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1045             name => 'attr name >',
1046             ca => {leave => 1},
1047             emit => '',
1048             state => DATA_STATE,
1049             };
1050             $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1051             name => 'attr name > xml',
1052             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1053             ca => {leave => 1},
1054             emit => '',
1055             state => DATA_STATE,
1056             };
1057             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1058             name => 'attr name uc',
1059             ca => {name => 0x0020}, # UC -> lc
1060             };
1061             $XMLAction->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1062             name => 'attr name uc',
1063             ca => {name => 0x0000},
1064             };
1065             $Action->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1066             name => 'attr name /',
1067             ca => {leave => 1},
1068             state => SELF_CLOSING_START_TAG_STATE,
1069             };
1070             $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1071             name => 'attr name / xml',
1072             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1073             ca => {leave => 1},
1074             state => SELF_CLOSING_START_TAG_STATE,
1075             };
1076             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1077             name => 'attr name eof',
1078             error => 'unclosed tag',
1079             ca => {leave => 1},
1080             state => DATA_STATE,
1081             reconsume => 1,
1082             };
1083             $Action->[ATTRIBUTE_NAME_STATE]->[0x0022] =
1084             $Action->[ATTRIBUTE_NAME_STATE]->[0x0027] =
1085             $Action->[ATTRIBUTE_NAME_STATE]->[0x003C] = {
1086             name => q[attr name "'<],
1087             error => 'bad attribute name', ## XML5: Not a parse error.
1088             ca => {name => 0x0000},
1089             };
1090             $Action->[ATTRIBUTE_NAME_STATE]->[0x0000] = {
1091             name => 'attr name null',
1092             ca => {name => 0xFFFD},
1093             error => 'NULL',
1094             };
1095             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1096             name => 'attr name else',
1097             ca => {name => 0x0000},
1098             };
1099             ## XML5: "Tag attribute name after state".
1100             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1101             name => 'after attr name sp',
1102             };
1103             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1104             name => 'after attr name =',
1105             state => BEFORE_ATTRIBUTE_VALUE_STATE,
1106             };
1107             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1108             name => 'after attr name >',
1109             emit => '',
1110             state => DATA_STATE,
1111             };
1112             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1113             name => 'after attr name > xml',
1114             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1115             emit => '',
1116             state => DATA_STATE,
1117             };
1118             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1119             name => 'after attr name uc',
1120             ca => {set_name => 0x0020}, # UC -> lc
1121             state => ATTRIBUTE_NAME_STATE,
1122             };
1123             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1124             name => 'after attr name uc xml',
1125             ca => {set_name => 0x0000},
1126             state => ATTRIBUTE_NAME_STATE,
1127             };
1128             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1129             name => 'after attr name /',
1130             state => SELF_CLOSING_START_TAG_STATE,
1131             };
1132             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1133             name => 'after attr name / xml',
1134             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1135             state => SELF_CLOSING_START_TAG_STATE,
1136             };
1137             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1138             name => 'after attr name eof',
1139             error => 'unclosed tag',
1140             state => DATA_STATE,
1141             reconsume => 1,
1142             };
1143             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0022] =
1144             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0027] =
1145             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003C] = {
1146             name => q[after attr name "'<],
1147             error => 'bad attribute name', ## XML5: Not a parse error.
1148             #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1149             ca => {set_name => 0x0000},
1150             state => ATTRIBUTE_NAME_STATE,
1151             };
1152             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1153             name => q[after attr name else],
1154             ca => {set_name => 0xFFFD},
1155             error => 'NULL',
1156             #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1157             state => ATTRIBUTE_NAME_STATE,
1158             };
1159             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1160             name => q[after attr name else],
1161             ca => {set_name => 0x0000},
1162             state => ATTRIBUTE_NAME_STATE,
1163             };
1164             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1165             name => q[after attr name else],
1166             error => 'no attr value', ## XML5: Not a parse error.
1167             ca => {set_name => 0x0000},
1168             state => ATTRIBUTE_NAME_STATE,
1169             };
1170             ## XML5: "Tag attribute value before state".
1171             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_SPACE_CHAR] = {
1172             name => 'before attr value sp',
1173             };
1174             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0022] = {
1175             name => 'before attr value "',
1176             state => ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
1177             };
1178             $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1179             name => 'before attr value &',
1180             error => 'unquoted attr value', ## XML5: Not a parse error.
1181             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1182             reconsume => 1,
1183             };
1184             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1185             name => 'before attr value &',
1186             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1187             reconsume => 1,
1188             };
1189             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0027] = {
1190             name => "before attr value '",
1191             state => ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
1192             };
1193             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003E] = {
1194             name => 'before attr value >',
1195             error => 'empty unquoted attribute value',
1196             emit => '',
1197             state => DATA_STATE,
1198             };
1199             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_EOF_CHAR] = {
1200             name => 'before attr value eof',
1201             error => 'unclosed tag',
1202             state => DATA_STATE,
1203             };
1204             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003C] =
1205             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003D] =
1206             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0060] = {
1207             name => 'before attr value <=`',
1208             error => 'bad attribute value', ## XML5: Not a parse error.
1209             #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1210             ca => {value => 1},
1211             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1212             };
1213             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0000] = {
1214             name => 'before attr value null',
1215             ca => {value => "\x{FFFD}"},
1216             error => 'NULL',
1217             #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1218             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1219             };
1220             $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1221             name => 'before attr value else xml',
1222             error => 'unquoted attr value', ## XML5: Not a parse error. # XXXdocumentation
1223             ca => {value => 1},
1224             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1225             };
1226             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1227             name => 'before attr value else',
1228             ca => {value => 1},
1229             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1230             };
1231              
1232             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_SPACE_CHAR] = {
1233             name => 'after attr value quoted sp',
1234             state => BEFORE_ATTRIBUTE_NAME_STATE,
1235             };
1236             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x003E] = {
1237             name => 'after attr value quoted >',
1238             emit => '',
1239             state => DATA_STATE,
1240             };
1241             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x002F] = {
1242             name => 'after attr value quoted /',
1243             state => SELF_CLOSING_START_TAG_STATE,
1244             };
1245             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_EOF_CHAR] = {
1246             name => 'after attr value quoted eof',
1247             error => 'unclosed tag',
1248             state => DATA_STATE,
1249             reconsume => 1,
1250             };
1251             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_ELSE_CHAR] = {
1252             name => 'after attr value quoted else',
1253             error => 'no space between attributes',
1254             state => BEFORE_ATTRIBUTE_NAME_STATE,
1255             reconsume => 1,
1256             };
1257             $Action->[SELF_CLOSING_START_TAG_STATE]->[0x003E] = {
1258             name => 'self closing start tag >',
1259             skip => 1,
1260             };
1261             $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_EOF_CHAR] = {
1262             name => 'self closing start tag eof',
1263             error => 'unclosed tag',
1264             state => DATA_STATE, ## XML5: "Tag attribute name before state".
1265             reconsume => 1,
1266             };
1267             $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_ELSE_CHAR] = {
1268             name => 'self closing start tag else',
1269             error => 'nestc', # XXX This error type is wrong.
1270             state => BEFORE_ATTRIBUTE_NAME_STATE,
1271             reconsume => 1,
1272             };
1273             $Action->[MD_HYPHEN_STATE]->[0x002D] = {
1274             name => 'md hyphen -',
1275             ct => {type => COMMENT_TOKEN, data => '', delta => 3},
1276             state => COMMENT_START_STATE, ## XML5: "comment state".
1277             };
1278             $Action->[MD_HYPHEN_STATE]->[KEY_ELSE_CHAR] = {
1279             name => 'md hyphen else',
1280             error => 'bogus comment',
1281             error_delta => 3,
1282             state => BOGUS_COMMENT_STATE,
1283             reconsume => 1,
1284             ct => {type => COMMENT_TOKEN, data => '-', delta => 3},
1285             };
1286              
1287             my $c_to_key = [];
1288             $c_to_key->[255] = KEY_EOF_CHAR; # EOF_CHAR
1289             $c_to_key->[$_] = $_ for 0x0000..0x007F;
1290             $c_to_key->[$_] = KEY_SPACE_CHAR for keys %$is_space;
1291             $c_to_key->[$_] = KEY_ULATIN_CHAR for 0x0041..0x005A;
1292             $c_to_key->[$_] = KEY_LLATIN_CHAR for 0x0061..0x007A;
1293              
1294             sub _get_next_token ($) {
1295 4945     4945   9863 my $self = shift;
1296              
1297 4945 100       10979 if ($self->{self_closing}) {
1298             ## NOTE: The |$self->{self_closing}| flag can never be set to
1299             ## tokens except for start tag tokens. A start tag token is
1300             ## always set to |$self->{ct}| before it is emitted.
1301 2         12 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1302 2         5 delete $self->{self_closing};
1303             }
1304              
1305 4945 100       7308 if (@{$self->{token}}) {
  4945         11694  
1306 142         430 $self->{self_closing} = $self->{token}->[0]->{self_closing};
1307 142         245 return shift @{$self->{token}};
  142         542  
1308             }
1309              
1310             A: {
1311 4803         7329 my $nc = $self->{nc};
  25433         38148  
1312 25433         38306 my $state = $self->{state};
1313              
1314            
1315              
1316 25433 100       46705 my $c = $nc > 0x007F ? KEY_ELSE_CHAR : $c_to_key->[$nc];
1317 25433   100     68064 my $action = $Action->[$state]->[$c] || $Action->[$state]->[KEY_ELSE_CHAR];
1318 25433 50       50115 if ($self->{is_xml}) {
1319 0   0     0 $action = $XMLAction->[$state]->[$c]
1320             || $Action->[$state]->[$c]
1321             || $XMLAction->[$state]->[KEY_ELSE_CHAR]
1322             || $Action->[$state]->[KEY_ELSE_CHAR];
1323             }
1324              
1325 25433 100 100     67550 if ($action and not $action->{skip}) {
1326            
1327              
1328 18098 100       36306 if (defined $action->{error}) {
1329 37 100       154 if ($action->{error_delta}) {
1330             $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error},
1331             line => $self->{line_prev},
1332 9         66 column => $self->{column_prev} - $action->{error_delta} + 1);
1333             } else {
1334 28         201 $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error});
1335             }
1336             }
1337              
1338 18098 100       32230 if (defined $action->{state}) {
1339 9463         14257 $self->{state} = $action->{state};
1340            
1341 9463 100       18222 if ($action->{state_set}) {
1342 96         163 for (keys %{$action->{state_set}}) {
  96         369  
1343 192         451 $self->{$_} = $action->{state_set}->{$_};
1344             }
1345             }
1346             }
1347              
1348 18098 100       35279 if (my $act = $action->{ct}) {
1349 8713 100       16108 if (defined $act->{type}) {
1350             $self->{ct} = {type => $act->{type},
1351 2311         9077 tag_name => '', data => $act->{data}};
1352 2311 100       5066 if ($act->{delta}) {
1353 2305         4277 $self->{ct}->{line} = $self->{line_prev};
1354 2305         5326 $self->{ct}->{column} = $self->{column_prev} - $act->{delta} + 1;
1355             } else {
1356 6         17 $self->{ct}->{line} = $self->{line};
1357 6         15 $self->{ct}->{column} = $self->{column};
1358             }
1359             }
1360            
1361 8713 100       15450 if (defined $act->{append_tag_name}) {
1362 8659         19537 $self->{ct}->{tag_name} .= chr ($nc + $act->{append_tag_name});
1363             }
1364             }
1365            
1366 18098 100       35176 if (my $aca = $action->{ca}) {
1367 851 100       2123 if ($aca->{value}) {
    100          
    100          
    50          
1368 40 50       159 $self->{ca}->{value} .= $aca->{value} ne '1' ? $aca->{value} : chr $nc;
1369             } elsif (defined $aca->{name}) {
1370 519         1099 $self->{ca}->{name} .= chr ($nc + $aca->{name});
1371             } elsif (defined $aca->{set_name}) {
1372             $self->{ca} = {
1373             name => chr ($nc + $aca->{set_name}),
1374             value => '',
1375             line => $self->{line}, column => $self->{column},
1376 146         747 };
1377             } elsif ($aca->{leave}) {
1378 146 50       484 if (exists $self->{ct}->{attributes}->{$self->{ca}->{name}}) {
1379            
1380 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1381             ## Discard $self->{ca}.
1382             } else {
1383            
1384 146         516 $self->{ct}->{attributes}->{$self->{ca}->{name}} = $self->{ca};
1385 146         462 $self->{ca}->{index} = ++$self->{ct}->{last_index};
1386             }
1387             }
1388             }
1389              
1390 18098 100       33716 if (defined $action->{buffer}) {
1391 216 100       664 $self->{kwd} = '' if $action->{buffer}->{clear};
1392             $self->{kwd} .= chr ($nc + $action->{buffer}->{append})
1393 216 100       609 if defined $action->{buffer}->{append};
1394              
1395            
1396             }
1397              
1398 18098 100       32010 if (defined $action->{emit}) {
1399 4179 100       8850 if ($action->{emit} eq '') {
1400 2289 100       5150 if ($self->{ct}->{type} == START_TAG_TOKEN) {
    50          
1401            
1402 1608         3462 $self->{last_stag_name} = $self->{ct}->{tag_name};
1403             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1404 681 100       1525 if ($self->{ct}->{attributes}) {
1405            
1406 5         22 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1407             } else {
1408            
1409             }
1410             } else {
1411 0         0 die "$0: $self->{ct}->{type}: Unknown token type";
1412             }
1413            
1414 2289 50       4431 if ($action->{reconsume}) {
1415             #
1416             } else {
1417            
1418 2289 100       4786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1419 1848         3103 $self->{line_prev} = $self->{line};
1420 1848         2742 $self->{column_prev} = $self->{column};
1421 1848         2811 $self->{column}++;
1422             $self->{nc}
1423 1848         3667 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1424             } else {
1425 441         1348 $self->{set_nc}->($self);
1426             }
1427            
1428             }
1429 2289         9675 return ($self->{ct});
1430             } else {
1431 1890         6167 my $token = {type => $action->{emit}};
1432 1890 100       5694 if (defined $action->{emit_data}) {
    100          
1433 108         260 $token->{data} = $action->{emit_data};
1434 108 100       259 if ($action->{emit_data_append}) {
1435 11         26 $token->{data} .= chr $nc;
1436             }
1437             } elsif ($action->{emit} == CHARACTER_TOKEN) {
1438 1074         3108 $token->{data} .= chr $nc;
1439             }
1440 1890 100       3574 if ($action->{emit_delta}) {
1441 2         5 $token->{line} = $self->{line_prev};
1442 2         7 $token->{column} = $self->{column_prev} - $action->{emit_delta} + 1;
1443             } else {
1444 1888         3824 $token->{line} = $self->{line};
1445 1888         3589 $token->{column} = $self->{column};
1446             }
1447 1890 100       3861 if (defined $action->{emit_data_read_until}) {
1448             $self->{read_until}->($token->{data},
1449             $action->{emit_data_read_until},
1450 899         3217 length $token->{data});
1451             }
1452            
1453 1890 100       4422 if ($action->{reconsume}) {
1454             #
1455             } else {
1456            
1457 1163 100       2528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1458 751         1413 $self->{line_prev} = $self->{line};
1459 751         1243 $self->{column_prev} = $self->{column};
1460 751         1087 $self->{column}++;
1461             $self->{nc}
1462 751         1763 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1463             } else {
1464 412         1159 $self->{set_nc}->($self);
1465             }
1466            
1467             }
1468 1890         6275 return ($token);
1469             }
1470             } else {
1471 13919 100       22840 if ($action->{reconsume}) {
1472             #
1473             } else {
1474            
1475 13888 100       26590 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476 13884         21130 $self->{line_prev} = $self->{line};
1477 13884         20361 $self->{column_prev} = $self->{column};
1478 13884         19526 $self->{column}++;
1479             $self->{nc}
1480 13884         26593 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481             } else {
1482 4         18 $self->{set_nc}->($self);
1483             }
1484            
1485             }
1486             }
1487              
1488 13919         24397 redo A;
1489             }
1490              
1491 7335 100 100     68992 if ({
    100 100        
    100 100        
    100 100        
    100 100        
    100          
    100          
    100          
    100          
    50          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    50          
    100          
    50          
    50          
    50          
    100          
    100          
    100          
    100          
    100          
    50          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1492             (RCDATA_END_TAG_OPEN_STATE) => 1,
1493             (RAWTEXT_END_TAG_OPEN_STATE) => 1,
1494             (SCRIPT_DATA_END_TAG_OPEN_STATE) => 1,
1495             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => 1,
1496             }->{$state}) {
1497             ## This switch-case implements "RCDATA end tag open state",
1498             ## "RAWTEXT end tag open state", "script data end tag open
1499             ## state", "RCDATA end tag name state", "RAWTEXT end tag name
1500             ## state", and "script end tag name state" jointly with the
1501             ## implementation of the "tag name" state.
1502              
1503 560         1219 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"
1504              
1505 560 50       1136 if (defined $self->{last_stag_name}) {
1506             #
1507             } else {
1508             ## No start tag token has ever been emitted
1509             ## NOTE: See .
1510            
1511             $self->{state} = {
1512             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1513             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1514             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1515             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1516             => SCRIPT_DATA_ESCAPED_STATE,
1517 0 0       0 }->{$state} or die "${state}'s next state not found";
1518             ## Reconsume.
1519 0         0 return ({type => CHARACTER_TOKEN, data => '
1520             line => $l, column => $c});
1521 0         0 redo A;
1522             }
1523              
1524 560         1103 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
1525 560 100       1008 if (length $ch) {
1526 481         691 my $CH = $ch;
1527 481         811 $ch =~ tr/a-z/A-Z/;
1528 481         859 my $nch = chr $nc;
1529 481 100 100     1551 if ($nch eq $ch or $nch eq $CH) {
1530            
1531             ## Stay in the state.
1532 466         830 $self->{kwd} .= $nch;
1533            
1534 466 100       908 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1535 465         753 $self->{line_prev} = $self->{line};
1536 465         708 $self->{column_prev} = $self->{column};
1537 465         723 $self->{column}++;
1538             $self->{nc}
1539 465         901 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1540             } else {
1541 1         5 $self->{set_nc}->($self);
1542             }
1543            
1544 466         1202 redo A;
1545             } else {
1546            
1547             $self->{state} = {
1548             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1549             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1550             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1551             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1552             => SCRIPT_DATA_ESCAPED_STATE,
1553 15 50       126 }->{$state} or die "${state}'s next state not found";
1554             ## Reconsume.
1555             return ({type => CHARACTER_TOKEN,
1556             data => '{kwd},
1557             line => $self->{line_prev},
1558             column => $self->{column_prev} - 1 - length $self->{kwd},
1559 15         130 });
1560 0         0 redo A;
1561             }
1562             } else { # after "
1563 79 100 100     619 unless ($is_space->{$nc} or
1564             {
1565             0x003E => 1, # >
1566             0x002F => 1, # /
1567             }->{$nc}) {
1568            
1569             ## Reconsume.
1570             $self->{state} = {
1571             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1572             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1573             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1574             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1575             => SCRIPT_DATA_ESCAPED_STATE,
1576 3 50       21 }->{$self->{state}} or die "${state}'s next state not found";
1577             return ({type => CHARACTER_TOKEN,
1578             data => '{kwd},
1579             line => $self->{line_prev},
1580             column => $self->{column_prev} - 1 - length $self->{kwd},
1581 3         30 });
1582 0         0 redo A;
1583             } else {
1584            
1585             $self->{ct}
1586             = {type => END_TAG_TOKEN,
1587             tag_name => $self->{last_stag_name},
1588             line => $self->{line_prev},
1589 76         537 column => $self->{column_prev} - 1 - length $self->{kwd}};
1590 76         175 $self->{state} = TAG_NAME_STATE;
1591             ## Reconsume.
1592 76         218 redo A;
1593             }
1594             }
1595             } elsif ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE or
1596             $state == SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
1597 17 50 100     89 if ($is_space->{$nc} or
      66        
1598             $nc == 0x002F or # /
1599             $nc == 0x003E) { # >
1600             my $token = {type => CHARACTER_TOKEN,
1601             data => chr $nc,
1602 17         95 line => $self->{line}, column => $self->{column}};
1603 17 100       50 if ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) {
1604 9 50       24 $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1605             ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE
1606             : SCRIPT_DATA_ESCAPED_STATE;
1607             } else {
1608 8 50       25 $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1609             ? SCRIPT_DATA_ESCAPED_STATE
1610             : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1611             }
1612            
1613 17 50       37 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1614 17         28 $self->{line_prev} = $self->{line};
1615 17         26 $self->{column_prev} = $self->{column};
1616 17         27 $self->{column}++;
1617             $self->{nc}
1618 17         38 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1619             } else {
1620 0         0 $self->{set_nc}->($self);
1621             }
1622            
1623 17         73 return ($token);
1624 0         0 redo A;
1625             } else {
1626 0         0 die "$state/$nc is implemented";
1627             }
1628             } elsif ($state == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1629             ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1630             ## ATTLIST attribute value double quoted state".
1631            
1632 145 100 33     564 if ($nc == 0x0022) { # "
    50          
    50          
    100          
    50          
1633 68 50       194 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1634            
1635             ## XML5: "DOCTYPE ATTLIST name after state".
1636 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
1637 0         0 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1638             } else {
1639            
1640             ## XML5: "Tag attribute name before state".
1641 68         111 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1642             }
1643            
1644 68 50       171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1645 68         131 $self->{line_prev} = $self->{line};
1646 68         112 $self->{column_prev} = $self->{column};
1647 68         126 $self->{column}++;
1648             $self->{nc}
1649 68         173 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1650             } else {
1651 0         0 $self->{set_nc}->($self);
1652             }
1653            
1654 68         180 redo A;
1655             } elsif ($nc == 0x0026) { # &
1656            
1657             ## XML5: Not defined yet.
1658              
1659             ## NOTE: In the spec, the tokenizer is switched to the
1660             ## "entity in attribute value state". In this implementation, the
1661             ## tokenizer is switched to the |ENTITY_STATE|, which is an
1662             ## implementation of the "consume a character reference" algorithm.
1663 0         0 $self->{prev_state} = $state;
1664 0         0 $self->{entity_add} = 0x0022; # "
1665 0         0 $self->{state} = ENTITY_STATE;
1666            
1667 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1668 0         0 $self->{line_prev} = $self->{line};
1669 0         0 $self->{column_prev} = $self->{column};
1670 0         0 $self->{column}++;
1671             $self->{nc}
1672 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1673             } else {
1674 0         0 $self->{set_nc}->($self);
1675             }
1676            
1677 0         0 redo A;
1678             } elsif ($self->{is_xml} and
1679             $is_space->{$nc}) {
1680            
1681 0         0 $self->{ca}->{value} .= ' ';
1682             ## Stay in the state.
1683            
1684 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1685 0         0 $self->{line_prev} = $self->{line};
1686 0         0 $self->{column_prev} = $self->{column};
1687 0         0 $self->{column}++;
1688             $self->{nc}
1689 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1690             } else {
1691 0         0 $self->{set_nc}->($self);
1692             }
1693            
1694 0         0 redo A;
1695             } elsif ($nc == -1) {
1696 1         10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1697 1 50       8 if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
1698            
1699 1         15 $self->{last_stag_name} = $self->{ct}->{tag_name};
1700              
1701 1         3 $self->{state} = DATA_STATE;
1702             ## reconsume
1703 1         6 return ($self->{ct}); # start tag
1704 0         0 redo A;
1705             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1706 0 0       0 if ($self->{ct}->{attributes}) {
1707            
1708 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1709             } else {
1710             ## NOTE: This state should never be reached.
1711            
1712             }
1713              
1714 0         0 $self->{state} = DATA_STATE;
1715             ## reconsume
1716              
1717             ## Discard the token.
1718             #return ($self->{ct}); # end tag
1719              
1720 0         0 redo A;
1721             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1722             ## XML5: No parse error above; not defined yet.
1723 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
1724 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1725             ## Reconsume.
1726              
1727             ## Discard the token.
1728             #return ($self->{ct}); # ATTLIST
1729              
1730 0         0 redo A;
1731             } else {
1732 0         0 die "$0: $self->{ct}->{type}: Unknown token type";
1733             }
1734             } elsif ($nc == 0x0000) {
1735 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1736 0         0 $self->{ca}->{value} .= "\x{FFFD}";
1737             ## Stay in the state
1738            
1739 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1740 0         0 $self->{line_prev} = $self->{line};
1741 0         0 $self->{column_prev} = $self->{column};
1742 0         0 $self->{column}++;
1743             $self->{nc}
1744 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1745             } else {
1746 0         0 $self->{set_nc}->($self);
1747             }
1748            
1749 0         0 redo A;
1750             } else {
1751             ## XML5 [ATTLIST]: Not defined yet.
1752 76 50 33     221 if ($self->{is_xml} and $nc == 0x003C) { # <
1753            
1754             ## XML5: Not a parse error.
1755 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1756             } else {
1757            
1758             }
1759 76         211 $self->{ca}->{value} .= chr ($nc);
1760             $self->{read_until}->($self->{ca}->{value},
1761             qq[\x00"&<\x09\x0C\x20],
1762 76         354 length $self->{ca}->{value});
1763              
1764             ## Stay in the state
1765            
1766 76 100       268 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1767 75         164 $self->{line_prev} = $self->{line};
1768 75         132 $self->{column_prev} = $self->{column};
1769 75         117 $self->{column}++;
1770             $self->{nc}
1771 75         188 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1772             } else {
1773 1         9 $self->{set_nc}->($self);
1774             }
1775            
1776 76         297 redo A;
1777             }
1778             } elsif ($state == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1779             ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1780             ## ATTLIST attribute value single quoted state".
1781              
1782 46 100 33     350 if ($nc == 0x0027) { # '
    50          
    50          
    50          
    50          
1783 24 50       115 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1784            
1785             ## XML5: "DOCTYPE ATTLIST name after state".
1786 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
1787 0         0 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1788             } else {
1789            
1790             ## XML5: "Before attribute name state" (sic).
1791 24         58 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1792             }
1793            
1794 24 50       97 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795 24         56 $self->{line_prev} = $self->{line};
1796 24         57 $self->{column_prev} = $self->{column};
1797 24         51 $self->{column}++;
1798             $self->{nc}
1799 24         70 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800             } else {
1801 0         0 $self->{set_nc}->($self);
1802             }
1803            
1804 24         63 redo A;
1805             } elsif ($nc == 0x0026) { # &
1806            
1807             ## XML5: Not defined yet.
1808              
1809             ## NOTE: In the spec, the tokenizer is switched to the
1810             ## "entity in attribute value state". In this implementation, the
1811             ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812             ## implementation of the "consume a character reference" algorithm.
1813 0         0 $self->{entity_add} = 0x0027; # '
1814 0         0 $self->{prev_state} = $state;
1815 0         0 $self->{state} = ENTITY_STATE;
1816            
1817 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818 0         0 $self->{line_prev} = $self->{line};
1819 0         0 $self->{column_prev} = $self->{column};
1820 0         0 $self->{column}++;
1821             $self->{nc}
1822 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823             } else {
1824 0         0 $self->{set_nc}->($self);
1825             }
1826            
1827 0         0 redo A;
1828             } elsif ($self->{is_xml} and
1829             $is_space->{$nc}) {
1830            
1831 0         0 $self->{ca}->{value} .= ' ';
1832             ## Stay in the state.
1833            
1834 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1835 0         0 $self->{line_prev} = $self->{line};
1836 0         0 $self->{column_prev} = $self->{column};
1837 0         0 $self->{column}++;
1838             $self->{nc}
1839 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1840             } else {
1841 0         0 $self->{set_nc}->($self);
1842             }
1843            
1844 0         0 redo A;
1845             } elsif ($nc == -1) {
1846 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1847 0 0       0 if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
1848            
1849 0         0 $self->{last_stag_name} = $self->{ct}->{tag_name};
1850              
1851 0         0 $self->{state} = DATA_STATE;
1852             ## reconsume
1853              
1854             ## Discard the token.
1855             #return ($self->{ct}); # start tag
1856              
1857 0         0 redo A;
1858             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1859 0 0       0 if ($self->{ct}->{attributes}) {
1860            
1861 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1862             } else {
1863             ## NOTE: This state should never be reached.
1864            
1865             }
1866              
1867 0         0 $self->{state} = DATA_STATE;
1868             ## reconsume
1869              
1870             ## Discard the token.
1871             #return ($self->{ct}); # end tag
1872              
1873 0         0 redo A;
1874             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1875             ## XML5: No parse error above; not defined yet.
1876 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
1877 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1878             ## Reconsume.
1879              
1880             ## Discard the token.
1881             #return ($self->{ct}); # ATTLIST
1882              
1883 0         0 redo A;
1884             } else {
1885 0         0 die "$0: $self->{ct}->{type}: Unknown token type";
1886             }
1887             } elsif ($nc == 0x0000) {
1888 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1889 0         0 $self->{ca}->{value} .= "\x{FFFD}";
1890             ## Stay in the state
1891            
1892 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1893 0         0 $self->{line_prev} = $self->{line};
1894 0         0 $self->{column_prev} = $self->{column};
1895 0         0 $self->{column}++;
1896             $self->{nc}
1897 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1898             } else {
1899 0         0 $self->{set_nc}->($self);
1900             }
1901            
1902 0         0 redo A;
1903             } else {
1904             ## XML5 [ATTLIST]: Not defined yet.
1905 22 50 33     106 if ($self->{is_xml} and $nc == 0x003C) { # <
1906            
1907             ## XML5: Not a parse error.
1908 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1909             } else {
1910            
1911             }
1912 22         73 $self->{ca}->{value} .= chr ($nc);
1913             $self->{read_until}->($self->{ca}->{value},
1914             qq[\x00'&<\x09\x0C\x20],
1915 22         127 length $self->{ca}->{value});
1916              
1917             ## Stay in the state
1918            
1919 22 50       140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1920 22         74 $self->{line_prev} = $self->{line};
1921 22         54 $self->{column_prev} = $self->{column};
1922 22         41 $self->{column}++;
1923             $self->{nc}
1924 22         73 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1925             } else {
1926 0         0 $self->{set_nc}->($self);
1927             }
1928            
1929 22         77 redo A;
1930             }
1931             } elsif ($state == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1932             ## XML5: "Tag attribute value unquoted state".
1933              
1934 68 100       338 if ($is_space->{$nc}) {
    100          
    100          
    50          
    50          
1935 10 50       52 if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1936            
1937 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
1938 0         0 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1939             } else {
1940            
1941             ## XML5: "Tag attribute name before state".
1942 10         17 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1943             }
1944            
1945 10 50       30 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1946 10         20 $self->{line_prev} = $self->{line};
1947 10         21 $self->{column_prev} = $self->{column};
1948 10         14 $self->{column}++;
1949             $self->{nc}
1950 10         24 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1951             } else {
1952 0         0 $self->{set_nc}->($self);
1953             }
1954            
1955 10         24 redo A;
1956             } elsif ($nc == 0x0026) { # &
1957            
1958              
1959             ## XML5: Not defined yet.
1960              
1961             ## NOTE: In the spec, the tokenizer is switched to the
1962             ## "character reference in attribute value state". In this
1963             ## implementation, the tokenizer is switched to the
1964             ## |ENTITY_STATE|, which is an implementation of the "consume
1965             ## a character reference" algorithm.
1966 1         4 $self->{entity_add} = 0x003E; # >
1967 1         6 $self->{prev_state} = $state;
1968 1         2 $self->{state} = ENTITY_STATE;
1969            
1970 1 50       5 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1971 1         3 $self->{line_prev} = $self->{line};
1972 1         4 $self->{column_prev} = $self->{column};
1973 1         3 $self->{column}++;
1974             $self->{nc}
1975 1         3 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1976             } else {
1977 0         0 $self->{set_nc}->($self);
1978             }
1979            
1980 1         4 redo A;
1981             } elsif ($nc == 0x003E) { # >
1982 31 100       98 if ($self->{ct}->{type} == START_TAG_TOKEN) {
    50          
    0          
1983            
1984 30         93 $self->{last_stag_name} = $self->{ct}->{tag_name};
1985              
1986 30         65 $self->{state} = DATA_STATE;
1987            
1988 30 100       85 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1989 21         41 $self->{line_prev} = $self->{line};
1990 21         39 $self->{column_prev} = $self->{column};
1991 21         41 $self->{column}++;
1992             $self->{nc}
1993 21         52 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1994             } else {
1995 9         29 $self->{set_nc}->($self);
1996             }
1997            
1998 30         149 return ($self->{ct}); # start tag
1999 0         0 redo A;
2000             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2001 1 50       5 if ($self->{ct}->{attributes}) {
2002            
2003 1         5 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2004             } else {
2005             ## NOTE: This state should never be reached.
2006            
2007             }
2008              
2009 1         4 $self->{state} = DATA_STATE;
2010            
2011 1 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012 1         3 $self->{line_prev} = $self->{line};
2013 1         3 $self->{column_prev} = $self->{column};
2014 1         3 $self->{column}++;
2015             $self->{nc}
2016 1         5 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017             } else {
2018 0         0 $self->{set_nc}->($self);
2019             }
2020            
2021 1         5 return ($self->{ct}); # end tag
2022 0         0 redo A;
2023             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2024 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
2025 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2026            
2027 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2028 0         0 $self->{line_prev} = $self->{line};
2029 0         0 $self->{column_prev} = $self->{column};
2030 0         0 $self->{column}++;
2031             $self->{nc}
2032 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2033             } else {
2034 0         0 $self->{set_nc}->($self);
2035             }
2036            
2037 0         0 return ($self->{ct}); # ATTLIST
2038 0         0 redo A;
2039             } else {
2040 0         0 die "$0: $self->{ct}->{type}: Unknown token type";
2041             }
2042             } elsif ($nc == -1) {
2043 0 0       0 if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
2044            
2045 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2046 0         0 $self->{last_stag_name} = $self->{ct}->{tag_name};
2047              
2048 0         0 $self->{state} = DATA_STATE;
2049             ## reconsume
2050              
2051             ## Discard the token.
2052             #return ($self->{ct}); # start tag
2053            
2054 0         0 redo A;
2055             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2056 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2057 0 0       0 if ($self->{ct}->{attributes}) {
2058            
2059 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2060             } else {
2061             ## NOTE: This state should never be reached.
2062            
2063             }
2064              
2065 0         0 $self->{state} = DATA_STATE;
2066             ## reconsume
2067              
2068             ## Discard the token.
2069             #return ($self->{ct}); # end tag
2070              
2071 0         0 redo A;
2072             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2073 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2074 0         0 push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0         0  
2075 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2076             ## Reconsume.
2077              
2078             ## Discard the token.
2079             #return ($self->{ct}); # ATTLIST
2080              
2081 0         0 redo A;
2082             } else {
2083 0         0 die "$0: $self->{ct}->{type}: Unknown token type";
2084             }
2085             } elsif ($nc == 0x0000) {
2086 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2087 0         0 $self->{ca}->{value} .= "\x{FFFD}";
2088             ## Stay in the state
2089            
2090 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2091 0         0 $self->{line_prev} = $self->{line};
2092 0         0 $self->{column_prev} = $self->{column};
2093 0         0 $self->{column}++;
2094             $self->{nc}
2095 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2096             } else {
2097 0         0 $self->{set_nc}->($self);
2098             }
2099            
2100 0         0 redo A;
2101             } else {
2102 26 100       192 if ({
2103             0x0022 => 1, # "
2104             0x0027 => 1, # '
2105             0x003D => 1, # =
2106             0x003C => 1, # <
2107             0x0060 => 1, # `
2108             }->{$nc}) {
2109            
2110             ## XML5: Not a parse error.
2111 1         6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2112             } else {
2113            
2114             }
2115 26         99 $self->{ca}->{value} .= chr ($nc);
2116             $self->{read_until}->($self->{ca}->{value},
2117             qq[\x00"'=&` \x09\x0C<>],
2118 26         130 length $self->{ca}->{value});
2119              
2120             ## Stay in the state
2121            
2122 26 50       113 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2123 26         53 $self->{line_prev} = $self->{line};
2124 26         48 $self->{column_prev} = $self->{column};
2125 26         47 $self->{column}++;
2126             $self->{nc}
2127 26         65 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2128             } else {
2129 0         0 $self->{set_nc}->($self);
2130             }
2131            
2132 26         79 redo A;
2133             }
2134             } elsif ($state == SELF_CLOSING_START_TAG_STATE) {
2135             ## XML5: "Empty tag state".
2136              
2137 10 50       38 if ($nc == 0x003E) { # >
2138 10 100       36 if ($self->{ct}->{type} == END_TAG_TOKEN) {
2139            
2140 1         9 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2141             ## XXX: Different type than slash in start tag
2142 1 50       7 if ($self->{ct}->{attributes}) {
2143            
2144 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2145             } else {
2146            
2147             }
2148             ## XXX: Test ||
2149             } else {
2150            
2151 9         40 $self->{self_closing} = 1;
2152             }
2153              
2154 10         26 $self->{state} = DATA_STATE;
2155            
2156 10 100       49 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2157 7         19 $self->{line_prev} = $self->{line};
2158 7         25 $self->{column_prev} = $self->{column};
2159 7         24 $self->{column}++;
2160             $self->{nc}
2161 7         27 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2162             } else {
2163 3         13 $self->{set_nc}->($self);
2164             }
2165            
2166              
2167 10         63 return ($self->{ct}); # start tag or end tag
2168              
2169 0         0 redo A;
2170             } else {
2171 0         0 die "$state/$nc is implemented";
2172             }
2173             } elsif ($state == BOGUS_COMMENT_STATE) {
2174             ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2175              
2176             ## NOTE: Unlike spec's "bogus comment state", this implementation
2177             ## consumes characters one-by-one basis.
2178            
2179 20 100       91 if ($nc == 0x003E) { # >
    100          
    50          
2180 9 50       58 if ($self->{in_subset}) {
2181            
2182 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2183             } else {
2184            
2185 9         25 $self->{state} = DATA_STATE;
2186             }
2187            
2188 9 100       39 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2189 2         5 $self->{line_prev} = $self->{line};
2190 2         4 $self->{column_prev} = $self->{column};
2191 2         4 $self->{column}++;
2192             $self->{nc}
2193 2         7 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2194             } else {
2195 7         68 $self->{set_nc}->($self);
2196             }
2197            
2198              
2199 9         66 return ($self->{ct}); # comment
2200 0         0 redo A;
2201             } elsif ($nc == -1) {
2202 1 50       5 if ($self->{in_subset}) {
2203            
2204 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2205             } else {
2206            
2207 1         3 $self->{state} = DATA_STATE;
2208             }
2209             ## reconsume
2210              
2211 1         4 return ($self->{ct}); # comment
2212 0         0 redo A;
2213             } elsif ($nc == 0x0000) {
2214 0         0 $self->{ct}->{data} .= "\x{FFFD}"; # comment
2215             ## Stay in the state.
2216            
2217 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218 0         0 $self->{line_prev} = $self->{line};
2219 0         0 $self->{column_prev} = $self->{column};
2220 0         0 $self->{column}++;
2221             $self->{nc}
2222 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223             } else {
2224 0         0 $self->{set_nc}->($self);
2225             }
2226            
2227 0         0 redo A;
2228             } else {
2229            
2230 10         59 $self->{ct}->{data} .= chr ($nc); # comment
2231             $self->{read_until}->($self->{ct}->{data},
2232             qq[\x00>],
2233 10         52 length $self->{ct}->{data});
2234              
2235             ## Stay in the state.
2236            
2237 10 100       66 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2238 9         25 $self->{line_prev} = $self->{line};
2239 9         20 $self->{column_prev} = $self->{column};
2240 9         19 $self->{column}++;
2241             $self->{nc}
2242 9         34 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2243             } else {
2244 1         5 $self->{set_nc}->($self);
2245             }
2246            
2247 10         38 redo A;
2248             }
2249             } elsif ($state == MARKUP_DECLARATION_OPEN_STATE) {
2250             ## XML5: "Markup declaration state".
2251            
2252 444 100 100     1892 if ($nc == 0x002D) { # -
    100          
2253            
2254 47         98 $self->{state} = MD_HYPHEN_STATE;
2255            
2256 47 50       137 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257 47         94 $self->{line_prev} = $self->{line};
2258 47         88 $self->{column_prev} = $self->{column};
2259 47         74 $self->{column}++;
2260             $self->{nc}
2261 47         107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262             } else {
2263 0         0 $self->{set_nc}->($self);
2264             }
2265            
2266 47         138 redo A;
2267             } elsif ($nc == 0x0044 or # D
2268             $nc == 0x0064) { # d
2269             ## ASCII case-insensitive.
2270            
2271 394         714 $self->{state} = MD_DOCTYPE_STATE;
2272 394         1357 $self->{kwd} = chr $nc;
2273            
2274 394 50       969 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275 394         794 $self->{line_prev} = $self->{line};
2276 394         629 $self->{column_prev} = $self->{column};
2277 394         592 $self->{column}++;
2278             $self->{nc}
2279 394         904 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280             } else {
2281 0         0 $self->{set_nc}->($self);
2282             }
2283            
2284 394         1354 redo A;
2285             # $nc == 0x005B) { # [
2286            
2287 0         0 $self->{state} = MD_CDATA_STATE;
2288 0         0 $self->{kwd} = '[';
2289            
2290 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2291 0         0 $self->{line_prev} = $self->{line};
2292 0         0 $self->{column_prev} = $self->{column};
2293 0         0 $self->{column}++;
2294             $self->{nc}
2295 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2296             } else {
2297 0         0 $self->{set_nc}->($self);
2298             }
2299            
2300 0         0 redo A;
2301             } else {
2302            
2303             }
2304              
2305             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2306             line => $self->{line_prev},
2307 3         56 column => $self->{column_prev} - 1);
2308             ## Reconsume.
2309 3         24 $self->{state} = BOGUS_COMMENT_STATE;
2310             $self->{ct} = {type => COMMENT_TOKEN, data => '',
2311             line => $self->{line_prev},
2312 3         32 column => $self->{column_prev} - 1,
2313             };
2314 3         23 redo A;
2315             } elsif ($state == MD_DOCTYPE_STATE) {
2316             ## ASCII case-insensitive.
2317 2364 100 100     11374 if ($nc == [
    50 66        
      33        
2318             undef,
2319             0x004F, # O
2320             0x0043, # C
2321             0x0054, # T
2322             0x0059, # Y
2323             0x0050, # P
2324             NEVER_CHAR, # (E)
2325             ]->[length $self->{kwd}] or
2326             $nc == [
2327             undef,
2328             0x006F, # o
2329             0x0063, # c
2330             0x0074, # t
2331             0x0079, # y
2332             0x0070, # p
2333             NEVER_CHAR, # (e)
2334             ]->[length $self->{kwd}]) {
2335            
2336             ## Stay in the state.
2337 1970         3721 $self->{kwd} .= chr $nc;
2338            
2339 1970 50       3731 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2340 1970         3132 $self->{line_prev} = $self->{line};
2341 1970         2961 $self->{column_prev} = $self->{column};
2342 1970         2728 $self->{column}++;
2343             $self->{nc}
2344 1970         3595 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2345             } else {
2346 0         0 $self->{set_nc}->($self);
2347             }
2348            
2349 1970         5024 redo A;
2350             } elsif ((length $self->{kwd}) == 6 and
2351             ($nc == 0x0045 or # E
2352             $nc == 0x0065)) { # e
2353 394 50 0     1480 if ($self->{is_xml} and
      33        
2354             ($self->{kwd} ne 'DOCTYP' or $nc == 0x0065)) {
2355            
2356             ## XML5: case-sensitive.
2357             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2358             text => 'DOCTYPE',
2359             line => $self->{line_prev},
2360 0         0 column => $self->{column_prev} - 5);
2361             } else {
2362            
2363             }
2364 394         674 $self->{state} = DOCTYPE_STATE;
2365             $self->{ct} = {type => DOCTYPE_TOKEN,
2366             quirks => 1,
2367             line => $self->{line_prev},
2368 394         1804 column => $self->{column_prev} - 7,
2369             };
2370            
2371 394 50       1164 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2372 394         805 $self->{line_prev} = $self->{line};
2373 394         645 $self->{column_prev} = $self->{column};
2374 394         581 $self->{column}++;
2375             $self->{nc}
2376 394         873 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2377             } else {
2378 0         0 $self->{set_nc}->($self);
2379             }
2380            
2381 394         1151 redo A;
2382             } else {
2383            
2384             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2385             line => $self->{line_prev},
2386 0         0 column => $self->{column_prev} - 1 - length $self->{kwd});
2387 0         0 $self->{state} = BOGUS_COMMENT_STATE;
2388             ## Reconsume.
2389             $self->{ct} = {type => COMMENT_TOKEN,
2390             data => $self->{kwd},
2391             line => $self->{line_prev},
2392             column => $self->{column_prev} - 1 - length $self->{kwd},
2393 0         0 };
2394 0         0 redo A;
2395             }
2396             } elsif ($state == MD_CDATA_STATE) {
2397 0 0 0     0 if ($nc == {
    0          
2398             '[' => 0x0043, # C
2399             '[C' => 0x0044, # D
2400             '[CD' => 0x0041, # A
2401             '[CDA' => 0x0054, # T
2402             '[CDAT' => 0x0041, # A
2403             '[CDATA' => NEVER_CHAR, # ([)
2404             }->{$self->{kwd}}) {
2405            
2406             ## Stay in the state.
2407 0         0 $self->{kwd} .= chr $nc;
2408            
2409 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410 0         0 $self->{line_prev} = $self->{line};
2411 0         0 $self->{column_prev} = $self->{column};
2412 0         0 $self->{column}++;
2413             $self->{nc}
2414 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415             } else {
2416 0         0 $self->{set_nc}->($self);
2417             }
2418            
2419 0         0 redo A;
2420             } elsif ($self->{kwd} eq '[CDATA' and
2421             $nc == 0x005B) { # [
2422 0 0 0     0 if ($self->{is_xml} and
      0        
2423             not $self->{tainted} and
2424 0 0       0 @{$self->{open_elements} or []} == 0) {
2425            
2426             $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2427             line => $self->{line_prev},
2428 0         0 column => $self->{column_prev} - 7);
2429 0         0 $self->{tainted} = 1;
2430             } else {
2431            
2432             }
2433              
2434             $self->{ct} = {type => CHARACTER_TOKEN,
2435             data => '',
2436             line => $self->{line_prev},
2437 0         0 column => $self->{column_prev} - 7};
2438 0         0 $self->{state} = CDATA_SECTION_STATE;
2439            
2440 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2441 0         0 $self->{line_prev} = $self->{line};
2442 0         0 $self->{column_prev} = $self->{column};
2443 0         0 $self->{column}++;
2444             $self->{nc}
2445 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2446             } else {
2447 0         0 $self->{set_nc}->($self);
2448             }
2449            
2450 0         0 redo A;
2451             } else {
2452            
2453             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2454             line => $self->{line_prev},
2455 0         0 column => $self->{column_prev} - 1 - length $self->{kwd});
2456 0         0 $self->{state} = BOGUS_COMMENT_STATE;
2457             ## Reconsume.
2458             $self->{ct} = {type => COMMENT_TOKEN,
2459             data => $self->{kwd},
2460             line => $self->{line_prev},
2461             column => $self->{column_prev} - 1 - length $self->{kwd},
2462 0         0 };
2463 0         0 redo A;
2464             }
2465             } elsif ($state == COMMENT_START_STATE) {
2466 47 100       240 if ($nc == 0x002D) { # -
    100          
    50          
    50          
2467            
2468 3         6 $self->{state} = COMMENT_START_DASH_STATE;
2469            
2470 3 50       9 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2471 3         6 $self->{line_prev} = $self->{line};
2472 3         7 $self->{column_prev} = $self->{column};
2473 3         4 $self->{column}++;
2474             $self->{nc}
2475 3         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2476             } else {
2477 0         0 $self->{set_nc}->($self);
2478             }
2479            
2480 3         8 redo A;
2481             } elsif ($nc == 0x003E) { # >
2482 1         16 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2483 1 50       4 if ($self->{in_subset}) {
2484            
2485 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2486             } else {
2487            
2488 1         3 $self->{state} = DATA_STATE;
2489             }
2490            
2491 1 50       4 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2492 1         3 $self->{line_prev} = $self->{line};
2493 1         3 $self->{column_prev} = $self->{column};
2494 1         2 $self->{column}++;
2495             $self->{nc}
2496 1         5 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2497             } else {
2498 0         0 $self->{set_nc}->($self);
2499             }
2500            
2501              
2502 1         5 return ($self->{ct}); # comment
2503              
2504 0         0 redo A;
2505             } elsif ($nc == -1) {
2506 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2507 0 0       0 if ($self->{in_subset}) {
2508            
2509 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2510             } else {
2511            
2512 0         0 $self->{state} = DATA_STATE;
2513             }
2514             ## reconsume
2515              
2516 0         0 return ($self->{ct}); # comment
2517              
2518 0         0 redo A;
2519             } elsif ($nc == 0x0000) {
2520 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2521 0         0 $self->{ct}->{data} .= "\x{FFFD}"; # comment
2522 0         0 $self->{state} = COMMENT_STATE;
2523            
2524 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525 0         0 $self->{line_prev} = $self->{line};
2526 0         0 $self->{column_prev} = $self->{column};
2527 0         0 $self->{column}++;
2528             $self->{nc}
2529 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530             } else {
2531 0         0 $self->{set_nc}->($self);
2532             }
2533            
2534 0         0 redo A;
2535             } else {
2536            
2537             $self->{ct}->{data} # comment
2538 43         142 .= chr ($nc);
2539 43         92 $self->{state} = COMMENT_STATE;
2540            
2541 43 100       107 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2542 42         82 $self->{line_prev} = $self->{line};
2543 42         72 $self->{column_prev} = $self->{column};
2544 42         67 $self->{column}++;
2545             $self->{nc}
2546 42         102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2547             } else {
2548 1         5 $self->{set_nc}->($self);
2549             }
2550            
2551 43         114 redo A;
2552             }
2553             } elsif ($state == COMMENT_START_DASH_STATE) {
2554 3 100       13 if ($nc == 0x002D) { # -
    50          
    0          
    0          
2555            
2556 2         4 $self->{state} = COMMENT_END_STATE;
2557            
2558 2 50       6 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559 2         5 $self->{line_prev} = $self->{line};
2560 2         5 $self->{column_prev} = $self->{column};
2561 2         4 $self->{column}++;
2562             $self->{nc}
2563 2         5 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564             } else {
2565 0         0 $self->{set_nc}->($self);
2566             }
2567            
2568 2         7 redo A;
2569             } elsif ($nc == 0x003E) { # >
2570 1         6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2571 1 50       4 if ($self->{in_subset}) {
2572            
2573 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2574             } else {
2575            
2576 1         3 $self->{state} = DATA_STATE;
2577             }
2578            
2579 1 50       15 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580 1         4 $self->{line_prev} = $self->{line};
2581 1         3 $self->{column_prev} = $self->{column};
2582 1         2 $self->{column}++;
2583             $self->{nc}
2584 1         5 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585             } else {
2586 0         0 $self->{set_nc}->($self);
2587             }
2588            
2589              
2590 1         5 return ($self->{ct}); # comment
2591              
2592 0         0 redo A;
2593             } elsif ($nc == -1) {
2594 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2595 0 0       0 if ($self->{in_subset}) {
2596            
2597 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2598             } else {
2599            
2600 0         0 $self->{state} = DATA_STATE;
2601             }
2602             ## reconsume
2603              
2604 0         0 return ($self->{ct}); # comment
2605              
2606 0         0 redo A;
2607             } elsif ($nc == 0x0000) {
2608 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2609 0         0 $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2610 0         0 $self->{state} = COMMENT_STATE;
2611            
2612 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2613 0         0 $self->{line_prev} = $self->{line};
2614 0         0 $self->{column_prev} = $self->{column};
2615 0         0 $self->{column}++;
2616             $self->{nc}
2617 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2618             } else {
2619 0         0 $self->{set_nc}->($self);
2620             }
2621            
2622 0         0 redo A;
2623             } else {
2624            
2625             $self->{ct}->{data} # comment
2626 0         0 .= '-' . chr ($nc);
2627 0         0 $self->{state} = COMMENT_STATE;
2628            
2629 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2630 0         0 $self->{line_prev} = $self->{line};
2631 0         0 $self->{column_prev} = $self->{column};
2632 0         0 $self->{column}++;
2633             $self->{nc}
2634 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2635             } else {
2636 0         0 $self->{set_nc}->($self);
2637             }
2638            
2639 0         0 redo A;
2640             }
2641             } elsif ($state == COMMENT_STATE) {
2642             ## XML5: "Comment state" and "DOCTYPE comment state".
2643              
2644 110 100       340 if ($nc == 0x002D) { # -
    100          
    50          
2645            
2646 54         116 $self->{state} = COMMENT_END_DASH_STATE;
2647            
2648 54 50       149 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2649 54         102 $self->{line_prev} = $self->{line};
2650 54         90 $self->{column_prev} = $self->{column};
2651 54         104 $self->{column}++;
2652             $self->{nc}
2653 54         122 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2654             } else {
2655 0         0 $self->{set_nc}->($self);
2656             }
2657            
2658 54         126 redo A;
2659             } elsif ($nc == -1) {
2660 4         19 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2661 4 50       12 if ($self->{in_subset}) {
2662            
2663 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2664             } else {
2665            
2666 4         9 $self->{state} = DATA_STATE;
2667             }
2668             ## reconsume
2669              
2670 4         16 return ($self->{ct}); # comment
2671              
2672 0         0 redo A;
2673             } elsif ($nc == 0x0000) {
2674 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2675 0         0 $self->{ct}->{data} .= "\x{FFFD}"; # comment
2676            
2677 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2678 0         0 $self->{line_prev} = $self->{line};
2679 0         0 $self->{column_prev} = $self->{column};
2680 0         0 $self->{column}++;
2681             $self->{nc}
2682 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2683             } else {
2684 0         0 $self->{set_nc}->($self);
2685             }
2686            
2687 0         0 redo A;
2688             } else {
2689            
2690 52         145 $self->{ct}->{data} .= chr ($nc); # comment
2691             $self->{read_until}->($self->{ct}->{data},
2692             qq[-\x00],
2693 52         231 length $self->{ct}->{data});
2694              
2695             ## Stay in the state
2696            
2697 52 100       196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698 41         87 $self->{line_prev} = $self->{line};
2699 41         110 $self->{column_prev} = $self->{column};
2700 41         85 $self->{column}++;
2701             $self->{nc}
2702 41         102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703             } else {
2704 11         34 $self->{set_nc}->($self);
2705             }
2706            
2707 52         160 redo A;
2708             }
2709             } elsif ($state == COMMENT_END_DASH_STATE) {
2710             ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2711              
2712 54 100       159 if ($nc == 0x002D) { # -
    50          
    50          
2713            
2714 49         103 $self->{state} = COMMENT_END_STATE;
2715            
2716 49 100       121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2717 48         132 $self->{line_prev} = $self->{line};
2718 48         86 $self->{column_prev} = $self->{column};
2719 48         72 $self->{column}++;
2720             $self->{nc}
2721 48         102 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2722             } else {
2723 1         5 $self->{set_nc}->($self);
2724             }
2725            
2726 49         117 redo A;
2727             } elsif ($nc == -1) {
2728 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729 0 0       0 if ($self->{in_subset}) {
2730            
2731 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732             } else {
2733            
2734 0         0 $self->{state} = DATA_STATE;
2735             }
2736             ## reconsume
2737              
2738 0         0 return ($self->{ct}); # comment
2739              
2740 0         0 redo A;
2741             } elsif ($nc == 0x0000) {
2742 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2743 0         0 $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2744 0         0 $self->{state} = COMMENT_STATE;
2745            
2746 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2747 0         0 $self->{line_prev} = $self->{line};
2748 0         0 $self->{column_prev} = $self->{column};
2749 0         0 $self->{column}++;
2750             $self->{nc}
2751 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2752             } else {
2753 0         0 $self->{set_nc}->($self);
2754             }
2755            
2756 0         0 redo A;
2757             } else {
2758            
2759 5         19 $self->{ct}->{data} .= '-' . chr ($nc); # comment
2760 5         10 $self->{state} = COMMENT_STATE;
2761            
2762 5 50       15 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2763 5         13 $self->{line_prev} = $self->{line};
2764 5         8 $self->{column_prev} = $self->{column};
2765 5         10 $self->{column}++;
2766             $self->{nc}
2767 5         13 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2768             } else {
2769 0         0 $self->{set_nc}->($self);
2770             }
2771            
2772 5         13 redo A;
2773             }
2774             } elsif ($state == COMMENT_END_STATE or
2775             $state == COMMENT_END_BANG_STATE) {
2776             ## XML5: "Comment end state" and "DOCTYPE comment end state".
2777             ## (No comment end bang state.)
2778              
2779 55 100 66     203 if ($nc == 0x003E) { # >
    100          
    100          
    100          
    50          
2780 40 50       100 if ($self->{in_subset}) {
2781            
2782 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2783             } else {
2784            
2785 40         86 $self->{state} = DATA_STATE;
2786             }
2787            
2788 40 100       131 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2789 21         44 $self->{line_prev} = $self->{line};
2790 21         40 $self->{column_prev} = $self->{column};
2791 21         30 $self->{column}++;
2792             $self->{nc}
2793 21         48 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2794             } else {
2795 19         67 $self->{set_nc}->($self);
2796             }
2797            
2798              
2799 40         183 return ($self->{ct}); # comment
2800              
2801 0         0 redo A;
2802             } elsif ($nc == 0x002D) { # -
2803 1 50       4 if ($state == COMMENT_END_BANG_STATE) {
2804            
2805 0         0 $self->{ct}->{data} .= '--!'; # comment
2806 0         0 $self->{state} = COMMENT_END_DASH_STATE;
2807             } else {
2808            
2809             ## XML5: Not a parse error.
2810             $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2811             line => $self->{line_prev},
2812 1         7 column => $self->{column_prev});
2813 1         14 $self->{ct}->{data} .= '-'; # comment
2814             ## Stay in the state
2815             }
2816            
2817 1 50       9 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2818 1         3 $self->{line_prev} = $self->{line};
2819 1         3 $self->{column_prev} = $self->{column};
2820 1         3 $self->{column}++;
2821             $self->{nc}
2822 1         4 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2823             } else {
2824 0         0 $self->{set_nc}->($self);
2825             }
2826            
2827 1         4 redo A;
2828             } elsif ($state != COMMENT_END_BANG_STATE and
2829             $nc == 0x0021) { # !
2830            
2831 3         14 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
2832 3         8 $self->{state} = COMMENT_END_BANG_STATE;
2833            
2834 3 50       20 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2835 3         11 $self->{line_prev} = $self->{line};
2836 3         6 $self->{column_prev} = $self->{column};
2837 3         7 $self->{column}++;
2838             $self->{nc}
2839 3         9 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2840             } else {
2841 0         0 $self->{set_nc}->($self);
2842             }
2843            
2844 3         11 redo A;
2845             } elsif ($nc == -1) {
2846 1         8 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2847 1 50       5 if ($self->{in_subset}) {
2848            
2849 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850             } else {
2851            
2852 1         3 $self->{state} = DATA_STATE;
2853             }
2854             ## Reconsume.
2855              
2856 1         5 return ($self->{ct}); # comment
2857              
2858 0         0 redo A;
2859             } elsif ($nc == 0x0000) {
2860 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2861 0 0       0 if ($state == COMMENT_END_BANG_STATE) {
2862 0         0 $self->{ct}->{data} .= "--!\x{FFFD}"; # comment
2863             } else {
2864 0         0 $self->{ct}->{data} .= "--\x{FFFD}"; # comment
2865             }
2866 0         0 $self->{state} = COMMENT_STATE;
2867            
2868 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2869 0         0 $self->{line_prev} = $self->{line};
2870 0         0 $self->{column_prev} = $self->{column};
2871 0         0 $self->{column}++;
2872             $self->{nc}
2873 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2874             } else {
2875 0         0 $self->{set_nc}->($self);
2876             }
2877            
2878 0         0 redo A;
2879             } else {
2880            
2881 10 50       22 if ($state == COMMENT_END_BANG_STATE) {
2882 0         0 $self->{ct}->{data} .= '--!' . chr ($nc); # comment
2883             } else {
2884 10         50 $self->{ct}->{data} .= '--' . chr ($nc); # comment
2885             }
2886 10         29 $self->{state} = COMMENT_STATE;
2887            
2888 10 50       27 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2889 10         19 $self->{line_prev} = $self->{line};
2890 10         19 $self->{column_prev} = $self->{column};
2891 10         15 $self->{column}++;
2892             $self->{nc}
2893 10         22 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2894             } else {
2895 0         0 $self->{set_nc}->($self);
2896             }
2897            
2898 10         24 redo A;
2899             }
2900             } elsif ($state == DOCTYPE_STATE) {
2901 394 100       1138 if ($is_space->{$nc}) {
    50          
2902            
2903 387         765 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2904            
2905 387 50       926 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2906 387         780 $self->{line_prev} = $self->{line};
2907 387         674 $self->{column_prev} = $self->{column};
2908 387         684 $self->{column}++;
2909             $self->{nc}
2910 387         934 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2911             } else {
2912 0         0 $self->{set_nc}->($self);
2913             }
2914            
2915 387         886 redo A;
2916             } elsif ($nc == -1) {
2917            
2918 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2919 0         0 $self->{ct}->{quirks} = 1;
2920              
2921 0         0 $self->{state} = DATA_STATE;
2922             ## Reconsume.
2923 0         0 return ($self->{ct}); # DOCTYPE (quirks)
2924              
2925 0         0 redo A;
2926             } else {
2927            
2928             ## XML5: Swith to the bogus comment state.
2929 7         32 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2930 7         12 $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2931             ## reconsume
2932 7         19 redo A;
2933             }
2934             } elsif ($state == BEFORE_DOCTYPE_NAME_STATE) {
2935             ## XML5: "DOCTYPE root name before state".
2936              
2937 396 100 100     3258 if ($is_space->{$nc}) {
    100 33        
    100          
    50          
    50          
    50          
2938            
2939             ## Stay in the state
2940            
2941 2 50       6 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2942 2         5 $self->{line_prev} = $self->{line};
2943 2         4 $self->{column_prev} = $self->{column};
2944 2         3 $self->{column}++;
2945             $self->{nc}
2946 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2947             } else {
2948 0         0 $self->{set_nc}->($self);
2949             }
2950            
2951 2         15 redo A;
2952             } elsif ($nc == 0x003E) { # >
2953            
2954             ## XML5: No parse error.
2955 2         9 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2956 2         4 $self->{state} = DATA_STATE;
2957            
2958 2 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959 2         5 $self->{line_prev} = $self->{line};
2960 2         4 $self->{column_prev} = $self->{column};
2961 2         4 $self->{column}++;
2962             $self->{nc}
2963 2         5 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964             } else {
2965 0         0 $self->{set_nc}->($self);
2966             }
2967            
2968              
2969 2         9 return ($self->{ct}); # DOCTYPE (quirks)
2970              
2971 0         0 redo A;
2972             } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
2973            
2974             $self->{ct}->{name} # DOCTYPE
2975 8 50       47 = chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
2976 8         18 delete $self->{ct}->{quirks};
2977 8         16 $self->{state} = DOCTYPE_NAME_STATE;
2978            
2979 8 50       25 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2980 8         17 $self->{line_prev} = $self->{line};
2981 8         15 $self->{column_prev} = $self->{column};
2982 8         15 $self->{column}++;
2983             $self->{nc}
2984 8         18 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2985             } else {
2986 0         0 $self->{set_nc}->($self);
2987             }
2988            
2989 8         21 redo A;
2990             } elsif ($nc == -1) {
2991            
2992 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2993 0         0 $self->{state} = DATA_STATE;
2994             ## reconsume
2995              
2996 0         0 return ($self->{ct}); # DOCTYPE (quirks)
2997              
2998 0         0 redo A;
2999             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
3000            
3001 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3002 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3003 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3004 0         0 $self->{in_subset} = 1;
3005            
3006 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007 0         0 $self->{line_prev} = $self->{line};
3008 0         0 $self->{column_prev} = $self->{column};
3009 0         0 $self->{column}++;
3010             $self->{nc}
3011 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012             } else {
3013 0         0 $self->{set_nc}->($self);
3014             }
3015            
3016 0         0 return ($self->{ct}); # DOCTYPE
3017 0         0 redo A;
3018             } elsif ($nc == 0x0000) {
3019 0         0 $self->{ct}->{name} = "\x{FFFD}";
3020 0         0 delete $self->{ct}->{quirks};
3021 0         0 $self->{state} = DOCTYPE_NAME_STATE;
3022            
3023 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3024 0         0 $self->{line_prev} = $self->{line};
3025 0         0 $self->{column_prev} = $self->{column};
3026 0         0 $self->{column}++;
3027             $self->{nc}
3028 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3029             } else {
3030 0         0 $self->{set_nc}->($self);
3031             }
3032            
3033 0         0 redo A;
3034             } else {
3035            
3036 384         1085 $self->{ct}->{name} = chr $nc;
3037 384         836 delete $self->{ct}->{quirks};
3038 384         676 $self->{state} = DOCTYPE_NAME_STATE;
3039            
3040 384 50       2606 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3041 384         674 $self->{line_prev} = $self->{line};
3042 384         645 $self->{column_prev} = $self->{column};
3043 384         541 $self->{column}++;
3044             $self->{nc}
3045 384         839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3046             } else {
3047 0         0 $self->{set_nc}->($self);
3048             }
3049            
3050 384         933 redo A;
3051             }
3052             } elsif ($state == DOCTYPE_NAME_STATE) {
3053             ## XML5: "DOCTYPE root name state".
3054              
3055 1622 100 100     7876 if ($is_space->{$nc}) {
    100 33        
    100          
    50          
    50          
    50          
3056            
3057 33         62 $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3058            
3059 33 50       75 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3060 33         57 $self->{line_prev} = $self->{line};
3061 33         55 $self->{column_prev} = $self->{column};
3062 33         52 $self->{column}++;
3063             $self->{nc}
3064 33         70 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3065             } else {
3066 0         0 $self->{set_nc}->($self);
3067             }
3068            
3069 33         92 redo A;
3070             } elsif ($nc == 0x003E) { # >
3071            
3072 359         688 $self->{state} = DATA_STATE;
3073            
3074 359 100       757 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075 341         587 $self->{line_prev} = $self->{line};
3076 341         523 $self->{column_prev} = $self->{column};
3077 341         597 $self->{column}++;
3078             $self->{nc}
3079 341         652 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080             } else {
3081 18         89 $self->{set_nc}->($self);
3082             }
3083            
3084              
3085 359         1745 return ($self->{ct}); # DOCTYPE
3086              
3087 0         0 redo A;
3088             } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
3089            
3090             $self->{ct}->{name} # DOCTYPE
3091 31 50       78 .= chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
3092 31         48 delete $self->{ct}->{quirks};
3093             ## Stay in the state.
3094            
3095 31 50       78 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096 31         54 $self->{line_prev} = $self->{line};
3097 31         55 $self->{column_prev} = $self->{column};
3098 31         42 $self->{column}++;
3099             $self->{nc}
3100 31         61 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101             } else {
3102 0         0 $self->{set_nc}->($self);
3103             }
3104            
3105 31         108 redo A;
3106             } elsif ($nc == -1) {
3107            
3108 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3109 0         0 $self->{state} = DATA_STATE;
3110             ## reconsume
3111              
3112 0         0 $self->{ct}->{quirks} = 1;
3113 0         0 return ($self->{ct}); # DOCTYPE
3114              
3115 0         0 redo A;
3116             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
3117            
3118 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3119 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3120 0         0 $self->{in_subset} = 1;
3121            
3122 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123 0         0 $self->{line_prev} = $self->{line};
3124 0         0 $self->{column_prev} = $self->{column};
3125 0         0 $self->{column}++;
3126             $self->{nc}
3127 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128             } else {
3129 0         0 $self->{set_nc}->($self);
3130             }
3131            
3132 0         0 return ($self->{ct}); # DOCTYPE
3133 0         0 redo A;
3134             } elsif ($nc == 0x0000) {
3135 0         0 $self->{ct}->{name} .= "\x{FFFD}"; # DOCTYPE
3136             ## Stay in the state.
3137            
3138 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3139 0         0 $self->{line_prev} = $self->{line};
3140 0         0 $self->{column_prev} = $self->{column};
3141 0         0 $self->{column}++;
3142             $self->{nc}
3143 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3144             } else {
3145 0         0 $self->{set_nc}->($self);
3146             }
3147            
3148 0         0 redo A;
3149             } else {
3150            
3151 1199         2298 $self->{ct}->{name} .= chr ($nc); # DOCTYPE
3152             ## Stay in the state.
3153            
3154 1199 50       2490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3155 1199         2023 $self->{line_prev} = $self->{line};
3156 1199         1819 $self->{column_prev} = $self->{column};
3157 1199         1773 $self->{column}++;
3158             $self->{nc}
3159 1199         2176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3160             } else {
3161 0         0 $self->{set_nc}->($self);
3162             }
3163            
3164 1199         2610 redo A;
3165             }
3166             } elsif ($state == AFTER_DOCTYPE_NAME_STATE) {
3167             ## XML5: Corresponding to XML5's "DOCTYPE root name after
3168             ## state", but implemented differently.
3169              
3170 39 100 100     237 if ($is_space->{$nc}) {
    100 100        
    50 0        
    100 33        
    100 0        
    50 33        
    50 33        
    50 33        
3171            
3172             ## Stay in the state
3173            
3174 6 50       13 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3175 6         11 $self->{line_prev} = $self->{line};
3176 6         10 $self->{column_prev} = $self->{column};
3177 6         8 $self->{column}++;
3178             $self->{nc}
3179 6         15 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3180             } else {
3181 0         0 $self->{set_nc}->($self);
3182             }
3183            
3184 6         13 redo A;
3185             } elsif ($nc == 0x003E) { # >
3186 1 50       4 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187            
3188 1         3 $self->{state} = DATA_STATE;
3189             } else {
3190            
3191 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3192 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3193             }
3194            
3195            
3196 1 50       4 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3197 1         2 $self->{line_prev} = $self->{line};
3198 1         3 $self->{column_prev} = $self->{column};
3199 1         2 $self->{column}++;
3200             $self->{nc}
3201 1         3 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3202             } else {
3203 0         0 $self->{set_nc}->($self);
3204             }
3205            
3206 1         6 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3207 0         0 redo A;
3208             } elsif ($nc == -1) {
3209 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3210            
3211 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3212 0         0 $self->{state} = DATA_STATE;
3213 0         0 $self->{ct}->{quirks} = 1;
3214             } else {
3215            
3216 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3217 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3218             }
3219            
3220             ## Reconsume.
3221 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3222 0         0 redo A;
3223             } elsif ($nc == 0x0050 or # P
3224             $nc == 0x0070) { # p
3225            
3226 17         33 $self->{state} = PUBLIC_STATE;
3227 17         36 $self->{kwd} = chr $nc;
3228            
3229 17 50       51 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3230 17         31 $self->{line_prev} = $self->{line};
3231 17         28 $self->{column_prev} = $self->{column};
3232 17         24 $self->{column}++;
3233             $self->{nc}
3234 17         34 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3235             } else {
3236 0         0 $self->{set_nc}->($self);
3237             }
3238            
3239 17         39 redo A;
3240             } elsif ($nc == 0x0053 or # S
3241             $nc == 0x0073) { # s
3242            
3243 10         22 $self->{state} = SYSTEM_STATE;
3244 10         26 $self->{kwd} = chr $nc;
3245            
3246 10 50       25 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247 10         17 $self->{line_prev} = $self->{line};
3248 10         26 $self->{column_prev} = $self->{column};
3249 10         17 $self->{column}++;
3250             $self->{nc}
3251 10         24 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3252             } else {
3253 0         0 $self->{set_nc}->($self);
3254             }
3255            
3256 10         27 redo A;
3257             } elsif ($nc == 0x0022 and # "
3258             ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3259             $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3260            
3261 0         0 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3262 0         0 $self->{ct}->{value} = ''; # ENTITY
3263            
3264 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265 0         0 $self->{line_prev} = $self->{line};
3266 0         0 $self->{column_prev} = $self->{column};
3267 0         0 $self->{column}++;
3268             $self->{nc}
3269 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270             } else {
3271 0         0 $self->{set_nc}->($self);
3272             }
3273            
3274 0         0 redo A;
3275             } elsif ($nc == 0x0027 and # '
3276             ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277             $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278            
3279 0         0 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3280 0         0 $self->{ct}->{value} = ''; # ENTITY
3281            
3282 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283 0         0 $self->{line_prev} = $self->{line};
3284 0         0 $self->{column_prev} = $self->{column};
3285 0         0 $self->{column}++;
3286             $self->{nc}
3287 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288             } else {
3289 0         0 $self->{set_nc}->($self);
3290             }
3291            
3292 0         0 redo A;
3293             } elsif ($self->{is_xml} and
3294             $self->{ct}->{type} == DOCTYPE_TOKEN and
3295             $nc == 0x005B) { # [
3296            
3297 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3298 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3299 0         0 $self->{in_subset} = 1;
3300            
3301 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3302 0         0 $self->{line_prev} = $self->{line};
3303 0         0 $self->{column_prev} = $self->{column};
3304 0         0 $self->{column}++;
3305             $self->{nc}
3306 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3307             } else {
3308 0         0 $self->{set_nc}->($self);
3309             }
3310            
3311 0         0 return ($self->{ct}); # DOCTYPE
3312 0         0 redo A;
3313             } else {
3314 5         27 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3315              
3316 5 50       22 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3317            
3318 5         15 $self->{ct}->{quirks} = 1;
3319 5         11 $self->{state} = BOGUS_DOCTYPE_STATE;
3320             } else {
3321            
3322 0         0 $self->{state} = BOGUS_MD_STATE;
3323             }
3324              
3325            
3326 5 50       14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3327 5         12 $self->{line_prev} = $self->{line};
3328 5         9 $self->{column_prev} = $self->{column};
3329 5         9 $self->{column}++;
3330             $self->{nc}
3331 5         13 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3332             } else {
3333 0         0 $self->{set_nc}->($self);
3334             }
3335            
3336 5         17 redo A;
3337             }
3338             } elsif ($state == PUBLIC_STATE) {
3339             ## ASCII case-insensitive
3340 85 100 100     385 if ($nc == [
    50 66        
      33        
3341             undef,
3342             0x0055, # U
3343             0x0042, # B
3344             0x004C, # L
3345             0x0049, # I
3346             NEVER_CHAR, # (C)
3347             ]->[length $self->{kwd}] or
3348             $nc == [
3349             undef,
3350             0x0075, # u
3351             0x0062, # b
3352             0x006C, # l
3353             0x0069, # i
3354             NEVER_CHAR, # (c)
3355             ]->[length $self->{kwd}]) {
3356            
3357             ## Stay in the state.
3358 68         455 $self->{kwd} .= chr $nc;
3359            
3360 68 50       307 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361 68         109 $self->{line_prev} = $self->{line};
3362 68         224 $self->{column_prev} = $self->{column};
3363 68         103 $self->{column}++;
3364             $self->{nc}
3365 68         119 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366             } else {
3367 0         0 $self->{set_nc}->($self);
3368             }
3369            
3370 68         160 redo A;
3371             } elsif ((length $self->{kwd}) == 5 and
3372             ($nc == 0x0043 or # C
3373             $nc == 0x0063)) { # c
3374 17 50 0     52 if ($self->{is_xml} and
      33        
3375             ($self->{kwd} ne 'PUBLI' or $nc == 0x0063)) { # c
3376            
3377             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3378             text => 'PUBLIC',
3379             line => $self->{line_prev},
3380 0         0 column => $self->{column_prev} - 4);
3381             } else {
3382            
3383             }
3384 17         27 $self->{state} = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
3385            
3386 17 100       41 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3387 16         25 $self->{line_prev} = $self->{line};
3388 16         30 $self->{column_prev} = $self->{column};
3389 16         22 $self->{column}++;
3390             $self->{nc}
3391 16         35 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3392             } else {
3393 1         4 $self->{set_nc}->($self);
3394             }
3395            
3396 17         48 redo A;
3397             } else {
3398             $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3399             line => $self->{line_prev},
3400 0         0 column => $self->{column_prev} + 1 - length $self->{kwd});
3401 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3402            
3403 0         0 $self->{ct}->{quirks} = 1;
3404 0         0 $self->{state} = BOGUS_DOCTYPE_STATE;
3405             } else {
3406            
3407 0         0 $self->{state} = BOGUS_MD_STATE;
3408             }
3409             ## Reconsume.
3410 0         0 redo A;
3411             }
3412             } elsif ($state == SYSTEM_STATE) {
3413             ## ASCII case-insensitive
3414 50 100 100     224 if ($nc == [
    50 66        
      33        
3415             undef,
3416             0x0059, # Y
3417             0x0053, # S
3418             0x0054, # T
3419             0x0045, # E
3420             NEVER_CHAR, # (M)
3421             ]->[length $self->{kwd}] or
3422             $nc == [
3423             undef,
3424             0x0079, # y
3425             0x0073, # s
3426             0x0074, # t
3427             0x0065, # e
3428             NEVER_CHAR, # (m)
3429             ]->[length $self->{kwd}]) {
3430            
3431             ## Stay in the state.
3432 40         101 $self->{kwd} .= chr $nc;
3433            
3434 40 50       87 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435 40         69 $self->{line_prev} = $self->{line};
3436 40         92 $self->{column_prev} = $self->{column};
3437 40         55 $self->{column}++;
3438             $self->{nc}
3439 40         83 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440             } else {
3441 0         0 $self->{set_nc}->($self);
3442             }
3443            
3444 40         94 redo A;
3445             } elsif ((length $self->{kwd}) == 5 and
3446             ($nc == 0x004D or # M
3447             $nc == 0x006D)) { # m
3448 10 50 0     29 if ($self->{is_xml} and
      33        
3449             ($self->{kwd} ne 'SYSTE' or $nc == 0x006D)) { # m
3450            
3451             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3452             text => 'SYSTEM',
3453             line => $self->{line_prev},
3454 0         0 column => $self->{column_prev} - 4);
3455             } else {
3456            
3457             }
3458 10         21 $self->{state} = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
3459            
3460 10 50       26 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461 10         21 $self->{line_prev} = $self->{line};
3462 10         21 $self->{column_prev} = $self->{column};
3463 10         23 $self->{column}++;
3464             $self->{nc}
3465 10         30 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466             } else {
3467 0         0 $self->{set_nc}->($self);
3468             }
3469            
3470 10         26 redo A;
3471             } else {
3472             $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3473             line => $self->{line_prev},
3474 0         0 column => $self->{column_prev} + 1 - length $self->{kwd});
3475 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3476            
3477 0         0 $self->{ct}->{quirks} = 1;
3478 0         0 $self->{state} = BOGUS_DOCTYPE_STATE;
3479             } else {
3480            
3481 0         0 $self->{state} = BOGUS_MD_STATE;
3482             }
3483             ## Reconsume.
3484 0         0 redo A;
3485             }
3486             } elsif ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE or
3487             $state == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3488 32 100 33     115 if ($is_space->{$nc}) {
    100 33        
    100          
    100          
    50          
    50          
3489            
3490             ## Stay in or switch to the state.
3491 15         29 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3492            
3493 15 100       34 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3494 14         23 $self->{line_prev} = $self->{line};
3495 14         24 $self->{column_prev} = $self->{column};
3496 14         21 $self->{column}++;
3497             $self->{nc}
3498 14         29 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3499             } else {
3500 1         3 $self->{set_nc}->($self);
3501             }
3502            
3503 15         36 redo A;
3504             } elsif ($nc == 0x0022) { # "
3505 10 100       34 if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3506            
3507 1         6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3508             } else {
3509            
3510             }
3511 10         30 $self->{ct}->{pubid} = ''; # DOCTYPE
3512 10         18 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3513            
3514 10 50       45 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3515 10         19 $self->{line_prev} = $self->{line};
3516 10         17 $self->{column_prev} = $self->{column};
3517 10         16 $self->{column}++;
3518             $self->{nc}
3519 10         23 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3520             } else {
3521 0         0 $self->{set_nc}->($self);
3522             }
3523            
3524 10         27 redo A;
3525             } elsif ($nc == 0x0027) { # '
3526 3 100       9 if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3527            
3528 1         5 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3529             } else {
3530            
3531             }
3532 3         7 $self->{ct}->{pubid} = ''; # DOCTYPE
3533 3         7 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3534            
3535 3 50       9 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3536 3         17 $self->{line_prev} = $self->{line};
3537 3         6 $self->{column_prev} = $self->{column};
3538 3         5 $self->{column}++;
3539             $self->{nc}
3540 3         9 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3541             } else {
3542 0         0 $self->{set_nc}->($self);
3543             }
3544            
3545 3         10 redo A;
3546             } elsif ($nc == 0x003E) { # >
3547 2         11 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3548            
3549 2 50       19 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3550            
3551 2         5 $self->{state} = DATA_STATE;
3552 2         18 $self->{ct}->{quirks} = 1;
3553             } else {
3554            
3555 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3556             }
3557            
3558            
3559 2 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3560 2         5 $self->{line_prev} = $self->{line};
3561 2         2 $self->{column_prev} = $self->{column};
3562 2         4 $self->{column}++;
3563             $self->{nc}
3564 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3565             } else {
3566 0         0 $self->{set_nc}->($self);
3567             }
3568            
3569 2         12 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3570 0         0 redo A;
3571             } elsif ($nc == EOF_CHAR) {
3572 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3573            
3574 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3575 0         0 $self->{state} = DATA_STATE;
3576 0         0 $self->{ct}->{quirks} = 1;
3577             } else {
3578            
3579 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3580 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3581             }
3582            
3583             ## Reconsume.
3584 0         0 return ($self->{ct}); # DOCTYPE
3585 0         0 redo A;
3586             } elsif ($self->{is_xml} and
3587             $self->{ct}->{type} == DOCTYPE_TOKEN and
3588             $nc == 0x005B) { # [
3589            
3590 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3591 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3592 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3593 0         0 $self->{in_subset} = 1;
3594            
3595 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3596 0         0 $self->{line_prev} = $self->{line};
3597 0         0 $self->{column_prev} = $self->{column};
3598 0         0 $self->{column}++;
3599             $self->{nc}
3600 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3601             } else {
3602 0         0 $self->{set_nc}->($self);
3603             }
3604            
3605 0         0 return ($self->{ct}); # DOCTYPE
3606 0         0 redo A;
3607             } else {
3608 2         12 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3609              
3610 2 50       7 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611            
3612 2         6 $self->{ct}->{quirks} = 1;
3613 2         4 $self->{state} = BOGUS_DOCTYPE_STATE;
3614             } else {
3615            
3616 0         0 $self->{state} = BOGUS_MD_STATE;
3617             }
3618              
3619            
3620 2 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3621 2         4 $self->{line_prev} = $self->{line};
3622 2         5 $self->{column_prev} = $self->{column};
3623 2         3 $self->{column}++;
3624             $self->{nc}
3625 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3626             } else {
3627 0         0 $self->{set_nc}->($self);
3628             }
3629            
3630 2         6 redo A;
3631             }
3632             } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3633 20 100       74 if ($nc == 0x0022) { # "
    50          
    50          
    50          
3634            
3635 10         20 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3636            
3637 10 100       35 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638 6         18 $self->{line_prev} = $self->{line};
3639 6         11 $self->{column_prev} = $self->{column};
3640 6         9 $self->{column}++;
3641             $self->{nc}
3642 6         27 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643             } else {
3644 4         11 $self->{set_nc}->($self);
3645             }
3646            
3647 10         29 redo A;
3648             } elsif ($nc == 0x003E) { # >
3649 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3650              
3651 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3652            
3653 0         0 $self->{state} = DATA_STATE;
3654 0         0 $self->{ct}->{quirks} = 1;
3655             } else {
3656            
3657 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3658             }
3659              
3660            
3661 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3662 0         0 $self->{line_prev} = $self->{line};
3663 0         0 $self->{column_prev} = $self->{column};
3664 0         0 $self->{column}++;
3665             $self->{nc}
3666 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3667             } else {
3668 0         0 $self->{set_nc}->($self);
3669             }
3670            
3671 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3672 0         0 redo A;
3673             } elsif ($nc == -1) {
3674 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3675              
3676 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3677            
3678 0         0 $self->{state} = DATA_STATE;
3679 0         0 $self->{ct}->{quirks} = 1;
3680             } else {
3681            
3682 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3683             }
3684            
3685             ## Reconsume.
3686 0         0 return ($self->{ct}); # DOCTYPE
3687 0         0 redo A;
3688             } elsif ($nc == 0x0000) {
3689 0         0 $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3690             ## Stay in the state.
3691            
3692 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3693 0         0 $self->{line_prev} = $self->{line};
3694 0         0 $self->{column_prev} = $self->{column};
3695 0         0 $self->{column}++;
3696             $self->{nc}
3697 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3698             } else {
3699 0         0 $self->{set_nc}->($self);
3700             }
3701            
3702 0         0 redo A;
3703             } else {
3704            
3705 10         30 $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3706             $self->{read_until}->($self->{ct}->{pubid}, qq[\x00">],
3707 10         49 length $self->{ct}->{pubid});
3708              
3709             ## Stay in the state.
3710            
3711 10 50       37 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3712 10         22 $self->{line_prev} = $self->{line};
3713 10         18 $self->{column_prev} = $self->{column};
3714 10         17 $self->{column}++;
3715             $self->{nc}
3716 10         26 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3717             } else {
3718 0         0 $self->{set_nc}->($self);
3719             }
3720            
3721 10         30 redo A;
3722             }
3723             } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3724 6 100       31 if ($nc == 0x0027) { # '
    50          
    50          
    50          
3725            
3726 3         8 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3727            
3728 3 50       18 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3729 3         7 $self->{line_prev} = $self->{line};
3730 3         6 $self->{column_prev} = $self->{column};
3731 3         5 $self->{column}++;
3732             $self->{nc}
3733 3         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3734             } else {
3735 0         0 $self->{set_nc}->($self);
3736             }
3737            
3738 3         8 redo A;
3739             } elsif ($nc == 0x003E) { # >
3740 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3741              
3742 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3743            
3744 0         0 $self->{state} = DATA_STATE;
3745 0         0 $self->{ct}->{quirks} = 1;
3746             } else {
3747            
3748 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3749             }
3750              
3751            
3752 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3753 0         0 $self->{line_prev} = $self->{line};
3754 0         0 $self->{column_prev} = $self->{column};
3755 0         0 $self->{column}++;
3756             $self->{nc}
3757 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3758             } else {
3759 0         0 $self->{set_nc}->($self);
3760             }
3761            
3762 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3763 0         0 redo A;
3764             } elsif ($nc == -1) {
3765 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766              
3767 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768            
3769 0         0 $self->{state} = DATA_STATE;
3770 0         0 $self->{ct}->{quirks} = 1;
3771             } else {
3772            
3773 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3774             }
3775            
3776             ## reconsume
3777 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3778 0         0 redo A;
3779             } elsif ($nc == 0x0000) {
3780 0         0 $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3781             ## Stay in the state.
3782            
3783 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3784 0         0 $self->{line_prev} = $self->{line};
3785 0         0 $self->{column_prev} = $self->{column};
3786 0         0 $self->{column}++;
3787             $self->{nc}
3788 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3789             } else {
3790 0         0 $self->{set_nc}->($self);
3791             }
3792            
3793 0         0 redo A;
3794             } else {
3795            
3796 3         11 $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3797             $self->{read_until}->($self->{ct}->{pubid}, qq[\x00'>],
3798 3         15 length $self->{ct}->{pubid});
3799              
3800             ## Stay in the state
3801            
3802 3 50       12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3803 3         7 $self->{line_prev} = $self->{line};
3804 3         5 $self->{column_prev} = $self->{column};
3805 3         7 $self->{column}++;
3806             $self->{nc}
3807 3         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3808             } else {
3809 0         0 $self->{set_nc}->($self);
3810             }
3811            
3812 3         11 redo A;
3813             }
3814             } elsif ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE or
3815             $state == BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE) {
3816 26 100 33     131 if ($is_space->{$nc}) {
    100 33        
    100          
    100          
    50          
    50          
3817            
3818             ## Stay in or switch to the state.
3819 13         22 $self->{state} = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE;
3820            
3821 13 100       30 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3822 9         15 $self->{line_prev} = $self->{line};
3823 9         16 $self->{column_prev} = $self->{column};
3824 9         12 $self->{column}++;
3825             $self->{nc}
3826 9         19 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3827             } else {
3828 4         12 $self->{set_nc}->($self);
3829             }
3830            
3831 13         43 redo A;
3832             } elsif ($nc == 0x0022) { # "
3833 5 100       12 if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3834            
3835 1         7 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3836             } else {
3837            
3838             }
3839 5         12 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3840 5         10 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3841            
3842 5 50       12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843 5         11 $self->{line_prev} = $self->{line};
3844 5         7 $self->{column_prev} = $self->{column};
3845 5         10 $self->{column}++;
3846             $self->{nc}
3847 5         14 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848             } else {
3849 0         0 $self->{set_nc}->($self);
3850             }
3851            
3852 5         12 redo A;
3853             } elsif ($nc == 0x0027) { # '
3854 3 50       7 if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3855            
3856 3         11 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3857             } else {
3858            
3859             }
3860 3         7 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3861 3         6 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3862            
3863 3 50       10 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864 3         6 $self->{line_prev} = $self->{line};
3865 3         7 $self->{column_prev} = $self->{column};
3866 3         7 $self->{column}++;
3867             $self->{nc}
3868 3         10 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869             } else {
3870 0         0 $self->{set_nc}->($self);
3871             }
3872            
3873 3         9 redo A;
3874             } elsif ($nc == 0x003E) { # >
3875 3 50       23 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3876 3 50       12 if ($self->{is_xml}) {
3877            
3878 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3879             } else {
3880            
3881             }
3882 3         8 $self->{state} = DATA_STATE;
3883             } else {
3884 0 0       0 if ($self->{ct}->{type} == NOTATION_TOKEN) {
3885            
3886             } else {
3887            
3888 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3889             }
3890 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3891             }
3892            
3893            
3894 3 100       44 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3895 2         5 $self->{line_prev} = $self->{line};
3896 2         5 $self->{column_prev} = $self->{column};
3897 2         4 $self->{column}++;
3898             $self->{nc}
3899 2         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3900             } else {
3901 1         4 $self->{set_nc}->($self);
3902             }
3903            
3904 3         19 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3905 0         0 redo A;
3906             } elsif ($nc == EOF_CHAR) {
3907 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3908            
3909 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3910            
3911 0         0 $self->{state} = DATA_STATE;
3912 0         0 $self->{ct}->{quirks} = 1;
3913             } else {
3914 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3915 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3916             }
3917              
3918             ## Reconsume.
3919 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3920 0         0 redo A;
3921             } elsif ($self->{is_xml} and
3922             $self->{ct}->{type} == DOCTYPE_TOKEN and
3923             $nc == 0x005B) { # [
3924            
3925 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3926 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3927 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3928 0         0 $self->{in_subset} = 1;
3929            
3930 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3931 0         0 $self->{line_prev} = $self->{line};
3932 0         0 $self->{column_prev} = $self->{column};
3933 0         0 $self->{column}++;
3934             $self->{nc}
3935 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3936             } else {
3937 0         0 $self->{set_nc}->($self);
3938             }
3939            
3940 0         0 return ($self->{ct}); # DOCTYPE
3941 0         0 redo A;
3942             } else {
3943 2         11 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3944              
3945 2 50       6 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3946            
3947 2         6 $self->{ct}->{quirks} = 1;
3948 2         4 $self->{state} = BOGUS_DOCTYPE_STATE;
3949             } else {
3950            
3951 0         0 $self->{state} = BOGUS_MD_STATE;
3952             }
3953              
3954            
3955 2 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3956 2         5 $self->{line_prev} = $self->{line};
3957 2         4 $self->{column_prev} = $self->{column};
3958 2         4 $self->{column}++;
3959             $self->{nc}
3960 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3961             } else {
3962 0         0 $self->{set_nc}->($self);
3963             }
3964            
3965 2         6 redo A;
3966             }
3967             } elsif ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE or
3968             $state == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3969 22 100 33     78 if ($is_space->{$nc}) {
    100 33        
    100          
    100          
    50          
    50          
3970            
3971             ## Stay in or switch to the state.
3972 12         24 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3973            
3974 12 50       32 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3975 12         26 $self->{line_prev} = $self->{line};
3976 12         19 $self->{column_prev} = $self->{column};
3977 12         31 $self->{column}++;
3978             $self->{nc}
3979 12         32 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3980             } else {
3981 0         0 $self->{set_nc}->($self);
3982             }
3983            
3984 12         29 redo A;
3985             } elsif ($nc == 0x0022) { # "
3986 4 50       13 if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
3987            
3988 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3989             } else {
3990            
3991             }
3992 4         13 $self->{ct}->{sysid} = ''; # DOCTYPE
3993 4         11 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3994            
3995 4 50       13 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3996 4         18 $self->{line_prev} = $self->{line};
3997 4         11 $self->{column_prev} = $self->{column};
3998 4         18 $self->{column}++;
3999             $self->{nc}
4000 4         24 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4001             } else {
4002 0         0 $self->{set_nc}->($self);
4003             }
4004            
4005 4         15 redo A;
4006             } elsif ($nc == 0x0027) { # '
4007 1 50       5 if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
4008            
4009 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
4010             } else {
4011            
4012             }
4013 1         4 $self->{ct}->{sysid} = ''; # DOCTYPE
4014 1         3 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4015            
4016 1 50       14 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4017 1         5 $self->{line_prev} = $self->{line};
4018 1         4 $self->{column_prev} = $self->{column};
4019 1         2 $self->{column}++;
4020             $self->{nc}
4021 1         4 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4022             } else {
4023 0         0 $self->{set_nc}->($self);
4024             }
4025            
4026 1         4 redo A;
4027             } elsif ($nc == 0x003E) { # >
4028 2         12 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4029            
4030 2 50       15 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4031 2         7 $self->{line_prev} = $self->{line};
4032 2         3 $self->{column_prev} = $self->{column};
4033 2         5 $self->{column}++;
4034             $self->{nc}
4035 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4036             } else {
4037 0         0 $self->{set_nc}->($self);
4038             }
4039            
4040              
4041 2 50       7 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4042            
4043 2         4 $self->{state} = DATA_STATE;
4044 2         5 $self->{ct}->{quirks} = 1;
4045             } else {
4046            
4047 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048             }
4049              
4050 2         11 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4051 0         0 redo A;
4052             } elsif ($nc == EOF_CHAR) {
4053 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4054            
4055 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4056 0         0 $self->{state} = DATA_STATE;
4057 0         0 $self->{ct}->{quirks} = 1;
4058             } else {
4059            
4060 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4061 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4062             }
4063            
4064             ## Reconsume.
4065 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4066 0         0 redo A;
4067             } elsif ($self->{is_xml} and
4068             $self->{ct}->{type} == DOCTYPE_TOKEN and
4069             $nc == 0x005B) { # [
4070            
4071 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4072              
4073 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4075 0         0 $self->{in_subset} = 1;
4076            
4077 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078 0         0 $self->{line_prev} = $self->{line};
4079 0         0 $self->{column_prev} = $self->{column};
4080 0         0 $self->{column}++;
4081             $self->{nc}
4082 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083             } else {
4084 0         0 $self->{set_nc}->($self);
4085             }
4086            
4087 0         0 return ($self->{ct}); # DOCTYPE
4088 0         0 redo A;
4089             } else {
4090 3         15 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4091              
4092 3 50       9 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4093            
4094 3         7 $self->{ct}->{quirks} = 1;
4095 3         6 $self->{state} = BOGUS_DOCTYPE_STATE;
4096             } else {
4097            
4098 0         0 $self->{state} = BOGUS_MD_STATE;
4099             }
4100              
4101            
4102 3 50       8 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103 3         4 $self->{line_prev} = $self->{line};
4104 3         7 $self->{column_prev} = $self->{column};
4105 3         4 $self->{column}++;
4106             $self->{nc}
4107 3         7 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108             } else {
4109 0         0 $self->{set_nc}->($self);
4110             }
4111            
4112 3         8 redo A;
4113             }
4114             } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4115 18 100 33     78 if ($nc == 0x0022) { # "
    50          
    50          
    50          
4116            
4117 9         20 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4118            
4119 9 50       37 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4120 9         23 $self->{line_prev} = $self->{line};
4121 9         15 $self->{column_prev} = $self->{column};
4122 9         17 $self->{column}++;
4123             $self->{nc}
4124 9         20 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4125             } else {
4126 0         0 $self->{set_nc}->($self);
4127             }
4128            
4129 9         22 redo A;
4130             } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4131 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4132              
4133 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4134            
4135 0         0 $self->{state} = DATA_STATE;
4136 0         0 $self->{ct}->{quirks} = 1;
4137             } else {
4138            
4139 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4140             }
4141            
4142            
4143 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144 0         0 $self->{line_prev} = $self->{line};
4145 0         0 $self->{column_prev} = $self->{column};
4146 0         0 $self->{column}++;
4147             $self->{nc}
4148 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4149             } else {
4150 0         0 $self->{set_nc}->($self);
4151             }
4152            
4153 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4154 0         0 redo A;
4155             } elsif ($nc == -1) {
4156 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4157              
4158 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4159            
4160 0         0 $self->{state} = DATA_STATE;
4161 0         0 $self->{ct}->{quirks} = 1;
4162             } else {
4163            
4164 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4165             }
4166            
4167             ## reconsume
4168 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4169 0         0 redo A;
4170             } elsif ($nc == 0x0000) {
4171 0         0 $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4172             ## Stay in the state.
4173            
4174 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4175 0         0 $self->{line_prev} = $self->{line};
4176 0         0 $self->{column_prev} = $self->{column};
4177 0         0 $self->{column}++;
4178             $self->{nc}
4179 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4180             } else {
4181 0         0 $self->{set_nc}->($self);
4182             }
4183            
4184 0         0 redo A;
4185             } else {
4186            
4187 9         28 $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4188             $self->{read_until}->($self->{ct}->{sysid}, qq[\x00">],
4189 9         55 length $self->{ct}->{sysid});
4190              
4191             ## Stay in the state
4192            
4193 9 50       29 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4194 9         18 $self->{line_prev} = $self->{line};
4195 9         19 $self->{column_prev} = $self->{column};
4196 9         17 $self->{column}++;
4197             $self->{nc}
4198 9         23 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4199             } else {
4200 0         0 $self->{set_nc}->($self);
4201             }
4202            
4203 9         26 redo A;
4204             }
4205             } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4206 8 100 33     40 if ($nc == 0x0027) { # '
    50          
    50          
    50          
4207            
4208 4         10 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4209            
4210 4 50       12 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4211 4         11 $self->{line_prev} = $self->{line};
4212 4         7 $self->{column_prev} = $self->{column};
4213 4         7 $self->{column}++;
4214             $self->{nc}
4215 4         10 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4216             } else {
4217 0         0 $self->{set_nc}->($self);
4218             }
4219            
4220 4         20 redo A;
4221             } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4222            
4223 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4224              
4225 0         0 $self->{state} = DATA_STATE;
4226            
4227 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4228 0         0 $self->{line_prev} = $self->{line};
4229 0         0 $self->{column_prev} = $self->{column};
4230 0         0 $self->{column}++;
4231             $self->{nc}
4232 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4233             } else {
4234 0         0 $self->{set_nc}->($self);
4235             }
4236            
4237              
4238 0         0 $self->{ct}->{quirks} = 1;
4239 0         0 return ($self->{ct}); # DOCTYPE
4240              
4241 0         0 redo A;
4242             } elsif ($nc == -1) {
4243 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4244              
4245 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4246            
4247 0         0 $self->{state} = DATA_STATE;
4248 0         0 $self->{ct}->{quirks} = 1;
4249             } else {
4250            
4251 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4252             }
4253              
4254             ## reconsume
4255 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4256 0         0 redo A;
4257             } elsif ($nc == 0x0000) {
4258 0         0 $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4259             ## Stay in the state.
4260            
4261 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4262 0         0 $self->{line_prev} = $self->{line};
4263 0         0 $self->{column_prev} = $self->{column};
4264 0         0 $self->{column}++;
4265             $self->{nc}
4266 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4267             } else {
4268 0         0 $self->{set_nc}->($self);
4269             }
4270            
4271 0         0 redo A;
4272             } else {
4273            
4274 4         16 $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4275             $self->{read_until}->($self->{ct}->{sysid}, qq[\x00'>],
4276 4         20 length $self->{ct}->{sysid});
4277              
4278             ## Stay in the state
4279            
4280 4 50       16 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4281 4         10 $self->{line_prev} = $self->{line};
4282 4         8 $self->{column_prev} = $self->{column};
4283 4         5 $self->{column}++;
4284             $self->{nc}
4285 4         11 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4286             } else {
4287 0         0 $self->{set_nc}->($self);
4288             }
4289            
4290 4         14 redo A;
4291             }
4292             } elsif ($state == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4293 13 50 0     44 if ($is_space->{$nc}) {
    50 0        
    0 0        
    0 0        
    0          
4294 0 0       0 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4295            
4296 0         0 $self->{state} = BEFORE_NDATA_STATE;
4297             } else {
4298            
4299             ## Stay in the state
4300             }
4301            
4302 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4303 0         0 $self->{line_prev} = $self->{line};
4304 0         0 $self->{column_prev} = $self->{column};
4305 0         0 $self->{column}++;
4306             $self->{nc}
4307 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4308             } else {
4309 0         0 $self->{set_nc}->($self);
4310             }
4311            
4312 0         0 redo A;
4313             } elsif ($nc == 0x003E) { # >
4314 13 50       43 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4315            
4316 13         26 $self->{state} = DATA_STATE;
4317             } else {
4318            
4319 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4320             }
4321              
4322            
4323 13 100       32 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324 6         25 $self->{line_prev} = $self->{line};
4325 6         11 $self->{column_prev} = $self->{column};
4326 6         10 $self->{column}++;
4327             $self->{nc}
4328 6         16 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329             } else {
4330 7         18 $self->{set_nc}->($self);
4331             }
4332            
4333 13         67 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4334 0         0 redo A;
4335             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4336             ($nc == 0x004E or # N
4337             $nc == 0x006E)) { # n
4338            
4339 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4340 0         0 $self->{state} = NDATA_STATE;
4341 0         0 $self->{kwd} = chr $nc;
4342            
4343 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4344 0         0 $self->{line_prev} = $self->{line};
4345 0         0 $self->{column_prev} = $self->{column};
4346 0         0 $self->{column}++;
4347             $self->{nc}
4348 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4349             } else {
4350 0         0 $self->{set_nc}->($self);
4351             }
4352            
4353 0         0 redo A;
4354             } elsif ($nc == -1) {
4355 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4356            
4357 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4358 0         0 $self->{state} = DATA_STATE;
4359 0         0 $self->{ct}->{quirks} = 1;
4360             } else {
4361            
4362 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4363 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364             }
4365              
4366             ## reconsume
4367 0         0 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4368 0         0 redo A;
4369             } elsif ($self->{is_xml} and
4370             $self->{ct}->{type} == DOCTYPE_TOKEN and
4371             $nc == 0x005B) { # [
4372            
4373 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4374 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4375 0         0 $self->{in_subset} = 1;
4376            
4377 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4378 0         0 $self->{line_prev} = $self->{line};
4379 0         0 $self->{column_prev} = $self->{column};
4380 0         0 $self->{column}++;
4381             $self->{nc}
4382 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4383             } else {
4384 0         0 $self->{set_nc}->($self);
4385             }
4386            
4387 0         0 return ($self->{ct}); # DOCTYPE
4388 0         0 redo A;
4389             } else {
4390 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4391              
4392 0 0       0 if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4393            
4394             #$self->{ct}->{quirks} = 1;
4395 0         0 $self->{state} = BOGUS_DOCTYPE_STATE;
4396             } else {
4397            
4398 0         0 $self->{state} = BOGUS_MD_STATE;
4399             }
4400              
4401            
4402 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403 0         0 $self->{line_prev} = $self->{line};
4404 0         0 $self->{column_prev} = $self->{column};
4405 0         0 $self->{column}++;
4406             $self->{nc}
4407 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408             } else {
4409 0         0 $self->{set_nc}->($self);
4410             }
4411            
4412 0         0 redo A;
4413             }
4414             } elsif ($state == BEFORE_NDATA_STATE) {
4415 0 0 0     0 if ($is_space->{$nc}) {
    0          
    0          
    0          
4416            
4417             ## Stay in the state.
4418            
4419 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420 0         0 $self->{line_prev} = $self->{line};
4421 0         0 $self->{column_prev} = $self->{column};
4422 0         0 $self->{column}++;
4423             $self->{nc}
4424 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425             } else {
4426 0         0 $self->{set_nc}->($self);
4427             }
4428            
4429 0         0 redo A;
4430             } elsif ($nc == 0x003E) { # >
4431            
4432 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4433            
4434 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4435 0         0 $self->{line_prev} = $self->{line};
4436 0         0 $self->{column_prev} = $self->{column};
4437 0         0 $self->{column}++;
4438             $self->{nc}
4439 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4440             } else {
4441 0         0 $self->{set_nc}->($self);
4442             }
4443            
4444 0         0 return ($self->{ct}); # ENTITY
4445 0         0 redo A;
4446             } elsif ($nc == 0x004E or # N
4447             $nc == 0x006E) { # n
4448            
4449 0         0 $self->{state} = NDATA_STATE;
4450 0         0 $self->{kwd} = chr $nc;
4451            
4452 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4453 0         0 $self->{line_prev} = $self->{line};
4454 0         0 $self->{column_prev} = $self->{column};
4455 0         0 $self->{column}++;
4456             $self->{nc}
4457 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4458             } else {
4459 0         0 $self->{set_nc}->($self);
4460             }
4461            
4462 0         0 redo A;
4463             } elsif ($nc == -1) {
4464            
4465 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4466 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4467             ## reconsume
4468 0         0 return ($self->{ct}); # ENTITY
4469 0         0 redo A;
4470             } else {
4471            
4472 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4473 0         0 $self->{state} = BOGUS_MD_STATE;
4474            
4475 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476 0         0 $self->{line_prev} = $self->{line};
4477 0         0 $self->{column_prev} = $self->{column};
4478 0         0 $self->{column}++;
4479             $self->{nc}
4480 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481             } else {
4482 0         0 $self->{set_nc}->($self);
4483             }
4484            
4485 0         0 redo A;
4486             }
4487             } elsif ($state == BOGUS_DOCTYPE_STATE) {
4488 26 100 33     81 if ($nc == 0x003E) { # >
    50          
    50          
4489            
4490 12         21 $self->{state} = DATA_STATE;
4491            
4492 12 100       28 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493 11         20 $self->{line_prev} = $self->{line};
4494 11         19 $self->{column_prev} = $self->{column};
4495 11         17 $self->{column}++;
4496             $self->{nc}
4497 11         23 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498             } else {
4499 1         5 $self->{set_nc}->($self);
4500             }
4501            
4502              
4503 12         62 return ($self->{ct}); # DOCTYPE
4504              
4505 0         0 redo A;
4506             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
4507            
4508 0         0 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509 0         0 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4510 0         0 $self->{in_subset} = 1;
4511            
4512 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4513 0         0 $self->{line_prev} = $self->{line};
4514 0         0 $self->{column_prev} = $self->{column};
4515 0         0 $self->{column}++;
4516             $self->{nc}
4517 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4518             } else {
4519 0         0 $self->{set_nc}->($self);
4520             }
4521            
4522 0         0 return ($self->{ct}); # DOCTYPE
4523 0         0 redo A;
4524             } elsif ($nc == -1) {
4525            
4526 0         0 $self->{state} = DATA_STATE;
4527             ## reconsume
4528              
4529 0         0 return ($self->{ct}); # DOCTYPE
4530              
4531 0         0 redo A;
4532             } else {
4533            
4534 14         27 my $s = '';
4535 14         49 $self->{read_until}->($s, q{>[}, 0);
4536              
4537             ## Stay in the state
4538            
4539 14 100       46 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540 12         24 $self->{line_prev} = $self->{line};
4541 12         19 $self->{column_prev} = $self->{column};
4542 12         23 $self->{column}++;
4543             $self->{nc}
4544 12         29 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545             } else {
4546 2         8 $self->{set_nc}->($self);
4547             }
4548            
4549 14         45 redo A;
4550             }
4551             } elsif ($state == CDATA_SECTION_STATE) {
4552             ## NOTE: "CDATA section state" in the state is jointly implemented
4553             ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4554             ## and |CDATA_SECTION_MSE2_STATE|.
4555              
4556             ## XML5: "CDATA state".
4557            
4558 0 0       0 if ($nc == 0x005D) { # ]
    0          
4559            
4560 0         0 $self->{state} = CDATA_SECTION_MSE1_STATE;
4561            
4562 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4563 0         0 $self->{line_prev} = $self->{line};
4564 0         0 $self->{column_prev} = $self->{column};
4565 0         0 $self->{column}++;
4566             $self->{nc}
4567 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4568             } else {
4569 0         0 $self->{set_nc}->($self);
4570             }
4571            
4572 0         0 redo A;
4573             } elsif ($nc == -1) {
4574 0 0       0 if ($self->{is_xml}) {
4575            
4576 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4577             } else {
4578            
4579             }
4580              
4581 0         0 $self->{state} = DATA_STATE;
4582             ## Reconsume.
4583 0 0       0 if (length $self->{ct}->{data}) { # character
4584            
4585 0         0 return ($self->{ct}); # character
4586             } else {
4587            
4588             ## No token to emit. $self->{ct} is discarded.
4589             }
4590 0         0 redo A;
4591             } else {
4592            
4593 0         0 $self->{ct}->{data} .= chr $nc;
4594             $self->{read_until}->($self->{ct}->{data},
4595             qq<\x00]>,
4596 0         0 length $self->{ct}->{data});
4597             ## NOTE: NULLs are left as is (see spec's comment). However,
4598             ## a token cannot contain more than one U+0000 NULL character
4599             ## for the ease of processing in the tree constructor.
4600              
4601             ## Stay in the state.
4602            
4603 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4604 0         0 $self->{line_prev} = $self->{line};
4605 0         0 $self->{column_prev} = $self->{column};
4606 0         0 $self->{column}++;
4607             $self->{nc}
4608 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4609             } else {
4610 0         0 $self->{set_nc}->($self);
4611             }
4612            
4613 0         0 redo A;
4614             }
4615              
4616             ## ISSUE: "text tokens" in spec.
4617             } elsif ($state == CDATA_SECTION_MSE1_STATE) {
4618             ## XML5: "CDATA bracket state".
4619              
4620 0 0       0 if ($nc == 0x005D) { # ]
4621            
4622 0         0 $self->{state} = CDATA_SECTION_MSE2_STATE;
4623            
4624 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4625 0         0 $self->{line_prev} = $self->{line};
4626 0         0 $self->{column_prev} = $self->{column};
4627 0         0 $self->{column}++;
4628             $self->{nc}
4629 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4630             } else {
4631 0         0 $self->{set_nc}->($self);
4632             }
4633            
4634 0         0 redo A;
4635             } else {
4636            
4637             ## XML5: If EOF, "]" is not appended and changed to the data state.
4638 0         0 $self->{ct}->{data} .= ']';
4639 0         0 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4640             ## Reconsume.
4641 0         0 redo A;
4642             }
4643             } elsif ($state == CDATA_SECTION_MSE2_STATE) {
4644             ## XML5: "CDATA end state".
4645              
4646 0 0       0 if ($nc == 0x003E) { # >
    0          
4647 0         0 $self->{state} = DATA_STATE;
4648            
4649 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650 0         0 $self->{line_prev} = $self->{line};
4651 0         0 $self->{column_prev} = $self->{column};
4652 0         0 $self->{column}++;
4653             $self->{nc}
4654 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655             } else {
4656 0         0 $self->{set_nc}->($self);
4657             }
4658            
4659 0 0       0 if (length $self->{ct}->{data}) { # character
4660            
4661 0         0 return ($self->{ct}); # character
4662             } else {
4663            
4664             ## No token to emit. $self->{ct} is discarded.
4665             }
4666 0         0 redo A;
4667             } elsif ($nc == 0x005D) { # ]
4668             # character
4669 0         0 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4670             ## Stay in the state.
4671            
4672 0 0       0 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673 0         0 $self->{line_prev} = $self->{line};
4674 0         0 $self->{column_prev} = $self->{column};
4675 0         0 $self->{column}++;
4676             $self->{nc}
4677 0         0 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678             } else {
4679 0         0 $self->{set_nc}->($self);
4680             }
4681            
4682 0         0 redo A;
4683             } else {
4684            
4685 0         0 $self->{ct}->{data} .= ']]'; # character
4686 0         0 $self->{state} = CDATA_SECTION_STATE;
4687             ## Reconsume. ## XML5: Emit.
4688 0         0 redo A;
4689             }
4690             } elsif ($state == ENTITY_STATE) {
4691 97 100 100     1241 if ($is_space->{$nc} or
    100 66        
    50 66        
      33        
      66        
4692             {
4693             0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4694              
4695             ## Following characters are added here to detect parse
4696             ## error for "=" of "&=" in an unquoted attribute value.
4697             ## Though this disagree with the Web Applications 1.0
4698             ## spec, the result token sequences of both algorithms
4699             ## should be same, as these characters cannot form a part
4700             ## of character references.
4701             0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
4702             0x003D => 1, # =
4703              
4704             ## As a result of the addition above, the following clause
4705             ## has no effect in fact.
4706             $self->{entity_add} => 1,
4707             }->{$nc}) {
4708 6 50       19 if ($self->{is_xml}) {
4709            
4710             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4711             line => $self->{line_prev},
4712             column => $self->{column_prev}
4713 0 0       0 + ($nc == -1 ? 1 : 0));
4714             } else {
4715            
4716             ## No error
4717             }
4718             ## Don't consume
4719             ## Return nothing.
4720             #
4721             } elsif ($nc == 0x0023) { # #
4722            
4723 64         143 $self->{state} = ENTITY_HASH_STATE;
4724 64         132 $self->{kwd} = '#';
4725            
4726 64 100       161 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4727 63         113 $self->{line_prev} = $self->{line};
4728 63         107 $self->{column_prev} = $self->{column};
4729 63         133 $self->{column}++;
4730             $self->{nc}
4731 63         224 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4732             } else {
4733 1         4 $self->{set_nc}->($self);
4734             }
4735            
4736 64         238 redo A;
4737             } elsif ($self->{is_xml} or
4738             (0x0041 <= $nc and
4739             $nc <= 0x005A) or # A..Z
4740             (0x0061 <= $nc and
4741             $nc <= 0x007A)) { # a..z
4742            
4743             #require HTML::HTML5::Parser::NamedEntityList;
4744 27         60 $self->{state} = ENTITY_NAME_STATE;
4745 27         80 $self->{kwd} = chr $nc;
4746 27         81 $self->{entity__value} = $self->{kwd};
4747 27         60 $self->{entity__match} = 0;
4748            
4749 27 50       72 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4750 27         53 $self->{line_prev} = $self->{line};
4751 27         50 $self->{column_prev} = $self->{column};
4752 27         47 $self->{column}++;
4753             $self->{nc}
4754 27         69 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4755             } else {
4756 0         0 $self->{set_nc}->($self);
4757             }
4758            
4759 27         107 redo A;
4760             } else {
4761            
4762             ## Return nothing.
4763             #
4764             }
4765              
4766             ## We implement the "consume a character reference" in a
4767             ## slightly different way from the spec's algorithm, though the
4768             ## end result should be exactly same.
4769              
4770             ## NOTE: No character is consumed by the "consume a character
4771             ## reference" algorithm. In other word, there is an "&" character
4772             ## that does not introduce a character reference, which would be
4773             ## appended to the parent element or the attribute value in later
4774             ## process of the tokenizer.
4775              
4776 6 50 33     42 if ($self->{prev_state} == DATA_STATE or
4777             $self->{prev_state} == RCDATA_STATE) {
4778            
4779 6         12 $self->{state} = $self->{prev_state};
4780             ## Reconsume.
4781             return ({type => CHARACTER_TOKEN, data => '&',
4782             line => $self->{line_prev},
4783             column => $self->{column_prev},
4784 6         41 });
4785 0         0 redo A;
4786             } else {
4787            
4788 0         0 $self->{ca}->{value} .= '&';
4789 0         0 $self->{state} = $self->{prev_state};
4790             ## Reconsume.
4791 0         0 redo A;
4792             }
4793             } elsif ($state == ENTITY_HASH_STATE) {
4794 64 100 100     169 if ($nc == 0x0078) { # x
    100          
    100          
4795            
4796 55         94 $self->{state} = HEXREF_X_STATE;
4797 55         144 $self->{kwd} .= chr $nc;
4798            
4799 55 100       129 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4800 54         86 $self->{line_prev} = $self->{line};
4801 54         92 $self->{column_prev} = $self->{column};
4802 54         79 $self->{column}++;
4803             $self->{nc}
4804 54         136 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4805             } else {
4806 1         5 $self->{set_nc}->($self);
4807             }
4808            
4809 55         139 redo A;
4810             } elsif ($nc == 0x0058) { # X
4811            
4812 3 50       17 if ($self->{is_xml}) {
4813 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4814             }
4815 3         7 $self->{state} = HEXREF_X_STATE;
4816 3         11 $self->{kwd} .= chr $nc;
4817            
4818 3 100       11 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4819 2         4 $self->{line_prev} = $self->{line};
4820 2         5 $self->{column_prev} = $self->{column};
4821 2         3 $self->{column}++;
4822             $self->{nc}
4823 2         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4824             } else {
4825 1         6 $self->{set_nc}->($self);
4826             }
4827            
4828 3         12 redo A;
4829             } elsif (0x0030 <= $nc and
4830             $nc <= 0x0039) { # 0..9
4831            
4832 3         8 $self->{state} = NCR_NUM_STATE;
4833 3         10 $self->{kwd} = $nc - 0x0030;
4834            
4835 3 50       10 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4836 3         7 $self->{line_prev} = $self->{line};
4837 3         6 $self->{column_prev} = $self->{column};
4838 3         6 $self->{column}++;
4839             $self->{nc}
4840 3         8 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4841             } else {
4842 0         0 $self->{set_nc}->($self);
4843             }
4844            
4845 3         8 redo A;
4846             } else {
4847             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4848             line => $self->{line_prev},
4849 3         42 column => $self->{column_prev} - 1);
4850              
4851             ## NOTE: According to the spec algorithm, nothing is returned,
4852             ## and then "&#" is appended to the parent element or the attribute
4853             ## value in the later processing.
4854              
4855 3 50 33     20 if ($self->{prev_state} == DATA_STATE or
4856             $self->{prev_state} == RCDATA_STATE) {
4857            
4858 3         7 $self->{state} = $self->{prev_state};
4859             ## Reconsume.
4860             return ({type => CHARACTER_TOKEN,
4861             data => '&#',
4862             line => $self->{line_prev},
4863 3         22 column => $self->{column_prev} - 1,
4864             });
4865 0         0 redo A;
4866             } else {
4867            
4868 0         0 $self->{ca}->{value} .= '&#';
4869 0         0 $self->{state} = $self->{prev_state};
4870             ## Reconsume.
4871 0         0 redo A;
4872             }
4873             }
4874             } elsif ($state == NCR_NUM_STATE) {
4875 6 100 100     73 if (0x0030 <= $nc and
    100          
4876             $nc <= 0x0039) { # 0..9
4877            
4878 3         11 $self->{kwd} *= 10;
4879 3         9 $self->{kwd} += $nc - 0x0030;
4880            
4881             ## Stay in the state.
4882            
4883 3 100       11 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4884 2         5 $self->{line_prev} = $self->{line};
4885 2         4 $self->{column_prev} = $self->{column};
4886 2         6 $self->{column}++;
4887             $self->{nc}
4888 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4889             } else {
4890 1         5 $self->{set_nc}->($self);
4891             }
4892            
4893 3         12 redo A;
4894             } elsif ($nc == 0x003B) { # ;
4895            
4896            
4897 1 50       91 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4898 1         6 $self->{line_prev} = $self->{line};
4899 1         3 $self->{column_prev} = $self->{column};
4900 1         2 $self->{column}++;
4901             $self->{nc}
4902 1         4 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4903             } else {
4904 0         0 $self->{set_nc}->($self);
4905             }
4906            
4907             #
4908             } else {
4909            
4910 2         11 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4911             ## Reconsume.
4912             #
4913             }
4914              
4915 3         11 my $code = $self->{kwd};
4916 3         7 my $l = $self->{line_prev};
4917 3         7 my $c = $self->{column_prev};
4918 3 50 33     40 if ((not $self->{is_xml} and $charref_map->{$code}) or
    50 33        
      33        
      33        
      33        
      33        
4919             ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4920             ($self->{is_xml} and $code == 0x0000)) {
4921            
4922 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4923             text => (sprintf 'U+%04X', $code),
4924             line => $l, column => $c);
4925 0         0 $code = $charref_map->{$code};
4926             } elsif ($code > 0x10FFFF) {
4927            
4928 0         0 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4929             text => (sprintf 'U-%08X', $code),
4930             line => $l, column => $c);
4931 0         0 $code = 0xFFFD;
4932             }
4933              
4934 3 50 33     16 if ($self->{prev_state} == DATA_STATE or
4935             $self->{prev_state} == RCDATA_STATE) {
4936            
4937 3         7 $self->{state} = $self->{prev_state};
4938             ## Reconsume.
4939 3         26 return ({type => CHARACTER_TOKEN, data => chr $code,
4940             has_reference => 1,
4941             line => $l, column => $c,
4942             });
4943 0         0 redo A;
4944             } else {
4945            
4946 0         0 $self->{ca}->{value} .= chr $code;
4947 0         0 $self->{ca}->{has_reference} = 1;
4948 0         0 $self->{state} = $self->{prev_state};
4949             ## Reconsume.
4950 0         0 redo A;
4951             }
4952             } elsif ($state == HEXREF_X_STATE) {
4953 58 100 100     299 if ((0x0030 <= $nc and $nc <= 0x0039) or
      100        
      100        
      33        
      66        
4954             (0x0041 <= $nc and $nc <= 0x0046) or
4955             (0x0061 <= $nc and $nc <= 0x0066)) {
4956             # 0..9, A..F, a..f
4957            
4958 54         101 $self->{state} = HEXREF_HEX_STATE;
4959 54         91 $self->{kwd} = 0;
4960             ## Reconsume.
4961 54         117 redo A;
4962             } else {
4963             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4964             line => $self->{line_prev},
4965 4         27 column => $self->{column_prev} - 2);
4966              
4967             ## NOTE: According to the spec algorithm, nothing is returned,
4968             ## and then "&#" followed by "X" or "x" is appended to the parent
4969             ## element or the attribute value in the later processing.
4970              
4971 4 50 33     16 if ($self->{prev_state} == DATA_STATE or
4972             $self->{prev_state} == RCDATA_STATE) {
4973            
4974 4         11 $self->{state} = $self->{prev_state};
4975             ## Reconsume.
4976             return ({type => CHARACTER_TOKEN,
4977             data => '&' . $self->{kwd},
4978             line => $self->{line_prev},
4979             column => $self->{column_prev} - length $self->{kwd},
4980 4         32 });
4981 0         0 redo A;
4982             } else {
4983            
4984 0         0 $self->{ca}->{value} .= '&' . $self->{kwd};
4985 0         0 $self->{state} = $self->{prev_state};
4986             ## Reconsume.
4987 0         0 redo A;
4988             }
4989             }
4990             } elsif ($state == HEXREF_HEX_STATE) {
4991 268 100 66     1053 if (0x0030 <= $nc and $nc <= 0x0039) {
    100 66        
    100 100        
    100          
4992             # 0..9
4993            
4994 166         270 $self->{kwd} *= 0x10;
4995 166         303 $self->{kwd} += $nc - 0x0030;
4996             ## Stay in the state.
4997            
4998 166 50       303 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999 166         260 $self->{line_prev} = $self->{line};
5000 166         234 $self->{column_prev} = $self->{column};
5001 166         241 $self->{column}++;
5002             $self->{nc}
5003 166         310 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5004             } else {
5005 0         0 $self->{set_nc}->($self);
5006             }
5007            
5008 166         374 redo A;
5009             } elsif (0x0061 <= $nc and
5010             $nc <= 0x0066) { # a..f
5011            
5012 2         6 $self->{kwd} *= 0x10;
5013 2         5 $self->{kwd} += $nc - 0x0060 + 9;
5014             ## Stay in the state.
5015            
5016 2 50       7 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5017 2         4 $self->{line_prev} = $self->{line};
5018 2         5 $self->{column_prev} = $self->{column};
5019 2         3 $self->{column}++;
5020             $self->{nc}
5021 2         6 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5022             } else {
5023 0         0 $self->{set_nc}->($self);
5024             }
5025            
5026 2         5 redo A;
5027             } elsif (0x0041 <= $nc and
5028             $nc <= 0x0046) { # A..F
5029            
5030 46         79 $self->{kwd} *= 0x10;
5031 46         89 $self->{kwd} += $nc - 0x0040 + 9;
5032             ## Stay in the state.
5033            
5034 46 50       110 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035 46         77 $self->{line_prev} = $self->{line};
5036 46         455 $self->{column_prev} = $self->{column};
5037 46         64 $self->{column}++;
5038             $self->{nc}
5039 46         93 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040             } else {
5041 0         0 $self->{set_nc}->($self);
5042             }
5043            
5044 46         104 redo A;
5045             } elsif ($nc == 0x003B) { # ;
5046            
5047            
5048 51 50       116 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5049 51         82 $self->{line_prev} = $self->{line};
5050 51         85 $self->{column_prev} = $self->{column};
5051 51         75 $self->{column}++;
5052             $self->{nc}
5053 51         104 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5054             } else {
5055 0         0 $self->{set_nc}->($self);
5056             }
5057            
5058             #
5059             } else {
5060            
5061             $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5062             line => $self->{line},
5063 3         18 column => $self->{column});
5064             ## Reconsume.
5065             #
5066             }
5067              
5068 54         94 my $code = $self->{kwd};
5069 54         94 my $l = $self->{line_prev};
5070 54         100 my $c = $self->{column_prev};
5071 54 100 66     399 if ((not $self->{is_xml} and $charref_map->{$code}) or
    100 33        
      33        
      66        
      33        
      33        
5072             ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5073             ($self->{is_xml} and $code == 0x0000)) {
5074            
5075 39         375 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5076             text => (sprintf 'U+%04X', $code),
5077             line => $l, column => $c);
5078 39         103 $code = $charref_map->{$code};
5079             } elsif ($code > 0x10FFFF) {
5080            
5081 2         20 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5082             text => (sprintf 'U-%08X', $code),
5083             line => $l, column => $c);
5084 2         3 $code = 0xFFFD;
5085             }
5086              
5087 54 50 33     159 if ($self->{prev_state} == DATA_STATE or
5088             $self->{prev_state} == RCDATA_STATE) {
5089            
5090 54         130 $self->{state} = $self->{prev_state};
5091             ## Reconsume.
5092 54         402 return ({type => CHARACTER_TOKEN, data => chr $code,
5093             has_reference => 1,
5094             line => $l, column => $c,
5095             });
5096 0         0 redo A;
5097             } else {
5098            
5099 0         0 $self->{ca}->{value} .= chr $code;
5100 0         0 $self->{ca}->{has_reference} = 1;
5101 0         0 $self->{state} = $self->{prev_state};
5102             ## Reconsume.
5103 0         0 redo A;
5104             }
5105             } elsif ($state == ENTITY_NAME_STATE) {
5106 142 100 100     904 if ((0x0041 <= $nc and # a
    50 66        
      100        
      66        
      66        
      66        
      0        
      33        
      66        
5107             $nc <= 0x005A) or # x
5108             (0x0061 <= $nc and # a
5109             $nc <= 0x007A) or # z
5110             (0x0030 <= $nc and # 0
5111             $nc <= 0x0039) or # 9
5112             $nc == 0x003B or # ;
5113             ($self->{is_xml} and
5114             not ($is_space->{$nc} or
5115             {
5116             0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5117              
5118             ## See comment in the |ENTITY_STATE|'s |if|
5119             ## statement for the rationale of addition of these
5120             ## characters.
5121             0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
5122             0x003D => 1, # =
5123              
5124             ## This is redundant for the same reason.
5125             $self->{entity_add} => 1,
5126             }->{$nc}))) {
5127             #local %entity2char;
5128 137         280 $self->{kwd} .= chr $nc; ## Bare entity name.
5129 137 100 66     642 if (defined $entity2char{$self->{kwd}} or ## HTML charrefs.
5130             $self->{ge}->{$self->{kwd}}) { ## XML general entities.
5131 32 100       81 if ($nc == 0x003B) { # ;
5132 20 50       60 if (defined $self->{ge}->{$self->{kwd}}) {
5133             ## A declared XML entity.
5134 0 0       0 if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5135            
5136 0         0 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5137             } else {
5138 0 0       0 if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5139            
5140             $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5141 0         0 value => $self->{kwd});
5142             } else {
5143            
5144             }
5145 0         0 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5146             }
5147             } else {
5148             ## An HTML character reference.
5149 20 50       64 if ($self->{is_xml}) {
5150             ## Not a declared XML entity.
5151            
5152             $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5153             value => $self->{kwd},
5154             level => {
5155             'amp;' => $self->{level}->{warn},
5156             'quot;' => $self->{level}->{warn},
5157             'lt;' => $self->{level}->{warn},
5158             'gt;' => $self->{level}->{warn},
5159             'apos;' => $self->{level}->{warn},
5160             }->{$self->{kwd}} ||
5161             $self->{level}->{must},
5162             line => $self->{line_prev},
5163 0   0     0 column => $self->{column} - length $self->{kwd});
5164             } else {
5165            
5166             }
5167 20         57 $self->{entity__value} = $entity2char{$self->{kwd}};
5168             }
5169 20         36 $self->{entity__match} = 1; ## Matched exactly with ";" entity.
5170            
5171 20 100       55 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5172 10         22 $self->{line_prev} = $self->{line};
5173 10         21 $self->{column_prev} = $self->{column};
5174 10         15 $self->{column}++;
5175             $self->{nc}
5176 10         28 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5177             } else {
5178 10         40 $self->{set_nc}->($self);
5179             }
5180            
5181             #
5182             } else {
5183            
5184 12         35 $self->{entity__value} = $entity2char{$self->{kwd}};
5185 12         42 $self->{entity__match} = -1; ## Exactly matched to non-";" entity.
5186             ## Stay in the state.
5187            
5188 12 100       38 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5189 11         22 $self->{line_prev} = $self->{line};
5190 11         25 $self->{column_prev} = $self->{column};
5191 11         31 $self->{column}++;
5192             $self->{nc}
5193 11         34 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5194             } else {
5195 1         6 $self->{set_nc}->($self);
5196             }
5197            
5198 12         41 redo A;
5199             }
5200             } else {
5201 105 100       201 if ($nc == 0x003B) { # ;
5202             ## A reserved HTML character reference or an undeclared
5203             ## XML entity reference.
5204            
5205             $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## XXXtype
5206             value => $self->{kwd},
5207             level => $self->{level}->{must},
5208             line => $self->{line_prev},
5209 2         22 column => $self->{column} - length $self->{kwd});
5210 2         11 $self->{entity__value} .= chr $nc;
5211 2         5 $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5212            
5213 2 100       9 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5214 1         3 $self->{line_prev} = $self->{line};
5215 1         3 $self->{column_prev} = $self->{column};
5216 1         3 $self->{column}++;
5217             $self->{nc}
5218 1         4 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5219             } else {
5220 1         4 $self->{set_nc}->($self);
5221             }
5222            
5223             #
5224             } else {
5225            
5226 103         162 $self->{entity__value} .= chr $nc;
5227 103         165 $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5228             ## Stay in the state.
5229            
5230 103 100       201 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5231 102         163 $self->{line_prev} = $self->{line};
5232 102         149 $self->{column_prev} = $self->{column};
5233 102         148 $self->{column}++;
5234             $self->{nc}
5235 102         191 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5236             } else {
5237 1         5 $self->{set_nc}->($self);
5238             }
5239            
5240 103         252 redo A;
5241             }
5242             }
5243             } elsif ($nc == 0x003D) { # =
5244 0 0 0     0 if ($self->{entity__match} < 0 and
      0        
5245             $self->{prev_state} != DATA_STATE and # in attribute
5246             $self->{prev_state} != RCDATA_STATE) {
5247 0         0 $self->{entity__match} = 0;
5248             }
5249             }
5250              
5251 27         61 my $data;
5252             my $has_ref;
5253 27 100       88 if ($self->{entity__match} > 0) { ## A ";" entity.
    100          
5254            
5255 20         47 $data = $self->{entity__value};
5256             ## Strictly speaking the $has_ref flag should not be set if
5257             ## there is no matched entity. However, this flag is used
5258             ## only in contexts where use of an
5259             ## unexpanded-entity-reference-like string is in no way
5260             ## allowed, so it should not make any difference in theory.
5261 20         39 $has_ref = 1;
5262             #
5263             } elsif ($self->{entity__match} < 0) { ## Matched to non-";" entity.
5264 5 50 66     27 if ($self->{prev_state} != DATA_STATE and # in attribute
      66        
5265             $self->{prev_state} != RCDATA_STATE and
5266             $self->{entity__match} < -1) {
5267             ## In attribute-value contexts, matched non-";" string is
5268             ## left as is if there is trailing alphabetical letters.
5269            
5270 0         0 $data = '&' . $self->{kwd};
5271             #
5272             } else {
5273             ## In attribute-value contexts, exactly matched non-";"
5274             ## string is replaced as a character reference. In any
5275             ## context, matched non-";" string with or without trailing
5276             ## alphabetical letters is replaced as a character reference
5277             ## (with trailing letters). Note that use of a no-";"
5278             ## character reference is always non-conforming.
5279            
5280 5         23 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5281 5         11 $data = $self->{entity__value};
5282 5         20 $has_ref = 1;
5283             #
5284             }
5285             } else { ## Unmatched string.
5286 2 50 33     11 if ($self->{is_xml} and not $self->{kwd} =~ /;$/) {
5287            
5288             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5289             line => $self->{line_prev},
5290 0         0 column => $self->{column_prev} - length $self->{kwd});
5291             } else {
5292            
5293             }
5294 2         5 $data = '&' . $self->{kwd};
5295             #
5296             }
5297            
5298             ## NOTE: In these cases, when a character reference is found,
5299             ## it is consumed and a character token is returned, or, otherwise,
5300             ## nothing is consumed and returned, according to the spec algorithm.
5301             ## In this implementation, anything that has been examined by the
5302             ## tokenizer is appended to the parent element or the attribute value
5303             ## as string, either literal string when no character reference or
5304             ## entity-replaced string otherwise, in this stage, since any characters
5305             ## that would not be consumed are appended in the data state or in an
5306             ## appropriate attribute value state anyway.
5307            
5308 27 100 66     110 if ($self->{prev_state} == DATA_STATE or
5309             $self->{prev_state} == RCDATA_STATE) {
5310            
5311 26         57 $self->{state} = $self->{prev_state};
5312             ## Reconsume.
5313             return ({type => CHARACTER_TOKEN,
5314             data => $data,
5315             has_reference => $has_ref,
5316             line => $self->{line_prev},
5317             column => $self->{column_prev} + 1 - length $self->{kwd},
5318 26         217 });
5319 0         0 redo A;
5320             } else {
5321            
5322 1         4 $self->{ca}->{value} .= $data;
5323 1 50       6 $self->{ca}->{has_reference} = 1 if $has_ref;
5324 1         3 $self->{state} = $self->{prev_state};
5325             ## Reconsume.
5326 1         5 redo A;
5327             }
5328              
5329             ## ========== XML-only states ==========
5330              
5331             } elsif ($state == PI_STATE) {
5332             ## XML5: "Pi state" and "DOCTYPE pi state".
5333              
5334 0 0 0       if ($is_space->{$nc} or
      0        
5335             $nc == 0x003F or # ?
5336             $nc == -1) {
5337             ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5338             ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5339             ## "DOCTYPE pi state": Parse error, switch to the "data
5340             ## state".
5341             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5342             line => $self->{line_prev},
5343             column => $self->{column_prev}
5344 0           - 1 * ($nc != -1));
5345 0           $self->{state} = BOGUS_COMMENT_STATE;
5346             ## Reconsume.
5347             $self->{ct} = {type => COMMENT_TOKEN,
5348             data => '?',
5349             line => $self->{line_prev},
5350             column => $self->{column_prev}
5351 0           - 1 * ($nc != -1),
5352             };
5353 0           redo A;
5354             } else {
5355             ## XML5: "DOCTYPE pi state": Stay in the state.
5356 0 0         if ($nc == 0x0000) {
5357 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5358             }
5359             $self->{ct} = {type => PI_TOKEN,
5360             target => $nc == 0x0000 ? "\x{FFFD}" : chr $nc,
5361             data => '',
5362             line => $self->{line_prev},
5363 0 0         column => $self->{column_prev} - 1,
5364             };
5365 0           $self->{state} = PI_TARGET_STATE;
5366            
5367 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5368 0           $self->{line_prev} = $self->{line};
5369 0           $self->{column_prev} = $self->{column};
5370 0           $self->{column}++;
5371             $self->{nc}
5372 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5373             } else {
5374 0           $self->{set_nc}->($self);
5375             }
5376            
5377 0           redo A;
5378             }
5379             } elsif ($state == PI_TARGET_STATE) {
5380 0 0         if ($is_space->{$nc}) {
    0          
    0          
5381 0           $self->{state} = PI_TARGET_AFTER_STATE;
5382 0           $self->{kwd} = chr $nc; # "temporary buffer"
5383            
5384 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5385 0           $self->{line_prev} = $self->{line};
5386 0           $self->{column_prev} = $self->{column};
5387 0           $self->{column}++;
5388             $self->{nc}
5389 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5390             } else {
5391 0           $self->{set_nc}->($self);
5392             }
5393            
5394 0           redo A;
5395             } elsif ($nc == EOF_CHAR) {
5396 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5397 0 0         if ($self->{in_subset}) {
5398 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5399             } else {
5400 0           $self->{state} = DATA_STATE;
5401             }
5402             ## Reconsume.
5403             return ({type => COMMENT_TOKEN,
5404             data => '?' . $self->{ct}->{target},
5405             line => $self->{ct}->{line},
5406 0           column => $self->{ct}->{column}});
5407 0           redo A;
5408             } elsif ($nc == 0x003F) { # ?
5409 0           $self->{state} = PI_AFTER_STATE;
5410 0           $self->{kwd} = ''; # "temporary buffer"
5411            
5412 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5413 0           $self->{line_prev} = $self->{line};
5414 0           $self->{column_prev} = $self->{column};
5415 0           $self->{column}++;
5416             $self->{nc}
5417 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5418             } else {
5419 0           $self->{set_nc}->($self);
5420             }
5421            
5422 0           redo A;
5423             } else {
5424             ## XML5: typo ("tag name" -> "target")
5425 0 0         if ($nc == 0x0000) {
5426 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5427             }
5428 0 0         $self->{ct}->{target} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5429            
5430 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5431 0           $self->{line_prev} = $self->{line};
5432 0           $self->{column_prev} = $self->{column};
5433 0           $self->{column}++;
5434             $self->{nc}
5435 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5436             } else {
5437 0           $self->{set_nc}->($self);
5438             }
5439            
5440 0           redo A;
5441             }
5442             } elsif ($state == PI_TARGET_AFTER_STATE) {
5443 0 0         if ($is_space->{$nc}) {
5444 0           $self->{kwd} .= chr $nc; # "temporary buffer"
5445             ## Stay in the state.
5446            
5447 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5448 0           $self->{line_prev} = $self->{line};
5449 0           $self->{column_prev} = $self->{column};
5450 0           $self->{column}++;
5451             $self->{nc}
5452 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5453             } else {
5454 0           $self->{set_nc}->($self);
5455             }
5456            
5457 0           redo A;
5458             } else {
5459 0           $self->{state} = PI_DATA_STATE;
5460             ## Reprocess.
5461 0           redo A;
5462             }
5463             } elsif ($state == PI_DATA_STATE) {
5464 0 0         if ($nc == 0x003F) { # ?
    0          
5465 0           $self->{state} = PI_DATA_AFTER_STATE;
5466            
5467 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5468 0           $self->{line_prev} = $self->{line};
5469 0           $self->{column_prev} = $self->{column};
5470 0           $self->{column}++;
5471             $self->{nc}
5472 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5473             } else {
5474 0           $self->{set_nc}->($self);
5475             }
5476            
5477 0           redo A;
5478             } elsif ($nc == EOF_CHAR) {
5479 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5480 0 0         if ($self->{in_subset}) {
5481 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5482             } else {
5483 0           $self->{state} = DATA_STATE;
5484             }
5485             ## Reprocess.
5486             return ({type => COMMENT_TOKEN,
5487             data => '?' . $self->{ct}->{target} .
5488             $self->{kwd} . # "temporary buffer"
5489             $self->{ct}->{data},
5490             line => $self->{ct}->{line},
5491 0           column => $self->{ct}->{column}});
5492 0           redo A;
5493             } else {
5494 0 0         if ($nc == 0x0000) {
5495 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5496             }
5497 0 0         $self->{ct}->{data} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5498             $self->{read_until}->($self->{ct}->{data}, qq[\x00?],
5499 0           length $self->{ct}->{data});
5500             ## Stay in the state.
5501            
5502 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503 0           $self->{line_prev} = $self->{line};
5504 0           $self->{column_prev} = $self->{column};
5505 0           $self->{column}++;
5506             $self->{nc}
5507 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508             } else {
5509 0           $self->{set_nc}->($self);
5510             }
5511            
5512             ## Reprocess.
5513 0           redo A;
5514             }
5515             } elsif ($state == PI_AFTER_STATE) {
5516             ## XML5: Part of "Pi after state".
5517              
5518 0 0         if ($nc == 0x003E) { # >
    0          
5519 0 0         if ($self->{in_subset}) {
5520 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5521             } else {
5522 0           $self->{state} = DATA_STATE;
5523             }
5524            
5525 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5526 0           $self->{line_prev} = $self->{line};
5527 0           $self->{column_prev} = $self->{column};
5528 0           $self->{column}++;
5529             $self->{nc}
5530 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5531             } else {
5532 0           $self->{set_nc}->($self);
5533             }
5534            
5535 0           return ($self->{ct}); # pi
5536 0           redo A;
5537             } elsif ($nc == 0x003F) { # ?
5538             $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5539             line => $self->{line_prev},
5540 0           column => $self->{column_prev}); ## XML5: no error
5541 0           $self->{ct}->{data} .= '?';
5542 0           $self->{state} = PI_DATA_AFTER_STATE;
5543            
5544 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5545 0           $self->{line_prev} = $self->{line};
5546 0           $self->{column_prev} = $self->{column};
5547 0           $self->{column}++;
5548             $self->{nc}
5549 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5550             } else {
5551 0           $self->{set_nc}->($self);
5552             }
5553            
5554 0           redo A;
5555             } else {
5556             $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5557             line => $self->{line_prev},
5558             column => $self->{column_prev}
5559 0           + 1 * ($nc == -1)); ## XML5: no error
5560 0           $self->{ct}->{data} .= '?'; ## XML5: not appended
5561 0           $self->{state} = PI_DATA_STATE;
5562             ## Reprocess.
5563 0           redo A;
5564             }
5565             } elsif ($state == PI_DATA_AFTER_STATE) {
5566             ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5567              
5568 0 0         if ($nc == 0x003E) { # >
    0          
5569 0 0         if ($self->{in_subset}) {
5570 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5571             } else {
5572 0           $self->{state} = DATA_STATE;
5573             }
5574            
5575 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5576 0           $self->{line_prev} = $self->{line};
5577 0           $self->{column_prev} = $self->{column};
5578 0           $self->{column}++;
5579             $self->{nc}
5580 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5581             } else {
5582 0           $self->{set_nc}->($self);
5583             }
5584            
5585 0           return ($self->{ct}); # pi
5586 0           redo A;
5587             } elsif ($nc == 0x003F) { # ?
5588 0           $self->{ct}->{data} .= '?';
5589             ## Stay in the state.
5590            
5591 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5592 0           $self->{line_prev} = $self->{line};
5593 0           $self->{column_prev} = $self->{column};
5594 0           $self->{column}++;
5595             $self->{nc}
5596 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5597             } else {
5598 0           $self->{set_nc}->($self);
5599             }
5600            
5601 0           redo A;
5602             } else {
5603 0           $self->{ct}->{data} .= '?'; ## XML5: not appended
5604 0           $self->{state} = PI_DATA_STATE;
5605             ## Reprocess.
5606 0           redo A;
5607             }
5608              
5609             } elsif ($state == DOCTYPE_INTERNAL_SUBSET_STATE) {
5610 0 0         if ($nc == 0x003C) { # <
    0          
    0          
    0          
    0          
5611 0           $self->{state} = DOCTYPE_TAG_STATE;
5612            
5613 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5614 0           $self->{line_prev} = $self->{line};
5615 0           $self->{column_prev} = $self->{column};
5616 0           $self->{column}++;
5617             $self->{nc}
5618 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5619             } else {
5620 0           $self->{set_nc}->($self);
5621             }
5622            
5623 0           redo A;
5624             } elsif ($nc == 0x0025) { # %
5625             ## XML5: Not defined yet.
5626              
5627             ## TODO: parameter entity expansion
5628              
5629 0 0 0       if (not $self->{stop_processing} and
5630             not $self->{document}->xml_standalone) {
5631             $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5632 0           level => $self->{level}->{info});
5633 0           $self->{stop_processing} = 1;
5634             }
5635              
5636            
5637 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5638 0           $self->{line_prev} = $self->{line};
5639 0           $self->{column_prev} = $self->{column};
5640 0           $self->{column}++;
5641             $self->{nc}
5642 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5643             } else {
5644 0           $self->{set_nc}->($self);
5645             }
5646            
5647 0           redo A;
5648             } elsif ($nc == 0x005D) { # ]
5649 0           delete $self->{in_subset};
5650 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5651            
5652 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5653 0           $self->{line_prev} = $self->{line};
5654 0           $self->{column_prev} = $self->{column};
5655 0           $self->{column}++;
5656             $self->{nc}
5657 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5658             } else {
5659 0           $self->{set_nc}->($self);
5660             }
5661            
5662 0           redo A;
5663             } elsif ($is_space->{$nc}) {
5664             ## Stay in the state.
5665            
5666 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667 0           $self->{line_prev} = $self->{line};
5668 0           $self->{column_prev} = $self->{column};
5669 0           $self->{column}++;
5670             $self->{nc}
5671 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672             } else {
5673 0           $self->{set_nc}->($self);
5674             }
5675            
5676 0           redo A;
5677             } elsif ($nc == EOF_CHAR) {
5678 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5679 0           delete $self->{in_subset};
5680 0           $self->{state} = DATA_STATE;
5681             ## Reconsume.
5682 0           return ({type => END_OF_DOCTYPE_TOKEN});
5683 0           redo A;
5684             } else {
5685 0 0         unless ($self->{internal_subset_tainted}) {
5686             ## XML5: No parse error.
5687 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5688 0           $self->{internal_subset_tainted} = 1;
5689             }
5690             ## Stay in the state.
5691            
5692 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693 0           $self->{line_prev} = $self->{line};
5694 0           $self->{column_prev} = $self->{column};
5695 0           $self->{column}++;
5696             $self->{nc}
5697 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698             } else {
5699 0           $self->{set_nc}->($self);
5700             }
5701            
5702 0           redo A;
5703             }
5704             } elsif ($state == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5705 0 0         if ($nc == 0x003E) { # >
    0          
5706 0           $self->{state} = DATA_STATE;
5707            
5708 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5709 0           $self->{line_prev} = $self->{line};
5710 0           $self->{column_prev} = $self->{column};
5711 0           $self->{column}++;
5712             $self->{nc}
5713 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5714             } else {
5715 0           $self->{set_nc}->($self);
5716             }
5717            
5718 0           return ({type => END_OF_DOCTYPE_TOKEN});
5719 0           redo A;
5720             } elsif ($nc == EOF_CHAR) {
5721 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5722 0           $self->{state} = DATA_STATE;
5723             ## Reconsume.
5724 0           return ({type => END_OF_DOCTYPE_TOKEN});
5725 0           redo A;
5726             } else {
5727             ## XML5: No parse error and stay in the state.
5728 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5729              
5730 0           $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5731            
5732 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5733 0           $self->{line_prev} = $self->{line};
5734 0           $self->{column_prev} = $self->{column};
5735 0           $self->{column}++;
5736             $self->{nc}
5737 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5738             } else {
5739 0           $self->{set_nc}->($self);
5740             }
5741            
5742 0           redo A;
5743             }
5744             } elsif ($state == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5745 0 0         if ($nc == 0x003E) { # >
    0          
5746 0           $self->{state} = DATA_STATE;
5747            
5748 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749 0           $self->{line_prev} = $self->{line};
5750 0           $self->{column_prev} = $self->{column};
5751 0           $self->{column}++;
5752             $self->{nc}
5753 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754             } else {
5755 0           $self->{set_nc}->($self);
5756             }
5757            
5758 0           return ({type => END_OF_DOCTYPE_TOKEN});
5759 0           redo A;
5760             } elsif ($nc == EOF_CHAR) {
5761 0           $self->{state} = DATA_STATE;
5762             ## Reconsume.
5763 0           return ({type => END_OF_DOCTYPE_TOKEN});
5764 0           redo A;
5765             } else {
5766             ## Stay in the state.
5767            
5768 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769 0           $self->{line_prev} = $self->{line};
5770 0           $self->{column_prev} = $self->{column};
5771 0           $self->{column}++;
5772             $self->{nc}
5773 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774             } else {
5775 0           $self->{set_nc}->($self);
5776             }
5777            
5778 0           redo A;
5779             }
5780             } elsif ($state == DOCTYPE_TAG_STATE) {
5781 0 0         if ($nc == 0x0021) { # !
    0          
    0          
5782 0           $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5783            
5784 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785 0           $self->{line_prev} = $self->{line};
5786 0           $self->{column_prev} = $self->{column};
5787 0           $self->{column}++;
5788             $self->{nc}
5789 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790             } else {
5791 0           $self->{set_nc}->($self);
5792             }
5793            
5794 0           redo A;
5795             } elsif ($nc == 0x003F) { # ?
5796 0           $self->{state} = PI_STATE;
5797            
5798 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5799 0           $self->{line_prev} = $self->{line};
5800 0           $self->{column_prev} = $self->{column};
5801 0           $self->{column}++;
5802             $self->{nc}
5803 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5804             } else {
5805 0           $self->{set_nc}->($self);
5806             }
5807            
5808 0           redo A;
5809             } elsif ($nc == EOF_CHAR) {
5810 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5811 0           $self->{state} = DATA_STATE;
5812             ## Reconsume.
5813 0           redo A;
5814             } else {
5815             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5816             line => $self->{line_prev},
5817 0           column => $self->{column_prev});
5818 0           $self->{state} = BOGUS_COMMENT_STATE;
5819 0           $self->{ct} = {type => COMMENT_TOKEN,
5820             data => '',
5821             }; ## NOTE: Will be discarded.
5822            
5823 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824 0           $self->{line_prev} = $self->{line};
5825 0           $self->{column_prev} = $self->{column};
5826 0           $self->{column}++;
5827             $self->{nc}
5828 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829             } else {
5830 0           $self->{set_nc}->($self);
5831             }
5832            
5833 0           redo A;
5834             }
5835             } elsif ($state == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5836             ## XML5: "DOCTYPE markup declaration state".
5837            
5838 0 0 0       if ($nc == 0x002D) { # -
    0 0        
    0 0        
    0          
5839 0           $self->{state} = MD_HYPHEN_STATE;
5840            
5841 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5842 0           $self->{line_prev} = $self->{line};
5843 0           $self->{column_prev} = $self->{column};
5844 0           $self->{column}++;
5845             $self->{nc}
5846 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5847             } else {
5848 0           $self->{set_nc}->($self);
5849             }
5850            
5851 0           redo A;
5852             } elsif ($nc == 0x0045 or # E
5853             $nc == 0x0065) { # e
5854 0           $self->{state} = MD_E_STATE;
5855 0           $self->{kwd} = chr $nc;
5856            
5857 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5858 0           $self->{line_prev} = $self->{line};
5859 0           $self->{column_prev} = $self->{column};
5860 0           $self->{column}++;
5861             $self->{nc}
5862 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5863             } else {
5864 0           $self->{set_nc}->($self);
5865             }
5866            
5867 0           redo A;
5868             } elsif ($nc == 0x0041 or # A
5869             $nc == 0x0061) { # a
5870 0           $self->{state} = MD_ATTLIST_STATE;
5871 0           $self->{kwd} = chr $nc;
5872            
5873 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5874 0           $self->{line_prev} = $self->{line};
5875 0           $self->{column_prev} = $self->{column};
5876 0           $self->{column}++;
5877             $self->{nc}
5878 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5879             } else {
5880 0           $self->{set_nc}->($self);
5881             }
5882            
5883 0           redo A;
5884             } elsif ($nc == 0x004E or # N
5885             $nc == 0x006E) { # n
5886 0           $self->{state} = MD_NOTATION_STATE;
5887 0           $self->{kwd} = chr $nc;
5888            
5889 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890 0           $self->{line_prev} = $self->{line};
5891 0           $self->{column_prev} = $self->{column};
5892 0           $self->{column}++;
5893             $self->{nc}
5894 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895             } else {
5896 0           $self->{set_nc}->($self);
5897             }
5898            
5899 0           redo A;
5900             } else {
5901             #
5902             }
5903            
5904             ## XML5: No parse error.
5905             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5906             line => $self->{line_prev},
5907 0           column => $self->{column_prev} - 1);
5908             ## Reconsume.
5909 0           $self->{state} = BOGUS_COMMENT_STATE;
5910 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5911 0           redo A;
5912             } elsif ($state == MD_E_STATE) {
5913 0 0 0       if ($nc == 0x004E or # N
    0 0        
5914             $nc == 0x006E) { # n
5915 0           $self->{state} = MD_ENTITY_STATE;
5916 0           $self->{kwd} .= chr $nc;
5917            
5918 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5919 0           $self->{line_prev} = $self->{line};
5920 0           $self->{column_prev} = $self->{column};
5921 0           $self->{column}++;
5922             $self->{nc}
5923 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5924             } else {
5925 0           $self->{set_nc}->($self);
5926             }
5927            
5928 0           redo A;
5929             } elsif ($nc == 0x004C or # L
5930             $nc == 0x006C) { # l
5931             ## XML5: not supported.
5932 0           $self->{state} = MD_ELEMENT_STATE;
5933 0           $self->{kwd} .= chr $nc;
5934            
5935 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5936 0           $self->{line_prev} = $self->{line};
5937 0           $self->{column_prev} = $self->{column};
5938 0           $self->{column}++;
5939             $self->{nc}
5940 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5941             } else {
5942 0           $self->{set_nc}->($self);
5943             }
5944            
5945 0           redo A;
5946             } else {
5947             ## XML5: No parse error.
5948             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5949             line => $self->{line_prev},
5950 0           column => $self->{column_prev} - 2
5951             + 1 * ($nc == EOF_CHAR));
5952             ## Reconsume.
5953 0           $self->{state} = BOGUS_COMMENT_STATE;
5954 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5955 0           redo A;
5956             }
5957             } elsif ($state == MD_ENTITY_STATE) {
5958 0 0 0       if ($nc == [
    0 0        
      0        
5959             undef,
5960             undef,
5961             0x0054, # T
5962             0x0049, # I
5963             0x0054, # T
5964             NEVER_CHAR, # (Y)
5965             ]->[length $self->{kwd}] or
5966             $nc == [
5967             undef,
5968             undef,
5969             0x0074, # t
5970             0x0069, # i
5971             0x0074, # t
5972             NEVER_CHAR, # (y)
5973             ]->[length $self->{kwd}]) {
5974             ## Stay in the state.
5975 0           $self->{kwd} .= chr $nc;
5976            
5977 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978 0           $self->{line_prev} = $self->{line};
5979 0           $self->{column_prev} = $self->{column};
5980 0           $self->{column}++;
5981             $self->{nc}
5982 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983             } else {
5984 0           $self->{set_nc}->($self);
5985             }
5986            
5987 0           redo A;
5988             } elsif ((length $self->{kwd}) == 5 and
5989             ($nc == 0x0059 or # Y
5990             $nc == 0x0079)) { # y
5991 0 0 0       if ($self->{kwd} ne 'ENTIT' or $nc == 0x0079) {
5992             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5993             text => 'ENTITY',
5994             line => $self->{line_prev},
5995 0           column => $self->{column_prev} - 4);
5996             }
5997             $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5998             line => $self->{line_prev},
5999 0           column => $self->{column_prev} - 6};
6000 0           $self->{state} = DOCTYPE_MD_STATE;
6001            
6002 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6003 0           $self->{line_prev} = $self->{line};
6004 0           $self->{column_prev} = $self->{column};
6005 0           $self->{column}++;
6006             $self->{nc}
6007 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6008             } else {
6009 0           $self->{set_nc}->($self);
6010             }
6011            
6012 0           redo A;
6013             } else {
6014             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6015             line => $self->{line_prev},
6016             column => $self->{column_prev} - 1
6017             - (length $self->{kwd})
6018 0           + 1 * ($nc == EOF_CHAR));
6019 0           $self->{state} = BOGUS_COMMENT_STATE;
6020             ## Reconsume.
6021 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6022 0           redo A;
6023             }
6024             } elsif ($state == MD_ELEMENT_STATE) {
6025 0 0 0       if ($nc == [
    0 0        
      0        
6026             undef,
6027             undef,
6028             0x0045, # E
6029             0x004D, # M
6030             0x0045, # E
6031             0x004E, # N
6032             NEVER_CHAR, # (T)
6033             ]->[length $self->{kwd}] or
6034             $nc == [
6035             undef,
6036             undef,
6037             0x0065, # e
6038             0x006D, # m
6039             0x0065, # e
6040             0x006E, # n
6041             NEVER_CHAR, # (t)
6042             ]->[length $self->{kwd}]) {
6043             ## Stay in the state.
6044 0           $self->{kwd} .= chr $nc;
6045            
6046 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047 0           $self->{line_prev} = $self->{line};
6048 0           $self->{column_prev} = $self->{column};
6049 0           $self->{column}++;
6050             $self->{nc}
6051 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052             } else {
6053 0           $self->{set_nc}->($self);
6054             }
6055            
6056 0           redo A;
6057             } elsif ((length $self->{kwd}) == 6 and
6058             ($nc == 0x0054 or # T
6059             $nc == 0x0074)) { # t
6060 0 0 0       if ($self->{kwd} ne 'ELEMEN' or $nc == 0x0074) {
6061             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6062             text => 'ELEMENT',
6063             line => $self->{line_prev},
6064 0           column => $self->{column_prev} - 5);
6065             }
6066             $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6067             line => $self->{line_prev},
6068 0           column => $self->{column_prev} - 7};
6069 0           $self->{state} = DOCTYPE_MD_STATE;
6070            
6071 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072 0           $self->{line_prev} = $self->{line};
6073 0           $self->{column_prev} = $self->{column};
6074 0           $self->{column}++;
6075             $self->{nc}
6076 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077             } else {
6078 0           $self->{set_nc}->($self);
6079             }
6080            
6081 0           redo A;
6082             } else {
6083             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6084             line => $self->{line_prev},
6085             column => $self->{column_prev} - 1
6086             - (length $self->{kwd})
6087 0           + 1 * ($nc == EOF_CHAR));
6088 0           $self->{state} = BOGUS_COMMENT_STATE;
6089             ## Reconsume.
6090 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6091 0           redo A;
6092             }
6093             } elsif ($state == MD_ATTLIST_STATE) {
6094 0 0 0       if ($nc == [
    0 0        
      0        
6095             undef,
6096             0x0054, # T
6097             0x0054, # T
6098             0x004C, # L
6099             0x0049, # I
6100             0x0053, # S
6101             NEVER_CHAR, # (T)
6102             ]->[length $self->{kwd}] or
6103             $nc == [
6104             undef,
6105             0x0074, # t
6106             0x0074, # t
6107             0x006C, # l
6108             0x0069, # i
6109             0x0073, # s
6110             NEVER_CHAR, # (t)
6111             ]->[length $self->{kwd}]) {
6112             ## Stay in the state.
6113 0           $self->{kwd} .= chr $nc;
6114            
6115 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6116 0           $self->{line_prev} = $self->{line};
6117 0           $self->{column_prev} = $self->{column};
6118 0           $self->{column}++;
6119             $self->{nc}
6120 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6121             } else {
6122 0           $self->{set_nc}->($self);
6123             }
6124            
6125 0           redo A;
6126             } elsif ((length $self->{kwd}) == 6 and
6127             ($nc == 0x0054 or # T
6128             $nc == 0x0074)) { # t
6129 0 0 0       if ($self->{kwd} ne 'ATTLIS' or $nc == 0x0074) {
6130             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6131             text => 'ATTLIST',
6132             line => $self->{line_prev},
6133 0           column => $self->{column_prev} - 5);
6134             }
6135             $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6136             attrdefs => [],
6137             line => $self->{line_prev},
6138 0           column => $self->{column_prev} - 7};
6139 0           $self->{state} = DOCTYPE_MD_STATE;
6140            
6141 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6142 0           $self->{line_prev} = $self->{line};
6143 0           $self->{column_prev} = $self->{column};
6144 0           $self->{column}++;
6145             $self->{nc}
6146 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6147             } else {
6148 0           $self->{set_nc}->($self);
6149             }
6150            
6151 0           redo A;
6152             } else {
6153             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6154             line => $self->{line_prev},
6155             column => $self->{column_prev} - 1
6156             - (length $self->{kwd})
6157 0           + 1 * ($nc == EOF_CHAR));
6158 0           $self->{state} = BOGUS_COMMENT_STATE;
6159             ## Reconsume.
6160 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6161 0           redo A;
6162             }
6163             } elsif ($state == MD_NOTATION_STATE) {
6164 0 0 0       if ($nc == [
    0 0        
      0        
6165             undef,
6166             0x004F, # O
6167             0x0054, # T
6168             0x0041, # A
6169             0x0054, # T
6170             0x0049, # I
6171             0x004F, # O
6172             NEVER_CHAR, # (N)
6173             ]->[length $self->{kwd}] or
6174             $nc == [
6175             undef,
6176             0x006F, # o
6177             0x0074, # t
6178             0x0061, # a
6179             0x0074, # t
6180             0x0069, # i
6181             0x006F, # o
6182             NEVER_CHAR, # (n)
6183             ]->[length $self->{kwd}]) {
6184             ## Stay in the state.
6185 0           $self->{kwd} .= chr $nc;
6186            
6187 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6188 0           $self->{line_prev} = $self->{line};
6189 0           $self->{column_prev} = $self->{column};
6190 0           $self->{column}++;
6191             $self->{nc}
6192 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6193             } else {
6194 0           $self->{set_nc}->($self);
6195             }
6196            
6197 0           redo A;
6198             } elsif ((length $self->{kwd}) == 7 and
6199             ($nc == 0x004E or # N
6200             $nc == 0x006E)) { # n
6201 0 0 0       if ($self->{kwd} ne 'NOTATIO' or $nc == 0x006E) {
6202             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6203             text => 'NOTATION',
6204             line => $self->{line_prev},
6205 0           column => $self->{column_prev} - 6);
6206             }
6207             $self->{ct} = {type => NOTATION_TOKEN, name => '',
6208             line => $self->{line_prev},
6209 0           column => $self->{column_prev} - 8};
6210 0           $self->{state} = DOCTYPE_MD_STATE;
6211            
6212 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6213 0           $self->{line_prev} = $self->{line};
6214 0           $self->{column_prev} = $self->{column};
6215 0           $self->{column}++;
6216             $self->{nc}
6217 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6218             } else {
6219 0           $self->{set_nc}->($self);
6220             }
6221            
6222 0           redo A;
6223             } else {
6224             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6225             line => $self->{line_prev},
6226             column => $self->{column_prev} - 1
6227             - (length $self->{kwd})
6228 0           + 1 * ($nc == EOF_CHAR));
6229 0           $self->{state} = BOGUS_COMMENT_STATE;
6230             ## Reconsume.
6231 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6232 0           redo A;
6233             }
6234             } elsif ($state == DOCTYPE_MD_STATE) {
6235             ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6236             ## "DOCTYPE NOTATION state".
6237              
6238 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
6239             ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6240 0           $self->{state} = BEFORE_MD_NAME_STATE;
6241            
6242 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6243 0           $self->{line_prev} = $self->{line};
6244 0           $self->{column_prev} = $self->{column};
6245 0           $self->{column}++;
6246             $self->{nc}
6247 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6248             } else {
6249 0           $self->{set_nc}->($self);
6250             }
6251            
6252 0           redo A;
6253             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6254             $nc == 0x0025) { # %
6255             ## XML5: Switch to the "DOCTYPE bogus comment state".
6256 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6257 0           $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6258            
6259 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6260 0           $self->{line_prev} = $self->{line};
6261 0           $self->{column_prev} = $self->{column};
6262 0           $self->{column}++;
6263             $self->{nc}
6264 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6265             } else {
6266 0           $self->{set_nc}->($self);
6267             }
6268            
6269 0           redo A;
6270             } elsif ($nc == EOF_CHAR) {
6271 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6272 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6273             ## Reconsume.
6274 0           redo A;
6275             } elsif ($nc == 0x003E) { # >
6276             ## XML5: Switch to the "DOCTYPE bogus comment state".
6277 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6278 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6279            
6280 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6281 0           $self->{line_prev} = $self->{line};
6282 0           $self->{column_prev} = $self->{column};
6283 0           $self->{column}++;
6284             $self->{nc}
6285 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6286             } else {
6287 0           $self->{set_nc}->($self);
6288             }
6289            
6290 0           redo A;
6291             } else {
6292             ## XML5: Switch to the "DOCTYPE bogus comment state".
6293 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6294 0           $self->{state} = BEFORE_MD_NAME_STATE;
6295 0           redo A;
6296             }
6297             } elsif ($state == BEFORE_MD_NAME_STATE) {
6298             ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6299             ## before state", "DOCTYPE ATTLIST name before state".
6300              
6301 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
6302             ## Stay in the state.
6303            
6304 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6305 0           $self->{line_prev} = $self->{line};
6306 0           $self->{column_prev} = $self->{column};
6307 0           $self->{column}++;
6308             $self->{nc}
6309 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6310             } else {
6311 0           $self->{set_nc}->($self);
6312             }
6313            
6314 0           redo A;
6315             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6316             $nc == 0x0025) { # %
6317 0           $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6318            
6319 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320 0           $self->{line_prev} = $self->{line};
6321 0           $self->{column_prev} = $self->{column};
6322 0           $self->{column}++;
6323             $self->{nc}
6324 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325             } else {
6326 0           $self->{set_nc}->($self);
6327             }
6328            
6329 0           redo A;
6330             } elsif ($nc == 0x003E) { # >
6331             ## XML5: Same as "Anything else".
6332 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6333 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6334            
6335 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6336 0           $self->{line_prev} = $self->{line};
6337 0           $self->{column_prev} = $self->{column};
6338 0           $self->{column}++;
6339             $self->{nc}
6340 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6341             } else {
6342 0           $self->{set_nc}->($self);
6343             }
6344            
6345 0           redo A;
6346             } elsif ($nc == EOF_CHAR) {
6347 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6348 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6349             ## Reconsume.
6350 0           redo A;
6351             } else {
6352             ## XML5: [ATTLIST] Not defined yet.
6353 0 0         if ($nc == 0x0000) {
6354 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6355             }
6356 0 0         $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6357 0           $self->{state} = MD_NAME_STATE;
6358            
6359 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6360 0           $self->{line_prev} = $self->{line};
6361 0           $self->{column_prev} = $self->{column};
6362 0           $self->{column}++;
6363             $self->{nc}
6364 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6365             } else {
6366 0           $self->{set_nc}->($self);
6367             }
6368            
6369 0           redo A;
6370             }
6371             } elsif ($state == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6372 0 0         if ($is_space->{$nc}) {
    0          
    0          
6373             ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6374 0           $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6375 0           $self->{state} = BEFORE_MD_NAME_STATE;
6376            
6377 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6378 0           $self->{line_prev} = $self->{line};
6379 0           $self->{column_prev} = $self->{column};
6380 0           $self->{column}++;
6381             $self->{nc}
6382 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6383             } else {
6384 0           $self->{set_nc}->($self);
6385             }
6386            
6387 0           redo A;
6388             } elsif ($nc == 0x003E) { # >
6389             ## XML5: Same as "Anything else".
6390 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6391 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6392            
6393 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394 0           $self->{line_prev} = $self->{line};
6395 0           $self->{column_prev} = $self->{column};
6396 0           $self->{column}++;
6397             $self->{nc}
6398 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399             } else {
6400 0           $self->{set_nc}->($self);
6401             }
6402            
6403 0           redo A;
6404             } elsif ($nc == EOF_CHAR) {
6405 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6406 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6407             ## Reconsume.
6408 0           redo A;
6409             } else {
6410             ## XML5: No parse error.
6411 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6412 0           $self->{state} = BOGUS_COMMENT_STATE;
6413 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6414             ## Reconsume.
6415 0           redo A;
6416             }
6417             } elsif ($state == MD_NAME_STATE) {
6418             ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6419            
6420 0 0         if ($is_space->{$nc}) {
    0          
    0          
6421 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
    0          
6422 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6423             } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6424 0           $self->{state} = AFTER_ELEMENT_NAME_STATE;
6425             } else { # ENTITY/NOTATION
6426 0           $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6427             }
6428            
6429 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6430 0           $self->{line_prev} = $self->{line};
6431 0           $self->{column_prev} = $self->{column};
6432 0           $self->{column}++;
6433             $self->{nc}
6434 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6435             } else {
6436 0           $self->{set_nc}->($self);
6437             }
6438            
6439 0           redo A;
6440             } elsif ($nc == 0x003E) { # >
6441 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6442             #
6443             } else {
6444 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6445             }
6446 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6447            
6448 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6449 0           $self->{line_prev} = $self->{line};
6450 0           $self->{column_prev} = $self->{column};
6451 0           $self->{column}++;
6452             $self->{nc}
6453 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6454             } else {
6455 0           $self->{set_nc}->($self);
6456             }
6457            
6458 0           return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6459 0           redo A;
6460             } elsif ($nc == EOF_CHAR) {
6461             ## XML5: [ATTLIST] No parse error.
6462 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6463 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6464             ## Reconsume.
6465 0           redo A;
6466             } else {
6467             ## XML5: [ATTLIST] Not defined yet.
6468 0 0         if ($nc == 0x0000) {
6469 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6470             }
6471 0 0         $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6472             ## Stay in the state.
6473            
6474 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6475 0           $self->{line_prev} = $self->{line};
6476 0           $self->{column_prev} = $self->{column};
6477 0           $self->{column}++;
6478             $self->{nc}
6479 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6480             } else {
6481 0           $self->{set_nc}->($self);
6482             }
6483            
6484 0           redo A;
6485             }
6486             } elsif ($state == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6487 0 0         if ($is_space->{$nc}) {
    0          
    0          
6488             ## Stay in the state.
6489            
6490 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6491 0           $self->{line_prev} = $self->{line};
6492 0           $self->{column_prev} = $self->{column};
6493 0           $self->{column}++;
6494             $self->{nc}
6495 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6496             } else {
6497 0           $self->{set_nc}->($self);
6498             }
6499            
6500 0           redo A;
6501             } elsif ($nc == 0x003E) { # >
6502 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6503            
6504 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6505 0           $self->{line_prev} = $self->{line};
6506 0           $self->{column_prev} = $self->{column};
6507 0           $self->{column}++;
6508             $self->{nc}
6509 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6510             } else {
6511 0           $self->{set_nc}->($self);
6512             }
6513            
6514 0           return ($self->{ct}); # ATTLIST
6515 0           redo A;
6516             } elsif ($nc == EOF_CHAR) {
6517             ## XML5: No parse error.
6518 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6519 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6520             ## Discard the current token.
6521 0           redo A;
6522             } else {
6523             ## XML5: Not defined yet.
6524 0 0         if ($nc == 0x0000) {
6525 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6526             }
6527             $self->{ca} = {name => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, # attrdef
6528             tokens => [],
6529 0 0         line => $self->{line}, column => $self->{column}};
6530 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6531            
6532 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6533 0           $self->{line_prev} = $self->{line};
6534 0           $self->{column_prev} = $self->{column};
6535 0           $self->{column}++;
6536             $self->{nc}
6537 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6538             } else {
6539 0           $self->{set_nc}->($self);
6540             }
6541            
6542 0           redo A;
6543             }
6544             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6545 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
6546 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6547            
6548 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6549 0           $self->{line_prev} = $self->{line};
6550 0           $self->{column_prev} = $self->{column};
6551 0           $self->{column}++;
6552             $self->{nc}
6553 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6554             } else {
6555 0           $self->{set_nc}->($self);
6556             }
6557            
6558 0           redo A;
6559             } elsif ($nc == 0x003E) { # >
6560             ## XML5: Same as "anything else".
6561 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6562 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6563            
6564 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565 0           $self->{line_prev} = $self->{line};
6566 0           $self->{column_prev} = $self->{column};
6567 0           $self->{column}++;
6568             $self->{nc}
6569 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570             } else {
6571 0           $self->{set_nc}->($self);
6572             }
6573            
6574 0           return ($self->{ct}); # ATTLIST
6575 0           redo A;
6576             } elsif ($nc == 0x0028) { # (
6577             ## XML5: Same as "anything else".
6578 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6579 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6580            
6581 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6582 0           $self->{line_prev} = $self->{line};
6583 0           $self->{column_prev} = $self->{column};
6584 0           $self->{column}++;
6585             $self->{nc}
6586 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6587             } else {
6588 0           $self->{set_nc}->($self);
6589             }
6590            
6591 0           redo A;
6592             } elsif ($nc == EOF_CHAR) {
6593             ## XML5: No parse error.
6594 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6595 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6596            
6597 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6598 0           $self->{line_prev} = $self->{line};
6599 0           $self->{column_prev} = $self->{column};
6600 0           $self->{column}++;
6601             $self->{nc}
6602 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6603             } else {
6604 0           $self->{set_nc}->($self);
6605             }
6606            
6607             ## Discard the current token.
6608 0           redo A;
6609             } else {
6610             ## XML5: Not defined yet.
6611 0 0         if ($nc == 0x0000) {
6612 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6613             }
6614 0 0         $self->{ca}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6615             ## Stay in the state.
6616            
6617 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6618 0           $self->{line_prev} = $self->{line};
6619 0           $self->{column_prev} = $self->{column};
6620 0           $self->{column}++;
6621             $self->{nc}
6622 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6623             } else {
6624 0           $self->{set_nc}->($self);
6625             }
6626            
6627 0           redo A;
6628             }
6629             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6630 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
6631             ## Stay in the state.
6632            
6633 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6634 0           $self->{line_prev} = $self->{line};
6635 0           $self->{column_prev} = $self->{column};
6636 0           $self->{column}++;
6637             $self->{nc}
6638 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6639             } else {
6640 0           $self->{set_nc}->($self);
6641             }
6642            
6643 0           redo A;
6644             } elsif ($nc == 0x003E) { # >
6645             ## XML5: Same as "anything else".
6646 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6647 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6648            
6649 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6650 0           $self->{line_prev} = $self->{line};
6651 0           $self->{column_prev} = $self->{column};
6652 0           $self->{column}++;
6653             $self->{nc}
6654 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6655             } else {
6656 0           $self->{set_nc}->($self);
6657             }
6658            
6659 0           return ($self->{ct}); # ATTLIST
6660 0           redo A;
6661             } elsif ($nc == 0x0028) { # (
6662             ## XML5: Same as "anything else".
6663 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6664            
6665 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6666 0           $self->{line_prev} = $self->{line};
6667 0           $self->{column_prev} = $self->{column};
6668 0           $self->{column}++;
6669             $self->{nc}
6670 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6671             } else {
6672 0           $self->{set_nc}->($self);
6673             }
6674            
6675 0           redo A;
6676             } elsif ($nc == EOF_CHAR) {
6677             ## XML5: No parse error.
6678 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6679 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6680            
6681 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6682 0           $self->{line_prev} = $self->{line};
6683 0           $self->{column_prev} = $self->{column};
6684 0           $self->{column}++;
6685             $self->{nc}
6686 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6687             } else {
6688 0           $self->{set_nc}->($self);
6689             }
6690            
6691             ## Discard the token.
6692 0           redo A;
6693             } else {
6694             ## XML5: Not defined yet.
6695 0           $self->{ca}->{type} = chr $nc;
6696 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6697            
6698 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699 0           $self->{line_prev} = $self->{line};
6700 0           $self->{column_prev} = $self->{column};
6701 0           $self->{column}++;
6702             $self->{nc}
6703 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704             } else {
6705 0           $self->{set_nc}->($self);
6706             }
6707            
6708 0           redo A;
6709             }
6710             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6711 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
    0          
6712 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6713            
6714 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6715 0           $self->{line_prev} = $self->{line};
6716 0           $self->{column_prev} = $self->{column};
6717 0           $self->{column}++;
6718             $self->{nc}
6719 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6720             } else {
6721 0           $self->{set_nc}->($self);
6722             }
6723            
6724 0           redo A;
6725             } elsif ($nc == 0x0023) { # #
6726             ## XML5: Same as "anything else".
6727 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6728 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6729            
6730 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731 0           $self->{line_prev} = $self->{line};
6732 0           $self->{column_prev} = $self->{column};
6733 0           $self->{column}++;
6734             $self->{nc}
6735 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736             } else {
6737 0           $self->{set_nc}->($self);
6738             }
6739            
6740 0           redo A;
6741             } elsif ($nc == 0x0022) { # "
6742             ## XML5: Same as "anything else".
6743 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6744 0           $self->{ca}->{value} = '';
6745 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6746            
6747 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6748 0           $self->{line_prev} = $self->{line};
6749 0           $self->{column_prev} = $self->{column};
6750 0           $self->{column}++;
6751             $self->{nc}
6752 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6753             } else {
6754 0           $self->{set_nc}->($self);
6755             }
6756            
6757 0           redo A;
6758             } elsif ($nc == 0x0027) { # '
6759             ## XML5: Same as "anything else".
6760 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6761 0           $self->{ca}->{value} = '';
6762 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6763            
6764 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765 0           $self->{line_prev} = $self->{line};
6766 0           $self->{column_prev} = $self->{column};
6767 0           $self->{column}++;
6768             $self->{nc}
6769 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770             } else {
6771 0           $self->{set_nc}->($self);
6772             }
6773            
6774 0           redo A;
6775             } elsif ($nc == 0x003E) { # >
6776             ## XML5: Same as "anything else".
6777 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6778 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6779            
6780 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6781 0           $self->{line_prev} = $self->{line};
6782 0           $self->{column_prev} = $self->{column};
6783 0           $self->{column}++;
6784             $self->{nc}
6785 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6786             } else {
6787 0           $self->{set_nc}->($self);
6788             }
6789            
6790 0           return ($self->{ct}); # ATTLIST
6791 0           redo A;
6792             } elsif ($nc == 0x0028) { # (
6793             ## XML5: Same as "anything else".
6794 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6795 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6796            
6797 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6798 0           $self->{line_prev} = $self->{line};
6799 0           $self->{column_prev} = $self->{column};
6800 0           $self->{column}++;
6801             $self->{nc}
6802 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6803             } else {
6804 0           $self->{set_nc}->($self);
6805             }
6806            
6807 0           redo A;
6808             } elsif ($nc == EOF_CHAR) {
6809             ## XML5: No parse error.
6810 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6811 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6812            
6813 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6814 0           $self->{line_prev} = $self->{line};
6815 0           $self->{column_prev} = $self->{column};
6816 0           $self->{column}++;
6817             $self->{nc}
6818 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6819             } else {
6820 0           $self->{set_nc}->($self);
6821             }
6822            
6823             ## Discard the token.
6824 0           redo A;
6825             } else {
6826             ## XML5: Not defined yet.
6827 0           $self->{ca}->{type} .= chr $nc;
6828             ## Stay in the state.
6829            
6830 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6831 0           $self->{line_prev} = $self->{line};
6832 0           $self->{column_prev} = $self->{column};
6833 0           $self->{column}++;
6834             $self->{nc}
6835 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6836             } else {
6837 0           $self->{set_nc}->($self);
6838             }
6839            
6840 0           redo A;
6841             }
6842             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6843 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
    0          
6844             ## Stay in the state.
6845            
6846 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6847 0           $self->{line_prev} = $self->{line};
6848 0           $self->{column_prev} = $self->{column};
6849 0           $self->{column}++;
6850             $self->{nc}
6851 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6852             } else {
6853 0           $self->{set_nc}->($self);
6854             }
6855            
6856 0           redo A;
6857             } elsif ($nc == 0x0028) { # (
6858             ## XML5: Same as "anything else".
6859 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6860            
6861 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6862 0           $self->{line_prev} = $self->{line};
6863 0           $self->{column_prev} = $self->{column};
6864 0           $self->{column}++;
6865             $self->{nc}
6866 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6867             } else {
6868 0           $self->{set_nc}->($self);
6869             }
6870            
6871 0           redo A;
6872             } elsif ($nc == 0x0023) { # #
6873 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6874            
6875 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876 0           $self->{line_prev} = $self->{line};
6877 0           $self->{column_prev} = $self->{column};
6878 0           $self->{column}++;
6879             $self->{nc}
6880 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881             } else {
6882 0           $self->{set_nc}->($self);
6883             }
6884            
6885 0           redo A;
6886             } elsif ($nc == 0x0022) { # "
6887             ## XML5: Same as "anything else".
6888 0           $self->{ca}->{value} = '';
6889 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6890            
6891 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892 0           $self->{line_prev} = $self->{line};
6893 0           $self->{column_prev} = $self->{column};
6894 0           $self->{column}++;
6895             $self->{nc}
6896 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897             } else {
6898 0           $self->{set_nc}->($self);
6899             }
6900            
6901 0           redo A;
6902             } elsif ($nc == 0x0027) { # '
6903             ## XML5: Same as "anything else".
6904 0           $self->{ca}->{value} = '';
6905 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6906            
6907 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908 0           $self->{line_prev} = $self->{line};
6909 0           $self->{column_prev} = $self->{column};
6910 0           $self->{column}++;
6911             $self->{nc}
6912 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913             } else {
6914 0           $self->{set_nc}->($self);
6915             }
6916            
6917 0           redo A;
6918             } elsif ($nc == 0x003E) { # >
6919             ## XML5: Same as "anything else".
6920 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6921 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6922            
6923 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6924 0           $self->{line_prev} = $self->{line};
6925 0           $self->{column_prev} = $self->{column};
6926 0           $self->{column}++;
6927             $self->{nc}
6928 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6929             } else {
6930 0           $self->{set_nc}->($self);
6931             }
6932            
6933 0           return ($self->{ct}); # ATTLIST
6934 0           redo A;
6935             } elsif ($nc == EOF_CHAR) {
6936             ## XML5: No parse error.
6937 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6938 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6939            
6940 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6941 0           $self->{line_prev} = $self->{line};
6942 0           $self->{column_prev} = $self->{column};
6943 0           $self->{column}++;
6944             $self->{nc}
6945 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6946             } else {
6947 0           $self->{set_nc}->($self);
6948             }
6949            
6950             ## Discard the current token.
6951 0           redo A;
6952             } else {
6953             ## XML5: Switch to the "DOCTYPE bogus comment state".
6954 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6955 0           $self->{ca}->{value} = '';
6956 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6957             ## Reconsume.
6958 0           redo A;
6959             }
6960             } elsif ($state == BEFORE_ALLOWED_TOKEN_STATE) {
6961 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
6962             ## Stay in the state.
6963            
6964 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6965 0           $self->{line_prev} = $self->{line};
6966 0           $self->{column_prev} = $self->{column};
6967 0           $self->{column}++;
6968             $self->{nc}
6969 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6970             } else {
6971 0           $self->{set_nc}->($self);
6972             }
6973            
6974 0           redo A;
6975             } elsif ($nc == 0x007C) { # |
6976 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6977             ## Stay in the state.
6978            
6979 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6980 0           $self->{line_prev} = $self->{line};
6981 0           $self->{column_prev} = $self->{column};
6982 0           $self->{column}++;
6983             $self->{nc}
6984 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6985             } else {
6986 0           $self->{set_nc}->($self);
6987             }
6988            
6989 0           redo A;
6990             } elsif ($nc == 0x0029) { # )
6991 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6992 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6993            
6994 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6995 0           $self->{line_prev} = $self->{line};
6996 0           $self->{column_prev} = $self->{column};
6997 0           $self->{column}++;
6998             $self->{nc}
6999 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7000             } else {
7001 0           $self->{set_nc}->($self);
7002             }
7003            
7004 0           redo A;
7005             } elsif ($nc == 0x003E) { # >
7006 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7007 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7008            
7009 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010 0           $self->{line_prev} = $self->{line};
7011 0           $self->{column_prev} = $self->{column};
7012 0           $self->{column}++;
7013             $self->{nc}
7014 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015             } else {
7016 0           $self->{set_nc}->($self);
7017             }
7018            
7019 0           return ($self->{ct}); # ATTLIST
7020 0           redo A;
7021             } elsif ($nc == EOF_CHAR) {
7022             ## XML5: No parse error.
7023 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7024 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7025            
7026 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7027 0           $self->{line_prev} = $self->{line};
7028 0           $self->{column_prev} = $self->{column};
7029 0           $self->{column}++;
7030             $self->{nc}
7031 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7032             } else {
7033 0           $self->{set_nc}->($self);
7034             }
7035            
7036             ## Discard the current token.
7037 0           redo A;
7038             } else {
7039 0 0         if ($nc == 0x000) {
7040 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7041             }
7042 0 0         push @{$self->{ca}->{tokens}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
  0            
7043 0           $self->{state} = ALLOWED_TOKEN_STATE;
7044            
7045 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7046 0           $self->{line_prev} = $self->{line};
7047 0           $self->{column_prev} = $self->{column};
7048 0           $self->{column}++;
7049             $self->{nc}
7050 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7051             } else {
7052 0           $self->{set_nc}->($self);
7053             }
7054            
7055 0           redo A;
7056             }
7057             } elsif ($state == ALLOWED_TOKEN_STATE) {
7058 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7059 0           $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7060            
7061 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7062 0           $self->{line_prev} = $self->{line};
7063 0           $self->{column_prev} = $self->{column};
7064 0           $self->{column}++;
7065             $self->{nc}
7066 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7067             } else {
7068 0           $self->{set_nc}->($self);
7069             }
7070            
7071 0           redo A;
7072             } elsif ($nc == 0x007C) { # |
7073 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7074            
7075 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7076 0           $self->{line_prev} = $self->{line};
7077 0           $self->{column_prev} = $self->{column};
7078 0           $self->{column}++;
7079             $self->{nc}
7080 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7081             } else {
7082 0           $self->{set_nc}->($self);
7083             }
7084            
7085 0           redo A;
7086             } elsif ($nc == 0x0029) { # )
7087 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7088            
7089 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7090 0           $self->{line_prev} = $self->{line};
7091 0           $self->{column_prev} = $self->{column};
7092 0           $self->{column}++;
7093             $self->{nc}
7094 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7095             } else {
7096 0           $self->{set_nc}->($self);
7097             }
7098            
7099 0           redo A;
7100             } elsif ($nc == 0x003E) { # >
7101 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7102 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7103            
7104 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105 0           $self->{line_prev} = $self->{line};
7106 0           $self->{column_prev} = $self->{column};
7107 0           $self->{column}++;
7108             $self->{nc}
7109 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110             } else {
7111 0           $self->{set_nc}->($self);
7112             }
7113            
7114 0           return ($self->{ct}); # ATTLIST
7115 0           redo A;
7116             } elsif ($nc == EOF_CHAR) {
7117             ## XML5: No parse error.
7118 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7119 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7120            
7121 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122 0           $self->{line_prev} = $self->{line};
7123 0           $self->{column_prev} = $self->{column};
7124 0           $self->{column}++;
7125             $self->{nc}
7126 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127             } else {
7128 0           $self->{set_nc}->($self);
7129             }
7130            
7131             ## Discard the current token.
7132 0           redo A;
7133             } else {
7134 0 0         if ($nc == 0x0000) {
7135 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7136             }
7137 0 0         $self->{ca}->{tokens}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
7138             ## Stay in the state.
7139            
7140 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7141 0           $self->{line_prev} = $self->{line};
7142 0           $self->{column_prev} = $self->{column};
7143 0           $self->{column}++;
7144             $self->{nc}
7145 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7146             } else {
7147 0           $self->{set_nc}->($self);
7148             }
7149            
7150 0           redo A;
7151             }
7152             } elsif ($state == AFTER_ALLOWED_TOKEN_STATE) {
7153 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7154             ## Stay in the state.
7155            
7156 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7157 0           $self->{line_prev} = $self->{line};
7158 0           $self->{column_prev} = $self->{column};
7159 0           $self->{column}++;
7160             $self->{nc}
7161 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7162             } else {
7163 0           $self->{set_nc}->($self);
7164             }
7165            
7166 0           redo A;
7167             } elsif ($nc == 0x007C) { # |
7168 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7169            
7170 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7171 0           $self->{line_prev} = $self->{line};
7172 0           $self->{column_prev} = $self->{column};
7173 0           $self->{column}++;
7174             $self->{nc}
7175 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7176             } else {
7177 0           $self->{set_nc}->($self);
7178             }
7179            
7180 0           redo A;
7181             } elsif ($nc == 0x0029) { # )
7182 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7183            
7184 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7185 0           $self->{line_prev} = $self->{line};
7186 0           $self->{column_prev} = $self->{column};
7187 0           $self->{column}++;
7188             $self->{nc}
7189 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7190             } else {
7191 0           $self->{set_nc}->($self);
7192             }
7193            
7194 0           redo A;
7195             } elsif ($nc == 0x003E) { # >
7196 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7197 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198            
7199 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200 0           $self->{line_prev} = $self->{line};
7201 0           $self->{column_prev} = $self->{column};
7202 0           $self->{column}++;
7203             $self->{nc}
7204 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205             } else {
7206 0           $self->{set_nc}->($self);
7207             }
7208            
7209 0           return ($self->{ct}); # ATTLIST
7210 0           redo A;
7211             } elsif ($nc == EOF_CHAR) {
7212             ## XML5: No parse error.
7213 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7214 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7215            
7216 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7217 0           $self->{line_prev} = $self->{line};
7218 0           $self->{column_prev} = $self->{column};
7219 0           $self->{column}++;
7220             $self->{nc}
7221 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7222             } else {
7223 0           $self->{set_nc}->($self);
7224             }
7225            
7226             ## Discard the current token.
7227 0           redo A;
7228             } else {
7229             $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7230             line => $self->{line_prev},
7231 0           column => $self->{column_prev});
7232 0 0         if ($nc == 0x0000) {
7233 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7234             }
7235 0 0         $self->{ca}->{tokens}->[-1] .= ' ' . ($nc == 0x0000 ? "\x{FFFD}" : chr $nc);
7236 0           $self->{state} = ALLOWED_TOKEN_STATE;
7237            
7238 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7239 0           $self->{line_prev} = $self->{line};
7240 0           $self->{column_prev} = $self->{column};
7241 0           $self->{column}++;
7242             $self->{nc}
7243 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7244             } else {
7245 0           $self->{set_nc}->($self);
7246             }
7247            
7248 0           redo A;
7249             }
7250             } elsif ($state == AFTER_ALLOWED_TOKENS_STATE) {
7251 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
7252 0           $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7253            
7254 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255 0           $self->{line_prev} = $self->{line};
7256 0           $self->{column_prev} = $self->{column};
7257 0           $self->{column}++;
7258             $self->{nc}
7259 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260             } else {
7261 0           $self->{set_nc}->($self);
7262             }
7263            
7264 0           redo A;
7265             } elsif ($nc == 0x0023) { # #
7266 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7267 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7268            
7269 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7270 0           $self->{line_prev} = $self->{line};
7271 0           $self->{column_prev} = $self->{column};
7272 0           $self->{column}++;
7273             $self->{nc}
7274 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7275             } else {
7276 0           $self->{set_nc}->($self);
7277             }
7278            
7279 0           redo A;
7280             } elsif ($nc == 0x0022) { # "
7281 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7282 0           $self->{ca}->{value} = '';
7283 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7284            
7285 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7286 0           $self->{line_prev} = $self->{line};
7287 0           $self->{column_prev} = $self->{column};
7288 0           $self->{column}++;
7289             $self->{nc}
7290 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7291             } else {
7292 0           $self->{set_nc}->($self);
7293             }
7294            
7295 0           redo A;
7296             } elsif ($nc == 0x0027) { # '
7297 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7298 0           $self->{ca}->{value} = '';
7299 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7300            
7301 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7302 0           $self->{line_prev} = $self->{line};
7303 0           $self->{column_prev} = $self->{column};
7304 0           $self->{column}++;
7305             $self->{nc}
7306 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7307             } else {
7308 0           $self->{set_nc}->($self);
7309             }
7310            
7311 0           redo A;
7312             } elsif ($nc == 0x003E) { # >
7313 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7314 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7315            
7316 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317 0           $self->{line_prev} = $self->{line};
7318 0           $self->{column_prev} = $self->{column};
7319 0           $self->{column}++;
7320             $self->{nc}
7321 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322             } else {
7323 0           $self->{set_nc}->($self);
7324             }
7325            
7326 0           return ($self->{ct}); # ATTLIST
7327 0           redo A;
7328             } elsif ($nc == EOF_CHAR) {
7329 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7330 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7331            
7332 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333 0           $self->{line_prev} = $self->{line};
7334 0           $self->{column_prev} = $self->{column};
7335 0           $self->{column}++;
7336             $self->{nc}
7337 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338             } else {
7339 0           $self->{set_nc}->($self);
7340             }
7341            
7342             ## Discard the current token.
7343 0           redo A;
7344             } else {
7345 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7346 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7347             ## Reconsume.
7348 0           redo A;
7349             }
7350             } elsif ($state == BEFORE_ATTR_DEFAULT_STATE) {
7351 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
7352             ## Stay in the state.
7353            
7354 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355 0           $self->{line_prev} = $self->{line};
7356 0           $self->{column_prev} = $self->{column};
7357 0           $self->{column}++;
7358             $self->{nc}
7359 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360             } else {
7361 0           $self->{set_nc}->($self);
7362             }
7363            
7364 0           redo A;
7365             } elsif ($nc == 0x0023) { # #
7366 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7367            
7368 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369 0           $self->{line_prev} = $self->{line};
7370 0           $self->{column_prev} = $self->{column};
7371 0           $self->{column}++;
7372             $self->{nc}
7373 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374             } else {
7375 0           $self->{set_nc}->($self);
7376             }
7377            
7378 0           redo A;
7379             } elsif ($nc == 0x0022) { # "
7380 0           $self->{ca}->{value} = '';
7381 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7382            
7383 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7384 0           $self->{line_prev} = $self->{line};
7385 0           $self->{column_prev} = $self->{column};
7386 0           $self->{column}++;
7387             $self->{nc}
7388 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7389             } else {
7390 0           $self->{set_nc}->($self);
7391             }
7392            
7393 0           redo A;
7394             } elsif ($nc == 0x0027) { # '
7395 0           $self->{ca}->{value} = '';
7396 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7397            
7398 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7399 0           $self->{line_prev} = $self->{line};
7400 0           $self->{column_prev} = $self->{column};
7401 0           $self->{column}++;
7402             $self->{nc}
7403 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7404             } else {
7405 0           $self->{set_nc}->($self);
7406             }
7407            
7408 0           redo A;
7409             } elsif ($nc == 0x003E) { # >
7410 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7411 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7412            
7413 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7414 0           $self->{line_prev} = $self->{line};
7415 0           $self->{column_prev} = $self->{column};
7416 0           $self->{column}++;
7417             $self->{nc}
7418 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7419             } else {
7420 0           $self->{set_nc}->($self);
7421             }
7422            
7423 0           return ($self->{ct}); # ATTLIST
7424 0           redo A;
7425             } elsif ($nc == EOF_CHAR) {
7426 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7427 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7428            
7429 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7430 0           $self->{line_prev} = $self->{line};
7431 0           $self->{column_prev} = $self->{column};
7432 0           $self->{column}++;
7433             $self->{nc}
7434 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7435             } else {
7436 0           $self->{set_nc}->($self);
7437             }
7438            
7439             ## Discard the current token.
7440 0           redo A;
7441             } else {
7442 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7443 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7444             ## Reconsume.
7445 0           redo A;
7446             }
7447             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7448 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7449             ## XML5: No parse error.
7450 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7451 0           $self->{state} = BOGUS_MD_STATE;
7452             ## Reconsume.
7453 0           redo A;
7454             } elsif ($nc == 0x0022) { # "
7455             # XXX parse error?
7456             ## XML5: Same as "anything else".
7457 0           $self->{ca}->{value} = '';
7458 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7459            
7460 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7461 0           $self->{line_prev} = $self->{line};
7462 0           $self->{column_prev} = $self->{column};
7463 0           $self->{column}++;
7464             $self->{nc}
7465 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7466             } else {
7467 0           $self->{set_nc}->($self);
7468             }
7469            
7470 0           redo A;
7471             } elsif ($nc == 0x0027) { # '
7472             # XXX parse error?
7473             ## XML5: Same as "anything else".
7474 0           $self->{ca}->{value} = '';
7475 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7476            
7477 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7478 0           $self->{line_prev} = $self->{line};
7479 0           $self->{column_prev} = $self->{column};
7480 0           $self->{column}++;
7481             $self->{nc}
7482 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7483             } else {
7484 0           $self->{set_nc}->($self);
7485             }
7486            
7487 0           redo A;
7488             } elsif ($nc == 0x003E) { # >
7489             ## XML5: Same as "anything else".
7490 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7491 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7492            
7493 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7494 0           $self->{line_prev} = $self->{line};
7495 0           $self->{column_prev} = $self->{column};
7496 0           $self->{column}++;
7497             $self->{nc}
7498 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7499             } else {
7500 0           $self->{set_nc}->($self);
7501             }
7502            
7503 0           return ($self->{ct}); # ATTLIST
7504 0           redo A;
7505             } elsif ($nc == EOF_CHAR) {
7506             ## XML5: No parse error.
7507 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7508 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7509            
7510 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7511 0           $self->{line_prev} = $self->{line};
7512 0           $self->{column_prev} = $self->{column};
7513 0           $self->{column}++;
7514             $self->{nc}
7515 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7516             } else {
7517 0           $self->{set_nc}->($self);
7518             }
7519            
7520             ## Discard the current token.
7521 0           redo A;
7522             } else {
7523 0           $self->{ca}->{default} = chr $nc;
7524 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7525            
7526 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7527 0           $self->{line_prev} = $self->{line};
7528 0           $self->{column_prev} = $self->{column};
7529 0           $self->{column}++;
7530             $self->{nc}
7531 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7532             } else {
7533 0           $self->{set_nc}->($self);
7534             }
7535            
7536 0           redo A;
7537             }
7538             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7539 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7540 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7541            
7542 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7543 0           $self->{line_prev} = $self->{line};
7544 0           $self->{column_prev} = $self->{column};
7545 0           $self->{column}++;
7546             $self->{nc}
7547 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7548             } else {
7549 0           $self->{set_nc}->($self);
7550             }
7551            
7552 0           redo A;
7553             } elsif ($nc == 0x0022) { # "
7554             ## XML5: Same as "anything else".
7555 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7556 0           $self->{ca}->{value} = '';
7557 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7558            
7559 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560 0           $self->{line_prev} = $self->{line};
7561 0           $self->{column_prev} = $self->{column};
7562 0           $self->{column}++;
7563             $self->{nc}
7564 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565             } else {
7566 0           $self->{set_nc}->($self);
7567             }
7568            
7569 0           redo A;
7570             } elsif ($nc == 0x0027) { # '
7571             ## XML5: Same as "anything else".
7572 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7573 0           $self->{ca}->{value} = '';
7574 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7575            
7576 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7577 0           $self->{line_prev} = $self->{line};
7578 0           $self->{column_prev} = $self->{column};
7579 0           $self->{column}++;
7580             $self->{nc}
7581 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7582             } else {
7583 0           $self->{set_nc}->($self);
7584             }
7585            
7586 0           redo A;
7587             } elsif ($nc == 0x003E) { # >
7588             ## XML5: Same as "anything else".
7589 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7590 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7591            
7592 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7593 0           $self->{line_prev} = $self->{line};
7594 0           $self->{column_prev} = $self->{column};
7595 0           $self->{column}++;
7596             $self->{nc}
7597 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7598             } else {
7599 0           $self->{set_nc}->($self);
7600             }
7601            
7602 0           return ($self->{ct}); # ATTLIST
7603 0           redo A;
7604             } elsif ($nc == EOF_CHAR) {
7605             ## XML5: No parse error.
7606 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7607 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7608 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7609            
7610 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7611 0           $self->{line_prev} = $self->{line};
7612 0           $self->{column_prev} = $self->{column};
7613 0           $self->{column}++;
7614             $self->{nc}
7615 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7616             } else {
7617 0           $self->{set_nc}->($self);
7618             }
7619            
7620             ## Discard the current token.
7621 0           redo A;
7622             } else {
7623 0           $self->{ca}->{default} .= chr $nc;
7624             ## Stay in the state.
7625            
7626 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7627 0           $self->{line_prev} = $self->{line};
7628 0           $self->{column_prev} = $self->{column};
7629 0           $self->{column}++;
7630             $self->{nc}
7631 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7632             } else {
7633 0           $self->{set_nc}->($self);
7634             }
7635            
7636 0           redo A;
7637             }
7638             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7639 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7640             ## Stay in the state.
7641            
7642 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643 0           $self->{line_prev} = $self->{line};
7644 0           $self->{column_prev} = $self->{column};
7645 0           $self->{column}++;
7646             $self->{nc}
7647 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648             } else {
7649 0           $self->{set_nc}->($self);
7650             }
7651            
7652 0           redo A;
7653             } elsif ($nc == 0x0022) { # "
7654 0           $self->{ca}->{value} = '';
7655 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7656            
7657 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7658 0           $self->{line_prev} = $self->{line};
7659 0           $self->{column_prev} = $self->{column};
7660 0           $self->{column}++;
7661             $self->{nc}
7662 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7663             } else {
7664 0           $self->{set_nc}->($self);
7665             }
7666            
7667 0           redo A;
7668             } elsif ($nc == 0x0027) { # '
7669 0           $self->{ca}->{value} = '';
7670 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7671            
7672 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7673 0           $self->{line_prev} = $self->{line};
7674 0           $self->{column_prev} = $self->{column};
7675 0           $self->{column}++;
7676             $self->{nc}
7677 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7678             } else {
7679 0           $self->{set_nc}->($self);
7680             }
7681            
7682 0           redo A;
7683             } elsif ($nc == 0x003E) { # >
7684 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7685 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7686            
7687 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7688 0           $self->{line_prev} = $self->{line};
7689 0           $self->{column_prev} = $self->{column};
7690 0           $self->{column}++;
7691             $self->{nc}
7692 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7693             } else {
7694 0           $self->{set_nc}->($self);
7695             }
7696            
7697 0           return ($self->{ct}); # ATTLIST
7698 0           redo A;
7699             } elsif ($nc == EOF_CHAR) {
7700             ## XML5: No parse error.
7701 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7702 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7703 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7704            
7705 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7706 0           $self->{line_prev} = $self->{line};
7707 0           $self->{column_prev} = $self->{column};
7708 0           $self->{column}++;
7709             $self->{nc}
7710 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7711             } else {
7712 0           $self->{set_nc}->($self);
7713             }
7714            
7715             ## Discard the current token.
7716 0           redo A;
7717             } else {
7718             ## XML5: Not defined yet.
7719 0 0         if ($self->{ca}->{default} eq 'FIXED') {
7720 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7721             } else {
7722 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7723 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7724             }
7725             ## Reconsume.
7726 0           redo A;
7727             }
7728             } elsif ($state == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7729 0 0 0       if ($is_space->{$nc} or
      0        
7730             $nc == EOF_CHAR or
7731             $nc == 0x003E) { # >
7732 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7733             ## Reconsume.
7734 0           redo A;
7735             } else {
7736 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7737 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7738             ## Reconsume.
7739 0           redo A;
7740             }
7741             } elsif ($state == NDATA_STATE) {
7742             ## ASCII case-insensitive
7743 0 0 0       if ($nc == [
    0 0        
      0        
7744             undef,
7745             0x0044, # D
7746             0x0041, # A
7747             0x0054, # T
7748             NEVER_CHAR, # (A)
7749             ]->[length $self->{kwd}] or
7750             $nc == [
7751             undef,
7752             0x0064, # d
7753             0x0061, # a
7754             0x0074, # t
7755             NEVER_CHAR, # (a)
7756             ]->[length $self->{kwd}]) {
7757            
7758             ## Stay in the state.
7759 0           $self->{kwd} .= chr $nc;
7760            
7761 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7762 0           $self->{line_prev} = $self->{line};
7763 0           $self->{column_prev} = $self->{column};
7764 0           $self->{column}++;
7765             $self->{nc}
7766 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7767             } else {
7768 0           $self->{set_nc}->($self);
7769             }
7770            
7771 0           redo A;
7772             } elsif ((length $self->{kwd}) == 4 and
7773             ($nc == 0x0041 or # A
7774             $nc == 0x0061)) { # a
7775 0 0 0       if ($self->{kwd} ne 'NDAT' or $nc == 0x0061) { # a
7776            
7777             $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7778             text => 'NDATA',
7779             line => $self->{line_prev},
7780 0           column => $self->{column_prev} - 4);
7781             } else {
7782            
7783             }
7784 0           $self->{state} = AFTER_NDATA_STATE;
7785            
7786 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787 0           $self->{line_prev} = $self->{line};
7788 0           $self->{column_prev} = $self->{column};
7789 0           $self->{column}++;
7790             $self->{nc}
7791 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792             } else {
7793 0           $self->{set_nc}->($self);
7794             }
7795            
7796 0           redo A;
7797             } else {
7798             $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7799             line => $self->{line_prev},
7800             column => $self->{column_prev} + 1
7801 0           - length $self->{kwd});
7802            
7803 0           $self->{state} = BOGUS_MD_STATE;
7804             ## Reconsume.
7805 0           redo A;
7806             }
7807             } elsif ($state == AFTER_NDATA_STATE) {
7808 0 0         if ($is_space->{$nc}) {
    0          
    0          
7809 0           $self->{state} = BEFORE_NOTATION_NAME_STATE;
7810            
7811 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7812 0           $self->{line_prev} = $self->{line};
7813 0           $self->{column_prev} = $self->{column};
7814 0           $self->{column}++;
7815             $self->{nc}
7816 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7817             } else {
7818 0           $self->{set_nc}->($self);
7819             }
7820            
7821 0           redo A;
7822             } elsif ($nc == 0x003E) { # >
7823 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7824 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7825            
7826 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827 0           $self->{line_prev} = $self->{line};
7828 0           $self->{column_prev} = $self->{column};
7829 0           $self->{column}++;
7830             $self->{nc}
7831 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832             } else {
7833 0           $self->{set_nc}->($self);
7834             }
7835            
7836 0           return ($self->{ct}); # ENTITY
7837 0           redo A;
7838             } elsif ($nc == EOF_CHAR) {
7839 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7840 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7841            
7842 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7843 0           $self->{line_prev} = $self->{line};
7844 0           $self->{column_prev} = $self->{column};
7845 0           $self->{column}++;
7846             $self->{nc}
7847 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7848             } else {
7849 0           $self->{set_nc}->($self);
7850             }
7851            
7852             ## Discard the current token.
7853 0           redo A;
7854             } else {
7855             $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7856             line => $self->{line_prev},
7857             column => $self->{column_prev} + 1
7858 0           - length $self->{kwd});
7859 0           $self->{state} = BOGUS_MD_STATE;
7860             ## Reconsume.
7861 0           redo A;
7862             }
7863             } elsif ($state == BEFORE_NOTATION_NAME_STATE) {
7864 0 0         if ($is_space->{$nc}) {
    0          
    0          
7865             ## Stay in the state.
7866            
7867 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7868 0           $self->{line_prev} = $self->{line};
7869 0           $self->{column_prev} = $self->{column};
7870 0           $self->{column}++;
7871             $self->{nc}
7872 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7873             } else {
7874 0           $self->{set_nc}->($self);
7875             }
7876            
7877 0           redo A;
7878             } elsif ($nc == 0x003E) { # >
7879 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7880 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7881            
7882 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7883 0           $self->{line_prev} = $self->{line};
7884 0           $self->{column_prev} = $self->{column};
7885 0           $self->{column}++;
7886             $self->{nc}
7887 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7888             } else {
7889 0           $self->{set_nc}->($self);
7890             }
7891            
7892 0           return ($self->{ct}); # ENTITY
7893 0           redo A;
7894             } elsif ($nc == EOF_CHAR) {
7895 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7896 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7897            
7898 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7899 0           $self->{line_prev} = $self->{line};
7900 0           $self->{column_prev} = $self->{column};
7901 0           $self->{column}++;
7902             $self->{nc}
7903 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7904             } else {
7905 0           $self->{set_nc}->($self);
7906             }
7907            
7908             ## Discard the current token.
7909 0           redo A;
7910             } else {
7911 0 0         if ($nc == 0x0000) {
7912 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7913             }
7914 0 0         $self->{ct}->{notation} = $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7915 0           $self->{state} = NOTATION_NAME_STATE;
7916            
7917 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7918 0           $self->{line_prev} = $self->{line};
7919 0           $self->{column_prev} = $self->{column};
7920 0           $self->{column}++;
7921             $self->{nc}
7922 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7923             } else {
7924 0           $self->{set_nc}->($self);
7925             }
7926            
7927 0           redo A;
7928             }
7929             } elsif ($state == NOTATION_NAME_STATE) {
7930 0 0         if ($is_space->{$nc}) {
    0          
    0          
7931 0           $self->{state} = AFTER_MD_DEF_STATE;
7932            
7933 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7934 0           $self->{line_prev} = $self->{line};
7935 0           $self->{column_prev} = $self->{column};
7936 0           $self->{column}++;
7937             $self->{nc}
7938 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7939             } else {
7940 0           $self->{set_nc}->($self);
7941             }
7942            
7943 0           redo A;
7944             } elsif ($nc == 0x003E) { # >
7945 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7946            
7947 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7948 0           $self->{line_prev} = $self->{line};
7949 0           $self->{column_prev} = $self->{column};
7950 0           $self->{column}++;
7951             $self->{nc}
7952 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7953             } else {
7954 0           $self->{set_nc}->($self);
7955             }
7956            
7957 0           return ($self->{ct}); # ENTITY
7958 0           redo A;
7959             } elsif ($nc == EOF_CHAR) {
7960 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7961 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7962            
7963 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964 0           $self->{line_prev} = $self->{line};
7965 0           $self->{column_prev} = $self->{column};
7966 0           $self->{column}++;
7967             $self->{nc}
7968 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969             } else {
7970 0           $self->{set_nc}->($self);
7971             }
7972            
7973             ## The current token.
7974 0           redo A;
7975             } else {
7976 0 0         if ($nc == 0x0000) {
7977 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7978             }
7979 0 0         $self->{ct}->{notation} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7980             ## Stay in the state.
7981            
7982 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7983 0           $self->{line_prev} = $self->{line};
7984 0           $self->{column_prev} = $self->{column};
7985 0           $self->{column}++;
7986             $self->{nc}
7987 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7988             } else {
7989 0           $self->{set_nc}->($self);
7990             }
7991            
7992 0           redo A;
7993             }
7994             } elsif ($state == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7995 0 0         if ($nc == 0x0022) { # "
    0          
    0          
7996 0           $self->{state} = AFTER_MD_DEF_STATE;
7997            
7998 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7999 0           $self->{line_prev} = $self->{line};
8000 0           $self->{column_prev} = $self->{column};
8001 0           $self->{column}++;
8002             $self->{nc}
8003 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8004             } else {
8005 0           $self->{set_nc}->($self);
8006             }
8007            
8008 0           redo A;
8009             } elsif ($nc == 0x0026) { # &
8010 0           $self->{prev_state} = $state;
8011 0           $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8012 0           $self->{entity_add} = 0x0022; # "
8013            
8014 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8015 0           $self->{line_prev} = $self->{line};
8016 0           $self->{column_prev} = $self->{column};
8017 0           $self->{column}++;
8018             $self->{nc}
8019 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8020             } else {
8021 0           $self->{set_nc}->($self);
8022             }
8023            
8024 0           redo A;
8025             ## TODO: %
8026             } elsif ($nc == EOF_CHAR) {
8027 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8028 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8029             ## Reconsume.
8030             ## Discard the current token.
8031 0           redo A;
8032             } else {
8033 0 0         if ($nc == 0x0000) {
8034 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8035             }
8036 0 0         $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8037            
8038 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039 0           $self->{line_prev} = $self->{line};
8040 0           $self->{column_prev} = $self->{column};
8041 0           $self->{column}++;
8042             $self->{nc}
8043 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044             } else {
8045 0           $self->{set_nc}->($self);
8046             }
8047            
8048 0           redo A;
8049             }
8050             } elsif ($state == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8051 0 0         if ($nc == 0x0027) { # '
    0          
    0          
8052 0           $self->{state} = AFTER_MD_DEF_STATE;
8053            
8054 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055 0           $self->{line_prev} = $self->{line};
8056 0           $self->{column_prev} = $self->{column};
8057 0           $self->{column}++;
8058             $self->{nc}
8059 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060             } else {
8061 0           $self->{set_nc}->($self);
8062             }
8063            
8064 0           redo A;
8065             } elsif ($nc == 0x0026) { # &
8066 0           $self->{prev_state} = $state;
8067 0           $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8068 0           $self->{entity_add} = 0x0027; # '
8069            
8070 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071 0           $self->{line_prev} = $self->{line};
8072 0           $self->{column_prev} = $self->{column};
8073 0           $self->{column}++;
8074             $self->{nc}
8075 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076             } else {
8077 0           $self->{set_nc}->($self);
8078             }
8079            
8080 0           redo A;
8081             ## TODO: %
8082             } elsif ($nc == EOF_CHAR) {
8083 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8084 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8085             ## Reconsume.
8086             ## Discard the current token.
8087 0           redo A;
8088             } else {
8089 0 0         if ($nc == 0x0000) {
8090 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8091             }
8092 0 0         $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8093            
8094 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095 0           $self->{line_prev} = $self->{line};
8096 0           $self->{column_prev} = $self->{column};
8097 0           $self->{column}++;
8098             $self->{nc}
8099 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100             } else {
8101 0           $self->{set_nc}->($self);
8102             }
8103            
8104 0           redo A;
8105             }
8106             } elsif ($state == ENTITY_VALUE_ENTITY_STATE) {
8107 0 0 0       if ($is_space->{$nc} or
    0          
8108             {
8109             0x003C => 1, 0x0026 => 1, (EOF_CHAR) => 1, # <, &
8110             $self->{entity_add} => 1,
8111             }->{$nc}) {
8112             $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8113             line => $self->{line_prev},
8114             column => $self->{column_prev}
8115 0 0         + ($nc == EOF_CHAR ? 1 : 0));
8116             ## Don't consume
8117             ## Return nothing.
8118             #
8119             } elsif ($nc == 0x0023) { # #
8120 0           $self->{ca} = $self->{ct};
8121 0           $self->{state} = ENTITY_HASH_STATE;
8122 0           $self->{kwd} = '#';
8123            
8124 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8125 0           $self->{line_prev} = $self->{line};
8126 0           $self->{column_prev} = $self->{column};
8127 0           $self->{column}++;
8128             $self->{nc}
8129 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8130             } else {
8131 0           $self->{set_nc}->($self);
8132             }
8133            
8134 0           redo A;
8135             } else {
8136             #
8137             }
8138              
8139 0           $self->{ct}->{value} .= '&';
8140 0           $self->{state} = $self->{prev_state};
8141             ## Reconsume.
8142 0           redo A;
8143             } elsif ($state == AFTER_ELEMENT_NAME_STATE) {
8144 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
8145 0           $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8146            
8147 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148 0           $self->{line_prev} = $self->{line};
8149 0           $self->{column_prev} = $self->{column};
8150 0           $self->{column}++;
8151             $self->{nc}
8152 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153             } else {
8154 0           $self->{set_nc}->($self);
8155             }
8156            
8157 0           redo A;
8158             } elsif ($nc == 0x0028) { # (
8159 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8160 0           $self->{ct}->{content} = ['('];
8161 0           $self->{group_depth} = 1;
8162            
8163 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8164 0           $self->{line_prev} = $self->{line};
8165 0           $self->{column_prev} = $self->{column};
8166 0           $self->{column}++;
8167             $self->{nc}
8168 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8169             } else {
8170 0           $self->{set_nc}->($self);
8171             }
8172            
8173 0           redo A;
8174             } elsif ($nc == 0x003E) { # >
8175 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8176 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8177            
8178 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8179 0           $self->{line_prev} = $self->{line};
8180 0           $self->{column_prev} = $self->{column};
8181 0           $self->{column}++;
8182             $self->{nc}
8183 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8184             } else {
8185 0           $self->{set_nc}->($self);
8186             }
8187            
8188 0           return ($self->{ct}); # ELEMENT
8189 0           redo A;
8190             } elsif ($nc == EOF_CHAR) {
8191 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8192 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8193            
8194 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8195 0           $self->{line_prev} = $self->{line};
8196 0           $self->{column_prev} = $self->{column};
8197 0           $self->{column}++;
8198             $self->{nc}
8199 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8200             } else {
8201 0           $self->{set_nc}->($self);
8202             }
8203            
8204             ## Discard the current token.
8205 0           redo A;
8206             } else {
8207 0 0         if ($nc == 0x0000) {
8208 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8209             }
8210 0 0         $self->{ct}->{content} = [$nc == 0x0000 ? "\x{FFFD}" : chr $nc];
8211 0           $self->{state} = CONTENT_KEYWORD_STATE;
8212            
8213 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8214 0           $self->{line_prev} = $self->{line};
8215 0           $self->{column_prev} = $self->{column};
8216 0           $self->{column}++;
8217             $self->{nc}
8218 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8219             } else {
8220 0           $self->{set_nc}->($self);
8221             }
8222            
8223 0           redo A;
8224             }
8225             } elsif ($state == CONTENT_KEYWORD_STATE) {
8226 0 0         if ($is_space->{$nc}) {
    0          
    0          
8227 0           $self->{state} = AFTER_MD_DEF_STATE;
8228            
8229 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8230 0           $self->{line_prev} = $self->{line};
8231 0           $self->{column_prev} = $self->{column};
8232 0           $self->{column}++;
8233             $self->{nc}
8234 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8235             } else {
8236 0           $self->{set_nc}->($self);
8237             }
8238            
8239 0           redo A;
8240             } elsif ($nc == 0x003E) { # >
8241 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8242            
8243 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244 0           $self->{line_prev} = $self->{line};
8245 0           $self->{column_prev} = $self->{column};
8246 0           $self->{column}++;
8247             $self->{nc}
8248 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249             } else {
8250 0           $self->{set_nc}->($self);
8251             }
8252            
8253 0           return ($self->{ct}); # ELEMENT
8254 0           redo A;
8255             } elsif ($nc == EOF_CHAR) {
8256 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8257 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8258            
8259 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8260 0           $self->{line_prev} = $self->{line};
8261 0           $self->{column_prev} = $self->{column};
8262 0           $self->{column}++;
8263             $self->{nc}
8264 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8265             } else {
8266 0           $self->{set_nc}->($self);
8267             }
8268            
8269             ## Discard the current token.
8270 0           redo A;
8271             } else {
8272 0 0         if ($nc == 0x0000) {
8273 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8274             }
8275 0 0         $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ELEMENT
8276             ## Stay in the state.
8277            
8278 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8279 0           $self->{line_prev} = $self->{line};
8280 0           $self->{column_prev} = $self->{column};
8281 0           $self->{column}++;
8282             $self->{nc}
8283 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8284             } else {
8285 0           $self->{set_nc}->($self);
8286             }
8287            
8288 0           redo A;
8289             }
8290             } elsif ($state == AFTER_CM_GROUP_OPEN_STATE) {
8291 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
8292             ## Stay in the state.
8293            
8294 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8295 0           $self->{line_prev} = $self->{line};
8296 0           $self->{column_prev} = $self->{column};
8297 0           $self->{column}++;
8298             $self->{nc}
8299 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8300             } else {
8301 0           $self->{set_nc}->($self);
8302             }
8303            
8304 0           redo A;
8305             } elsif ($nc == 0x0028) { # (
8306 0           $self->{group_depth}++;
8307 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8308             ## Stay in the state.
8309            
8310 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8311 0           $self->{line_prev} = $self->{line};
8312 0           $self->{column_prev} = $self->{column};
8313 0           $self->{column}++;
8314             $self->{nc}
8315 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8316             } else {
8317 0           $self->{set_nc}->($self);
8318             }
8319            
8320 0           redo A;
8321             } elsif ($nc == 0x007C or # |
8322             $nc == 0x002C) { # ,
8323 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8324             ## Stay in the state.
8325            
8326 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8327 0           $self->{line_prev} = $self->{line};
8328 0           $self->{column_prev} = $self->{column};
8329 0           $self->{column}++;
8330             $self->{nc}
8331 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8332             } else {
8333 0           $self->{set_nc}->($self);
8334             }
8335            
8336 0           redo A;
8337             } elsif ($nc == 0x0029) { # )
8338 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8339 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8340 0           $self->{group_depth}--;
8341 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8342            
8343 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8344 0           $self->{line_prev} = $self->{line};
8345 0           $self->{column_prev} = $self->{column};
8346 0           $self->{column}++;
8347             $self->{nc}
8348 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8349             } else {
8350 0           $self->{set_nc}->($self);
8351             }
8352            
8353 0           redo A;
8354             } elsif ($nc == 0x003E) { # >
8355 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8356 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8357 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8358            
8359 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8360 0           $self->{line_prev} = $self->{line};
8361 0           $self->{column_prev} = $self->{column};
8362 0           $self->{column}++;
8363             $self->{nc}
8364 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8365             } else {
8366 0           $self->{set_nc}->($self);
8367             }
8368            
8369 0           return ($self->{ct}); # ELEMENT
8370 0           redo A;
8371             } elsif ($nc == EOF_CHAR) {
8372 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8373             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8374 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8375            
8376 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8377 0           $self->{line_prev} = $self->{line};
8378 0           $self->{column_prev} = $self->{column};
8379 0           $self->{column}++;
8380             $self->{nc}
8381 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8382             } else {
8383 0           $self->{set_nc}->($self);
8384             }
8385            
8386             ## Discard the current token.
8387 0           redo A;
8388             } else {
8389 0 0         if ($nc == 0x0000) {
8390 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8391             }
8392 0 0         push @{$self->{ct}->{content}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
  0            
8393 0           $self->{state} = CM_ELEMENT_NAME_STATE;
8394            
8395 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8396 0           $self->{line_prev} = $self->{line};
8397 0           $self->{column_prev} = $self->{column};
8398 0           $self->{column}++;
8399             $self->{nc}
8400 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8401             } else {
8402 0           $self->{set_nc}->($self);
8403             }
8404            
8405 0           redo A;
8406             }
8407             } elsif ($state == CM_ELEMENT_NAME_STATE) {
8408 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0 0        
    0          
    0          
    0          
8409 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8410            
8411 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8412 0           $self->{line_prev} = $self->{line};
8413 0           $self->{column_prev} = $self->{column};
8414 0           $self->{column}++;
8415             $self->{nc}
8416 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8417             } else {
8418 0           $self->{set_nc}->($self);
8419             }
8420            
8421 0           redo A;
8422             } elsif ($nc == 0x002A or # *
8423             $nc == 0x002B or # +
8424             $nc == 0x003F) { # ?
8425 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8426 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8427            
8428 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8429 0           $self->{line_prev} = $self->{line};
8430 0           $self->{column_prev} = $self->{column};
8431 0           $self->{column}++;
8432             $self->{nc}
8433 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8434             } else {
8435 0           $self->{set_nc}->($self);
8436             }
8437            
8438 0           redo A;
8439             } elsif ($nc == 0x007C or # |
8440             $nc == 0x002C) { # ,
8441 0 0         push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
  0            
8442 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8443            
8444 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8445 0           $self->{line_prev} = $self->{line};
8446 0           $self->{column_prev} = $self->{column};
8447 0           $self->{column}++;
8448             $self->{nc}
8449 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8450             } else {
8451 0           $self->{set_nc}->($self);
8452             }
8453            
8454 0           redo A;
8455             } elsif ($nc == 0x0029) { # )
8456 0           $self->{group_depth}--;
8457 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8458 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8459            
8460 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8461 0           $self->{line_prev} = $self->{line};
8462 0           $self->{column_prev} = $self->{column};
8463 0           $self->{column}++;
8464             $self->{nc}
8465 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8466             } else {
8467 0           $self->{set_nc}->($self);
8468             }
8469            
8470 0           redo A;
8471             } elsif ($nc == 0x003E) { # >
8472 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8473 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8474 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8475            
8476 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8477 0           $self->{line_prev} = $self->{line};
8478 0           $self->{column_prev} = $self->{column};
8479 0           $self->{column}++;
8480             $self->{nc}
8481 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8482             } else {
8483 0           $self->{set_nc}->($self);
8484             }
8485            
8486 0           return ($self->{ct}); # ELEMENT
8487 0           redo A;
8488             } elsif ($nc == EOF_CHAR) {
8489 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8490             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8491 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8492            
8493 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8494 0           $self->{line_prev} = $self->{line};
8495 0           $self->{column_prev} = $self->{column};
8496 0           $self->{column}++;
8497             $self->{nc}
8498 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8499             } else {
8500 0           $self->{set_nc}->($self);
8501             }
8502            
8503             ## Discard the token.
8504 0           redo A;
8505             } else {
8506 0 0         if ($nc == 0x0000) {
8507 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8508             }
8509 0 0         $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
8510             ## Stay in the state.
8511            
8512 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8513 0           $self->{line_prev} = $self->{line};
8514 0           $self->{column_prev} = $self->{column};
8515 0           $self->{column}++;
8516             $self->{nc}
8517 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8518             } else {
8519 0           $self->{set_nc}->($self);
8520             }
8521            
8522 0           redo A;
8523             }
8524             } elsif ($state == AFTER_CM_ELEMENT_NAME_STATE) {
8525 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
8526             ## Stay in the state.
8527            
8528 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8529 0           $self->{line_prev} = $self->{line};
8530 0           $self->{column_prev} = $self->{column};
8531 0           $self->{column}++;
8532             $self->{nc}
8533 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8534             } else {
8535 0           $self->{set_nc}->($self);
8536             }
8537            
8538 0           redo A;
8539             } elsif ($nc == 0x007C or # |
8540             $nc == 0x002C) { # ,
8541 0 0         push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
  0            
8542 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8543            
8544 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8545 0           $self->{line_prev} = $self->{line};
8546 0           $self->{column_prev} = $self->{column};
8547 0           $self->{column}++;
8548             $self->{nc}
8549 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8550             } else {
8551 0           $self->{set_nc}->($self);
8552             }
8553            
8554 0           redo A;
8555             } elsif ($nc == 0x0029) { # )
8556 0           $self->{group_depth}--;
8557 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8558 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8559            
8560 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8561 0           $self->{line_prev} = $self->{line};
8562 0           $self->{column_prev} = $self->{column};
8563 0           $self->{column}++;
8564             $self->{nc}
8565 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8566             } else {
8567 0           $self->{set_nc}->($self);
8568             }
8569            
8570 0           redo A;
8571             } elsif ($nc == 0x003E) { # >
8572 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8573 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8574 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8575            
8576 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8577 0           $self->{line_prev} = $self->{line};
8578 0           $self->{column_prev} = $self->{column};
8579 0           $self->{column}++;
8580             $self->{nc}
8581 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8582             } else {
8583 0           $self->{set_nc}->($self);
8584             }
8585            
8586 0           return ($self->{ct}); # ELEMENT
8587 0           redo A;
8588             } elsif ($nc == EOF_CHAR) {
8589 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8590             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8591 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8592            
8593 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8594 0           $self->{line_prev} = $self->{line};
8595 0           $self->{column_prev} = $self->{column};
8596 0           $self->{column}++;
8597             $self->{nc}
8598 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8599             } else {
8600 0           $self->{set_nc}->($self);
8601             }
8602            
8603             ## Discard the current token.
8604 0           redo A;
8605             } else {
8606 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8607 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8608 0           $self->{state} = BOGUS_MD_STATE;
8609            
8610 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8611 0           $self->{line_prev} = $self->{line};
8612 0           $self->{column_prev} = $self->{column};
8613 0           $self->{column}++;
8614             $self->{nc}
8615 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8616             } else {
8617 0           $self->{set_nc}->($self);
8618             }
8619            
8620 0           redo A;
8621             }
8622             } elsif ($state == AFTER_CM_GROUP_CLOSE_STATE) {
8623 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
8624 0 0         if ($self->{group_depth}) {
8625 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8626             } else {
8627 0           $self->{state} = AFTER_MD_DEF_STATE;
8628             }
8629            
8630 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8631 0           $self->{line_prev} = $self->{line};
8632 0           $self->{column_prev} = $self->{column};
8633 0           $self->{column}++;
8634             $self->{nc}
8635 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8636             } else {
8637 0           $self->{set_nc}->($self);
8638             }
8639            
8640 0           redo A;
8641             } elsif ($nc == 0x002A or # *
8642             $nc == 0x002B or # +
8643             $nc == 0x003F) { # ?
8644 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8645 0 0         if ($self->{group_depth}) {
8646 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8647             } else {
8648 0           $self->{state} = AFTER_MD_DEF_STATE;
8649             }
8650            
8651 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652 0           $self->{line_prev} = $self->{line};
8653 0           $self->{column_prev} = $self->{column};
8654 0           $self->{column}++;
8655             $self->{nc}
8656 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657             } else {
8658 0           $self->{set_nc}->($self);
8659             }
8660            
8661 0           redo A;
8662             } elsif ($nc == 0x0029) { # )
8663 0 0         if ($self->{group_depth}) {
8664 0           $self->{group_depth}--;
8665 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8666             ## Stay in the state.
8667            
8668 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8669 0           $self->{line_prev} = $self->{line};
8670 0           $self->{column_prev} = $self->{column};
8671 0           $self->{column}++;
8672             $self->{nc}
8673 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8674             } else {
8675 0           $self->{set_nc}->($self);
8676             }
8677            
8678 0           redo A;
8679             } else {
8680 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8681 0           $self->{state} = BOGUS_MD_STATE;
8682             ## Reconsume.
8683 0           redo A;
8684             }
8685             } elsif ($nc == 0x003E) { # >
8686 0 0         if ($self->{group_depth}) {
8687 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8688 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8689             }
8690 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8691            
8692 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693 0           $self->{line_prev} = $self->{line};
8694 0           $self->{column_prev} = $self->{column};
8695 0           $self->{column}++;
8696             $self->{nc}
8697 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698             } else {
8699 0           $self->{set_nc}->($self);
8700             }
8701            
8702 0           return ($self->{ct}); # ELEMENT
8703 0           redo A;
8704             } elsif ($nc == EOF_CHAR) {
8705 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8706             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8707 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8708            
8709 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710 0           $self->{line_prev} = $self->{line};
8711 0           $self->{column_prev} = $self->{column};
8712 0           $self->{column}++;
8713             $self->{nc}
8714 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715             } else {
8716 0           $self->{set_nc}->($self);
8717             }
8718            
8719             ## Discard the current token.
8720 0           redo A;
8721             } else {
8722 0 0         if ($self->{group_depth}) {
8723 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8724             } else {
8725 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8726 0           $self->{state} = BOGUS_MD_STATE;
8727             }
8728             ## Reconsume.
8729 0           redo A;
8730             }
8731             } elsif ($state == AFTER_MD_DEF_STATE) {
8732 0 0         if ($is_space->{$nc}) {
    0          
    0          
8733             ## Stay in the state.
8734            
8735 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8736 0           $self->{line_prev} = $self->{line};
8737 0           $self->{column_prev} = $self->{column};
8738 0           $self->{column}++;
8739             $self->{nc}
8740 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8741             } else {
8742 0           $self->{set_nc}->($self);
8743             }
8744            
8745 0           redo A;
8746             } elsif ($nc == 0x003E) { # >
8747 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8748            
8749 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8750 0           $self->{line_prev} = $self->{line};
8751 0           $self->{column_prev} = $self->{column};
8752 0           $self->{column}++;
8753             $self->{nc}
8754 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8755             } else {
8756 0           $self->{set_nc}->($self);
8757             }
8758            
8759 0           return ($self->{ct}); # ENTITY/ELEMENT
8760 0           redo A;
8761             } elsif ($nc == EOF_CHAR) {
8762 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8763 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8764            
8765 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8766 0           $self->{line_prev} = $self->{line};
8767 0           $self->{column_prev} = $self->{column};
8768 0           $self->{column}++;
8769             $self->{nc}
8770 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8771             } else {
8772 0           $self->{set_nc}->($self);
8773             }
8774            
8775             ## Discard the current token.
8776 0           redo A;
8777             } else {
8778 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8779 0           $self->{state} = BOGUS_MD_STATE;
8780             ## Reconsume.
8781 0           redo A;
8782             }
8783             } elsif ($state == BOGUS_MD_STATE) {
8784 0 0         if ($nc == 0x003E) { # >
    0          
8785 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8786            
8787 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8788 0           $self->{line_prev} = $self->{line};
8789 0           $self->{column_prev} = $self->{column};
8790 0           $self->{column}++;
8791             $self->{nc}
8792 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8793             } else {
8794 0           $self->{set_nc}->($self);
8795             }
8796            
8797 0           return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8798 0           redo A;
8799             } elsif ($nc == EOF_CHAR) {
8800 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8801             ## Reconsume.
8802             ## Discard the current token.
8803 0           redo A;
8804             } else {
8805             ## Stay in the state.
8806            
8807 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8808 0           $self->{line_prev} = $self->{line};
8809 0           $self->{column_prev} = $self->{column};
8810 0           $self->{column}++;
8811             $self->{nc}
8812 0           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8813             } else {
8814 0           $self->{set_nc}->($self);
8815             }
8816            
8817 0           redo A;
8818             }
8819             } else {
8820 0           die "$0: $state: Unknown state";
8821             }
8822             } # A
8823              
8824 0           die "$0: _get_next_token: unexpected case";
8825             } # _get_next_token
8826              
8827             1;
8828              
8829             # Copyright 2007-2011 Wakaba .
8830             #
8831             # This library is free software; you can redistribute it and/or modify
8832             # it under the same terms as Perl itself.
8833