File Coverage

blib/lib/HTML/HTML5/Parser/Tokenizer.pm
Criterion Covered Total %
statement 14 3731 0.3
branch 0 1870 0.0
condition 0 453 0.0
subroutine 5 7 71.4
pod n/a
total 19 6061 0.3


line stmt bran cond sub pod time code
1             package HTML::HTML5::Parser::Tokenizer; # -*- Perl -*-
2             ## skip Test::Tabs
3 10     10   72 use strict;
  10         23  
  10         912  
4             our $VERSION='0.301';
5              
6             ## This module implements the tokenization phase of both HTML5 and
7             ## XML5. Notes like this are usually based on the latest HTML
8             ## specification. Since XML is different from HTML, and since XML5
9             ## specification has not been maintained any more, there is a few
10             ## differences from HTML's tokenization. Such differences are marked
11             ## by prefix "XML5:".
12              
13             ## Warnings that depend on the HTML/XML input stream, such as ones
14             ## related to surrogate code positions, are not useful.
15 10     10   60 no warnings 'utf8';
  10         25  
  10         1575  
16              
17             ## ------ Token types ------
18              
19             BEGIN {
20 10     10   60 require Exporter;
21 10         99 push our @ISA, 'Exporter';
22              
23 10         53 our @EXPORT_OK = qw(
24             DOCTYPE_TOKEN
25             COMMENT_TOKEN
26             START_TAG_TOKEN
27             END_TAG_TOKEN
28             END_OF_FILE_TOKEN
29             CHARACTER_TOKEN
30             PI_TOKEN
31             ABORT_TOKEN
32             END_OF_DOCTYPE_TOKEN
33             ATTLIST_TOKEN
34             ELEMENT_TOKEN
35             GENERAL_ENTITY_TOKEN
36             PARAMETER_ENTITY_TOKEN
37             NOTATION_TOKEN
38             );
39            
40 10         3137 our %EXPORT_TAGS = (
41             token => [qw(
42             DOCTYPE_TOKEN
43             COMMENT_TOKEN
44             START_TAG_TOKEN
45             END_TAG_TOKEN
46             END_OF_FILE_TOKEN
47             CHARACTER_TOKEN
48             PI_TOKEN
49             ABORT_TOKEN
50             END_OF_DOCTYPE_TOKEN
51             ATTLIST_TOKEN
52             ELEMENT_TOKEN
53             GENERAL_ENTITY_TOKEN
54             PARAMETER_ENTITY_TOKEN
55             NOTATION_TOKEN
56             )],
57             );
58             }
59              
60             sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
61             sub COMMENT_TOKEN () { 2 }
62             sub START_TAG_TOKEN () { 3 }
63             sub END_TAG_TOKEN () { 4 }
64             sub END_OF_FILE_TOKEN () { 5 }
65             sub CHARACTER_TOKEN () { 6 }
66             sub PI_TOKEN () { 7 } ## NOTE: XML only.
67             sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
68             sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
69             sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
70             sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
71             sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
72             sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
73             sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
74              
75             ## XML5: XML5 has "empty tag token". In this implementation, it is
76             ## represented as a start tag token with $self->{self_closing} flag
77             ## set to true.
78              
79             ## XML5: XML5 has "short end tag token". In this implementation, it
80             ## is represented as an end tag token with $token->{tag_name} flag set
81             ## to an empty string.
82              
83             package HTML::HTML5::Parser::TagSoupParser;
84              
85 10     10   2853 BEGIN { HTML::HTML5::Parser::Tokenizer->import (':token') }
86              
87 10     10   27702 use HTML::HTML5::Entities qw[%entity2char];
  10         330215  
  10         1064977  
88              
89             ## ------ Tokenizer states ------
90              
91             sub DATA_STATE () { 0 }
92             sub RCDATA_STATE () { 107 }
93             sub RAWTEXT_STATE () { 108 }
94             sub SCRIPT_DATA_STATE () { 109 }
95             sub PLAINTEXT_STATE () { 110 }
96             sub TAG_OPEN_STATE () { 2 }
97             sub RCDATA_LT_STATE () { 111 }
98             sub RAWTEXT_LT_STATE () { 112 }
99             sub SCRIPT_DATA_LT_STATE () { 113 }
100             sub CLOSE_TAG_OPEN_STATE () { 3 }
101             sub RCDATA_END_TAG_OPEN_STATE () { 114 }
102             sub RAWTEXT_END_TAG_OPEN_STATE () { 115 }
103             sub SCRIPT_DATA_END_TAG_OPEN_STATE () { 116 }
104             sub SCRIPT_DATA_ESCAPE_START_STATE () { 1 }
105             sub SCRIPT_DATA_ESCAPE_START_DASH_STATE () { 12 }
106             sub SCRIPT_DATA_ESCAPED_STATE () { 117 }
107             sub SCRIPT_DATA_ESCAPED_DASH_STATE () { 118 }
108             sub SCRIPT_DATA_ESCAPED_DASH_DASH_STATE () { 119 }
109             sub SCRIPT_DATA_ESCAPED_LT_STATE () { 120 }
110             sub SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE () { 121 }
111             sub SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE () { 122 }
112             sub SCRIPT_DATA_DOUBLE_ESCAPED_STATE () { 123 }
113             sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE () { 124 }
114             sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE () { 125 }
115             sub SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE () { 126 }
116             sub SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE () { 127 }
117             sub TAG_NAME_STATE () { 4 }
118             sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
119             sub ATTRIBUTE_NAME_STATE () { 6 }
120             sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
121             sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
122             sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
123             sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
124             sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
125             sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
126             sub COMMENT_START_STATE () { 14 }
127             sub COMMENT_START_DASH_STATE () { 15 }
128             sub COMMENT_STATE () { 16 }
129             sub COMMENT_END_STATE () { 17 }
130             sub COMMENT_END_BANG_STATE () { 102 }
131             #sub COMMENT_END_SPACE_STATE () { 103 } ## REMOVED
132             sub COMMENT_END_DASH_STATE () { 18 }
133             sub BOGUS_COMMENT_STATE () { 19 }
134             sub DOCTYPE_STATE () { 20 }
135             sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
136             sub DOCTYPE_NAME_STATE () { 22 }
137             sub AFTER_DOCTYPE_NAME_STATE () { 23 }
138             sub AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE () { 104 }
139             sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
140             sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
141             sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
142             sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
143             sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
144             sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
145             sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
146             sub BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE () { 105 }
147             sub AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE () { 106 }
148             sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
149             sub BOGUS_DOCTYPE_STATE () { 32 }
150             sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
151             sub SELF_CLOSING_START_TAG_STATE () { 34 }
152             sub CDATA_SECTION_STATE () { 35 }
153             sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
154             sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
155             sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
156             #sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
157             sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
158             sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
159             sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
160             sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
161             ##
162             ## NOTE: "Entity data state", "entity in attribute value state", and
163             ## the "consume a character reference" algorithm, are jointly
164             ## implemented as the following six states:
165             sub ENTITY_STATE () { 44 }
166             sub ENTITY_HASH_STATE () { 45 }
167             sub NCR_NUM_STATE () { 46 }
168             sub HEXREF_X_STATE () { 47 }
169             sub HEXREF_HEX_STATE () { 48 }
170             sub ENTITY_NAME_STATE () { 49 }
171             ##
172             ## XML-only states
173             sub DATA_MSE1_STATE () { 50 }
174             sub DATA_MSE2_STATE () { 128 } # last
175             sub PI_STATE () { 51 }
176             sub PI_TARGET_STATE () { 52 }
177             sub PI_TARGET_AFTER_STATE () { 53 }
178             sub PI_DATA_STATE () { 54 }
179             sub PI_AFTER_STATE () { 55 }
180             sub PI_DATA_AFTER_STATE () { 56 }
181             sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
182             sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
183             sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
184             sub DOCTYPE_TAG_STATE () { 60 }
185             sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
186             sub MD_ATTLIST_STATE () { 62 }
187             sub MD_E_STATE () { 63 }
188             sub MD_ELEMENT_STATE () { 64 }
189             sub MD_ENTITY_STATE () { 65 }
190             sub MD_NOTATION_STATE () { 66 }
191             sub DOCTYPE_MD_STATE () { 67 }
192             sub BEFORE_MD_NAME_STATE () { 68 }
193             sub MD_NAME_STATE () { 69 }
194             sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
195             sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
196             sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
197             sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
198             sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
199             sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
200             sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
201             sub ALLOWED_TOKEN_STATE () { 77 }
202             sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
203             sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
204             sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
205             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
206             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
207             sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
208             sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
209             sub BEFORE_NDATA_STATE () { 85 }
210             sub NDATA_STATE () { 86 }
211             sub AFTER_NDATA_STATE () { 87 }
212             sub BEFORE_NOTATION_NAME_STATE () { 88 }
213             sub NOTATION_NAME_STATE () { 89 }
214             sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
215             sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
216             sub ENTITY_VALUE_ENTITY_STATE () { 92 }
217             sub AFTER_ELEMENT_NAME_STATE () { 93 }
218             sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
219             sub CONTENT_KEYWORD_STATE () { 95 }
220             sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
221             sub CM_ELEMENT_NAME_STATE () { 97 }
222             sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
223             sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
224             sub AFTER_MD_DEF_STATE () { 100 }
225             sub BOGUS_MD_STATE () { 101 }
226              
227             ## ------ Tree constructor state constants ------
228              
229             ## Whether the parsed string is in the foreign island or not affect
230             ## how tokenization is done, unfortunately. These are a copy of some
231             ## of tokenization state constants. See Whatpm::HTML for the full
232             ## list and the descriptions for constants.
233              
234             sub FOREIGN_EL () { 0b1_00000000000 }
235              
236             ## ------ Character reference mappings ------
237              
238             my $charref_map = {
239             0x00 => 0xFFFD, # REPLACEMENT CHARACTER
240             0x0D => 0x000D, # CARRIAGE RETURN
241             0x80 => 0x20AC,
242             0x81 => 0x0081,
243             0x82 => 0x201A,
244             0x83 => 0x0192,
245             0x84 => 0x201E,
246             0x85 => 0x2026,
247             0x86 => 0x2020,
248             0x87 => 0x2021,
249             0x88 => 0x02C6,
250             0x89 => 0x2030,
251             0x8A => 0x0160,
252             0x8B => 0x2039,
253             0x8C => 0x0152,
254             0x8D => 0x008D,
255             0x8E => 0x017D,
256             0x8F => 0x008F,
257             0x90 => 0x0090,
258             0x91 => 0x2018,
259             0x92 => 0x2019,
260             0x93 => 0x201C,
261             0x94 => 0x201D,
262             0x95 => 0x2022,
263             0x96 => 0x2013,
264             0x97 => 0x2014,
265             0x98 => 0x02DC,
266             0x99 => 0x2122,
267             0x9A => 0x0161,
268             0x9B => 0x203A,
269             0x9C => 0x0153,
270             0x9D => 0x009D,
271             0x9E => 0x017E,
272             0x9F => 0x0178,
273             }; # $charref_map
274             $charref_map->{$_} = 0xFFFD # REPLACEMENT CHARACTER
275             for 0xD800..0xDFFF;
276             $charref_map->{$_} = $_
277             for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
278             0xFDD0..0xFDEF,
279             0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
280             0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
281             0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
282             0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
283             0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
284              
285             ## ------ Special character-like constants ------
286              
287             ## The "EOF" pseudo-character in the HTML parsing algorithm.
288             sub EOF_CHAR () { -1 }
289              
290             ## A pseudo-character code that can never appear in the input stream.
291             sub NEVER_CHAR () { -2 }
292              
293             ## ------ The tokenizer ------
294              
295             ## Implementations MUST act as if state machine in the spec
296              
297             sub _initialize_tokenizer ($) {
298 0     0     my $self = shift;
299              
300             ## NOTE: Fields set by |new| constructor:
301             #$self->{level}
302             #$self->{set_nc}
303             #$self->{parse_error}
304             #$self->{is_xml} (if XML)
305              
306 0           $self->{state} = DATA_STATE; # MUST
307             #$self->{kwd} = ''; # State-dependent keyword; initialized when used
308             #$self->{entity__value}; # initialized when used
309             #$self->{entity__match}; # initialized when used
310 0           undef $self->{ct}; # current token
311 0           undef $self->{ca}; # current attribute
312 0           undef $self->{last_stag_name}; # last emitted start tag name
313             #$self->{prev_state}; # initialized when used
314 0           delete $self->{self_closing};
315 0           $self->{char_buffer} = '';
316 0           $self->{char_buffer_pos} = 0;
317 0           $self->{nc} = -1; # next input character
318             #$self->{next_nc}
319            
320 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
321 0           $self->{line_prev} = $self->{line};
322 0           $self->{column_prev} = $self->{column};
323 0           $self->{column}++;
324 0           $self->{nc}
325             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
326             } else {
327 0           $self->{set_nc}->($self);
328             }
329            
330 0           $self->{token} = [];
331             # $self->{escape}
332             } # _initialize_tokenizer
333              
334             ## A token has:
335             ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
336             ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
337             ## ->{name} (DOCTYPE_TOKEN)
338             ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
339             ## ->{target} (PI_TOKEN)
340             ## ->{pubid} (DOCTYPE_TOKEN)
341             ## ->{sysid} (DOCTYPE_TOKEN)
342             ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
343             ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
344             ## ->{name}
345             ## ->{value}
346             ## ->{has_reference} == 1 or 0
347             ## ->{index}: Index of the attribute in a tag.
348             ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
349             ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
350             ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
351             ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
352              
353             ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
354             ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
355             ## while the token is pushed back to the stack.
356              
357             ## Emitted token MUST immediately be handled by the tree construction state.
358              
359             ## Before each step, UA MAY check to see if either one of the scripts in
360             ## "list of scripts that will execute as soon as possible" or the first
361             ## script in the "list of scripts that will execute asynchronously",
362             ## has completed loading. If one has, then it MUST be executed
363             ## and removed from the list.
364              
365             ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
366             ## (This requirement was dropped from HTML5 spec, unfortunately.)
367              
368             my $is_space = {
369             0x0009 => 1, # CHARACTER TABULATION (HT)
370             0x000A => 1, # LINE FEED (LF)
371             #0x000B => 0, # LINE TABULATION (VT)
372             0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
373             0x000D => 1, # CARRIAGE RETURN (CR)
374             0x0020 => 1, # SPACE (SP)
375             };
376              
377             sub KEY_ELSE_CHAR () { 255 }
378             sub KEY_ULATIN_CHAR () { 254 }
379             sub KEY_LLATIN_CHAR () { 253 }
380             sub KEY_EOF_CHAR () { 252 }
381             sub KEY_SPACE_CHAR () { 251 }
382              
383             my $Action;
384             my $XMLAction;
385             $Action->[DATA_STATE]->[0x0026] = {
386             name => 'data &',
387             state => ENTITY_STATE, # "entity data state" + "consume a character reference"
388             state_set => {entity_add => -1, prev_state => DATA_STATE},
389             };
390             $Action->[DATA_STATE]->[0x003C] = {
391             name => 'data <',
392             state => TAG_OPEN_STATE,
393             };
394             $Action->[DATA_STATE]->[KEY_EOF_CHAR] = {
395             name => 'data eof',
396             emit => END_OF_FILE_TOKEN,
397             reconsume => 1,
398             };
399             $Action->[DATA_STATE]->[0x0000] = {
400             name => 'data null',
401             emit => CHARACTER_TOKEN,
402             error => 'NULL',
403             };
404             $Action->[DATA_STATE]->[KEY_ELSE_CHAR] = {
405             name => 'data else',
406             emit => CHARACTER_TOKEN,
407             emit_data_read_until => qq{\x00<&},
408             };
409             $XMLAction->[DATA_STATE]->[0x005D] = { # ]
410             name => 'data ]',
411             state => DATA_MSE1_STATE,
412             emit => CHARACTER_TOKEN,
413             };
414             $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = {
415             name => 'data else xml',
416             emit => CHARACTER_TOKEN,
417             emit_data_read_until => qq{\x00<&\]},
418             };
419             $Action->[RCDATA_STATE]->[0x0026] = {
420             name => 'rcdata &',
421             state => ENTITY_STATE, # "entity data state" + "consume a character reference"
422             state_set => {entity_add => -1, prev_state => RCDATA_STATE},
423             };
424             $Action->[RCDATA_STATE]->[0x003C] = {
425             name => 'rcdata <',
426             state => RCDATA_LT_STATE,
427             };
428             $Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
429             $Action->[RCDATA_STATE]->[0x0000] = {
430             name => 'rcdata null',
431             emit => CHARACTER_TOKEN,
432             emit_data => "\x{FFFD}",
433             error => 'NULL',
434             };
435             $Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = {
436             name => 'rcdata else',
437             emit => CHARACTER_TOKEN,
438             emit_data_read_until => qq{\x00<&},
439             };
440             $Action->[RAWTEXT_STATE]->[0x003C] = {
441             name => 'rawtext <',
442             state => RAWTEXT_LT_STATE,
443             };
444             $Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
445             $Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000];
446             $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = {
447             name => 'rawtext else',
448             emit => CHARACTER_TOKEN,
449             emit_data_read_until => qq{\x00<},
450             };
451             $Action->[SCRIPT_DATA_STATE]->[0x003C] = {
452             name => 'script data <',
453             state => SCRIPT_DATA_LT_STATE,
454             };
455             $Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
456             $Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
457             $Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR];
458             $Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
459             $Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
460             $Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = {
461             name => 'plaintext else',
462             emit => CHARACTER_TOKEN,
463             emit_data_read_until => qq{\x00},
464             };
465             # "Tag open state" is known as "tag state" in XML5.
466             $Action->[TAG_OPEN_STATE]->[0x0021] = {
467             name => 'tag open !',
468             state => MARKUP_DECLARATION_OPEN_STATE,
469             };
470             $Action->[TAG_OPEN_STATE]->[0x002F] = {
471             name => 'tag open /',
472             state => CLOSE_TAG_OPEN_STATE,
473             };
474             $Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
475             name => 'tag open uc',
476             ct => {
477             type => START_TAG_TOKEN,
478             delta => 1,
479             append_tag_name => 0x0020, # UC -> lc
480             },
481             state => TAG_NAME_STATE,
482             };
483             $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
484             name => 'tag open uc xml',
485             ct => {
486             type => START_TAG_TOKEN,
487             delta => 1,
488             append_tag_name => 0x0000,
489             },
490             state => TAG_NAME_STATE,
491             };
492             $Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
493             name => 'tag open lc',
494             ct => {
495             type => START_TAG_TOKEN,
496             delta => 1,
497             append_tag_name => 0x0000,
498             },
499             state => TAG_NAME_STATE,
500             };
501             $Action->[TAG_OPEN_STATE]->[0x003F] = {
502             name => 'tag open ?',
503             state => BOGUS_COMMENT_STATE,
504             error => 'pio',
505             error_delta => 1,
506             ct => {
507             type => COMMENT_TOKEN,
508             },
509             reconsume => 1, ## $self->{nc} is intentionally left as is
510             };
511             $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ?
512             name => 'tag open ? xml',
513             state => PI_STATE,
514             };
515             $Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
516             $Action->[TAG_OPEN_STATE]->[0x003E] = { # >
517             name => 'tag open else',
518             error => 'bare stago',
519             error_delta => 1,
520             state => DATA_STATE,
521             reconsume => 1,
522             emit => CHARACTER_TOKEN,
523             emit_data => '<',
524             emit_delta => 1,
525             };
526             $Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E];
527             $XMLAction->[TAG_OPEN_STATE]->[0x0000] = {
528             name => 'tag open null xml',
529             ct => {
530             type => START_TAG_TOKEN,
531             delta => 1,
532             append_tag_name => 0xFFFD,
533             },
534             error => 'NULL',
535             state => TAG_NAME_STATE,
536             };
537             ## XML5: "<:" has a parse error.
538             $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
539             name => 'tag open else xml',
540             ct => {
541             type => START_TAG_TOKEN,
542             delta => 1,
543             append_tag_name => 0x0000,
544             },
545             state => TAG_NAME_STATE,
546             };
547             $Action->[RCDATA_LT_STATE]->[0x002F] = {
548             name => 'rcdata lt /',
549             state => RCDATA_END_TAG_OPEN_STATE,
550             buffer => {clear => 1},
551             };
552             $Action->[RAWTEXT_LT_STATE]->[0x002F] = {
553             name => 'rawtext lt /',
554             state => RAWTEXT_END_TAG_OPEN_STATE,
555             buffer => {clear => 1},
556             };
557             $Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = {
558             name => 'script data lt /',
559             state => SCRIPT_DATA_END_TAG_OPEN_STATE,
560             buffer => {clear => 1},
561             };
562             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = {
563             name => 'script data escaped lt /',
564             state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE,
565             buffer => {clear => 1},
566             };
567             $Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = {
568             name => 'script data lt !',
569             state => SCRIPT_DATA_ESCAPE_START_STATE,
570             emit => CHARACTER_TOKEN,
571             emit_data => '<!',
572             };
573             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = {
574             name => 'script data escaped lt uc',
575             emit => CHARACTER_TOKEN,
576             emit_data => '<',
577             emit_data_append => 1,
578             buffer => {clear => 1, append => 0x0020}, # UC -> lc
579             state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
580             };
581             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = {
582             name => 'script data escaped lt lc',
583             emit => CHARACTER_TOKEN,
584             emit_data => '<',
585             emit_data_append => 1,
586             buffer => {clear => 1, append => 0x0000},
587             state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
588             };
589             $Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = {
590             name => 'rcdata lt else',
591             state => RCDATA_STATE,
592             reconsume => 1,
593             emit => CHARACTER_TOKEN,
594             emit_data => '<',
595             };
596             $Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = {
597             name => 'rawtext lt else',
598             state => RAWTEXT_STATE,
599             reconsume => 1,
600             emit => CHARACTER_TOKEN,
601             emit_data => '<',
602             };
603             $Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = {
604             name => 'script data lt else',
605             state => SCRIPT_DATA_STATE,
606             reconsume => 1,
607             emit => CHARACTER_TOKEN,
608             emit_data => '<',
609             };
610             $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
611             name => 'script data escaped lt else',
612             state => SCRIPT_DATA_ESCAPED_STATE,
613             reconsume => 1,
614             emit => CHARACTER_TOKEN,
615             emit_data => '<',
616             };
617             ## XXX "End tag token" in latest HTML5 and in XML5.
618             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
619             name => 'end tag open uc',
620             ct => {
621             type => END_TAG_TOKEN,
622             delta => 2,
623             append_tag_name => 0x0020, # UC -> lc
624             },
625             state => TAG_NAME_STATE,
626             };
627             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
628             name => 'end tag open uc xml',
629             ct => {
630             type => END_TAG_TOKEN,
631             delta => 2,
632             append_tag_name => 0x0000,
633             },
634             state => TAG_NAME_STATE,
635             };
636             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
637             name => 'end tag open lc',
638             ct => {
639             type => END_TAG_TOKEN,
640             delta => 2,
641             append_tag_name => 0x0000,
642             },
643             state => TAG_NAME_STATE,
644             };
645             $Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
646             name => 'end tag open >',
647             error => 'empty end tag',
648             error_delta => 2, # "<" in "</>"
649             state => DATA_STATE,
650             };
651             ## XML5: No parse error.
652            
653             ## NOTE: This parser raises a parse error, since it supports XML1,
654             ## not XML5.
655            
656             ## NOTE: A short end tag token.
657              
658             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
659             name => 'end tag open > xml',
660             error => 'empty end tag',
661             error_delta => 2, # "<" in "</>"
662             state => DATA_STATE,
663             ct => {
664             type => END_TAG_TOKEN,
665             delta => 2,
666             },
667             emit => '',
668             };
669             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_EOF_CHAR] = {
670             name => 'end tag open eof',
671             error => 'bare etago',
672             state => DATA_STATE,
673             reconsume => 1,
674             emit => CHARACTER_TOKEN,
675             emit_data => '</',
676             emit_delta => 2,
677             };
678             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
679             $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
680             name => 'end tag open else',
681             error => 'bogus end tag',
682             error_delta => 2, # "<" of "</"
683             state => BOGUS_COMMENT_STATE,
684             ct => {
685             type => COMMENT_TOKEN,
686             delta => 2, # "<" of "</"
687             },
688             reconsume => 1,
689             ## NOTE: $self->{nc} is intentionally left as is. Although the
690             ## "anything else" case of the spec not explicitly states that the
691             ## next input character is to be reconsumed, it will be included to
692             ## the |data| of the comment token generated from the bogus end tag,
693             ## as defined in the "bogus comment state" entry.
694             };
695             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x0000] = {
696             name => 'end tag open null xml',
697             ct => {
698             type => END_TAG_TOKEN,
699             delta => 2,
700             append_tag_name => 0xFFFD,
701             },
702             error => 'NULL',
703             state => TAG_NAME_STATE, ## XML5: "end tag name state".
704             };
705             ## XML5: "</:" is a parse error.
706             $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
707             name => 'end tag open else xml',
708             ct => {
709             type => END_TAG_TOKEN,
710             delta => 2,
711             append_tag_name => 0x0000,
712             },
713             state => TAG_NAME_STATE, ## XML5: "end tag name state".
714             };
715             ## This switch-case implements "tag name state", "RCDATA end tag
716             ## name state", "RAWTEXT end tag name state", and "script data
717             ## end tag name state" jointly with the implementation of
718             ## "RCDATA end tag open state" and so on.
719             $Action->[TAG_NAME_STATE]->[KEY_SPACE_CHAR] = {
720             name => 'tag name sp',
721             state => BEFORE_ATTRIBUTE_NAME_STATE,
722             };
723             $Action->[TAG_NAME_STATE]->[0x003E] = {
724             name => 'tag name >',
725             state => DATA_STATE,
726             emit => '',
727             };
728             $Action->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
729             name => 'tag name uc',
730             ct => {
731             append_tag_name => 0x0020, # UC -> lc
732             },
733             };
734             $XMLAction->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
735             name => 'tag name uc xml',
736             ct => {
737             append_tag_name => 0x0000,
738             },
739             };
740             $Action->[TAG_NAME_STATE]->[KEY_EOF_CHAR] = {
741             name => 'tag name eof',
742             error => 'unclosed tag',
743             state => DATA_STATE,
744             reconsume => 1,
745             };
746             $Action->[TAG_NAME_STATE]->[0x002F] = {
747             name => 'tag name /',
748             state => SELF_CLOSING_START_TAG_STATE,
749             };
750             $Action->[TAG_NAME_STATE]->[0x0000] = {
751             name => 'tag name null',
752             ct => {
753             append_tag_name => 0xFFFD,
754             },
755             error => 'NULL',
756             };
757             $Action->[TAG_NAME_STATE]->[KEY_ELSE_CHAR] = {
758             name => 'tag name else',
759             ct => {
760             append_tag_name => 0x0000,
761             },
762             };
763             $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[0x002D] = {
764             name => 'script data escape start -',
765             state => SCRIPT_DATA_ESCAPE_START_DASH_STATE,
766             emit => CHARACTER_TOKEN,
767             emit_data => '-',
768             };
769             $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[0x002D] = {
770             name => 'script data escape start dash -',
771             state => SCRIPT_DATA_ESCAPED_STATE,
772             emit => CHARACTER_TOKEN,
773             emit_data => '-',
774             };
775             $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
776             name => 'script data escape start else',
777             state => SCRIPT_DATA_STATE,
778             reconsume => 1,
779             };
780             $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[KEY_ELSE_CHAR] = $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR];
781             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x002D] = {
782             name => 'script data escaped -',
783             state => SCRIPT_DATA_ESCAPED_DASH_STATE,
784             emit => CHARACTER_TOKEN,
785             emit_data => '-',
786             };
787             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x002D] = {
788             name => 'script data escaped dash -',
789             state => SCRIPT_DATA_ESCAPED_DASH_DASH_STATE,
790             emit => CHARACTER_TOKEN,
791             emit_data => '-',
792             };
793             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
794             name => 'script data escaped dash dash -',
795             emit => CHARACTER_TOKEN,
796             emit_data => '-',
797             };
798             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x002D] = {
799             name => 'script data double escaped -',
800             state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE,
801             emit => CHARACTER_TOKEN,
802             emit_data => '-',
803             };
804             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x002D] = {
805             name => 'script data double escaped -',
806             state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE,
807             emit => CHARACTER_TOKEN,
808             emit_data => '-',
809             };
810             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
811             name => 'script data double escaped dash dash -',
812             emit => CHARACTER_TOKEN,
813             emit_data => '-',
814             };
815             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x003C] = {
816             name => 'script data escaped <',
817             state => SCRIPT_DATA_ESCAPED_LT_STATE,
818             };
819             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x003C] = {
820             name => 'script data escaped dash <',
821             state => SCRIPT_DATA_ESCAPED_LT_STATE,
822             };
823             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
824             name => 'script data escaped dash dash <',
825             state => SCRIPT_DATA_ESCAPED_LT_STATE,
826             };
827             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x003C] = {
828             name => 'script data double escaped <',
829             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
830             emit => CHARACTER_TOKEN,
831             emit_data => '<',
832             };
833             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x003C] = {
834             name => 'script data double escaped dash <',
835             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
836             emit => CHARACTER_TOKEN,
837             emit_data => '<',
838             };
839             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
840             name => 'script data double escaped dash dash <',
841             state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
842             emit => CHARACTER_TOKEN,
843             emit_data => '<',
844             };
845             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E] = {
846             name => 'script data escaped dash dash >',
847             state => SCRIPT_DATA_STATE,
848             emit => CHARACTER_TOKEN,
849             emit_data => '>',
850             };
851             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003E] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E];
852             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_EOF_CHAR] =
853             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
854             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] =
855             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_EOF_CHAR] =
856             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
857             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = {
858             name => 'script data escaped eof',
859             error => 'eof in escaped script data', # XXXdocumentation
860             state => DATA_STATE,
861             reconsume => 1,
862             };
863             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x0000] =
864             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x0000] =
865             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x0000] =
866             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x0000] =
867             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x0000] =
868             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x0000] = {
869             name => 'script data escaped null',
870             emit => CHARACTER_TOKEN,
871             emit_data => "\x{FFFD}",
872             error => 'NULL',
873             state => SCRIPT_DATA_ESCAPED_STATE,
874             };
875             $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
876             name => 'script data escaped else',
877             emit => CHARACTER_TOKEN,
878             state => SCRIPT_DATA_ESCAPED_STATE,
879             };
880             $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
881             name => 'script data escaped dash else',
882             emit => CHARACTER_TOKEN,
883             state => SCRIPT_DATA_ESCAPED_STATE,
884             };
885             $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
886             name => 'script data escaped dash dash else',
887             emit => CHARACTER_TOKEN,
888             state => SCRIPT_DATA_ESCAPED_STATE,
889             };
890             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
891             name => 'script data double escaped else',
892             emit => CHARACTER_TOKEN,
893             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
894             };
895             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
896             name => 'script data double escaped dash else',
897             emit => CHARACTER_TOKEN,
898             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
899             };
900             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
901             name => 'script data double escaped dash dash else',
902             emit => CHARACTER_TOKEN,
903             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
904             };
905             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_SPACE_CHAR] =
906             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_SPACE_CHAR] =
907             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x003E] =
908             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x003E] =
909             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x002F] =
910             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x002F] = {
911             name => 'script data double escape start sp>/',
912             skip => 1,
913             };
914             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ULATIN_CHAR] =
915             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ULATIN_CHAR] = {
916             name => 'script data double escape start uc',
917             emit => CHARACTER_TOKEN,
918             buffer => {append => 0x0020}, # UC -> lc
919             };
920             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_LLATIN_CHAR] =
921             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_LLATIN_CHAR] = {
922             name => 'script data double escape start lc',
923             emit => CHARACTER_TOKEN,
924             buffer => {append => 0x0000},
925             };
926             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
927             name => 'script data double escape start else',
928             state => SCRIPT_DATA_ESCAPED_STATE,
929             reconsume => 1,
930             };
931             $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ELSE_CHAR] = {
932             name => 'script data double escape end else',
933             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
934             reconsume => 1,
935             };
936             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[0x002F] = {
937             name => 'script data double escaped lt /',
938             buffer => {clear => 1},
939             state => SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE,
940             emit => CHARACTER_TOKEN,
941             emit_data => '/',
942             };
943             $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
944             name => 'script data double escaped lt else',
945             state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
946             reconsume => 1,
947             };
948             ## XML5: Part of the "data state".
949             $Action->[DATA_MSE1_STATE]->[0x005D] = {
950             name => 'data mse1 ]',
951             state => DATA_MSE2_STATE,
952             emit => CHARACTER_TOKEN,
953             emit_data => ']',
954             };
955             $Action->[DATA_MSE1_STATE]->[KEY_ELSE_CHAR] = {
956             name => 'data mse1 else',
957             state => DATA_STATE,
958             reconsume => 1,
959             };
960             $Action->[DATA_MSE2_STATE]->[0x003E] = {
961             name => 'data mse2 >',
962             error => 'unmatched mse', # XML5: Not a parse error. # XXXdocumentation
963             error_delta => 2,
964             state => DATA_STATE,
965             emit => CHARACTER_TOKEN,
966             emit_data => '>',
967             };
968             $Action->[DATA_MSE2_STATE]->[0x005D] = {
969             name => 'data mse2 ]',
970             emit => CHARACTER_TOKEN,
971             emit_data => ']',
972             };
973             $Action->[DATA_MSE2_STATE]->[KEY_ELSE_CHAR] = {
974             name => 'data mse2 else',
975             state => DATA_STATE,
976             reconsume => 1,
977             };
978             ## XML5: "Tag attribute name before state".
979             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
980             name => 'before attr name sp',
981             };
982             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003E] = {
983             name => 'before attr name >',
984             emit => '',
985             state => DATA_STATE,
986             };
987             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
988             name => 'before attr name uc',
989             ca => {
990             set_name => 0x0020, # UC -> lc
991             },
992             state => ATTRIBUTE_NAME_STATE,
993             };
994             $XMLAction->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
995             name => 'before attr name uc xml',
996             ca => {
997             set_name => 0x0000,
998             },
999             state => ATTRIBUTE_NAME_STATE,
1000             };
1001             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1002             name => 'before attr name /',
1003             state => SELF_CLOSING_START_TAG_STATE,
1004             };
1005             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1006             name => 'before attr name eof',
1007             error => 'unclosed tag',
1008             state => DATA_STATE,
1009             };
1010             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0022] =
1011             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0027] =
1012             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003C] =
1013             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1014             name => q[before attr name "'<=],
1015             error => 'bad attribute name', ## XML5: Not a parse error.
1016             ca => {set_name => 0x0000},
1017             state => ATTRIBUTE_NAME_STATE,
1018             };
1019             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1020             name => 'before attr name null',
1021             ca => {set_name => 0xFFFD},
1022             error => 'NULL',
1023             state => ATTRIBUTE_NAME_STATE,
1024             };
1025             ## XML5: ":" raises a parse error and is ignored.
1026             $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1027             name => 'before attr name else',
1028             ca => {set_name => 0x0000},
1029             state => ATTRIBUTE_NAME_STATE,
1030             };
1031              
1032             ## XML5: "Tag attribute name state".
1033             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1034             name => 'attr name sp',
1035             ca => {leave => 1},
1036             state => AFTER_ATTRIBUTE_NAME_STATE,
1037             };
1038             $Action->[ATTRIBUTE_NAME_STATE]->[0x003D] = {
1039             name => 'attr name =',
1040             ca => {leave => 1},
1041             state => BEFORE_ATTRIBUTE_VALUE_STATE,
1042             };
1043             $Action->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1044             name => 'attr name >',
1045             ca => {leave => 1},
1046             emit => '',
1047             state => DATA_STATE,
1048             };
1049             $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1050             name => 'attr name > xml',
1051             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1052             ca => {leave => 1},
1053             emit => '',
1054             state => DATA_STATE,
1055             };
1056             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1057             name => 'attr name uc',
1058             ca => {name => 0x0020}, # UC -> lc
1059             };
1060             $XMLAction->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1061             name => 'attr name uc',
1062             ca => {name => 0x0000},
1063             };
1064             $Action->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1065             name => 'attr name /',
1066             ca => {leave => 1},
1067             state => SELF_CLOSING_START_TAG_STATE,
1068             };
1069             $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1070             name => 'attr name / xml',
1071             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1072             ca => {leave => 1},
1073             state => SELF_CLOSING_START_TAG_STATE,
1074             };
1075             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1076             name => 'attr name eof',
1077             error => 'unclosed tag',
1078             ca => {leave => 1},
1079             state => DATA_STATE,
1080             reconsume => 1,
1081             };
1082             $Action->[ATTRIBUTE_NAME_STATE]->[0x0022] =
1083             $Action->[ATTRIBUTE_NAME_STATE]->[0x0027] =
1084             $Action->[ATTRIBUTE_NAME_STATE]->[0x003C] = {
1085             name => q[attr name "'<],
1086             error => 'bad attribute name', ## XML5: Not a parse error.
1087             ca => {name => 0x0000},
1088             };
1089             $Action->[ATTRIBUTE_NAME_STATE]->[0x0000] = {
1090             name => 'attr name null',
1091             ca => {name => 0xFFFD},
1092             error => 'NULL',
1093             };
1094             $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1095             name => 'attr name else',
1096             ca => {name => 0x0000},
1097             };
1098             ## XML5: "Tag attribute name after state".
1099             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1100             name => 'after attr name sp',
1101             };
1102             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1103             name => 'after attr name =',
1104             state => BEFORE_ATTRIBUTE_VALUE_STATE,
1105             };
1106             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1107             name => 'after attr name >',
1108             emit => '',
1109             state => DATA_STATE,
1110             };
1111             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1112             name => 'after attr name > xml',
1113             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1114             emit => '',
1115             state => DATA_STATE,
1116             };
1117             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1118             name => 'after attr name uc',
1119             ca => {set_name => 0x0020}, # UC -> lc
1120             state => ATTRIBUTE_NAME_STATE,
1121             };
1122             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1123             name => 'after attr name uc xml',
1124             ca => {set_name => 0x0000},
1125             state => ATTRIBUTE_NAME_STATE,
1126             };
1127             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1128             name => 'after attr name /',
1129             state => SELF_CLOSING_START_TAG_STATE,
1130             };
1131             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1132             name => 'after attr name / xml',
1133             error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1134             state => SELF_CLOSING_START_TAG_STATE,
1135             };
1136             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1137             name => 'after attr name eof',
1138             error => 'unclosed tag',
1139             state => DATA_STATE,
1140             reconsume => 1,
1141             };
1142             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0022] =
1143             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0027] =
1144             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003C] = {
1145             name => q[after attr name "'<],
1146             error => 'bad attribute name', ## XML5: Not a parse error.
1147             #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1148             ca => {set_name => 0x0000},
1149             state => ATTRIBUTE_NAME_STATE,
1150             };
1151             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1152             name => q[after attr name else],
1153             ca => {set_name => 0xFFFD},
1154             error => 'NULL',
1155             #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1156             state => ATTRIBUTE_NAME_STATE,
1157             };
1158             $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1159             name => q[after attr name else],
1160             ca => {set_name => 0x0000},
1161             state => ATTRIBUTE_NAME_STATE,
1162             };
1163             $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1164             name => q[after attr name else],
1165             error => 'no attr value', ## XML5: Not a parse error.
1166             ca => {set_name => 0x0000},
1167             state => ATTRIBUTE_NAME_STATE,
1168             };
1169             ## XML5: "Tag attribute value before state".
1170             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_SPACE_CHAR] = {
1171             name => 'before attr value sp',
1172             };
1173             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0022] = {
1174             name => 'before attr value "',
1175             state => ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
1176             };
1177             $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1178             name => 'before attr value &',
1179             error => 'unquoted attr value', ## XML5: Not a parse error.
1180             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1181             reconsume => 1,
1182             };
1183             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1184             name => 'before attr value &',
1185             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1186             reconsume => 1,
1187             };
1188             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0027] = {
1189             name => "before attr value '",
1190             state => ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
1191             };
1192             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003E] = {
1193             name => 'before attr value >',
1194             error => 'empty unquoted attribute value',
1195             emit => '',
1196             state => DATA_STATE,
1197             };
1198             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_EOF_CHAR] = {
1199             name => 'before attr value eof',
1200             error => 'unclosed tag',
1201             state => DATA_STATE,
1202             };
1203             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003C] =
1204             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003D] =
1205             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0060] = {
1206             name => 'before attr value <=`',
1207             error => 'bad attribute value', ## XML5: Not a parse error.
1208             #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1209             ca => {value => 1},
1210             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1211             };
1212             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0000] = {
1213             name => 'before attr value null',
1214             ca => {value => "\x{FFFD}"},
1215             error => 'NULL',
1216             #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1217             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1218             };
1219             $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1220             name => 'before attr value else xml',
1221             error => 'unquoted attr value', ## XML5: Not a parse error. # XXXdocumentation
1222             ca => {value => 1},
1223             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1224             };
1225             $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1226             name => 'before attr value else',
1227             ca => {value => 1},
1228             state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1229             };
1230              
1231             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_SPACE_CHAR] = {
1232             name => 'after attr value quoted sp',
1233             state => BEFORE_ATTRIBUTE_NAME_STATE,
1234             };
1235             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x003E] = {
1236             name => 'after attr value quoted >',
1237             emit => '',
1238             state => DATA_STATE,
1239             };
1240             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x002F] = {
1241             name => 'after attr value quoted /',
1242             state => SELF_CLOSING_START_TAG_STATE,
1243             };
1244             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_EOF_CHAR] = {
1245             name => 'after attr value quoted eof',
1246             error => 'unclosed tag',
1247             state => DATA_STATE,
1248             reconsume => 1,
1249             };
1250             $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_ELSE_CHAR] = {
1251             name => 'after attr value quoted else',
1252             error => 'no space between attributes',
1253             state => BEFORE_ATTRIBUTE_NAME_STATE,
1254             reconsume => 1,
1255             };
1256             $Action->[SELF_CLOSING_START_TAG_STATE]->[0x003E] = {
1257             name => 'self closing start tag >',
1258             skip => 1,
1259             };
1260             $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_EOF_CHAR] = {
1261             name => 'self closing start tag eof',
1262             error => 'unclosed tag',
1263             state => DATA_STATE, ## XML5: "Tag attribute name before state".
1264             reconsume => 1,
1265             };
1266             $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_ELSE_CHAR] = {
1267             name => 'self closing start tag else',
1268             error => 'nestc', # XXX This error type is wrong.
1269             state => BEFORE_ATTRIBUTE_NAME_STATE,
1270             reconsume => 1,
1271             };
1272             $Action->[MD_HYPHEN_STATE]->[0x002D] = {
1273             name => 'md hyphen -',
1274             ct => {type => COMMENT_TOKEN, data => '', delta => 3},
1275             state => COMMENT_START_STATE, ## XML5: "comment state".
1276             };
1277             $Action->[MD_HYPHEN_STATE]->[KEY_ELSE_CHAR] = {
1278             name => 'md hyphen else',
1279             error => 'bogus comment',
1280             error_delta => 3,
1281             state => BOGUS_COMMENT_STATE,
1282             reconsume => 1,
1283             ct => {type => COMMENT_TOKEN, data => '-', delta => 3},
1284             };
1285              
1286             my $c_to_key = [];
1287             $c_to_key->[255] = KEY_EOF_CHAR; # EOF_CHAR
1288             $c_to_key->[$_] = $_ for 0x0000..0x007F;
1289             $c_to_key->[$_] = KEY_SPACE_CHAR for keys %$is_space;
1290             $c_to_key->[$_] = KEY_ULATIN_CHAR for 0x0041..0x005A;
1291             $c_to_key->[$_] = KEY_LLATIN_CHAR for 0x0061..0x007A;
1292              
1293             sub _get_next_token ($) {
1294 0     0     my $self = shift;
1295              
1296 0 0         if ($self->{self_closing}) {
1297             ## NOTE: The |$self->{self_closing}| flag can never be set to
1298             ## tokens except for start tag tokens. A start tag token is
1299             ## always set to |$self->{ct}| before it is emitted.
1300 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1301 0           delete $self->{self_closing};
1302             }
1303              
1304 0 0         if (@{$self->{token}}) {
  0            
1305 0           $self->{self_closing} = $self->{token}->[0]->{self_closing};
1306 0           return shift @{$self->{token}};
  0            
1307             }
1308              
1309             A: {
1310 0           my $nc = $self->{nc};
  0            
1311 0           my $state = $self->{state};
1312              
1313            
1314              
1315 0 0         my $c = $nc > 0x007F ? KEY_ELSE_CHAR : $c_to_key->[$nc];
1316 0   0       my $action = $Action->[$state]->[$c] || $Action->[$state]->[KEY_ELSE_CHAR];
1317 0 0         if ($self->{is_xml}) {
1318 0   0       $action = $XMLAction->[$state]->[$c]
1319             || $Action->[$state]->[$c]
1320             || $XMLAction->[$state]->[KEY_ELSE_CHAR]
1321             || $Action->[$state]->[KEY_ELSE_CHAR];
1322             }
1323              
1324 0 0 0       if ($action and not $action->{skip}) {
1325            
1326              
1327 0 0         if (defined $action->{error}) {
1328 0 0         if ($action->{error_delta}) {
1329 0           $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error},
1330             line => $self->{line_prev},
1331             column => $self->{column_prev} - $action->{error_delta} + 1);
1332             } else {
1333 0           $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error});
1334             }
1335             }
1336              
1337 0 0         if (defined $action->{state}) {
1338 0           $self->{state} = $action->{state};
1339            
1340 0 0         if ($action->{state_set}) {
1341 0           for (keys %{$action->{state_set}}) {
  0            
1342 0           $self->{$_} = $action->{state_set}->{$_};
1343             }
1344             }
1345             }
1346              
1347 0 0         if (my $act = $action->{ct}) {
1348 0 0         if (defined $act->{type}) {
1349 0           $self->{ct} = {type => $act->{type},
1350             tag_name => '', data => $act->{data}};
1351 0 0         if ($act->{delta}) {
1352 0           $self->{ct}->{line} = $self->{line_prev};
1353 0           $self->{ct}->{column} = $self->{column_prev} - $act->{delta} + 1;
1354             } else {
1355 0           $self->{ct}->{line} = $self->{line};
1356 0           $self->{ct}->{column} = $self->{column};
1357             }
1358             }
1359            
1360 0 0         if (defined $act->{append_tag_name}) {
1361 0           $self->{ct}->{tag_name} .= chr ($nc + $act->{append_tag_name});
1362             }
1363             }
1364            
1365 0 0         if (my $aca = $action->{ca}) {
1366 0 0         if ($aca->{value}) {
    0          
    0          
    0          
1367 0 0         $self->{ca}->{value} .= $aca->{value} ne '1' ? $aca->{value} : chr $nc;
1368             } elsif (defined $aca->{name}) {
1369 0           $self->{ca}->{name} .= chr ($nc + $aca->{name});
1370             } elsif (defined $aca->{set_name}) {
1371 0           $self->{ca} = {
1372             name => chr ($nc + $aca->{set_name}),
1373             value => '',
1374             line => $self->{line}, column => $self->{column},
1375             };
1376             } elsif ($aca->{leave}) {
1377 0 0         if (exists $self->{ct}->{attributes}->{$self->{ca}->{name}}) {
1378            
1379 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1380             ## Discard $self->{ca}.
1381             } else {
1382            
1383 0           $self->{ct}->{attributes}->{$self->{ca}->{name}} = $self->{ca};
1384 0           $self->{ca}->{index} = ++$self->{ct}->{last_index};
1385             }
1386             }
1387             }
1388              
1389 0 0         if (defined $action->{buffer}) {
1390 0 0         $self->{kwd} = '' if $action->{buffer}->{clear};
1391 0 0         $self->{kwd} .= chr ($nc + $action->{buffer}->{append})
1392             if defined $action->{buffer}->{append};
1393              
1394            
1395             }
1396              
1397 0 0         if (defined $action->{emit}) {
1398 0 0         if ($action->{emit} eq '') {
1399 0 0         if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
1400            
1401 0           $self->{last_stag_name} = $self->{ct}->{tag_name};
1402             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1403 0 0         if ($self->{ct}->{attributes}) {
1404            
1405 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1406             } else {
1407            
1408             }
1409             } else {
1410 0           die "$0: $self->{ct}->{type}: Unknown token type";
1411             }
1412            
1413 0 0         if ($action->{reconsume}) {
1414             #
1415             } else {
1416            
1417 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1418 0           $self->{line_prev} = $self->{line};
1419 0           $self->{column_prev} = $self->{column};
1420 0           $self->{column}++;
1421 0           $self->{nc}
1422             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1423             } else {
1424 0           $self->{set_nc}->($self);
1425             }
1426            
1427             }
1428 0           return ($self->{ct});
1429             } else {
1430 0           my $token = {type => $action->{emit}};
1431 0 0         if (defined $action->{emit_data}) {
    0          
1432 0           $token->{data} = $action->{emit_data};
1433 0 0         if ($action->{emit_data_append}) {
1434 0           $token->{data} .= chr $nc;
1435             }
1436             } elsif ($action->{emit} == CHARACTER_TOKEN) {
1437 0           $token->{data} .= chr $nc;
1438             }
1439 0 0         if ($action->{emit_delta}) {
1440 0           $token->{line} = $self->{line_prev};
1441 0           $token->{column} = $self->{column_prev} - $action->{emit_delta} + 1;
1442             } else {
1443 0           $token->{line} = $self->{line};
1444 0           $token->{column} = $self->{column};
1445             }
1446 0 0         if (defined $action->{emit_data_read_until}) {
1447 0           $self->{read_until}->($token->{data},
1448             $action->{emit_data_read_until},
1449             length $token->{data});
1450             }
1451            
1452 0 0         if ($action->{reconsume}) {
1453             #
1454             } else {
1455            
1456 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1457 0           $self->{line_prev} = $self->{line};
1458 0           $self->{column_prev} = $self->{column};
1459 0           $self->{column}++;
1460 0           $self->{nc}
1461             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1462             } else {
1463 0           $self->{set_nc}->($self);
1464             }
1465            
1466             }
1467 0           return ($token);
1468             }
1469             } else {
1470 0 0         if ($action->{reconsume}) {
1471             #
1472             } else {
1473            
1474 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1475 0           $self->{line_prev} = $self->{line};
1476 0           $self->{column_prev} = $self->{column};
1477 0           $self->{column}++;
1478 0           $self->{nc}
1479             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1480             } else {
1481 0           $self->{set_nc}->($self);
1482             }
1483            
1484             }
1485             }
1486              
1487 0           redo A;
1488             }
1489              
1490 0 0 0       if ({
    0 0        
    0 0        
    0 0        
    0 0        
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
1491             (RCDATA_END_TAG_OPEN_STATE) => 1,
1492             (RAWTEXT_END_TAG_OPEN_STATE) => 1,
1493             (SCRIPT_DATA_END_TAG_OPEN_STATE) => 1,
1494             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => 1,
1495             }->{$state}) {
1496             ## This switch-case implements "RCDATA end tag open state",
1497             ## "RAWTEXT end tag open state", "script data end tag open
1498             ## state", "RCDATA end tag name state", "RAWTEXT end tag name
1499             ## state", and "script end tag name state" jointly with the
1500             ## implementation of the "tag name" state.
1501              
1502 0           my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1503              
1504 0 0         if (defined $self->{last_stag_name}) {
1505             #
1506             } else {
1507             ## No start tag token has ever been emitted
1508             ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1509            
1510 0 0         $self->{state} = {
1511             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1512             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1513             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1514             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1515             => SCRIPT_DATA_ESCAPED_STATE,
1516             }->{$state} or die "${state}'s next state not found";
1517             ## Reconsume.
1518 0           return ({type => CHARACTER_TOKEN, data => '</',
1519             line => $l, column => $c});
1520 0           redo A;
1521             }
1522              
1523 0           my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
1524 0 0         if (length $ch) {
1525 0           my $CH = $ch;
1526 0           $ch =~ tr/a-z/A-Z/;
1527 0           my $nch = chr $nc;
1528 0 0 0       if ($nch eq $ch or $nch eq $CH) {
1529            
1530             ## Stay in the state.
1531 0           $self->{kwd} .= $nch;
1532            
1533 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1534 0           $self->{line_prev} = $self->{line};
1535 0           $self->{column_prev} = $self->{column};
1536 0           $self->{column}++;
1537 0           $self->{nc}
1538             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1539             } else {
1540 0           $self->{set_nc}->($self);
1541             }
1542            
1543 0           redo A;
1544             } else {
1545            
1546 0 0         $self->{state} = {
1547             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1548             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1549             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1550             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1551             => SCRIPT_DATA_ESCAPED_STATE,
1552             }->{$state} or die "${state}'s next state not found";
1553             ## Reconsume.
1554 0           return ({type => CHARACTER_TOKEN,
1555             data => '</' . $self->{kwd},
1556             line => $self->{line_prev},
1557             column => $self->{column_prev} - 1 - length $self->{kwd},
1558             });
1559 0           redo A;
1560             }
1561             } else { # after "</{tag-name}"
1562 0 0 0       unless ($is_space->{$nc} or
1563             {
1564             0x003E => 1, # >
1565             0x002F => 1, # /
1566             }->{$nc}) {
1567            
1568             ## Reconsume.
1569 0 0         $self->{state} = {
1570             (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1571             (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1572             (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1573             (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1574             => SCRIPT_DATA_ESCAPED_STATE,
1575             }->{$self->{state}} or die "${state}'s next state not found";
1576 0           return ({type => CHARACTER_TOKEN,
1577             data => '</' . $self->{kwd},
1578             line => $self->{line_prev},
1579             column => $self->{column_prev} - 1 - length $self->{kwd},
1580             });
1581 0           redo A;
1582             } else {
1583            
1584 0           $self->{ct}
1585             = {type => END_TAG_TOKEN,
1586             tag_name => $self->{last_stag_name},
1587             line => $self->{line_prev},
1588             column => $self->{column_prev} - 1 - length $self->{kwd}};
1589 0           $self->{state} = TAG_NAME_STATE;
1590             ## Reconsume.
1591 0           redo A;
1592             }
1593             }
1594             } elsif ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE or
1595             $state == SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
1596 0 0 0       if ($is_space->{$nc} or
      0        
1597             $nc == 0x002F or # /
1598             $nc == 0x003E) { # >
1599 0           my $token = {type => CHARACTER_TOKEN,
1600             data => chr $nc,
1601             line => $self->{line}, column => $self->{column}};
1602 0 0         if ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) {
1603 0 0         $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1604             ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE
1605             : SCRIPT_DATA_ESCAPED_STATE;
1606             } else {
1607 0 0         $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1608             ? SCRIPT_DATA_ESCAPED_STATE
1609             : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1610             }
1611            
1612 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1613 0           $self->{line_prev} = $self->{line};
1614 0           $self->{column_prev} = $self->{column};
1615 0           $self->{column}++;
1616 0           $self->{nc}
1617             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1618             } else {
1619 0           $self->{set_nc}->($self);
1620             }
1621            
1622 0           return ($token);
1623 0           redo A;
1624             } else {
1625 0           die "$state/$nc is implemented";
1626             }
1627             } elsif ($state == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1628             ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1629             ## ATTLIST attribute value double quoted state".
1630            
1631 0 0 0       if ($nc == 0x0022) { # "
    0          
    0          
    0          
    0          
1632 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1633            
1634             ## XML5: "DOCTYPE ATTLIST name after state".
1635 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
1636 0           $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1637             } else {
1638            
1639             ## XML5: "Tag attribute name before state".
1640 0           $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1641             }
1642            
1643 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644 0           $self->{line_prev} = $self->{line};
1645 0           $self->{column_prev} = $self->{column};
1646 0           $self->{column}++;
1647 0           $self->{nc}
1648             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1649             } else {
1650 0           $self->{set_nc}->($self);
1651             }
1652            
1653 0           redo A;
1654             } elsif ($nc == 0x0026) { # &
1655            
1656             ## XML5: Not defined yet.
1657              
1658             ## NOTE: In the spec, the tokenizer is switched to the
1659             ## "entity in attribute value state". In this implementation, the
1660             ## tokenizer is switched to the |ENTITY_STATE|, which is an
1661             ## implementation of the "consume a character reference" algorithm.
1662 0           $self->{prev_state} = $state;
1663 0           $self->{entity_add} = 0x0022; # "
1664 0           $self->{state} = ENTITY_STATE;
1665            
1666 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1667 0           $self->{line_prev} = $self->{line};
1668 0           $self->{column_prev} = $self->{column};
1669 0           $self->{column}++;
1670 0           $self->{nc}
1671             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1672             } else {
1673 0           $self->{set_nc}->($self);
1674             }
1675            
1676 0           redo A;
1677             } elsif ($self->{is_xml} and
1678             $is_space->{$nc}) {
1679            
1680 0           $self->{ca}->{value} .= ' ';
1681             ## Stay in the state.
1682            
1683 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1684 0           $self->{line_prev} = $self->{line};
1685 0           $self->{column_prev} = $self->{column};
1686 0           $self->{column}++;
1687 0           $self->{nc}
1688             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1689             } else {
1690 0           $self->{set_nc}->($self);
1691             }
1692            
1693 0           redo A;
1694             } elsif ($nc == -1) {
1695 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1696 0 0         if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
1697            
1698 0           $self->{last_stag_name} = $self->{ct}->{tag_name};
1699              
1700 0           $self->{state} = DATA_STATE;
1701             ## reconsume
1702 0           return ($self->{ct}); # start tag
1703 0           redo A;
1704             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1705 0 0         if ($self->{ct}->{attributes}) {
1706            
1707 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1708             } else {
1709             ## NOTE: This state should never be reached.
1710            
1711             }
1712              
1713 0           $self->{state} = DATA_STATE;
1714             ## reconsume
1715              
1716             ## Discard the token.
1717             #return ($self->{ct}); # end tag
1718              
1719 0           redo A;
1720             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1721             ## XML5: No parse error above; not defined yet.
1722 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
1723 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1724             ## Reconsume.
1725              
1726             ## Discard the token.
1727             #return ($self->{ct}); # ATTLIST
1728              
1729 0           redo A;
1730             } else {
1731 0           die "$0: $self->{ct}->{type}: Unknown token type";
1732             }
1733             } elsif ($nc == 0x0000) {
1734 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1735 0           $self->{ca}->{value} .= "\x{FFFD}";
1736             ## Stay in the state
1737            
1738 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1739 0           $self->{line_prev} = $self->{line};
1740 0           $self->{column_prev} = $self->{column};
1741 0           $self->{column}++;
1742 0           $self->{nc}
1743             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1744             } else {
1745 0           $self->{set_nc}->($self);
1746             }
1747            
1748 0           redo A;
1749             } else {
1750             ## XML5 [ATTLIST]: Not defined yet.
1751 0 0 0       if ($self->{is_xml} and $nc == 0x003C) { # <
1752            
1753             ## XML5: Not a parse error.
1754 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1755             } else {
1756            
1757             }
1758 0           $self->{ca}->{value} .= chr ($nc);
1759 0           $self->{read_until}->($self->{ca}->{value},
1760             qq[\x00"&<\x09\x0C\x20],
1761             length $self->{ca}->{value});
1762              
1763             ## Stay in the state
1764            
1765 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1766 0           $self->{line_prev} = $self->{line};
1767 0           $self->{column_prev} = $self->{column};
1768 0           $self->{column}++;
1769 0           $self->{nc}
1770             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1771             } else {
1772 0           $self->{set_nc}->($self);
1773             }
1774            
1775 0           redo A;
1776             }
1777             } elsif ($state == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1778             ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1779             ## ATTLIST attribute value single quoted state".
1780              
1781 0 0 0       if ($nc == 0x0027) { # '
    0          
    0          
    0          
    0          
1782 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1783            
1784             ## XML5: "DOCTYPE ATTLIST name after state".
1785 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
1786 0           $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1787             } else {
1788            
1789             ## XML5: "Before attribute name state" (sic).
1790 0           $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1791             }
1792            
1793 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1794 0           $self->{line_prev} = $self->{line};
1795 0           $self->{column_prev} = $self->{column};
1796 0           $self->{column}++;
1797 0           $self->{nc}
1798             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1799             } else {
1800 0           $self->{set_nc}->($self);
1801             }
1802            
1803 0           redo A;
1804             } elsif ($nc == 0x0026) { # &
1805            
1806             ## XML5: Not defined yet.
1807              
1808             ## NOTE: In the spec, the tokenizer is switched to the
1809             ## "entity in attribute value state". In this implementation, the
1810             ## tokenizer is switched to the |ENTITY_STATE|, which is an
1811             ## implementation of the "consume a character reference" algorithm.
1812 0           $self->{entity_add} = 0x0027; # '
1813 0           $self->{prev_state} = $state;
1814 0           $self->{state} = ENTITY_STATE;
1815            
1816 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1817 0           $self->{line_prev} = $self->{line};
1818 0           $self->{column_prev} = $self->{column};
1819 0           $self->{column}++;
1820 0           $self->{nc}
1821             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1822             } else {
1823 0           $self->{set_nc}->($self);
1824             }
1825            
1826 0           redo A;
1827             } elsif ($self->{is_xml} and
1828             $is_space->{$nc}) {
1829            
1830 0           $self->{ca}->{value} .= ' ';
1831             ## Stay in the state.
1832            
1833 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1834 0           $self->{line_prev} = $self->{line};
1835 0           $self->{column_prev} = $self->{column};
1836 0           $self->{column}++;
1837 0           $self->{nc}
1838             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1839             } else {
1840 0           $self->{set_nc}->($self);
1841             }
1842            
1843 0           redo A;
1844             } elsif ($nc == -1) {
1845 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1846 0 0         if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
1847            
1848 0           $self->{last_stag_name} = $self->{ct}->{tag_name};
1849              
1850 0           $self->{state} = DATA_STATE;
1851             ## reconsume
1852              
1853             ## Discard the token.
1854             #return ($self->{ct}); # start tag
1855              
1856 0           redo A;
1857             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1858 0 0         if ($self->{ct}->{attributes}) {
1859            
1860 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1861             } else {
1862             ## NOTE: This state should never be reached.
1863            
1864             }
1865              
1866 0           $self->{state} = DATA_STATE;
1867             ## reconsume
1868              
1869             ## Discard the token.
1870             #return ($self->{ct}); # end tag
1871              
1872 0           redo A;
1873             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1874             ## XML5: No parse error above; not defined yet.
1875 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
1876 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1877             ## Reconsume.
1878              
1879             ## Discard the token.
1880             #return ($self->{ct}); # ATTLIST
1881              
1882 0           redo A;
1883             } else {
1884 0           die "$0: $self->{ct}->{type}: Unknown token type";
1885             }
1886             } elsif ($nc == 0x0000) {
1887 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1888 0           $self->{ca}->{value} .= "\x{FFFD}";
1889             ## Stay in the state
1890            
1891 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1892 0           $self->{line_prev} = $self->{line};
1893 0           $self->{column_prev} = $self->{column};
1894 0           $self->{column}++;
1895 0           $self->{nc}
1896             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1897             } else {
1898 0           $self->{set_nc}->($self);
1899             }
1900            
1901 0           redo A;
1902             } else {
1903             ## XML5 [ATTLIST]: Not defined yet.
1904 0 0 0       if ($self->{is_xml} and $nc == 0x003C) { # <
1905            
1906             ## XML5: Not a parse error.
1907 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1908             } else {
1909            
1910             }
1911 0           $self->{ca}->{value} .= chr ($nc);
1912 0           $self->{read_until}->($self->{ca}->{value},
1913             qq[\x00'&<\x09\x0C\x20],
1914             length $self->{ca}->{value});
1915              
1916             ## Stay in the state
1917            
1918 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1919 0           $self->{line_prev} = $self->{line};
1920 0           $self->{column_prev} = $self->{column};
1921 0           $self->{column}++;
1922 0           $self->{nc}
1923             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1924             } else {
1925 0           $self->{set_nc}->($self);
1926             }
1927            
1928 0           redo A;
1929             }
1930             } elsif ($state == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1931             ## XML5: "Tag attribute value unquoted state".
1932              
1933 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
1934 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1935            
1936 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
1937 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1938             } else {
1939            
1940             ## XML5: "Tag attribute name before state".
1941 0           $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1942             }
1943            
1944 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1945 0           $self->{line_prev} = $self->{line};
1946 0           $self->{column_prev} = $self->{column};
1947 0           $self->{column}++;
1948 0           $self->{nc}
1949             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1950             } else {
1951 0           $self->{set_nc}->($self);
1952             }
1953            
1954 0           redo A;
1955             } elsif ($nc == 0x0026) { # &
1956            
1957              
1958             ## XML5: Not defined yet.
1959              
1960             ## NOTE: In the spec, the tokenizer is switched to the
1961             ## "character reference in attribute value state". In this
1962             ## implementation, the tokenizer is switched to the
1963             ## |ENTITY_STATE|, which is an implementation of the "consume
1964             ## a character reference" algorithm.
1965 0           $self->{entity_add} = 0x003E; # >
1966 0           $self->{prev_state} = $state;
1967 0           $self->{state} = ENTITY_STATE;
1968            
1969 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1970 0           $self->{line_prev} = $self->{line};
1971 0           $self->{column_prev} = $self->{column};
1972 0           $self->{column}++;
1973 0           $self->{nc}
1974             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1975             } else {
1976 0           $self->{set_nc}->($self);
1977             }
1978            
1979 0           redo A;
1980             } elsif ($nc == 0x003E) { # >
1981 0 0         if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
1982            
1983 0           $self->{last_stag_name} = $self->{ct}->{tag_name};
1984              
1985 0           $self->{state} = DATA_STATE;
1986            
1987 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1988 0           $self->{line_prev} = $self->{line};
1989 0           $self->{column_prev} = $self->{column};
1990 0           $self->{column}++;
1991 0           $self->{nc}
1992             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1993             } else {
1994 0           $self->{set_nc}->($self);
1995             }
1996            
1997 0           return ($self->{ct}); # start tag
1998 0           redo A;
1999             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2000 0 0         if ($self->{ct}->{attributes}) {
2001            
2002 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2003             } else {
2004             ## NOTE: This state should never be reached.
2005            
2006             }
2007              
2008 0           $self->{state} = DATA_STATE;
2009            
2010 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2011 0           $self->{line_prev} = $self->{line};
2012 0           $self->{column_prev} = $self->{column};
2013 0           $self->{column}++;
2014 0           $self->{nc}
2015             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2016             } else {
2017 0           $self->{set_nc}->($self);
2018             }
2019            
2020 0           return ($self->{ct}); # end tag
2021 0           redo A;
2022             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2023 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
2024 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2025            
2026 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2027 0           $self->{line_prev} = $self->{line};
2028 0           $self->{column_prev} = $self->{column};
2029 0           $self->{column}++;
2030 0           $self->{nc}
2031             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2032             } else {
2033 0           $self->{set_nc}->($self);
2034             }
2035            
2036 0           return ($self->{ct}); # ATTLIST
2037 0           redo A;
2038             } else {
2039 0           die "$0: $self->{ct}->{type}: Unknown token type";
2040             }
2041             } elsif ($nc == -1) {
2042 0 0         if ($self->{ct}->{type} == START_TAG_TOKEN) {
    0          
    0          
2043            
2044 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2045 0           $self->{last_stag_name} = $self->{ct}->{tag_name};
2046              
2047 0           $self->{state} = DATA_STATE;
2048             ## reconsume
2049              
2050             ## Discard the token.
2051             #return ($self->{ct}); # start tag
2052            
2053 0           redo A;
2054             } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2055 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2056 0 0         if ($self->{ct}->{attributes}) {
2057            
2058 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2059             } else {
2060             ## NOTE: This state should never be reached.
2061            
2062             }
2063              
2064 0           $self->{state} = DATA_STATE;
2065             ## reconsume
2066              
2067             ## Discard the token.
2068             #return ($self->{ct}); # end tag
2069              
2070 0           redo A;
2071             } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2072 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2073 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
2074 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2075             ## Reconsume.
2076              
2077             ## Discard the token.
2078             #return ($self->{ct}); # ATTLIST
2079              
2080 0           redo A;
2081             } else {
2082 0           die "$0: $self->{ct}->{type}: Unknown token type";
2083             }
2084             } elsif ($nc == 0x0000) {
2085 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2086 0           $self->{ca}->{value} .= "\x{FFFD}";
2087             ## Stay in the state
2088            
2089 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2090 0           $self->{line_prev} = $self->{line};
2091 0           $self->{column_prev} = $self->{column};
2092 0           $self->{column}++;
2093 0           $self->{nc}
2094             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2095             } else {
2096 0           $self->{set_nc}->($self);
2097             }
2098            
2099 0           redo A;
2100             } else {
2101 0 0         if ({
2102             0x0022 => 1, # "
2103             0x0027 => 1, # '
2104             0x003D => 1, # =
2105             0x003C => 1, # <
2106             0x0060 => 1, # `
2107             }->{$nc}) {
2108            
2109             ## XML5: Not a parse error.
2110 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2111             } else {
2112            
2113             }
2114 0           $self->{ca}->{value} .= chr ($nc);
2115 0           $self->{read_until}->($self->{ca}->{value},
2116             qq[\x00"'=&` \x09\x0C<>],
2117             length $self->{ca}->{value});
2118              
2119             ## Stay in the state
2120            
2121 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2122 0           $self->{line_prev} = $self->{line};
2123 0           $self->{column_prev} = $self->{column};
2124 0           $self->{column}++;
2125 0           $self->{nc}
2126             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2127             } else {
2128 0           $self->{set_nc}->($self);
2129             }
2130            
2131 0           redo A;
2132             }
2133             } elsif ($state == SELF_CLOSING_START_TAG_STATE) {
2134             ## XML5: "Empty tag state".
2135              
2136 0 0         if ($nc == 0x003E) { # >
2137 0 0         if ($self->{ct}->{type} == END_TAG_TOKEN) {
2138            
2139 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2140             ## XXX: Different type than slash in start tag
2141 0 0         if ($self->{ct}->{attributes}) {
2142            
2143 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2144             } else {
2145            
2146             }
2147             ## XXX: Test |<title></title/>|
2148             } else {
2149            
2150 0           $self->{self_closing} = 1;
2151             }
2152              
2153 0           $self->{state} = DATA_STATE;
2154            
2155 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2156 0           $self->{line_prev} = $self->{line};
2157 0           $self->{column_prev} = $self->{column};
2158 0           $self->{column}++;
2159 0           $self->{nc}
2160             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2161             } else {
2162 0           $self->{set_nc}->($self);
2163             }
2164            
2165              
2166 0           return ($self->{ct}); # start tag or end tag
2167              
2168 0           redo A;
2169             } else {
2170 0           die "$state/$nc is implemented";
2171             }
2172             } elsif ($state == BOGUS_COMMENT_STATE) {
2173             ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2174              
2175             ## NOTE: Unlike spec's "bogus comment state", this implementation
2176             ## consumes characters one-by-one basis.
2177            
2178 0 0         if ($nc == 0x003E) { # >
    0          
    0          
2179 0 0         if ($self->{in_subset}) {
2180            
2181 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2182             } else {
2183            
2184 0           $self->{state} = DATA_STATE;
2185             }
2186            
2187 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2188 0           $self->{line_prev} = $self->{line};
2189 0           $self->{column_prev} = $self->{column};
2190 0           $self->{column}++;
2191 0           $self->{nc}
2192             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2193             } else {
2194 0           $self->{set_nc}->($self);
2195             }
2196            
2197              
2198 0           return ($self->{ct}); # comment
2199 0           redo A;
2200             } elsif ($nc == -1) {
2201 0 0         if ($self->{in_subset}) {
2202            
2203 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2204             } else {
2205            
2206 0           $self->{state} = DATA_STATE;
2207             }
2208             ## reconsume
2209              
2210 0           return ($self->{ct}); # comment
2211 0           redo A;
2212             } elsif ($nc == 0x0000) {
2213 0           $self->{ct}->{data} .= "\x{FFFD}"; # comment
2214             ## Stay in the state.
2215            
2216 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2217 0           $self->{line_prev} = $self->{line};
2218 0           $self->{column_prev} = $self->{column};
2219 0           $self->{column}++;
2220 0           $self->{nc}
2221             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2222             } else {
2223 0           $self->{set_nc}->($self);
2224             }
2225            
2226 0           redo A;
2227             } else {
2228            
2229 0           $self->{ct}->{data} .= chr ($nc); # comment
2230 0           $self->{read_until}->($self->{ct}->{data},
2231             qq[\x00>],
2232             length $self->{ct}->{data});
2233              
2234             ## Stay in the state.
2235            
2236 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2237 0           $self->{line_prev} = $self->{line};
2238 0           $self->{column_prev} = $self->{column};
2239 0           $self->{column}++;
2240 0           $self->{nc}
2241             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2242             } else {
2243 0           $self->{set_nc}->($self);
2244             }
2245            
2246 0           redo A;
2247             }
2248             } elsif ($state == MARKUP_DECLARATION_OPEN_STATE) {
2249             ## XML5: "Markup declaration state".
2250            
2251 0 0 0       if ($nc == 0x002D) { # -
    0          
2252            
2253 0           $self->{state} = MD_HYPHEN_STATE;
2254            
2255 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256 0           $self->{line_prev} = $self->{line};
2257 0           $self->{column_prev} = $self->{column};
2258 0           $self->{column}++;
2259 0           $self->{nc}
2260             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261             } else {
2262 0           $self->{set_nc}->($self);
2263             }
2264            
2265 0           redo A;
2266             } elsif ($nc == 0x0044 or # D
2267             $nc == 0x0064) { # d
2268             ## ASCII case-insensitive.
2269            
2270 0           $self->{state} = MD_DOCTYPE_STATE;
2271 0           $self->{kwd} = chr $nc;
2272            
2273 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2274 0           $self->{line_prev} = $self->{line};
2275 0           $self->{column_prev} = $self->{column};
2276 0           $self->{column}++;
2277 0           $self->{nc}
2278             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2279             } else {
2280 0           $self->{set_nc}->($self);
2281             }
2282            
2283 0           redo A;
2284             # $nc == 0x005B) { # [
2285            
2286 0           $self->{state} = MD_CDATA_STATE;
2287 0           $self->{kwd} = '[';
2288            
2289 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2290 0           $self->{line_prev} = $self->{line};
2291 0           $self->{column_prev} = $self->{column};
2292 0           $self->{column}++;
2293 0           $self->{nc}
2294             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2295             } else {
2296 0           $self->{set_nc}->($self);
2297             }
2298            
2299 0           redo A;
2300             } else {
2301            
2302             }
2303              
2304 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2305             line => $self->{line_prev},
2306             column => $self->{column_prev} - 1);
2307             ## Reconsume.
2308 0           $self->{state} = BOGUS_COMMENT_STATE;
2309 0           $self->{ct} = {type => COMMENT_TOKEN, data => '',
2310             line => $self->{line_prev},
2311             column => $self->{column_prev} - 1,
2312             };
2313 0           redo A;
2314             } elsif ($state == MD_DOCTYPE_STATE) {
2315             ## ASCII case-insensitive.
2316 0 0 0       if ($nc == [
    0 0        
      0        
2317             undef,
2318             0x004F, # O
2319             0x0043, # C
2320             0x0054, # T
2321             0x0059, # Y
2322             0x0050, # P
2323             NEVER_CHAR, # (E)
2324             ]->[length $self->{kwd}] or
2325             $nc == [
2326             undef,
2327             0x006F, # o
2328             0x0063, # c
2329             0x0074, # t
2330             0x0079, # y
2331             0x0070, # p
2332             NEVER_CHAR, # (e)
2333             ]->[length $self->{kwd}]) {
2334            
2335             ## Stay in the state.
2336 0           $self->{kwd} .= chr $nc;
2337            
2338 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339 0           $self->{line_prev} = $self->{line};
2340 0           $self->{column_prev} = $self->{column};
2341 0           $self->{column}++;
2342 0           $self->{nc}
2343             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2344             } else {
2345 0           $self->{set_nc}->($self);
2346             }
2347            
2348 0           redo A;
2349             } elsif ((length $self->{kwd}) == 6 and
2350             ($nc == 0x0045 or # E
2351             $nc == 0x0065)) { # e
2352 0 0 0       if ($self->{is_xml} and
      0        
2353             ($self->{kwd} ne 'DOCTYP' or $nc == 0x0065)) {
2354            
2355             ## XML5: case-sensitive.
2356 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2357             text => 'DOCTYPE',
2358             line => $self->{line_prev},
2359             column => $self->{column_prev} - 5);
2360             } else {
2361            
2362             }
2363 0           $self->{state} = DOCTYPE_STATE;
2364 0           $self->{ct} = {type => DOCTYPE_TOKEN,
2365             quirks => 1,
2366             line => $self->{line_prev},
2367             column => $self->{column_prev} - 7,
2368             };
2369            
2370 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2371 0           $self->{line_prev} = $self->{line};
2372 0           $self->{column_prev} = $self->{column};
2373 0           $self->{column}++;
2374 0           $self->{nc}
2375             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2376             } else {
2377 0           $self->{set_nc}->($self);
2378             }
2379            
2380 0           redo A;
2381             } else {
2382            
2383 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2384             line => $self->{line_prev},
2385             column => $self->{column_prev} - 1 - length $self->{kwd});
2386 0           $self->{state} = BOGUS_COMMENT_STATE;
2387             ## Reconsume.
2388 0           $self->{ct} = {type => COMMENT_TOKEN,
2389             data => $self->{kwd},
2390             line => $self->{line_prev},
2391             column => $self->{column_prev} - 1 - length $self->{kwd},
2392             };
2393 0           redo A;
2394             }
2395             } elsif ($state == MD_CDATA_STATE) {
2396 0 0 0       if ($nc == {
    0          
2397             '[' => 0x0043, # C
2398             '[C' => 0x0044, # D
2399             '[CD' => 0x0041, # A
2400             '[CDA' => 0x0054, # T
2401             '[CDAT' => 0x0041, # A
2402             '[CDATA' => NEVER_CHAR, # ([)
2403             }->{$self->{kwd}}) {
2404            
2405             ## Stay in the state.
2406 0           $self->{kwd} .= chr $nc;
2407            
2408 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2409 0           $self->{line_prev} = $self->{line};
2410 0           $self->{column_prev} = $self->{column};
2411 0           $self->{column}++;
2412 0           $self->{nc}
2413             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2414             } else {
2415 0           $self->{set_nc}->($self);
2416             }
2417            
2418 0           redo A;
2419             } elsif ($self->{kwd} eq '[CDATA' and
2420             $nc == 0x005B) { # [
2421 0 0 0       if ($self->{is_xml} and
  0 0 0        
2422             not $self->{tainted} and
2423             @{$self->{open_elements} or []} == 0) {
2424            
2425 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2426             line => $self->{line_prev},
2427             column => $self->{column_prev} - 7);
2428 0           $self->{tainted} = 1;
2429             } else {
2430            
2431             }
2432              
2433 0           $self->{ct} = {type => CHARACTER_TOKEN,
2434             data => '',
2435             line => $self->{line_prev},
2436             column => $self->{column_prev} - 7};
2437 0           $self->{state} = CDATA_SECTION_STATE;
2438            
2439 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2440 0           $self->{line_prev} = $self->{line};
2441 0           $self->{column_prev} = $self->{column};
2442 0           $self->{column}++;
2443 0           $self->{nc}
2444             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2445             } else {
2446 0           $self->{set_nc}->($self);
2447             }
2448            
2449 0           redo A;
2450             } else {
2451            
2452 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2453             line => $self->{line_prev},
2454             column => $self->{column_prev} - 1 - length $self->{kwd});
2455 0           $self->{state} = BOGUS_COMMENT_STATE;
2456             ## Reconsume.
2457 0           $self->{ct} = {type => COMMENT_TOKEN,
2458             data => $self->{kwd},
2459             line => $self->{line_prev},
2460             column => $self->{column_prev} - 1 - length $self->{kwd},
2461             };
2462 0           redo A;
2463             }
2464             } elsif ($state == COMMENT_START_STATE) {
2465 0 0         if ($nc == 0x002D) { # -
    0          
    0          
    0          
2466            
2467 0           $self->{state} = COMMENT_START_DASH_STATE;
2468            
2469 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470 0           $self->{line_prev} = $self->{line};
2471 0           $self->{column_prev} = $self->{column};
2472 0           $self->{column}++;
2473 0           $self->{nc}
2474             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2475             } else {
2476 0           $self->{set_nc}->($self);
2477             }
2478            
2479 0           redo A;
2480             } elsif ($nc == 0x003E) { # >
2481 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2482 0 0         if ($self->{in_subset}) {
2483            
2484 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2485             } else {
2486            
2487 0           $self->{state} = DATA_STATE;
2488             }
2489            
2490 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2491 0           $self->{line_prev} = $self->{line};
2492 0           $self->{column_prev} = $self->{column};
2493 0           $self->{column}++;
2494 0           $self->{nc}
2495             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2496             } else {
2497 0           $self->{set_nc}->($self);
2498             }
2499            
2500              
2501 0           return ($self->{ct}); # comment
2502              
2503 0           redo A;
2504             } elsif ($nc == -1) {
2505 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2506 0 0         if ($self->{in_subset}) {
2507            
2508 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2509             } else {
2510            
2511 0           $self->{state} = DATA_STATE;
2512             }
2513             ## reconsume
2514              
2515 0           return ($self->{ct}); # comment
2516              
2517 0           redo A;
2518             } elsif ($nc == 0x0000) {
2519 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2520 0           $self->{ct}->{data} .= "\x{FFFD}"; # comment
2521 0           $self->{state} = COMMENT_STATE;
2522            
2523 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2524 0           $self->{line_prev} = $self->{line};
2525 0           $self->{column_prev} = $self->{column};
2526 0           $self->{column}++;
2527 0           $self->{nc}
2528             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2529             } else {
2530 0           $self->{set_nc}->($self);
2531             }
2532            
2533 0           redo A;
2534             } else {
2535            
2536 0           $self->{ct}->{data} # comment
2537             .= chr ($nc);
2538 0           $self->{state} = COMMENT_STATE;
2539            
2540 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2541 0           $self->{line_prev} = $self->{line};
2542 0           $self->{column_prev} = $self->{column};
2543 0           $self->{column}++;
2544 0           $self->{nc}
2545             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2546             } else {
2547 0           $self->{set_nc}->($self);
2548             }
2549            
2550 0           redo A;
2551             }
2552             } elsif ($state == COMMENT_START_DASH_STATE) {
2553 0 0         if ($nc == 0x002D) { # -
    0          
    0          
    0          
2554            
2555 0           $self->{state} = COMMENT_END_STATE;
2556            
2557 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2558 0           $self->{line_prev} = $self->{line};
2559 0           $self->{column_prev} = $self->{column};
2560 0           $self->{column}++;
2561 0           $self->{nc}
2562             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2563             } else {
2564 0           $self->{set_nc}->($self);
2565             }
2566            
2567 0           redo A;
2568             } elsif ($nc == 0x003E) { # >
2569 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2570 0 0         if ($self->{in_subset}) {
2571            
2572 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2573             } else {
2574            
2575 0           $self->{state} = DATA_STATE;
2576             }
2577            
2578 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2579 0           $self->{line_prev} = $self->{line};
2580 0           $self->{column_prev} = $self->{column};
2581 0           $self->{column}++;
2582 0           $self->{nc}
2583             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2584             } else {
2585 0           $self->{set_nc}->($self);
2586             }
2587            
2588              
2589 0           return ($self->{ct}); # comment
2590              
2591 0           redo A;
2592             } elsif ($nc == -1) {
2593 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2594 0 0         if ($self->{in_subset}) {
2595            
2596 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2597             } else {
2598            
2599 0           $self->{state} = DATA_STATE;
2600             }
2601             ## reconsume
2602              
2603 0           return ($self->{ct}); # comment
2604              
2605 0           redo A;
2606             } elsif ($nc == 0x0000) {
2607 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2608 0           $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2609 0           $self->{state} = COMMENT_STATE;
2610            
2611 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2612 0           $self->{line_prev} = $self->{line};
2613 0           $self->{column_prev} = $self->{column};
2614 0           $self->{column}++;
2615 0           $self->{nc}
2616             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2617             } else {
2618 0           $self->{set_nc}->($self);
2619             }
2620            
2621 0           redo A;
2622             } else {
2623            
2624 0           $self->{ct}->{data} # comment
2625             .= '-' . chr ($nc);
2626 0           $self->{state} = COMMENT_STATE;
2627            
2628 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2629 0           $self->{line_prev} = $self->{line};
2630 0           $self->{column_prev} = $self->{column};
2631 0           $self->{column}++;
2632 0           $self->{nc}
2633             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2634             } else {
2635 0           $self->{set_nc}->($self);
2636             }
2637            
2638 0           redo A;
2639             }
2640             } elsif ($state == COMMENT_STATE) {
2641             ## XML5: "Comment state" and "DOCTYPE comment state".
2642              
2643 0 0         if ($nc == 0x002D) { # -
    0          
    0          
2644            
2645 0           $self->{state} = COMMENT_END_DASH_STATE;
2646            
2647 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2648 0           $self->{line_prev} = $self->{line};
2649 0           $self->{column_prev} = $self->{column};
2650 0           $self->{column}++;
2651 0           $self->{nc}
2652             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2653             } else {
2654 0           $self->{set_nc}->($self);
2655             }
2656            
2657 0           redo A;
2658             } elsif ($nc == -1) {
2659 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2660 0 0         if ($self->{in_subset}) {
2661            
2662 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2663             } else {
2664            
2665 0           $self->{state} = DATA_STATE;
2666             }
2667             ## reconsume
2668              
2669 0           return ($self->{ct}); # comment
2670              
2671 0           redo A;
2672             } elsif ($nc == 0x0000) {
2673 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2674 0           $self->{ct}->{data} .= "\x{FFFD}"; # comment
2675            
2676 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2677 0           $self->{line_prev} = $self->{line};
2678 0           $self->{column_prev} = $self->{column};
2679 0           $self->{column}++;
2680 0           $self->{nc}
2681             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2682             } else {
2683 0           $self->{set_nc}->($self);
2684             }
2685            
2686 0           redo A;
2687             } else {
2688            
2689 0           $self->{ct}->{data} .= chr ($nc); # comment
2690 0           $self->{read_until}->($self->{ct}->{data},
2691             qq[-\x00],
2692             length $self->{ct}->{data});
2693              
2694             ## Stay in the state
2695            
2696 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2697 0           $self->{line_prev} = $self->{line};
2698 0           $self->{column_prev} = $self->{column};
2699 0           $self->{column}++;
2700 0           $self->{nc}
2701             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2702             } else {
2703 0           $self->{set_nc}->($self);
2704             }
2705            
2706 0           redo A;
2707             }
2708             } elsif ($state == COMMENT_END_DASH_STATE) {
2709             ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2710              
2711 0 0         if ($nc == 0x002D) { # -
    0          
    0          
2712            
2713 0           $self->{state} = COMMENT_END_STATE;
2714            
2715 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2716 0           $self->{line_prev} = $self->{line};
2717 0           $self->{column_prev} = $self->{column};
2718 0           $self->{column}++;
2719 0           $self->{nc}
2720             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2721             } else {
2722 0           $self->{set_nc}->($self);
2723             }
2724            
2725 0           redo A;
2726             } elsif ($nc == -1) {
2727 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2728 0 0         if ($self->{in_subset}) {
2729            
2730 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731             } else {
2732            
2733 0           $self->{state} = DATA_STATE;
2734             }
2735             ## reconsume
2736              
2737 0           return ($self->{ct}); # comment
2738              
2739 0           redo A;
2740             } elsif ($nc == 0x0000) {
2741 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2742 0           $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2743 0           $self->{state} = COMMENT_STATE;
2744            
2745 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2746 0           $self->{line_prev} = $self->{line};
2747 0           $self->{column_prev} = $self->{column};
2748 0           $self->{column}++;
2749 0           $self->{nc}
2750             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2751             } else {
2752 0           $self->{set_nc}->($self);
2753             }
2754            
2755 0           redo A;
2756             } else {
2757            
2758 0           $self->{ct}->{data} .= '-' . chr ($nc); # comment
2759 0           $self->{state} = COMMENT_STATE;
2760            
2761 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2762 0           $self->{line_prev} = $self->{line};
2763 0           $self->{column_prev} = $self->{column};
2764 0           $self->{column}++;
2765 0           $self->{nc}
2766             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2767             } else {
2768 0           $self->{set_nc}->($self);
2769             }
2770            
2771 0           redo A;
2772             }
2773             } elsif ($state == COMMENT_END_STATE or
2774             $state == COMMENT_END_BANG_STATE) {
2775             ## XML5: "Comment end state" and "DOCTYPE comment end state".
2776             ## (No comment end bang state.)
2777              
2778 0 0 0       if ($nc == 0x003E) { # >
    0          
    0          
    0          
    0          
2779 0 0         if ($self->{in_subset}) {
2780            
2781 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2782             } else {
2783            
2784 0           $self->{state} = DATA_STATE;
2785             }
2786            
2787 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788 0           $self->{line_prev} = $self->{line};
2789 0           $self->{column_prev} = $self->{column};
2790 0           $self->{column}++;
2791 0           $self->{nc}
2792             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2793             } else {
2794 0           $self->{set_nc}->($self);
2795             }
2796            
2797              
2798 0           return ($self->{ct}); # comment
2799              
2800 0           redo A;
2801             } elsif ($nc == 0x002D) { # -
2802 0 0         if ($state == COMMENT_END_BANG_STATE) {
2803            
2804 0           $self->{ct}->{data} .= '--!'; # comment
2805 0           $self->{state} = COMMENT_END_DASH_STATE;
2806             } else {
2807            
2808             ## XML5: Not a parse error.
2809 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2810             line => $self->{line_prev},
2811             column => $self->{column_prev});
2812 0           $self->{ct}->{data} .= '-'; # comment
2813             ## Stay in the state
2814             }
2815            
2816 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2817 0           $self->{line_prev} = $self->{line};
2818 0           $self->{column_prev} = $self->{column};
2819 0           $self->{column}++;
2820 0           $self->{nc}
2821             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2822             } else {
2823 0           $self->{set_nc}->($self);
2824             }
2825            
2826 0           redo A;
2827             } elsif ($state != COMMENT_END_BANG_STATE and
2828             $nc == 0x0021) { # !
2829            
2830 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
2831 0           $self->{state} = COMMENT_END_BANG_STATE;
2832            
2833 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2834 0           $self->{line_prev} = $self->{line};
2835 0           $self->{column_prev} = $self->{column};
2836 0           $self->{column}++;
2837 0           $self->{nc}
2838             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2839             } else {
2840 0           $self->{set_nc}->($self);
2841             }
2842            
2843 0           redo A;
2844             } elsif ($nc == -1) {
2845 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2846 0 0         if ($self->{in_subset}) {
2847            
2848 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2849             } else {
2850            
2851 0           $self->{state} = DATA_STATE;
2852             }
2853             ## Reconsume.
2854              
2855 0           return ($self->{ct}); # comment
2856              
2857 0           redo A;
2858             } elsif ($nc == 0x0000) {
2859 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2860 0 0         if ($state == COMMENT_END_BANG_STATE) {
2861 0           $self->{ct}->{data} .= "--!\x{FFFD}"; # comment
2862             } else {
2863 0           $self->{ct}->{data} .= "--\x{FFFD}"; # comment
2864             }
2865 0           $self->{state} = COMMENT_STATE;
2866            
2867 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2868 0           $self->{line_prev} = $self->{line};
2869 0           $self->{column_prev} = $self->{column};
2870 0           $self->{column}++;
2871 0           $self->{nc}
2872             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2873             } else {
2874 0           $self->{set_nc}->($self);
2875             }
2876            
2877 0           redo A;
2878             } else {
2879            
2880 0 0         if ($state == COMMENT_END_BANG_STATE) {
2881 0           $self->{ct}->{data} .= '--!' . chr ($nc); # comment
2882             } else {
2883 0           $self->{ct}->{data} .= '--' . chr ($nc); # comment
2884             }
2885 0           $self->{state} = COMMENT_STATE;
2886            
2887 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2888 0           $self->{line_prev} = $self->{line};
2889 0           $self->{column_prev} = $self->{column};
2890 0           $self->{column}++;
2891 0           $self->{nc}
2892             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2893             } else {
2894 0           $self->{set_nc}->($self);
2895             }
2896            
2897 0           redo A;
2898             }
2899             } elsif ($state == DOCTYPE_STATE) {
2900 0 0         if ($is_space->{$nc}) {
    0          
2901            
2902 0           $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2903            
2904 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2905 0           $self->{line_prev} = $self->{line};
2906 0           $self->{column_prev} = $self->{column};
2907 0           $self->{column}++;
2908 0           $self->{nc}
2909             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2910             } else {
2911 0           $self->{set_nc}->($self);
2912             }
2913            
2914 0           redo A;
2915             } elsif ($nc == -1) {
2916            
2917 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2918 0           $self->{ct}->{quirks} = 1;
2919              
2920 0           $self->{state} = DATA_STATE;
2921             ## Reconsume.
2922 0           return ($self->{ct}); # DOCTYPE (quirks)
2923              
2924 0           redo A;
2925             } else {
2926            
2927             ## XML5: Swith to the bogus comment state.
2928 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2929 0           $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2930             ## reconsume
2931 0           redo A;
2932             }
2933             } elsif ($state == BEFORE_DOCTYPE_NAME_STATE) {
2934             ## XML5: "DOCTYPE root name before state".
2935              
2936 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
    0          
2937            
2938             ## Stay in the state
2939            
2940 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941 0           $self->{line_prev} = $self->{line};
2942 0           $self->{column_prev} = $self->{column};
2943 0           $self->{column}++;
2944 0           $self->{nc}
2945             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946             } else {
2947 0           $self->{set_nc}->($self);
2948             }
2949            
2950 0           redo A;
2951             } elsif ($nc == 0x003E) { # >
2952            
2953             ## XML5: No parse error.
2954 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2955 0           $self->{state} = DATA_STATE;
2956            
2957 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2958 0           $self->{line_prev} = $self->{line};
2959 0           $self->{column_prev} = $self->{column};
2960 0           $self->{column}++;
2961 0           $self->{nc}
2962             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2963             } else {
2964 0           $self->{set_nc}->($self);
2965             }
2966            
2967              
2968 0           return ($self->{ct}); # DOCTYPE (quirks)
2969              
2970 0           redo A;
2971             } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
2972            
2973 0 0         $self->{ct}->{name} # DOCTYPE
2974             = chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
2975 0           delete $self->{ct}->{quirks};
2976 0           $self->{state} = DOCTYPE_NAME_STATE;
2977            
2978 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2979 0           $self->{line_prev} = $self->{line};
2980 0           $self->{column_prev} = $self->{column};
2981 0           $self->{column}++;
2982 0           $self->{nc}
2983             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2984             } else {
2985 0           $self->{set_nc}->($self);
2986             }
2987            
2988 0           redo A;
2989             } elsif ($nc == -1) {
2990            
2991 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2992 0           $self->{state} = DATA_STATE;
2993             ## reconsume
2994              
2995 0           return ($self->{ct}); # DOCTYPE (quirks)
2996              
2997 0           redo A;
2998             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
2999            
3000 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3001 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3002 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3003 0           $self->{in_subset} = 1;
3004            
3005 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3006 0           $self->{line_prev} = $self->{line};
3007 0           $self->{column_prev} = $self->{column};
3008 0           $self->{column}++;
3009 0           $self->{nc}
3010             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3011             } else {
3012 0           $self->{set_nc}->($self);
3013             }
3014            
3015 0           return ($self->{ct}); # DOCTYPE
3016 0           redo A;
3017             } elsif ($nc == 0x0000) {
3018 0           $self->{ct}->{name} = "\x{FFFD}";
3019 0           delete $self->{ct}->{quirks};
3020 0           $self->{state} = DOCTYPE_NAME_STATE;
3021            
3022 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3023 0           $self->{line_prev} = $self->{line};
3024 0           $self->{column_prev} = $self->{column};
3025 0           $self->{column}++;
3026 0           $self->{nc}
3027             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3028             } else {
3029 0           $self->{set_nc}->($self);
3030             }
3031            
3032 0           redo A;
3033             } else {
3034            
3035 0           $self->{ct}->{name} = chr $nc;
3036 0           delete $self->{ct}->{quirks};
3037 0           $self->{state} = DOCTYPE_NAME_STATE;
3038            
3039 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3040 0           $self->{line_prev} = $self->{line};
3041 0           $self->{column_prev} = $self->{column};
3042 0           $self->{column}++;
3043 0           $self->{nc}
3044             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3045             } else {
3046 0           $self->{set_nc}->($self);
3047             }
3048            
3049 0           redo A;
3050             }
3051             } elsif ($state == DOCTYPE_NAME_STATE) {
3052             ## XML5: "DOCTYPE root name state".
3053              
3054 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
    0          
3055            
3056 0           $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3057            
3058 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3059 0           $self->{line_prev} = $self->{line};
3060 0           $self->{column_prev} = $self->{column};
3061 0           $self->{column}++;
3062 0           $self->{nc}
3063             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3064             } else {
3065 0           $self->{set_nc}->($self);
3066             }
3067            
3068 0           redo A;
3069             } elsif ($nc == 0x003E) { # >
3070            
3071 0           $self->{state} = DATA_STATE;
3072            
3073 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3074 0           $self->{line_prev} = $self->{line};
3075 0           $self->{column_prev} = $self->{column};
3076 0           $self->{column}++;
3077 0           $self->{nc}
3078             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3079             } else {
3080 0           $self->{set_nc}->($self);
3081             }
3082            
3083              
3084 0           return ($self->{ct}); # DOCTYPE
3085              
3086 0           redo A;
3087             } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
3088            
3089 0 0         $self->{ct}->{name} # DOCTYPE
3090             .= chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
3091 0           delete $self->{ct}->{quirks};
3092             ## Stay in the state.
3093            
3094 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3095 0           $self->{line_prev} = $self->{line};
3096 0           $self->{column_prev} = $self->{column};
3097 0           $self->{column}++;
3098 0           $self->{nc}
3099             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3100             } else {
3101 0           $self->{set_nc}->($self);
3102             }
3103            
3104 0           redo A;
3105             } elsif ($nc == -1) {
3106            
3107 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3108 0           $self->{state} = DATA_STATE;
3109             ## reconsume
3110              
3111 0           $self->{ct}->{quirks} = 1;
3112 0           return ($self->{ct}); # DOCTYPE
3113              
3114 0           redo A;
3115             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
3116            
3117 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3118 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3119 0           $self->{in_subset} = 1;
3120            
3121 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122 0           $self->{line_prev} = $self->{line};
3123 0           $self->{column_prev} = $self->{column};
3124 0           $self->{column}++;
3125 0           $self->{nc}
3126             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127             } else {
3128 0           $self->{set_nc}->($self);
3129             }
3130            
3131 0           return ($self->{ct}); # DOCTYPE
3132 0           redo A;
3133             } elsif ($nc == 0x0000) {
3134 0           $self->{ct}->{name} .= "\x{FFFD}"; # DOCTYPE
3135             ## Stay in the state.
3136            
3137 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3138 0           $self->{line_prev} = $self->{line};
3139 0           $self->{column_prev} = $self->{column};
3140 0           $self->{column}++;
3141 0           $self->{nc}
3142             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3143             } else {
3144 0           $self->{set_nc}->($self);
3145             }
3146            
3147 0           redo A;
3148             } else {
3149            
3150 0           $self->{ct}->{name} .= chr ($nc); # DOCTYPE
3151             ## Stay in the state.
3152            
3153 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3154 0           $self->{line_prev} = $self->{line};
3155 0           $self->{column_prev} = $self->{column};
3156 0           $self->{column}++;
3157 0           $self->{nc}
3158             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3159             } else {
3160 0           $self->{set_nc}->($self);
3161             }
3162            
3163 0           redo A;
3164             }
3165             } elsif ($state == AFTER_DOCTYPE_NAME_STATE) {
3166             ## XML5: Corresponding to XML5's "DOCTYPE root name after
3167             ## state", but implemented differently.
3168              
3169 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0 0        
    0 0        
    0 0        
    0 0        
    0 0        
    0 0        
3170            
3171             ## Stay in the state
3172            
3173 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3174 0           $self->{line_prev} = $self->{line};
3175 0           $self->{column_prev} = $self->{column};
3176 0           $self->{column}++;
3177 0           $self->{nc}
3178             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3179             } else {
3180 0           $self->{set_nc}->($self);
3181             }
3182            
3183 0           redo A;
3184             } elsif ($nc == 0x003E) { # >
3185 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3186            
3187 0           $self->{state} = DATA_STATE;
3188             } else {
3189            
3190 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3191 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3192             }
3193            
3194            
3195 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3196 0           $self->{line_prev} = $self->{line};
3197 0           $self->{column_prev} = $self->{column};
3198 0           $self->{column}++;
3199 0           $self->{nc}
3200             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3201             } else {
3202 0           $self->{set_nc}->($self);
3203             }
3204            
3205 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3206 0           redo A;
3207             } elsif ($nc == -1) {
3208 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3209            
3210 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3211 0           $self->{state} = DATA_STATE;
3212 0           $self->{ct}->{quirks} = 1;
3213             } else {
3214            
3215 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3216 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3217             }
3218            
3219             ## Reconsume.
3220 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3221 0           redo A;
3222             } elsif ($nc == 0x0050 or # P
3223             $nc == 0x0070) { # p
3224            
3225 0           $self->{state} = PUBLIC_STATE;
3226 0           $self->{kwd} = chr $nc;
3227            
3228 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229 0           $self->{line_prev} = $self->{line};
3230 0           $self->{column_prev} = $self->{column};
3231 0           $self->{column}++;
3232 0           $self->{nc}
3233             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234             } else {
3235 0           $self->{set_nc}->($self);
3236             }
3237            
3238 0           redo A;
3239             } elsif ($nc == 0x0053 or # S
3240             $nc == 0x0073) { # s
3241            
3242 0           $self->{state} = SYSTEM_STATE;
3243 0           $self->{kwd} = chr $nc;
3244            
3245 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3246 0           $self->{line_prev} = $self->{line};
3247 0           $self->{column_prev} = $self->{column};
3248 0           $self->{column}++;
3249 0           $self->{nc}
3250             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3251             } else {
3252 0           $self->{set_nc}->($self);
3253             }
3254            
3255 0           redo A;
3256             } elsif ($nc == 0x0022 and # "
3257             ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3258             $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3259            
3260 0           $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3261 0           $self->{ct}->{value} = ''; # ENTITY
3262            
3263 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264 0           $self->{line_prev} = $self->{line};
3265 0           $self->{column_prev} = $self->{column};
3266 0           $self->{column}++;
3267 0           $self->{nc}
3268             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269             } else {
3270 0           $self->{set_nc}->($self);
3271             }
3272            
3273 0           redo A;
3274             } elsif ($nc == 0x0027 and # '
3275             ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3276             $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3277            
3278 0           $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3279 0           $self->{ct}->{value} = ''; # ENTITY
3280            
3281 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282 0           $self->{line_prev} = $self->{line};
3283 0           $self->{column_prev} = $self->{column};
3284 0           $self->{column}++;
3285 0           $self->{nc}
3286             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3287             } else {
3288 0           $self->{set_nc}->($self);
3289             }
3290            
3291 0           redo A;
3292             } elsif ($self->{is_xml} and
3293             $self->{ct}->{type} == DOCTYPE_TOKEN and
3294             $nc == 0x005B) { # [
3295            
3296 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3297 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3298 0           $self->{in_subset} = 1;
3299            
3300 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301 0           $self->{line_prev} = $self->{line};
3302 0           $self->{column_prev} = $self->{column};
3303 0           $self->{column}++;
3304 0           $self->{nc}
3305             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306             } else {
3307 0           $self->{set_nc}->($self);
3308             }
3309            
3310 0           return ($self->{ct}); # DOCTYPE
3311 0           redo A;
3312             } else {
3313 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3314              
3315 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3316            
3317 0           $self->{ct}->{quirks} = 1;
3318 0           $self->{state} = BOGUS_DOCTYPE_STATE;
3319             } else {
3320            
3321 0           $self->{state} = BOGUS_MD_STATE;
3322             }
3323              
3324            
3325 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326 0           $self->{line_prev} = $self->{line};
3327 0           $self->{column_prev} = $self->{column};
3328 0           $self->{column}++;
3329 0           $self->{nc}
3330             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3331             } else {
3332 0           $self->{set_nc}->($self);
3333             }
3334            
3335 0           redo A;
3336             }
3337             } elsif ($state == PUBLIC_STATE) {
3338             ## ASCII case-insensitive
3339 0 0 0       if ($nc == [
    0 0        
      0        
3340             undef,
3341             0x0055, # U
3342             0x0042, # B
3343             0x004C, # L
3344             0x0049, # I
3345             NEVER_CHAR, # (C)
3346             ]->[length $self->{kwd}] or
3347             $nc == [
3348             undef,
3349             0x0075, # u
3350             0x0062, # b
3351             0x006C, # l
3352             0x0069, # i
3353             NEVER_CHAR, # (c)
3354             ]->[length $self->{kwd}]) {
3355            
3356             ## Stay in the state.
3357 0           $self->{kwd} .= chr $nc;
3358            
3359 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3360 0           $self->{line_prev} = $self->{line};
3361 0           $self->{column_prev} = $self->{column};
3362 0           $self->{column}++;
3363 0           $self->{nc}
3364             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3365             } else {
3366 0           $self->{set_nc}->($self);
3367             }
3368            
3369 0           redo A;
3370             } elsif ((length $self->{kwd}) == 5 and
3371             ($nc == 0x0043 or # C
3372             $nc == 0x0063)) { # c
3373 0 0 0       if ($self->{is_xml} and
      0        
3374             ($self->{kwd} ne 'PUBLI' or $nc == 0x0063)) { # c
3375            
3376 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3377             text => 'PUBLIC',
3378             line => $self->{line_prev},
3379             column => $self->{column_prev} - 4);
3380             } else {
3381            
3382             }
3383 0           $self->{state} = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
3384            
3385 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3386 0           $self->{line_prev} = $self->{line};
3387 0           $self->{column_prev} = $self->{column};
3388 0           $self->{column}++;
3389 0           $self->{nc}
3390             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3391             } else {
3392 0           $self->{set_nc}->($self);
3393             }
3394            
3395 0           redo A;
3396             } else {
3397 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3398             line => $self->{line_prev},
3399             column => $self->{column_prev} + 1 - length $self->{kwd});
3400 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3401            
3402 0           $self->{ct}->{quirks} = 1;
3403 0           $self->{state} = BOGUS_DOCTYPE_STATE;
3404             } else {
3405            
3406 0           $self->{state} = BOGUS_MD_STATE;
3407             }
3408             ## Reconsume.
3409 0           redo A;
3410             }
3411             } elsif ($state == SYSTEM_STATE) {
3412             ## ASCII case-insensitive
3413 0 0 0       if ($nc == [
    0 0        
      0        
3414             undef,
3415             0x0059, # Y
3416             0x0053, # S
3417             0x0054, # T
3418             0x0045, # E
3419             NEVER_CHAR, # (M)
3420             ]->[length $self->{kwd}] or
3421             $nc == [
3422             undef,
3423             0x0079, # y
3424             0x0073, # s
3425             0x0074, # t
3426             0x0065, # e
3427             NEVER_CHAR, # (m)
3428             ]->[length $self->{kwd}]) {
3429            
3430             ## Stay in the state.
3431 0           $self->{kwd} .= chr $nc;
3432            
3433 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3434 0           $self->{line_prev} = $self->{line};
3435 0           $self->{column_prev} = $self->{column};
3436 0           $self->{column}++;
3437 0           $self->{nc}
3438             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3439             } else {
3440 0           $self->{set_nc}->($self);
3441             }
3442            
3443 0           redo A;
3444             } elsif ((length $self->{kwd}) == 5 and
3445             ($nc == 0x004D or # M
3446             $nc == 0x006D)) { # m
3447 0 0 0       if ($self->{is_xml} and
      0        
3448             ($self->{kwd} ne 'SYSTE' or $nc == 0x006D)) { # m
3449            
3450 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3451             text => 'SYSTEM',
3452             line => $self->{line_prev},
3453             column => $self->{column_prev} - 4);
3454             } else {
3455            
3456             }
3457 0           $self->{state} = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
3458            
3459 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3460 0           $self->{line_prev} = $self->{line};
3461 0           $self->{column_prev} = $self->{column};
3462 0           $self->{column}++;
3463 0           $self->{nc}
3464             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3465             } else {
3466 0           $self->{set_nc}->($self);
3467             }
3468            
3469 0           redo A;
3470             } else {
3471 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3472             line => $self->{line_prev},
3473             column => $self->{column_prev} + 1 - length $self->{kwd});
3474 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3475            
3476 0           $self->{ct}->{quirks} = 1;
3477 0           $self->{state} = BOGUS_DOCTYPE_STATE;
3478             } else {
3479            
3480 0           $self->{state} = BOGUS_MD_STATE;
3481             }
3482             ## Reconsume.
3483 0           redo A;
3484             }
3485             } elsif ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE or
3486             $state == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3487 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
    0          
3488            
3489             ## Stay in or switch to the state.
3490 0           $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3491            
3492 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3493 0           $self->{line_prev} = $self->{line};
3494 0           $self->{column_prev} = $self->{column};
3495 0           $self->{column}++;
3496 0           $self->{nc}
3497             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3498             } else {
3499 0           $self->{set_nc}->($self);
3500             }
3501            
3502 0           redo A;
3503             } elsif ($nc == 0x0022) { # "
3504 0 0         if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3505            
3506 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3507             } else {
3508            
3509             }
3510 0           $self->{ct}->{pubid} = ''; # DOCTYPE
3511 0           $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3512            
3513 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3514 0           $self->{line_prev} = $self->{line};
3515 0           $self->{column_prev} = $self->{column};
3516 0           $self->{column}++;
3517 0           $self->{nc}
3518             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3519             } else {
3520 0           $self->{set_nc}->($self);
3521             }
3522            
3523 0           redo A;
3524             } elsif ($nc == 0x0027) { # '
3525 0 0         if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3526            
3527 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3528             } else {
3529            
3530             }
3531 0           $self->{ct}->{pubid} = ''; # DOCTYPE
3532 0           $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3533            
3534 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3535 0           $self->{line_prev} = $self->{line};
3536 0           $self->{column_prev} = $self->{column};
3537 0           $self->{column}++;
3538 0           $self->{nc}
3539             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3540             } else {
3541 0           $self->{set_nc}->($self);
3542             }
3543            
3544 0           redo A;
3545             } elsif ($nc == 0x003E) { # >
3546 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3547            
3548 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3549            
3550 0           $self->{state} = DATA_STATE;
3551 0           $self->{ct}->{quirks} = 1;
3552             } else {
3553            
3554 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3555             }
3556            
3557            
3558 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3559 0           $self->{line_prev} = $self->{line};
3560 0           $self->{column_prev} = $self->{column};
3561 0           $self->{column}++;
3562 0           $self->{nc}
3563             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3564             } else {
3565 0           $self->{set_nc}->($self);
3566             }
3567            
3568 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3569 0           redo A;
3570             } elsif ($nc == EOF_CHAR) {
3571 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3572            
3573 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3574 0           $self->{state} = DATA_STATE;
3575 0           $self->{ct}->{quirks} = 1;
3576             } else {
3577            
3578 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3579 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3580             }
3581            
3582             ## Reconsume.
3583 0           return ($self->{ct}); # DOCTYPE
3584 0           redo A;
3585             } elsif ($self->{is_xml} and
3586             $self->{ct}->{type} == DOCTYPE_TOKEN and
3587             $nc == 0x005B) { # [
3588            
3589 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3590 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3591 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3592 0           $self->{in_subset} = 1;
3593            
3594 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3595 0           $self->{line_prev} = $self->{line};
3596 0           $self->{column_prev} = $self->{column};
3597 0           $self->{column}++;
3598 0           $self->{nc}
3599             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3600             } else {
3601 0           $self->{set_nc}->($self);
3602             }
3603            
3604 0           return ($self->{ct}); # DOCTYPE
3605 0           redo A;
3606             } else {
3607 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3608              
3609 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3610            
3611 0           $self->{ct}->{quirks} = 1;
3612 0           $self->{state} = BOGUS_DOCTYPE_STATE;
3613             } else {
3614            
3615 0           $self->{state} = BOGUS_MD_STATE;
3616             }
3617              
3618            
3619 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3620 0           $self->{line_prev} = $self->{line};
3621 0           $self->{column_prev} = $self->{column};
3622 0           $self->{column}++;
3623 0           $self->{nc}
3624             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3625             } else {
3626 0           $self->{set_nc}->($self);
3627             }
3628            
3629 0           redo A;
3630             }
3631             } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3632 0 0         if ($nc == 0x0022) { # "
    0          
    0          
    0          
3633            
3634 0           $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3635            
3636 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3637 0           $self->{line_prev} = $self->{line};
3638 0           $self->{column_prev} = $self->{column};
3639 0           $self->{column}++;
3640 0           $self->{nc}
3641             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3642             } else {
3643 0           $self->{set_nc}->($self);
3644             }
3645            
3646 0           redo A;
3647             } elsif ($nc == 0x003E) { # >
3648 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3649              
3650 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3651            
3652 0           $self->{state} = DATA_STATE;
3653 0           $self->{ct}->{quirks} = 1;
3654             } else {
3655            
3656 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3657             }
3658              
3659            
3660 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3661 0           $self->{line_prev} = $self->{line};
3662 0           $self->{column_prev} = $self->{column};
3663 0           $self->{column}++;
3664 0           $self->{nc}
3665             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3666             } else {
3667 0           $self->{set_nc}->($self);
3668             }
3669            
3670 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3671 0           redo A;
3672             } elsif ($nc == -1) {
3673 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3674              
3675 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3676            
3677 0           $self->{state} = DATA_STATE;
3678 0           $self->{ct}->{quirks} = 1;
3679             } else {
3680            
3681 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3682             }
3683            
3684             ## Reconsume.
3685 0           return ($self->{ct}); # DOCTYPE
3686 0           redo A;
3687             } elsif ($nc == 0x0000) {
3688 0           $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3689             ## Stay in the state.
3690            
3691 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692 0           $self->{line_prev} = $self->{line};
3693 0           $self->{column_prev} = $self->{column};
3694 0           $self->{column}++;
3695 0           $self->{nc}
3696             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697             } else {
3698 0           $self->{set_nc}->($self);
3699             }
3700            
3701 0           redo A;
3702             } else {
3703            
3704 0           $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3705 0           $self->{read_until}->($self->{ct}->{pubid}, qq[\x00">],
3706             length $self->{ct}->{pubid});
3707              
3708             ## Stay in the state.
3709            
3710 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3711 0           $self->{line_prev} = $self->{line};
3712 0           $self->{column_prev} = $self->{column};
3713 0           $self->{column}++;
3714 0           $self->{nc}
3715             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3716             } else {
3717 0           $self->{set_nc}->($self);
3718             }
3719            
3720 0           redo A;
3721             }
3722             } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3723 0 0         if ($nc == 0x0027) { # '
    0          
    0          
    0          
3724            
3725 0           $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3726            
3727 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3728 0           $self->{line_prev} = $self->{line};
3729 0           $self->{column_prev} = $self->{column};
3730 0           $self->{column}++;
3731 0           $self->{nc}
3732             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3733             } else {
3734 0           $self->{set_nc}->($self);
3735             }
3736            
3737 0           redo A;
3738             } elsif ($nc == 0x003E) { # >
3739 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3740              
3741 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3742            
3743 0           $self->{state} = DATA_STATE;
3744 0           $self->{ct}->{quirks} = 1;
3745             } else {
3746            
3747 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3748             }
3749              
3750            
3751 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3752 0           $self->{line_prev} = $self->{line};
3753 0           $self->{column_prev} = $self->{column};
3754 0           $self->{column}++;
3755 0           $self->{nc}
3756             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3757             } else {
3758 0           $self->{set_nc}->($self);
3759             }
3760            
3761 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3762 0           redo A;
3763             } elsif ($nc == -1) {
3764 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3765              
3766 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3767            
3768 0           $self->{state} = DATA_STATE;
3769 0           $self->{ct}->{quirks} = 1;
3770             } else {
3771            
3772 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3773             }
3774            
3775             ## reconsume
3776 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3777 0           redo A;
3778             } elsif ($nc == 0x0000) {
3779 0           $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3780             ## Stay in the state.
3781            
3782 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3783 0           $self->{line_prev} = $self->{line};
3784 0           $self->{column_prev} = $self->{column};
3785 0           $self->{column}++;
3786 0           $self->{nc}
3787             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3788             } else {
3789 0           $self->{set_nc}->($self);
3790             }
3791            
3792 0           redo A;
3793             } else {
3794            
3795 0           $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3796 0           $self->{read_until}->($self->{ct}->{pubid}, qq[\x00'>],
3797             length $self->{ct}->{pubid});
3798              
3799             ## Stay in the state
3800            
3801 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3802 0           $self->{line_prev} = $self->{line};
3803 0           $self->{column_prev} = $self->{column};
3804 0           $self->{column}++;
3805 0           $self->{nc}
3806             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3807             } else {
3808 0           $self->{set_nc}->($self);
3809             }
3810            
3811 0           redo A;
3812             }
3813             } elsif ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE or
3814             $state == BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE) {
3815 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
    0          
3816            
3817             ## Stay in or switch to the state.
3818 0           $self->{state} = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE;
3819            
3820 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3821 0           $self->{line_prev} = $self->{line};
3822 0           $self->{column_prev} = $self->{column};
3823 0           $self->{column}++;
3824 0           $self->{nc}
3825             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3826             } else {
3827 0           $self->{set_nc}->($self);
3828             }
3829            
3830 0           redo A;
3831             } elsif ($nc == 0x0022) { # "
3832 0 0         if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3833            
3834 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3835             } else {
3836            
3837             }
3838 0           $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3839 0           $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3840            
3841 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3842 0           $self->{line_prev} = $self->{line};
3843 0           $self->{column_prev} = $self->{column};
3844 0           $self->{column}++;
3845 0           $self->{nc}
3846             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3847             } else {
3848 0           $self->{set_nc}->($self);
3849             }
3850            
3851 0           redo A;
3852             } elsif ($nc == 0x0027) { # '
3853 0 0         if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3854            
3855 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3856             } else {
3857            
3858             }
3859 0           $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3860 0           $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3861            
3862 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3863 0           $self->{line_prev} = $self->{line};
3864 0           $self->{column_prev} = $self->{column};
3865 0           $self->{column}++;
3866 0           $self->{nc}
3867             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3868             } else {
3869 0           $self->{set_nc}->($self);
3870             }
3871            
3872 0           redo A;
3873             } elsif ($nc == 0x003E) { # >
3874 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875 0 0         if ($self->{is_xml}) {
3876            
3877 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3878             } else {
3879            
3880             }
3881 0           $self->{state} = DATA_STATE;
3882             } else {
3883 0 0         if ($self->{ct}->{type} == NOTATION_TOKEN) {
3884            
3885             } else {
3886            
3887 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3888             }
3889 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3890             }
3891            
3892            
3893 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3894 0           $self->{line_prev} = $self->{line};
3895 0           $self->{column_prev} = $self->{column};
3896 0           $self->{column}++;
3897 0           $self->{nc}
3898             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3899             } else {
3900 0           $self->{set_nc}->($self);
3901             }
3902            
3903 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3904 0           redo A;
3905             } elsif ($nc == EOF_CHAR) {
3906 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3907            
3908 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3909            
3910 0           $self->{state} = DATA_STATE;
3911 0           $self->{ct}->{quirks} = 1;
3912             } else {
3913 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3914 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3915             }
3916              
3917             ## Reconsume.
3918 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3919 0           redo A;
3920             } elsif ($self->{is_xml} and
3921             $self->{ct}->{type} == DOCTYPE_TOKEN and
3922             $nc == 0x005B) { # [
3923            
3924 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3925 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3926 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3927 0           $self->{in_subset} = 1;
3928            
3929 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3930 0           $self->{line_prev} = $self->{line};
3931 0           $self->{column_prev} = $self->{column};
3932 0           $self->{column}++;
3933 0           $self->{nc}
3934             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3935             } else {
3936 0           $self->{set_nc}->($self);
3937             }
3938            
3939 0           return ($self->{ct}); # DOCTYPE
3940 0           redo A;
3941             } else {
3942 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3943              
3944 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3945            
3946 0           $self->{ct}->{quirks} = 1;
3947 0           $self->{state} = BOGUS_DOCTYPE_STATE;
3948             } else {
3949            
3950 0           $self->{state} = BOGUS_MD_STATE;
3951             }
3952              
3953            
3954 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3955 0           $self->{line_prev} = $self->{line};
3956 0           $self->{column_prev} = $self->{column};
3957 0           $self->{column}++;
3958 0           $self->{nc}
3959             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3960             } else {
3961 0           $self->{set_nc}->($self);
3962             }
3963            
3964 0           redo A;
3965             }
3966             } elsif ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE or
3967             $state == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3968 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
    0          
3969            
3970             ## Stay in or switch to the state.
3971 0           $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3972            
3973 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3974 0           $self->{line_prev} = $self->{line};
3975 0           $self->{column_prev} = $self->{column};
3976 0           $self->{column}++;
3977 0           $self->{nc}
3978             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3979             } else {
3980 0           $self->{set_nc}->($self);
3981             }
3982            
3983 0           redo A;
3984             } elsif ($nc == 0x0022) { # "
3985 0 0         if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
3986            
3987 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3988             } else {
3989            
3990             }
3991 0           $self->{ct}->{sysid} = ''; # DOCTYPE
3992 0           $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3993            
3994 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3995 0           $self->{line_prev} = $self->{line};
3996 0           $self->{column_prev} = $self->{column};
3997 0           $self->{column}++;
3998 0           $self->{nc}
3999             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4000             } else {
4001 0           $self->{set_nc}->($self);
4002             }
4003            
4004 0           redo A;
4005             } elsif ($nc == 0x0027) { # '
4006 0 0         if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
4007            
4008 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
4009             } else {
4010            
4011             }
4012 0           $self->{ct}->{sysid} = ''; # DOCTYPE
4013 0           $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4014            
4015 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4016 0           $self->{line_prev} = $self->{line};
4017 0           $self->{column_prev} = $self->{column};
4018 0           $self->{column}++;
4019 0           $self->{nc}
4020             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4021             } else {
4022 0           $self->{set_nc}->($self);
4023             }
4024            
4025 0           redo A;
4026             } elsif ($nc == 0x003E) { # >
4027 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4028            
4029 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4030 0           $self->{line_prev} = $self->{line};
4031 0           $self->{column_prev} = $self->{column};
4032 0           $self->{column}++;
4033 0           $self->{nc}
4034             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4035             } else {
4036 0           $self->{set_nc}->($self);
4037             }
4038            
4039              
4040 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4041            
4042 0           $self->{state} = DATA_STATE;
4043 0           $self->{ct}->{quirks} = 1;
4044             } else {
4045            
4046 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4047             }
4048              
4049 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4050 0           redo A;
4051             } elsif ($nc == EOF_CHAR) {
4052 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4053            
4054 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4055 0           $self->{state} = DATA_STATE;
4056 0           $self->{ct}->{quirks} = 1;
4057             } else {
4058            
4059 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4060 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4061             }
4062            
4063             ## Reconsume.
4064 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4065 0           redo A;
4066             } elsif ($self->{is_xml} and
4067             $self->{ct}->{type} == DOCTYPE_TOKEN and
4068             $nc == 0x005B) { # [
4069            
4070 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4071              
4072 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4073 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4074 0           $self->{in_subset} = 1;
4075            
4076 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4077 0           $self->{line_prev} = $self->{line};
4078 0           $self->{column_prev} = $self->{column};
4079 0           $self->{column}++;
4080 0           $self->{nc}
4081             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4082             } else {
4083 0           $self->{set_nc}->($self);
4084             }
4085            
4086 0           return ($self->{ct}); # DOCTYPE
4087 0           redo A;
4088             } else {
4089 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4090              
4091 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092            
4093 0           $self->{ct}->{quirks} = 1;
4094 0           $self->{state} = BOGUS_DOCTYPE_STATE;
4095             } else {
4096            
4097 0           $self->{state} = BOGUS_MD_STATE;
4098             }
4099              
4100            
4101 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4102 0           $self->{line_prev} = $self->{line};
4103 0           $self->{column_prev} = $self->{column};
4104 0           $self->{column}++;
4105 0           $self->{nc}
4106             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4107             } else {
4108 0           $self->{set_nc}->($self);
4109             }
4110            
4111 0           redo A;
4112             }
4113             } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4114 0 0 0       if ($nc == 0x0022) { # "
    0          
    0          
    0          
4115            
4116 0           $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4117            
4118 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119 0           $self->{line_prev} = $self->{line};
4120 0           $self->{column_prev} = $self->{column};
4121 0           $self->{column}++;
4122 0           $self->{nc}
4123             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124             } else {
4125 0           $self->{set_nc}->($self);
4126             }
4127            
4128 0           redo A;
4129             } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4130 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4131              
4132 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4133            
4134 0           $self->{state} = DATA_STATE;
4135 0           $self->{ct}->{quirks} = 1;
4136             } else {
4137            
4138 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4139             }
4140            
4141            
4142 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4143 0           $self->{line_prev} = $self->{line};
4144 0           $self->{column_prev} = $self->{column};
4145 0           $self->{column}++;
4146 0           $self->{nc}
4147             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4148             } else {
4149 0           $self->{set_nc}->($self);
4150             }
4151            
4152 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4153 0           redo A;
4154             } elsif ($nc == -1) {
4155 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4156              
4157 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4158            
4159 0           $self->{state} = DATA_STATE;
4160 0           $self->{ct}->{quirks} = 1;
4161             } else {
4162            
4163 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4164             }
4165            
4166             ## reconsume
4167 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4168 0           redo A;
4169             } elsif ($nc == 0x0000) {
4170 0           $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4171             ## Stay in the state.
4172            
4173 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174 0           $self->{line_prev} = $self->{line};
4175 0           $self->{column_prev} = $self->{column};
4176 0           $self->{column}++;
4177 0           $self->{nc}
4178             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4179             } else {
4180 0           $self->{set_nc}->($self);
4181             }
4182            
4183 0           redo A;
4184             } else {
4185            
4186 0           $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4187 0           $self->{read_until}->($self->{ct}->{sysid}, qq[\x00">],
4188             length $self->{ct}->{sysid});
4189              
4190             ## Stay in the state
4191            
4192 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4193 0           $self->{line_prev} = $self->{line};
4194 0           $self->{column_prev} = $self->{column};
4195 0           $self->{column}++;
4196 0           $self->{nc}
4197             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4198             } else {
4199 0           $self->{set_nc}->($self);
4200             }
4201            
4202 0           redo A;
4203             }
4204             } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4205 0 0 0       if ($nc == 0x0027) { # '
    0          
    0          
    0          
4206            
4207 0           $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4208            
4209 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4210 0           $self->{line_prev} = $self->{line};
4211 0           $self->{column_prev} = $self->{column};
4212 0           $self->{column}++;
4213 0           $self->{nc}
4214             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4215             } else {
4216 0           $self->{set_nc}->($self);
4217             }
4218            
4219 0           redo A;
4220             } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4221            
4222 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4223              
4224 0           $self->{state} = DATA_STATE;
4225            
4226 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4227 0           $self->{line_prev} = $self->{line};
4228 0           $self->{column_prev} = $self->{column};
4229 0           $self->{column}++;
4230 0           $self->{nc}
4231             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4232             } else {
4233 0           $self->{set_nc}->($self);
4234             }
4235            
4236              
4237 0           $self->{ct}->{quirks} = 1;
4238 0           return ($self->{ct}); # DOCTYPE
4239              
4240 0           redo A;
4241             } elsif ($nc == -1) {
4242 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4243              
4244 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4245            
4246 0           $self->{state} = DATA_STATE;
4247 0           $self->{ct}->{quirks} = 1;
4248             } else {
4249            
4250 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4251             }
4252              
4253             ## reconsume
4254 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4255 0           redo A;
4256             } elsif ($nc == 0x0000) {
4257 0           $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4258             ## Stay in the state.
4259            
4260 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4261 0           $self->{line_prev} = $self->{line};
4262 0           $self->{column_prev} = $self->{column};
4263 0           $self->{column}++;
4264 0           $self->{nc}
4265             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4266             } else {
4267 0           $self->{set_nc}->($self);
4268             }
4269            
4270 0           redo A;
4271             } else {
4272            
4273 0           $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4274 0           $self->{read_until}->($self->{ct}->{sysid}, qq[\x00'>],
4275             length $self->{ct}->{sysid});
4276              
4277             ## Stay in the state
4278            
4279 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4280 0           $self->{line_prev} = $self->{line};
4281 0           $self->{column_prev} = $self->{column};
4282 0           $self->{column}++;
4283 0           $self->{nc}
4284             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4285             } else {
4286 0           $self->{set_nc}->($self);
4287             }
4288            
4289 0           redo A;
4290             }
4291             } elsif ($state == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4292 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0 0        
    0 0        
    0          
4293 0 0         if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4294            
4295 0           $self->{state} = BEFORE_NDATA_STATE;
4296             } else {
4297            
4298             ## Stay in the state
4299             }
4300            
4301 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4302 0           $self->{line_prev} = $self->{line};
4303 0           $self->{column_prev} = $self->{column};
4304 0           $self->{column}++;
4305 0           $self->{nc}
4306             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4307             } else {
4308 0           $self->{set_nc}->($self);
4309             }
4310            
4311 0           redo A;
4312             } elsif ($nc == 0x003E) { # >
4313 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4314            
4315 0           $self->{state} = DATA_STATE;
4316             } else {
4317            
4318 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4319             }
4320              
4321            
4322 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4323 0           $self->{line_prev} = $self->{line};
4324 0           $self->{column_prev} = $self->{column};
4325 0           $self->{column}++;
4326 0           $self->{nc}
4327             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4328             } else {
4329 0           $self->{set_nc}->($self);
4330             }
4331            
4332 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4333 0           redo A;
4334             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4335             ($nc == 0x004E or # N
4336             $nc == 0x006E)) { # n
4337            
4338 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4339 0           $self->{state} = NDATA_STATE;
4340 0           $self->{kwd} = chr $nc;
4341            
4342 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4343 0           $self->{line_prev} = $self->{line};
4344 0           $self->{column_prev} = $self->{column};
4345 0           $self->{column}++;
4346 0           $self->{nc}
4347             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4348             } else {
4349 0           $self->{set_nc}->($self);
4350             }
4351            
4352 0           redo A;
4353             } elsif ($nc == -1) {
4354 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4355            
4356 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4357 0           $self->{state} = DATA_STATE;
4358 0           $self->{ct}->{quirks} = 1;
4359             } else {
4360            
4361 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4362 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4363             }
4364              
4365             ## reconsume
4366 0           return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4367 0           redo A;
4368             } elsif ($self->{is_xml} and
4369             $self->{ct}->{type} == DOCTYPE_TOKEN and
4370             $nc == 0x005B) { # [
4371            
4372 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4373 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4374 0           $self->{in_subset} = 1;
4375            
4376 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4377 0           $self->{line_prev} = $self->{line};
4378 0           $self->{column_prev} = $self->{column};
4379 0           $self->{column}++;
4380 0           $self->{nc}
4381             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4382             } else {
4383 0           $self->{set_nc}->($self);
4384             }
4385            
4386 0           return ($self->{ct}); # DOCTYPE
4387 0           redo A;
4388             } else {
4389 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4390              
4391 0 0         if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4392            
4393             #$self->{ct}->{quirks} = 1;
4394 0           $self->{state} = BOGUS_DOCTYPE_STATE;
4395             } else {
4396            
4397 0           $self->{state} = BOGUS_MD_STATE;
4398             }
4399              
4400            
4401 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4402 0           $self->{line_prev} = $self->{line};
4403 0           $self->{column_prev} = $self->{column};
4404 0           $self->{column}++;
4405 0           $self->{nc}
4406             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4407             } else {
4408 0           $self->{set_nc}->($self);
4409             }
4410            
4411 0           redo A;
4412             }
4413             } elsif ($state == BEFORE_NDATA_STATE) {
4414 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
4415            
4416             ## Stay in the state.
4417            
4418 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4419 0           $self->{line_prev} = $self->{line};
4420 0           $self->{column_prev} = $self->{column};
4421 0           $self->{column}++;
4422 0           $self->{nc}
4423             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4424             } else {
4425 0           $self->{set_nc}->($self);
4426             }
4427            
4428 0           redo A;
4429             } elsif ($nc == 0x003E) { # >
4430            
4431 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4432            
4433 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4434 0           $self->{line_prev} = $self->{line};
4435 0           $self->{column_prev} = $self->{column};
4436 0           $self->{column}++;
4437 0           $self->{nc}
4438             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4439             } else {
4440 0           $self->{set_nc}->($self);
4441             }
4442            
4443 0           return ($self->{ct}); # ENTITY
4444 0           redo A;
4445             } elsif ($nc == 0x004E or # N
4446             $nc == 0x006E) { # n
4447            
4448 0           $self->{state} = NDATA_STATE;
4449 0           $self->{kwd} = chr $nc;
4450            
4451 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452 0           $self->{line_prev} = $self->{line};
4453 0           $self->{column_prev} = $self->{column};
4454 0           $self->{column}++;
4455 0           $self->{nc}
4456             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457             } else {
4458 0           $self->{set_nc}->($self);
4459             }
4460            
4461 0           redo A;
4462             } elsif ($nc == -1) {
4463            
4464 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4465 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4466             ## reconsume
4467 0           return ($self->{ct}); # ENTITY
4468 0           redo A;
4469             } else {
4470            
4471 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4472 0           $self->{state} = BOGUS_MD_STATE;
4473            
4474 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4475 0           $self->{line_prev} = $self->{line};
4476 0           $self->{column_prev} = $self->{column};
4477 0           $self->{column}++;
4478 0           $self->{nc}
4479             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4480             } else {
4481 0           $self->{set_nc}->($self);
4482             }
4483            
4484 0           redo A;
4485             }
4486             } elsif ($state == BOGUS_DOCTYPE_STATE) {
4487 0 0 0       if ($nc == 0x003E) { # >
    0          
    0          
4488            
4489 0           $self->{state} = DATA_STATE;
4490            
4491 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4492 0           $self->{line_prev} = $self->{line};
4493 0           $self->{column_prev} = $self->{column};
4494 0           $self->{column}++;
4495 0           $self->{nc}
4496             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4497             } else {
4498 0           $self->{set_nc}->($self);
4499             }
4500            
4501              
4502 0           return ($self->{ct}); # DOCTYPE
4503              
4504 0           redo A;
4505             } elsif ($self->{is_xml} and $nc == 0x005B) { # [
4506            
4507 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4508 0           $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4509 0           $self->{in_subset} = 1;
4510            
4511 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4512 0           $self->{line_prev} = $self->{line};
4513 0           $self->{column_prev} = $self->{column};
4514 0           $self->{column}++;
4515 0           $self->{nc}
4516             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4517             } else {
4518 0           $self->{set_nc}->($self);
4519             }
4520            
4521 0           return ($self->{ct}); # DOCTYPE
4522 0           redo A;
4523             } elsif ($nc == -1) {
4524            
4525 0           $self->{state} = DATA_STATE;
4526             ## reconsume
4527              
4528 0           return ($self->{ct}); # DOCTYPE
4529              
4530 0           redo A;
4531             } else {
4532            
4533 0           my $s = '';
4534 0           $self->{read_until}->($s, q{>[}, 0);
4535              
4536             ## Stay in the state
4537            
4538 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4539 0           $self->{line_prev} = $self->{line};
4540 0           $self->{column_prev} = $self->{column};
4541 0           $self->{column}++;
4542 0           $self->{nc}
4543             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4544             } else {
4545 0           $self->{set_nc}->($self);
4546             }
4547            
4548 0           redo A;
4549             }
4550             } elsif ($state == CDATA_SECTION_STATE) {
4551             ## NOTE: "CDATA section state" in the state is jointly implemented
4552             ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4553             ## and |CDATA_SECTION_MSE2_STATE|.
4554              
4555             ## XML5: "CDATA state".
4556            
4557 0 0         if ($nc == 0x005D) { # ]
    0          
4558            
4559 0           $self->{state} = CDATA_SECTION_MSE1_STATE;
4560            
4561 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4562 0           $self->{line_prev} = $self->{line};
4563 0           $self->{column_prev} = $self->{column};
4564 0           $self->{column}++;
4565 0           $self->{nc}
4566             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4567             } else {
4568 0           $self->{set_nc}->($self);
4569             }
4570            
4571 0           redo A;
4572             } elsif ($nc == -1) {
4573 0 0         if ($self->{is_xml}) {
4574            
4575 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4576             } else {
4577            
4578             }
4579              
4580 0           $self->{state} = DATA_STATE;
4581             ## Reconsume.
4582 0 0         if (length $self->{ct}->{data}) { # character
4583            
4584 0           return ($self->{ct}); # character
4585             } else {
4586            
4587             ## No token to emit. $self->{ct} is discarded.
4588             }
4589 0           redo A;
4590             } else {
4591            
4592 0           $self->{ct}->{data} .= chr $nc;
4593 0           $self->{read_until}->($self->{ct}->{data},
4594             qq<\x00]>,
4595             length $self->{ct}->{data});
4596             ## NOTE: NULLs are left as is (see spec's comment). However,
4597             ## a token cannot contain more than one U+0000 NULL character
4598             ## for the ease of processing in the tree constructor.
4599              
4600             ## Stay in the state.
4601            
4602 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4603 0           $self->{line_prev} = $self->{line};
4604 0           $self->{column_prev} = $self->{column};
4605 0           $self->{column}++;
4606 0           $self->{nc}
4607             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4608             } else {
4609 0           $self->{set_nc}->($self);
4610             }
4611            
4612 0           redo A;
4613             }
4614              
4615             ## ISSUE: "text tokens" in spec.
4616             } elsif ($state == CDATA_SECTION_MSE1_STATE) {
4617             ## XML5: "CDATA bracket state".
4618              
4619 0 0         if ($nc == 0x005D) { # ]
4620            
4621 0           $self->{state} = CDATA_SECTION_MSE2_STATE;
4622            
4623 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4624 0           $self->{line_prev} = $self->{line};
4625 0           $self->{column_prev} = $self->{column};
4626 0           $self->{column}++;
4627 0           $self->{nc}
4628             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4629             } else {
4630 0           $self->{set_nc}->($self);
4631             }
4632            
4633 0           redo A;
4634             } else {
4635            
4636             ## XML5: If EOF, "]" is not appended and changed to the data state.
4637 0           $self->{ct}->{data} .= ']';
4638 0           $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4639             ## Reconsume.
4640 0           redo A;
4641             }
4642             } elsif ($state == CDATA_SECTION_MSE2_STATE) {
4643             ## XML5: "CDATA end state".
4644              
4645 0 0         if ($nc == 0x003E) { # >
    0          
4646 0           $self->{state} = DATA_STATE;
4647            
4648 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4649 0           $self->{line_prev} = $self->{line};
4650 0           $self->{column_prev} = $self->{column};
4651 0           $self->{column}++;
4652 0           $self->{nc}
4653             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4654             } else {
4655 0           $self->{set_nc}->($self);
4656             }
4657            
4658 0 0         if (length $self->{ct}->{data}) { # character
4659            
4660 0           return ($self->{ct}); # character
4661             } else {
4662            
4663             ## No token to emit. $self->{ct} is discarded.
4664             }
4665 0           redo A;
4666             } elsif ($nc == 0x005D) { # ]
4667             # character
4668 0           $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4669             ## Stay in the state.
4670            
4671 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4672 0           $self->{line_prev} = $self->{line};
4673 0           $self->{column_prev} = $self->{column};
4674 0           $self->{column}++;
4675 0           $self->{nc}
4676             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4677             } else {
4678 0           $self->{set_nc}->($self);
4679             }
4680            
4681 0           redo A;
4682             } else {
4683            
4684 0           $self->{ct}->{data} .= ']]'; # character
4685 0           $self->{state} = CDATA_SECTION_STATE;
4686             ## Reconsume. ## XML5: Emit.
4687 0           redo A;
4688             }
4689             } elsif ($state == ENTITY_STATE) {
4690 0 0 0       if ($is_space->{$nc} or
    0 0        
    0 0        
      0        
      0        
4691             {
4692             0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4693              
4694             ## Following characters are added here to detect parse
4695             ## error for "=" of "&=" in an unquoted attribute value.
4696             ## Though this disagree with the Web Applications 1.0
4697             ## spec, the result token sequences of both algorithms
4698             ## should be same, as these characters cannot form a part
4699             ## of character references.
4700             0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
4701             0x003D => 1, # =
4702              
4703             ## As a result of the addition above, the following clause
4704             ## has no effect in fact.
4705             $self->{entity_add} => 1,
4706             }->{$nc}) {
4707 0 0         if ($self->{is_xml}) {
4708            
4709 0 0         $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4710             line => $self->{line_prev},
4711             column => $self->{column_prev}
4712             + ($nc == -1 ? 1 : 0));
4713             } else {
4714            
4715             ## No error
4716             }
4717             ## Don't consume
4718             ## Return nothing.
4719             #
4720             } elsif ($nc == 0x0023) { # #
4721            
4722 0           $self->{state} = ENTITY_HASH_STATE;
4723 0           $self->{kwd} = '#';
4724            
4725 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4726 0           $self->{line_prev} = $self->{line};
4727 0           $self->{column_prev} = $self->{column};
4728 0           $self->{column}++;
4729 0           $self->{nc}
4730             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4731             } else {
4732 0           $self->{set_nc}->($self);
4733             }
4734            
4735 0           redo A;
4736             } elsif ($self->{is_xml} or
4737             (0x0041 <= $nc and
4738             $nc <= 0x005A) or # A..Z
4739             (0x0061 <= $nc and
4740             $nc <= 0x007A)) { # a..z
4741            
4742             #require HTML::HTML5::Parser::NamedEntityList;
4743 0           $self->{state} = ENTITY_NAME_STATE;
4744 0           $self->{kwd} = chr $nc;
4745 0           $self->{entity__value} = $self->{kwd};
4746 0           $self->{entity__match} = 0;
4747            
4748 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4749 0           $self->{line_prev} = $self->{line};
4750 0           $self->{column_prev} = $self->{column};
4751 0           $self->{column}++;
4752 0           $self->{nc}
4753             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4754             } else {
4755 0           $self->{set_nc}->($self);
4756             }
4757            
4758 0           redo A;
4759             } else {
4760            
4761             ## Return nothing.
4762             #
4763             }
4764              
4765             ## We implement the "consume a character reference" in a
4766             ## slightly different way from the spec's algorithm, though the
4767             ## end result should be exactly same.
4768              
4769             ## NOTE: No character is consumed by the "consume a character
4770             ## reference" algorithm. In other word, there is an "&" character
4771             ## that does not introduce a character reference, which would be
4772             ## appended to the parent element or the attribute value in later
4773             ## process of the tokenizer.
4774              
4775 0 0 0       if ($self->{prev_state} == DATA_STATE or
4776             $self->{prev_state} == RCDATA_STATE) {
4777            
4778 0           $self->{state} = $self->{prev_state};
4779             ## Reconsume.
4780 0           return ({type => CHARACTER_TOKEN, data => '&',
4781             line => $self->{line_prev},
4782             column => $self->{column_prev},
4783             });
4784 0           redo A;
4785             } else {
4786            
4787 0           $self->{ca}->{value} .= '&';
4788 0           $self->{state} = $self->{prev_state};
4789             ## Reconsume.
4790 0           redo A;
4791             }
4792             } elsif ($state == ENTITY_HASH_STATE) {
4793 0 0 0       if ($nc == 0x0078) { # x
    0          
    0          
4794            
4795 0           $self->{state} = HEXREF_X_STATE;
4796 0           $self->{kwd} .= chr $nc;
4797            
4798 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4799 0           $self->{line_prev} = $self->{line};
4800 0           $self->{column_prev} = $self->{column};
4801 0           $self->{column}++;
4802 0           $self->{nc}
4803             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4804             } else {
4805 0           $self->{set_nc}->($self);
4806             }
4807            
4808 0           redo A;
4809             } elsif ($nc == 0x0058) { # X
4810            
4811 0 0         if ($self->{is_xml}) {
4812 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4813             }
4814 0           $self->{state} = HEXREF_X_STATE;
4815 0           $self->{kwd} .= chr $nc;
4816            
4817 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4818 0           $self->{line_prev} = $self->{line};
4819 0           $self->{column_prev} = $self->{column};
4820 0           $self->{column}++;
4821 0           $self->{nc}
4822             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4823             } else {
4824 0           $self->{set_nc}->($self);
4825             }
4826            
4827 0           redo A;
4828             } elsif (0x0030 <= $nc and
4829             $nc <= 0x0039) { # 0..9
4830            
4831 0           $self->{state} = NCR_NUM_STATE;
4832 0           $self->{kwd} = $nc - 0x0030;
4833            
4834 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4835 0           $self->{line_prev} = $self->{line};
4836 0           $self->{column_prev} = $self->{column};
4837 0           $self->{column}++;
4838 0           $self->{nc}
4839             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4840             } else {
4841 0           $self->{set_nc}->($self);
4842             }
4843            
4844 0           redo A;
4845             } else {
4846 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4847             line => $self->{line_prev},
4848             column => $self->{column_prev} - 1);
4849              
4850             ## NOTE: According to the spec algorithm, nothing is returned,
4851             ## and then "&#" is appended to the parent element or the attribute
4852             ## value in the later processing.
4853              
4854 0 0 0       if ($self->{prev_state} == DATA_STATE or
4855             $self->{prev_state} == RCDATA_STATE) {
4856            
4857 0           $self->{state} = $self->{prev_state};
4858             ## Reconsume.
4859 0           return ({type => CHARACTER_TOKEN,
4860             data => '&#',
4861             line => $self->{line_prev},
4862             column => $self->{column_prev} - 1,
4863             });
4864 0           redo A;
4865             } else {
4866            
4867 0           $self->{ca}->{value} .= '&#';
4868 0           $self->{state} = $self->{prev_state};
4869             ## Reconsume.
4870 0           redo A;
4871             }
4872             }
4873             } elsif ($state == NCR_NUM_STATE) {
4874 0 0 0       if (0x0030 <= $nc and
    0          
4875             $nc <= 0x0039) { # 0..9
4876            
4877 0           $self->{kwd} *= 10;
4878 0           $self->{kwd} += $nc - 0x0030;
4879            
4880             ## Stay in the state.
4881            
4882 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4883 0           $self->{line_prev} = $self->{line};
4884 0           $self->{column_prev} = $self->{column};
4885 0           $self->{column}++;
4886 0           $self->{nc}
4887             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4888             } else {
4889 0           $self->{set_nc}->($self);
4890             }
4891            
4892 0           redo A;
4893             } elsif ($nc == 0x003B) { # ;
4894            
4895            
4896 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4897 0           $self->{line_prev} = $self->{line};
4898 0           $self->{column_prev} = $self->{column};
4899 0           $self->{column}++;
4900 0           $self->{nc}
4901             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4902             } else {
4903 0           $self->{set_nc}->($self);
4904             }
4905            
4906             #
4907             } else {
4908            
4909 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4910             ## Reconsume.
4911             #
4912             }
4913              
4914 0           my $code = $self->{kwd};
4915 0           my $l = $self->{line_prev};
4916 0           my $c = $self->{column_prev};
4917 0 0 0       if ((not $self->{is_xml} and $charref_map->{$code}) or
    0 0        
      0        
      0        
      0        
      0        
4918             ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4919             ($self->{is_xml} and $code == 0x0000)) {
4920            
4921 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4922             text => (sprintf 'U+%04X', $code),
4923             line => $l, column => $c);
4924 0           $code = $charref_map->{$code};
4925             } elsif ($code > 0x10FFFF) {
4926            
4927 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928             text => (sprintf 'U-%08X', $code),
4929             line => $l, column => $c);
4930 0           $code = 0xFFFD;
4931             }
4932              
4933 0 0 0       if ($self->{prev_state} == DATA_STATE or
4934             $self->{prev_state} == RCDATA_STATE) {
4935            
4936 0           $self->{state} = $self->{prev_state};
4937             ## Reconsume.
4938 0           return ({type => CHARACTER_TOKEN, data => chr $code,
4939             has_reference => 1,
4940             line => $l, column => $c,
4941             });
4942 0           redo A;
4943             } else {
4944            
4945 0           $self->{ca}->{value} .= chr $code;
4946 0           $self->{ca}->{has_reference} = 1;
4947 0           $self->{state} = $self->{prev_state};
4948             ## Reconsume.
4949 0           redo A;
4950             }
4951             } elsif ($state == HEXREF_X_STATE) {
4952 0 0 0       if ((0x0030 <= $nc and $nc <= 0x0039) or
      0        
      0        
      0        
      0        
4953             (0x0041 <= $nc and $nc <= 0x0046) or
4954             (0x0061 <= $nc and $nc <= 0x0066)) {
4955             # 0..9, A..F, a..f
4956            
4957 0           $self->{state} = HEXREF_HEX_STATE;
4958 0           $self->{kwd} = 0;
4959             ## Reconsume.
4960 0           redo A;
4961             } else {
4962 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4963             line => $self->{line_prev},
4964             column => $self->{column_prev} - 2);
4965              
4966             ## NOTE: According to the spec algorithm, nothing is returned,
4967             ## and then "&#" followed by "X" or "x" is appended to the parent
4968             ## element or the attribute value in the later processing.
4969              
4970 0 0 0       if ($self->{prev_state} == DATA_STATE or
4971             $self->{prev_state} == RCDATA_STATE) {
4972            
4973 0           $self->{state} = $self->{prev_state};
4974             ## Reconsume.
4975 0           return ({type => CHARACTER_TOKEN,
4976             data => '&' . $self->{kwd},
4977             line => $self->{line_prev},
4978             column => $self->{column_prev} - length $self->{kwd},
4979             });
4980 0           redo A;
4981             } else {
4982            
4983 0           $self->{ca}->{value} .= '&' . $self->{kwd};
4984 0           $self->{state} = $self->{prev_state};
4985             ## Reconsume.
4986 0           redo A;
4987             }
4988             }
4989             } elsif ($state == HEXREF_HEX_STATE) {
4990 0 0 0       if (0x0030 <= $nc and $nc <= 0x0039) {
    0 0        
    0 0        
    0          
4991             # 0..9
4992            
4993 0           $self->{kwd} *= 0x10;
4994 0           $self->{kwd} += $nc - 0x0030;
4995             ## Stay in the state.
4996            
4997 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4998 0           $self->{line_prev} = $self->{line};
4999 0           $self->{column_prev} = $self->{column};
5000 0           $self->{column}++;
5001 0           $self->{nc}
5002             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5003             } else {
5004 0           $self->{set_nc}->($self);
5005             }
5006            
5007 0           redo A;
5008             } elsif (0x0061 <= $nc and
5009             $nc <= 0x0066) { # a..f
5010            
5011 0           $self->{kwd} *= 0x10;
5012 0           $self->{kwd} += $nc - 0x0060 + 9;
5013             ## Stay in the state.
5014            
5015 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5016 0           $self->{line_prev} = $self->{line};
5017 0           $self->{column_prev} = $self->{column};
5018 0           $self->{column}++;
5019 0           $self->{nc}
5020             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5021             } else {
5022 0           $self->{set_nc}->($self);
5023             }
5024            
5025 0           redo A;
5026             } elsif (0x0041 <= $nc and
5027             $nc <= 0x0046) { # A..F
5028            
5029 0           $self->{kwd} *= 0x10;
5030 0           $self->{kwd} += $nc - 0x0040 + 9;
5031             ## Stay in the state.
5032            
5033 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5034 0           $self->{line_prev} = $self->{line};
5035 0           $self->{column_prev} = $self->{column};
5036 0           $self->{column}++;
5037 0           $self->{nc}
5038             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5039             } else {
5040 0           $self->{set_nc}->($self);
5041             }
5042            
5043 0           redo A;
5044             } elsif ($nc == 0x003B) { # ;
5045            
5046            
5047 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5048 0           $self->{line_prev} = $self->{line};
5049 0           $self->{column_prev} = $self->{column};
5050 0           $self->{column}++;
5051 0           $self->{nc}
5052             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5053             } else {
5054 0           $self->{set_nc}->($self);
5055             }
5056            
5057             #
5058             } else {
5059            
5060 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5061             line => $self->{line},
5062             column => $self->{column});
5063             ## Reconsume.
5064             #
5065             }
5066              
5067 0           my $code = $self->{kwd};
5068 0           my $l = $self->{line_prev};
5069 0           my $c = $self->{column_prev};
5070 0 0 0       if ((not $self->{is_xml} and $charref_map->{$code}) or
    0 0        
      0        
      0        
      0        
      0        
5071             ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5072             ($self->{is_xml} and $code == 0x0000)) {
5073            
5074 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5075             text => (sprintf 'U+%04X', $code),
5076             line => $l, column => $c);
5077 0           $code = $charref_map->{$code};
5078             } elsif ($code > 0x10FFFF) {
5079            
5080 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5081             text => (sprintf 'U-%08X', $code),
5082             line => $l, column => $c);
5083 0           $code = 0xFFFD;
5084             }
5085              
5086 0 0 0       if ($self->{prev_state} == DATA_STATE or
5087             $self->{prev_state} == RCDATA_STATE) {
5088            
5089 0           $self->{state} = $self->{prev_state};
5090             ## Reconsume.
5091 0           return ({type => CHARACTER_TOKEN, data => chr $code,
5092             has_reference => 1,
5093             line => $l, column => $c,
5094             });
5095 0           redo A;
5096             } else {
5097            
5098 0           $self->{ca}->{value} .= chr $code;
5099 0           $self->{ca}->{has_reference} = 1;
5100 0           $self->{state} = $self->{prev_state};
5101             ## Reconsume.
5102 0           redo A;
5103             }
5104             } elsif ($state == ENTITY_NAME_STATE) {
5105 0 0 0       if ((0x0041 <= $nc and # a
    0 0        
      0        
      0        
      0        
      0        
      0        
      0        
      0        
5106             $nc <= 0x005A) or # x
5107             (0x0061 <= $nc and # a
5108             $nc <= 0x007A) or # z
5109             (0x0030 <= $nc and # 0
5110             $nc <= 0x0039) or # 9
5111             $nc == 0x003B or # ;
5112             ($self->{is_xml} and
5113             not ($is_space->{$nc} or
5114             {
5115             0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5116              
5117             ## See comment in the |ENTITY_STATE|'s |if|
5118             ## statement for the rationale of addition of these
5119             ## characters.
5120             0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
5121             0x003D => 1, # =
5122              
5123             ## This is redundant for the same reason.
5124             $self->{entity_add} => 1,
5125             }->{$nc}))) {
5126             #local %entity2char;
5127 0           $self->{kwd} .= chr $nc; ## Bare entity name.
5128 0 0 0       if (defined $entity2char{$self->{kwd}} or ## HTML charrefs.
5129             $self->{ge}->{$self->{kwd}}) { ## XML general entities.
5130 0 0         if ($nc == 0x003B) { # ;
5131 0 0         if (defined $self->{ge}->{$self->{kwd}}) {
5132             ## A declared XML entity.
5133 0 0         if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5134            
5135 0           $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5136             } else {
5137 0 0         if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5138            
5139 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5140             value => $self->{kwd});
5141             } else {
5142            
5143             }
5144 0           $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5145             }
5146             } else {
5147             ## An HTML character reference.
5148 0 0         if ($self->{is_xml}) {
5149             ## Not a declared XML entity.
5150            
5151 0   0       $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5152             value => $self->{kwd},
5153             level => {
5154             'amp;' => $self->{level}->{warn},
5155             'quot;' => $self->{level}->{warn},
5156             'lt;' => $self->{level}->{warn},
5157             'gt;' => $self->{level}->{warn},
5158             'apos;' => $self->{level}->{warn},
5159             }->{$self->{kwd}} ||
5160             $self->{level}->{must},
5161             line => $self->{line_prev},
5162             column => $self->{column} - length $self->{kwd});
5163             } else {
5164            
5165             }
5166 0           $self->{entity__value} = $entity2char{$self->{kwd}};
5167             }
5168 0           $self->{entity__match} = 1; ## Matched exactly with ";" entity.
5169            
5170 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5171 0           $self->{line_prev} = $self->{line};
5172 0           $self->{column_prev} = $self->{column};
5173 0           $self->{column}++;
5174 0           $self->{nc}
5175             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5176             } else {
5177 0           $self->{set_nc}->($self);
5178             }
5179            
5180             #
5181             } else {
5182            
5183 0           $self->{entity__value} = $entity2char{$self->{kwd}};
5184 0           $self->{entity__match} = -1; ## Exactly matched to non-";" entity.
5185             ## Stay in the state.
5186            
5187 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5188 0           $self->{line_prev} = $self->{line};
5189 0           $self->{column_prev} = $self->{column};
5190 0           $self->{column}++;
5191 0           $self->{nc}
5192             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5193             } else {
5194 0           $self->{set_nc}->($self);
5195             }
5196            
5197 0           redo A;
5198             }
5199             } else {
5200 0 0         if ($nc == 0x003B) { # ;
5201             ## A reserved HTML character reference or an undeclared
5202             ## XML entity reference.
5203            
5204 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## XXXtype
5205             value => $self->{kwd},
5206             level => $self->{level}->{must},
5207             line => $self->{line_prev},
5208             column => $self->{column} - length $self->{kwd});
5209 0           $self->{entity__value} .= chr $nc;
5210 0           $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5211            
5212 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5213 0           $self->{line_prev} = $self->{line};
5214 0           $self->{column_prev} = $self->{column};
5215 0           $self->{column}++;
5216 0           $self->{nc}
5217             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5218             } else {
5219 0           $self->{set_nc}->($self);
5220             }
5221            
5222             #
5223             } else {
5224            
5225 0           $self->{entity__value} .= chr $nc;
5226 0           $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5227             ## Stay in the state.
5228            
5229 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5230 0           $self->{line_prev} = $self->{line};
5231 0           $self->{column_prev} = $self->{column};
5232 0           $self->{column}++;
5233 0           $self->{nc}
5234             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5235             } else {
5236 0           $self->{set_nc}->($self);
5237             }
5238            
5239 0           redo A;
5240             }
5241             }
5242             } elsif ($nc == 0x003D) { # =
5243 0 0 0       if ($self->{entity__match} < 0 and
      0        
5244             $self->{prev_state} != DATA_STATE and # in attribute
5245             $self->{prev_state} != RCDATA_STATE) {
5246 0           $self->{entity__match} = 0;
5247             }
5248             }
5249              
5250 0           my $data;
5251             my $has_ref;
5252 0 0         if ($self->{entity__match} > 0) { ## A ";" entity.
    0          
5253            
5254 0           $data = $self->{entity__value};
5255             ## Strictly speaking the $has_ref flag should not be set if
5256             ## there is no matched entity. However, this flag is used
5257             ## only in contexts where use of an
5258             ## unexpanded-entity-reference-like string is in no way
5259             ## allowed, so it should not make any difference in theory.
5260 0           $has_ref = 1;
5261             #
5262             } elsif ($self->{entity__match} < 0) { ## Matched to non-";" entity.
5263 0 0 0       if ($self->{prev_state} != DATA_STATE and # in attribute
      0        
5264             $self->{prev_state} != RCDATA_STATE and
5265             $self->{entity__match} < -1) {
5266             ## In attribute-value contexts, matched non-";" string is
5267             ## left as is if there is trailing alphabetical letters.
5268            
5269 0           $data = '&' . $self->{kwd};
5270             #
5271             } else {
5272             ## In attribute-value contexts, exactly matched non-";"
5273             ## string is replaced as a character reference. In any
5274             ## context, matched non-";" string with or without trailing
5275             ## alphabetical letters is replaced as a character reference
5276             ## (with trailing letters). Note that use of a no-";"
5277             ## character reference is always non-conforming.
5278            
5279 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5280 0           $data = $self->{entity__value};
5281 0           $has_ref = 1;
5282             #
5283             }
5284             } else { ## Unmatched string.
5285 0 0 0       if ($self->{is_xml} and not $self->{kwd} =~ /;$/) {
5286            
5287 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5288             line => $self->{line_prev},
5289             column => $self->{column_prev} - length $self->{kwd});
5290             } else {
5291            
5292             }
5293 0           $data = '&' . $self->{kwd};
5294             #
5295             }
5296            
5297             ## NOTE: In these cases, when a character reference is found,
5298             ## it is consumed and a character token is returned, or, otherwise,
5299             ## nothing is consumed and returned, according to the spec algorithm.
5300             ## In this implementation, anything that has been examined by the
5301             ## tokenizer is appended to the parent element or the attribute value
5302             ## as string, either literal string when no character reference or
5303             ## entity-replaced string otherwise, in this stage, since any characters
5304             ## that would not be consumed are appended in the data state or in an
5305             ## appropriate attribute value state anyway.
5306            
5307 0 0 0       if ($self->{prev_state} == DATA_STATE or
5308             $self->{prev_state} == RCDATA_STATE) {
5309            
5310 0           $self->{state} = $self->{prev_state};
5311             ## Reconsume.
5312 0           return ({type => CHARACTER_TOKEN,
5313             data => $data,
5314             has_reference => $has_ref,
5315             line => $self->{line_prev},
5316             column => $self->{column_prev} + 1 - length $self->{kwd},
5317             });
5318 0           redo A;
5319             } else {
5320            
5321 0           $self->{ca}->{value} .= $data;
5322 0 0         $self->{ca}->{has_reference} = 1 if $has_ref;
5323 0           $self->{state} = $self->{prev_state};
5324             ## Reconsume.
5325 0           redo A;
5326             }
5327              
5328             ## ========== XML-only states ==========
5329              
5330             } elsif ($state == PI_STATE) {
5331             ## XML5: "Pi state" and "DOCTYPE pi state".
5332              
5333 0 0 0       if ($is_space->{$nc} or
      0        
5334             $nc == 0x003F or # ?
5335             $nc == -1) {
5336             ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5337             ## pi state": Switch to the "DOCTYPE pi after state". EOF:
5338             ## "DOCTYPE pi state": Parse error, switch to the "data
5339             ## state".
5340 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5341             line => $self->{line_prev},
5342             column => $self->{column_prev}
5343             - 1 * ($nc != -1));
5344 0           $self->{state} = BOGUS_COMMENT_STATE;
5345             ## Reconsume.
5346 0           $self->{ct} = {type => COMMENT_TOKEN,
5347             data => '?',
5348             line => $self->{line_prev},
5349             column => $self->{column_prev}
5350             - 1 * ($nc != -1),
5351             };
5352 0           redo A;
5353             } else {
5354             ## XML5: "DOCTYPE pi state": Stay in the state.
5355 0 0         if ($nc == 0x0000) {
5356 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5357             }
5358 0 0         $self->{ct} = {type => PI_TOKEN,
5359             target => $nc == 0x0000 ? "\x{FFFD}" : chr $nc,
5360             data => '',
5361             line => $self->{line_prev},
5362             column => $self->{column_prev} - 1,
5363             };
5364 0           $self->{state} = PI_TARGET_STATE;
5365            
5366 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367 0           $self->{line_prev} = $self->{line};
5368 0           $self->{column_prev} = $self->{column};
5369 0           $self->{column}++;
5370 0           $self->{nc}
5371             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372             } else {
5373 0           $self->{set_nc}->($self);
5374             }
5375            
5376 0           redo A;
5377             }
5378             } elsif ($state == PI_TARGET_STATE) {
5379 0 0         if ($is_space->{$nc}) {
    0          
    0          
5380 0           $self->{state} = PI_TARGET_AFTER_STATE;
5381 0           $self->{kwd} = chr $nc; # "temporary buffer"
5382            
5383 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384 0           $self->{line_prev} = $self->{line};
5385 0           $self->{column_prev} = $self->{column};
5386 0           $self->{column}++;
5387 0           $self->{nc}
5388             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389             } else {
5390 0           $self->{set_nc}->($self);
5391             }
5392            
5393 0           redo A;
5394             } elsif ($nc == EOF_CHAR) {
5395 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5396 0 0         if ($self->{in_subset}) {
5397 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5398             } else {
5399 0           $self->{state} = DATA_STATE;
5400             }
5401             ## Reconsume.
5402 0           return ({type => COMMENT_TOKEN,
5403             data => '?' . $self->{ct}->{target},
5404             line => $self->{ct}->{line},
5405             column => $self->{ct}->{column}});
5406 0           redo A;
5407             } elsif ($nc == 0x003F) { # ?
5408 0           $self->{state} = PI_AFTER_STATE;
5409 0           $self->{kwd} = ''; # "temporary buffer"
5410            
5411 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5412 0           $self->{line_prev} = $self->{line};
5413 0           $self->{column_prev} = $self->{column};
5414 0           $self->{column}++;
5415 0           $self->{nc}
5416             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5417             } else {
5418 0           $self->{set_nc}->($self);
5419             }
5420            
5421 0           redo A;
5422             } else {
5423             ## XML5: typo ("tag name" -> "target")
5424 0 0         if ($nc == 0x0000) {
5425 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5426             }
5427 0 0         $self->{ct}->{target} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5428            
5429 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5430 0           $self->{line_prev} = $self->{line};
5431 0           $self->{column_prev} = $self->{column};
5432 0           $self->{column}++;
5433 0           $self->{nc}
5434             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5435             } else {
5436 0           $self->{set_nc}->($self);
5437             }
5438            
5439 0           redo A;
5440             }
5441             } elsif ($state == PI_TARGET_AFTER_STATE) {
5442 0 0         if ($is_space->{$nc}) {
5443 0           $self->{kwd} .= chr $nc; # "temporary buffer"
5444             ## Stay in the state.
5445            
5446 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5447 0           $self->{line_prev} = $self->{line};
5448 0           $self->{column_prev} = $self->{column};
5449 0           $self->{column}++;
5450 0           $self->{nc}
5451             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5452             } else {
5453 0           $self->{set_nc}->($self);
5454             }
5455            
5456 0           redo A;
5457             } else {
5458 0           $self->{state} = PI_DATA_STATE;
5459             ## Reprocess.
5460 0           redo A;
5461             }
5462             } elsif ($state == PI_DATA_STATE) {
5463 0 0         if ($nc == 0x003F) { # ?
    0          
5464 0           $self->{state} = PI_DATA_AFTER_STATE;
5465            
5466 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5467 0           $self->{line_prev} = $self->{line};
5468 0           $self->{column_prev} = $self->{column};
5469 0           $self->{column}++;
5470 0           $self->{nc}
5471             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5472             } else {
5473 0           $self->{set_nc}->($self);
5474             }
5475            
5476 0           redo A;
5477             } elsif ($nc == EOF_CHAR) {
5478 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5479 0 0         if ($self->{in_subset}) {
5480 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5481             } else {
5482 0           $self->{state} = DATA_STATE;
5483             }
5484             ## Reprocess.
5485 0           return ({type => COMMENT_TOKEN,
5486             data => '?' . $self->{ct}->{target} .
5487             $self->{kwd} . # "temporary buffer"
5488             $self->{ct}->{data},
5489             line => $self->{ct}->{line},
5490             column => $self->{ct}->{column}});
5491 0           redo A;
5492             } else {
5493 0 0         if ($nc == 0x0000) {
5494 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5495             }
5496 0 0         $self->{ct}->{data} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5497 0           $self->{read_until}->($self->{ct}->{data}, qq[\x00?],
5498             length $self->{ct}->{data});
5499             ## Stay in the state.
5500            
5501 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5502 0           $self->{line_prev} = $self->{line};
5503 0           $self->{column_prev} = $self->{column};
5504 0           $self->{column}++;
5505 0           $self->{nc}
5506             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5507             } else {
5508 0           $self->{set_nc}->($self);
5509             }
5510            
5511             ## Reprocess.
5512 0           redo A;
5513             }
5514             } elsif ($state == PI_AFTER_STATE) {
5515             ## XML5: Part of "Pi after state".
5516              
5517 0 0         if ($nc == 0x003E) { # >
    0          
5518 0 0         if ($self->{in_subset}) {
5519 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5520             } else {
5521 0           $self->{state} = DATA_STATE;
5522             }
5523            
5524 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5525 0           $self->{line_prev} = $self->{line};
5526 0           $self->{column_prev} = $self->{column};
5527 0           $self->{column}++;
5528 0           $self->{nc}
5529             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5530             } else {
5531 0           $self->{set_nc}->($self);
5532             }
5533            
5534 0           return ($self->{ct}); # pi
5535 0           redo A;
5536             } elsif ($nc == 0x003F) { # ?
5537 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5538             line => $self->{line_prev},
5539             column => $self->{column_prev}); ## XML5: no error
5540 0           $self->{ct}->{data} .= '?';
5541 0           $self->{state} = PI_DATA_AFTER_STATE;
5542            
5543 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5544 0           $self->{line_prev} = $self->{line};
5545 0           $self->{column_prev} = $self->{column};
5546 0           $self->{column}++;
5547 0           $self->{nc}
5548             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5549             } else {
5550 0           $self->{set_nc}->($self);
5551             }
5552            
5553 0           redo A;
5554             } else {
5555 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5556             line => $self->{line_prev},
5557             column => $self->{column_prev}
5558             + 1 * ($nc == -1)); ## XML5: no error
5559 0           $self->{ct}->{data} .= '?'; ## XML5: not appended
5560 0           $self->{state} = PI_DATA_STATE;
5561             ## Reprocess.
5562 0           redo A;
5563             }
5564             } elsif ($state == PI_DATA_AFTER_STATE) {
5565             ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5566              
5567 0 0         if ($nc == 0x003E) { # >
    0          
5568 0 0         if ($self->{in_subset}) {
5569 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5570             } else {
5571 0           $self->{state} = DATA_STATE;
5572             }
5573            
5574 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5575 0           $self->{line_prev} = $self->{line};
5576 0           $self->{column_prev} = $self->{column};
5577 0           $self->{column}++;
5578 0           $self->{nc}
5579             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5580             } else {
5581 0           $self->{set_nc}->($self);
5582             }
5583            
5584 0           return ($self->{ct}); # pi
5585 0           redo A;
5586             } elsif ($nc == 0x003F) { # ?
5587 0           $self->{ct}->{data} .= '?';
5588             ## Stay in the state.
5589            
5590 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591 0           $self->{line_prev} = $self->{line};
5592 0           $self->{column_prev} = $self->{column};
5593 0           $self->{column}++;
5594 0           $self->{nc}
5595             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596             } else {
5597 0           $self->{set_nc}->($self);
5598             }
5599            
5600 0           redo A;
5601             } else {
5602 0           $self->{ct}->{data} .= '?'; ## XML5: not appended
5603 0           $self->{state} = PI_DATA_STATE;
5604             ## Reprocess.
5605 0           redo A;
5606             }
5607              
5608             } elsif ($state == DOCTYPE_INTERNAL_SUBSET_STATE) {
5609 0 0         if ($nc == 0x003C) { # <
    0          
    0          
    0          
    0          
5610 0           $self->{state} = DOCTYPE_TAG_STATE;
5611            
5612 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5613 0           $self->{line_prev} = $self->{line};
5614 0           $self->{column_prev} = $self->{column};
5615 0           $self->{column}++;
5616 0           $self->{nc}
5617             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5618             } else {
5619 0           $self->{set_nc}->($self);
5620             }
5621            
5622 0           redo A;
5623             } elsif ($nc == 0x0025) { # %
5624             ## XML5: Not defined yet.
5625              
5626             ## TODO: parameter entity expansion
5627              
5628 0 0 0       if (not $self->{stop_processing} and
5629             not $self->{document}->xml_standalone) {
5630 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5631             level => $self->{level}->{info});
5632 0           $self->{stop_processing} = 1;
5633             }
5634              
5635            
5636 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5637 0           $self->{line_prev} = $self->{line};
5638 0           $self->{column_prev} = $self->{column};
5639 0           $self->{column}++;
5640 0           $self->{nc}
5641             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5642             } else {
5643 0           $self->{set_nc}->($self);
5644             }
5645            
5646 0           redo A;
5647             } elsif ($nc == 0x005D) { # ]
5648 0           delete $self->{in_subset};
5649 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5650            
5651 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5652 0           $self->{line_prev} = $self->{line};
5653 0           $self->{column_prev} = $self->{column};
5654 0           $self->{column}++;
5655 0           $self->{nc}
5656             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5657             } else {
5658 0           $self->{set_nc}->($self);
5659             }
5660            
5661 0           redo A;
5662             } elsif ($is_space->{$nc}) {
5663             ## Stay in the state.
5664            
5665 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666 0           $self->{line_prev} = $self->{line};
5667 0           $self->{column_prev} = $self->{column};
5668 0           $self->{column}++;
5669 0           $self->{nc}
5670             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671             } else {
5672 0           $self->{set_nc}->($self);
5673             }
5674            
5675 0           redo A;
5676             } elsif ($nc == EOF_CHAR) {
5677 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5678 0           delete $self->{in_subset};
5679 0           $self->{state} = DATA_STATE;
5680             ## Reconsume.
5681 0           return ({type => END_OF_DOCTYPE_TOKEN});
5682 0           redo A;
5683             } else {
5684 0 0         unless ($self->{internal_subset_tainted}) {
5685             ## XML5: No parse error.
5686 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5687 0           $self->{internal_subset_tainted} = 1;
5688             }
5689             ## Stay in the state.
5690            
5691 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5692 0           $self->{line_prev} = $self->{line};
5693 0           $self->{column_prev} = $self->{column};
5694 0           $self->{column}++;
5695 0           $self->{nc}
5696             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5697             } else {
5698 0           $self->{set_nc}->($self);
5699             }
5700            
5701 0           redo A;
5702             }
5703             } elsif ($state == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5704 0 0         if ($nc == 0x003E) { # >
    0          
5705 0           $self->{state} = DATA_STATE;
5706            
5707 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708 0           $self->{line_prev} = $self->{line};
5709 0           $self->{column_prev} = $self->{column};
5710 0           $self->{column}++;
5711 0           $self->{nc}
5712             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713             } else {
5714 0           $self->{set_nc}->($self);
5715             }
5716            
5717 0           return ({type => END_OF_DOCTYPE_TOKEN});
5718 0           redo A;
5719             } elsif ($nc == EOF_CHAR) {
5720 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5721 0           $self->{state} = DATA_STATE;
5722             ## Reconsume.
5723 0           return ({type => END_OF_DOCTYPE_TOKEN});
5724 0           redo A;
5725             } else {
5726             ## XML5: No parse error and stay in the state.
5727 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5728              
5729 0           $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5730            
5731 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5732 0           $self->{line_prev} = $self->{line};
5733 0           $self->{column_prev} = $self->{column};
5734 0           $self->{column}++;
5735 0           $self->{nc}
5736             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5737             } else {
5738 0           $self->{set_nc}->($self);
5739             }
5740            
5741 0           redo A;
5742             }
5743             } elsif ($state == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5744 0 0         if ($nc == 0x003E) { # >
    0          
5745 0           $self->{state} = DATA_STATE;
5746            
5747 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5748 0           $self->{line_prev} = $self->{line};
5749 0           $self->{column_prev} = $self->{column};
5750 0           $self->{column}++;
5751 0           $self->{nc}
5752             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5753             } else {
5754 0           $self->{set_nc}->($self);
5755             }
5756            
5757 0           return ({type => END_OF_DOCTYPE_TOKEN});
5758 0           redo A;
5759             } elsif ($nc == EOF_CHAR) {
5760 0           $self->{state} = DATA_STATE;
5761             ## Reconsume.
5762 0           return ({type => END_OF_DOCTYPE_TOKEN});
5763 0           redo A;
5764             } else {
5765             ## Stay in the state.
5766            
5767 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5768 0           $self->{line_prev} = $self->{line};
5769 0           $self->{column_prev} = $self->{column};
5770 0           $self->{column}++;
5771 0           $self->{nc}
5772             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5773             } else {
5774 0           $self->{set_nc}->($self);
5775             }
5776            
5777 0           redo A;
5778             }
5779             } elsif ($state == DOCTYPE_TAG_STATE) {
5780 0 0         if ($nc == 0x0021) { # !
    0          
    0          
5781 0           $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5782            
5783 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5784 0           $self->{line_prev} = $self->{line};
5785 0           $self->{column_prev} = $self->{column};
5786 0           $self->{column}++;
5787 0           $self->{nc}
5788             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5789             } else {
5790 0           $self->{set_nc}->($self);
5791             }
5792            
5793 0           redo A;
5794             } elsif ($nc == 0x003F) { # ?
5795 0           $self->{state} = PI_STATE;
5796            
5797 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5798 0           $self->{line_prev} = $self->{line};
5799 0           $self->{column_prev} = $self->{column};
5800 0           $self->{column}++;
5801 0           $self->{nc}
5802             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5803             } else {
5804 0           $self->{set_nc}->($self);
5805             }
5806            
5807 0           redo A;
5808             } elsif ($nc == EOF_CHAR) {
5809 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5810 0           $self->{state} = DATA_STATE;
5811             ## Reconsume.
5812 0           redo A;
5813             } else {
5814 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5815             line => $self->{line_prev},
5816             column => $self->{column_prev});
5817 0           $self->{state} = BOGUS_COMMENT_STATE;
5818 0           $self->{ct} = {type => COMMENT_TOKEN,
5819             data => '',
5820             }; ## NOTE: Will be discarded.
5821            
5822 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5823 0           $self->{line_prev} = $self->{line};
5824 0           $self->{column_prev} = $self->{column};
5825 0           $self->{column}++;
5826 0           $self->{nc}
5827             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5828             } else {
5829 0           $self->{set_nc}->($self);
5830             }
5831            
5832 0           redo A;
5833             }
5834             } elsif ($state == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5835             ## XML5: "DOCTYPE markup declaration state".
5836            
5837 0 0 0       if ($nc == 0x002D) { # -
    0 0        
    0 0        
    0          
5838 0           $self->{state} = MD_HYPHEN_STATE;
5839            
5840 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5841 0           $self->{line_prev} = $self->{line};
5842 0           $self->{column_prev} = $self->{column};
5843 0           $self->{column}++;
5844 0           $self->{nc}
5845             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5846             } else {
5847 0           $self->{set_nc}->($self);
5848             }
5849            
5850 0           redo A;
5851             } elsif ($nc == 0x0045 or # E
5852             $nc == 0x0065) { # e
5853 0           $self->{state} = MD_E_STATE;
5854 0           $self->{kwd} = chr $nc;
5855            
5856 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5857 0           $self->{line_prev} = $self->{line};
5858 0           $self->{column_prev} = $self->{column};
5859 0           $self->{column}++;
5860 0           $self->{nc}
5861             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5862             } else {
5863 0           $self->{set_nc}->($self);
5864             }
5865            
5866 0           redo A;
5867             } elsif ($nc == 0x0041 or # A
5868             $nc == 0x0061) { # a
5869 0           $self->{state} = MD_ATTLIST_STATE;
5870 0           $self->{kwd} = chr $nc;
5871            
5872 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5873 0           $self->{line_prev} = $self->{line};
5874 0           $self->{column_prev} = $self->{column};
5875 0           $self->{column}++;
5876 0           $self->{nc}
5877             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5878             } else {
5879 0           $self->{set_nc}->($self);
5880             }
5881            
5882 0           redo A;
5883             } elsif ($nc == 0x004E or # N
5884             $nc == 0x006E) { # n
5885 0           $self->{state} = MD_NOTATION_STATE;
5886 0           $self->{kwd} = chr $nc;
5887            
5888 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5889 0           $self->{line_prev} = $self->{line};
5890 0           $self->{column_prev} = $self->{column};
5891 0           $self->{column}++;
5892 0           $self->{nc}
5893             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5894             } else {
5895 0           $self->{set_nc}->($self);
5896             }
5897            
5898 0           redo A;
5899             } else {
5900             #
5901             }
5902            
5903             ## XML5: No parse error.
5904 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5905             line => $self->{line_prev},
5906             column => $self->{column_prev} - 1);
5907             ## Reconsume.
5908 0           $self->{state} = BOGUS_COMMENT_STATE;
5909 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5910 0           redo A;
5911             } elsif ($state == MD_E_STATE) {
5912 0 0 0       if ($nc == 0x004E or # N
    0 0        
5913             $nc == 0x006E) { # n
5914 0           $self->{state} = MD_ENTITY_STATE;
5915 0           $self->{kwd} .= chr $nc;
5916            
5917 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5918 0           $self->{line_prev} = $self->{line};
5919 0           $self->{column_prev} = $self->{column};
5920 0           $self->{column}++;
5921 0           $self->{nc}
5922             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5923             } else {
5924 0           $self->{set_nc}->($self);
5925             }
5926            
5927 0           redo A;
5928             } elsif ($nc == 0x004C or # L
5929             $nc == 0x006C) { # l
5930             ## XML5: <!ELEMENT> not supported.
5931 0           $self->{state} = MD_ELEMENT_STATE;
5932 0           $self->{kwd} .= chr $nc;
5933            
5934 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5935 0           $self->{line_prev} = $self->{line};
5936 0           $self->{column_prev} = $self->{column};
5937 0           $self->{column}++;
5938 0           $self->{nc}
5939             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5940             } else {
5941 0           $self->{set_nc}->($self);
5942             }
5943            
5944 0           redo A;
5945             } else {
5946             ## XML5: No parse error.
5947 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5948             line => $self->{line_prev},
5949             column => $self->{column_prev} - 2
5950             + 1 * ($nc == EOF_CHAR));
5951             ## Reconsume.
5952 0           $self->{state} = BOGUS_COMMENT_STATE;
5953 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5954 0           redo A;
5955             }
5956             } elsif ($state == MD_ENTITY_STATE) {
5957 0 0 0       if ($nc == [
    0 0        
      0        
5958             undef,
5959             undef,
5960             0x0054, # T
5961             0x0049, # I
5962             0x0054, # T
5963             NEVER_CHAR, # (Y)
5964             ]->[length $self->{kwd}] or
5965             $nc == [
5966             undef,
5967             undef,
5968             0x0074, # t
5969             0x0069, # i
5970             0x0074, # t
5971             NEVER_CHAR, # (y)
5972             ]->[length $self->{kwd}]) {
5973             ## Stay in the state.
5974 0           $self->{kwd} .= chr $nc;
5975            
5976 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5977 0           $self->{line_prev} = $self->{line};
5978 0           $self->{column_prev} = $self->{column};
5979 0           $self->{column}++;
5980 0           $self->{nc}
5981             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5982             } else {
5983 0           $self->{set_nc}->($self);
5984             }
5985            
5986 0           redo A;
5987             } elsif ((length $self->{kwd}) == 5 and
5988             ($nc == 0x0059 or # Y
5989             $nc == 0x0079)) { # y
5990 0 0 0       if ($self->{kwd} ne 'ENTIT' or $nc == 0x0079) {
5991 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5992             text => 'ENTITY',
5993             line => $self->{line_prev},
5994             column => $self->{column_prev} - 4);
5995             }
5996 0           $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5997             line => $self->{line_prev},
5998             column => $self->{column_prev} - 6};
5999 0           $self->{state} = DOCTYPE_MD_STATE;
6000            
6001 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6002 0           $self->{line_prev} = $self->{line};
6003 0           $self->{column_prev} = $self->{column};
6004 0           $self->{column}++;
6005 0           $self->{nc}
6006             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6007             } else {
6008 0           $self->{set_nc}->($self);
6009             }
6010            
6011 0           redo A;
6012             } else {
6013 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6014             line => $self->{line_prev},
6015             column => $self->{column_prev} - 1
6016             - (length $self->{kwd})
6017             + 1 * ($nc == EOF_CHAR));
6018 0           $self->{state} = BOGUS_COMMENT_STATE;
6019             ## Reconsume.
6020 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6021 0           redo A;
6022             }
6023             } elsif ($state == MD_ELEMENT_STATE) {
6024 0 0 0       if ($nc == [
    0 0        
      0        
6025             undef,
6026             undef,
6027             0x0045, # E
6028             0x004D, # M
6029             0x0045, # E
6030             0x004E, # N
6031             NEVER_CHAR, # (T)
6032             ]->[length $self->{kwd}] or
6033             $nc == [
6034             undef,
6035             undef,
6036             0x0065, # e
6037             0x006D, # m
6038             0x0065, # e
6039             0x006E, # n
6040             NEVER_CHAR, # (t)
6041             ]->[length $self->{kwd}]) {
6042             ## Stay in the state.
6043 0           $self->{kwd} .= chr $nc;
6044            
6045 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6046 0           $self->{line_prev} = $self->{line};
6047 0           $self->{column_prev} = $self->{column};
6048 0           $self->{column}++;
6049 0           $self->{nc}
6050             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6051             } else {
6052 0           $self->{set_nc}->($self);
6053             }
6054            
6055 0           redo A;
6056             } elsif ((length $self->{kwd}) == 6 and
6057             ($nc == 0x0054 or # T
6058             $nc == 0x0074)) { # t
6059 0 0 0       if ($self->{kwd} ne 'ELEMEN' or $nc == 0x0074) {
6060 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6061             text => 'ELEMENT',
6062             line => $self->{line_prev},
6063             column => $self->{column_prev} - 5);
6064             }
6065 0           $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6066             line => $self->{line_prev},
6067             column => $self->{column_prev} - 7};
6068 0           $self->{state} = DOCTYPE_MD_STATE;
6069            
6070 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6071 0           $self->{line_prev} = $self->{line};
6072 0           $self->{column_prev} = $self->{column};
6073 0           $self->{column}++;
6074 0           $self->{nc}
6075             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6076             } else {
6077 0           $self->{set_nc}->($self);
6078             }
6079            
6080 0           redo A;
6081             } else {
6082 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6083             line => $self->{line_prev},
6084             column => $self->{column_prev} - 1
6085             - (length $self->{kwd})
6086             + 1 * ($nc == EOF_CHAR));
6087 0           $self->{state} = BOGUS_COMMENT_STATE;
6088             ## Reconsume.
6089 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6090 0           redo A;
6091             }
6092             } elsif ($state == MD_ATTLIST_STATE) {
6093 0 0 0       if ($nc == [
    0 0        
      0        
6094             undef,
6095             0x0054, # T
6096             0x0054, # T
6097             0x004C, # L
6098             0x0049, # I
6099             0x0053, # S
6100             NEVER_CHAR, # (T)
6101             ]->[length $self->{kwd}] or
6102             $nc == [
6103             undef,
6104             0x0074, # t
6105             0x0074, # t
6106             0x006C, # l
6107             0x0069, # i
6108             0x0073, # s
6109             NEVER_CHAR, # (t)
6110             ]->[length $self->{kwd}]) {
6111             ## Stay in the state.
6112 0           $self->{kwd} .= chr $nc;
6113            
6114 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6115 0           $self->{line_prev} = $self->{line};
6116 0           $self->{column_prev} = $self->{column};
6117 0           $self->{column}++;
6118 0           $self->{nc}
6119             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6120             } else {
6121 0           $self->{set_nc}->($self);
6122             }
6123            
6124 0           redo A;
6125             } elsif ((length $self->{kwd}) == 6 and
6126             ($nc == 0x0054 or # T
6127             $nc == 0x0074)) { # t
6128 0 0 0       if ($self->{kwd} ne 'ATTLIS' or $nc == 0x0074) {
6129 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6130             text => 'ATTLIST',
6131             line => $self->{line_prev},
6132             column => $self->{column_prev} - 5);
6133             }
6134 0           $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6135             attrdefs => [],
6136             line => $self->{line_prev},
6137             column => $self->{column_prev} - 7};
6138 0           $self->{state} = DOCTYPE_MD_STATE;
6139            
6140 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6141 0           $self->{line_prev} = $self->{line};
6142 0           $self->{column_prev} = $self->{column};
6143 0           $self->{column}++;
6144 0           $self->{nc}
6145             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6146             } else {
6147 0           $self->{set_nc}->($self);
6148             }
6149            
6150 0           redo A;
6151             } else {
6152 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6153             line => $self->{line_prev},
6154             column => $self->{column_prev} - 1
6155             - (length $self->{kwd})
6156             + 1 * ($nc == EOF_CHAR));
6157 0           $self->{state} = BOGUS_COMMENT_STATE;
6158             ## Reconsume.
6159 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6160 0           redo A;
6161             }
6162             } elsif ($state == MD_NOTATION_STATE) {
6163 0 0 0       if ($nc == [
    0 0        
      0        
6164             undef,
6165             0x004F, # O
6166             0x0054, # T
6167             0x0041, # A
6168             0x0054, # T
6169             0x0049, # I
6170             0x004F, # O
6171             NEVER_CHAR, # (N)
6172             ]->[length $self->{kwd}] or
6173             $nc == [
6174             undef,
6175             0x006F, # o
6176             0x0074, # t
6177             0x0061, # a
6178             0x0074, # t
6179             0x0069, # i
6180             0x006F, # o
6181             NEVER_CHAR, # (n)
6182             ]->[length $self->{kwd}]) {
6183             ## Stay in the state.
6184 0           $self->{kwd} .= chr $nc;
6185            
6186 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6187 0           $self->{line_prev} = $self->{line};
6188 0           $self->{column_prev} = $self->{column};
6189 0           $self->{column}++;
6190 0           $self->{nc}
6191             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6192             } else {
6193 0           $self->{set_nc}->($self);
6194             }
6195            
6196 0           redo A;
6197             } elsif ((length $self->{kwd}) == 7 and
6198             ($nc == 0x004E or # N
6199             $nc == 0x006E)) { # n
6200 0 0 0       if ($self->{kwd} ne 'NOTATIO' or $nc == 0x006E) {
6201 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6202             text => 'NOTATION',
6203             line => $self->{line_prev},
6204             column => $self->{column_prev} - 6);
6205             }
6206 0           $self->{ct} = {type => NOTATION_TOKEN, name => '',
6207             line => $self->{line_prev},
6208             column => $self->{column_prev} - 8};
6209 0           $self->{state} = DOCTYPE_MD_STATE;
6210            
6211 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6212 0           $self->{line_prev} = $self->{line};
6213 0           $self->{column_prev} = $self->{column};
6214 0           $self->{column}++;
6215 0           $self->{nc}
6216             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6217             } else {
6218 0           $self->{set_nc}->($self);
6219             }
6220            
6221 0           redo A;
6222             } else {
6223 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6224             line => $self->{line_prev},
6225             column => $self->{column_prev} - 1
6226             - (length $self->{kwd})
6227             + 1 * ($nc == EOF_CHAR));
6228 0           $self->{state} = BOGUS_COMMENT_STATE;
6229             ## Reconsume.
6230 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6231 0           redo A;
6232             }
6233             } elsif ($state == DOCTYPE_MD_STATE) {
6234             ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6235             ## "DOCTYPE NOTATION state".
6236              
6237 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
6238             ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6239 0           $self->{state} = BEFORE_MD_NAME_STATE;
6240            
6241 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6242 0           $self->{line_prev} = $self->{line};
6243 0           $self->{column_prev} = $self->{column};
6244 0           $self->{column}++;
6245 0           $self->{nc}
6246             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6247             } else {
6248 0           $self->{set_nc}->($self);
6249             }
6250            
6251 0           redo A;
6252             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6253             $nc == 0x0025) { # %
6254             ## XML5: Switch to the "DOCTYPE bogus comment state".
6255 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6256 0           $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6257            
6258 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6259 0           $self->{line_prev} = $self->{line};
6260 0           $self->{column_prev} = $self->{column};
6261 0           $self->{column}++;
6262 0           $self->{nc}
6263             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6264             } else {
6265 0           $self->{set_nc}->($self);
6266             }
6267            
6268 0           redo A;
6269             } elsif ($nc == EOF_CHAR) {
6270 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6271 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6272             ## Reconsume.
6273 0           redo A;
6274             } elsif ($nc == 0x003E) { # >
6275             ## XML5: Switch to the "DOCTYPE bogus comment state".
6276 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6277 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6278            
6279 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6280 0           $self->{line_prev} = $self->{line};
6281 0           $self->{column_prev} = $self->{column};
6282 0           $self->{column}++;
6283 0           $self->{nc}
6284             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6285             } else {
6286 0           $self->{set_nc}->($self);
6287             }
6288            
6289 0           redo A;
6290             } else {
6291             ## XML5: Switch to the "DOCTYPE bogus comment state".
6292 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6293 0           $self->{state} = BEFORE_MD_NAME_STATE;
6294 0           redo A;
6295             }
6296             } elsif ($state == BEFORE_MD_NAME_STATE) {
6297             ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6298             ## before state", "DOCTYPE ATTLIST name before state".
6299              
6300 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
6301             ## Stay in the state.
6302            
6303 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304 0           $self->{line_prev} = $self->{line};
6305 0           $self->{column_prev} = $self->{column};
6306 0           $self->{column}++;
6307 0           $self->{nc}
6308             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309             } else {
6310 0           $self->{set_nc}->($self);
6311             }
6312            
6313 0           redo A;
6314             } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6315             $nc == 0x0025) { # %
6316 0           $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6317            
6318 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6319 0           $self->{line_prev} = $self->{line};
6320 0           $self->{column_prev} = $self->{column};
6321 0           $self->{column}++;
6322 0           $self->{nc}
6323             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6324             } else {
6325 0           $self->{set_nc}->($self);
6326             }
6327            
6328 0           redo A;
6329             } elsif ($nc == 0x003E) { # >
6330             ## XML5: Same as "Anything else".
6331 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6332 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6333            
6334 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6335 0           $self->{line_prev} = $self->{line};
6336 0           $self->{column_prev} = $self->{column};
6337 0           $self->{column}++;
6338 0           $self->{nc}
6339             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6340             } else {
6341 0           $self->{set_nc}->($self);
6342             }
6343            
6344 0           redo A;
6345             } elsif ($nc == EOF_CHAR) {
6346 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6347 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6348             ## Reconsume.
6349 0           redo A;
6350             } else {
6351             ## XML5: [ATTLIST] Not defined yet.
6352 0 0         if ($nc == 0x0000) {
6353 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6354             }
6355 0 0         $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6356 0           $self->{state} = MD_NAME_STATE;
6357            
6358 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6359 0           $self->{line_prev} = $self->{line};
6360 0           $self->{column_prev} = $self->{column};
6361 0           $self->{column}++;
6362 0           $self->{nc}
6363             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6364             } else {
6365 0           $self->{set_nc}->($self);
6366             }
6367            
6368 0           redo A;
6369             }
6370             } elsif ($state == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6371 0 0         if ($is_space->{$nc}) {
    0          
    0          
6372             ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6373 0           $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6374 0           $self->{state} = BEFORE_MD_NAME_STATE;
6375            
6376 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6377 0           $self->{line_prev} = $self->{line};
6378 0           $self->{column_prev} = $self->{column};
6379 0           $self->{column}++;
6380 0           $self->{nc}
6381             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6382             } else {
6383 0           $self->{set_nc}->($self);
6384             }
6385            
6386 0           redo A;
6387             } elsif ($nc == 0x003E) { # >
6388             ## XML5: Same as "Anything else".
6389 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6390 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6391            
6392 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393 0           $self->{line_prev} = $self->{line};
6394 0           $self->{column_prev} = $self->{column};
6395 0           $self->{column}++;
6396 0           $self->{nc}
6397             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398             } else {
6399 0           $self->{set_nc}->($self);
6400             }
6401            
6402 0           redo A;
6403             } elsif ($nc == EOF_CHAR) {
6404 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6405 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6406             ## Reconsume.
6407 0           redo A;
6408             } else {
6409             ## XML5: No parse error.
6410 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6411 0           $self->{state} = BOGUS_COMMENT_STATE;
6412 0           $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6413             ## Reconsume.
6414 0           redo A;
6415             }
6416             } elsif ($state == MD_NAME_STATE) {
6417             ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6418            
6419 0 0         if ($is_space->{$nc}) {
    0          
    0          
6420 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
    0          
6421 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6422             } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6423 0           $self->{state} = AFTER_ELEMENT_NAME_STATE;
6424             } else { # ENTITY/NOTATION
6425 0           $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6426             }
6427            
6428 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429 0           $self->{line_prev} = $self->{line};
6430 0           $self->{column_prev} = $self->{column};
6431 0           $self->{column}++;
6432 0           $self->{nc}
6433             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434             } else {
6435 0           $self->{set_nc}->($self);
6436             }
6437            
6438 0           redo A;
6439             } elsif ($nc == 0x003E) { # >
6440 0 0         if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6441             #
6442             } else {
6443 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6444             }
6445 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6446            
6447 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6448 0           $self->{line_prev} = $self->{line};
6449 0           $self->{column_prev} = $self->{column};
6450 0           $self->{column}++;
6451 0           $self->{nc}
6452             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6453             } else {
6454 0           $self->{set_nc}->($self);
6455             }
6456            
6457 0           return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6458 0           redo A;
6459             } elsif ($nc == EOF_CHAR) {
6460             ## XML5: [ATTLIST] No parse error.
6461 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6462 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6463             ## Reconsume.
6464 0           redo A;
6465             } else {
6466             ## XML5: [ATTLIST] Not defined yet.
6467 0 0         if ($nc == 0x0000) {
6468 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6469             }
6470 0 0         $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6471             ## Stay in the state.
6472            
6473 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6474 0           $self->{line_prev} = $self->{line};
6475 0           $self->{column_prev} = $self->{column};
6476 0           $self->{column}++;
6477 0           $self->{nc}
6478             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6479             } else {
6480 0           $self->{set_nc}->($self);
6481             }
6482            
6483 0           redo A;
6484             }
6485             } elsif ($state == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6486 0 0         if ($is_space->{$nc}) {
    0          
    0          
6487             ## Stay in the state.
6488            
6489 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6490 0           $self->{line_prev} = $self->{line};
6491 0           $self->{column_prev} = $self->{column};
6492 0           $self->{column}++;
6493 0           $self->{nc}
6494             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6495             } else {
6496 0           $self->{set_nc}->($self);
6497             }
6498            
6499 0           redo A;
6500             } elsif ($nc == 0x003E) { # >
6501 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6502            
6503 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6504 0           $self->{line_prev} = $self->{line};
6505 0           $self->{column_prev} = $self->{column};
6506 0           $self->{column}++;
6507 0           $self->{nc}
6508             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6509             } else {
6510 0           $self->{set_nc}->($self);
6511             }
6512            
6513 0           return ($self->{ct}); # ATTLIST
6514 0           redo A;
6515             } elsif ($nc == EOF_CHAR) {
6516             ## XML5: No parse error.
6517 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6518 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6519             ## Discard the current token.
6520 0           redo A;
6521             } else {
6522             ## XML5: Not defined yet.
6523 0 0         if ($nc == 0x0000) {
6524 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6525             }
6526 0 0         $self->{ca} = {name => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, # attrdef
6527             tokens => [],
6528             line => $self->{line}, column => $self->{column}};
6529 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6530            
6531 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6532 0           $self->{line_prev} = $self->{line};
6533 0           $self->{column_prev} = $self->{column};
6534 0           $self->{column}++;
6535 0           $self->{nc}
6536             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6537             } else {
6538 0           $self->{set_nc}->($self);
6539             }
6540            
6541 0           redo A;
6542             }
6543             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6544 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
6545 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6546            
6547 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548 0           $self->{line_prev} = $self->{line};
6549 0           $self->{column_prev} = $self->{column};
6550 0           $self->{column}++;
6551 0           $self->{nc}
6552             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553             } else {
6554 0           $self->{set_nc}->($self);
6555             }
6556            
6557 0           redo A;
6558             } elsif ($nc == 0x003E) { # >
6559             ## XML5: Same as "anything else".
6560 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6561 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6562            
6563 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6564 0           $self->{line_prev} = $self->{line};
6565 0           $self->{column_prev} = $self->{column};
6566 0           $self->{column}++;
6567 0           $self->{nc}
6568             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6569             } else {
6570 0           $self->{set_nc}->($self);
6571             }
6572            
6573 0           return ($self->{ct}); # ATTLIST
6574 0           redo A;
6575             } elsif ($nc == 0x0028) { # (
6576             ## XML5: Same as "anything else".
6577 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6578 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6579            
6580 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6581 0           $self->{line_prev} = $self->{line};
6582 0           $self->{column_prev} = $self->{column};
6583 0           $self->{column}++;
6584 0           $self->{nc}
6585             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6586             } else {
6587 0           $self->{set_nc}->($self);
6588             }
6589            
6590 0           redo A;
6591             } elsif ($nc == EOF_CHAR) {
6592             ## XML5: No parse error.
6593 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6594 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6595            
6596 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6597 0           $self->{line_prev} = $self->{line};
6598 0           $self->{column_prev} = $self->{column};
6599 0           $self->{column}++;
6600 0           $self->{nc}
6601             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6602             } else {
6603 0           $self->{set_nc}->($self);
6604             }
6605            
6606             ## Discard the current token.
6607 0           redo A;
6608             } else {
6609             ## XML5: Not defined yet.
6610 0 0         if ($nc == 0x0000) {
6611 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6612             }
6613 0 0         $self->{ca}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6614             ## Stay in the state.
6615            
6616 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617 0           $self->{line_prev} = $self->{line};
6618 0           $self->{column_prev} = $self->{column};
6619 0           $self->{column}++;
6620 0           $self->{nc}
6621             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622             } else {
6623 0           $self->{set_nc}->($self);
6624             }
6625            
6626 0           redo A;
6627             }
6628             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6629 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
6630             ## Stay in the state.
6631            
6632 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6633 0           $self->{line_prev} = $self->{line};
6634 0           $self->{column_prev} = $self->{column};
6635 0           $self->{column}++;
6636 0           $self->{nc}
6637             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6638             } else {
6639 0           $self->{set_nc}->($self);
6640             }
6641            
6642 0           redo A;
6643             } elsif ($nc == 0x003E) { # >
6644             ## XML5: Same as "anything else".
6645 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6646 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6647            
6648 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6649 0           $self->{line_prev} = $self->{line};
6650 0           $self->{column_prev} = $self->{column};
6651 0           $self->{column}++;
6652 0           $self->{nc}
6653             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6654             } else {
6655 0           $self->{set_nc}->($self);
6656             }
6657            
6658 0           return ($self->{ct}); # ATTLIST
6659 0           redo A;
6660             } elsif ($nc == 0x0028) { # (
6661             ## XML5: Same as "anything else".
6662 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6663            
6664 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6665 0           $self->{line_prev} = $self->{line};
6666 0           $self->{column_prev} = $self->{column};
6667 0           $self->{column}++;
6668 0           $self->{nc}
6669             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6670             } else {
6671 0           $self->{set_nc}->($self);
6672             }
6673            
6674 0           redo A;
6675             } elsif ($nc == EOF_CHAR) {
6676             ## XML5: No parse error.
6677 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6678 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6679            
6680 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6681 0           $self->{line_prev} = $self->{line};
6682 0           $self->{column_prev} = $self->{column};
6683 0           $self->{column}++;
6684 0           $self->{nc}
6685             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6686             } else {
6687 0           $self->{set_nc}->($self);
6688             }
6689            
6690             ## Discard the token.
6691 0           redo A;
6692             } else {
6693             ## XML5: Not defined yet.
6694 0           $self->{ca}->{type} = chr $nc;
6695 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6696            
6697 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6698 0           $self->{line_prev} = $self->{line};
6699 0           $self->{column_prev} = $self->{column};
6700 0           $self->{column}++;
6701 0           $self->{nc}
6702             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6703             } else {
6704 0           $self->{set_nc}->($self);
6705             }
6706            
6707 0           redo A;
6708             }
6709             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6710 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
    0          
6711 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6712            
6713 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6714 0           $self->{line_prev} = $self->{line};
6715 0           $self->{column_prev} = $self->{column};
6716 0           $self->{column}++;
6717 0           $self->{nc}
6718             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6719             } else {
6720 0           $self->{set_nc}->($self);
6721             }
6722            
6723 0           redo A;
6724             } elsif ($nc == 0x0023) { # #
6725             ## XML5: Same as "anything else".
6726 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6727 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6728            
6729 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6730 0           $self->{line_prev} = $self->{line};
6731 0           $self->{column_prev} = $self->{column};
6732 0           $self->{column}++;
6733 0           $self->{nc}
6734             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6735             } else {
6736 0           $self->{set_nc}->($self);
6737             }
6738            
6739 0           redo A;
6740             } elsif ($nc == 0x0022) { # "
6741             ## XML5: Same as "anything else".
6742 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6743 0           $self->{ca}->{value} = '';
6744 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6745            
6746 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6747 0           $self->{line_prev} = $self->{line};
6748 0           $self->{column_prev} = $self->{column};
6749 0           $self->{column}++;
6750 0           $self->{nc}
6751             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6752             } else {
6753 0           $self->{set_nc}->($self);
6754             }
6755            
6756 0           redo A;
6757             } elsif ($nc == 0x0027) { # '
6758             ## XML5: Same as "anything else".
6759 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6760 0           $self->{ca}->{value} = '';
6761 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6762            
6763 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6764 0           $self->{line_prev} = $self->{line};
6765 0           $self->{column_prev} = $self->{column};
6766 0           $self->{column}++;
6767 0           $self->{nc}
6768             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6769             } else {
6770 0           $self->{set_nc}->($self);
6771             }
6772            
6773 0           redo A;
6774             } elsif ($nc == 0x003E) { # >
6775             ## XML5: Same as "anything else".
6776 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6777 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6778            
6779 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780 0           $self->{line_prev} = $self->{line};
6781 0           $self->{column_prev} = $self->{column};
6782 0           $self->{column}++;
6783 0           $self->{nc}
6784             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785             } else {
6786 0           $self->{set_nc}->($self);
6787             }
6788            
6789 0           return ($self->{ct}); # ATTLIST
6790 0           redo A;
6791             } elsif ($nc == 0x0028) { # (
6792             ## XML5: Same as "anything else".
6793 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6794 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6795            
6796 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6797 0           $self->{line_prev} = $self->{line};
6798 0           $self->{column_prev} = $self->{column};
6799 0           $self->{column}++;
6800 0           $self->{nc}
6801             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6802             } else {
6803 0           $self->{set_nc}->($self);
6804             }
6805            
6806 0           redo A;
6807             } elsif ($nc == EOF_CHAR) {
6808             ## XML5: No parse error.
6809 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6810 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6811            
6812 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6813 0           $self->{line_prev} = $self->{line};
6814 0           $self->{column_prev} = $self->{column};
6815 0           $self->{column}++;
6816 0           $self->{nc}
6817             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6818             } else {
6819 0           $self->{set_nc}->($self);
6820             }
6821            
6822             ## Discard the token.
6823 0           redo A;
6824             } else {
6825             ## XML5: Not defined yet.
6826 0           $self->{ca}->{type} .= chr $nc;
6827             ## Stay in the state.
6828            
6829 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6830 0           $self->{line_prev} = $self->{line};
6831 0           $self->{column_prev} = $self->{column};
6832 0           $self->{column}++;
6833 0           $self->{nc}
6834             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6835             } else {
6836 0           $self->{set_nc}->($self);
6837             }
6838            
6839 0           redo A;
6840             }
6841             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6842 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
    0          
6843             ## Stay in the state.
6844            
6845 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6846 0           $self->{line_prev} = $self->{line};
6847 0           $self->{column_prev} = $self->{column};
6848 0           $self->{column}++;
6849 0           $self->{nc}
6850             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6851             } else {
6852 0           $self->{set_nc}->($self);
6853             }
6854            
6855 0           redo A;
6856             } elsif ($nc == 0x0028) { # (
6857             ## XML5: Same as "anything else".
6858 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6859            
6860 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6861 0           $self->{line_prev} = $self->{line};
6862 0           $self->{column_prev} = $self->{column};
6863 0           $self->{column}++;
6864 0           $self->{nc}
6865             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6866             } else {
6867 0           $self->{set_nc}->($self);
6868             }
6869            
6870 0           redo A;
6871             } elsif ($nc == 0x0023) { # #
6872 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6873            
6874 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6875 0           $self->{line_prev} = $self->{line};
6876 0           $self->{column_prev} = $self->{column};
6877 0           $self->{column}++;
6878 0           $self->{nc}
6879             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6880             } else {
6881 0           $self->{set_nc}->($self);
6882             }
6883            
6884 0           redo A;
6885             } elsif ($nc == 0x0022) { # "
6886             ## XML5: Same as "anything else".
6887 0           $self->{ca}->{value} = '';
6888 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6889            
6890 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6891 0           $self->{line_prev} = $self->{line};
6892 0           $self->{column_prev} = $self->{column};
6893 0           $self->{column}++;
6894 0           $self->{nc}
6895             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6896             } else {
6897 0           $self->{set_nc}->($self);
6898             }
6899            
6900 0           redo A;
6901             } elsif ($nc == 0x0027) { # '
6902             ## XML5: Same as "anything else".
6903 0           $self->{ca}->{value} = '';
6904 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6905            
6906 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6907 0           $self->{line_prev} = $self->{line};
6908 0           $self->{column_prev} = $self->{column};
6909 0           $self->{column}++;
6910 0           $self->{nc}
6911             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6912             } else {
6913 0           $self->{set_nc}->($self);
6914             }
6915            
6916 0           redo A;
6917             } elsif ($nc == 0x003E) { # >
6918             ## XML5: Same as "anything else".
6919 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6920 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6921            
6922 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6923 0           $self->{line_prev} = $self->{line};
6924 0           $self->{column_prev} = $self->{column};
6925 0           $self->{column}++;
6926 0           $self->{nc}
6927             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6928             } else {
6929 0           $self->{set_nc}->($self);
6930             }
6931            
6932 0           return ($self->{ct}); # ATTLIST
6933 0           redo A;
6934             } elsif ($nc == EOF_CHAR) {
6935             ## XML5: No parse error.
6936 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6937 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6938            
6939 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940 0           $self->{line_prev} = $self->{line};
6941 0           $self->{column_prev} = $self->{column};
6942 0           $self->{column}++;
6943 0           $self->{nc}
6944             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945             } else {
6946 0           $self->{set_nc}->($self);
6947             }
6948            
6949             ## Discard the current token.
6950 0           redo A;
6951             } else {
6952             ## XML5: Switch to the "DOCTYPE bogus comment state".
6953 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6954 0           $self->{ca}->{value} = '';
6955 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6956             ## Reconsume.
6957 0           redo A;
6958             }
6959             } elsif ($state == BEFORE_ALLOWED_TOKEN_STATE) {
6960 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
6961             ## Stay in the state.
6962            
6963 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6964 0           $self->{line_prev} = $self->{line};
6965 0           $self->{column_prev} = $self->{column};
6966 0           $self->{column}++;
6967 0           $self->{nc}
6968             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6969             } else {
6970 0           $self->{set_nc}->($self);
6971             }
6972            
6973 0           redo A;
6974             } elsif ($nc == 0x007C) { # |
6975 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6976             ## Stay in the state.
6977            
6978 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6979 0           $self->{line_prev} = $self->{line};
6980 0           $self->{column_prev} = $self->{column};
6981 0           $self->{column}++;
6982 0           $self->{nc}
6983             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6984             } else {
6985 0           $self->{set_nc}->($self);
6986             }
6987            
6988 0           redo A;
6989             } elsif ($nc == 0x0029) { # )
6990 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6991 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6992            
6993 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6994 0           $self->{line_prev} = $self->{line};
6995 0           $self->{column_prev} = $self->{column};
6996 0           $self->{column}++;
6997 0           $self->{nc}
6998             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6999             } else {
7000 0           $self->{set_nc}->($self);
7001             }
7002            
7003 0           redo A;
7004             } elsif ($nc == 0x003E) { # >
7005 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7006 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7007            
7008 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7009 0           $self->{line_prev} = $self->{line};
7010 0           $self->{column_prev} = $self->{column};
7011 0           $self->{column}++;
7012 0           $self->{nc}
7013             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7014             } else {
7015 0           $self->{set_nc}->($self);
7016             }
7017            
7018 0           return ($self->{ct}); # ATTLIST
7019 0           redo A;
7020             } elsif ($nc == EOF_CHAR) {
7021             ## XML5: No parse error.
7022 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7023 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7024            
7025 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7026 0           $self->{line_prev} = $self->{line};
7027 0           $self->{column_prev} = $self->{column};
7028 0           $self->{column}++;
7029 0           $self->{nc}
7030             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7031             } else {
7032 0           $self->{set_nc}->($self);
7033             }
7034            
7035             ## Discard the current token.
7036 0           redo A;
7037             } else {
7038 0 0         if ($nc == 0x000) {
7039 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7040             }
7041 0 0         push @{$self->{ca}->{tokens}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
  0            
7042 0           $self->{state} = ALLOWED_TOKEN_STATE;
7043            
7044 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7045 0           $self->{line_prev} = $self->{line};
7046 0           $self->{column_prev} = $self->{column};
7047 0           $self->{column}++;
7048 0           $self->{nc}
7049             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7050             } else {
7051 0           $self->{set_nc}->($self);
7052             }
7053            
7054 0           redo A;
7055             }
7056             } elsif ($state == ALLOWED_TOKEN_STATE) {
7057 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7058 0           $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7059            
7060 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7061 0           $self->{line_prev} = $self->{line};
7062 0           $self->{column_prev} = $self->{column};
7063 0           $self->{column}++;
7064 0           $self->{nc}
7065             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7066             } else {
7067 0           $self->{set_nc}->($self);
7068             }
7069            
7070 0           redo A;
7071             } elsif ($nc == 0x007C) { # |
7072 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7073            
7074 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7075 0           $self->{line_prev} = $self->{line};
7076 0           $self->{column_prev} = $self->{column};
7077 0           $self->{column}++;
7078 0           $self->{nc}
7079             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7080             } else {
7081 0           $self->{set_nc}->($self);
7082             }
7083            
7084 0           redo A;
7085             } elsif ($nc == 0x0029) { # )
7086 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7087            
7088 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7089 0           $self->{line_prev} = $self->{line};
7090 0           $self->{column_prev} = $self->{column};
7091 0           $self->{column}++;
7092 0           $self->{nc}
7093             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7094             } else {
7095 0           $self->{set_nc}->($self);
7096             }
7097            
7098 0           redo A;
7099             } elsif ($nc == 0x003E) { # >
7100 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7101 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7102            
7103 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7104 0           $self->{line_prev} = $self->{line};
7105 0           $self->{column_prev} = $self->{column};
7106 0           $self->{column}++;
7107 0           $self->{nc}
7108             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7109             } else {
7110 0           $self->{set_nc}->($self);
7111             }
7112            
7113 0           return ($self->{ct}); # ATTLIST
7114 0           redo A;
7115             } elsif ($nc == EOF_CHAR) {
7116             ## XML5: No parse error.
7117 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7118 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7119            
7120 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121 0           $self->{line_prev} = $self->{line};
7122 0           $self->{column_prev} = $self->{column};
7123 0           $self->{column}++;
7124 0           $self->{nc}
7125             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126             } else {
7127 0           $self->{set_nc}->($self);
7128             }
7129            
7130             ## Discard the current token.
7131 0           redo A;
7132             } else {
7133 0 0         if ($nc == 0x0000) {
7134 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7135             }
7136 0 0         $self->{ca}->{tokens}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
7137             ## Stay in the state.
7138            
7139 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7140 0           $self->{line_prev} = $self->{line};
7141 0           $self->{column_prev} = $self->{column};
7142 0           $self->{column}++;
7143 0           $self->{nc}
7144             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7145             } else {
7146 0           $self->{set_nc}->($self);
7147             }
7148            
7149 0           redo A;
7150             }
7151             } elsif ($state == AFTER_ALLOWED_TOKEN_STATE) {
7152 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7153             ## Stay in the state.
7154            
7155 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7156 0           $self->{line_prev} = $self->{line};
7157 0           $self->{column_prev} = $self->{column};
7158 0           $self->{column}++;
7159 0           $self->{nc}
7160             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7161             } else {
7162 0           $self->{set_nc}->($self);
7163             }
7164            
7165 0           redo A;
7166             } elsif ($nc == 0x007C) { # |
7167 0           $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7168            
7169 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7170 0           $self->{line_prev} = $self->{line};
7171 0           $self->{column_prev} = $self->{column};
7172 0           $self->{column}++;
7173 0           $self->{nc}
7174             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7175             } else {
7176 0           $self->{set_nc}->($self);
7177             }
7178            
7179 0           redo A;
7180             } elsif ($nc == 0x0029) { # )
7181 0           $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7182            
7183 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7184 0           $self->{line_prev} = $self->{line};
7185 0           $self->{column_prev} = $self->{column};
7186 0           $self->{column}++;
7187 0           $self->{nc}
7188             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7189             } else {
7190 0           $self->{set_nc}->($self);
7191             }
7192            
7193 0           redo A;
7194             } elsif ($nc == 0x003E) { # >
7195 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7196 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7197            
7198 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7199 0           $self->{line_prev} = $self->{line};
7200 0           $self->{column_prev} = $self->{column};
7201 0           $self->{column}++;
7202 0           $self->{nc}
7203             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7204             } else {
7205 0           $self->{set_nc}->($self);
7206             }
7207            
7208 0           return ($self->{ct}); # ATTLIST
7209 0           redo A;
7210             } elsif ($nc == EOF_CHAR) {
7211             ## XML5: No parse error.
7212 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7213 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7214            
7215 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7216 0           $self->{line_prev} = $self->{line};
7217 0           $self->{column_prev} = $self->{column};
7218 0           $self->{column}++;
7219 0           $self->{nc}
7220             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7221             } else {
7222 0           $self->{set_nc}->($self);
7223             }
7224            
7225             ## Discard the current token.
7226 0           redo A;
7227             } else {
7228 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7229             line => $self->{line_prev},
7230             column => $self->{column_prev});
7231 0 0         if ($nc == 0x0000) {
7232 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7233             }
7234 0 0         $self->{ca}->{tokens}->[-1] .= ' ' . ($nc == 0x0000 ? "\x{FFFD}" : chr $nc);
7235 0           $self->{state} = ALLOWED_TOKEN_STATE;
7236            
7237 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7238 0           $self->{line_prev} = $self->{line};
7239 0           $self->{column_prev} = $self->{column};
7240 0           $self->{column}++;
7241 0           $self->{nc}
7242             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7243             } else {
7244 0           $self->{set_nc}->($self);
7245             }
7246            
7247 0           redo A;
7248             }
7249             } elsif ($state == AFTER_ALLOWED_TOKENS_STATE) {
7250 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
7251 0           $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7252            
7253 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7254 0           $self->{line_prev} = $self->{line};
7255 0           $self->{column_prev} = $self->{column};
7256 0           $self->{column}++;
7257 0           $self->{nc}
7258             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7259             } else {
7260 0           $self->{set_nc}->($self);
7261             }
7262            
7263 0           redo A;
7264             } elsif ($nc == 0x0023) { # #
7265 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7266 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7267            
7268 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7269 0           $self->{line_prev} = $self->{line};
7270 0           $self->{column_prev} = $self->{column};
7271 0           $self->{column}++;
7272 0           $self->{nc}
7273             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7274             } else {
7275 0           $self->{set_nc}->($self);
7276             }
7277            
7278 0           redo A;
7279             } elsif ($nc == 0x0022) { # "
7280 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7281 0           $self->{ca}->{value} = '';
7282 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7283            
7284 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7285 0           $self->{line_prev} = $self->{line};
7286 0           $self->{column_prev} = $self->{column};
7287 0           $self->{column}++;
7288 0           $self->{nc}
7289             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7290             } else {
7291 0           $self->{set_nc}->($self);
7292             }
7293            
7294 0           redo A;
7295             } elsif ($nc == 0x0027) { # '
7296 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7297 0           $self->{ca}->{value} = '';
7298 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7299            
7300 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7301 0           $self->{line_prev} = $self->{line};
7302 0           $self->{column_prev} = $self->{column};
7303 0           $self->{column}++;
7304 0           $self->{nc}
7305             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7306             } else {
7307 0           $self->{set_nc}->($self);
7308             }
7309            
7310 0           redo A;
7311             } elsif ($nc == 0x003E) { # >
7312 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7313 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7314            
7315 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7316 0           $self->{line_prev} = $self->{line};
7317 0           $self->{column_prev} = $self->{column};
7318 0           $self->{column}++;
7319 0           $self->{nc}
7320             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7321             } else {
7322 0           $self->{set_nc}->($self);
7323             }
7324            
7325 0           return ($self->{ct}); # ATTLIST
7326 0           redo A;
7327             } elsif ($nc == EOF_CHAR) {
7328 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7329 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7330            
7331 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7332 0           $self->{line_prev} = $self->{line};
7333 0           $self->{column_prev} = $self->{column};
7334 0           $self->{column}++;
7335 0           $self->{nc}
7336             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7337             } else {
7338 0           $self->{set_nc}->($self);
7339             }
7340            
7341             ## Discard the current token.
7342 0           redo A;
7343             } else {
7344 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7345 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7346             ## Reconsume.
7347 0           redo A;
7348             }
7349             } elsif ($state == BEFORE_ATTR_DEFAULT_STATE) {
7350 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
7351             ## Stay in the state.
7352            
7353 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7354 0           $self->{line_prev} = $self->{line};
7355 0           $self->{column_prev} = $self->{column};
7356 0           $self->{column}++;
7357 0           $self->{nc}
7358             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7359             } else {
7360 0           $self->{set_nc}->($self);
7361             }
7362            
7363 0           redo A;
7364             } elsif ($nc == 0x0023) { # #
7365 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7366            
7367 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7368 0           $self->{line_prev} = $self->{line};
7369 0           $self->{column_prev} = $self->{column};
7370 0           $self->{column}++;
7371 0           $self->{nc}
7372             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7373             } else {
7374 0           $self->{set_nc}->($self);
7375             }
7376            
7377 0           redo A;
7378             } elsif ($nc == 0x0022) { # "
7379 0           $self->{ca}->{value} = '';
7380 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7381            
7382 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7383 0           $self->{line_prev} = $self->{line};
7384 0           $self->{column_prev} = $self->{column};
7385 0           $self->{column}++;
7386 0           $self->{nc}
7387             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7388             } else {
7389 0           $self->{set_nc}->($self);
7390             }
7391            
7392 0           redo A;
7393             } elsif ($nc == 0x0027) { # '
7394 0           $self->{ca}->{value} = '';
7395 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7396            
7397 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7398 0           $self->{line_prev} = $self->{line};
7399 0           $self->{column_prev} = $self->{column};
7400 0           $self->{column}++;
7401 0           $self->{nc}
7402             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7403             } else {
7404 0           $self->{set_nc}->($self);
7405             }
7406            
7407 0           redo A;
7408             } elsif ($nc == 0x003E) { # >
7409 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7410 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7411            
7412 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7413 0           $self->{line_prev} = $self->{line};
7414 0           $self->{column_prev} = $self->{column};
7415 0           $self->{column}++;
7416 0           $self->{nc}
7417             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7418             } else {
7419 0           $self->{set_nc}->($self);
7420             }
7421            
7422 0           return ($self->{ct}); # ATTLIST
7423 0           redo A;
7424             } elsif ($nc == EOF_CHAR) {
7425 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7426 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7427            
7428 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7429 0           $self->{line_prev} = $self->{line};
7430 0           $self->{column_prev} = $self->{column};
7431 0           $self->{column}++;
7432 0           $self->{nc}
7433             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7434             } else {
7435 0           $self->{set_nc}->($self);
7436             }
7437            
7438             ## Discard the current token.
7439 0           redo A;
7440             } else {
7441 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7442 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7443             ## Reconsume.
7444 0           redo A;
7445             }
7446             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7447 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7448             ## XML5: No parse error.
7449 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7450 0           $self->{state} = BOGUS_MD_STATE;
7451             ## Reconsume.
7452 0           redo A;
7453             } elsif ($nc == 0x0022) { # "
7454             # XXX parse error?
7455             ## XML5: Same as "anything else".
7456 0           $self->{ca}->{value} = '';
7457 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7458            
7459 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7460 0           $self->{line_prev} = $self->{line};
7461 0           $self->{column_prev} = $self->{column};
7462 0           $self->{column}++;
7463 0           $self->{nc}
7464             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7465             } else {
7466 0           $self->{set_nc}->($self);
7467             }
7468            
7469 0           redo A;
7470             } elsif ($nc == 0x0027) { # '
7471             # XXX parse error?
7472             ## XML5: Same as "anything else".
7473 0           $self->{ca}->{value} = '';
7474 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7475            
7476 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7477 0           $self->{line_prev} = $self->{line};
7478 0           $self->{column_prev} = $self->{column};
7479 0           $self->{column}++;
7480 0           $self->{nc}
7481             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7482             } else {
7483 0           $self->{set_nc}->($self);
7484             }
7485            
7486 0           redo A;
7487             } elsif ($nc == 0x003E) { # >
7488             ## XML5: Same as "anything else".
7489 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7490 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7491            
7492 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493 0           $self->{line_prev} = $self->{line};
7494 0           $self->{column_prev} = $self->{column};
7495 0           $self->{column}++;
7496 0           $self->{nc}
7497             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498             } else {
7499 0           $self->{set_nc}->($self);
7500             }
7501            
7502 0           return ($self->{ct}); # ATTLIST
7503 0           redo A;
7504             } elsif ($nc == EOF_CHAR) {
7505             ## XML5: No parse error.
7506 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7507 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7508            
7509 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7510 0           $self->{line_prev} = $self->{line};
7511 0           $self->{column_prev} = $self->{column};
7512 0           $self->{column}++;
7513 0           $self->{nc}
7514             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7515             } else {
7516 0           $self->{set_nc}->($self);
7517             }
7518            
7519             ## Discard the current token.
7520 0           redo A;
7521             } else {
7522 0           $self->{ca}->{default} = chr $nc;
7523 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7524            
7525 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7526 0           $self->{line_prev} = $self->{line};
7527 0           $self->{column_prev} = $self->{column};
7528 0           $self->{column}++;
7529 0           $self->{nc}
7530             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7531             } else {
7532 0           $self->{set_nc}->($self);
7533             }
7534            
7535 0           redo A;
7536             }
7537             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7538 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7539 0           $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7540            
7541 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7542 0           $self->{line_prev} = $self->{line};
7543 0           $self->{column_prev} = $self->{column};
7544 0           $self->{column}++;
7545 0           $self->{nc}
7546             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7547             } else {
7548 0           $self->{set_nc}->($self);
7549             }
7550            
7551 0           redo A;
7552             } elsif ($nc == 0x0022) { # "
7553             ## XML5: Same as "anything else".
7554 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7555 0           $self->{ca}->{value} = '';
7556 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7557            
7558 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7559 0           $self->{line_prev} = $self->{line};
7560 0           $self->{column_prev} = $self->{column};
7561 0           $self->{column}++;
7562 0           $self->{nc}
7563             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7564             } else {
7565 0           $self->{set_nc}->($self);
7566             }
7567            
7568 0           redo A;
7569             } elsif ($nc == 0x0027) { # '
7570             ## XML5: Same as "anything else".
7571 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7572 0           $self->{ca}->{value} = '';
7573 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7574            
7575 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7576 0           $self->{line_prev} = $self->{line};
7577 0           $self->{column_prev} = $self->{column};
7578 0           $self->{column}++;
7579 0           $self->{nc}
7580             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7581             } else {
7582 0           $self->{set_nc}->($self);
7583             }
7584            
7585 0           redo A;
7586             } elsif ($nc == 0x003E) { # >
7587             ## XML5: Same as "anything else".
7588 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7589 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7590            
7591 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7592 0           $self->{line_prev} = $self->{line};
7593 0           $self->{column_prev} = $self->{column};
7594 0           $self->{column}++;
7595 0           $self->{nc}
7596             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7597             } else {
7598 0           $self->{set_nc}->($self);
7599             }
7600            
7601 0           return ($self->{ct}); # ATTLIST
7602 0           redo A;
7603             } elsif ($nc == EOF_CHAR) {
7604             ## XML5: No parse error.
7605 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7606 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7607 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7608            
7609 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610 0           $self->{line_prev} = $self->{line};
7611 0           $self->{column_prev} = $self->{column};
7612 0           $self->{column}++;
7613 0           $self->{nc}
7614             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615             } else {
7616 0           $self->{set_nc}->($self);
7617             }
7618            
7619             ## Discard the current token.
7620 0           redo A;
7621             } else {
7622 0           $self->{ca}->{default} .= chr $nc;
7623             ## Stay in the state.
7624            
7625 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7626 0           $self->{line_prev} = $self->{line};
7627 0           $self->{column_prev} = $self->{column};
7628 0           $self->{column}++;
7629 0           $self->{nc}
7630             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7631             } else {
7632 0           $self->{set_nc}->($self);
7633             }
7634            
7635 0           redo A;
7636             }
7637             } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7638 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
7639             ## Stay in the state.
7640            
7641 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642 0           $self->{line_prev} = $self->{line};
7643 0           $self->{column_prev} = $self->{column};
7644 0           $self->{column}++;
7645 0           $self->{nc}
7646             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647             } else {
7648 0           $self->{set_nc}->($self);
7649             }
7650            
7651 0           redo A;
7652             } elsif ($nc == 0x0022) { # "
7653 0           $self->{ca}->{value} = '';
7654 0           $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7655            
7656 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7657 0           $self->{line_prev} = $self->{line};
7658 0           $self->{column_prev} = $self->{column};
7659 0           $self->{column}++;
7660 0           $self->{nc}
7661             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7662             } else {
7663 0           $self->{set_nc}->($self);
7664             }
7665            
7666 0           redo A;
7667             } elsif ($nc == 0x0027) { # '
7668 0           $self->{ca}->{value} = '';
7669 0           $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7670            
7671 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7672 0           $self->{line_prev} = $self->{line};
7673 0           $self->{column_prev} = $self->{column};
7674 0           $self->{column}++;
7675 0           $self->{nc}
7676             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7677             } else {
7678 0           $self->{set_nc}->($self);
7679             }
7680            
7681 0           redo A;
7682             } elsif ($nc == 0x003E) { # >
7683 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7684 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7685            
7686 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7687 0           $self->{line_prev} = $self->{line};
7688 0           $self->{column_prev} = $self->{column};
7689 0           $self->{column}++;
7690 0           $self->{nc}
7691             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7692             } else {
7693 0           $self->{set_nc}->($self);
7694             }
7695            
7696 0           return ($self->{ct}); # ATTLIST
7697 0           redo A;
7698             } elsif ($nc == EOF_CHAR) {
7699             ## XML5: No parse error.
7700 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7701 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7702 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7703            
7704 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7705 0           $self->{line_prev} = $self->{line};
7706 0           $self->{column_prev} = $self->{column};
7707 0           $self->{column}++;
7708 0           $self->{nc}
7709             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7710             } else {
7711 0           $self->{set_nc}->($self);
7712             }
7713            
7714             ## Discard the current token.
7715 0           redo A;
7716             } else {
7717             ## XML5: Not defined yet.
7718 0 0         if ($self->{ca}->{default} eq 'FIXED') {
7719 0           $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7720             } else {
7721 0           push @{$self->{ct}->{attrdefs}}, $self->{ca};
  0            
7722 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7723             }
7724             ## Reconsume.
7725 0           redo A;
7726             }
7727             } elsif ($state == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7728 0 0 0       if ($is_space->{$nc} or
      0        
7729             $nc == EOF_CHAR or
7730             $nc == 0x003E) { # >
7731 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7732             ## Reconsume.
7733 0           redo A;
7734             } else {
7735 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7736 0           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7737             ## Reconsume.
7738 0           redo A;
7739             }
7740             } elsif ($state == NDATA_STATE) {
7741             ## ASCII case-insensitive
7742 0 0 0       if ($nc == [
    0 0        
      0        
7743             undef,
7744             0x0044, # D
7745             0x0041, # A
7746             0x0054, # T
7747             NEVER_CHAR, # (A)
7748             ]->[length $self->{kwd}] or
7749             $nc == [
7750             undef,
7751             0x0064, # d
7752             0x0061, # a
7753             0x0074, # t
7754             NEVER_CHAR, # (a)
7755             ]->[length $self->{kwd}]) {
7756            
7757             ## Stay in the state.
7758 0           $self->{kwd} .= chr $nc;
7759            
7760 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7761 0           $self->{line_prev} = $self->{line};
7762 0           $self->{column_prev} = $self->{column};
7763 0           $self->{column}++;
7764 0           $self->{nc}
7765             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7766             } else {
7767 0           $self->{set_nc}->($self);
7768             }
7769            
7770 0           redo A;
7771             } elsif ((length $self->{kwd}) == 4 and
7772             ($nc == 0x0041 or # A
7773             $nc == 0x0061)) { # a
7774 0 0 0       if ($self->{kwd} ne 'NDAT' or $nc == 0x0061) { # a
7775            
7776 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7777             text => 'NDATA',
7778             line => $self->{line_prev},
7779             column => $self->{column_prev} - 4);
7780             } else {
7781            
7782             }
7783 0           $self->{state} = AFTER_NDATA_STATE;
7784            
7785 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7786 0           $self->{line_prev} = $self->{line};
7787 0           $self->{column_prev} = $self->{column};
7788 0           $self->{column}++;
7789 0           $self->{nc}
7790             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7791             } else {
7792 0           $self->{set_nc}->($self);
7793             }
7794            
7795 0           redo A;
7796             } else {
7797 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7798             line => $self->{line_prev},
7799             column => $self->{column_prev} + 1
7800             - length $self->{kwd});
7801            
7802 0           $self->{state} = BOGUS_MD_STATE;
7803             ## Reconsume.
7804 0           redo A;
7805             }
7806             } elsif ($state == AFTER_NDATA_STATE) {
7807 0 0         if ($is_space->{$nc}) {
    0          
    0          
7808 0           $self->{state} = BEFORE_NOTATION_NAME_STATE;
7809            
7810 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7811 0           $self->{line_prev} = $self->{line};
7812 0           $self->{column_prev} = $self->{column};
7813 0           $self->{column}++;
7814 0           $self->{nc}
7815             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7816             } else {
7817 0           $self->{set_nc}->($self);
7818             }
7819            
7820 0           redo A;
7821             } elsif ($nc == 0x003E) { # >
7822 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7823 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7824            
7825 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7826 0           $self->{line_prev} = $self->{line};
7827 0           $self->{column_prev} = $self->{column};
7828 0           $self->{column}++;
7829 0           $self->{nc}
7830             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7831             } else {
7832 0           $self->{set_nc}->($self);
7833             }
7834            
7835 0           return ($self->{ct}); # ENTITY
7836 0           redo A;
7837             } elsif ($nc == EOF_CHAR) {
7838 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7839 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7840            
7841 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7842 0           $self->{line_prev} = $self->{line};
7843 0           $self->{column_prev} = $self->{column};
7844 0           $self->{column}++;
7845 0           $self->{nc}
7846             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7847             } else {
7848 0           $self->{set_nc}->($self);
7849             }
7850            
7851             ## Discard the current token.
7852 0           redo A;
7853             } else {
7854 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7855             line => $self->{line_prev},
7856             column => $self->{column_prev} + 1
7857             - length $self->{kwd});
7858 0           $self->{state} = BOGUS_MD_STATE;
7859             ## Reconsume.
7860 0           redo A;
7861             }
7862             } elsif ($state == BEFORE_NOTATION_NAME_STATE) {
7863 0 0         if ($is_space->{$nc}) {
    0          
    0          
7864             ## Stay in the state.
7865            
7866 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7867 0           $self->{line_prev} = $self->{line};
7868 0           $self->{column_prev} = $self->{column};
7869 0           $self->{column}++;
7870 0           $self->{nc}
7871             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7872             } else {
7873 0           $self->{set_nc}->($self);
7874             }
7875            
7876 0           redo A;
7877             } elsif ($nc == 0x003E) { # >
7878 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7879 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7880            
7881 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7882 0           $self->{line_prev} = $self->{line};
7883 0           $self->{column_prev} = $self->{column};
7884 0           $self->{column}++;
7885 0           $self->{nc}
7886             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7887             } else {
7888 0           $self->{set_nc}->($self);
7889             }
7890            
7891 0           return ($self->{ct}); # ENTITY
7892 0           redo A;
7893             } elsif ($nc == EOF_CHAR) {
7894 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7895 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7896            
7897 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898 0           $self->{line_prev} = $self->{line};
7899 0           $self->{column_prev} = $self->{column};
7900 0           $self->{column}++;
7901 0           $self->{nc}
7902             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903             } else {
7904 0           $self->{set_nc}->($self);
7905             }
7906            
7907             ## Discard the current token.
7908 0           redo A;
7909             } else {
7910 0 0         if ($nc == 0x0000) {
7911 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7912             }
7913 0 0         $self->{ct}->{notation} = $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7914 0           $self->{state} = NOTATION_NAME_STATE;
7915            
7916 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7917 0           $self->{line_prev} = $self->{line};
7918 0           $self->{column_prev} = $self->{column};
7919 0           $self->{column}++;
7920 0           $self->{nc}
7921             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7922             } else {
7923 0           $self->{set_nc}->($self);
7924             }
7925            
7926 0           redo A;
7927             }
7928             } elsif ($state == NOTATION_NAME_STATE) {
7929 0 0         if ($is_space->{$nc}) {
    0          
    0          
7930 0           $self->{state} = AFTER_MD_DEF_STATE;
7931            
7932 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7933 0           $self->{line_prev} = $self->{line};
7934 0           $self->{column_prev} = $self->{column};
7935 0           $self->{column}++;
7936 0           $self->{nc}
7937             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7938             } else {
7939 0           $self->{set_nc}->($self);
7940             }
7941            
7942 0           redo A;
7943             } elsif ($nc == 0x003E) { # >
7944 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7945            
7946 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7947 0           $self->{line_prev} = $self->{line};
7948 0           $self->{column_prev} = $self->{column};
7949 0           $self->{column}++;
7950 0           $self->{nc}
7951             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7952             } else {
7953 0           $self->{set_nc}->($self);
7954             }
7955            
7956 0           return ($self->{ct}); # ENTITY
7957 0           redo A;
7958             } elsif ($nc == EOF_CHAR) {
7959 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7960 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7961            
7962 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7963 0           $self->{line_prev} = $self->{line};
7964 0           $self->{column_prev} = $self->{column};
7965 0           $self->{column}++;
7966 0           $self->{nc}
7967             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7968             } else {
7969 0           $self->{set_nc}->($self);
7970             }
7971            
7972             ## The current token.
7973 0           redo A;
7974             } else {
7975 0 0         if ($nc == 0x0000) {
7976 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7977             }
7978 0 0         $self->{ct}->{notation} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7979             ## Stay in the state.
7980            
7981 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7982 0           $self->{line_prev} = $self->{line};
7983 0           $self->{column_prev} = $self->{column};
7984 0           $self->{column}++;
7985 0           $self->{nc}
7986             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7987             } else {
7988 0           $self->{set_nc}->($self);
7989             }
7990            
7991 0           redo A;
7992             }
7993             } elsif ($state == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7994 0 0         if ($nc == 0x0022) { # "
    0          
    0          
7995 0           $self->{state} = AFTER_MD_DEF_STATE;
7996            
7997 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7998 0           $self->{line_prev} = $self->{line};
7999 0           $self->{column_prev} = $self->{column};
8000 0           $self->{column}++;
8001 0           $self->{nc}
8002             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8003             } else {
8004 0           $self->{set_nc}->($self);
8005             }
8006            
8007 0           redo A;
8008             } elsif ($nc == 0x0026) { # &
8009 0           $self->{prev_state} = $state;
8010 0           $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8011 0           $self->{entity_add} = 0x0022; # "
8012            
8013 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8014 0           $self->{line_prev} = $self->{line};
8015 0           $self->{column_prev} = $self->{column};
8016 0           $self->{column}++;
8017 0           $self->{nc}
8018             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8019             } else {
8020 0           $self->{set_nc}->($self);
8021             }
8022            
8023 0           redo A;
8024             ## TODO: %
8025             } elsif ($nc == EOF_CHAR) {
8026 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8027 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8028             ## Reconsume.
8029             ## Discard the current token.
8030 0           redo A;
8031             } else {
8032 0 0         if ($nc == 0x0000) {
8033 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8034             }
8035 0 0         $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8036            
8037 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8038 0           $self->{line_prev} = $self->{line};
8039 0           $self->{column_prev} = $self->{column};
8040 0           $self->{column}++;
8041 0           $self->{nc}
8042             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8043             } else {
8044 0           $self->{set_nc}->($self);
8045             }
8046            
8047 0           redo A;
8048             }
8049             } elsif ($state == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8050 0 0         if ($nc == 0x0027) { # '
    0          
    0          
8051 0           $self->{state} = AFTER_MD_DEF_STATE;
8052            
8053 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8054 0           $self->{line_prev} = $self->{line};
8055 0           $self->{column_prev} = $self->{column};
8056 0           $self->{column}++;
8057 0           $self->{nc}
8058             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8059             } else {
8060 0           $self->{set_nc}->($self);
8061             }
8062            
8063 0           redo A;
8064             } elsif ($nc == 0x0026) { # &
8065 0           $self->{prev_state} = $state;
8066 0           $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8067 0           $self->{entity_add} = 0x0027; # '
8068            
8069 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8070 0           $self->{line_prev} = $self->{line};
8071 0           $self->{column_prev} = $self->{column};
8072 0           $self->{column}++;
8073 0           $self->{nc}
8074             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8075             } else {
8076 0           $self->{set_nc}->($self);
8077             }
8078            
8079 0           redo A;
8080             ## TODO: %
8081             } elsif ($nc == EOF_CHAR) {
8082 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8083 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8084             ## Reconsume.
8085             ## Discard the current token.
8086 0           redo A;
8087             } else {
8088 0 0         if ($nc == 0x0000) {
8089 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8090             }
8091 0 0         $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8092            
8093 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094 0           $self->{line_prev} = $self->{line};
8095 0           $self->{column_prev} = $self->{column};
8096 0           $self->{column}++;
8097 0           $self->{nc}
8098             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099             } else {
8100 0           $self->{set_nc}->($self);
8101             }
8102            
8103 0           redo A;
8104             }
8105             } elsif ($state == ENTITY_VALUE_ENTITY_STATE) {
8106 0 0 0       if ($is_space->{$nc} or
    0          
8107             {
8108             0x003C => 1, 0x0026 => 1, (EOF_CHAR) => 1, # <, &
8109             $self->{entity_add} => 1,
8110             }->{$nc}) {
8111 0 0         $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8112             line => $self->{line_prev},
8113             column => $self->{column_prev}
8114             + ($nc == EOF_CHAR ? 1 : 0));
8115             ## Don't consume
8116             ## Return nothing.
8117             #
8118             } elsif ($nc == 0x0023) { # #
8119 0           $self->{ca} = $self->{ct};
8120 0           $self->{state} = ENTITY_HASH_STATE;
8121 0           $self->{kwd} = '#';
8122            
8123 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8124 0           $self->{line_prev} = $self->{line};
8125 0           $self->{column_prev} = $self->{column};
8126 0           $self->{column}++;
8127 0           $self->{nc}
8128             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8129             } else {
8130 0           $self->{set_nc}->($self);
8131             }
8132            
8133 0           redo A;
8134             } else {
8135             #
8136             }
8137              
8138 0           $self->{ct}->{value} .= '&';
8139 0           $self->{state} = $self->{prev_state};
8140             ## Reconsume.
8141 0           redo A;
8142             } elsif ($state == AFTER_ELEMENT_NAME_STATE) {
8143 0 0         if ($is_space->{$nc}) {
    0          
    0          
    0          
8144 0           $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8145            
8146 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8147 0           $self->{line_prev} = $self->{line};
8148 0           $self->{column_prev} = $self->{column};
8149 0           $self->{column}++;
8150 0           $self->{nc}
8151             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8152             } else {
8153 0           $self->{set_nc}->($self);
8154             }
8155            
8156 0           redo A;
8157             } elsif ($nc == 0x0028) { # (
8158 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8159 0           $self->{ct}->{content} = ['('];
8160 0           $self->{group_depth} = 1;
8161            
8162 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8163 0           $self->{line_prev} = $self->{line};
8164 0           $self->{column_prev} = $self->{column};
8165 0           $self->{column}++;
8166 0           $self->{nc}
8167             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8168             } else {
8169 0           $self->{set_nc}->($self);
8170             }
8171            
8172 0           redo A;
8173             } elsif ($nc == 0x003E) { # >
8174 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8175 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8176            
8177 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178 0           $self->{line_prev} = $self->{line};
8179 0           $self->{column_prev} = $self->{column};
8180 0           $self->{column}++;
8181 0           $self->{nc}
8182             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183             } else {
8184 0           $self->{set_nc}->($self);
8185             }
8186            
8187 0           return ($self->{ct}); # ELEMENT
8188 0           redo A;
8189             } elsif ($nc == EOF_CHAR) {
8190 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8191 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8192            
8193 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8194 0           $self->{line_prev} = $self->{line};
8195 0           $self->{column_prev} = $self->{column};
8196 0           $self->{column}++;
8197 0           $self->{nc}
8198             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8199             } else {
8200 0           $self->{set_nc}->($self);
8201             }
8202            
8203             ## Discard the current token.
8204 0           redo A;
8205             } else {
8206 0 0         if ($nc == 0x0000) {
8207 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8208             }
8209 0 0         $self->{ct}->{content} = [$nc == 0x0000 ? "\x{FFFD}" : chr $nc];
8210 0           $self->{state} = CONTENT_KEYWORD_STATE;
8211            
8212 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8213 0           $self->{line_prev} = $self->{line};
8214 0           $self->{column_prev} = $self->{column};
8215 0           $self->{column}++;
8216 0           $self->{nc}
8217             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8218             } else {
8219 0           $self->{set_nc}->($self);
8220             }
8221            
8222 0           redo A;
8223             }
8224             } elsif ($state == CONTENT_KEYWORD_STATE) {
8225 0 0         if ($is_space->{$nc}) {
    0          
    0          
8226 0           $self->{state} = AFTER_MD_DEF_STATE;
8227            
8228 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8229 0           $self->{line_prev} = $self->{line};
8230 0           $self->{column_prev} = $self->{column};
8231 0           $self->{column}++;
8232 0           $self->{nc}
8233             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8234             } else {
8235 0           $self->{set_nc}->($self);
8236             }
8237            
8238 0           redo A;
8239             } elsif ($nc == 0x003E) { # >
8240 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8241            
8242 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8243 0           $self->{line_prev} = $self->{line};
8244 0           $self->{column_prev} = $self->{column};
8245 0           $self->{column}++;
8246 0           $self->{nc}
8247             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8248             } else {
8249 0           $self->{set_nc}->($self);
8250             }
8251            
8252 0           return ($self->{ct}); # ELEMENT
8253 0           redo A;
8254             } elsif ($nc == EOF_CHAR) {
8255 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8256 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8257            
8258 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8259 0           $self->{line_prev} = $self->{line};
8260 0           $self->{column_prev} = $self->{column};
8261 0           $self->{column}++;
8262 0           $self->{nc}
8263             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8264             } else {
8265 0           $self->{set_nc}->($self);
8266             }
8267            
8268             ## Discard the current token.
8269 0           redo A;
8270             } else {
8271 0 0         if ($nc == 0x0000) {
8272 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8273             }
8274 0 0         $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ELEMENT
8275             ## Stay in the state.
8276            
8277 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8278 0           $self->{line_prev} = $self->{line};
8279 0           $self->{column_prev} = $self->{column};
8280 0           $self->{column}++;
8281 0           $self->{nc}
8282             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8283             } else {
8284 0           $self->{set_nc}->($self);
8285             }
8286            
8287 0           redo A;
8288             }
8289             } elsif ($state == AFTER_CM_GROUP_OPEN_STATE) {
8290 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
    0          
8291             ## Stay in the state.
8292            
8293 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8294 0           $self->{line_prev} = $self->{line};
8295 0           $self->{column_prev} = $self->{column};
8296 0           $self->{column}++;
8297 0           $self->{nc}
8298             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8299             } else {
8300 0           $self->{set_nc}->($self);
8301             }
8302            
8303 0           redo A;
8304             } elsif ($nc == 0x0028) { # (
8305 0           $self->{group_depth}++;
8306 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8307             ## Stay in the state.
8308            
8309 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8310 0           $self->{line_prev} = $self->{line};
8311 0           $self->{column_prev} = $self->{column};
8312 0           $self->{column}++;
8313 0           $self->{nc}
8314             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8315             } else {
8316 0           $self->{set_nc}->($self);
8317             }
8318            
8319 0           redo A;
8320             } elsif ($nc == 0x007C or # |
8321             $nc == 0x002C) { # ,
8322 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8323             ## Stay in the state.
8324            
8325 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326 0           $self->{line_prev} = $self->{line};
8327 0           $self->{column_prev} = $self->{column};
8328 0           $self->{column}++;
8329 0           $self->{nc}
8330             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331             } else {
8332 0           $self->{set_nc}->($self);
8333             }
8334            
8335 0           redo A;
8336             } elsif ($nc == 0x0029) { # )
8337 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8338 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8339 0           $self->{group_depth}--;
8340 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8341            
8342 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8343 0           $self->{line_prev} = $self->{line};
8344 0           $self->{column_prev} = $self->{column};
8345 0           $self->{column}++;
8346 0           $self->{nc}
8347             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8348             } else {
8349 0           $self->{set_nc}->($self);
8350             }
8351            
8352 0           redo A;
8353             } elsif ($nc == 0x003E) { # >
8354 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8355 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8356 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8357            
8358 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8359 0           $self->{line_prev} = $self->{line};
8360 0           $self->{column_prev} = $self->{column};
8361 0           $self->{column}++;
8362 0           $self->{nc}
8363             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8364             } else {
8365 0           $self->{set_nc}->($self);
8366             }
8367            
8368 0           return ($self->{ct}); # ELEMENT
8369 0           redo A;
8370             } elsif ($nc == EOF_CHAR) {
8371 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8372             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8373 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8374            
8375 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8376 0           $self->{line_prev} = $self->{line};
8377 0           $self->{column_prev} = $self->{column};
8378 0           $self->{column}++;
8379 0           $self->{nc}
8380             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8381             } else {
8382 0           $self->{set_nc}->($self);
8383             }
8384            
8385             ## Discard the current token.
8386 0           redo A;
8387             } else {
8388 0 0         if ($nc == 0x0000) {
8389 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8390             }
8391 0 0         push @{$self->{ct}->{content}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
  0            
8392 0           $self->{state} = CM_ELEMENT_NAME_STATE;
8393            
8394 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8395 0           $self->{line_prev} = $self->{line};
8396 0           $self->{column_prev} = $self->{column};
8397 0           $self->{column}++;
8398 0           $self->{nc}
8399             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8400             } else {
8401 0           $self->{set_nc}->($self);
8402             }
8403            
8404 0           redo A;
8405             }
8406             } elsif ($state == CM_ELEMENT_NAME_STATE) {
8407 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0 0        
    0          
    0          
    0          
8408 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8409            
8410 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8411 0           $self->{line_prev} = $self->{line};
8412 0           $self->{column_prev} = $self->{column};
8413 0           $self->{column}++;
8414 0           $self->{nc}
8415             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8416             } else {
8417 0           $self->{set_nc}->($self);
8418             }
8419            
8420 0           redo A;
8421             } elsif ($nc == 0x002A or # *
8422             $nc == 0x002B or # +
8423             $nc == 0x003F) { # ?
8424 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8425 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8426            
8427 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8428 0           $self->{line_prev} = $self->{line};
8429 0           $self->{column_prev} = $self->{column};
8430 0           $self->{column}++;
8431 0           $self->{nc}
8432             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8433             } else {
8434 0           $self->{set_nc}->($self);
8435             }
8436            
8437 0           redo A;
8438             } elsif ($nc == 0x007C or # |
8439             $nc == 0x002C) { # ,
8440 0 0         push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
  0            
8441 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8442            
8443 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8444 0           $self->{line_prev} = $self->{line};
8445 0           $self->{column_prev} = $self->{column};
8446 0           $self->{column}++;
8447 0           $self->{nc}
8448             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8449             } else {
8450 0           $self->{set_nc}->($self);
8451             }
8452            
8453 0           redo A;
8454             } elsif ($nc == 0x0029) { # )
8455 0           $self->{group_depth}--;
8456 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8457 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8458            
8459 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8460 0           $self->{line_prev} = $self->{line};
8461 0           $self->{column_prev} = $self->{column};
8462 0           $self->{column}++;
8463 0           $self->{nc}
8464             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8465             } else {
8466 0           $self->{set_nc}->($self);
8467             }
8468            
8469 0           redo A;
8470             } elsif ($nc == 0x003E) { # >
8471 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8472 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8473 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8474            
8475 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8476 0           $self->{line_prev} = $self->{line};
8477 0           $self->{column_prev} = $self->{column};
8478 0           $self->{column}++;
8479 0           $self->{nc}
8480             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8481             } else {
8482 0           $self->{set_nc}->($self);
8483             }
8484            
8485 0           return ($self->{ct}); # ELEMENT
8486 0           redo A;
8487             } elsif ($nc == EOF_CHAR) {
8488 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8489             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8490 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8491            
8492 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8493 0           $self->{line_prev} = $self->{line};
8494 0           $self->{column_prev} = $self->{column};
8495 0           $self->{column}++;
8496 0           $self->{nc}
8497             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8498             } else {
8499 0           $self->{set_nc}->($self);
8500             }
8501            
8502             ## Discard the token.
8503 0           redo A;
8504             } else {
8505 0 0         if ($nc == 0x0000) {
8506 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8507             }
8508 0 0         $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
8509             ## Stay in the state.
8510            
8511 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8512 0           $self->{line_prev} = $self->{line};
8513 0           $self->{column_prev} = $self->{column};
8514 0           $self->{column}++;
8515 0           $self->{nc}
8516             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8517             } else {
8518 0           $self->{set_nc}->($self);
8519             }
8520            
8521 0           redo A;
8522             }
8523             } elsif ($state == AFTER_CM_ELEMENT_NAME_STATE) {
8524 0 0 0       if ($is_space->{$nc}) {
    0          
    0          
    0          
    0          
8525             ## Stay in the state.
8526            
8527 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8528 0           $self->{line_prev} = $self->{line};
8529 0           $self->{column_prev} = $self->{column};
8530 0           $self->{column}++;
8531 0           $self->{nc}
8532             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8533             } else {
8534 0           $self->{set_nc}->($self);
8535             }
8536            
8537 0           redo A;
8538             } elsif ($nc == 0x007C or # |
8539             $nc == 0x002C) { # ,
8540 0 0         push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
  0            
8541 0           $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8542            
8543 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8544 0           $self->{line_prev} = $self->{line};
8545 0           $self->{column_prev} = $self->{column};
8546 0           $self->{column}++;
8547 0           $self->{nc}
8548             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8549             } else {
8550 0           $self->{set_nc}->($self);
8551             }
8552            
8553 0           redo A;
8554             } elsif ($nc == 0x0029) { # )
8555 0           $self->{group_depth}--;
8556 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8557 0           $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8558            
8559 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8560 0           $self->{line_prev} = $self->{line};
8561 0           $self->{column_prev} = $self->{column};
8562 0           $self->{column}++;
8563 0           $self->{nc}
8564             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8565             } else {
8566 0           $self->{set_nc}->($self);
8567             }
8568            
8569 0           redo A;
8570             } elsif ($nc == 0x003E) { # >
8571 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8572 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8573 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8574            
8575 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8576 0           $self->{line_prev} = $self->{line};
8577 0           $self->{column_prev} = $self->{column};
8578 0           $self->{column}++;
8579 0           $self->{nc}
8580             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8581             } else {
8582 0           $self->{set_nc}->($self);
8583             }
8584            
8585 0           return ($self->{ct}); # ELEMENT
8586 0           redo A;
8587             } elsif ($nc == EOF_CHAR) {
8588 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8589             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8590 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8591            
8592 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8593 0           $self->{line_prev} = $self->{line};
8594 0           $self->{column_prev} = $self->{column};
8595 0           $self->{column}++;
8596 0           $self->{nc}
8597             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8598             } else {
8599 0           $self->{set_nc}->($self);
8600             }
8601            
8602             ## Discard the current token.
8603 0           redo A;
8604             } else {
8605 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8606 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8607 0           $self->{state} = BOGUS_MD_STATE;
8608            
8609 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8610 0           $self->{line_prev} = $self->{line};
8611 0           $self->{column_prev} = $self->{column};
8612 0           $self->{column}++;
8613 0           $self->{nc}
8614             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8615             } else {
8616 0           $self->{set_nc}->($self);
8617             }
8618            
8619 0           redo A;
8620             }
8621             } elsif ($state == AFTER_CM_GROUP_CLOSE_STATE) {
8622 0 0 0       if ($is_space->{$nc}) {
    0 0        
    0          
    0          
    0          
8623 0 0         if ($self->{group_depth}) {
8624 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8625             } else {
8626 0           $self->{state} = AFTER_MD_DEF_STATE;
8627             }
8628            
8629 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8630 0           $self->{line_prev} = $self->{line};
8631 0           $self->{column_prev} = $self->{column};
8632 0           $self->{column}++;
8633 0           $self->{nc}
8634             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8635             } else {
8636 0           $self->{set_nc}->($self);
8637             }
8638            
8639 0           redo A;
8640             } elsif ($nc == 0x002A or # *
8641             $nc == 0x002B or # +
8642             $nc == 0x003F) { # ?
8643 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8644 0 0         if ($self->{group_depth}) {
8645 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8646             } else {
8647 0           $self->{state} = AFTER_MD_DEF_STATE;
8648             }
8649            
8650 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8651 0           $self->{line_prev} = $self->{line};
8652 0           $self->{column_prev} = $self->{column};
8653 0           $self->{column}++;
8654 0           $self->{nc}
8655             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8656             } else {
8657 0           $self->{set_nc}->($self);
8658             }
8659            
8660 0           redo A;
8661             } elsif ($nc == 0x0029) { # )
8662 0 0         if ($self->{group_depth}) {
8663 0           $self->{group_depth}--;
8664 0           push @{$self->{ct}->{content}}, chr $nc;
  0            
8665             ## Stay in the state.
8666            
8667 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8668 0           $self->{line_prev} = $self->{line};
8669 0           $self->{column_prev} = $self->{column};
8670 0           $self->{column}++;
8671 0           $self->{nc}
8672             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8673             } else {
8674 0           $self->{set_nc}->($self);
8675             }
8676            
8677 0           redo A;
8678             } else {
8679 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8680 0           $self->{state} = BOGUS_MD_STATE;
8681             ## Reconsume.
8682 0           redo A;
8683             }
8684             } elsif ($nc == 0x003E) { # >
8685 0 0         if ($self->{group_depth}) {
8686 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8687 0           push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
  0            
8688             }
8689 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8690            
8691 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8692 0           $self->{line_prev} = $self->{line};
8693 0           $self->{column_prev} = $self->{column};
8694 0           $self->{column}++;
8695 0           $self->{nc}
8696             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8697             } else {
8698 0           $self->{set_nc}->($self);
8699             }
8700            
8701 0           return ($self->{ct}); # ELEMENT
8702 0           redo A;
8703             } elsif ($nc == EOF_CHAR) {
8704 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8705             #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8706 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8707            
8708 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8709 0           $self->{line_prev} = $self->{line};
8710 0           $self->{column_prev} = $self->{column};
8711 0           $self->{column}++;
8712 0           $self->{nc}
8713             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8714             } else {
8715 0           $self->{set_nc}->($self);
8716             }
8717            
8718             ## Discard the current token.
8719 0           redo A;
8720             } else {
8721 0 0         if ($self->{group_depth}) {
8722 0           $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8723             } else {
8724 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8725 0           $self->{state} = BOGUS_MD_STATE;
8726             }
8727             ## Reconsume.
8728 0           redo A;
8729             }
8730             } elsif ($state == AFTER_MD_DEF_STATE) {
8731 0 0         if ($is_space->{$nc}) {
    0          
    0          
8732             ## Stay in the state.
8733            
8734 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8735 0           $self->{line_prev} = $self->{line};
8736 0           $self->{column_prev} = $self->{column};
8737 0           $self->{column}++;
8738 0           $self->{nc}
8739             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8740             } else {
8741 0           $self->{set_nc}->($self);
8742             }
8743            
8744 0           redo A;
8745             } elsif ($nc == 0x003E) { # >
8746 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8747            
8748 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8749 0           $self->{line_prev} = $self->{line};
8750 0           $self->{column_prev} = $self->{column};
8751 0           $self->{column}++;
8752 0           $self->{nc}
8753             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8754             } else {
8755 0           $self->{set_nc}->($self);
8756             }
8757            
8758 0           return ($self->{ct}); # ENTITY/ELEMENT
8759 0           redo A;
8760             } elsif ($nc == EOF_CHAR) {
8761 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8762 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8763            
8764 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8765 0           $self->{line_prev} = $self->{line};
8766 0           $self->{column_prev} = $self->{column};
8767 0           $self->{column}++;
8768 0           $self->{nc}
8769             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8770             } else {
8771 0           $self->{set_nc}->($self);
8772             }
8773            
8774             ## Discard the current token.
8775 0           redo A;
8776             } else {
8777 0           $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8778 0           $self->{state} = BOGUS_MD_STATE;
8779             ## Reconsume.
8780 0           redo A;
8781             }
8782             } elsif ($state == BOGUS_MD_STATE) {
8783 0 0         if ($nc == 0x003E) { # >
    0          
8784 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8785            
8786 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8787 0           $self->{line_prev} = $self->{line};
8788 0           $self->{column_prev} = $self->{column};
8789 0           $self->{column}++;
8790 0           $self->{nc}
8791             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8792             } else {
8793 0           $self->{set_nc}->($self);
8794             }
8795            
8796 0           return ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8797 0           redo A;
8798             } elsif ($nc == EOF_CHAR) {
8799 0           $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8800             ## Reconsume.
8801             ## Discard the current token.
8802 0           redo A;
8803             } else {
8804             ## Stay in the state.
8805            
8806 0 0         if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8807 0           $self->{line_prev} = $self->{line};
8808 0           $self->{column_prev} = $self->{column};
8809 0           $self->{column}++;
8810 0           $self->{nc}
8811             = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8812             } else {
8813 0           $self->{set_nc}->($self);
8814             }
8815            
8816 0           redo A;
8817             }
8818             } else {
8819 0           die "$0: $state: Unknown state";
8820             }
8821             } # A
8822              
8823 0           die "$0: _get_next_token: unexpected case";
8824             } # _get_next_token
8825              
8826             1;
8827              
8828             # Copyright 2007-2011 Wakaba <w@suika.fam.cx>.
8829             #
8830             # This library is free software; you can redistribute it and/or modify
8831             # it under the same terms as Perl itself.
8832