File Coverage

blib/lib/Pod/Simple/BlackBox.pm
Criterion Covered Total %
statement 891 1101 80.9
branch 398 568 70.0
condition 136 196 69.3
subroutine 72 75 96.0
pod 0 8 0.0
total 1497 1948 76.8


line stmt bran cond sub pod time code
1             package Pod::Simple::BlackBox;
2             #
3             # "What's in the box?" "Pain."
4             #
5             ###########################################################################
6             #
7             # This is where all the scary things happen: parsing lines into
8             # paragraphs; and then into directives, verbatims, and then also
9             # turning formatting sequences into treelets.
10             #
11             # Are you really sure you want to read this code?
12             #
13             #-----------------------------------------------------------------------------
14             #
15             # The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16             # of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17             # to call the proper callbacks on the treelets.
18             #
19             # Every node in a treelet is a ['name', {attrhash}, ...children...]
20              
21 70     70   297885 use integer; # vroom!
  70         164  
  70         463  
22 70     70   2870 use strict;
  70         132  
  70         1479  
23 70     70   384 use warnings;
  70         166  
  70         3166  
24 70     70   399 use Carp ();
  70         132  
  70         6764  
25             our $VERSION = '3.47';
26             #use constant DEBUG => 7;
27              
28             sub my_qr ($$) {
29              
30             # $1 is a pattern to compile and return. Older perls compile any
31             # syntactically valid property, even if it isn't legal. To cope with
32             # this, return an empty string unless the compiled pattern also
33             # successfully matches $2, which the caller furnishes.
34              
35 642     642 0 2105 my ($input_re, $should_match) = @_;
36             # XXX could have a third parameter $shouldnt_match for extra safety
37              
38 70 50   70   490 my $use_utf8 = do { no integer; $] <= 5.006002 } ? 'use utf8;' : "";
  70         176  
  70         453  
  642         1528  
  642         2328  
39              
40 70     70   717 my $re = eval "no warnings; $use_utf8 qr/$input_re/";
  70     70   150  
  70     70   5591  
  70     70   501  
  70     70   189  
  70     70   4185  
  70     70   501  
  70     70   192  
  70     70   3942  
  70     3   508  
  70     3   141  
  70     3   4043  
  70     3   429  
  70         163  
  70         4304  
  70         697  
  70         376  
  70         4996  
  70         466  
  70         137  
  70         6602  
  70         509  
  70         141  
  70         3806  
  70         554  
  70         168  
  70         6956  
  3         34  
  3         5  
  3         623  
  3         14  
  3         4  
  3         147  
  3         13  
  3         5  
  3         127  
  3         11  
  3         4  
  3         118  
  642         52074  
41             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
42 642 50       3001 return "" if $@;
43              
44 70     70   499 my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
  70     70   146  
  70     70   3801  
  70     70   492  
  70     70   197  
  70     70   4374  
  70     70   518  
  70     70   164  
  70     70   4008  
  70     3   420  
  70     3   157  
  70     3   3885  
  70     3   455  
  70         153  
  70         4541  
  70         503  
  70         141  
  70         4708  
  70         483  
  70         195  
  70         6233  
  70         426  
  70         163  
  70         3906  
  70         555  
  70         169  
  70         5374  
  3         16  
  3         5  
  3         347  
  3         12  
  3         6  
  3         189  
  3         12  
  3         4  
  3         127  
  3         18  
  3         5  
  3         199  
  642         51593  
45             #print STDERR __LINE__, ": $input_re: $@\n" if $@;
46 642 50       3089 return "" if $@;
47              
48             #print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49 642 50       2747 return $re if $matches;
50              
51             #print STDERR __LINE__, ": $re: didn't match\n";
52 0         0 return "";
53             }
54              
55             BEGIN {
56 70     70   15588 require Pod::Simple;
57 70 50       7893 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58             }
59              
60             # Matches a character iff the character will have a different meaning
61             # if we choose CP1252 vs UTF-8 if there is no =encoding line.
62             # This is broken for early Perls on non-ASCII platforms.
63             my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64             $non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65              
66             # Use patterns understandable by Perl 5.6, if possible
67 70     70   536 my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
  70         181  
  70         22063  
68             my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69             # to get assigned
70             my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71             "\x{250}");
72             $rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73              
74 70     70   476 my $script_run_re = eval 'no warnings "experimental::script_run";
  70         170  
  70         3916  
75             qr/(*script_run: ^ .* $ )/x';
76             my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77             unless ($latin_re) {
78             # This was machine generated to be the ranges of the union of the above
79             # three properties, with things that were undefined by Unicode 4.1 filling
80             # gaps. That is the version in use when Perl advanced enough to
81             # successfully compile and execute the above pattern.
82             $latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83             }
84              
85             my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86              
87             # Latin script code points not in the first release of Unicode
88             my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89              
90             # If this perl doesn't have the Deprecated property, there's only one code
91             # point in it that we need be concerned with.
92             my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93             $deprecated_re = qr/\x{149}/ unless $deprecated_re;
94              
95             my $utf8_bom;
96 70     70   546 if ( do { no integer; "$]" >= 5.007_003 }) {
  70         199  
  70         390  
97             $utf8_bom = "\x{FEFF}";
98             utf8::encode($utf8_bom);
99             } else {
100             $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101             }
102              
103             # This is used so that the 'content_seen' method doesn't return true on a
104             # file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105             # there is a valid =foo line will we return that content was seen.
106             my $seen_legal_directive = 0;
107              
108             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109              
110 0     0 0 0 sub parse_line { shift->parse_lines(@_) } # alias
111              
112             # - - - Turn back now! Run away! - - -
113              
114             sub parse_lines { # Usage: $parser->parse_lines(@lines)
115             # an undef means end-of-stream
116 9584     9584 0 15440 my $self = shift;
117              
118 9584         17391 my $code_handler = $self->{'code_handler'};
119 9584         15602 my $cut_handler = $self->{'cut_handler'};
120 9584         16743 my $wl_handler = $self->{'whiteline_handler'};
121 9584   100     23802 $self->{'line_count'} ||= 0;
122              
123 9584         13412 my $scratch;
124              
125             DEBUG > 4 and
126 9584         12922 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127              
128 9584         12862 DEBUG > 5 and
129             print STDERR "# About to parse lines: ",
130             join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131              
132 9584   100     23609 my $paras = ($self->{'paras'} ||= []);
133             # paragraph buffer. Because we need to defer processing of =over
134             # directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135             # to process this.
136              
137 9584   100     23846 $self->{'pod_para_count'} ||= 0;
138              
139             # An attempt to match the pod portions of a line. This is not fool proof,
140             # but is good enough to serve as part of the heuristic for guessing the pod
141             # encoding if not specified.
142 9584         15798 my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
  101347         218681  
  9584         81781  
143 9584         75087 my $pod_chars_re = qr/ ^ = [A-Za-z]+ | [\Q$codes\E] < /x;
144              
145 9584         15960 my $line;
146 9584         20576 foreach my $source_line (@_) {
147 19136 50       42091 if( $self->{'source_dead'} ) {
148 0         0 DEBUG > 4 and print STDERR "# Source is dead.\n";
149 0         0 last;
150             }
151              
152 19136 100       42455 unless( defined $source_line ) {
153 903         1327 DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154              
155 903         3398 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156 903         2464 push @$paras, $paras->[-1], $paras->[-1];
157             # So that it definitely fills the buffer.
158 903         2024 $self->{'source_dead'} = 1;
159 903         2667 $self->_ponder_paragraph_buffer;
160 903         1951 next;
161             }
162              
163              
164 18233 100       37778 if( $self->{'line_count'}++ ) {
165 17334         40606 ($line = $source_line) =~ tr/\n\r//d;
166             # If we don't have two vars, we'll end up with that there
167             # tr/// modding the (potentially read-only) original source line!
168              
169             } else {
170 899         1303 DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171              
172 899 50       7391 if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
    50          
    50          
173 0         0 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174 0         0 $self->_handle_encoding_line( "=encoding utf8" );
175 0         0 delete $self->{'_processed_encoding'};
176 0         0 $line =~ tr/\n\r//d;
177              
178             } elsif( $line =~ s/^\xFE\xFF//s ) {
179 0         0 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180             $self->scream(
181 0         0 $self->{'line_count'},
182             "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183             );
184 0         0 splice @_;
185 0         0 push @_, undef;
186 0         0 next;
187              
188             # TODO: implement somehow?
189              
190             } elsif( $line =~ s/^\xFF\xFE//s ) {
191 0         0 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192             $self->scream(
193 0         0 $self->{'line_count'},
194             "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195             );
196 0         0 splice @_;
197 0         0 push @_, undef;
198 0         0 next;
199              
200             # TODO: implement somehow?
201              
202             } else {
203 899         1430 DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204 899         2431 ($line = $source_line) =~ tr/\n\r//d;
205             }
206             }
207              
208 18233 100 100     158609 if(!$self->{'parse_characters'} && !$self->{'encoding'}
      100        
      100        
      100        
209             && ($self->{'in_pod'} || $line =~ /^=/s)
210             && $line =~ /$non_ascii_re/
211             ) {
212              
213 21         45 my $encoding;
214              
215             # No =encoding line, and we are at the first pod line in the input that
216             # contains a non-ascii byte, that is, one whose meaning varies depending
217             # on whether the file is encoded in UTF-8 or CP1252, which are the two
218             # possibilities permitted by the pod spec. (ASCII is assumed if the
219             # file only contains ASCII bytes.) In order to process this line, we
220             # need to figure out what encoding we will use for the file.
221             #
222             # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223             # 160-255, but it is used here, as it often colloquially is, to refer to
224             # the complete set of code points 0-255, including ASCII (0-127), the C1
225             # controls (128-159), and strict Latin 1 (160-255).
226             #
227             # CP1252 is effectively a superset of Latin 1, because it differs only
228             # from colloquial 8859-1 in the C1 controls, which are very unlikely to
229             # actually be present in 8859-1 files, so can be used for other purposes
230             # without conflict. CP 1252 uses most of them for graphic characters.
231             #
232             # Note that all ASCII-range bytes represent their corresponding code
233             # points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234             # code points require multiple (non-ASCII) bytes to represent. (A
235             # separate paragraph for EBCDIC is below.) The multi-byte
236             # representation is quite structured. If we find an isolated byte that
237             # would require multiple bytes to represent in UTF-8, we know that the
238             # encoding is not UTF-8. If we find a sequence of bytes that violates
239             # the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240             # hence must be 1252.
241             #
242             # But there are ambiguous cases where we could guess wrong. If so, the
243             # user will end up having to supply an =encoding line. We use all
244             # readily available information to improve our chances of guessing
245             # right. The odds of something not being UTF-8, but still passing a
246             # UTF-8 validity test go down very rapidly with increasing length of the
247             # sequence. Therefore we look at all non-ascii sequences on the line.
248             # If any of the sequences can't be UTF-8, we quit there and choose
249             # CP1252. If all could be UTF-8, we see if any of the code points
250             # represented are unlikely to be in pod. If so, we guess CP1252. If
251             # not, we check if the line is all in the same script; if not guess
252             # CP1252; otherwise UTF-8. For perls that don't have convenient script
253             # run testing, see if there is both Latin and non-Latin. If so, CP1252,
254             # otherwise UTF-8.
255             #
256             # On EBCDIC platforms, the situation is somewhat different. In
257             # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258             # but so do the bytes that are for the C1 controls. Recall that these
259             # correspond to the unused portion of 8859-1 that 1252 mostly takes
260             # over. That means that there are fewer code points that are
261             # represented by multi-bytes. But, note that the these controls are
262             # very unlikely to be in pod text. So if we encounter one of them, it
263             # means that it is quite likely CP1252 and not UTF-8. The net result is
264             # the same code below is used for both platforms.
265             #
266             # XXX probably if the line has E that evaluates to illegal CP1252,
267             # then it is UTF-8. But we haven't processed E<> yet.
268              
269 70 50   70   68261 goto set_1252 if do { no integer; "$]" < 5.006_000 }; # No UTF-8 on very early perls
  70         150  
  70         344  
  21         32  
  21         184  
270              
271 21         42 my $copy;
272              
273 70     70   4530 no warnings 'utf8';
  70         140  
  70         4042  
274              
275 70 50   70   414 if ( do { no integer; "$]" >= 5.007_003 } ) {
  70         185  
  70         254  
  21         46  
  21         100  
276 21         42 $copy = $line;
277              
278             # On perls that have this function, we can use it to easily see if the
279             # sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280             # needed below for script run detection
281 21 100       451 goto set_1252 if ! utf8::decode($copy);
282             }
283             elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284             # code page doing here anyway?
285             goto set_utf8;
286             }
287             else { # ASCII, no decode(): do it ourselves using the fundamental
288             # characteristics of UTF-8
289 70     70   6832 use if do { no integer; "$]" <= 5.006002 }, 'utf8';
  70     70   173  
  70         339  
  70         3964  
  70         161  
  70         142  
  70         803689  
290              
291 0         0 my $char_ord;
292             my $needed; # How many continuation bytes to gobble up
293              
294             # Initialize the translated line with a dummy character that will be
295             # deleted after everything else is done. This dummy makes sure that
296             # $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297             # with upgrading in the middle
298 0         0 $copy = chr(0x100);
299              
300             # Parse through the line
301 0         0 for (my $i = 0; $i < length $line; $i++) {
302 0         0 my $byte = substr($line, $i, 1);
303              
304             # ASCII bytes are trivially dealt with
305 0 0       0 if ($byte !~ $non_ascii_re) {
306 0         0 $copy .= $byte;
307 0         0 next;
308             }
309              
310 0         0 my $b_ord = ord $byte;
311              
312             # Now figure out what this code point would be if the input is
313             # actually in UTF-8. If, in the process, we discover that it isn't
314             # well-formed UTF-8, we guess CP1252.
315             #
316             # Start the process. If it is UTF-8, we are at the first, start
317             # byte, of a multi-byte sequence. We look at this byte to figure
318             # out how many continuation bytes are needed, and to initialize the
319             # code point accumulator with the data from this byte.
320             #
321             # Normally the minimum continuation byte is 0x80, but in certain
322             # instances the minimum is a higher number. So the code below
323             # overrides this for those instances.
324 0         0 my $min_cont = 0x80;
325              
326 0 0       0 if ($b_ord < 0xC2) { # A start byte < C2 is malformed
    0          
    0          
    0          
327 0         0 goto set_1252;
328             }
329             elsif ($b_ord <= 0xDF) {
330 0         0 $needed = 1;
331 0         0 $char_ord = $b_ord & 0x1F;
332             }
333             elsif ($b_ord <= 0xEF) {
334 0 0       0 $min_cont = 0xA0 if $b_ord == 0xE0;
335 0         0 $needed = 2;
336 0         0 $char_ord = $b_ord & (0x1F >> 1);
337             }
338             elsif ($b_ord <= 0xF4) {
339 0 0       0 $min_cont = 0x90 if $b_ord == 0xF0;
340 0         0 $needed = 3;
341 0         0 $char_ord = $b_ord & (0x1F >> 2);
342             }
343             else { # F4 is the highest start byte for legal Unicode; higher is
344             # unlikely to be in pod.
345 0         0 goto set_1252;
346             }
347              
348             # ? not enough continuation bytes available
349 0 0       0 goto set_1252 if $i + $needed >= length $line;
350              
351             # Accumulate the ordinal of the character from the remaining
352             # (continuation) bytes.
353 0         0 while ($needed-- > 0) {
354 0         0 my $cont = substr($line, ++$i, 1);
355 0         0 $b_ord = ord $cont;
356 0 0 0     0 goto set_1252 if $b_ord < $min_cont || $b_ord > 0xBF;
357              
358             # In all cases, any next continuation bytes all have the same
359             # minimum legal value
360 0         0 $min_cont = 0x80;
361              
362             # Accumulate this byte's contribution to the code point
363 0         0 $char_ord <<= 6;
364 0         0 $char_ord |= ($b_ord & 0x3F);
365             }
366              
367             # Here, the sequence that formed this code point was valid UTF-8,
368             # so add the completed character to the output
369 0         0 $copy .= chr $char_ord;
370             } # End of loop through line
371              
372             # Delete the dummy first character
373 0         0 $copy = substr($copy, 1);
374             }
375              
376             # Here, $copy is legal UTF-8.
377              
378             # If it can't be legal CP1252, no need to look further. (These bytes
379             # aren't valid in CP1252.) This test could have been placed higher in
380             # the code, but it seemed wrong to set the encoding to UTF-8 without
381             # making sure that the very first instance is well-formed. But what if
382             # it isn't legal CP1252 either? We have to choose one or the other, and
383             # It seems safer to favor the single-byte encoding over the multi-byte.
384 6 50       22 goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385              
386             # The C1 controls are not likely to appear in pod
387 6 50       18 goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388              
389             # Nor are surrogates nor unassigned, nor deprecated.
390 6 50       70 DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391 6 50 33     77 goto set_1252 if $cs_re && $copy =~ $cs_re;
392 6 50 33     75 DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393 6 50 33     43 goto set_1252 if $cn_re && $copy =~ $cn_re;
394 6 50       43 DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395 6 50       36 goto set_1252 if $copy =~ $deprecated_re;
396              
397             # Nor are rare code points. But this is hard to determine. khw
398             # believes that IPA characters and the modifier letters are unlikely to
399             # be in pod (and certainly very unlikely to be the in the first line in
400             # the pod containing non-ASCII)
401 6 100       46 DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402 6 100 66     40 goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403              
404             # The first Unicode version included essentially every Latin character
405             # in modern usage. So, a Latin character not in the first release will
406             # unlikely be in pod.
407 5 50 33     87 DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408 5 50 33     26 goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409              
410             # On perls that handle script runs, if the UTF-8 interpretation yields
411             # a single script, we guess UTF-8, otherwise just having a mixture of
412             # scripts is suspicious, so guess CP1252. We first strip off, as best
413             # we can, the ASCII characters that look like they are pod directives,
414             # as these would always show as mixed with non-Latin text.
415 5         47 $copy =~ s/$pod_chars_re//g;
416              
417 5 50       12 if ($script_run_re) {
418 5 100       87 goto set_utf8 if $copy =~ $script_run_re;
419 1         3 DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420 1         8 goto set_1252;
421             }
422              
423             # Even without script runs, but on recent enough perls and Unicodes, we
424             # can check if there is a mixture of both Latin and non-Latin. Again,
425             # having a mixture of scripts is suspicious, so assume CP1252
426              
427             # If it's all non-Latin, there is no CP1252, as that is Latin
428             # characters and punct, etc.
429 0 0       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430 0 0       0 goto set_utf8 if $copy !~ $latin_re;
431              
432 0 0       0 DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433 0 0       0 goto set_utf8 if $copy =~ $every_char_is_latin_re;
434              
435 0         0 DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436              
437 17         37 set_1252:
438             DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439 17         47 $encoding = 'CP1252';
440 17         78 goto done_set;
441              
442 4         5 set_utf8:
443             DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444 4         5 $encoding = 'UTF-8';
445              
446 21         170 done_set:
447             $self->_handle_encoding_line( "=encoding $encoding" );
448 21         55 delete $self->{'_processed_encoding'};
449 21 50       102 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450              
451 21         612 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
452              
453             $self->whine(
454 21         173 $self->{'line_count'},
455             "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456             );
457             }
458              
459 18233         26886 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460              
461 18233 100       37359 if(!$self->{'in_pod'}) {
462 1713 100       6122 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
463 954 100       2730 if($1 eq 'cut') {
464             $self->scream(
465 4         59 $self->{'line_count'},
466             "=cut found outside a pod block. Skipping to next block."
467             );
468              
469             ## Before there were errata sections in the world, it was
470             ## least-pessimal to abort processing the file. But now we can
471             ## just barrel on thru (but still not start a pod block).
472             #splice @_;
473             #push @_, undef;
474              
475 4         13 next;
476             } else {
477             $self->{'in_pod'} = $self->{'start_of_pod_block'}
478 950         3631 = $self->{'last_was_blank'} = 1;
479             # And fall thru to the pod-mode block further down
480             }
481             } else {
482 759         1132 DEBUG > 5 and print STDERR "# It's a code-line.\n";
483 759 100       1735 $code_handler->(map $_, $line, $self->{'line_count'}, $self)
484             if $code_handler;
485             # Note: this may cause code to be processed out of order relative
486             # to pods, but in order relative to cuts.
487              
488             # Note also that we haven't yet applied the transcoding to $line
489             # by time we call $code_handler!
490              
491 759 50       1698 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
492             # That RE is from perlsyn, section "Plain Old Comments (Not!)",
493             #$fname = $2 if defined $2;
494             #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495 0         0 DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496 0         0 $self->{'line_count'} = $1 - 1;
497             }
498              
499 759         1725 next;
500             }
501             }
502              
503             # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504             # Else we're in pod mode:
505              
506             # Apply any necessary transcoding:
507 17470 100       37643 $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508              
509             # HERE WE CATCH =encoding EARLY!
510 17470 100       35535 if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511 39 100       193 next if $self->parse_characters; # Ignore this line
512 38         191 $line = $self->_handle_encoding_line( $line );
513             }
514              
515 17469 100       73930 if($line =~ m/^=cut/s) {
    100          
    100          
516             # here ends the pod block, and therefore the previous pod para
517 168         307 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518 168         389 $self->{'in_pod'} = 0;
519             # ++$self->{'pod_para_count'};
520 168         633 $self->_ponder_paragraph_buffer();
521             # by now it's safe to consider the previous paragraph as done.
522 168         265 DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523 168 100       559 $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524             if $cut_handler;
525              
526             # TODO: add to docs: Note: this may cause cuts to be processed out
527             # of order relative to pods, but in order relative to code.
528              
529             } elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530 5776 100 66     25368 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531 22 100       66 $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532             if $wl_handler;
533             }
534              
535 5776 100 66     30515 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      100        
536 632         1005 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537 632         1016 push @{$paras->[-1]}, $line;
  632         1826  
538             } # otherwise it's not interesting
539              
540 5776 100 100     21340 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541 5681         7846 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542             }
543              
544 5776         12067 $self->{'last_was_blank'} = 1;
545              
546             } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547              
548 6388 100       23035 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(\s+|$)(.*)/s) {
    100          
549             # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550 2885         15831 my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551 2885 100 100     13628 $new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552             # Note that in "=head1 foo", the WS is lost.
553             # Example: ['=head1', {'start_line' => 123}, ' foo']
554              
555 2885         5266 ++$self->{'pod_para_count'};
556              
557 2885         10250 $self->_ponder_paragraph_buffer();
558             # by now it's safe to consider the previous paragraph as done.
559              
560 2885         6181 push @$paras, $new; # the new incipient paragraph
561 2885         4404 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562              
563             } elsif($line =~ m/^\s/s) {
564              
565 717 100 33     4321 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
      66        
566 173         393 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567 173         269 push @{$paras->[-1]}, $line;
  173         468  
568             } else {
569 544         1057 ++$self->{'pod_para_count'};
570 544         1883 $self->_ponder_paragraph_buffer();
571             # by now it's safe to consider the previous paragraph as done.
572 544         882 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573 544         2989 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574             }
575             } else {
576 2786         4983 ++$self->{'pod_para_count'};
577 2786         8699 $self->_ponder_paragraph_buffer();
578             # by now it's safe to consider the previous paragraph as done.
579 2786         12830 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580 2786         4632 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581             }
582 6388         18146 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583              
584             } else {
585             # It's a non-blank line /continuing/ the current para
586 5137 50       9664 if(@$paras) {
587 5137         6258 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588 5137         6649 push @{$paras->[-1]}, $line;
  5137         13316  
589             } else {
590             # Unexpected case!
591 0         0 die "Continuing a paragraph but \@\$paras is empty?";
592             }
593 5137         11322 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594             }
595              
596             } # ends the big while loop
597              
598 9584         13586 DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599 9584         63479 return $self;
600             }
601              
602             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603              
604             sub _maybe_handle_element_start {
605 8355     8355   12528 my $self = shift;
606             return $self->_handle_element_start(@_)
607 8355 100       32335 if !$self->{heading_filter};
608              
609 20         37 my ($element_name, $attr) = @_;
610              
611 20 100       53 if ($element_name =~ /\Ahead(\d)\z/) {
612             $self->{_in_head} = {
613 10         78 element => $element_name,
614             level => $1 + 0,
615             events => [],
616             text => '',
617             };
618             }
619              
620 20 100       47 if (my $head = $self->{_in_head}) {
621 10         12 push @{ $head->{events} }, [ '_handle_element_start', @_ ];
  10         30  
622 10         19 return;
623             }
624              
625             return
626 10 100       26 if !$self->{_filter_allowed};
627              
628 2         9 $self->_handle_element_start(@_);
629             }
630              
631             sub _maybe_handle_element_end {
632 8355     8355   11854 my $self = shift;
633             return $self->_handle_element_end(@_)
634 8355 100       27554 if !$self->{heading_filter};
635              
636 20         39 my ($element_name, $attr) = @_;
637              
638 20 100       47 if (my $head = $self->{_in_head}) {
639 10 50       24 if ($element_name ne $head->{element}) {
640 0         0 push @{ $head->{events} }, [ '_handle_element_end', @_ ];
  0         0  
641 0         0 return;
642             }
643              
644 10         17 delete $self->{_in_head};
645              
646 10   100     27 my $headings = $self->{_current_headings} ||= [];
647 10         24 @$headings = (@{$headings}[0 .. $head->{level} - 2], $head->{text});
  10         31  
648              
649 10         32 my $allowed = $self->{_filter_allowed} = $self->_filter_allows(@$headings);
650              
651 10 100       29 if ($allowed) {
652 2         4 for my $event (@{ $head->{events} }) {
  2         5  
653 4         10 my ($method, @args) = @$event;
654 4         23 $self->$method(@args);
655             }
656             }
657             }
658              
659             return
660 20 100       59 if !$self->{_filter_allowed};
661              
662 4         18 $self->_handle_element_end(@_);
663             }
664              
665             sub _maybe_handle_text {
666 10006     10006   14432 my $self = shift;
667             return $self->_handle_text(@_)
668 10006 100       31355 if !$self->{heading_filter};
669              
670 20         69 my ($text) = @_;
671              
672 20 100       46 if (my $head = $self->{_in_head}) {
673 10         15 push @{ $head->{events} }, [ '_handle_text', @_ ];
  10         24  
674 10         51 $head->{text} .= $text;
675 10         31 return;
676             }
677              
678             return
679 10 100       44 if !$self->{_filter_allowed};
680              
681 2         6 $self->_handle_text(@_);
682             }
683              
684             sub _filter_allows {
685 10     10   16 my $self = shift;
686 10         29 my @headings = @_;
687              
688             my $filter = $self->{heading_filter}
689 10 50       24 or return 1;
690              
691 10         19 SPEC: for my $spec ( @$filter ) {
692 10         27 for my $i (0 .. $#$spec) {
693 14         26 my $regex = $spec->[$i];
694 14         23 my $heading = $headings[$i];
695 14 100       33 $heading = ''
696             if !defined $heading;
697             next SPEC
698 14 100       105 if $heading !~ $regex;
699             }
700 2         7 return 1;
701             }
702              
703 8         20 return 0;
704             }
705              
706             sub set_heading_select {
707 2     2 0 22 my $self = shift;
708 2         9 my (@selections) = @_;
709              
710 2   50     36 my $filter = $self->{heading_filter} ||= [];
711 2 50 33     13 if (@selections && $selections[0] eq '+') {
712 0         0 shift @selections;
713             }
714             else {
715 2         6 @$filter = ();
716             }
717              
718 2         5 for my $spec (@selections) {
719             eval {
720 2         15 push @$filter, $self->_compile_heading_spec($spec);
721 2         9 1;
722 2 50       3 } or do {
723 0         0 warn $@;
724 0         0 warn qq{Ignoring section spec "$spec"!\n};
725             };
726             }
727             }
728              
729             sub _compile_heading_spec {
730 2     2   3 my $self = shift;
731 2         5 my ($spec) = @_;
732              
733 2         4 my @bad;
734 2         61 my @parts = $spec =~ m{(?:\A|\G/)((?:[^/\\]|\\.)*)}g;
735 2         7 for my $part (@parts) {
736 4         9 $part =~ s{\\(.)}{$1}g;
737 4         10 my $negate = $part =~ s{\A!}{};
738 4 50       27 $part = '.*'
739             if !length $part;
740              
741             eval {
742 4 50       81 $part = $negate ? qr{^(?!$part$)} : qr{^$part$};
743 4         14 1;
744 4 50       7 } or do {
745 0         0 push @bad, qq{Bad regular expression /$part/ in "$spec": $@\n};
746             };
747             }
748              
749 2 50       7 Carp::croak(join '', @bad)
750             if @bad;
751              
752 2         13 return \@parts;
753             }
754              
755              
756             sub _handle_encoding_line {
757 59     59   342 my($self, $line) = @_;
758              
759 59 50       221 return if $self->parse_characters;
760              
761             # The point of this routine is to set $self->{'_transcoder'} as indicated.
762              
763 59 50       370 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
764 59         86 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
765              
766 59         230 my $e = $1;
767 59         111 my $orig = $e;
768 59         93 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
  59         262  
769              
770 59         102 my $enc_error;
771              
772             # Cf. perldoc Encode and perldoc Encode::Supported
773              
774 59         10410 require Pod::Simple::Transcode;
775              
776 59 100 33     224 if( $self->{'encoding'} ) {
    50          
    100          
777 5         18 my $norm_current = $self->{'encoding'};
778 5         9 my $norm_e = $e;
779 5         12 foreach my $that ($norm_current, $norm_e) {
780 10         23 $that = lc($that);
781 10         50 $that =~ s/[-_]//g;
782             }
783 5 100       46 if($norm_current eq $norm_e) {
784 3         6 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
785             "redundant. ($norm_current eq $norm_e). Ignoring.\n";
786 3         8 $enc_error = '';
787             # But that doesn't necessarily mean that the earlier one went okay
788             } else {
789 2         6 $enc_error = "Encoding is already set to " . $self->{'encoding'};
790 2         3 DEBUG > 1 and print STDERR $enc_error;
791             }
792             } elsif (
793             # OK, let's turn on the encoding
794             do {
795 54         81 DEBUG > 1 and print STDERR " Setting encoding to $e\n";
796 54         200 $self->{'encoding'} = $e;
797 54         685 1;
798             }
799             and $e eq 'HACKRAW'
800             ) {
801 0         0 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
802              
803             } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
804              
805             die($enc_error = "WHAT? _transcoder is already set?!")
806 52 50       48742 if $self->{'_transcoder'}; # should never happen
807 52         255 require Pod::Simple::Transcode;
808 52         252 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
809 52         122 eval {
810 52         372 my @x = ('', "abc", "123");
811 52         222 $self->{'_transcoder'}->(@x);
812             };
813 52 50       145 $@ && die( $enc_error =
814             "Really unexpected error setting up encoding $e: $@\nAborting"
815             );
816 52         159 $self->{'detected_encoding'} = $e;
817              
818             } else {
819 2         1451 my @supported = Pod::Simple::Transcode::->all_encodings;
820              
821             # Note unsupported, and complain
822 2         1535 DEBUG and print STDERR " Encoding [$e] is unsupported.",
823             "\nSupporteds: @supported\n";
824 2         4 my $suggestion = '';
825              
826             # Look for a near match:
827 2         5 my $norm = lc($e);
828 2         4 $norm =~ tr[-_][]d;
829 2         4 my $n;
830 2         6 foreach my $enc (@supported) {
831 248         243 $n = lc($enc);
832 248         223 $n =~ tr[-_][]d;
833 248 50       296 next unless $n eq $norm;
834 0         0 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
835 0         0 last;
836             }
837 2         14 my $encmodver = Pod::Simple::Transcode::->encmodver;
838 2         30 $enc_error = join '' =>
839             "This document probably does not appear as it should, because its ",
840             "\"=encoding $e\" line calls for an unsupported encoding.",
841             $suggestion, " [$encmodver\'s supported encodings are: @supported]"
842             ;
843              
844 2         22 $self->scream( $self->{'line_count'}, $enc_error );
845             }
846 59         135 push @{ $self->{'encoding_command_statuses'} }, $enc_error;
  59         177  
847 59 100       210 if (defined($self->{'_processed_encoding'})) {
848             # Double declaration.
849 1         11 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
850             }
851 59         148 $self->{'_processed_encoding'} = $orig;
852              
853 59         200 return $line;
854             }
855              
856             # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
857              
858             sub _handle_encoding_second_level {
859             # By time this is called, the encoding (if well formed) will already
860             # have been acted on.
861 38     38   103 my($self, $para) = @_;
862 38         120 my @x = @$para;
863 38         137 my $content = join ' ', splice @x, 2;
864 38         140 $content =~ s/^\s+//s;
865 38         108 $content =~ s/\s+$//s;
866              
867 38         95 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
868              
869 38 100       108 if (defined($self->{'_processed_encoding'})) {
870             #if($content ne $self->{'_processed_encoding'}) {
871             # Could it happen?
872             #}
873 37         125 delete $self->{'_processed_encoding'};
874             # It's already been handled. Check for errors.
875 37 50       186 if(! $self->{'encoding_command_statuses'} ) {
    100          
876 0         0 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
877             } elsif( $self->{'encoding_command_statuses'}[-1] ) {
878             $self->whine( $para->[1]{'start_line'},
879             sprintf "Couldn't do %s: %s",
880             $self->{'encoding_command_reqs' }[-1],
881 4         34 $self->{'encoding_command_statuses'}[-1],
882             );
883             } else {
884 33         61 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
885             }
886              
887             } else {
888             # Otherwise it's a syntax error
889 1         13 $self->whine( $para->[1]{'start_line'},
890             "Invalid =encoding syntax: $content"
891             );
892             }
893              
894 38         91 return;
895             }
896              
897             #~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
898              
899             {
900             my $m = -321; # magic line number
901              
902             sub _gen_errata {
903 903     903   1483 my $self = $_[0];
904             # Return 0 or more fake-o paragraphs explaining the accumulated
905             # errors on this document.
906              
907 903 100 66     3600 return() unless $self->{'errata'} and keys %{$self->{'errata'}};
  40         185  
908              
909 40         73 my @out;
910              
911 40         224 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
  26         95  
  40         183  
912             push @out,
913             ['=item', {'start_line' => $m}, "Around line $line:"],
914             map( ['~Para', {'start_line' => $m, '~cooked' => 1},
915             #['~Top', {'start_line' => $m},
916             $_
917             #]
918             ],
919 59         263 @{$self->{'errata'}{$line}}
  59         395  
920             )
921             ;
922             }
923              
924             # TODO: report of unknown entities? unrenderable characters?
925              
926 40         495 unshift @out,
927             ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
928             ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
929             "Hey! ",
930             ['B', {},
931             'The above document had some coding errors, which are explained below:'
932             ]
933             ],
934             ['=over', {'start_line' => $m, 'errata' => 1}, ''],
935             ;
936              
937 40         219 push @out,
938             ['=back', {'start_line' => $m, 'errata' => 1}, ''],
939             ;
940              
941 40         69 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
942              
943 40         203 return @out;
944             }
945              
946             }
947              
948             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
949              
950             ##############################################################################
951             ##
952             ## stop reading now stop reading now stop reading now stop reading now stop
953             ##
954             ## HERE IT BECOMES REALLY SCARY
955             ##
956             ## stop reading now stop reading now stop reading now stop reading now stop
957             ##
958             ##############################################################################
959              
960             sub _ponder_paragraph_buffer {
961              
962             # Para-token types as found in the buffer.
963             # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
964             # =over, =back, =item
965             # and the null =pod (to be complained about if over one line)
966             #
967             # "~data" paragraphs are something we generate at this level, depending on
968             # a currently open =over region
969              
970             # Events fired: Begin and end for:
971             # directivename (like head1 .. head4), item, extend,
972             # for (from =begin...=end, =for),
973             # over-bullet, over-number, over-text, over-block,
974             # item-bullet, item-number, item-text,
975             # Document,
976             # Data, Para, Verbatim
977             # B, C, longdirname (TODO -- wha?), etc. for all directives
978             #
979              
980 7286     7286   11289 my $self = $_[0];
981 7286         10092 my $paras;
982 7286 100       9714 return unless @{$paras = $self->{'paras'}};
  7286         31939  
983 6339   100     17339 my $curr_open = ($self->{'curr_open'} ||= []);
984              
985 6339         8753 my $scratch;
986              
987 6339         8007 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
988              
989             # We have something in our buffer. So apparently the document has started.
990 6339 100       13944 unless($self->{'doc_has_started'}) {
991 917         1958 $self->{'doc_has_started'} = 1;
992              
993 917         1527 my $starting_contentless;
994 917   66     6181 $starting_contentless =
995             (
996             !@$curr_open
997             and @$paras and ! grep $_->[0] ne '~end', @$paras
998             # i.e., if the paras is all ~ends
999             )
1000             ;
1001 917         1552 DEBUG and print STDERR "# Starting ",
1002             $starting_contentless ? 'contentless' : 'contentful',
1003             " document\n"
1004             ;
1005              
1006             $self->_handle_element_start(
1007             ($scratch = 'Document'),
1008             {
1009 917 100       6242 'start_line' => $paras->[0][1]{'start_line'},
1010             $starting_contentless ? ( 'contentless' => 1 ) : (),
1011             },
1012             );
1013             }
1014              
1015 6339         12243 my($para, $para_type);
1016 6339         13885 while(@$paras) {
1017              
1018             # If a directive, assume it's legal; subtract below if found not to be
1019 9050 100       28040 $seen_legal_directive++ if $paras->[0][0] =~ /^=/;
1020              
1021             last if @$paras == 1
1022             and ( $paras->[0][0] eq '=over'
1023             or $paras->[0][0] eq '=item'
1024 9050 100 100     45104 or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
      100        
1025             # Those're the three kinds of paragraphs that require lookahead.
1026             # Actually, an "=item Foo" inside an region
1027             # and any =item inside an region (rare)
1028             # don't require any lookahead, but all others (bullets
1029             # and numbers) do.
1030             # The verbatim is different from the other two, because those might be
1031             # like:
1032             #
1033             # =item
1034             # ...
1035             # =cut
1036             # ...
1037             # =item
1038             #
1039             # The =cut here finishes the paragraph but doesn't terminate the =over
1040             # they should be in. (khw apologizes that he didn't comment at the time
1041             # why the 'in_pod' works, and no longer remembers why, and doesn't think
1042             # it is currently worth the effort to re-figure it out.)
1043              
1044             # TODO: whinge about many kinds of directives in non-resolving =for regions?
1045             # TODO: many? like what? =head1 etc?
1046              
1047 7526         17910 $para = shift @$paras;
1048 7526         13992 $para_type = $para->[0];
1049              
1050 7526         10937 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
1051             $self->_dump_curr_open(), ")\n";
1052              
1053 7526 100       26067 if($para_type eq '=for') {
    100          
    100          
    100          
1054 50 50       150 next if $self->_ponder_for($para,$curr_open,$paras);
1055              
1056             } elsif($para_type eq '=begin') {
1057 127 50       380 next if $self->_ponder_begin($para,$curr_open,$paras);
1058              
1059             } elsif($para_type eq '=end') {
1060 130 50       410 next if $self->_ponder_end($para,$curr_open,$paras);
1061              
1062             } elsif($para_type eq '~end') { # The virtual end-document signal
1063 953 50       3469 next if $self->_ponder_doc_end($para,$curr_open,$paras);
1064             }
1065              
1066              
1067             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1068             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1069 6266 100       18759 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1070 42         51 DEBUG > 1 and
1071             print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
1072 42         80 next;
1073             }
1074             #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1075             # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1076              
1077 6224 100       16480 if($para_type eq '=pod') {
    100          
    100          
1078 580         2083 $self->_ponder_pod($para,$curr_open,$paras);
1079              
1080             } elsif($para_type eq '=over') {
1081 218 100       960 next if $self->_ponder_over($para,$curr_open,$paras);
1082              
1083             } elsif($para_type eq '=back') {
1084 214 100       1111 next if $self->_ponder_back($para,$curr_open,$paras);
1085              
1086             } else {
1087              
1088             # All non-magical codes!!!
1089              
1090             # Here we start using $para_type for our own twisted purposes, to
1091             # mean how it should get treated, not as what the element name
1092             # should be.
1093              
1094 5212         7342 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
1095              
1096 5212         7291 my $i;
1097              
1098             # Enforce some =headN discipline
1099 5212 100 66     16035 if($para_type =~ m/^=head\d$/s
      100        
      66        
1100             and ! $self->{'accept_heads_anywhere'}
1101             and @$curr_open
1102             and $curr_open->[-1][0] eq '=over'
1103             ) {
1104 6         7 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
1105             $self->whine(
1106 6         32 $para->[1]{'start_line'},
1107             "You forgot a '=back' before '$para_type'"
1108             );
1109 6         17 unshift @$paras, ['=back', {}, ''], $para; # close the =over
1110 6         15 next;
1111             }
1112              
1113              
1114 5206 100 66     21816 if($para_type eq '=item') {
    100          
    100          
    100          
    100          
    100          
    100          
1115              
1116 1026         1491 my $over;
1117 1026 50 33     2947 unless(@$curr_open and
1118 1134         4813 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1119             $self->whine(
1120 0         0 $para->[1]{'start_line'},
1121             "'=item' outside of any '=over'"
1122             );
1123             unshift @$paras,
1124 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1125             $para
1126             ;
1127 0         0 next;
1128             }
1129              
1130              
1131 1026         2125 my $over_type = $over->[1]{'~type'};
1132              
1133 1026 50       3553 if(!$over_type) {
    50          
    100          
    100          
    50          
1134             # Shouldn't happen1
1135             die "Typeless over in stack, starting at line "
1136 0         0 . $over->[1]{'start_line'};
1137              
1138             } elsif($over_type eq 'block') {
1139 0 0       0 unless($curr_open->[-1][1]{'~bitched_about'}) {
1140 0         0 $curr_open->[-1][1]{'~bitched_about'} = 1;
1141             $self->whine(
1142             $curr_open->[-1][1]{'start_line'},
1143             "You can't have =items (as at line "
1144 0         0 . $para->[1]{'start_line'}
1145             . ") unless the first thing after the =over is an =item"
1146             );
1147             }
1148             # Just turn it into a paragraph and reconsider it
1149 0         0 $para->[0] = '~Para';
1150 0         0 unshift @$paras, $para;
1151 0         0 next;
1152              
1153             } elsif($over_type eq 'text') {
1154 839         3076 my $item_type = $self->_get_item_type($para);
1155             # That kills the content of the item if it's a number or bullet.
1156 839         1431 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1157              
1158 839 100 66     2000 if($item_type eq 'text') {
    50          
1159             # Nothing special needs doing for 'text'
1160             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1161             $self->whine(
1162 4         30 $para->[1]{'start_line'},
1163             "Expected text after =item, not a $item_type"
1164             );
1165             # Undo our clobbering:
1166 4         9 push @$para, $para->[1]{'~orig_content'};
1167 4         9 delete $para->[1]{'number'};
1168             # Only a PROPER item-number element is allowed
1169             # to have a number attribute.
1170             } else {
1171 0         0 die "Unhandled item type $item_type"; # should never happen
1172             }
1173              
1174             # =item-text thingies don't need any assimilation, it seems.
1175              
1176             } elsif($over_type eq 'number') {
1177 28         91 my $item_type = $self->_get_item_type($para);
1178             # That kills the content of the item if it's a number or bullet.
1179 28         45 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1180              
1181 28         66 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1182              
1183 28 50       200 if($item_type eq 'bullet') {
    50          
    50          
    50          
1184             # Hm, it's not numeric. Correct for this.
1185 0         0 $para->[1]{'number'} = $expected_value;
1186             $self->whine(
1187 0         0 $para->[1]{'start_line'},
1188             "Expected '=item $expected_value'"
1189             );
1190 0         0 push @$para, $para->[1]{'~orig_content'};
1191             # restore the bullet, blocking the assimilation of next para
1192              
1193             } elsif($item_type eq 'text') {
1194             # Hm, it's not numeric. Correct for this.
1195 0         0 $para->[1]{'number'} = $expected_value;
1196             $self->whine(
1197 0         0 $para->[1]{'start_line'},
1198             "Expected '=item $expected_value'"
1199             );
1200             # Text content will still be there and will block next ~Para
1201              
1202             } elsif($item_type ne 'number') {
1203 0         0 die "Unknown item type $item_type"; # should never happen
1204              
1205             } elsif($expected_value == $para->[1]{'number'}) {
1206 28         40 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1207              
1208             } else {
1209 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1210             " instead of the expected value of $expected_value\n";
1211             $self->whine(
1212             $para->[1]{'start_line'},
1213 0         0 "You have '=item " . $para->[1]{'number'} .
1214             "' instead of the expected '=item $expected_value'"
1215             );
1216 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1217             }
1218              
1219 28 50       63 if(@$para == 2) {
1220             # For the cases where we /didn't/ push to @$para
1221 28 100       64 if($paras->[0][0] eq '~Para') {
1222 25         45 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1223 25         40 push @$para, splice @{shift @$paras},2;
  25         68  
1224             } else {
1225 3         5 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1226 3         6 push @$para, ''; # Just so it's not contentless
1227             }
1228             }
1229              
1230              
1231             } elsif($over_type eq 'bullet') {
1232 159         619 my $item_type = $self->_get_item_type($para);
1233             # That kills the content of the item if it's a number or bullet.
1234 159         257 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1235              
1236 159 50       337 if($item_type eq 'bullet') {
    0          
    0          
1237             # as expected!
1238              
1239 159 100       427 if( $para->[1]{'~_freaky_para_hack'} ) {
1240 101         143 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1241 101         246 push @$para, $para->[1]{'~_freaky_para_hack'};
1242             }
1243              
1244             } elsif($item_type eq 'number') {
1245             $self->whine(
1246 0         0 $para->[1]{'start_line'},
1247             "Expected '=item *'"
1248             );
1249 0         0 push @$para, $para->[1]{'~orig_content'};
1250             # and block assimilation of the next paragraph
1251 0         0 delete $para->[1]{'number'};
1252             # Only a PROPER item-number element is allowed
1253             # to have a number attribute.
1254             } elsif($item_type eq 'text') {
1255             $self->whine(
1256 0         0 $para->[1]{'start_line'},
1257             "Expected '=item *'"
1258             );
1259             # But doesn't need processing. But it'll block assimilation
1260             # of the next para.
1261             } else {
1262 0         0 die "Unhandled item type $item_type"; # should never happen
1263             }
1264              
1265 159 100       417 if(@$para == 2) {
1266             # For the cases where we /didn't/ push to @$para
1267 58 50       160 if($paras->[0][0] eq '~Para') {
1268 58         92 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1269 58         151 push @$para, splice @{shift @$paras},2;
  58         306  
1270             } else {
1271 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1272 0         0 push @$para, ''; # Just so it's not contentless
1273             }
1274             }
1275              
1276             } else {
1277 0         0 die "Unhandled =over type \"$over_type\"?";
1278             # Shouldn't happen!
1279             }
1280              
1281 1026         2013 $para_type = 'Plain';
1282 1026         2781 $para->[0] .= '-' . $over_type;
1283             # Whew. Now fall thru and process it.
1284              
1285              
1286             } elsif($para_type eq '=extend') {
1287             # Well, might as well implement it here.
1288 21         139 $self->_ponder_extend($para);
1289 21         68 next; # and skip
1290             } elsif($para_type eq '=encoding') {
1291             # Not actually acted on here, but we catch errors here.
1292 38         221 $self->_handle_encoding_second_level($para);
1293 38 100       155 next unless $self->keep_encoding_directive;
1294 35         74 $para_type = 'Plain';
1295             } elsif($para_type eq '~Verbatim') {
1296 538         1125 $para->[0] = 'Verbatim';
1297 538         901 $para_type = '?Verbatim';
1298             } elsif($para_type eq '~Para') {
1299 2789         5410 $para->[0] = 'Para';
1300 2789         4545 $para_type = '?Plain';
1301             } elsif($para_type eq 'Data') {
1302 30         56 $para->[0] = 'Data';
1303 30         46 $para_type = '?Data';
1304             } elsif( $para_type =~ s/^=//s
1305             and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1306             ) {
1307 752         1185 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1308             } else {
1309             # An unknown directive!
1310 12         27 $seen_legal_directive--;
1311             DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1312 12         23 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1313             ;
1314             $self->whine(
1315 12         143 $para->[1]{'start_line'},
1316             "Unknown directive: $para->[0]"
1317             );
1318              
1319             # And maybe treat it as text instead of just letting it go?
1320 12         50 next;
1321             }
1322              
1323 5170 100       18780 if($para_type =~ s/^\?//s) {
1324 3357 100       6790 if(! @$curr_open) { # usual case
1325 2082         3003 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1326             } else {
1327 1275         3772 my @fors = grep $_->[0] eq '=for', @$curr_open;
1328             DEBUG > 1 and print STDERR "Containing fors: ",
1329 1275         1788 join(',', map $_->[1]{'target'}, @fors), "\n";
1330              
1331 1275 100       2613 if(! @fors) {
    100          
1332 1151         1772 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1333              
1334             #} elsif(grep $_->[1]{'~resolve'}, @fors) {
1335             #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1336             } elsif( $fors[-1][1]{'~resolve'} ) {
1337             # Look to the immediately containing for
1338              
1339 73 100       158 if($para_type eq 'Data') {
1340 18         23 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1341 18         31 $para->[0] = 'Para';
1342 18         37 $para_type = 'Plain';
1343             } else {
1344 55         121 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1345             }
1346             } else {
1347 51         83 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1348 51         160 $para->[0] = $para_type = 'Data';
1349             }
1350             }
1351             }
1352              
1353             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1354 5170 100       10790 if($para_type eq 'Plain') {
    100          
    50          
1355 4582         12618 $self->_ponder_Plain($para);
1356             } elsif($para_type eq 'Verbatim') {
1357 534         2092 $self->_ponder_Verbatim($para);
1358             } elsif($para_type eq 'Data') {
1359 54         254 $self->_ponder_Data($para);
1360             } else {
1361 0         0 die "\$para type is $para_type -- how did that happen?";
1362             # Shouldn't happen.
1363             }
1364              
1365             #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1366 5170         18537 $para->[0] =~ s/^[~=]//s;
1367              
1368 5170         7507 DEBUG and print STDERR "\n", pretty($para), "\n";
1369              
1370             # traverse the treelet (which might well be just one string scalar)
1371             $self->{'content_seen'} ||= 1 if $seen_legal_directive
1372 5170 100 100     27045 && ! $self->{'~tried_gen_errata'};
      100        
1373 5170         13925 $self->_traverse_treelet_bit(@$para);
1374             }
1375             }
1376              
1377 6339         22839 return;
1378             }
1379              
1380             ###########################################################################
1381             # The sub-ponderers...
1382              
1383              
1384              
1385             sub _ponder_for {
1386 50     50   109 my ($self,$para,$curr_open,$paras) = @_;
1387              
1388             # Fake it out as a begin/end
1389 50         79 my $target;
1390              
1391 50 50       115 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1392 0         0 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1393 0         0 return 1;
1394             }
1395              
1396 50         151 for(my $i = 2; $i < @$para; ++$i) {
1397 50 50       224 if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1398 50         107 $target = $1;
1399 50         95 last;
1400             }
1401             }
1402 50 50       109 unless(defined $target) {
1403             $self->whine(
1404 0         0 $para->[1]{'start_line'},
1405             "=for without a target?"
1406             );
1407 0         0 return 1;
1408             }
1409 50         61 DEBUG > 1 and
1410             print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1411              
1412 50         91 $para->[0] = 'Data';
1413              
1414             unshift @$paras,
1415             ['=begin',
1416             {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1417             $target,
1418             ],
1419             $para,
1420             ['=end',
1421 50         340 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1422             $target,
1423             ],
1424             ;
1425              
1426 50         198 return 1;
1427             }
1428              
1429             sub _ponder_begin {
1430 127     127   420 my ($self,$para,$curr_open,$paras) = @_;
1431 127         462 my $content = join ' ', splice @$para, 2;
1432 127         470 $content =~ s/^\s+//s;
1433 127         357 $content =~ s/\s+$//s;
1434 127 50       286 unless(length($content)) {
1435             $self->whine(
1436 0         0 $para->[1]{'start_line'},
1437             "=begin without a target?"
1438             );
1439 0         0 DEBUG and print STDERR "Ignoring targetless =begin\n";
1440 0         0 return 1;
1441             }
1442              
1443 127         652 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1444 127 100       292 $para->[1]{'title'} = $title if ($title);
1445 127         415 $para->[1]{'target'} = $target; # without any ':'
1446 127         200 $content = $target; # strip off the title
1447              
1448 127         254 $content =~ s/^:!/!:/s;
1449 127         170 my $neg; # whether this is a negation-match
1450 127 100       369 $neg = 1 if $content =~ s/^!//s;
1451 127         173 my $to_resolve; # whether to process formatting codes
1452 127 100       419 $to_resolve = 1 if $content =~ s/^://s;
1453              
1454 127         195 my $dont_ignore; # whether this target matches us
1455              
1456 127 100       588 foreach my $target_name (
1457             split(',', $content, -1),
1458             $neg ? () : '*'
1459             ) {
1460 249         358 DEBUG > 2 and
1461             print STDERR " Considering whether =begin $content matches $target_name\n";
1462 249 100       624 next unless $self->{'accept_targets'}{$target_name};
1463              
1464 66         106 DEBUG > 2 and
1465             print STDERR " It DOES match the acceptable target $target_name!\n";
1466             $to_resolve = 1
1467 66 100       221 if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1468 66         129 $dont_ignore = 1;
1469 66         192 $para->[1]{'target_matching'} = $target_name;
1470 66         123 last; # stop looking at other target names
1471             }
1472              
1473 127 100       291 if($neg) {
1474 33 100       60 if( $dont_ignore ) {
1475 6         7 $dont_ignore = '';
1476 6         10 delete $para->[1]{'target_matching'};
1477 6         5 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1478             } else {
1479 27         42 $dont_ignore = 1;
1480 27         54 $para->[1]{'target_matching'} = '!';
1481 27         38 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1482             }
1483             }
1484              
1485 127         267 $para->[0] = '=for'; # Just what we happen to call these, internally
1486 127   100     519 $para->[1]{'~really'} ||= '=begin';
1487 127   100     477 $para->[1]{'~ignore'} = (! $dont_ignore) || 0;
1488 127   100     470 $para->[1]{'~resolve'} = $to_resolve || 0;
1489              
1490 127         165 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1491             "ignore contents of this region\n";
1492 127         165 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1493             ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1494 127         164 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1495              
1496 127         268 push @$curr_open, $para;
1497 127 100 100     538 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1498 40         54 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1499             } else {
1500 87 50 100     349 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1501 87         279 $self->_maybe_handle_element_start((my $scratch='for'), $para->[1]);
1502             }
1503              
1504 127         639 return 1;
1505             }
1506              
1507             sub _ponder_end {
1508 130     130   304 my ($self,$para,$curr_open,$paras) = @_;
1509 130         453 my $content = join ' ', splice @$para, 2;
1510 130         425 $content =~ s/^\s+//s;
1511 130         369 $content =~ s/\s+$//s;
1512 130         222 DEBUG and print STDERR "Ogling '=end $content' directive\n";
1513              
1514 130 50       308 unless(length($content)) {
1515             $self->whine(
1516             $para->[1]{'start_line'},
1517             "'=end' without a target?" . (
1518             ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1519 0 0 0     0 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1520             : ''
1521             )
1522             );
1523 0         0 DEBUG and print STDERR "Ignoring targetless =end\n";
1524 0         0 return 1;
1525             }
1526              
1527 130 50       603 unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1528             $self->whine(
1529 0         0 $para->[1]{'start_line'},
1530             "'=end $content' is invalid. (Stack: "
1531             . $self->_dump_curr_open() . ')'
1532             );
1533 0         0 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1534 0         0 return 1;
1535             }
1536              
1537 130 50 33     701 unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1538             $self->whine(
1539 0         0 $para->[1]{'start_line'},
1540             "=end $content without matching =begin. (Stack: "
1541             . $self->_dump_curr_open() . ')'
1542             );
1543 0         0 DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1544 0         0 return 1;
1545             }
1546              
1547 130 100       376 unless($content eq $curr_open->[-1][1]{'target'}) {
1548             $self->whine(
1549             $para->[1]{'start_line'},
1550             "=end $content doesn't match =begin "
1551 3         16 . $curr_open->[-1][1]{'target'}
1552             . ". (Stack: "
1553             . $self->_dump_curr_open() . ')'
1554             );
1555 3         6 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1556 3         13 return 1;
1557             }
1558              
1559             # Else it's okay to close...
1560 127 100       547 if(grep $_->[1]{'~ignore'}, @$curr_open) {
1561 40         224 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1562             # And that may be because of this to-be-closed =for region, or some
1563             # other one, but it doesn't matter.
1564             } else {
1565 87         205 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1566             # what's that for?
1567              
1568 87 50 50     319 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1569 87         276 $self->_maybe_handle_element_end( my $scratch = 'for', $para->[1]);
1570             }
1571 127         214 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1572 127         240 pop @$curr_open;
1573              
1574 127         761 return 1;
1575             }
1576              
1577             sub _ponder_doc_end {
1578 953     953   2377 my ($self,$para,$curr_open,$paras) = @_;
1579 953 100       2001 if(@$curr_open) { # Deal with things left open
1580 10         14 DEBUG and print STDERR "Stack is nonempty at end-document: (",
1581             $self->_dump_curr_open(), ")\n";
1582              
1583 10         13 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1584 10         58 unshift @$paras, $self->_closers_for_all_curr_open;
1585             # Make sure there is exactly one ~end in the parastack, at the end:
1586 10         39 @$paras = grep $_->[0] ne '~end', @$paras;
1587 10         13 push @$paras, $para, $para;
1588             # We need two -- once for the next cycle where we
1589             # generate errata, and then another to be at the end
1590             # when that loop back around to process the errata.
1591 10         42 return 1;
1592              
1593             } else {
1594 943         1520 DEBUG and print STDERR "Okay, stack is empty now.\n";
1595             }
1596              
1597             # Try generating errata section, if applicable
1598 943 100       2546 unless($self->{'~tried_gen_errata'}) {
1599 903         2087 $self->{'~tried_gen_errata'} = 1;
1600 903         2784 my @extras = $self->_gen_errata();
1601 903 100       2363 if(@extras) {
1602 40         123 unshift @$paras, @extras;
1603 40         241 DEBUG and print STDERR "Generated errata... relooping...\n";
1604 40         218 return 1; # I.e., loop around again to process these fake-o paragraphs
1605             }
1606             }
1607              
1608 903         2121 splice @$paras; # Well, that's that for this paragraph buffer.
1609 903         1363 DEBUG and print STDERR "Throwing end-document event.\n";
1610              
1611 903         3196 $self->_handle_element_end( my $scratch = 'Document' );
1612 903         3860 return 1; # Hasta la byebye
1613             }
1614              
1615             sub _ponder_pod {
1616 580     580   1387 my ($self,$para,$curr_open,$paras) = @_;
1617             $self->whine(
1618 580 50       1483 $para->[1]{'start_line'},
1619             "=pod directives shouldn't be over one line long! Ignoring all "
1620             . (@$para - 2) . " lines of content"
1621             ) if @$para > 3;
1622              
1623             # Content ignored unless 'pod_handler' is set
1624 580 100       1569 if (my $pod_handler = $self->{'pod_handler'}) {
1625 6         21 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1626 6 100       13 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1627 6         13 $pod_handler->($line, $line_num, $self);
1628             }
1629              
1630             # The surrounding methods set content_seen, so let us remain consistent.
1631             # I do not know why it was not here before -- should it not be here?
1632             # $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1633              
1634 580         1701 return;
1635             }
1636              
1637             sub _ponder_over {
1638 218     218   577 my ($self,$para,$curr_open,$paras) = @_;
1639 218 50       544 return 1 unless @$paras;
1640 218         470 my $list_type;
1641              
1642 218 100       699 if($paras->[0][0] eq '=item') { # most common case
    100          
    50          
1643 189         1094 $list_type = $self->_get_initial_item_type($paras->[0]);
1644              
1645             } elsif($paras->[0][0] eq '=back') {
1646             # Ignore empty lists by default
1647 6 100       27 if ($self->{'parse_empty_lists'}) {
1648 2         4 $list_type = 'empty';
1649             } else {
1650 4         9 shift @$paras;
1651 4         23 return 1;
1652             }
1653             } elsif($paras->[0][0] eq '~end') {
1654             $self->whine(
1655 0         0 $para->[1]{'start_line'},
1656             "=over is the last thing in the document?!"
1657             );
1658 0         0 return 1; # But feh, ignore it.
1659             } else {
1660 23         40 $list_type = 'block';
1661             }
1662 214         534 $para->[1]{'~type'} = $list_type;
1663 214         483 push @$curr_open, $para;
1664             # yes, we reuse the paragraph as a stack item
1665              
1666 214         699 my $content = join ' ', splice @$para, 2;
1667 214         567 $para->[1]{'~orig_content'} = $content;
1668 214         393 my $overness;
1669 214 100       1328 if($content =~ m/^\s*$/s) {
    50          
1670 120         8552 $para->[1]{'indent'} = 4;
1671             } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1672 70     70   871 no integer;
  70         157  
  70         550  
1673 94         343 $para->[1]{'indent'} = $1;
1674 94 50       362 if($1 == 0) {
1675             $self->whine(
1676 0         0 $para->[1]{'start_line'},
1677             "Can't have a 0 in =over $content"
1678             );
1679 0         0 $para->[1]{'indent'} = 4;
1680             }
1681             } else {
1682             $self->whine(
1683 0         0 $para->[1]{'start_line'},
1684             "=over should be: '=over' or '=over positive_number'"
1685             );
1686 0         0 $para->[1]{'indent'} = 4;
1687             }
1688 214         329 DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1689              
1690 214 100 100     1165 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1691 214         1027 $self->_maybe_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1692              
1693 214         968 return;
1694             }
1695              
1696             sub _ponder_back {
1697 214     214   669 my ($self,$para,$curr_open,$paras) = @_;
1698             # TODO: fire off or or ??
1699              
1700 214         745 my $content = join ' ', splice @$para, 2;
1701 214 50       743 if($content =~ m/\S/) {
1702             $self->whine(
1703 0         0 $para->[1]{'start_line'},
1704             "=back doesn't take any parameters, but you said =back $content"
1705             );
1706             }
1707              
1708 214 50 33     1166 if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1709 214         300 DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1710             # Expected case: we're closing the most recently opened thing
1711             #my $over = pop @$curr_open;
1712 214 100 50     789 $self->{'content_seen'} ||= 1 unless $self->{'~tried_gen_errata'};
1713             $self->_maybe_handle_element_end( my $scratch =
1714 214         852 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1715             );
1716             } else {
1717 0         0 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1718             join(', ', map $_->[0], @$curr_open), ").\n";
1719             $self->whine(
1720 0         0 $para->[1]{'start_line'},
1721             '=back without =over'
1722             );
1723 0         0 return 1; # and ignore it
1724             }
1725             }
1726              
1727             sub _ponder_item {
1728 0     0   0 my ($self,$para,$curr_open,$paras) = @_;
1729 0         0 my $over;
1730 0 0 0     0 unless(@$curr_open and
1731 0         0 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1732             $self->whine(
1733 0         0 $para->[1]{'start_line'},
1734             "'=item' outside of any '=over'"
1735             );
1736             unshift @$paras,
1737 0         0 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1738             $para
1739             ;
1740 0         0 return 1;
1741             }
1742              
1743              
1744 0         0 my $over_type = $over->[1]{'~type'};
1745              
1746 0 0       0 if(!$over_type) {
    0          
    0          
    0          
    0          
1747             # Shouldn't happen1
1748             die "Typeless over in stack, starting at line "
1749 0         0 . $over->[1]{'start_line'};
1750              
1751             } elsif($over_type eq 'block') {
1752 0 0       0 unless($curr_open->[-1][1]{'~bitched_about'}) {
1753 0         0 $curr_open->[-1][1]{'~bitched_about'} = 1;
1754             $self->whine(
1755             $curr_open->[-1][1]{'start_line'},
1756             "You can't have =items (as at line "
1757 0         0 . $para->[1]{'start_line'}
1758             . ") unless the first thing after the =over is an =item"
1759             );
1760             }
1761             # Just turn it into a paragraph and reconsider it
1762 0         0 $para->[0] = '~Para';
1763 0         0 unshift @$paras, $para;
1764 0         0 return 1;
1765              
1766             } elsif($over_type eq 'text') {
1767 0         0 my $item_type = $self->_get_item_type($para);
1768             # That kills the content of the item if it's a number or bullet.
1769 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1770              
1771 0 0 0     0 if($item_type eq 'text') {
    0          
1772             # Nothing special needs doing for 'text'
1773             } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1774             $self->whine(
1775 0         0 $para->[1]{'start_line'},
1776             "Expected text after =item, not a $item_type"
1777             );
1778             # Undo our clobbering:
1779 0         0 push @$para, $para->[1]{'~orig_content'};
1780 0         0 delete $para->[1]{'number'};
1781             # Only a PROPER item-number element is allowed
1782             # to have a number attribute.
1783             } else {
1784 0         0 die "Unhandled item type $item_type"; # should never happen
1785             }
1786              
1787             # =item-text thingies don't need any assimilation, it seems.
1788              
1789             } elsif($over_type eq 'number') {
1790 0         0 my $item_type = $self->_get_item_type($para);
1791             # That kills the content of the item if it's a number or bullet.
1792 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1793              
1794 0         0 my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1795              
1796 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
    0          
1797             # Hm, it's not numeric. Correct for this.
1798 0         0 $para->[1]{'number'} = $expected_value;
1799             $self->whine(
1800 0         0 $para->[1]{'start_line'},
1801             "Expected '=item $expected_value'"
1802             );
1803 0         0 push @$para, $para->[1]{'~orig_content'};
1804             # restore the bullet, blocking the assimilation of next para
1805              
1806             } elsif($item_type eq 'text') {
1807             # Hm, it's not numeric. Correct for this.
1808 0         0 $para->[1]{'number'} = $expected_value;
1809             $self->whine(
1810 0         0 $para->[1]{'start_line'},
1811             "Expected '=item $expected_value'"
1812             );
1813             # Text content will still be there and will block next ~Para
1814              
1815             } elsif($item_type ne 'number') {
1816 0         0 die "Unknown item type $item_type"; # should never happen
1817              
1818             } elsif($expected_value == $para->[1]{'number'}) {
1819 0         0 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1820              
1821             } else {
1822 0         0 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1823             " instead of the expected value of $expected_value\n";
1824             $self->whine(
1825             $para->[1]{'start_line'},
1826 0         0 "You have '=item " . $para->[1]{'number'} .
1827             "' instead of the expected '=item $expected_value'"
1828             );
1829 0         0 $para->[1]{'number'} = $expected_value; # correcting!!
1830             }
1831              
1832 0 0       0 if(@$para == 2) {
1833             # For the cases where we /didn't/ push to @$para
1834 0 0       0 if($paras->[0][0] eq '~Para') {
1835 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1836 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1837             } else {
1838 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1839 0         0 push @$para, ''; # Just so it's not contentless
1840             }
1841             }
1842              
1843              
1844             } elsif($over_type eq 'bullet') {
1845 0         0 my $item_type = $self->_get_item_type($para);
1846             # That kills the content of the item if it's a number or bullet.
1847 0         0 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1848              
1849 0 0       0 if($item_type eq 'bullet') {
    0          
    0          
1850             # as expected!
1851              
1852 0 0       0 if( $para->[1]{'~_freaky_para_hack'} ) {
1853 0         0 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1854 0         0 push @$para, $para->[1]{'~_freaky_para_hack'};
1855             }
1856              
1857             } elsif($item_type eq 'number') {
1858             $self->whine(
1859 0         0 $para->[1]{'start_line'},
1860             "Expected '=item *'"
1861             );
1862 0         0 push @$para, $para->[1]{'~orig_content'};
1863             # and block assimilation of the next paragraph
1864 0         0 delete $para->[1]{'number'};
1865             # Only a PROPER item-number element is allowed
1866             # to have a number attribute.
1867             } elsif($item_type eq 'text') {
1868             $self->whine(
1869 0         0 $para->[1]{'start_line'},
1870             "Expected '=item *'"
1871             );
1872             # But doesn't need processing. But it'll block assimilation
1873             # of the next para.
1874             } else {
1875 0         0 die "Unhandled item type $item_type"; # should never happen
1876             }
1877              
1878 0 0       0 if(@$para == 2) {
1879             # For the cases where we /didn't/ push to @$para
1880 0 0       0 if($paras->[0][0] eq '~Para') {
1881 0         0 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1882 0         0 push @$para, splice @{shift @$paras},2;
  0         0  
1883             } else {
1884 0         0 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1885 0         0 push @$para, ''; # Just so it's not contentless
1886             }
1887             }
1888              
1889             } else {
1890 0         0 die "Unhandled =over type \"$over_type\"?";
1891             # Shouldn't happen!
1892             }
1893 0         0 $para->[0] .= '-' . $over_type;
1894              
1895 0         0 return;
1896             }
1897              
1898             sub _ponder_Plain {
1899 4582     4582   9571 my ($self,$para) = @_;
1900 4582         6511 DEBUG and print STDERR " giving plain treatment...\n";
1901 4582 100 100     30353 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
      66        
      100        
1902             or $para->[1]{'~cooked'}
1903             ) {
1904             push @$para,
1905 4477         7132 @{$self->_make_treelet(
1906             join("\n", splice(@$para, 2)),
1907 4477         28045 $para->[1]{'start_line'}
1908             )};
1909             }
1910             # Empty paragraphs don't need a treelet for any reason I can see.
1911             # And precooked paragraphs already have a treelet.
1912 4582         11723 return;
1913             }
1914              
1915             sub _ponder_Verbatim {
1916 534     534   1302 my ($self,$para) = @_;
1917 534         730 DEBUG and print STDERR " giving verbatim treatment...\n";
1918              
1919 534         1510 $para->[1]{'xml:space'} = 'preserve';
1920              
1921 534 100       1613 unless ($self->{'_output_is_for_JustPod'}) {
1922             # Fix illegal settings for expand_verbatim_tabs()
1923             # This is because this module doesn't do input error checking, but khw
1924             # doesn't want to add yet another instance of that.
1925 453         1814 my $tab_width = $self->expand_verbatim_tabs;
1926 453 100 100     2546 $tab_width = $self->expand_verbatim_tabs(8)
1927             if ! defined $tab_width
1928             || $tab_width =~ /\D/;
1929              
1930 453         1502 my $indent = $self->strip_verbatim_indent;
1931 453 100 100     1454 if ($indent && ref $indent eq 'CODE') {
1932 10         19 my @shifted = (shift @{$para}, shift @{$para});
  10         23  
  10         23  
1933 10         44 $indent = $indent->($para);
1934 10         106 unshift @{$para}, @shifted;
  10         32  
1935             }
1936              
1937 453         1342 for(my $i = 2; $i < @$para; $i++) {
1938 2077         3404 foreach my $line ($para->[$i]) { # just for aliasing
1939             # Strip indentation.
1940 2077 100       3924 $line =~ s/^\Q$indent// if $indent;
1941 2077 100       3606 next unless $tab_width;
1942              
1943             # This is commented out because of github issue #85, and the
1944             # current maintainers don't know why it was there in the first
1945             # place.
1946             #&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1947 2076         6614 while( $line =~
1948             # Sort of adapted from Text::Tabs.
1949 189         1266 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1950             * $tab_width)
1951             -(length($1) % $tab_width)))/e
1952             ) {}
1953              
1954             # TODO: whinge about (or otherwise treat) unindented or overlong lines
1955              
1956             }
1957             }
1958             }
1959              
1960             # Now the VerbatimFormatted hoodoo...
1961 534 100 66     2916 if( $self->{'accept_codes'} and
    100          
1962             $self->{'accept_codes'}{'VerbatimFormatted'}
1963             ) {
1964 103   100     529 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
  76         296  
1965             # Kill any number of terminal newlines
1966 103         450 $self->_verbatim_format($para);
1967             } elsif ($self->{'codes_in_verbatim'}) {
1968             push @$para,
1969 2         4 @{$self->_make_treelet(
1970             join("\n", splice(@$para, 2)),
1971 2         11 $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1972             )};
1973 2         6 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1974             } else {
1975 429 100       2565 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1976 429         3368 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1977             }
1978 534         1338 return;
1979             }
1980              
1981             sub _ponder_Data {
1982 54     54   141 my ($self,$para) = @_;
1983 54         83 DEBUG and print STDERR " giving data treatment...\n";
1984 54         133 $para->[1]{'xml:space'} = 'preserve';
1985 54 100       299 push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1986 54         169 return;
1987             }
1988              
1989              
1990              
1991              
1992             ###########################################################################
1993              
1994             sub _traverse_treelet_bit { # for use only by the routine above
1995 8054     8054   20661 my($self, $name) = splice @_,0,2;
1996              
1997 8054         11516 my $scratch;
1998 8054         25462 $self->_maybe_handle_element_start(($scratch=$name), shift @_);
1999              
2000 8054         17951 while (@_) {
2001 12896         20769 my $x = shift;
2002 12896 100       21926 if (ref($x)) {
2003 2890         6298 &_traverse_treelet_bit($self, @$x);
2004             } else {
2005 10006   100     27965 $x .= shift while @_ && !ref($_[0]);
2006 10006         22256 $self->_maybe_handle_text($x);
2007             }
2008             }
2009              
2010 8054         23569 $self->_maybe_handle_element_end($scratch=$name);
2011 8054         26986 return;
2012             }
2013              
2014             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2015              
2016             sub _closers_for_all_curr_open {
2017 10     10   14 my $self = $_[0];
2018 10         21 my @closers;
2019 10 50       16 foreach my $still_open (@{ $self->{'curr_open'} || return }) {
  10         35  
2020 15         35 my @copy = @$still_open;
2021 15         19 $copy[1] = {%{ $copy[1] }};
  15         58  
2022             #$copy[1]{'start_line'} = -1;
2023 15 100       42 if($copy[0] eq '=for') {
    50          
2024 9         15 $copy[0] = '=end';
2025             } elsif($copy[0] eq '=over') {
2026             $self->whine(
2027             $still_open->[1]{start_line} ,
2028 6         36 "=over without closing =back"
2029             );
2030              
2031 6         9 $copy[0] = '=back';
2032             } else {
2033 0         0 die "I don't know how to auto-close an open $copy[0] region";
2034             }
2035              
2036 15 50       78 unless( @copy > 2 ) {
2037 15         28 push @copy, $copy[1]{'target'};
2038 15 100       33 $copy[-1] = '' unless defined $copy[-1];
2039             # since =over's don't have targets
2040             }
2041              
2042 15         29 $copy[1]{'fake-closer'} = 1;
2043              
2044 15         16 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
2045 15         35 unshift @closers, \@copy;
2046             }
2047 10         22 return @closers;
2048             }
2049              
2050             #--------------------------------------------------------------------------
2051              
2052             sub _verbatim_format {
2053 103     103   246 my($it, $p) = @_;
2054              
2055 103         159 my $formatting;
2056              
2057 103         276 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
2058 559         718 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
2059 559         1282 $p->[$i] .= "\n";
2060             # Unlike with simple Verbatim blocks, we don't end up just doing
2061             # a join("\n", ...) on the contents, so we have to append a
2062             # newline to every line, and then nix the last one later.
2063             }
2064              
2065 103         229 if( DEBUG > 4 ) {
2066             print STDERR "<<\n";
2067             for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
2068             print STDERR "_verbatim_format $i: $p->[$i]";
2069             }
2070             print STDERR ">>\n";
2071             }
2072              
2073 103         352 for(my $i = $#$p; $i > 2; $i--) {
2074             # work backwards over the lines, except the first (#2)
2075              
2076             #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
2077             # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
2078             # look at a formatty line preceding a nonformatty one
2079 455         549 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
2080 455 100       848 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
2081 11         17 DEBUG > 5 and print STDERR " It's a formatty line. ",
2082             "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
2083              
2084 11 100       31 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
2085 1         3 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
2086 1         4 next;
2087             } else {
2088 10         17 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
2089             }
2090             } else {
2091 444         518 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
2092 444         847 next;
2093             }
2094              
2095             # A formatty line has to have #: in the first two columns, and uses
2096             # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
2097             # Example:
2098             # What do you want? i like pie. [or whatever]
2099             # #:^^^^^^^^^^^^^^^^^ /////////////
2100              
2101              
2102 10         13 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
2103              
2104 10         40 $formatting = ' ' . $1;
2105 10         45 $formatting =~ s/\s+$//s; # nix trailing whitespace
2106 10 50 33     59 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
2107 0         0 splice @$p,$i,1; # remove this line
2108 0         0 $i--; # don't consider next line
2109 0         0 next;
2110             }
2111              
2112 10 100       26 if( length($formatting) >= length($p->[$i-1]) ) {
2113 3         11 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
2114             } else {
2115 7         24 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
2116             }
2117             # Make $formatting and the previous line be exactly the same length,
2118             # with $formatting having a " " as the last character.
2119              
2120 10         15 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
2121              
2122              
2123 10         17 my @new_line;
2124 10         47 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
2125             #print STDERR "Format matches $1\n";
2126              
2127 54 100       129 if($2) {
2128             #print STDERR "SKIPPING <$2>\n";
2129 32         165 push @new_line,
2130             substr($p->[$i-1], pos($formatting)-length($1), length($1));
2131             } else {
2132             #print STDERR "SNARING $+\n";
2133 22 50       183 push @new_line, [
    100          
    100          
2134             (
2135             $3 ? 'VerbatimB' :
2136             $4 ? 'VerbatimI' :
2137             $5 ? 'VerbatimBI' : die("Should never get called")
2138             ), {},
2139             substr($p->[$i-1], pos($formatting)-length($1), length($1))
2140             ];
2141             #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
2142             }
2143             }
2144 10         54 my @nixed =
2145             splice @$p, $i-1, 2, @new_line; # replace myself and the next line
2146 10         17 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
2147              
2148 10         14 DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
2149             scalar(@new_line), "):",
2150             map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
2151 10         43 $i--; # So the next line we scrutinize is the line before the one
2152             # that we just went and formatted
2153             }
2154              
2155 103         259 $p->[0] = 'VerbatimFormatted';
2156              
2157             # Collapse adjacent text nodes, just for kicks.
2158 103         332 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2159 0 0 0     0 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2160 0         0 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2161 0         0 $p->[$i] .= splice @$p, $i+1, 1; # merge
2162 0         0 --$i; # and back up
2163             }
2164             }
2165              
2166             # Now look for the last text token, and remove the terminal newline
2167 103         325 for( my $i = $#$p; $i >= 2; $i-- ) {
2168             # work backwards over the tokens, even the first
2169 103 50       353 if( !ref($p->[$i]) ) {
2170 103 50       720 if($p->[$i] =~ s/\n$//s) {
2171 103         309 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2172             } else {
2173 0         0 DEBUG > 5 and print STDERR
2174             "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2175             }
2176 103         229 last; # we only want the next one
2177             }
2178             }
2179              
2180 103         256 return;
2181             }
2182              
2183              
2184             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2185              
2186              
2187             sub _treelet_from_formatting_codes {
2188             # Given a paragraph, returns a treelet. Full of scary tokenizing code.
2189             # Like [ '~Top', {'start_line' => $start_line},
2190             # "I like ",
2191             # [ 'B', {}, "pie" ],
2192             # "!"
2193             # ]
2194             # This illustrates the general format of a treelet. It is an array:
2195             # [0] is a scalar indicating its type. In the example above, the
2196             # types are '~Top' and 'B'
2197             # [1] is a hash of various flags about it, possibly empty
2198             # [2] - [N] are an ordered list of the subcomponents of the treelet.
2199             # Scalars are literal text, refs are sub-treelets, to
2200             # arbitrary levels. Stringifying a treelet will recursively
2201             # stringify the sub-treelets, concatentating everything
2202             # together to form the exact text of the treelet.
2203              
2204 4479     4479   11436 my($self, $para, $start_line, $preserve_space) = @_;
2205              
2206 4479         16433 my $treelet = ['~Top', {'start_line' => $start_line},];
2207              
2208 4479 100 100     18529 unless ($preserve_space || $self->{'preserve_whitespace'}) {
2209 3602         37624 $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2210 3602         7250 $para =~ s/ $//;
2211 3602         5964 $para =~ s/^ //;
2212             }
2213              
2214             # Only apparent problem the above code is that N<< >> turns into
2215             # N<< >>. But then, word wrapping does that too! So don't do that!
2216              
2217              
2218             # As a Start-code is encountered, the number of opening bracket '<'
2219             # characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2220             # etc). When closing brackets are found in the text, at least this number
2221             # (plus the 1) will be required to mean the Start-code is terminated. When
2222             # those are found, @stack is popped.
2223 4479         7015 my @stack;
2224              
2225 4479         8591 my @lineage = ($treelet);
2226 4479         7449 my $raw = ''; # raw content of L<> fcode before splitting/processing
2227             # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2228             # into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2229             # the 'collapse and trim all whitespace first' lines just above.
2230 4479         6296 my $inL = 0;
2231              
2232 4479         5895 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2233              
2234             # Here begins our frightening tokenizer RE. The following regex matches
2235             # text in four main parts:
2236             #
2237             # * Start-codes. The first alternative matches C< or C<<, the latter
2238             # followed by some whitespace. $1 will hold the entire start code
2239             # (including any space following a multiple-angle-bracket delimiter),
2240             # and $2 will hold only the additional brackets past the first in a
2241             # multiple-bracket delimiter. length($2) + 1 will be the number of
2242             # closing brackets we have to find.
2243             #
2244             # * Closing brackets. Match some amount of whitespace followed by
2245             # multiple close brackets. The logic to see if this closes anything
2246             # is down below. Note that in order to parse C<< >> correctly, we
2247             # have to use look-behind (?<=\s\s), since the match of the starting
2248             # code will have consumed the whitespace.
2249             #
2250             # * A single closing bracket, to close a simple code like C<>.
2251             #
2252             # * Something that isn't a start or end code. We have to be careful
2253             # about accepting whitespace, since perlpodspec says that any whitespace
2254             # before a multiple-bracket closing delimiter should be ignored.
2255             #
2256 4479         60166 while($para =~
2257             m/\G
2258             (?:
2259             # Match starting codes, including the whitespace following a
2260             # multiple-delimiter start code. $1 gets the whole start code and
2261             # $2 gets all but one of the
2262             ([A-Z]<(?:(<+)\s+)?)
2263             |
2264             # Match multiple-bracket end codes. $3 gets the whitespace that
2265             # should be discarded before an end bracket but kept in other cases
2266             # and $4 gets the end brackets themselves. ($3 can be empty if the
2267             # construct is empty, like C<< >>, and all the white-space has been
2268             # gobbled up already, considered to be space after the opening
2269             # bracket. In this case we use look-behind to verify that there are
2270             # at least 2 spaces in a row before the ">".)
2271             (\s+|(?<=\s\s))(>{2,})
2272             |
2273             (\s?>) # $5: simple end-codes
2274             |
2275             ( # $6: stuff containing no start-codes or end-codes
2276             (?:
2277             [^A-Z\s>]
2278             |
2279             (?:
2280             [A-Z](?!<)
2281             )
2282             |
2283             # whitespace is ok, but we don't want to eat the whitespace before
2284             # a multiple-bracket end code.
2285             # NOTE: we may still have problems with e.g. S<< >>
2286             (?:
2287             \s(?!\s*>{2,})
2288             )
2289             )+
2290             )
2291             )
2292             /xgo
2293             ) {
2294 15947         21562 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2295 15947 100       53269 if(defined $1) {
    100          
    100          
    50          
2296 3087         4340 my $bracket_count; # How many '<<<' in a row this has. Needed for
2297             # Pod::Simple::JustPod
2298 3087 100       5819 if(defined $2) {
2299 119         197 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2300 119         227 $bracket_count = length($2) + 1;
2301 119         220 push @stack, $bracket_count; # length of the necessary complex
2302             # end-code string
2303             } else {
2304 2968         3884 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2305 2968         4655 push @stack, 0; # signal that we're looking for simple
2306 2968         4706 $bracket_count = 1;
2307             }
2308 3087         7045 my $code = substr($1,0,1);
2309 3087 100       6288 if ('L' eq $code) {
2310 437 100       910 if ($inL) {
2311 1         3 $raw .= $1;
2312 1         17 $self->scream( $start_line,
2313             'Nested L<> are illegal. Pretending inner one is '
2314             . 'X<...> so can continue looking for other errors.');
2315 1         3 $code = "X";
2316             }
2317             else {
2318 436         740 $raw = ""; # reset raw content accumulator
2319 436         778 $inL = @stack;
2320             }
2321             } else {
2322 2650 100       5228 $raw .= $1 if $inL;
2323             }
2324 3087         8291 push @lineage, [ $code, {}, ]; # new node object
2325              
2326             # Tell Pod::Simple::JustPod how many brackets there were, but to save
2327             # space, not in the most usual case of there was just 1. It can be
2328             # inferred by the absence of this element. Similarly, if there is more
2329             # than one bracket, extract the white space between the final bracket
2330             # and the real beginning of the interior. Save that if it isn't just a
2331             # single space
2332 3087 100 100     8534 if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2333 18         44 $lineage[-1][1]{'~bracket_count'} = $bracket_count;
2334 18         47 my $lspacer = substr($1, 1 + $bracket_count);
2335 18 100       54 $lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2336             }
2337 3087         4166 push @{ $lineage[-2] }, $lineage[-1];
  3087         20445  
2338             } elsif(defined $4) {
2339 130         197 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2340             # This is where it gets messy...
2341 130 100       536 if(! @stack) {
    100          
    100          
    50          
2342             # We saw " >>>>" but needed nothing. This is ALL just stuff then.
2343 1         3 DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2344 1         2 push @{ $lineage[-1] }, $3, $4;
  1         5  
2345 1         16 next;
2346             } elsif(!$stack[-1]) {
2347             # We saw " >>>>" but needed only ">". Back pos up.
2348 3         5 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2349 3         6 push @{ $lineage[-1] }, $3; # That was a for-real space, too.
  3         11  
2350 3         17 pos($para) = pos($para) - length($4) + 1;
2351             } elsif($stack[-1] == length($4)) {
2352             # We found " >>>>", and it was exactly what we needed. Commonest case.
2353 119         199 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2354             } elsif($stack[-1] < length($4)) {
2355             # We saw " >>>>" but needed only " >>". Back pos up.
2356 0         0 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2357 0         0 pos($para) = pos($para) - length($4) + $stack[-1];
2358             } else {
2359             # We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2360 7         15 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2361 7         12 push @{ $lineage[-1] }, $3, $4;
  7         26  
2362 7         45 next;
2363             }
2364             #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2365              
2366 122 50 66     418 if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2367 3 100       14 if ($3 ne "") {
    100          
2368 1         4 $lineage[-1][1]{'~rspacer'} = $3;
2369             }
2370             elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2371              
2372             # Here we had something like C<< >> which was a false positive
2373 1         4 delete $lineage[-1][1]{'~lspacer'};
2374             }
2375             else {
2376             $lineage[-1][1]{'~rspacer'}
2377 1         4 = substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2378 1         4 chop $lineage[-1][1]{'~lspacer'};
2379             }
2380             }
2381              
2382 122 100       211 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  2         5  
  122         391  
2383             # Keep the element from being childless
2384              
2385 122 100       325 if ($inL == @stack) {
2386 26         119 $lineage[-1][1]{'raw'} = $raw;
2387 26         42 $inL = 0;
2388             }
2389              
2390 122         224 pop @stack;
2391 122         204 pop @lineage;
2392              
2393 122 100       859 $raw .= $3.$4 if $inL;
2394              
2395             } elsif(defined $5) {
2396 3064         4036 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2397              
2398 3064 100 100     9878 if(@stack and ! $stack[-1]) {
2399             # We're indeed expecting a simple end-code
2400 2957         3786 DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2401              
2402 2957 50       6065 if(length($5) == 2) { # There was a space there: " >"
    100          
2403 0         0 push @{ $lineage[-1] }, ' ';
  0         0  
2404 2957         6494 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2405 41         56 push @{ $lineage[-1] }, ''; # keep it from being really childless
  41         90  
2406             }
2407              
2408 2957 100       6058 if ($inL == @stack) {
2409 410         1233 $lineage[-1][1]{'raw'} = $raw;
2410 410         599 $inL = 0;
2411             }
2412              
2413 2957         4285 pop @stack;
2414 2957         4139 pop @lineage;
2415             } else {
2416 107         315 DEBUG > 4 and print STDERR " It's just stuff.\n";
2417 107         161 push @{ $lineage[-1] }, $5;
  107         305  
2418             }
2419              
2420 3064 100       26902 $raw .= $5 if $inL;
2421              
2422             } elsif(defined $6) {
2423 9666         12978 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2424 9666         13032 push @{ $lineage[-1] }, $6;
  9666         27417  
2425 9666 100       37716 $raw .= $6 if $inL;
2426             # XXX does not capture multiplace whitespaces -- 'raw' ends up with
2427             # at most 1 leading/trailing whitespace, why not all of it?
2428             # Answer, because we deliberately trimmed it above
2429              
2430             } else {
2431             # should never ever ever ever happen
2432 0         0 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2433 0         0 die "SPORK 512512!";
2434             }
2435             }
2436              
2437 4479 100       9493 if(@stack) { # Uhoh, some sequences weren't closed.
2438 8         22 my $x= "...";
2439 8         21 while(@stack) {
2440 8 50       14 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
  0         0  
  8         26  
2441             # Hmmmmm!
2442              
2443 8         22 my $code = (pop @lineage)->[0];
2444 8         15 my $ender_length = pop @stack;
2445 8 50       35 if($ender_length) {
2446 0         0 --$ender_length;
2447 0         0 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2448             } else {
2449 8         34 $x = $code . "<$x>";
2450             }
2451             }
2452 8         12 DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2453 8         39 $self->whine($start_line,
2454             "Unterminated $x sequence",
2455             );
2456             }
2457              
2458 4479         17092 return $treelet;
2459             }
2460              
2461             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2462              
2463             sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2464 0     0 0 0 return stringify_lol($_[1]);
2465             }
2466              
2467             sub stringify_lol { # function: stringify_lol($lol)
2468 2641     2641 0 4342 my $string_form = '';
2469 2641         6361 _stringify_lol( $_[0] => \$string_form );
2470 2641         8377 return $string_form;
2471             }
2472              
2473             sub _stringify_lol { # the real recursor
2474 3029     3029   5148 my($lol, $to) = @_;
2475 3029         6574 for(my $i = 2; $i < @$lol; ++$i) {
2476 4049 100 100     11234 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
      66        
2477 388         810 _stringify_lol( $lol->[$i], $to); # recurse!
2478             } else {
2479 3661         9018 $$to .= $lol->[$i];
2480             }
2481             }
2482 3029         5341 return;
2483             }
2484              
2485             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2486              
2487             sub _dump_curr_open { # return a string representation of the stack
2488 3     3   3 my $curr_open = $_[0]{'curr_open'};
2489              
2490 3 50       5 return '[empty]' unless @$curr_open;
2491             return join '; ',
2492 3         6 map {;
2493             ($_->[0] eq '=for')
2494             ? ( ($_->[1]{'~really'} || '=over')
2495 4 50 50     35 . ' ' . $_->[1]{'target'})
2496             : $_->[0]
2497             }
2498             @$curr_open
2499             ;
2500             }
2501              
2502             ###########################################################################
2503             my %pretty_form = (
2504             "\a" => '\a', # ding!
2505             "\b" => '\b', # BS
2506             "\e" => '\e', # ESC
2507             "\f" => '\f', # FF
2508             "\t" => '\t', # tab
2509             "\cm" => '\cm',
2510             "\cj" => '\cj',
2511             "\n" => '\n', # probably overrides one of either \cm or \cj
2512             '"' => '\"',
2513             '\\' => '\\\\',
2514             '$' => '\\$',
2515             '@' => '\\@',
2516             '%' => '\\%',
2517             '#' => '\\#',
2518             );
2519              
2520             sub pretty { # adopted from Class::Classless
2521             # Not the most brilliant routine, but passable.
2522             # Don't give it a cyclic data structure!
2523 502     502 0 249038 my @stuff = @_; # copy
2524 502         682 my $x;
2525             my $out =
2526             # join ",\n" .
2527             join ", ",
2528 502         813 map {;
2529 592 50 100     4314 if(!defined($_)) {
    100 66        
    50 33        
    100          
    100          
    100          
2530 0         0 "undef";
2531             } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2532 58         122 $x = "[ " . pretty(@$_) . " ]" ;
2533 58         138 $x;
2534             } elsif(ref($_) eq 'SCALAR') {
2535 0         0 $x = "\\" . pretty($$_) ;
2536 0         0 $x;
2537             } elsif(ref($_) eq 'HASH') {
2538 48         85 my $hr = $_;
2539             $x = "{" . join(", ",
2540 48         416 map(pretty($_) . '=>' . pretty($hr->{$_}),
2541             sort keys %$hr ) ) . "}" ;
2542 48         236 $x;
2543 2         5 } elsif(!length($_)) { q{''} # empty string
2544             } elsif(
2545             $_ eq '0' # very common case
2546             or(
2547             m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2548             and $_ ne '-0' # the strange case that RE lets thru
2549             )
2550 26         77 ) { $_;
2551             } else {
2552             # Yes, explicitly name every character desired. There are shorcuts one
2553             # could make, but I (Karl Williamson) was afraid that some Perl
2554             # releases would have bugs in some of them. For example [A-Z] works
2555             # even on EBCDIC platforms to match exactly the 26 uppercase English
2556             # letters, but I don't know if it has always worked without bugs. It
2557             # seemed safest just to list the characters.
2558             # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2559 458         1037 s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
2560 0 0       0 <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2561             #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2562 458         1305 qq{"$_"};
2563             }
2564             } @stuff;
2565             # $out =~ s/\n */ /g if length($out) < 75;
2566 502         5133 return $out;
2567             }
2568              
2569             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2570              
2571             # A rather unsubtle method of blowing away all the state information
2572             # from a parser object so it can be reused. Provided as a utility for
2573             # backward compatibility in Pod::Man, etc. but not recommended for
2574             # general use.
2575              
2576             sub reinit {
2577 5     5 0 15054 my $self = shift;
2578 5         20 foreach (qw(source_dead source_filename doc_has_started
2579             start_of_pod_block content_seen last_was_blank paras curr_open
2580             line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2581             Title _current_headings _in_head _filter_allowed)) {
2582              
2583 95         186 delete $self->{$_};
2584             }
2585             }
2586              
2587             #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2588             1;
2589