File Coverage

blib/lib/Pod/Simple/BlackBox.pm

Criterion	Covered	Total	%
statement	891	1101	80.9
branch	398	568	70.0
condition	136	196	69.3
subroutine	72	75	96.0
pod	0	8	0.0
total	1497	1948	76.8

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Simple::BlackBox;
2							#
3							# "What's in the box?" "Pain."
4							#
5							###########################################################################
6							#
7							# This is where all the scary things happen: parsing lines into
8							# paragraphs; and then into directives, verbatims, and then also
9							# turning formatting sequences into treelets.
10							#
11							# Are you really sure you want to read this code?
12							#
13							#-----------------------------------------------------------------------------
14							#
15							# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16							# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17							# to call the proper callbacks on the treelets.
18							#
19							# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21	70			70		297885	use integer; # vroom!
	70					164
	70					463
22	70			70		2870	use strict;
	70					132
	70					1479
23	70			70		384	use warnings;
	70					166
	70					3166
24	70			70		399	use Carp ();
	70					132
	70					6764
25							our $VERSION = '3.47';
26							#use constant DEBUG => 7;
27
28							sub my_qr ($$) {
29
30							# $1 is a pattern to compile and return. Older perls compile any
31							# syntactically valid property, even if it isn't legal. To cope with
32							# this, return an empty string unless the compiled pattern also
33							# successfully matches $2, which the caller furnishes.
34
35	642			642	0	2105	my ($input_re, $should_match) = @_;
36							# XXX could have a third parameter $shouldnt_match for extra safety
37
38	70	50		70		490	my $use_utf8 = do { no integer; $] <= 5.006002 } ? 'use utf8;' : "";
	70					176
	70					453
	642					1528
	642					2328
39
40	70			70		717	my $re = eval "no warnings; $use_utf8 qr/$input_re/";
	70			70		150
	70			70		5591
	70			70		501
	70			70		189
	70			70		4185
	70			70		501
	70			70		192
	70			70		3942
	70			3		508
	70			3		141
	70			3		4043
	70			3		429
	70					163
	70					4304
	70					697
	70					376
	70					4996
	70					466
	70					137
	70					6602
	70					509
	70					141
	70					3806
	70					554
	70					168
	70					6956
	3					34
	3					5
	3					623
	3					14
	3					4
	3					147
	3					13
	3					5
	3					127
	3					11
	3					4
	3					118
	642					52074
41							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
42	642	50				3001	return "" if $@;
43
44	70			70		499	my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
	70			70		146
	70			70		3801
	70			70		492
	70			70		197
	70			70		4374
	70			70		518
	70			70		164
	70			70		4008
	70			3		420
	70			3		157
	70			3		3885
	70			3		455
	70					153
	70					4541
	70					503
	70					141
	70					4708
	70					483
	70					195
	70					6233
	70					426
	70					163
	70					3906
	70					555
	70					169
	70					5374
	3					16
	3					5
	3					347
	3					12
	3					6
	3					189
	3					12
	3					4
	3					127
	3					18
	3					5
	3					199
	642					51593
45							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
46	642	50				3089	return "" if $@;
47
48							#print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49	642	50				2747	return $re if $matches;
50
51							#print STDERR __LINE__, ": $re: didn't match\n";
52	0					0	return "";
53							}
54
55							BEGIN {
56	70			70		15588	require Pod::Simple;
57	70	50				7893	*DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58							}
59
60							# Matches a character iff the character will have a different meaning
61							# if we choose CP1252 vs UTF-8 if there is no =encoding line.
62							# This is broken for early Perls on non-ASCII platforms.
63							my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64							$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65
66							# Use patterns understandable by Perl 5.6, if possible
67	70			70		536	my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
	70					181
	70					22063
68							my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69							# to get assigned
70							my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71							"\x{250}");
72							$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73
74	70			70		476	my $script_run_re = eval 'no warnings "experimental::script_run";
	70					170
	70					3916
75							qr/(script_run: ^ . $ )/x';
76							my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77							unless ($latin_re) {
78							# This was machine generated to be the ranges of the union of the above
79							# three properties, with things that were undefined by Unicode 4.1 filling
80							# gaps. That is the version in use when Perl advanced enough to
81							# successfully compile and execute the above pattern.
82							$latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83							}
84
85							my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86
87							# Latin script code points not in the first release of Unicode
88							my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89
90							# If this perl doesn't have the Deprecated property, there's only one code
91							# point in it that we need be concerned with.
92							my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93							$deprecated_re = qr/\x{149}/ unless $deprecated_re;
94
95							my $utf8_bom;
96	70			70		546	if ( do { no integer; "$]" >= 5.007_003 }) {
	70					199
	70					390
97							$utf8_bom = "\x{FEFF}";
98							utf8::encode($utf8_bom);
99							} else {
100							$utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101							}
102
103							# This is used so that the 'content_seen' method doesn't return true on a
104							# file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105							# there is a valid =foo line will we return that content was seen.
106							my $seen_legal_directive = 0;
107
108							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109
110	0			0	0	0	sub parse_line { shift->parse_lines(@_) } # alias
111
112							# - - - Turn back now! Run away! - - -
113
114							sub parse_lines { # Usage: $parser->parse_lines(@lines)
115							# an undef means end-of-stream
116	9584			9584	0	15440	my $self = shift;
117
118	9584					17391	my $code_handler = $self->{'code_handler'};
119	9584					15602	my $cut_handler = $self->{'cut_handler'};
120	9584					16743	my $wl_handler = $self->{'whiteline_handler'};
121	9584		100			23802	$self->{'line_count'} \|\|= 0;
122
123	9584					13412	my $scratch;
124
125							DEBUG > 4 and
126	9584					12922	print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127
128	9584					12862	DEBUG > 5 and
129							print STDERR "# About to parse lines: ",
130							join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131
132	9584		100			23609	my $paras = ($self->{'paras'} \|\|= []);
133							# paragraph buffer. Because we need to defer processing of =over
134							# directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135							# to process this.
136
137	9584		100			23846	$self->{'pod_para_count'} \|\|= 0;
138
139							# An attempt to match the pod portions of a line. This is not fool proof,
140							# but is good enough to serve as part of the heuristic for guessing the pod
141							# encoding if not specified.
142	9584					15798	my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
	101347					218681
	9584					81781
143	9584					75087	my $pod_chars_re = qr/ ^ = [A-Za-z]+ \| [\Q$codes\E] < /x;
144
145	9584					15960	my $line;
146	9584					20576	foreach my $source_line (@_) {
147	19136	50				42091	if( $self->{'source_dead'} ) {
148	0					0	DEBUG > 4 and print STDERR "# Source is dead.\n";
149	0					0	last;
150							}
151
152	19136	100				42455	unless( defined $source_line ) {
153	903					1327	DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154
155	903					3398	push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156	903					2464	push @$paras, $paras->[-1], $paras->[-1];
157							# So that it definitely fills the buffer.
158	903					2024	$self->{'source_dead'} = 1;
159	903					2667	$self->_ponder_paragraph_buffer;
160	903					1951	next;
161							}
162
163
164	18233	100				37778	if( $self->{'line_count'}++ ) {
165	17334					40606	($line = $source_line) =~ tr/\n\r//d;
166							# If we don't have two vars, we'll end up with that there
167							# tr/// modding the (potentially read-only) original source line!
168
169							} else {
170	899					1303	DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171
172	899	50				7391	if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
		50
		50
173	0					0	DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174	0					0	$self->_handle_encoding_line( "=encoding utf8" );
175	0					0	delete $self->{'_processed_encoding'};
176	0					0	$line =~ tr/\n\r//d;
177
178							} elsif( $line =~ s/^\xFE\xFF//s ) {
179	0					0	DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180							$self->scream(
181	0					0	$self->{'line_count'},
182							"UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183							);
184	0					0	splice @_;
185	0					0	push @_, undef;
186	0					0	next;
187
188							# TODO: implement somehow?
189
190							} elsif( $line =~ s/^\xFF\xFE//s ) {
191	0					0	DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192							$self->scream(
193	0					0	$self->{'line_count'},
194							"UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195							);
196	0					0	splice @_;
197	0					0	push @_, undef;
198	0					0	next;
199
200							# TODO: implement somehow?
201
202							} else {
203	899					1430	DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204	899					2431	($line = $source_line) =~ tr/\n\r//d;
205							}
206							}
207
208	18233	100	100			158609	if(!$self->{'parse_characters'} && !$self->{'encoding'}
			100
			100
			100
209							&& ($self->{'in_pod'} \|\| $line =~ /^=/s)
210							&& $line =~ /$non_ascii_re/
211							) {
212
213	21					45	my $encoding;
214
215							# No =encoding line, and we are at the first pod line in the input that
216							# contains a non-ascii byte, that is, one whose meaning varies depending
217							# on whether the file is encoded in UTF-8 or CP1252, which are the two
218							# possibilities permitted by the pod spec. (ASCII is assumed if the
219							# file only contains ASCII bytes.) In order to process this line, we
220							# need to figure out what encoding we will use for the file.
221							#
222							# Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223							# 160-255, but it is used here, as it often colloquially is, to refer to
224							# the complete set of code points 0-255, including ASCII (0-127), the C1
225							# controls (128-159), and strict Latin 1 (160-255).
226							#
227							# CP1252 is effectively a superset of Latin 1, because it differs only
228							# from colloquial 8859-1 in the C1 controls, which are very unlikely to
229							# actually be present in 8859-1 files, so can be used for other purposes
230							# without conflict. CP 1252 uses most of them for graphic characters.
231							#
232							# Note that all ASCII-range bytes represent their corresponding code
233							# points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234							# code points require multiple (non-ASCII) bytes to represent. (A
235							# separate paragraph for EBCDIC is below.) The multi-byte
236							# representation is quite structured. If we find an isolated byte that
237							# would require multiple bytes to represent in UTF-8, we know that the
238							# encoding is not UTF-8. If we find a sequence of bytes that violates
239							# the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240							# hence must be 1252.
241							#
242							# But there are ambiguous cases where we could guess wrong. If so, the
243							# user will end up having to supply an =encoding line. We use all
244							# readily available information to improve our chances of guessing
245							# right. The odds of something not being UTF-8, but still passing a
246							# UTF-8 validity test go down very rapidly with increasing length of the
247							# sequence. Therefore we look at all non-ascii sequences on the line.
248							# If any of the sequences can't be UTF-8, we quit there and choose
249							# CP1252. If all could be UTF-8, we see if any of the code points
250							# represented are unlikely to be in pod. If so, we guess CP1252. If
251							# not, we check if the line is all in the same script; if not guess
252							# CP1252; otherwise UTF-8. For perls that don't have convenient script
253							# run testing, see if there is both Latin and non-Latin. If so, CP1252,
254							# otherwise UTF-8.
255							#
256							# On EBCDIC platforms, the situation is somewhat different. In
257							# UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258							# but so do the bytes that are for the C1 controls. Recall that these
259							# correspond to the unused portion of 8859-1 that 1252 mostly takes
260							# over. That means that there are fewer code points that are
261							# represented by multi-bytes. But, note that the these controls are
262							# very unlikely to be in pod text. So if we encounter one of them, it
263							# means that it is quite likely CP1252 and not UTF-8. The net result is
264							# the same code below is used for both platforms.
265							#
266							# XXX probably if the line has E that evaluates to illegal CP1252,
267							# then it is UTF-8. But we haven't processed E<> yet.
268
269	70	50		70		68261	goto set_1252 if do { no integer; "$]" < 5.006_000 }; # No UTF-8 on very early perls
	70					150
	70					344
	21					32
	21					184
270
271	21					42	my $copy;
272
273	70			70		4530	no warnings 'utf8';
	70					140
	70					4042
274
275	70	50		70		414	if ( do { no integer; "$]" >= 5.007_003 } ) {
	70					185
	70					254
	21					46
	21					100
276	21					42	$copy = $line;
277
278							# On perls that have this function, we can use it to easily see if the
279							# sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280							# needed below for script run detection
281	21	100				451	goto set_1252 if ! utf8::decode($copy);
282							}
283							elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284							# code page doing here anyway?
285							goto set_utf8;
286							}
287							else { # ASCII, no decode(): do it ourselves using the fundamental
288							# characteristics of UTF-8
289	70			70		6832	use if do { no integer; "$]" <= 5.006002 }, 'utf8';
	70			70		173
	70					339
	70					3964
	70					161
	70					142
	70					803689
290
291	0					0	my $char_ord;
292							my $needed; # How many continuation bytes to gobble up
293
294							# Initialize the translated line with a dummy character that will be
295							# deleted after everything else is done. This dummy makes sure that
296							# $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297							# with upgrading in the middle
298	0					0	$copy = chr(0x100);
299
300							# Parse through the line
301	0					0	for (my $i = 0; $i < length $line; $i++) {
302	0					0	my $byte = substr($line, $i, 1);
303
304							# ASCII bytes are trivially dealt with
305	0	0				0	if ($byte !~ $non_ascii_re) {
306	0					0	$copy .= $byte;
307	0					0	next;
308							}
309
310	0					0	my $b_ord = ord $byte;
311
312							# Now figure out what this code point would be if the input is
313							# actually in UTF-8. If, in the process, we discover that it isn't
314							# well-formed UTF-8, we guess CP1252.
315							#
316							# Start the process. If it is UTF-8, we are at the first, start
317							# byte, of a multi-byte sequence. We look at this byte to figure
318							# out how many continuation bytes are needed, and to initialize the
319							# code point accumulator with the data from this byte.
320							#
321							# Normally the minimum continuation byte is 0x80, but in certain
322							# instances the minimum is a higher number. So the code below
323							# overrides this for those instances.
324	0					0	my $min_cont = 0x80;
325
326	0	0				0	if ($b_ord < 0xC2) { # A start byte < C2 is malformed
		0
		0
		0
327	0					0	goto set_1252;
328							}
329							elsif ($b_ord <= 0xDF) {
330	0					0	$needed = 1;
331	0					0	$char_ord = $b_ord & 0x1F;
332							}
333							elsif ($b_ord <= 0xEF) {
334	0	0				0	$min_cont = 0xA0 if $b_ord == 0xE0;
335	0					0	$needed = 2;
336	0					0	$char_ord = $b_ord & (0x1F >> 1);
337							}
338							elsif ($b_ord <= 0xF4) {
339	0	0				0	$min_cont = 0x90 if $b_ord == 0xF0;
340	0					0	$needed = 3;
341	0					0	$char_ord = $b_ord & (0x1F >> 2);
342							}
343							else { # F4 is the highest start byte for legal Unicode; higher is
344							# unlikely to be in pod.
345	0					0	goto set_1252;
346							}
347
348							# ? not enough continuation bytes available
349	0	0				0	goto set_1252 if $i + $needed >= length $line;
350
351							# Accumulate the ordinal of the character from the remaining
352							# (continuation) bytes.
353	0					0	while ($needed-- > 0) {
354	0					0	my $cont = substr($line, ++$i, 1);
355	0					0	$b_ord = ord $cont;
356	0	0	0			0	goto set_1252 if $b_ord < $min_cont \|\| $b_ord > 0xBF;
357
358							# In all cases, any next continuation bytes all have the same
359							# minimum legal value
360	0					0	$min_cont = 0x80;
361
362							# Accumulate this byte's contribution to the code point
363	0					0	$char_ord <<= 6;
364	0					0	$char_ord \|= ($b_ord & 0x3F);
365							}
366
367							# Here, the sequence that formed this code point was valid UTF-8,
368							# so add the completed character to the output
369	0					0	$copy .= chr $char_ord;
370							} # End of loop through line
371
372							# Delete the dummy first character
373	0					0	$copy = substr($copy, 1);
374							}
375
376							# Here, $copy is legal UTF-8.
377
378							# If it can't be legal CP1252, no need to look further. (These bytes
379							# aren't valid in CP1252.) This test could have been placed higher in
380							# the code, but it seemed wrong to set the encoding to UTF-8 without
381							# making sure that the very first instance is well-formed. But what if
382							# it isn't legal CP1252 either? We have to choose one or the other, and
383							# It seems safer to favor the single-byte encoding over the multi-byte.
384	6	50				22	goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385
386							# The C1 controls are not likely to appear in pod
387	6	50				18	goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388
389							# Nor are surrogates nor unassigned, nor deprecated.
390	6	50				70	DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391	6	50	33			77	goto set_1252 if $cs_re && $copy =~ $cs_re;
392	6	50	33			75	DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393	6	50	33			43	goto set_1252 if $cn_re && $copy =~ $cn_re;
394	6	50				43	DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395	6	50				36	goto set_1252 if $copy =~ $deprecated_re;
396
397							# Nor are rare code points. But this is hard to determine. khw
398							# believes that IPA characters and the modifier letters are unlikely to
399							# be in pod (and certainly very unlikely to be the in the first line in
400							# the pod containing non-ASCII)
401	6	100				46	DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402	6	100	66			40	goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403
404							# The first Unicode version included essentially every Latin character
405							# in modern usage. So, a Latin character not in the first release will
406							# unlikely be in pod.
407	5	50	33			87	DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408	5	50	33			26	goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409
410							# On perls that handle script runs, if the UTF-8 interpretation yields
411							# a single script, we guess UTF-8, otherwise just having a mixture of
412							# scripts is suspicious, so guess CP1252. We first strip off, as best
413							# we can, the ASCII characters that look like they are pod directives,
414							# as these would always show as mixed with non-Latin text.
415	5					47	$copy =~ s/$pod_chars_re//g;
416
417	5	50				12	if ($script_run_re) {
418	5	100				87	goto set_utf8 if $copy =~ $script_run_re;
419	1					3	DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420	1					8	goto set_1252;
421							}
422
423							# Even without script runs, but on recent enough perls and Unicodes, we
424							# can check if there is a mixture of both Latin and non-Latin. Again,
425							# having a mixture of scripts is suspicious, so assume CP1252
426
427							# If it's all non-Latin, there is no CP1252, as that is Latin
428							# characters and punct, etc.
429	0	0				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430	0	0				0	goto set_utf8 if $copy !~ $latin_re;
431
432	0	0				0	DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433	0	0				0	goto set_utf8 if $copy =~ $every_char_is_latin_re;
434
435	0					0	DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436
437	17					37	set_1252:
438							DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439	17					47	$encoding = 'CP1252';
440	17					78	goto done_set;
441
442	4					5	set_utf8:
443							DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444	4					5	$encoding = 'UTF-8';
445
446	21					170	done_set:
447							$self->_handle_encoding_line( "=encoding $encoding" );
448	21					55	delete $self->{'_processed_encoding'};
449	21	50				102	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450
451	21					612	my ($word) = $line =~ /(\S$non_ascii_re\S)/;
452
453							$self->whine(
454	21					173	$self->{'line_count'},
455							"Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456							);
457							}
458
459	18233					26886	DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460
461	18233	100				37359	if(!$self->{'in_pod'}) {
462	1713	100				6122	if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s\|$)/s) {
463	954	100				2730	if($1 eq 'cut') {
464							$self->scream(
465	4					59	$self->{'line_count'},
466							"=cut found outside a pod block. Skipping to next block."
467							);
468
469							## Before there were errata sections in the world, it was
470							## least-pessimal to abort processing the file. But now we can
471							## just barrel on thru (but still not start a pod block).
472							#splice @_;
473							#push @_, undef;
474
475	4					13	next;
476							} else {
477							$self->{'in_pod'} = $self->{'start_of_pod_block'}
478	950					3631	= $self->{'last_was_blank'} = 1;
479							# And fall thru to the pod-mode block further down
480							}
481							} else {
482	759					1132	DEBUG > 5 and print STDERR "# It's a code-line.\n";
483	759	100				1735	$code_handler->(map $_, $line, $self->{'line_count'}, $self)
484							if $code_handler;
485							# Note: this may cause code to be processed out of order relative
486							# to pods, but in order relative to cuts.
487
488							# Note also that we haven't yet applied the transcoding to $line
489							# by time we call $code_handler!
490
491	759	50				1698	if( $line =~ m/^#\sline\s+(\d+)\s(?:\s"([^"]+)")?\s*$/ ) {
492							# That RE is from perlsyn, section "Plain Old Comments (Not!)",
493							#$fname = $2 if defined $2;
494							#DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495	0					0	DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496	0					0	$self->{'line_count'} = $1 - 1;
497							}
498
499	759					1725	next;
500							}
501							}
502
503							# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504							# Else we're in pod mode:
505
506							# Apply any necessary transcoding:
507	17470	100				37643	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508
509							# HERE WE CATCH =encoding EARLY!
510	17470	100				35535	if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511	39	100				193	next if $self->parse_characters; # Ignore this line
512	38					191	$line = $self->_handle_encoding_line( $line );
513							}
514
515	17469	100				73930	if($line =~ m/^=cut/s) {
		100
		100
516							# here ends the pod block, and therefore the previous pod para
517	168					307	DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518	168					389	$self->{'in_pod'} = 0;
519							# ++$self->{'pod_para_count'};
520	168					633	$self->_ponder_paragraph_buffer();
521							# by now it's safe to consider the previous paragraph as done.
522	168					265	DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523	168	100				559	$cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524							if $cut_handler;
525
526							# TODO: add to docs: Note: this may cause cuts to be processed out
527							# of order relative to pods, but in order relative to code.
528
529							} elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530	5776	100	66			25368	if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531	22	100				66	$wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532							if $wl_handler;
533							}
534
535	5776	100	66			30515	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			100
536	632					1005	DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537	632					1016	push @{$paras->[-1]}, $line;
	632					1826
538							} # otherwise it's not interesting
539
540	5776	100	100			21340	if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541	5681					7846	DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542							}
543
544	5776					12067	$self->{'last_was_blank'} = 1;
545
546							} elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547
548	6388	100				23035	if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9])(\s+\|$)(.)/s) {
		100
549							# THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550	2885					15831	my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551	2885	100	100			13628	$new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552							# Note that in "=head1 foo", the WS is lost.
553							# Example: ['=head1', {'start_line' => 123}, ' foo']
554
555	2885					5266	++$self->{'pod_para_count'};
556
557	2885					10250	$self->_ponder_paragraph_buffer();
558							# by now it's safe to consider the previous paragraph as done.
559
560	2885					6181	push @$paras, $new; # the new incipient paragraph
561	2885					4404	DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562
563							} elsif($line =~ m/^\s/s) {
564
565	717	100	33			4321	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			66
566	173					393	DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567	173					269	push @{$paras->[-1]}, $line;
	173					468
568							} else {
569	544					1057	++$self->{'pod_para_count'};
570	544					1883	$self->_ponder_paragraph_buffer();
571							# by now it's safe to consider the previous paragraph as done.
572	544					882	DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573	544					2989	push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574							}
575							} else {
576	2786					4983	++$self->{'pod_para_count'};
577	2786					8699	$self->_ponder_paragraph_buffer();
578							# by now it's safe to consider the previous paragraph as done.
579	2786					12830	push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580	2786					4632	DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581							}
582	6388					18146	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583
584							} else {
585							# It's a non-blank line /continuing/ the current para
586	5137	50				9664	if(@$paras) {
587	5137					6258	DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588	5137					6649	push @{$paras->[-1]}, $line;
	5137					13316
589							} else {
590							# Unexpected case!
591	0					0	die "Continuing a paragraph but \@\$paras is empty?";
592							}
593	5137					11322	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594							}
595
596							} # ends the big while loop
597
598	9584					13586	DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599	9584					63479	return $self;
600							}
601
602							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603
604							sub _maybe_handle_element_start {
605	8355			8355		12528	my $self = shift;
606							return $self->_handle_element_start(@_)
607	8355	100				32335	if !$self->{heading_filter};
608
609	20					37	my ($element_name, $attr) = @_;
610
611	20	100				53	if ($element_name =~ /\Ahead(\d)\z/) {
612							$self->{_in_head} = {
613	10					78	element => $element_name,
614							level => $1 + 0,
615							events => [],
616							text => '',
617							};
618							}
619
620	20	100				47	if (my $head = $self->{_in_head}) {
621	10					12	push @{ $head->{events} }, [ '_handle_element_start', @_ ];
	10					30
622	10					19	return;
623							}
624
625							return
626	10	100				26	if !$self->{_filter_allowed};
627
628	2					9	$self->_handle_element_start(@_);
629							}
630
631							sub _maybe_handle_element_end {
632	8355			8355		11854	my $self = shift;
633							return $self->_handle_element_end(@_)
634	8355	100				27554	if !$self->{heading_filter};
635
636	20					39	my ($element_name, $attr) = @_;
637
638	20	100				47	if (my $head = $self->{_in_head}) {
639	10	50				24	if ($element_name ne $head->{element}) {
640	0					0	push @{ $head->{events} }, [ '_handle_element_end', @_ ];
	0					0
641	0					0	return;
642							}
643
644	10					17	delete $self->{_in_head};
645
646	10		100			27	my $headings = $self->{_current_headings} \|\|= [];
647	10					24	@$headings = (@{$headings}[0 .. $head->{level} - 2], $head->{text});
	10					31
648
649	10					32	my $allowed = $self->{_filter_allowed} = $self->_filter_allows(@$headings);
650
651	10	100				29	if ($allowed) {
652	2					4	for my $event (@{ $head->{events} }) {
	2					5
653	4					10	my ($method, @args) = @$event;
654	4					23	$self->$method(@args);
655							}
656							}
657							}
658
659							return
660	20	100				59	if !$self->{_filter_allowed};
661
662	4					18	$self->_handle_element_end(@_);
663							}
664
665							sub _maybe_handle_text {
666	10006			10006		14432	my $self = shift;
667							return $self->_handle_text(@_)
668	10006	100				31355	if !$self->{heading_filter};
669
670	20					69	my ($text) = @_;
671
672	20	100				46	if (my $head = $self->{_in_head}) {
673	10					15	push @{ $head->{events} }, [ '_handle_text', @_ ];
	10					24
674	10					51	$head->{text} .= $text;
675	10					31	return;
676							}
677
678							return
679	10	100				44	if !$self->{_filter_allowed};
680
681	2					6	$self->_handle_text(@_);
682							}
683
684							sub _filter_allows {
685	10			10		16	my $self = shift;
686	10					29	my @headings = @_;
687
688							my $filter = $self->{heading_filter}
689	10	50				24	or return 1;
690
691	10					19	SPEC: for my $spec ( @$filter ) {
692	10					27	for my $i (0 .. $#$spec) {
693	14					26	my $regex = $spec->[$i];
694	14					23	my $heading = $headings[$i];
695	14	100				33	$heading = ''
696							if !defined $heading;
697							next SPEC
698	14	100				105	if $heading !~ $regex;
699							}
700	2					7	return 1;
701							}
702
703	8					20	return 0;
704							}
705
706							sub set_heading_select {
707	2			2	0	22	my $self = shift;
708	2					9	my (@selections) = @_;
709
710	2		50			36	my $filter = $self->{heading_filter} \|\|= [];
711	2	50	33			13	if (@selections && $selections[0] eq '+') {
712	0					0	shift @selections;
713							}
714							else {
715	2					6	@$filter = ();
716							}
717
718	2					5	for my $spec (@selections) {
719							eval {
720	2					15	push @$filter, $self->_compile_heading_spec($spec);
721	2					9	1;
722	2	50				3	} or do {
723	0					0	warn $@;
724	0					0	warn qq{Ignoring section spec "$spec"!\n};
725							};
726							}
727							}
728
729							sub _compile_heading_spec {
730	2			2		3	my $self = shift;
731	2					5	my ($spec) = @_;
732
733	2					4	my @bad;
734	2					61	my @parts = $spec =~ m{(?:\A\|\G/)((?:[^/\\]\|\\.)*)}g;
735	2					7	for my $part (@parts) {
736	4					9	$part =~ s{\\(.)}{$1}g;
737	4					10	my $negate = $part =~ s{\A!}{};
738	4	50				27	$part = '.*'
739							if !length $part;
740
741							eval {
742	4	50				81	$part = $negate ? qr{^(?!$part$)} : qr{^$part$};
743	4					14	1;
744	4	50				7	} or do {
745	0					0	push @bad, qq{Bad regular expression /$part/ in "$spec": $@\n};
746							};
747							}
748
749	2	50				7	Carp::croak(join '', @bad)
750							if @bad;
751
752	2					13	return \@parts;
753							}
754
755
756							sub _handle_encoding_line {
757	59			59		342	my($self, $line) = @_;
758
759	59	50				221	return if $self->parse_characters;
760
761							# The point of this routine is to set $self->{'_transcoder'} as indicated.
762
763	59	50				370	return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
764	59					86	DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
765
766	59					230	my $e = $1;
767	59					111	my $orig = $e;
768	59					93	push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
	59					262
769
770	59					102	my $enc_error;
771
772							# Cf. perldoc Encode and perldoc Encode::Supported
773
774	59					10410	require Pod::Simple::Transcode;
775
776	59	100	33			224	if( $self->{'encoding'} ) {
		50
		100
777	5					18	my $norm_current = $self->{'encoding'};
778	5					9	my $norm_e = $e;
779	5					12	foreach my $that ($norm_current, $norm_e) {
780	10					23	$that = lc($that);
781	10					50	$that =~ s/[-_]//g;
782							}
783	5	100				46	if($norm_current eq $norm_e) {
784	3					6	DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
785							"redundant. ($norm_current eq $norm_e). Ignoring.\n";
786	3					8	$enc_error = '';
787							# But that doesn't necessarily mean that the earlier one went okay
788							} else {
789	2					6	$enc_error = "Encoding is already set to " . $self->{'encoding'};
790	2					3	DEBUG > 1 and print STDERR $enc_error;
791							}
792							} elsif (
793							# OK, let's turn on the encoding
794							do {
795	54					81	DEBUG > 1 and print STDERR " Setting encoding to $e\n";
796	54					200	$self->{'encoding'} = $e;
797	54					685	1;
798							}
799							and $e eq 'HACKRAW'
800							) {
801	0					0	DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
802
803							} elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
804
805							die($enc_error = "WHAT? _transcoder is already set?!")
806	52	50				48742	if $self->{'_transcoder'}; # should never happen
807	52					255	require Pod::Simple::Transcode;
808	52					252	$self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
809	52					122	eval {
810	52					372	my @x = ('', "abc", "123");
811	52					222	$self->{'_transcoder'}->(@x);
812							};
813	52	50				145	$@ && die( $enc_error =
814							"Really unexpected error setting up encoding $e: $@\nAborting"
815							);
816	52					159	$self->{'detected_encoding'} = $e;
817
818							} else {
819	2					1451	my @supported = Pod::Simple::Transcode::->all_encodings;
820
821							# Note unsupported, and complain
822	2					1535	DEBUG and print STDERR " Encoding [$e] is unsupported.",
823							"\nSupporteds: @supported\n";
824	2					4	my $suggestion = '';
825
826							# Look for a near match:
827	2					5	my $norm = lc($e);
828	2					4	$norm =~ tr[-_][]d;
829	2					4	my $n;
830	2					6	foreach my $enc (@supported) {
831	248					243	$n = lc($enc);
832	248					223	$n =~ tr[-_][]d;
833	248	50				296	next unless $n eq $norm;
834	0					0	$suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
835	0					0	last;
836							}
837	2					14	my $encmodver = Pod::Simple::Transcode::->encmodver;
838	2					30	$enc_error = join '' =>
839							"This document probably does not appear as it should, because its ",
840							"\"=encoding $e\" line calls for an unsupported encoding.",
841							$suggestion, " [$encmodver\'s supported encodings are: @supported]"
842							;
843
844	2					22	$self->scream( $self->{'line_count'}, $enc_error );
845							}
846	59					135	push @{ $self->{'encoding_command_statuses'} }, $enc_error;
	59					177
847	59	100				210	if (defined($self->{'_processed_encoding'})) {
848							# Double declaration.
849	1					11	$self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
850							}
851	59					148	$self->{'_processed_encoding'} = $orig;
852
853	59					200	return $line;
854							}
855
856							# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
857
858							sub _handle_encoding_second_level {
859							# By time this is called, the encoding (if well formed) will already
860							# have been acted on.
861	38			38		103	my($self, $para) = @_;
862	38					120	my @x = @$para;
863	38					137	my $content = join ' ', splice @x, 2;
864	38					140	$content =~ s/^\s+//s;
865	38					108	$content =~ s/\s+$//s;
866
867	38					95	DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
868
869	38	100				108	if (defined($self->{'_processed_encoding'})) {
870							#if($content ne $self->{'_processed_encoding'}) {
871							# Could it happen?
872							#}
873	37					125	delete $self->{'_processed_encoding'};
874							# It's already been handled. Check for errors.
875	37	50				186	if(! $self->{'encoding_command_statuses'} ) {
		100
876	0					0	DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
877							} elsif( $self->{'encoding_command_statuses'}[-1] ) {
878							$self->whine( $para->[1]{'start_line'},
879							sprintf "Couldn't do %s: %s",
880							$self->{'encoding_command_reqs' }[-1],
881	4					34	$self->{'encoding_command_statuses'}[-1],
882							);
883							} else {
884	33					61	DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
885							}
886
887							} else {
888							# Otherwise it's a syntax error
889	1					13	$self->whine( $para->[1]{'start_line'},
890							"Invalid =encoding syntax: $content"
891							);
892							}
893
894	38					91	return;
895							}
896
897							#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
898
899							{
900							my $m = -321; # magic line number
901
902							sub _gen_errata {
903	903			903		1483	my $self = $_[0];
904							# Return 0 or more fake-o paragraphs explaining the accumulated
905							# errors on this document.
906
907	903	100	66			3600	return() unless $self->{'errata'} and keys %{$self->{'errata'}};
	40					185
908
909	40					73	my @out;
910
911	40					224	foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
	26					95
	40					183
912							push @out,
913							['=item', {'start_line' => $m}, "Around line $line:"],
914							map( ['~Para', {'start_line' => $m, '~cooked' => 1},
915							#['~Top', {'start_line' => $m},
916							$_
917							#]
918							],
919	59					263	@{$self->{'errata'}{$line}}
	59					395
920							)
921							;
922							}
923
924							# TODO: report of unknown entities? unrenderable characters?
925
926	40					495	unshift @out,
927							['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
928							['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
929							"Hey! ",
930							['B', {},
931							'The above document had some coding errors, which are explained below:'
932							]
933							],
934							['=over', {'start_line' => $m, 'errata' => 1}, ''],
935							;
936
937	40					219	push @out,
938							['=back', {'start_line' => $m, 'errata' => 1}, ''],
939							;
940
941	40					69	DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
942
943	40					203	return @out;
944							}
945
946							}
947
948							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
949
950							##############################################################################
951							##
952							## stop reading now stop reading now stop reading now stop reading now stop
953							##
954							## HERE IT BECOMES REALLY SCARY
955							##
956							## stop reading now stop reading now stop reading now stop reading now stop
957							##
958							##############################################################################
959
960							sub _ponder_paragraph_buffer {
961
962							# Para-token types as found in the buffer.
963							# ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
964							# =over, =back, =item
965							# and the null =pod (to be complained about if over one line)
966							#
967							# "~data" paragraphs are something we generate at this level, depending on
968							# a currently open =over region
969
970							# Events fired: Begin and end for:
971							# directivename (like head1 .. head4), item, extend,
972							# for (from =begin...=end, =for),
973							# over-bullet, over-number, over-text, over-block,
974							# item-bullet, item-number, item-text,
975							# Document,
976							# Data, Para, Verbatim
977							# B, C, longdirname (TODO -- wha?), etc. for all directives
978							#
979
980	7286			7286		11289	my $self = $_[0];
981	7286					10092	my $paras;
982	7286	100				9714	return unless @{$paras = $self->{'paras'}};
	7286					31939
983	6339		100			17339	my $curr_open = ($self->{'curr_open'} \|\|= []);
984
985	6339					8753	my $scratch;
986
987	6339					8007	DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
988
989							# We have something in our buffer. So apparently the document has started.
990	6339	100				13944	unless($self->{'doc_has_started'}) {
991	917					1958	$self->{'doc_has_started'} = 1;
992
993	917					1527	my $starting_contentless;
994	917		66			6181	$starting_contentless =
995							(
996							!@$curr_open
997							and @$paras and ! grep $_->[0] ne '~end', @$paras
998							# i.e., if the paras is all ~ends
999							)
1000							;
1001	917					1552	DEBUG and print STDERR "# Starting ",
1002							$starting_contentless ? 'contentless' : 'contentful',
1003							" document\n"
1004							;
1005
1006							$self->_handle_element_start(
1007							($scratch = 'Document'),
1008							{
1009	917	100				6242	'start_line' => $paras->[0][1]{'start_line'},
1010							$starting_contentless ? ( 'contentless' => 1 ) : (),
1011							},
1012							);
1013							}
1014
1015	6339					12243	my($para, $para_type);
1016	6339					13885	while(@$paras) {
1017
1018							# If a directive, assume it's legal; subtract below if found not to be
1019	9050	100				28040	$seen_legal_directive++ if $paras->[0][0] =~ /^=/;
1020
1021							last if @$paras == 1
1022							and ( $paras->[0][0] eq '=over'
1023							or $paras->[0][0] eq '=item'
1024	9050	100	100			45104	or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
			100
1025							# Those're the three kinds of paragraphs that require lookahead.
1026							# Actually, an "=item Foo" inside an region
1027							# and any =item inside an region (rare)
1028							# don't require any lookahead, but all others (bullets
1029							# and numbers) do.
1030							# The verbatim is different from the other two, because those might be
1031							# like:
1032							#
1033							# =item
1034							# ...
1035							# =cut
1036							# ...
1037							# =item
1038							#
1039							# The =cut here finishes the paragraph but doesn't terminate the =over
1040							# they should be in. (khw apologizes that he didn't comment at the time
1041							# why the 'in_pod' works, and no longer remembers why, and doesn't think
1042							# it is currently worth the effort to re-figure it out.)
1043
1044							# TODO: whinge about many kinds of directives in non-resolving =for regions?
1045							# TODO: many? like what? =head1 etc?
1046
1047	7526					17910	$para = shift @$paras;
1048	7526					13992	$para_type = $para->[0];
1049
1050	7526					10937	DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
1051							$self->_dump_curr_open(), ")\n";
1052
1053	7526	100				26067	if($para_type eq '=for') {
		100
		100
		100
1054	50	50				150	next if $self->_ponder_for($para,$curr_open,$paras);
1055
1056							} elsif($para_type eq '=begin') {
1057	127	50				380	next if $self->_ponder_begin($para,$curr_open,$paras);
1058
1059							} elsif($para_type eq '=end') {
1060	130	50				410	next if $self->_ponder_end($para,$curr_open,$paras);
1061
1062							} elsif($para_type eq '~end') { # The virtual end-document signal
1063	953	50				3469	next if $self->_ponder_doc_end($para,$curr_open,$paras);
1064							}
1065
1066
1067							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1068							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1069	6266	100				18759	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1070	42					51	DEBUG > 1 and
1071							print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
1072	42					80	next;
1073							}
1074							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1075							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
1076
1077	6224	100				16480	if($para_type eq '=pod') {
		100
		100
1078	580					2083	$self->_ponder_pod($para,$curr_open,$paras);
1079
1080							} elsif($para_type eq '=over') {
1081	218	100				960	next if $self->_ponder_over($para,$curr_open,$paras);
1082
1083							} elsif($para_type eq '=back') {
1084	214	100				1111	next if $self->_ponder_back($para,$curr_open,$paras);
1085
1086							} else {
1087
1088							# All non-magical codes!!!
1089
1090							# Here we start using $para_type for our own twisted purposes, to
1091							# mean how it should get treated, not as what the element name
1092							# should be.
1093
1094	5212					7342	DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
1095
1096	5212					7291	my $i;
1097
1098							# Enforce some =headN discipline
1099	5212	100	66			16035	if($para_type =~ m/^=head\d$/s
			100
			66
1100							and ! $self->{'accept_heads_anywhere'}
1101							and @$curr_open
1102							and $curr_open->[-1][0] eq '=over'
1103							) {
1104	6					7	DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
1105							$self->whine(
1106	6					32	$para->[1]{'start_line'},
1107							"You forgot a '=back' before '$para_type'"
1108							);
1109	6					17	unshift @$paras, ['=back', {}, ''], $para; # close the =over
1110	6					15	next;
1111							}
1112
1113
1114	5206	100	66			21816	if($para_type eq '=item') {
		100
		100
		100
		100
		100
		100
1115
1116	1026					1491	my $over;
1117	1026	50	33			2947	unless(@$curr_open and
1118	1134					4813	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1119							$self->whine(
1120	0					0	$para->[1]{'start_line'},
1121							"'=item' outside of any '=over'"
1122							);
1123							unshift @$paras,
1124	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1125							$para
1126							;
1127	0					0	next;
1128							}
1129
1130
1131	1026					2125	my $over_type = $over->[1]{'~type'};
1132
1133	1026	50				3553	if(!$over_type) {
		50
		100
		100
		50
1134							# Shouldn't happen1
1135							die "Typeless over in stack, starting at line "
1136	0					0	. $over->[1]{'start_line'};
1137
1138							} elsif($over_type eq 'block') {
1139	0	0				0	unless($curr_open->[-1][1]{'~bitched_about'}) {
1140	0					0	$curr_open->[-1][1]{'~bitched_about'} = 1;
1141							$self->whine(
1142							$curr_open->[-1][1]{'start_line'},
1143							"You can't have =items (as at line "
1144	0					0	. $para->[1]{'start_line'}
1145							. ") unless the first thing after the =over is an =item"
1146							);
1147							}
1148							# Just turn it into a paragraph and reconsider it
1149	0					0	$para->[0] = '~Para';
1150	0					0	unshift @$paras, $para;
1151	0					0	next;
1152
1153							} elsif($over_type eq 'text') {
1154	839					3076	my $item_type = $self->_get_item_type($para);
1155							# That kills the content of the item if it's a number or bullet.
1156	839					1431	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1157
1158	839	100	66			2000	if($item_type eq 'text') {
		50
1159							# Nothing special needs doing for 'text'
1160							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1161							$self->whine(
1162	4					30	$para->[1]{'start_line'},
1163							"Expected text after =item, not a $item_type"
1164							);
1165							# Undo our clobbering:
1166	4					9	push @$para, $para->[1]{'~orig_content'};
1167	4					9	delete $para->[1]{'number'};
1168							# Only a PROPER item-number element is allowed
1169							# to have a number attribute.
1170							} else {
1171	0					0	die "Unhandled item type $item_type"; # should never happen
1172							}
1173
1174							# =item-text thingies don't need any assimilation, it seems.
1175
1176							} elsif($over_type eq 'number') {
1177	28					91	my $item_type = $self->_get_item_type($para);
1178							# That kills the content of the item if it's a number or bullet.
1179	28					45	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1180
1181	28					66	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1182
1183	28	50				200	if($item_type eq 'bullet') {
		50
		50
		50
1184							# Hm, it's not numeric. Correct for this.
1185	0					0	$para->[1]{'number'} = $expected_value;
1186							$self->whine(
1187	0					0	$para->[1]{'start_line'},
1188							"Expected '=item $expected_value'"
1189							);
1190	0					0	push @$para, $para->[1]{'~orig_content'};
1191							# restore the bullet, blocking the assimilation of next para
1192
1193							} elsif($item_type eq 'text') {
1194							# Hm, it's not numeric. Correct for this.
1195	0					0	$para->[1]{'number'} = $expected_value;
1196							$self->whine(
1197	0					0	$para->[1]{'start_line'},
1198							"Expected '=item $expected_value'"
1199							);
1200							# Text content will still be there and will block next ~Para
1201
1202							} elsif($item_type ne 'number') {
1203	0					0	die "Unknown item type $item_type"; # should never happen
1204
1205							} elsif($expected_value == $para->[1]{'number'}) {
1206	28					40	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1207
1208							} else {
1209	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1210							" instead of the expected value of $expected_value\n";
1211							$self->whine(
1212							$para->[1]{'start_line'},
1213	0					0	"You have '=item " . $para->[1]{'number'} .
1214							"' instead of the expected '=item $expected_value'"
1215							);
1216	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1217							}
1218
1219	28	50				63	if(@$para == 2) {
1220							# For the cases where we /didn't/ push to @$para
1221	28	100				64	if($paras->[0][0] eq '~Para') {
1222	25					45	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1223	25					40	push @$para, splice @{shift @$paras},2;
	25					68
1224							} else {
1225	3					5	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1226	3					6	push @$para, ''; # Just so it's not contentless
1227							}
1228							}
1229
1230
1231							} elsif($over_type eq 'bullet') {
1232	159					619	my $item_type = $self->_get_item_type($para);
1233							# That kills the content of the item if it's a number or bullet.
1234	159					257	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1235
1236	159	50				337	if($item_type eq 'bullet') {
		0
		0
1237							# as expected!
1238
1239	159	100				427	if( $para->[1]{'~_freaky_para_hack'} ) {
1240	101					143	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1241	101					246	push @$para, $para->[1]{'~_freaky_para_hack'};
1242							}
1243
1244							} elsif($item_type eq 'number') {
1245							$self->whine(
1246	0					0	$para->[1]{'start_line'},
1247							"Expected '=item *'"
1248							);
1249	0					0	push @$para, $para->[1]{'~orig_content'};
1250							# and block assimilation of the next paragraph
1251	0					0	delete $para->[1]{'number'};
1252							# Only a PROPER item-number element is allowed
1253							# to have a number attribute.
1254							} elsif($item_type eq 'text') {
1255							$self->whine(
1256	0					0	$para->[1]{'start_line'},
1257							"Expected '=item *'"
1258							);
1259							# But doesn't need processing. But it'll block assimilation
1260							# of the next para.
1261							} else {
1262	0					0	die "Unhandled item type $item_type"; # should never happen
1263							}
1264
1265	159	100				417	if(@$para == 2) {
1266							# For the cases where we /didn't/ push to @$para
1267	58	50				160	if($paras->[0][0] eq '~Para') {
1268	58					92	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1269	58					151	push @$para, splice @{shift @$paras},2;
	58					306
1270							} else {
1271	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1272	0					0	push @$para, ''; # Just so it's not contentless
1273							}
1274							}
1275
1276							} else {
1277	0					0	die "Unhandled =over type \"$over_type\"?";
1278							# Shouldn't happen!
1279							}
1280
1281	1026					2013	$para_type = 'Plain';
1282	1026					2781	$para->[0] .= '-' . $over_type;
1283							# Whew. Now fall thru and process it.
1284
1285
1286							} elsif($para_type eq '=extend') {
1287							# Well, might as well implement it here.
1288	21					139	$self->_ponder_extend($para);
1289	21					68	next; # and skip
1290							} elsif($para_type eq '=encoding') {
1291							# Not actually acted on here, but we catch errors here.
1292	38					221	$self->_handle_encoding_second_level($para);
1293	38	100				155	next unless $self->keep_encoding_directive;
1294	35					74	$para_type = 'Plain';
1295							} elsif($para_type eq '~Verbatim') {
1296	538					1125	$para->[0] = 'Verbatim';
1297	538					901	$para_type = '?Verbatim';
1298							} elsif($para_type eq '~Para') {
1299	2789					5410	$para->[0] = 'Para';
1300	2789					4545	$para_type = '?Plain';
1301							} elsif($para_type eq 'Data') {
1302	30					56	$para->[0] = 'Data';
1303	30					46	$para_type = '?Data';
1304							} elsif( $para_type =~ s/^=//s
1305							and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1306							) {
1307	752					1185	DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1308							} else {
1309							# An unknown directive!
1310	12					27	$seen_legal_directive--;
1311							DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1312	12					23	$para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1313							;
1314							$self->whine(
1315	12					143	$para->[1]{'start_line'},
1316							"Unknown directive: $para->[0]"
1317							);
1318
1319							# And maybe treat it as text instead of just letting it go?
1320	12					50	next;
1321							}
1322
1323	5170	100				18780	if($para_type =~ s/^\?//s) {
1324	3357	100				6790	if(! @$curr_open) { # usual case
1325	2082					3003	DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1326							} else {
1327	1275					3772	my @fors = grep $_->[0] eq '=for', @$curr_open;
1328							DEBUG > 1 and print STDERR "Containing fors: ",
1329	1275					1788	join(',', map $_->[1]{'target'}, @fors), "\n";
1330
1331	1275	100				2613	if(! @fors) {
		100
1332	1151					1772	DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1333
1334							#} elsif(grep $_->[1]{'~resolve'}, @fors) {
1335							#} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1336							} elsif( $fors[-1][1]{'~resolve'} ) {
1337							# Look to the immediately containing for
1338
1339	73	100				158	if($para_type eq 'Data') {
1340	18					23	DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1341	18					31	$para->[0] = 'Para';
1342	18					37	$para_type = 'Plain';
1343							} else {
1344	55					121	DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1345							}
1346							} else {
1347	51					83	DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1348	51					160	$para->[0] = $para_type = 'Data';
1349							}
1350							}
1351							}
1352
1353							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1354	5170	100				10790	if($para_type eq 'Plain') {
		100
		50
1355	4582					12618	$self->_ponder_Plain($para);
1356							} elsif($para_type eq 'Verbatim') {
1357	534					2092	$self->_ponder_Verbatim($para);
1358							} elsif($para_type eq 'Data') {
1359	54					254	$self->_ponder_Data($para);
1360							} else {
1361	0					0	die "\$para type is $para_type -- how did that happen?";
1362							# Shouldn't happen.
1363							}
1364
1365							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1366	5170					18537	$para->[0] =~ s/^[~=]//s;
1367
1368	5170					7507	DEBUG and print STDERR "\n", pretty($para), "\n";
1369
1370							# traverse the treelet (which might well be just one string scalar)
1371							$self->{'content_seen'} \|\|= 1 if $seen_legal_directive
1372	5170	100	100			27045	&& ! $self->{'~tried_gen_errata'};
			100
1373	5170					13925	$self->_traverse_treelet_bit(@$para);
1374							}
1375							}
1376
1377	6339					22839	return;
1378							}
1379
1380							###########################################################################
1381							# The sub-ponderers...
1382
1383
1384
1385							sub _ponder_for {
1386	50			50		109	my ($self,$para,$curr_open,$paras) = @_;
1387
1388							# Fake it out as a begin/end
1389	50					79	my $target;
1390
1391	50	50				115	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1392	0					0	DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1393	0					0	return 1;
1394							}
1395
1396	50					151	for(my $i = 2; $i < @$para; ++$i) {
1397	50	50				224	if($para->[$i] =~ s/^\s(\S+)\s//s) {
1398	50					107	$target = $1;
1399	50					95	last;
1400							}
1401							}
1402	50	50				109	unless(defined $target) {
1403							$self->whine(
1404	0					0	$para->[1]{'start_line'},
1405							"=for without a target?"
1406							);
1407	0					0	return 1;
1408							}
1409	50					61	DEBUG > 1 and
1410							print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1411
1412	50					91	$para->[0] = 'Data';
1413
1414							unshift @$paras,
1415							['=begin',
1416							{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1417							$target,
1418							],
1419							$para,
1420							['=end',
1421	50					340	{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1422							$target,
1423							],
1424							;
1425
1426	50					198	return 1;
1427							}
1428
1429							sub _ponder_begin {
1430	127			127		420	my ($self,$para,$curr_open,$paras) = @_;
1431	127					462	my $content = join ' ', splice @$para, 2;
1432	127					470	$content =~ s/^\s+//s;
1433	127					357	$content =~ s/\s+$//s;
1434	127	50				286	unless(length($content)) {
1435							$self->whine(
1436	0					0	$para->[1]{'start_line'},
1437							"=begin without a target?"
1438							);
1439	0					0	DEBUG and print STDERR "Ignoring targetless =begin\n";
1440	0					0	return 1;
1441							}
1442
1443	127					652	my ($target, $title) = $content =~ m/^(\S+)\s(.)$/;
1444	127	100				292	$para->[1]{'title'} = $title if ($title);
1445	127					415	$para->[1]{'target'} = $target; # without any ':'
1446	127					200	$content = $target; # strip off the title
1447
1448	127					254	$content =~ s/^:!/!:/s;
1449	127					170	my $neg; # whether this is a negation-match
1450	127	100				369	$neg = 1 if $content =~ s/^!//s;
1451	127					173	my $to_resolve; # whether to process formatting codes
1452	127	100				419	$to_resolve = 1 if $content =~ s/^://s;
1453
1454	127					195	my $dont_ignore; # whether this target matches us
1455
1456	127	100				588	foreach my $target_name (
1457							split(',', $content, -1),
1458							$neg ? () : '*'
1459							) {
1460	249					358	DEBUG > 2 and
1461							print STDERR " Considering whether =begin $content matches $target_name\n";
1462	249	100				624	next unless $self->{'accept_targets'}{$target_name};
1463
1464	66					106	DEBUG > 2 and
1465							print STDERR " It DOES match the acceptable target $target_name!\n";
1466							$to_resolve = 1
1467	66	100				221	if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1468	66					129	$dont_ignore = 1;
1469	66					192	$para->[1]{'target_matching'} = $target_name;
1470	66					123	last; # stop looking at other target names
1471							}
1472
1473	127	100				291	if($neg) {
1474	33	100				60	if( $dont_ignore ) {
1475	6					7	$dont_ignore = '';
1476	6					10	delete $para->[1]{'target_matching'};
1477	6					5	DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1478							} else {
1479	27					42	$dont_ignore = 1;
1480	27					54	$para->[1]{'target_matching'} = '!';
1481	27					38	DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1482							}
1483							}
1484
1485	127					267	$para->[0] = '=for'; # Just what we happen to call these, internally
1486	127		100			519	$para->[1]{'~really'} \|\|= '=begin';
1487	127		100			477	$para->[1]{'~ignore'} = (! $dont_ignore) \|\| 0;
1488	127		100			470	$para->[1]{'~resolve'} = $to_resolve \|\| 0;
1489
1490	127					165	DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1491							"ignore contents of this region\n";
1492	127					165	DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1493							($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1494	127					164	DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1495
1496	127					268	push @$curr_open, $para;
1497	127	100	100			538	if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1498	40					54	DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1499							} else {
1500	87	50	100			349	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1501	87					279	$self->_maybe_handle_element_start((my $scratch='for'), $para->[1]);
1502							}
1503
1504	127					639	return 1;
1505							}
1506
1507							sub _ponder_end {
1508	130			130		304	my ($self,$para,$curr_open,$paras) = @_;
1509	130					453	my $content = join ' ', splice @$para, 2;
1510	130					425	$content =~ s/^\s+//s;
1511	130					369	$content =~ s/\s+$//s;
1512	130					222	DEBUG and print STDERR "Ogling '=end $content' directive\n";
1513
1514	130	50				308	unless(length($content)) {
1515							$self->whine(
1516							$para->[1]{'start_line'},
1517							"'=end' without a target?" . (
1518							( @$curr_open and $curr_open->[-1][0] eq '=for' )
1519	0	0	0			0	? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1520							: ''
1521							)
1522							);
1523	0					0	DEBUG and print STDERR "Ignoring targetless =end\n";
1524	0					0	return 1;
1525							}
1526
1527	130	50				603	unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1528							$self->whine(
1529	0					0	$para->[1]{'start_line'},
1530							"'=end $content' is invalid. (Stack: "
1531							. $self->_dump_curr_open() . ')'
1532							);
1533	0					0	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1534	0					0	return 1;
1535							}
1536
1537	130	50	33			701	unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1538							$self->whine(
1539	0					0	$para->[1]{'start_line'},
1540							"=end $content without matching =begin. (Stack: "
1541							. $self->_dump_curr_open() . ')'
1542							);
1543	0					0	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1544	0					0	return 1;
1545							}
1546
1547	130	100				376	unless($content eq $curr_open->[-1][1]{'target'}) {
1548							$self->whine(
1549							$para->[1]{'start_line'},
1550							"=end $content doesn't match =begin "
1551	3					16	. $curr_open->[-1][1]{'target'}
1552							. ". (Stack: "
1553							. $self->_dump_curr_open() . ')'
1554							);
1555	3					6	DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1556	3					13	return 1;
1557							}
1558
1559							# Else it's okay to close...
1560	127	100				547	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1561	40					224	DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1562							# And that may be because of this to-be-closed =for region, or some
1563							# other one, but it doesn't matter.
1564							} else {
1565	87					205	$curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1566							# what's that for?
1567
1568	87	50	50			319	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1569	87					276	$self->_maybe_handle_element_end( my $scratch = 'for', $para->[1]);
1570							}
1571	127					214	DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1572	127					240	pop @$curr_open;
1573
1574	127					761	return 1;
1575							}
1576
1577							sub _ponder_doc_end {
1578	953			953		2377	my ($self,$para,$curr_open,$paras) = @_;
1579	953	100				2001	if(@$curr_open) { # Deal with things left open
1580	10					14	DEBUG and print STDERR "Stack is nonempty at end-document: (",
1581							$self->_dump_curr_open(), ")\n";
1582
1583	10					13	DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1584	10					58	unshift @$paras, $self->_closers_for_all_curr_open;
1585							# Make sure there is exactly one ~end in the parastack, at the end:
1586	10					39	@$paras = grep $_->[0] ne '~end', @$paras;
1587	10					13	push @$paras, $para, $para;
1588							# We need two -- once for the next cycle where we
1589							# generate errata, and then another to be at the end
1590							# when that loop back around to process the errata.
1591	10					42	return 1;
1592
1593							} else {
1594	943					1520	DEBUG and print STDERR "Okay, stack is empty now.\n";
1595							}
1596
1597							# Try generating errata section, if applicable
1598	943	100				2546	unless($self->{'~tried_gen_errata'}) {
1599	903					2087	$self->{'~tried_gen_errata'} = 1;
1600	903					2784	my @extras = $self->_gen_errata();
1601	903	100				2363	if(@extras) {
1602	40					123	unshift @$paras, @extras;
1603	40					241	DEBUG and print STDERR "Generated errata... relooping...\n";
1604	40					218	return 1; # I.e., loop around again to process these fake-o paragraphs
1605							}
1606							}
1607
1608	903					2121	splice @$paras; # Well, that's that for this paragraph buffer.
1609	903					1363	DEBUG and print STDERR "Throwing end-document event.\n";
1610
1611	903					3196	$self->_handle_element_end( my $scratch = 'Document' );
1612	903					3860	return 1; # Hasta la byebye
1613							}
1614
1615							sub _ponder_pod {
1616	580			580		1387	my ($self,$para,$curr_open,$paras) = @_;
1617							$self->whine(
1618	580	50				1483	$para->[1]{'start_line'},
1619							"=pod directives shouldn't be over one line long! Ignoring all "
1620							. (@$para - 2) . " lines of content"
1621							) if @$para > 3;
1622
1623							# Content ignored unless 'pod_handler' is set
1624	580	100				1569	if (my $pod_handler = $self->{'pod_handler'}) {
1625	6					21	my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1626	6	100				13	$line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1627	6					13	$pod_handler->($line, $line_num, $self);
1628							}
1629
1630							# The surrounding methods set content_seen, so let us remain consistent.
1631							# I do not know why it was not here before -- should it not be here?
1632							# $self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1633
1634	580					1701	return;
1635							}
1636
1637							sub _ponder_over {
1638	218			218		577	my ($self,$para,$curr_open,$paras) = @_;
1639	218	50				544	return 1 unless @$paras;
1640	218					470	my $list_type;
1641
1642	218	100				699	if($paras->[0][0] eq '=item') { # most common case
		100
		50
1643	189					1094	$list_type = $self->_get_initial_item_type($paras->[0]);
1644
1645							} elsif($paras->[0][0] eq '=back') {
1646							# Ignore empty lists by default
1647	6	100				27	if ($self->{'parse_empty_lists'}) {
1648	2					4	$list_type = 'empty';
1649							} else {
1650	4					9	shift @$paras;
1651	4					23	return 1;
1652							}
1653							} elsif($paras->[0][0] eq '~end') {
1654							$self->whine(
1655	0					0	$para->[1]{'start_line'},
1656							"=over is the last thing in the document?!"
1657							);
1658	0					0	return 1; # But feh, ignore it.
1659							} else {
1660	23					40	$list_type = 'block';
1661							}
1662	214					534	$para->[1]{'~type'} = $list_type;
1663	214					483	push @$curr_open, $para;
1664							# yes, we reuse the paragraph as a stack item
1665
1666	214					699	my $content = join ' ', splice @$para, 2;
1667	214					567	$para->[1]{'~orig_content'} = $content;
1668	214					393	my $overness;
1669	214	100				1328	if($content =~ m/^\s*$/s) {
		50
1670	120					8552	$para->[1]{'indent'} = 4;
1671							} elsif($content =~ m/^\s((?:\d\.)?\d+)\s*$/s) {
1672	70			70		871	no integer;
	70					157
	70					550
1673	94					343	$para->[1]{'indent'} = $1;
1674	94	50				362	if($1 == 0) {
1675							$self->whine(
1676	0					0	$para->[1]{'start_line'},
1677							"Can't have a 0 in =over $content"
1678							);
1679	0					0	$para->[1]{'indent'} = 4;
1680							}
1681							} else {
1682							$self->whine(
1683	0					0	$para->[1]{'start_line'},
1684							"=over should be: '=over' or '=over positive_number'"
1685							);
1686	0					0	$para->[1]{'indent'} = 4;
1687							}
1688	214					329	DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1689
1690	214	100	100			1165	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1691	214					1027	$self->_maybe_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1692
1693	214					968	return;
1694							}
1695
1696							sub _ponder_back {
1697	214			214		669	my ($self,$para,$curr_open,$paras) = @_;
1698							# TODO: fire off or or ??
1699
1700	214					745	my $content = join ' ', splice @$para, 2;
1701	214	50				743	if($content =~ m/\S/) {
1702							$self->whine(
1703	0					0	$para->[1]{'start_line'},
1704							"=back doesn't take any parameters, but you said =back $content"
1705							);
1706							}
1707
1708	214	50	33			1166	if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1709	214					300	DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1710							# Expected case: we're closing the most recently opened thing
1711							#my $over = pop @$curr_open;
1712	214	100	50			789	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1713							$self->_maybe_handle_element_end( my $scratch =
1714	214					852	'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1715							);
1716							} else {
1717	0					0	DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1718							join(', ', map $_->[0], @$curr_open), ").\n";
1719							$self->whine(
1720	0					0	$para->[1]{'start_line'},
1721							'=back without =over'
1722							);
1723	0					0	return 1; # and ignore it
1724							}
1725							}
1726
1727							sub _ponder_item {
1728	0			0		0	my ($self,$para,$curr_open,$paras) = @_;
1729	0					0	my $over;
1730	0	0	0			0	unless(@$curr_open and
1731	0					0	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1732							$self->whine(
1733	0					0	$para->[1]{'start_line'},
1734							"'=item' outside of any '=over'"
1735							);
1736							unshift @$paras,
1737	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1738							$para
1739							;
1740	0					0	return 1;
1741							}
1742
1743
1744	0					0	my $over_type = $over->[1]{'~type'};
1745
1746	0	0				0	if(!$over_type) {
		0
		0
		0
		0
1747							# Shouldn't happen1
1748							die "Typeless over in stack, starting at line "
1749	0					0	. $over->[1]{'start_line'};
1750
1751							} elsif($over_type eq 'block') {
1752	0	0				0	unless($curr_open->[-1][1]{'~bitched_about'}) {
1753	0					0	$curr_open->[-1][1]{'~bitched_about'} = 1;
1754							$self->whine(
1755							$curr_open->[-1][1]{'start_line'},
1756							"You can't have =items (as at line "
1757	0					0	. $para->[1]{'start_line'}
1758							. ") unless the first thing after the =over is an =item"
1759							);
1760							}
1761							# Just turn it into a paragraph and reconsider it
1762	0					0	$para->[0] = '~Para';
1763	0					0	unshift @$paras, $para;
1764	0					0	return 1;
1765
1766							} elsif($over_type eq 'text') {
1767	0					0	my $item_type = $self->_get_item_type($para);
1768							# That kills the content of the item if it's a number or bullet.
1769	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1770
1771	0	0	0			0	if($item_type eq 'text') {
		0
1772							# Nothing special needs doing for 'text'
1773							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1774							$self->whine(
1775	0					0	$para->[1]{'start_line'},
1776							"Expected text after =item, not a $item_type"
1777							);
1778							# Undo our clobbering:
1779	0					0	push @$para, $para->[1]{'~orig_content'};
1780	0					0	delete $para->[1]{'number'};
1781							# Only a PROPER item-number element is allowed
1782							# to have a number attribute.
1783							} else {
1784	0					0	die "Unhandled item type $item_type"; # should never happen
1785							}
1786
1787							# =item-text thingies don't need any assimilation, it seems.
1788
1789							} elsif($over_type eq 'number') {
1790	0					0	my $item_type = $self->_get_item_type($para);
1791							# That kills the content of the item if it's a number or bullet.
1792	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1793
1794	0					0	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1795
1796	0	0				0	if($item_type eq 'bullet') {
		0
		0
		0
1797							# Hm, it's not numeric. Correct for this.
1798	0					0	$para->[1]{'number'} = $expected_value;
1799							$self->whine(
1800	0					0	$para->[1]{'start_line'},
1801							"Expected '=item $expected_value'"
1802							);
1803	0					0	push @$para, $para->[1]{'~orig_content'};
1804							# restore the bullet, blocking the assimilation of next para
1805
1806							} elsif($item_type eq 'text') {
1807							# Hm, it's not numeric. Correct for this.
1808	0					0	$para->[1]{'number'} = $expected_value;
1809							$self->whine(
1810	0					0	$para->[1]{'start_line'},
1811							"Expected '=item $expected_value'"
1812							);
1813							# Text content will still be there and will block next ~Para
1814
1815							} elsif($item_type ne 'number') {
1816	0					0	die "Unknown item type $item_type"; # should never happen
1817
1818							} elsif($expected_value == $para->[1]{'number'}) {
1819	0					0	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1820
1821							} else {
1822	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1823							" instead of the expected value of $expected_value\n";
1824							$self->whine(
1825							$para->[1]{'start_line'},
1826	0					0	"You have '=item " . $para->[1]{'number'} .
1827							"' instead of the expected '=item $expected_value'"
1828							);
1829	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1830							}
1831
1832	0	0				0	if(@$para == 2) {
1833							# For the cases where we /didn't/ push to @$para
1834	0	0				0	if($paras->[0][0] eq '~Para') {
1835	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1836	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1837							} else {
1838	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1839	0					0	push @$para, ''; # Just so it's not contentless
1840							}
1841							}
1842
1843
1844							} elsif($over_type eq 'bullet') {
1845	0					0	my $item_type = $self->_get_item_type($para);
1846							# That kills the content of the item if it's a number or bullet.
1847	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1848
1849	0	0				0	if($item_type eq 'bullet') {
		0
		0
1850							# as expected!
1851
1852	0	0				0	if( $para->[1]{'~_freaky_para_hack'} ) {
1853	0					0	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1854	0					0	push @$para, $para->[1]{'~_freaky_para_hack'};
1855							}
1856
1857							} elsif($item_type eq 'number') {
1858							$self->whine(
1859	0					0	$para->[1]{'start_line'},
1860							"Expected '=item *'"
1861							);
1862	0					0	push @$para, $para->[1]{'~orig_content'};
1863							# and block assimilation of the next paragraph
1864	0					0	delete $para->[1]{'number'};
1865							# Only a PROPER item-number element is allowed
1866							# to have a number attribute.
1867							} elsif($item_type eq 'text') {
1868							$self->whine(
1869	0					0	$para->[1]{'start_line'},
1870							"Expected '=item *'"
1871							);
1872							# But doesn't need processing. But it'll block assimilation
1873							# of the next para.
1874							} else {
1875	0					0	die "Unhandled item type $item_type"; # should never happen
1876							}
1877
1878	0	0				0	if(@$para == 2) {
1879							# For the cases where we /didn't/ push to @$para
1880	0	0				0	if($paras->[0][0] eq '~Para') {
1881	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1882	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1883							} else {
1884	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1885	0					0	push @$para, ''; # Just so it's not contentless
1886							}
1887							}
1888
1889							} else {
1890	0					0	die "Unhandled =over type \"$over_type\"?";
1891							# Shouldn't happen!
1892							}
1893	0					0	$para->[0] .= '-' . $over_type;
1894
1895	0					0	return;
1896							}
1897
1898							sub _ponder_Plain {
1899	4582			4582		9571	my ($self,$para) = @_;
1900	4582					6511	DEBUG and print STDERR " giving plain treatment...\n";
1901	4582	100	100			30353	unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
			66
			100
1902							or $para->[1]{'~cooked'}
1903							) {
1904							push @$para,
1905	4477					7132	@{$self->_make_treelet(
1906							join("\n", splice(@$para, 2)),
1907	4477					28045	$para->[1]{'start_line'}
1908							)};
1909							}
1910							# Empty paragraphs don't need a treelet for any reason I can see.
1911							# And precooked paragraphs already have a treelet.
1912	4582					11723	return;
1913							}
1914
1915							sub _ponder_Verbatim {
1916	534			534		1302	my ($self,$para) = @_;
1917	534					730	DEBUG and print STDERR " giving verbatim treatment...\n";
1918
1919	534					1510	$para->[1]{'xml:space'} = 'preserve';
1920
1921	534	100				1613	unless ($self->{'_output_is_for_JustPod'}) {
1922							# Fix illegal settings for expand_verbatim_tabs()
1923							# This is because this module doesn't do input error checking, but khw
1924							# doesn't want to add yet another instance of that.
1925	453					1814	my $tab_width = $self->expand_verbatim_tabs;
1926	453	100	100			2546	$tab_width = $self->expand_verbatim_tabs(8)
1927							if ! defined $tab_width
1928							\|\| $tab_width =~ /\D/;
1929
1930	453					1502	my $indent = $self->strip_verbatim_indent;
1931	453	100	100			1454	if ($indent && ref $indent eq 'CODE') {
1932	10					19	my @shifted = (shift @{$para}, shift @{$para});
	10					23
	10					23
1933	10					44	$indent = $indent->($para);
1934	10					106	unshift @{$para}, @shifted;
	10					32
1935							}
1936
1937	453					1342	for(my $i = 2; $i < @$para; $i++) {
1938	2077					3404	foreach my $line ($para->[$i]) { # just for aliasing
1939							# Strip indentation.
1940	2077	100				3924	$line =~ s/^\Q$indent// if $indent;
1941	2077	100				3606	next unless $tab_width;
1942
1943							# This is commented out because of github issue #85, and the
1944							# current maintainers don't know why it was there in the first
1945							# place.
1946							#&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1947	2076					6614	while( $line =~
1948							# Sort of adapted from Text::Tabs.
1949	189					1266	s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1950							* $tab_width)
1951							-(length($1) % $tab_width)))/e
1952							) {}
1953
1954							# TODO: whinge about (or otherwise treat) unindented or overlong lines
1955
1956							}
1957							}
1958							}
1959
1960							# Now the VerbatimFormatted hoodoo...
1961	534	100	66			2916	if( $self->{'accept_codes'} and
		100
1962							$self->{'accept_codes'}{'VerbatimFormatted'}
1963							) {
1964	103		100			529	while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
	76					296
1965							# Kill any number of terminal newlines
1966	103					450	$self->_verbatim_format($para);
1967							} elsif ($self->{'codes_in_verbatim'}) {
1968							push @$para,
1969	2					4	@{$self->_make_treelet(
1970							join("\n", splice(@$para, 2)),
1971	2					11	$para->[1]{'start_line'}, $para->[1]{'xml:space'}
1972							)};
1973	2					6	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1974							} else {
1975	429	100				2565	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1976	429					3368	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1977							}
1978	534					1338	return;
1979							}
1980
1981							sub _ponder_Data {
1982	54			54		141	my ($self,$para) = @_;
1983	54					83	DEBUG and print STDERR " giving data treatment...\n";
1984	54					133	$para->[1]{'xml:space'} = 'preserve';
1985	54	100				299	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1986	54					169	return;
1987							}
1988
1989
1990
1991
1992							###########################################################################
1993
1994							sub _traverse_treelet_bit { # for use only by the routine above
1995	8054			8054		20661	my($self, $name) = splice @_,0,2;
1996
1997	8054					11516	my $scratch;
1998	8054					25462	$self->_maybe_handle_element_start(($scratch=$name), shift @_);
1999
2000	8054					17951	while (@_) {
2001	12896					20769	my $x = shift;
2002	12896	100				21926	if (ref($x)) {
2003	2890					6298	&_traverse_treelet_bit($self, @$x);
2004							} else {
2005	10006		100			27965	$x .= shift while @_ && !ref($_[0]);
2006	10006					22256	$self->_maybe_handle_text($x);
2007							}
2008							}
2009
2010	8054					23569	$self->_maybe_handle_element_end($scratch=$name);
2011	8054					26986	return;
2012							}
2013
2014							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2015
2016							sub _closers_for_all_curr_open {
2017	10			10		14	my $self = $_[0];
2018	10					21	my @closers;
2019	10	50				16	foreach my $still_open (@{ $self->{'curr_open'} \|\| return }) {
	10					35
2020	15					35	my @copy = @$still_open;
2021	15					19	$copy[1] = {%{ $copy[1] }};
	15					58
2022							#$copy[1]{'start_line'} = -1;
2023	15	100				42	if($copy[0] eq '=for') {
		50
2024	9					15	$copy[0] = '=end';
2025							} elsif($copy[0] eq '=over') {
2026							$self->whine(
2027							$still_open->[1]{start_line} ,
2028	6					36	"=over without closing =back"
2029							);
2030
2031	6					9	$copy[0] = '=back';
2032							} else {
2033	0					0	die "I don't know how to auto-close an open $copy[0] region";
2034							}
2035
2036	15	50				78	unless( @copy > 2 ) {
2037	15					28	push @copy, $copy[1]{'target'};
2038	15	100				33	$copy[-1] = '' unless defined $copy[-1];
2039							# since =over's don't have targets
2040							}
2041
2042	15					29	$copy[1]{'fake-closer'} = 1;
2043
2044	15					16	DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
2045	15					35	unshift @closers, \@copy;
2046							}
2047	10					22	return @closers;
2048							}
2049
2050							#--------------------------------------------------------------------------
2051
2052							sub _verbatim_format {
2053	103			103		246	my($it, $p) = @_;
2054
2055	103					159	my $formatting;
2056
2057	103					276	for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
2058	559					718	DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
2059	559					1282	$p->[$i] .= "\n";
2060							# Unlike with simple Verbatim blocks, we don't end up just doing
2061							# a join("\n", ...) on the contents, so we have to append a
2062							# newline to every line, and then nix the last one later.
2063							}
2064
2065	103					229	if( DEBUG > 4 ) {
2066							print STDERR "<<\n";
2067							for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
2068							print STDERR "_verbatim_format $i: $p->[$i]";
2069							}
2070							print STDERR ">>\n";
2071							}
2072
2073	103					352	for(my $i = $#$p; $i > 2; $i--) {
2074							# work backwards over the lines, except the first (#2)
2075
2076							#next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
2077							# and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
2078							# look at a formatty line preceding a nonformatty one
2079	455					549	DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
2080	455	100				848	if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
2081	11					17	DEBUG > 5 and print STDERR " It's a formatty line. ",
2082							"Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
2083
2084	11	100				31	if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
2085	1					3	DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
2086	1					4	next;
2087							} else {
2088	10					17	DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
2089							}
2090							} else {
2091	444					518	DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
2092	444					847	next;
2093							}
2094
2095							# A formatty line has to have #: in the first two columns, and uses
2096							# "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
2097							# Example:
2098							# What do you want? i like pie. [or whatever]
2099							# #:^^^^^^^^^^^^^^^^^ /////////////
2100
2101
2102	10					13	DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
2103
2104	10					40	$formatting = ' ' . $1;
2105	10					45	$formatting =~ s/\s+$//s; # nix trailing whitespace
2106	10	50	33			59	unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
2107	0					0	splice @$p,$i,1; # remove this line
2108	0					0	$i--; # don't consider next line
2109	0					0	next;
2110							}
2111
2112	10	100				26	if( length($formatting) >= length($p->[$i-1]) ) {
2113	3					11	$formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
2114							} else {
2115	7					24	$formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
2116							}
2117							# Make $formatting and the previous line be exactly the same length,
2118							# with $formatting having a " " as the last character.
2119
2120	10					15	DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
2121
2122
2123	10					17	my @new_line;
2124	10					47	while( $formatting =~ m{\G(( +)\|(\^+)\|(\/+)\|(\%+))}g ) {
2125							#print STDERR "Format matches $1\n";
2126
2127	54	100				129	if($2) {
2128							#print STDERR "SKIPPING <$2>\n";
2129	32					165	push @new_line,
2130							substr($p->[$i-1], pos($formatting)-length($1), length($1));
2131							} else {
2132							#print STDERR "SNARING $+\n";
2133	22	50				183	push @new_line, [
		100
		100
2134							(
2135							$3 ? 'VerbatimB' :
2136							$4 ? 'VerbatimI' :
2137							$5 ? 'VerbatimBI' : die("Should never get called")
2138							), {},
2139							substr($p->[$i-1], pos($formatting)-length($1), length($1))
2140							];
2141							#print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
2142							}
2143							}
2144	10					54	my @nixed =
2145							splice @$p, $i-1, 2, @new_line; # replace myself and the next line
2146	10					17	DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
2147
2148	10					14	DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
2149							scalar(@new_line), "):",
2150							map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
2151	10					43	$i--; # So the next line we scrutinize is the line before the one
2152							# that we just went and formatted
2153							}
2154
2155	103					259	$p->[0] = 'VerbatimFormatted';
2156
2157							# Collapse adjacent text nodes, just for kicks.
2158	103					332	for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2159	0	0	0			0	if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2160	0					0	DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2161	0					0	$p->[$i] .= splice @$p, $i+1, 1; # merge
2162	0					0	--$i; # and back up
2163							}
2164							}
2165
2166							# Now look for the last text token, and remove the terminal newline
2167	103					325	for( my $i = $#$p; $i >= 2; $i-- ) {
2168							# work backwards over the tokens, even the first
2169	103	50				353	if( !ref($p->[$i]) ) {
2170	103	50				720	if($p->[$i] =~ s/\n$//s) {
2171	103					309	DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2172							} else {
2173	0					0	DEBUG > 5 and print STDERR
2174							"No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2175							}
2176	103					229	last; # we only want the next one
2177							}
2178							}
2179
2180	103					256	return;
2181							}
2182
2183
2184							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2185
2186
2187							sub _treelet_from_formatting_codes {
2188							# Given a paragraph, returns a treelet. Full of scary tokenizing code.
2189							# Like [ '~Top', {'start_line' => $start_line},
2190							# "I like ",
2191							# [ 'B', {}, "pie" ],
2192							# "!"
2193							# ]
2194							# This illustrates the general format of a treelet. It is an array:
2195							# [0] is a scalar indicating its type. In the example above, the
2196							# types are '~Top' and 'B'
2197							# [1] is a hash of various flags about it, possibly empty
2198							# [2] - [N] are an ordered list of the subcomponents of the treelet.
2199							# Scalars are literal text, refs are sub-treelets, to
2200							# arbitrary levels. Stringifying a treelet will recursively
2201							# stringify the sub-treelets, concatentating everything
2202							# together to form the exact text of the treelet.
2203
2204	4479			4479		11436	my($self, $para, $start_line, $preserve_space) = @_;
2205
2206	4479					16433	my $treelet = ['~Top', {'start_line' => $start_line},];
2207
2208	4479	100	100			18529	unless ($preserve_space \|\| $self->{'preserve_whitespace'}) {
2209	3602					37624	$para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2210	3602					7250	$para =~ s/ $//;
2211	3602					5964	$para =~ s/^ //;
2212							}
2213
2214							# Only apparent problem the above code is that N<< >> turns into
2215							# N<< >>. But then, word wrapping does that too! So don't do that!
2216
2217
2218							# As a Start-code is encountered, the number of opening bracket '<'
2219							# characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2220							# etc). When closing brackets are found in the text, at least this number
2221							# (plus the 1) will be required to mean the Start-code is terminated. When
2222							# those are found, @stack is popped.
2223	4479					7015	my @stack;
2224
2225	4479					8591	my @lineage = ($treelet);
2226	4479					7449	my $raw = ''; # raw content of L<> fcode before splitting/processing
2227							# XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2228							# into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2229							# the 'collapse and trim all whitespace first' lines just above.
2230	4479					6296	my $inL = 0;
2231
2232	4479					5895	DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2233
2234							# Here begins our frightening tokenizer RE. The following regex matches
2235							# text in four main parts:
2236							#
2237							# * Start-codes. The first alternative matches C< or C<<, the latter
2238							# followed by some whitespace. $1 will hold the entire start code
2239							# (including any space following a multiple-angle-bracket delimiter),
2240							# and $2 will hold only the additional brackets past the first in a
2241							# multiple-bracket delimiter. length($2) + 1 will be the number of
2242							# closing brackets we have to find.
2243							#
2244							# * Closing brackets. Match some amount of whitespace followed by
2245							# multiple close brackets. The logic to see if this closes anything
2246							# is down below. Note that in order to parse C<< >> correctly, we
2247							# have to use look-behind (?<=\s\s), since the match of the starting
2248							# code will have consumed the whitespace.
2249							#
2250							# * A single closing bracket, to close a simple code like C<>.
2251							#
2252							# * Something that isn't a start or end code. We have to be careful
2253							# about accepting whitespace, since perlpodspec says that any whitespace
2254							# before a multiple-bracket closing delimiter should be ignored.
2255							#
2256	4479					60166	while($para =~
2257							m/\G
2258							(?:
2259							# Match starting codes, including the whitespace following a
2260							# multiple-delimiter start code. $1 gets the whole start code and
2261							# $2 gets all but one of the
2262							([A-Z]<(?:(<+)\s+)?)
2263							\|
2264							# Match multiple-bracket end codes. $3 gets the whitespace that
2265							# should be discarded before an end bracket but kept in other cases
2266							# and $4 gets the end brackets themselves. ($3 can be empty if the
2267							# construct is empty, like C<< >>, and all the white-space has been
2268							# gobbled up already, considered to be space after the opening
2269							# bracket. In this case we use look-behind to verify that there are
2270							# at least 2 spaces in a row before the ">".)
2271							(\s+\|(?<=\s\s))(>{2,})
2272							\|
2273							(\s?>) # $5: simple end-codes
2274							\|
2275							( # $6: stuff containing no start-codes or end-codes
2276							(?:
2277							[^A-Z\s>]
2278							\|
2279							(?:
2280							[A-Z](?!<)
2281							)
2282							\|
2283							# whitespace is ok, but we don't want to eat the whitespace before
2284							# a multiple-bracket end code.
2285							# NOTE: we may still have problems with e.g. S<< >>
2286							(?:
2287							\s(?!\s*>{2,})
2288							)
2289							)+
2290							)
2291							)
2292							/xgo
2293							) {
2294	15947					21562	DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2295	15947	100				53269	if(defined $1) {
		100
		100
		50
2296	3087					4340	my $bracket_count; # How many '<<<' in a row this has. Needed for
2297							# Pod::Simple::JustPod
2298	3087	100				5819	if(defined $2) {
2299	119					197	DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2300	119					227	$bracket_count = length($2) + 1;
2301	119					220	push @stack, $bracket_count; # length of the necessary complex
2302							# end-code string
2303							} else {
2304	2968					3884	DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2305	2968					4655	push @stack, 0; # signal that we're looking for simple
2306	2968					4706	$bracket_count = 1;
2307							}
2308	3087					7045	my $code = substr($1,0,1);
2309	3087	100				6288	if ('L' eq $code) {
2310	437	100				910	if ($inL) {
2311	1					3	$raw .= $1;
2312	1					17	$self->scream( $start_line,
2313							'Nested L<> are illegal. Pretending inner one is '
2314							. 'X<...> so can continue looking for other errors.');
2315	1					3	$code = "X";
2316							}
2317							else {
2318	436					740	$raw = ""; # reset raw content accumulator
2319	436					778	$inL = @stack;
2320							}
2321							} else {
2322	2650	100				5228	$raw .= $1 if $inL;
2323							}
2324	3087					8291	push @lineage, [ $code, {}, ]; # new node object
2325
2326							# Tell Pod::Simple::JustPod how many brackets there were, but to save
2327							# space, not in the most usual case of there was just 1. It can be
2328							# inferred by the absence of this element. Similarly, if there is more
2329							# than one bracket, extract the white space between the final bracket
2330							# and the real beginning of the interior. Save that if it isn't just a
2331							# single space
2332	3087	100	100			8534	if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2333	18					44	$lineage[-1][1]{'~bracket_count'} = $bracket_count;
2334	18					47	my $lspacer = substr($1, 1 + $bracket_count);
2335	18	100				54	$lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2336							}
2337	3087					4166	push @{ $lineage[-2] }, $lineage[-1];
	3087					20445
2338							} elsif(defined $4) {
2339	130					197	DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2340							# This is where it gets messy...
2341	130	100				536	if(! @stack) {
		100
		100
		50
2342							# We saw " >>>>" but needed nothing. This is ALL just stuff then.
2343	1					3	DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2344	1					2	push @{ $lineage[-1] }, $3, $4;
	1					5
2345	1					16	next;
2346							} elsif(!$stack[-1]) {
2347							# We saw " >>>>" but needed only ">". Back pos up.
2348	3					5	DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2349	3					6	push @{ $lineage[-1] }, $3; # That was a for-real space, too.
	3					11
2350	3					17	pos($para) = pos($para) - length($4) + 1;
2351							} elsif($stack[-1] == length($4)) {
2352							# We found " >>>>", and it was exactly what we needed. Commonest case.
2353	119					199	DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2354							} elsif($stack[-1] < length($4)) {
2355							# We saw " >>>>" but needed only " >>". Back pos up.
2356	0					0	DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2357	0					0	pos($para) = pos($para) - length($4) + $stack[-1];
2358							} else {
2359							# We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2360	7					15	DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2361	7					12	push @{ $lineage[-1] }, $3, $4;
	7					26
2362	7					45	next;
2363							}
2364							#print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2365
2366	122	50	66			418	if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2367	3	100				14	if ($3 ne "") {
		100
2368	1					4	$lineage[-1][1]{'~rspacer'} = $3;
2369							}
2370							elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2371
2372							# Here we had something like C<< >> which was a false positive
2373	1					4	delete $lineage[-1][1]{'~lspacer'};
2374							}
2375							else {
2376							$lineage[-1][1]{'~rspacer'}
2377	1					4	= substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2378	1					4	chop $lineage[-1][1]{'~lspacer'};
2379							}
2380							}
2381
2382	122	100				211	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	2					5
	122					391
2383							# Keep the element from being childless
2384
2385	122	100				325	if ($inL == @stack) {
2386	26					119	$lineage[-1][1]{'raw'} = $raw;
2387	26					42	$inL = 0;
2388							}
2389
2390	122					224	pop @stack;
2391	122					204	pop @lineage;
2392
2393	122	100				859	$raw .= $3.$4 if $inL;
2394
2395							} elsif(defined $5) {
2396	3064					4036	DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2397
2398	3064	100	100			9878	if(@stack and ! $stack[-1]) {
2399							# We're indeed expecting a simple end-code
2400	2957					3786	DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2401
2402	2957	50				6065	if(length($5) == 2) { # There was a space there: " >"
		100
2403	0					0	push @{ $lineage[-1] }, ' ';
	0					0
2404	2957					6494	} elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2405	41					56	push @{ $lineage[-1] }, ''; # keep it from being really childless
	41					90
2406							}
2407
2408	2957	100				6058	if ($inL == @stack) {
2409	410					1233	$lineage[-1][1]{'raw'} = $raw;
2410	410					599	$inL = 0;
2411							}
2412
2413	2957					4285	pop @stack;
2414	2957					4139	pop @lineage;
2415							} else {
2416	107					315	DEBUG > 4 and print STDERR " It's just stuff.\n";
2417	107					161	push @{ $lineage[-1] }, $5;
	107					305
2418							}
2419
2420	3064	100				26902	$raw .= $5 if $inL;
2421
2422							} elsif(defined $6) {
2423	9666					12978	DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2424	9666					13032	push @{ $lineage[-1] }, $6;
	9666					27417
2425	9666	100				37716	$raw .= $6 if $inL;
2426							# XXX does not capture multiplace whitespaces -- 'raw' ends up with
2427							# at most 1 leading/trailing whitespace, why not all of it?
2428							# Answer, because we deliberately trimmed it above
2429
2430							} else {
2431							# should never ever ever ever happen
2432	0					0	DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2433	0					0	die "SPORK 512512!";
2434							}
2435							}
2436
2437	4479	100				9493	if(@stack) { # Uhoh, some sequences weren't closed.
2438	8					22	my $x= "...";
2439	8					21	while(@stack) {
2440	8	50				14	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	0					0
	8					26
2441							# Hmmmmm!
2442
2443	8					22	my $code = (pop @lineage)->[0];
2444	8					15	my $ender_length = pop @stack;
2445	8	50				35	if($ender_length) {
2446	0					0	--$ender_length;
2447	0					0	$x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2448							} else {
2449	8					34	$x = $code . "<$x>";
2450							}
2451							}
2452	8					12	DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2453	8					39	$self->whine($start_line,
2454							"Unterminated $x sequence",
2455							);
2456							}
2457
2458	4479					17092	return $treelet;
2459							}
2460
2461							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2462
2463							sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2464	0			0	0	0	return stringify_lol($_[1]);
2465							}
2466
2467							sub stringify_lol { # function: stringify_lol($lol)
2468	2641			2641	0	4342	my $string_form = '';
2469	2641					6361	_stringify_lol( $_[0] => \$string_form );
2470	2641					8377	return $string_form;
2471							}
2472
2473							sub _stringify_lol { # the real recursor
2474	3029			3029		5148	my($lol, $to) = @_;
2475	3029					6574	for(my $i = 2; $i < @$lol; ++$i) {
2476	4049	100	100			11234	if( ref($lol->[$i] \|\| '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
			66
2477	388					810	_stringify_lol( $lol->[$i], $to); # recurse!
2478							} else {
2479	3661					9018	$$to .= $lol->[$i];
2480							}
2481							}
2482	3029					5341	return;
2483							}
2484
2485							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2486
2487							sub _dump_curr_open { # return a string representation of the stack
2488	3			3		3	my $curr_open = $_[0]{'curr_open'};
2489
2490	3	50				5	return '[empty]' unless @$curr_open;
2491							return join '; ',
2492	3					6	map {;
2493							($_->[0] eq '=for')
2494							? ( ($_->[1]{'~really'} \|\| '=over')
2495	4	50	50			35	. ' ' . $_->[1]{'target'})
2496							: $_->[0]
2497							}
2498							@$curr_open
2499							;
2500							}
2501
2502							###########################################################################
2503							my %pretty_form = (
2504							"\a" => '\a', # ding!
2505							"\b" => '\b', # BS
2506							"\e" => '\e', # ESC
2507							"\f" => '\f', # FF
2508							"\t" => '\t', # tab
2509							"\cm" => '\cm',
2510							"\cj" => '\cj',
2511							"\n" => '\n', # probably overrides one of either \cm or \cj
2512							'"' => '\"',
2513							'\\' => '\\\\',
2514							'$' => '\\$',
2515							'@' => '\\@',
2516							'%' => '\\%',
2517							'#' => '\\#',
2518							);
2519
2520							sub pretty { # adopted from Class::Classless
2521							# Not the most brilliant routine, but passable.
2522							# Don't give it a cyclic data structure!
2523	502			502	0	249038	my @stuff = @_; # copy
2524	502					682	my $x;
2525							my $out =
2526							# join ",\n" .
2527							join ", ",
2528	502					813	map {;
2529	592	50	100			4314	if(!defined($_)) {
		100	66
		50	33
		100
		100
		100
2530	0					0	"undef";
2531							} elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2532	58					122	$x = "[ " . pretty(@$_) . " ]" ;
2533	58					138	$x;
2534							} elsif(ref($_) eq 'SCALAR') {
2535	0					0	$x = "\\" . pretty($$_) ;
2536	0					0	$x;
2537							} elsif(ref($_) eq 'HASH') {
2538	48					85	my $hr = $_;
2539							$x = "{" . join(", ",
2540	48					416	map(pretty($_) . '=>' . pretty($hr->{$_}),
2541							sort keys %$hr ) ) . "}" ;
2542	48					236	$x;
2543	2					5	} elsif(!length($_)) { q{''} # empty string
2544							} elsif(
2545							$_ eq '0' # very common case
2546							or(
2547							m/^-?(?:[123456789]\d*\|0)(?:\.\d+)?$/s
2548							and $_ ne '-0' # the strange case that RE lets thru
2549							)
2550	26					77	) { $_;
2551							} else {
2552							# Yes, explicitly name every character desired. There are shorcuts one
2553							# could make, but I (Karl Williamson) was afraid that some Perl
2554							# releases would have bugs in some of them. For example [A-Z] works
2555							# even on EBCDIC platforms to match exactly the 26 uppercase English
2556							# letters, but I don't know if it has always worked without bugs. It
2557							# seemed safest just to list the characters.
2558							# s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2559	458					1037	s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{\|}~])>
2560	0	0				0	<$pretty_form{$1} \|\| '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2561							#<$pretty_form{$1} \|\| '\\x'.(unpack("H2",$1))>eg;
2562	458					1305	qq{"$_"};
2563							}
2564							} @stuff;
2565							# $out =~ s/\n */ /g if length($out) < 75;
2566	502					5133	return $out;
2567							}
2568
2569							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2570
2571							# A rather unsubtle method of blowing away all the state information
2572							# from a parser object so it can be reused. Provided as a utility for
2573							# backward compatibility in Pod::Man, etc. but not recommended for
2574							# general use.
2575
2576							sub reinit {
2577	5			5	0	15054	my $self = shift;
2578	5					20	foreach (qw(source_dead source_filename doc_has_started
2579							start_of_pod_block content_seen last_was_blank paras curr_open
2580							line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2581							Title _current_headings _in_head _filter_allowed)) {
2582
2583	95					186	delete $self->{$_};
2584							}
2585							}
2586
2587							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2588							1;
2589