File Coverage

blib/lib/Pod/Simple/BlackBox.pm

Criterion	Covered	Total	%
statement	806	1007	80.0
branch	368	526	69.9
condition	132	189	69.8
subroutine	62	65	95.3
pod	0	7	0.0
total	1368	1794	76.2

line	stmt	bran	cond	sub	pod	time	code
1							package Pod::Simple::BlackBox;
2							#
3							# "What's in the box?" "Pain."
4							#
5							###########################################################################
6							#
7							# This is where all the scary things happen: parsing lines into
8							# paragraphs; and then into directives, verbatims, and then also
9							# turning formatting sequences into treelets.
10							#
11							# Are you really sure you want to read this code?
12							#
13							#-----------------------------------------------------------------------------
14							#
15							# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16							# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17							# to call the proper callbacks on the treelets.
18							#
19							# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21	68			68		2997	use integer; # vroom!
	68					278
	68					429
22	68			68		2028	use strict;
	68					136
	68					1243
23	68			68		300	use warnings;
	68					122
	68					1444
24	68			68		331	use Carp ();
	68					138
	68					12931
25							our $VERSION = '3.45';
26							#use constant DEBUG => 7;
27
28							sub my_qr ($$) {
29
30							# $1 is a pattern to compile and return. Older perls compile any
31							# syntactically valid property, even if it isn't legal. To cope with
32							# this, return an empty string unless the compiled pattern also
33							# successfully matches $2, which the caller furnishes.
34
35	624			624	0	1733	my ($input_re, $should_match) = @_;
36							# XXX could have a third parameter $shouldnt_match for extra safety
37
38	624	50				4013	my $use_utf8 = ($] le 5.006002) ? 'use utf8;' : "";
39
40	68			68		569	my $re = eval "no warnings; $use_utf8 qr/$input_re/";
	68			68		158
	68			68		3001
	68			68		447
	68			68		149
	68			68		4596
	68			68		45390
	68			68		2485
	68			68		1186
	68			68		1732
	68			3		202
	68			3		2400
	68			3		418
	68			3		140
	68					2319
	68					445
	68					139
	68					2413
	68					417
	68					138
	68					2548
	68					414
	68					136
	68					2315
	68					401
	68					134
	68					2357
	68					468
	68					139
	68					3826
	3					40
	3					8
	3					147
	3					17
	3					7
	3					126
	3					17
	3					6
	3					106
	3					15
	3					6
	3					100
	624					36824
41							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
42	624	50				2444	return "" if $@;
43
44	68			68		905	my $matches = eval "no warnings; $use_utf8 '$should_match' =~ /$re/";
	68			68		151
	68			68		2934
	68			68		545
	68			68		146
	68			68		3363
	68			68		464
	68			68		157
	68			68		2599
	68			3		458
	68			3		134
	68			3		2313
	68			3		435
	68					133
	68					2547
	68					425
	68					150
	68					2383
	68					422
	68					129
	68					2574
	68					419
	68					132
	68					2399
	68					414
	68					138
	68					2988
	3					19
	3					5
	3					124
	3					22
	3					6
	3					127
	3					25
	3					6
	3					104
	3					19
	3					10
	3					110
	624					34876
45							#print STDERR __LINE__, ": $input_re: $@\n" if $@;
46	624	50				2417	return "" if $@;
47
48							#print STDERR __LINE__, ": SUCCESS: $re\n" if $matches;
49	624	50				2180	return $re if $matches;
50
51							#print STDERR __LINE__, ": $re: didn't match\n";
52	0					0	return "";
53							}
54
55							BEGIN {
56	68			68		1933	require Pod::Simple;
57	68	50				5946	*DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
58							}
59
60							# Matches a character iff the character will have a different meaning
61							# if we choose CP1252 vs UTF-8 if there is no =encoding line.
62							# This is broken for early Perls on non-ASCII platforms.
63							my $non_ascii_re = my_qr('[[:^ascii:]]', "\xB6");
64							$non_ascii_re = qr/[\x80-\xFF]/ unless $non_ascii_re;
65
66							# Use patterns understandable by Perl 5.6, if possible
67	68			68		508	my $cs_re = do { no warnings; my_qr('\p{IsCs}', "\x{D800}") };
	68					190
	68					68600
68							my $cn_re = my_qr('\p{IsCn}', "\x{09E4}"); # code point unlikely
69							# to get assigned
70							my $rare_blocks_re = my_qr('[\p{InIPAExtensions}\p{InSpacingModifierLetters}]',
71							"\x{250}");
72							$rare_blocks_re = my_qr('[\x{0250}-\x{02FF}]', "\x{250}") unless $rare_blocks_re;
73
74	68			68		428	my $script_run_re = eval 'no warnings "experimental::script_run";
	68					129
	68					16734
75							qr/(script_run: ^ . $ )/x';
76							my $latin_re = my_qr('[\p{IsLatin}\p{IsInherited}\p{IsCommon}]', "\x{100}");
77							unless ($latin_re) {
78							# This was machine generated to be the ranges of the union of the above
79							# three properties, with things that were undefined by Unicode 4.1 filling
80							# gaps. That is the version in use when Perl advanced enough to
81							# successfully compile and execute the above pattern.
82							$latin_re = my_qr('[\x00-\x{02E9}\x{02EC}-\x{0374}\x{037E}\x{0385}\x{0387}\x{0485}\x{0486}\x{0589}\x{060C}\x{061B}\x{061F}\x{0640}\x{064B}-\x{0655}\x{0670}\x{06DD}\x{0951}-\x{0954}\x{0964}\x{0965}\x{0E3F}\x{10FB}\x{16EB}-\x{16ED}\x{1735}\x{1736}\x{1802}\x{1803}\x{1805}\x{1D00}-\x{1D25}\x{1D2C}-\x{1D5C}\x{1D62}-\x{1D65}\x{1D6B}-\x{1D77}\x{1D79}-\x{1DBE}\x{1DC0}-\x{1EF9}\x{2000}-\x{2125}\x{2127}-\x{27FF}\x{2900}-\x{2B13}\x{2E00}-\x{2E1D}\x{2FF0}-\x{3004}\x{3006}\x{3008}-\x{3020}\x{302A}-\x{302D}\x{3030}-\x{3037}\x{303C}-\x{303F}\x{3099}-\x{309C}\x{30A0}\x{30FB}\x{30FC}\x{3190}-\x{319F}\x{31C0}-\x{31CF}\x{3220}-\x{325F}\x{327F}-\x{32CF}\x{3358}-\x{33FF}\x{4DC0}-\x{4DFF}\x{A700}-\x{A716}\x{FB00}-\x{FB06}\x{FD3E}\x{FD3F}\x{FE00}-\x{FE6B}\x{FEFF}-\x{FF65}\x{FF70}\x{FF9E}\x{FF9F}\x{FFE0}-\x{FFFD}\x{10100}-\x{1013F}\x{1D000}-\x{1D1DD}\x{1D300}-\x{1D7FF}]', "\x{100}");
83							}
84
85							my $every_char_is_latin_re = my_qr("^(?:$latin_re)*\\z", "A");
86
87							# Latin script code points not in the first release of Unicode
88							my $later_latin_re = my_qr('[^\P{IsLatin}\p{IsAge=1.1}]', "\x{1F6}");
89
90							# If this perl doesn't have the Deprecated property, there's only one code
91							# point in it that we need be concerned with.
92							my $deprecated_re = my_qr('\p{IsDeprecated}', "\x{149}");
93							$deprecated_re = qr/\x{149}/ unless $deprecated_re;
94
95							my $utf8_bom;
96							if (($] ge 5.007_003)) {
97							$utf8_bom = "\x{FEFF}";
98							utf8::encode($utf8_bom);
99							} else {
100							$utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls.
101							}
102
103							# This is used so that the 'content_seen' method doesn't return true on a
104							# file that just happens to have a line that matches /^=[a-zA-z]/. Only if
105							# there is a valid =foo line will we return that content was seen.
106							my $seen_legal_directive = 0;
107
108							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
109
110	0			0	0	0	sub parse_line { shift->parse_lines(@_) } # alias
111
112							# - - - Turn back now! Run away! - - -
113
114							sub parse_lines { # Usage: $parser->parse_lines(@lines)
115							# an undef means end-of-stream
116	9477			9477	0	14576	my $self = shift;
117
118	9477					14416	my $code_handler = $self->{'code_handler'};
119	9477					13969	my $cut_handler = $self->{'cut_handler'};
120	9477					12926	my $wl_handler = $self->{'whiteline_handler'};
121	9477		100			19384	$self->{'line_count'} \|\|= 0;
122
123	9477					12029	my $scratch;
124
125							DEBUG > 4 and
126	9477					11683	print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
127
128	9477					11440	DEBUG > 5 and
129							print STDERR "# About to parse lines: ",
130							join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
131
132	9477		100			20197	my $paras = ($self->{'paras'} \|\|= []);
133							# paragraph buffer. Because we need to defer processing of =over
134							# directives and verbatim paragraphs. We call _ponder_paragraph_buffer
135							# to process this.
136
137	9477		100			19034	$self->{'pod_para_count'} \|\|= 0;
138
139							# An attempt to match the pod portions of a line. This is not fool proof,
140							# but is good enough to serve as part of the heuristic for guessing the pod
141							# encoding if not specified.
142	9477					12786	my $codes = join '', grep { / ^ [A-Za-z] $/x } sort keys %{$self->{accept_codes}};
	90736					187603
	9477					47050
143	9477					46599	my $pod_chars_re = qr/ ^ = [A-Za-z]+ \| [\Q$codes\E] < /x;
144
145	9477					15128	my $line;
146	9477					16761	foreach my $source_line (@_) {
147	19029	50				39996	if( $self->{'source_dead'} ) {
148	0					0	DEBUG > 4 and print STDERR "# Source is dead.\n";
149	0					0	last;
150							}
151
152	19029	100				37396	unless( defined $source_line ) {
153	890					1300	DEBUG > 4 and print STDERR "# Undef-line seen.\n";
154
155	890					2896	push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
156	890					2511	push @$paras, $paras->[-1], $paras->[-1];
157							# So that it definitely fills the buffer.
158	890					1697	$self->{'source_dead'} = 1;
159	890					2376	$self->_ponder_paragraph_buffer;
160	890					1717	next;
161							}
162
163
164	18139	100				33846	if( $self->{'line_count'}++ ) {
165	17253					37011	($line = $source_line) =~ tr/\n\r//d;
166							# If we don't have two vars, we'll end up with that there
167							# tr/// modding the (potentially read-only) original source line!
168
169							} else {
170	886					1220	DEBUG > 2 and print STDERR "First line: [$source_line]\n";
171
172	886	50				5694	if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
		50
		50
173	0					0	DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n";
174	0					0	$self->_handle_encoding_line( "=encoding utf8" );
175	0					0	delete $self->{'_processed_encoding'};
176	0					0	$line =~ tr/\n\r//d;
177
178							} elsif( $line =~ s/^\xFE\xFF//s ) {
179	0					0	DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n";
180							$self->scream(
181	0					0	$self->{'line_count'},
182							"UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
183							);
184	0					0	splice @_;
185	0					0	push @_, undef;
186	0					0	next;
187
188							# TODO: implement somehow?
189
190							} elsif( $line =~ s/^\xFF\xFE//s ) {
191	0					0	DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n";
192							$self->scream(
193	0					0	$self->{'line_count'},
194							"UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
195							);
196	0					0	splice @_;
197	0					0	push @_, undef;
198	0					0	next;
199
200							# TODO: implement somehow?
201
202							} else {
203	886					1723	DEBUG > 2 and print STDERR "First line is BOM-less.\n";
204	886					2465	($line = $source_line) =~ tr/\n\r//d;
205							}
206							}
207
208	18139	100	100			124359	if(!$self->{'parse_characters'} && !$self->{'encoding'}
			100
			100
			100
209							&& ($self->{'in_pod'} \|\| $line =~ /^=/s)
210							&& $line =~ /$non_ascii_re/
211							) {
212
213	21					40	my $encoding;
214
215							# No =encoding line, and we are at the first pod line in the input that
216							# contains a non-ascii byte, that is, one whose meaning varies depending
217							# on whether the file is encoded in UTF-8 or CP1252, which are the two
218							# possibilities permitted by the pod spec. (ASCII is assumed if the
219							# file only contains ASCII bytes.) In order to process this line, we
220							# need to figure out what encoding we will use for the file.
221							#
222							# Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
223							# 160-255, but it is used here, as it often colloquially is, to refer to
224							# the complete set of code points 0-255, including ASCII (0-127), the C1
225							# controls (128-159), and strict Latin 1 (160-255).
226							#
227							# CP1252 is effectively a superset of Latin 1, because it differs only
228							# from colloquial 8859-1 in the C1 controls, which are very unlikely to
229							# actually be present in 8859-1 files, so can be used for other purposes
230							# without conflict. CP 1252 uses most of them for graphic characters.
231							#
232							# Note that all ASCII-range bytes represent their corresponding code
233							# points in both CP1252 and UTF-8. In ASCII platform UTF-8, all other
234							# code points require multiple (non-ASCII) bytes to represent. (A
235							# separate paragraph for EBCDIC is below.) The multi-byte
236							# representation is quite structured. If we find an isolated byte that
237							# would require multiple bytes to represent in UTF-8, we know that the
238							# encoding is not UTF-8. If we find a sequence of bytes that violates
239							# the UTF-8 structure, we also can presume the encoding isn't UTF-8, and
240							# hence must be 1252.
241							#
242							# But there are ambiguous cases where we could guess wrong. If so, the
243							# user will end up having to supply an =encoding line. We use all
244							# readily available information to improve our chances of guessing
245							# right. The odds of something not being UTF-8, but still passing a
246							# UTF-8 validity test go down very rapidly with increasing length of the
247							# sequence. Therefore we look at all non-ascii sequences on the line.
248							# If any of the sequences can't be UTF-8, we quit there and choose
249							# CP1252. If all could be UTF-8, we see if any of the code points
250							# represented are unlikely to be in pod. If so, we guess CP1252. If
251							# not, we check if the line is all in the same script; if not guess
252							# CP1252; otherwise UTF-8. For perls that don't have convenient script
253							# run testing, see if there is both Latin and non-Latin. If so, CP1252,
254							# otherwise UTF-8.
255							#
256							# On EBCDIC platforms, the situation is somewhat different. In
257							# UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
258							# but so do the bytes that are for the C1 controls. Recall that these
259							# correspond to the unused portion of 8859-1 that 1252 mostly takes
260							# over. That means that there are fewer code points that are
261							# represented by multi-bytes. But, note that the these controls are
262							# very unlikely to be in pod text. So if we encounter one of them, it
263							# means that it is quite likely CP1252 and not UTF-8. The net result is
264							# the same code below is used for both platforms.
265							#
266							# XXX probably if the line has E that evaluates to illegal CP1252,
267							# then it is UTF-8. But we haven't processed E<> yet.
268
269	21	50				151	goto set_1252 if $] lt 5.006_000; # No UTF-8 on very early perls
270
271	21					45	my $copy;
272
273	68			68		534	no warnings 'utf8';
	68					156
	68					6545
274
275	21	50				81	if ($] ge 5.007_003) {
276	21					50	$copy = $line;
277
278							# On perls that have this function, we can use it to easily see if the
279							# sequence is valid UTF-8 or not; if valid it turns on the UTF-8 flag
280							# needed below for script run detection
281	21	100				344	goto set_1252 if ! utf8::decode($copy);
282							}
283							elsif (ord("A") != 65) { # Early EBCDIC, assume UTF-8. What's a windows
284							# code page doing here anyway?
285							goto set_utf8;
286							}
287							else { # ASCII, no decode(): do it ourselves using the fundamental
288							# characteristics of UTF-8
289	68			68		45073	use if $] le 5.006002, 'utf8';
	68					943
	68					1130
290
291	0					0	my $char_ord;
292							my $needed; # How many continuation bytes to gobble up
293
294							# Initialize the translated line with a dummy character that will be
295							# deleted after everything else is done. This dummy makes sure that
296							# $copy will be in UTF-8. Doing it now avoids the bugs in early perls
297							# with upgrading in the middle
298	0					0	$copy = chr(0x100);
299
300							# Parse through the line
301	0					0	for (my $i = 0; $i < length $line; $i++) {
302	0					0	my $byte = substr($line, $i, 1);
303
304							# ASCII bytes are trivially dealt with
305	0	0				0	if ($byte !~ $non_ascii_re) {
306	0					0	$copy .= $byte;
307	0					0	next;
308							}
309
310	0					0	my $b_ord = ord $byte;
311
312							# Now figure out what this code point would be if the input is
313							# actually in UTF-8. If, in the process, we discover that it isn't
314							# well-formed UTF-8, we guess CP1252.
315							#
316							# Start the process. If it is UTF-8, we are at the first, start
317							# byte, of a multi-byte sequence. We look at this byte to figure
318							# out how many continuation bytes are needed, and to initialize the
319							# code point accumulator with the data from this byte.
320							#
321							# Normally the minimum continuation byte is 0x80, but in certain
322							# instances the minimum is a higher number. So the code below
323							# overrides this for those instances.
324	0					0	my $min_cont = 0x80;
325
326	0	0				0	if ($b_ord < 0xC2) { # A start byte < C2 is malformed
		0
		0
		0
327	0					0	goto set_1252;
328							}
329							elsif ($b_ord <= 0xDF) {
330	0					0	$needed = 1;
331	0					0	$char_ord = $b_ord & 0x1F;
332							}
333							elsif ($b_ord <= 0xEF) {
334	0	0				0	$min_cont = 0xA0 if $b_ord == 0xE0;
335	0					0	$needed = 2;
336	0					0	$char_ord = $b_ord & (0x1F >> 1);
337							}
338							elsif ($b_ord <= 0xF4) {
339	0	0				0	$min_cont = 0x90 if $b_ord == 0xF0;
340	0					0	$needed = 3;
341	0					0	$char_ord = $b_ord & (0x1F >> 2);
342							}
343							else { # F4 is the highest start byte for legal Unicode; higher is
344							# unlikely to be in pod.
345	0					0	goto set_1252;
346							}
347
348							# ? not enough continuation bytes available
349	0	0				0	goto set_1252 if $i + $needed >= length $line;
350
351							# Accumulate the ordinal of the character from the remaining
352							# (continuation) bytes.
353	0					0	while ($needed-- > 0) {
354	0					0	my $cont = substr($line, ++$i, 1);
355	0					0	$b_ord = ord $cont;
356	0	0	0			0	goto set_1252 if $b_ord < $min_cont \|\| $b_ord > 0xBF;
357
358							# In all cases, any next continuation bytes all have the same
359							# minimum legal value
360	0					0	$min_cont = 0x80;
361
362							# Accumulate this byte's contribution to the code point
363	0					0	$char_ord <<= 6;
364	0					0	$char_ord \|= ($b_ord & 0x3F);
365							}
366
367							# Here, the sequence that formed this code point was valid UTF-8,
368							# so add the completed character to the output
369	0					0	$copy .= chr $char_ord;
370							} # End of loop through line
371
372							# Delete the dummy first character
373	0					0	$copy = substr($copy, 1);
374							}
375
376							# Here, $copy is legal UTF-8.
377
378							# If it can't be legal CP1252, no need to look further. (These bytes
379							# aren't valid in CP1252.) This test could have been placed higher in
380							# the code, but it seemed wrong to set the encoding to UTF-8 without
381							# making sure that the very first instance is well-formed. But what if
382							# it isn't legal CP1252 either? We have to choose one or the other, and
383							# It seems safer to favor the single-byte encoding over the multi-byte.
384	6	50				19	goto set_utf8 if ord("A") == 65 && $line =~ /[\x81\x8D\x8F\x90\x9D]/;
385
386							# The C1 controls are not likely to appear in pod
387	6	50				27	goto set_1252 if ord("A") == 65 && $copy =~ /[\x80-\x9F]/;
388
389							# Nor are surrogates nor unassigned, nor deprecated.
390	6	50				38	DEBUG > 8 and print STDERR __LINE__, ": $copy: surrogate\n" if $copy =~ $cs_re;
391	6	50	33			51	goto set_1252 if $cs_re && $copy =~ $cs_re;
392	6	50	33			56	DEBUG > 8 and print STDERR __LINE__, ": $copy: unassigned\n" if $cn_re && $copy =~ $cn_re;
393	6	50	33			37	goto set_1252 if $cn_re && $copy =~ $cn_re;
394	6	50				34	DEBUG > 8 and print STDERR __LINE__, ": $copy: deprecated\n" if $copy =~ $deprecated_re;
395	6	50				24	goto set_1252 if $copy =~ $deprecated_re;
396
397							# Nor are rare code points. But this is hard to determine. khw
398							# believes that IPA characters and the modifier letters are unlikely to
399							# be in pod (and certainly very unlikely to be the in the first line in
400							# the pod containing non-ASCII)
401	6	100				44	DEBUG > 8 and print STDERR __LINE__, ": $copy: rare\n" if $copy =~ $rare_blocks_re;
402	6	100	66			37	goto set_1252 if $rare_blocks_re && $copy =~ $rare_blocks_re;
403
404							# The first Unicode version included essentially every Latin character
405							# in modern usage. So, a Latin character not in the first release will
406							# unlikely be in pod.
407	5	50	33			43	DEBUG > 8 and print STDERR __LINE__, ": $copy: later_latin\n" if $later_latin_re && $copy =~ $later_latin_re;
408	5	50	33			30	goto set_1252 if $later_latin_re && $copy =~ $later_latin_re;
409
410							# On perls that handle script runs, if the UTF-8 interpretation yields
411							# a single script, we guess UTF-8, otherwise just having a mixture of
412							# scripts is suspicious, so guess CP1252. We first strip off, as best
413							# we can, the ASCII characters that look like they are pod directives,
414							# as these would always show as mixed with non-Latin text.
415	5					75	$copy =~ s/$pod_chars_re//g;
416
417	5	50				19	if ($script_run_re) {
418	0	0				0	goto set_utf8 if $copy =~ $script_run_re;
419	0					0	DEBUG > 8 and print STDERR __LINE__, ": not script run\n";
420	0					0	goto set_1252;
421							}
422
423							# Even without script runs, but on recent enough perls and Unicodes, we
424							# can check if there is a mixture of both Latin and non-Latin. Again,
425							# having a mixture of scripts is suspicious, so assume CP1252
426
427							# If it's all non-Latin, there is no CP1252, as that is Latin
428							# characters and punct, etc.
429	5	50				38	DEBUG > 8 and print STDERR __LINE__, ": $copy: not latin\n" if $copy !~ $latin_re;
430	5	50				33	goto set_utf8 if $copy !~ $latin_re;
431
432	5	100				70	DEBUG > 8 and print STDERR __LINE__, ": $copy: all latin\n" if $copy =~ $every_char_is_latin_re;
433	5	100				44	goto set_utf8 if $copy =~ $every_char_is_latin_re;
434
435	1					2	DEBUG > 8 and print STDERR __LINE__, ": $copy: mixed\n";
436
437	17					24	set_1252:
438							DEBUG > 9 and print STDERR __LINE__, ": $copy: is 1252\n";
439	17					35	$encoding = 'CP1252';
440	17					47	goto done_set;
441
442	4					5	set_utf8:
443							DEBUG > 9 and print STDERR __LINE__, ": $copy: is UTF-8\n";
444	4					9	$encoding = 'UTF-8';
445
446	21					128	done_set:
447							$self->_handle_encoding_line( "=encoding $encoding" );
448	21					46	delete $self->{'_processed_encoding'};
449	21	50				101	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
450
451	21					400	my ($word) = $line =~ /(\S$non_ascii_re\S)/;
452
453							$self->whine(
454	21					171	$self->{'line_count'},
455							"Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
456							);
457							}
458
459	18139					26444	DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
460
461	18139	100				33122	if(!$self->{'in_pod'}) {
462	1700	100				4884	if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s\|$)/s) {
463	941	100				2659	if($1 eq 'cut') {
464							$self->scream(
465	4					40	$self->{'line_count'},
466							"=cut found outside a pod block. Skipping to next block."
467							);
468
469							## Before there were errata sections in the world, it was
470							## least-pessimal to abort processing the file. But now we can
471							## just barrel on thru (but still not start a pod block).
472							#splice @_;
473							#push @_, undef;
474
475	4					10	next;
476							} else {
477							$self->{'in_pod'} = $self->{'start_of_pod_block'}
478	937					2586	= $self->{'last_was_blank'} = 1;
479							# And fall thru to the pod-mode block further down
480							}
481							} else {
482	759					972	DEBUG > 5 and print STDERR "# It's a code-line.\n";
483	759	100				1481	$code_handler->(map $_, $line, $self->{'line_count'}, $self)
484							if $code_handler;
485							# Note: this may cause code to be processed out of order relative
486							# to pods, but in order relative to cuts.
487
488							# Note also that we haven't yet applied the transcoding to $line
489							# by time we call $code_handler!
490
491	759	50				1572	if( $line =~ m/^#\sline\s+(\d+)\s(?:\s"([^"]+)")?\s*$/ ) {
492							# That RE is from perlsyn, section "Plain Old Comments (Not!)",
493							#$fname = $2 if defined $2;
494							#DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
495	0					0	DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
496	0					0	$self->{'line_count'} = $1 - 1;
497							}
498
499	759					1540	next;
500							}
501							}
502
503							# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
504							# Else we're in pod mode:
505
506							# Apply any necessary transcoding:
507	17376	100				32515	$self->{'_transcoder'} && $self->{'_transcoder'}->($line);
508
509							# HERE WE CATCH =encoding EARLY!
510	17376	100				32270	if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
511	39	100				189	next if $self->parse_characters; # Ignore this line
512	38					142	$line = $self->_handle_encoding_line( $line );
513							}
514
515	17375	100				61699	if($line =~ m/^=cut/s) {
		100
		100
516							# here ends the pod block, and therefore the previous pod para
517	166					282	DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
518	166					384	$self->{'in_pod'} = 0;
519							# ++$self->{'pod_para_count'};
520	166					489	$self->_ponder_paragraph_buffer();
521							# by now it's safe to consider the previous paragraph as done.
522	166					244	DEBUG > 6 and print STDERR "Processing any cut handler, line ${$self}{'line_count'}\n";
523	166	100				566	$cut_handler->(map $_, $line, $self->{'line_count'}, $self)
524							if $cut_handler;
525
526							# TODO: add to docs: Note: this may cause cuts to be processed out
527							# of order relative to pods, but in order relative to code.
528
529							} elsif($line =~ m/^(\s*)$/s) { # it's a blank line
530	5731	100	66			21489	if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
531	22	100				82	$wl_handler->(map $_, $line, $self->{'line_count'}, $self)
532							if $wl_handler;
533							}
534
535	5731	100	66			26387	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			100
536	632					941	DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
537	632					819	push @{$paras->[-1]}, $line;
	632					1482
538							} # otherwise it's not interesting
539
540	5731	100	100			18434	if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
541	5636					7208	DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
542							}
543
544	5731					10028	$self->{'last_was_blank'} = 1;
545
546							} elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para...
547
548	6341	100				19133	if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9])(\s+\|$)(.)/s) {
		100
549							# THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
550	2855					11865	my $new = [$1, {'start_line' => $self->{'line_count'}}, $3];
551	2855	100	100			11617	$new->[1]{'~orig_spacer'} = $2 if $2 && $2 ne " ";
552							# Note that in "=head1 foo", the WS is lost.
553							# Example: ['=head1', {'start_line' => 123}, ' foo']
554
555	2855					4530	++$self->{'pod_para_count'};
556
557	2855					8597	$self->_ponder_paragraph_buffer();
558							# by now it's safe to consider the previous paragraph as done.
559
560	2855					5375	push @$paras, $new; # the new incipient paragraph
561	2855					4335	DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
562
563							} elsif($line =~ m/^\s/s) {
564
565	717	100	33			3904	if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
			66
566	173					244	DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
567	173					254	push @{$paras->[-1]}, $line;
	173					476
568							} else {
569	544					855	++$self->{'pod_para_count'};
570	544					1437	$self->_ponder_paragraph_buffer();
571							# by now it's safe to consider the previous paragraph as done.
572	544					757	DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
573	544					2110	push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
574							}
575							} else {
576	2769					4226	++$self->{'pod_para_count'};
577	2769					7229	$self->_ponder_paragraph_buffer();
578							# by now it's safe to consider the previous paragraph as done.
579	2769					9864	push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line];
580	2769					4415	DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
581							}
582	6341					13755	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
583
584							} else {
585							# It's a non-blank line /continuing/ the current para
586	5137	50				9434	if(@$paras) {
587	5137					6209	DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
588	5137					6284	push @{$paras->[-1]}, $line;
	5137					11370
589							} else {
590							# Unexpected case!
591	0					0	die "Continuing a paragraph but \@\$paras is empty?";
592							}
593	5137					9883	$self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
594							}
595
596							} # ends the big while loop
597
598	9477					12486	DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
599	9477					46774	return $self;
600							}
601
602							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
603
604							sub _handle_encoding_line {
605	59			59		194	my($self, $line) = @_;
606
607	59	50				183	return if $self->parse_characters;
608
609							# The point of this routine is to set $self->{'_transcoder'} as indicated.
610
611	59	50				330	return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
612	59					104	DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
613
614	59					170	my $e = $1;
615	59					104	my $orig = $e;
616	59					115	push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
	59					267
617
618	59					106	my $enc_error;
619
620							# Cf. perldoc Encode and perldoc Encode::Supported
621
622	59					7537	require Pod::Simple::Transcode;
623
624	59	100	33			226	if( $self->{'encoding'} ) {
		50
		100
625	5					24	my $norm_current = $self->{'encoding'};
626	5					12	my $norm_e = $e;
627	5					15	foreach my $that ($norm_current, $norm_e) {
628	10					23	$that = lc($that);
629	10					49	$that =~ s/[-_]//g;
630							}
631	5	100				21	if($norm_current eq $norm_e) {
632	3					4	DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
633							"redundant. ($norm_current eq $norm_e). Ignoring.\n";
634	3					9	$enc_error = '';
635							# But that doesn't necessarily mean that the earlier one went okay
636							} else {
637	2					27	$enc_error = "Encoding is already set to " . $self->{'encoding'};
638	2					4	DEBUG > 1 and print STDERR $enc_error;
639							}
640							} elsif (
641							# OK, let's turn on the encoding
642							do {
643	54					92	DEBUG > 1 and print STDERR " Setting encoding to $e\n";
644	54					147	$self->{'encoding'} = $e;
645	54					631	1;
646							}
647							and $e eq 'HACKRAW'
648							) {
649	0					0	DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
650
651							} elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
652
653							die($enc_error = "WHAT? _transcoder is already set?!")
654	52	50				42757	if $self->{'_transcoder'}; # should never happen
655	52					248	require Pod::Simple::Transcode;
656	52					279	$self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
657	52					120	eval {
658	52					160	my @x = ('', "abc", "123");
659	52					239	$self->{'_transcoder'}->(@x);
660							};
661	52	50				185	$@ && die( $enc_error =
662							"Really unexpected error setting up encoding $e: $@\nAborting"
663							);
664	52					143	$self->{'detected_encoding'} = $e;
665
666							} else {
667	2					1875	my @supported = Pod::Simple::Transcode::->all_encodings;
668
669							# Note unsupported, and complain
670	2					2104	DEBUG and print STDERR " Encoding [$e] is unsupported.",
671							"\nSupporteds: @supported\n";
672	2					13	my $suggestion = '';
673
674							# Look for a near match:
675	2					10	my $norm = lc($e);
676	2					7	$norm =~ tr[-_][]d;
677	2					5	my $n;
678	2					7	foreach my $enc (@supported) {
679	248					323	$n = lc($enc);
680	248					310	$n =~ tr[-_][]d;
681	248	50				419	next unless $n eq $norm;
682	0					0	$suggestion = " (Maybe \"$e\" should be \"$enc\"?)";
683	0					0	last;
684							}
685	2					25	my $encmodver = Pod::Simple::Transcode::->encmodver;
686	2					37	$enc_error = join '' =>
687							"This document probably does not appear as it should, because its ",
688							"\"=encoding $e\" line calls for an unsupported encoding.",
689							$suggestion, " [$encmodver\'s supported encodings are: @supported]"
690							;
691
692	2					28	$self->scream( $self->{'line_count'}, $enc_error );
693							}
694	59					115	push @{ $self->{'encoding_command_statuses'} }, $enc_error;
	59					181
695	59	100				191	if (defined($self->{'_processed_encoding'})) {
696							# Double declaration.
697	1					8	$self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
698							}
699	59					122	$self->{'_processed_encoding'} = $orig;
700
701	59					127	return $line;
702							}
703
704							# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
705
706							sub _handle_encoding_second_level {
707							# By time this is called, the encoding (if well formed) will already
708							# have been acted on.
709	38			38		123	my($self, $para) = @_;
710	38					117	my @x = @$para;
711	38					138	my $content = join ' ', splice @x, 2;
712	38					112	$content =~ s/^\s+//s;
713	38					107	$content =~ s/\s+$//s;
714
715	38					66	DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
716
717	38	100				103	if (defined($self->{'_processed_encoding'})) {
718							#if($content ne $self->{'_processed_encoding'}) {
719							# Could it happen?
720							#}
721	37					98	delete $self->{'_processed_encoding'};
722							# It's already been handled. Check for errors.
723	37	50				184	if(! $self->{'encoding_command_statuses'} ) {
		100
724	0					0	DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
725							} elsif( $self->{'encoding_command_statuses'}[-1] ) {
726							$self->whine( $para->[1]{'start_line'},
727							sprintf "Couldn't do %s: %s",
728							$self->{'encoding_command_reqs' }[-1],
729	4					73	$self->{'encoding_command_statuses'}[-1],
730							);
731							} else {
732	33					55	DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
733							}
734
735							} else {
736							# Otherwise it's a syntax error
737	1					20	$self->whine( $para->[1]{'start_line'},
738							"Invalid =encoding syntax: $content"
739							);
740							}
741
742	38					83	return;
743							}
744
745							#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
746
747							{
748							my $m = -321; # magic line number
749
750							sub _gen_errata {
751	890			890		1416	my $self = $_[0];
752							# Return 0 or more fake-o paragraphs explaining the accumulated
753							# errors on this document.
754
755	890	100	66			3006	return() unless $self->{'errata'} and keys %{$self->{'errata'}};
	40					235
756
757	40					95	my @out;
758
759	40					79	foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
	26					74
	40					204
760							push @out,
761							['=item', {'start_line' => $m}, "Around line $line:"],
762							map( ['~Para', {'start_line' => $m, '~cooked' => 1},
763							#['~Top', {'start_line' => $m},
764							$_
765							#]
766							],
767	59					291	@{$self->{'errata'}{$line}}
	59					372
768							)
769							;
770							}
771
772							# TODO: report of unknown entities? unrenderable characters?
773
774	40					469	unshift @out,
775							['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
776							['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
777							"Hey! ",
778							['B', {},
779							'The above document had some coding errors, which are explained below:'
780							]
781							],
782							['=over', {'start_line' => $m, 'errata' => 1}, ''],
783							;
784
785	40					189	push @out,
786							['=back', {'start_line' => $m, 'errata' => 1}, ''],
787							;
788
789	40					86	DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
790
791	40					159	return @out;
792							}
793
794							}
795
796							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
797
798							##############################################################################
799							##
800							## stop reading now stop reading now stop reading now stop reading now stop
801							##
802							## HERE IT BECOMES REALLY SCARY
803							##
804							## stop reading now stop reading now stop reading now stop reading now stop
805							##
806							##############################################################################
807
808							sub _ponder_paragraph_buffer {
809
810							# Para-token types as found in the buffer.
811							# ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
812							# =over, =back, =item
813							# and the null =pod (to be complained about if over one line)
814							#
815							# "~data" paragraphs are something we generate at this level, depending on
816							# a currently open =over region
817
818							# Events fired: Begin and end for:
819							# directivename (like head1 .. head4), item, extend,
820							# for (from =begin...=end, =for),
821							# over-bullet, over-number, over-text, over-block,
822							# item-bullet, item-number, item-text,
823							# Document,
824							# Data, Para, Verbatim
825							# B, C, longdirname (TODO -- wha?), etc. for all directives
826							#
827
828	7224			7224		10817	my $self = $_[0];
829	7224					9573	my $paras;
830	7224	100				9119	return unless @{$paras = $self->{'paras'}};
	7224					16797
831	6290		100			16328	my $curr_open = ($self->{'curr_open'} \|\|= []);
832
833	6290					8542	my $scratch;
834
835	6290					7845	DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
836
837							# We have something in our buffer. So apparently the document has started.
838	6290	100				11846	unless($self->{'doc_has_started'}) {
839	904					2302	$self->{'doc_has_started'} = 1;
840
841	904					1270	my $starting_contentless;
842	904		66			5347	$starting_contentless =
843							(
844							!@$curr_open
845							and @$paras and ! grep $_->[0] ne '~end', @$paras
846							# i.e., if the paras is all ~ends
847							)
848							;
849	904					1314	DEBUG and print STDERR "# Starting ",
850							$starting_contentless ? 'contentless' : 'contentful',
851							" document\n"
852							;
853
854							$self->_handle_element_start(
855							($scratch = 'Document'),
856							{
857	904	100				4710	'start_line' => $paras->[0][1]{'start_line'},
858							$starting_contentless ? ( 'contentless' => 1 ) : (),
859							},
860							);
861							}
862
863	6290					10917	my($para, $para_type);
864	6290					12276	while(@$paras) {
865
866							# If a directive, assume it's legal; subtract below if found not to be
867	8981	100				25179	$seen_legal_directive++ if $paras->[0][0] =~ /^=/;
868
869							last if @$paras == 1
870							and ( $paras->[0][0] eq '=over'
871							or $paras->[0][0] eq '=item'
872	8981	100	100			37861	or ($paras->[0][0] eq '~Verbatim' and $self->{'in_pod'}));
			100
873							# Those're the three kinds of paragraphs that require lookahead.
874							# Actually, an "=item Foo" inside an region
875							# and any =item inside an region (rare)
876							# don't require any lookahead, but all others (bullets
877							# and numbers) do.
878							# The verbatim is different from the other two, because those might be
879							# like:
880							#
881							# =item
882							# ...
883							# =cut
884							# ...
885							# =item
886							#
887							# The =cut here finishes the paragraph but doesn't terminate the =over
888							# they should be in. (khw apologizes that he didn't comment at the time
889							# why the 'in_pod' works, and no longer remembers why, and doesn't think
890							# it is currently worth the effort to re-figure it out.)
891
892							# TODO: whinge about many kinds of directives in non-resolving =for regions?
893							# TODO: many? like what? =head1 etc?
894
895	7466					14801	$para = shift @$paras;
896	7466					12976	$para_type = $para->[0];
897
898	7466					9519	DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
899							$self->_dump_curr_open(), ")\n";
900
901	7466	100				23502	if($para_type eq '=for') {
		100
		100
		100
902	50	50				140	next if $self->_ponder_for($para,$curr_open,$paras);
903
904							} elsif($para_type eq '=begin') {
905	127	50				321	next if $self->_ponder_begin($para,$curr_open,$paras);
906
907							} elsif($para_type eq '=end') {
908	130	50				335	next if $self->_ponder_end($para,$curr_open,$paras);
909
910							} elsif($para_type eq '~end') { # The virtual end-document signal
911	940	50				2806	next if $self->_ponder_doc_end($para,$curr_open,$paras);
912							}
913
914
915							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
916							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
917	6219	100				15507	if(grep $_->[1]{'~ignore'}, @$curr_open) {
918	42					58	DEBUG > 1 and
919							print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
920	42					83	next;
921							}
922							#~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
923							# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
924
925	6177	100				15371	if($para_type eq '=pod') {
		100
		100
926	578					1669	$self->_ponder_pod($para,$curr_open,$paras);
927
928							} elsif($para_type eq '=over') {
929	213	100				840	next if $self->_ponder_over($para,$curr_open,$paras);
930
931							} elsif($para_type eq '=back') {
932	209	100				808	next if $self->_ponder_back($para,$curr_open,$paras);
933
934							} else {
935
936							# All non-magical codes!!!
937
938							# Here we start using $para_type for our own twisted purposes, to
939							# mean how it should get treated, not as what the element name
940							# should be.
941
942	5177					6722	DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
943
944	5177					6683	my $i;
945
946							# Enforce some =headN discipline
947	5177	100	66			14525	if($para_type =~ m/^=head\d$/s
			100
			66
948							and ! $self->{'accept_heads_anywhere'}
949							and @$curr_open
950							and $curr_open->[-1][0] eq '=over'
951							) {
952	6					10	DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
953							$self->whine(
954	6					32	$para->[1]{'start_line'},
955							"You forgot a '=back' before '$para_type'"
956							);
957	6					31	unshift @$paras, ['=back', {}, ''], $para; # close the =over
958	6					16	next;
959							}
960
961
962	5171	100	66			19466	if($para_type eq '=item') {
		100
		100
		100
		100
		100
		100
963
964	1022					1444	my $over;
965	1022	50	33			2715	unless(@$curr_open and
966	1130					4301	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
967							$self->whine(
968	0					0	$para->[1]{'start_line'},
969							"'=item' outside of any '=over'"
970							);
971							unshift @$paras,
972	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
973							$para
974							;
975	0					0	next;
976							}
977
978
979	1022					2087	my $over_type = $over->[1]{'~type'};
980
981	1022	50				3583	if(!$over_type) {
		50
		100
		100
		50
982							# Shouldn't happen1
983							die "Typeless over in stack, starting at line "
984	0					0	. $over->[1]{'start_line'};
985
986							} elsif($over_type eq 'block') {
987	0	0				0	unless($curr_open->[-1][1]{'~bitched_about'}) {
988	0					0	$curr_open->[-1][1]{'~bitched_about'} = 1;
989							$self->whine(
990							$curr_open->[-1][1]{'start_line'},
991							"You can't have =items (as at line "
992	0					0	. $para->[1]{'start_line'}
993							. ") unless the first thing after the =over is an =item"
994							);
995							}
996							# Just turn it into a paragraph and reconsider it
997	0					0	$para->[0] = '~Para';
998	0					0	unshift @$paras, $para;
999	0					0	next;
1000
1001							} elsif($over_type eq 'text') {
1002	835					2428	my $item_type = $self->_get_item_type($para);
1003							# That kills the content of the item if it's a number or bullet.
1004	835					1307	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1005
1006	835	100	66			1878	if($item_type eq 'text') {
		50
1007							# Nothing special needs doing for 'text'
1008							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1009							$self->whine(
1010	4					41	$para->[1]{'start_line'},
1011							"Expected text after =item, not a $item_type"
1012							);
1013							# Undo our clobbering:
1014	4					14	push @$para, $para->[1]{'~orig_content'};
1015	4					8	delete $para->[1]{'number'};
1016							# Only a PROPER item-number element is allowed
1017							# to have a number attribute.
1018							} else {
1019	0					0	die "Unhandled item type $item_type"; # should never happen
1020							}
1021
1022							# =item-text thingies don't need any assimilation, it seems.
1023
1024							} elsif($over_type eq 'number') {
1025	28					109	my $item_type = $self->_get_item_type($para);
1026							# That kills the content of the item if it's a number or bullet.
1027	28					46	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1028
1029	28					74	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1030
1031	28	50				179	if($item_type eq 'bullet') {
		50
		50
		50
1032							# Hm, it's not numeric. Correct for this.
1033	0					0	$para->[1]{'number'} = $expected_value;
1034							$self->whine(
1035	0					0	$para->[1]{'start_line'},
1036							"Expected '=item $expected_value'"
1037							);
1038	0					0	push @$para, $para->[1]{'~orig_content'};
1039							# restore the bullet, blocking the assimilation of next para
1040
1041							} elsif($item_type eq 'text') {
1042							# Hm, it's not numeric. Correct for this.
1043	0					0	$para->[1]{'number'} = $expected_value;
1044							$self->whine(
1045	0					0	$para->[1]{'start_line'},
1046							"Expected '=item $expected_value'"
1047							);
1048							# Text content will still be there and will block next ~Para
1049
1050							} elsif($item_type ne 'number') {
1051	0					0	die "Unknown item type $item_type"; # should never happen
1052
1053							} elsif($expected_value == $para->[1]{'number'}) {
1054	28					46	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1055
1056							} else {
1057	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1058							" instead of the expected value of $expected_value\n";
1059							$self->whine(
1060							$para->[1]{'start_line'},
1061	0					0	"You have '=item " . $para->[1]{'number'} .
1062							"' instead of the expected '=item $expected_value'"
1063							);
1064	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1065							}
1066
1067	28	50				86	if(@$para == 2) {
1068							# For the cases where we /didn't/ push to @$para
1069	28	100				64	if($paras->[0][0] eq '~Para') {
1070	25					37	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1071	25					39	push @$para, splice @{shift @$paras},2;
	25					70
1072							} else {
1073	3					6	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1074	3					5	push @$para, ''; # Just so it's not contentless
1075							}
1076							}
1077
1078
1079							} elsif($over_type eq 'bullet') {
1080	159					475	my $item_type = $self->_get_item_type($para);
1081							# That kills the content of the item if it's a number or bullet.
1082	159					263	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1083
1084	159	50				350	if($item_type eq 'bullet') {
		0
		0
1085							# as expected!
1086
1087	159	100				367	if( $para->[1]{'~_freaky_para_hack'} ) {
1088	101					131	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1089	101					197	push @$para, $para->[1]{'~_freaky_para_hack'};
1090							}
1091
1092							} elsif($item_type eq 'number') {
1093							$self->whine(
1094	0					0	$para->[1]{'start_line'},
1095							"Expected '=item *'"
1096							);
1097	0					0	push @$para, $para->[1]{'~orig_content'};
1098							# and block assimilation of the next paragraph
1099	0					0	delete $para->[1]{'number'};
1100							# Only a PROPER item-number element is allowed
1101							# to have a number attribute.
1102							} elsif($item_type eq 'text') {
1103							$self->whine(
1104	0					0	$para->[1]{'start_line'},
1105							"Expected '=item *'"
1106							);
1107							# But doesn't need processing. But it'll block assimilation
1108							# of the next para.
1109							} else {
1110	0					0	die "Unhandled item type $item_type"; # should never happen
1111							}
1112
1113	159	100				398	if(@$para == 2) {
1114							# For the cases where we /didn't/ push to @$para
1115	58	50				152	if($paras->[0][0] eq '~Para') {
1116	58					88	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1117	58					92	push @$para, splice @{shift @$paras},2;
	58					160
1118							} else {
1119	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1120	0					0	push @$para, ''; # Just so it's not contentless
1121							}
1122							}
1123
1124							} else {
1125	0					0	die "Unhandled =over type \"$over_type\"?";
1126							# Shouldn't happen!
1127							}
1128
1129	1022					1701	$para_type = 'Plain';
1130	1022					2771	$para->[0] .= '-' . $over_type;
1131							# Whew. Now fall thru and process it.
1132
1133
1134							} elsif($para_type eq '=extend') {
1135							# Well, might as well implement it here.
1136	21					76	$self->_ponder_extend($para);
1137	21					50	next; # and skip
1138							} elsif($para_type eq '=encoding') {
1139							# Not actually acted on here, but we catch errors here.
1140	38					205	$self->_handle_encoding_second_level($para);
1141	38	100				130	next unless $self->keep_encoding_directive;
1142	35					77	$para_type = 'Plain';
1143							} elsif($para_type eq '~Verbatim') {
1144	538					972	$para->[0] = 'Verbatim';
1145	538					810	$para_type = '?Verbatim';
1146							} elsif($para_type eq '~Para') {
1147	2772					4592	$para->[0] = 'Para';
1148	2772					4243	$para_type = '?Plain';
1149							} elsif($para_type eq 'Data') {
1150	30					56	$para->[0] = 'Data';
1151	30					45	$para_type = '?Data';
1152							} elsif( $para_type =~ s/^=//s
1153							and defined( $para_type = $self->{'accept_directives'}{$para_type} )
1154							) {
1155	738					1323	DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
1156							} else {
1157							# An unknown directive!
1158	12					37	$seen_legal_directive--;
1159							DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
1160	12					22	$para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
1161							;
1162							$self->whine(
1163	12					129	$para->[1]{'start_line'},
1164							"Unknown directive: $para->[0]"
1165							);
1166
1167							# And maybe treat it as text instead of just letting it go?
1168	12					51	next;
1169							}
1170
1171	5135	100				17627	if($para_type =~ s/^\?//s) {
1172	3340	100				6775	if(! @$curr_open) { # usual case
1173	2066					2841	DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
1174							} else {
1175	1274					3373	my @fors = grep $_->[0] eq '=for', @$curr_open;
1176							DEBUG > 1 and print STDERR "Containing fors: ",
1177	1274					1783	join(',', map $_->[1]{'target'}, @fors), "\n";
1178
1179	1274	100				2590	if(! @fors) {
		100
1180	1150					1718	DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
1181
1182							#} elsif(grep $_->[1]{'~resolve'}, @fors) {
1183							#} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
1184							} elsif( $fors[-1][1]{'~resolve'} ) {
1185							# Look to the immediately containing for
1186
1187	73	100				154	if($para_type eq 'Data') {
1188	18					24	DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1189	18					29	$para->[0] = 'Para';
1190	18					40	$para_type = 'Plain';
1191							} else {
1192	55					85	DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
1193							}
1194							} else {
1195	51					88	DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
1196	51					136	$para->[0] = $para_type = 'Data';
1197							}
1198							}
1199							}
1200
1201							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1202	5135	100				10340	if($para_type eq 'Plain') {
		100
		50
1203	4547					10516	$self->_ponder_Plain($para);
1204							} elsif($para_type eq 'Verbatim') {
1205	534					1429	$self->_ponder_Verbatim($para);
1206							} elsif($para_type eq 'Data') {
1207	54					174	$self->_ponder_Data($para);
1208							} else {
1209	0					0	die "\$para type is $para_type -- how did that happen?";
1210							# Shouldn't happen.
1211							}
1212
1213							#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1214	5135					13931	$para->[0] =~ s/^[~=]//s;
1215
1216	5135					7329	DEBUG and print STDERR "\n", pretty($para), "\n";
1217
1218							# traverse the treelet (which might well be just one string scalar)
1219							$self->{'content_seen'} \|\|= 1 if $seen_legal_directive
1220	5135	100	100			22584	&& ! $self->{'~tried_gen_errata'};
			100
1221	5135					12610	$self->_traverse_treelet_bit(@$para);
1222							}
1223							}
1224
1225	6290					17108	return;
1226							}
1227
1228							###########################################################################
1229							# The sub-ponderers...
1230
1231
1232
1233							sub _ponder_for {
1234	50			50		124	my ($self,$para,$curr_open,$paras) = @_;
1235
1236							# Fake it out as a begin/end
1237	50					70	my $target;
1238
1239	50	50				128	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1240	0					0	DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1241	0					0	return 1;
1242							}
1243
1244	50					151	for(my $i = 2; $i < @$para; ++$i) {
1245	50	50				234	if($para->[$i] =~ s/^\s(\S+)\s//s) {
1246	50					107	$target = $1;
1247	50					83	last;
1248							}
1249							}
1250	50	50				115	unless(defined $target) {
1251							$self->whine(
1252	0					0	$para->[1]{'start_line'},
1253							"=for without a target?"
1254							);
1255	0					0	return 1;
1256							}
1257	50					70	DEBUG > 1 and
1258							print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1259
1260	50					115	$para->[0] = 'Data';
1261
1262							unshift @$paras,
1263							['=begin',
1264							{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1265							$target,
1266							],
1267							$para,
1268							['=end',
1269	50					320	{'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1270							$target,
1271							],
1272							;
1273
1274	50					185	return 1;
1275							}
1276
1277							sub _ponder_begin {
1278	127			127		253	my ($self,$para,$curr_open,$paras) = @_;
1279	127					333	my $content = join ' ', splice @$para, 2;
1280	127					310	$content =~ s/^\s+//s;
1281	127					283	$content =~ s/\s+$//s;
1282	127	50				279	unless(length($content)) {
1283							$self->whine(
1284	0					0	$para->[1]{'start_line'},
1285							"=begin without a target?"
1286							);
1287	0					0	DEBUG and print STDERR "Ignoring targetless =begin\n";
1288	0					0	return 1;
1289							}
1290
1291	127					562	my ($target, $title) = $content =~ m/^(\S+)\s(.)$/;
1292	127	100				330	$para->[1]{'title'} = $title if ($title);
1293	127					330	$para->[1]{'target'} = $target; # without any ':'
1294	127					183	$content = $target; # strip off the title
1295
1296	127					208	$content =~ s/^:!/!:/s;
1297	127					173	my $neg; # whether this is a negation-match
1298	127	100				395	$neg = 1 if $content =~ s/^!//s;
1299	127					229	my $to_resolve; # whether to process formatting codes
1300	127	100				322	$to_resolve = 1 if $content =~ s/^://s;
1301
1302	127					189	my $dont_ignore; # whether this target matches us
1303
1304	127	100				447	foreach my $target_name (
1305							split(',', $content, -1),
1306							$neg ? () : '*'
1307							) {
1308	249					316	DEBUG > 2 and
1309							print STDERR " Considering whether =begin $content matches $target_name\n";
1310	249	100				570	next unless $self->{'accept_targets'}{$target_name};
1311
1312	66					91	DEBUG > 2 and
1313							print STDERR " It DOES match the acceptable target $target_name!\n";
1314							$to_resolve = 1
1315	66	100				179	if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1316	66					105	$dont_ignore = 1;
1317	66					155	$para->[1]{'target_matching'} = $target_name;
1318	66					103	last; # stop looking at other target names
1319							}
1320
1321	127	100				304	if($neg) {
1322	33	100				67	if( $dont_ignore ) {
1323	6					12	$dont_ignore = '';
1324	6					11	delete $para->[1]{'target_matching'};
1325	6					9	DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1326							} else {
1327	27					44	$dont_ignore = 1;
1328	27					55	$para->[1]{'target_matching'} = '!';
1329	27					41	DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1330							}
1331							}
1332
1333	127					198	$para->[0] = '=for'; # Just what we happen to call these, internally
1334	127		100			448	$para->[1]{'~really'} \|\|= '=begin';
1335	127		100			397	$para->[1]{'~ignore'} = (! $dont_ignore) \|\| 0;
1336	127		100			326	$para->[1]{'~resolve'} = $to_resolve \|\| 0;
1337
1338	127					152	DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1339							"ignore contents of this region\n";
1340	127					170	DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1341							($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1342	127					176	DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1343
1344	127					244	push @$curr_open, $para;
1345	127	100	100			505	if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1346	40					54	DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1347							} else {
1348	87	50	100			281	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1349	87					291	$self->_handle_element_start((my $scratch='for'), $para->[1]);
1350							}
1351
1352	127					449	return 1;
1353							}
1354
1355							sub _ponder_end {
1356	130			130		293	my ($self,$para,$curr_open,$paras) = @_;
1357	130					373	my $content = join ' ', splice @$para, 2;
1358	130					331	$content =~ s/^\s+//s;
1359	130					292	$content =~ s/\s+$//s;
1360	130					182	DEBUG and print STDERR "Ogling '=end $content' directive\n";
1361
1362	130	50				281	unless(length($content)) {
1363							$self->whine(
1364							$para->[1]{'start_line'},
1365							"'=end' without a target?" . (
1366							( @$curr_open and $curr_open->[-1][0] eq '=for' )
1367	0	0	0			0	? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1368							: ''
1369							)
1370							);
1371	0					0	DEBUG and print STDERR "Ignoring targetless =end\n";
1372	0					0	return 1;
1373							}
1374
1375	130	50				404	unless($content =~ m/^\S+$/) { # i.e., unless it's one word
1376							$self->whine(
1377	0					0	$para->[1]{'start_line'},
1378							"'=end $content' is invalid. (Stack: "
1379							. $self->_dump_curr_open() . ')'
1380							);
1381	0					0	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1382	0					0	return 1;
1383							}
1384
1385	130	50	33			496	unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1386							$self->whine(
1387	0					0	$para->[1]{'start_line'},
1388							"=end $content without matching =begin. (Stack: "
1389							. $self->_dump_curr_open() . ')'
1390							);
1391	0					0	DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1392	0					0	return 1;
1393							}
1394
1395	130	100				324	unless($content eq $curr_open->[-1][1]{'target'}) {
1396							$self->whine(
1397							$para->[1]{'start_line'},
1398							"=end $content doesn't match =begin "
1399	3					20	. $curr_open->[-1][1]{'target'}
1400							. ". (Stack: "
1401							. $self->_dump_curr_open() . ')'
1402							);
1403	3					4	DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1404	3					13	return 1;
1405							}
1406
1407							# Else it's okay to close...
1408	127	100				376	if(grep $_->[1]{'~ignore'}, @$curr_open) {
1409	40					52	DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1410							# And that may be because of this to-be-closed =for region, or some
1411							# other one, but it doesn't matter.
1412							} else {
1413	87					165	$curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1414							# what's that for?
1415
1416	87	50	50			266	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1417	87					274	$self->_handle_element_end( my $scratch = 'for', $para->[1]);
1418							}
1419	127					208	DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1420	127					203	pop @$curr_open;
1421
1422	127					602	return 1;
1423							}
1424
1425							sub _ponder_doc_end {
1426	940			940		2277	my ($self,$para,$curr_open,$paras) = @_;
1427	940	100				1966	if(@$curr_open) { # Deal with things left open
1428	10					16	DEBUG and print STDERR "Stack is nonempty at end-document: (",
1429							$self->_dump_curr_open(), ")\n";
1430
1431	10					19	DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1432	10					56	unshift @$paras, $self->_closers_for_all_curr_open;
1433							# Make sure there is exactly one ~end in the parastack, at the end:
1434	10					56	@$paras = grep $_->[0] ne '~end', @$paras;
1435	10					27	push @$paras, $para, $para;
1436							# We need two -- once for the next cycle where we
1437							# generate errata, and then another to be at the end
1438							# when that loop back around to process the errata.
1439	10					44	return 1;
1440
1441							} else {
1442	930					1318	DEBUG and print STDERR "Okay, stack is empty now.\n";
1443							}
1444
1445							# Try generating errata section, if applicable
1446	930	100				2177	unless($self->{'~tried_gen_errata'}) {
1447	890					1728	$self->{'~tried_gen_errata'} = 1;
1448	890					2292	my @extras = $self->_gen_errata();
1449	890	100				2119	if(@extras) {
1450	40					118	unshift @$paras, @extras;
1451	40					82	DEBUG and print STDERR "Generated errata... relooping...\n";
1452	40					199	return 1; # I.e., loop around again to process these fake-o paragraphs
1453							}
1454							}
1455
1456	890					1716	splice @$paras; # Well, that's that for this paragraph buffer.
1457	890					1258	DEBUG and print STDERR "Throwing end-document event.\n";
1458
1459	890					2790	$self->_handle_element_end( my $scratch = 'Document' );
1460	890					3175	return 1; # Hasta la byebye
1461							}
1462
1463							sub _ponder_pod {
1464	578			578		1287	my ($self,$para,$curr_open,$paras) = @_;
1465							$self->whine(
1466	578	50				1309	$para->[1]{'start_line'},
1467							"=pod directives shouldn't be over one line long! Ignoring all "
1468							. (@$para - 2) . " lines of content"
1469							) if @$para > 3;
1470
1471							# Content ignored unless 'pod_handler' is set
1472	578	100				1304	if (my $pod_handler = $self->{'pod_handler'}) {
1473	6					22	my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1474	6	100				18	$line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1475	6					15	$pod_handler->($line, $line_num, $self);
1476							}
1477
1478							# The surrounding methods set content_seen, so let us remain consistent.
1479							# I do not know why it was not here before -- should it not be here?
1480							# $self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1481
1482	578					1614	return;
1483							}
1484
1485							sub _ponder_over {
1486	213			213		569	my ($self,$para,$curr_open,$paras) = @_;
1487	213	50				497	return 1 unless @$paras;
1488	213					345	my $list_type;
1489
1490	213	100				666	if($paras->[0][0] eq '=item') { # most common case
		100
		50
1491	185					717	$list_type = $self->_get_initial_item_type($paras->[0]);
1492
1493							} elsif($paras->[0][0] eq '=back') {
1494							# Ignore empty lists by default
1495	6	100				19	if ($self->{'parse_empty_lists'}) {
1496	2					4	$list_type = 'empty';
1497							} else {
1498	4					8	shift @$paras;
1499	4					19	return 1;
1500							}
1501							} elsif($paras->[0][0] eq '~end') {
1502							$self->whine(
1503	0					0	$para->[1]{'start_line'},
1504							"=over is the last thing in the document?!"
1505							);
1506	0					0	return 1; # But feh, ignore it.
1507							} else {
1508	22					36	$list_type = 'block';
1509							}
1510	209					484	$para->[1]{'~type'} = $list_type;
1511	209					473	push @$curr_open, $para;
1512							# yes, we reuse the paragraph as a stack item
1513
1514	209					719	my $content = join ' ', splice @$para, 2;
1515	209					527	$para->[1]{'~orig_content'} = $content;
1516	209					307	my $overness;
1517	209	100				1081	if($content =~ m/^\s*$/s) {
		50
1518	119					352	$para->[1]{'indent'} = 4;
1519							} elsif($content =~ m/^\s((?:\d\.)?\d+)\s*$/s) {
1520	68			68		496223	no integer;
	68					216
	68					472
1521	90					310	$para->[1]{'indent'} = $1;
1522	90	50				279	if($1 == 0) {
1523							$self->whine(
1524	0					0	$para->[1]{'start_line'},
1525							"Can't have a 0 in =over $content"
1526							);
1527	0					0	$para->[1]{'indent'} = 4;
1528							}
1529							} else {
1530							$self->whine(
1531	0					0	$para->[1]{'start_line'},
1532							"=over should be: '=over' or '=over positive_number'"
1533							);
1534	0					0	$para->[1]{'indent'} = 4;
1535							}
1536	209					336	DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1537
1538	209	100	100			812	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1539	209					944	$self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1540
1541	209					776	return;
1542							}
1543
1544							sub _ponder_back {
1545	209			209		557	my ($self,$para,$curr_open,$paras) = @_;
1546							# TODO: fire off or or ??
1547
1548	209					582	my $content = join ' ', splice @$para, 2;
1549	209	50				561	if($content =~ m/\S/) {
1550							$self->whine(
1551	0					0	$para->[1]{'start_line'},
1552							"=back doesn't take any parameters, but you said =back $content"
1553							);
1554							}
1555
1556	209	50	33			939	if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1557	209					322	DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1558							# Expected case: we're closing the most recently opened thing
1559							#my $over = pop @$curr_open;
1560	209	100	50			702	$self->{'content_seen'} \|\|= 1 unless $self->{'~tried_gen_errata'};
1561							$self->_handle_element_end( my $scratch =
1562	209					933	'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1563							);
1564							} else {
1565	0					0	DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (",
1566							join(', ', map $_->[0], @$curr_open), ").\n";
1567							$self->whine(
1568	0					0	$para->[1]{'start_line'},
1569							'=back without =over'
1570							);
1571	0					0	return 1; # and ignore it
1572							}
1573							}
1574
1575							sub _ponder_item {
1576	0			0		0	my ($self,$para,$curr_open,$paras) = @_;
1577	0					0	my $over;
1578	0	0	0			0	unless(@$curr_open and
1579	0					0	$over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1580							$self->whine(
1581	0					0	$para->[1]{'start_line'},
1582							"'=item' outside of any '=over'"
1583							);
1584							unshift @$paras,
1585	0					0	['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1586							$para
1587							;
1588	0					0	return 1;
1589							}
1590
1591
1592	0					0	my $over_type = $over->[1]{'~type'};
1593
1594	0	0				0	if(!$over_type) {
		0
		0
		0
		0
1595							# Shouldn't happen1
1596							die "Typeless over in stack, starting at line "
1597	0					0	. $over->[1]{'start_line'};
1598
1599							} elsif($over_type eq 'block') {
1600	0	0				0	unless($curr_open->[-1][1]{'~bitched_about'}) {
1601	0					0	$curr_open->[-1][1]{'~bitched_about'} = 1;
1602							$self->whine(
1603							$curr_open->[-1][1]{'start_line'},
1604							"You can't have =items (as at line "
1605	0					0	. $para->[1]{'start_line'}
1606							. ") unless the first thing after the =over is an =item"
1607							);
1608							}
1609							# Just turn it into a paragraph and reconsider it
1610	0					0	$para->[0] = '~Para';
1611	0					0	unshift @$paras, $para;
1612	0					0	return 1;
1613
1614							} elsif($over_type eq 'text') {
1615	0					0	my $item_type = $self->_get_item_type($para);
1616							# That kills the content of the item if it's a number or bullet.
1617	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1618
1619	0	0	0			0	if($item_type eq 'text') {
		0
1620							# Nothing special needs doing for 'text'
1621							} elsif($item_type eq 'number' or $item_type eq 'bullet') {
1622							$self->whine(
1623	0					0	$para->[1]{'start_line'},
1624							"Expected text after =item, not a $item_type"
1625							);
1626							# Undo our clobbering:
1627	0					0	push @$para, $para->[1]{'~orig_content'};
1628	0					0	delete $para->[1]{'number'};
1629							# Only a PROPER item-number element is allowed
1630							# to have a number attribute.
1631							} else {
1632	0					0	die "Unhandled item type $item_type"; # should never happen
1633							}
1634
1635							# =item-text thingies don't need any assimilation, it seems.
1636
1637							} elsif($over_type eq 'number') {
1638	0					0	my $item_type = $self->_get_item_type($para);
1639							# That kills the content of the item if it's a number or bullet.
1640	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1641
1642	0					0	my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1643
1644	0	0				0	if($item_type eq 'bullet') {
		0
		0
		0
1645							# Hm, it's not numeric. Correct for this.
1646	0					0	$para->[1]{'number'} = $expected_value;
1647							$self->whine(
1648	0					0	$para->[1]{'start_line'},
1649							"Expected '=item $expected_value'"
1650							);
1651	0					0	push @$para, $para->[1]{'~orig_content'};
1652							# restore the bullet, blocking the assimilation of next para
1653
1654							} elsif($item_type eq 'text') {
1655							# Hm, it's not numeric. Correct for this.
1656	0					0	$para->[1]{'number'} = $expected_value;
1657							$self->whine(
1658	0					0	$para->[1]{'start_line'},
1659							"Expected '=item $expected_value'"
1660							);
1661							# Text content will still be there and will block next ~Para
1662
1663							} elsif($item_type ne 'number') {
1664	0					0	die "Unknown item type $item_type"; # should never happen
1665
1666							} elsif($expected_value == $para->[1]{'number'}) {
1667	0					0	DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1668
1669							} else {
1670	0					0	DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1671							" instead of the expected value of $expected_value\n";
1672							$self->whine(
1673							$para->[1]{'start_line'},
1674	0					0	"You have '=item " . $para->[1]{'number'} .
1675							"' instead of the expected '=item $expected_value'"
1676							);
1677	0					0	$para->[1]{'number'} = $expected_value; # correcting!!
1678							}
1679
1680	0	0				0	if(@$para == 2) {
1681							# For the cases where we /didn't/ push to @$para
1682	0	0				0	if($paras->[0][0] eq '~Para') {
1683	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1684	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1685							} else {
1686	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1687	0					0	push @$para, ''; # Just so it's not contentless
1688							}
1689							}
1690
1691
1692							} elsif($over_type eq 'bullet') {
1693	0					0	my $item_type = $self->_get_item_type($para);
1694							# That kills the content of the item if it's a number or bullet.
1695	0					0	DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1696
1697	0	0				0	if($item_type eq 'bullet') {
		0
		0
1698							# as expected!
1699
1700	0	0				0	if( $para->[1]{'~_freaky_para_hack'} ) {
1701	0					0	DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1702	0					0	push @$para, $para->[1]{'~_freaky_para_hack'};
1703							}
1704
1705							} elsif($item_type eq 'number') {
1706							$self->whine(
1707	0					0	$para->[1]{'start_line'},
1708							"Expected '=item *'"
1709							);
1710	0					0	push @$para, $para->[1]{'~orig_content'};
1711							# and block assimilation of the next paragraph
1712	0					0	delete $para->[1]{'number'};
1713							# Only a PROPER item-number element is allowed
1714							# to have a number attribute.
1715							} elsif($item_type eq 'text') {
1716							$self->whine(
1717	0					0	$para->[1]{'start_line'},
1718							"Expected '=item *'"
1719							);
1720							# But doesn't need processing. But it'll block assimilation
1721							# of the next para.
1722							} else {
1723	0					0	die "Unhandled item type $item_type"; # should never happen
1724							}
1725
1726	0	0				0	if(@$para == 2) {
1727							# For the cases where we /didn't/ push to @$para
1728	0	0				0	if($paras->[0][0] eq '~Para') {
1729	0					0	DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1730	0					0	push @$para, splice @{shift @$paras},2;
	0					0
1731							} else {
1732	0					0	DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1733	0					0	push @$para, ''; # Just so it's not contentless
1734							}
1735							}
1736
1737							} else {
1738	0					0	die "Unhandled =over type \"$over_type\"?";
1739							# Shouldn't happen!
1740							}
1741	0					0	$para->[0] .= '-' . $over_type;
1742
1743	0					0	return;
1744							}
1745
1746							sub _ponder_Plain {
1747	4547			4547		8598	my ($self,$para) = @_;
1748	4547					6540	DEBUG and print STDERR " giving plain treatment...\n";
1749	4547	100	100			25638	unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
			66
			100
1750							or $para->[1]{'~cooked'}
1751							) {
1752							push @$para,
1753	4442					6911	@{$self->_make_treelet(
1754							join("\n", splice(@$para, 2)),
1755	4442					20743	$para->[1]{'start_line'}
1756							)};
1757							}
1758							# Empty paragraphs don't need a treelet for any reason I can see.
1759							# And precooked paragraphs already have a treelet.
1760	4547					9488	return;
1761							}
1762
1763							sub _ponder_Verbatim {
1764	534			534		1088	my ($self,$para) = @_;
1765	534					734	DEBUG and print STDERR " giving verbatim treatment...\n";
1766
1767	534					1148	$para->[1]{'xml:space'} = 'preserve';
1768
1769	534	100				1232	unless ($self->{'_output_is_for_JustPod'}) {
1770							# Fix illegal settings for expand_verbatim_tabs()
1771							# This is because this module doesn't do input error checking, but khw
1772							# doesn't want to add yet another instance of that.
1773	453					1312	my $tab_width = $self->expand_verbatim_tabs;
1774	453	100	100			2014	$tab_width = $self->expand_verbatim_tabs(8)
1775							if ! defined $tab_width
1776							\|\| $tab_width =~ /\D/;
1777
1778	453					1136	my $indent = $self->strip_verbatim_indent;
1779	453	100	100			1091	if ($indent && ref $indent eq 'CODE') {
1780	10					14	my @shifted = (shift @{$para}, shift @{$para});
	10					18
	10					31
1781	10					32	$indent = $indent->($para);
1782	10					106	unshift @{$para}, @shifted;
	10					30
1783							}
1784
1785	453					1144	for(my $i = 2; $i < @$para; $i++) {
1786	2077					3227	foreach my $line ($para->[$i]) { # just for aliasing
1787							# Strip indentation.
1788	2077	100				3491	$line =~ s/^\Q$indent// if $indent;
1789	2077	100				3315	next unless $tab_width;
1790
1791							# This is commented out because of github issue #85, and the
1792							# current maintainers don't know why it was there in the first
1793							# place.
1794							#&& !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1795	2076					5882	while( $line =~
1796							# Sort of adapted from Text::Tabs.
1797	189					1253	s/^([^\t]*)(\t+)/$1.(" " x ((length($2)
1798							* $tab_width)
1799							-(length($1) % $tab_width)))/e
1800							) {}
1801
1802							# TODO: whinge about (or otherwise treat) unindented or overlong lines
1803
1804							}
1805							}
1806							}
1807
1808							# Now the VerbatimFormatted hoodoo...
1809	534	100	66			2251	if( $self->{'accept_codes'} and
		100
1810							$self->{'accept_codes'}{'VerbatimFormatted'}
1811							) {
1812	103		100			443	while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
	76					285
1813							# Kill any number of terminal newlines
1814	103					293	$self->_verbatim_format($para);
1815							} elsif ($self->{'codes_in_verbatim'}) {
1816							push @$para,
1817	2					10	@{$self->_make_treelet(
1818							join("\n", splice(@$para, 2)),
1819	2					22	$para->[1]{'start_line'}, $para->[1]{'xml:space'}
1820							)};
1821	2					8	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1822							} else {
1823	429	100				1939	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1824	429					2691	$para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1825							}
1826	534					1134	return;
1827							}
1828
1829							sub _ponder_Data {
1830	54			54		113	my ($self,$para) = @_;
1831	54					76	DEBUG and print STDERR " giving data treatment...\n";
1832	54					107	$para->[1]{'xml:space'} = 'preserve';
1833	54	100				164	push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1834	54					102	return;
1835							}
1836
1837
1838
1839
1840							###########################################################################
1841
1842							sub _traverse_treelet_bit { # for use only by the routine above
1843	8010			8010		17045	my($self, $name) = splice @_,0,2;
1844
1845	8010					10339	my $scratch;
1846	8010					24844	$self->_handle_element_start(($scratch=$name), shift @_);
1847
1848	8010					17130	while (@_) {
1849	12846					19857	my $x = shift;
1850	12846	100				21384	if (ref($x)) {
1851	2881					5926	&_traverse_treelet_bit($self, @$x);
1852							} else {
1853	9965		100			25237	$x .= shift while @_ && !ref($_[0]);
1854	9965					20973	$self->_handle_text($x);
1855							}
1856							}
1857
1858	8010					23415	$self->_handle_element_end($scratch=$name);
1859	8010					21314	return;
1860							}
1861
1862							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1863
1864							sub _closers_for_all_curr_open {
1865	10			10		23	my $self = $_[0];
1866	10					18	my @closers;
1867	10	50				16	foreach my $still_open (@{ $self->{'curr_open'} \|\| return }) {
	10					73
1868	15					43	my @copy = @$still_open;
1869	15					26	$copy[1] = {%{ $copy[1] }};
	15					79
1870							#$copy[1]{'start_line'} = -1;
1871	15	100				63	if($copy[0] eq '=for') {
		50
1872	9					18	$copy[0] = '=end';
1873							} elsif($copy[0] eq '=over') {
1874							$self->whine(
1875							$still_open->[1]{start_line} ,
1876	6					43	"=over without closing =back"
1877							);
1878
1879	6					12	$copy[0] = '=back';
1880							} else {
1881	0					0	die "I don't know how to auto-close an open $copy[0] region";
1882							}
1883
1884	15	50				55	unless( @copy > 2 ) {
1885	15					37	push @copy, $copy[1]{'target'};
1886	15	100				46	$copy[-1] = '' unless defined $copy[-1];
1887							# since =over's don't have targets
1888							}
1889
1890	15					29	$copy[1]{'fake-closer'} = 1;
1891
1892	15					18	DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1893	15					60	unshift @closers, \@copy;
1894							}
1895	10					37	return @closers;
1896							}
1897
1898							#--------------------------------------------------------------------------
1899
1900							sub _verbatim_format {
1901	103			103		212	my($it, $p) = @_;
1902
1903	103					121	my $formatting;
1904
1905	103					241	for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1906	559					664	DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1907	559					1283	$p->[$i] .= "\n";
1908							# Unlike with simple Verbatim blocks, we don't end up just doing
1909							# a join("\n", ...) on the contents, so we have to append a
1910							# newline to every line, and then nix the last one later.
1911							}
1912
1913	103					153	if( DEBUG > 4 ) {
1914							print STDERR "<<\n";
1915							for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1916							print STDERR "_verbatim_format $i: $p->[$i]";
1917							}
1918							print STDERR ">>\n";
1919							}
1920
1921	103					301	for(my $i = $#$p; $i > 2; $i--) {
1922							# work backwards over the lines, except the first (#2)
1923
1924							#next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s
1925							# and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1926							# look at a formatty line preceding a nonformatty one
1927	455					532	DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1928	455	100				747	if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) {
1929	11					14	DEBUG > 5 and print STDERR " It's a formatty line. ",
1930							"Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1931
1932	11	100				24	if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1933	1					2	DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n";
1934	1					3	next;
1935							} else {
1936	10					13	DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n";
1937							}
1938							} else {
1939	444					553	DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n";
1940	444					782	next;
1941							}
1942
1943							# A formatty line has to have #: in the first two columns, and uses
1944							# "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1945							# Example:
1946							# What do you want? i like pie. [or whatever]
1947							# #:^^^^^^^^^^^^^^^^^ /////////////
1948
1949
1950	10					14	DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1951
1952	10					27	$formatting = ' ' . $1;
1953	10					33	$formatting =~ s/\s+$//s; # nix trailing whitespace
1954	10	50	33			47	unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1955	0					0	splice @$p,$i,1; # remove this line
1956	0					0	$i--; # don't consider next line
1957	0					0	next;
1958							}
1959
1960	10	100				24	if( length($formatting) >= length($p->[$i-1]) ) {
1961	3					8	$formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1962							} else {
1963	7					25	$formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1964							}
1965							# Make $formatting and the previous line be exactly the same length,
1966							# with $formatting having a " " as the last character.
1967
1968	10					14	DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n";
1969
1970
1971	10					16	my @new_line;
1972	10					44	while( $formatting =~ m{\G(( +)\|(\^+)\|(\/+)\|(\%+))}g ) {
1973							#print STDERR "Format matches $1\n";
1974
1975	54	100				120	if($2) {
1976							#print STDERR "SKIPPING <$2>\n";
1977	32					139	push @new_line,
1978							substr($p->[$i-1], pos($formatting)-length($1), length($1));
1979							} else {
1980							#print STDERR "SNARING $+\n";
1981	22	50				157	push @new_line, [
		100
		100
1982							(
1983							$3 ? 'VerbatimB' :
1984							$4 ? 'VerbatimI' :
1985							$5 ? 'VerbatimBI' : die("Should never get called")
1986							), {},
1987							substr($p->[$i-1], pos($formatting)-length($1), length($1))
1988							];
1989							#print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1990							}
1991							}
1992	10					40	my @nixed =
1993							splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1994	10					15	DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1995
1996	10					14	DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1997							scalar(@new_line), "):",
1998							map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1999	10					31	$i--; # So the next line we scrutinize is the line before the one
2000							# that we just went and formatted
2001							}
2002
2003	103					195	$p->[0] = 'VerbatimFormatted';
2004
2005							# Collapse adjacent text nodes, just for kicks.
2006	103					271	for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
2007	0	0	0			0	if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
2008	0					0	DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
2009	0					0	$p->[$i] .= splice @$p, $i+1, 1; # merge
2010	0					0	--$i; # and back up
2011							}
2012							}
2013
2014							# Now look for the last text token, and remove the terminal newline
2015	103					286	for( my $i = $#$p; $i >= 2; $i-- ) {
2016							# work backwards over the tokens, even the first
2017	103	50				236	if( !ref($p->[$i]) ) {
2018	103	50				537	if($p->[$i] =~ s/\n$//s) {
2019	103					223	DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
2020							} else {
2021	0					0	DEBUG > 5 and print STDERR
2022							"No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
2023							}
2024	103					209	last; # we only want the next one
2025							}
2026							}
2027
2028	103					202	return;
2029							}
2030
2031
2032							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2033
2034
2035							sub _treelet_from_formatting_codes {
2036							# Given a paragraph, returns a treelet. Full of scary tokenizing code.
2037							# Like [ '~Top', {'start_line' => $start_line},
2038							# "I like ",
2039							# [ 'B', {}, "pie" ],
2040							# "!"
2041							# ]
2042							# This illustrates the general format of a treelet. It is an array:
2043							# [0] is a scalar indicating its type. In the example above, the
2044							# types are '~Top' and 'B'
2045							# [1] is a hash of various flags about it, possibly empty
2046							# [2] - [N] are an ordered list of the subcomponents of the treelet.
2047							# Scalars are literal text, refs are sub-treelets, to
2048							# arbitrary levels. Stringifying a treelet will recursively
2049							# stringify the sub-treelets, concatentating everything
2050							# together to form the exact text of the treelet.
2051
2052	4444			4444		9790	my($self, $para, $start_line, $preserve_space) = @_;
2053
2054	4444					13114	my $treelet = ['~Top', {'start_line' => $start_line},];
2055
2056	4444	100	100			15852	unless ($preserve_space \|\| $self->{'preserve_whitespace'}) {
2057	3577					28119	$para =~ s/\s+/ /g; # collapse and trim all whitespace first.
2058	3577					6766	$para =~ s/ $//;
2059	3577					5257	$para =~ s/^ //;
2060							}
2061
2062							# Only apparent problem the above code is that N<< >> turns into
2063							# N<< >>. But then, word wrapping does that too! So don't do that!
2064
2065
2066							# As a Start-code is encountered, the number of opening bracket '<'
2067							# characters minus 1 is pushed onto @stack (so 0 means a single bracket,
2068							# etc). When closing brackets are found in the text, at least this number
2069							# (plus the 1) will be required to mean the Start-code is terminated. When
2070							# those are found, @stack is popped.
2071	4444					6308	my @stack;
2072
2073	4444					8098	my @lineage = ($treelet);
2074	4444					6615	my $raw = ''; # raw content of L<> fcode before splitting/processing
2075							# XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
2076							# into just 1 ' '. Is this the regex's doing or 'raw's? Answer is it's
2077							# the 'collapse and trim all whitespace first' lines just above.
2078	4444					6063	my $inL = 0;
2079
2080	4444					5748	DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
2081
2082							# Here begins our frightening tokenizer RE. The following regex matches
2083							# text in four main parts:
2084							#
2085							# * Start-codes. The first alternative matches C< or C<<, the latter
2086							# followed by some whitespace. $1 will hold the entire start code
2087							# (including any space following a multiple-angle-bracket delimiter),
2088							# and $2 will hold only the additional brackets past the first in a
2089							# multiple-bracket delimiter. length($2) + 1 will be the number of
2090							# closing brackets we have to find.
2091							#
2092							# * Closing brackets. Match some amount of whitespace followed by
2093							# multiple close brackets. The logic to see if this closes anything
2094							# is down below. Note that in order to parse C<< >> correctly, we
2095							# have to use look-behind (?<=\s\s), since the match of the starting
2096							# code will have consumed the whitespace.
2097							#
2098							# * A single closing bracket, to close a simple code like C<>.
2099							#
2100							# * Something that isn't a start or end code. We have to be careful
2101							# about accepting whitespace, since perlpodspec says that any whitespace
2102							# before a multiple-bracket closing delimiter should be ignored.
2103							#
2104	4444					43781	while($para =~
2105							m/\G
2106							(?:
2107							# Match starting codes, including the whitespace following a
2108							# multiple-delimiter start code. $1 gets the whole start code and
2109							# $2 gets all but one of the
2110							([A-Z]<(?:(<+)\s+)?)
2111							\|
2112							# Match multiple-bracket end codes. $3 gets the whitespace that
2113							# should be discarded before an end bracket but kept in other cases
2114							# and $4 gets the end brackets themselves. ($3 can be empty if the
2115							# construct is empty, like C<< >>, and all the white-space has been
2116							# gobbled up already, considered to be space after the opening
2117							# bracket. In this case we use look-behind to verify that there are
2118							# at least 2 spaces in a row before the ">".)
2119							(\s+\|(?<=\s\s))(>{2,})
2120							\|
2121							(\s?>) # $5: simple end-codes
2122							\|
2123							( # $6: stuff containing no start-codes or end-codes
2124							(?:
2125							[^A-Z\s>]
2126							\|
2127							(?:
2128							[A-Z](?!<)
2129							)
2130							\|
2131							# whitespace is ok, but we don't want to eat the whitespace before
2132							# a multiple-bracket end code.
2133							# NOTE: we may still have problems with e.g. S<< >>
2134							(?:
2135							\s(?!\s*>{2,})
2136							)
2137							)+
2138							)
2139							)
2140							/xgo
2141							) {
2142	15883					21863	DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
2143	15883	100				46732	if(defined $1) {
		100
		100
		50
2144	3078					4179	my $bracket_count; # How many '<<<' in a row this has. Needed for
2145							# Pod::Simple::JustPod
2146	3078	100				5301	if(defined $2) {
2147	115					171	DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
2148	115					254	$bracket_count = length($2) + 1;
2149	115					212	push @stack, $bracket_count; # length of the necessary complex
2150							# end-code string
2151							} else {
2152	2963					3547	DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
2153	2963					4307	push @stack, 0; # signal that we're looking for simple
2154	2963					3935	$bracket_count = 1;
2155							}
2156	3078					6282	my $code = substr($1,0,1);
2157	3078	100				5357	if ('L' eq $code) {
2158	433	100				781	if ($inL) {
2159	1					2	$raw .= $1;
2160	1					44	$self->scream( $start_line,
2161							'Nested L<> are illegal. Pretending inner one is '
2162							. 'X<...> so can continue looking for other errors.');
2163	1					2	$code = "X";
2164							}
2165							else {
2166	432					631	$raw = ""; # reset raw content accumulator
2167	432					667	$inL = @stack;
2168							}
2169							} else {
2170	2645	100				4659	$raw .= $1 if $inL;
2171							}
2172	3078					6776	push @lineage, [ $code, {}, ]; # new node object
2173
2174							# Tell Pod::Simple::JustPod how many brackets there were, but to save
2175							# space, not in the most usual case of there was just 1. It can be
2176							# inferred by the absence of this element. Similarly, if there is more
2177							# than one bracket, extract the white space between the final bracket
2178							# and the real beginning of the interior. Save that if it isn't just a
2179							# single space
2180	3078	100	100			7794	if ($self->{'_output_is_for_JustPod'} && $bracket_count > 1) {
2181	18					48	$lineage[-1][1]{'~bracket_count'} = $bracket_count;
2182	18					43	my $lspacer = substr($1, 1 + $bracket_count);
2183	18	100				50	$lineage[-1][1]{'~lspacer'} = $lspacer if $lspacer ne " ";
2184							}
2185	3078					3938	push @{ $lineage[-2] }, $lineage[-1];
	3078					16802
2186							} elsif(defined $4) {
2187	126					187	DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
2188							# This is where it gets messy...
2189	126	100				564	if(! @stack) {
		100
		100
		50
2190							# We saw " >>>>" but needed nothing. This is ALL just stuff then.
2191	1					4	DEBUG > 4 and print STDERR " But it's really just stuff.\n";
2192	1					8	push @{ $lineage[-1] }, $3, $4;
	1					3
2193	1					11	next;
2194							} elsif(!$stack[-1]) {
2195							# We saw " >>>>" but needed only ">". Back pos up.
2196	3					5	DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
2197	3					5	push @{ $lineage[-1] }, $3; # That was a for-real space, too.
	3					7
2198	3					12	pos($para) = pos($para) - length($4) + 1;
2199							} elsif($stack[-1] == length($4)) {
2200							# We found " >>>>", and it was exactly what we needed. Commonest case.
2201	115					174	DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
2202							} elsif($stack[-1] < length($4)) {
2203							# We saw " >>>>" but needed only " >>". Back pos up.
2204	0					0	DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
2205	0					0	pos($para) = pos($para) - length($4) + $stack[-1];
2206							} else {
2207							# We saw " >>>>" but needed " >>>>>>". So this is all just stuff!
2208	7					12	DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
2209	7					14	push @{ $lineage[-1] }, $3, $4;
	7					26
2210	7					37	next;
2211							}
2212							#print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
2213
2214	118	50	66			306	if ($3 ne " " && $self->{'_output_is_for_JustPod'}) {
2215	3	100				29	if ($3 ne "") {
		100
2216	1					3	$lineage[-1][1]{'~rspacer'} = $3;
2217							}
2218							elsif ($lineage[-1][1]{'~lspacer'} eq " ") {
2219
2220							# Here we had something like C<< >> which was a false positive
2221	1					3	delete $lineage[-1][1]{'~lspacer'};
2222							}
2223							else {
2224							$lineage[-1][1]{'~rspacer'}
2225	1					15	= substr($lineage[-1][1]{'~lspacer'}, -1, 1);
2226	1					3	chop $lineage[-1][1]{'~lspacer'};
2227							}
2228							}
2229
2230	118	100				235	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	2					5
	118					256
2231							# Keep the element from being childless
2232
2233	118	100				244	if ($inL == @stack) {
2234	22					54	$lineage[-1][1]{'raw'} = $raw;
2235	22					32	$inL = 0;
2236							}
2237
2238	118					181	pop @stack;
2239	118					152	pop @lineage;
2240
2241	118	100				646	$raw .= $3.$4 if $inL;
2242
2243							} elsif(defined $5) {
2244	3056					3821	DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
2245
2246	3056	100	100			9836	if(@stack and ! $stack[-1]) {
2247							# We're indeed expecting a simple end-code
2248	2952					3758	DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
2249
2250	2952	50				5726	if(length($5) == 2) { # There was a space there: " >"
		100
2251	0					0	push @{ $lineage[-1] }, ' ';
	0					0
2252	2952					5879	} elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
2253	41					64	push @{ $lineage[-1] }, ''; # keep it from being really childless
	41					109
2254							}
2255
2256	2952	100				6001	if ($inL == @stack) {
2257	410					995	$lineage[-1][1]{'raw'} = $raw;
2258	410					554	$inL = 0;
2259							}
2260
2261	2952					4074	pop @stack;
2262	2952					3983	pop @lineage;
2263							} else {
2264	104					170	DEBUG > 4 and print STDERR " It's just stuff.\n";
2265	104					197	push @{ $lineage[-1] }, $5;
	104					273
2266							}
2267
2268	3056	100				21772	$raw .= $5 if $inL;
2269
2270							} elsif(defined $6) {
2271	9623					11775	DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
2272	9623					12129	push @{ $lineage[-1] }, $6;
	9623					24960
2273	9623	100				35449	$raw .= $6 if $inL;
2274							# XXX does not capture multiplace whitespaces -- 'raw' ends up with
2275							# at most 1 leading/trailing whitespace, why not all of it?
2276							# Answer, because we deliberately trimmed it above
2277
2278							} else {
2279							# should never ever ever ever happen
2280	0					0	DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
2281	0					0	die "SPORK 512512!";
2282							}
2283							}
2284
2285	4444	100				8701	if(@stack) { # Uhoh, some sequences weren't closed.
2286	8					38	my $x= "...";
2287	8					20	while(@stack) {
2288	8	50				14	push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
	0					0
	8					22
2289							# Hmmmmm!
2290
2291	8					20	my $code = (pop @lineage)->[0];
2292	8					42	my $ender_length = pop @stack;
2293	8	50				18	if($ender_length) {
2294	0					0	--$ender_length;
2295	0					0	$x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
2296							} else {
2297	8					29	$x = $code . "<$x>";
2298							}
2299							}
2300	8					12	DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
2301	8					39	$self->whine($start_line,
2302							"Unterminated $x sequence",
2303							);
2304							}
2305
2306	4444					13291	return $treelet;
2307							}
2308
2309							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2310
2311							sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol)
2312	0			0	0	0	return stringify_lol($_[1]);
2313							}
2314
2315							sub stringify_lol { # function: stringify_lol($lol)
2316	2587			2587	0	3865	my $string_form = '';
2317	2587					5470	_stringify_lol( $_[0] => \$string_form );
2318	2587					7461	return $string_form;
2319							}
2320
2321							sub _stringify_lol { # the real recursor
2322	2967			2967		4646	my($lol, $to) = @_;
2323	2967					5765	for(my $i = 2; $i < @$lol; ++$i) {
2324	3963	100	100			9457	if( ref($lol->[$i] \|\| '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
			66
2325	380					667	_stringify_lol( $lol->[$i], $to); # recurse!
2326							} else {
2327	3583					7879	$$to .= $lol->[$i];
2328							}
2329							}
2330	2967					4596	return;
2331							}
2332
2333							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2334
2335							sub _dump_curr_open { # return a string representation of the stack
2336	3			3		7	my $curr_open = $_[0]{'curr_open'};
2337
2338	3	50				6	return '[empty]' unless @$curr_open;
2339							return join '; ',
2340	3					6	map {;
2341							($_->[0] eq '=for')
2342							? ( ($_->[1]{'~really'} \|\| '=over')
2343	4	50	50			45	. ' ' . $_->[1]{'target'})
2344							: $_->[0]
2345							}
2346							@$curr_open
2347							;
2348							}
2349
2350							###########################################################################
2351							my %pretty_form = (
2352							"\a" => '\a', # ding!
2353							"\b" => '\b', # BS
2354							"\e" => '\e', # ESC
2355							"\f" => '\f', # FF
2356							"\t" => '\t', # tab
2357							"\cm" => '\cm',
2358							"\cj" => '\cj',
2359							"\n" => '\n', # probably overrides one of either \cm or \cj
2360							'"' => '\"',
2361							'\\' => '\\\\',
2362							'$' => '\\$',
2363							'@' => '\\@',
2364							'%' => '\\%',
2365							'#' => '\\#',
2366							);
2367
2368							sub pretty { # adopted from Class::Classless
2369							# Not the most brilliant routine, but passable.
2370							# Don't give it a cyclic data structure!
2371	502			502	0	5726	my @stuff = @_; # copy
2372	502					667	my $x;
2373							my $out =
2374							# join ",\n" .
2375							join ", ",
2376	502					778	map {;
2377	592	50	100			4392	if(!defined($_)) {
		100	66
		50	33
		100
		100
		100
2378	0					0	"undef";
2379							} elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2380	58					146	$x = "[ " . pretty(@$_) . " ]" ;
2381	58					148	$x;
2382							} elsif(ref($_) eq 'SCALAR') {
2383	0					0	$x = "\\" . pretty($$_) ;
2384	0					0	$x;
2385							} elsif(ref($_) eq 'HASH') {
2386	48					86	my $hr = $_;
2387							$x = "{" . join(", ",
2388	48					470	map(pretty($_) . '=>' . pretty($hr->{$_}),
2389							sort keys %$hr ) ) . "}" ;
2390	48					176	$x;
2391	2					5	} elsif(!length($_)) { q{''} # empty string
2392							} elsif(
2393							$_ eq '0' # very common case
2394							or(
2395							m/^-?(?:[123456789]\d*\|0)(?:\.\d+)?$/s
2396							and $_ ne '-0' # the strange case that RE lets thru
2397							)
2398	26					82	) { $_;
2399							} else {
2400							# Yes, explicitly name every character desired. There are shorcuts one
2401							# could make, but I (Karl Williamson) was afraid that some Perl
2402							# releases would have bugs in some of them. For example [A-Z] works
2403							# even on EBCDIC platforms to match exactly the 26 uppercase English
2404							# letters, but I don't know if it has always worked without bugs. It
2405							# seemed safest just to list the characters.
2406							# s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2407	458	0				1058	s<([^ !"#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{\|}~])>
	0					0
2408							<$pretty_form{$1} \|\| '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2409	458					1379	#<$pretty_form{$1} \|\| '\\x'.(unpack("H2",$1))>eg;
2410							qq{"$_"};
2411							}
2412							} @stuff;
2413	502					2401	# $out =~ s/\n */ /g if length($out) < 75;
2414							return $out;
2415							}
2416
2417							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2418
2419							# A rather unsubtle method of blowing away all the state information
2420							# from a parser object so it can be reused. Provided as a utility for
2421							# backward compatibility in Pod::Man, etc. but not recommended for
2422							# general use.
2423
2424	5			5	0	10348	sub reinit {
2425	5					20	my $self = shift;
2426							foreach (qw(source_dead source_filename doc_has_started
2427							start_of_pod_block content_seen last_was_blank paras curr_open
2428							line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2429							Title)) {
2430	80					142
2431							delete $self->{$_};
2432							}
2433							}
2434
2435							#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2436							1;
2437