File Coverage

blib/lib/Text/MediawikiFormat.pm

Criterion	Covered	Total	%
statement	253	367	68.9
branch	106	194	54.6
condition	20	69	28.9
subroutine	37	42	88.1
pod	2	2	100.0
total	418	674	62.0

line stmt bran cond sub pod time code

1 package Text::MediawikiFormat;

3 14 14 208599 use strict;

14 29

14 602

4 14 14 65 use warnings::register;

14 21

14 2488

6 =head1 NAME

8 Text::MediawikiFormat - Translate Wiki markup into other text formats

10 =head1 VERSION

12 Version 1.04

14 =cut

16 our $VERSION = '1.04';

18 =head1 SYNOPSIS

20 use Text::MediawikiFormat 'wikiformat';

21 my $html = wikiformat ($raw);

22 my $text = wikiformat ($raw, {}, {implicit_links => 1});

24 =head1 DESCRIPTION

26 L and its sister projects use the PHP Mediawiki to format

27 their pages. This module attempts to duplicate the Mediawiki formatting rules.

28 Those formatting rules can be simple and easy to use, while providing more

29 advanced options for the power user. They are also easy to translate into

30 other, more complicated markup languages with this module. It creates HTML by

31 default, but could produce valid POD, DocBook, XML, or any other format

32 imaginable.

34 The most important function is C. It is

35 not exported by default, but will be exported as C if any

36 options at all are passed to the exporter, unless the name is overridden

37 explicitly. See L<"EXPORT"> for more information.

39 It should be noted that this module is written as a drop in replacement for

40 L that expands on that modules functionality and provides

41 a default rule set that may be used to format text like the PHP Mediawiki. It

42 is also well to note early that if you just want a Mediawiki clone (you don't

43 need to customize it heavily and you want integration with a back end

44 database), you should look at L.

46 =cut

48 14 14 75 use Carp qw(carp confess croak);

14 21

14 1016

49 14 14 27225 use CGI qw(:standard);

14 196326

14 109

50 14 14 40323 use Scalar::Util qw(blessed);

14 28

14 1514

51 14 14 6749 use Text::MediawikiFormat::Blocks;

14 29

14 78

52 14 14 8654 use URI;

14 56265

14 562

53 14 14 122 use URI::Escape qw(uri_escape uri_escape_utf8);

14 19

14 1162

55 14 1860 use vars qw($missing_html_packages %tags %opts %merge_matrix

56 14 14 70 $uric $uricCheat $uriCruft);

14 19

58 BEGIN {

59 # Try to load optional HTML packages, recording any errors.

60 14 14 30 eval { require HTML::Parser };

14 12637

61 14 75984 $missing_html_packages = $@;

62 14 35 eval { require HTML::Tagset };

14 8202

63 14 34545 $missing_html_packages .= $@;

64 }

66 ###

67 ### Defaults

68 ###

69 %tags = (

70 indent => qr/^(?:[:*#;]*)(?=[:*#;])/,

71 link => \&_make_html_link,

72 strong => sub {"$_[0]"},

73 emphasized => sub {"$_[0]"},

74 strong_tag => qr/'''(.+?)'''/,

75 emphasized_tag => qr/''(.+?)''/,

code => [ '

',  "

\n", '', "\n" ],

line => [ '', '', '

', "\n" ],

paragraph => [ "

", "

\n", '', "\n", 1 ],

80 paragraph_break => [ '', '', '', "\n" ],

unordered => [ "

\n", "\n", '

', "

\n" ],

ordered => [ "

\n", "\n", '

', "

\n" ],

definition => [ "

\n", "\n", \&_dl ],

84 header => [ '', "\n", \&_make_header ],

86 blocks => {

87 code => qr/^ /,

88 header => qr/^(=+)\s*(.+?)\s*\1$/,

89 line => qr/^-{4,}$/,

90 ordered => qr/^#\s*/,

91 unordered => qr/^\*\s*/,

92 definition => qr/^([;:])\s*/,

93 paragraph => qr/^/,

94 paragraph_break => qr/^\s*$/,

95 },

97 indented => { map { $_ => 1 } qw(ordered unordered definition) },

98 nests => { map { $_ => 1 } qw(ordered unordered definition) },

99 nests_anywhere => { map { $_ => 1 } qw(nowiki) },

100

101 blockorder => [

102 qw(code header line ordered unordered definition

103 paragraph_break paragraph)

104 ],

105 implicit_link_delimiters => qr!\b(?:[A-Z][a-z0-9]\w*){2,}!,

106 extended_link_delimiters => qr!\[(?:\[[^][]*\]|[^][]*)\]!,

107

108 schemas => [qw(http https ftp mailto gopher)],

109

110 unformatted_blocks => [qw(header nowiki pre)],

111

112 allowed_tags => [ #HTML

113 qw(b big blockquote br caption center cite code dd

114 div dl dt em font h1 h2 h3 h4 h5 h6 hr i li ol p

115 pre rb rp rt ruby s samp small strike strong sub

116 sup table td th tr tt u ul var),

117

118 # Mediawiki Specific

119 qw(nowiki),

120 ],

121 allowed_attrs => [

122 qw(title align lang dir width height bgcolor),

123 qw(clear), # BR

124 qw(noshade), # HR

125 qw(cite), # BLOCKQUOTE, Q

126 qw(size face color), # FONT

127 # For various lists, mostly deprecated but safe

128 qw(type start value compact),

129

130 # Tables

131 qw(summary width border frame rules cellspacing

132 cellpadding valign char charoff colgroup col

133 span abbr axis headers scope rowspan colspan),

134 qw(id class name style), # For CSS

135 ],

136

137 _toc => [],

138 );

139

140 %opts = (

141 extended => 1,

142 implicit_links => 0,

143 absolute_links => 1,

144 prefix => '',

145 process_html => 1,

146 charset => 'utf-8',

147 );

148

149 # Make sure import's argument hash contains an `as' entry. `as' defaults to

150 # `wikiformat' when none is given.

151 sub _process_args {

152 15 15 23 shift; # Class

153 15 100 57 return as => shift if @_ == 1;

154 14 92 return as => 'wikiformat', @_;

155 }

156

157 # Delete the options (prefix, extended, implicit_links, ...) from a hash,

158 # returning a new hash with the deleted options.

159 sub _extract_opts {

160 15 15 29 my %newopts;

161

162 15 47 for my $key (

163 qw{prefix extended implicit_links absolute_links

164 process_html debug}

165 )

166 {

167 90 100 253 if ( defined( my $val = delete $_[0]->{$key} ) ) {

168 19 53 $newopts{$key} = $val;

169 }

170 }

171

172 15 69 return \%newopts;

173 }

174

175 # Shamelessly ripped from Hash::Merge, which doesn't work in a threaded

176 # environment with two threads trying to use different merge matrices.

177 %merge_matrix = (

178 SCALAR => {

179 SCALAR => sub { return $_[0] },

180 ARRAY => sub { # Need to be able to replace scalar with array

181 # for extended_link_delimiters (could be array

182 # or regex).

183 return $_[0];

184 },

185 HASH => sub {

186 confess "Attempt to replace hash with scalar"

187 if defined $_[0];

188 return _clone( $_[1] );

189 }

190 },

191

192 ARRAY => {

193 SCALAR => sub { # Need to be able to replace array with scalar

194 # for extended_link_delimiters (could be array

195 # or regex).

196 return _clone( $_[0] );

197 },

198 ARRAY => sub { return _clone( $_[0] ); },

199 HASH => sub { confess "Attempt to replace hash with array" }

200 },

201

202 HASH => {

203 SCALAR => sub { confess "Attempt to replace scalar with hash" },

204 ARRAY => sub { confess "Attempt to replace array with hash" },

205 HASH => sub { _merge_hash_elements( $_[0], $_[1] ) }

206 }

207 );

208

209 # Return arrays and a deep copy of hashes.

210 sub _clone {

211 2890 2890 2487 my ($obj) = @_;

212 2890 2030 my $type;

213 2890 50 6495 if ( !defined $obj ) { # Perl 5.005 compatibility

100

214 0 0 $type = 'SCALAR';

215 }

216 elsif ( ref $obj eq 'HASH' ) {

217 231 226 $type = 'HASH';

218 }

219 elsif ( ref $obj eq 'ARRAY' ) {

220 913 838 $type = 'ARRAY';

221 }

222 else {

223 1746 1475 $type = 'SCALAR';

224 }

225

226 2890 100 6314 return $obj if $type eq 'SCALAR';

227 1144 100 2596 return $obj if $type eq 'ARRAY';

228

229 231 201 my %copy;

230 231 492 foreach my $key ( keys %$obj ) {

231 800 1105 $copy{$key} = _clone( $obj->{$key} );

232 }

233 231 615 return \%copy;

234 }

235

236 # This does a straight merge of hashes, delegating the merge-specific

237 # work to '_merge_hashes'.

238 sub _merge_hash_elements {

239 169 169 177 my ( $left, $right ) = @_;

240 169 50 33 987 die "Arguments for _merge_hash_elements must be hash references"

241 unless UNIVERSAL::isa( $left, 'HASH' ) && UNIVERSAL::isa( $right, 'HASH' );

242

243 169 158 my %newhash;

244 169 512 foreach my $leftkey ( keys %$left ) {

245 243 100 378 if ( exists $right->{$leftkey} ) {

246 235 435 $newhash{$leftkey} = _merge_hashes( $left->{$leftkey}, $right->{$leftkey} );

247 }

248 else {

249 8 15 $newhash{$leftkey} = _clone( $left->{$leftkey} );

250 }

251 }

252 169 694 foreach my $rightkey ( keys %$right ) {

253 2266 100 4650 $newhash{$rightkey} = _clone( $right->{$rightkey} )

254 if !exists $left->{$rightkey};

255 }

256 169 508 return \%newhash;

257 }

258

259 sub _merge_hashes {

260 371 371 3762 my ( $left, $right ) = @_;

261

262 # if one argument or the other is undefined or empty, don't worry about

263 # copying, just return the original.

264 371 50 707 return $right unless defined $left;

265 371 50 549 return $left unless defined $right;

266

267 # For the general use of this function, we want to create duplicates

268 # of all data that is merged.

269

270 371 285 my ( $lefttype, $righttype );

271 371 100 829 if ( ref $left eq 'HASH' ) {

100

272 169 204 $lefttype = 'HASH';

273 }

274 elsif ( ref $left eq 'ARRAY' ) {

275 51 54 $lefttype = 'ARRAY';

276 }

277 else {

278 151 157 $lefttype = 'SCALAR';

279 }

280

281 371 100 664 if ( ref $right eq 'HASH' ) {

100

282 169 176 $righttype = 'HASH';

283 }

284 elsif ( ref $right eq 'ARRAY' ) {

285 49 43 $righttype = 'ARRAY';

286 }

287 else {

288 153 155 $righttype = 'SCALAR';

289 }

290

291 371 847 return $merge_matrix{$lefttype}->{$righttype}( $left, $right );

292 }

293

294 sub _require_html_packages {

295 1 50 1 4 croak "$missing_html_packages\n" . "HTML::Parser & HTML::Tagset is required for process_html\n"

296 if $missing_html_packages;

297 }

298

299 sub import {

300 18 100 18 4025 return unless @_ > 1;

301

302 15 42 my $class = shift;

303 15 71 my %args = $class->_process_args(@_);

304 15 56 my $name = delete $args{as};

305

306 15 69 my $caller = caller();

307 15 229 my $iopts = _merge_hashes _extract_opts( \%args ), \%opts;

308 15 67 my $itags = _merge_hashes \%args, \%tags;

309

310 15 100 82 _require_html_packages

311 if $iopts->{process_html};

312

313 # Could verify ITAGS here via _check_blocks, but what if a user

314 # wants to add a block to block_order that they intend to override

315 # the implementation of with every call to format()?

316

317 14 14 121 no strict 'refs';

14 20

14 50271

318 15 17904 *{ $caller . "::" . $name } = sub {

319 45 45 30730 Text::MediawikiFormat::_format( $itags, $iopts, @_ );

320 }

321 15 80 }

322

323 =head1 FUNCTIONS

324

325 =head2 format

326

327 C takes one required argument, the text to convert, and returns the

328 converted text. It allows two optional arguments. The first is a reference to

329 a hash of tags used to override the function's default behavior. Anything

330 passed in here will override the default tags. The second argument is a hash

331 reference of options. The options are currently:

332

333 =over 4

334

335 =item prefix

336

337 The prefix of any links to wiki pages. In HTML mode, this is the path to the

338 Wiki. The actual linked item itself will be appended to the prefix. This is

339 useful to create full URIs:

340

341 {prefix => 'http://example.com/wiki.pl?page='}

342

343 =item extended

344

345 A boolean flag, true by default, to let square brackets mark links.

346 An optional title may occur after the Wiki targets, preceded by an open pipe.

347 URI titles are separated from their title with a space. These are valid

348 extended links:

349

350 [[A wiki page|and the title to display]]

351 [http://ximbiot.com URI title]

352

353 Where the linking semantics of the destination format allow it, the result will

354 display the title instead of the URI. In HTML terms, the title is the content

355 of an C element (not the content of its C attribute).

356

357 You can use delimiters other than single square brackets for marking extended

358 links by passing a value for C in the C<%tags> hash

359 when calling C.

360

361 Note that if you disable this flag, you should probably enable

362 C or there will be no automated way to link to other pages in

363 your wiki.

364

365 =item implicit_links

366

367 A boolean flag, false by default, to create links from StudlyCapsStrings.

368

369 =item absolute_links

370

371 A boolean flag, true by default, which treats any links that are absolute URIs

372 (such as C) specially. Any prefix will not apply.

373 This should maybe be called implicit_absolute_links since the C

374 option enables absolute links inside square brackets by default.

375

376 A link is any text that starts with a known schema followed by a colon and one

377 or more non-whitespace characters. This is a distinct subset of what L

378 recognizes as a URI, but is a good first-order approximation. If you need to

379 recognize more complex URIs, use the standard wiki formatting explained

380 earlier.

381

382 The recognized schemas are those defined in the C value in the C<%tags>

383 hash. C defaults to C, C, C, C, and

384 C.

385

386 =item process_html

387

388 This flag, true by default, causes the formatter to ignore block level wiki

389 markup (code, ordered, unordered, etc...) when they occur on lines which also

390

contain allowed block-level HTML tags (

, , ,

, etc...).

391 Phrase level wiki markup (emphasis, strong, & links) is unaffected by this

392 flag.

393

394 =back

395

396 =cut

397

398 sub format {

399 7 7 1 22763 _format( \%tags, \%opts, @_ );

400 }

401

402 # Turn the contents after a ; or : into a dictionary list.

403 # Using : without ; just looks like an indent.

404 sub _dl {

405

406 #my ($line, $indent, $lead) = @_;

407 23 23 17 my ( $term, $def );

408

409 23 100 31 if ( $_[2] eq ';' ) {

410 11 100 35 if ( $_[0] =~ /^(.*?)\s+:\s+(.*)$/ ) {

411 6 10 $term = $1;

412 6 9 $def = $2;

413 }

414 else {

415 5 7 $term = $_[0];

416 }

417 }

418 else {

419 12 12 $def = $_[0];

420 }

421

422 23 18 my @retval;

423 23 100 47 push @retval, "
", $term, "
\n" if defined $term;

424 23 100 43 push @retval, "
", $def, "
\n" if defined $def;

425 23 63 return @retval;

426 }

427

428 # Makes a regex out of the allowed schema array.

429 sub _make_schema_regex {

430 47 47 85 my $re = join "|", map {qr/\Q$_\E/} @_;

231 1998

431 47 871 return qr/(?:$re)/;

432 }

433

434 $uric = $URI::uric;

435 $uricCheat = $uric;

436

437 # We need to avoid picking up 'HTTP::Request::Common' so we have a

438 # subset of uric without a colon.

439 $uricCheat =~ tr/://d;

440

441 # Identifying characters often accidentally picked up trailing a URI.

442 $uriCruft = q/]),.!'";}/;

443

444 # escape a URI based on our charset.

445 sub _escape_uri {

446 28 28 43 my ( $opts, $uri ) = @_;

447 28 50 76 confess "charset not initialized" unless $opts->{charset};

448 28 50 247 return uri_escape_utf8 $uri if $opts->{charset} =~ /^utf-?8$/i;

449 0 0 return uri_escape $uri;

450 }

451

452 # Turn [[Wiki Link|Title]], [URI Title], scheme:url, or StudlyCaps into links.

453 sub _make_html_link {

454 32 32 77 my ( $tag, $opts, $tags ) = @_;

455

456 32 48 my ( $class, $trailing ) = ( '', '' );

457 32 34 my ( $href, $title );

458 32 100 147 if ( $tag =~ /^\[\[([^|#]*)(?:(#)([^|]*))?(?:(\|)(.*))?\]\]$/ ) {

100

459

460 # Wiki link

461 7 50 55 $href = $opts->{prefix} . _escape_uri $opts, $1 if $1;

462 7 50 194 $href .= $2 . _escape_uri $opts, $3 if $2;

463

464 7 100 21 if ($4) {

465

466 # Title specified explicitly.

467 3 50 10 if ( length $5 ) {

468 3 8 $title = $5;

469 }

470 else {

471 # An empty title asks Mediawiki to strip any parens off the end

472 # of the node name.

473 0 0 $1 =~ /^([^(]*)(?:\s*\()?/;

474 0 0 $title = $1;

475 }

476 }

477 else {

478 # Title defaults to the node name.

479 4 7 $title = $1;

480 }

481 }

482 elsif ( $tag =~ /^\[(\S*)(?:(\s+)(.*))?\]$/ ) {

483

484 # URI

485 5 10 $href = $1;

486 5 50 13 if ($2) {

487 5 8 $title = $3;

488 }

489 else {

490 0 0 $title = ++$opts->{_uri_refs};

491 }

492 5 10 $href =~ s/'/%27/g;

493 }

494 else {

495 # Shouldn't be able to get here without either $opts->{absolute_links}

496 # or $opts->{implicit_links};

497 20 33 50 $tags->{_schema_regex} ||= _make_schema_regex @{ $tags->{schemas} };

0 0

498 20 25 my $s = $tags->{_schema_regex};

499

500 20 100 525 if ( $tag =~ /^$s:[$uricCheat][$uric]*$/ ) {

501

502 # absolute link

503 8 17 $href = $&;

504 8 100 67 $trailing = $& if $href =~ s/[$uriCruft]$//;

505 8 16 $title = $href;

506 }

507 else {

508 # StudlyCaps

509 12 38 $href = $opts->{prefix} . _escape_uri $opts, $tag;

510 12 290 $title = $tag;

511 }

512 }

513

514 32 223 return "$title$trailing";

515 }

516

517 # Store a TOC line for later.

518 #

519 # ASSUMPTIONS

520 # $level >= 1

521 sub _store_toc_line {

522 12 12 20 my ( $toc, $level, $title, $name ) = @_;

523

524 # TODO: Strip formatting from $title.

525

526 12 100 100 63 if ( @$toc && $level > $toc->[-1]->{level} ) {

527

528 # Nest a sublevel.

529 3 100 19 $toc->[-1]->{sublevel} = []

530 unless exists $toc->[-1]->{sublevel};

531 3 13 _store_toc_line( $toc->[-1]->{sublevel}, $level, $title, $name );

532 }

533 else {

534 9 54 push @$toc, { level => $level, title => $title, name => $name };

535 }

536

537 12 23 return $level;

538 }

539

540 # Make header text, storing the line for the TOC.

541 #

542 # ASSUMPTIONS

543 # $tags->{_toc} has been initialized to an array ref.

544 sub _make_header {

545 9 9 16 my $level = length $_[2];

546 9 29 my $n = _escape_uri $_[-1], $_[3];

547

548 9 249 _store_toc_line( $_[-2]->{_toc}, $level, $_[3], $n );

549

550 9 50 return "", Text::MediawikiFormat::format_line( $_[3], @_[ -2, -1 ] ), "\n";

551 }

552

553 sub _format {

554 50 50 117 my ( $itags, $iopts, $text, $tags, $opts ) = @_;

555

556 # Overwriting the caller's hashes locally after merging its contents

557 # is okay.

558 50 100 366 $tags = _merge_hashes( $tags || {}, $itags );

559 50 100 253 $opts = _merge_hashes( $opts || {}, $iopts );

560

561 50 50 157 _require_html_packages

562 if $opts->{process_html};

563

564 # Always verify the blocks since the user may have slagged the

565 # default hash on import.

566 50 114 _check_blocks($tags);

567

568 50 157 my @blocks = _find_blocks( $text, $tags, $opts );

569 50 144 @blocks = _nest_blocks( \@blocks );

570 50 200 return _process_blocks( \@blocks, $tags, $opts );

571 }

572

573 sub _check_blocks {

574 52 52 2467 my $tags = shift;

575 52 64 my %blocks = %{ $tags->{blocks} };

52 305

576 52 80 delete @blocks{ @{ $tags->{blockorder} } };

52 212

577

578 52 100 666 carp "No order specified for blocks: " . join( ', ', keys %blocks ) . ".\n"

579 if keys %blocks;

580 }

581

582 # This sub recognizes three states:

583 #

584 # 1. undef

585 # Normal wiki processing will be done on this line.

586 #

587 # 2. html

588 # Links and phrasal processing will be done, but formatting should be

589 # ignored.

590 #

591 # 3. nowiki

592 # No further wiki processing should be done.

593 #

594 # Each state may override the lower ones if already set on a given line.

595 #

596 sub _append_processed_line {

597 0 0 0 my ( $parser, $text, $state ) = @_;

598 0 0 my $lines = $parser->{processed_lines};

599

600 0 0 0 $state ||= '';

601

602 0 0 my @newlines = split /(?<=\n)/, $text;

603 0 0 0 0 if (

604 @$lines

605 && $lines->[-1]->[1] !~ /\n$/

606 && # State not changing from or to 'nowiki'

607 !( $state ne $lines->[-1]->[0] && grep /^nowiki$/, $state, $lines->[-1]->[0] )

608 )

609 {

610 0 0 $lines->[-1]->[1] .= shift @newlines;

611 0 0 0 $lines->[-1]->[0] = $state if $state eq 'html';

612 }

613

614 0 0 foreach my $line (@newlines) {

615 0 0 0 $lines->[-1]->[2] = '1' if @$lines;

616 0 0 push @$lines, [ $state, $line ];

617 }

618 0 0 0 0 $lines->[-1]->[2] = '1'

619 if @$lines && $lines->[-1]->[1] =~ /\n$/;

620 }

621

622 sub _html_tag {

623 0 0 0 my ( $parser, $type, $tagname, $orig, $attr ) = @_;

624 0 0 my $tags = $parser->{tags};

625

626 # $tagname may have been generated by an empty tag. If so, HTML::Parser

627 # will sometimes include the trailing / in the tag name.

628 0 0 my $isEmptyTag = $orig =~ m#/>$#;

629 0 0 0 $tagname =~ s#/$## if $isEmptyTag;

630

631 0 0 0 unless ( grep /^\Q$tagname\E$/, @{ $tags->{allowed_tags} } ) {

0 0

632 0 0 _append_processed_line $parser, CGI::escapeHTML $orig;

633 0 0 return;

634 }

635

636 # Any $tagname must now be in the allowed list, including .

637

638 0 0 my $tagstack = $parser->{tag_stack};

639 0 0 0 my $stacktop = @$tagstack ? $tagstack->[-1] : '';

640

641 # First, process end tags, since they can change our state.

642 0 0 0 0 if ( $type eq 'E' && $stacktop eq $tagname ) {

643

644 # The closing tag is at the top of the stack, like it should be.

645 # Pop it and append the close tag to the output.

646 0 0 pop @$tagstack;

647 0 0 my $newtag;

648

649 0 0 0 if ( $tagname eq 'nowiki' ) {

650

651 # The browser doesn't need to see the tag.

652 0 0 $newtag = '';

653 }

654 else {

655 0 0 $newtag = "";

656 }

657

658

# Can't close a state into

or

659 0 0 _append_processed_line $parser, $newtag, 'html';

660 0 0 return;

661 }

662

663 0 0 0 0 if ( @$tagstack && grep /^\Q$stacktop\E$/, qw{nowiki pre} ) {

664

665

# Ignore all markup within

 or  tags.

666 0 0 _append_processed_line $parser, CGI::escapeHTML($orig), 'nowiki';

667 0 0 return;

668 }

669

670 0 0 0 0 if ( $type eq 'E' && $HTML::Tagset::isPhraseMarkup{$tagname} )

671

672 # If we ask for artificial end element events for self-closed elements,

673 # then we need to check $HTML::Tagset::emptyElement($tagname) here too.

674 {

675 # We didn't record phrase markup on the stack, so it's okay to just

676 # let it close.

677 0 0 _append_processed_line $parser, "";

678 0 0 return;

679 }

680

681 0 0 0 if ( $type eq 'E' ) {

682

683 # We got a non-phrase end tag that wasn't on the stack. Escape it.

684 0 0 _append_processed_line $parser, CGI::escapeHTML($orig);

685 0 0 return;

686 }

687

688 ###

689 ### $type must now eq 'S'.

690 ###

691

692 # The browser doesn't need to see the tag.

693 0 0 0 if ( $tagname eq 'nowiki' ) {

694 0 0 0 push @$tagstack, $tagname

695 unless $isEmptyTag;

696 0 0 return;

697 }

698

699 # Strip disallowed attributes.

700 0 0 my $newtag = "<$tagname";

701 0 0 foreach ( @{ $tags->{allowed_attrs} } ) {

0 0

702 0 0 0 if ( defined $attr->{$_} ) {

703 0 0 $newtag .= " $_";

704 0 0 0 unless ( $attr->{$_} eq '__TEXT_MEDIAWIKIFORMAT_BOOL__' ) {

705

706 # CGI::escapeHTML escapes single quotes.

707 0 0 $attr->{$_} = CGI::escapeHTML $attr->{$_};

708 0 0 $newtag .= "='" . $attr->{$_} . "'";

709 }

710 }

711 }

712 0 0 0 0 $newtag .= " /" if $HTML::Tagset::emptyElement{$tagname} || $isEmptyTag;

713 0 0 $newtag .= ">";

714

715 # If this isn't a block level element, there's no need to track nesting.

716 0 0 0 0 if ( $HTML::Tagset::isPhraseMarkup{$tagname}

717 || $HTML::Tagset::emptyElement{$tagname} )

718 {

719 0 0 _append_processed_line $parser, $newtag;

720 0 0 return;

721 }

722

723 # Some elements can close implicitly

724 0 0 0 if (@$tagstack) {

725 0 0 0 0 if ( $tagname eq $stacktop

726 && $HTML::Tagset::optionalEndTag{$tagname} )

727 {

728 0 0 pop @$tagstack;

729 }

730 elsif ( !$HTML::Tagset::is_Possible_Strict_P_Content{$tagname} ) {

731

732 # Need to check more than the last item for paragraphs.

733 0 0 for ( my $i = $#{$tagstack}; $i >= 0; $i-- ) {

0 0

734 0 0 my $checking = $tagstack->[$i];

735 0 0 0 last if grep /^\Q$checking\E$/, @HTML::Tagset::p_closure_barriers;

736

737 0 0 0 if ( $checking eq 'p' ) {

738

739 # pop 'em all.

740 0 0 splice @$tagstack, $i;

741 0 0 last;

742 }

743 }

744 }

745 }

746

747

# Could verify here that

and sub-elements only appear where

748							# they belong.
749
750							# Push the new tag onto the stack.
751	0	0				0	push @$tagstack, $tagname
752							unless $isEmptyTag;
753
754	0	0				0	_append_processed_line $parser, $newtag, $tagname eq 'pre' ? 'nowiki' : 'html';
755	0					0	return;
756							}
757
758							sub _html_comment {
759	0			0		0	my ( $parser, $text ) = @_;
760
761	0					0	_append_processed_line $parser, $text, 'nowiki';
762							}
763
764							sub _html_text {
765	0			0		0	my ( $parser, $dtext, $skipped_text, $is_cdata ) = @_;
766	0					0	my $tagstack = $parser->{tag_stack};
767	0					0	my ( $newtext, $newstate );
768
769	0	0				0	warnings::warnif("Got skipped_text: `$skipped_text'")
770							if $skipped_text;
771
772	0	0				0	if (@$tagstack) {
773	0	0	0			0	if ( grep /\Q$tagstack->[-1]\E/, qw{nowiki pre} ) {
		0
774	0					0	$newstate = 'nowiki';
775							}
776							elsif ( $is_cdata && $HTML::Tagset::isCDATA_Parent{ $tagstack->[-1] } ) {
777
778							# If the user hadn't specifically allowed a tag which contains
779							# CDATA, then it won't be on the tag stack.
780	0					0	$newtext = $dtext;
781							}
782							}
783
784	0	0				0	unless ( defined $newtext ) {
785	0	0				0	$newtext = CGI::escapeHTML $dtext unless defined $newtext;
786
787							# CGI::escapeHTML escapes single quotes so the text may be included
788							# in attribute values, but we know we aren't processing an attribute
789							# value here.
790	0					0	$newtext =~ s/'/'/g;
791							}
792
793	0					0	_append_processed_line $parser, $newtext, $newstate;
794							}
795
796							sub _find_blocks_in_html {
797	0			0		0	my ( $text, $tags, $opts ) = @_;
798
799	0					0	my $parser = HTML::Parser->new(
800							start_h => [ \&_html_tag, 'self, "S", tagname, text, attr' ],
801							end_h => [ \&_html_tag, 'self, "E", tagname, text' ],
802							comment_h => [ \&_html_comment, 'self, text' ],
803							text_h => [ \&_html_text, 'self, dtext, skipped_text, is_cdata' ],
804							marked_sections => 1,
805							boolean_attribute_value => '__TEXT_MEDIAWIKIFORMAT_BOOL__',
806							);
807	0					0	$parser->{opts} = $opts;
808	0					0	$parser->{tags} = $tags;
809	0					0	$parser->{processed_lines} = [];
810	0					0	$parser->{tag_stack} = [];
811
812	0					0	my @blocks;
813	0					0	my @lines = split /\r?\n/, $text;
814	0					0	for ( my $i = 0; $i < @lines; $i++ ) {
815	0					0	$parser->parse( $lines[$i] );
816	0					0	$parser->parse("\n");
817	0	0				0	$parser->eof if $i == $#lines;
818
819							# @{$parser->{processed_lines}} may be empty when tags are
820							# still open.
821	0		0			0	while ( @{ $parser->{processed_lines} }
	0					0
822							&& $parser->{processed_lines}->[0]->[2] )
823							{
824	0					0	my ( $type, $dtext )
825	0					0	= @{ shift @{ $parser->{processed_lines} } };
	0					0
826
827	0					0	my $block;
828	0	0				0	if ($type) {
829	0					0	$block = _start_block( $dtext, $tags, $opts, $type );
830							}
831							else {
832	0					0	chomp $dtext;
833	0					0	$block = _start_block( $dtext, $tags, $opts );
834							}
835	0	0				0	push @blocks, $block if $block;
836							}
837							}
838
839	0					0	return @blocks;
840							}
841
842							sub _find_blocks {
843	50			50		74	my ( $text, $tags, $opts ) = @_;
844	50					64	my @blocks;
845
846	50	50				143	if ( $opts->{process_html} ) {
847	0					0	@blocks = _find_blocks_in_html $text, $tags, $opts;
848							}
849							else {
850							# The original behavior.
851	50					599	for my $line ( split /\r?\n/, $text ) {
852	300					501	my $block = _start_block( $line, $tags, $opts );
853	300	100				926	push @blocks, $block if $block;
854							}
855							}
856
857	50					265	return @blocks;
858							}
859
860							sub _start_block {
861	305			305		1458	my ( $text, $tags, $opts, $type ) = @_;
862
863	305	100				723	return new_block( 'end', level => 0 ) unless $text;
864	218	50				381	return new_block(
865							$type,
866							level => 0,
867							opts => $opts,
868							text => $text,
869							tags => $tags,
870							) if $type;
871
872	218					194	for my $block ( @{ $tags->{blockorder} } ) {
	218					421
873	1216					1277	my ( $line, $level, $indentation ) = ( $text, 0, '' );
874
875	1216	100				2385	( $level, $line, $indentation ) = _get_indentation( $tags, $line )
876							if $tags->{indented}{$block};
877
878	1216					3785	my $marker_removed = length( $line =~ s/$tags->{blocks}{$block}// );
879
880	1216	100				2175	next unless $marker_removed;
881
882	1944					3448	return new_block(
883							$block,
884	216		100			502	args => [ grep {defined} $1, $2, $3, $4, $5, $6, $7, $8, $9 ],
885							level => $level \|\| 0,
886							opts => $opts,
887							text => $line,
888							tags => $tags,
889							);
890							}
891							}
892
893							sub _nest_blocks {
894	54			54		1442	my $blocks = shift;
895	54	100				141	return unless @$blocks;
896
897	53					93	my @processed = shift @$blocks;
898
899	53					108	for my $block (@$blocks) {
900	251					925	push @processed, $processed[-1]->nest($block);
901							}
902
903	53					337	return @processed;
904							}
905
906							sub _process_blocks {
907	51			51		93	my ( $blocks, $tags, $opts ) = @_;
908
909	51					55	my @open;
910	51					86	for my $block (@$blocks) {
911	205	100				461	push @open, _process_block( $block, $tags, $opts )
912							unless $block->type() eq 'end';
913							}
914
915	51					280	return join '', @open;
916							}
917
918							sub _process_block {
919	135			135		172	my ( $block, $tags, $opts ) = @_;
920	135					262	my $type = $block->type();
921
922	135					144	my ( $start, $end, $start_line, $end_line, $between );
923	135	50				284	if ( $tags->{$type} ) {
924	135					136	( $start, $end, $start_line, $end_line, $between ) = @{ $tags->{$type} };
	135					385
925							}
926							else {
927	0					0	( $start, $end, $start_line, $end_line ) = ( '', '', '', '' );
928							}
929
930	135					185	my @text = ();
931	135	100				135	for my $line (
	135					2638
932							grep ( /^\Q$type\E$/, @{ $tags->{unformatted_blocks} } )
933							? $block->text()
934							: $block->formatted_text()
935							)
936							{
937	240	100				538	if ( blessed $line) {
938	18		33			44	my $prev_end = pop @text \|\| ();
939	18					47	push @text, _process_block( $line, $tags, $opts ), $prev_end;
940	18					31	next;
941							}
942
943	222					201	my @triplets;
944	222	100	100			785	if ( ( ref($start_line) \|\| '' ) eq 'CODE' ) {
945	32					78	@triplets = $start_line->( $line, $block->level(), $block->shift_args(), $tags, $opts );
946							}
947							else {
948	190					380	@triplets = ( $start_line, $line, $end_line );
949							}
950	222					469	push @text, @triplets;
951							}
952
953	135	100				285	pop @text if $between;
954	135					514	return join '', $start, @text, $end;
955							}
956
957							sub _get_indentation {
958	443			443		442	my ( $tags, $text ) = @_;
959
960	443	100				2882	return 1, $text unless $text =~ s/($tags->{indent})//;
961	238					797	return length($1) + 1, $text, $1;
962							}
963
964							=head2 format_line
965
966							$formatted = format_line ($raw, $tags, $opts);
967
968							This function is never exported. It formats the phrase elements of a single
969							line of text (emphasised, strong, and links).
970
971							This is only meant to be called from L and so
972							requires $tags and $opts to have all elements filled in. If you find a use for
973							it, please let me know and maybe I will have it default the missing elements as
974							C does.
975
976							=cut
977
978							sub format_line {
979	227			227	1	3702	my ( $text, $tags, $opts ) = @_;
980
981	227					712	$text =~ s!$tags->{strong_tag}!$tags->{strong}->($1, $opts)!eg;
	5					28
982	227					508	$text =~ s!$tags->{emphasized_tag}!$tags->{emphasized}->($1, $opts)!eg;
	6					22
983
984	227	50	66			717	$text = _find_links( $text, $tags, $opts )
			33
985							if $opts->{extended}
986							\|\| $opts->{absolute_links}
987							\|\| $opts->{implicit_links};
988
989	227					676	return $text;
990							}
991
992							sub _find_innermost_balanced_pair {
993	11			11		12	my ( $text, $open, $close ) = @_;
994
995	11					18	my $start_pos = rindex $text, $open;
996	11	100				29	return if $start_pos == -1;
997
998	7					8	my $end_pos = index $text, $close, $start_pos;
999	7	50				20	return if $end_pos == -1;
1000
1001	7					8	my $open_length = length $open;
1002	7					8	my $close_length = length $close;
1003	7					9	my $close_pos = $end_pos + $close_length;
1004	7					7	my $enclosed_length = $close_pos - $start_pos;
1005
1006	7					12	my $enclosed_atom = substr $text, $start_pos, $enclosed_length;
1007	7					28	return substr( $enclosed_atom, $open_length, 0 - $close_length ),
1008							substr( $text, 0, $start_pos ),
1009							substr( $text, $close_pos );
1010							}
1011
1012							sub _find_links {
1013	227			227		253	my ( $text, $tags, $opts ) = @_;
1014
1015							# Build Regexp
1016	227					190	my @res;
1017
1018	227	100				460	if ( $opts->{absolute_links} ) {
1019
1020							# URI
1021	225					182	my $s;
1022	225		66			493	$tags->{_schema_regex} \|\|= _make_schema_regex @{ $tags->{schemas} };
	47					163
1023	225					243	$s = $tags->{_schema_regex};
1024	225					1700	push @res, qr/\b$s:[$uricCheat][$uric]*/;
1025							}
1026
1027	227	100				454	if ( $opts->{implicit_links} ) {
1028
1029							# StudlyCaps
1030	17	50				36	if ( $tags->{implicit_link_delimiters} ) {
1031	17					47	push @res, qr/$tags->{implicit_link_delimiters}/;
1032							}
1033							else {
1034	0					0	warnings::warnif("Ignoring implicit_links option since implicit_link_delimiters is empty");
1035							}
1036							}
1037
1038	227	100				400	if ( $opts->{extended} ) {
1039
1040							# [[Wiki Page]]
1041	226	100				658	if ( !$tags->{extended_link_delimiters} ) {
		100
1042	3					70	warnings::warnif("Ignoring extended option since extended_link_delimiters is empty");
1043							}
1044							elsif ( ref $tags->{extended_link_delimiters} eq "ARRAY" ) {
1045
1046							# Backwards compatibility for extended links.
1047							# Bypasses the regex substitution used by absolute and implicit
1048							# links.
1049	4					6	my ( $start, $end ) = @{ $tags->{extended_link_delimiters} };
	4					8
1050	4					10	while ( my @pieces = _find_innermost_balanced_pair( $text, $start, $end ) ) {
1051	7	50				21	my ( $tag, $before, $after ) = map { defined $_ ? $_ : '' } @pieces;
	21					36
1052	7		50			20	my $extended = $tags->{link}->( $tag, $opts, $tags ) \|\| '';
1053	7					50	$text = $before . $extended . $after;
1054							}
1055							}
1056							else {
1057	219					677	push @res, qr/$tags->{extended_link_delimiters}/;
1058							}
1059							}
1060
1061	227	50				633	if (@res) {
1062	227					395	my $re = join "\|", @res;
1063	227					2783	$text =~ s/$re/$tags->{link}->($&, $opts, $tags)/ge;
	30					104
1064							}
1065
1066	227					577	return $text;
1067							}
1068
1069							=head1 Wiki Format
1070
1071							Refer to L for
1072							description of the default wiki format, as interpreted by this module. Any
1073							discrepencies will be considered bugs in this module, with a few exceptions.
1074
1075							=head2 Unimplemented Wiki Markup
1076
1077							=over 4
1078
1079							=item Templates, Magic Words, and Wanted Links
1080
1081							Templates, magic words, and the colorization of wanted links all require a back
1082							end data store that can be consulted on the existance and content of named
1083							pages. C has deliberately been constructed such that it
1084							operates independantly from such a back end. For an interface to
1085							C which implements these features, see
1086							L.
1087
1088							=item Tables
1089
1090							This is on the TODO list.
1091
1092							=back
1093
1094							=head1 EXPORT
1095
1096							If you'd like to make your life more convenient, you can optionally import a
1097							subroutine that already has default tags and options set up. This is
1098							especially handy if you use a prefix:
1099
1100							use Text::MediawikiFormat prefix => 'http://www.example.com/';
1101							wikiformat ('some text');
1102
1103							Tags are interpreted as default members of the $tags hash normally passed to
1104							C, except for the five options (see above) and the C key, who's
1105							value is interpreted as an alternate name for the imported function.
1106
1107							To use the C flag to control the name by which your code calls the imported
1108							function, for example,
1109
1110							use Text::MediawikiFormat as => 'formatTextWithWikiStyle';
1111							formatTextWithWikiStyle ('some text');
1112
1113							You might choose a better name, though.
1114
1115							The calling semantics are effectively the same as those of the C
1116							function. Any additional tags or options to the imported function will
1117							override the defaults. This code:
1118
1119							use Text::MediawikiFormat as => 'wf', extended => 0;
1120							wf ('some text', {}, {extended => 1});
1121
1122							enables extended links, after specifying that the default behavior should be
1123							to disable them.
1124
1125							=head1 GORY DETAILS
1126
1127							=head2 Tags
1128
1129							There are two types of Wiki markup: phrase markup and blocks. Blocks include
1130							lists, which are made up of lines and can also contain other lists.
1131
1132							=head3 Phrase Markup
1133
1134							The are currently three types of wiki phrase markup. These are the
1135							strong and emphasized markup and links. Links may additionally be of three
1136							subtypes, extended, implicit, or absolute.
1137
1138							You can change the regular expressions used to find strong and emphasized tags:
1139
1140							%tags = (
1141							strong_tag => qr/\([^]+?)\*/,
1142							emphasized_tag => qr\|/([^/]+?)/\|,
1143							);
1144
1145							$wikitext = 'this is strong, /emphasized/, and /em+strong/';
1146							$htmltext = wikiformat ($wikitext, \%tags, {});
1147
1148							You can also change the regular expressions used to find links. The following
1149							just sets them to their default states (but enables parsing of implicit links,
1150							which is I the default):
1151
1152							my $html = wikiformat
1153							(
1154							$raw,
1155							{implicit_link_delimiters => qr!\b(?:[A-Z][a-z0-9]\w*){2,}!,
1156							extended_link_delimiters => qr!\[(?:\[[^][]\]\|[^][])\]!,
1157							},
1158							{implicit_links => 1}
1159							);
1160
1161							In addition, you may set the function references that format strong and
1162							emphasized text and links. The strong and emphasized functions receive only
1163							the text to be formatted as an argument and are expected to return the
1164							formatted text. The link formatter also recieves references to the C<$tags>
1165							and C<$opts> arrays. For example, the following sets the strong and
1166							emphasized formatters to their default state while replacing the link formatter
1167							with one which strips href information and returns only the title text:
1168
1169							my $html = wikiformat
1170							(
1171							$raw,
1172							{strong => sub {"$_[0]"},
1173							emphasized => sub {"$_[0]"},
1174							link => sub
1175							{
1176							my ($tag, $opts, $tags) = @_;
1177							if ($tag =~ s/^\[\[([^][]+)\]\]$/$1/)
1178							{
1179							my ($page, $title) = split qr/\\|/, $tag, 2;
1180							return $title if $title;
1181							return $page;
1182							}
1183							elsif ($tag =~ s/^\[([^][]+)\]$/$1/)
1184							{
1185							my ($href, $title) = split qr/ /, $tag, 2;
1186							return $title if $title;
1187							return $href;
1188							}
1189							else
1190							{
1191							return $tag;
1192							}
1193							},
1194							},
1195							);
1196
1197							=head3 Blocks
1198
1199							The default block types are C`, C, C, C,`
1200							C, C, C, and C .
1201
1202							Block entries in the tag hashes must contain array references. The first two
1203							items are the tags used at the start and end of the block. The third and
1204							fourth contain the tags used at the start and end of each line. Where there
1205							needs to be more processing of individual lines, use a subref as the third
1206							item. This is how the module processes ordered lines in HTML lists and
1207							headers:
1208
1209							my $html = wikiformat
1210							(
1211							$raw,
1212							{ordered => [' ', " \n", ' ', " \n"],
1213							header => ['', "\n", \&_make_header],
1214							},
1215							);
1216
1217							The first argument to these subrefs is the post-processed text of the line
1218							itself. (Processing removes the indentation and tokens used to mark this as a
1219							list and checks the rest of the line for other line formattings.) The second
1220							argument is the indentation level (see below). The subsequent arguments are
1221							captured variables in the regular expression used to find this list type. The
1222							regexp for headers is:
1223
1224							$html = wikiformat
1225							(
1226							$raw,
1227							{blocks => {header => qr/^(=+)\s(.+?)\s\1$/}}
1228							);
1229
1230							The module processes indentation first, if applicable, and stores the
1231							indentation level (the length of the indentation removed).
1232
1233							Lists automatically start and end as necessary.
1234
1235							Because regular expressions could conceivably match more than one line, block
1236							level markup is processed in a specific order. The C tag governs
1237							this order. It contains a reference to an array of the names of the
1238							appropriate blocks to process. If you add a block type, be sure to add an
1239							entry for it in C:
1240
1241							my $html = wikiformat
1242							(
1243							$raw,
1244							{invisible => ['', '', '', ''],
1245							blocks => {invisible => qr!^--(.*?)--$!},
1246							blockorder => [qw(code header line ordered
1247							unordered definition invisible
1248							paragraph_break paragraph)]
1249							},
1250							},
1251							);
1252
1253							=head3 Finding blocks
1254
1255							As has already been mentioned in passing, C uses regular
1256							expressions to find blocks. These are in the C<%tags> hash under the C
1257							key. For example, to change the regular expression to find code block items,
1258							use:
1259
1260							my $html = wikiformat ($raw, {blocks => {code => qr/^:\s+/}});
1261
1262							This will require a leading colon to mark code lines (note that as writted
1263							here, this would interfere with the default processing of definition lists).
1264
1265							=head3 Finding Blocks in the Correct Order
1266
1267							As intrepid bug reporter Tom Hukins pointed out in CPAN RT bug #671, the order
1268							in which C searches for blocks varies by platform and
1269							version of Perl. Because some block-finding regular expressions are more
1270							specific than others, what you intend to be one type of block may turn into a
1271							different list type.
1272
1273							If you're adding new block types, be aware of this. The C entry in
1274							C<%tags> exists to force C to apply its regexes from
1275							most specific to least specific. It contains an array reference. By default,
1276							it looks for ordered lists first, unordered lists second, and code references
1277							at the end.
1278
1279							=head1 SEE ALSO
1280
1281							L
1282
1283							=head1 SUPPORT
1284
1285							You can find documentation for this module with the perldoc command.
1286
1287							perldoc Text::MediawikiFormat
1288
1289							You can also look for information at:
1290
1291							=over 4
1292
1293							=item * AnnoCPAN: Annotated CPAN documentation
1294
1295							L
1296
1297							=item * CPAN Ratings
1298
1299							L
1300
1301							=item * RT: CPAN's request tracker
1302
1303							L
1304
1305							=item * Search CPAN
1306
1307							L
1308
1309							=back
1310
1311							=head1 AUTHOR
1312
1313							Derek Price C is the author.
1314
1315							=head1 ACKNOWLEDGEMENTS
1316
1317							This module is derived from L, written by chromatic.
1318							chromatic's original credits are below:
1319
1320							chromatic, C, with much input from the Jellybean team
1321							(including Jonathan Paulett). Kate L Pugh has also provided several patches,
1322							many failing tests, and is usually the driving force behind new features and
1323							releases. If you think this module is worth buying me a beer, she deserves at
1324							least half of it.
1325
1326							Alex Vandiver added a nice patch and tests for extended links.
1327
1328							Tony Bowden, Tom Hukins, and Andy H. all suggested useful features that are now
1329							implemented.
1330
1331							Sam Vilain, Chris Winters, Paul Schmidt, and Art Henry have all found and
1332							reported silly bugs.
1333
1334							Blame me for the implementation.
1335
1336							=head1 BUGS
1337
1338							The link checker in C may fail to detect existing links that do
1339							not follow HTML, XML, or SGML style. They may die with some SGML styles too.
1340							I.
1341
1342							=head1 TODO
1343
1344							=over 4
1345
1346							=item * Optimize C to work on a list of lines
1347
1348							=back
1349
1350							=head1 COPYRIGHT & LICENSE
1351
1352							Copyright (c) 2006-2008 Derek R. Price, all rights reserved.
1353							Copyright (c) 2002 - 2006, chromatic, all rights reserved.
1354
1355							This program is free software; you can redistribute it and/or modify it
1356							under the same terms as Perl itself.
1357
1358							=cut
1359
1360							1; # End of Text::MediaiwkiFormat