File Coverage

blib/lib/Text/MediawikiFormat.pm

Criterion	Covered	Total	%
statement	253	367	68.9
branch	106	194	54.6
condition	20	69	28.9
subroutine	37	42	88.1
pod	2	2	100.0
total	418	674	62.0

line stmt bran cond sub pod time code

1 package Text::MediawikiFormat;

3 14 14 190398 use strict;

14 30

14 632

4 14 14 97 use warnings::register;

14 22

14 2453

6 =head1 NAME

8 Text::MediawikiFormat - Translate Wiki markup into other text formats

10 =head1 VERSION

12 Version 1.02

14 =cut

16 our $VERSION = '1.02';

18 =head1 SYNOPSIS

20 use Text::MediawikiFormat 'wikiformat';

21 my $html = wikiformat ($raw);

22 my $text = wikiformat ($raw, {}, {implicit_links => 1});

24 =head1 DESCRIPTION

26 L and its sister projects use the PHP Mediawiki to format

27 their pages. This module attempts to duplicate the Mediawiki formatting rules.

28 Those formatting rules can be simple and easy to use, while providing more

29 advanced options for the power user. They are also easy to translate into

30 other, more complicated markup languages with this module. It creates HTML by

31 default, but could produce valid POD, DocBook, XML, or any other format

32 imaginable.

34 The most important function is C. It is

35 not exported by default, but will be exported as C if any

36 options at all are passed to the exporter, unless the name is overridden

37 explicitly. See L<"EXPORT"> for more information.

39 It should be noted that this module is written as a drop in replacement for

40 L that expands on that modules functionality and provides

41 a default rule set that may be used to format text like the PHP Mediawiki. It

42 is also well to note early that if you just want a Mediawiki clone (you don't

43 need to customize it heavily and you want integration with a back end

44 database), you should look at L.

46 =cut

48 14 14 75 use Carp qw(carp confess croak);

14 22

14 990

49 14 14 18971 use CGI qw(:standard);

14 192132

14 101

50 14 14 41488 use Scalar::Util qw(blessed);

14 26

14 1430

51 14 14 6409 use Text::MediawikiFormat::Blocks;

14 30

14 84

52 14 14 22461 use URI;

14 55564

14 496

53 14 14 105 use URI::Escape qw(uri_escape uri_escape_utf8);

14 21

14 973

55 14 1593 use vars qw($missing_html_packages %tags %opts %merge_matrix

56 14 14 66 $uric $uricCheat $uriCruft);

14 20

58 BEGIN

59 {

60 # Try to load optional HTML packages, recording any errors.

61 14 14 28 eval {require HTML::Parser};

14 10287

62 14 72543 $missing_html_packages = $@;

63 14 30 eval {require HTML::Tagset};

14 8192

64 14 32582 $missing_html_packages .= $@;

65 }

69 ###

70 ### Defaults

71 ###

72 %tags =

73 (

74 indent => qr/^(?:[:*#;]*)(?=[:*#;])/,

75 link => \&_make_html_link,

76 strong => sub {"$_[0]"},

77 emphasized => sub {"$_[0]"},

78 strong_tag => qr/'''(.+?)'''/,

79 emphasized_tag => qr/''(.+?)''/,

code => ['

', "

\n", '', "\n"],

line => ['', '', '

', "\n"],

paragraph => ["

", "

\n", '', "\n", 1],

84 paragraph_break => ['', '', '', "\n"],

unordered => ["

\n", "\n", '

', "

\n"],

ordered => ["

\n", "\n", '

', "

\n"],

definition => ["

\n", "\n", \&_dl],

88 header => ['', "\n", \&_make_header],

90 blocks =>

91 {

92 code => qr/^ /,

93 header => qr/^(=+)\s*(.+?)\s*\1$/,

94 line => qr/^-{4,}$/,

95 ordered => qr/^#\s*/,

96 unordered => qr/^\*\s*/,

97 definition => qr/^([;:])\s*/,

98 paragraph => qr/^/,

99 paragraph_break => qr/^\s*$/,

100 },

101

102 indented => {map {$_ => 1} qw(ordered unordered definition)},

103 nests => {map {$_ => 1} qw(ordered unordered definition)},

104 nests_anywhere => {map {$_ => 1} qw(nowiki)},

105

106 blockorder => [qw(code header line ordered unordered definition

107 paragraph_break paragraph)],

108 implicit_link_delimiters

109 => qr!\b(?:[A-Z][a-z0-9]\w*){2,}!,

110 extended_link_delimiters

111 => qr!\[(?:\[[^][]*\]|[^][]*)\]!,

112

113 schemas => [qw(http https ftp mailto gopher)],

114

115 unformatted_blocks => [qw(header nowiki pre)],

116

117 allowed_tags => [#HTML

118 qw(b big blockquote br caption center cite code dd

119 div dl dt em font h1 h2 h3 h4 h5 h6 hr i li ol p

120 pre rb rp rt ruby s samp small strike strong sub

121 sup table td th tr tt u ul var),

122 # Mediawiki Specific

123 qw(nowiki),],

124 allowed_attrs => [qw(title align lang dir width height bgcolor),

125 qw(clear), # BR

126 qw(noshade), # HR

127 qw(cite), # BLOCKQUOTE, Q

128 qw(size face color), # FONT

129 # For various lists, mostly deprecated but safe

130 qw(type start value compact),

131 # Tables

132 qw(summary width border frame rules cellspacing

133 cellpadding valign char charoff colgroup col

134 span abbr axis headers scope rowspan colspan),

135 qw(id class name style), # For CSS

136 ],

137

138 _toc => [],

139 );

140

141 %opts =

142 (

143 extended => 1,

144 implicit_links => 0,

145 absolute_links => 1,

146 prefix => '',

147 process_html => 1,

148 charset => 'utf-8',

149 );

150

151 # Make sure import's argument hash contains an `as' entry. `as' defaults to

152 # `wikiformat' when none is given.

153 sub _process_args

154 {

155 15 15 25 shift; # Class

156 15 100 53 return as => shift if @_ == 1;

157 14 97 return as => 'wikiformat', @_;

158 }

159

160 # Delete the options (prefix, extended, implicit_links, ...) from a hash,

161 # returning a new hash with the deleted options.

162 sub _extract_opts

163 {

164 15 15 23 my %newopts;

165

166 15 40 for my $key (qw{prefix extended implicit_links absolute_links

167 process_html debug})

168 {

169 90 100 260 if (defined (my $val = delete $_[0]->{$key}))

170 {

171 19 39 $newopts{$key} = $val;

172 }

173 }

174

175 15 58 return \%newopts;

176 }

177

178 # Shamelessly ripped from Hash::Merge, which doesn't work in a threaded

179 # environment with two threads trying to use different merge matrices.

180 %merge_matrix =

181 (

182 SCALAR =>

183 {

184 SCALAR => sub {return $_[0]},

185 ARRAY => sub {# Need to be able to replace scalar with array

186 # for extended_link_delimiters (could be array

187 # or regex).

188 return $_[0];},

189 HASH => sub {confess "Attempt to replace hash with scalar"

190 if defined $_[0];

191 return _clone ($_[1]);}

192 },

193

194 ARRAY =>

195 {

196 SCALAR => sub {# Need to be able to replace array with scalar

197 # for extended_link_delimiters (could be array

198 # or regex).

199 return _clone ($_[0]);},

200 ARRAY => sub {return _clone ($_[0]);},

201 HASH => sub {confess "Attempt to replace hash with array"}

202 },

203

204 HASH =>

205 {

206 SCALAR => sub {confess "Attempt to replace scalar with hash"},

207 ARRAY => sub {confess "Attempt to replace array with hash"},

208 HASH => sub {_merge_hash_elements ($_[0], $_[1])}

209 }

210 );

211 # Return arrays and a deep copy of hashes.

212 sub _clone

213 {

214 2890 2890 2298 my ($obj) = @_;

215 2890 1807 my $type;

216 2890 50 5772 if (!defined $obj) { # Perl 5.005 compatibility

100

217 0 0 $type = 'SCALAR';

218 } elsif (ref $obj eq 'HASH') {

219 231 205 $type = 'HASH';

220 } elsif (ref $obj eq 'ARRAY') {

221 913 719 $type = 'ARRAY';

222 } else {

223 1746 1398 $type = 'SCALAR';

224 }

225

226 2890 100 5759 return $obj if $type eq 'SCALAR';

227 1144 100 2389 return $obj if $type eq 'ARRAY';

228

229 231 171 my %copy;

230 231 582 foreach my $key (keys %$obj)

231 {

232 800 1199 $copy{$key} = _clone ($obj->{$key});

233 }

234 231 501 return \%copy;

235 }

236 # This does a straight merge of hashes, delegating the merge-specific

237 # work to '_merge_hashes'.

238 sub _merge_hash_elements

239 {

240 169 169 166 my ($left, $right) = @_;

241 169 50 33 927 die "Arguments for _merge_hash_elements must be hash references" unless

242 UNIVERSAL::isa ($left, 'HASH') && UNIVERSAL::isa ($right, 'HASH');

243

244 169 157 my %newhash;

245 169 409 foreach my $leftkey (keys %$left)

246 {

247 243 100 352 if (exists $right->{$leftkey})

248 {

249 235 394 $newhash{$leftkey} =

250 _merge_hashes ($left->{$leftkey}, $right->{$leftkey});

251 }

252 else

253 {

254 8 14 $newhash{$leftkey} = _clone ($left->{$leftkey});

255 }

256 }

257 169 671 foreach my $rightkey (keys %$right)

258 {

259 2266 100 4505 $newhash{$rightkey} = _clone ($right->{$rightkey})

260 if !exists $left->{$rightkey};

261 }

262 169 480 return \%newhash;

263 }

264 sub _merge_hashes

265 {

266 371 371 4481 my ($left, $right) = @_;

267

268 # if one argument or the other is undefined or empty, don't worry about

269 # copying, just return the original.

270 371 50 541 return $right unless defined $left;

271 371 50 517 return $left unless defined $right;

272

273 # For the general use of this function, we want to create duplicates

274 # of all data that is merged.

275

276 371 278 my ($lefttype, $righttype);

277 371 100 720 if (ref $left eq 'HASH') {

100

278 169 307 $lefttype = 'HASH';

279 } elsif (ref $left eq 'ARRAY') {

280 51 54 $lefttype = 'ARRAY';

281 } else {

282 151 151 $lefttype = 'SCALAR';

283 }

284

285 371 100 599 if (ref $right eq 'HASH') {

100

286 169 149 $righttype = 'HASH';

287 } elsif (ref $right eq 'ARRAY') {

288 49 36 $righttype = 'ARRAY';

289 } else {

290 153 153 $righttype = 'SCALAR';

291 }

292

293 371 734 return $merge_matrix{$lefttype}->{$righttype} ($left, $right);

294 }

295

296 sub _require_html_packages

297 {

298 1 50 1 3 croak "$missing_html_packages\n"

299 . "HTML::Parser & HTML::Tagset is required for process_html\n"

300 if $missing_html_packages;

301 }

302

303 sub import

304 {

305 18 100 18 4362 return unless @_ > 1;

306

307 15 28 my $class = shift;

308 15 57 my %args = $class->_process_args (@_);

309 15 42 my $name = delete $args{as};

310

311 15 40 my $caller = caller();

312 15 62 my $iopts = _merge_hashes _extract_opts (\%args), \%opts;

313 15 54 my $itags = _merge_hashes \%args, \%tags;

314

315 15 100 68 _require_html_packages

316 if $iopts->{process_html};

317

318 # Could verify ITAGS here via _check_blocks, but what if a user

319 # wants to add a block to block_order that they intend to override

320 # the implementation of with every call to format()?

321

322 14 14 102 no strict 'refs';

14 118

14 49162

323 15 17715 *{ $caller . "::" . $name } = sub

324 {

325 45 45 33441 Text::MediawikiFormat::_format ($itags, $iopts, @_);

326 }

327 15 102 }

328

329

330

331 =head1 FUNCTIONS

332

333 =head2 format

334

335 C takes one required argument, the text to convert, and returns the

336 converted text. It allows two optional arguments. The first is a reference to

337 a hash of tags used to override the function's default behavior. Anything

338 passed in here will override the default tags. The second argument is a hash

339 reference of options. The options are currently:

340

341 =over 4

342

343 =item prefix

344

345 The prefix of any links to wiki pages. In HTML mode, this is the path to the

346 Wiki. The actual linked item itself will be appended to the prefix. This is

347 useful to create full URIs:

348

349 {prefix => 'http://example.com/wiki.pl?page='}

350

351 =item extended

352

353 A boolean flag, true by default, to let square brackets mark links.

354 An optional title may occur after the Wiki targets, preceded by an open pipe.

355 URI titles are separated from their title with a space. These are valid

356 extended links:

357

358 [[A wiki page|and the title to display]]

359 [http://ximbiot.com URI title]

360

361 Where the linking semantics of the destination format allow it, the result will

362 display the title instead of the URI. In HTML terms, the title is the content

363 of an C element (not the content of its C attribute).

364

365 You can use delimiters other than single square brackets for marking extended

366 links by passing a value for C in the C<%tags> hash

367 when calling C.

368

369 Note that if you disable this flag, you should probably enable

370 C or there will be no automated way to link to other pages in

371 your wiki.

372

373 =item implicit_links

374

375 A boolean flag, false by default, to create links from StudlyCapsStrings.

376

377 =item absolute_links

378

379 A boolean flag, true by default, which treats any links that are absolute URIs

380 (such as C) specially. Any prefix will not apply.

381 This should maybe be called implicit_absolute_links since the C

382 option enables absolute links inside square brackets by default.

383

384 A link is any text that starts with a known schema followed by a colon and one

385 or more non-whitespace characters. This is a distinct subset of what L

386 recognizes as a URI, but is a good first-order approximation. If you need to

387 recognize more complex URIs, use the standard wiki formatting explained

388 earlier.

389

390 The recognized schemas are those defined in the C value in the C<%tags>

391 hash. C defaults to C, C, C, C, and

392 C.

393

394 =item process_html

395

396 This flag, true by default, causes the formatter to ignore block level wiki

397 markup (code, ordered, unordered, etc...) when they occur on lines which also

398

contain allowed block-level HTML tags (

, , ,

, etc...).

399 Phrase level wiki markup (emphasis, strong, & links) is unaffected by this

400 flag.

401

402 =back

403

404 =cut

405

406 sub format

407 {

408 7 7 1 9586 _format (\%tags, \%opts, @_);

409 }

410

411 # Turn the contents after a ; or : into a dictionary list.

412 # Using : without ; just looks like an indent.

413 sub _dl

414 {

415 #my ($line, $indent, $lead) = @_;

416 23 23 16 my ($term, $def);

417

418 23 100 29 if ($_[2] eq ';')

419 {

420 11 100 32 if ($_[0] =~ /^(.*?)\s+:\s+(.*)$/)

421 {

422 6 10 $term = $1;

423 6 8 $def = $2;

424 }

425 else

426 {

427 5 8 $term = $_[0];

428 }

429 }

430 else

431 {

432 12 11 $def = $_[0];

433 }

434

435 23 18 my @retval;

436 23 100 40 push @retval, "
", $term, "
\n" if defined $term;

437 23 100 54 push @retval, "
", $def, "
\n" if defined $def;

438 23 48 return @retval;

439 }

440

441 # Makes a regex out of the allowed schema array.

442 sub _make_schema_regex

443 {

444 47 47 70 my $re = join "|", map {qr/\Q$_\E/} @_;

231 1697

445 47 844 return qr/(?:$re)/;

446 }

447

448 $uric = $URI::uric;

449 $uricCheat = $uric;

450

451 # We need to avoid picking up 'HTTP::Request::Common' so we have a

452 # subset of uric without a colon.

453 $uricCheat =~ tr/://d;

454

455 # Identifying characters often accidentally picked up trailing a URI.

456 $uriCruft = q/]),.!'";}/;

457

458 # escape a URI based on our charset.

459 sub _escape_uri

460 {

461 28 28 40 my ($opts, $uri) = @_;

462 28 50 65 confess "charset not initialized" unless $opts->{charset};

463 28 50 192 return uri_escape_utf8 $uri if $opts->{charset} =~ /^utf-?8$/i;

464 0 0 return uri_escape $uri;

465 }

466

467 # Turn [[Wiki Link|Title]], [URI Title], scheme:url, or StudlyCaps into links.

468 sub _make_html_link

469 {

470 32 32 118 my ($tag, $opts, $tags) = @_;

471

472 32 53 my ($class, $trailing) = ('', '');

473 32 30 my ($href, $title);

474 32 100 157 if ($tag =~ /^\[\[([^|#]*)(?:(#)([^|]*))?(?:(\|)(.*))?\]\]$/)

100

475 {

476 # Wiki link

477 7 50 35 $href = $opts->{prefix} . _escape_uri $opts, $1 if $1;

478 7 50 170 $href .= $2 . _escape_uri $opts, $3 if $2;

479

480 7 100 20 if ($4)

481 {

482 # Title specified explicitly.

483 3 50 7 if (length $5)

484 {

485 3 6 $title = $5;

486 }

487 else

488 {

489 # An empty title asks Mediawiki to strip any parens off the end

490 # of the node name.

491 0 0 $1 =~ /^([^(]*)(?:\s*\()?/;

492 0 0 $title = $1;

493 }

494 }

495 else

496 {

497 # Title defaults to the node name.

498 4 6 $title = $1;

499 }

500 }

501 elsif ($tag =~ /^\[(\S*)(?:(\s+)(.*))?\]$/)

502 {

503 # URI

504 5 12 $href = $1;

505 5 50 18 if ($2)

506 {

507 5 10 $title = $3;

508 }

509 else

510 {

511 0 0 $title = ++$opts->{_uri_refs};

512 }

513 5 8 $href =~ s/'/%27/g;

514 }

515 else

516 {

517 # Shouldn't be able to get here without either $opts->{absolute_links}

518 # or $opts->{implicit_links};

519 20 33 63 $tags->{_schema_regex} ||= _make_schema_regex @{$tags->{schemas}};

0 0

520 20 29 my $s = $tags->{_schema_regex};

521

522 20 100 560 if ($tag =~ /^$s:[$uricCheat][$uric]*$/)

523 {

524 # absolute link

525 8 19 $href = $&;

526 8 100 68 $trailing = $& if $href =~ s/[$uriCruft]$//;

527 8 18 $title = $href;

528 }

529 else

530 {

531 # StudlyCaps

532 12 30 $href = $opts->{prefix} . _escape_uri $opts, $tag;

533 12 273 $title = $tag;

534 }

535 }

536

537 32 213 return "$title$trailing";

538 }

539

540 # Store a TOC line for later.

541 #

542 # ASSUMPTIONS

543 # $level >= 1

544 sub _store_toc_line

545 {

546 12 12 20 my ($toc, $level, $title, $name) = @_;

547

548 # TODO: Strip formatting from $title.

549

550 12 100 100 53 if (@$toc && $level > $toc->[-1]->{level})

551 {

552 # Nest a sublevel.

553 3 100 23 $toc->[-1]->{sublevel} = []

554 unless exists $toc->[-1]->{sublevel};

555 3 10 _store_toc_line ($toc->[-1]->{sublevel}, $level, $title, $name);

556 }

557 else

558 {

559 9 40 push @$toc, {level => $level, title => $title, name => $name};

560 }

561

562 12 17 return $level;

563 }

564

565 # Make header text, storing the line for the TOC.

566 #

567 # ASSUMPTIONS

568 # $tags->{_toc} has been initialized to an array ref.

569 sub _make_header

570 {

571 9 9 13 my $level = length $_[2];

572 9 24 my $n = _escape_uri $_[-1], $_[3];

573

574 9 209 _store_toc_line ($_[-2]->{_toc}, $level, $_[3], $n);

575

576 9 40 return "",

577 Text::MediawikiFormat::format_line ($_[3], @_[-2, -1]),

578 "\n";

579 }

580

581 sub _format

582 {

583 50 50 106 my ($itags, $iopts, $text, $tags, $opts) = @_;

584

585 # Overwriting the caller's hashes locally after merging its contents

586 # is okay.

587 50 100 235 $tags = _merge_hashes ($tags || {}, $itags);

588 50 100 210 $opts = _merge_hashes ($opts || {}, $iopts);

589

590 50 50 152 _require_html_packages

591 if $opts->{process_html};

592

593 # Always verify the blocks since the user may have slagged the

594 # default hash on import.

595 50 147 _check_blocks ($tags);

596

597 50 156 my @blocks = _find_blocks ($text, $tags, $opts);

598 50 128 @blocks = _nest_blocks (\@blocks);

599 50 133 return _process_blocks (\@blocks, $tags, $opts);

600 }

601

602 sub _check_blocks

603 {

604 52 52 1783 my $tags = shift;

605 52 56 my %blocks = %{$tags->{blocks}};

52 277

606 52 83 delete @blocks{@{$tags->{blockorder}}};

52 190

607

608 52 100 578 carp

609 "No order specified for blocks: "

610 . join (', ', keys %blocks)

611 . ".\n"

612 if keys %blocks;

613 }

614

615 # This sub recognizes three states:

616 #

617 # 1. undef

618 # Normal wiki processing will be done on this line.

619 #

620 # 2. html

621 # Links and phrasal processing will be done, but formatting should be

622 # ignored.

623 #

624 # 3. nowiki

625 # No further wiki processing should be done.

626 #

627 # Each state may override the lower ones if already set on a given line.

628 #

629 sub _append_processed_line

630 {

631 0 0 0 my ($parser, $text, $state) = @_;

632 0 0 my $lines = $parser->{processed_lines};

633

634 0 0 0 $state ||= '';

635

636 0 0 my @newlines = split /(?<=\n)/, $text;

637 0 0 0 0 if (@$lines && $lines->[-1]->[1] !~ /\n$/

638 && # State not changing from or to 'nowiki'

639 !($state ne $lines->[-1]->[0]

640 && grep /^nowiki$/, $state, $lines->[-1]->[0]))

641 {

642 0 0 $lines->[-1]->[1] .= shift @newlines;

643 0 0 0 $lines->[-1]->[0] = $state if $state eq 'html';

644 }

645

646 0 0 foreach my $line (@newlines)

647 {

648 0 0 0 $lines->[-1]->[2] = '1' if @$lines;

649 0 0 push @$lines, [$state, $line];

650 }

651 0 0 0 0 $lines->[-1]->[2] = '1'

652 if @$lines && $lines->[-1]->[1] =~ /\n$/;

653 }

654

655 sub _html_tag

656 {

657 0 0 0 my ($parser, $type, $tagname, $orig, $attr) = @_;

658 0 0 my $tags = $parser->{tags};

659

660 # $tagname may have been generated by an empty tag. If so, HTML::Parser

661 # will sometimes include the trailing / in the tag name.

662 0 0 my $isEmptyTag = $orig =~ m#/>$#;

663 0 0 0 $tagname =~ s#/$## if $isEmptyTag;

664

665 0 0 0 unless (grep /^\Q$tagname\E$/, @{$tags->{allowed_tags}})

0 0

666 {

667 0 0 _append_processed_line $parser, CGI::escapeHTML $orig;

668 0 0 return;

669 }

670 # Any $tagname must now be in the allowed list, including .

671

672 0 0 my $tagstack = $parser->{tag_stack};

673 0 0 0 my $stacktop = @$tagstack ? $tagstack->[-1] : '';

674

675 # First, process end tags, since they can change our state.

676 0 0 0 0 if ($type eq 'E' && $stacktop eq $tagname)

677 {

678 # The closing tag is at the top of the stack, like it should be.

679 # Pop it and append the close tag to the output.

680 0 0 pop @$tagstack;

681 0 0 my $newtag;

682

683 0 0 0 if ($tagname eq 'nowiki')

684 {

685 # The browser doesn't need to see the tag.

686 0 0 $newtag = '';

687 }

688 else

689 {

690 0 0 $newtag = "";

691 }

692

693

# Can't close a state into

or

694 0 0 _append_processed_line $parser, $newtag, 'html';

695 0 0 return;

696 }

697

698 0 0 0 0 if (@$tagstack && grep /^\Q$stacktop\E$/, qw{nowiki pre})

699 {

700

# Ignore all markup within

 or  tags.

701 0 0 _append_processed_line $parser, CGI::escapeHTML ($orig), 'nowiki';

702 0 0 return;

703 }

704

705 0 0 0 0 if ($type eq 'E' && $HTML::Tagset::isPhraseMarkup{$tagname})

706 # If we ask for artificial end element events for self-closed elements,

707 # then we need to check $HTML::Tagset::emptyElement($tagname) here too.

708 {

709 # We didn't record phrase markup on the stack, so it's okay to just

710 # let it close.

711 0 0 _append_processed_line $parser, "";

712 0 0 return;

713 }

714

715 0 0 0 if ($type eq 'E')

716 {

717 # We got a non-phrase end tag that wasn't on the stack. Escape it.

718 0 0 _append_processed_line $parser, CGI::escapeHTML ($orig);

719 0 0 return;

720 }

721

722

723 ###

724 ### $type must now eq 'S'.

725 ###

726

727 # The browser doesn't need to see the tag.

728 0 0 0 if ($tagname eq 'nowiki')

729 {

730 0 0 0 push @$tagstack, $tagname

731 unless $isEmptyTag;

732 0 0 return;

733 }

734

735 # Strip disallowed attributes.

736 0 0 my $newtag = "<$tagname";

737 0 0 foreach (@{$tags->{allowed_attrs}})

0 0

738 {

739 0 0 0 if (defined $attr->{$_})

740 {

741 0 0 $newtag .= " $_";

742 0 0 0 unless ($attr->{$_}

743 eq '__TEXT_MEDIAWIKIFORMAT_BOOL__')

744 {

745 # CGI::escapeHTML escapes single quotes.

746 0 0 $attr->{$_} = CGI::escapeHTML $attr->{$_};

747 0 0 $newtag .= "='" . $attr->{$_} . "'";

748 }

749 }

750 }

751 0 0 0 0 $newtag .= " /" if $HTML::Tagset::emptyElement{$tagname} || $isEmptyTag;

752 0 0 $newtag .= ">";

753

754 # If this isn't a block level element, there's no need to track nesting.

755 0 0 0 0 if ($HTML::Tagset::isPhraseMarkup{$tagname}

756 || $HTML::Tagset::emptyElement{$tagname})

757 {

758 0 0 _append_processed_line $parser, $newtag;

759 0 0 return;

760 }

761

762 # Some elements can close implicitly

763 0 0 0 if (@$tagstack)

764 {

765 0 0 0 0 if ($tagname eq $stacktop

766 && $HTML::Tagset::optionalEndTag{$tagname})

767 {

768 0 0 pop @$tagstack;

769 }

770 elsif (!$HTML::Tagset::is_Possible_Strict_P_Content{$tagname})

771 {

772 # Need to check more than the last item for paragraphs.

773 0 0 for (my $i = $#{$tagstack}; $i >= 0; $i--)

0 0

774 {

775 0 0 my $checking = $tagstack->[$i];

776 0 0 0 last if grep /^\Q$checking\E$/,

777 @HTML::Tagset::p_closure_barriers;

778

779 0 0 0 if ($checking eq 'p')

780 {

781 # pop 'em all.

782 0 0 splice @$tagstack, $i;

783 0 0 last;

784 }

785 }

786 }

787 }

788

789

# Could verify here that

and sub-elements only appear where

790							# they belong.
791
792							# Push the new tag onto the stack.
793	0	0				0	push @$tagstack, $tagname
794							unless $isEmptyTag;
795
796	0	0				0	_append_processed_line $parser, $newtag,
797							$tagname eq 'pre' ? 'nowiki' : 'html';
798	0					0	return;
799							}
800
801							sub _html_comment
802							{
803	0			0		0	my ($parser, $text) = @_;
804
805	0					0	_append_processed_line $parser, $text, 'nowiki';
806							}
807
808							sub _html_text
809							{
810	0			0		0	my ($parser, $dtext, $skipped_text, $is_cdata) = @_;
811	0					0	my $tagstack = $parser->{tag_stack};
812	0					0	my ($newtext, $newstate);
813
814	0	0				0	warnings::warnif ("Got skipped_text: `$skipped_text'")
815							if $skipped_text;
816
817	0	0				0	if (@$tagstack)
818							{
819	0	0	0			0	if (grep /\Q$tagstack->[-1]\E/, qw{nowiki pre})
		0
820							{
821	0					0	$newstate = 'nowiki'
822							}
823							elsif ($is_cdata && $HTML::Tagset::isCDATA_Parent{$tagstack->[-1]})
824							{
825							# If the user hadn't specifically allowed a tag which contains
826							# CDATA, then it won't be on the tag stack.
827	0					0	$newtext = $dtext;
828							}
829							}
830
831	0	0				0	unless (defined $newtext)
832							{
833	0	0				0	$newtext = CGI::escapeHTML $dtext unless defined $newtext;
834							# CGI::escapeHTML escapes single quotes so the text may be included
835							# in attribute values, but we know we aren't processing an attribute
836							# value here.
837	0					0	$newtext =~ s/'/'/g;
838							}
839
840	0					0	_append_processed_line $parser, $newtext, $newstate;
841							}
842
843							sub _find_blocks_in_html
844							{
845	0			0		0	my ($text, $tags, $opts) = @_;
846
847	0					0	my $parser = HTML::Parser->new
848							(start_h => [\&_html_tag, 'self, "S", tagname, text, attr'],
849							end_h => [\&_html_tag, 'self, "E", tagname, text'],
850							comment_h => [\&_html_comment, 'self, text'],
851							text_h => [\&_html_text, 'self, dtext, skipped_text, is_cdata'],
852							marked_sections => 1,
853							boolean_attribute_value => '__TEXT_MEDIAWIKIFORMAT_BOOL__',
854							);
855	0					0	$parser->{opts} = $opts;
856	0					0	$parser->{tags} = $tags;
857	0					0	$parser->{processed_lines} = [];
858	0					0	$parser->{tag_stack} = [];
859
860	0					0	my @blocks;
861	0					0	my @lines = split /\r?\n/, $text;
862	0					0	for (my $i = 0; $i < @lines; $i++)
863							{
864	0					0	$parser->parse ($lines[$i]);
865	0					0	$parser->parse ("\n");
866	0	0				0	$parser->eof if $i == $#lines;
867
868							# @{$parser->{processed_lines}} may be empty when tags are
869							# still open.
870	0		0			0	while (@{$parser->{processed_lines}}
	0					0
871							&& $parser->{processed_lines}->[0]->[2])
872							{
873	0					0	my ($type, $dtext)
874	0					0	= @{shift @{$parser->{processed_lines}}};
	0					0
875
876	0					0	my $block;
877	0	0				0	if ($type)
878							{
879	0					0	$block = _start_block ($dtext, $tags, $opts, $type);
880							}
881							else
882							{
883	0					0	chomp $dtext;
884	0					0	$block = _start_block ($dtext, $tags, $opts);
885							}
886	0	0				0	push @blocks, $block if $block;
887							}
888							}
889
890	0					0	return @blocks;
891							}
892
893							sub _find_blocks
894							{
895	50			50		67	my ($text, $tags, $opts) = @_;
896	50					61	my @blocks;
897
898	50	50				110	if ($opts->{process_html})
899							{
900	0					0	@blocks = _find_blocks_in_html $text, $tags, $opts;
901							}
902							else
903							{
904							# The original behavior.
905	50					521	for my $line (split /\r?\n/, $text)
906							{
907	300					438	my $block = _start_block ($line, $tags, $opts);
908	300	100				886	push @blocks, $block if $block;
909							}
910							}
911
912	50					189	return @blocks;
913							}
914
915							sub _start_block
916							{
917	305			305		1458	my ($text, $tags, $opts, $type) = @_;
918
919	305	100				596	return new_block ('end', level => 0) unless $text;
920	218	50				317	return new_block ($type,
921							level => 0,
922							opts => $opts,
923							text => $text,
924							tags => $tags,)
925							if $type;
926
927	218					178	for my $block (@{$tags->{blockorder}})
	218					365
928							{
929	1216					1206	my ($line, $level, $indentation) = ($text, 0, '');
930
931	1216	100				2287	($level, $line, $indentation) = _get_indentation ($tags, $line)
932							if $tags->{indented}{$block};
933
934	1216					3416	my $marker_removed = length ($line =~ s/$tags->{blocks}{$block}//);
935
936	1216	100				2010	next unless $marker_removed;
937
938	1944					3076	return new_block ($block,
939	216		100			337	args => [grep {defined} $1, $2, $3, $4, $5, $6, $7,
940							$8, $9],
941							level => $level \|\| 0,
942							opts => $opts,
943							text => $line,
944							tags => $tags,
945							);
946							}
947							}
948
949							sub _nest_blocks
950							{
951	54			54		1442	my $blocks = shift;
952	54	100				128	return unless @$blocks;
953
954	53					101	my @processed = shift @$blocks;
955
956	53					88	for my $block (@$blocks)
957							{
958	251					618	push @processed, $processed[-1]->nest( $block );
959							}
960
961	53					353	return @processed;
962							}
963
964							sub _process_blocks
965							{
966	51			51		160	my ($blocks, $tags, $opts) = @_;
967
968	51					48	my @open;
969	51					78	for my $block (@$blocks)
970							{
971	205	100				415	push @open, _process_block ($block, $tags, $opts)
972							unless $block->type() eq 'end';
973							}
974
975	51					229	return join '', @open ;
976							}
977
978							sub _process_block
979							{
980	135			135		148	my ($block, $tags, $opts) = @_;
981	135					226	my $type = $block->type();
982
983	135					131	my ($start, $end, $start_line, $end_line, $between);
984	135	50				255	if ($tags->{$type})
985							{
986	135					108	($start, $end, $start_line, $end_line, $between) = @{$tags->{$type}};
	135					349
987							}
988							else
989							{
990	0					0	($start, $end, $start_line, $end_line) = ('', '', '', '');
991							}
992
993	135					167	my @text = ();
994	135	100				117	for my $line (grep (/^\Q$type\E$/, @{$tags->{unformatted_blocks}})
	135					1952
995							? $block->text()
996							: $block->formatted_text())
997							{
998	240	100				479	if (blessed $line)
999							{
1000	18		33			45	my $prev_end = pop @text \|\| ();
1001	18					48	push @text, _process_block ($line, $tags, $opts), $prev_end;
1002	18					30	next;
1003							}
1004
1005	222					170	my @triplets;
1006	222	100	100			680	if ((ref ($start_line) \|\| '') eq 'CODE')
1007							{
1008	32					68	@triplets = $start_line->($line, $block->level(),
1009							$block->shift_args(), $tags, $opts);
1010							}
1011							else
1012							{
1013	190					274	@triplets = ($start_line, $line, $end_line);
1014							}
1015	222					423	push @text, @triplets;
1016							}
1017
1018	135	100				250	pop @text if $between;
1019	135					438	return join '', $start, @text, $end;
1020							}
1021
1022							sub _get_indentation
1023							{
1024	443			443		414	my ($tags, $text) = @_;
1025
1026	443	100				2657	return 1, $text unless $text =~ s/($tags->{indent})//;
1027	238					778	return length ($1) + 1, $text, $1;
1028							}
1029
1030							=head2 format_line
1031
1032							$formatted = format_line ($raw, $tags, $opts);
1033
1034							This function is never exported. It formats the phrase elements of a single
1035							line of text (emphasised, strong, and links).
1036
1037							This is only meant to be called from L and so
1038							requires $tags and $opts to have all elements filled in. If you find a use for
1039							it, please let me know and maybe I will have it default the missing elements as
1040							C does.
1041
1042							=cut
1043
1044							sub format_line
1045							{
1046	227			227	1	4045	my ($text, $tags, $opts) = @_;
1047
1048	227					635	$text =~ s!$tags->{strong_tag}!$tags->{strong}->($1, $opts)!eg;
	5					12
1049	227					436	$text =~ s!$tags->{emphasized_tag}!$tags->{emphasized}->($1, $opts)!eg;
	6					18
1050
1051	227	50	66			646	$text = _find_links ($text, $tags, $opts)
			33
1052							if $opts->{extended}
1053							\|\| $opts->{absolute_links}
1054							\|\| $opts->{implicit_links};
1055
1056	227					621	return $text;
1057							}
1058
1059							sub _find_innermost_balanced_pair
1060							{
1061	11			11		13	my ($text, $open, $close) = @_;
1062
1063	11					16	my $start_pos = rindex $text, $open;
1064	11	100				24	return if $start_pos == -1;
1065
1066	7					9	my $end_pos = index $text, $close, $start_pos;
1067	7	50				24	return if $end_pos == -1;
1068
1069	7					7	my $open_length = length $open;
1070	7					6	my $close_length = length $close;
1071	7					8	my $close_pos = $end_pos + $close_length;
1072	7					8	my $enclosed_length = $close_pos - $start_pos;
1073
1074	7					11	my $enclosed_atom = substr $text, $start_pos, $enclosed_length;
1075	7					27	return substr ($enclosed_atom, $open_length, 0 - $close_length),
1076							substr ($text, 0, $start_pos),
1077							substr ($text, $close_pos);
1078							}
1079
1080							sub _find_links
1081							{
1082	227			227		236	my ($text, $tags, $opts) = @_;
1083
1084							# Build Regexp
1085	227					179	my @res;
1086
1087	227	100				373	if ($opts->{absolute_links})
1088							{
1089							# URI
1090	225					176	my $s;
1091	225		66			451	$tags->{_schema_regex} \|\|= _make_schema_regex @{$tags->{schemas}};
	47					135
1092	225					262	$s = $tags->{_schema_regex};
1093	225					1668	push @res, qr/\b$s:[$uricCheat][$uric]*/
1094							}
1095
1096	227	100				442	if ($opts->{implicit_links})
1097							{
1098							# StudlyCaps
1099	17	50				31	if ($tags->{implicit_link_delimiters})
1100							{
1101	17					49	push @res, qr/$tags->{implicit_link_delimiters}/;
1102							}
1103							else
1104							{
1105	0					0	warnings::warnif ("Ignoring implicit_links option since implicit_link_delimiters is empty");
1106							}
1107							}
1108
1109	227	100				361	if ($opts->{extended})
1110							{
1111							# [[Wiki Page]]
1112	226	100				588	if (!$tags->{extended_link_delimiters})
		100
1113							{
1114	3					82	warnings::warnif ("Ignoring extended option since extended_link_delimiters is empty");
1115							}
1116							elsif (ref $tags->{extended_link_delimiters} eq "ARRAY")
1117							{
1118							# Backwards compatibility for extended links.
1119							# Bypasses the regex substitution used by absolute and implicit
1120							# links.
1121	4					4	my ($start, $end) = @{$tags->{extended_link_delimiters}};
	4					10
1122	4					9	while (my @pieces = _find_innermost_balanced_pair ($text, $start,
1123							$end))
1124							{
1125	7	50				9	my ($tag, $before, $after) = map { defined $_ ? $_ : '' }
	21					33
1126							@pieces;
1127	7		50			14	my $extended = $tags->{link}->($tag, $opts, $tags) \|\| '';
1128	7					44	$text = $before . $extended . $after;
1129							}
1130							}
1131							else
1132							{
1133	219					599	push @res, qr/$tags->{extended_link_delimiters}/;
1134							}
1135							}
1136
1137	227	50				608	if (@res)
1138							{
1139	227					370	my $re = join "\|", @res;
1140	227					2643	$text =~ s/$re/$tags->{link}->($&, $opts, $tags)/ge;
	30					107
1141							}
1142
1143	227					547	return $text;
1144							}
1145
1146							=head1 Wiki Format
1147
1148							Refer to L for
1149							description of the default wiki format, as interpreted by this module. Any
1150							discrepencies will be considered bugs in this module, with a few exceptions.
1151
1152							=head2 Unimplemented Wiki Markup
1153
1154							=over 4
1155
1156							=item Templates, Magic Words, and Wanted Links
1157
1158							Templates, magic words, and the colorization of wanted links all require a back
1159							end data store that can be consulted on the existance and content of named
1160							pages. C has deliberately been constructed such that it
1161							operates independantly from such a back end. For an interface to
1162							C which implements these features, see
1163							L.
1164
1165							=item Tables
1166
1167							This is on the TODO list.
1168
1169							=back
1170
1171							=head1 EXPORT
1172
1173							If you'd like to make your life more convenient, you can optionally import a
1174							subroutine that already has default tags and options set up. This is
1175							especially handy if you use a prefix:
1176
1177							use Text::MediawikiFormat prefix => 'http://www.example.com/';
1178							wikiformat ('some text');
1179
1180							Tags are interpreted as default members of the $tags hash normally passed to
1181							C, except for the five options (see above) and the C key, who's
1182							value is interpreted as an alternate name for the imported function.
1183
1184							To use the C flag to control the name by which your code calls the imported
1185							function, for example,
1186
1187							use Text::MediawikiFormat as => 'formatTextWithWikiStyle';
1188							formatTextWithWikiStyle ('some text');
1189
1190							You might choose a better name, though.
1191
1192							The calling semantics are effectively the same as those of the C
1193							function. Any additional tags or options to the imported function will
1194							override the defaults. This code:
1195
1196							use Text::MediawikiFormat as => 'wf', extended => 0;
1197							wf ('some text', {}, {extended => 1});
1198
1199							enables extended links, after specifying that the default behavior should be
1200							to disable them.
1201
1202							=head1 GORY DETAILS
1203
1204							=head2 Tags
1205
1206							There are two types of Wiki markup: phrase markup and blocks. Blocks include
1207							lists, which are made up of lines and can also contain other lists.
1208
1209							=head3 Phrase Markup
1210
1211							The are currently three types of wiki phrase markup. These are the
1212							strong and emphasized markup and links. Links may additionally be of three
1213							subtypes, extended, implicit, or absolute.
1214
1215							You can change the regular expressions used to find strong and emphasized tags:
1216
1217							%tags = (
1218							strong_tag => qr/\([^]+?)\*/,
1219							emphasized_tag => qr\|/([^/]+?)/\|,
1220							);
1221
1222							$wikitext = 'this is strong, /emphasized/, and /em+strong/';
1223							$htmltext = wikiformat ($wikitext, \%tags, {});
1224
1225							You can also change the regular expressions used to find links. The following
1226							just sets them to their default states (but enables parsing of implicit links,
1227							which is I the default):
1228
1229							my $html = wikiformat
1230							(
1231							$raw,
1232							{implicit_link_delimiters => qr!\b(?:[A-Z][a-z0-9]\w*){2,}!,
1233							extended_link_delimiters => qr!\[(?:\[[^][]\]\|[^][])\]!,
1234							},
1235							{implicit_links => 1}
1236							);
1237
1238							In addition, you may set the function references that format strong and
1239							emphasized text and links. The strong and emphasized functions receive only
1240							the text to be formatted as an argument and are expected to return the
1241							formatted text. The link formatter also recieves references to the C<$tags>
1242							and C<$opts> arrays. For example, the following sets the strong and
1243							emphasized formatters to their default state while replacing the link formatter
1244							with one which strips href information and returns only the title text:
1245
1246							my $html = wikiformat
1247							(
1248							$raw,
1249							{strong => sub {"$_[0]"},
1250							emphasized => sub {"$_[0]"},
1251							link => sub
1252							{
1253							my ($tag, $opts, $tags) = @_;
1254							if ($tag =~ s/^\[\[([^][]+)\]\]$/$1/)
1255							{
1256							my ($page, $title) = split qr/\\|/, $tag, 2;
1257							return $title if $title;
1258							return $page;
1259							}
1260							elsif ($tag =~ s/^\[([^][]+)\]$/$1/)
1261							{
1262							my ($href, $title) = split qr/ /, $tag, 2;
1263							return $title if $title;
1264							return $href;
1265							}
1266							else
1267							{
1268							return $tag;
1269							}
1270							},
1271							},
1272							);
1273
1274							=head3 Blocks
1275
1276							The default block types are C`, C, C, C,`
1277							C, C, C, and C .
1278
1279							Block entries in the tag hashes must contain array references. The first two
1280							items are the tags used at the start and end of the block. The third and
1281							fourth contain the tags used at the start and end of each line. Where there
1282							needs to be more processing of individual lines, use a subref as the third
1283							item. This is how the module processes ordered lines in HTML lists and
1284							headers:
1285
1286							my $html = wikiformat
1287							(
1288							$raw,
1289							{ordered => [' ', " \n", ' ', " \n"],
1290							header => ['', "\n", \&_make_header],
1291							},
1292							);
1293
1294							The first argument to these subrefs is the post-processed text of the line
1295							itself. (Processing removes the indentation and tokens used to mark this as a
1296							list and checks the rest of the line for other line formattings.) The second
1297							argument is the indentation level (see below). The subsequent arguments are
1298							captured variables in the regular expression used to find this list type. The
1299							regexp for headers is:
1300
1301							$html = wikiformat
1302							(
1303							$raw,
1304							{blocks => {header => qr/^(=+)\s(.+?)\s\1$/}}
1305							);
1306
1307							The module processes indentation first, if applicable, and stores the
1308							indentation level (the length of the indentation removed).
1309
1310							Lists automatically start and end as necessary.
1311
1312							Because regular expressions could conceivably match more than one line, block
1313							level markup is processed in a specific order. The C tag governs
1314							this order. It contains a reference to an array of the names of the
1315							appropriate blocks to process. If you add a block type, be sure to add an
1316							entry for it in C:
1317
1318							my $html = wikiformat
1319							(
1320							$raw,
1321							{invisible => ['', '', '', ''],
1322							blocks => {invisible => qr!^--(.*?)--$!},
1323							blockorder => [qw(code header line ordered
1324							unordered definition invisible
1325							paragraph_break paragraph)]
1326							},
1327							},
1328							);
1329
1330							=head3 Finding blocks
1331
1332							As has already been mentioned in passing, C uses regular
1333							expressions to find blocks. These are in the C<%tags> hash under the C
1334							key. For example, to change the regular expression to find code block items,
1335							use:
1336
1337							my $html = wikiformat ($raw, {blocks => {code => qr/^:\s+/}});
1338
1339							This will require a leading colon to mark code lines (note that as writted
1340							here, this would interfere with the default processing of definition lists).
1341
1342							=head3 Finding Blocks in the Correct Order
1343
1344							As intrepid bug reporter Tom Hukins pointed out in CPAN RT bug #671, the order
1345							in which C searches for blocks varies by platform and
1346							version of Perl. Because some block-finding regular expressions are more
1347							specific than others, what you intend to be one type of block may turn into a
1348							different list type.
1349
1350							If you're adding new block types, be aware of this. The C entry in
1351							C<%tags> exists to force C to apply its regexes from
1352							most specific to least specific. It contains an array reference. By default,
1353							it looks for ordered lists first, unordered lists second, and code references
1354							at the end.
1355
1356							=head1 SEE ALSO
1357
1358							L
1359
1360							=head1 SUPPORT
1361
1362							You can find documentation for this module with the perldoc command.
1363
1364							perldoc Text::MediawikiFormat
1365
1366							You can also look for information at:
1367
1368							=over 4
1369
1370							=item * AnnoCPAN: Annotated CPAN documentation
1371
1372							L
1373
1374							=item * CPAN Ratings
1375
1376							L
1377
1378							=item * RT: CPAN's request tracker
1379
1380							L
1381
1382							=item * Search CPAN
1383
1384							L
1385
1386							=back
1387
1388							=head1 AUTHOR
1389
1390							Derek Price C is the author.
1391
1392							=head1 ACKNOWLEDGEMENTS
1393
1394							This module is derived from L, written by chromatic.
1395							chromatic's original credits are below:
1396
1397							chromatic, C, with much input from the Jellybean team
1398							(including Jonathan Paulett). Kate L Pugh has also provided several patches,
1399							many failing tests, and is usually the driving force behind new features and
1400							releases. If you think this module is worth buying me a beer, she deserves at
1401							least half of it.
1402
1403							Alex Vandiver added a nice patch and tests for extended links.
1404
1405							Tony Bowden, Tom Hukins, and Andy H. all suggested useful features that are now
1406							implemented.
1407
1408							Sam Vilain, Chris Winters, Paul Schmidt, and Art Henry have all found and
1409							reported silly bugs.
1410
1411							Blame me for the implementation.
1412
1413							=head1 BUGS
1414
1415							The link checker in C may fail to detect existing links that do
1416							not follow HTML, XML, or SGML style. They may die with some SGML styles too.
1417							I.
1418
1419							=head1 TODO
1420
1421							=over 4
1422
1423							=item * Optimize C to work on a list of lines
1424
1425							=back
1426
1427							=head1 COPYRIGHT & LICENSE
1428
1429							Copyright (c) 2006-2008 Derek R. Price, all rights reserved.
1430							Copyright (c) 2002 - 2006, chromatic, all rights reserved.
1431
1432							This program is free software; you can redistribute it and/or modify it
1433							under the same terms as Perl itself.
1434
1435							=cut
1436
1437							1; # End of Text::MediaiwkiFormat