blib/lib/HTML/ParagraphSplit.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 98 | 103 | 95.1 |
branch | 25 | 28 | 89.2 |
condition | 5 | 8 | 62.5 |
subroutine | 12 | 12 | 100.0 |
pod | 2 | 2 | 100.0 |
total | 142 | 153 | 92.8 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::ParagraphSplit; | ||||||
2 | |||||||
3 | 9 | 9 | 245354 | use strict; | |||
9 | 23 | ||||||
9 | 761 | ||||||
4 | 9 | 9 | 51 | use warnings; | |||
9 | 18 | ||||||
9 | 897 | ||||||
5 | |||||||
6 | our $VERSION = '1.05'; | ||||||
7 | |||||||
8 | require Exporter; | ||||||
9 | |||||||
10 | our @ISA = qw( Exporter ); | ||||||
11 | |||||||
12 | our @EXPORT_OK = qw( split_paragraphs split_paragraphs_to_text ); | ||||||
13 | |||||||
14 | 9 | 9 | 9807 | use HTML::Entities; | |||
9 | 66328 | ||||||
9 | 933 | ||||||
15 | 9 | 9 | 12493 | use HTML::TreeBuilder; | |||
9 | 329774 | ||||||
9 | 158 | ||||||
16 | 9 | 9 | 435 | use HTML::Tagset; | |||
9 | 19 | ||||||
9 | 250 | ||||||
17 | 9 | 9 | 52 | use Scalar::Util qw/ blessed /; | |||
9 | 19 | ||||||
9 | 1163 | ||||||
18 | |||||||
19 | 9 | 9 | 56 | use vars qw( %p_content ); | |||
9 | 18 | ||||||
9 | 24411 | ||||||
20 | *p_content = *HTML::Tagset::is_Possible_Strict_P_Content; | ||||||
21 | |||||||
22 | |||||||
23 | =head1 NAME | ||||||
24 | |||||||
25 | HTML::ParagraphSplit - Change text containing HTML into a formatted HTML fragment | ||||||
26 | |||||||
27 | =head1 SYNOPSIS | ||||||
28 | |||||||
29 | use HTML::ParagraphSplit qw( split_paragraphs_to_text split_paragraphs ); | ||||||
30 | |||||||
31 | # Read in from a file handle, output text | ||||||
32 | print split_paragraphs_to_text(\*ARGV); | ||||||
33 | |||||||
34 | # Convert text to nicely split text | ||||||
35 | print split_paragraphs_to_text(< | ||||||
36 | This is one paragraph. | ||||||
37 | |||||||
38 | This is a another paragraph. | ||||||
39 | END_OF_MARKUP | ||||||
40 | |||||||
41 | # Convert to an HTML::Element object instead | ||||||
42 | my $tree = split_paragraphs($html_input); | ||||||
43 | print $tree->as_HTML; | ||||||
44 | |||||||
45 | # Create your own HTML::Element object and split it | ||||||
46 | my $tree = HTML::TreeBuilder->new; | ||||||
47 | $tree->parse($text); | ||||||
48 | $tree->eof; | ||||||
49 | |||||||
50 | split_paragraphs($tree); | ||||||
51 | |||||||
52 | my $html_fragment = $tree->guts->as_HTML; | ||||||
53 | $tree->delete; | ||||||
54 | |||||||
55 | =head1 DESCRIPTION | ||||||
56 | |||||||
57 | The purpose of this library is to provide methods for converting double line-breaks in text to HTML paragraphs (i.e., wrap in C |
||||||
58 | |||||||
59 | For example, given this input (the initial text was generated by DadaDodo L |
||||||
60 | |||||||
61 | I see over the noise but I don't understand sometimes. | ||||||
62 | |||||||
63 |
|
||||||
64 | |||||||
65 | Fortunately, we've traded the club you can't skimp on the do because This | ||||||
66 | week! Presented by code Lounge: except, for controlling Knox video cameras | ||||||
67 | Linux well that the reason, the runlevel to run some reason number of coming | ||||||
68 | back next server; sees you Control display a steep | ||||||
69 | and I tagged with specifications of six feet, moving to Code, flyer main room | ||||||
70 | motel balcony, and airflow in which define the ability to run a common. We |
||||||
71 | need to current in a manner than six months and that already gotten a |
||||||
72 | webcast is roughly long and bulk: and up the src page: and updates on a: | ||||||
73 | user will probably does this. | ||||||
74 | |||||||
75 | This would be converted into the following: | ||||||
76 | |||||||
77 | I see over the noise but I don't understand sometimes. |
||||||
78 | |||||||
79 |
|
||||||
80 | |||||||
81 | Fortunately, we've traded the club you can't skimp on the do because This |
||||||
82 | week! Presented by code Lounge: except, for controlling Knox video cameras | ||||||
83 | Linux well that the reason, the runlevel to run some reason number of coming | ||||||
84 | back next server; sees you Control display a steep | ||||||
85 | and I tagged with specifications of six feet, moving to Code, flyer main room | ||||||
86 | motel balcony, | ||||||
87 | and airflow in which define the ability to run a common. We need to |
||||||
88 | current in a manner | ||||||
89 | than six months and that already gotten a |
||||||
90 | webcast | ||||||
91 | is roughly long and bulk: and up the src page: and updates on a: user will |
||||||
92 | probably does this. | ||||||
93 | |||||||
94 | This allows authors to use HTML markup some without having to cope with getting their paragraph tags right. | ||||||
95 | |||||||
96 | This library depends upon L |
||||||
97 | |||||||
98 | =head1 METHODS | ||||||
99 | |||||||
100 | The primary method of this library is C |
||||||
101 | |||||||
102 | =head2 split_paragraphs | ||||||
103 | |||||||
104 | =over | ||||||
105 | |||||||
106 | =item $element = split_paragraphs($handle, \%options) | ||||||
107 | |||||||
108 | =item $element = split_paragraphs($text, \%options) | ||||||
109 | |||||||
110 | =item $element = split_paragraphs($element, \%options) | ||||||
111 | |||||||
112 | =back | ||||||
113 | |||||||
114 | This method has three forms, which vary only in the input they receive. If the first argument is a file handle, C<$handle>, then that handle will be read, parsed, and split. If the first argument is a scalar, C<$text>, then that text will parsed and split. If the first argument is a subclass of L |
||||||
115 | |||||||
116 | If you use the third form, your tree will be modified in place and the same tree will be returned. You will want to clone the tree ahead of time if you need to preserve the old tree. | ||||||
117 | |||||||
118 | All forms take an optional second parameter, C<\%options>, which is a reference to a hash of options which modify the default behavior. See below for details. | ||||||
119 | |||||||
120 | The first two forms perform an extra step, but are handled essentially the same after the input is parsed into an L |
||||||
121 | |||||||
122 | This method will search down the element tree and find the first node with non-implicit child ndoes and use that as the root of operations. | ||||||
123 | |||||||
124 | The C |
||||||
125 | |||||||
126 | Any text found within a block-level node may also be paragraphified. Those blocks of text will not be wrapped in paragraphs unless they contain a double-line break (that way we're not inserting C -tags without an explicit need for them). |
||||||
127 | |||||||
128 | Note also that this will insert C -tags conservatively. If more than two line-breaks are present, even if they are mixed with other white space, all of that whitespace will be treated as the same paragraph break. No empty C -tags or C -tags containing only whitespace will be inserted (mostly). The only exception is when the white space is created by white space entities, such as C< >. |
||||||
129 | |||||||
130 | All of that is the default behavior. That behavior may be modified by the second parameter, which is used to specify options that modify that behavior. | ||||||
131 | |||||||
132 | Here's the list of options and what they do: | ||||||
133 | |||||||
134 | =over | ||||||
135 | |||||||
136 | =item p_on_breaks_only =E |
||||||
137 | |||||||
138 | If this option is used, then paragrpahs will not be added to your text unless there is at least one double-line break. This option is used internally to make sure nested elements do not have extra C -tags unnecessarily. |
||||||
139 | |||||||
140 | =item single_line_breaks_to_br =E |
||||||
141 | |||||||
142 | If this option is given, then single line breaks will also be converted to C -tags. |
||||||
143 | |||||||
144 | =item br_only_if_can_tighten =E |
||||||
145 | |||||||
146 | This option modifies the C -tags are not added within blocks that cannot be tightened (i.e., aren't set in C<%canTighten> of L -tags or C |
||||||
147 | |||||||
148 | =item use_br_instead_of_p =E |
||||||
149 | |||||||
150 | As an alternative to using C -tags at all, this can also just place C -tags whenever a double line-break is enountered, two C |
||||||
151 | |||||||
152 | This option is independant of C -tag insertion, it inserts C |
||||||
153 | |||||||
154 | =item add_attrs_to_p =E |
||||||
155 | |||||||
156 | This can be used to insert a static set of attributes to each inserted C -element. For example: |
||||||
157 | |||||||
158 | # Give each newly added paragraph the "generated" class. | ||||||
159 | split_paragraphs($tree, { | ||||||
160 | add_attrs_to_p => { class => 'generated' }, | ||||||
161 | }); | ||||||
162 | |||||||
163 | =item add_attrs_to_br =E |
||||||
164 | |||||||
165 | Same as above, but for the inserted C -tags. |
||||||
166 | |||||||
167 | =item filter_added_nodes =E |
||||||
168 | |||||||
169 | This can be used to run a small subroutine on each added paragraph or line-break tag as it is added. For example: | ||||||
170 | |||||||
171 | # Give each newly added paragraph a unique ID | ||||||
172 | split_paragraphs($tree, { | ||||||
173 | filter_added_nodes => sub { | ||||||
174 | my ($element) = @_; | ||||||
175 | $element->idf(); | ||||||
176 | }, | ||||||
177 | }); | ||||||
178 | |||||||
179 | Many, if not all, of the other options can be simulated using this method, by the way. | ||||||
180 | |||||||
181 | =item use_instead_of_p =E |
||||||
182 | |||||||
183 | Rather than using C -tags to break everything, use a different tag. This example uses C -tags instead of C -tags: |
||||||
184 | |||||||
185 | split_paragraphs($tree, { | ||||||
186 | use_instead_of_p => 'div', | ||||||
187 | }); | ||||||
188 | |||||||
189 | =back | ||||||
190 | |||||||
191 | =cut | ||||||
192 | |||||||
193 | sub _split_paragraphs_phrases { | ||||||
194 | 38 | 38 | 66 | my ($h, $phrase, $options) = @_; | |||
195 | |||||||
196 | # Trim the phrase of any extra whitespace at the edges | ||||||
197 | 38 | 104 | $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; | ||||
38 | 196 | ||||||
198 | |||||||
199 | # Adding a space at the end fixes a problem I have with the parser dropping | ||||||
200 | # the last word in the text. *shrug* | ||||||
201 | #my @paragraphs = map { "$_ " } split /\r?\n\s*(?:\r?\n\s*)+/, $phrase; | ||||||
202 | 38 | 267 | my @paragraphs = split /\r?\n\s*(?:\r?\n\s*)+/, $phrase; | ||||
203 | |||||||
204 | # If there's only one paragraph and we're not allowed to P without a break, | ||||||
205 | # push the text back in and skip paragraphification. | ||||||
206 | 38 | 100 | 100 | 167 | if ($options->{p_on_breaks_only} && @paragraphs == 1) { | ||
207 | 11 | 19 | my $parsed_paragraph = _nice_tree_builder(); | ||||
208 | 11 | 62 | $parsed_paragraph->parse($paragraphs[0]); | ||||
209 | 11 | 688 | $parsed_paragraph->eof; | ||||
210 | |||||||
211 | 11 | 1692 | $h->push_content($parsed_paragraph->guts); | ||||
212 | 11 | 646 | return; | ||||
213 | } | ||||||
214 | |||||||
215 | # Loop over the paragraph splits | ||||||
216 | 27 | 46 | my $iteration = 0; | ||||
217 | 27 | 46 | for my $paragraph (@paragraphs) { | ||||
218 | 47 | 238 | $iteration++; | ||||
219 | |||||||
220 | # Add BR tags within the paragraph, if necessary | ||||||
221 | 47 | 100 | 167 | $paragraph =~ s[\r?\n][ ]g |
|||
222 | if $options->{single_line_breaks_to_br}; | ||||||
223 | |||||||
224 | # Parse the paragraph text to recover the phrase elements that were | ||||||
225 | # converted to text | ||||||
226 | 47 | 86 | my $parsed_paragraph = _nice_tree_builder(); | ||||
227 | 47 | 321 | $parsed_paragraph->parse($paragraph); | ||||
228 | 47 | 5217 | $parsed_paragraph->eof; | ||||
229 | |||||||
230 | # Don't use P-tags, use BR-tags | ||||||
231 | 47 | 100 | 6638 | if ($options->{use_br_instead_of_p}) { | |||
232 | 19 | 49 | $h->push_content($parsed_paragraph->guts); | ||||
233 | 19 | 100 | 937 | if ($iteration < @paragraphs) { | |||
234 | 8 | 31 | my $new_br_1 = | ||||
235 | 8 | 11 | HTML::Element->new('br', %{ $options->{add_attrs_to_br} }); | ||||
236 | 8 | 24 | my $new_br_2 = | ||||
237 | 8 | 130 | HTML::Element->new('br', %{ $options->{add_attrs_to_br} }); | ||||
238 | |||||||
239 | 8 | 132 | $h->push_content($new_br_1); | ||||
240 | 8 | 94 | $h->push_content($new_br_2); | ||||
241 | |||||||
242 | 8 | 50 | 197 | if ($options->{filter_added_nodes}) { | |||
243 | 0 | 0 | $options->{filter_added_nodes}->($new_br_1); | ||||
244 | 0 | 0 | $options->{filter_added_nodes}->($new_br_2); | ||||
245 | } | ||||||
246 | |||||||
247 | } | ||||||
248 | } | ||||||
249 | |||||||
250 | # Use P-tags | ||||||
251 | else { | ||||||
252 | # Create a new P-tag to contain the text | ||||||
253 | 28 | 121 | my $p = HTML::Element->new( | ||||
254 | $options->{use_instead_of_p}, | ||||||
255 | 28 | 51 | %{ $options->{add_attrs_to_p} } | ||||
256 | ); | ||||||
257 | |||||||
258 | # Now add the paragraph back to the parent | ||||||
259 | 28 | 606 | $p->push_content($parsed_paragraph->guts); | ||||
260 | 28 | 1665 | $h->push_content($p); | ||||
261 | |||||||
262 | 28 | 100 | 810 | if ($options->{filter_added_nodes}) { | |||
263 | 2 | 7 | $options->{filter_added_nodes}->($p); | ||||
264 | } | ||||||
265 | } | ||||||
266 | } | ||||||
267 | } | ||||||
268 | |||||||
269 | sub _split_paragraphs_guts { | ||||||
270 | 42 | 42 | 71 | my ($h, $options) = @_; | |||
271 | |||||||
272 | # Rip the content out of the parent so we can add it back in a piece at a | ||||||
273 | # time as we process it | ||||||
274 | 42 | 173 | my @content = $h->detach_content; | ||||
275 | |||||||
276 | # This is a temporary variable used to collect text nodes and phrase-level | ||||||
277 | # elements to be split into paragraphs | ||||||
278 | 42 | 478 | my $unparsed_phrase; | ||||
279 | |||||||
280 | # Loop over the child nodes | ||||||
281 | 42 | 68 | for my $content (@content) { | ||||
282 | |||||||
283 | # Handle nested elements. | ||||||
284 | 81 | 100 | 333 | if (blessed $content) { | |||
285 | |||||||
286 | # Get the tag name | ||||||
287 | 32 | 85 | my $tag = $content->tag; | ||||
288 | |||||||
289 | # It's a phrase level tag, append it as text to the accumulator | ||||||
290 | 32 | 100 | 219 | if ($p_content{$tag}) { | |||
291 | 11 | 42 | $unparsed_phrase .= $content->as_HTML(undef, undef, {}); | ||||
292 | |||||||
293 | # HTML::TreeBuilder adds a new line after tags, kill it. This | ||||||
294 | # new line will mess things up with single_line_breaks_to_br is | ||||||
295 | # used. | ||||||
296 | 11 | 1631 | chomp $unparsed_phrase; | ||||
297 | } | ||||||
298 | |||||||
299 | # It's not a phrase level tag | ||||||
300 | else { | ||||||
301 | |||||||
302 | 21 | 100 | 48 | if (defined $unparsed_phrase) { | |||
303 | # Process the accumulator and empty it | ||||||
304 | 7 | 21 | _split_paragraphs_phrases($h, $unparsed_phrase, $options); | ||||
305 | 7 | 14 | $unparsed_phrase = undef; | ||||
306 | } | ||||||
307 | |||||||
308 | # Recurse downward to find sub-elements to split, if any | ||||||
309 | 21 | 119 | _split_paragraphs_guts($content, { %$options, | ||||
310 | p_on_breaks_only => 1, | ||||||
311 | }); | ||||||
312 | |||||||
313 | # Push this tag into the list | ||||||
314 | 21 | 87 | $h->push_content($content); | ||||
315 | } | ||||||
316 | } | ||||||
317 | |||||||
318 | # Text node, just append to the accumulator | ||||||
319 | else { | ||||||
320 | # Need to make sure entities are escaped first | ||||||
321 | 49 | 150 | $content = encode_entities($content); | ||||
322 | 49 | 596 | $unparsed_phrase .= $content; | ||||
323 | } | ||||||
324 | } | ||||||
325 | |||||||
326 | # If the accumulator still has something in it, split it. | ||||||
327 | 42 | 100 | 308 | if (defined $unparsed_phrase) { | |||
328 | 31 | 85 | _split_paragraphs_phrases($h, $unparsed_phrase, $options); | ||||
329 | } | ||||||
330 | } | ||||||
331 | |||||||
332 | sub _nice_tree_builder { | ||||||
333 | # Create a HTML::TreeBuilder that's good for splitting | ||||||
334 | 79 | 79 | 339 | my $tree = HTML::TreeBuilder->new; | |||
335 | 79 | 15539 | $tree->no_space_compacting(1); | ||||
336 | 79 | 618 | return $tree; | ||||
337 | } | ||||||
338 | |||||||
339 | sub split_paragraphs { | ||||||
340 | 21 | 21 | 1 | 52 | my ($input, $options) = @_; | ||
341 | |||||||
342 | # Translate the first argument into a tree | ||||||
343 | 21 | 34 | my $tree; | ||||
344 | |||||||
345 | # The first argument is a tree; use it. | ||||||
346 | 21 | 50 | 33 | 153 | if (blessed $input && $input->isa('HTML::Element')) { | ||
50 | |||||||
347 | 0 | 0 | $tree = $input; | ||||
348 | } | ||||||
349 | |||||||
350 | # The first argument is a file handle (or bad stuff); parse the file. | ||||||
351 | elsif (ref $input) { | ||||||
352 | 0 | 0 | $tree = _nice_tree_builder(); | ||||
353 | 0 | 0 | $tree->parse_file($input); | ||||
354 | } | ||||||
355 | |||||||
356 | # The first argument is text; parse it. | ||||||
357 | else { | ||||||
358 | 21 | 64 | $tree = _nice_tree_builder(); | ||||
359 | 21 | 275 | $tree->parse($input); | ||||
360 | 21 | 8378 | $tree->eof; | ||||
361 | } | ||||||
362 | |||||||
363 | # Don't use ->guts because they could give us an HTML::Element. | ||||||
364 | # We objectify in case something goes wrong and the only non-implicit node | ||||||
365 | # is text. | ||||||
366 | 21 | 3582 | $tree->objectify_text; | ||||
367 | 21 | 2641 | my $guts = $tree->look_down(_implicit => undef)->parent; | ||||
368 | 21 | 1368 | $tree->deobjectify_text; | ||||
369 | |||||||
370 | # Setup the default options | ||||||
371 | 21 | 100 | 1216 | $options = { | |||
372 | add_attrs_to_p => {}, | ||||||
373 | add_attrs_to_br => {}, | ||||||
374 | use_instead_of_p => 'p', | ||||||
375 | defined $options ? %$options : (), | ||||||
376 | }; | ||||||
377 | |||||||
378 | # Split those guts | ||||||
379 | 21 | 50 | 134 | _split_paragraphs_guts($guts, $options || {}); | |||
380 | |||||||
381 | # $tree->find('body')->delete_content; | ||||||
382 | # $tree->find('body')->push_content($guts->content_list); | ||||||
383 | |||||||
384 | 21 | 100 | return $tree; | ||||
385 | } | ||||||
386 | |||||||
387 | =head2 split_paragraphs_to_text | ||||||
388 | |||||||
389 | =over | ||||||
390 | |||||||
391 | =item $html_text = split_paragraphs_to_text($handle, \%options) | ||||||
392 | |||||||
393 | =item $html_text = split_paragraphs_to_text($text, \%options) | ||||||
394 | |||||||
395 | =item $html_text = split_paragraphs_to_text($element, \%options) | ||||||
396 | |||||||
397 | =back | ||||||
398 | |||||||
399 | This method performs the exact same operation as the C |
||||||
400 | |||||||
401 | I created this method primarily as a way of outputing the tree to make testing easier. If the output isn't want you like, use C |
||||||
402 | |||||||
403 | =cut | ||||||
404 | |||||||
405 | sub split_paragraphs_to_text { | ||||||
406 | # Use split_paragraphs() to do all the work | ||||||
407 | 21 | 21 | 1 | 19904 | my $tree = split_paragraphs(@_); | ||
408 | |||||||
409 | # Get the top non-implicit node | ||||||
410 | 21 | 73 | $tree->objectify_text; | ||||
411 | 21 | 3472 | my $output_element = $tree->look_down(_implicit => undef)->parent; | ||||
412 | 21 | 1151 | $tree->deobjectify_text; | ||||
413 | |||||||
414 | # Append each node as text | ||||||
415 | 21 | 1344 | my $text; | ||||
416 | 21 | 74 | for my $element ($output_element->content_list) { | ||||
417 | |||||||
418 | # Convert each element to HTML text | ||||||
419 | 72 | 100 | 6453 | if (ref $element) { | |||
420 | 52 | 176 | $text .= $element->as_HTML(undef, undef, {}); | ||||
421 | } | ||||||
422 | |||||||
423 | # Add text nodes as-is | ||||||
424 | else { | ||||||
425 | 20 | 36 | $text .= $element; | ||||
426 | } | ||||||
427 | } | ||||||
428 | |||||||
429 | # Deallocate the tree to prevent memory leaks | ||||||
430 | 21 | 3814 | $tree->delete; | ||||
431 | |||||||
432 | 21 | 2175 | return $text; | ||||
433 | } | ||||||
434 | |||||||
435 | =head1 SEE ALSO | ||||||
436 | |||||||
437 | L |
||||||
438 | |||||||
439 | =head1 BUGS AND TODO | ||||||
440 | |||||||
441 | I don't really have any explicit plans for this module, but if you find a bug or would like an additional feature or have another contribution, send me email at E |
||||||
442 | |||||||
443 | =head1 NOTES | ||||||
444 | |||||||
445 | I tried to name this library HTML::Paragraphify first. After typing that a dozen times and looking at it for a few hours, my eyes felt like they were starting to bleed so I changed it to L |
||||||
446 | |||||||
447 | =head1 AUTHOR | ||||||
448 | |||||||
449 | Andrew Sterling Hanenkamp, C<< |
||||||
450 | |||||||
451 | =head1 LICENSE AND COPYRIGHT | ||||||
452 | |||||||
453 | Copyright 2006 Andrew Sterling Hanenkamp C<< |
||||||
454 | Rights Reserved. | ||||||
455 | |||||||
456 | This module is free software; you can redistribute it and/or modify it under | ||||||
457 | the same terms as Perl itself. See L |
||||||
458 | |||||||
459 | This program is distributed in the hope that it will be useful, but WITHOUT | ||||||
460 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||||||
461 | FOR A PARTICULAR PURPOSE. | ||||||
462 | |||||||
463 | =cut | ||||||
464 | |||||||
465 | 1 |