blib/lib/HTML/Truncate.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 151 | 174 | 86.7 |
branch | 59 | 84 | 70.2 |
condition | 6 | 15 | 40.0 |
subroutine | 20 | 24 | 83.3 |
pod | 11 | 11 | 100.0 |
total | 247 | 308 | 80.1 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package HTML::Truncate; | ||||||
2 | |||||||
3 | 9 | 9 | 303047 | use 5.008; | |||
9 | 33 | ||||||
9 | 372 | ||||||
4 | 9 | 9 | 50 | use strict; | |||
9 | 18 | ||||||
9 | 271 | ||||||
5 | 9 | 9 | 64 | use warnings; | |||
9 | 22 | ||||||
9 | 339 | ||||||
6 | 9 | 9 | 52 | no warnings "uninitialized"; | |||
9 | 17 | ||||||
9 | 353 | ||||||
7 | |||||||
8 | 9 | 9 | 70189 | use HTML::TokeParser; | |||
9 | 298846 | ||||||
9 | 370 | ||||||
9 | 9 | 9 | 80 | use HTML::Tagset (); | |||
9 | 81 | ||||||
9 | 167 | ||||||
10 | 9 | 9 | 55 | use HTML::Entities (); | |||
9 | 1295 | ||||||
9 | 159 | ||||||
11 | 9 | 9 | 60 | use Carp; | |||
9 | 18 | ||||||
9 | 3361 | ||||||
12 | 9 | 9 | 94 | use List::Util qw( first ); | |||
9 | 15 | ||||||
9 | 27969 | ||||||
13 | |||||||
14 | =head1 NAME | ||||||
15 | |||||||
16 | HTML::Truncate - (beta software) truncate HTML by percentage or character count while preserving well-formedness. | ||||||
17 | |||||||
18 | =head1 VERSION | ||||||
19 | |||||||
20 | 0.20 | ||||||
21 | |||||||
22 | =cut | ||||||
23 | |||||||
24 | our $VERSION = "0.20"; | ||||||
25 | |||||||
26 | =head1 ABSTRACT | ||||||
27 | |||||||
28 | When working with text it is common to want to truncate strings to make them fit a desired context. E.g., you might have a menu that is only 100px wide and prefer text doesn't wrap so you'd truncate it around 15-30 characters, depending on preference and typeface size. This is trivial with plain text using L |
||||||
29 | |||||||
30 | L |
||||||
31 | |||||||
32 | =head1 SYNOPSIS | ||||||
33 | |||||||
34 | use strict; | ||||||
35 | use HTML::Truncate; | ||||||
36 | |||||||
37 | my $html = ' We have to test something. '; |
||||||
38 | my $readmore = '... [readmore]'; | ||||||
39 | |||||||
40 | my $html_truncate = HTML::Truncate->new(); | ||||||
41 | $html_truncate->chars(20); | ||||||
42 | $html_truncate->ellipsis($readmore); | ||||||
43 | print $html_truncate->truncate($html); | ||||||
44 | |||||||
45 | # or | ||||||
46 | |||||||
47 | use Encode; | ||||||
48 | my $ht = HTML::Truncate->new( utf8_mode => 1, | ||||||
49 | chars => 1_000, | ||||||
50 | ); | ||||||
51 | print Encode::encode_utf8( $ht->truncate($html) ); | ||||||
52 | |||||||
53 | =head1 XHTML | ||||||
54 | |||||||
55 | This module is designed to work with XHTML-style nested tags. More | ||||||
56 | below. | ||||||
57 | |||||||
58 | =head1 WHITESPACE AND ENTITIES | ||||||
59 | |||||||
60 | Repeated natural whitespace (i.e., "\s+" and not " ") in HTML | ||||||
61 | -- with rare exception (pre tags or user defined styles) -- is not | ||||||
62 | meaningful. Therefore it is normalized when truncating. Entities are | ||||||
63 | also normalized. The following is only counted 14 chars long. | ||||||
64 | |||||||
65 | \n \nthis is ‘text’\n\n |
||||||
66 | ^^^^^^^12345----678--9------01234------^^^^^^^^ | ||||||
67 | |||||||
68 | =head1 METHODS | ||||||
69 | |||||||
70 | =over 4 | ||||||
71 | |||||||
72 | =item B |
||||||
73 | |||||||
74 | Can take all the methods as hash style args. "percent" and "chars" are | ||||||
75 | incompatible so don't use them both. Whichever is set most recently | ||||||
76 | will erase the other. | ||||||
77 | |||||||
78 | my $ht = HTML::Truncate->new(utf8_mode => 1, | ||||||
79 | chars => 500, # default is 100 | ||||||
80 | ); | ||||||
81 | |||||||
82 | =cut | ||||||
83 | |||||||
84 | our %skip = ( head => 1, | ||||||
85 | script => 1, | ||||||
86 | form => 1, | ||||||
87 | iframe => 1, | ||||||
88 | object => 1, | ||||||
89 | embed => 1, | ||||||
90 | title => 1, | ||||||
91 | style => 1, | ||||||
92 | base => 1, | ||||||
93 | link => 1, | ||||||
94 | meta => 1, | ||||||
95 | ); | ||||||
96 | |||||||
97 | |||||||
98 | sub new { | ||||||
99 | 9 | 9 | 1 | 12972 | my $class = shift; | ||
100 | |||||||
101 | 9 | 137 | my $self = bless | ||||
102 | { | ||||||
103 | _chars => 100, | ||||||
104 | _percent => undef, | ||||||
105 | _cleanly => qr/[\s[:punct:]]+\z/, | ||||||
106 | _on_space => undef, | ||||||
107 | _utf8_mode => undef, | ||||||
108 | _ellipsis => '…', | ||||||
109 | _raw_html => '', | ||||||
110 | _repair => undef, | ||||||
111 | _skip_tags => \%skip, | ||||||
112 | }, $class; | ||||||
113 | |||||||
114 | 9 | 62 | while ( my ( $k, $v ) = splice(@_, 0, 2) ) | ||||
115 | { | ||||||
116 | 2 | 50 | 7 | croak "No such method or attribute '$k'" unless exists $self->{"_$k"}; | |||
117 | 2 | 6 | $self->$k($v); | ||||
118 | } | ||||||
119 | 9 | 65 | return $self; | ||||
120 | } | ||||||
121 | |||||||
122 | =item B |
||||||
123 | |||||||
124 | Set/get, true/false. If C |
||||||
125 | set in the underlying L |
||||||
126 | with L |
||||||
127 | literal ellipsis and not the default of C<…>. | ||||||
128 | |||||||
129 | =cut | ||||||
130 | |||||||
131 | sub utf8_mode { | ||||||
132 | 193 | 193 | 1 | 620 | my $self = shift; | ||
133 | 193 | 100 | 348 | if ( @_ ) | |||
134 | { | ||||||
135 | 5 | 9 | $self->{_utf8_mode} = shift; | ||||
136 | 5 | 21 | return 1; # say we did it, even if setting untrue value | ||||
137 | } | ||||||
138 | else | ||||||
139 | { | ||||||
140 | 188 | 661 | return $self->{_utf8_mode}; | ||||
141 | } | ||||||
142 | } | ||||||
143 | |||||||
144 | =item B |
||||||
145 | |||||||
146 | Set/get. The number of characters remaining after truncation, | ||||||
147 | B |
||||||
148 | |||||||
149 | Entities are counted as single characters. E.g., C<©> is one | ||||||
150 | character for truncation counts. | ||||||
151 | |||||||
152 | Default is "100." Side-effect: clears any L that has been | ||||||
153 | set. | ||||||
154 | |||||||
155 | =cut | ||||||
156 | |||||||
157 | sub chars { | ||||||
158 | 93 | 93 | 1 | 40751 | my ( $self, $chars ) = @_; | ||
159 | 93 | 100 | 287 | return $self->{_chars} unless defined $chars; | |||
160 | 87 | 50 | 281 | $chars > 0 or croak "You must truncate to at least 1 character"; | |||
161 | 87 | 50 | 417 | $chars =~ /^(?:[1-9][_\d]*|0)$/ | |||
162 | or croak "Specified chars must be a number"; | ||||||
163 | 87 | 146 | $self->{_percent} = undef; # no conflict allowed | ||||
164 | 87 | 244 | $self->{_chars} = $chars; | ||||
165 | } | ||||||
166 | |||||||
167 | =item B |
||||||
168 | |||||||
169 | Set/get. A percentage to keep while truncating the rest. For a | ||||||
170 | document of 1,000 chars, percent('15%') and chars(150) would be | ||||||
171 | equivalent. The actual amount of character that the percent represents | ||||||
172 | cannot be known until the given HTML is parsed. | ||||||
173 | |||||||
174 | Side-effect: clears any L that has been set. | ||||||
175 | |||||||
176 | =cut | ||||||
177 | |||||||
178 | sub percent { | ||||||
179 | 1 | 1 | 1 | 4 | my ( $self, $percent ) = @_; | ||
180 | |||||||
181 | 1 | 50 | 33 | 9 | return unless $self->{_percent} or $percent; | ||
182 | |||||||
183 | 1 | 50 | 4 | return sprintf("%d%%", 100 * $self->{_percent}) | |||
184 | unless $percent; | ||||||
185 | |||||||
186 | 1 | 6 | my ( $temp_percent ) = $percent =~ /^(100|[1-9]?[0-9])\%$/; | ||||
187 | |||||||
188 | 1 | 50 | 33 | 11 | $temp_percent and $temp_percent != 0 | ||
189 | or croak "Specified percent is invalid '$percent' -- 1\% - 100\%"; | ||||||
190 | |||||||
191 | 1 | 3 | $self->{_chars} = undef; # no conflict allowed | ||||
192 | 1 | 8 | $self->{_percent} = $1 / 100; | ||||
193 | } | ||||||
194 | |||||||
195 | =item B |
||||||
196 | |||||||
197 | Set/get. Ellipsis in this case means -- | ||||||
198 | |||||||
199 | The omission of a word or phrase necessary for a complete | ||||||
200 | syntactical construction but not necessary for understanding. | ||||||
201 | http://www.answers.com/topic/ellipsis | ||||||
202 | |||||||
203 | What it will probably mean in most real applications is "read more." | ||||||
204 | The default is C<…> which if the utf8 flag is true will render | ||||||
205 | as a literal ellipsis, C |
||||||
206 | |||||||
207 | The reason the default is C<…> and not "..." is this is meant | ||||||
208 | for use in HTML environments, not plain text, and "..." (dot-dot-dot) | ||||||
209 | is not typographically correct or equivalent to a real horizontal | ||||||
210 | ellipsis character. | ||||||
211 | |||||||
212 | =cut | ||||||
213 | |||||||
214 | sub ellipsis { | ||||||
215 | 94 | 94 | 1 | 144 | my $self = shift; | ||
216 | 94 | 100 | 260 | if ( @_ ) | |||
100 | |||||||
217 | { | ||||||
218 | 5 | 37 | $self->{_ellipsis} = shift; | ||||
219 | } | ||||||
220 | elsif ( $self->utf8_mode() ) | ||||||
221 | { | ||||||
222 | 6 | 76 | return HTML::Entities::decode($self->{_ellipsis}); | ||||
223 | } | ||||||
224 | else | ||||||
225 | { | ||||||
226 | 83 | 178 | return $self->{_ellipsis}; | ||||
227 | } | ||||||
228 | } | ||||||
229 | |||||||
230 | =item B |
||||||
231 | |||||||
232 | It returns the truncated XHTML if asked for a return value. | ||||||
233 | |||||||
234 | my $truncated = $ht->truncate($html); | ||||||
235 | |||||||
236 | It will truncate the string in place if no return value is expected | ||||||
237 | (L |
||||||
238 | |||||||
239 | $ht->truncate($html); | ||||||
240 | print $html; | ||||||
241 | |||||||
242 | Also can be called with inline arguments- | ||||||
243 | |||||||
244 | print $ht->truncate( $html, | ||||||
245 | $chars_or_percent, | ||||||
246 | $ellipsis ); | ||||||
247 | |||||||
248 | No arguments are strictly required. Without HTML to operate upon it | ||||||
249 | returns undef. The two optional arguments may be preset with the | ||||||
250 | methods L (or L) and L. | ||||||
251 | |||||||
252 | Valid nesting of tags is required (alla XHTML). Therefore some old | ||||||
253 | HTML habits like E |
||||||
254 | and may cause a fatal error. See L for help with badly formed | ||||||
255 | HTML. | ||||||
256 | |||||||
257 | Certain tags are omitted by default from the truncated output. | ||||||
258 | |||||||
259 | =over 4 | ||||||
260 | |||||||
261 | =item * Skipped tags | ||||||
262 | |||||||
263 | These will not be included in truncated output by default. | ||||||
264 | |||||||
265 | ... | ||||||
266 | |
||||||
267 | |
||||||
268 | |||||||
269 | =item * Tags allowed to self-close | ||||||
270 | |||||||
271 | See L |
||||||
272 | |||||||
273 | =back | ||||||
274 | |||||||
275 | =cut | ||||||
276 | |||||||
277 | sub _chars_or_percent { | ||||||
278 | 0 | 0 | 0 | my ( $self, $which ) = @_; | |||
279 | 0 | 0 | 0 | if ( $which =~ /\%\z/ ) | |||
280 | { | ||||||
281 | 0 | 0 | $self->percent($which); | ||||
282 | } | ||||||
283 | else | ||||||
284 | { | ||||||
285 | 0 | 0 | $self->chars($which); | ||||
286 | } | ||||||
287 | } | ||||||
288 | |||||||
289 | sub truncate { | ||||||
290 | 97 | 97 | 1 | 4684 | my $self = shift; | ||
291 | 97 | 188 | $self->{_raw_html} = \$_[0]; | ||||
292 | 97 | 50 | 209 | shift || return; | |||
293 | |||||||
294 | 97 | 50 | 200 | $self->_chars_or_percent(+shift) if @_; | |||
295 | 97 | 50 | 236 | $self->ellipsis(+shift) if @_; | |||
296 | |||||||
297 | 97 | 90 | my @tag_q; | ||||
298 | 97 | 122 | my $renew = ""; | ||||
299 | 97 | 154 | my $total = 0; | ||||
300 | 97 | 98 | my $previous_token; | ||||
301 | my $next_token; | ||||||
302 | |||||||
303 | # my $tmp_ellipsis = $self->ellipsis; | ||||||
304 | # $tmp_ellipsis =~ s/<\w[^>]+>//g; # Naive html strip. | ||||||
305 | # HTML::Entities::encode($tmp_ellipsis); | ||||||
306 | 97 | 561 | my $chars = $self->{_chars};# + length $tmp_ellipsis; | ||||
307 | |||||||
308 | 97 | 382 | my $p = HTML::TokeParser->new( $self->{_raw_html} ); | ||||
309 | 97 | 13011 | $p->unbroken_text(1); | ||||
310 | 97 | 225 | $p->utf8_mode( $self->utf8_mode ); | ||||
311 | |||||||
312 | TOKEN: | ||||||
313 | 97 | 343 | while ( my $token = $p->get_token() ) | ||||
314 | { | ||||||
315 | 617 | 7916 | my @nexttoken; | ||||
316 | NEXT_TOKEN: | ||||||
317 | 617 | 1441 | while ( my $next = $p->get_token() ) | ||||
318 | { | ||||||
319 | 1439 | 8921 | push @nexttoken, $next; | ||||
320 | 1439 | 100 | 4485 | if ( $next->[0] eq 'S' ) | |||
321 | { | ||||||
322 | 489 | 504 | $next_token = $next; | ||||
323 | 489 | 684 | last NEXT_TOKEN; | ||||
324 | } | ||||||
325 | } | ||||||
326 | 617 | 2420 | $p->unget_token(@nexttoken); | ||||
327 | 617 | 100 | 3613 | $previous_token = $token if $token->[0] eq 'E'; | |||
328 | |||||||
329 | # print " Queue: ", join ":", @tag_q; print $/; | ||||||
330 | # print "Previous: $previous_token->[1]\n"; | ||||||
331 | # print " IN: $token->[1]\n"; | ||||||
332 | # print " Next: $next_token->[1]\n\n"; | ||||||
333 | |||||||
334 | 617 | 100 | 1519 | if ( $token->[0] eq 'S' ) | |||
100 | |||||||
50 | |||||||
335 | { | ||||||
336 | # _callback_for...? 321 | ||||||
337 | 253 | 451 | ( my $real_tag = $token->[1] ) =~ s,/\z,,; | ||||
338 | 253 | 50 | 598 | next TOKEN if $self->{_skip_tags}{$real_tag}; | |||
339 | 253 | 100 | 812 | push @tag_q, $token->[1] unless $HTML::Tagset::emptyElement{$real_tag}; | |||
340 | 253 | 1239 | $renew .= $token->[-1]; | ||||
341 | } | ||||||
342 | elsif ( $token->[0] eq 'E' ) | ||||||
343 | { | ||||||
344 | 116 | 50 | 293 | next TOKEN if $self->{_skip_tags}{$token->[1]}; | |||
345 | 116 | 167 | my $open = pop @tag_q; | ||||
346 | 116 | 208 | my $close = $token->[1]; | ||||
347 | 116 | 100 | 227 | unless ( $open eq $close ) | |||
348 | { | ||||||
349 | 8 | 50 | 17 | if ( $self->{_repair} ) | |||
350 | { | ||||||
351 | 8 | 8 | my @unmatched; | ||||
352 | 8 | 100 | 21 | push @unmatched, $open if $open; | |||
353 | 8 | 19 | while ( my $temp = pop @tag_q ) | ||||
354 | { | ||||||
355 | 8 | 100 | 15 | if ( $temp eq $close ) | |||
356 | { | ||||||
357 | 5 | 14 | while ( my $add = shift @unmatched ) | ||||
358 | { | ||||||
359 | 8 | 23 | $renew .= ""; | ||||
360 | } | ||||||
361 | 5 | 9 | $renew .= ""; | ||||
362 | 5 | 24 | next TOKEN; | ||||
363 | } | ||||||
364 | else | ||||||
365 | { | ||||||
366 | 3 | 9 | push @unmatched, $temp; | ||||
367 | } | ||||||
368 | } | ||||||
369 | 3 | 3 | push @tag_q, reverse @unmatched; | ||||
370 | 3 | 12 | next TOKEN; # silently drop unmatched close tags | ||||
371 | } | ||||||
372 | else | ||||||
373 | { | ||||||
374 | 0 | 0 | my $nearby = substr($renew, | ||||
375 | length($renew) - 15, | ||||||
376 | 15); | ||||||
377 | 0 | 0 | croak qq|<$open> closed by near "$nearby"|; | ||||
378 | } | ||||||
379 | } | ||||||
380 | 108 | 401 | $renew .= $token->[-1]; | ||||
381 | } | ||||||
382 | elsif ( $token->[0] eq 'T' ) | ||||||
383 | { | ||||||
384 | 248 | 50 | 498 | next TOKEN if $token->[2]; # DATA | |||
385 | # my $txt = HTML::Entities::decode($token->[1]); | ||||||
386 | 248 | 313 | my $txt = $token->[1]; | ||||
387 | 248 | 261 | my $current_length = 0; | ||||
388 | 248 | 100 | 416 | 1171 | unless ( first { $_ eq 'pre' } @tag_q ) # We're not somewhere inside a |
||
416 | 749 | ||||||
389 | { | ||||||
390 | 244 | 837 | $txt =~ s/\s+/ /g; | ||||
391 | |||||||
392 | 244 | 100 | 66 | 853 | if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow | ||
393 | and | ||||||
394 | ! $HTML::Tagset::isPhraseMarkup{$previous_token->[1]} | ||||||
395 | ) | ||||||
396 | { | ||||||
397 | 10 | 36 | $txt =~ s/\A +//; | ||||
398 | } | ||||||
399 | |||||||
400 | 244 | 100 | 66 | 669 | if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow | ||
401 | and | ||||||
402 | ! $HTML::Tagset::isPhraseMarkup{$next_token->[1]} | ||||||
403 | ) | ||||||
404 | { | ||||||
405 | 20 | 52 | $txt =~ s/ +\z//; | ||||
406 | } | ||||||
407 | 244 | 427 | $current_length = _count_visual_chars($txt); | ||||
408 | } | ||||||
409 | else | ||||||
410 | { | ||||||
411 | 4 | 5 | $current_length = length($txt); | ||||
412 | } | ||||||
413 | |||||||
414 | 248 | 603 | $total += $current_length; | ||||
415 | |||||||
416 | 248 | 100 | 410 | if ( $total >= $chars ) | |||
417 | { | ||||||
418 | 85 | 98 | $total -= $current_length; | ||||
419 | |||||||
420 | 85 | 101 | my $chars_to_keep = $chars - $total; | ||||
421 | 85 | 105 | my $keep = ""; | ||||
422 | 85 | 100 | 178 | if ( $self->on_space ) | |||
423 | { | ||||||
424 | 26 | 727 | ( $keep ) = $txt =~ /\A(.{0,$chars_to_keep}\s?)(?=\s|\z)/; | ||||
425 | 26 | 90 | $keep =~ s/\s+\z//; | ||||
426 | } | ||||||
427 | else | ||||||
428 | { | ||||||
429 | 59 | 117 | $keep = substr($txt, 0, $chars_to_keep); | ||||
430 | } | ||||||
431 | |||||||
432 | 85 | 100 | 202 | if ( my $cleaner = $self->cleanly ) | |||
433 | { | ||||||
434 | 56 | 241 | $keep =~ s/$cleaner//; | ||||
435 | } | ||||||
436 | |||||||
437 | 85 | 100 | 194 | if ( $keep ) | |||
438 | { | ||||||
439 | # $renew .= $self->utf8_mode ? | ||||||
440 | # $keep : HTML::Entities::encode($keep); | ||||||
441 | 68 | 108 | $renew .= $keep; | ||||
442 | } | ||||||
443 | |||||||
444 | 85 | 194 | $renew .= $self->ellipsis(); | ||||
445 | 85 | 199 | last TOKEN; | ||||
446 | } | ||||||
447 | else | ||||||
448 | { | ||||||
449 | 163 | 791 | $renew .= $token->[1]; | ||||
450 | } | ||||||
451 | } | ||||||
452 | } # TOKEN block ends | ||||||
453 | |||||||
454 | 97 | 303 | $renew .= join('', map {""} reverse @tag_q); | ||||
125 | 349 | ||||||
455 | |||||||
456 | 97 | 50 | 194 | if ( defined wantarray ) | |||
457 | { | ||||||
458 | 97 | 1323 | return $renew; | ||||
459 | } | ||||||
460 | else | ||||||
461 | { | ||||||
462 | 0 | 0 | ${$self->{_raw_html}} = $renew; | ||||
0 | 0 | ||||||
463 | } | ||||||
464 | } | ||||||
465 | |||||||
466 | =item B |
||||||
467 | |||||||
468 | Put one or more new tags into the list of those to be omitted from | ||||||
469 | truncated output. An example of when you might like to use this is if | ||||||
470 | you're thumb-nailing articles and they start with C<< title>> |
||||||
471 | or such before the article body. The heading level would be absurd | ||||||
472 | with a list of excerpts so you could drop it completely this way-- | ||||||
473 | |||||||
474 | $ht->add_skip_tags( 'h1' ); | ||||||
475 | |||||||
476 | =cut | ||||||
477 | |||||||
478 | sub add_skip_tags { | ||||||
479 | 0 | 0 | 1 | 0 | my $self = shift; | ||
480 | 0 | 0 | for ( @_ ) | ||||
481 | { | ||||||
482 | 0 | 0 | 0 | croak "Args to add_skip_tags must be scalar tag names, not references" | |||
483 | if ref $_; | ||||||
484 | 0 | 0 | $self->{_skip_tags}{$_} = 1; | ||||
485 | } | ||||||
486 | } | ||||||
487 | |||||||
488 | =item B |
||||||
489 | |||||||
490 | Takes tags out of the current list to be omitted from truncated output. | ||||||
491 | |||||||
492 | =cut | ||||||
493 | |||||||
494 | sub dont_skip_tags { | ||||||
495 | 0 | 0 | 1 | 0 | my $self = shift; | ||
496 | 0 | 0 | for ( @_ ) | ||||
497 | { | ||||||
498 | 0 | 0 | 0 | croak "Args to dont_skip_tags must be scalar tag names, not references" | |||
499 | if ref $_; | ||||||
500 | 0 | 0 | 0 | carp "$_ was not set to be skipped" | |||
501 | unless delete $self->{_skip_tags}{$_}; | ||||||
502 | } | ||||||
503 | } | ||||||
504 | |||||||
505 | =item B |
||||||
506 | |||||||
507 | Set/get, true/false. If true, will attempt to repair unclosed HTML | ||||||
508 | tags by adding close-tags as late as possible (eg. C<< | ||||||
509 | foobar >> becomes C<< foobar >>). Unmatched | ||||||
510 | close tags are dropped (C<< foobar >> becomes C<< foobar >>). | ||||||
511 | |||||||
512 | =cut | ||||||
513 | |||||||
514 | sub repair { | ||||||
515 | 8 | 8 | 1 | 15 | my $self = shift; | ||
516 | 8 | 100 | 22 | if ( @_ ) | |||
517 | { | ||||||
518 | 3 | 8 | $self->{_repair} = shift; | ||||
519 | 3 | 7 | return 1; # say we did it, even if untrue value | ||||
520 | } | ||||||
521 | else | ||||||
522 | { | ||||||
523 | 5 | 24 | return $self->{_repair}; | ||||
524 | } | ||||||
525 | } | ||||||
526 | |||||||
527 | sub _load_chars_from_percent { | ||||||
528 | 0 | 0 | 0 | my $self = shift; | |||
529 | 0 | 0 | my $p = HTML::TokeParser->new( $self->{_raw_html} ); | ||||
530 | 0 | 0 | my $txt_length = 0; | ||||
531 | |||||||
532 | CHARS: | ||||||
533 | 0 | 0 | while ( my $token = $p->get_token ) | ||||
534 | { | ||||||
535 | # don't check padding b/c we're going by a document average | ||||||
536 | 0 | 0 | 0 | 0 | next unless $token->[0] eq 'T' and not $token->[2]; # Not data. | ||
537 | 0 | 0 | $txt_length += _count_visual_chars( $token->[1] ); | ||||
538 | } | ||||||
539 | 0 | 0 | $self->chars( int( $txt_length * $self->{_percent} ) ); | ||||
540 | } | ||||||
541 | |||||||
542 | sub _count_visual_chars { # private function | ||||||
543 | 244 | 244 | 1068 | my $to_count = HTML::Entities::decode_entities(+shift); | |||
544 | 244 | 442 | $to_count =~ s/\s\s+/ /g; | ||||
545 | 244 | 314 | $to_count =~ s/[^[:print:]]+//g; | ||||
546 | # my $count = () = | ||||||
547 | # $to_count =~ | ||||||
548 | # /\&\#\d+;|\&[[:alpha:]]{2,5};|\S|\s+/g; | ||||||
549 | # return $count; | ||||||
550 | 244 | 414 | return length($to_count); | ||||
551 | } | ||||||
552 | |||||||
553 | # Need to put hooks for these or not? 321 | ||||||
554 | #sub _default_image_callback { | ||||||
555 | # sub { | ||||||
556 | # '[image]' | ||||||
557 | # } | ||||||
558 | #} | ||||||
559 | |||||||
560 | =item B |
||||||
561 | |||||||
562 | This will make the truncation back up to the first space it finds so | ||||||
563 | it doesn't truncate in the the middle of a word. L runs | ||||||
564 | before L if both are set. | ||||||
565 | |||||||
566 | =cut | ||||||
567 | |||||||
568 | sub on_space { | ||||||
569 | 86 | 86 | 1 | 113 | my $self = shift; | ||
570 | 86 | 100 | 160 | if ( @_ ) | |||
571 | { | ||||||
572 | 1 | 2 | $self->{_on_space} = shift; | ||||
573 | 1 | 7 | return 1; # say we did it, even if setting untrue value | ||||
574 | } | ||||||
575 | else | ||||||
576 | { | ||||||
577 | 85 | 214 | return $self->{_on_space}; | ||||
578 | } | ||||||
579 | } | ||||||
580 | |||||||
581 | |||||||
582 | =item B |
||||||
583 | |||||||
584 | Set/get -- a regular expression. This is on by default and the default | ||||||
585 | cleaning regular expression is C |
||||||
586 | will make the truncation strip any trailing spacing and punctuation so | ||||||
587 | you don't get things like "The End...." or "What? ..." You can cancel | ||||||
588 | it with C<$ht-E |
||||||
589 | expression. | ||||||
590 | |||||||
591 | =cut | ||||||
592 | |||||||
593 | sub cleanly { | ||||||
594 | 87 | 87 | 1 | 167 | my $self = shift; | ||
595 | 87 | 100 | 158 | if ( @_ ) | |||
596 | { | ||||||
597 | 2 | 5 | $self->{_cleanly} = shift; | ||||
598 | 2 | 18 | return 1; # say we did it, even if setting untrue value | ||||
599 | } | ||||||
600 | else | ||||||
601 | { | ||||||
602 | 85 | 298 | return $self->{_cleanly}; | ||||
603 | } | ||||||
604 | } | ||||||
605 | |||||||
606 | =back | ||||||
607 | |||||||
608 | =head1 COOKBOOK (well, a recipe) | ||||||
609 | |||||||
610 | =head2 Template Toolkit filter | ||||||
611 | |||||||
612 | For excerpting HTML in your Templates. Note the L which | ||||||
613 | is set to drop any images from the truncated output. | ||||||
614 | |||||||
615 | use Template; | ||||||
616 | use HTML::Truncate; | ||||||
617 | |||||||
618 | my %config = | ||||||
619 | ( | ||||||
620 | FILTERS => { | ||||||
621 | truncate_html => [ \&truncate_html_filter_factory, 1 ], | ||||||
622 | }, | ||||||
623 | ); | ||||||
624 | |||||||
625 | my $tt = Template->new(\%config) or die $Template::ERROR; | ||||||
626 | |||||||
627 | # ... etc ... | ||||||
628 | |||||||
629 | sub truncate_html_filter_factory { | ||||||
630 | my ( $context, $len, $ellipsis ) = @_; | ||||||
631 | $len = 32 unless $len; | ||||||
632 | $ellipsis = chr(8230) unless defined $ellipsis; | ||||||
633 | my $ht = HTML::Truncate->new(); | ||||||
634 | $ht->add_skip_tags(qw( img )); | ||||||
635 | return sub { | ||||||
636 | my $html = shift || return ''; | ||||||
637 | return $ht->truncate( $html, $len, $ellipsis ); | ||||||
638 | } | ||||||
639 | } | ||||||
640 | |||||||
641 | Then in your templates you can do things like this: | ||||||
642 | |||||||
643 | [% FOR item IN search_results %] | ||||||
644 | |
||||||
645 | [% item.title %] |
||||||
646 | [% item.body | truncate_html(200) %] | ||||||
647 | |||||||
648 | [% END %] | ||||||
649 | |||||||
650 | See also L |
||||||
651 | |||||||
652 | =head1 AUTHOR | ||||||
653 | |||||||
654 | Ashley Pond V, C<< |
||||||
655 | |||||||
656 | =head1 LIMITATIONS | ||||||
657 | |||||||
658 | There may be places where this will break down right now. I'll pad out possible edge cases as I find them or they are sent to me via the CPAN bug ticket system. | ||||||
659 | |||||||
660 | =head2 This is not an HTML filter | ||||||
661 | |||||||
662 | Although this happens to do some crude HTML filtering to achieve its end, it is not a fully featured filter. If you are looking for one, check out L |
||||||
663 | |||||||
664 | =head1 BUGS, FEEDBACK, PATCHES | ||||||
665 | |||||||
666 | Please report any bugs or feature requests to | ||||||
667 | C |
||||||
668 | L |
||||||
669 | will get the ticket, and then you'll automatically be notified of | ||||||
670 | progress as I make changes. | ||||||
671 | |||||||
672 | =head2 TO DO | ||||||
673 | |||||||
674 | Write a couple more tests (percent and skip stuff) then take out beta notice. Try to make the 5.6 stuff work without decode...? Try a C |
||||||
675 | |||||||
676 | Write an L |
||||||
677 | |||||||
678 | =head1 THANKS TO | ||||||
679 | |||||||
680 | Kevin Riggle for the L functionality; patch, Pod, and tests. | ||||||
681 | |||||||
682 | Lorenzo Iannuzzi for the L functionality. | ||||||
683 | |||||||
684 | =head1 SEE ALSO | ||||||
685 | |||||||
686 | L |