| blib/lib/HTML/Truncate.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 151 | 174 | 86.7 |
| branch | 59 | 84 | 70.2 |
| condition | 6 | 15 | 40.0 |
| subroutine | 20 | 24 | 83.3 |
| pod | 11 | 11 | 100.0 |
| total | 247 | 308 | 80.1 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package HTML::Truncate; | ||||||
| 2 | |||||||
| 3 | 9 | 9 | 303047 | use 5.008; | |||
| 9 | 33 | ||||||
| 9 | 372 | ||||||
| 4 | 9 | 9 | 50 | use strict; | |||
| 9 | 18 | ||||||
| 9 | 271 | ||||||
| 5 | 9 | 9 | 64 | use warnings; | |||
| 9 | 22 | ||||||
| 9 | 339 | ||||||
| 6 | 9 | 9 | 52 | no warnings "uninitialized"; | |||
| 9 | 17 | ||||||
| 9 | 353 | ||||||
| 7 | |||||||
| 8 | 9 | 9 | 70189 | use HTML::TokeParser; | |||
| 9 | 298846 | ||||||
| 9 | 370 | ||||||
| 9 | 9 | 9 | 80 | use HTML::Tagset (); | |||
| 9 | 81 | ||||||
| 9 | 167 | ||||||
| 10 | 9 | 9 | 55 | use HTML::Entities (); | |||
| 9 | 1295 | ||||||
| 9 | 159 | ||||||
| 11 | 9 | 9 | 60 | use Carp; | |||
| 9 | 18 | ||||||
| 9 | 3361 | ||||||
| 12 | 9 | 9 | 94 | use List::Util qw( first ); | |||
| 9 | 15 | ||||||
| 9 | 27969 | ||||||
| 13 | |||||||
| 14 | =head1 NAME | ||||||
| 15 | |||||||
| 16 | HTML::Truncate - (beta software) truncate HTML by percentage or character count while preserving well-formedness. | ||||||
| 17 | |||||||
| 18 | =head1 VERSION | ||||||
| 19 | |||||||
| 20 | 0.20 | ||||||
| 21 | |||||||
| 22 | =cut | ||||||
| 23 | |||||||
| 24 | our $VERSION = "0.20"; | ||||||
| 25 | |||||||
| 26 | =head1 ABSTRACT | ||||||
| 27 | |||||||
| 28 | When working with text it is common to want to truncate strings to make them fit a desired context. E.g., you might have a menu that is only 100px wide and prefer text doesn't wrap so you'd truncate it around 15-30 characters, depending on preference and typeface size. This is trivial with plain text using L |
||||||
| 29 | |||||||
| 30 | L |
||||||
| 31 | |||||||
| 32 | =head1 SYNOPSIS | ||||||
| 33 | |||||||
| 34 | use strict; | ||||||
| 35 | use HTML::Truncate; | ||||||
| 36 | |||||||
| 37 | my $html = ' We have to test something. '; |
||||||
| 38 | my $readmore = '... [readmore]'; | ||||||
| 39 | |||||||
| 40 | my $html_truncate = HTML::Truncate->new(); | ||||||
| 41 | $html_truncate->chars(20); | ||||||
| 42 | $html_truncate->ellipsis($readmore); | ||||||
| 43 | print $html_truncate->truncate($html); | ||||||
| 44 | |||||||
| 45 | # or | ||||||
| 46 | |||||||
| 47 | use Encode; | ||||||
| 48 | my $ht = HTML::Truncate->new( utf8_mode => 1, | ||||||
| 49 | chars => 1_000, | ||||||
| 50 | ); | ||||||
| 51 | print Encode::encode_utf8( $ht->truncate($html) ); | ||||||
| 52 | |||||||
| 53 | =head1 XHTML | ||||||
| 54 | |||||||
| 55 | This module is designed to work with XHTML-style nested tags. More | ||||||
| 56 | below. | ||||||
| 57 | |||||||
| 58 | =head1 WHITESPACE AND ENTITIES | ||||||
| 59 | |||||||
| 60 | Repeated natural whitespace (i.e., "\s+" and not " ") in HTML | ||||||
| 61 | -- with rare exception (pre tags or user defined styles) -- is not | ||||||
| 62 | meaningful. Therefore it is normalized when truncating. Entities are | ||||||
| 63 | also normalized. The following is only counted 14 chars long. | ||||||
| 64 | |||||||
| 65 | \n \nthis is ‘text’\n\n |
||||||
| 66 | ^^^^^^^12345----678--9------01234------^^^^^^^^ | ||||||
| 67 | |||||||
| 68 | =head1 METHODS | ||||||
| 69 | |||||||
| 70 | =over 4 | ||||||
| 71 | |||||||
| 72 | =item B |
||||||
| 73 | |||||||
| 74 | Can take all the methods as hash style args. "percent" and "chars" are | ||||||
| 75 | incompatible so don't use them both. Whichever is set most recently | ||||||
| 76 | will erase the other. | ||||||
| 77 | |||||||
| 78 | my $ht = HTML::Truncate->new(utf8_mode => 1, | ||||||
| 79 | chars => 500, # default is 100 | ||||||
| 80 | ); | ||||||
| 81 | |||||||
| 82 | =cut | ||||||
| 83 | |||||||
| 84 | our %skip = ( head => 1, | ||||||
| 85 | script => 1, | ||||||
| 86 | form => 1, | ||||||
| 87 | iframe => 1, | ||||||
| 88 | object => 1, | ||||||
| 89 | embed => 1, | ||||||
| 90 | title => 1, | ||||||
| 91 | style => 1, | ||||||
| 92 | base => 1, | ||||||
| 93 | link => 1, | ||||||
| 94 | meta => 1, | ||||||
| 95 | ); | ||||||
| 96 | |||||||
| 97 | |||||||
| 98 | sub new { | ||||||
| 99 | 9 | 9 | 1 | 12972 | my $class = shift; | ||
| 100 | |||||||
| 101 | 9 | 137 | my $self = bless | ||||
| 102 | { | ||||||
| 103 | _chars => 100, | ||||||
| 104 | _percent => undef, | ||||||
| 105 | _cleanly => qr/[\s[:punct:]]+\z/, | ||||||
| 106 | _on_space => undef, | ||||||
| 107 | _utf8_mode => undef, | ||||||
| 108 | _ellipsis => '…', | ||||||
| 109 | _raw_html => '', | ||||||
| 110 | _repair => undef, | ||||||
| 111 | _skip_tags => \%skip, | ||||||
| 112 | }, $class; | ||||||
| 113 | |||||||
| 114 | 9 | 62 | while ( my ( $k, $v ) = splice(@_, 0, 2) ) | ||||
| 115 | { | ||||||
| 116 | 2 | 50 | 7 | croak "No such method or attribute '$k'" unless exists $self->{"_$k"}; | |||
| 117 | 2 | 6 | $self->$k($v); | ||||
| 118 | } | ||||||
| 119 | 9 | 65 | return $self; | ||||
| 120 | } | ||||||
| 121 | |||||||
| 122 | =item B |
||||||
| 123 | |||||||
| 124 | Set/get, true/false. If C |
||||||
| 125 | set in the underlying L |
||||||
| 126 | with L |
||||||
| 127 | literal ellipsis and not the default of C<…>. | ||||||
| 128 | |||||||
| 129 | =cut | ||||||
| 130 | |||||||
| 131 | sub utf8_mode { | ||||||
| 132 | 193 | 193 | 1 | 620 | my $self = shift; | ||
| 133 | 193 | 100 | 348 | if ( @_ ) | |||
| 134 | { | ||||||
| 135 | 5 | 9 | $self->{_utf8_mode} = shift; | ||||
| 136 | 5 | 21 | return 1; # say we did it, even if setting untrue value | ||||
| 137 | } | ||||||
| 138 | else | ||||||
| 139 | { | ||||||
| 140 | 188 | 661 | return $self->{_utf8_mode}; | ||||
| 141 | } | ||||||
| 142 | } | ||||||
| 143 | |||||||
| 144 | =item B |
||||||
| 145 | |||||||
| 146 | Set/get. The number of characters remaining after truncation, | ||||||
| 147 | B |
||||||
| 148 | |||||||
| 149 | Entities are counted as single characters. E.g., C<©> is one | ||||||
| 150 | character for truncation counts. | ||||||
| 151 | |||||||
| 152 | Default is "100." Side-effect: clears any L that has been | ||||||
| 153 | set. | ||||||
| 154 | |||||||
| 155 | =cut | ||||||
| 156 | |||||||
| 157 | sub chars { | ||||||
| 158 | 93 | 93 | 1 | 40751 | my ( $self, $chars ) = @_; | ||
| 159 | 93 | 100 | 287 | return $self->{_chars} unless defined $chars; | |||
| 160 | 87 | 50 | 281 | $chars > 0 or croak "You must truncate to at least 1 character"; | |||
| 161 | 87 | 50 | 417 | $chars =~ /^(?:[1-9][_\d]*|0)$/ | |||
| 162 | or croak "Specified chars must be a number"; | ||||||
| 163 | 87 | 146 | $self->{_percent} = undef; # no conflict allowed | ||||
| 164 | 87 | 244 | $self->{_chars} = $chars; | ||||
| 165 | } | ||||||
| 166 | |||||||
| 167 | =item B |
||||||
| 168 | |||||||
| 169 | Set/get. A percentage to keep while truncating the rest. For a | ||||||
| 170 | document of 1,000 chars, percent('15%') and chars(150) would be | ||||||
| 171 | equivalent. The actual amount of character that the percent represents | ||||||
| 172 | cannot be known until the given HTML is parsed. | ||||||
| 173 | |||||||
| 174 | Side-effect: clears any L that has been set. | ||||||
| 175 | |||||||
| 176 | =cut | ||||||
| 177 | |||||||
| 178 | sub percent { | ||||||
| 179 | 1 | 1 | 1 | 4 | my ( $self, $percent ) = @_; | ||
| 180 | |||||||
| 181 | 1 | 50 | 33 | 9 | return unless $self->{_percent} or $percent; | ||
| 182 | |||||||
| 183 | 1 | 50 | 4 | return sprintf("%d%%", 100 * $self->{_percent}) | |||
| 184 | unless $percent; | ||||||
| 185 | |||||||
| 186 | 1 | 6 | my ( $temp_percent ) = $percent =~ /^(100|[1-9]?[0-9])\%$/; | ||||
| 187 | |||||||
| 188 | 1 | 50 | 33 | 11 | $temp_percent and $temp_percent != 0 | ||
| 189 | or croak "Specified percent is invalid '$percent' -- 1\% - 100\%"; | ||||||
| 190 | |||||||
| 191 | 1 | 3 | $self->{_chars} = undef; # no conflict allowed | ||||
| 192 | 1 | 8 | $self->{_percent} = $1 / 100; | ||||
| 193 | } | ||||||
| 194 | |||||||
| 195 | =item B |
||||||
| 196 | |||||||
| 197 | Set/get. Ellipsis in this case means -- | ||||||
| 198 | |||||||
| 199 | The omission of a word or phrase necessary for a complete | ||||||
| 200 | syntactical construction but not necessary for understanding. | ||||||
| 201 | http://www.answers.com/topic/ellipsis | ||||||
| 202 | |||||||
| 203 | What it will probably mean in most real applications is "read more." | ||||||
| 204 | The default is C<…> which if the utf8 flag is true will render | ||||||
| 205 | as a literal ellipsis, C |
||||||
| 206 | |||||||
| 207 | The reason the default is C<…> and not "..." is this is meant | ||||||
| 208 | for use in HTML environments, not plain text, and "..." (dot-dot-dot) | ||||||
| 209 | is not typographically correct or equivalent to a real horizontal | ||||||
| 210 | ellipsis character. | ||||||
| 211 | |||||||
| 212 | =cut | ||||||
| 213 | |||||||
| 214 | sub ellipsis { | ||||||
| 215 | 94 | 94 | 1 | 144 | my $self = shift; | ||
| 216 | 94 | 100 | 260 | if ( @_ ) | |||
| 100 | |||||||
| 217 | { | ||||||
| 218 | 5 | 37 | $self->{_ellipsis} = shift; | ||||
| 219 | } | ||||||
| 220 | elsif ( $self->utf8_mode() ) | ||||||
| 221 | { | ||||||
| 222 | 6 | 76 | return HTML::Entities::decode($self->{_ellipsis}); | ||||
| 223 | } | ||||||
| 224 | else | ||||||
| 225 | { | ||||||
| 226 | 83 | 178 | return $self->{_ellipsis}; | ||||
| 227 | } | ||||||
| 228 | } | ||||||
| 229 | |||||||
| 230 | =item B |
||||||
| 231 | |||||||
| 232 | It returns the truncated XHTML if asked for a return value. | ||||||
| 233 | |||||||
| 234 | my $truncated = $ht->truncate($html); | ||||||
| 235 | |||||||
| 236 | It will truncate the string in place if no return value is expected | ||||||
| 237 | (L |
||||||
| 238 | |||||||
| 239 | $ht->truncate($html); | ||||||
| 240 | print $html; | ||||||
| 241 | |||||||
| 242 | Also can be called with inline arguments- | ||||||
| 243 | |||||||
| 244 | print $ht->truncate( $html, | ||||||
| 245 | $chars_or_percent, | ||||||
| 246 | $ellipsis ); | ||||||
| 247 | |||||||
| 248 | No arguments are strictly required. Without HTML to operate upon it | ||||||
| 249 | returns undef. The two optional arguments may be preset with the | ||||||
| 250 | methods L (or L) and L. | ||||||
| 251 | |||||||
| 252 | Valid nesting of tags is required (alla XHTML). Therefore some old | ||||||
| 253 | HTML habits like E |
||||||
| 254 | and may cause a fatal error. See L for help with badly formed | ||||||
| 255 | HTML. | ||||||
| 256 | |||||||
| 257 | Certain tags are omitted by default from the truncated output. | ||||||
| 258 | |||||||
| 259 | =over 4 | ||||||
| 260 | |||||||
| 261 | =item * Skipped tags | ||||||
| 262 | |||||||
| 263 | These will not be included in truncated output by default. | ||||||
| 264 | |||||||
| 265 | ... | ||||||
| 266 | |
||||||
| 267 | |
||||||
| 268 | |||||||
| 269 | =item * Tags allowed to self-close | ||||||
| 270 | |||||||
| 271 | See L |
||||||
| 272 | |||||||
| 273 | =back | ||||||
| 274 | |||||||
| 275 | =cut | ||||||
| 276 | |||||||
| 277 | sub _chars_or_percent { | ||||||
| 278 | 0 | 0 | 0 | my ( $self, $which ) = @_; | |||
| 279 | 0 | 0 | 0 | if ( $which =~ /\%\z/ ) | |||
| 280 | { | ||||||
| 281 | 0 | 0 | $self->percent($which); | ||||
| 282 | } | ||||||
| 283 | else | ||||||
| 284 | { | ||||||
| 285 | 0 | 0 | $self->chars($which); | ||||
| 286 | } | ||||||
| 287 | } | ||||||
| 288 | |||||||
| 289 | sub truncate { | ||||||
| 290 | 97 | 97 | 1 | 4684 | my $self = shift; | ||
| 291 | 97 | 188 | $self->{_raw_html} = \$_[0]; | ||||
| 292 | 97 | 50 | 209 | shift || return; | |||
| 293 | |||||||
| 294 | 97 | 50 | 200 | $self->_chars_or_percent(+shift) if @_; | |||
| 295 | 97 | 50 | 236 | $self->ellipsis(+shift) if @_; | |||
| 296 | |||||||
| 297 | 97 | 90 | my @tag_q; | ||||
| 298 | 97 | 122 | my $renew = ""; | ||||
| 299 | 97 | 154 | my $total = 0; | ||||
| 300 | 97 | 98 | my $previous_token; | ||||
| 301 | my $next_token; | ||||||
| 302 | |||||||
| 303 | # my $tmp_ellipsis = $self->ellipsis; | ||||||
| 304 | # $tmp_ellipsis =~ s/<\w[^>]+>//g; # Naive html strip. | ||||||
| 305 | # HTML::Entities::encode($tmp_ellipsis); | ||||||
| 306 | 97 | 561 | my $chars = $self->{_chars};# + length $tmp_ellipsis; | ||||
| 307 | |||||||
| 308 | 97 | 382 | my $p = HTML::TokeParser->new( $self->{_raw_html} ); | ||||
| 309 | 97 | 13011 | $p->unbroken_text(1); | ||||
| 310 | 97 | 225 | $p->utf8_mode( $self->utf8_mode ); | ||||
| 311 | |||||||
| 312 | TOKEN: | ||||||
| 313 | 97 | 343 | while ( my $token = $p->get_token() ) | ||||
| 314 | { | ||||||
| 315 | 617 | 7916 | my @nexttoken; | ||||
| 316 | NEXT_TOKEN: | ||||||
| 317 | 617 | 1441 | while ( my $next = $p->get_token() ) | ||||
| 318 | { | ||||||
| 319 | 1439 | 8921 | push @nexttoken, $next; | ||||
| 320 | 1439 | 100 | 4485 | if ( $next->[0] eq 'S' ) | |||
| 321 | { | ||||||
| 322 | 489 | 504 | $next_token = $next; | ||||
| 323 | 489 | 684 | last NEXT_TOKEN; | ||||
| 324 | } | ||||||
| 325 | } | ||||||
| 326 | 617 | 2420 | $p->unget_token(@nexttoken); | ||||
| 327 | 617 | 100 | 3613 | $previous_token = $token if $token->[0] eq 'E'; | |||
| 328 | |||||||
| 329 | # print " Queue: ", join ":", @tag_q; print $/; | ||||||
| 330 | # print "Previous: $previous_token->[1]\n"; | ||||||
| 331 | # print " IN: $token->[1]\n"; | ||||||
| 332 | # print " Next: $next_token->[1]\n\n"; | ||||||
| 333 | |||||||
| 334 | 617 | 100 | 1519 | if ( $token->[0] eq 'S' ) | |||
| 100 | |||||||
| 50 | |||||||
| 335 | { | ||||||
| 336 | # _callback_for...? 321 | ||||||
| 337 | 253 | 451 | ( my $real_tag = $token->[1] ) =~ s,/\z,,; | ||||
| 338 | 253 | 50 | 598 | next TOKEN if $self->{_skip_tags}{$real_tag}; | |||
| 339 | 253 | 100 | 812 | push @tag_q, $token->[1] unless $HTML::Tagset::emptyElement{$real_tag}; | |||
| 340 | 253 | 1239 | $renew .= $token->[-1]; | ||||
| 341 | } | ||||||
| 342 | elsif ( $token->[0] eq 'E' ) | ||||||
| 343 | { | ||||||
| 344 | 116 | 50 | 293 | next TOKEN if $self->{_skip_tags}{$token->[1]}; | |||
| 345 | 116 | 167 | my $open = pop @tag_q; | ||||
| 346 | 116 | 208 | my $close = $token->[1]; | ||||
| 347 | 116 | 100 | 227 | unless ( $open eq $close ) | |||
| 348 | { | ||||||
| 349 | 8 | 50 | 17 | if ( $self->{_repair} ) | |||
| 350 | { | ||||||
| 351 | 8 | 8 | my @unmatched; | ||||
| 352 | 8 | 100 | 21 | push @unmatched, $open if $open; | |||
| 353 | 8 | 19 | while ( my $temp = pop @tag_q ) | ||||
| 354 | { | ||||||
| 355 | 8 | 100 | 15 | if ( $temp eq $close ) | |||
| 356 | { | ||||||
| 357 | 5 | 14 | while ( my $add = shift @unmatched ) | ||||
| 358 | { | ||||||
| 359 | 8 | 23 | $renew .= "$add>"; | ||||
| 360 | } | ||||||
| 361 | 5 | 9 | $renew .= "$temp>"; | ||||
| 362 | 5 | 24 | next TOKEN; | ||||
| 363 | } | ||||||
| 364 | else | ||||||
| 365 | { | ||||||
| 366 | 3 | 9 | push @unmatched, $temp; | ||||
| 367 | } | ||||||
| 368 | } | ||||||
| 369 | 3 | 3 | push @tag_q, reverse @unmatched; | ||||
| 370 | 3 | 12 | next TOKEN; # silently drop unmatched close tags | ||||
| 371 | } | ||||||
| 372 | else | ||||||
| 373 | { | ||||||
| 374 | 0 | 0 | my $nearby = substr($renew, | ||||
| 375 | length($renew) - 15, | ||||||
| 376 | 15); | ||||||
| 377 | 0 | 0 | croak qq|<$open> closed by $close> near "$nearby"|; | ||||
| 378 | } | ||||||
| 379 | } | ||||||
| 380 | 108 | 401 | $renew .= $token->[-1]; | ||||
| 381 | } | ||||||
| 382 | elsif ( $token->[0] eq 'T' ) | ||||||
| 383 | { | ||||||
| 384 | 248 | 50 | 498 | next TOKEN if $token->[2]; # DATA | |||
| 385 | # my $txt = HTML::Entities::decode($token->[1]); | ||||||
| 386 | 248 | 313 | my $txt = $token->[1]; | ||||
| 387 | 248 | 261 | my $current_length = 0; | ||||
| 388 | 248 | 100 | 416 | 1171 | unless ( first { $_ eq 'pre' } @tag_q ) # We're not somewhere inside a | ||
| 416 | 749 | ||||||
| 389 | { | ||||||
| 390 | 244 | 837 | $txt =~ s/\s+/ /g; | ||||
| 391 | |||||||
| 392 | 244 | 100 | 66 | 853 | if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow | ||
| 393 | and | ||||||
| 394 | ! $HTML::Tagset::isPhraseMarkup{$previous_token->[1]} | ||||||
| 395 | ) | ||||||
| 396 | { | ||||||
| 397 | 10 | 36 | $txt =~ s/\A +//; | ||||
| 398 | } | ||||||
| 399 | |||||||
| 400 | 244 | 100 | 66 | 669 | if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow | ||
| 401 | and | ||||||
| 402 | ! $HTML::Tagset::isPhraseMarkup{$next_token->[1]} | ||||||
| 403 | ) | ||||||
| 404 | { | ||||||
| 405 | 20 | 52 | $txt =~ s/ +\z//; | ||||
| 406 | } | ||||||
| 407 | 244 | 427 | $current_length = _count_visual_chars($txt); | ||||
| 408 | } | ||||||
| 409 | else | ||||||
| 410 | { | ||||||
| 411 | 4 | 5 | $current_length = length($txt); | ||||
| 412 | } | ||||||
| 413 | |||||||
| 414 | 248 | 603 | $total += $current_length; | ||||
| 415 | |||||||
| 416 | 248 | 100 | 410 | if ( $total >= $chars ) | |||
| 417 | { | ||||||
| 418 | 85 | 98 | $total -= $current_length; | ||||
| 419 | |||||||
| 420 | 85 | 101 | my $chars_to_keep = $chars - $total; | ||||
| 421 | 85 | 105 | my $keep = ""; | ||||
| 422 | 85 | 100 | 178 | if ( $self->on_space ) | |||
| 423 | { | ||||||
| 424 | 26 | 727 | ( $keep ) = $txt =~ /\A(.{0,$chars_to_keep}\s?)(?=\s|\z)/; | ||||
| 425 | 26 | 90 | $keep =~ s/\s+\z//; | ||||
| 426 | } | ||||||
| 427 | else | ||||||
| 428 | { | ||||||
| 429 | 59 | 117 | $keep = substr($txt, 0, $chars_to_keep); | ||||
| 430 | } | ||||||
| 431 | |||||||
| 432 | 85 | 100 | 202 | if ( my $cleaner = $self->cleanly ) | |||
| 433 | { | ||||||
| 434 | 56 | 241 | $keep =~ s/$cleaner//; | ||||
| 435 | } | ||||||
| 436 | |||||||
| 437 | 85 | 100 | 194 | if ( $keep ) | |||
| 438 | { | ||||||
| 439 | # $renew .= $self->utf8_mode ? | ||||||
| 440 | # $keep : HTML::Entities::encode($keep); | ||||||
| 441 | 68 | 108 | $renew .= $keep; | ||||
| 442 | } | ||||||
| 443 | |||||||
| 444 | 85 | 194 | $renew .= $self->ellipsis(); | ||||
| 445 | 85 | 199 | last TOKEN; | ||||
| 446 | } | ||||||
| 447 | else | ||||||
| 448 | { | ||||||
| 449 | 163 | 791 | $renew .= $token->[1]; | ||||
| 450 | } | ||||||
| 451 | } | ||||||
| 452 | } # TOKEN block ends | ||||||
| 453 | |||||||
| 454 | 97 | 303 | $renew .= join('', map {"$_>"} reverse @tag_q); | ||||
| 125 | 349 | ||||||
| 455 | |||||||
| 456 | 97 | 50 | 194 | if ( defined wantarray ) | |||
| 457 | { | ||||||
| 458 | 97 | 1323 | return $renew; | ||||
| 459 | } | ||||||
| 460 | else | ||||||
| 461 | { | ||||||
| 462 | 0 | 0 | ${$self->{_raw_html}} = $renew; | ||||
| 0 | 0 | ||||||
| 463 | } | ||||||
| 464 | } | ||||||
| 465 | |||||||
| 466 | =item B |
||||||
| 467 | |||||||
| 468 | Put one or more new tags into the list of those to be omitted from | ||||||
| 469 | truncated output. An example of when you might like to use this is if | ||||||
| 470 | you're thumb-nailing articles and they start with C<< title>> |
||||||
| 471 | or such before the article body. The heading level would be absurd | ||||||
| 472 | with a list of excerpts so you could drop it completely this way-- | ||||||
| 473 | |||||||
| 474 | $ht->add_skip_tags( 'h1' ); | ||||||
| 475 | |||||||
| 476 | =cut | ||||||
| 477 | |||||||
| 478 | sub add_skip_tags { | ||||||
| 479 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 480 | 0 | 0 | for ( @_ ) | ||||
| 481 | { | ||||||
| 482 | 0 | 0 | 0 | croak "Args to add_skip_tags must be scalar tag names, not references" | |||
| 483 | if ref $_; | ||||||
| 484 | 0 | 0 | $self->{_skip_tags}{$_} = 1; | ||||
| 485 | } | ||||||
| 486 | } | ||||||
| 487 | |||||||
| 488 | =item B |
||||||
| 489 | |||||||
| 490 | Takes tags out of the current list to be omitted from truncated output. | ||||||
| 491 | |||||||
| 492 | =cut | ||||||
| 493 | |||||||
| 494 | sub dont_skip_tags { | ||||||
| 495 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 496 | 0 | 0 | for ( @_ ) | ||||
| 497 | { | ||||||
| 498 | 0 | 0 | 0 | croak "Args to dont_skip_tags must be scalar tag names, not references" | |||
| 499 | if ref $_; | ||||||
| 500 | 0 | 0 | 0 | carp "$_ was not set to be skipped" | |||
| 501 | unless delete $self->{_skip_tags}{$_}; | ||||||
| 502 | } | ||||||
| 503 | } | ||||||
| 504 | |||||||
| 505 | =item B |
||||||
| 506 | |||||||
| 507 | Set/get, true/false. If true, will attempt to repair unclosed HTML | ||||||
| 508 | tags by adding close-tags as late as possible (eg. C<< | ||||||
| 509 | foobar >> becomes C<< foobar >>). Unmatched | ||||||
| 510 | close tags are dropped (C<< foobar >> becomes C<< foobar >>). | ||||||
| 511 | |||||||
| 512 | =cut | ||||||
| 513 | |||||||
| 514 | sub repair { | ||||||
| 515 | 8 | 8 | 1 | 15 | my $self = shift; | ||
| 516 | 8 | 100 | 22 | if ( @_ ) | |||
| 517 | { | ||||||
| 518 | 3 | 8 | $self->{_repair} = shift; | ||||
| 519 | 3 | 7 | return 1; # say we did it, even if untrue value | ||||
| 520 | } | ||||||
| 521 | else | ||||||
| 522 | { | ||||||
| 523 | 5 | 24 | return $self->{_repair}; | ||||
| 524 | } | ||||||
| 525 | } | ||||||
| 526 | |||||||
| 527 | sub _load_chars_from_percent { | ||||||
| 528 | 0 | 0 | 0 | my $self = shift; | |||
| 529 | 0 | 0 | my $p = HTML::TokeParser->new( $self->{_raw_html} ); | ||||
| 530 | 0 | 0 | my $txt_length = 0; | ||||
| 531 | |||||||
| 532 | CHARS: | ||||||
| 533 | 0 | 0 | while ( my $token = $p->get_token ) | ||||
| 534 | { | ||||||
| 535 | # don't check padding b/c we're going by a document average | ||||||
| 536 | 0 | 0 | 0 | 0 | next unless $token->[0] eq 'T' and not $token->[2]; # Not data. | ||
| 537 | 0 | 0 | $txt_length += _count_visual_chars( $token->[1] ); | ||||
| 538 | } | ||||||
| 539 | 0 | 0 | $self->chars( int( $txt_length * $self->{_percent} ) ); | ||||
| 540 | } | ||||||
| 541 | |||||||
| 542 | sub _count_visual_chars { # private function | ||||||
| 543 | 244 | 244 | 1068 | my $to_count = HTML::Entities::decode_entities(+shift); | |||
| 544 | 244 | 442 | $to_count =~ s/\s\s+/ /g; | ||||
| 545 | 244 | 314 | $to_count =~ s/[^[:print:]]+//g; | ||||
| 546 | # my $count = () = | ||||||
| 547 | # $to_count =~ | ||||||
| 548 | # /\&\#\d+;|\&[[:alpha:]]{2,5};|\S|\s+/g; | ||||||
| 549 | # return $count; | ||||||
| 550 | 244 | 414 | return length($to_count); | ||||
| 551 | } | ||||||
| 552 | |||||||
| 553 | # Need to put hooks for these or not? 321 | ||||||
| 554 | #sub _default_image_callback { | ||||||
| 555 | # sub { | ||||||
| 556 | # '[image]' | ||||||
| 557 | # } | ||||||
| 558 | #} | ||||||
| 559 | |||||||
| 560 | =item B |
||||||
| 561 | |||||||
| 562 | This will make the truncation back up to the first space it finds so | ||||||
| 563 | it doesn't truncate in the the middle of a word. L runs | ||||||
| 564 | before L if both are set. | ||||||
| 565 | |||||||
| 566 | =cut | ||||||
| 567 | |||||||
| 568 | sub on_space { | ||||||
| 569 | 86 | 86 | 1 | 113 | my $self = shift; | ||
| 570 | 86 | 100 | 160 | if ( @_ ) | |||
| 571 | { | ||||||
| 572 | 1 | 2 | $self->{_on_space} = shift; | ||||
| 573 | 1 | 7 | return 1; # say we did it, even if setting untrue value | ||||
| 574 | } | ||||||
| 575 | else | ||||||
| 576 | { | ||||||
| 577 | 85 | 214 | return $self->{_on_space}; | ||||
| 578 | } | ||||||
| 579 | } | ||||||
| 580 | |||||||
| 581 | |||||||
| 582 | =item B |
||||||
| 583 | |||||||
| 584 | Set/get -- a regular expression. This is on by default and the default | ||||||
| 585 | cleaning regular expression is C |
||||||
| 586 | will make the truncation strip any trailing spacing and punctuation so | ||||||
| 587 | you don't get things like "The End...." or "What? ..." You can cancel | ||||||
| 588 | it with C<$ht-E |
||||||
| 589 | expression. | ||||||
| 590 | |||||||
| 591 | =cut | ||||||
| 592 | |||||||
| 593 | sub cleanly { | ||||||
| 594 | 87 | 87 | 1 | 167 | my $self = shift; | ||
| 595 | 87 | 100 | 158 | if ( @_ ) | |||
| 596 | { | ||||||
| 597 | 2 | 5 | $self->{_cleanly} = shift; | ||||
| 598 | 2 | 18 | return 1; # say we did it, even if setting untrue value | ||||
| 599 | } | ||||||
| 600 | else | ||||||
| 601 | { | ||||||
| 602 | 85 | 298 | return $self->{_cleanly}; | ||||
| 603 | } | ||||||
| 604 | } | ||||||
| 605 | |||||||
| 606 | =back | ||||||
| 607 | |||||||
| 608 | =head1 COOKBOOK (well, a recipe) | ||||||
| 609 | |||||||
| 610 | =head2 Template Toolkit filter | ||||||
| 611 | |||||||
| 612 | For excerpting HTML in your Templates. Note the L which | ||||||
| 613 | is set to drop any images from the truncated output. | ||||||
| 614 | |||||||
| 615 | use Template; | ||||||
| 616 | use HTML::Truncate; | ||||||
| 617 | |||||||
| 618 | my %config = | ||||||
| 619 | ( | ||||||
| 620 | FILTERS => { | ||||||
| 621 | truncate_html => [ \&truncate_html_filter_factory, 1 ], | ||||||
| 622 | }, | ||||||
| 623 | ); | ||||||
| 624 | |||||||
| 625 | my $tt = Template->new(\%config) or die $Template::ERROR; | ||||||
| 626 | |||||||
| 627 | # ... etc ... | ||||||
| 628 | |||||||
| 629 | sub truncate_html_filter_factory { | ||||||
| 630 | my ( $context, $len, $ellipsis ) = @_; | ||||||
| 631 | $len = 32 unless $len; | ||||||
| 632 | $ellipsis = chr(8230) unless defined $ellipsis; | ||||||
| 633 | my $ht = HTML::Truncate->new(); | ||||||
| 634 | $ht->add_skip_tags(qw( img )); | ||||||
| 635 | return sub { | ||||||
| 636 | my $html = shift || return ''; | ||||||
| 637 | return $ht->truncate( $html, $len, $ellipsis ); | ||||||
| 638 | } | ||||||
| 639 | } | ||||||
| 640 | |||||||
| 641 | Then in your templates you can do things like this: | ||||||
| 642 | |||||||
| 643 | [% FOR item IN search_results %] | ||||||
| 644 | |
||||||
| 645 | [% item.title %] |
||||||
| 646 | [% item.body | truncate_html(200) %] | ||||||
| 647 | |||||||
| 648 | [% END %] | ||||||
| 649 | |||||||
| 650 | See also L |
||||||
| 651 | |||||||
| 652 | =head1 AUTHOR | ||||||
| 653 | |||||||
| 654 | Ashley Pond V, C<< |
||||||
| 655 | |||||||
| 656 | =head1 LIMITATIONS | ||||||
| 657 | |||||||
| 658 | There may be places where this will break down right now. I'll pad out possible edge cases as I find them or they are sent to me via the CPAN bug ticket system. | ||||||
| 659 | |||||||
| 660 | =head2 This is not an HTML filter | ||||||
| 661 | |||||||
| 662 | Although this happens to do some crude HTML filtering to achieve its end, it is not a fully featured filter. If you are looking for one, check out L |
||||||
| 663 | |||||||
| 664 | =head1 BUGS, FEEDBACK, PATCHES | ||||||
| 665 | |||||||
| 666 | Please report any bugs or feature requests to | ||||||
| 667 | C |
||||||
| 668 | L |
||||||
| 669 | will get the ticket, and then you'll automatically be notified of | ||||||
| 670 | progress as I make changes. | ||||||
| 671 | |||||||
| 672 | =head2 TO DO | ||||||
| 673 | |||||||
| 674 | Write a couple more tests (percent and skip stuff) then take out beta notice. Try to make the 5.6 stuff work without decode...? Try a C |
||||||
| 675 | |||||||
| 676 | Write an L |
||||||
| 677 | |||||||
| 678 | =head1 THANKS TO | ||||||
| 679 | |||||||
| 680 | Kevin Riggle for the L functionality; patch, Pod, and tests. | ||||||
| 681 | |||||||
| 682 | Lorenzo Iannuzzi for the L functionality. | ||||||
| 683 | |||||||
| 684 | =head1 SEE ALSO | ||||||
| 685 | |||||||
| 686 | L |
||||||
| 687 | |||||||
| 688 | L |
||||||
| 689 | |||||||
| 690 | =head1 COPYRIGHT & LICENSE | ||||||
| 691 | |||||||
| 692 | Copyright (E |
||||||
| 693 | |||||||
| 694 | This program is free software; you can redistribute it or modify it or both under the same terms as Perl itself. | ||||||
| 695 | |||||||
| 696 | =head1 DISCLAIMER OF WARRANTY | ||||||
| 697 | |||||||
| 698 | Because this software is licensed free of charge, there is no warranty | ||||||
| 699 | for the software, to the extent permitted by applicable law. Except | ||||||
| 700 | when otherwise stated in writing the copyright holders or other | ||||||
| 701 | parties provide the software "as is" without warranty of any kind, | ||||||
| 702 | either expressed or implied, including, but not limited to, the | ||||||
| 703 | implied warranties of merchantability and fitness for a particular | ||||||
| 704 | purpose. The entire risk as to the quality and performance of the | ||||||
| 705 | software is with you. Should the software prove defective, you assume | ||||||
| 706 | the cost of all necessary servicing, repair, or correction. | ||||||
| 707 | |||||||
| 708 | In no event unless required by applicable law or agreed to in writing | ||||||
| 709 | will any copyright holder, or any other party who may modify and/or | ||||||
| 710 | redistribute the software as permitted by the above licence, be liable | ||||||
| 711 | to you for damages, including any general, special, incidental, or | ||||||
| 712 | consequential damages arising out of the use or inability to use the | ||||||
| 713 | software (including but not limited to loss of data or data being | ||||||
| 714 | rendered inaccurate or losses sustained by you or third parties or a | ||||||
| 715 | failure of the software to operate with any other software), even if | ||||||
| 716 | such holder or other party has been advised of the possibility of such | ||||||
| 717 | damages. | ||||||
| 718 | |||||||
| 719 | =cut | ||||||
| 720 | |||||||
| 721 | 1; | ||||||
| 722 | |||||||
| 723 |