File Coverage

blib/lib/HTML/Truncate.pm

Criterion	Covered	Total	%
statement	151	174	86.7
branch	59	84	70.2
condition	6	15	40.0
subroutine	20	24	83.3
pod	11	11	100.0
total	247	308	80.1

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Truncate;
2
3	9			9		303047	use 5.008;
	9					33
	9					372
4	9			9		50	use strict;
	9					18
	9					271
5	9			9		64	use warnings;
	9					22
	9					339
6	9			9		52	no warnings "uninitialized";
	9					17
	9					353
7
8	9			9		70189	use HTML::TokeParser;
	9					298846
	9					370
9	9			9		80	use HTML::Tagset ();
	9					81
	9					167
10	9			9		55	use HTML::Entities ();
	9					1295
	9					159
11	9			9		60	use Carp;
	9					18
	9					3361
12	9			9		94	use List::Util qw( first );
	9					15
	9					27969
13
14							=head1 NAME
15
16							HTML::Truncate - (beta software) truncate HTML by percentage or character count while preserving well-formedness.
17
18							=head1 VERSION
19
20							0.20
21
22							=cut
23
24							our $VERSION = "0.20";
25
26							=head1 ABSTRACT
27
28							When working with text it is common to want to truncate strings to make them fit a desired context. E.g., you might have a menu that is only 100px wide and prefer text doesn't wrap so you'd truncate it around 15-30 characters, depending on preference and typeface size. This is trivial with plain text using L but with HTML it is somewhat difficult because whitespace has fluid significance and open tags that are not properly closed destroy well-formedness and can wreck an entire layout.
29
30							L attempts to account for those two problems by padding truncation for spacing and entities and closing any tags that remain open at the point of truncation.
31
32							=head1 SYNOPSIS
33
34							use strict;
35							use HTML::Truncate;
36
37							my $html = ' We have to test something. ';
38							my $readmore = '... [readmore]';
39
40							my $html_truncate = HTML::Truncate->new();
41							$html_truncate->chars(20);
42							$html_truncate->ellipsis($readmore);
43							print $html_truncate->truncate($html);
44
45							# or
46
47							use Encode;
48							my $ht = HTML::Truncate->new( utf8_mode => 1,
49							chars => 1_000,
50							);
51							print Encode::encode_utf8( $ht->truncate($html) );
52
53							=head1 XHTML
54
55							This module is designed to work with XHTML-style nested tags. More
56							below.
57
58							=head1 WHITESPACE AND ENTITIES
59
60							Repeated natural whitespace (i.e., "\s+" and not " ") in HTML
61							-- with rare exception (pre tags or user defined styles) -- is not
62							meaningful. Therefore it is normalized when truncating. Entities are
63							also normalized. The following is only counted 14 chars long.
64
65							\n \nthis is ‘text’\n\n
66							^^^^^^^12345----678--9------01234------^^^^^^^^
67
68							=head1 METHODS
69
70							=over 4
71
72							=item B
73
74							Can take all the methods as hash style args. "percent" and "chars" are
75							incompatible so don't use them both. Whichever is set most recently
76							will erase the other.
77
78							my $ht = HTML::Truncate->new(utf8_mode => 1,
79							chars => 500, # default is 100
80							);
81
82							=cut
83
84							our %skip = ( head => 1,
85							script => 1,
86							form => 1,
87							iframe => 1,
88							object => 1,
89							embed => 1,
90							title => 1,
91							style => 1,
92							base => 1,
93							link => 1,
94							meta => 1,
95							);
96
97
98							sub new {
99	9			9	1	12972	my $class = shift;
100
101	9					137	my $self = bless
102							{
103							_chars => 100,
104							_percent => undef,
105							_cleanly => qr/[\s[:punct:]]+\z/,
106							_on_space => undef,
107							_utf8_mode => undef,
108							_ellipsis => '…',
109							_raw_html => '',
110							_repair => undef,
111							_skip_tags => \%skip,
112							}, $class;
113
114	9					62	while ( my ( $k, $v ) = splice(@_, 0, 2) )
115							{
116	2	50				7	croak "No such method or attribute '$k'" unless exists $self->{"_$k"};
117	2					6	$self->$k($v);
118							}
119	9					65	return $self;
120							}
121
122							=item B
123
124							Set/get, true/false. If C is set, C is also
125							set in the underlying L, entities will be transformed
126							with L and the default ellipsis will be a
127							literal ellipsis and not the default of C<…>.
128
129							=cut
130
131							sub utf8_mode {
132	193			193	1	620	my $self = shift;
133	193	100				348	if ( @_ )
134							{
135	5					9	$self->{_utf8_mode} = shift;
136	5					21	return 1; # say we did it, even if setting untrue value
137							}
138							else
139							{
140	188					661	return $self->{_utf8_mode};
141							}
142							}
143
144							=item B
145
146							Set/get. The number of characters remaining after truncation,
147							B the L.
148
149							Entities are counted as single characters. E.g., C<©> is one
150							character for truncation counts.
151
152							Default is "100." Side-effect: clears any L that has been
153							set.
154
155							=cut
156
157							sub chars {
158	93			93	1	40751	my ( $self, $chars ) = @_;
159	93	100				287	return $self->{_chars} unless defined $chars;
160	87	50				281	$chars > 0 or croak "You must truncate to at least 1 character";
161	87	50				417	$chars =~ /^(?:[1-9][_\d]*\|0)$/
162							or croak "Specified chars must be a number";
163	87					146	$self->{_percent} = undef; # no conflict allowed
164	87					244	$self->{_chars} = $chars;
165							}
166
167							=item B
168
169							Set/get. A percentage to keep while truncating the rest. For a
170							document of 1,000 chars, percent('15%') and chars(150) would be
171							equivalent. The actual amount of character that the percent represents
172							cannot be known until the given HTML is parsed.
173
174							Side-effect: clears any L that has been set.
175
176							=cut
177
178							sub percent {
179	1			1	1	4	my ( $self, $percent ) = @_;
180
181	1	50	33			9	return unless $self->{_percent} or $percent;
182
183	1	50				4	return sprintf("%d%%", 100 * $self->{_percent})
184							unless $percent;
185
186	1					6	my ( $temp_percent ) = $percent =~ /^(100\|[1-9]?[0-9])\%$/;
187
188	1	50	33			11	$temp_percent and $temp_percent != 0
189							or croak "Specified percent is invalid '$percent' -- 1\% - 100\%";
190
191	1					3	$self->{_chars} = undef; # no conflict allowed
192	1					8	$self->{_percent} = $1 / 100;
193							}
194
195							=item B
196
197							Set/get. Ellipsis in this case means --
198
199							The omission of a word or phrase necessary for a complete
200							syntactical construction but not necessary for understanding.
201							http://www.answers.com/topic/ellipsis
202
203							What it will probably mean in most real applications is "read more."
204							The default is C<…> which if the utf8 flag is true will render
205							as a literal ellipsis, C.
206
207							The reason the default is C<…> and not "..." is this is meant
208							for use in HTML environments, not plain text, and "..." (dot-dot-dot)
209							is not typographically correct or equivalent to a real horizontal
210							ellipsis character.
211
212							=cut
213
214							sub ellipsis {
215	94			94	1	144	my $self = shift;
216	94	100				260	if ( @_ )
		100
217							{
218	5					37	$self->{_ellipsis} = shift;
219							}
220							elsif ( $self->utf8_mode() )
221							{
222	6					76	return HTML::Entities::decode($self->{_ellipsis});
223							}
224							else
225							{
226	83					178	return $self->{_ellipsis};
227							}
228							}
229
230							=item B
231
232							It returns the truncated XHTML if asked for a return value.
233
234							my $truncated = $ht->truncate($html);
235
236							It will truncate the string in place if no return value is expected
237							(L is not defined).
238
239							$ht->truncate($html);
240							print $html;
241
242							Also can be called with inline arguments-
243
244							print $ht->truncate( $html,
245							$chars_or_percent,
246							$ellipsis );
247
248							No arguments are strictly required. Without HTML to operate upon it
249							returns undef. The two optional arguments may be preset with the
250							methods L (or L) and L.
251
252							Valid nesting of tags is required (alla XHTML). Therefore some old
253							HTML habits like EpE without a E/pE are not supported
254							and may cause a fatal error. See L for help with badly formed
255							HTML.
256
257							Certain tags are omitted by default from the truncated output.
258
259							=over 4
260
261							=item * Skipped tags
262
263							These will not be included in truncated output by default.
264
265							... ...
266							...
267
268
269							=item * Tags allowed to self-close
270
271							See L in L.
272
273							=back
274
275							=cut
276
277							sub _chars_or_percent {
278	0			0		0	my ( $self, $which ) = @_;
279	0	0				0	if ( $which =~ /\%\z/ )
280							{
281	0					0	$self->percent($which);
282							}
283							else
284							{
285	0					0	$self->chars($which);
286							}
287							}
288
289							sub truncate {
290	97			97	1	4684	my $self = shift;
291	97					188	$self->{_raw_html} = \$_[0];
292	97	50				209	shift \|\| return;
293
294	97	50				200	$self->_chars_or_percent(+shift) if @_;
295	97	50				236	$self->ellipsis(+shift) if @_;
296
297	97					90	my @tag_q;
298	97					122	my $renew = "";
299	97					154	my $total = 0;
300	97					98	my $previous_token;
301							my $next_token;
302
303							# my $tmp_ellipsis = $self->ellipsis;
304							# $tmp_ellipsis =~ s/<\w[^>]+>//g; # Naive html strip.
305							# HTML::Entities::encode($tmp_ellipsis);
306	97					561	my $chars = $self->{_chars};# + length $tmp_ellipsis;
307
308	97					382	my $p = HTML::TokeParser->new( $self->{_raw_html} );
309	97					13011	$p->unbroken_text(1);
310	97					225	$p->utf8_mode( $self->utf8_mode );
311
312							TOKEN:
313	97					343	while ( my $token = $p->get_token() )
314							{
315	617					7916	my @nexttoken;
316							NEXT_TOKEN:
317	617					1441	while ( my $next = $p->get_token() )
318							{
319	1439					8921	push @nexttoken, $next;
320	1439	100				4485	if ( $next->[0] eq 'S' )
321							{
322	489					504	$next_token = $next;
323	489					684	last NEXT_TOKEN;
324							}
325							}
326	617					2420	$p->unget_token(@nexttoken);
327	617	100				3613	$previous_token = $token if $token->[0] eq 'E';
328
329							# print " Queue: ", join ":", @tag_q; print $/;
330							# print "Previous: $previous_token->[1]\n";
331							# print " IN: $token->[1]\n";
332							# print " Next: $next_token->[1]\n\n";
333
334	617	100				1519	if ( $token->[0] eq 'S' )
		100
		50
335							{
336							# _callback_for...? 321
337	253					451	( my $real_tag = $token->[1] ) =~ s,/\z,,;
338	253	50				598	next TOKEN if $self->{_skip_tags}{$real_tag};
339	253	100				812	push @tag_q, $token->[1] unless $HTML::Tagset::emptyElement{$real_tag};
340	253					1239	$renew .= $token->[-1];
341							}
342							elsif ( $token->[0] eq 'E' )
343							{
344	116	50				293	next TOKEN if $self->{_skip_tags}{$token->[1]};
345	116					167	my $open = pop @tag_q;
346	116					208	my $close = $token->[1];
347	116	100				227	unless ( $open eq $close )
348							{
349	8	50				17	if ( $self->{_repair} )
350							{
351	8					8	my @unmatched;
352	8	100				21	push @unmatched, $open if $open;
353	8					19	while ( my $temp = pop @tag_q )
354							{
355	8	100				15	if ( $temp eq $close )
356							{
357	5					14	while ( my $add = shift @unmatched )
358							{
359	8					23	$renew .= "";
360							}
361	5					9	$renew .= "";
362	5					24	next TOKEN;
363							}
364							else
365							{
366	3					9	push @unmatched, $temp;
367							}
368							}
369	3					3	push @tag_q, reverse @unmatched;
370	3					12	next TOKEN; # silently drop unmatched close tags
371							}
372							else
373							{
374	0					0	my $nearby = substr($renew,
375							length($renew) - 15,
376							15);
377	0					0	croak qq\|<$open> closed by near "$nearby"\|;
378							}
379							}
380	108					401	$renew .= $token->[-1];
381							}
382							elsif ( $token->[0] eq 'T' )
383							{
384	248	50				498	next TOKEN if $token->[2]; # DATA
385							# my $txt = HTML::Entities::decode($token->[1]);
386	248					313	my $txt = $token->[1];
387	248					261	my $current_length = 0;
388	248	100		416		1171	unless ( first { $_ eq 'pre' } @tag_q ) # We're not somewhere inside a
	416					749
389							{
390	244					837	$txt =~ s/\s+/ /g;
391
392	244	100	66			853	if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow
393							and
394							! $HTML::Tagset::isPhraseMarkup{$previous_token->[1]}
395							)
396							{
397	10					36	$txt =~ s/\A +//;
398							}
399
400	244	100	66			669	if ( ! $HTML::Tagset::isPhraseMarkup{$tag_q[-1]} # in flow
401							and
402							! $HTML::Tagset::isPhraseMarkup{$next_token->[1]}
403							)
404							{
405	20					52	$txt =~ s/ +\z//;
406							}
407	244					427	$current_length = _count_visual_chars($txt);
408							}
409							else
410							{
411	4					5	$current_length = length($txt);
412							}
413
414	248					603	$total += $current_length;
415
416	248	100				410	if ( $total >= $chars )
417							{
418	85					98	$total -= $current_length;
419
420	85					101	my $chars_to_keep = $chars - $total;
421	85					105	my $keep = "";
422	85	100				178	if ( $self->on_space )
423							{
424	26					727	( $keep ) = $txt =~ /\A(.{0,$chars_to_keep}\s?)(?=\s\|\z)/;
425	26					90	$keep =~ s/\s+\z//;
426							}
427							else
428							{
429	59					117	$keep = substr($txt, 0, $chars_to_keep);
430							}
431
432	85	100				202	if ( my $cleaner = $self->cleanly )
433							{
434	56					241	$keep =~ s/$cleaner//;
435							}
436
437	85	100				194	if ( $keep )
438							{
439							# $renew .= $self->utf8_mode ?
440							# $keep : HTML::Entities::encode($keep);
441	68					108	$renew .= $keep;
442							}
443
444	85					194	$renew .= $self->ellipsis();
445	85					199	last TOKEN;
446							}
447							else
448							{
449	163					791	$renew .= $token->[1];
450							}
451							}
452							} # TOKEN block ends
453
454	97					303	$renew .= join('', map {""} reverse @tag_q);
	125					349
455
456	97	50				194	if ( defined wantarray )
457							{
458	97					1323	return $renew;
459							}
460							else
461							{
462	0					0	${$self->{_raw_html}} = $renew;
	0					0
463							}
464							}
465
466							=item B
467
468							Put one or more new tags into the list of those to be omitted from
469							truncated output. An example of when you might like to use this is if
470							you're thumb-nailing articles and they start with C<< title >>
471							or such before the article body. The heading level would be absurd
472							with a list of excerpts so you could drop it completely this way--
473
474							$ht->add_skip_tags( 'h1' );
475
476							=cut
477
478							sub add_skip_tags {
479	0			0	1	0	my $self = shift;
480	0					0	for ( @_ )
481							{
482	0	0				0	croak "Args to add_skip_tags must be scalar tag names, not references"
483							if ref $_;
484	0					0	$self->{_skip_tags}{$_} = 1;
485							}
486							}
487
488							=item B
489
490							Takes tags out of the current list to be omitted from truncated output.
491
492							=cut
493
494							sub dont_skip_tags {
495	0			0	1	0	my $self = shift;
496	0					0	for ( @_ )
497							{
498	0	0				0	croak "Args to dont_skip_tags must be scalar tag names, not references"
499							if ref $_;
500	0	0				0	carp "$_ was not set to be skipped"
501							unless delete $self->{_skip_tags}{$_};
502							}
503							}
504
505							=item B
506
507							Set/get, true/false. If true, will attempt to repair unclosed HTML
508							tags by adding close-tags as late as possible (eg. C<<
509							foobar >> becomes C<< foobar >>). Unmatched
510							close tags are dropped (C<< foobar >> becomes C<< foobar >>).
511
512							=cut
513
514							sub repair {
515	8			8	1	15	my $self = shift;
516	8	100				22	if ( @_ )
517							{
518	3					8	$self->{_repair} = shift;
519	3					7	return 1; # say we did it, even if untrue value
520							}
521							else
522							{
523	5					24	return $self->{_repair};
524							}
525							}
526
527							sub _load_chars_from_percent {
528	0			0		0	my $self = shift;
529	0					0	my $p = HTML::TokeParser->new( $self->{_raw_html} );
530	0					0	my $txt_length = 0;
531
532							CHARS:
533	0					0	while ( my $token = $p->get_token )
534							{
535							# don't check padding b/c we're going by a document average
536	0	0	0			0	next unless $token->[0] eq 'T' and not $token->[2]; # Not data.
537	0					0	$txt_length += _count_visual_chars( $token->[1] );
538							}
539	0					0	$self->chars( int( $txt_length * $self->{_percent} ) );
540							}
541
542							sub _count_visual_chars { # private function
543	244			244		1068	my $to_count = HTML::Entities::decode_entities(+shift);
544	244					442	$to_count =~ s/\s\s+/ /g;
545	244					314	$to_count =~ s/[^[:print:]]+//g;
546							# my $count = () =
547							# $to_count =~
548							# /\&\#\d+;\|\&[[:alpha:]]{2,5};\|\S\|\s+/g;
549							# return $count;
550	244					414	return length($to_count);
551							}
552
553							# Need to put hooks for these or not? 321
554							#sub _default_image_callback {
555							# sub {
556							# '[image]'
557							# }
558							#}
559
560							=item B
561
562							This will make the truncation back up to the first space it finds so
563							it doesn't truncate in the the middle of a word. L runs
564							before L if both are set.
565
566							=cut
567
568							sub on_space {
569	86			86	1	113	my $self = shift;
570	86	100				160	if ( @_ )
571							{
572	1					2	$self->{_on_space} = shift;
573	1					7	return 1; # say we did it, even if setting untrue value
574							}
575							else
576							{
577	85					214	return $self->{_on_space};
578							}
579							}
580
581
582							=item B
583
584							Set/get -- a regular expression. This is on by default and the default
585							cleaning regular expression is C. It
586							will make the truncation strip any trailing spacing and punctuation so
587							you don't get things like "The End...." or "What? ..." You can cancel
588							it with C<$ht-Ecleanly(undef)> or provide your own regular
589							expression.
590
591							=cut
592
593							sub cleanly {
594	87			87	1	167	my $self = shift;
595	87	100				158	if ( @_ )
596							{
597	2					5	$self->{_cleanly} = shift;
598	2					18	return 1; # say we did it, even if setting untrue value
599							}
600							else
601							{
602	85					298	return $self->{_cleanly};
603							}
604							}
605
606							=back
607
608							=head1 COOKBOOK (well, a recipe)
609
610							=head2 Template Toolkit filter
611
612							For excerpting HTML in your Templates. Note the L which
613							is set to drop any images from the truncated output.
614
615							use Template;
616							use HTML::Truncate;
617
618							my %config =
619							(
620							FILTERS => {
621							truncate_html => [ \&truncate_html_filter_factory, 1 ],
622							},
623							);
624
625							my $tt = Template->new(\%config) or die $Template::ERROR;
626
627							# ... etc ...
628
629							sub truncate_html_filter_factory {
630							my ( $context, $len, $ellipsis ) = @_;
631							$len = 32 unless $len;
632							$ellipsis = chr(8230) unless defined $ellipsis;
633							my $ht = HTML::Truncate->new();
634							$ht->add_skip_tags(qw( img ));
635							return sub {
636							my $html = shift \|\| return '';
637							return $ht->truncate( $html, $len, $ellipsis );
638							}
639							}
640
641							Then in your templates you can do things like this:
642
643							[% FOR item IN search_results %]
644
645							[% item.title %]
646							[% item.body \| truncate_html(200) %]
647
648							[% END %]
649
650							See also L.
651
652							=head1 AUTHOR
653
654							Ashley Pond V, C<< >>.
655
656							=head1 LIMITATIONS
657
658							There may be places where this will break down right now. I'll pad out possible edge cases as I find them or they are sent to me via the CPAN bug ticket system.
659
660							=head2 This is not an HTML filter
661
662							Although this happens to do some crude HTML filtering to achieve its end, it is not a fully featured filter. If you are looking for one, check out L and L.
663
664							=head1 BUGS, FEEDBACK, PATCHES
665
666							Please report any bugs or feature requests to
667							C, or through the web interface at
668							L. I
669							will get the ticket, and then you'll automatically be notified of
670							progress as I make changes.
671
672							=head2 TO DO
673
674							Write a couple more tests (percent and skip stuff) then take out beta notice. Try to make the 5.6 stuff work without decode...? Try a C method?
675
676							Write an L based version to load when possible...? Or make that part of L?
677
678							=head1 THANKS TO
679
680							Kevin Riggle for the L functionality; patch, Pod, and tests.
681
682							Lorenzo Iannuzzi for the L functionality.
683
684							=head1 SEE ALSO
685
686							L, L, the "truncate" filter in L

File Coverage

title