File Coverage

blib/lib/HTML/FormatText/WithLinks/AndTables.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	30	36	83.3
condition	8	10	80.0
subroutine	8	8	100.0
pod	2	3	66.6
total	155	165	93.9

line

stmt

bran

cond

sub

pod

time

code

package HTML::FormatText::WithLinks::AndTables;

16182

use strict;

162

use warnings;

250

our $VERSION = '0.07'; # VERSION

use base 'HTML::FormatText::WithLinks';

3186

252582

use HTML::TreeBuilder;

################################################################################

# configuration defaults

################################################################################

my $cellpadding = 1; # number of horizontal spaces to pad interior of

cells

my $no_rowspacing = 0; # boolean, suppress space between table rows and rows with empty

################################################################################

=head1 NAME

HTML::FormatText::WithLinks::AndTables - Converts HTML to Text with tables intact

=head1 VERSION

version 0.07

=cut

=head1 SYNOPSIS

use HTML::FormatText::WithLinks::AndTables;

my $text = HTML::FormatText::WithLinks::AndTables->convert($html);

Or optionally...

my $conf = { # same as HTML::FormatText excepting below

cellpadding => 2, # defaults to 1

no_rowspacing => 1, # bool, suppress vertical space between table rows

};

my $text = HTML::FormatText::WithLinks::AndTables->convert($html, $conf);

=head1 DESCRIPTION

This module was inspired by HTML::FormatText::WithLinks which has proven to be a

useful `lynx -dump` work-alike. However one frustration was that no other HTML

converters I came across had the ability to deal affectively with HTML s.

This module can in a rudimentary sense do so. The aim was to provide facility to take

a simple HTML based email template, and to also convert it to text with the

structure intact for inclusion as "multipart/alternative" content. Further, it will

preserve both the formatting specified by the

tag's "align" attribute, and will

also preserve multiline text inside of a

element provided it is broken using

tags.

=head2 EXPORT

None by default.

=head1 METHODS

=head2 convert

=cut

my $parser_indent = 3; # HTML::FormatText::WithLinks adds this indent to data in each

67							my $conf_defaults = {};
68
69							# the one and only public interface
70							sub convert {
71	6	50		6	1	123	shift if $_[0] eq __PACKAGE__; # to make it function friendly
72	6					12	my ($html, $conf) = @_;
73
74							# over-ride our defaults
75	6	50	33			53	if ($conf and ref $conf eq 'HASH') {
76	6	50				24	$no_rowspacing = $$conf{no_rowspacing} if $$conf{no_rowspacing};
77	6					12	delete $$conf{no_rowspacing};
78	6	100				25	$cellpadding = $$conf{cellpadding} if $$conf{cellpadding};
79	6					8	delete $$conf{cellpadding};
80	6					34	%$conf_defaults = (%$conf_defaults, %$conf);
81							}
82
83	6					65	return __PACKAGE__->new->parse($html);
84							}
85
86							# sub-class configure
87							sub configure {
88							# SUPER::configure actually modifies the hash, so we need to pass a copy
89	18			18	0	422	my %configure = %$conf_defaults;
90
91	18					66	shift()->SUPER::configure(\%configure);
92							}
93
94							# sub-class parse
95							sub parse {
96
97	6			6	1	382	my $self = shift;
98	6					10	my $html = shift;
99
100	6	50				19	return unless defined $html;
101	6	50				19	return '' if $html eq '';
102
103	6					47	my $tree = HTML::TreeBuilder->new->parse( $html );
104	6					10968	return $self->_format_tables( $tree ); # we work our magic...
105
106							}
107
108							# a private method
109							sub _format_tables {
110	6			6		17	my $self = shift;
111	6					8	my $tree = shift;
112
113	6					15	my $formatted_tables = []; # a nested stack for our formatted table text
114
115							# the result of an all night programming session...
116							#
117							# essentially we take two passes over each table
118							# and modify the structure of text and html by replacing	content with tokens
119							# then replacing the tokens after _parse() has converted it to text
120							#
121							# for each

...

122					# we grab all it's	inner text (and/or parsed html), rearrange it into a
123					# single string of formatted text, and put a token into it's first
124					# once we have processed the html with _parse(), we replace the tokens with the
125					# corresponding formatted text
126
127	6			74	my @tables = $tree->look_down(_tag=>'table');
128	6			640	my $table_count = 0;
129	6			12	for my $table (@tables) {
130	6			14	$formatted_tables->[$table_count] = [];
131	6			21	my @trs = $table->look_down(_tag=>'tr');
132	6			358	my @max_col_width; # max column widths by index
133					my @max_col_heights; # max column heights (for multi-line text) by index
134	0			0	my @col_lines; # a stack for our redesigned rows of column (	) text
135					FIRST_PASS: {
136	6			10	my $row_count = 0; # obviously a counter...
	6			13
137	6			13	for my $tr (@trs) { # *** 1st pass over rows
138	9			25	$max_col_heights[$row_count] = 0;
139	9			12	$col_lines[$row_count] = [];
140	9			51	my @cols = $tr->look_down(_tag=>qr/^(td\|th)$/); # no support for	. sorry.
141	9			499	for (my $i = 0; $i < scalar @cols; $i++) {
142	12			44	my $td = $cols[$i]->clone;
143	12			453	my $new_tree = HTML::TreeBuilder->new;
144	12			1578	$new_tree->{_content} = [ $td ];
145					# parse the contents of the td into text
146					# this doesn't work well with nested tables...
147	12			50	my $text = __PACKAGE__->new->_parse($new_tree);
148					# we don't want leading or tailing whitespace
149	12			46398	$text =~ s/\xA0+/ /s; # -> space
150	12			47	$text =~ s/^\s+//s;
151	12			35	$text =~ s/\s+\z//s;
152					# now we figure out the maximum widths and heights needed for each column
153	12			84	my $max_line_width = 0;
154	12			38	my @lines = split "\n", $text; # take the parsed text and break it into virtual rows
155	12	100		38	$max_col_heights[$row_count] = scalar @lines if scalar @lines > $max_col_heights[$row_count];
156	12			20	for my $line (@lines) {
157	25			20	my $line_width = length $line;
158	25	100		38	$max_line_width = $line_width if $line_width > $max_line_width;
159					}
160	12			29	$cols[$i]->{_content} = [ $text ];
161	12		100	58	$max_col_width[$i] \|\|= 0;
162	12	100		27	$max_col_width[$i] = $max_line_width if $max_line_width > $max_col_width[$i];
163					# now put the accumulated lines onto our stack
164	12			167	$col_lines[$row_count]->[$i] = \@lines;
165					}
166	9			14	$tr->{_content} = \@cols;
167	9			19	$row_count++;
168					}
169					}
170
171					SECOND_PASS: {
172	6			8	my $row_count = 0; # obviously, another counter...
	6			9
173	6			11	for my $tr (@trs) { # *** 2nd pass over rows
174	9			49	my @cols = $tr->look_down(_tag=>qr/^(td\|th)$/); # no support for	. sorry.
175
176	9			423	my $row_text; # the final string representing each row of reformatted text
177
178					my @col_rows; # a stack for each virtual $new_line spliced together from a group of	's
179
180					# iterate over each column of the maximum rows of parsed multiline text per
181					# for each virtual row of each virtual column, concat the text with alignment spacings
182					# the final concatinated string value will be placed in column 0
183	9			35	for (my $j = 0; $j < $max_col_heights[$row_count]; $j++) {
184	14			12	my $new_line;
185	14			26	for (my $i = 0; $i < scalar @cols; $i++) { # here are the actual	elements we're iterating over...
186	26			26	my $width = $max_col_width[$i] + $cellpadding; # how wide is this column of text
187	26			23	my $line = $col_lines[$row_count]->[$i]->[$j]; # get the text to fit into it
188	26	100		33	$line = defined $line ? $line : '';
189
190					# strip the whitespace from beginning and end of each line
191	26			44	$line =~ s/^\s+//gs;
192	26			31	$line =~ s/\s+\z//gs;
193	26			22	my $n_space = $width - length $line; # the difference between the column and text widths
194
195					# we are creating virtual rows of text within a single
196					# so we need to add an indent to all but the first row to
197					# match the indent added by _parse() for presenting table contents
198	26	100	100	68	$line = ((' ')x$parser_indent). $line if $j != 0 and $i == 0;
199
200					# here we adjust the text alignment by wrapping the text in occulted whitespace
201	26	100	100	49	my $justify = $cols[$i]->tag eq 'td' ? ( $cols[$i]->attr('align') \|\| 'left' ) : 'center';
202	26	100		284	if ($justify eq 'center') {
		100
203	1			4	my $pre = int( ($n_space + $cellpadding) / 2 ); # divide remaining space in half
204	1			1	my $post = $n_space - $pre; # assign any uneven remainder to the end
205	1			6	$new_line .= ((' ')x$pre). $line .((' ')x$post); # wrap the text in spaces
206					} elsif ($justify eq 'left') {
207	15			49	$new_line .= ((' ')x$cellpadding). $line .((' ')x$n_space);
208					} else {
209	10			24	$new_line .= ((' ')x$n_space). $line .((' ')x$cellpadding);
210					}
211					}
212	14	100		32	$new_line .= "\n" if $j != $max_col_heights[$row_count] - 1; # add a newline to all but the last text row
213	14			26	$col_rows[$j] = $new_line; # put the line into the stack for this row
214					}
215	9			26	$row_text .= $_ for @col_rows;
216	9			33	for (my $i = 1; $i < scalar @cols; $i++) {
217	4			7	$cols[$i]->delete; # get rid of unneeded	's
218					}
219					# put the fully formatted text into our accumulator
220	9			98	$formatted_tables->[$table_count]->[$row_count] = $row_text;
221	9	100		20	if (scalar @cols) {
222	8			45	$cols[0]->content->[0] = "__TOKEN__${table_count}__${row_count}__"; # place a token into the row at col 0
223					}
224	9			38	$row_count++;
225					}
226					}
227	6			19	$table_count++;
228					}
229
230					# now replace our tokens
231	6			24	my $text = $self->_parse( $tree );
232	6			15071	for (my $i = 0; $i < scalar @$formatted_tables; $i++) {
233	6			13	for (my $j = 0; $j < scalar @{ $$formatted_tables[$i] }; $j++) {
	15			51
234	9			24	my $token = "__TOKEN__${i}__${j}__";
235	9	50		21	$token .= "\n?" if $no_rowspacing;
236	9			14	my $new_text = $$formatted_tables[$i][$j];
237	9	100		20	if (defined $new_text) {
238	6			80	$text =~ s/$token/$new_text/;
239					}
240					else {
241	3			37	$text =~ s/$token//;
242					}
243					}
244					}
245
246	6			85	return $text;
247					}
248
249					1;
250					__END__