File Coverage

blib/lib/HTML/FormatText/WithLinks/AndTables.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line

stmt

bran

cond

sub

pod

time

code

package HTML::FormatText::WithLinks::AndTables;

39135

use strict;

use warnings;

use base 'HTML::FormatText::WithLinks';

1340

1901

use HTML::TreeBuilder;

################################################################################

# configuration defaults

################################################################################

my $cellpadding = 1; # number of horizontal spaces to pad interior of

cells

my $no_rowspacing = 0; # boolean, suppress space between table rows and rows with empty

################################################################################

=head1 NAME

HTML::FormatText::WithLinks::AndTables - Converts HTML to Text with tables in tact

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.02';

=head1 SYNOPSIS

use HTML::FormatText::WithLinks::AndTables;

my $text = HTML::FormatText::WithLinks::AndTables->convert($html);

Or optionally...

my $conf = { # same as HTML::FormatText excepting below

cellpadding => 2, # defaults to 1

no_rowspacing => 1, # bool, suppress vertical space between table rows

};

my $text = HTML::FormatText::WithLinks::AndTables->convert($html, $conf);

=head1 DESCRIPTION

This module was inspired by HTML::FormatText::WithLinks which has proven to be a

useful `lynx -dump` work-alike. However one frustration was that no other HTML

converters I came across had the ability to deal affectively with HTML s.

This module can in a rudimentary sense do so. The aim was to provide facility to take

a simple HTML based email template, and to also convert it to text with the

structure in tact for inclusion as "multipart/alternative" content. Further, it will

preserve both the formatting specified by the

tag's "align" attribute, and will

also preserve multiline text inside of a

element provided it is broken using

tags.

=head2 EXPORT

None by default.

=head1 METHODS

=head2 convert

=cut

my $parser_indent = 3; # HTML::FormatText::WithLinks adds this indent to data in each

67	my $conf_defaults = {};
68
69	# the one and only public interface
70	sub convert {
71	shift if $_[0] eq __PACKAGE__; # to make it function friendly
72	my ($html, $conf) = @_;
73
74	# over-ride our defaults
75	if ($conf and ref $conf eq 'HASH') {
76	$no_rowspacing = $$conf{no_rowspacing} if $$conf{no_rowspacing};
77	delete $$conf{no_rowspacing};
78	$cellpadding = $$conf{cellpadding} if $$conf{cellpadding};
79	delete $$conf{cellpadding};
80	%$conf_defaults = (%$conf_defaults, %$conf);
81	}
82
83	return __PACKAGE__->new->parse($html);
84	}
85
86	# sub-class configure
87	sub configure {
88	shift()->SUPER::configure($conf_defaults);
89	}
90
91	# sub-class parse
92	sub parse {
93
94	my $self = shift;
95	my $html = shift;
96
97	return undef unless defined $html;
98	return '' if $html eq '';
99
100	my $tree = HTML::TreeBuilder->new->parse( $html );
101	return $self->_format_tables( $tree ); # we work our magic...
102
103	}
104
105	# a private method
106	sub _format_tables {
107	my $self = shift;
108	my $tree = shift;
109
110	my $formatted_tables = []; # a nested stack for our formatted table text
111
112	# the result of an all night programming session...
113	#
114	# essentially we take two passes over each table
115	# and modify the structure of text and html by replacing	content with tokens
116	# then replacing the tokens after _parse() has converted it to text
117	#
118	# for each

...

119	# we grab all it's	inner text (and/or parsed html), rearrange it into a
120	# single string of formatted text, and put a token into it's first
121	# once we have processed the html with _parse(), we replace the tokens with the
122	# corresponding formatted text
123
124	my @tables = $tree->look_down(_tag=>'table');
125	my $table_count = 0;
126	for my $table (@tables) {
127	$formatted_tables->[$table_count] = [];
128	my @trs = $table->look_down(_tag=>'tr');
129	my @max_col_width; # max column widths by index
130	my @max_col_heights; # max column heights (for multi-line text) by index
131	my @col_lines; # a stack for our redesigned rows of column (	) text
132	FIRST_PASS: {
133	my $row_count = 0; # obviously a counter...
134	for my $tr (@trs) { # *** 1st pass over rows
135	$max_col_heights[$row_count] = 0;
136	$col_lines[$row_count] = [];
137	my @cols = $tr->look_down(_tag=>'td'); # no support for	. sorry.
138	for (my $i = 0; $i < scalar @cols; $i++) {
139	my $td = $cols[$i]->clone;
140	my $new_tree = HTML::TreeBuilder->new;
141	$new_tree->{_content} = [ $td ];
142	# parse the contents of the td into text
143	# this doesn't work well with nested tables...
144	my $text = __PACKAGE__->new->_parse($new_tree);
145	# we don't want leading or tailing whitespace
146	$text =~ s/^\s+//s;
147	$text =~ s/\s+\z//s;
148	# now we figure out the maximum widths and heights needed for each column
149	my $max_line_width = 0;
150	my @lines = split "\n", $text; # take the parsed text and break it into virtual rows
151	$max_col_heights[$row_count] = scalar @lines if scalar @lines > $max_col_heights[$row_count];
152	for my $line (@lines) {
153	my $line_width = length $line;
154	$max_line_width = $line_width if $line_width > $max_line_width;
155	}
156	$cols[$i]->{_content} = [ $text ];
157	$max_col_width[$i] \|\|= 0;
158	$max_col_width[$i] = $max_line_width if $max_line_width > $max_col_width[$i];
159	# now put the accumulated lines onto our stack
160	$col_lines[$row_count]->[$i] = \@lines;
161	}
162	$tr->{_content} = \@cols;
163	$row_count++;
164	}
165	}
166
167	SECOND_PASS: {
168	my $row_count = 0; # obviously, another counter...
169	for my $tr (@trs) { # *** 2nd pass over rows
170	my @cols = $tr->look_down(_tag=>'td'); # no support for	. sorry.
171
172	my $row_text; # the final string representing each row of reformatted text
173
174	my @col_rows; # a stack for each virtual $new_line spliced together from a group of	's
175
176	# iterate over each column of the maximum rows of parsed multiline text per
177	# for each virtual row of each virtual column, concat the text with alignment spacings
178	# the final concatinated string value will be placed in column 0
179	for (my $j = 0; $j < $max_col_heights[$row_count]; $j++) {
180	my $new_line;
181	for (my $i = 0; $i < scalar @cols; $i++) { # here are the actual	elements we're iterating over...
182	my $width = $max_col_width[$i] + $cellpadding; # how wide is this column of text
183	my $line = $col_lines[$row_count]->[$i]->[$j]; # get the text to fit into it
184	$line = defined $line ? $line : '';
185
186	# strip the whitespace from beginning and end of each line
187	$line =~ s/^\s+//gs;
188	$line =~ s/\s+\z//gs;
189	my $n_space = $width - length $line; # the difference between the column and text widths
190
191	# we are creating virtual rows of text within a single
192	# so we need to add an indent to all but the first row to
193	# match the indent added by _parse() for presenting table contents
194	$line = ((' ')x$parser_indent). $line if $j != 0 and $i == 0;
195
196	# here we adjust the text alignment by wrapping the text in occulted whitespace
197	my $justify = $cols[$i]->tag eq 'td' ? ( $cols[$i]->attr('align') \|\| 'left' ) : 'center';
198	if ($justify eq 'center') {
199	my $pre = int( ($n_space + $cellpadding) / 2 ); # divide remaining space in half
200	my $post = $n_space - $pre; # assign any uneven remainder to the end
201	$new_line .= ((' ')x$pre). $line .((' ')x$post); # wrap the text in spaces
202	} elsif ($justify eq 'left') {
203	$new_line .= ((' ')x$cellpadding). $line .((' ')x$n_space);
204	} else {
205	$new_line .= ((' ')x$n_space). $line .((' ')x$cellpadding);
206	}
207	}
208	$new_line .= "\n" if $j != $max_col_heights[$row_count] - 1; # add a newline to all but the last text row
209	$col_rows[$j] = $new_line; # put the line into the stack for this row
210	}
211	$row_text .= $_ for @col_rows;
212	for (my $i = 1; $i < scalar @cols; $i++) {
213	$cols[$i]->delete; # get rid of unneeded	's
214	}
215	# put the fully formatted text into our accumulator
216	$formatted_tables->[$table_count]->[$row_count] = $row_text;
217	$cols[0]->content->[0] = "__TOKEN__${table_count}__${row_count}__"; # place a token into the row at col 0
218	$row_count++;
219	}
220	}
221	$table_count++;
222	}
223
224	# now replace our tokens
225	my $text = $self->_parse( $tree );
226	for (my $i = 0; $i < scalar @$formatted_tables; $i++) {
227	for (my $j = 0; $j < scalar @{ $$formatted_tables[$i] }; $j++) {
228	my $token = "__TOKEN__${i}__${j}__";
229	$token .= "\n?" if $no_rowspacing;
230	my $new_text = $$formatted_tables[$i][$j];
231	$text =~ s/$token/$new_text/;
232	}
233	}
234
235	return $text;
236	}
237
238	1;
239	__END__