File Coverage

blib/lib/CGI/Application/MailPage.pm
Criterion Covered Total %
statement 313 382 81.9
branch 121 194 62.3
condition 33 51 64.7
subroutine 28 30 93.3
pod 1 7 14.2
total 496 664 74.7


line stmt bran cond sub pod time code
1             package CGI::Application::MailPage;
2 1     1   17894 use base 'CGI::Application';
  1         3  
  1         1084  
3 1     1   5752 use strict;
  1         3  
  1         32  
4 1     1   16 use CGI::Application;
  1         2  
  1         18  
5 1     1   5 use File::Spec;
  1         1  
  1         19  
6 1     1   1528 use HTML::Template;
  1         14665  
  1         42  
7 1     1   1281 use HTML::TreeBuilder;
  1         31355  
  1         11  
8 1     1   35 use HTTP::Date;
  1         1  
  1         75  
9 1     1   1186 use MIME::Entity;
  1         91584  
  1         37  
10 1     1   10 use Mail::Header;
  1         2  
  1         21  
11 1     1   7 use Mail::Internet;
  1         2  
  1         22  
12 1     1   1006 use Email::Valid;
  1         101543  
  1         38  
13 1     1   961 use Net::SMTP;
  1         20625  
  1         79  
14 1     1   1535 use Text::Format;
  1         5012  
  1         44  
15 1     1   10 use URI;
  1         2  
  1         25  
16 1     1   1207 use Data::FormValidator;
  1         32144  
  1         63  
17 1     1   13 use Fcntl qw(:flock);
  1         3  
  1         8973  
18              
19             $CGI::Application::MailPage::VERSION = '1.7';
20              
21             sub setup {
22 25     25 1 4849051 my $self = shift;
23 25         90 $self->start_mode('show_form');
24 25         250 $self->mode_param('rm');
25 25         449 $self->run_modes(
26             show_form => \&show_form,
27             send_mail => \&send_mail,
28             );
29              
30             # make sure we have required params
31 25 50 33     606 die "You must set either 'document_root' or 'remote_fetch' in PARAMS!"
32             unless defined $self->param('document_root') || $self->param('remote_fetch');
33              
34 25 50       497 die "You must set 'your.smtp.server' in PARAMS!"
35             unless defined $self->param('smtp_server');
36              
37             # default custom validation_profile is an empty hash
38 25 100       409 $self->param(validation_profile => {} )
39             unless defined($self->param('validation_profile'));
40             }
41              
42             sub show_form {
43 5     5 0 154 my ($self, $err_msgs) = @_;
44 5         16 my $query = $self->query;
45              
46 5         48 my $page = $query->param('page');
47 5 50       105 if (not defined $page) {
48 0 0       0 unless($self->param('use_page_param')) {
49 0         0 $page = $query->referer();
50 0 0       0 return $self->error(
51             "Sorry, I can't tell what page you want to send. " .
52             "You need to be using either Netscape 4 or Internet Explorer 4 (or newer) " .
53             "to use this feature. Please upgrade your browser and try again!"
54             )
55             unless defined $page;
56             } else {
57 0 0       0 return $self->error("no value for page param!")
58             unless defined $page;
59             }
60             }
61              
62 5         8 my $template;
63 5 50       17 if ($self->param('form_template')) {
64 0         0 $template = $self->load_tmpl($self->param('form_template'),
65             die_on_bad_params => 0,
66             cache => 1,
67             associate => $query
68             );
69            
70             } else {
71 5         80 my @path = $self->tmpl_path;
72 5 50       51 @path = @{ $self->tmpl_path} if(ref($path[0]) eq 'ARRAY');
  0         0  
73 5         40 $template = $self->load_tmpl('CGI/Application/MailPage/templates/form.tmpl',
74             die_on_bad_params => 0,
75             path => [@path, @INC],
76             cache => 1,
77             associate => $query,
78             );
79             }
80              
81 5         9155 my %formats = (
82             both_attachment => 'Both Text and Full HTML as Attachments',
83             html => 'Full HTML',
84             html_attachment => 'Full HTML as an Attachment',
85             text => 'Plain Text',
86             text_attachment => 'Plain Text as an Attachment',
87             url => 'Just A Link',
88             );
89             #create the default dropdown menu
90 5         155 $template->param(FORMAT_SELECTOR =>
91             $query->popup_menu(-name => 'format',
92             '-values' => [ sort(keys %formats) ],
93             -labels => \%formats,
94             -default => 'both_attachment',
95             )
96             );
97             #create a loop that the user can use as they wish
98 5         12998 my @format_loop = ();
99 5         31 foreach my $key (sort(keys(%formats))) {
100 30         90 push(@format_loop, { value => $key, label => $formats{$key}});
101             }
102 5         23 $template->param(FORMAT_OPTIONS => \@format_loop);
103            
104             # set the default 'subject' as the email subject
105 5 100       172 $query->param('subject' => $self->param('email_subject'))
106             unless($query->param('subject'));
107             # if we have any alerts or error messages
108 5 100 66     172 if( $err_msgs && ref $err_msgs eq 'HASH' ) {
109 4         28 $template->param(%$err_msgs);
110             }
111 5 100       439 $template->param(%{$self->param('extra_tmpl_params')})
  1         26  
112             if($self->param('extra_tmpl_params'));
113 5         105 return $template->output();
114             }
115              
116             sub send_mail {
117 24     24 0 4169 my $self = shift;
118 24         71 my $query = $self->query;
119              
120             # the default validation profile
121             my %validation_profile = (
122             required => [qw(
123             name from_email to_emails format page subject
124             )],
125             optional => [qw(note)],
126             constraints => {
127             name => qr/^[\w '-\(\),\.]{1,50}$/,
128             from_email => 'email',
129             to_emails => sub {
130 23     23   34714 my @emails = split(/\s*,\s*|\s+/, shift);
131             # make sure there aren't too many
132             # if we have 'max_emails_per_request'
133 23 100 100     113 if(
134             $self->param('max_emails_per_request')
135             && scalar @emails > $self->param('max_emails_per_request')
136             ) {
137 1         35 return;
138             }
139             # check for valid email addresses
140 22         527 foreach my $email (@emails) {
141 42 100       7259 if(! Email::Valid->address($email) ) {
142 2         157 return;
143             }
144             }
145 20         9422 return \@emails;
146             },
147             subject => qr/^[\w '-\(\),\.\?\!]{1,50}$/,
148             note => qr/^[^\0]{1,250}$/,
149             format => sub {
150 22     22   32427 my $val = shift;
151 22         89 my @valid_formats = qw(
152             both_attachment html html_attachment
153             text text_attachment url
154             );
155 22 100       117 if( grep { $val eq $_ } @valid_formats ) {
  132         297  
156 21         83 return $val;
157             }
158 1         4 return;
159             },
160 24         780 page => qr/^[^\n\0]{1,256}$/, # TODO - what should this be?
161             },
162             untaint_all_constraints => 1,
163             );
164              
165             # merge this default with the custom profile
166             # first merge the 'constraints'
167 24 100       93 if( $self->param('validation_profile')->{constraints} ) {
168 1         5 $validation_profile{constraints} = {
169 1         4 %{$validation_profile{constraints}},
170 1         15 %{ $self->param('validation_profile')->{constraints} }
171             };
172             }
173 24         592 delete $self->param('validation_profile')->{constraints};
174             # now merge the rest
175 24         75 %validation_profile = (
176             %validation_profile,
177 24         343 %{ $self->param('validation_profile') },
178             );
179              
180             # now validate the data
181 24         808 my $results = Data::FormValidator->check($query, \%validation_profile);
182              
183             # create any error messages if necessary.
184 24 100 66     12018 if( $results->has_invalid || $results->has_missing ) {
185 4         32 my %err_msgs = ();
186             # look at each invalid
187 4         18 foreach my $invalid ($results->invalid) {
188 13         59 $err_msgs{"error_$invalid"} = 1;
189 13         34 $err_msgs{"invalid_$invalid"} = 1;
190 13         18 $err_msgs{"any_errors"} = 1;
191 13         24 $err_msgs{"any_invalid"} = 1;
192             }
193             # look at each missing
194 4         23 foreach my $missing ($results->missing) {
195 5         26 $err_msgs{"error_$missing"} = 1;
196 5         12 $err_msgs{"missing_$missing"} = 1;
197 5         10 $err_msgs{"any_errors"} = 1;
198 5         11 $err_msgs{"any_missing"} = 1;
199             }
200              
201             # for backwards compatability, add an 'alert' parameter
202             # for older templates that hold's the first error message we encounter
203 4 100       41 if( $err_msgs{error_name} ) {
    50          
    50          
    50          
    0          
    0          
204 3         10 $err_msgs{alert} = "Please fill in your name in the form below.";
205             } elsif( $err_msgs{invalid_from_email} ) {
206 0         0 $err_msgs{alert} = "Your email address is invalid - it should look like name\@host.com.";
207             } elsif( $err_msgs{missing_from_email} ) {
208 0         0 $err_msgs{alert} = "Please fill in your email address in the form below.";
209             } elsif( $err_msgs{invalid_to_emails} ) {
210 1         3 $err_msgs{alert} = "One of your friend's email addresses is invalid - it should look like name\@host.com.";
211             } elsif( $err_msgs{missing_to_emails} ) {
212 0         0 $err_msgs{alert} = "Please fill in your friends' email addresses in the form below.";
213             } elsif( $err_msgs{error_subject} ) {
214 0         0 $err_msgs{alert} = "Please enter a Subject for the email in the form below.";
215             }
216              
217             # show these errors
218 4         16 return $self->show_form(\%err_msgs);
219             }
220              
221             # get the valid data
222 20         412 my $valid_data = $results->valid();
223 140         302 my ($to_emails, $from_email, $name, $subject, $page, $note, $format) =
224 20         209 map { $valid_data->{$_} } qw(to_emails from_email name subject page note format);
225            
226             #make sure this page is either relative or within the acceptable_domains
227 20 50 66     88 if(
      66        
228             $self->param('acceptable_domains') #if we have any domains
229             && (ref($self->param('acceptable_domains')) eq 'ARRAY') #if it's an array ref
230             && $page =~ m#^https?://([^/:]+)# #if the path's not relative
231             )
232             {
233 2         81 my $domain = $1;
234 2         38 return $self->error("The domain for that desired page is not acceptable!")
235 2 100       3 unless( grep { lc($domain) eq lc($_) } @{$self->param('acceptable_domains')});
  2         8  
236             }
237            
238             # make sure we haven't exceeded our hourly limit
239 19 100       337 if( $self->param('max_emails_per_hour') ) {
240 4         63 my $file = $self->param('max_emails_per_hour_file');
241 4 50       59 unless( $file ) {
242 0         0 die "max_emails_per_hour_file ($file) must exist and be writable"
243             . " in order to use max_emails_per_hour!";
244             }
245 4         7 my ($error, $count, $last_time);
246 4         8 my $current_time = time();
247             # if already exists then open it and read the data
248 4 50       109 if( -e $file ) {
249 4 50       201 open(my $IN, $file) or die "Could not open $file for reading! $!";
250 4         105 ($last_time, $count) = split(qr/:/, <$IN>);
251 4 50       54 close($IN) or die "Could not close $file! $!";
252 4   100     18 $last_time ||= 0;
253 4   100     12 $count ||= 0;
254             # find out if we've done this within the hour
255             # if the difference is less than 1 hour, increase the count
256             # and make sure it's less than the hourly total
257 4 100       18 if( $current_time - $last_time < ( 60 * 60 ) ) {
258 2         6 $count += scalar(@$to_emails);
259 2 50       9 if( $count > $self->param('max_emails_per_hour') ) {
260 2         36 $error = "Hourly limit on emails exceeded!";
261             }
262             # keep the last recorded time.
263 2         9 $current_time = $last_time;
264             } else {
265 2         10 $count = scalar(@$to_emails);
266             }
267             # else the file doesn't exist
268             } else {
269 0         0 $count = scalar(@$to_emails);
270             }
271              
272             # now save the time and count
273 4 50       311 open(my $OUT, ">", $file) or die "Could not open $file for writing! $!";
274 4 50       38 flock($OUT, LOCK_EX) or die "Could not obtain lock on $file! $!";
275 4         24 print $OUT "$current_time:$count";
276 4 50       176 close($OUT) or die "Could not close $file! $!";
277              
278             # if we have an error then return it
279 4 100       20 return $self->error($error) if( $error );
280             }
281              
282             # find the HTML file to open (if it's not a remote fetch)
283 17         563 my ($filename, $base_url, $base);
284 17 100 66     59 unless( $self->param('remote_fetch') && ($page =~ m!^https?://!) ) {
285 10         179 $filename = $self->_find_html_file($page);
286 10 50 66     620 return $self->error("Unable to find file $filename for page $page (might be empty or unreadable): $!")
      66        
287             unless -e $filename and -r _ and -s _;
288 7         121 my ($vol, $dir, $file) = File::Spec->splitpath($filename);
289              
290 7         15 $base_url = $page;
291 7         52 $base_url =~ s/\Q$file\E//;
292            
293             # if file is empty, assume index.html
294 7 50 33     45 if (not defined $file or not length $file) {
295 0         0 $file = 'index.html';
296 0         0 $filename .= '/index.html';
297             }
298            
299 7         11 my $ext;
300 7         47 ($base, $ext) = $file =~ /(.*)\.([^\.]+)$/;
301             } else {
302 7         294 $base_url = URI->new($page);
303 7         8050 $base_url = $base_url->scheme . '://' . $base_url->authority . '/' . $base_url->path;
304             }
305              
306             # open the email template
307 14         476 my $template;
308 14 50       49 if ($self->param('email_template')) {
309 0         0 $template = $self->load_tmpl($self->param('email_template'),
310             die_on_bad_params => 0,
311             cache => 1,
312             );
313             } else {
314 14         260 my @path = $self->tmpl_path;
315 14 50       176 @path = @{ $self->tmpl_path} if(ref($path[0]) eq 'ARRAY');
  0         0  
316 14         288 $template = $self->load_tmpl('CGI/Application/MailPage/templates/email.tmpl',
317             die_on_bad_params => 0,
318             path => [@path, @INC],
319             cache => 1,
320             );
321             }
322 14         10833 $template->param(%$valid_data);
323 14 50       1182 $template->param(%{$self->param('extra_tmpl_params')})
  0         0  
324             if($self->param('extra_tmpl_params'));
325              
326             # get the IP address of the original sender
327 14   50     327 my $sender_ip = $ENV{HTTP_X_FORWARDED_FOR} || $ENV{REMOTE_ADDR} || '';
328             # $msg will end up with either a Mail::Internet or MIME::Entity object.
329 14         24 my $msg;
330              
331             # are we doing attachments?
332 14 100       54 if (index($format, '_attachment') != -1) {
333             # open up a MIME::Entity for our msg
334 7         72 $msg = MIME::Entity->build(
335             'Type' => "multipart/mixed",
336             'From' => "$name <$from_email>",
337             'Reply-To' => "$name <$from_email>",
338             'To' => $to_emails,
339             'Subject' => $subject,
340             'Date' => HTTP::Date::time2str(time()),
341             'X-Originating-Ip' => $sender_ip,
342             );
343              
344 7         14455 $msg->attach(Data => $template->output);
345              
346             # attach the straight HTML if requested
347 7 100       8296 if ($format =~ /^(both|html)/) {
348 5         11 my $buffer = "";
349 5 50 66     26 if ($self->param('read_file_callback')) {
    100          
350 0         0 my $callback = $self->param('read_file_callback');
351 0         0 $buffer = $callback->($filename);
352             } elsif( $self->param('remote_fetch') && ($page =~ /^https?:\/\//) ) {
353             #fetch this page with LWP
354 3         131 require LWP::UserAgent;
355 3         17 require HTTP::Request;
356 3         31 my $agent = LWP::UserAgent->new();
357 3         4929 my $response = $agent->request(HTTP::Request->new(GET => $page));
358 3 100       400494 if( $response->is_success ) {
359 2         41 $buffer = $response->content();
360             } else {
361 1         22 return $self->error("Unable to retrieve remote page $page");
362             }
363             } else {
364 2 50       172 open(HTML, $filename) or return $self->error("Can't open $filename : $!");
365 2         47 while(read(HTML, $buffer, 10240, length($buffer))) {}
366 2         27 close(HTML);
367             }
368            
369             # add tag in
370 4 50       261 $buffer =~ s/(<\s*[Hh][Ee][Aa][Dd].*?>)/$1\n\n/
371             if( $base_url );
372            
373 4 100       23 my $attached_filename = $base ? "$base.html" : $page;
374 4         29 $msg->attach(
375             Data => $buffer,
376             Type => 'text/html',
377             Filename => $attached_filename,
378             );
379             }
380              
381             # attach text translation
382 6 100       5350 if ($format =~ /^(both|text)/) {
383 4 100       21 my $new_filename = $base ? "$base.txt" : "$page.txt";
384 4         23 $msg->attach(
385             Data => $self->_html2text($filename, $page),
386             Type => 'text/plain',
387             Filename => $new_filename,
388             );
389             }
390              
391             } else {
392             # non attachment mail
393 7         68 my $header = Mail::Header->new();
394 7         379 $header->add(From => "$name <$from_email>");
395 7         1329 $header->add('Reply-To' => "$name <$from_email>");
396 7         888 $header->add(To => join(', ', @$to_emails));
397 7         791 $header->add(Subject => $subject);
398 7         908 $header->add(Date => HTTP::Date::time2str(time()));
399 7 50       974 $header->add('X-Originating-Ip' => $sender_ip)
400             if( $sender_ip );
401              
402 7         1033 my @lines;
403 7         45 push(@lines, $template->output());
404              
405 7 100       661 if ($format =~ /^(both|text)/) {
406 2         4 push(@lines, "\n---\n\n");
407 2         47 push(@lines, $self->_html2text($filename, $page));
408             }
409            
410 7 100       34 if ($format =~ /^(both|html)/) {
411 2         4 push(@lines, "\n---\n\n");
412 2 50 66     10 if ($self->param('read_file_callback')) {
    100          
413 0         0 my $callback = $self->param('read_file_callback');
414 0         0 my $buffer = $callback->($filename);
415 0         0 push(@lines, split("\n", $buffer));
416             } elsif( $self->param('remote_fetch') && ($page =~ /^https?:\/\//) ) {
417             #fetch this page with LWP
418 1         57 require LWP::UserAgent;
419 1         7 require HTTP::Request;
420 1         15 my $agent = LWP::UserAgent->new();
421 1         286 my $response = $agent->request(HTTP::Request->new(GET => $page));
422 1 50       146842 if( $response->is_success ) {
423 1         18 my $buffer = $response->content();
424 1         124 @lines = split(/\r?\n/, $buffer);
425             } else {
426 0         0 return $self->error("Unable to retrieve remote page $page");
427             }
428             } else {
429 1 50       73 open(HTML, $filename) or return $self->error("Can't open $filename : $!");
430 1         18 push(@lines, );
431 1         12 close(HTML);
432             }
433             }
434              
435 7 100       41 if ($format =~ /url/) {
436 3         10 push(@lines, "\n$page");
437             }
438              
439 7         80 $msg = Mail::Internet->new([], Header => $header, Body => \@lines);
440 7 50       475 return $self->error("Unable to create Mail::Internet object!")
441             unless defined $msg;
442             }
443            
444             # send the message using SMTP - other methods can be added later
445 13 50       4828 unless($self->param('dump_mail')) {
446 0         0 my $smtp = Net::SMTP->new($self->param('smtp_server'));
447 0 0 0     0 return $self->error("Unable to connect to SMTP server ".$self->param('smtp_server')." : $!")
448             unless defined $smtp and UNIVERSAL::isa($smtp,'Net::SMTP');
449 0 0       0 $smtp->debug(1) if $self->param('smtp_debug');
450            
451 0         0 $smtp->mail("$name <$from_email>");
452 0         0 foreach (@$to_emails) {
453 0         0 $smtp->to($_);
454             }
455 0         0 $smtp->data();
456 0         0 $smtp->datasend($msg->as_string());
457 0         0 $smtp->dataend();
458 0         0 $smtp->quit();
459              
460             } else {
461             # debuging hook for test.pl
462 13         263 my $mailref = $self->param('dump_mail');
463 13         223 $$mailref = $msg->as_string();
464 13         22405 return $self->error("Mail Dumped");
465             }
466              
467             # all done
468 0         0 return $self->show_thanks;
469             }
470              
471             sub show_thanks {
472 0     0 0 0 my $self = shift;
473 0         0 my $query = $self->query;
474 0         0 my $page = $query->param('page');
475              
476 0         0 my $template;
477 0 0       0 if ($self->param('thanks_template')) {
478 0         0 $template = $self->load_tmpl($self->param('thanks_template'),
479             die_on_bad_params => 0,
480             cache => 1,
481             );
482             } else {
483 0         0 my @path = $self->tmpl_path;
484 0 0       0 @path = @{ $self->tmpl_path} if(ref($path[0]) eq 'ARRAY');
  0         0  
485 0         0 $template = $self->load_tmpl('CGI/Application/MailPage/templates/thanks.tmpl',
486             die_on_bad_params => 0,
487             path => [@path, @INC],
488             cache => 1,
489             );
490             }
491              
492 0         0 $template->param(PAGE => $page);
493 0 0       0 $template->param(%{$self->param('extra_tmpl_params')})
  0         0  
494             if($self->param('extra_tmpl_params'));
495 0         0 return $template->output();
496             }
497              
498              
499             sub error {
500 20     20 0 52 my ($self, $msg) = @_;
501 20         35 my $template;
502              
503 20 50       97 if($self->param('error_template')) {
504 0         0 $template = $self->load_tmpl( $self->param('error_template'),
505             die_on_bad_params => 0,
506             cache => 1,
507             );
508             } else {
509 20         377 my @path = $self->tmpl_path;
510 20 50       240 @path = @{ $self->tmpl_path} if(ref($path[0]) eq 'ARRAY');
  0         0  
511 20         187 $template = $self->load_tmpl( 'CGI/Application/MailPage/templates/error.tmpl',
512             die_on_bad_params => 0,
513             path => [@path, @INC],
514             cache => 1,
515             );
516             }
517              
518 20         11396 $template->param(error => $msg);
519 20 50       792 $template->param(%{$self->param('extra_tmpl_params')})
  0         0  
520             if($self->param('extra_tmpl_params'));
521 20         370 return $template->output();
522             }
523              
524             sub _find_html_file {
525 10     10   17 my $self = shift;
526 10         17 my $url = shift;
527 10         16 my $path;
528              
529             # if it doesn't start with http, its relative to web root
530 10 100       67 if($url =~ m!^https?://([-\w\.:]+)/(.*)!) {
531 7         20 my $host = $1;
532 7         16 $path = $2;
533             # if the path starts with a ~user thing, remove it
534 7         17 $path =~ s!~[^/]+/!!;
535             } else {
536 3 50       18 $path = ($url =~ /^\//) ? $url : "/$url"; #make sure it has a preceding path
537             }
538              
539             # now make sure we don't allow any '../' sections to try and hack the server
540 10         27 $path =~ s/\.\.\///g;
541             # append it to document_root and return it
542 10         36 return File::Spec->join($self->param('document_root'), $path);
543             }
544            
545             # takes an html file and returns text. This code was taken and
546             # modified from html2text.pl by Ave Wrigley. I don't really
547             # understand most of it, but it seems to work well.
548              
549             #--------------------------------------------------------------------------
550             #
551             # prefixes to convert tags into - some are converted back to Text::Format
552             # formatting later
553             #
554             #--------------------------------------------------------------------------
555              
556             my %prefix = (
557             'li' => '* ',
558             'dt' => '+ ',
559             'dd' => '- ',
560             );
561              
562             my %underline = (
563             'h1' => '=',
564             'h2' => '-',
565             'h3' => '-',
566             'h4' => '-',
567             'h5' => '-',
568             'h6' => '-',
569             );
570              
571             my @heading_number = ( 0, 0, 0, 0, 0, 0 );
572              
573             sub _html2text {
574 6     6   12 my $self = shift;
575 6         14 my $filename = shift;
576 6         12 my $page = shift;
577              
578 6         63 my $html_tree = new HTML::TreeBuilder;
579 6         1988 my $text_formatter = new Text::Format;
580 6         487 $text_formatter->firstIndent( 0 );
581              
582 6         55 my $result = "";
583              
584             #----------------------------------------------------------------------
585             #
586             # get_text - get all the text under a node
587             #
588             #----------------------------------------------------------------------
589              
590             sub get_text
591             {
592 3     3 0 5 my $this = shift;
593 3         7 my $text = '';
594            
595             # iterate though my children ...
596 3 50       13 return unless defined $this->content;
597 3         18 for my $child ( @{ $this->content } )
  3         11  
598             {
599             # if the child is also non-text ...
600 3 50       16 if ( ref( $child ) )
601             {
602             # traverse it ...
603             $child->traverse(
604             # traveral callback
605             sub {
606 0     0   0 my( $node, $startflag, $depth ) = @_;
607             # only visit once
608 0 0       0 return 0 unless $startflag;
609             # if it is non-text ...
610 0 0       0 if ( ref( $node ) )
611             {
612             # recurse get_text
613 0         0 $text .= get_text( $node );
614             }
615             # if it is text
616             else
617             {
618             # add it to $text
619 0 0       0 $text .= $node if $node =~ /\S/;
620             }
621 0         0 return 0;
622             },
623 0         0 0
624             );
625             }
626             # if it is text
627             else
628             {
629             # add it to $text
630 3 50       24 $text .= $child if $child =~ /\S/;
631             }
632             }
633 3         9 return $text;
634             }
635            
636             #--------------------------------------------------------------------------
637             #
638             # get_paragraphs - routine for generating an array of paras from a given node
639             #
640             #--------------------------------------------------------------------------
641            
642             sub get_paragraphs
643             {
644 3     3 0 7 my $this = shift;
645            
646             # array to save paragraphs in
647 3         6 my @paras = ();
648             # avoid -w warning for .= operation on undefined
649 3         11 $paras[ 0 ] = '';
650            
651             # iterate though my children ...
652 3         6 for my $child ( @{ $this->content } )
  3         19  
653             {
654             # if the child is also non-text ...
655 6 100       32 if ( ref( $child ) )
656             {
657             # traverse it ...
658             $child->traverse(
659             # traveral callback
660             sub {
661 9     9   202 my( $node, $startflag, $depth ) = @_;
662             # only visit once
663 9 100       24 return 0 unless $startflag;
664             # if it is non-text ...
665 6 100       18 if ( ref( $node ) )
666             {
667             # if it is a list element ...
668 3 50       12 if ( $node->tag =~ /^(?:li|dd|dt)$/ )
669             {
670             # recurse get_paragraphs
671 0         0 my @new_paras = get_paragraphs( $node );
672             # pre-pend appropriate prefix for list
673 0         0 $new_paras[ 0 ] =
674             $prefix{ $node->tag } . $new_paras[ 0 ]
675             ;
676             # and update the @paras array
677 0         0 @paras = ( @paras, @new_paras );
678             # and traverse no more
679 0         0 return 0;
680             }
681             else
682             {
683             # any other element, just traverse
684 3         28 return 1;
685             }
686             }
687             else
688             {
689             # add text to the current paragraph ...
690 3 50       22 $paras[ $#paras ] =
691             join( ' ', $paras[ $#paras ], $node )
692             if $node =~ /\S/
693             ;
694             # and recurse no more
695 3         10 return 0;
696             }
697             },
698 3         56 0
699             );
700             }
701             else
702             {
703             # add test to current paragraph ...
704 3 50       35 $paras[ $#paras ] = join( ' ', $paras[ $#paras ], $child )
705             if $child =~ /\S/
706             ;
707             }
708             }
709 3         77 return @paras;
710             }
711            
712             #--------------------------------------------------------------------------
713             #
714             # Main
715             #
716             #--------------------------------------------------------------------------
717            
718             # parse the HTML file
719 6 50 66     32 if ($self->param('read_file_callback')) {
    100          
720 0         0 my $callback = $self->param('read_file_callback');
721 0         0 $html_tree->parse( $callback->($filename) );
722             } elsif( $self->param('remote_fetch') && ($page =~ /^https?:\/\//) ) {
723             #fetch this page with LWP
724 3         413 require LWP::UserAgent;
725 3         18 require HTTP::Request;
726 3         34 my $agent = LWP::UserAgent->new();
727 3         974 my $response = $agent->request(HTTP::Request->new(GET => $page));
728 3 50       650021 if( $response->is_success ) {
729 3         50 my $buffer = $response->content();
730 3         172 $html_tree->parse($buffer);
731             } else {
732 0         0 return $self->error("Unable to retrieve remote page $page");
733             }
734             } else {
735 3 50       232 open(HTML, $filename) or return $self->error("Can't open $filename : $!");
736 3         123 $html_tree->parse( join( '', ) );
737 3         1226 close(HTML);
738             }
739              
740             # main tree traversal routine
741            
742             $html_tree->traverse(
743             sub {
744 660     660   11567 my( $node, $startflag, $depth ) = @_;
745             # ignore what's in the
746 660 100 100     3719 return 0 if ref( $node ) and $node->tag eq 'head';
747             # only visit nodes once
748 654 100       5137 return 0 unless $startflag;
749             # if this node is non-text ...
750 426 100       839 if ( ref $node )
751             {
752             # if this is a para ...
753 270 100       632 if ( $node->tag eq 'p' )
    100          
754             {
755             # iterate sub-paragraphs (including lists) ...
756 3         27 for ( get_paragraphs( $node ) )
757             {
758             # if it is a
  • ...
  • 759 3 50       28 if ( /^\* / )
        50          
        50          
    760             {
    761             # indent first line by 4, rest by 6
    762 0         0 $text_formatter->firstIndent( 4 );
    763 0         0 $text_formatter->bodyIndent( 6 );
    764             }
    765             # if it is a
    ...
    766             elsif ( s/^\+ // )
    767             {
    768             # set left margin to 4
    769 0         0 $text_formatter->leftMargin( 4 );
    770             }
    771             # if it is a
    ...
    772             elsif ( s/^- // )
    773             {
    774             # set left margin to 8
    775 0         0 $text_formatter->leftMargin( 8 );
    776             }
    777             # print formatted paragraphs ...
    778 3         17 $result .= $text_formatter->paragraphs( $_ );
    779             # and reset formatter defaults
    780 3         431 $text_formatter->leftMargin( 0 );
    781 3         35 $text_formatter->firstIndent( 0 );
    782 3         28 $text_formatter->bodyIndent( 0 );
    783             }
    784 3         27 $result .= "\n";
    785 3         12 return 0;
    786             }
    787             # if this is a heading ...
    788             elsif ( $node->tag =~ /^h(\d)/ )
    789             {
    790             # get the heading level ...
    791 3         48 my $level = $1;
    792             # increment the number for this level ...
    793 3         13 $heading_number[ $level ]++;
    794             # reset lower level heading numbers ...
    795 3         14 for ( $level+1 .. $#heading_number )
    796             {
    797 12         22 $heading_number[ $_ ] = 0;
    798             }
    799             # create heading number string
    800 3         17 my $heading_number = join(
    801             '.',
    802             @heading_number[ 1 .. $level ]
    803             );
    804             # generate heading from number string and heading text ...
    805             # my $text = "$heading_number " . get_text( $node );
    806 3         12 my $text = get_text( $node );
    807             # underline it with the appropriate underline character ...
    808 3         16 $text =~ s{
    809             (.*)
    810             }
    811             {
    812 6         56 "$1\n" . $underline{ $node->tag } x length( $1 )
    813             }gex
    814             ;
    815 3         32 $result .= $text;
    816 3         11 return 0;
    817             } else {
    818 264         3509 return 1;
    819             }
    820             }
    821             # if it is text ...
    822             else
    823             {
    824 156 100       691 return 0 unless $node =~ /\S/;
    825 96         310 $result .= $text_formatter->format( $node );
    826 96         14243 return 0;
    827             }
    828             },
    829 6         94378 0
    830             );
    831              
    832             # filter out comments
    833 6         202 $result =~ s///gs;
    834              
    835 6         718 return $result;
    836             }
    837            
    838              
    839             1;
    840             __END__