| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #!/usr/bin/perl | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # html_table_to_db.pm | 
| 5 |  |  |  |  |  |  | # Copyright (C) 2005-2015 by John Heidemann | 
| 6 |  |  |  |  |  |  | # $Id: 025cc75c8e0df7ccdb092d89696480e5dee7dd08 $ | 
| 7 |  |  |  |  |  |  | # | 
| 8 |  |  |  |  |  |  | # This program is distributed under terms of the GNU general | 
| 9 |  |  |  |  |  |  | # public license, version 2.  See the file COPYING | 
| 10 |  |  |  |  |  |  | # in $dblib for details. | 
| 11 |  |  |  |  |  |  | # | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | package Fsdb::Filter::html_table_to_db; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | =head1 NAME | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | html_table_to_db - convert HTML tables into fsdb | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | html_table_to_db dest.fsdb | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | Converts a HTML table to Fsdb format. | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | The input is an HTML table (I fsdb). | 
| 28 |  |  |  |  |  |  | Column names are taken from C | elements, | 
| 29 |  |  |  |  |  |  | or defined as C through C if | 
| 30 |  |  |  |  |  |  | no such elements appear. | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | The output is two-space-separated fsdb. | 
| 33 |  |  |  |  |  |  | (Someday more general field separators should be supported.) | 
| 34 |  |  |  |  |  |  | Fsdb fields are normalized version of the html file: | 
| 35 |  |  |  |  |  |  | multiple spaces are compressed to one. | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =for comment | 
| 38 |  |  |  |  |  |  | begin_standard_fsdb_options | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | This module also supports the standard fsdb options: | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | =over 4 | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | =item B<-d> | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | Enable debugging output. | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | =item B<-i> or B<--input> InputSource | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | Read from InputSource, typically a file name, or C<-> for standard input, | 
| 51 |  |  |  |  |  |  | or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects. | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | =item B<-o> or B<--output> OutputDestination | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | Write to OutputDestination, typically a file name, or C<-> for standard output, | 
| 56 |  |  |  |  |  |  | or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects. | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | =item B<--autorun> or B<--noautorun> | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | By default, programs process automatically, | 
| 61 |  |  |  |  |  |  | but Fsdb::Filter objects in Perl do not run until you invoke | 
| 62 |  |  |  |  |  |  | the run() method. | 
| 63 |  |  |  |  |  |  | The C<--(no)autorun> option controls that behavior within Perl. | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | =item B<--help> | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | Show help. | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | =item B<--man> | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | Show full manual. | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | =back | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | =for comment | 
| 76 |  |  |  |  |  |  | end_standard_fsdb_options | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | =head1 SAMPLE USAGE | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | =head2 Input: | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  |  
| 84 |  |  |  |  |  |  |  |  | account | passwd | uid | gid | fullname | homedir | shell | 
|---|
 
| 85 |  |  |  |  |  |  |  |  | johnh | * | 2274 | 134 | John & Ampersand | /home/johnh | /bin/bash |  
| 86 |  |  |  |  |  |  |  |  | greg | * | 2275 | 134 | Greg < Lessthan | /home/greg | /bin/bash |  
| 87 |  |  |  |  |  |  |  |  | root | * | 0 | 0 | Root ; Semi | /root | /bin/bash |  
| 88 |  |  |  |  |  |  |  |  | four | * | 1 | 1 | Fourth Row | /home/four | /bin/bash |  
| 89 |  |  |  |  |  |  |  |  | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | =head2 Command: | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | html_table_to_db | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | =head2 Output: | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | #fsdb -F S account passwd uid gid fullname homedir shell | 
| 98 |  |  |  |  |  |  | johnh  *  2274  134  John & Ampersand  /home/johnh  /bin/bash | 
| 99 |  |  |  |  |  |  | greg  *  2275  134  Greg < Lessthan  /home/greg  /bin/bash | 
| 100 |  |  |  |  |  |  | root  *  0  0  Root ; Semi  /root  /bin/bash | 
| 101 |  |  |  |  |  |  | four  *  1  1  Fourth Row  /home/four  /bin/bash | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | L. | 
| 107 |  |  |  |  |  |  | L. | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | =head1 CLASS FUNCTIONS | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | =cut | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | @ISA = qw(Fsdb::Filter); | 
| 115 |  |  |  |  |  |  | $VERSION = 2.0; | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 1 |  |  | 1 |  | 3661 | use strict; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 23 |  | 
| 118 | 1 |  |  | 1 |  | 3 | use Pod::Usage; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 64 |  | 
| 119 | 1 |  |  | 1 |  | 4 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 46 |  | 
| 120 |  |  |  |  |  |  |  | 
| 121 | 1 |  |  | 1 |  | 533 | use HTML::Parser; | 
|  | 1 |  |  |  |  | 3776 |  | 
|  | 1 |  |  |  |  | 27 |  | 
| 122 |  |  |  |  |  |  |  | 
| 123 | 1 |  |  | 1 |  | 5 | use Fsdb::Filter; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 20 |  | 
| 124 | 1 |  |  | 1 |  | 5 | use Fsdb::IO::Writer; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 769 |  | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | =head2 new | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | $filter = new Fsdb::Filter::csv_to_db(@arguments); | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | Create a new csv_to_db object, taking command-line arguments. | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | =cut | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | sub new ($@) { | 
| 136 | 0 |  |  | 0 | 1 |  | my $class = shift @_; | 
| 137 | 0 |  |  |  |  |  | my $self = $class->SUPER::new(@_); | 
| 138 | 0 |  |  |  |  |  | bless $self, $class; | 
| 139 | 0 |  |  |  |  |  | $self->set_defaults; | 
| 140 | 0 |  |  |  |  |  | $self->parse_options(@_); | 
| 141 | 0 |  |  |  |  |  | $self->SUPER::post_new(); | 
| 142 | 0 |  |  |  |  |  | return $self; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | =head2 set_defaults | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | $filter->set_defaults(); | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | Internal: set up defaults. | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | =cut | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | sub set_defaults ($) { | 
| 155 | 0 |  |  | 0 | 1 |  | my($self) = @_; | 
| 156 | 0 |  |  |  |  |  | $self->SUPER::set_defaults(); | 
| 157 |  |  |  |  |  |  | } | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | =head2 parse_options | 
| 160 |  |  |  |  |  |  |  | 
| 161 |  |  |  |  |  |  | $filter->parse_options(@ARGV); | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | Internal: parse command-line arguments. | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | =cut | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | sub parse_options ($@) { | 
| 168 | 0 |  |  | 0 | 1 |  | my $self = shift @_; | 
| 169 |  |  |  |  |  |  |  | 
| 170 | 0 |  |  |  |  |  | my(@argv) = @_; | 
| 171 |  |  |  |  |  |  | $self->get_options( | 
| 172 |  |  |  |  |  |  | \@argv, | 
| 173 | 0 |  |  | 0 |  |  | 'help|?' => sub { pod2usage(1); }, | 
| 174 | 0 |  |  | 0 |  |  | 'man' => sub { pod2usage(-verbose => 2); }, | 
| 175 |  |  |  |  |  |  | 'autorun!' => \$self->{_autorun}, | 
| 176 |  |  |  |  |  |  | 'd|debug+' => \$self->{_debug}, | 
| 177 | 0 |  |  | 0 |  |  | 'i|input=s' => sub { $self->parse_io_option('input', @_); }, | 
| 178 |  |  |  |  |  |  | 'log!' => \$self->{_logprog}, | 
| 179 | 0 |  |  | 0 |  |  | 'o|output=s' => sub { $self->parse_io_option('output', @_); }, | 
| 180 | 0 | 0 |  |  |  |  | ) or pod2usage(2); | 
| 181 | 0 | 0 |  |  |  |  | pod2usage(2) if ($#argv >= 0); | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  |  | 
| 184 |  |  |  |  |  |  | =head2 setup | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | $filter->setup(); | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | Internal: setup, parse headers. | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | =cut | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | sub setup ($) { | 
| 193 | 0 |  |  | 0 | 1 |  | my($self) = @_; | 
| 194 |  |  |  |  |  |  |  | 
| 195 | 0 |  |  |  |  |  | $self->finish_fh_io_option('input'); | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | # Can't open up the source, | 
| 198 |  |  |  |  |  |  | # so can't write the header. | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | =head2 run | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | $filter->run(); | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | Internal: run over each rows. | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | =cut | 
| 208 |  |  |  |  |  |  | sub run ($) { | 
| 209 | 0 |  |  | 0 | 1 |  | my($self) = @_; | 
| 210 |  |  |  |  |  |  |  | 
| 211 | 0 |  |  |  |  |  | my %inside; | 
| 212 | 0 |  |  |  |  |  | my $text = ''; | 
| 213 | 0 |  |  |  |  |  | my $colspan = undef; | 
| 214 | 0 |  |  |  |  |  | my $header_count = 0; | 
| 215 | 0 |  |  |  |  |  | my @row; | 
| 216 |  |  |  |  |  |  | my @cols; | 
| 217 |  |  |  |  |  |  | my $start_sub = sub { | 
| 218 | 0 |  |  | 0 |  |  | my($tag, $attrh) = @_; | 
| 219 |  |  |  |  |  |  | croak $self->{_prog} . ": tr not in table.\n" | 
| 220 | 0 | 0 | 0 |  |  |  | if ($tag eq 'tr' && !$inside{table}); | 
| 221 | 0 | 0 | 0 |  |  |  | if ($tag eq 'th' || $tag eq 'td') { | 
| 222 |  |  |  |  |  |  | croak $self->{_prog} . ": badly nested $tag.\n" | 
| 223 | 0 | 0 | 0 |  |  |  | if ($inside{th} || $inside{td}); | 
| 224 |  |  |  |  |  |  | croak $self->{_prog} . ": th or td outside table or tr.\n" | 
| 225 | 0 | 0 | 0 |  |  |  | if (!$inside{table} || !$inside{tr}); | 
| 226 | 0 |  |  |  |  |  | $text = ''; | 
| 227 | 0 |  |  |  |  |  | $colspan = $attrh->{colspan}; | 
| 228 |  |  |  |  |  |  | }; | 
| 229 | 0 |  |  |  |  |  | $inside{$tag}++; | 
| 230 | 0 |  |  |  |  |  | }; | 
| 231 |  |  |  |  |  |  | my $end_sub = sub { | 
| 232 | 0 |  |  | 0 |  |  | my($tag, $attrh) = @_; | 
| 233 | 0 |  |  |  |  |  | $inside{$tag}--; | 
| 234 | 0 | 0 | 0 |  |  |  | if ($tag eq 'th' || $tag eq 'td') { | 
|  |  | 0 |  |  |  |  |  | 
| 235 | 0 | 0 |  |  |  |  | $header_count++ if ($tag eq 'th'); | 
| 236 | 0 |  |  |  |  |  | $text =~ s/\n/ /g; | 
| 237 | 0 |  |  |  |  |  | $text =~ s/^\s+//; | 
| 238 | 0 |  |  |  |  |  | $text =~ s/\s+$//; | 
| 239 | 0 |  |  |  |  |  | $text =~ s/  +/ /g;  # clean up for fsdb double-space separator | 
| 240 | 0 | 0 |  |  |  |  | $text = $self->{_empty} if ($text =~ /^\s+$/); | 
| 241 | 0 |  |  |  |  |  | push(@row, $text); | 
| 242 | 0 | 0 | 0 |  |  |  | push(@row, ($self->{_empty}) x ($colspan - 1)) if (defined($colspan) && $colspan > 1); | 
| 243 | 0 |  |  |  |  |  | $text = ''; | 
| 244 |  |  |  |  |  |  | } elsif ($tag eq 'tr') { | 
| 245 |  |  |  |  |  |  | # take a row action | 
| 246 | 0 | 0 |  |  |  |  | if (!defined($self->{_out})) { | 
| 247 | 0 |  |  |  |  |  | my $got_header = undef; | 
| 248 | 0 | 0 |  |  |  |  | if ($header_count == $#row+1) { | 
| 249 |  |  |  |  |  |  | # first row and all headers | 
| 250 | 0 |  |  |  |  |  | @cols = Fsdb::IO::clean_potential_columns(@row); | 
| 251 | 0 |  |  |  |  |  | @row = (); | 
| 252 | 0 |  |  |  |  |  | $got_header = 1; | 
| 253 |  |  |  |  |  |  | } else { | 
| 254 |  |  |  |  |  |  | # no headers, make it up | 
| 255 | 0 |  |  |  |  |  | foreach (0..$#row) { | 
| 256 | 0 |  |  |  |  |  | push(@cols, "column$_"); | 
| 257 |  |  |  |  |  |  | }; | 
| 258 |  |  |  |  |  |  | }; | 
| 259 | 0 |  |  |  |  |  | $self->finish_io_option('output', -fscode => 'S', -cols => \@cols); | 
| 260 | 0 | 0 |  |  |  |  | return if ($got_header); | 
| 261 |  |  |  |  |  |  | }; | 
| 262 |  |  |  |  |  |  | # fill in empty rows, if any | 
| 263 | 0 | 0 |  |  |  |  | if ($#row + 1 != $self->{_out}->ncols) { | 
| 264 | 0 |  |  |  |  |  | push(@row, ($self->{_empty} . "x") x ($self->{_out}->ncols - ($#row + 1))); | 
| 265 |  |  |  |  |  |  | }; | 
| 266 |  |  |  |  |  |  | # and rename blank rows to the empty symbol | 
| 267 |  |  |  |  |  |  | # and cleanup newlines | 
| 268 | 0 |  |  |  |  |  | foreach (0..$#row) { | 
| 269 | 0 |  |  |  |  |  | $row[$_] =~ s/\n/ /gm; | 
| 270 |  |  |  |  |  |  | # next line is a bit of a hack, assuming -F S | 
| 271 | 0 |  |  |  |  |  | $row[$_] =~ s/\s\s+/  /gm; | 
| 272 | 0 | 0 |  |  |  |  | $row[$_] = $self->{_empty} if ($row[$_] =~ /^\s*$/); | 
| 273 |  |  |  |  |  |  | }; | 
| 274 | 0 |  |  |  |  |  | $self->{_out}->write_row_from_aref(\@row); | 
| 275 | 0 |  |  |  |  |  | @row = (); | 
| 276 |  |  |  |  |  |  | }; | 
| 277 | 0 |  |  |  |  |  | }; | 
| 278 |  |  |  |  |  |  | my $text_sub = sub { | 
| 279 | 0 | 0 | 0 | 0 |  |  | return if ($inside{script} || $inside{style}); | 
| 280 | 0 | 0 | 0 |  |  |  | return if (!$inside{table} || !$inside{tr}); | 
| 281 | 0 | 0 | 0 |  |  |  | $text .= $_[0] if ($inside{td} || $inside{th}); | 
| 282 | 0 |  |  |  |  |  | }; | 
| 283 |  |  |  |  |  |  |  | 
| 284 | 0 |  |  |  |  |  | my $parser = HTML::Parser->new(api_version => 3, | 
| 285 |  |  |  |  |  |  | start_h => [ $start_sub, "tagname, attr" ], | 
| 286 |  |  |  |  |  |  | end_h => [ $end_sub, "tagname, attr" ], | 
| 287 |  |  |  |  |  |  | text_h => [ $text_sub, "dtext" ], | 
| 288 |  |  |  |  |  |  | marked_sections => 1); | 
| 289 | 0 |  |  |  |  |  | $parser->parse_file($self->{_in}); | 
| 290 |  |  |  |  |  |  |  | 
| 291 | 0 | 0 |  |  |  |  | croak $self->{_prog} . ": could not find table in html input.\n" | 
| 292 |  |  |  |  |  |  | if ($#cols == -1); | 
| 293 |  |  |  |  |  |  | } | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  |  | 
| 296 |  |  |  |  |  |  | =head1 AUTHOR and COPYRIGHT | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | Copyright (C) 1991-2015 by John Heidemann | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | This program is distributed under terms of the GNU general | 
| 301 |  |  |  |  |  |  | public license, version 2.  See the file COPYING | 
| 302 |  |  |  |  |  |  | with the distribution for details. | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | =cut | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | 1; |