| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WWW::IndexParser; | 
| 2 | 1 |  |  | 1 |  | 31483 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 36 |  | 
| 3 | 1 |  |  | 1 |  | 5 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 43 |  | 
| 4 | 1 |  |  | 1 |  | 5780 | use LWP::UserAgent; | 
|  | 1 |  |  |  |  | 102043 |  | 
|  | 1 |  |  |  |  | 42 |  | 
| 5 | 1 |  |  | 1 |  | 10937 | use HTML::Parser; | 
|  | 1 |  |  |  |  | 8691 |  | 
|  | 1 |  |  |  |  | 45 |  | 
| 6 | 1 |  |  | 1 |  | 9 | use Time::Local; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 79 |  | 
| 7 | 1 |  |  | 1 |  | 936 | use WWW::IndexParser::Entry; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 34 |  | 
| 8 | 1 |  |  | 1 |  | 8 | use URI; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 31 |  | 
| 9 | 1 |  |  | 1 |  | 6 | use Carp; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 85 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | BEGIN { | 
| 12 | 1 |  |  | 1 |  | 7438 | our $VERSION = "0.91"; | 
| 13 |  |  |  |  |  |  | } | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | our $months = { | 
| 16 |  |  |  |  |  |  | Jan => 0, January => 0, | 
| 17 |  |  |  |  |  |  | Feb => 1, February => 1, | 
| 18 |  |  |  |  |  |  | Mar => 2, March => 2, | 
| 19 |  |  |  |  |  |  | Apr => 3, April => 3, | 
| 20 |  |  |  |  |  |  | May => 4, | 
| 21 |  |  |  |  |  |  | Jun => 5, June => 5, | 
| 22 |  |  |  |  |  |  | Jul => 6, July => 6, | 
| 23 |  |  |  |  |  |  | Aug => 7, August => 7, | 
| 24 |  |  |  |  |  |  | Sep => 8, September => 8, | 
| 25 |  |  |  |  |  |  | Oct => 9, October => 9, | 
| 26 |  |  |  |  |  |  | Nov => 10, November => 10, | 
| 27 |  |  |  |  |  |  | Dec => 11, December => 11 }; | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | sub new { | 
| 33 | 1 |  |  | 1 | 1 | 12 | my $proto = shift; | 
| 34 | 1 |  | 33 |  |  | 7 | my $class = ref($proto)||$proto; | 
| 35 | 1 |  |  |  |  | 3 | my $self = {}; | 
| 36 | 1 |  |  |  |  | 2 | bless $self, $class; | 
| 37 | 1 |  |  |  |  | 3 | my %args = @_; | 
| 38 | 1 |  |  |  |  | 11 | $self->{ua} = LWP::UserAgent->new; | 
| 39 | 1 |  |  |  |  | 13974 | $self->{ua}->agent('PerlIndexParser/0.1'); | 
| 40 | 1 | 50 |  |  |  | 75 | if (defined $args{timeout}) { | 
| 41 | 0 | 0 |  |  |  | 0 | if ($args{timeout} =~ /^\d+/) { | 
| 42 | 0 |  |  |  |  | 0 | $self->{ua}->timeout($args{timeout}); | 
| 43 |  |  |  |  |  |  | } else { | 
| 44 | 0 |  |  |  |  | 0 | carp "Invalid timeout: " . $args{timeout}; | 
| 45 | 0 |  |  |  |  | 0 | return; | 
| 46 |  |  |  |  |  |  | } | 
| 47 |  |  |  |  |  |  | } else { | 
| 48 | 1 |  |  |  |  | 7 | $self->{ua}->timeout(10); | 
| 49 |  |  |  |  |  |  | } | 
| 50 | 1 | 50 |  |  |  | 22 | if (defined $args{proxy}) { | 
| 51 | 0 |  |  |  |  | 0 | $self->{ua}->proxy('http', $args{proxy}); | 
| 52 |  |  |  |  |  |  | } | 
| 53 | 1 | 50 |  |  |  | 4 | if (defined $args{debug}) { | 
| 54 | 0 |  |  |  |  | 0 | $self->{debug} = $args{debug}; | 
| 55 |  |  |  |  |  |  | } else { | 
| 56 | 1 |  |  |  |  | 3 | $self->{debug} = 0; | 
| 57 |  |  |  |  |  |  | } | 
| 58 | 1 |  |  |  |  | 8 | $self->{parser} = HTML::Parser->new( api_version => 3); | 
| 59 | 1 |  |  |  |  | 46 | $self->{parser}->{debug} = $self->{debug}; | 
| 60 | 1 | 50 |  |  |  | 6 | if (defined $args{url}) { | 
| 61 | 0 |  |  |  |  | 0 | $self->_url($args{url}); | 
| 62 | 0 | 0 |  |  |  | 0 | return @{$self->{files}} if defined $self->{files}; | 
|  | 0 |  |  |  |  | 0 |  | 
| 63 | 0 |  |  |  |  | 0 | return; | 
| 64 |  |  |  |  |  |  | } | 
| 65 | 1 |  |  |  |  | 5 | return $self; | 
| 66 |  |  |  |  |  |  | } | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub _url { | 
| 71 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 72 | 0 | 0 |  |  |  |  | if (@_) { | 
| 73 | 0 |  |  |  |  |  | my $new_url = shift; | 
| 74 | 0 | 0 |  |  |  |  | warn "The URL was $new_url" if $self->{debug}; | 
| 75 |  |  |  |  |  |  |  | 
| 76 | 0 |  |  |  |  |  | $self->{url} = $new_url; | 
| 77 | 0 | 0 |  |  |  |  | if ($new_url =~ /^([^:]+):\/\/([^:\/]+)(:(\d+))?/) { | 
| 78 | 0 |  |  |  |  |  | $self->{server} = $2; | 
| 79 | 0 |  |  |  |  |  | $self->{protocol} = $1; | 
| 80 | 0 | 0 |  |  |  |  | $self->{port} = $4 if defined $4; | 
| 81 | 0 |  |  |  |  |  | $self->{req} = HTTP::Request->new(GET => $new_url); | 
| 82 | 0 |  |  |  |  |  | $self->{res} = $self->{ua}->request($self->{req}); | 
| 83 | 0 | 0 |  |  |  |  | if (not $self->{res}->is_success) { | 
| 84 | 0 | 0 |  |  |  |  | carp "Cannot fetch for $new_url: " . $self->{res}->status_line if $self->{debug}; | 
| 85 | 0 |  |  |  |  |  | return; | 
| 86 |  |  |  |  |  |  | } | 
| 87 |  |  |  |  |  |  | } else { | 
| 88 | 0 |  |  |  |  |  | warn "Invalid URL " . $new_url; | 
| 89 | 0 |  |  |  |  |  | return; | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  |  | 
| 92 | 0 | 0 |  |  |  |  | if (ref($self->{res}->headers->{'content-type'}) eq "ARRAY") { | 
|  |  | 0 |  |  |  |  |  | 
| 93 | 0 |  |  |  |  |  | my $found_html = 0; | 
| 94 | 0 |  |  |  |  |  | foreach (@{$self->{res}->headers->{'content-type'}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 95 | 0 | 0 |  |  |  |  | $found_html = 1 if /^text\/html/; | 
| 96 |  |  |  |  |  |  | } | 
| 97 | 0 | 0 |  |  |  |  | if (not $found_html) { | 
| 98 | 0 |  |  |  |  |  | warn "Not an HTML page " . $self->{res}->headers->{'content-type'}; | 
| 99 | 0 |  |  |  |  |  | return; | 
| 100 |  |  |  |  |  |  | } | 
| 101 |  |  |  |  |  |  | } elsif ($self->{res}->headers->{'content-type'} !~ /^text\/html/) { | 
| 102 | 0 |  |  |  |  |  | warn "Not an HTML page " . $self->{res}->headers->{'content-type'}; | 
| 103 | 0 |  |  |  |  |  | return; | 
| 104 |  |  |  |  |  |  | } | 
| 105 |  |  |  |  |  |  |  | 
| 106 | 0 | 0 |  |  |  |  | if ($self->{res}->headers->{server} =~ /^Apache-Coyote/) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 107 | 0 | 0 |  |  |  |  | warn "Server is Tomcat Coyote" if $self->{debug}; | 
| 108 | 0 |  |  |  |  |  | $self->{parser}->handler( start => \&_parse_html_tomcat, "self, tagname, attr, attrseq, text"); | 
| 109 | 0 |  |  |  |  |  | $self->{parser}->handler( text => \&_parse_html_tomcat, "self, tagname, attr, attrseq, text"); | 
| 110 |  |  |  |  |  |  | } elsif ($self->{res}->headers->{server} =~ /^Apache/) { | 
| 111 | 0 | 0 |  |  |  |  | warn "Server is Apache" if $self->{debug}; | 
| 112 | 0 |  |  |  |  |  | $self->{parser}->handler( start => \&_parse_html_apache, "self, tagname, attr, attrseq, text"); | 
| 113 | 0 |  |  |  |  |  | $self->{parser}->handler( text => \&_parse_html_apache, "self, tagname, attr, attrseq, text"); | 
| 114 |  |  |  |  |  |  | } elsif ($self->{res}->headers->{server} =~ /^Microsoft-IIS/) { | 
| 115 | 0 | 0 |  |  |  |  | warn "Server is IIS" if $self->{debug}; | 
| 116 | 0 |  |  |  |  |  | $self->{parser}->handler( start => \&_parse_html_iis, "self, tagname, attr, attrseq, text"); | 
| 117 | 0 |  |  |  |  |  | $self->{parser}->handler( text => \&_parse_html_iis, "self, tagname, attr, attrseq, text"); | 
| 118 |  |  |  |  |  |  | } elsif ($self->{res}->headers->{server} =~ m!^lighttpd/!) { | 
| 119 | 0 | 0 |  |  |  |  | warn "Server is lighttpd" if $self->{debug}; | 
| 120 | 0 |  |  |  |  |  | $self->{parser}->handler( start => \&_parse_html_lighttpd, "self, tagname, attr, attrseq, text"); | 
| 121 | 0 |  |  |  |  |  | $self->{parser}->handler( text => \&_parse_html_lighttpd, "self, tagname, attr, attrseq, text"); | 
| 122 |  |  |  |  |  |  | } else { | 
| 123 | 0 | 0 |  |  |  |  | warn "Unknown web server" if $self->{debug}; | 
| 124 | 0 |  |  |  |  |  | return; | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 | 0 |  |  |  |  |  | $self->{parser}->parse($self->{res}->content); | 
| 128 | 0 |  |  |  |  |  | $self->{parser}->eof(); | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | # Add the URL to each HASH for ease of use | 
| 131 | 0 |  |  |  |  |  | foreach my $entry (@{$self->{parser}->{files}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 132 | 0 | 0 |  |  |  |  | if ($entry->filename =~ /^\//) { | 
| 133 | 0 | 0 |  |  |  |  | $entry->url($self->{protocol} . "://" . $self->{server} . | 
| 134 |  |  |  |  |  |  | (defined $self->{port}?':' . $self->{port}:'') . | 
| 135 |  |  |  |  |  |  | $entry->filename); | 
| 136 |  |  |  |  |  |  | } else { | 
| 137 | 0 |  |  |  |  |  | $entry->url( URI->new_abs($entry->filename, $self->{url}) ); | 
| 138 |  |  |  |  |  |  | } | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  | # Get this back from the parser object. | 
| 141 | 0 |  |  |  |  |  | $self->{files} = $self->{parser}->{files}; | 
| 142 |  |  |  |  |  |  | } | 
| 143 | 0 |  |  |  |  |  | return $self->{url}; | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | sub _parse_html_tomcat { | 
| 147 | 0 |  |  | 0 |  |  | my ($self, $tagname, $attr, $attrseq, $origtext) = @_; | 
| 148 |  |  |  |  |  |  |  | 
| 149 | 0 | 0 | 0 |  |  |  | if (not defined $tagname) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 150 | 0 | 0 |  |  |  |  | return unless $self->{parser_state}; | 
| 151 |  |  |  |  |  |  |  | 
| 152 | 0 | 0 |  |  |  |  | if ($self->{parser_state} == 2) { | 
| 153 | 0 | 0 |  |  |  |  | warn "The title is: $origtext" if $self->{debug}; | 
| 154 | 0 | 0 |  |  |  |  | if ($origtext =~ /^Directory Listing For (.+)$/) { | 
| 155 | 0 |  |  |  |  |  | $self->{directory} = $1; | 
| 156 |  |  |  |  |  |  | } | 
| 157 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 158 | 0 |  |  |  |  |  | return; | 
| 159 |  |  |  |  |  |  | } | 
| 160 | 0 | 0 | 0 |  |  |  | if ($self->{parser_state} == 1 && $origtext =~ /^([\d\.]+)(\s+(\w+))?/) { | 
| 161 |  |  |  |  |  |  |  | 
| 162 | 0 |  |  |  |  |  | $self->{current_file}->{size} = $1; | 
| 163 | 0 | 0 |  |  |  |  | $self->{current_file}->{size_units} = $3 if defined $3; | 
| 164 |  |  |  |  |  |  | } | 
| 165 | 0 | 0 | 0 |  |  |  | if ($self->{parser_state} == 1 && $origtext =~ /^\w+,\s+(\d+)\s+(\w+)\s+(\d+)\s+(\d+):(\d+):(\d+)\s+(\w+)/) { | 
| 166 | 0 |  |  |  |  |  | my $time = timelocal($6, $5, $4, $1, $months->{$2}, $3-1900); | 
| 167 | 0 |  |  |  |  |  | $self->{current_file}->{time} = $time; | 
| 168 |  |  |  |  |  |  | } | 
| 169 |  |  |  |  |  |  | } elsif ($tagname eq 'title') { | 
| 170 | 0 |  |  |  |  |  | $self->{parser_state} = 2; | 
| 171 |  |  |  |  |  |  | } elsif ($tagname eq "hr" && $self->{parser_state} && defined $self->{current_file}) { | 
| 172 | 0 |  |  |  |  |  | my $entry = WWW::IndexParser::Entry->new; | 
| 173 | 0 | 0 |  |  |  |  | $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename}; | 
| 174 | 0 | 0 |  |  |  |  | $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time}; | 
| 175 | 0 | 0 |  |  |  |  | $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size}; | 
| 176 | 0 | 0 |  |  |  |  | $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units}; | 
| 177 | 0 |  |  |  |  |  | push @{$self->{files}}, $entry; | 
|  | 0 |  |  |  |  |  |  | 
| 178 | 0 |  |  |  |  |  | delete $self->{current_file}; | 
| 179 |  |  |  |  |  |  | #$self->{parser_state} = 1; | 
| 180 |  |  |  |  |  |  | } elsif ($tagname eq "tr" && defined $self->{parser_state}) { | 
| 181 | 0 | 0 |  |  |  |  | if (defined $self->{current_file}) { | 
| 182 | 0 |  |  |  |  |  | my $entry = WWW::IndexParser::Entry->new; | 
| 183 | 0 | 0 |  |  |  |  | $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename}; | 
| 184 | 0 | 0 |  |  |  |  | $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time}; | 
| 185 | 0 | 0 |  |  |  |  | $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size}; | 
| 186 | 0 | 0 |  |  |  |  | $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units}; | 
| 187 | 0 |  |  |  |  |  | push @{$self->{files}}, $entry; | 
|  | 0 |  |  |  |  |  |  | 
| 188 | 0 |  |  |  |  |  | delete $self->{current_file}; | 
| 189 |  |  |  |  |  |  | } | 
| 190 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 191 |  |  |  |  |  |  | } elsif ($tagname eq "a" && defined $self->{parser_state}) { | 
| 192 | 0 | 0 |  |  |  |  | warn "  file name = " .  $attr->{href} if $self->{debug}; | 
| 193 | 0 | 0 |  |  |  |  | $self->{current_file}->{filename} = $attr->{href} if $attr->{href}; | 
| 194 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | sub _parse_html_apache { | 
| 200 | 0 |  |  | 0 |  |  | my ($self, $tagname, $attr, $attrseq, $origtext) = @_; | 
| 201 |  |  |  |  |  |  |  | 
| 202 | 0 | 0 | 0 |  |  |  | if (not defined $tagname) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 203 | 0 | 0 |  |  |  |  | return unless $self->{parser_state}; | 
| 204 |  |  |  |  |  |  |  | 
| 205 | 0 | 0 |  |  |  |  | if ($self->{parser_state} == 2) { | 
| 206 | 0 | 0 |  |  |  |  | warn "The title is: $origtext" if $self->{debug}; | 
| 207 | 0 | 0 |  |  |  |  | if ($origtext =~ /^Index of (.+)$/) { | 
| 208 | 0 |  |  |  |  |  | $self->{directory} = $1; | 
| 209 |  |  |  |  |  |  | } | 
| 210 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 211 | 0 |  |  |  |  |  | return; | 
| 212 |  |  |  |  |  |  | } | 
| 213 | 0 | 0 |  |  |  |  | if ($origtext =~ /(\d\d)-(\w\w\w)-(\d{4}) (\d\d):(\d\d)\s+([\d\.]+)(\w)?/) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 214 | 0 |  |  |  |  |  | my $time = timelocal(0, $5, $4, $1, $months->{$2}, $3-1900); | 
| 215 | 0 |  |  |  |  |  | $self->{current_file}->{time} = $time; | 
| 216 | 0 |  |  |  |  |  | $self->{current_file}->{size} = $6; | 
| 217 | 0 | 0 |  |  |  |  | $self->{current_file}->{size_units} = $7 if defined $7; | 
| 218 |  |  |  |  |  |  | } elsif ($origtext =~ /(\d\d)-(\w\w\w)-(\d{4}) (\d\d):(\d\d)/) { | 
| 219 | 0 |  |  |  |  |  | my $time = timelocal(0, $5, $4, $1, $months->{$2}, $3-1900); | 
| 220 | 0 |  |  |  |  |  | $self->{current_file}->{time} = $time; | 
| 221 | 0 | 0 |  |  |  |  | warn " Found time (using Apache 2.2+ check)" if $self->{debug}; | 
| 222 |  |  |  |  |  |  | } elsif ($origtext =~ /^(\d[\d\.]+)(\w)?/) { | 
| 223 | 0 | 0 |  |  |  |  | warn " Found size (using Apache 2.2+ check)" if $self->{debug}; | 
| 224 | 0 |  |  |  |  |  | $self->{current_file}->{size} = $1; | 
| 225 | 0 | 0 |  |  |  |  | $self->{current_file}->{size_units} = $2 if defined $2; | 
| 226 |  |  |  |  |  |  | } | 
| 227 |  |  |  |  |  |  | } elsif ($tagname eq 'title') { | 
| 228 | 0 |  |  |  |  |  | $self->{parser_state} = 2; | 
| 229 |  |  |  |  |  |  | } elsif ($tagname eq "pre") { | 
| 230 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 231 |  |  |  |  |  |  | } elsif (($tagname eq "img" || $tagname eq "hr") && defined $self->{parser_state}) { | 
| 232 | 0 | 0 | 0 |  |  |  | if (defined $self->{current_file} && $self->{current_file}->{filename} !~ /^\?/ && $self->{current_file}->{type} !~ /Icon/) { | 
|  |  |  | 0 |  |  |  |  | 
| 233 | 0 |  |  |  |  |  | my $entry = WWW::IndexParser::Entry->new; | 
| 234 | 0 | 0 |  |  |  |  | $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename}; | 
| 235 | 0 | 0 |  |  |  |  | $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time}; | 
| 236 | 0 | 0 |  |  |  |  | $entry->type($self->{current_file}->{type}) if defined $self->{current_file}->{type}; | 
| 237 | 0 | 0 |  |  |  |  | $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size}; | 
| 238 | 0 | 0 |  |  |  |  | $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units}; | 
| 239 | 0 |  |  |  |  |  | push @{$self->{files}}, $entry; | 
|  | 0 |  |  |  |  |  |  | 
| 240 | 0 | 0 |  |  |  |  | warn "Added " . $self->{current_file}->{filename} if $self->{debug}; | 
| 241 | 0 |  |  |  |  |  | delete $self->{current_file}; | 
| 242 |  |  |  |  |  |  | } | 
| 243 | 0 | 0 |  |  |  |  | if (defined $attr->{alt}) { | 
| 244 | 0 | 0 |  |  |  |  | warn "Possible new file:" . $attr->{alt} if $self->{debug}; | 
| 245 | 0 |  |  |  |  |  | $self->{current_file}->{type} = $attr->{alt}; | 
| 246 |  |  |  |  |  |  | } | 
| 247 |  |  |  |  |  |  | } elsif ($tagname eq "a" && defined $self->{parser_state}) { | 
| 248 | 0 | 0 |  |  |  |  | warn "  file name = " .  $attr->{href} if $self->{debug}; | 
| 249 | 0 | 0 |  |  |  |  | $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href}; | 
| 250 |  |  |  |  |  |  | } else { | 
| 251 | 0 | 0 |  |  |  |  | warn $tagname if $self->{debug}; | 
| 252 |  |  |  |  |  |  | } | 
| 253 |  |  |  |  |  |  | } | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | sub _parse_html_iis { | 
| 261 | 0 |  |  | 0 |  |  | my ($self, $tagname, $attr, $attrseq, $origtext) = @_; | 
| 262 |  |  |  |  |  |  |  | 
| 263 | 0 | 0 | 0 |  |  |  | if (not defined $tagname) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 264 | 0 | 0 |  |  |  |  | return unless $self->{parser_state}; | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 0 | 0 |  |  |  |  | if ($self->{parser_state} == 2) { | 
| 267 | 0 | 0 |  |  |  |  | if ($origtext =~ /- (.+)$/) { | 
| 268 | 0 |  |  |  |  |  | $self->{directory} = $1; | 
| 269 |  |  |  |  |  |  | } | 
| 270 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 271 | 0 |  |  |  |  |  | return; | 
| 272 |  |  |  |  |  |  | } | 
| 273 | 0 | 0 |  |  |  |  | if ($origtext =~ /\s*(\w+),\s+(\w+)\s+(\d+),\s+(\d{4})\s+(\d{1,2}):(\d\d) (AM|PM)\s+([\d\.]+)/) { | 
| 274 | 0 |  |  |  |  |  | my $hour_of_day = $5; | 
| 275 | 0 | 0 | 0 |  |  |  | $hour_of_day = 0 if ($7 eq 'AM' && $hour_of_day eq 12); | 
| 276 | 0 | 0 | 0 |  |  |  | $hour_of_day += 12 if ($7 eq 'PM' && $hour_of_day ne 12); | 
| 277 | 0 |  |  |  |  |  | my $time = timelocal(0, $6, $hour_of_day, $3, $months->{$2}, $4-1900); | 
| 278 | 0 |  |  |  |  |  | $self->{current_file}->{time} = $time; | 
| 279 | 0 |  |  |  |  |  | $self->{current_file}->{size} = $8; | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  | } elsif ($tagname eq 'title') { | 
| 282 | 0 |  |  |  |  |  | $self->{parser_state} = 2; | 
| 283 |  |  |  |  |  |  | } elsif ($tagname eq "pre") { | 
| 284 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 285 |  |  |  |  |  |  | } elsif ($tagname eq "br" && defined $self->{parser_state}) { | 
| 286 | 0 | 0 |  |  |  |  | if (defined $self->{current_file}) { | 
| 287 | 0 |  |  |  |  |  | my $entry = WWW::IndexParser::Entry->new; | 
| 288 | 0 | 0 |  |  |  |  | $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename}; | 
| 289 | 0 | 0 |  |  |  |  | $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time}; | 
| 290 | 0 | 0 |  |  |  |  | $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size}; | 
| 291 | 0 | 0 |  |  |  |  | $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units}; | 
| 292 | 0 |  |  |  |  |  | push @{$self->{files}}, $entry; | 
|  | 0 |  |  |  |  |  |  | 
| 293 | 0 |  |  |  |  |  | delete $self->{current_file}; | 
| 294 |  |  |  |  |  |  | } | 
| 295 |  |  |  |  |  |  | } elsif ($tagname eq "a" && defined $self->{parser_state}) { | 
| 296 | 0 | 0 |  |  |  |  | warn "  file name = " .  $attr->{href} if $self->{debug}; | 
| 297 | 0 | 0 |  |  |  |  | $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href}; | 
| 298 |  |  |  |  |  |  | } | 
| 299 |  |  |  |  |  |  | } | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  |  | 
| 302 |  |  |  |  |  |  | sub _parse_html_lighttpd { | 
| 303 | 0 |  |  | 0 |  |  | my ($self, $tagname, $attr, $attrseq, $origtext) = @_; | 
| 304 |  |  |  |  |  |  |  | 
| 305 | 0 | 0 | 0 |  |  |  | if (not defined $tagname) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 306 | 0 | 0 |  |  |  |  | return unless $self->{parser_state}; | 
| 307 |  |  |  |  |  |  |  | 
| 308 | 0 | 0 |  |  |  |  | if ($self->{parser_state} eq 'title') { | 
| 309 | 0 | 0 |  |  |  |  | warn "The title is: $origtext" if $self->{debug}; | 
| 310 | 0 | 0 |  |  |  |  | if ($origtext =~ m!^Index of (.+)/$!) { | 
| 311 | 0 |  |  |  |  |  | $self->{directory} = $1; | 
| 312 |  |  |  |  |  |  | } | 
| 313 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 314 | 0 |  |  |  |  |  | return; | 
| 315 |  |  |  |  |  |  | } | 
| 316 |  |  |  |  |  |  |  | 
| 317 | 0 | 0 |  |  |  |  | if ($self->{parser_state} eq 'time') { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 318 | 0 | 0 |  |  |  |  | if ($origtext =~ /^(\d{4})-(\w\w\w)-(\d\d) (\d\d):(\d\d):(\d\d)$/) { | 
| 319 | 0 |  |  |  |  |  | my $time = timelocal(0, $5, $4, $3, $months->{$2}, $1-1900); | 
| 320 | 0 |  |  |  |  |  | $self->{current_file}->{time} = $time; | 
| 321 |  |  |  |  |  |  | } | 
| 322 |  |  |  |  |  |  | } elsif ($self->{parser_state} eq 'size') { | 
| 323 | 0 | 0 |  |  |  |  | if ($origtext =~ /^([\d\.]+)(\w)?/) { | 
| 324 | 0 |  |  |  |  |  | $self->{current_file}->{size} = $1; | 
| 325 | 0 | 0 |  |  |  |  | $self->{current_file}->{size_units} = $2 if defined $2; | 
| 326 |  |  |  |  |  |  | } | 
| 327 |  |  |  |  |  |  | } elsif ($self->{parser_state} eq 'type') { | 
| 328 | 0 | 0 |  |  |  |  | if ($origtext =~ /^[\w\-\/]+$/) { | 
| 329 | 0 |  |  |  |  |  | $self->{current_file}->{type} = $origtext; | 
| 330 |  |  |  |  |  |  | } | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  | } elsif ($tagname eq 'title') { | 
| 333 | 0 |  |  |  |  |  | $self->{parser_state} = 'title'; | 
| 334 |  |  |  |  |  |  | } elsif ($tagname eq "td") { | 
| 335 | 0 |  |  |  |  |  | my %class2state = (m => 'time', s => 'size', t => 'type'); | 
| 336 | 0 |  |  |  |  |  | my $class = $attr->{class}; | 
| 337 | 0 |  |  |  |  |  | my $state = $class2state{$class}; | 
| 338 | 0 | 0 |  |  |  |  | $self->{parser_state} = $state if $state; | 
| 339 |  |  |  |  |  |  | } elsif ($tagname eq 'tr') { | 
| 340 | 0 | 0 |  |  |  |  | if (defined $self->{current_file}) { | 
| 341 | 0 |  |  |  |  |  | my $entry = WWW::IndexParser::Entry->new; | 
| 342 | 0 | 0 |  |  |  |  | $entry->filename($self->{current_file}->{filename}) if defined $self->{current_file}->{filename}; | 
| 343 | 0 | 0 |  |  |  |  | $entry->time($self->{current_file}->{time}) if defined $self->{current_file}->{time}; | 
| 344 | 0 | 0 |  |  |  |  | $entry->type($self->{current_file}->{type}) if defined $self->{current_file}->{type}; | 
| 345 | 0 | 0 |  |  |  |  | $entry->size($self->{current_file}->{size}) if defined $self->{current_file}->{size}; | 
| 346 | 0 | 0 |  |  |  |  | $entry->size_units($self->{current_file}->{size_units}) if defined $self->{current_file}->{size_units}; | 
| 347 | 0 |  |  |  |  |  | push @{$self->{files}}, $entry; | 
|  | 0 |  |  |  |  |  |  | 
| 348 | 0 | 0 |  |  |  |  | warn "Added " . $self->{current_file}->{filename} if $self->{debug}; | 
| 349 | 0 |  |  |  |  |  | delete $self->{current_file}; | 
| 350 |  |  |  |  |  |  | } | 
| 351 | 0 | 0 |  |  |  |  | warn "Possible new file row" if $self->{debug}; | 
| 352 | 0 |  |  |  |  |  | $self->{parser_state} = 1; | 
| 353 |  |  |  |  |  |  | } elsif ($tagname eq "a" && defined $self->{parser_state}) { | 
| 354 | 0 | 0 |  |  |  |  | warn "  file name = " .  $attr->{href} if $self->{debug}; | 
| 355 | 0 | 0 |  |  |  |  | $self->{current_file}->{filename} = $attr->{href} if defined $attr->{href}; | 
| 356 |  |  |  |  |  |  | } else { | 
| 357 | 0 | 0 |  |  |  |  | warn $tagname if $self->{debug}; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  | } | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  |  | 
| 363 |  |  |  |  |  |  | =head1 NAME | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | WWW::IndexParser - Fetch and parse the directory index from a web server | 
| 366 |  |  |  |  |  |  |  | 
| 367 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 368 |  |  |  |  |  |  |  | 
| 369 |  |  |  |  |  |  | use WWW::IndexParser; | 
| 370 |  |  |  |  |  |  | my @files = WWW::IndexParser->new(url => 'http://www.example.com/dir/'); | 
| 371 |  |  |  |  |  |  | foreach my $entry (@files) { | 
| 372 |  |  |  |  |  |  | printf "%s %s\n", $entry->filename, | 
| 373 |  |  |  |  |  |  | scalar(localtime($entry->time)||''); | 
| 374 |  |  |  |  |  |  | } | 
| 375 |  |  |  |  |  |  |  | 
| 376 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 377 |  |  |  |  |  |  |  | 
| 378 |  |  |  |  |  |  |  | 
| 379 |  |  |  |  |  |  | B is a module that uses LWP to fetch a URL from a web | 
| 380 |  |  |  |  |  |  | server. It then atempts to parse this page as if it were an auto generated | 
| 381 |  |  |  |  |  |  | index page.  It returns an array of B objects, one | 
| 382 |  |  |  |  |  |  | per entry in the directory index that it has found. Each Entry has a | 
| 383 |  |  |  |  |  |  | set of methods: filename(), time(), size(), and others if supported | 
| 384 |  |  |  |  |  |  | by the autoindex generated: type() and size_units(). | 
| 385 |  |  |  |  |  |  |  | 
| 386 |  |  |  |  |  |  | =head1 CONSTRUCTOR | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | =over 4 | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | =item new ( url => $url, timeout => $seconds, proxy => $proxy_url, debug => 1  ) | 
| 391 |  |  |  |  |  |  |  | 
| 392 |  |  |  |  |  |  | When called with a URL to examine, this method does not return an object, | 
| 393 |  |  |  |  |  |  | but an array of WWW::IndexParser::Entry obects, one per entry in the | 
| 394 |  |  |  |  |  |  | directory listing that was accessed. | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | The options to this are: | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | =over 4 | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | =item url | 
| 401 |  |  |  |  |  |  |  | 
| 402 |  |  |  |  |  |  | The complete URL of the index to fetch. | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | =item timeout | 
| 405 |  |  |  |  |  |  |  | 
| 406 |  |  |  |  |  |  | The timeout for the request to fetch data, default 10 seconds. | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  | =item proxy | 
| 409 |  |  |  |  |  |  |  | 
| 410 |  |  |  |  |  |  | A proxy server URL, eg, 'http://proxy:3128/'. | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  | =item debug | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | Decide if to print parsing debug information. Set to 0 (the default) to | 
| 415 |  |  |  |  |  |  | disable, or anything non-false to print. Recommened you use a digit (ie, 1) | 
| 416 |  |  |  |  |  |  | as this may become a numeric 'level' of debug in the future. | 
| 417 |  |  |  |  |  |  |  | 
| 418 |  |  |  |  |  |  | =back | 
| 419 |  |  |  |  |  |  |  | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | =back | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  |  | 
| 424 |  |  |  |  |  |  | =head1 METHODS | 
| 425 |  |  |  |  |  |  |  | 
| 426 |  |  |  |  |  |  | All methods are private in this module. Pass only a URL to the constructor, | 
| 427 |  |  |  |  |  |  | and it does everything for you itself. | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | =head1 PREREQUISUTES | 
| 430 |  |  |  |  |  |  |  | 
| 431 |  |  |  |  |  |  | This modile depends upon C, C, C. | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  |  | 
| 434 |  |  |  |  |  |  | =head1 OSNAMES | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | any | 
| 437 |  |  |  |  |  |  |  | 
| 438 |  |  |  |  |  |  | =head1 BUGS | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | Currently only supports Apache, IIS and Tomcat style auto indexes. Send suggestions for new Auto-Indexes to support to the author (along with sample HTML)! | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  | =head1 AUTHOR | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | James Bromberger Ejames@rcpt.toE | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | =head1 COPYRIGHT | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | Copyright (c) 2006 James Bromberger. All rights reserved. All rights | 
| 449 |  |  |  |  |  |  | reserved. This program is free software; you can redistribute it and/or | 
| 450 |  |  |  |  |  |  | modify it under the same terms as Perl itself. | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | =cut | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | 1; |