| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package App::scrape; | 
| 2 | 4 |  |  | 4 |  | 254972 | use strict; | 
|  | 4 |  |  |  |  | 45 |  | 
|  | 4 |  |  |  |  | 123 |  | 
| 3 | 4 |  |  | 4 |  | 2181 | use URI; | 
|  | 4 |  |  |  |  | 26795 |  | 
|  | 4 |  |  |  |  | 138 |  | 
| 4 | 4 |  |  | 4 |  | 2367 | use HTML::TreeBuilder::XPath; | 
|  | 4 |  |  |  |  | 241761 |  | 
|  | 4 |  |  |  |  | 37 |  | 
| 5 | 4 |  |  | 4 |  | 2210 | use HTML::Selector::XPath 'selector_to_xpath'; | 
|  | 4 |  |  |  |  | 9110 |  | 
|  | 4 |  |  |  |  | 264 |  | 
| 6 | 4 |  |  | 4 |  | 31 | use Exporter 'import'; | 
|  | 4 |  |  |  |  | 9 |  | 
|  | 4 |  |  |  |  | 2246 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | our $VERSION = '0.07'; | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | our @EXPORT_OK = qw; | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | =head1 NAME | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | App::scrape - simple HTML scraping | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | =head1 ABSTRACT | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | This is a simple module to extract data from HTML by | 
| 19 |  |  |  |  |  |  | specifying CSS3 or XPath selectors. | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | use App::scrape 'scrape'; | 
| 24 |  |  |  |  |  |  | use LWP::Simple 'get'; | 
| 25 |  |  |  |  |  |  | use Data::Dumper; | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | my $html = get('http://perlmonks.org'); | 
| 28 |  |  |  |  |  |  | my @posts = scrape( | 
| 29 |  |  |  |  |  |  | $html, | 
| 30 |  |  |  |  |  |  | ['a','a@href'], | 
| 31 |  |  |  |  |  |  | { | 
| 32 |  |  |  |  |  |  | absolute => [qw[href src rel]], | 
| 33 |  |  |  |  |  |  | base => 'http://perlmonks.org', | 
| 34 |  |  |  |  |  |  | }, | 
| 35 |  |  |  |  |  |  | ); | 
| 36 |  |  |  |  |  |  | print Dumper \@posts; | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | my @posts = scrape( | 
| 39 |  |  |  |  |  |  | $html, | 
| 40 |  |  |  |  |  |  | { | 
| 41 |  |  |  |  |  |  | title => 'a', | 
| 42 |  |  |  |  |  |  | url   => 'a@href', | 
| 43 |  |  |  |  |  |  | }, | 
| 44 |  |  |  |  |  |  | { | 
| 45 |  |  |  |  |  |  | absolute => [qw[href src rel]], | 
| 46 |  |  |  |  |  |  | base => 'http://perlmonks.org', | 
| 47 |  |  |  |  |  |  | }, | 
| 48 |  |  |  |  |  |  | ); | 
| 49 |  |  |  |  |  |  | print Dumper \@posts; | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | This module implements yet another scraping engine | 
| 54 |  |  |  |  |  |  | to extract data from HTML. | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | This engine does not (yet) support nested data | 
| 57 |  |  |  |  |  |  | structures. For an engine that supports nesting, see | 
| 58 |  |  |  |  |  |  | L. | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | =cut | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | sub scrape { | 
| 63 | 6 |  |  | 6 | 0 | 8083 | my ($html, $selectors, $options) = @_; | 
| 64 |  |  |  |  |  |  |  | 
| 65 | 6 |  | 50 |  |  | 19 | $options ||= {}; | 
| 66 | 6 |  |  |  |  | 9 | my $delete_tree; | 
| 67 | 6 | 100 |  |  |  | 18 | if (! ref $options->{tree}) { | 
| 68 | 4 |  |  |  |  | 38 | $options->{tree} = HTML::TreeBuilder::XPath->new; | 
| 69 | 4 |  |  |  |  | 1039 | $options->{tree}->parse($html); | 
| 70 | 4 |  |  |  |  | 5768 | $options->{tree}->eof; | 
| 71 | 4 |  |  |  |  | 560 | $delete_tree = 1; | 
| 72 |  |  |  |  |  |  | }; | 
| 73 | 6 |  |  |  |  | 14 | my $tree = $options->{tree}; | 
| 74 |  |  |  |  |  |  |  | 
| 75 | 6 |  | 50 |  |  | 38 | $options->{make_uri} ||= {}; | 
| 76 | 6 |  |  |  |  | 7 | my %make_uri = %{$options->{make_uri}}; | 
|  | 6 |  |  |  |  | 17 |  | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | # now fetch all "rows" from the page. We do this once to avoid | 
| 79 |  |  |  |  |  |  | # fetching a page multiple times | 
| 80 | 6 |  |  |  |  | 10 | my @rows; | 
| 81 |  |  |  |  |  |  |  | 
| 82 | 6 |  |  |  |  | 18 | my %known_uri = ( | 
| 83 |  |  |  |  |  |  | 'href' => 1, # a@href | 
| 84 |  |  |  |  |  |  | 'src'  => 1, # img@src , script@src | 
| 85 |  |  |  |  |  |  | ); | 
| 86 |  |  |  |  |  |  |  | 
| 87 | 6 |  |  |  |  | 8 | my @selectors; | 
| 88 | 6 | 100 |  |  |  | 17 | if (ref $selectors eq 'ARRAY') { | 
| 89 | 4 |  |  |  |  | 10 | @selectors = @$selectors | 
| 90 |  |  |  |  |  |  | } else { | 
| 91 | 2 |  |  |  |  | 8 | @selectors = map { $selectors->{ $_ } } sort keys %$selectors; | 
|  | 2 |  |  |  |  | 4 |  | 
| 92 |  |  |  |  |  |  | }; | 
| 93 |  |  |  |  |  |  |  | 
| 94 | 6 |  |  |  |  | 12 | my $rowidx=0; | 
| 95 | 6 |  |  |  |  | 8 | my $found_max = 0; | 
| 96 | 6 |  |  |  |  | 12 | for my $selector (@selectors) { | 
| 97 | 6 |  |  |  |  | 28 | my ($attr); | 
| 98 | 6 |  |  |  |  | 10 | my $s = $selector; | 
| 99 | 6 | 100 |  |  |  | 30 | if ($selector =~ s!/?\@(\w+)$!!) { | 
| 100 | 3 |  |  |  |  | 9 | $attr = $1; | 
| 101 |  |  |  |  |  |  | }; | 
| 102 | 6 | 100 |  |  |  | 23 | if ($selector !~ m!^\.?/!) { | 
| 103 | 5 |  |  |  |  | 22 | $selector = selector_to_xpath( $selector ); | 
| 104 |  |  |  |  |  |  | }; | 
| 105 |  |  |  |  |  |  | # We always make the selector relative to the current node: | 
| 106 | 6 | 100 |  |  |  | 556 | $selector = ".$selector" unless $selector =~ /^\./; | 
| 107 | 6 |  |  |  |  | 12 | my @nodes; | 
| 108 | 6 | 100 |  |  |  | 17 | if (! defined $attr) { | 
| 109 | 3 |  |  |  |  | 18 | @nodes = map { $_->as_trimmed_text } $tree->findnodes($selector); | 
|  | 10 |  |  |  |  | 5965 |  | 
| 110 |  |  |  |  |  |  | } else { | 
| 111 | 3 |  | 33 |  |  | 44 | $make_uri{ $rowidx } ||= (($known_uri{ lc $attr }) and ! $options->{no_known_uri}); | 
|  |  |  | 33 |  |  |  |  | 
| 112 | 3 |  |  |  |  | 27 | @nodes = $tree->findvalues("$selector/\@$attr"); | 
| 113 |  |  |  |  |  |  | }; | 
| 114 | 6 | 100 |  |  |  | 5651 | if ($make_uri{ $rowidx }) { | 
| 115 | 3 |  |  |  |  | 9 | @nodes = map { URI->new_abs( $_, $options->{base} )->as_string } @nodes; | 
|  | 7 |  |  |  |  | 15357 |  | 
| 116 |  |  |  |  |  |  | }; | 
| 117 | 6 | 100 |  |  |  | 7640 | if( $found_max < @nodes) { | 
| 118 | 5 |  |  |  |  | 9 | $found_max = @nodes | 
| 119 |  |  |  |  |  |  | }; | 
| 120 | 6 |  |  |  |  | 18 | push @rows, \@nodes; | 
| 121 | 6 |  |  |  |  | 17 | $rowidx++; | 
| 122 |  |  |  |  |  |  | }; | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | # Now convert the result from rows to columns | 
| 125 | 6 |  |  |  |  | 31 | my @result; | 
| 126 | 6 |  |  |  |  | 27 | for my $idx (0.. $found_max-1) { | 
| 127 |  |  |  |  |  |  | push @result, [ map { | 
| 128 | 14 |  |  |  |  | 30 | $rows[$_]->[$idx] | 
|  | 18 |  |  |  |  | 44 |  | 
| 129 |  |  |  |  |  |  | } 0..$#rows ]; | 
| 130 |  |  |  |  |  |  | }; | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | # Now check what the user wants, array or hash: | 
| 133 | 6 | 100 |  |  |  | 39 | if( ref $selectors eq 'HASH') { | 
| 134 |  |  |  |  |  |  | @result = map { | 
| 135 | 2 |  |  |  |  | 6 | my $arr = $_; | 
|  | 4 |  |  |  |  | 5 |  | 
| 136 | 4 |  |  |  |  | 6 | my $i = 0; | 
| 137 | 4 |  |  |  |  | 12 | my @keys = sort { $a cmp $b } keys( %$selectors ); | 
|  | 4 |  |  |  |  | 10 |  | 
| 138 |  |  |  |  |  |  | $_ = +{ | 
| 139 | 4 |  |  |  |  | 7 | map { $_ => $arr->[$i++] } @keys | 
|  | 8 |  |  |  |  | 16 |  | 
| 140 |  |  |  |  |  |  | }; | 
| 141 |  |  |  |  |  |  | $_->{ $options->{url_field}} = $options->{base} | 
| 142 | 4 | 50 |  |  |  | 9 | if( $options->{url_field} ); | 
| 143 | 4 |  |  |  |  | 9 | $_ | 
| 144 |  |  |  |  |  |  | } @result | 
| 145 |  |  |  |  |  |  | } else { | 
| 146 | 4 | 50 |  |  |  | 15 | if( $options->{url_field} ) { | 
| 147 | 0 |  |  |  |  | 0 | for my $row (@result) { | 
| 148 | 0 |  |  |  |  | 0 | unshift @$row, $options->{base}; | 
| 149 |  |  |  |  |  |  | }; | 
| 150 |  |  |  |  |  |  | }; | 
| 151 |  |  |  |  |  |  | }; | 
| 152 |  |  |  |  |  |  |  | 
| 153 | 6 | 100 |  |  |  | 34 | $tree->delete | 
| 154 |  |  |  |  |  |  | if $delete_tree; | 
| 155 |  |  |  |  |  |  | @result | 
| 156 | 6 |  |  |  |  | 459 | }; | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | L - the scraper inspiring this module | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | =head1 REPOSITORY | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | The public repository of this module is | 
| 165 |  |  |  |  |  |  | L. | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | =head1 SUPPORT | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | The public support forum of this program is | 
| 170 |  |  |  |  |  |  | L. | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | =head1 AUTHOR | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | Max Maischein C | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | =head1 COPYRIGHT (c) | 
| 177 |  |  |  |  |  |  |  | 
| 178 |  |  |  |  |  |  | Copyright 2011-2011 by Max Maischein C. | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | =head1 LICENSE | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  | This module is released under the same terms as Perl itself. | 
| 183 |  |  |  |  |  |  |  | 
| 184 |  |  |  |  |  |  | =cut |