line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: /mirror/perl/Web-Scraper-Config/trunk/lib/Web/Scraper/Config.pm 7145 2007-05-09T16:36:57.901467Z daisuke $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp> |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
package Web::Scraper::Config; |
6
|
1
|
|
|
1
|
|
62368
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
7
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
44
|
|
8
|
1
|
|
|
1
|
|
624
|
use Config::Any; |
|
1
|
|
|
|
|
10514
|
|
|
1
|
|
|
|
|
51
|
|
9
|
1
|
|
|
1
|
|
688
|
use Data::Visitor::Callback; |
|
1
|
|
|
|
|
420502
|
|
|
1
|
|
|
|
|
39
|
|
10
|
1
|
|
|
1
|
|
618
|
use Web::Scraper; |
|
1
|
|
|
|
|
54545
|
|
|
1
|
|
|
|
|
6
|
|
11
|
1
|
|
|
1
|
|
478
|
use URI; |
|
1
|
|
|
|
|
3237
|
|
|
1
|
|
|
|
|
10
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
sub new |
15
|
|
|
|
|
|
|
{ |
16
|
0
|
|
|
0
|
1
|
|
my $class = shift; |
17
|
0
|
|
|
|
|
|
my $self = bless {}, $class; |
18
|
0
|
|
|
|
|
|
my $config = shift; |
19
|
0
|
|
|
|
|
|
my $opts = shift; |
20
|
0
|
|
|
|
|
|
$config = $self->_load_config($config); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
{ # BROKEN YAML::Syck |
23
|
0
|
|
|
|
|
|
my $v = Data::Visitor::Callback->new( |
24
|
0
|
|
|
0
|
|
|
plain_value => sub { $_[1] =~ s/(\w:) /$1/g; $_[1] } |
|
0
|
|
|
|
|
|
|
25
|
0
|
|
|
|
|
|
); |
26
|
0
|
|
|
|
|
|
$config = $v->visit($config); |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
0
|
|
|
|
|
|
$self->{config} = $config; |
30
|
0
|
0
|
0
|
|
|
|
$self->{callbacks} = $opts->{callbacks} if $opts && $opts->{callbacks}; |
31
|
0
|
|
|
|
|
|
return $self; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub _load_config |
35
|
|
|
|
|
|
|
{ |
36
|
0
|
|
|
0
|
|
|
my $self = shift; |
37
|
0
|
|
|
|
|
|
my $file = shift; |
38
|
|
|
|
|
|
|
|
39
|
0
|
0
|
|
|
|
|
if (ref $file eq 'HASH') { |
40
|
0
|
|
|
|
|
|
return $file; |
41
|
|
|
|
|
|
|
} else { |
42
|
|
|
|
|
|
|
# This is a bit hackish, but we're only loading one file, so |
43
|
|
|
|
|
|
|
# we should be okay |
44
|
0
|
|
|
|
|
|
my $list = Config::Any->load_files({ files => [ $file ]}); |
45
|
0
|
0
|
|
|
|
|
if (! @$list ) { |
46
|
0
|
|
|
|
|
|
require Carp; |
47
|
0
|
|
|
|
|
|
Carp::croak("Could not load config file $file: $@"); |
48
|
|
|
|
|
|
|
} |
49
|
0
|
|
|
|
|
|
return (values %{$list->[0]})[0]; |
|
0
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub scrape |
54
|
|
|
|
|
|
|
{ |
55
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
56
|
0
|
|
|
|
|
|
my $config = $self->{config}; |
57
|
0
|
|
|
|
|
|
my $scraper = $self->_recurse($config)->(); |
58
|
0
|
|
|
|
|
|
return $scraper->scrape(@_); |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub _recurse |
62
|
|
|
|
|
|
|
{ |
63
|
0
|
|
|
0
|
|
|
my ($self, $rules) = @_; |
64
|
|
|
|
|
|
|
|
65
|
0
|
|
|
|
|
|
my $ref = ref($rules); |
66
|
0
|
|
|
|
|
|
my $ret; |
67
|
0
|
0
|
|
|
|
|
if (! $ref) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
68
|
0
|
0
|
|
|
|
|
if ($rules =~ /^__callback\(([^\)]+)\)__$/) { |
69
|
0
|
|
|
|
|
|
$rules = $self->{callbacks}{$1}; |
70
|
|
|
|
|
|
|
} |
71
|
0
|
|
|
|
|
|
$ret = $rules; |
72
|
|
|
|
|
|
|
} elsif ($ref eq 'ARRAY') { |
73
|
0
|
|
|
|
|
|
my @elements; |
74
|
0
|
|
|
|
|
|
foreach my $rule (@$rules) { |
75
|
0
|
0
|
|
|
|
|
if ($rule =~ /^__callback\(([^\)]+)\)__$/) { |
76
|
0
|
|
|
|
|
|
$rule = $self->{callbacks}{$1}; |
77
|
0
|
|
|
0
|
|
|
push @elements, sub { sub { $rule->(@_) } }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
} else { |
79
|
0
|
0
|
|
|
|
|
push @elements, ref $rule ? $self->_recurse($rule) : $rule; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
0
|
0
|
|
|
|
|
if (! grep { my $ref = ref($_); $ref ? $ref ne 'CODE' : 1 } @elements) { |
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
$ret = sub { |
85
|
0
|
|
|
0
|
|
|
foreach my $code (@elements) { |
86
|
0
|
|
|
|
|
|
$code->() |
87
|
|
|
|
|
|
|
} |
88
|
0
|
|
|
|
|
|
}; |
89
|
|
|
|
|
|
|
} else { |
90
|
0
|
|
|
|
|
|
$ret = \@elements; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
} elsif ($ref eq 'HASH'){ |
93
|
0
|
|
|
|
|
|
my($op) = keys %$rules; |
94
|
0
|
|
|
|
|
|
my $h = $self->_recurse($rules->{$op}); |
95
|
0
|
|
|
|
|
|
my $is_func = ($op =~ /^(?:scraper|process(?:_first)?|result)$/); |
96
|
|
|
|
|
|
|
|
97
|
0
|
0
|
|
|
|
|
if ($is_func) { |
98
|
0
|
0
|
|
|
|
|
my @args = (ref $h eq 'ARRAY') ? @$h : ($h); |
99
|
0
|
0
|
|
|
|
|
if ($op eq 'scraper') { |
100
|
|
|
|
|
|
|
$ret = sub { |
101
|
0
|
|
|
|
|
|
scraper(sub { for (@args) { $_->() } }) |
|
0
|
|
|
|
|
|
|
102
|
0
|
|
|
0
|
|
|
}; |
|
0
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
} else { |
104
|
|
|
|
|
|
|
$ret = sub { |
105
|
|
|
|
|
|
|
my $code = sub { |
106
|
0
|
0
|
|
|
|
|
@_ = map { (ref $_ eq 'CODE') ? $_->() : $_ }@args; |
|
0
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
goto &$op; |
108
|
0
|
|
|
0
|
|
|
}; |
109
|
0
|
|
|
|
|
|
$code->() |
110
|
0
|
|
|
|
|
|
}; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} else { |
113
|
0
|
|
|
|
|
|
$ret = { $op => $h }; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} else { |
116
|
0
|
|
|
|
|
|
require Data::Dumper; |
117
|
0
|
|
|
|
|
|
die "Web::Scraper::Config does not know how to parse: " . Data::Dumper::Dumper($rules); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
return $ret; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
1; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
__END__ |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head1 NAME |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Web::Scraper::Config - Run Web::Scraper From Config Files |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head1 SYNOPSIS |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
--- |
134
|
|
|
|
|
|
|
scraper: |
135
|
|
|
|
|
|
|
- process: |
136
|
|
|
|
|
|
|
- td>ul>li |
137
|
|
|
|
|
|
|
- trailers[] |
138
|
|
|
|
|
|
|
- scraper: |
139
|
|
|
|
|
|
|
- process_first: |
140
|
|
|
|
|
|
|
- li>b |
141
|
|
|
|
|
|
|
- title |
142
|
|
|
|
|
|
|
- TEXT |
143
|
|
|
|
|
|
|
- process_first: |
144
|
|
|
|
|
|
|
- ul>li>a[href] |
145
|
|
|
|
|
|
|
- url |
146
|
|
|
|
|
|
|
- @href |
147
|
|
|
|
|
|
|
- process: |
148
|
|
|
|
|
|
|
- ul>li>ul>li>a |
149
|
|
|
|
|
|
|
- movies[] |
150
|
|
|
|
|
|
|
- __callback(process_movie)__ |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
my $scraper = Web::Scraper::Config->new( |
154
|
|
|
|
|
|
|
$config, |
155
|
|
|
|
|
|
|
{ |
156
|
|
|
|
|
|
|
callbacks => { |
157
|
|
|
|
|
|
|
process_movie => sub { |
158
|
|
|
|
|
|
|
my $elem = shift; |
159
|
|
|
|
|
|
|
return { |
160
|
|
|
|
|
|
|
text => $elem->as_text, |
161
|
|
|
|
|
|
|
href => $elem->attr('href') |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
); |
167
|
|
|
|
|
|
|
$scraper->scrape($uri); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head1 DESCRIPTION |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Web::Scraper::Config allows you to harness the power of Web::Scraper from |
172
|
|
|
|
|
|
|
a config file. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
The config files can be written in any format that Config::Any understands, |
175
|
|
|
|
|
|
|
as long as it conforms to this module's rules. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 METHODS |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head2 new |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Creates a new Web::Scraper::Config instance. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
The first arguments is either a hashref that represents a config, or a |
184
|
|
|
|
|
|
|
filename to the config. The config file can be in any format that Config::Any |
185
|
|
|
|
|
|
|
understands as long as it returns a hash that's conformant to the |
186
|
|
|
|
|
|
|
Web::Scraper::Config rules. |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
The second argument (options) is optional, and is currently only used to |
189
|
|
|
|
|
|
|
provider callbacks to be called from the scraper. When Web::Scraper::Config |
190
|
|
|
|
|
|
|
encounters an element in the form of: |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
__callback(function_name)__ |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
then that is replaced by the corresponding callback specified in the |
195
|
|
|
|
|
|
|
options hash. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 scrape |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Starts scraping. The semantics are exactly the same as Web::Scraper::scrape |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=head1 AUTHOR |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Daisuke Maki E<lt>daisuke@endeworks.jpE<gt> |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head1 LICENSE |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
208
|
|
|
|
|
|
|
under the same terms as Perl itself. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
See http://www.perl.com/perl/misc/Artistic.html |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=cut |