line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Scraper; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
28220
|
use strict; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
106
|
|
4
|
4
|
|
|
4
|
|
17
|
use Carp; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
9645
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=pod |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 NAME |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Text::Scraper - Structured data from (un)structured text |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 SYNOPSIS |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
use Text::Scraper; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
use LWP::Simple; |
19
|
|
|
|
|
|
|
use Data::Dumper; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# |
22
|
|
|
|
|
|
|
# 1. Get our template and source text |
23
|
|
|
|
|
|
|
# |
24
|
|
|
|
|
|
|
my $tmpl = Text::Scraper->slurp(\*DATA); |
25
|
|
|
|
|
|
|
my $src = get('http://search.cpan.org/recent') || die $!; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
# 2. Extract data from source |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
my $obj = Text::Scraper->new(tmpl => $tmpl); |
31
|
|
|
|
|
|
|
my $data = $obj->scrape($src); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# |
34
|
|
|
|
|
|
|
# 3. Do something really neat...(left as excercise) |
35
|
|
|
|
|
|
|
# |
36
|
|
|
|
|
|
|
print "Newest Submission: ", $data->[0]{submissions}[0]{name}, "\n\n"; |
37
|
|
|
|
|
|
|
print "Scraper model:\n", Dumper($obj), "\n\n"; |
38
|
|
|
|
|
|
|
print "Parsed model:\n", Dumper($data) , "\n\n"; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
__DATA__ |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
-- |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head1 ABSTRACT |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Text::Scraper provides a fully functional base-class to quickly develop |
61
|
|
|
|
|
|
|
I and other text extraction tools. Programmatically |
62
|
|
|
|
|
|
|
generated text such as dynamic webpages are trivially reversed engineered. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Using templates, the programmer is freed from staring at fragile, heavily |
65
|
|
|
|
|
|
|
escaped regular expressions, mapping capture groups to named variables or |
66
|
|
|
|
|
|
|
wrestling with the DOM and badly formed HTML. In addition, extracted data |
67
|
|
|
|
|
|
|
can be hierarchical, which is beyond the capabilities of vanilla regular |
68
|
|
|
|
|
|
|
expressions. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Text::Scraper's functionality overlaps some existing CPAN modules - |
71
|
|
|
|
|
|
|
L and L. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Text::Scraper is much more lightweight than either and has a |
74
|
|
|
|
|
|
|
more general application domain than the latter. It has no dependencies on |
75
|
|
|
|
|
|
|
other frameworks, modules or design-decisions. On average, Text::Scraper |
76
|
|
|
|
|
|
|
benchmarks around I<250% faster> than Template::Extract - and uses |
77
|
|
|
|
|
|
|
significantly less memory. |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Unlike both existing modules, Text::Scraper generalizes its functionality |
80
|
|
|
|
|
|
|
to allow the programmer to refine template capture groups beyond C<(.*?)>, |
81
|
|
|
|
|
|
|
fully redefine the template syntax and introduce new template constructs |
82
|
|
|
|
|
|
|
bound to custom classes. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head1 BACKGROUND |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Using templates is a popular method of seperating visual presentation from |
87
|
|
|
|
|
|
|
programming logic - particularly popular in programs generating dynamic webpages. |
88
|
|
|
|
|
|
|
Text::Scraper reverses this process, using templates to I the data |
89
|
|
|
|
|
|
|
back out of the surrounding presentation. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
If you are familiar with templating concepts, then the L should be sufficient |
92
|
|
|
|
|
|
|
to get you started. If not, I would recommend reading the documentation for |
93
|
|
|
|
|
|
|
L - a module thats syntax and terminology is very |
94
|
|
|
|
|
|
|
similar to Text::Scraper's. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head1 DESCRIPTION |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Template Tags are classed as I or I. Like XML, Branches must |
99
|
|
|
|
|
|
|
have an associated closing tag, Leaves must not. By default, Leaf nodes return |
100
|
|
|
|
|
|
|
SCALARs and Branch nodes return ARRAYs of HASHes - each array element mapping |
101
|
|
|
|
|
|
|
to a matched sub-sequence. Blessing or filtering this data is left as an |
102
|
|
|
|
|
|
|
exercise for subclasses. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
The default syntax is based on the XML preprocessor syntax: |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
and for Branches: |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
... |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
By default, Tags I be named and any closing tag I include the name of the |
115
|
|
|
|
|
|
|
opening tag it is closing. Attributes have the same syntax as XML attributes - |
116
|
|
|
|
|
|
|
but (similar to Perl regular expressions) can use any non-bracket punctuation character |
117
|
|
|
|
|
|
|
as quotation delimiters: |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
The only attribute acted on by the default tag classes is C - used to refine how |
122
|
|
|
|
|
|
|
the Tag is translated into a regular-expression capture group: |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
This can be used to further filter the parsed data - similar to using grep: |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Each tag should create I capture group - but it is fine to make the outer |
131
|
|
|
|
|
|
|
group non-capturing: |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
I C<02 July 1979>. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head2 Default Tags |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
The default tags provided by Text::Scraper are typical for basic scraping but can be |
140
|
|
|
|
|
|
|
subclassed for additional functionality. All the default tags are demonstrated in the |
141
|
|
|
|
|
|
|
L: |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=over 4 |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item B |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
Vars represent strings of text in a template. They are instances of |
148
|
|
|
|
|
|
|
C. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item B |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Stuff tags represent spans of text that are of no interest in the |
153
|
|
|
|
|
|
|
extracted data, but can ease parsing in certain situations. They are instances |
154
|
|
|
|
|
|
|
of C - a subclass of C. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item B |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Loops represent repeated information in a template and are extracted as an |
159
|
|
|
|
|
|
|
array of hashes. They are instances of C. |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=item B |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
A conditional region in the template. If not present, the parent scope |
164
|
|
|
|
|
|
|
will contain a false value under the tags name. Otherwise the value will be true |
165
|
|
|
|
|
|
|
and any tags inside the if's scope will be exported to its parent scope also. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
These are instances of C. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=back |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head1 User API |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
These methods alone are sufficient for a basic scraping session: |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=cut |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
my $null = bless \$0, "NULL"; |
178
|
|
|
|
|
|
|
my %protos = (); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub TRACE () {0;} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=pod |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 C<< my $string = Text::Scraper->slurp( STRING|GLOBREF ) >> |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
Static utility method to return either a filename or filehandle as a string |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
sub slurp |
192
|
|
|
|
|
|
|
{ |
193
|
8
|
|
|
8
|
1
|
820
|
my $class = shift; |
194
|
8
|
|
|
|
|
15
|
my $file = shift; |
195
|
8
|
|
|
|
|
12
|
my $data = undef; |
196
|
8
|
|
|
|
|
25
|
local $/ = undef; |
197
|
|
|
|
|
|
|
|
198
|
8
|
100
|
|
|
|
36
|
if(!ref $file){ |
|
|
50
|
|
|
|
|
|
199
|
4
|
50
|
|
|
|
177
|
open my $f, $file or Carp::croak("$class\::slurp: '$file' $!"); |
200
|
4
|
|
|
|
|
500
|
$data = <$f>; |
201
|
4
|
|
|
|
|
44
|
close $f; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
elsif(ref $file eq 'GLOB'){ |
204
|
4
|
|
|
|
|
93
|
$data = <$file>; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
else{ |
207
|
0
|
|
|
|
|
0
|
Carp::croak("$class\::slurp: bad argument '$file'\n"); |
208
|
|
|
|
|
|
|
} |
209
|
8
|
|
|
|
|
36
|
return $data; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=pod |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=head2 C<< my $object = Text::Scraper->new(HASH) >> |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
Returns a new Text::Scraper object. Optional parameters are: |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=over 4 |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item B |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
A template as a string |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=item B |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
A Text::Scraper::Syntax instance. See L. |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=back |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=cut |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub new |
233
|
|
|
|
|
|
|
{ |
234
|
60
|
|
|
60
|
1
|
160
|
my $proto = shift; |
235
|
60
|
|
66
|
|
|
182
|
my $class = ref($proto) || $proto; |
236
|
60
|
50
|
|
|
|
137
|
Carp::croak("Bad key/value arguments to $class::new") if @_ % 2; |
237
|
60
|
|
|
|
|
355
|
my $self = bless {@_}, $class; |
238
|
|
|
|
|
|
|
|
239
|
60
|
100
|
|
|
|
200
|
$protos{$self} = $proto |
240
|
|
|
|
|
|
|
unless $proto eq $class; |
241
|
|
|
|
|
|
|
|
242
|
60
|
100
|
|
|
|
281
|
$self->parse_attr(delete $self->{attributes}) |
243
|
|
|
|
|
|
|
if exists $self->{attributes}; |
244
|
60
|
100
|
|
|
|
188
|
$self->compile(delete $self->{tmpl}) |
245
|
|
|
|
|
|
|
if $self->{tmpl}; |
246
|
|
|
|
|
|
|
|
247
|
60
|
|
|
|
|
152
|
$self->on_create(); |
248
|
60
|
|
|
|
|
154
|
return $self; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
sub DESTROY |
252
|
|
|
|
|
|
|
{ |
253
|
60
|
|
|
60
|
|
2667
|
my $self = shift; |
254
|
60
|
|
|
|
|
135
|
$self->on_destroy(); |
255
|
60
|
|
|
|
|
83
|
delete $protos{$self}; |
256
|
60
|
|
|
|
|
677
|
return; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=pod |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head2 C<< $obj->compile(STRING) >> |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Only required for recompilation or if no B parameter is passed to the constructor. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=cut |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
sub compile |
268
|
|
|
|
|
|
|
{ |
269
|
8
|
|
|
8
|
1
|
14
|
my $self = shift; |
270
|
8
|
|
|
|
|
11
|
my $tmpl = shift; |
271
|
8
|
|
66
|
|
|
44
|
my $syntax = $self->{syntax} || Text::Scraper::Syntax->new(); |
272
|
|
|
|
|
|
|
|
273
|
8
|
50
|
33
|
|
|
52
|
if($tmpl && $syntax) |
274
|
|
|
|
|
|
|
{ |
275
|
8
|
|
|
|
|
15
|
$self->{tmpl} = $tmpl; |
276
|
8
|
|
|
|
|
12
|
$self->{syntax}= $syntax; |
277
|
8
|
|
|
|
|
17
|
$self->{nodes} = []; |
278
|
|
|
|
|
|
|
|
279
|
8
|
|
|
|
|
16
|
my $rex_leaf = $syntax->{regex}{leaf}; |
280
|
8
|
|
|
|
|
16
|
my $rex_open = $syntax->{regex}{open}; |
281
|
8
|
|
|
|
|
13
|
my $rex_close = $syntax->{regex}{close}; |
282
|
8
|
|
|
|
|
13
|
my $rex_escape = $syntax->{regex}{escape}; |
283
|
|
|
|
|
|
|
|
284
|
8
|
|
|
|
|
7187
|
1 while $tmpl =~ s#$rex_open(?!=$rex_open.*?$rex_close)(.*?)$rex_close#$self->de_branch($1,$2,$3,$7)#sge; |
|
4
|
|
|
|
|
28
|
|
285
|
8
|
|
|
|
|
169
|
1 while $tmpl =~ s#$rex_leaf#$self->de_leaf($1,$2,$3)#sge; |
|
38
|
|
|
|
|
111
|
|
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# TODO: Can this third substitution on escape sequences be removed? |
288
|
|
|
|
|
|
|
# May requires double escape on all above regex...slower? |
289
|
|
|
|
|
|
|
|
290
|
8
|
|
|
|
|
28
|
$tmpl = $syntax->quote($tmpl); |
291
|
8
|
|
|
|
|
92
|
$tmpl =~ s/$rex_escape/$self->{nodes}[$1]->to_regex()/esg; |
|
38
|
|
|
|
|
144
|
|
292
|
|
|
|
|
|
|
|
293
|
8
|
|
|
|
|
24
|
$self->{compiled} = $tmpl; |
294
|
8
|
|
|
|
|
11
|
$self->{nodes} = [ grep { $_ != $null } @{$self->{nodes}} ]; |
|
42
|
|
|
|
|
108
|
|
|
8
|
|
|
|
|
19
|
|
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# |
299
|
|
|
|
|
|
|
# Compile scopes and replace with internal leafs |
300
|
|
|
|
|
|
|
# |
301
|
|
|
|
|
|
|
sub de_branch |
302
|
|
|
|
|
|
|
{ |
303
|
4
|
|
|
4
|
0
|
20
|
my($self, $type, $name, $args, $body) = @_; |
304
|
4
|
|
|
|
|
9
|
my $nodes = $self->{nodes}; |
305
|
4
|
|
|
|
|
8
|
my $idx = scalar @$nodes; |
306
|
4
|
|
|
|
|
8
|
my $types = $self->{syntax}{branches}; |
307
|
4
|
50
|
|
|
|
32
|
Carp::croak("Invalid branch-type '$type'") |
308
|
|
|
|
|
|
|
unless $types->{$type}; |
309
|
4
|
|
|
|
|
52
|
my $node = $types->{$type}->new(tmpl => $body, syntax => $self->{syntax}, type => $type, class => $types->{$type}, name => $name, attributes => $args); |
310
|
4
|
|
|
|
|
10
|
push @$nodes, $node; |
311
|
4
|
|
|
|
|
21
|
return $self->{syntax}->create_internal_leaf_string( $node, $idx ); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# |
315
|
|
|
|
|
|
|
# Insert leafs and branches in correct order (use $null to maintain indexes) |
316
|
|
|
|
|
|
|
# |
317
|
|
|
|
|
|
|
sub de_leaf |
318
|
|
|
|
|
|
|
{ |
319
|
38
|
|
|
38
|
0
|
99
|
my($self,$type,$name,$args) = @_; |
320
|
38
|
|
|
|
|
54
|
my $nodes = $self->{nodes}; |
321
|
38
|
|
|
|
|
42
|
my $idx = scalar @$nodes; |
322
|
|
|
|
|
|
|
|
323
|
38
|
100
|
|
|
|
102
|
if($type =~ /^\d+$/o){ |
324
|
4
|
|
|
|
|
15
|
push @$nodes, splice(@$nodes, $type, 1, $null); |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
else{ |
327
|
34
|
|
|
|
|
48
|
my $types = $self->{syntax}{leaves}; |
328
|
34
|
50
|
|
|
|
82
|
Carp::croak("Invalid leaf-type '$type'") |
329
|
|
|
|
|
|
|
unless $types->{$type}; |
330
|
34
|
|
|
|
|
168
|
push @$nodes, $types->{$type}->new(syntax => $self->{syntax}, type => $type, class => $types->{$type}, name => $name, attributes => $args); |
331
|
|
|
|
|
|
|
} |
332
|
38
|
|
|
|
|
101
|
return $self->{syntax}->create_escape_string($idx); |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# |
336
|
|
|
|
|
|
|
# NB: Prepends '$' to user attributes to seperate from private |
337
|
|
|
|
|
|
|
# |
338
|
|
|
|
|
|
|
sub parse_attr |
339
|
|
|
|
|
|
|
{ |
340
|
38
|
|
|
38
|
0
|
58
|
my $self = shift; |
341
|
38
|
|
|
|
|
45
|
my $args = shift; |
342
|
38
|
100
|
66
|
|
|
178
|
if(defined $args && length $args){ |
343
|
5
|
|
|
|
|
29
|
while($args =~ /(\w+)\s*=\s*(\W)(.*?)\2/sg){ |
344
|
2
|
|
|
|
|
11
|
$self->{"\$$1"} = $3; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
} |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=pod |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=head2 C<< my $data = $obj->scrape(STRING) >> |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Extract data from STRING based on compiled template. |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=cut |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
# NB: $parent and $scope arguments are used internally to allow |
358
|
|
|
|
|
|
|
# nodes to modify their parent, such as Text::Scraper::Conditional |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub scrape |
361
|
|
|
|
|
|
|
{ |
362
|
1221
|
|
|
1221
|
1
|
1683
|
my ($self, $text, $parent, $scope) = @_; |
363
|
1221
|
|
|
|
|
1436
|
my $tmpl = $self->{compiled}; |
364
|
1221
|
|
|
|
|
1309
|
my $nodes = $self->{nodes}; |
365
|
|
|
|
|
|
|
|
366
|
1221
|
100
|
|
|
|
4261
|
return $self->on_data($text) |
367
|
|
|
|
|
|
|
if($self->isa('Text::Scraper::Leaf')); |
368
|
|
|
|
|
|
|
|
369
|
221
|
50
|
|
|
|
391
|
Carp::croak("$self->{name}: Cannot scrape without a compiled template!") |
370
|
|
|
|
|
|
|
unless $tmpl; |
371
|
|
|
|
|
|
|
|
372
|
221
|
100
|
|
|
|
7186
|
$text =~ s/\s+/ /sg |
373
|
|
|
|
|
|
|
unless $parent; |
374
|
|
|
|
|
|
|
|
375
|
221
|
|
|
|
|
17901
|
my @matches = ($text =~ /$tmpl/gs); |
376
|
221
|
|
|
|
|
392
|
my $symbols = undef; |
377
|
221
|
|
|
|
|
298
|
my $returns = []; |
378
|
|
|
|
|
|
|
|
379
|
221
|
|
|
|
|
254
|
TRACE && print STDERR "$self matches: ",scalar @matches,"\n"; |
380
|
|
|
|
|
|
|
|
381
|
221
|
|
|
|
|
511
|
for(my $i=0; $i<@matches; $i++) |
382
|
|
|
|
|
|
|
{ |
383
|
1267
|
|
|
|
|
1527
|
my $mod = $i % scalar @$nodes; |
384
|
1267
|
|
|
|
|
1325
|
my $node = $nodes->[$mod]; |
385
|
1267
|
|
|
|
|
1540
|
my $name = $node->{name}; |
386
|
|
|
|
|
|
|
|
387
|
1267
|
100
|
|
|
|
2185
|
if($mod==0) |
388
|
|
|
|
|
|
|
{ |
389
|
434
|
100
|
|
|
|
767
|
push @$returns, $symbols if $symbols; |
390
|
434
|
|
|
|
|
649
|
$symbols = {}; |
391
|
|
|
|
|
|
|
} |
392
|
1267
|
100
|
|
|
|
2110
|
next if $node->ignore(); |
393
|
1217
|
|
|
|
|
2159
|
$symbols->{$name} = $node->scrape($matches[$i], $self, $symbols); |
394
|
|
|
|
|
|
|
} |
395
|
221
|
100
|
|
|
|
541
|
push @$returns, $symbols if $symbols; |
396
|
221
|
|
|
|
|
391
|
return $self->on_data($returns); |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=pod |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head1 Subclass API |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
Text::Scraper allows its users to define custom tags and bless captured |
404
|
|
|
|
|
|
|
data into custom classes. Because Text::Scraper objects are prototype |
405
|
|
|
|
|
|
|
based, a subclass can both inherit the scraping logic and also encapsulate |
406
|
|
|
|
|
|
|
any particular instance of the scraped data. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
During template compilation, a single instance of each tag type is created |
409
|
|
|
|
|
|
|
as the I. Its attributes will be related to the tag, any |
410
|
|
|
|
|
|
|
supplied tag attributes, etc. During scraping, each prototype is invoked |
411
|
|
|
|
|
|
|
to scrape the relevent I against its I. |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 C<< $subclass->on_create() >> |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
General construction callback. Text::Scraper objects are prototype based so |
416
|
|
|
|
|
|
|
overriding the constructor is not recommended. Objects are hash based; any |
417
|
|
|
|
|
|
|
constructor arguments become attributes of the new instance before invoking |
418
|
|
|
|
|
|
|
this method. |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 C<< $subclass->on_destroy() >> |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
General destruction callback. Text::Scraper uses the DESTROY hook so any |
423
|
|
|
|
|
|
|
custom functionality is best implemented here. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head2 C<< $subclass->on_data(SCALAR) >> |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
This is the subclasses opportunity to bless or otherwise process any parsed |
428
|
|
|
|
|
|
|
data. The return value from C is added to the generated output |
429
|
|
|
|
|
|
|
data-structure. By default these values are just returned unblessed. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
The SCALAR argument depends on the class of tag. For C |
432
|
|
|
|
|
|
|
subclasses, SCALAR will be the matched text. For C |
433
|
|
|
|
|
|
|
subclasses, SCALAR will be a reference to an array of hashes. Below is an |
434
|
|
|
|
|
|
|
example of two custom tag classes that bless captured data into the same |
435
|
|
|
|
|
|
|
class: |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
package Myleaf; |
438
|
|
|
|
|
|
|
use base "Text::Scraper::Leaf"; |
439
|
|
|
|
|
|
|
sub on_data |
440
|
|
|
|
|
|
|
{ |
441
|
|
|
|
|
|
|
my ($self, $match) = @_; |
442
|
|
|
|
|
|
|
return $self->new(value => $match); |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
package MyBranch; |
446
|
|
|
|
|
|
|
use base "Text::Scraper::Branch"; |
447
|
|
|
|
|
|
|
sub on_data |
448
|
|
|
|
|
|
|
{ |
449
|
|
|
|
|
|
|
my ($self, $matches) = @_; |
450
|
|
|
|
|
|
|
@$matches = map { $self->new(%$_) } @$matches; |
451
|
|
|
|
|
|
|
return $matches; |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=head2 C<< my $regex = $subclass->to_regex() >> |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
Returns this nodes representation as a regular expression, to be used |
457
|
|
|
|
|
|
|
in a compiled template. If you find yourself using a particular regex |
458
|
|
|
|
|
|
|
attribute a lot, it might be easier to define a custom tag that overloads |
459
|
|
|
|
|
|
|
this method. |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=head2 C<< my $boolean = $subclass->ignore() >> |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
Returns a boolean value stating whether the parser should ignore the data |
464
|
|
|
|
|
|
|
captured by this object. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=head2 C<< $subclass->proto() $subclass->proto(SCALAR) >> |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
Utility method to allow Tag instances to access (attributes of) their prototype. |
469
|
|
|
|
|
|
|
This can be safely called from a prototype object, which just points to itself. |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=head2 C<< my @children = $subclass->nodes() >> |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
Returns instance data I, including any present conditional data. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=cut |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
sub on_data |
478
|
|
|
|
|
|
|
{ |
479
|
1218
|
|
|
1218
|
1
|
6066
|
return $_[1]; |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub on_create |
483
|
|
|
|
|
|
|
{ |
484
|
60
|
|
|
60
|
1
|
73
|
my $self = shift; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
sub on_destroy |
488
|
|
|
|
|
|
|
{ |
489
|
60
|
|
|
60
|
1
|
79
|
my $self = shift; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub to_regex |
493
|
|
|
|
|
|
|
{ |
494
|
38
|
|
|
38
|
1
|
43
|
my $self = shift; |
495
|
38
|
|
100
|
|
|
229
|
return $self->{"\$regex"} || '(.*?)'; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub ignore |
499
|
|
|
|
|
|
|
{ |
500
|
1217
|
|
|
1217
|
1
|
2263
|
return 0; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub nodes |
504
|
|
|
|
|
|
|
{ |
505
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
506
|
0
|
|
0
|
|
|
0
|
my $for = shift || $self; |
507
|
0
|
|
|
|
|
0
|
my $proto = $self->proto(); |
508
|
|
|
|
|
|
|
|
509
|
0
|
0
|
|
|
|
0
|
return @{$self->{nodes}} |
|
0
|
|
|
|
|
0
|
|
510
|
|
|
|
|
|
|
if($proto == $self); |
511
|
|
|
|
|
|
|
|
512
|
0
|
|
|
|
|
0
|
my @vals = (); |
513
|
0
|
|
|
|
|
0
|
foreach my $n ( @{$proto->{nodes}} ) |
|
0
|
|
|
|
|
0
|
|
514
|
|
|
|
|
|
|
{ |
515
|
0
|
|
|
|
|
0
|
my $val = $for->{$n->{name}}; |
516
|
0
|
0
|
|
|
|
0
|
next unless $val; |
517
|
0
|
|
|
|
|
0
|
push @vals, $val; |
518
|
0
|
0
|
|
|
|
0
|
push @vals, $val->nodes($for) |
519
|
|
|
|
|
|
|
if $val->isa('Text::Scraper::Conditional'); |
520
|
|
|
|
|
|
|
} |
521
|
0
|
|
|
|
|
0
|
return @vals; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub proto |
525
|
|
|
|
|
|
|
{ |
526
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
527
|
0
|
|
|
|
|
0
|
my $attr = shift; |
528
|
0
|
|
0
|
|
|
0
|
my $proto = $protos{$self} || $self; |
529
|
0
|
0
|
|
|
|
0
|
return ($attr == undef) ? $proto : |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
530
|
|
|
|
|
|
|
(defined $proto->{"\$$attr"}) ? $proto->{"\$$attr"} : |
531
|
|
|
|
|
|
|
(defined $proto->{$attr}) ? $proto->{$attr} : undef; |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
# |
535
|
|
|
|
|
|
|
# Inherits all behaviour from Text::Scraper |
536
|
|
|
|
|
|
|
# |
537
|
|
|
|
|
|
|
package Text::Scraper::Branch; |
538
|
|
|
|
|
|
|
our @ISA = ('Text::Scraper'); |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# |
541
|
|
|
|
|
|
|
# |
542
|
|
|
|
|
|
|
# |
543
|
|
|
|
|
|
|
package Text::Scraper::Leaf; |
544
|
|
|
|
|
|
|
our @ISA = ('Text::Scraper'); |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# |
547
|
|
|
|
|
|
|
# |
548
|
|
|
|
|
|
|
# |
549
|
|
|
|
|
|
|
package Text::Scraper::Conditional; |
550
|
|
|
|
|
|
|
our @ISA = ('Text::Scraper'); |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
sub scrape |
553
|
|
|
|
|
|
|
{ |
554
|
204
|
|
|
204
|
|
286
|
my ($self, $text, $parent, $scope) = @_; |
555
|
204
|
|
|
|
|
392
|
my $data = $self->SUPER::scrape($text, $parent, $scope); |
556
|
204
|
|
|
|
|
213
|
my $tag; |
557
|
|
|
|
|
|
|
|
558
|
204
|
|
|
|
|
262
|
$data = shift @$data; |
559
|
204
|
100
|
|
|
|
378
|
unless($data){ |
560
|
25
|
|
|
|
|
30
|
$tag = 0; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
else{ |
563
|
179
|
|
|
|
|
735
|
$scope->{$_} = $data->{$_} foreach keys %$data; |
564
|
179
|
|
|
|
|
264
|
$tag = 1; |
565
|
|
|
|
|
|
|
} |
566
|
204
|
|
|
|
|
811
|
return $tag; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
sub to_regex |
570
|
|
|
|
|
|
|
{ |
571
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
572
|
1
|
|
|
|
|
5
|
return $self->SUPER::to_regex()."?"; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
package Text::Scraper::Ignorable; |
576
|
|
|
|
|
|
|
our @ISA = ('Text::Scraper::Leaf'); |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# TODO: Currently ignorables still capture their text... |
579
|
|
|
|
|
|
|
# which makes for a more elegent algorithm over efficiency. |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub ignore |
582
|
|
|
|
|
|
|
{ |
583
|
50
|
|
|
50
|
|
169
|
1; |
584
|
|
|
|
|
|
|
} |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=pod |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=head2 Defining a custom syntax |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
The two areas of customization are Tag Syntax and Tag Classes. The defaults are |
591
|
|
|
|
|
|
|
encapsulated in the I class. |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
The interested reader is encouraged to copy the source of the default syntax class |
594
|
|
|
|
|
|
|
and play around with changes. All the over-ridable methods begin with B |
595
|
|
|
|
|
|
|
and are fairly self explanatory or well commented. |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Any new Tag classes should be subclassed from either I, |
598
|
|
|
|
|
|
|
I, I or I. |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=cut |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
package Text::Scraper::Syntax; |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# |
605
|
|
|
|
|
|
|
# Map tag types to classes |
606
|
|
|
|
|
|
|
# |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
sub define_class_leaves |
609
|
|
|
|
|
|
|
{ |
610
|
4
|
|
|
4
|
|
22
|
return (var => 'Text::Scraper::Leaf', stuff => 'Text::Scraper::Ignorable'); |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub define_class_branches |
614
|
|
|
|
|
|
|
{ |
615
|
4
|
|
|
4
|
|
44
|
return (loop => 'Text::Scraper::Branch', if => 'Text::Scraper::Conditional'); |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
# |
619
|
|
|
|
|
|
|
# Tag Syntax: |
620
|
|
|
|
|
|
|
# TYPE, NAME, ATTRIBUTES, BACKREF, and ESCAPE are special |
621
|
|
|
|
|
|
|
# markers that are substituted with either regular |
622
|
|
|
|
|
|
|
# expressions or values. |
623
|
|
|
|
|
|
|
# |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
sub define_syntax_leaf |
626
|
|
|
|
|
|
|
{ |
627
|
8
|
|
|
8
|
|
48
|
''; |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
sub define_syntax_branch_open |
631
|
|
|
|
|
|
|
{ |
632
|
4
|
|
|
4
|
|
20
|
''; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
sub define_syntax_branch_close |
636
|
|
|
|
|
|
|
{ |
637
|
4
|
|
|
4
|
|
20
|
''; |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# |
641
|
|
|
|
|
|
|
# Escape sequences must never appear in input text |
642
|
|
|
|
|
|
|
# |
643
|
|
|
|
|
|
|
sub define_syntax_escape |
644
|
|
|
|
|
|
|
{ |
645
|
42
|
|
|
42
|
|
240
|
"$;ESCAPE$;"; |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# |
649
|
|
|
|
|
|
|
# BACKREF must be able to match 2 unique identifiers |
650
|
|
|
|
|
|
|
# in nested branch nodes, hence \2\5. If you change |
651
|
|
|
|
|
|
|
# the order of TYPE and NAME, this will need updated. |
652
|
|
|
|
|
|
|
# |
653
|
|
|
|
|
|
|
sub define_backref |
654
|
|
|
|
|
|
|
{ |
655
|
4
|
|
|
4
|
|
8
|
'(?:\2|\5)'; |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
# |
659
|
|
|
|
|
|
|
# The methods below should NOT be overridden in custom Syntax subclasses |
660
|
|
|
|
|
|
|
# |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
sub new |
663
|
|
|
|
|
|
|
{ |
664
|
4
|
|
|
4
|
|
24
|
my $class = shift; |
665
|
4
|
|
|
|
|
14
|
my $self = bless {}, $class; |
666
|
|
|
|
|
|
|
|
667
|
4
|
|
|
|
|
15
|
my $bref = $self->define_backref(); |
668
|
4
|
|
|
|
|
31
|
my %tokens = (NAME => '(\w+)',TYPE => '((?:\w+|\d+))', ATTRIBUTES => '(.*?)?', ESCAPE => '(\d+?)', BACKREF => $bref ); |
669
|
4
|
|
|
|
|
18
|
my $tokes = join('|', keys %tokens); |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
# Load valid types: |
672
|
4
|
|
|
|
|
18
|
$self->{branches} = { $self->define_class_branches() }; |
673
|
4
|
|
|
|
|
20
|
$self->{leaves} = { $self->define_class_leaves() }; |
674
|
|
|
|
|
|
|
|
675
|
4
|
|
|
|
|
21
|
my $syn = |
676
|
|
|
|
|
|
|
{ |
677
|
|
|
|
|
|
|
leaf => $self->define_syntax_leaf(), |
678
|
|
|
|
|
|
|
open => $self->define_syntax_branch_open(), |
679
|
|
|
|
|
|
|
close => $self->define_syntax_branch_close(), |
680
|
|
|
|
|
|
|
escape => $self->define_syntax_escape() |
681
|
|
|
|
|
|
|
}; |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# Create regexen from syntax: |
684
|
|
|
|
|
|
|
# 'escape' is a special case as it is invoked as a regex AFTER |
685
|
|
|
|
|
|
|
# whole tmpl has been escaped - requiring double "escapation" |
686
|
|
|
|
|
|
|
|
687
|
4
|
|
|
|
|
10
|
my $rex = {}; |
688
|
4
|
|
|
|
|
32
|
$rex->{$_} = $self->quote($syn->{$_}) foreach keys %$syn; |
689
|
4
|
|
|
|
|
17
|
$rex->{escape} = $self->quote(quotemeta($syn->{escape})); |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# Insert token regexes into compiled regex |
692
|
|
|
|
|
|
|
$_ =~ s/($tokes)/$tokens{$1}/sg |
693
|
4
|
|
|
|
|
421
|
foreach values %$rex; |
694
|
|
|
|
|
|
|
|
695
|
4
|
|
|
|
|
13
|
$self->{syntax} = $syn; |
696
|
4
|
|
|
|
|
11
|
$self->{regex} = $rex; |
697
|
4
|
|
|
|
|
26
|
return $self; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# |
701
|
|
|
|
|
|
|
# Compact and escape a template |
702
|
|
|
|
|
|
|
# TODO: needs knowledge of preserver_whitespace options |
703
|
|
|
|
|
|
|
# |
704
|
|
|
|
|
|
|
sub quote |
705
|
|
|
|
|
|
|
{ |
706
|
28
|
|
|
28
|
|
37
|
my $self = shift; |
707
|
28
|
|
|
|
|
35
|
my $tmpl = shift; |
708
|
|
|
|
|
|
|
|
709
|
28
|
|
|
|
|
218
|
$tmpl =~ s/\s+/ /sgo; |
710
|
28
|
|
|
|
|
445
|
$tmpl = qr/\Q$tmpl\E/; |
711
|
28
|
|
|
|
|
473
|
$tmpl =~ s/\\\s/\\s*/sgo; |
712
|
28
|
|
|
|
|
105
|
return $tmpl; |
713
|
|
|
|
|
|
|
} |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# |
716
|
|
|
|
|
|
|
# Create the syntax specific escaped index (CANNOT clash with template data) |
717
|
|
|
|
|
|
|
# |
718
|
|
|
|
|
|
|
sub create_escape_string |
719
|
|
|
|
|
|
|
{ |
720
|
38
|
|
|
38
|
|
40
|
my $self = shift; |
721
|
38
|
|
|
|
|
38
|
my $num = shift; |
722
|
38
|
|
|
|
|
72
|
my $str = $self->define_syntax_escape(); |
723
|
38
|
|
|
|
|
97
|
$str =~ s/ESCAPE/$num/; |
724
|
38
|
|
|
|
|
251
|
return $str; |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# |
728
|
|
|
|
|
|
|
# Create syntax for an internal leaf referencing an already parsed branch |
729
|
|
|
|
|
|
|
# |
730
|
|
|
|
|
|
|
sub create_internal_leaf_string |
731
|
|
|
|
|
|
|
{ |
732
|
4
|
|
|
4
|
|
8
|
my $self = shift; |
733
|
4
|
|
|
|
|
5
|
my $node = shift; |
734
|
4
|
|
|
|
|
7
|
my $idx = shift; |
735
|
4
|
|
|
|
|
13
|
my $str = $self->define_syntax_leaf(); |
736
|
4
|
|
|
|
|
15
|
$str =~ s#TYPE#$idx#; |
737
|
4
|
|
|
|
|
19
|
$str =~ s#NAME#$node->{name}#; |
738
|
4
|
|
|
|
|
13
|
$str =~ s#ATTRIBUTES##; |
739
|
4
|
|
|
|
|
1450
|
return $str; |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=pod |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=head1 BUGS & CAVEATS |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
Rather than write a slow parser in pure Perl, Text::Scraper |
747
|
|
|
|
|
|
|
farms a lot of the work out to Perl's optimized regular-expression engine. |
748
|
|
|
|
|
|
|
This works well in general but, unfortunately, doesn't allow for a lot of |
749
|
|
|
|
|
|
|
error feedback during scraping. A fair understanding of the pros and cons |
750
|
|
|
|
|
|
|
of using regular expressions in this manner can be beneficial, but is outside |
751
|
|
|
|
|
|
|
the scope of this documentation. |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
L can be indespensible in following the success of |
754
|
|
|
|
|
|
|
your scraping. It can be safely applied to a Text::Scraper instance to analyze |
755
|
|
|
|
|
|
|
the parser's object model, or to the return value from a C invokation |
756
|
|
|
|
|
|
|
to analyze what was parsed. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
Bug reports and suggestions welcome. |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head1 AUTHOR |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
Copyright (C) 2005 Chris McEwan - All rights reserved. |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Chris McEwan |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=head1 LICENSE |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
769
|
|
|
|
|
|
|
under the same terms as Perl itself. |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
=cut |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
1; |