line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Yahoo::Search::XML; |
2
|
2
|
|
|
2
|
|
14
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
92
|
|
3
|
2
|
|
|
2
|
|
2136
|
use Encode; |
|
2
|
|
|
|
|
14953
|
|
|
2
|
|
|
|
|
3282
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = "20100614.1"; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
my %enc_cache; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
## |
10
|
|
|
|
|
|
|
## Version history: |
11
|
|
|
|
|
|
|
## |
12
|
|
|
|
|
|
|
## 20060729.004 |
13
|
|
|
|
|
|
|
## * handle tags being added by Yahoo! |
14
|
|
|
|
|
|
|
## * slightly better error messages |
15
|
|
|
|
|
|
|
## |
16
|
|
|
|
|
|
|
## 20060428.003 -- |
17
|
|
|
|
|
|
|
## * ignore type tags |
18
|
|
|
|
|
|
|
## * allow '-' in a tag name |
19
|
|
|
|
|
|
|
## * properly handle self-closing tags with no attributes, e.g. "" |
20
|
|
|
|
|
|
|
## * added atomic-parens in one area to increase efficiency |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 NAME |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 VERSION |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
version 1.11.3 |
28
|
|
|
|
|
|
|
Yahoo::Search::XML -- Simple routines for parsing XML from Yahoo! Search. |
29
|
|
|
|
|
|
|
(This package is included in, and automatically loaded by, the |
30
|
|
|
|
|
|
|
Yahoo::Search package.) |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The XML sent back from Yahoo! is fairly simple, and is guaranteed to be |
35
|
|
|
|
|
|
|
well formed, so we really don't need much more than to make the data easily |
36
|
|
|
|
|
|
|
available. I'd like to use XML::Simple, but it uses XML::Parser, which |
37
|
|
|
|
|
|
|
suffers from crippling memory leaks (in one test, 36k was lost with each |
38
|
|
|
|
|
|
|
parsing of a 7k xml file), so I've rolled my own simple version that might |
39
|
|
|
|
|
|
|
be called, uh, XML::SuperDuperSimple. |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
The end result is identical to what XML::Simple would produce, at least for |
42
|
|
|
|
|
|
|
the XML the Yahoo! sends back. It may well be useful for other things that |
43
|
|
|
|
|
|
|
use a similarly small subset of XML notation. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This package is also much faster than XML::Simple / XML::Parser, producing |
46
|
|
|
|
|
|
|
the same output 41 times faster, in my tests. That's the benefit of not |
47
|
|
|
|
|
|
|
having to handle everything, I guess. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 AUTHOR |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Jeffrey Friedl |
52
|
|
|
|
|
|
|
Kyoto, Japan |
53
|
|
|
|
|
|
|
Feb 2005 |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my $error; |
58
|
|
|
|
|
|
|
my @stack; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
## |
61
|
|
|
|
|
|
|
## Process a start tag. |
62
|
|
|
|
|
|
|
## |
63
|
|
|
|
|
|
|
sub Start |
64
|
|
|
|
|
|
|
{ |
65
|
0
|
|
|
0
|
0
|
|
my ($tag, %attr) = @_; |
66
|
|
|
|
|
|
|
|
67
|
0
|
|
|
|
|
|
my $node = { |
68
|
|
|
|
|
|
|
Tag => $tag, |
69
|
|
|
|
|
|
|
Char => "", |
70
|
|
|
|
|
|
|
}; |
71
|
|
|
|
|
|
|
|
72
|
0
|
0
|
|
|
|
|
if (%attr) { |
73
|
0
|
|
|
|
|
|
$node->{Data} = \%attr; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
push @stack, $node; |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
## |
80
|
|
|
|
|
|
|
## Process raw text |
81
|
|
|
|
|
|
|
## |
82
|
|
|
|
|
|
|
sub Char |
83
|
|
|
|
|
|
|
{ |
84
|
0
|
|
|
0
|
0
|
|
my ($str) = @_; |
85
|
0
|
|
|
|
|
|
$stack[-1]->{Char} .= $str; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub _error($$) |
89
|
|
|
|
|
|
|
{ |
90
|
0
|
|
|
0
|
|
|
my $line = shift; |
91
|
0
|
|
|
|
|
|
my $msg = shift; |
92
|
|
|
|
|
|
|
|
93
|
0
|
|
|
|
|
|
die "Error in Yahoo::Search::XML on line $line: $msg\n"; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
## |
98
|
|
|
|
|
|
|
## Process an end tag |
99
|
|
|
|
|
|
|
## |
100
|
|
|
|
|
|
|
sub End |
101
|
|
|
|
|
|
|
{ |
102
|
0
|
|
|
0
|
0
|
|
my ($tag) = @_; |
103
|
0
|
|
|
|
|
|
my $node = pop @stack; |
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
my $val; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
## |
108
|
|
|
|
|
|
|
## There is {Data} if there were xml tags between this $tag's start and |
109
|
|
|
|
|
|
|
## the end we're processing now. |
110
|
|
|
|
|
|
|
## |
111
|
|
|
|
|
|
|
## There's {Char} if text was between. |
112
|
|
|
|
|
|
|
## |
113
|
|
|
|
|
|
|
## We never expect both, so we watch out for that here... |
114
|
|
|
|
|
|
|
## |
115
|
0
|
0
|
|
|
|
|
if ($node->{Data}) |
|
|
0
|
|
|
|
|
|
116
|
|
|
|
|
|
|
{ |
117
|
0
|
0
|
|
|
|
|
if ($node->{Char} =~ m/^\s*$/) { |
118
|
0
|
|
|
|
|
|
$node->{Char} = ""; |
119
|
|
|
|
|
|
|
} else { |
120
|
0
|
|
|
|
|
|
_error(__LINE__, "not expecting both text and structure as content of <$tag>"); |
121
|
|
|
|
|
|
|
} |
122
|
0
|
|
|
|
|
|
$val = $node->{Data}; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
elsif ($node->{Char} ne "") |
125
|
|
|
|
|
|
|
{ |
126
|
0
|
|
|
|
|
|
$val = $node->{Char}; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
else |
129
|
|
|
|
|
|
|
{ |
130
|
0
|
|
|
|
|
|
$val = ""; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
## |
134
|
|
|
|
|
|
|
## Shove this data ($val) into the previous node, named for this $tag |
135
|
|
|
|
|
|
|
## |
136
|
0
|
0
|
|
|
|
|
if (not $stack[-1]->{Data}->{$node->{Tag}}) { |
|
|
0
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
$stack[-1]->{Data}->{$node->{Tag}} = $val; |
138
|
|
|
|
|
|
|
} elsif (ref($stack[-1]->{Data}->{$node->{Tag}}) eq "ARRAY") { |
139
|
0
|
|
|
|
|
|
push @{ $stack[-1]->{Data}->{$node->{Tag}} }, $val; |
|
0
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
} else { |
141
|
0
|
|
|
|
|
|
$stack[-1]->{Data}->{$node->{Tag}} = [ $stack[-1]->{Data}->{$node->{Tag}}, $val ]; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my %EntityDecode = |
146
|
|
|
|
|
|
|
( |
147
|
|
|
|
|
|
|
amp => '&', |
148
|
|
|
|
|
|
|
lt => '<', |
149
|
|
|
|
|
|
|
gt => '>', |
150
|
|
|
|
|
|
|
apos => "'", |
151
|
|
|
|
|
|
|
quot => '"', #" |
152
|
|
|
|
|
|
|
); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub _entity($) |
155
|
|
|
|
|
|
|
{ |
156
|
0
|
|
|
0
|
|
|
my $name = shift; |
157
|
0
|
0
|
|
|
|
|
if (my $val = $EntityDecode{$name}) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
return $val; |
159
|
|
|
|
|
|
|
} elsif ($name =~ m/^#(\d+)$/) { |
160
|
0
|
|
|
|
|
|
return chr($1); |
161
|
|
|
|
|
|
|
} elsif ($name =~ m/^#x([0-9a-f]+)$/i) { |
162
|
0
|
|
|
|
|
|
return chr(hex($1)); |
163
|
|
|
|
|
|
|
} else { |
164
|
0
|
|
|
|
|
|
_error(__LINE__, "unknown entity &$name;"); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub de_grok($) |
169
|
|
|
|
|
|
|
{ |
170
|
0
|
|
|
0
|
0
|
|
my $text = shift; |
171
|
0
|
|
|
|
|
|
$text =~ s/&([^;]+);/_entity($1)/gxe; |
|
0
|
|
|
|
|
|
|
172
|
0
|
|
|
|
|
|
return $text; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub Parse($) |
176
|
|
|
|
|
|
|
{ |
177
|
0
|
|
|
0
|
0
|
|
my $xml = shift; |
178
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
@stack = {}; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
## skip past the leading tag |
182
|
0
|
0
|
|
|
|
|
if ($xml =~ m/\A <\?xml(.*?)> /xgcs) { |
183
|
0
|
|
|
|
|
|
my $xml_header = $1; |
184
|
|
|
|
|
|
|
# XXX doesn't handle BOM, just assumes UTF-8 if not explicit |
185
|
|
|
|
|
|
|
# (some yahoo services don't include an explicit encoding) |
186
|
0
|
0
|
|
|
|
|
my $encoding = ($xml_header =~ /encoding="(.*?)"/) ? $1 : "UTF-8"; |
187
|
0
|
|
|
|
|
|
my $enc = $enc_cache{$encoding} = find_encoding($encoding); |
188
|
|
|
|
|
|
|
# decode the bytes into a perl utf8 string |
189
|
|
|
|
|
|
|
# taking care to preserve the pos-ition. |
190
|
0
|
|
|
|
|
|
my $pos = pos($xml); |
191
|
0
|
|
|
|
|
|
$xml = $enc->decode($xml); |
192
|
0
|
|
|
|
|
|
pos($xml) = $pos; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
0
|
|
|
|
|
|
while (pos($xml) < length($xml)) |
196
|
|
|
|
|
|
|
{ |
197
|
|
|
|
|
|
|
#my $x = substr($xml, pos($xml), 30); |
198
|
|
|
|
|
|
|
#$x .= "..." if length($x) == 30; |
199
|
|
|
|
|
|
|
#$x =~ s/\n/\\n/g; |
200
|
|
|
|
|
|
|
#my $STACK = join ">", map { $_->{Tag} } @stack; |
201
|
|
|
|
|
|
|
#print "[$STACK] now at [$x]\n"; |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
## |
204
|
|
|
|
|
|
|
## Nab , , and tags... |
205
|
|
|
|
|
|
|
## |
206
|
0
|
0
|
|
|
|
|
if ($xml =~ m{\G |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
207
|
|
|
|
|
|
|
<(/?) # $1 - true if an ending tag |
208
|
|
|
|
|
|
|
( (?> [-:\w]+ ) ) # $2 - tag name |
209
|
|
|
|
|
|
|
([^>]*) # $3 - attributes (and possible final '/') |
210
|
|
|
|
|
|
|
>}xgc) |
211
|
|
|
|
|
|
|
{ |
212
|
0
|
|
|
|
|
|
my ($IsEnd, $TagName, $Attribs) = ($1, $2, $3); |
213
|
|
|
|
|
|
|
|
214
|
0
|
0
|
0
|
|
|
|
my $IsImmediateEnd = 1 if ($Attribs and $Attribs =~ s{/$}{}); |
215
|
|
|
|
|
|
|
|
216
|
0
|
0
|
|
|
|
|
if ($TagName eq 'wbr') |
|
|
0
|
|
|
|
|
|
217
|
|
|
|
|
|
|
{ |
218
|
|
|
|
|
|
|
## skip it |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
elsif ($IsEnd) { |
221
|
0
|
|
|
|
|
|
End($TagName); |
222
|
|
|
|
|
|
|
} else { |
223
|
0
|
|
|
|
|
|
my %A; |
224
|
0
|
0
|
|
|
|
|
if ($Attribs) |
225
|
|
|
|
|
|
|
{ |
226
|
0
|
|
|
|
|
|
while ($Attribs =~ m/([:\w]+)=(?: "([^\"]*)" | '([^\']*)' )/xg) { |
227
|
0
|
0
|
|
|
|
|
$A{$1} = de_grok(defined($3) ? $3 : $2); |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
0
|
|
|
|
|
|
Start($TagName, %A); |
231
|
0
|
0
|
|
|
|
|
if ($IsImmediateEnd) { |
232
|
0
|
|
|
|
|
|
End($TagName); |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
elsif ($xml =~ m/\G/xgcs) |
237
|
|
|
|
|
|
|
{ |
238
|
|
|
|
|
|
|
## comment -- ignore |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
elsif ($xml =~ m/\G]+>/xgcs) |
241
|
|
|
|
|
|
|
{ |
242
|
|
|
|
|
|
|
## , etc. -- ignore |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
## |
245
|
|
|
|
|
|
|
## Nab raw text / entities |
246
|
|
|
|
|
|
|
## |
247
|
|
|
|
|
|
|
elsif ($xml =~ m/\G /xgcs) |
248
|
|
|
|
|
|
|
{ |
249
|
0
|
|
|
|
|
|
Char($1); |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
elsif ($xml =~ m/\G ([^<>]+)/xgc) |
252
|
|
|
|
|
|
|
{ |
253
|
0
|
|
|
|
|
|
Char(de_grok($1)); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
else |
256
|
|
|
|
|
|
|
{ |
257
|
0
|
|
|
|
|
|
my ($str) = $xml =~ m/\G(.{1,40})/; |
258
|
0
|
0
|
|
|
|
|
$str .= "..." if length($str) == 40; |
259
|
0
|
|
|
|
|
|
_error(__LINE__, "bad XML parse at \"$str\""); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
#use Data::Dumper; print Data::Dumper::Dumper(\@stack), "\n"; |
264
|
0
|
0
|
|
|
|
|
_error(__LINE__, '@stack != 1') if @stack != 1; |
265
|
0
|
0
|
|
|
|
|
_error(__LINE__, "not data") if not $stack[0]->{Data}; |
266
|
0
|
0
|
|
|
|
|
_error(__LINE__, "keys not 1") if keys(%{ $stack[0]->{Data}} ) != 1; |
|
0
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
|
my ($tree) = values(%{$stack[0]->{Data}}); |
|
0
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
return $tree; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
1; |