| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Yahoo::Search::XML; |
|
2
|
2
|
|
|
2
|
|
14
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
92
|
|
|
3
|
2
|
|
|
2
|
|
2136
|
use Encode; |
|
|
2
|
|
|
|
|
14953
|
|
|
|
2
|
|
|
|
|
3282
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = "20100614.1"; |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
my %enc_cache; |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
## |
|
10
|
|
|
|
|
|
|
## Version history: |
|
11
|
|
|
|
|
|
|
## |
|
12
|
|
|
|
|
|
|
## 20060729.004 |
|
13
|
|
|
|
|
|
|
## * handle tags being added by Yahoo! |
|
14
|
|
|
|
|
|
|
## * slightly better error messages |
|
15
|
|
|
|
|
|
|
## |
|
16
|
|
|
|
|
|
|
## 20060428.003 -- |
|
17
|
|
|
|
|
|
|
## * ignore type tags |
|
18
|
|
|
|
|
|
|
## * allow '-' in a tag name |
|
19
|
|
|
|
|
|
|
## * properly handle self-closing tags with no attributes, e.g. "" |
|
20
|
|
|
|
|
|
|
## * added atomic-parens in one area to increase efficiency |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 NAME |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 VERSION |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
version 1.11.3 |
|
28
|
|
|
|
|
|
|
Yahoo::Search::XML -- Simple routines for parsing XML from Yahoo! Search. |
|
29
|
|
|
|
|
|
|
(This package is included in, and automatically loaded by, the |
|
30
|
|
|
|
|
|
|
Yahoo::Search package.) |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The XML sent back from Yahoo! is fairly simple, and is guaranteed to be |
|
35
|
|
|
|
|
|
|
well formed, so we really don't need much more than to make the data easily |
|
36
|
|
|
|
|
|
|
available. I'd like to use XML::Simple, but it uses XML::Parser, which |
|
37
|
|
|
|
|
|
|
suffers from crippling memory leaks (in one test, 36k was lost with each |
|
38
|
|
|
|
|
|
|
parsing of a 7k xml file), so I've rolled my own simple version that might |
|
39
|
|
|
|
|
|
|
be called, uh, XML::SuperDuperSimple. |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
The end result is identical to what XML::Simple would produce, at least for |
|
42
|
|
|
|
|
|
|
the XML the Yahoo! sends back. It may well be useful for other things that |
|
43
|
|
|
|
|
|
|
use a similarly small subset of XML notation. |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This package is also much faster than XML::Simple / XML::Parser, producing |
|
46
|
|
|
|
|
|
|
the same output 41 times faster, in my tests. That's the benefit of not |
|
47
|
|
|
|
|
|
|
having to handle everything, I guess. |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 AUTHOR |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Jeffrey Friedl |
|
52
|
|
|
|
|
|
|
Kyoto, Japan |
|
53
|
|
|
|
|
|
|
Feb 2005 |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
my $error; |
|
58
|
|
|
|
|
|
|
my @stack; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
## |
|
61
|
|
|
|
|
|
|
## Process a start tag. |
|
62
|
|
|
|
|
|
|
## |
|
63
|
|
|
|
|
|
|
sub Start |
|
64
|
|
|
|
|
|
|
{ |
|
65
|
0
|
|
|
0
|
0
|
|
my ($tag, %attr) = @_; |
|
66
|
|
|
|
|
|
|
|
|
67
|
0
|
|
|
|
|
|
my $node = { |
|
68
|
|
|
|
|
|
|
Tag => $tag, |
|
69
|
|
|
|
|
|
|
Char => "", |
|
70
|
|
|
|
|
|
|
}; |
|
71
|
|
|
|
|
|
|
|
|
72
|
0
|
0
|
|
|
|
|
if (%attr) { |
|
73
|
0
|
|
|
|
|
|
$node->{Data} = \%attr; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
push @stack, $node; |
|
77
|
|
|
|
|
|
|
} |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
## |
|
80
|
|
|
|
|
|
|
## Process raw text |
|
81
|
|
|
|
|
|
|
## |
|
82
|
|
|
|
|
|
|
sub Char |
|
83
|
|
|
|
|
|
|
{ |
|
84
|
0
|
|
|
0
|
0
|
|
my ($str) = @_; |
|
85
|
0
|
|
|
|
|
|
$stack[-1]->{Char} .= $str; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub _error($$) |
|
89
|
|
|
|
|
|
|
{ |
|
90
|
0
|
|
|
0
|
|
|
my $line = shift; |
|
91
|
0
|
|
|
|
|
|
my $msg = shift; |
|
92
|
|
|
|
|
|
|
|
|
93
|
0
|
|
|
|
|
|
die "Error in Yahoo::Search::XML on line $line: $msg\n"; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
## |
|
98
|
|
|
|
|
|
|
## Process an end tag |
|
99
|
|
|
|
|
|
|
## |
|
100
|
|
|
|
|
|
|
sub End |
|
101
|
|
|
|
|
|
|
{ |
|
102
|
0
|
|
|
0
|
0
|
|
my ($tag) = @_; |
|
103
|
0
|
|
|
|
|
|
my $node = pop @stack; |
|
104
|
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
my $val; |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
## |
|
108
|
|
|
|
|
|
|
## There is {Data} if there were xml tags between this $tag's start and |
|
109
|
|
|
|
|
|
|
## the end we're processing now. |
|
110
|
|
|
|
|
|
|
## |
|
111
|
|
|
|
|
|
|
## There's {Char} if text was between. |
|
112
|
|
|
|
|
|
|
## |
|
113
|
|
|
|
|
|
|
## We never expect both, so we watch out for that here... |
|
114
|
|
|
|
|
|
|
## |
|
115
|
0
|
0
|
|
|
|
|
if ($node->{Data}) |
|
|
|
0
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
{ |
|
117
|
0
|
0
|
|
|
|
|
if ($node->{Char} =~ m/^\s*$/) { |
|
118
|
0
|
|
|
|
|
|
$node->{Char} = ""; |
|
119
|
|
|
|
|
|
|
} else { |
|
120
|
0
|
|
|
|
|
|
_error(__LINE__, "not expecting both text and structure as content of <$tag>"); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
0
|
|
|
|
|
|
$val = $node->{Data}; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
elsif ($node->{Char} ne "") |
|
125
|
|
|
|
|
|
|
{ |
|
126
|
0
|
|
|
|
|
|
$val = $node->{Char}; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
else |
|
129
|
|
|
|
|
|
|
{ |
|
130
|
0
|
|
|
|
|
|
$val = ""; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
## |
|
134
|
|
|
|
|
|
|
## Shove this data ($val) into the previous node, named for this $tag |
|
135
|
|
|
|
|
|
|
## |
|
136
|
0
|
0
|
|
|
|
|
if (not $stack[-1]->{Data}->{$node->{Tag}}) { |
|
|
|
0
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
$stack[-1]->{Data}->{$node->{Tag}} = $val; |
|
138
|
|
|
|
|
|
|
} elsif (ref($stack[-1]->{Data}->{$node->{Tag}}) eq "ARRAY") { |
|
139
|
0
|
|
|
|
|
|
push @{ $stack[-1]->{Data}->{$node->{Tag}} }, $val; |
|
|
0
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
} else { |
|
141
|
0
|
|
|
|
|
|
$stack[-1]->{Data}->{$node->{Tag}} = [ $stack[-1]->{Data}->{$node->{Tag}}, $val ]; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my %EntityDecode = |
|
146
|
|
|
|
|
|
|
( |
|
147
|
|
|
|
|
|
|
amp => '&', |
|
148
|
|
|
|
|
|
|
lt => '<', |
|
149
|
|
|
|
|
|
|
gt => '>', |
|
150
|
|
|
|
|
|
|
apos => "'", |
|
151
|
|
|
|
|
|
|
quot => '"', #" |
|
152
|
|
|
|
|
|
|
); |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub _entity($) |
|
155
|
|
|
|
|
|
|
{ |
|
156
|
0
|
|
|
0
|
|
|
my $name = shift; |
|
157
|
0
|
0
|
|
|
|
|
if (my $val = $EntityDecode{$name}) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
return $val; |
|
159
|
|
|
|
|
|
|
} elsif ($name =~ m/^#(\d+)$/) { |
|
160
|
0
|
|
|
|
|
|
return chr($1); |
|
161
|
|
|
|
|
|
|
} elsif ($name =~ m/^#x([0-9a-f]+)$/i) { |
|
162
|
0
|
|
|
|
|
|
return chr(hex($1)); |
|
163
|
|
|
|
|
|
|
} else { |
|
164
|
0
|
|
|
|
|
|
_error(__LINE__, "unknown entity &$name;"); |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub de_grok($) |
|
169
|
|
|
|
|
|
|
{ |
|
170
|
0
|
|
|
0
|
0
|
|
my $text = shift; |
|
171
|
0
|
|
|
|
|
|
$text =~ s/&([^;]+);/_entity($1)/gxe; |
|
|
0
|
|
|
|
|
|
|
|
172
|
0
|
|
|
|
|
|
return $text; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
sub Parse($) |
|
176
|
|
|
|
|
|
|
{ |
|
177
|
0
|
|
|
0
|
0
|
|
my $xml = shift; |
|
178
|
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
@stack = {}; |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
## skip past the leading tag |
|
182
|
0
|
0
|
|
|
|
|
if ($xml =~ m/\A <\?xml(.*?)> /xgcs) { |
|
183
|
0
|
|
|
|
|
|
my $xml_header = $1; |
|
184
|
|
|
|
|
|
|
# XXX doesn't handle BOM, just assumes UTF-8 if not explicit |
|
185
|
|
|
|
|
|
|
# (some yahoo services don't include an explicit encoding) |
|
186
|
0
|
0
|
|
|
|
|
my $encoding = ($xml_header =~ /encoding="(.*?)"/) ? $1 : "UTF-8"; |
|
187
|
0
|
|
|
|
|
|
my $enc = $enc_cache{$encoding} = find_encoding($encoding); |
|
188
|
|
|
|
|
|
|
# decode the bytes into a perl utf8 string |
|
189
|
|
|
|
|
|
|
# taking care to preserve the pos-ition. |
|
190
|
0
|
|
|
|
|
|
my $pos = pos($xml); |
|
191
|
0
|
|
|
|
|
|
$xml = $enc->decode($xml); |
|
192
|
0
|
|
|
|
|
|
pos($xml) = $pos; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
0
|
|
|
|
|
|
while (pos($xml) < length($xml)) |
|
196
|
|
|
|
|
|
|
{ |
|
197
|
|
|
|
|
|
|
#my $x = substr($xml, pos($xml), 30); |
|
198
|
|
|
|
|
|
|
#$x .= "..." if length($x) == 30; |
|
199
|
|
|
|
|
|
|
#$x =~ s/\n/\\n/g; |
|
200
|
|
|
|
|
|
|
#my $STACK = join ">", map { $_->{Tag} } @stack; |
|
201
|
|
|
|
|
|
|
#print "[$STACK] now at [$x]\n"; |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
## |
|
204
|
|
|
|
|
|
|
## Nab , , and tags... |
|
205
|
|
|
|
|
|
|
## |
|
206
|
0
|
0
|
|
|
|
|
if ($xml =~ m{\G |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
<(/?) # $1 - true if an ending tag |
|
208
|
|
|
|
|
|
|
( (?> [-:\w]+ ) ) # $2 - tag name |
|
209
|
|
|
|
|
|
|
([^>]*) # $3 - attributes (and possible final '/') |
|
210
|
|
|
|
|
|
|
>}xgc) |
|
211
|
|
|
|
|
|
|
{ |
|
212
|
0
|
|
|
|
|
|
my ($IsEnd, $TagName, $Attribs) = ($1, $2, $3); |
|
213
|
|
|
|
|
|
|
|
|
214
|
0
|
0
|
0
|
|
|
|
my $IsImmediateEnd = 1 if ($Attribs and $Attribs =~ s{/$}{}); |
|
215
|
|
|
|
|
|
|
|
|
216
|
0
|
0
|
|
|
|
|
if ($TagName eq 'wbr') |
|
|
|
0
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
{ |
|
218
|
|
|
|
|
|
|
## skip it |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
elsif ($IsEnd) { |
|
221
|
0
|
|
|
|
|
|
End($TagName); |
|
222
|
|
|
|
|
|
|
} else { |
|
223
|
0
|
|
|
|
|
|
my %A; |
|
224
|
0
|
0
|
|
|
|
|
if ($Attribs) |
|
225
|
|
|
|
|
|
|
{ |
|
226
|
0
|
|
|
|
|
|
while ($Attribs =~ m/([:\w]+)=(?: "([^\"]*)" | '([^\']*)' )/xg) { |
|
227
|
0
|
0
|
|
|
|
|
$A{$1} = de_grok(defined($3) ? $3 : $2); |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
0
|
|
|
|
|
|
Start($TagName, %A); |
|
231
|
0
|
0
|
|
|
|
|
if ($IsImmediateEnd) { |
|
232
|
0
|
|
|
|
|
|
End($TagName); |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
elsif ($xml =~ m/\G/xgcs) |
|
237
|
|
|
|
|
|
|
{ |
|
238
|
|
|
|
|
|
|
## comment -- ignore |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
elsif ($xml =~ m/\G]+>/xgcs) |
|
241
|
|
|
|
|
|
|
{ |
|
242
|
|
|
|
|
|
|
## , etc. -- ignore |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
## |
|
245
|
|
|
|
|
|
|
## Nab raw text / entities |
|
246
|
|
|
|
|
|
|
## |
|
247
|
|
|
|
|
|
|
elsif ($xml =~ m/\G /xgcs) |
|
248
|
|
|
|
|
|
|
{ |
|
249
|
0
|
|
|
|
|
|
Char($1); |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
elsif ($xml =~ m/\G ([^<>]+)/xgc) |
|
252
|
|
|
|
|
|
|
{ |
|
253
|
0
|
|
|
|
|
|
Char(de_grok($1)); |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
else |
|
256
|
|
|
|
|
|
|
{ |
|
257
|
0
|
|
|
|
|
|
my ($str) = $xml =~ m/\G(.{1,40})/; |
|
258
|
0
|
0
|
|
|
|
|
$str .= "..." if length($str) == 40; |
|
259
|
0
|
|
|
|
|
|
_error(__LINE__, "bad XML parse at \"$str\""); |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
#use Data::Dumper; print Data::Dumper::Dumper(\@stack), "\n"; |
|
264
|
0
|
0
|
|
|
|
|
_error(__LINE__, '@stack != 1') if @stack != 1; |
|
265
|
0
|
0
|
|
|
|
|
_error(__LINE__, "not data") if not $stack[0]->{Data}; |
|
266
|
0
|
0
|
|
|
|
|
_error(__LINE__, "keys not 1") if keys(%{ $stack[0]->{Data}} ) != 1; |
|
|
0
|
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
|
my ($tree) = values(%{$stack[0]->{Data}}); |
|
|
0
|
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
return $tree; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
1; |