File Coverage

blib/lib/Yahoo/Search/XML.pm

Criterion	Covered	Total	%
statement	6	75	8.0
branch	0	52	0.0
condition	0	3	0.0
subroutine	2	9	22.2
pod	0	5	0.0
total	8	144	5.5

line	stmt	bran	cond	sub	pod	time	code
1							package Yahoo::Search::XML;
2	2			2		14	use strict;
	2					3
	2					92
3	2			2		2136	use Encode;
	2					14953
	2					3282
4
5							our $VERSION = "20100614.1";
6
7							my %enc_cache;
8
9							##
10							## Version history:
11							##
12							## 20060729.004
13							## * handle tags being added by Yahoo!
14							## * slightly better error messages
15							##
16							## 20060428.003 --
17							## * ignore type tags
18							## * allow '-' in a tag name
19							## * properly handle self-closing tags with no attributes, e.g. ""
20							## * added atomic-parens in one area to increase efficiency
21
22							=head1 NAME
23
24
25							=head1 VERSION
26
27							version 1.11.3
28							Yahoo::Search::XML -- Simple routines for parsing XML from Yahoo! Search.
29							(This package is included in, and automatically loaded by, the
30							Yahoo::Search package.)
31
32							=head1 DESCRIPTION
33
34							The XML sent back from Yahoo! is fairly simple, and is guaranteed to be
35							well formed, so we really don't need much more than to make the data easily
36							available. I'd like to use XML::Simple, but it uses XML::Parser, which
37							suffers from crippling memory leaks (in one test, 36k was lost with each
38							parsing of a 7k xml file), so I've rolled my own simple version that might
39							be called, uh, XML::SuperDuperSimple.
40
41							The end result is identical to what XML::Simple would produce, at least for
42							the XML the Yahoo! sends back. It may well be useful for other things that
43							use a similarly small subset of XML notation.
44
45							This package is also much faster than XML::Simple / XML::Parser, producing
46							the same output 41 times faster, in my tests. That's the benefit of not
47							having to handle everything, I guess.
48
49							=head1 AUTHOR
50
51							Jeffrey Friedl
52							Kyoto, Japan
53							Feb 2005
54
55							=cut
56
57							my $error;
58							my @stack;
59
60							##
61							## Process a start tag.
62							##
63							sub Start
64							{
65	0			0	0		my ($tag, %attr) = @_;
66
67	0						my $node = {
68							Tag => $tag,
69							Char => "",
70							};
71
72	0	0					if (%attr) {
73	0						$node->{Data} = \%attr;
74							}
75
76	0						push @stack, $node;
77							}
78
79							##
80							## Process raw text
81							##
82							sub Char
83							{
84	0			0	0		my ($str) = @_;
85	0						$stack[-1]->{Char} .= $str;
86							}
87
88							sub _error($$)
89							{
90	0			0			my $line = shift;
91	0						my $msg = shift;
92
93	0						die "Error in Yahoo::Search::XML on line $line: $msg\n";
94							}
95
96
97							##
98							## Process an end tag
99							##
100							sub End
101							{
102	0			0	0		my ($tag) = @_;
103	0						my $node = pop @stack;
104
105	0						my $val;
106
107							##
108							## There is {Data} if there were xml tags between this $tag's start and
109							## the end we're processing now.
110							##
111							## There's {Char} if text was between.
112							##
113							## We never expect both, so we watch out for that here...
114							##
115	0	0					if ($node->{Data})
		0
116							{
117	0	0					if ($node->{Char} =~ m/^\s*$/) {
118	0						$node->{Char} = "";
119							} else {
120	0						_error(__LINE__, "not expecting both text and structure as content of <$tag>");
121							}
122	0						$val = $node->{Data};
123							}
124							elsif ($node->{Char} ne "")
125							{
126	0						$val = $node->{Char};
127							}
128							else
129							{
130	0						$val = "";
131							}
132
133							##
134							## Shove this data ($val) into the previous node, named for this $tag
135							##
136	0	0					if (not $stack[-1]->{Data}->{$node->{Tag}}) {
		0
137	0						$stack[-1]->{Data}->{$node->{Tag}} = $val;
138							} elsif (ref($stack[-1]->{Data}->{$node->{Tag}}) eq "ARRAY") {
139	0						push @{ $stack[-1]->{Data}->{$node->{Tag}} }, $val;
	0
140							} else {
141	0						$stack[-1]->{Data}->{$node->{Tag}} = [ $stack[-1]->{Data}->{$node->{Tag}}, $val ];
142							}
143							}
144
145							my %EntityDecode =
146							(
147							amp => '&',
148							lt => '<',
149							gt => '>',
150							apos => "'",
151							quot => '"', #"
152							);
153
154							sub _entity($)
155							{
156	0			0			my $name = shift;
157	0	0					if (my $val = $EntityDecode{$name}) {
		0
		0
158	0						return $val;
159							} elsif ($name =~ m/^#(\d+)$/) {
160	0						return chr($1);
161							} elsif ($name =~ m/^#x([0-9a-f]+)$/i) {
162	0						return chr(hex($1));
163							} else {
164	0						_error(__LINE__, "unknown entity &$name;");
165							}
166							}
167
168							sub de_grok($)
169							{
170	0			0	0		my $text = shift;
171	0						$text =~ s/&([^;]+);/_entity($1)/gxe;
	0
172	0						return $text;
173							}
174
175							sub Parse($)
176							{
177	0			0	0		my $xml = shift;
178
179	0						@stack = {};
180
181							## skip past the leading tag
182	0	0					if ($xml =~ m/\A <\?xml(.*?)> /xgcs) {
183	0						my $xml_header = $1;
184							# XXX doesn't handle BOM, just assumes UTF-8 if not explicit
185							# (some yahoo services don't include an explicit encoding)
186	0	0					my $encoding = ($xml_header =~ /encoding="(.*?)"/) ? $1 : "UTF-8";
187	0						my $enc = $enc_cache{$encoding} = find_encoding($encoding);
188							# decode the bytes into a perl utf8 string
189							# taking care to preserve the pos-ition.
190	0						my $pos = pos($xml);
191	0						$xml = $enc->decode($xml);
192	0						pos($xml) = $pos;
193							}
194
195	0						while (pos($xml) < length($xml))
196							{
197							#my $x = substr($xml, pos($xml), 30);
198							#$x .= "..." if length($x) == 30;
199							#$x =~ s/\n/\\n/g;
200							#my $STACK = join ">", map { $_->{Tag} } @stack;
201							#print "[$STACK] now at [$x]\n";
202
203							##
204							## Nab , , and tags...
205							##
206	0	0					if ($xml =~ m{\G
		0
		0
		0
		0
207							<(/?) # $1 - true if an ending tag
208							( (?> [-:\w]+ ) ) # $2 - tag name
209							([^>]*) # $3 - attributes (and possible final '/')
210							>}xgc)
211							{
212	0						my ($IsEnd, $TagName, $Attribs) = ($1, $2, $3);
213
214	0	0	0				my $IsImmediateEnd = 1 if ($Attribs and $Attribs =~ s{/$}{});
215
216	0	0					if ($TagName eq 'wbr')
		0
217							{
218							## skip it
219							}
220							elsif ($IsEnd) {
221	0						End($TagName);
222							} else {
223	0						my %A;
224	0	0					if ($Attribs)
225							{
226	0						while ($Attribs =~ m/([:\w]+)=(?: "([^\"])" \| '([^\'])' )/xg) {
227	0	0					$A{$1} = de_grok(defined($3) ? $3 : $2);
228							}
229							}
230	0						Start($TagName, %A);
231	0	0					if ($IsImmediateEnd) {
232	0						End($TagName);
233							}
234							}
235							}
236							elsif ($xml =~ m/\G/xgcs)
237							{
238							## comment -- ignore
239							}
240							elsif ($xml =~ m/\G]+>/xgcs)
241							{
242							## , etc. -- ignore
243							}
244							##
245							## Nab raw text / entities
246							##
247							elsif ($xml =~ m/\G /xgcs)
248							{
249	0						Char($1);
250							}
251							elsif ($xml =~ m/\G ([^<>]+)/xgc)
252							{
253	0						Char(de_grok($1));
254							}
255							else
256							{
257	0						my ($str) = $xml =~ m/\G(.{1,40})/;
258	0	0					$str .= "..." if length($str) == 40;
259	0						_error(__LINE__, "bad XML parse at \"$str\"");
260							}
261							}
262
263							#use Data::Dumper; print Data::Dumper::Dumper(\@stack), "\n";
264	0	0					_error(__LINE__, '@stack != 1') if @stack != 1;
265	0	0					_error(__LINE__, "not data") if not $stack[0]->{Data};
266	0	0					_error(__LINE__, "keys not 1") if keys(%{ $stack[0]->{Data}} ) != 1;
	0
267	0						my ($tree) = values(%{$stack[0]->{Data}});
	0
268	0						return $tree;
269							}
270
271							1;