File Coverage

blib/lib/XML/RSSLite.pm

Criterion	Covered	Total	%
statement	91	160	56.8
branch	35	90	38.8
condition	7	23	30.4
subroutine	11	13	84.6
pod	2	4	50.0
total	146	290	50.3

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							package XML::RSSLite;
3	2			2		1855	use strict;
	2					3
	2					85
4	2			2		11	use vars qw($VERSION);
	2					4
	2					156
5
6							$VERSION = 0.15;
7
8							sub import{
9	2			2		12	no strict 'refs';
	2					7
	2					4726
10	2			2		15	shift;
11	2					5	my $pkg = scalar caller();
12	2					4	*{"${pkg}::parseRSS"} = \&parseRSS;
	2					13
13	2	50				4030	*{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_);
	0					0
14							}
15
16
17							sub parseRSS {
18	2			2	1	264	my ($rr, $cref) = @_;
19
20	2	50				9	die "$rr is not a hash reference" unless ref($rr) eq 'HASH';
21	2	50				10	die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR';
22
23							# Gotta have some content to parse
24	2	50				7	return unless $$cref;
25
26	2					7	preprocess($cref);
27							{
28	2	100				4	_parseRSS($rr, $cref), last if index(${$cref}, '
	2					3
	2					18
29	1	50				2	_parseRDF($rr, $cref), last if index(${$cref}, '
	1					10
30	0	0				0	_parseSN( $rr, $cref), last if index(${$cref}, '
	0					0
31	0	0				0	_parseWL( $rr, $cref), last if index(${$cref}, '
	0					0
32	0					0	die "Content must be RSS\|RDF\|ScriptingNews\|Weblog\|reasonably close";
33							}
34	2					9	postprocess($rr);
35							}
36
37							sub preprocess {
38	2			2	0	4	my $cref = shift;
39	2					39	$$cref =~ y/\r\n/\n/s;
40	2					34	$$cref =~ y{\n\t ~0-9\-+!@#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs;
41							#XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+\|#\d+);/amp/gs;
42							#XXX Do we wish to (re)allow escaped HTML?!
43	2					1283	$$cref =~ s{(?:<\|<)/?(?:b\|i\|h\d\|p\|center\|quote\|strong)(?:>\|>)}{}gsi;
44							}
45
46							sub _parseRSS {
47	1			1		7	parseXML($_[0], $_[1], 'channel', 0);
48	1					5	$_[0]->{'items'} = $_[0]->{'item'};
49							}
50
51							sub _parseRDF {
52	1			1		2	my ($rr, $cref) = @_;
53
54	1					4	$rr->{'items'} = [];
55	1					2	my $item;
56
57	1					5	parseXML($_[0], $_[1], 'rdf:RDF', 0);
58
59							# Alias RDF to RSS
60	1	50				4	if( exists($rr->{'item'}) ){
61	1					6	$rr->{'items'} = $rr->{'item'};
62							}
63							else{
64	0		0			0	my $li = $_[0]->{'rdf:li'} \|\| $_[0]->{'rdf:Seq'}->{'rdf:li'};
65	0					0	foreach $item ( @{$li} ){
	0					0
66	0					0	my %ia;
67	0	0				0	if (exists $item->{'dc:description'}) {
68	0					0	$ia{'description'} = $item->{'dc:description'};
69							}
70	0	0				0	if (exists $item->{'dc:title'}) {
71	0					0	$ia{'title'} = $item->{'dc:title'};
72							}
73	0	0				0	if (exists $item->{'dc:identifier'}) {
74	0					0	$ia{'link'} = delete($item->{'dc:identifier'});
75							}
76
77	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
78							}
79							}
80							}
81
82							sub _parseSN {
83	0			0		0	my ($rr, $cref) = @_;
84
85	0					0	$rr->{'items'} = ();
86	0					0	my $item;
87
88	0					0	parseXML($rr, $cref, 'channel', 0);
89
90							# Alias SN to RSS terms
91	0					0	foreach $item ( @{$_[0]->{'rdf:li'}} ){
	0					0
92	0					0	my %ia;
93	0	0				0	if (exists $item->{'text'}) {
94	0					0	$ia{'description'} = $item->{'text'};
95							}
96	0	0				0	if (exists $item->{'linetext'}) {
97	0					0	$ia{'title'} = $item->{'linetext'};
98							}
99	0	0				0	if (exists $item->{'url'}) {
100	0					0	$ia{'link'} = $item->{'url'};
101							}
102
103	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
104							}
105							}
106
107
108							sub _parseWL {
109	0			0		0	my ($rr, $cref) = @_;
110
111	0					0	$rr->{'items'} = ();
112	0					0	my $item;
113
114							#XXX is this the right tag to parse for?
115	0					0	parseXML($rr, $cref, 'channel', 0);
116
117							# Alias WL to RSS
118	0					0	foreach $item ( @{$_[0]->{'rdf:li'}} ){
	0					0
119	0					0	my %ia;
120	0	0				0	if (exists $item->{'url'}) {
121	0					0	$ia{'link'} = delete($item->{'url'});
122							}
123
124	0					0	push(@{$rr->{'items'}}, \%ia);
	0					0
125							}
126							}
127
128
129							sub postprocess {
130	2			2	0	5	my $rr = shift;
131
132							#XXX Not much to do, what about un-munging URL's in source, etc.?!
133	2	50				9	return unless defined($rr->{'items'});
134	2	50				9	$rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY';
135
136	2					6	foreach my $i (@{$rr->{'items'}}) {
	2					8
137	26	50				52	$i->{description} = $i->{description}->{'<>'} if ref($i->{description});
138
139							# Put stuff into the right name if necessary
140	26	50				54	if( not $i->{'link'} ){
141	0	0				0	if( defined($i->{'url'}) ){
		0
142	0					0	$i->{'link'} = delete($i->{'url'}); }
143							# See if you can use misplaced url in title for empty links
144							elsif( exists($i->{'title'}) ){
145							# The next case would trap this, but try to short-circuit the gathering
146	0	0				0	if ($i->{'title'} =~ /^(?:https?\|ftp):/) {
		0
147	0					0	$i->{'link'} = $i->{'title'};
148							}
149							elsif ($i->{'title'} =~ /"((?:https?\|ftp).*?)"/) {
150	0					0	$i->{'link'} = $1;
151	0					0	$i->{'title'} =~ s/<.*?>//;
152							}
153							else {
154	0					0	next;
155							}
156							}
157							}
158
159							# Clean bogus whitespace
160	26					206	$i->{'link'} =~ s/^\s+\|\s+$//;
161
162							# Make sure you've got an http/ftp link
163	26	50	33			183	if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(https?\|ftp)://}i) {
164							## Rip link out of anchor tag
165	0	0	0			0	if( ref($i->{'link'}) && $i->{'link'}->{a}->{href} ){
		0	0
166	0					0	$i->{'link'} = $i->{'link'}->{a}->{href} }
167							## Smells like a relative url
168							elsif( $i->{'link'} =~ m{^[#/]} and $rr->{'link'} =~ m{^https?://} ){
169	0	0				0	if (substr($i->{'link'}, 0, 1) ne '/') {
170	0					0	$i->{'link'} = '/' . $i->{'link'};
171							}
172	0					0	$i->{'link'} = $rr->{'link'} . $i->{'link'};
173							}
174							else {
175	0					0	next;
176							}
177							}
178
179							#If we don't have a title, use the link
180	26	50				55	unless( defined($i->{'title'}) ){
181	0					0	$i->{'title'} = $i->{'link'};
182							}
183
184	26	50				50	if( exists($i->{'link'}) ){
185							#XXX # Fix pre-process munging
186							# $i->{'link'} =~ s/&/&/gi;
187	26					56	$i->{'link'} =~ s/ /%20/g;
188							}
189							}
190							}
191
192							sub parseXML{
193	2			2	1	5	my($hash, $xml, $tag, $comments) = @_;
194	2					3	my($begin, $end, @comments);
195	2					5	local $_;
196
197							#Kill comments
198	2		33			4	while( ($begin = index(${$xml}, '') for @comments;
	0					0
215
216							#Expose comments if requested
217	0	0				0	do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY';
	0					0
218							}
219							}
220
221							sub _parseXML{
222	34			34		94	my($hash, $xml, $tag, $index) = @_;
223	34					33	my($begin, $end);
224
225							#Find topTag and set pos to start matching from there
226	34					37	${$xml} =~ /<$tag(?:>\|\s)/g;
	34					285
227	34		50			44	($begin, $end) = (0, pos(${$xml})\|\|0);
228
229							#Match either or , optional attributes, stash tag name
230	34					45	while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]?)?(?:/\|>.?%sg ){
	179					1460
231
232							#Save the tag name, we'll need it
233	145		33			451	$tag = $1 \|\| $2;
234
235							#Save the new beginning and end
236	145					241	($begin, $end) = ($end, pos(${$xml}));
	145					256
237
238							#Get the bit we just matched.
239	145					182	my $str = substr(${$xml}, $begin, $end-$begin);
	145					370
240
241							#Extract the actual attributes and contents of the tag
242	145	100				3668	$str =~ m%<\Q$tag\E\s([^>]?)?>(.*?)%s \|\|
243							#XXX pointed out by hv
244							# $str =~ s%^.?<$tag\s([^>]?)?>(.?)%<$tag>$2%s \|\|
245							$str =~ m%<\Q$tag\E\s([^>]?)?\s*/>%;
246	145					452	my($attr, $content) = ($1, $2);
247
248							#Did we get attributes? clean them up and chuck them in a hash.
249	145	100				271	if( $attr ){
250	43					96	($_, $attr) = ($attr, {});
251	43					600	$attr->{$1} = $3 while m/([^\s=]+)\s=\s(['"]?)([^\2>]*?)(?:\2\|$)/g;
252							}
253
254	145					159	my $inhash;
255							#Recurse if contents has more tags, replace contents with reference we get
256	145	100	100			614	if( $content && index($content, '<') > -1 ){
257	32					93	_parseXML($inhash={}, \$str, $tag);
258							#Was there any data in the contents? We should extract that...
259	32	50				143	if( $str =~ />[^><]+
260							#The odd RE above shortcircuits unnecessary entry
261
262							#Clean whitespace between tags
263							#$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning
264							#$str =~ s%(>?)\s*<%$1<%g;
265							#XXX #$str =~ s%(?:^\|(?<=>))\s*(?:(?=<)\|\z)%%g
266
267	32					55	my $qr = qr{@{[join('\|', keys %{$inhash})]}};
	32					37
	32					839
268	32					1413	$content =~ s%<($qr)\s(?:[^>]?)?(?:/\|>.*?%%sg;
269
270	32	50				149	$inhash->{'<>'} = $content if $content =~ /\S/;
271							}
272							}
273
274	145	100				328	if( ref($inhash) ){
		100
275							#We have attributes? Then we should merge them.
276	32	100				64	if( ref($attr) ){
277	22					23	for( keys %{$attr} ){
	22					65
278	0					0	$inhash->{$_} = exists($inhash->{$_}) ?
279							(ref($inhash->{$_}) eq 'ARRAY' ?
280	22	0				87	[@{$inhash->{$_}}, $attr->{$_}] :
		50
281							[ $inhash->{$_}, $attr->{$_}] ) : $attr->{$_};
282							}
283							}
284							}
285							elsif( ref($attr) ){
286	21					26	$inhash = $attr;
287							}
288							else{
289							#Otherwise save our content
290	92					136	$inhash = $content;
291							}
292
293	40					240	$hash->{$tag} = exists($hash->{$tag}) ?
294							(ref($hash->{$tag}) eq 'ARRAY' ?
295	145	100				591	[@{$hash->{$tag}}, $inhash] :
		100
296							[ $hash->{$tag}, $inhash] ) : $inhash;
297							}
298							}
299
300							1;
301							__END__