line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/perl |
2
|
|
|
|
|
|
|
package XML::RSSLite; |
3
|
2
|
|
|
2
|
|
1855
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
85
|
|
4
|
2
|
|
|
2
|
|
11
|
use vars qw($VERSION); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
156
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
$VERSION = 0.15; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
sub import{ |
9
|
2
|
|
|
2
|
|
12
|
no strict 'refs'; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
4726
|
|
10
|
2
|
|
|
2
|
|
15
|
shift; |
11
|
2
|
|
|
|
|
5
|
my $pkg = scalar caller(); |
12
|
2
|
|
|
|
|
4
|
*{"${pkg}::parseRSS"} = \&parseRSS; |
|
2
|
|
|
|
|
13
|
|
13
|
2
|
50
|
|
|
|
4030
|
*{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_); |
|
0
|
|
|
|
|
0
|
|
14
|
|
|
|
|
|
|
} |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
sub parseRSS { |
18
|
2
|
|
|
2
|
1
|
264
|
my ($rr, $cref) = @_; |
19
|
|
|
|
|
|
|
|
20
|
2
|
50
|
|
|
|
9
|
die "$rr is not a hash reference" unless ref($rr) eq 'HASH'; |
21
|
2
|
50
|
|
|
|
10
|
die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# Gotta have some content to parse |
24
|
2
|
50
|
|
|
|
7
|
return unless $$cref; |
25
|
|
|
|
|
|
|
|
26
|
2
|
|
|
|
|
7
|
preprocess($cref); |
27
|
|
|
|
|
|
|
{ |
28
|
2
|
100
|
|
|
|
4
|
_parseRSS($rr, $cref), last if index(${$cref}, '
|
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
18
|
|
29
|
1
|
50
|
|
|
|
2
|
_parseRDF($rr, $cref), last if index(${$cref}, '
|
|
1
|
|
|
|
|
10
|
|
30
|
0
|
0
|
|
|
|
0
|
_parseSN( $rr, $cref), last if index(${$cref}, '
|
|
0
|
|
|
|
|
0
|
|
31
|
0
|
0
|
|
|
|
0
|
_parseWL( $rr, $cref), last if index(${$cref}, '
|
|
0
|
|
|
|
|
0
|
|
32
|
0
|
|
|
|
|
0
|
die "Content must be RSS|RDF|ScriptingNews|Weblog|reasonably close"; |
33
|
|
|
|
|
|
|
} |
34
|
2
|
|
|
|
|
9
|
postprocess($rr); |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub preprocess { |
38
|
2
|
|
|
2
|
0
|
4
|
my $cref = shift; |
39
|
2
|
|
|
|
|
39
|
$$cref =~ y/\r\n/\n/s; |
40
|
2
|
|
|
|
|
34
|
$$cref =~ y{\n\t ~0-9\-+!@#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs; |
41
|
|
|
|
|
|
|
#XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+|#\d+);/amp/gs; |
42
|
|
|
|
|
|
|
#XXX Do we wish to (re)allow escaped HTML?! |
43
|
2
|
|
|
|
|
1283
|
$$cref =~ s{(?:<|<)/?(?:b|i|h\d|p|center|quote|strong)(?:>|>)}{}gsi; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub _parseRSS { |
47
|
1
|
|
|
1
|
|
7
|
parseXML($_[0], $_[1], 'channel', 0); |
48
|
1
|
|
|
|
|
5
|
$_[0]->{'items'} = $_[0]->{'item'}; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub _parseRDF { |
52
|
1
|
|
|
1
|
|
2
|
my ($rr, $cref) = @_; |
53
|
|
|
|
|
|
|
|
54
|
1
|
|
|
|
|
4
|
$rr->{'items'} = []; |
55
|
1
|
|
|
|
|
2
|
my $item; |
56
|
|
|
|
|
|
|
|
57
|
1
|
|
|
|
|
5
|
parseXML($_[0], $_[1], 'rdf:RDF', 0); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Alias RDF to RSS |
60
|
1
|
50
|
|
|
|
4
|
if( exists($rr->{'item'}) ){ |
61
|
1
|
|
|
|
|
6
|
$rr->{'items'} = $rr->{'item'}; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
else{ |
64
|
0
|
|
0
|
|
|
0
|
my $li = $_[0]->{'rdf:li'} || $_[0]->{'rdf:Seq'}->{'rdf:li'}; |
65
|
0
|
|
|
|
|
0
|
foreach $item ( @{$li} ){ |
|
0
|
|
|
|
|
0
|
|
66
|
0
|
|
|
|
|
0
|
my %ia; |
67
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'dc:description'}) { |
68
|
0
|
|
|
|
|
0
|
$ia{'description'} = $item->{'dc:description'}; |
69
|
|
|
|
|
|
|
} |
70
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'dc:title'}) { |
71
|
0
|
|
|
|
|
0
|
$ia{'title'} = $item->{'dc:title'}; |
72
|
|
|
|
|
|
|
} |
73
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'dc:identifier'}) { |
74
|
0
|
|
|
|
|
0
|
$ia{'link'} = delete($item->{'dc:identifier'}); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
0
|
push(@{$rr->{'items'}}, \%ia); |
|
0
|
|
|
|
|
0
|
|
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub _parseSN { |
83
|
0
|
|
|
0
|
|
0
|
my ($rr, $cref) = @_; |
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
0
|
$rr->{'items'} = (); |
86
|
0
|
|
|
|
|
0
|
my $item; |
87
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
0
|
parseXML($rr, $cref, 'channel', 0); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# Alias SN to RSS terms |
91
|
0
|
|
|
|
|
0
|
foreach $item ( @{$_[0]->{'rdf:li'}} ){ |
|
0
|
|
|
|
|
0
|
|
92
|
0
|
|
|
|
|
0
|
my %ia; |
93
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'text'}) { |
94
|
0
|
|
|
|
|
0
|
$ia{'description'} = $item->{'text'}; |
95
|
|
|
|
|
|
|
} |
96
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'linetext'}) { |
97
|
0
|
|
|
|
|
0
|
$ia{'title'} = $item->{'linetext'}; |
98
|
|
|
|
|
|
|
} |
99
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'url'}) { |
100
|
0
|
|
|
|
|
0
|
$ia{'link'} = $item->{'url'}; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
0
|
push(@{$rr->{'items'}}, \%ia); |
|
0
|
|
|
|
|
0
|
|
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub _parseWL { |
109
|
0
|
|
|
0
|
|
0
|
my ($rr, $cref) = @_; |
110
|
|
|
|
|
|
|
|
111
|
0
|
|
|
|
|
0
|
$rr->{'items'} = (); |
112
|
0
|
|
|
|
|
0
|
my $item; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
#XXX is this the right tag to parse for? |
115
|
0
|
|
|
|
|
0
|
parseXML($rr, $cref, 'channel', 0); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# Alias WL to RSS |
118
|
0
|
|
|
|
|
0
|
foreach $item ( @{$_[0]->{'rdf:li'}} ){ |
|
0
|
|
|
|
|
0
|
|
119
|
0
|
|
|
|
|
0
|
my %ia; |
120
|
0
|
0
|
|
|
|
0
|
if (exists $item->{'url'}) { |
121
|
0
|
|
|
|
|
0
|
$ia{'link'} = delete($item->{'url'}); |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
0
|
push(@{$rr->{'items'}}, \%ia); |
|
0
|
|
|
|
|
0
|
|
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub postprocess { |
130
|
2
|
|
|
2
|
0
|
5
|
my $rr = shift; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
#XXX Not much to do, what about un-munging URL's in source, etc.?! |
133
|
2
|
50
|
|
|
|
9
|
return unless defined($rr->{'items'}); |
134
|
2
|
50
|
|
|
|
9
|
$rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY'; |
135
|
|
|
|
|
|
|
|
136
|
2
|
|
|
|
|
6
|
foreach my $i (@{$rr->{'items'}}) { |
|
2
|
|
|
|
|
8
|
|
137
|
26
|
50
|
|
|
|
52
|
$i->{description} = $i->{description}->{'<>'} if ref($i->{description}); |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Put stuff into the right name if necessary |
140
|
26
|
50
|
|
|
|
54
|
if( not $i->{'link'} ){ |
141
|
0
|
0
|
|
|
|
0
|
if( defined($i->{'url'}) ){ |
|
|
0
|
|
|
|
|
|
142
|
0
|
|
|
|
|
0
|
$i->{'link'} = delete($i->{'url'}); } |
143
|
|
|
|
|
|
|
# See if you can use misplaced url in title for empty links |
144
|
|
|
|
|
|
|
elsif( exists($i->{'title'}) ){ |
145
|
|
|
|
|
|
|
# The next case would trap this, but try to short-circuit the gathering |
146
|
0
|
0
|
|
|
|
0
|
if ($i->{'title'} =~ /^(?:https?|ftp):/) { |
|
|
0
|
|
|
|
|
|
147
|
0
|
|
|
|
|
0
|
$i->{'link'} = $i->{'title'}; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
elsif ($i->{'title'} =~ /"((?:https?|ftp).*?)"/) { |
150
|
0
|
|
|
|
|
0
|
$i->{'link'} = $1; |
151
|
0
|
|
|
|
|
0
|
$i->{'title'} =~ s/<.*?>//; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
else { |
154
|
0
|
|
|
|
|
0
|
next; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Clean bogus whitespace |
160
|
26
|
|
|
|
|
206
|
$i->{'link'} =~ s/^\s+|\s+$//; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Make sure you've got an http/ftp link |
163
|
26
|
50
|
33
|
|
|
183
|
if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(https?|ftp)://}i) { |
164
|
|
|
|
|
|
|
## Rip link out of anchor tag |
165
|
0
|
0
|
0
|
|
|
0
|
if( ref($i->{'link'}) && $i->{'link'}->{a}->{href} ){ |
|
|
0
|
0
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
$i->{'link'} = $i->{'link'}->{a}->{href} } |
167
|
|
|
|
|
|
|
## Smells like a relative url |
168
|
|
|
|
|
|
|
elsif( $i->{'link'} =~ m{^[#/]} and $rr->{'link'} =~ m{^https?://} ){ |
169
|
0
|
0
|
|
|
|
0
|
if (substr($i->{'link'}, 0, 1) ne '/') { |
170
|
0
|
|
|
|
|
0
|
$i->{'link'} = '/' . $i->{'link'}; |
171
|
|
|
|
|
|
|
} |
172
|
0
|
|
|
|
|
0
|
$i->{'link'} = $rr->{'link'} . $i->{'link'}; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
else { |
175
|
0
|
|
|
|
|
0
|
next; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
#If we don't have a title, use the link |
180
|
26
|
50
|
|
|
|
55
|
unless( defined($i->{'title'}) ){ |
181
|
0
|
|
|
|
|
0
|
$i->{'title'} = $i->{'link'}; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
26
|
50
|
|
|
|
50
|
if( exists($i->{'link'}) ){ |
185
|
|
|
|
|
|
|
#XXX # Fix pre-process munging |
186
|
|
|
|
|
|
|
# $i->{'link'} =~ s/&/&/gi; |
187
|
26
|
|
|
|
|
56
|
$i->{'link'} =~ s/ /%20/g; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub parseXML{ |
193
|
2
|
|
|
2
|
1
|
5
|
my($hash, $xml, $tag, $comments) = @_; |
194
|
2
|
|
|
|
|
3
|
my($begin, $end, @comments); |
195
|
2
|
|
|
|
|
5
|
local $_; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
#Kill comments |
198
|
2
|
|
33
|
|
|
4
|
while( ($begin = index(${$xml}, '') for @comments; |
|
0
|
|
|
|
|
0
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
#Expose comments if requested |
217
|
0
|
0
|
|
|
|
0
|
do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY'; |
|
0
|
|
|
|
|
0
|
|
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub _parseXML{ |
222
|
34
|
|
|
34
|
|
94
|
my($hash, $xml, $tag, $index) = @_; |
223
|
34
|
|
|
|
|
33
|
my($begin, $end); |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
#Find topTag and set pos to start matching from there |
226
|
34
|
|
|
|
|
37
|
${$xml} =~ /<$tag(?:>|\s)/g; |
|
34
|
|
|
|
|
285
|
|
227
|
34
|
|
50
|
|
|
44
|
($begin, $end) = (0, pos(${$xml})||0); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
#Match either or , optional attributes, stash tag name |
230
|
34
|
|
|
|
|
45
|
while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?\1)>%sg ){ |
|
179
|
|
|
|
|
1460
|
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
#Save the tag name, we'll need it |
233
|
145
|
|
33
|
|
|
451
|
$tag = $1 || $2; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
#Save the new beginning and end |
236
|
145
|
|
|
|
|
241
|
($begin, $end) = ($end, pos(${$xml})); |
|
145
|
|
|
|
|
256
|
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#Get the bit we just matched. |
239
|
145
|
|
|
|
|
182
|
my $str = substr(${$xml}, $begin, $end-$begin); |
|
145
|
|
|
|
|
370
|
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
#Extract the actual attributes and contents of the tag |
242
|
145
|
100
|
|
|
|
3668
|
$str =~ m%<\Q$tag\E\s*([^>]*?)?>(.*?)\Q$tag\E>%s || |
243
|
|
|
|
|
|
|
#XXX pointed out by hv |
244
|
|
|
|
|
|
|
# $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)$tag>%<$tag>$2$tag>%s || |
245
|
|
|
|
|
|
|
$str =~ m%<\Q$tag\E\s*([^>]*?)?\s*/>%; |
246
|
145
|
|
|
|
|
452
|
my($attr, $content) = ($1, $2); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
#Did we get attributes? clean them up and chuck them in a hash. |
249
|
145
|
100
|
|
|
|
271
|
if( $attr ){ |
250
|
43
|
|
|
|
|
96
|
($_, $attr) = ($attr, {}); |
251
|
43
|
|
|
|
|
600
|
$attr->{$1} = $3 while m/([^\s=]+)\s*=\s*(['"]?)([^\2>]*?)(?:\2|$)/g; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
145
|
|
|
|
|
159
|
my $inhash; |
255
|
|
|
|
|
|
|
#Recurse if contents has more tags, replace contents with reference we get |
256
|
145
|
100
|
100
|
|
|
614
|
if( $content && index($content, '<') > -1 ){ |
257
|
32
|
|
|
|
|
93
|
_parseXML($inhash={}, \$str, $tag); |
258
|
|
|
|
|
|
|
#Was there any data in the contents? We should extract that... |
259
|
32
|
50
|
|
|
|
143
|
if( $str =~ />[^><]+ ){ |
260
|
|
|
|
|
|
|
#The odd RE above shortcircuits unnecessary entry |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
#Clean whitespace between tags |
263
|
|
|
|
|
|
|
#$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning |
264
|
|
|
|
|
|
|
#$str =~ s%(>?)\s*<%$1<%g; |
265
|
|
|
|
|
|
|
#XXX #$str =~ s%(?:^|(?<=>))\s*(?:(?=<)|\z)%%g |
266
|
|
|
|
|
|
|
|
267
|
32
|
|
|
|
|
55
|
my $qr = qr{@{[join('|', keys %{$inhash})]}}; |
|
32
|
|
|
|
|
37
|
|
|
32
|
|
|
|
|
839
|
|
268
|
32
|
|
|
|
|
1413
|
$content =~ s%<($qr)\s*(?:[^>]*?)?(?:/|>.*?\1)>%%sg; |
269
|
|
|
|
|
|
|
|
270
|
32
|
50
|
|
|
|
149
|
$inhash->{'<>'} = $content if $content =~ /\S/; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
145
|
100
|
|
|
|
328
|
if( ref($inhash) ){ |
|
|
100
|
|
|
|
|
|
275
|
|
|
|
|
|
|
#We have attributes? Then we should merge them. |
276
|
32
|
100
|
|
|
|
64
|
if( ref($attr) ){ |
277
|
22
|
|
|
|
|
23
|
for( keys %{$attr} ){ |
|
22
|
|
|
|
|
65
|
|
278
|
0
|
|
|
|
|
0
|
$inhash->{$_} = exists($inhash->{$_}) ? |
279
|
|
|
|
|
|
|
(ref($inhash->{$_}) eq 'ARRAY' ? |
280
|
22
|
0
|
|
|
|
87
|
[@{$inhash->{$_}}, $attr->{$_}] : |
|
|
50
|
|
|
|
|
|
281
|
|
|
|
|
|
|
[ $inhash->{$_}, $attr->{$_}] ) : $attr->{$_}; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
elsif( ref($attr) ){ |
286
|
21
|
|
|
|
|
26
|
$inhash = $attr; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
else{ |
289
|
|
|
|
|
|
|
#Otherwise save our content |
290
|
92
|
|
|
|
|
136
|
$inhash = $content; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
40
|
|
|
|
|
240
|
$hash->{$tag} = exists($hash->{$tag}) ? |
294
|
|
|
|
|
|
|
(ref($hash->{$tag}) eq 'ARRAY' ? |
295
|
145
|
100
|
|
|
|
591
|
[@{$hash->{$tag}}, $inhash] : |
|
|
100
|
|
|
|
|
|
296
|
|
|
|
|
|
|
[ $hash->{$tag}, $inhash] ) : $inhash; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
1; |
301
|
|
|
|
|
|
|
__END__ |