line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package dirtyRSS; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
6948
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
46
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8547
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require Exporter; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
@dirtyRSS::ISA = qw[Exporter]; |
9
|
|
|
|
|
|
|
@dirtyRSS::EXPORT = qw[&parse &disptree]; |
10
|
|
|
|
|
|
|
$dirtyRSS::VERSION = '0.3'; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our %htmlescapes = ( |
13
|
|
|
|
|
|
|
'quot' => 34, |
14
|
|
|
|
|
|
|
'amp' => 38, |
15
|
|
|
|
|
|
|
'apos' => 39, |
16
|
|
|
|
|
|
|
'lt' => 60, |
17
|
|
|
|
|
|
|
'gt' => 62, |
18
|
|
|
|
|
|
|
'nbsp' => 32, # Was 160, but we make it a normal space |
19
|
|
|
|
|
|
|
'iexcl' => 161, |
20
|
|
|
|
|
|
|
'cent' => 162, |
21
|
|
|
|
|
|
|
'pound' => 163, |
22
|
|
|
|
|
|
|
'curren' => 164, |
23
|
|
|
|
|
|
|
'yen' => 165, |
24
|
|
|
|
|
|
|
'brvbar' => 166, |
25
|
|
|
|
|
|
|
'sect' => 167, |
26
|
|
|
|
|
|
|
'uml' => 168, |
27
|
|
|
|
|
|
|
'copy' => 169, |
28
|
|
|
|
|
|
|
'ordf' => 170, |
29
|
|
|
|
|
|
|
'laquo' => 171, |
30
|
|
|
|
|
|
|
'not' => 172, |
31
|
|
|
|
|
|
|
'shy' => 173, |
32
|
|
|
|
|
|
|
'reg' => 174, |
33
|
|
|
|
|
|
|
'macr' => 175, |
34
|
|
|
|
|
|
|
'deg' => 176, |
35
|
|
|
|
|
|
|
'plusmn' => 177, |
36
|
|
|
|
|
|
|
'sup2' => 178, |
37
|
|
|
|
|
|
|
'sup3' => 179, |
38
|
|
|
|
|
|
|
'acute' => 180, |
39
|
|
|
|
|
|
|
'micro' => 181, |
40
|
|
|
|
|
|
|
'para' => 182, |
41
|
|
|
|
|
|
|
'middot' => 183, |
42
|
|
|
|
|
|
|
'cedil' => 184, |
43
|
|
|
|
|
|
|
'sup1' => 185, |
44
|
|
|
|
|
|
|
'ordm' => 186, |
45
|
|
|
|
|
|
|
'raquo' => 187, |
46
|
|
|
|
|
|
|
'frac14' => 188, |
47
|
|
|
|
|
|
|
'frac12' => 189, |
48
|
|
|
|
|
|
|
'frac34' => 190, |
49
|
|
|
|
|
|
|
'iquest' => 191, |
50
|
|
|
|
|
|
|
'agrave' => 192, |
51
|
|
|
|
|
|
|
'aacute' => 193, |
52
|
|
|
|
|
|
|
'acirc' => 194, |
53
|
|
|
|
|
|
|
'atilde' => 195, |
54
|
|
|
|
|
|
|
'auml' => 196, |
55
|
|
|
|
|
|
|
'aring' => 197, |
56
|
|
|
|
|
|
|
'aelig' => 198, |
57
|
|
|
|
|
|
|
'ccedil' => 199, |
58
|
|
|
|
|
|
|
'egrave' => 200, |
59
|
|
|
|
|
|
|
'eacute' => 201, |
60
|
|
|
|
|
|
|
'ecirc' => 202, |
61
|
|
|
|
|
|
|
'euml' => 203, |
62
|
|
|
|
|
|
|
'igrave' => 204, |
63
|
|
|
|
|
|
|
'iacute' => 205, |
64
|
|
|
|
|
|
|
'icirc' => 206, |
65
|
|
|
|
|
|
|
'iuml' => 207, |
66
|
|
|
|
|
|
|
'eth' => 208, |
67
|
|
|
|
|
|
|
'ntilde' => 209, |
68
|
|
|
|
|
|
|
'ograve' => 210, |
69
|
|
|
|
|
|
|
'oacute' => 211, |
70
|
|
|
|
|
|
|
'ocirc' => 212, |
71
|
|
|
|
|
|
|
'otilde' => 213, |
72
|
|
|
|
|
|
|
'ouml' => 214, |
73
|
|
|
|
|
|
|
'times' => 215, |
74
|
|
|
|
|
|
|
'oslash' => 216, |
75
|
|
|
|
|
|
|
'ugrave' => 217, |
76
|
|
|
|
|
|
|
'uacute' => 218, |
77
|
|
|
|
|
|
|
'ucirc' => 219, |
78
|
|
|
|
|
|
|
'uuml' => 220, |
79
|
|
|
|
|
|
|
'yacute' => 221, |
80
|
|
|
|
|
|
|
'thorn' => 222, |
81
|
|
|
|
|
|
|
'szlig' => 223, |
82
|
|
|
|
|
|
|
'agrave' => 224, |
83
|
|
|
|
|
|
|
'aacute' => 225, |
84
|
|
|
|
|
|
|
'acirc' => 226, |
85
|
|
|
|
|
|
|
'atilde' => 227, |
86
|
|
|
|
|
|
|
'auml' => 228, |
87
|
|
|
|
|
|
|
'aring' => 229, |
88
|
|
|
|
|
|
|
'aelig' => 230, |
89
|
|
|
|
|
|
|
'ccedil' => 231, |
90
|
|
|
|
|
|
|
'egrave' => 232, |
91
|
|
|
|
|
|
|
'eacute' => 233, |
92
|
|
|
|
|
|
|
'ecirc' => 234, |
93
|
|
|
|
|
|
|
'euml' => 235, |
94
|
|
|
|
|
|
|
'igrave' => 236, |
95
|
|
|
|
|
|
|
'iacute' => 237, |
96
|
|
|
|
|
|
|
'icirc' => 238, |
97
|
|
|
|
|
|
|
'iuml' => 239, |
98
|
|
|
|
|
|
|
'eth' => 240, |
99
|
|
|
|
|
|
|
'ntilde' => 241, |
100
|
|
|
|
|
|
|
'ograve' => 242, |
101
|
|
|
|
|
|
|
'oacute' => 243, |
102
|
|
|
|
|
|
|
'ocirc' => 244, |
103
|
|
|
|
|
|
|
'otilde' => 245, |
104
|
|
|
|
|
|
|
'ouml' => 246, |
105
|
|
|
|
|
|
|
'divide' => 247, |
106
|
|
|
|
|
|
|
'oslash' => 248, |
107
|
|
|
|
|
|
|
'ugrave' => 249, |
108
|
|
|
|
|
|
|
'uacute' => 250, |
109
|
|
|
|
|
|
|
'ucirc' => 251, |
110
|
|
|
|
|
|
|
'uuml' => 252, |
111
|
|
|
|
|
|
|
'yacute' => 253, |
112
|
|
|
|
|
|
|
'thorn' => 254, |
113
|
|
|
|
|
|
|
'yuml' => 255 |
114
|
|
|
|
|
|
|
); |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# These are typical HTML tags, which should be omitted. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
our %ignore_tags = ( |
119
|
|
|
|
|
|
|
'img' => 1, |
120
|
|
|
|
|
|
|
'a' => 1, |
121
|
|
|
|
|
|
|
'p' => 1, |
122
|
|
|
|
|
|
|
'br' => 1, |
123
|
|
|
|
|
|
|
'div' => 1, |
124
|
|
|
|
|
|
|
'span' => 1, |
125
|
|
|
|
|
|
|
'b' => 1, |
126
|
|
|
|
|
|
|
'i' => 1, |
127
|
|
|
|
|
|
|
'u' => 1, |
128
|
|
|
|
|
|
|
'body' => 1, |
129
|
|
|
|
|
|
|
'center' => 1, |
130
|
|
|
|
|
|
|
'code' => 1, |
131
|
|
|
|
|
|
|
'font' => 1, |
132
|
|
|
|
|
|
|
'form' => 1, |
133
|
|
|
|
|
|
|
'h1' => 1, |
134
|
|
|
|
|
|
|
'h2' => 1, |
135
|
|
|
|
|
|
|
'h3' => 1, |
136
|
|
|
|
|
|
|
'h4' => 1, |
137
|
|
|
|
|
|
|
'head' => 1, |
138
|
|
|
|
|
|
|
'hr' => 1, |
139
|
|
|
|
|
|
|
'html' => 1, |
140
|
|
|
|
|
|
|
'li' => 1, |
141
|
|
|
|
|
|
|
'ul' => 1, |
142
|
|
|
|
|
|
|
'ol' => 1, |
143
|
|
|
|
|
|
|
'pre' => 1, |
144
|
|
|
|
|
|
|
'style' => 1, |
145
|
|
|
|
|
|
|
'sub' => 1, |
146
|
|
|
|
|
|
|
'sup' => 1, |
147
|
|
|
|
|
|
|
'script' => 1, |
148
|
|
|
|
|
|
|
'small' => 1, |
149
|
|
|
|
|
|
|
'big' => 1, |
150
|
|
|
|
|
|
|
'table' => 1, |
151
|
|
|
|
|
|
|
'td' => 1, |
152
|
|
|
|
|
|
|
'tr' => 1, |
153
|
|
|
|
|
|
|
'th' => 1, |
154
|
|
|
|
|
|
|
'textarea'=> 1, |
155
|
|
|
|
|
|
|
'strong' => 1, |
156
|
|
|
|
|
|
|
'strike' => 1, |
157
|
|
|
|
|
|
|
'blockquote' => 1, |
158
|
|
|
|
|
|
|
); |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
our %ns = ( |
161
|
|
|
|
|
|
|
# RSS 2.0 tags |
162
|
|
|
|
|
|
|
'xml' => 'xml', |
163
|
|
|
|
|
|
|
'rss' => 'rss', |
164
|
|
|
|
|
|
|
'rdf' => 'rdf', |
165
|
|
|
|
|
|
|
'item' => 'item', |
166
|
|
|
|
|
|
|
'channel' => 'channel', |
167
|
|
|
|
|
|
|
'image' => 'image', |
168
|
|
|
|
|
|
|
'title' => 'title', |
169
|
|
|
|
|
|
|
'link' => 'link', |
170
|
|
|
|
|
|
|
'description' => 'description', |
171
|
|
|
|
|
|
|
'language' => 'language', |
172
|
|
|
|
|
|
|
'copyright' => 'copyright', |
173
|
|
|
|
|
|
|
'pubdate' => 'pubdate', |
174
|
|
|
|
|
|
|
'lastbuilddate'=> 'lastbuilddate', |
175
|
|
|
|
|
|
|
'category' => 'category', |
176
|
|
|
|
|
|
|
'generator' => 'generator', |
177
|
|
|
|
|
|
|
'ttl' => 'ttl', |
178
|
|
|
|
|
|
|
'url' => 'url', |
179
|
|
|
|
|
|
|
'width' => 'width', |
180
|
|
|
|
|
|
|
'height' => 'height', |
181
|
|
|
|
|
|
|
'version' => 'version', |
182
|
|
|
|
|
|
|
'encoding' => 'encoding', |
183
|
|
|
|
|
|
|
'guid' => 'guid', |
184
|
|
|
|
|
|
|
'enclosure' => 'enclosure', |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# RSS 1.0 tags translated to RSS 2.0 |
187
|
|
|
|
|
|
|
'subject' => 'category', |
188
|
|
|
|
|
|
|
'rights' => 'copyright', |
189
|
|
|
|
|
|
|
'modified' => 'lastbuilddate', |
190
|
|
|
|
|
|
|
'date' => 'pubdate', |
191
|
|
|
|
|
|
|
'resource' => 'resource', # 1.0 specific! |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Atom 1.0 tags translated to RSS 2.0 |
194
|
|
|
|
|
|
|
'feed' => 'channel', |
195
|
|
|
|
|
|
|
'summary' => 'description', |
196
|
|
|
|
|
|
|
'content' => 'description', |
197
|
|
|
|
|
|
|
'subtitle' => 'description', |
198
|
|
|
|
|
|
|
'lang' => 'language', |
199
|
|
|
|
|
|
|
'published' => 'pubdate', |
200
|
|
|
|
|
|
|
'updated' => 'lastbuilddate', |
201
|
|
|
|
|
|
|
'logo' => 'image', |
202
|
|
|
|
|
|
|
'entry' => 'item', |
203
|
|
|
|
|
|
|
'href' => 'link', |
204
|
|
|
|
|
|
|
); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# Note that %specials refer to the *right* side of %ns, so only one |
207
|
|
|
|
|
|
|
# entry is needed for each functional tag or its alias |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# TRUE means array type |
210
|
|
|
|
|
|
|
our %specials = ( |
211
|
|
|
|
|
|
|
'item' => 1, |
212
|
|
|
|
|
|
|
'channel' => 1, |
213
|
|
|
|
|
|
|
'image' => 1, |
214
|
|
|
|
|
|
|
'xml' => 0, |
215
|
|
|
|
|
|
|
'rss' => 0, |
216
|
|
|
|
|
|
|
'rdf' => 0, |
217
|
|
|
|
|
|
|
); |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
sub parse { |
220
|
1
|
|
|
1
|
0
|
102
|
my ($in, $debug) = @_; |
221
|
|
|
|
|
|
|
|
222
|
1
|
|
|
|
|
5
|
$in =~ s///gs; # Remove comments |
223
|
|
|
|
|
|
|
|
224
|
1
|
|
|
|
|
90
|
my @segs = map { /^[ \n\r\t]*(.*?)[ \n\r\t]*$/s } ($in =~ /(|<[^>]+?>|[^<]+)/gs); |
|
90
|
|
|
|
|
309
|
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Strip off CDATAs. Added a prefix space to avoid accidental tag hits |
227
|
1
|
50
|
|
|
|
13
|
@segs = map { /^$/s ? " $1" : $_ } @segs; |
|
90
|
|
|
|
|
172
|
|
228
|
|
|
|
|
|
|
|
229
|
1
|
|
|
|
|
11
|
@segs = grep { length > 0 } @segs; |
|
90
|
|
|
|
|
136
|
|
230
|
|
|
|
|
|
|
|
231
|
1
|
|
|
|
|
5
|
my @stack = (); |
232
|
1
|
|
|
|
|
2
|
my @valstack = (); |
233
|
1
|
|
|
|
|
3
|
my %tree = (); |
234
|
1
|
|
|
|
|
2
|
my $here = \%tree; |
235
|
1
|
|
|
|
|
2
|
my @parent = (); |
236
|
1
|
|
|
|
|
2
|
my $lastval = ""; |
237
|
|
|
|
|
|
|
|
238
|
1
|
|
|
|
|
3
|
foreach my $elem (@segs) { |
239
|
63
|
|
|
|
|
328
|
my ($modifier, $tag, $attr, $empty) = ($elem =~ /^<([!?\#]{0,1})[ \n\r\t]*([^ \n\r\t]*[^ \/\n\r\t])[ \n\r\t]*(.*?)[ \n\r\t]*(\/{0,1})>$/s); |
240
|
|
|
|
|
|
|
|
241
|
63
|
100
|
|
|
|
126
|
$empty = 1 if ($modifier); |
242
|
|
|
|
|
|
|
|
243
|
63
|
100
|
|
|
|
99
|
if (defined $tag) { |
244
|
45
|
|
|
|
|
61
|
$tag = lc $tag; # We're case-insensitive |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# Note that the regex below removes "dc:"-like namespace prefices |
247
|
45
|
|
|
|
|
43
|
my $closing; |
248
|
45
|
|
|
|
|
185
|
($closing, $tag) = ($tag =~ /^(\/{0,1}).*?:{0,1}([^:]*)$/); |
249
|
|
|
|
|
|
|
|
250
|
45
|
50
|
|
|
|
125
|
if ($ignore_tags{$tag}) { |
251
|
0
|
|
|
|
|
0
|
htmltags($here, unescape($elem)); |
252
|
0
|
|
|
|
|
0
|
next; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
45
|
100
|
|
|
|
76
|
unless ($closing) { # Opening tags... |
256
|
23
|
|
|
|
|
35
|
push @stack, $tag; |
257
|
|
|
|
|
|
|
|
258
|
23
|
|
|
|
|
34
|
my $alias = $ns{$tag}; |
259
|
|
|
|
|
|
|
|
260
|
23
|
50
|
|
|
|
35
|
if (defined $alias) { |
261
|
23
|
|
|
|
|
37
|
push @valstack, $lastval; |
262
|
23
|
|
|
|
|
57
|
$lastval = ""; |
263
|
|
|
|
|
|
|
|
264
|
23
|
100
|
|
|
|
55
|
if (defined $specials{$alias}) { |
265
|
5
|
|
|
|
|
7
|
push @parent, $here; |
266
|
5
|
|
|
|
|
7
|
$here = {}; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Note that attributes may pollute the parent hash. This is |
270
|
|
|
|
|
|
|
# necessary to support Atom 1.0 |
271
|
|
|
|
|
|
|
|
272
|
23
|
|
|
|
|
56
|
my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']*?\'|[^ \n\r\t]+?=\"[^\"]*?\"|[^ \n\r\t]+?=[^ \n\r\t]*)/g); |
273
|
|
|
|
|
|
|
|
274
|
23
|
|
|
|
|
117
|
foreach my $p (@pairs) { |
275
|
4
|
|
|
|
|
19
|
my ($k, $v) = ($p =~ /(.+?)=(.*)/); |
276
|
|
|
|
|
|
|
|
277
|
4
|
|
|
|
|
8
|
$k = lc $k; |
278
|
|
|
|
|
|
|
|
279
|
4
|
50
|
33
|
|
|
35
|
$v = $1 |
280
|
|
|
|
|
|
|
if (($v =~ /^\'(.*)\'$/s) || ($v =~ /^\"(.*)\"$/s)); |
281
|
|
|
|
|
|
|
|
282
|
4
|
|
|
|
|
16
|
($k) = ($k =~ /([^:]*)$/); # Remove namespace prefix if present |
283
|
|
|
|
|
|
|
|
284
|
4
|
|
|
|
|
9
|
my $alias = $ns{$k}; |
285
|
|
|
|
|
|
|
|
286
|
4
|
100
|
|
|
|
8
|
if (defined $alias) { |
287
|
3
|
|
|
|
|
8
|
$here->{$alias} = unescape($v); |
288
|
|
|
|
|
|
|
} else { |
289
|
1
|
50
|
|
|
|
7
|
warn "Ignored attribute $k=$v\n" |
290
|
|
|
|
|
|
|
if $debug; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
} else { |
294
|
0
|
0
|
|
|
|
0
|
warn "Ignored tag $tag\n" |
295
|
|
|
|
|
|
|
if $debug; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
45
|
100
|
100
|
|
|
154
|
if ($closing || $empty) { # Closing tags, or close an empty opening tag |
300
|
23
|
|
|
|
|
33
|
my $p = pop @stack; |
301
|
|
|
|
|
|
|
|
302
|
23
|
50
|
|
|
|
50
|
return "Bad XML tag nesting. Expected end tag for '$p', got '/$tag'" |
303
|
|
|
|
|
|
|
unless ($p eq $tag); |
304
|
|
|
|
|
|
|
|
305
|
23
|
|
|
|
|
40
|
my $alias = $ns{$tag}; |
306
|
|
|
|
|
|
|
|
307
|
23
|
50
|
|
|
|
42
|
if (defined $alias) { |
308
|
23
|
|
|
|
|
27
|
my $thislastval = $lastval; |
309
|
23
|
|
|
|
|
29
|
$lastval = pop @valstack; |
310
|
|
|
|
|
|
|
|
311
|
23
|
100
|
|
|
|
62
|
if (defined $specials{$alias}) { |
312
|
5
|
|
|
|
|
6
|
my $parent = pop @parent; |
313
|
|
|
|
|
|
|
|
314
|
5
|
100
|
|
|
|
34
|
if ($specials{$alias}) { # Array type |
315
|
3
|
50
|
33
|
|
|
27
|
$parent->{$alias} = [] |
316
|
|
|
|
|
|
|
unless ((ref $parent->{$alias}) && |
317
|
|
|
|
|
|
|
(ref $parent->{$alias}) eq 'ARRAY'); |
318
|
3
|
|
|
|
|
7
|
push @{$parent->{$alias}}, $here; |
|
3
|
|
|
|
|
8
|
|
319
|
|
|
|
|
|
|
} else { |
320
|
2
|
|
|
|
|
6
|
$parent->{$alias} = $here; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# UGLY HACK ALERT: |
324
|
|
|
|
|
|
|
# Just before leaving a tree node, we clean the 'description' |
325
|
|
|
|
|
|
|
# from possible HTML tags, and harvest the relevant values, |
326
|
|
|
|
|
|
|
# if applicable. This is because some feeds think that the |
327
|
|
|
|
|
|
|
# description should be rendered on a browser as is (cross |
328
|
|
|
|
|
|
|
# scripting, anybody?) |
329
|
|
|
|
|
|
|
|
330
|
5
|
100
|
|
|
|
20
|
$here->{'description'} =~ s/(<.*?>)/htmltags($here, $1)/ges |
|
0
|
|
|
|
|
0
|
|
331
|
|
|
|
|
|
|
if (defined $here->{'description'}); |
332
|
|
|
|
|
|
|
|
333
|
5
|
|
|
|
|
14
|
$here = $parent; |
334
|
|
|
|
|
|
|
} else { |
335
|
18
|
0
|
33
|
|
|
61
|
$here->{$alias} = unescape($thislastval) |
|
|
|
33
|
|
|
|
|
336
|
|
|
|
|
|
|
unless ((length($thislastval) == 0) && |
337
|
|
|
|
|
|
|
(defined $here->{$alias}) && |
338
|
|
|
|
|
|
|
(length $here->{$alias})); |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
} else { |
343
|
18
|
50
|
|
|
|
42
|
$lastval = (length $lastval) ? "$lastval $elem" : $elem; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
} |
347
|
1
|
50
|
|
|
|
5
|
return("Bad XML nesting: There were unclosed tags at EOF") |
348
|
|
|
|
|
|
|
if (@stack); |
349
|
|
|
|
|
|
|
|
350
|
1
|
|
|
|
|
13
|
return \%tree; |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
sub htmltags { |
355
|
0
|
|
|
0
|
0
|
0
|
my ($here, $seg) = @_; |
356
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
0
|
my ($tag, $attr) = ($seg =~ /^<[ \n\r\t]*([^ \n\r\t]+)[ \n\r\t]*(.*?)[ \n\r\t]*>$/s); |
358
|
|
|
|
|
|
|
|
359
|
0
|
0
|
|
|
|
0
|
return "" unless (defined $tag); |
360
|
|
|
|
|
|
|
|
361
|
0
|
|
|
|
|
0
|
$tag = lc $tag; |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# Respect HTML line breaks, even though the renderer won't |
364
|
0
|
0
|
0
|
|
|
0
|
return "\n" |
365
|
|
|
|
|
|
|
if (($tag eq 'p') || ($tag eq 'br')); |
366
|
|
|
|
|
|
|
|
367
|
0
|
0
|
0
|
|
|
0
|
if (($tag eq 'img') && !(defined $here->{'altimage'})) { |
|
|
0
|
0
|
|
|
|
|
368
|
0
|
|
|
|
|
0
|
my $new = {}; |
369
|
0
|
|
|
|
|
0
|
$here->{'altimage'} = $new; |
370
|
0
|
|
|
|
|
0
|
$here = $new; |
371
|
|
|
|
|
|
|
} elsif (($tag eq 'a') && !(defined $here->{'altlink'})) { |
372
|
0
|
|
|
|
|
0
|
my $new = {}; |
373
|
0
|
|
|
|
|
0
|
$here->{'altlink'} = $new; |
374
|
0
|
|
|
|
|
0
|
$here = $new; |
375
|
0
|
|
|
|
|
0
|
} else { return ""; } |
376
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
0
|
my @pairs = ($attr =~ /([^ \n\r\t]+?=\'[^\']*?\'|[^ \n\r\t]+?=\"[^\"]*?\"|[^ \n\r\t]+?=[^ \n\r\t]*)/g); |
378
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
0
|
foreach my $p (@pairs) { |
380
|
0
|
|
|
|
|
0
|
my ($k, $v) = ($p =~ /(.+?)=(.*)/); |
381
|
|
|
|
|
|
|
|
382
|
0
|
|
|
|
|
0
|
$k = lc $k; |
383
|
|
|
|
|
|
|
|
384
|
0
|
0
|
0
|
|
|
0
|
$v = $1 |
385
|
|
|
|
|
|
|
if (($v =~ /^\'(.*)\'$/s) || ($v =~ /^\"(.*)\"$/s)); |
386
|
|
|
|
|
|
|
|
387
|
0
|
|
|
|
|
0
|
$here->{$k} = $v; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
0
|
|
|
|
|
0
|
return ""; # This makes the function useful in substitutions |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub single_unescape { |
394
|
0
|
|
|
0
|
0
|
0
|
my ($ent) = @_; |
395
|
|
|
|
|
|
|
|
396
|
0
|
|
|
|
|
0
|
my $ord = $htmlescapes{lc($ent)}; |
397
|
0
|
0
|
|
|
|
0
|
return chr($ord) if defined $ord; |
398
|
0
|
|
|
|
|
0
|
return ""; # Conversion failed, return nothing |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
sub unescape { |
402
|
|
|
|
|
|
|
# Note! Unicode characters are escaped to space! |
403
|
21
|
|
|
21
|
0
|
24
|
my ($x) = @_; |
404
|
|
|
|
|
|
|
# For now, we go wild, and convert all escape markers |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
# Run twice, because of double-nested markups :-O |
407
|
21
|
|
|
|
|
50
|
for (my $i=0; $i<2; $i++) { |
408
|
42
|
|
|
|
|
48
|
$x =~ s/&(\w+);/single_unescape($1)/ge; |
|
0
|
|
|
|
|
0
|
|
409
|
42
|
0
|
|
|
|
48
|
$x =~ s/&\#(\d+);/chr($1 < 256 ? $1 : 32)/ge; |
|
0
|
|
|
|
|
0
|
|
410
|
42
|
0
|
|
|
|
87
|
$x =~ s/&\#x([0-9a-fA-F]+);/chr(hex($1) < 256 ? hex($1) : 32)/ige; |
|
0
|
|
|
|
|
0
|
|
411
|
|
|
|
|
|
|
} |
412
|
21
|
|
|
|
|
105
|
return $x; |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
sub disptree { |
416
|
0
|
|
|
0
|
0
|
|
my ($what, $s) = @_; |
417
|
|
|
|
|
|
|
|
418
|
0
|
|
|
|
|
|
foreach my $k (sort keys %{$what}) { |
|
0
|
|
|
|
|
|
|
419
|
0
|
|
|
|
|
|
my $v = $what->{$k}; |
420
|
|
|
|
|
|
|
|
421
|
0
|
0
|
|
|
|
|
if ((ref $v) eq 'HASH') { |
422
|
0
|
|
|
|
|
|
print " "x$s."$k\n"; |
423
|
0
|
|
|
|
|
|
disptree($v, $s+2); |
424
|
0
|
|
|
|
|
|
next; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
0
|
0
|
|
|
|
|
if ((ref $v) eq 'ARRAY') { |
428
|
0
|
|
|
|
|
|
my $count; |
429
|
0
|
|
|
|
|
|
for ($count=0; $count<=$#{$v}; $count++) { |
|
0
|
|
|
|
|
|
|
430
|
0
|
|
|
|
|
|
print " "x$s.$k."[$count]\n"; |
431
|
0
|
|
|
|
|
|
disptree($v->[$count], $s+2); |
432
|
|
|
|
|
|
|
} |
433
|
0
|
|
|
|
|
|
next; |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
|
436
|
0
|
|
|
|
|
|
print " "x$s."$k => $v\n"; |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
1; |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
__END__ |