blib/lib/Test/C2FIT/Parse.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 141 | 177 | 79.6 |
branch | 33 | 48 | 68.7 |
condition | 9 | 18 | 50.0 |
subroutine | 27 | 34 | 79.4 |
pod | 10 | 28 | 35.7 |
total | 220 | 305 | 72.1 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # $Id: Parse.pm,v 1.18 2006/06/16 15:20:56 tonyb Exp $ | ||||||
2 | # | ||||||
3 | # Copyright (c) 2002-2005 Cunningham & Cunningham, Inc. | ||||||
4 | # Released under the terms of the GNU General Public License version 2 or later. | ||||||
5 | # | ||||||
6 | # Perl translation by Dave W. Smith |
||||||
7 | # Modified by Tony Byrne |
||||||
8 | |||||||
9 | package Test::C2FIT::Parse; | ||||||
10 | |||||||
11 | 4 | 4 | 91244 | use strict; | |||
4 | 9 | ||||||
4 | 158 | ||||||
12 | |||||||
13 | 4 | 4 | 20 | use vars qw(@tags); | |||
4 | 7 | ||||||
4 | 218 | ||||||
14 | |||||||
15 | 4 | 4 | 2093 | use Test::C2FIT::ParseException; | |||
4 | 15 | ||||||
4 | 47 | ||||||
16 | |||||||
17 | our @tags = qw(table tr td); | ||||||
18 | our $MAX_VALUE = 99999999999999999999999999; | ||||||
19 | |||||||
20 | sub new { | ||||||
21 | 37 | 37 | 0 | 88 | my $pkg = shift; | ||
22 | 37 | 66 | 106 | my $class = ref $pkg || $pkg; | |||
23 | 37 | 102 | my $self = bless {}, $class; | ||||
24 | 37 | 201 | $self->_parse(@_); | ||||
25 | 37 | 70 | return $self; | ||||
26 | } | ||||||
27 | |||||||
28 | sub from { | ||||||
29 | 3 | 3 | 0 | 9 | my $pkg = shift; | ||
30 | 3 | 7 | my ( $tag, $body, $parts, $more ) = @_; | ||||
31 | 3 | 36 | bless { | ||||
32 | leader => "\n", | ||||||
33 | tag => "<$tag>", | ||||||
34 | body => $body, | ||||||
35 | parts => $parts, | ||||||
36 | end => "$tag>", | ||||||
37 | more => $more, | ||||||
38 | trailer => "" | ||||||
39 | }, $pkg; | ||||||
40 | } | ||||||
41 | |||||||
42 | sub _parse { | ||||||
43 | 37 | 37 | 45 | my $self = shift; | |||
44 | 37 | 57 | my ( $text, $tags, $level, $offset ) = @_; | ||||
45 | 37 | 50 | 72 | $tags = \@tags unless $tags; | |||
46 | 37 | 100 | 67 | $level = 0 unless $level; | |||
47 | 37 | 100 | 61 | $offset = 0 unless $offset; | |||
48 | |||||||
49 | 37 | 74 | my $lc = lc($text); | ||||
50 | |||||||
51 | 37 | 82 | my $startTag = index( $lc, "<" . $tags->[$level] ); | ||||
52 | 37 | 48 | my $endTag = index( $lc, ">", $startTag ) + 1; | ||||
53 | 37 | 40 | my $startEnd; | ||||
54 | my $endEnd; | ||||||
55 | 0 | 0 | my $startMore; | ||||
56 | 37 | 38 | my $isEmpty = 0; | ||||
57 | |||||||
58 | 37 | 50 | 91 | if ( substr( $lc, $endTag - 2, 1 ) eq "/" ) { # empty tag | |||
59 | 0 | 0 | $startEnd = $endTag; | ||||
60 | 0 | 0 | $endEnd = $endTag; | ||||
61 | 0 | 0 | $isEmpty = 1; | ||||
62 | } | ||||||
63 | else { | ||||||
64 | 37 | 127 | $startEnd = | ||||
65 | $self->findMatchingEndTag( $lc, $endTag, $tags->[$level], $offset ); | ||||||
66 | 37 | 54 | $endEnd = index( $lc, ">", $startEnd ) + 1; | ||||
67 | } | ||||||
68 | |||||||
69 | 37 | 85 | $startMore = index( $lc, "<" . $tags->[$level], $endEnd ); | ||||
70 | |||||||
71 | 37 | 50 | 33 | 323 | if ( $startTag < 0 or $endTag < 0 or $startEnd < 0 or $endEnd < 0 ) { | ||
33 | |||||||
33 | |||||||
72 | |||||||
73 | # warn "PARSE: $startTag $endTag $startEnd $endEnd\n"; | ||||||
74 | 0 | 0 | throw Test::C2FIT::ParseException( | ||||
75 | "Can't find tag: " . $tags->[$level] . "\n", $offset ); | ||||||
76 | } | ||||||
77 | |||||||
78 | 37 | 50 | 76 | if ($isEmpty) { | |||
79 | 0 | 0 | $self->{'tag'} = | ||||
80 | substr( $text, $startTag, $endTag - $startTag - 2 ) . ">"; | ||||||
81 | 0 | 0 | $self->{'body'} = ""; | ||||
82 | 0 | 0 | $self->{'end'} = "" . $tags->[$level] . ">"; | ||||
83 | } | ||||||
84 | else { | ||||||
85 | 37 | 122 | $self->{'tag'} = substr( $text, $startTag, $endTag - $startTag ); | ||||
86 | 37 | 79 | $self->{'body'} = substr( $text, $endTag, $startEnd - $endTag ); | ||||
87 | 37 | 71 | $self->{'end'} = substr( $text, $startEnd, $endEnd - $startEnd ); | ||||
88 | } | ||||||
89 | 37 | 72 | $self->{'leader'} = substr( $text, 0, $startTag ); | ||||
90 | 37 | 98 | $self->{'trailer'} = substr( $text, $endEnd ); | ||||
91 | |||||||
92 | 37 | 100 | 43 | if ( $level + 1 < scalar @{$tags} ) { | |||
37 | 78 | ||||||
93 | 13 | 63 | $self->{'parts'} = | ||||
94 | $self->new( $self->{'body'}, $tags, $level + 1, $offset + $endTag ); | ||||||
95 | 13 | 22 | $self->{'body'} = undef; | ||||
96 | } | ||||||
97 | else { | ||||||
98 | |||||||
99 | #Check for nested table | ||||||
100 | 24 | 57 | my $index = index( $self->{'body'}, "<" . $tags->[0] ); | ||||
101 | 24 | 50 | 59 | if ( $index >= 0 ) { | |||
102 | 0 | 0 | $self->{'parts'} = | ||||
103 | $self->new( $self->{'body'}, $tags, 0, $offset + $endTag ); | ||||||
104 | 0 | 0 | $self->{'body'} = ''; | ||||
105 | } | ||||||
106 | } | ||||||
107 | |||||||
108 | 37 | 100 | 92 | if ( $startMore >= 0 ) { | |||
109 | 21 | 80 | $self->{'more'} = | ||||
110 | $self->new( $self->{'trailer'}, $tags, $level, $offset + $endEnd ); | ||||||
111 | 21 | 42 | $self->{'trailer'} = undef; | ||||
112 | } | ||||||
113 | } | ||||||
114 | |||||||
115 | sub findMatchingEndTag { | ||||||
116 | 37 | 37 | 0 | 45 | my $self = shift; | ||
117 | 37 | 55 | my ( $lc, $matchFromHere, $tag, $offset ) = @_; | ||||
118 | |||||||
119 | 37 | 39 | my $fromHere = $matchFromHere; | ||||
120 | 37 | 36 | my $count = 1; | ||||
121 | 37 | 37 | my $startEnd = 0; | ||||
122 | |||||||
123 | 37 | 75 | while ( $count > 0 ) { | ||||
124 | 37 | 65 | my $embeddedTag = index( $lc, "<$tag", $fromHere ); | ||||
125 | 37 | 60 | my $embeddedTagEnd = index( $lc, "$tag", $fromHere ); | ||||
126 | |||||||
127 | # Which one is closer? | ||||||
128 | 37 | 50 | 66 | 115 | throw Test::C2FIT::ParseException( "Can't find tag: $tag\n", $offset ) | ||
129 | if ( $embeddedTag < 0 and $embeddedTagEnd < 0 ); | ||||||
130 | |||||||
131 | 37 | 100 | 71 | $embeddedTag = $MAX_VALUE if ( $embeddedTag < 0 ); | |||
132 | 37 | 50 | 64 | $embeddedTagEnd = $MAX_VALUE if ( $embeddedTagEnd < 0 ); | |||
133 | |||||||
134 | 37 | 50 | 112 | if ( $embeddedTag < $embeddedTagEnd ) { | |||
50 | |||||||
135 | 0 | 0 | $count++; | ||||
136 | 0 | 0 | $startEnd = $embeddedTag; | ||||
137 | 0 | 0 | $fromHere = index( $lc, ">", $embeddedTag ) + 1; | ||||
138 | } | ||||||
139 | elsif ( $embeddedTagEnd < $embeddedTag ) { | ||||||
140 | 37 | 36 | $count--; | ||||
141 | 37 | 38 | $startEnd = $embeddedTagEnd; | ||||
142 | 37 | 98 | $fromHere = index( $lc, ">", $embeddedTagEnd ) + 1; | ||||
143 | } | ||||||
144 | } | ||||||
145 | 37 | 77 | return $startEnd; | ||||
146 | } | ||||||
147 | |||||||
148 | sub size { | ||||||
149 | 0 | 0 | 0 | 0 | my $self = shift; | ||
150 | 0 | 0 | 0 | $self->more() ? $self->more()->size() + 1 : 1; | |||
151 | } | ||||||
152 | |||||||
153 | sub last { | ||||||
154 | 5 | 5 | 1 | 11 | my $self = shift; | ||
155 | 5 | 100 | 9 | $self->more() ? $self->more()->last() : $self; | |||
156 | } | ||||||
157 | |||||||
158 | sub leaf { | ||||||
159 | 3 | 3 | 1 | 4 | my $self = shift; | ||
160 | 3 | 100 | 5 | $self->parts() ? $self->parts()->leaf() : $self; | |||
161 | } | ||||||
162 | |||||||
163 | sub at { | ||||||
164 | 34 | 34 | 0 | 68 | my $self = shift; | ||
165 | |||||||
166 | 34 | 100 | 90 | return $self->_at3(@_) if 3 == @_; | |||
167 | 30 | 100 | 57 | return $self->_at2(@_) if 2 == @_; | |||
168 | 27 | 100 | 66 | 166 | return ( $_[0] == 0 || not defined( $self->more() ) ) | ||
169 | ? $self | ||||||
170 | : $self->more()->at( $_[0] - 1 ); | ||||||
171 | } | ||||||
172 | |||||||
173 | sub _at2 { | ||||||
174 | 7 | 7 | 10 | my $self = shift; | |||
175 | 7 | 34 | return $self->at( $_[0] )->parts()->at( $_[1] ); | ||||
176 | } | ||||||
177 | |||||||
178 | sub _at3 { | ||||||
179 | 4 | 4 | 7 | my $self = shift; | |||
180 | 4 | 19 | return $self->_at2( $_[0], $_[1] )->parts()->at( $_[2] ); | ||||
181 | } | ||||||
182 | |||||||
183 | sub text { | ||||||
184 | 32 | 32 | 1 | 44 | my $self = shift; | ||
185 | 32 | 64 | return $self->htmlToText( $self->body() ); | ||||
186 | } | ||||||
187 | |||||||
188 | sub htmlToText { | ||||||
189 | 32 | 32 | 0 | 38 | my $self = shift; | ||
190 | 32 | 36 | my $s = shift; | ||||
191 | 32 | 100 | 77 | return $s unless $s; | |||
192 | 30 | 61 | $s = $self->normalizeLineBreaks($s); | ||||
193 | 30 | 67 | $s = $self->removeNonBreakTags($s); | ||||
194 | 30 | 63 | $s = $self->condenseWhitespace($s); | ||||
195 | 30 | 68 | $s = $self->unescape($s); | ||||
196 | 30 | 104 | return $s; | ||||
197 | } | ||||||
198 | |||||||
199 | sub removeNonBreakTags { | ||||||
200 | 30 | 30 | 0 | 35 | my $self = shift; | ||
201 | 30 | 41 | my $s = shift; | ||||
202 | 30 | 38 | $s =~ s/(<(?!br)[^>]+>)//g; | ||||
203 | 30 | 53 | return $s; | ||||
204 | } | ||||||
205 | |||||||
206 | sub unescape { | ||||||
207 | 30 | 30 | 0 | 34 | my $self = shift; | ||
208 | 30 | 34 | my $s = shift; | ||||
209 | |||||||
210 | 30 | 46 | $s =~ s| |\n|g; |
||||
211 | 30 | 95 | $s = $self->unescapeEntities($s); | ||||
212 | 30 | 59 | $s = $self->unescapeSmartQuotes($s); | ||||
213 | |||||||
214 | 30 | 50 | return $s; | ||||
215 | } | ||||||
216 | |||||||
217 | sub unescapeSmartQuotes { | ||||||
218 | 30 | 30 | 0 | 40 | my $self = shift; | ||
219 | 30 | 33 | my $s = shift; | ||||
220 | |||||||
221 | 30 | 41 | $s =~ s/\x{91}/\'/g; | ||||
222 | 30 | 33 | $s =~ s/\x{92}/\'/g; | ||||
223 | 30 | 31 | $s =~ s/\x{93}/\"/g; | ||||
224 | 30 | 33 | $s =~ s/\x{94}/\"/g; | ||||
225 | |||||||
226 | 30 | 71 | $s =~ s/\x{201c}/\"/g; | ||||
227 | 30 | 48 | $s =~ s/\x{201d}/\"/g; | ||||
228 | 30 | 54 | $s =~ s/\x{2018}/\'/g; | ||||
229 | 30 | 39 | $s =~ s/\x{2019}/\'/g; | ||||
230 | |||||||
231 | 30 | 58 | return $s; | ||||
232 | } | ||||||
233 | |||||||
234 | sub unescapeEntities { | ||||||
235 | 30 | 30 | 0 | 36 | my $self = shift; | ||
236 | 30 | 31 | my $s = shift; | ||||
237 | 30 | 39 | $s =~ s/\</ | ||||
238 | 30 | 63 | $s =~ s/\>/>/g; | ||||
239 | 30 | 35 | $s =~ s/\ / /g; | ||||
240 | 30 | 33 | $s =~ s/\&/&/g; | ||||
241 | 30 | 32 | $s =~ s/\"/\"/g; | ||||
242 | 30 | 54 | return $s; | ||||
243 | } | ||||||
244 | |||||||
245 | sub normalizeLineBreaks { | ||||||
246 | 30 | 30 | 0 | 32 | my $self = shift; | ||
247 | 30 | 35 | my $s = shift; | ||||
248 | 30 | 41 | $s =~ s|<\s*br\s*/?\s*>| |g; |
||||
249 | 30 | 37 | $s =~ s|<\s*/\s*p\s*>\s*<\s*p( .*?)?>| |g; |
||||
250 | 30 | 57 | return $s; | ||||
251 | } | ||||||
252 | |||||||
253 | sub unformat { | ||||||
254 | 0 | 0 | 0 | 0 | my $self = shift; | ||
255 | 0 | 0 | my $s = shift; | ||||
256 | 0 | 0 | $s =~ s/<[^>]+>//g; | ||||
257 | 0 | 0 | return $s; | ||||
258 | } | ||||||
259 | |||||||
260 | sub addToTag { | ||||||
261 | 6 | 6 | 0 | 72 | my $self = shift; | ||
262 | 6 | 10 | my ($string) = @_; | ||||
263 | 6 | 51 | $self->{'tag'} =~ s/>$/$string>/; | ||||
264 | } | ||||||
265 | |||||||
266 | sub addToBody { | ||||||
267 | 0 | 0 | 0 | 0 | my $self = shift; | ||
268 | 0 | 0 | my ($string) = @_; | ||||
269 | 0 | 0 | $self->{'body'} .= $string; | ||||
270 | } | ||||||
271 | |||||||
272 | sub asString { | ||||||
273 | 0 | 0 | 0 | 0 | my $self = shift; | ||
274 | |||||||
275 | 0 | 0 | my $s = $self->leader() . $self->tag(); | ||||
276 | 0 | 0 | 0 | if ( $self->parts() ) { | |||
277 | 0 | 0 | $s .= $self->parts()->asString(); | ||||
278 | } | ||||||
279 | else { | ||||||
280 | 0 | 0 | $s .= $self->body(); | ||||
281 | } | ||||||
282 | 0 | 0 | $s .= $self->end(); | ||||
283 | 0 | 0 | 0 | if ( $self->more() ) { | |||
284 | 0 | 0 | $s .= $self->more()->asString(); | ||||
285 | } | ||||||
286 | else { | ||||||
287 | 0 | 0 | $s .= $self->trailer(); | ||||
288 | } | ||||||
289 | 0 | 0 | return $s; | ||||
290 | } | ||||||
291 | |||||||
292 | sub leader { | ||||||
293 | 1 | 1 | 1 | 13 | $_[0]->{'leader'}; | ||
294 | } | ||||||
295 | |||||||
296 | sub tag { | ||||||
297 | 1 | 1 | 1 | 5 | $_[0]->{'tag'}; | ||
298 | } | ||||||
299 | |||||||
300 | sub body { | ||||||
301 | 33 | 33 | 1 | 706 | $_[0]->{'body'}; | ||
302 | } | ||||||
303 | |||||||
304 | sub parts { | ||||||
305 | 30 | 30 | 1 | 138 | $_[0]->{'parts'}; | ||
306 | } | ||||||
307 | |||||||
308 | sub end { | ||||||
309 | 0 | 0 | 1 | 0 | $_[0]->{'end'}; | ||
310 | } | ||||||
311 | |||||||
312 | sub trailer { | ||||||
313 | 1 | 1 | 1 | 8 | $_[0]->{'trailer'}; | ||
314 | } | ||||||
315 | |||||||
316 | sub more { | ||||||
317 | 51 | 51 | 1 | 67 | my $self = shift; | ||
318 | 51 | 100 | 102 | $self->{'more'} = $_[0] if @_; | |||
319 | 51 | 227 | return $self->{'more'}; | ||||
320 | } | ||||||
321 | |||||||
322 | # TBD print() is required by the tests. TJB | ||||||
323 | sub print { | ||||||
324 | 0 | 0 | 0 | 0 | my $self = shift; | ||
325 | 0 | 0 | return $self->asString(); | ||||
326 | } | ||||||
327 | |||||||
328 | sub condenseWhitespace { | ||||||
329 | 30 | 30 | 0 | 35 | my $self = shift; | ||
330 | 30 | 34 | my $s = shift; | ||||
331 | |||||||
332 | 30 | 119 | $s =~ s/\s+/ /g; | ||||
333 | |||||||
334 | # | ||||||
335 | # if a non-breaking-space character was inserted by a perl logic, | ||||||
336 | # it might be represended either as a byte-sequence or as a single character. | ||||||
337 | # (depending on the perl version) | ||||||
338 | # | ||||||
339 | # the input document is exepected to be in a single-byte encoding, therefore | ||||||
340 | # checks to both variants are done. | ||||||
341 | |||||||
342 | 30 | 43 | my $NON_BREAKING_SPACE = | ||||
343 | "\x{00a0}"; # internal representation: utf8 byte sequence | ||||||
344 | 30 | 102 | $s =~ s/$NON_BREAKING_SPACE/ /g; | ||||
345 | |||||||
346 | 30 | 37 | $NON_BREAKING_SPACE = chr(160) | ||||
347 | ; # internal representation: single byte with numerical value of 160 | ||||||
348 | 30 | 52 | $s =~ s/$NON_BREAKING_SPACE/ /g; | ||||
349 | |||||||
350 | 30 | 41 | $s =~ s/ / /g; | ||||
351 | 30 | 55 | $s =~ s/^\s+//g; | ||||
352 | 30 | 55 | $s =~ s/\s+$//g; | ||||
353 | |||||||
354 | 30 | 61 | return $s; | ||||
355 | } | ||||||
356 | |||||||
357 | # TBD - not implemented yet. May be discarded in future releases | ||||||
358 | sub footnote { | ||||||
359 | 0 | 0 | 0 | return "[!]"; | |||
360 | } | ||||||
361 | 1; | ||||||
362 | |||||||
363 | =pod | ||||||
364 | |||||||
365 | =head1 NAME | ||||||
366 | |||||||
367 | Test::C2FIT::Parse - Parsing of html source, filtering out contents of arbitrary tags. | ||||||
368 | |||||||
369 | =head1 SYNOPSIS | ||||||
370 | |||||||
371 | Normally, you do not use Parse directly. | ||||||
372 | |||||||
373 | $parse = new Test::C2FIT::Parse($string,["table","tr","td"]); | ||||||
374 | |||||||
375 | $parse = new Test::C2FIT::Parse($string,["a"]); | ||||||
376 | |||||||
377 | =head1 DESCRIPTION | ||||||
378 | |||||||
379 | Parse creates a linked list of Parse-Objects, so upon parsing, the original content can be restored | ||||||
380 | (or modified, what the fit framework is actually doing). | ||||||
381 | |||||||
382 | |||||||
383 | =head1 METHODS | ||||||
384 | |||||||
385 | =over 4 | ||||||
386 | |||||||
387 | =item B |
||||||
388 | |||||||
389 | Returns the last parse object in the same hierarchy level (table -E |
||||||
390 | or self, if self is the last one. | ||||||
391 | |||||||
392 | =item B |
||||||
393 | |||||||
394 | Returns the first leaf node (=lower hierarchy) or self, if self has no parts. | ||||||
395 | |||||||
396 | =item B |
||||||
397 | |||||||
398 | Returns the text (html markup removed) of the parse object. | ||||||
399 | |||||||
400 | =item B |
||||||
401 | |||||||
402 | Return the part of the input, which came before this parse object. | ||||||
403 | |||||||
404 | =item B |
||||||
405 | |||||||
406 | Returns the tag, including any attributes. | ||||||
407 | |||||||
408 | =item B | ||||||
409 | |||||||
410 | Returns the tag body. | ||||||
411 | |||||||
412 | =item B |
||||||
413 | |||||||
414 | Returns the first Parse object of the next lower hierarchy (e.g. table -E |
||||||
415 | |||||||
416 | =item B |
||||||
417 | |||||||
418 | Returns the closing tag. | ||||||
419 | |||||||
420 | =item B |
||||||
421 | |||||||
422 | Returns the portion of the input, which came after this parse object. | ||||||
423 | |||||||
424 | =item B |
||||||
425 | |||||||
426 | Returns the next Parse object on the same hierarchy level. | ||||||
427 | |||||||
428 | |||||||
429 | |||||||
430 | =back | ||||||
431 | |||||||
432 | =head1 SEE ALSO | ||||||
433 | |||||||
434 | Extensive and up-to-date documentation on FIT can be found at: | ||||||
435 | http://fit.c2.com/ | ||||||
436 | |||||||
437 | |||||||
438 | =cut | ||||||
439 | |||||||
440 | __END__ |