File Coverage

blib/lib/HTML/TableParser/Table.pm
Criterion Covered Total %
statement 206 209 98.5
branch 87 98 88.7
condition 19 26 73.0
subroutine 22 23 95.6
pod 0 19 0.0
total 334 375 89.0


at line $line\n" );
line stmt bran cond sub pod time code
1             package HTML::TableParser::Table;
2              
3 8     8   29 use strict;
  8         8  
  8         183  
4 8     8   25 use warnings;
  8         8  
  8         161  
5              
6 8     8   22 use HTML::Entities;
  8         11  
  8         14754  
7              
8             our $VERSION = '0.38';
9              
10             ## no critic ( ProhibitAccessOfPrivateData )
11              
12             sub new
13             {
14 215     215 0 279 my $this = shift;
15              
16 215   33     709 my $class = ref($this) || $this;
17              
18 215         1508 my $self = {
19             data => [[]], # row data (for overlapping rows)
20             row => undef, # row info
21             col => undef, # column info
22             hdr => undef, # accumulated header info
23             hdr_row => 0, # index of header row
24             hdr_line => undef, # line in file of header row
25             in_hdr => 0, # are we in a header row?
26             prev_hdr => 0, # was the previous row a header row?
27             line => undef, # line in file of current row
28             start_line => undef, # line in file of table start
29             req => undef, # the matching table request
30             exclreqs => {}, # the requests which exlude this table
31             };
32              
33 215         317 bless $self, $class;
34              
35 215         298 my ( $parser, $ids, $reqs, $line ) = @_;
36              
37 215         323 $self->{parser} = $parser;
38 215         234 $self->{start_line} = $line;
39              
40             # if called with no args, create an empty, placeholder object
41 215 100       309 unless ( defined $ids )
42             {
43 93         171 $self->{ids} = [ 0 ];
44 93         139 $self->{process} = 0;
45 93         223 $self->{id} = 'sentinel';
46             }
47              
48             else
49             {
50 122         146 $ids->[-1]++;
51 122         246 $self->{oids} = [ @$ids ];
52 122         257 $self->{ids} = [ @$ids, 0 ];
53 122         155 $self->{id} = join( '.', grep { $_ != 0 } @{$ids} );
  152         573  
  122         184  
54              
55 122         155 $self->{reqs} = $reqs;
56              
57             # are we interested in this table?
58 122         260 $self->match_id();
59              
60             # inform user of table start. note that if we're looking for
61             # for column name matches, we don't want to do the callback;
62             # in that case $self->{req} isn't set and callback() won't
63             # actually make the call.
64             $self->callback( 'start', $self->{start_line} )
65 122 100       416 if $self->{process};
66             }
67              
68 215         4008 $self;
69             }
70              
71              
72             sub match_id
73             {
74 122     122 0 148 my $self = shift;
75              
76 122         150 $self->{process} = 0;
77 122         129 $self->{req} = undef;
78              
79             # 1. look for explicit id matches
80             # 2. if no explicit id match, use header matches
81             # 3. if no header matches, use DEFAULT
82             # 4. if no DEFAULT, no match
83              
84             # 1. explicit id.
85              
86 122         127 my ( $skip, $req );
87              
88             ( $skip, $req ) =
89             req_match_id( $self->{reqs}, $self->{id}, $self->{oids},
90 122         336 $self->{exclreqs} );
91              
92             # did we match a skip table request?
93 122 100       252 return if $skip;
94              
95 121 100       222 if ( $req )
96             {
97 28         94 $self->match_req( $req );
98 28         29 return;
99             }
100              
101              
102             # 2. header match.
103             # don't set {req}, as that'll trigger callbacks and we're not sure
104             # this is a match yet
105              
106 93 100       118 if ( grep { @{$_->{cols}} } @{$self->{reqs}})
  95         113  
  95         268  
  93         141  
107             {
108 62         79 $self->{process} = 1;
109 62         57 $self->{req} = undef;
110 62         84 return;
111             }
112              
113             # 3. DEFAULT match
114             ( $skip, $req ) =
115 31         67 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
116              
117             # did we match a skip table request? Does this make sense for DEFAULT?
118 31 50       62 return if $skip;
119              
120 31 100       61 if ( $req )
121             {
122 14         35 $self->match_req( $req );
123 14         12 return;
124             }
125              
126             # 4. out of luck. no match.
127             }
128              
129             # determine if a request matches an id. requests should
130             # be real objects, but until then...
131             sub req_match_id
132             {
133 182     182 0 230 my ( $reqs, $id, $oids, $excluded ) = @_;
134              
135 182         275 for my $req ( @$reqs )
136             {
137             # if we've already excluded this request, don't bother again.
138             # this is needed for id = DEFAULT passes where we've previously
139             # excluded based on actual table id and should again.
140 186 100       424 next if exists $excluded->{$req};
141              
142             # bail if this request has already matched and we're not
143             # multi-matching
144 185 100 66     591 next if $req->{match} && ! $req->{MultiMatch};
145              
146 155         170 for my $cmp ( @{$req->{id}} )
  155         372  
147             {
148             # is this a subroutine to call?
149 83 100       218 if ( 'CODE' eq ref $cmp->{match} )
    100          
150             {
151 6 100       11 next unless $cmp->{match}->($id, $oids );
152             }
153              
154             # regular expression
155             elsif( 'Regexp' eq ref $cmp->{match} )
156             {
157 3 100       14 next unless $id =~ /$cmp->{match}/;
158             }
159              
160             # a direct match?
161             else
162             {
163 74 100       227 next unless $id eq $cmp->{match};
164             }
165              
166             # we get here only if there was a match.
167              
168             # move on to next request if this was an explicit exclude
169             # request.
170 45 100       101 if ( $cmp->{exclude} )
171             {
172 1         2 $excluded->{$req}++;
173 1         1 next;
174             }
175              
176             # return match, plus whether this is a global skip request
177 44         149 return ( $cmp->{skip}, $req );
178             }
179             }
180              
181 138         261 ( 0, undef );
182             }
183              
184             # determine if a request matches a column. requests should
185             # be real objects, but until then...
186             sub req_match_cols
187             {
188 62     62 0 95 my ( $reqs, $cols, $id, $oids ) = @_;
189              
190 62         110 for my $req ( @$reqs )
191             {
192             # bail if this request has already matched and we're not
193             # multi-matching
194 63 100 66     196 next if $req->{match} && ! $req->{MultiMatch};
195              
196 55         188 my @fix_cols = @$cols;
197 55         126 fix_texts($req, \@fix_cols);
198              
199 55         67 for my $cmp ( @{$req->{cols}} )
  55         118  
200             {
201             # is this a subroutine to call?
202 54 100       208 if ( 'CODE' eq ref $cmp->{match} )
    100          
203             {
204 1 50       4 next unless $cmp->{match}->( $id, $oids, \@fix_cols );
205             }
206              
207             # regular expression
208             elsif( 'Regexp' eq ref $cmp->{match} )
209             {
210 17 100       33 next unless grep { /$cmp->{match}/ } @fix_cols;
  235         467  
211             }
212              
213             # a direct match?
214             else
215             {
216 36 100       57 next unless grep { $_ eq $cmp->{match} } @fix_cols;
  504         566  
217             }
218              
219             # we get here only if there was a match
220              
221             # move on to next request if this was an explicit exclude
222             # request.
223 33 50       86 next if $cmp->{exclude};
224              
225             # return match, plus whether this is a global skip request
226 33         153 return ( $cmp->{skip}, $req );
227             }
228              
229             }
230              
231 29         107 (0, undef);
232             }
233              
234             # we've pulled in a header; does it match against one of the requests?
235             sub match_hdr
236             {
237 62     62 0 204 my ( $self, @cols ) = @_;
238              
239              
240             # 1. check header matches
241             # 2. if no header matches, use DEFAULT id
242             # 3. if no DEFAULT, no match
243              
244             # 1. check header matches
245             my ( $skip, $req ) = req_match_cols( $self->{reqs}, \@cols, $self->{id},
246 62         250 $self->{oids} );
247             # did we match a skip table request?
248 62 50       125 return 0 if $skip;
249              
250 62 100       150 if ( $req )
251             {
252 33         93 $self->match_req( $req );
253 33         100 return 1;
254             }
255              
256              
257             # 2. DEFAULT match
258             ( $skip, $req ) =
259 29         71 req_match_id( $self->{reqs}, 'DEFAULT', $self->{oids}, $self->{exclreqs} );
260              
261             # did we match a skip table request? Does this make sense for DEFAULT?
262 29 50       66 return 0 if $skip;
263              
264 29 100       62 if ( $req )
265             {
266 1         3 $self->match_req( $req );
267 1         3 return 1;
268             }
269              
270             # 3. if no DEFAULT, no match
271              
272 28         85 0;
273             }
274              
275             sub match_req
276             {
277 76     76 0 97 my ( $self, $req ) = @_;
278              
279 76 100       219 if ( $req->{class} )
    100          
280             {
281             # no strict 'refs';
282 7         8 my $new = $req->{new};
283 7         33 $self->{obj} = $req->{class}->$new( $req->{id}, $req->{udata} );
284             }
285             elsif ( $req->{obj} )
286             {
287 6         9 $self->{obj} = $req->{obj};
288             }
289              
290 76         131 $self->{process} = 1;
291 76         94 $self->{req} = $req;
292 76         99 $self->{req}{match}++;
293             }
294              
295              
296             # generic call back interface. handle method calls as well as
297             # subroutine calls.
298             sub callback
299             {
300 3533     3533 0 2755 my $self = shift;
301 3533         2750 my $method = shift;
302              
303             return unless
304 3533 100 66     9421 defined $self->{req} && exists $self->{req}->{$method};
305              
306 2799         2258 my $req = $self->{req};
307 2799         2340 my $call = $req->{$method};
308              
309 2799 100       4563 if ( 'CODE' eq ref $call )
310             {
311 2128         4531 $call->( $self->{id}, @_, $req->{udata} );
312             }
313             else
314             {
315             # if the object was destroyed before we get here (if it
316             # was created by us and thus was destroyed before us if
317             # there was an error), we can't call a method
318             $self->{obj}->$call( $self->{id}, @_, $req->{udata} )
319 671 50       1985 if defined $self->{obj};
320             }
321             }
322              
323              
324             # handle
325             sub start_header
326             {
327 1630     1630 0 1188 my $self = shift;
328 1630         1451 my ( undef, $line ) = @_;
329              
330 1630         1194 $self->{in_hdr}++;
331 1630         1037 $self->{prev_hdr}++;
332 1630         1187 $self->{hdr_line} = $line;
333 1630         1806 $self->start_column( @_ );
334             }
335              
336              
337             # handle
338             sub end_header
339             {
340 1390     1390 0 959 my $self = shift;
341 1390         1433 $self->end_column();
342             }
343              
344             # handle
345             sub start_column
346             {
347 49757     49757 0 34956 my $self = shift;
348 49757         37937 my ( $attr, $line ) = @_;
349              
350             # end last column if not explicitly ended. perform check here
351             # to avoid extra method call
352 49757 100       84264 $self->end_column() if defined $self->{col};
353              
354             # we really shouldn't be here if a row hasn't been started
355 49757 50       64107 unless ( defined $self->{row} )
356             {
357 0         0 $self->callback( 'warn', $self->{id}, $line,
358             " or without
359 0         0 $self->start_row( {}, $line );
360             }
361              
362             # even weirder. if the last row was a header we have to process it now,
363             # rather than waiting until the end of this row, as there might be
364             # a table in one of the cells in this row and if the enclosing table
365             # was using a column match/re, we won't match it's header until after
366             # the enclosed table is completely parsed. this is bad, as it may
367             # grab a match (if there's no multimatch) meant for the enclosing table.
368              
369             # if we're one row past the header, we're done with the header
370             $self->finish_header()
371 49757 100 66     97570 if ! $self->{in_hdr} && $self->{prev_hdr};
372              
373 49757         106442 $self->{col} = { attr => { %$attr} };
374 49757   100     121470 $self->{col}{attr}{colspan} ||= 1;
375 49757   100     290101 $self->{col}{attr}{rowspan} ||= 1;
376             }
377              
378             # handle
379             sub end_column
380             {
381 50371     50371 0 32796 my $self = shift;
382              
383 50371 100       66443 return unless defined $self->{col};
384              
385 49757 50       69701 $self->{col}{text} = defined $self->{text} ? $self->{text} : '' ;
386              
387 49757         32076 push @{$self->{row}}, $self->{col};
  49757         63790  
388              
389 49757         39930 $self->{col} = undef;
390 49757         64679 $self->{text} = undef;
391             }
392              
393             sub start_row
394             {
395 3382     3382 0 2648 my $self = shift;
396 3382         2745 my ( $attr, $line ) = @_;
397              
398             # end last row if not explicitly ended
399 3382         3843 $self->end_row();
400              
401 3382         21071 $self->{row} = [];
402 3382         19182 $self->{line} = $line;
403             }
404              
405              
406             sub end_row
407             {
408 4100     4100 0 2813 my $self = shift;
409              
410 4100 100       6047 return unless defined $self->{row};
411              
412             # perhaps an unfinished row. first finish column
413 3382         3730 $self->end_column();
414              
415             # if we're in a header, deal with overlapping cells differently
416             # then if we're in the data section
417 3382 100       4178 if ( $self->{in_hdr} )
418             {
419              
420 167         159 my $cn = 0;
421 167         155 my $rn = 0;
422 167         129 foreach my $col ( @{$self->{row}} )
  167         338  
423             {
424             # do this just in case there are newlines and we're concatenating
425             # column names later on. causes strange problems. besides,
426             # column names should be regular
427 1630         2195 $col->{text} =~ s/^\s+//;
428 1630         2036 $col->{text} =~ s/\s+$//;
429              
430             # need to find the first undefined column
431 1630         3088 $cn++ while defined $self->{hdr}[$cn][$self->{hdr_row}];
432              
433             # note that header is stored as one array per column, not row!
434 1630         2445 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
435             {
436 1911   100     2801 $self->{hdr}[$cn] ||= [];
437 1911         2314 $self->{hdr}[$cn][$self->{hdr_row}] = $col->{text};
438            
439             # put empty placeholders in the rest of the rows
440 1911         4303 for ( my $rnn = 1 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
441             {
442 235         709 $self->{hdr}[$cn][$rnn + $self->{hdr_row}] = '';
443             }
444             }
445             }
446              
447 167         198 $self->{hdr_row}++;
448             }
449             else
450             {
451 3215         2471 my $cn = 0;
452 3215         2221 my $rn = 0;
453 3215         2185 foreach my $col ( @{$self->{row}} )
  3215         4226  
454             {
455             # need to find the first undefined column
456 48127         62816 $cn++ while defined $self->{data}[0][$cn];
457              
458 48127         64898 for ( my $cnn = 0 ; $cnn < $col->{attr}{colspan} ; $cnn++, $cn++ )
459             {
460 48147         60183 for ( my $rnn = 0 ; $rnn < $col->{attr}{rowspan} ; $rnn++ )
461             {
462 48167   100     58245 $self->{data}[$rnn] ||= [];
463 48167         124095 $self->{data}[$rnn][$cn] = $col->{text};
464             }
465             }
466             }
467             }
468              
469             # if we're one row past the header, we're done with the header
470             $self->finish_header()
471 3382 50 66     8561 if ! $self->{in_hdr} && $self->{prev_hdr};
472              
473             # output the data if we're not in a header
474             $self->callback( 'row', $self->{line},
475 3215         5935 fix_texts( $self->{req}, shift @{$self->{data}} ) )
476 3382 100       5813 unless $self->{in_hdr};
477              
478 3382         47625 $self->{in_hdr} = 0;
479 3382         9268 $self->{row} = undef;
480             }
481              
482             # collect the possible multiple header rows into one array and
483             # send it off
484             sub finish_header
485             {
486 208     208 0 201 my $self = shift;
487              
488 208 100       470 return unless $self->{hdr};
489              
490 1481 50       987 my @header = map { join( ' ', grep { defined $_ && $_ ne '' } @{$_}) }
  2146         6501  
  1481         1227  
491 104         108 @{ $self->{hdr} };
  104         211  
492              
493             # if we're trying to match header columns, check that here.
494 104 100       292 if ( defined $self->{req} )
495             {
496 42         113 fix_texts( $self->{req}, \@header );
497 42         98 $self->callback( 'hdr', $self->{hdr_line}, \@header );
498             }
499              
500             else
501             {
502 62 100       195 if ( $self->match_hdr( @header ) )
503             {
504             # haven't done this callback yet...
505 34         69 $self->callback( 'start', $self->{start_line} );
506              
507 34         19953518 fix_texts( $self->{req}, \@header );
508 34         118 $self->callback( 'hdr', $self->{hdr_line}, \@header );
509             }
510              
511             # no match. reach up to the controlling parser and turn off
512             # processing of this table. this is kind of kludgy!
513             else
514             {
515 28         105 $self->{parser}->process(0);
516             }
517             }
518              
519              
520 104         32586017 $self->{hdr} = undef;
521 104         563 $self->{prev_hdr} = undef;
522 104         264 $self->{hdr_row} = 0;
523             }
524              
525             DESTROY
526             {
527 215     215   93623 my $self = shift;
528              
529             # if we're actually parsing this table, do something.
530 215 100       1839 if ( $self->{process} )
531             {
532             # just in case
533 104         172 $self->end_row();
534              
535             # just in case there's no table body
536 104         429 $self->finish_header();
537              
538 104         240 $self->callback( 'end', $self->{line} );
539             }
540             }
541              
542             sub fix_texts
543             {
544 3346     3346 0 3408 my ( $req, $texts ) = @_;
545              
546 3346         3488 for ( @$texts )
547             {
548             local $HTML::Entities::entity2char{nbsp} =
549 49998         50908 $HTML::Entities::entity2char{nbsp};
550              
551             $HTML::Entities::entity2char{nbsp} = ' '
552 49998 50       58678 if $req->{DecodeNBSP};
553              
554             chomp $_
555 49998 100       55808 if $req->{Chomp};
556              
557             decode_entities( $_ )
558 49998 100       90994 if $req->{Decode};
559              
560              
561 49998 100       71925 if ( $req->{Trim} )
562             {
563 6529         8375 s/^\s+//;
564 6529         12819 s/\s+$//;
565             }
566             }
567              
568 3346         6005 $texts;
569             }
570              
571             sub text
572             {
573 62098     62098 0 46702 my $self = shift;
574              
575 62098         212124 $self->{text} = shift;
576             }
577              
578 0     0 0 0 sub id { $_[0]->{id} }
579 122     122 0 543 sub ids { $_[0]->{ids} }
580 244     244 0 680 sub process { $_[0]->{process} }
581              
582             1;
583              
584             __END__