File Coverage

lib/HTML/SummaryBasic.pm
Criterion Covered Total %
statement 68 82 82.9
branch 21 36 58.3
condition 4 9 44.4
subroutine 7 7 100.0
pod 0 1 0.0
total 100 135 74.0


line stmt bran cond sub pod time code
1 1     1   601 use strict;
  1         1  
  1         34  
2 1     1   5 use warnings;
  1         2  
  1         61  
3            
4             package HTML::SummaryBasic;
5            
6             our $VERSION = 0.2;
7            
8             =head1 NAME
9            
10             HTML::SummaryBasic - Basic summary info from HTML.
11            
12             =head1 SYNOPSIS
13            
14             use HTML::SummaryBasic;
15             my $p = new HTML::SummaryBasic {
16             PATH => "input.html",
17             # or HTML => '...',
18             NOT_AVAILABLE => undef,
19             };
20             foreach (keys %{$p->{SUMMARY}}){
21             warn "$_ ... $p->{SUMMARY}->{$_}\n";
22             }
23            
24             =head1 DEPENDENCIES
25            
26             use HTML::TokeParser;
27             use HTML::HeadParser;
28            
29             =cut
30            
31 1     1   13 use Carp;
  1         1  
  1         66  
32 1     1   806 use HTML::TokeParser;
  1         13094  
  1         956  
33            
34             =head1 DESCRIPTION
35            
36             From a file or string of HTML, creates a hash of useful summary information from C and C elements of an HTML document.
37            
38             =head1 GLOBAL VARIABLE
39            
40             =item $NOT_AVAILABLE
41            
42             Value for empty fields. Default is C<[Not Available]>. May be over-ridden directly by supplying the constructor with a field of the same name.
43             See L.
44            
45             =cut
46            
47             our $NOT_AVAILABLE = '[Not available]';
48            
49             =head1 CONSTRUCTOR (new)
50            
51             Accepts a hash-like structure...
52            
53             =over 4
54            
55             =item HTML or PATH
56            
57             Ref to a scalar of HTML, or plain string that is the path to an HTML file to process.
58            
59             =item SUMMARY
60            
61             Filled after C is called (see L and
62             L).
63            
64             =item FIELDS
65            
66             An array of C tag Cs whose C value should be
67             placed into the respective slots of the C field after
68             C has been called.
69            
70             =back
71            
72             =head2 THE SUMMARY STRUCTURE
73            
74             A field of the object which is a hash, with key/values as follows:
75            
76             =over 4
77            
78             =item AUTHOR
79            
80             HTML C tag C.
81            
82             =item TITLE
83            
84             Text of the element of the same name.
85            
86             =item DESCRIPTION
87            
88             Content of the C tag named C.
89            
90             =item LAST_MODIFIED_META, LAST_MODIFIED_FILE
91            
92             Time since of the modification of the file,
93             respectively according to any C tag of the same name,
94             with a C prefix; failing that, according to the file system.
95            
96             =item CREATED_META, CREATED_FILE
97            
98             As above, but relating to the creation date of the file.
99            
100             =item FIRST_PARA
101            
102             The first HTML C

element of the document.

103            
104             =item HEADLINE
105            
106             The first C

tag; failing that, the first C

; failing that,

107             the value of C<$NOT_AVAILABLE>.
108            
109             =item PLUS...
110            
111             Any meta-fields specified in the C field.
112            
113             =back
114            
115             =cut
116            
117             sub new {
118 2     2 0 3038 my $class = shift;
119 2 50       5 $class = ref($class)? ref($class) : $class;
120 2         5 my $self = bless {}, $class;
121 2 50       11 my $args = ref($_[0])? shift : {@_};
122            
123             # Defaults
124 2         7 $self->{SUMMARY} = {};
125            
126             # Load parameters
127 2         13 $self->{uc $_} = $args->{$_} foreach keys %$args;
128 2 50       8 croak "Required parameter field missing : $_" if not $self->{PATH};
129            
130 2         5 $self->_get_summary();
131 2         6 return $self;
132             }
133            
134            
135             sub _get_summary {
136 2     2   4 my ($self,$path) = @_;
137            
138 2         2 my ($p,$token, $html);
139            
140 2 50       5 if (defined $path){
141 0 0       0 if (ref $path){
142 0         0 $html = $path;
143 0         0 delete $self->{PATH};
144             } else {
145 0         0 $self->{PATH} = $path;
146             }
147             }
148            
149 2 50       4 if ($self->{PATH}){
150 2 50       5 $html = $self->_load_file()
151             or return undef;
152             }
153            
154             # Get first para
155 2 50       11 if (not $p = new HTML::TokeParser( $html ) ){
156 0         0 warn "HTML::TokeParser could not initiate: $!";
157 0         0 return undef;
158             }
159 2 100       275 if ($token = $p->get_tag('h1')){
160 1         352 $self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
161             } else {
162 1         193 $p = new HTML::TokeParser( $html );
163 1 50       104 if ($token = $p->get_tag('h2')){
164 0         0 $self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
165             } else {
166 1         168 $self->{SUMMARY}->{HEADLINE} = $self->{NOT_AVAILABLE};
167             }
168             }
169 2 50       71 if (not $p = new HTML::TokeParser( $html ) ){
170 0         0 warn "HTML::TokeParser could not initiate: $!";
171 0         0 return undef;
172             }
173 2 100       238 if ($token = $p->get_tag('p')){
174 1         431 $self->{SUMMARY}->{FIRST_PARA} = $p->get_trimmed_text;
175             } else {
176 1         182 $self->{SUMMARY}->{FIRST_PARA} = $self->{NOT_AVAILABLE}
177             }
178            
179 2         57 $p = new HTML::TokeParser( $html );
180 2         223 $p->get_tag('title');
181 2   66     314 $self->{SUMMARY}->{TITLE} = $p->get_text('/title') || $self->{NOT_AVAILABLE};
182            
183             {
184 2         61 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
  2         37  
185             $atime,$mtime,$ctime,$blksize,$blocks) = stat $self->{PATH};
186            
187 2   33     275 $self->{SUMMARY}->{LAST_MODIFIED_FILE} = scalar localtime ( $mtime ) || $self->{NOT_AVAILABLE};
188 2         17 $self->{SUMMARY}->{LAST_MODIFIED_FILE} =~ s/\s+/ /g;
189            
190 2   33     47 $self->{SUMMARY}->{CREATED_FILE} = scalar localtime ( $ctime ) || $self->{NOT_AVAILABLE};
191 2         14 $self->{SUMMARY}->{CREATED_FILE} =~ s/\s+/ /g;
192             }
193            
194 8         21 my $collect = {
195 2         7 map {$_=>1} (
196 2         3 keys %{$self->{FIELDS}},
197             qw(
198             AUTHOR DESCRIPTION
199             LAST-MODIFIED CREATED
200             )
201             )
202             };
203            
204 2         17 $self->{SUMMARY}->{$_} = $self->{NOT_AVAILABLE} foreach keys %$collect;
205            
206 2         10 $p = new HTML::TokeParser( $html );
207 2         257 while (my $tag = $p->get_tag('meta') ){
208 4         290 my $name = uc $tag->[1]->{name};
209             I:
210 4         10 for my $i (1..2){
211 7 100       21 $name =~ s/^X-META-//i if $i == 2;
212 7 100       17 if (exists $collect->{$name} ){
213 4         11 $self->{SUMMARY}->{$name} = $tag->[1]->{content};
214 4         21 last I;
215             }
216             }
217             }
218            
219 2         470 $self->{SUMMARY}->{LAST_MODIFIED_META} = delete $self->{SUMMARY}->{"LAST-MODIFIED"};
220 2         5 $self->{SUMMARY}->{CREATED_META} = delete $self->{SUMMARY}->{"CREATED"};
221            
222 2         14 return 1;
223             }
224            
225            
226             # Return a reference to a scalar of HTML, or C on failure, setting C<$!> with an error message.
227            
228             sub _load_file {
229 2     2   3 my ($self,$path) = @_;
230 2         5 local *IN;
231 2 50       5 return $path if ref $path;
232            
233 2 50       8 if (defined $path){
    50          
234 0         0 $self->{PATH} = $path
235             }
236             elsif (not $self->{PATH}){
237 0         0 warn "load_file requires a path argument, or that the PATH field be set";
238 0         0 return undef;
239             }
240 2 50       81 if (not open IN, $self->{PATH}){
241 0         0 warn "load_file could not open $self->{PATH}";
242 0         0 return undef;
243             }
244 2         49 read IN, $_, -s IN;
245 2         20 close IN;
246 2         11 return \$_;
247             }
248            
249             1;
250            
251             =head1 TODO
252            
253             Maybe work on URI as well as file paths.
254            
255             =head1 SEE ALSO
256            
257             L, L.
258            
259             =head1 AUTHOR
260            
261             Lee Goddard (LGoddard@CPAN.org)
262            
263             =head1 COPYRIGHT
264            
265             Copyright 2000-2001 Lee Goddard.
266            
267             This library is free software; you may use and redistribute it or modify it
268             undef the same terms as Perl itself.
269            
270            
271