line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
1
|
|
|
1
|
|
601
|
use strict;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
34
|
|
2
|
1
|
|
|
1
|
|
5
|
use warnings;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
61
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package HTML::SummaryBasic;
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = 0.2;
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
HTML::SummaryBasic - Basic summary info from HTML.
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use HTML::SummaryBasic;
|
15
|
|
|
|
|
|
|
my $p = new HTML::SummaryBasic {
|
16
|
|
|
|
|
|
|
PATH => "input.html",
|
17
|
|
|
|
|
|
|
# or HTML => '...',
|
18
|
|
|
|
|
|
|
NOT_AVAILABLE => undef,
|
19
|
|
|
|
|
|
|
};
|
20
|
|
|
|
|
|
|
foreach (keys %{$p->{SUMMARY}}){
|
21
|
|
|
|
|
|
|
warn "$_ ... $p->{SUMMARY}->{$_}\n";
|
22
|
|
|
|
|
|
|
}
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 DEPENDENCIES
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use HTML::TokeParser;
|
27
|
|
|
|
|
|
|
use HTML::HeadParser;
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=cut
|
30
|
|
|
|
|
|
|
|
31
|
1
|
|
|
1
|
|
13
|
use Carp;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
66
|
|
32
|
1
|
|
|
1
|
|
806
|
use HTML::TokeParser;
|
|
1
|
|
|
|
|
13094
|
|
|
1
|
|
|
|
|
956
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
From a file or string of HTML, creates a hash of useful summary information from C and C elements of an HTML document.
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 GLOBAL VARIABLE
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item $NOT_AVAILABLE
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
Value for empty fields. Default is C<[Not Available]>. May be over-ridden directly by supplying the constructor with a field of the same name.
|
43
|
|
|
|
|
|
|
See L.
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
our $NOT_AVAILABLE = '[Not available]';
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 CONSTRUCTOR (new)
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Accepts a hash-like structure...
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=over 4
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=item HTML or PATH
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Ref to a scalar of HTML, or plain string that is the path to an HTML file to process.
|
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=item SUMMARY
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Filled after C is called (see L and
|
62
|
|
|
|
|
|
|
L).
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item FIELDS
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
An array of C tag Cs whose C value should be
|
67
|
|
|
|
|
|
|
placed into the respective slots of the C field after
|
68
|
|
|
|
|
|
|
C has been called.
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=back
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 THE SUMMARY STRUCTURE
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
A field of the object which is a hash, with key/values as follows:
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=over 4
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=item AUTHOR
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
HTML C tag C.
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=item TITLE
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Text of the element of the same name.
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item DESCRIPTION
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Content of the C tag named C.
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=item LAST_MODIFIED_META, LAST_MODIFIED_FILE
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Time since of the modification of the file,
|
93
|
|
|
|
|
|
|
respectively according to any C tag of the same name,
|
94
|
|
|
|
|
|
|
with a C prefix; failing that, according to the file system.
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item CREATED_META, CREATED_FILE
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
As above, but relating to the creation date of the file.
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=item FIRST_PARA
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
The first HTML C element of the document.
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item HEADLINE
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
The first C tag; failing that, the first C; failing that,
|
107
|
|
|
|
|
|
|
the value of C<$NOT_AVAILABLE>.
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=item PLUS...
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Any meta-fields specified in the C field.
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=back
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub new {
|
118
|
2
|
|
|
2
|
0
|
3038
|
my $class = shift;
|
119
|
2
|
50
|
|
|
|
5
|
$class = ref($class)? ref($class) : $class;
|
120
|
2
|
|
|
|
|
5
|
my $self = bless {}, $class;
|
121
|
2
|
50
|
|
|
|
11
|
my $args = ref($_[0])? shift : {@_};
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Defaults
|
124
|
2
|
|
|
|
|
7
|
$self->{SUMMARY} = {};
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# Load parameters
|
127
|
2
|
|
|
|
|
13
|
$self->{uc $_} = $args->{$_} foreach keys %$args;
|
128
|
2
|
50
|
|
|
|
8
|
croak "Required parameter field missing : $_" if not $self->{PATH};
|
129
|
|
|
|
|
|
|
|
130
|
2
|
|
|
|
|
5
|
$self->_get_summary();
|
131
|
2
|
|
|
|
|
6
|
return $self;
|
132
|
|
|
|
|
|
|
}
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub _get_summary {
|
136
|
2
|
|
|
2
|
|
4
|
my ($self,$path) = @_;
|
137
|
|
|
|
|
|
|
|
138
|
2
|
|
|
|
|
2
|
my ($p,$token, $html);
|
139
|
|
|
|
|
|
|
|
140
|
2
|
50
|
|
|
|
5
|
if (defined $path){
|
141
|
0
|
0
|
|
|
|
0
|
if (ref $path){
|
142
|
0
|
|
|
|
|
0
|
$html = $path;
|
143
|
0
|
|
|
|
|
0
|
delete $self->{PATH};
|
144
|
|
|
|
|
|
|
} else {
|
145
|
0
|
|
|
|
|
0
|
$self->{PATH} = $path;
|
146
|
|
|
|
|
|
|
}
|
147
|
|
|
|
|
|
|
}
|
148
|
|
|
|
|
|
|
|
149
|
2
|
50
|
|
|
|
4
|
if ($self->{PATH}){
|
150
|
2
|
50
|
|
|
|
5
|
$html = $self->_load_file()
|
151
|
|
|
|
|
|
|
or return undef;
|
152
|
|
|
|
|
|
|
}
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Get first para
|
155
|
2
|
50
|
|
|
|
11
|
if (not $p = new HTML::TokeParser( $html ) ){
|
156
|
0
|
|
|
|
|
0
|
warn "HTML::TokeParser could not initiate: $!";
|
157
|
0
|
|
|
|
|
0
|
return undef;
|
158
|
|
|
|
|
|
|
}
|
159
|
2
|
100
|
|
|
|
275
|
if ($token = $p->get_tag('h1')){
|
160
|
1
|
|
|
|
|
352
|
$self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
|
161
|
|
|
|
|
|
|
} else {
|
162
|
1
|
|
|
|
|
193
|
$p = new HTML::TokeParser( $html );
|
163
|
1
|
50
|
|
|
|
104
|
if ($token = $p->get_tag('h2')){
|
164
|
0
|
|
|
|
|
0
|
$self->{SUMMARY}->{HEADLINE} = $p->get_trimmed_text;
|
165
|
|
|
|
|
|
|
} else {
|
166
|
1
|
|
|
|
|
168
|
$self->{SUMMARY}->{HEADLINE} = $self->{NOT_AVAILABLE};
|
167
|
|
|
|
|
|
|
}
|
168
|
|
|
|
|
|
|
}
|
169
|
2
|
50
|
|
|
|
71
|
if (not $p = new HTML::TokeParser( $html ) ){
|
170
|
0
|
|
|
|
|
0
|
warn "HTML::TokeParser could not initiate: $!";
|
171
|
0
|
|
|
|
|
0
|
return undef;
|
172
|
|
|
|
|
|
|
}
|
173
|
2
|
100
|
|
|
|
238
|
if ($token = $p->get_tag('p')){
|
174
|
1
|
|
|
|
|
431
|
$self->{SUMMARY}->{FIRST_PARA} = $p->get_trimmed_text;
|
175
|
|
|
|
|
|
|
} else {
|
176
|
1
|
|
|
|
|
182
|
$self->{SUMMARY}->{FIRST_PARA} = $self->{NOT_AVAILABLE}
|
177
|
|
|
|
|
|
|
}
|
178
|
|
|
|
|
|
|
|
179
|
2
|
|
|
|
|
57
|
$p = new HTML::TokeParser( $html );
|
180
|
2
|
|
|
|
|
223
|
$p->get_tag('title');
|
181
|
2
|
|
66
|
|
|
314
|
$self->{SUMMARY}->{TITLE} = $p->get_text('/title') || $self->{NOT_AVAILABLE};
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
{
|
184
|
2
|
|
|
|
|
61
|
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
|
|
2
|
|
|
|
|
37
|
|
185
|
|
|
|
|
|
|
$atime,$mtime,$ctime,$blksize,$blocks) = stat $self->{PATH};
|
186
|
|
|
|
|
|
|
|
187
|
2
|
|
33
|
|
|
275
|
$self->{SUMMARY}->{LAST_MODIFIED_FILE} = scalar localtime ( $mtime ) || $self->{NOT_AVAILABLE};
|
188
|
2
|
|
|
|
|
17
|
$self->{SUMMARY}->{LAST_MODIFIED_FILE} =~ s/\s+/ /g;
|
189
|
|
|
|
|
|
|
|
190
|
2
|
|
33
|
|
|
47
|
$self->{SUMMARY}->{CREATED_FILE} = scalar localtime ( $ctime ) || $self->{NOT_AVAILABLE};
|
191
|
2
|
|
|
|
|
14
|
$self->{SUMMARY}->{CREATED_FILE} =~ s/\s+/ /g;
|
192
|
|
|
|
|
|
|
}
|
193
|
|
|
|
|
|
|
|
194
|
8
|
|
|
|
|
21
|
my $collect = {
|
195
|
2
|
|
|
|
|
7
|
map {$_=>1} (
|
196
|
2
|
|
|
|
|
3
|
keys %{$self->{FIELDS}},
|
197
|
|
|
|
|
|
|
qw(
|
198
|
|
|
|
|
|
|
AUTHOR DESCRIPTION
|
199
|
|
|
|
|
|
|
LAST-MODIFIED CREATED
|
200
|
|
|
|
|
|
|
)
|
201
|
|
|
|
|
|
|
)
|
202
|
|
|
|
|
|
|
};
|
203
|
|
|
|
|
|
|
|
204
|
2
|
|
|
|
|
17
|
$self->{SUMMARY}->{$_} = $self->{NOT_AVAILABLE} foreach keys %$collect;
|
205
|
|
|
|
|
|
|
|
206
|
2
|
|
|
|
|
10
|
$p = new HTML::TokeParser( $html );
|
207
|
2
|
|
|
|
|
257
|
while (my $tag = $p->get_tag('meta') ){
|
208
|
4
|
|
|
|
|
290
|
my $name = uc $tag->[1]->{name};
|
209
|
|
|
|
|
|
|
I:
|
210
|
4
|
|
|
|
|
10
|
for my $i (1..2){
|
211
|
7
|
100
|
|
|
|
21
|
$name =~ s/^X-META-//i if $i == 2;
|
212
|
7
|
100
|
|
|
|
17
|
if (exists $collect->{$name} ){
|
213
|
4
|
|
|
|
|
11
|
$self->{SUMMARY}->{$name} = $tag->[1]->{content};
|
214
|
4
|
|
|
|
|
21
|
last I;
|
215
|
|
|
|
|
|
|
}
|
216
|
|
|
|
|
|
|
}
|
217
|
|
|
|
|
|
|
}
|
218
|
|
|
|
|
|
|
|
219
|
2
|
|
|
|
|
470
|
$self->{SUMMARY}->{LAST_MODIFIED_META} = delete $self->{SUMMARY}->{"LAST-MODIFIED"};
|
220
|
2
|
|
|
|
|
5
|
$self->{SUMMARY}->{CREATED_META} = delete $self->{SUMMARY}->{"CREATED"};
|
221
|
|
|
|
|
|
|
|
222
|
2
|
|
|
|
|
14
|
return 1;
|
223
|
|
|
|
|
|
|
}
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Return a reference to a scalar of HTML, or C on failure, setting C<$!> with an error message.
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub _load_file {
|
229
|
2
|
|
|
2
|
|
3
|
my ($self,$path) = @_;
|
230
|
2
|
|
|
|
|
5
|
local *IN;
|
231
|
2
|
50
|
|
|
|
5
|
return $path if ref $path;
|
232
|
|
|
|
|
|
|
|
233
|
2
|
50
|
|
|
|
8
|
if (defined $path){
|
|
|
50
|
|
|
|
|
|
234
|
0
|
|
|
|
|
0
|
$self->{PATH} = $path
|
235
|
|
|
|
|
|
|
}
|
236
|
|
|
|
|
|
|
elsif (not $self->{PATH}){
|
237
|
0
|
|
|
|
|
0
|
warn "load_file requires a path argument, or that the PATH field be set";
|
238
|
0
|
|
|
|
|
0
|
return undef;
|
239
|
|
|
|
|
|
|
}
|
240
|
2
|
50
|
|
|
|
81
|
if (not open IN, $self->{PATH}){
|
241
|
0
|
|
|
|
|
0
|
warn "load_file could not open $self->{PATH}";
|
242
|
0
|
|
|
|
|
0
|
return undef;
|
243
|
|
|
|
|
|
|
}
|
244
|
2
|
|
|
|
|
49
|
read IN, $_, -s IN;
|
245
|
2
|
|
|
|
|
20
|
close IN;
|
246
|
2
|
|
|
|
|
11
|
return \$_;
|
247
|
|
|
|
|
|
|
}
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
1;
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head1 TODO
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Maybe work on URI as well as file paths.
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head1 SEE ALSO
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
L, L.
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head1 AUTHOR
|
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Lee Goddard (LGoddard@CPAN.org)
|
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 COPYRIGHT
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Copyright 2000-2001 Lee Goddard.
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
This library is free software; you may use and redistribute it or modify it
|
268
|
|
|
|
|
|
|
undef the same terms as Perl itself.
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
|