line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME
|
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
HTML::Content::ContentExtractor - Perl module for extracting content from HTML documents.
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use HTML::WordTagRatio::WeightedRatio;
|
8
|
|
|
|
|
|
|
use HTML::Content::HTMLTokenizer;
|
9
|
|
|
|
|
|
|
use HTML::Content::ContentExtractor;
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my $ranker = new HTML::WordTagRatio::WeightedRatio();
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my $extractor = new HTML::Content::ContentExtractor($tokenizer,$ranker,"index.html","index.extr");
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
$extractor->Extract();
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
HTML::Content::ContentExtractor attempts to extract the content from HTML documents. It attempts to remove tags, scripts and boilerplate text from the documents by trying to find the region of the HTML document that has the maximum ratio of words to tags.
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head2 Methods
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=over 4
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=item * my $extractor = new HTML::Content::ContentExtractor($tokenizer, $ratio, $inputfilename, $extractfilename)
|
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Initializes HTML::Content::ContentExtractor with 1) an object that can tokenize HTML 2) an object that can compute the ratio of Words to Tags 3) an input filename and 4) an output filename.
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=item * $extractor->Extract()
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Attempts to extract content from the $inputfilename.
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=back
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 AUTHOR
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Jean Tavernier (jj.tavernier@gmail.com)
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 COPYRIGHT
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Copyright 2005 Jean Tavernier. All rights reserved.
|
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or
|
46
|
|
|
|
|
|
|
modify it under the same terms as Perl itself.
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 SEE ALSO
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
ContentExtractorDriver.pl (1).
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=cut
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
package HTML::Content::ContentExtractor;
|
55
|
|
|
|
|
|
|
|
56
|
1
|
|
|
1
|
|
470
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
57
|
1
|
|
|
1
|
|
18
|
use warnings;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
42
|
|
58
|
1
|
|
|
1
|
|
15
|
use Carp;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
51
|
|
59
|
1
|
|
|
1
|
|
337
|
use HTML::WordTagRatio::Ratio;
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
23
|
|
60
|
1
|
|
|
1
|
|
379
|
use HTML::Content::HTMLTokenizer;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
61
|
1
|
|
|
1
|
|
4
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
67
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
require Exporter;
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
@ISA = qw(Exporter);
|
66
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export
|
67
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead.
|
68
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants.
|
69
|
|
|
|
|
|
|
@EXPORT = qw();
|
70
|
|
|
|
|
|
|
$VERSION = '0.01';
|
71
|
1
|
|
|
1
|
|
3
|
use fields qw(Tokenizer Ratio Document Output);
|
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
3
|
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# new - constructs ContentExtractor object
|
75
|
|
|
|
|
|
|
# - preconditions: 1st arg is a reference to a HTMLTokenizer object
|
76
|
|
|
|
|
|
|
# 2nd arg is a reference to a Ratio object
|
77
|
|
|
|
|
|
|
# 3rd arg is an HTML file name
|
78
|
|
|
|
|
|
|
# 4th arg is an output file
|
79
|
|
|
|
|
|
|
# - postconditions: ContentExtractor is constructed
|
80
|
|
|
|
|
|
|
sub new
|
81
|
|
|
|
|
|
|
{
|
82
|
0
|
|
|
0
|
1
|
|
my $invocant = shift;
|
83
|
0
|
|
0
|
|
|
|
my $class = ref($invocant) || $invocant;
|
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
my $self = fields::new($invocant);
|
86
|
0
|
|
|
|
|
|
$self->{Tokenizer} = shift;
|
87
|
0
|
0
|
|
|
|
|
croak "ContentExtractor: first argument must be an HTMLTokenizer" unless $self->{Tokenizer}->isa('HTML::Content::HTMLTokenizer');
|
88
|
0
|
|
|
|
|
|
$self->{Ratio} = shift;
|
89
|
0
|
0
|
|
|
|
|
croak "ContentExtractor: second argument must be an HTMLTokenizer" unless $self->{Ratio}->isa('HTML::WordTagRatio::Ratio');
|
90
|
0
|
|
|
|
|
|
$self->{Document} = shift;
|
91
|
0
|
|
|
|
|
|
$self->{Output} = shift;
|
92
|
|
|
|
|
|
|
|
93
|
0
|
|
|
|
|
|
return bless($self, $class);
|
94
|
|
|
|
|
|
|
}
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub Extract
|
97
|
|
|
|
|
|
|
{
|
98
|
0
|
|
|
0
|
1
|
|
my $self = shift;
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# Read document
|
101
|
0
|
0
|
|
|
|
|
open(HTML,$self->{Document}) or croak "ContentExtractor::ExtractContent: Cannot open $self->{Document} ($!)\n";
|
102
|
0
|
|
|
|
|
|
my $html = join("",);
|
103
|
0
|
|
|
|
|
|
close(HTML);
|
104
|
|
|
|
|
|
|
|
105
|
0
|
|
|
|
|
|
my ($N,$T,$seq,$tokens) = $self->{Tokenizer}->Tokenize($html);
|
106
|
|
|
|
|
|
|
|
107
|
0
|
|
|
|
|
|
my ($i,$j,$max) = $self->FindBestRange($N,$T,$seq);
|
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
$self->PrintContent($tokens,$i,$j);
|
110
|
|
|
|
|
|
|
}
|
111
|
|
|
|
|
|
|
sub FindBestRange
|
112
|
|
|
|
|
|
|
{
|
113
|
0
|
|
|
0
|
0
|
|
my $self = shift;
|
114
|
0
|
|
|
|
|
|
my $tN = shift;
|
115
|
0
|
|
|
|
|
|
my $tT = shift;
|
116
|
0
|
|
|
|
|
|
my $tS = shift;
|
117
|
0
|
|
|
|
|
|
my @N = @{$tN};
|
|
0
|
|
|
|
|
|
|
118
|
0
|
|
|
|
|
|
my @T = @{$tT};
|
|
0
|
|
|
|
|
|
|
119
|
0
|
|
|
|
|
|
my @S = @{$tS};
|
|
0
|
|
|
|
|
|
|
120
|
0
|
|
|
|
|
|
my $best_i = 0;
|
121
|
0
|
|
|
|
|
|
my $best_j = $#N;
|
122
|
0
|
|
|
|
|
|
my $max = 0;
|
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
my $WordMarker = $self->{Tokenizer}->GetWordMarker();
|
125
|
|
|
|
|
|
|
|
126
|
0
|
|
|
|
|
|
for (my $i = 0;$i <= $#N;$i++)
|
127
|
|
|
|
|
|
|
{
|
128
|
0
|
|
|
|
|
|
for (my $j = $i + 1; $j <= $#N; $j++)
|
129
|
|
|
|
|
|
|
{
|
130
|
0
|
0
|
0
|
|
|
|
if ($S[$i] eq $WordMarker && $S[$j] eq $WordMarker)
|
131
|
|
|
|
|
|
|
{
|
132
|
|
|
|
|
|
|
# Only compute the score if we have Si = N and Sj = N
|
133
|
|
|
|
|
|
|
# and Si-1 = T and Sj+1 = T
|
134
|
0
|
0
|
0
|
|
|
|
if (
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
135
|
|
|
|
|
|
|
($i == 0 || $S[$i] ne $S[$i - 1])
|
136
|
|
|
|
|
|
|
&&
|
137
|
|
|
|
|
|
|
($j == $#N || $S[$j] ne $S[$j + 1])
|
138
|
|
|
|
|
|
|
)
|
139
|
|
|
|
|
|
|
{
|
140
|
0
|
|
|
|
|
|
my $tmp = $self->{Ratio}->RangeValue($i,$j,\@N,\@T);
|
141
|
0
|
0
|
|
|
|
|
if ($tmp > $max)
|
142
|
|
|
|
|
|
|
{
|
143
|
0
|
|
|
|
|
|
$best_i = $i;
|
144
|
0
|
|
|
|
|
|
$best_j = $j;
|
145
|
0
|
|
|
|
|
|
$max = $tmp;
|
146
|
|
|
|
|
|
|
}
|
147
|
|
|
|
|
|
|
}
|
148
|
|
|
|
|
|
|
}
|
149
|
|
|
|
|
|
|
}
|
150
|
|
|
|
|
|
|
}
|
151
|
0
|
|
|
|
|
|
return ($best_i,$best_j,$max);
|
152
|
|
|
|
|
|
|
}
|
153
|
|
|
|
|
|
|
sub PrintContent
|
154
|
|
|
|
|
|
|
{
|
155
|
0
|
|
|
0
|
0
|
|
my $self = shift;
|
156
|
0
|
|
|
|
|
|
my $tokens = shift;
|
157
|
0
|
|
|
|
|
|
my $i = shift;
|
158
|
0
|
|
|
|
|
|
my $j = shift;
|
159
|
|
|
|
|
|
|
|
160
|
0
|
0
|
|
|
|
|
open(OUTPUT,">$self->{Output}") or croak "ContentExtractor::ExtractContent: Cannot open $self->{Output} ($!)\n";
|
161
|
0
|
|
|
|
|
|
foreach my $key (sort {$a <=> $b} (keys %$tokens))
|
|
0
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
{
|
163
|
0
|
|
|
|
|
|
$$tokens{$key} =~ s/\s+//g;
|
164
|
0
|
0
|
|
|
|
|
if (length($$tokens{$key}) == 0)
|
165
|
|
|
|
|
|
|
{
|
166
|
0
|
|
|
|
|
|
next;
|
167
|
|
|
|
|
|
|
}
|
168
|
|
|
|
|
|
|
|
169
|
0
|
0
|
0
|
|
|
|
if ($key >= $i && $key <= $j)
|
170
|
|
|
|
|
|
|
{
|
171
|
0
|
|
|
|
|
|
print OUTPUT "$$tokens{$key} ";
|
172
|
|
|
|
|
|
|
}
|
173
|
|
|
|
|
|
|
}
|
174
|
0
|
|
|
|
|
|
close(OUTPUT);
|
175
|
|
|
|
|
|
|
}
|
176
|
|
|
|
|
|
|
1; |