File Coverage

blib/lib/HTML/Content/ContentExtractor.pm

Criterion	Covered	Total	%
statement	21	75	28.0
branch	0	18	0.0
condition	0	18	0.0
subroutine	7	11	63.6
pod	2	4	50.0
total	30	126	23.8

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							HTML::Content::ContentExtractor - Perl module for extracting content from HTML documents.
4
5							=head1 SYNOPSIS
6
7							use HTML::WordTagRatio::WeightedRatio;
8							use HTML::Content::HTMLTokenizer;
9							use HTML::Content::ContentExtractor;
10
11							my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
12
13							my $ranker = new HTML::WordTagRatio::WeightedRatio();
14
15							my $extractor = new HTML::Content::ContentExtractor($tokenizer,$ranker,"index.html","index.extr");
16
17							$extractor->Extract();
18
19							=head1 DESCRIPTION
20
21							HTML::Content::ContentExtractor attempts to extract the content from HTML documents. It attempts to remove tags, scripts and boilerplate text from the documents by trying to find the region of the HTML document that has the maximum ratio of words to tags.
22
23							=head2 Methods
24
25							=over 4
26
27							=item * my $extractor = new HTML::Content::ContentExtractor($tokenizer, $ratio, $inputfilename, $extractfilename)
28
29							Initializes HTML::Content::ContentExtractor with 1) an object that can tokenize HTML 2) an object that can compute the ratio of Words to Tags 3) an input filename and 4) an output filename.
30
31							=item * $extractor->Extract()
32
33							Attempts to extract content from the $inputfilename.
34
35							=back
36
37							=head1 AUTHOR
38
39							Jean Tavernier (jj.tavernier@gmail.com)
40
41							=head1 COPYRIGHT
42
43							Copyright 2005 Jean Tavernier. All rights reserved.
44
45							This library is free software; you can redistribute it and/or
46							modify it under the same terms as Perl itself.
47
48							=head1 SEE ALSO
49
50							ContentExtractorDriver.pl (1).
51
52							=cut
53
54							package HTML::Content::ContentExtractor;
55
56	1			1		470	use strict;
	1					2
	1					33
57	1			1		18	use warnings;
	1					1
	1					42
58	1			1		15	use Carp;
	1					1
	1					51
59	1			1		337	use HTML::WordTagRatio::Ratio;
	1					1
	1					23
60	1			1		379	use HTML::Content::HTMLTokenizer;
	1					2
	1					25
61	1			1		4	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	1					1
	1					67
62
63							require Exporter;
64
65							@ISA = qw(Exporter);
66							# Items to export into callers namespace by default. Note: do not export
67							# names by default without a very good reason. Use EXPORT_OK instead.
68							# Do not simply export all your public functions/methods/constants.
69							@EXPORT = qw();
70							$VERSION = '0.01';
71	1			1		3	use fields qw(Tokenizer Ratio Document Output);
	1					1
	1					3
72
73
74							# new - constructs ContentExtractor object
75							# - preconditions: 1st arg is a reference to a HTMLTokenizer object
76							# 2nd arg is a reference to a Ratio object
77							# 3rd arg is an HTML file name
78							# 4th arg is an output file
79							# - postconditions: ContentExtractor is constructed
80							sub new
81							{
82	0			0	1		my $invocant = shift;
83	0		0				my $class = ref($invocant) \|\| $invocant;
84
85	0						my $self = fields::new($invocant);
86	0						$self->{Tokenizer} = shift;
87	0	0					croak "ContentExtractor: first argument must be an HTMLTokenizer" unless $self->{Tokenizer}->isa('HTML::Content::HTMLTokenizer');
88	0						$self->{Ratio} = shift;
89	0	0					croak "ContentExtractor: second argument must be an HTMLTokenizer" unless $self->{Ratio}->isa('HTML::WordTagRatio::Ratio');
90	0						$self->{Document} = shift;
91	0						$self->{Output} = shift;
92
93	0						return bless($self, $class);
94							}
95
96							sub Extract
97							{
98	0			0	1		my $self = shift;
99
100							# Read document
101	0	0					open(HTML,$self->{Document}) or croak "ContentExtractor::ExtractContent: Cannot open $self->{Document} ($!)\n";
102	0						my $html = join("",);
103	0						close(HTML);
104
105	0						my ($N,$T,$seq,$tokens) = $self->{Tokenizer}->Tokenize($html);
106
107	0						my ($i,$j,$max) = $self->FindBestRange($N,$T,$seq);
108
109	0						$self->PrintContent($tokens,$i,$j);
110							}
111							sub FindBestRange
112							{
113	0			0	0		my $self = shift;
114	0						my $tN = shift;
115	0						my $tT = shift;
116	0						my $tS = shift;
117	0						my @N = @{$tN};
	0
118	0						my @T = @{$tT};
	0
119	0						my @S = @{$tS};
	0
120	0						my $best_i = 0;
121	0						my $best_j = $#N;
122	0						my $max = 0;
123
124	0						my $WordMarker = $self->{Tokenizer}->GetWordMarker();
125
126	0						for (my $i = 0;$i <= $#N;$i++)
127							{
128	0						for (my $j = $i + 1; $j <= $#N; $j++)
129							{
130	0	0	0				if ($S[$i] eq $WordMarker && $S[$j] eq $WordMarker)
131							{
132							# Only compute the score if we have Si = N and Sj = N
133							# and Si-1 = T and Sj+1 = T
134	0	0	0				if (
			0
			0
135							($i == 0 \|\| $S[$i] ne $S[$i - 1])
136							&&
137							($j == $#N \|\| $S[$j] ne $S[$j + 1])
138							)
139							{
140	0						my $tmp = $self->{Ratio}->RangeValue($i,$j,\@N,\@T);
141	0	0					if ($tmp > $max)
142							{
143	0						$best_i = $i;
144	0						$best_j = $j;
145	0						$max = $tmp;
146							}
147							}
148							}
149							}
150							}
151	0						return ($best_i,$best_j,$max);
152							}
153							sub PrintContent
154							{
155	0			0	0		my $self = shift;
156	0						my $tokens = shift;
157	0						my $i = shift;
158	0						my $j = shift;
159
160	0	0					open(OUTPUT,">$self->{Output}") or croak "ContentExtractor::ExtractContent: Cannot open $self->{Output} ($!)\n";
161	0						foreach my $key (sort {$a <=> $b} (keys %$tokens))
	0
162							{
163	0						$$tokens{$key} =~ s/\s+//g;
164	0	0					if (length($$tokens{$key}) == 0)
165							{
166	0						next;
167							}
168
169	0	0	0				if ($key >= $i && $key <= $j)
170							{
171	0						print OUTPUT "$$tokens{$key} ";
172							}
173							}
174	0						close(OUTPUT);
175							}
176							1;