File Coverage

blib/lib/HTML/Content/ContentExtractor.pm
Criterion Covered Total %
statement 21 75 28.0
branch 0 18 0.0
condition 0 18 0.0
subroutine 7 11 63.6
pod 2 4 50.0
total 30 126 23.8


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             HTML::Content::ContentExtractor - Perl module for extracting content from HTML documents.
4            
5             =head1 SYNOPSIS
6            
7             use HTML::WordTagRatio::WeightedRatio;
8             use HTML::Content::HTMLTokenizer;
9             use HTML::Content::ContentExtractor;
10            
11             my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');
12            
13             my $ranker = new HTML::WordTagRatio::WeightedRatio();
14            
15             my $extractor = new HTML::Content::ContentExtractor($tokenizer,$ranker,"index.html","index.extr");
16            
17             $extractor->Extract();
18            
19             =head1 DESCRIPTION
20            
21             HTML::Content::ContentExtractor attempts to extract the content from HTML documents. It attempts to remove tags, scripts and boilerplate text from the documents by trying to find the region of the HTML document that has the maximum ratio of words to tags.
22            
23             =head2 Methods
24            
25             =over 4
26            
27             =item * my $extractor = new HTML::Content::ContentExtractor($tokenizer, $ratio, $inputfilename, $extractfilename)
28            
29             Initializes HTML::Content::ContentExtractor with 1) an object that can tokenize HTML 2) an object that can compute the ratio of Words to Tags 3) an input filename and 4) an output filename.
30            
31             =item * $extractor->Extract()
32            
33             Attempts to extract content from the $inputfilename.
34            
35             =back
36            
37             =head1 AUTHOR
38            
39             Jean Tavernier (jj.tavernier@gmail.com)
40            
41             =head1 COPYRIGHT
42            
43             Copyright 2005 Jean Tavernier. All rights reserved.
44            
45             This library is free software; you can redistribute it and/or
46             modify it under the same terms as Perl itself.
47            
48             =head1 SEE ALSO
49            
50             ContentExtractorDriver.pl (1).
51            
52             =cut
53            
54             package HTML::Content::ContentExtractor;
55            
56 1     1   470 use strict;
  1         2  
  1         33  
57 1     1   18 use warnings;
  1         1  
  1         42  
58 1     1   15 use Carp;
  1         1  
  1         51  
59 1     1   337 use HTML::WordTagRatio::Ratio;
  1         1  
  1         23  
60 1     1   379 use HTML::Content::HTMLTokenizer;
  1         2  
  1         25  
61 1     1   4 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  1         1  
  1         67  
62            
63             require Exporter;
64            
65             @ISA = qw(Exporter);
66             # Items to export into callers namespace by default. Note: do not export
67             # names by default without a very good reason. Use EXPORT_OK instead.
68             # Do not simply export all your public functions/methods/constants.
69             @EXPORT = qw();
70             $VERSION = '0.01';
71 1     1   3 use fields qw(Tokenizer Ratio Document Output);
  1         1  
  1         3  
72            
73            
74             # new - constructs ContentExtractor object
75             # - preconditions: 1st arg is a reference to a HTMLTokenizer object
76             # 2nd arg is a reference to a Ratio object
77             # 3rd arg is an HTML file name
78             # 4th arg is an output file
79             # - postconditions: ContentExtractor is constructed
80             sub new
81             {
82 0     0 1   my $invocant = shift;
83 0   0       my $class = ref($invocant) || $invocant;
84            
85 0           my $self = fields::new($invocant);
86 0           $self->{Tokenizer} = shift;
87 0 0         croak "ContentExtractor: first argument must be an HTMLTokenizer" unless $self->{Tokenizer}->isa('HTML::Content::HTMLTokenizer');
88 0           $self->{Ratio} = shift;
89 0 0         croak "ContentExtractor: second argument must be an HTMLTokenizer" unless $self->{Ratio}->isa('HTML::WordTagRatio::Ratio');
90 0           $self->{Document} = shift;
91 0           $self->{Output} = shift;
92            
93 0           return bless($self, $class);
94             }
95            
96             sub Extract
97             {
98 0     0 1   my $self = shift;
99            
100             # Read document
101 0 0         open(HTML,$self->{Document}) or croak "ContentExtractor::ExtractContent: Cannot open $self->{Document} ($!)\n";
102 0           my $html = join("",);
103 0           close(HTML);
104            
105 0           my ($N,$T,$seq,$tokens) = $self->{Tokenizer}->Tokenize($html);
106            
107 0           my ($i,$j,$max) = $self->FindBestRange($N,$T,$seq);
108            
109 0           $self->PrintContent($tokens,$i,$j);
110             }
111             sub FindBestRange
112             {
113 0     0 0   my $self = shift;
114 0           my $tN = shift;
115 0           my $tT = shift;
116 0           my $tS = shift;
117 0           my @N = @{$tN};
  0            
118 0           my @T = @{$tT};
  0            
119 0           my @S = @{$tS};
  0            
120 0           my $best_i = 0;
121 0           my $best_j = $#N;
122 0           my $max = 0;
123            
124 0           my $WordMarker = $self->{Tokenizer}->GetWordMarker();
125            
126 0           for (my $i = 0;$i <= $#N;$i++)
127             {
128 0           for (my $j = $i + 1; $j <= $#N; $j++)
129             {
130 0 0 0       if ($S[$i] eq $WordMarker && $S[$j] eq $WordMarker)
131             {
132             # Only compute the score if we have Si = N and Sj = N
133             # and Si-1 = T and Sj+1 = T
134 0 0 0       if (
      0        
      0        
135             ($i == 0 || $S[$i] ne $S[$i - 1])
136             &&
137             ($j == $#N || $S[$j] ne $S[$j + 1])
138             )
139             {
140 0           my $tmp = $self->{Ratio}->RangeValue($i,$j,\@N,\@T);
141 0 0         if ($tmp > $max)
142             {
143 0           $best_i = $i;
144 0           $best_j = $j;
145 0           $max = $tmp;
146             }
147             }
148             }
149             }
150             }
151 0           return ($best_i,$best_j,$max);
152             }
153             sub PrintContent
154             {
155 0     0 0   my $self = shift;
156 0           my $tokens = shift;
157 0           my $i = shift;
158 0           my $j = shift;
159            
160 0 0         open(OUTPUT,">$self->{Output}") or croak "ContentExtractor::ExtractContent: Cannot open $self->{Output} ($!)\n";
161 0           foreach my $key (sort {$a <=> $b} (keys %$tokens))
  0            
162             {
163 0           $$tokens{$key} =~ s/\s+//g;
164 0 0         if (length($$tokens{$key}) == 0)
165             {
166 0           next;
167             }
168            
169 0 0 0       if ($key >= $i && $key <= $j)
170             {
171 0           print OUTPUT "$$tokens{$key} ";
172             }
173             }
174 0           close(OUTPUT);
175             }
176             1;