File Coverage

blib/lib/HTML/ContentExtractor.pm

Criterion	Covered	Total	%
statement	9	196	4.5
branch	0	62	0.0
condition	0	48	0.0
subroutine	3	25	12.0
pod	9	9	100.0
total	21	340	6.1

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::ContentExtractor;
2
3							=head1 NAME
4
5							HTML::ContentExtractor - extract the main content from a web page by analysising the DOM tree!
6
7							=head1 VERSION
8
9							Version 0.03
10
11							=cut
12
13							our $VERSION = '0.03';
14
15							=head1 SYNOPSIS
16
17							use HTML::ContentExtractor;
18							my $extractor = HTML::ContentExtractor->new();
19							my $agent=LWP::UserAgent->new;
20
21							my $url='http://sports.sina.com.cn/g/2007-03-23/16572821174.shtml';
22							my $res=$agent->get($url);
23							my $HTML = $res->decoded_content();
24
25							$extractor->extract($url,$HTML);
26							print $extractor->as_html();
27							print $extractor->as_text();
28
29							=head1 DESCRIPTION
30
31							Web pages often contain clutter (such as ads, unnecessary images and
32							extraneous links) around the body of an article that distracts a user
33							from actual content. This module is used to reduce the noise content
34							in web pages and thus identify the content rich regions.
35
36
37							A web page is first parsed by an HTML parser, which corrects the
38							markup and creates a DOM (Document Object Model) tree. By using a
39							depth-first traversal to navigate the DOM tree, noise nodes are
40							identified and removed, thus the main content is extracted. Some
41							useless nodes (script, style, etc.) are removed; the container nodes
42							(table, div, etc.) which have high link/text ratio (higher than
43							threshold) are removed; (link/text ratio is the ratio of the number of
44							links and non-linked words.) The nodes contain any string in the
45							predefined spam string list are removed.
46
47
48							Please notice the input HTML should be encoded in utf-8 format( so do
49							the spam words), thus the module can handle web pages in any language
50							(I've used it to process English, Chinese, and Japanese web pages).
51
52							=over 4
53
54							=item $e = HTML::ContentExtractor->new(%options);
55
56							Constructs a new C object. The optional
57							%options hash can be used to set the options list below.
58
59							=item $e->table_tags();
60
61							=item $e->table_tags(@tags);
62
63							=item $e->table_tags(\@tags);
64
65							This is used to get/set the table tags array. The tags are used as the
66							container tags.
67
68							=item $e->ignore_tags();
69
70							=item $e->ignore_tags(@tags);
71
72							=item $e->ignore_tags(\@tags);
73
74							This is used to get/set the ignore tags array. The elements of such
75							tags will be removed.
76
77							=item $e->spam_words();
78
79							=item $e->spam_words(@strings);
80
81							=item $e->spam_words(\@strings);
82
83							This is used to get/set the spam words list. The elements have such
84							string will be removed.
85
86							=item $e->link_text_ratio();
87
88							=item $e->link_text_ratio($ratio);
89
90							This is used to get/set the link/text ratio, default is 0.05.
91
92							=item $e->min_text_len();
93
94							=item $e->min_text_len($len);
95
96							This is used to get/set the min text length, default is 20. If length
97							of the text of an elment is less than this value, this element will be
98							removed.
99
100							=item $e->extract($url,$HTML);
101
102							This is used to perform the extraction process. Please notice the
103							input $HTML must be encoded in UTF-8.
104
105							=item $e->as_html();
106
107							Return the extraction result in HTML format.
108
109							=item $e->as_text();
110
111							Return the extraction result in text format.
112
113							=back
114
115
116							=head1 AUTHOR
117
118							Zhang Jun, C<< >>
119
120							=head1 COPYRIGHT & LICENSE
121
122							Copyright 2007 Zhang Jun, all rights reserved.
123
124							This program is free software; you can redistribute it and/or modify it
125							under the same terms as Perl itself.
126
127							=cut
128
129	1			1		26019	use strict;
	1					3
	1					42
130	1			1		6	use warnings;
	1					2
	1					29
131	1			1		1475	use HTML::TreeBuilder;
	1					66117
	1					12
132
133							sub new {
134	0			0	1		my $proto = shift;
135	0		0				my $class = ref($proto) \|\| $proto;
136
137	0						my $self = {};
138	0						bless($self, $class);
139
140	0						return $self->_init(@_);
141							}
142
143							sub _init{
144	0			0			my $self = shift;
145
146	0						$self->{table_tags} = [qw(table form div td tr tbody thead tfoot th col colgroup span iframe center ul h1 h2 h3 p)];
147	0						$self->{ignore_tags} = [qw(script noscript style form button meta input select iframe embed hr img)];
148	0						$self->{spam_words} = ['All rights reserved'];
149	0						$self->{link_text_ratio} = 0.05;
150	0						$self->{min_text_len} = 20;
151
152	0	0					if (@_ != 0) {
153	0	0					if (ref $_[0] eq 'HASH') {
154	0						my $hash=$_[0];
155	0						foreach my $key (keys %$hash) {
156	0						$self->{lc($key)}=$hash->{$key};
157							}
158							}else{
159	0						my %args = @_;
160	0						foreach my $key (keys %args) {
161	0						$self->{lc($key)}=$args{$key};
162							}
163							}
164							}
165
166	0						$self->table_tags($self->{table_tags});
167	0						$self->ignore_tags($self->{ignore_tags});
168	0						return $self;
169							}
170
171							sub min_text_len{
172	0			0	1		my $self=shift;
173	0	0					return $self->{min_text_len} if (@_ == 0);
174
175	0						$self->{min_text_len}=shift;
176							}
177
178							sub link_text_ratio{
179	0			0	1		my $self=shift;
180	0	0					return $self->{link_text_ratio} if (@_ == 0);
181
182	0						$self->{link_text_ratio}=shift;
183							}
184
185							sub spam_words{
186	0			0	1		my $self = shift;
187
188	0	0					if(@_ == 0){
189	0						return @{$self->{spam_words}};
	0
190							}
191
192	0	0					if(ref $_[0] eq 'ARRAY'){
193	0						$self->{spam_words} = $_[0];
194							}else{
195	0						my @array = @_;
196	0						$self->{spam_words} = \@array;
197							}
198							}
199
200							sub ignore_tags{
201	0			0	1		my $self = shift;
202
203	0	0					if(@_ == 0){
204	0						return keys %{$self->{ignore_tags}};
	0
205							}
206
207	0						my $array;
208	0	0					if(ref $_[0] eq 'ARRAY'){
209	0						$array = $_[0];
210							}else{
211	0						$array = \@_;
212							}
213
214	0						my $h={};
215	0						grep {$h->{$_}=1;} @$array;
	0
216	0						$self->{ignore_tags} = $h;
217							}
218
219							sub table_tags{
220	0			0	1		my $self = shift;
221	0	0					if(@_ == 0){
222	0						return keys %{$self->{table_tags}};
	0
223							}
224
225	0						my $array;
226	0	0					if(ref $_[0] eq 'ARRAY'){
227	0						$array = $_[0];
228							}else{
229	0						$array = \@_;
230							}
231
232	0						my $h={};
233	0						grep {$h->{$_}=1;} @$array;
	0
234	0						$self->{table_tags} = $h;
235							}
236
237							#the input should be utf8 encoded html content
238							sub extract{
239	0			0	1		my $self=shift;
240	0						my $url=shift;
241	0						my $HTML=shift;
242
243	0	0					$self->{tree}->delete if($self->{tree});
244
245	0						$HTML=_PreprocessForFragmentIdentifiedPage($url,$HTML);
246	0						_remove_crap($HTML);
247
248	0						$self->{url}=$url;
249	0						$self->{tree} = HTML::TreeBuilder->new();
250	0						$self->{tree} ->parse($HTML);
251	0						$self->{link_count} = _how_many_links($self->{tree});
252	0						$self->{is_index}= _check_if_index($self->{tree});
253	0						$self->_Heuristic_Remove($self->{tree});
254	0						$self->_Table_Remove($self->{tree});
255							}
256
257							sub _is_index{
258	0			0			return $_[0]->{is_index};
259							}
260
261							sub DESTROY{
262	0			0			my $self = shift;
263	0	0					$self->{tree}->delete if($self->{tree});
264							}
265
266							#also the output are in utf8 format
267							sub as_html{
268	0			0	1		my $self=shift;
269	0						my $HTML = $self->{tree}->as_HTML('<>&',"\t");
270	0						return $HTML;
271							}
272
273							sub as_text{
274	0			0	1		my $self=shift;
275	0						my $output = _to_text($self->{tree});
276	0						$output =~ s/[\n\r] +/\n/sg;
277	0						$output =~ s/[\n\r]+/\n/sg;
278	0						$output =~ s/ +/ /sg;
279	0						$output =~ s/\n /\n/sg;
280	0						$output =~ s/^\s+//;
281	0						return $output;
282							}
283
284							sub _link_count{
285	0			0			return $_[0]->{link_count};
286							}
287
288							sub _check_if_index{
289	0			0			my $node=shift;
290
291	0						my $num_links=_how_many_links($node);
292	0						my $txt=_nonlink_words($node);
293
294	0						my $num_words = _count_words_num($txt);
295
296	0						my $ratio=1;
297	0	0					$ratio = $num_links/$num_words unless $num_words==0;
298	0	0	0				if($ratio>0.3 \|\| $num_links>400){
299	0						return 1;
300							}else{
301	0						return 0;
302							}
303							}
304
305							sub _remove_crap{
306	0			0			$_[0] =~ s/ / /isg;
307							}
308
309							sub _Table_Remove{
310	0			0			my $self=shift;
311	0						my $node=shift;
312	0	0					return if not ref $node; # not an element node
313
314	0						my $tag=$node->tag;
315
316	0						my @nodes = $node->content_list(); # depth first recursive travesel
317	0						foreach my $child (@nodes){
318	0						$self->_Table_Remove( $child );
319							}
320
321	0	0					if($self->{table_tags}->{$tag}){
322
323	0						my $num_links=_how_many_links($node);
324	0						my $txt=_nonlink_words($node);
325
326	0						my $num_words = _count_words_num($txt);
327
328	0						my $ratio=1;
329	0	0					$ratio = $num_links/$num_words unless $num_words==0;
330
331	0	0	0				if ($num_words < $self->{min_text_len} and
			0
			0
			0
332							$node->tag ne 'h1' and
333							$node->tag ne 'h2' and
334							$node->tag ne 'h3' and
335							$node->tag ne 'p'){
336	0						$node->delete; return;
	0
337							}
338
339	0	0					if ($ratio > $self->{link_text_ratio}){
340	0						$node->delete; return;
	0
341							}
342
343	0						$txt = lc $txt;
344
345	0						foreach(@{ $self->{spam_words} }){
	0
346	0	0					if(index($txt,$_) != -1){
347	0						$node->delete;
348	0						return;
349							}
350							}
351							}
352							}
353
354							sub _how_many_links{
355	0			0			my $node=shift;
356	0						my $links_r = $node->extract_links();
357	0						my $num_links = scalar(@$links_r);
358	0						return $num_links;
359							}
360
361							sub _nonlink_words{
362	0			0			my $node=shift;
363	0	0					if(not ref $node){
364	0						my $text = $node;
365	0						return $text;
366							}
367	0	0	0				return '' if($node->tag eq 'a'
			0
			0
			0
			0
			0
368							or $node->tag eq 'style'
369							or $node->tag eq 'script'
370							or $node->tag eq 'option'
371							or $node->tag eq 'noscript'
372							or $node->tag eq 'hr'
373							or $node->tag eq 'input'
374							);
375
376	0						my @nodes = $node->content_list(); # breadth first travesel
377	0						my $sum_text="";
378	0						foreach $node (@nodes){
379	0						$sum_text .= _nonlink_words( $node );
380							}
381	0						return $sum_text;
382							}
383
384							sub _Heuristic_Remove{
385	0			0			my $self=shift;
386	0						my $node=shift;
387	0	0					return if not ref $node; # not an element node
388
389	0						my @nodes = $node->content_list(); # depth first recursive travesel
390	0						foreach my $child (@nodes){
391	0						$self->_Heuristic_Remove( $child );
392							}
393
394	0	0					if($self->{ignore_tags}->{$node->tag} ){ # ignore the tags defined in ignore_tags
395	0						$node->delete;
396	0						return;
397							}
398
399	0	0	0				if($node->tag eq 'a' and $node->parent->tag eq 'body'){
400	0						$node->delete;
401							}
402							}
403
404							sub _to_text{
405	0			0			my $node = shift;
406	0	0					if(not ref $node){
407	0						return $node;
408							}
409	0	0					return '' if($node->tag eq 'head');
410	0						my @nodes = $node->content_list(); #breadth firth travesel
411	0						my $text = "";
412	0						foreach my $child (@nodes) {
413	0	0	0				if ( ref $child and $child->can('tag') and $child->tag() eq 'table' ) {
			0
414	0						my $avail = eval { require HTML::TableExtract };
	0
415	0	0					unless ($avail) {
416	0						$text .= _to_text($child) . "\n";
417	0						next;
418							}
419	0						my $table = 'HTML::TableExtract'->new();
420	0						my $content = $child->as_HTML;
421	0						$table->parse($content);
422	0						foreach my $ts ( $table->tables ) {
423	0						foreach my $row ( $ts->rows ) {
424	0						defined and do { s/\s+$//, s/^\s+// }
425	0		0				for @$row;
426	0						$text .= join( ', ', grep { defined } @$row ) . "\n";
	0
427							}
428							}
429							}else {
430	0						$text .= _to_text($child) . "\n";
431							}
432							}
433	0						return $text;
434							}
435
436							sub _count_words_num{
437	0			0			my $text = shift;
438
439	0						$text =~ s/([\x21-\x7e]+)/ $1 /g;
440	0						$text =~ s/([^\x20-\x7e])/ $1 /g;
441	0						$text =~ s/^ +//;
442	0						my @tokens=split(/\s+/,$text);
443
444	0						return scalar(@tokens);
445							}
446
447							# input is the url and HTML
448							# output is the processed HTML
449							sub _PreprocessForFragmentIdentifiedPage{
450	0			0			my $url=shift;
451	0						my $HTML=shift;
452	0	0					if($url!~/\#/){
453	0						return $HTML;
454							}
455
456	0						my ($fragment_id)= $url=~/\#(.+)$/;
457	0						$fragment_id=~s/\///;
458
459	0	0					if($HTML=~/(
460	0						$HTML=$1;
461							}
462	0						return $HTML;
463							}
464
465							1;
466