File Coverage

blib/lib/HTML/ExtractMain.pm

Criterion	Covered	Total	%
statement	4	6	66.6
branch			n/a
condition			n/a
subroutine	2	2	100.0
pod			n/a
total	6	8	75.0

line	stmt	sub	time	code
1				#!perl
2
3				package HTML::ExtractMain;
4	2	2	122872	use Carp qw( carp );
	2		5
	2		264
5	2	2	1164	use HTML::TreeBuilder;
	0
	0
6				use Object::Destroyer 2.0;
7				use Scalar::Util qw( blessed refaddr );
8				use base qw( Exporter );
9				use strict;
10				use warnings;
11
12				our @EXPORT_OK = qw( extract_main_html );
13
14				sub extract_main_html {
15				my $arg = shift;
16
17				unless ( defined $arg ) {
18				carp 'extract_main_html requires HTML content as an argument';
19				return;
20				}
21
22				my $tree;
23				if ( ref $arg and blessed $arg and $arg->isa('HTML::TreeBuilder') ) {
24				$tree = $arg;
25				} else {
26				my $raw_html = $arg;
27
28				$tree = eval { HTML::TreeBuilder->new_from_content($raw_html) };
29				if ( !$tree ) {
30				carp 'check HTML input, could not create new HTML::TreeBuilder';
31				return;
32				}
33				}
34
35				my %options = @_;
36				if ( defined $options{output_type} ) {
37				$options{output_type} = lc( $options{output_type} );
38				} else {
39				$options{output_type} = "xhtml";
40				}
41
42				# Remove any lingering circular references. Details at:
43				# http://www.perl.com/pub/2007/06/07/better-code-through-destruction.html
44				my $sentry = Object::Destroyer->new( $tree, 'delete' );
45
46				# Use the Readability algorithm, inspired by:
47				# http://lab.arc90.com/experiments/readability/js/readability.js
48
49				# Study all the paragraphs and find the chunk that has the best score.
50				# A score is determined by things like: Number of 's, commas,
51				# class names, etc.
52
53				my %parents;
54				foreach my $p ( $tree->find_by_tag_name('p') ) {
55				my $parent = $p->parent;
56				my $parent_id = refaddr($parent);
57
58				if ( !defined $parents{$parent_id} ) {
59				$parents{$parent_id}->{element} = $parent;
60				$parents{$parent_id}->{readability} = 0;
61
62				my $text_to_scan = join q{ },
63				grep {defined}
64				( $parent->attr('class'), $parent->attr('id') );
65
66				if ( $text_to_scan =~ m/\b(?:comment\|meta\|footer\|footnote)\b/ ) {
67				$parents{$parent_id}->{readability} -= 50;
68				} elsif ( $text_to_scan
69				=~ m/\b(post\|hentry\|entry[-]?(content\|text\|body)?\|article[-]?(content\|text\|body)?)\b/
70				) {
71				$parents{$parent_id}->{readability} += 25;
72				}
73				}
74
75				# add point for each para found
76				$parents{$parent_id}->{readability}++;
77
78				# add a point for each comma found in the paragraph
79				foreach my $text_ref ( $p->content_refs_list ) {
80				my $num_commas = ( ${$text_ref} =~ m/,/g );
81				$parents{$parent_id}->{readability} += $num_commas;
82				}
83				}
84
85				my $best_parent;
86				foreach my $id ( keys %parents ) {
87				if ( !$best_parent
88				\|\| $parents{$id}->{readability} > $best_parent->{readability} ) {
89				$best_parent = $parents{$id};
90				}
91				}
92
93				if ($best_parent) {
94				my $best_parent_element = $best_parent->{element};
95				$best_parent_element->detach;
96
97				my $output;
98				if ( $options{output_type} eq 'tree' ) {
99				$output = $best_parent_element;
100				} elsif ( $options{output_type} eq 'html' ) {
101				$output = $best_parent_element->as_HTML;
102				} else {
103				$output = $best_parent_element->as_XML;
104				}
105
106				unless ( $options{output_type} eq 'tree' ) {
107				$output =~ s{^(.)\s$}{$1}s; # kill wrapping
108				$best_parent_element->delete;
109				}
110
111				return $output;
112				} else {
113				return;
114				}
115				}
116
117				=head1 NAME
118
119				HTML::ExtractMain - Extract the main content of a web page
120
121				=head1 VERSION
122
123				Version 0.63
124
125				=cut
126
127				our $VERSION = '0.63';
128
129				=head1 SYNOPSIS
130
131				use HTML::ExtractMain qw( extract_main_html );
132
133				my $html = <<'END';
134				Header
135				Home
136
137				Foo
138				Baz
139
140				Footer
141				END
142
143				my $main_html = extract_main_html($html, output_type => 'xhtml');
144				if (defined $main_html) {
145				# do something with $main_html here
146				# $main_html is ' Foo Baz '
147				}
148
149				=head1 EXPORT
150
151				C is optionally exported
152
153				=head1 FUNCTIONS
154
155				=head2 extract_main_html
156
157				C takes HTML content, and uses the Readability
158				algorithm to detect the main body of the page, usually skipping
159				headers, footers, navigation, etc.
160
161				The first argument is either an HTML string, or an
162				HTML::TreeBuilder tree. (If passed a tree, the tree will be modified
163				and destroyed.)
164
165				Remaining arguments are optional and represent key/value options. The
166				available options are:
167
168				=head3 output_type
169
170				This determines what format to return data in. If not specified then
171				xhtml format will be used. Valid formats are:
172
173				=over 4
174
175				=item C
176
177				=item C
178
179				=item C
180
181				=back
182
183				If C is selected, then an L object will be
184				returned instead of a string.
185
186				If the HTML's main content is found, it's returned in the chosen
187				output format. The returned HTML/XHTML will I look like what you put
188				in. (Source formatting, e.g. indentation, will be removed.)
189
190				If a most relevant block of content is not found, C
191				returns undef.
192
193				=cut
194
195				=head1 AUTHOR
196
197				Anirvan Chatterjee, C<< >>
198
199				=head1 BUGS
200
201				Please report any bugs or feature requests to
202				C, or through the web interface
203				at L.
204				I will be notified, and then you'll automatically be notified of
205				progress on your bug as I make changes.
206
207				=head1 SUPPORT
208
209				You can find documentation for this module with the perldoc command.
210
211				perldoc HTML::ExtractMain
212
213				You can also look for information at:
214
215				=over 4
216
217				=item * RT: CPAN's request tracker
218
219				L
220
221				=item * AnnoCPAN: Annotated CPAN documentation
222
223				L
224
225				=item * CPAN Ratings
226
227				L
228
229				=item * Search CPAN
230
231				L
232
233				=back
234
235				=head1 SEE ALSO
236
237				=over 4
238
239				=item * C
240
241				=item * C
242
243				=back
244
245				=head1 ACKNOWLEDGEMENTS
246
247				The Readability algorithm is ported from Arc90's JavaScript original,
248				built as part of the excellent Readability application, online at
249				L, repository at
250				L.
251
252				=head1 COPYRIGHT & LICENSE
253
254				Copyright 2009-2013 Anirvan Chatterjee, Rupert Lane, kryde, all rights
255				reserved.
256
257				This program is free software; you can redistribute it and/or modify it
258				under the same terms as Perl itself.
259
260				=cut
261
262				1; # End of HTML::ExtractMain
263
264				# Local Variables:
265				# mode: perltidy
266				# End: