blib/lib/HTML/ExtractMain.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 4 | 6 | 66.6 |
branch | n/a | ||
condition | n/a | ||
subroutine | 2 | 2 | 100.0 |
pod | n/a | ||
total | 6 | 8 | 75.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | #!perl | ||||||
2 | |||||||
3 | package HTML::ExtractMain; | ||||||
4 | 2 | 2 | 122872 | use Carp qw( carp ); | |||
2 | 5 | ||||||
2 | 264 | ||||||
5 | 2 | 2 | 1164 | use HTML::TreeBuilder; | |||
0 | |||||||
0 | |||||||
6 | use Object::Destroyer 2.0; | ||||||
7 | use Scalar::Util qw( blessed refaddr ); | ||||||
8 | use base qw( Exporter ); | ||||||
9 | use strict; | ||||||
10 | use warnings; | ||||||
11 | |||||||
12 | our @EXPORT_OK = qw( extract_main_html ); | ||||||
13 | |||||||
14 | sub extract_main_html { | ||||||
15 | my $arg = shift; | ||||||
16 | |||||||
17 | unless ( defined $arg ) { | ||||||
18 | carp 'extract_main_html requires HTML content as an argument'; | ||||||
19 | return; | ||||||
20 | } | ||||||
21 | |||||||
22 | my $tree; | ||||||
23 | if ( ref $arg and blessed $arg and $arg->isa('HTML::TreeBuilder') ) { | ||||||
24 | $tree = $arg; | ||||||
25 | } else { | ||||||
26 | my $raw_html = $arg; | ||||||
27 | |||||||
28 | $tree = eval { HTML::TreeBuilder->new_from_content($raw_html) }; | ||||||
29 | if ( !$tree ) { | ||||||
30 | carp 'check HTML input, could not create new HTML::TreeBuilder'; | ||||||
31 | return; | ||||||
32 | } | ||||||
33 | } | ||||||
34 | |||||||
35 | my %options = @_; | ||||||
36 | if ( defined $options{output_type} ) { | ||||||
37 | $options{output_type} = lc( $options{output_type} ); | ||||||
38 | } else { | ||||||
39 | $options{output_type} = "xhtml"; | ||||||
40 | } | ||||||
41 | |||||||
42 | # Remove any lingering circular references. Details at: | ||||||
43 | # http://www.perl.com/pub/2007/06/07/better-code-through-destruction.html | ||||||
44 | my $sentry = Object::Destroyer->new( $tree, 'delete' ); | ||||||
45 | |||||||
46 | # Use the Readability algorithm, inspired by: | ||||||
47 | # http://lab.arc90.com/experiments/readability/js/readability.js | ||||||
48 | |||||||
49 | # Study all the paragraphs and find the chunk that has the best score. | ||||||
50 | # A score is determined by things like: Number of 's, commas, |
||||||
51 | # class names, etc. | ||||||
52 | |||||||
53 | my %parents; | ||||||
54 | foreach my $p ( $tree->find_by_tag_name('p') ) { | ||||||
55 | my $parent = $p->parent; | ||||||
56 | my $parent_id = refaddr($parent); | ||||||
57 | |||||||
58 | if ( !defined $parents{$parent_id} ) { | ||||||
59 | $parents{$parent_id}->{element} = $parent; | ||||||
60 | $parents{$parent_id}->{readability} = 0; | ||||||
61 | |||||||
62 | my $text_to_scan = join q{ }, | ||||||
63 | grep {defined} | ||||||
64 | ( $parent->attr('class'), $parent->attr('id') ); | ||||||
65 | |||||||
66 | if ( $text_to_scan =~ m/\b(?:comment|meta|footer|footnote)\b/ ) { | ||||||
67 | $parents{$parent_id}->{readability} -= 50; | ||||||
68 | } elsif ( $text_to_scan | ||||||
69 | =~ m/\b(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)\b/ | ||||||
70 | ) { | ||||||
71 | $parents{$parent_id}->{readability} += 25; | ||||||
72 | } | ||||||
73 | } | ||||||
74 | |||||||
75 | # add point for each para found | ||||||
76 | $parents{$parent_id}->{readability}++; | ||||||
77 | |||||||
78 | # add a point for each comma found in the paragraph | ||||||
79 | foreach my $text_ref ( $p->content_refs_list ) { | ||||||
80 | my $num_commas = ( ${$text_ref} =~ m/,/g ); | ||||||
81 | $parents{$parent_id}->{readability} += $num_commas; | ||||||
82 | } | ||||||
83 | } | ||||||
84 | |||||||
85 | my $best_parent; | ||||||
86 | foreach my $id ( keys %parents ) { | ||||||
87 | if ( !$best_parent | ||||||
88 | || $parents{$id}->{readability} > $best_parent->{readability} ) { | ||||||
89 | $best_parent = $parents{$id}; | ||||||
90 | } | ||||||
91 | } | ||||||
92 | |||||||
93 | if ($best_parent) { | ||||||
94 | my $best_parent_element = $best_parent->{element}; | ||||||
95 | $best_parent_element->detach; | ||||||
96 | |||||||
97 | my $output; | ||||||
98 | if ( $options{output_type} eq 'tree' ) { | ||||||
99 | $output = $best_parent_element; | ||||||
100 | } elsif ( $options{output_type} eq 'html' ) { | ||||||
101 | $output = $best_parent_element->as_HTML; | ||||||
102 | } else { | ||||||
103 | $output = $best_parent_element->as_XML; | ||||||
104 | } | ||||||
105 | |||||||
106 | unless ( $options{output_type} eq 'tree' ) { | ||||||
107 | $output =~ s{^(.*)\s*$}{$1}s; # kill wrapping | ||||||
108 | $best_parent_element->delete; | ||||||
109 | } | ||||||
110 | |||||||
111 | return $output; | ||||||
112 | } else { | ||||||
113 | return; | ||||||
114 | } | ||||||
115 | } | ||||||
116 | |||||||
117 | =head1 NAME | ||||||
118 | |||||||
119 | HTML::ExtractMain - Extract the main content of a web page | ||||||
120 | |||||||
121 | =head1 VERSION | ||||||
122 | |||||||
123 | Version 0.63 | ||||||
124 | |||||||
125 | =cut | ||||||
126 | |||||||
127 | our $VERSION = '0.63'; | ||||||
128 | |||||||
129 | =head1 SYNOPSIS | ||||||
130 | |||||||
131 | use HTML::ExtractMain qw( extract_main_html ); | ||||||
132 | |||||||
133 | my $html = <<'END'; | ||||||
134 | Header |
||||||
135 | |||||||
136 | |
||||||
137 | Foo |
||||||
138 | Baz |
||||||
139 | |||||||
140 | |||||||
141 | END | ||||||
142 | |||||||
143 | my $main_html = extract_main_html($html, output_type => 'xhtml'); | ||||||
144 | if (defined $main_html) { | ||||||
145 | # do something with $main_html here | ||||||
146 | # $main_html is ' Foo Baz |
||||||
147 | } | ||||||
148 | |||||||
149 | =head1 EXPORT | ||||||
150 | |||||||
151 | C |
||||||
152 | |||||||
153 | =head1 FUNCTIONS | ||||||
154 | |||||||
155 | =head2 extract_main_html | ||||||
156 | |||||||
157 | C |
||||||
158 | algorithm to detect the main body of the page, usually skipping | ||||||
159 | headers, footers, navigation, etc. | ||||||
160 | |||||||
161 | The first argument is either an HTML string, or an | ||||||
162 | HTML::TreeBuilder tree. (If passed a tree, the tree will be modified | ||||||
163 | and destroyed.) | ||||||
164 | |||||||
165 | Remaining arguments are optional and represent key/value options. The | ||||||
166 | available options are: | ||||||
167 | |||||||
168 | =head3 output_type | ||||||
169 | |||||||
170 | This determines what format to return data in. If not specified then | ||||||
171 | xhtml format will be used. Valid formats are: | ||||||
172 | |||||||
173 | =over 4 | ||||||
174 | |||||||
175 | =item C |
||||||
176 | |||||||
177 | =item C | ||||||
178 | |||||||
179 | =item C |
||||||
180 | |||||||
181 | =back | ||||||
182 | |||||||
183 | If C |
||||||
184 | returned instead of a string. | ||||||
185 | |||||||
186 | If the HTML's main content is found, it's returned in the chosen | ||||||
187 | output format. The returned HTML/XHTML will I |
||||||
188 | in. (Source formatting, e.g. indentation, will be removed.) | ||||||
189 | |||||||
190 | If a most relevant block of content is not found, C |
||||||
191 | returns undef. | ||||||
192 | |||||||
193 | =cut | ||||||
194 | |||||||
195 | =head1 AUTHOR | ||||||
196 | |||||||
197 | Anirvan Chatterjee, C<< |
||||||
198 | |||||||
199 | =head1 BUGS | ||||||
200 | |||||||
201 | Please report any bugs or feature requests to | ||||||
202 | C |
||||||
203 | at L |
||||||
204 | I will be notified, and then you'll automatically be notified of | ||||||
205 | progress on your bug as I make changes. | ||||||
206 | |||||||
207 | =head1 SUPPORT | ||||||
208 | |||||||
209 | You can find documentation for this module with the perldoc command. | ||||||
210 | |||||||
211 | perldoc HTML::ExtractMain | ||||||
212 | |||||||
213 | You can also look for information at: | ||||||
214 | |||||||
215 | =over 4 | ||||||
216 | |||||||
217 | =item * RT: CPAN's request tracker | ||||||
218 | |||||||
219 | L |
||||||
220 | |||||||
221 | =item * AnnoCPAN: Annotated CPAN documentation | ||||||
222 | |||||||
223 | L |
||||||
224 | |||||||
225 | =item * CPAN Ratings | ||||||
226 | |||||||
227 | L |
||||||
228 | |||||||
229 | =item * Search CPAN | ||||||
230 | |||||||
231 | L |
||||||
232 | |||||||
233 | =back | ||||||
234 | |||||||
235 | =head1 SEE ALSO | ||||||
236 | |||||||
237 | =over 4 | ||||||
238 | |||||||
239 | =item * C |
||||||
240 | |||||||
241 | =item * C |
||||||
242 | |||||||
243 | =back | ||||||
244 | |||||||
245 | =head1 ACKNOWLEDGEMENTS | ||||||
246 | |||||||
247 | The Readability algorithm is ported from Arc90's JavaScript original, | ||||||
248 | built as part of the excellent Readability application, online at | ||||||
249 | L |
||||||
250 | L |
||||||
251 | |||||||
252 | =head1 COPYRIGHT & LICENSE | ||||||
253 | |||||||
254 | Copyright 2009-2013 Anirvan Chatterjee, Rupert Lane, kryde, all rights | ||||||
255 | reserved. | ||||||
256 | |||||||
257 | This program is free software; you can redistribute it and/or modify it | ||||||
258 | under the same terms as Perl itself. | ||||||
259 | |||||||
260 | =cut | ||||||
261 | |||||||
262 | 1; # End of HTML::ExtractMain | ||||||
263 | |||||||
264 | # Local Variables: | ||||||
265 | # mode: perltidy | ||||||
266 | # End: |