File Coverage

blib/lib/HTML/CruftText.pm

Criterion	Covered	Total	%
statement	15	169	8.8
branch	0	46	0.0
condition	0	9	0.0
subroutine	5	19	26.3
pod	2	2	100.0
total	22	245	8.9

line	stmt	bran	cond	sub	time	code
1						package HTML::CruftText;
2
3	2			2	33460	use 5.012;
	2				8
	2				67
4	2			2	12	use strict;
	2				3
	2				60
5	2			2	11	use warnings;
	2				8
	2				82
6
7	2			2	10104	use Time::HiRes;
	2				12872
	2				13
8	2			2	2524	use List::MoreUtils qw(first_index indexes last_index);
	2				3068
	2				5686
9
10						# STATICS
11
12						# markers -- patterns used to find lines than can help find the text
13						my $_MARKER_PATTERNS = {
14						startclickprintinclude => qr/<\!--\s*startclickprintinclude/pi,
15						endclickprintinclude => qr/<\!--\s*endclickprintinclude/pi,
16						startclickprintexclude => qr/<\!--\s*startclickprintexclude/pi,
17						endclickprintexclude => qr/<\!--\s*endclickprintexclude/pi,
18						sphereitbegin => qr/<\!--\sDISABLEsphereit\sstart/i,
19						sphereitend => qr/<\!--\sDISABLEsphereit\send/i,
20						body => qr/
21						comment => qr/(id\|class)="[^"]comment[^"]"/i,
22						};
23
24						#TODO handle sphereit like we're now handling CLickprint.
25
26						# blank everything within these elements
27						my $_SCRUB_TAGS = [ qw/script style frame applet textarea/ ];
28
29						sub _remove_everything_except_newlines($)
30						{
31	0			0		my $data = shift;
32
33						# Retain the number of newlines
34	0					my $newlines = ($data =~ tr/\n//);
35
36	0					return "\n" x $newlines;
37						}
38
39
40						my $_process_html_comment_regex_clickprint_comments = qr/^\s*(start\|end)clickprint(in\|ex)clude/ios;
41						my $_process_html_comment_regex_brackets = qr/[<>]/os;
42
43						sub _process_html_comment($)
44						{
45	0			0		my $data = shift;
46
47						# Don't touch clickprint comments
48	0	0				if ($data =~ $_process_html_comment_regex_clickprint_comments) {
49	0					return $data;
50						}
51
52						# Replace ">" and "<" to "\|"
53	0					$data =~ s/$_process_html_comment_regex_brackets/\|/g;
54
55						# Prepend every line with comment (not precompiled because trivial)
56	0					$data =~ s/\n/ -->\n/ios;
65
66						sub _remove_tags_in_comments($)
67						{
68	0			0		my $lines = shift;
69
70	0					my $html = join("\n", @{ $lines });
	0
71
72						# Remove ">" and "<" in comments
73	0					$html =~ s/$_remove_tags_in_comments_regex_html_comment/''/eg;
	0
74
75	0					$lines = [ split("\n", $html) ];
76
77	0					return $lines;
78						}
79
80						# make sure that all tags start and close on one line
81						# by adding false <>s as necessary, eg:
82						#
83						#
84						# bar>
85						#
86						# becomes
87						#
88						#
89						#
90						#
91						sub _fix_multiline_tags
92						{
93	0			0		my ( $lines ) = @_;
94
95	0					my $add_start_tag;
96	0					for ( my $i = 0 ; $i < @{ $lines } ; $i++ )
	0
97						{
98	0	0				if ( $add_start_tag )
99						{
100	0					$lines->[ $i ] = "<$add_start_tag " . $lines->[ $i ];
101	0					$add_start_tag = undef;
102						}
103
104	0	0				if ( $lines->[ $i ] =~ /<([^ >])[^>]$/ )
105						{
106	0					$add_start_tag = $1;
107	0					$lines->[ $i ] .= ' >';
108						}
109						}
110						}
111
112						#remove all text not within the tag
113						#Note: Some badly formated web pages will have multiple tags or will not have an open tag.
114						#We go the conservative thing of only deleting stuff before the first tag and stuff after the last tag.
115						sub _remove_nonbody_text
116						{
117	0			0		my ( $lines ) = @_;
118
119	0					my $add_start_tag;
120
121	0					my $state = 'before_body';
122
123	0			0		my $body_open_tag_line_number = first_index { $_ =~ /
	0
	0
124
125	0	0				if ( $body_open_tag_line_number != -1 )
126						{
127
128						#delete everything before
129	0					for ( my $line_number_to_clear = 0 ; $line_number_to_clear < $body_open_tag_line_number ; $line_number_to_clear++ )
130						{
131	0					$lines->[ $line_number_to_clear ] = '';
132						}
133
134	0					$lines->[ $body_open_tag_line_number ] =~ s/^.*?\
135						}
136
137	0			0		my $body_close_tag_line_number = last_index { $_ =~ /<\/body/i } @{ $lines };
	0
	0
138
139	0	0				if ( $body_close_tag_line_number != -1 )
140						{
141
142						#delete everything after
143
144	0					$lines->[ $body_close_tag_line_number ] =~ s/<\/body>.*/<\/body>/i;
145	0					for (
146	0					my $line_number_to_clear = ( $body_close_tag_line_number + 1 ) ;
147						$line_number_to_clear < scalar( @{ $lines } ) ;
148						$line_number_to_clear++
149						)
150						{
151	0					$lines->[ $line_number_to_clear ] = '';
152						}
153						}
154						}
155
156						sub _clickprint_start_line
157						{
158	0			0		my ( $lines ) = @_;
159
160	0					my $i = 0;
161
162	0					my $found_clickprint = 0;
163
164	0		0			while ( ( $i < @{ $lines } ) && !$found_clickprint )
	0
165						{
166	0	0				if ( $lines->[ $i ] =~ $_MARKER_PATTERNS->{ startclickprintinclude } )
167						{
168	0					$found_clickprint = 1;
169						}
170						else
171						{
172	0					$i++;
173						}
174						}
175
176	0	0				if ( !$found_clickprint )
177						{
178	0					return;
179						}
180						else
181						{
182	0					return $i;
183
184						}
185						}
186
187						sub _remove_nonclickprint_text
188						{
189	0			0		my ( $lines, $clickprintmap ) = @_;
190
191	0					my $clickprint_start_line = _clickprint_start_line( $lines );
192
193	0	0				return if !defined( $clickprint_start_line );
194
195						# blank out all line before the first click_print
196
197	0					for ( my $j = 0 ; $j < $clickprint_start_line ; $j++ )
198						{
199	0					$lines->[ $j ] = '';
200						}
201
202	0					my $i = $clickprint_start_line;
203
204	0					my $current_substring = \$lines->[ $i ];
205	0					my $state = "before_clickprint";
206
207	0					while ( $i < @{ $lines } )
	0
208						{
209
210						# print
211						# "i = $i state = $state current_substring = $$current_substring \n";
212
213	0	0				if ( $state eq "before_clickprint" )
214						{
215	0	0				if ( $$current_substring =~ $_MARKER_PATTERNS->{ startclickprintinclude } )
216						{
217	0					$$current_substring =~
218						"s/.*?$_MARKER_PATTERNS->{startclickprintinclude}/$_MARKER_PATTERNS->{startclickprintinclude}/p";
219
220	0					$$current_substring =~ $_MARKER_PATTERNS->{ startclickprintinclude };
221
222	0					$current_substring = \substr( $$current_substring, length( ${^PREMATCH} ) + length( ${^MATCH} ) );
223
224	0					$current_substring = \_get_string_after_comment_end_tags( $current_substring );
225
226	0					$state = "in_click_print";
227						}
228						else
229						{
230	0					$$current_substring = '';
231						}
232						}
233
234	0	0				if ( $state eq 'in_click_print' )
235						{
236
237						# print "in_click_print\n";
238	0	0				if ( $$current_substring =~ $_MARKER_PATTERNS->{ startclickprintexclude } )
		0
239						{
240	0					$current_substring = \substr( $$current_substring, length( ${^MATCH} ) + length( ${^PREMATCH} ) );
241
242	0					$current_substring = \_get_string_after_comment_end_tags( $current_substring );
243	0					$state = "in_click_print_exclude";
244
245						}
246						elsif ( $$current_substring =~ $_MARKER_PATTERNS->{ endclickprintinclude } )
247						{
248	0					$current_substring = \substr( $$current_substring, length( ${^MATCH} ) + length( ${^PREMATCH} ) );
249
250	0					$current_substring = \_get_string_after_comment_end_tags( $current_substring );
251
252	0					$state = 'before_clickprint';
253	0					next;
254						}
255						}
256
257	0	0				if ( $state eq 'in_click_print_exclude' )
258						{
259	0	0				if ( $$current_substring =~ $_MARKER_PATTERNS->{ endclickprintexclude } )
260						{
261	0					my $index = index( $$current_substring, $_MARKER_PATTERNS->{ endclickprintexclude } );
262
263	0					substr( $$current_substring, 0, length( ${^PREMATCH} ), '' );
264
265	0					$current_substring = \substr( $$current_substring, length( ${^MATCH} ) );
266
267	0					$current_substring = \_get_string_after_comment_end_tags( $current_substring );
268
269	0					$state = "in_click_print";
270	0					next;
271						}
272						else
273						{
274	0					$$current_substring = '';
275						}
276						}
277
278	0					$i++;
279	0	0				if ( $i < @{ $lines } )
	0
280						{
281	0					$current_substring = \$lines->[ $i ];
282						}
283						}
284						}
285
286						sub _get_string_after_comment_end_tags
287						{
288	0			0		my ( $current_substring, $i ) = @_;
289
290	0					my $comment_end_pos = 0;
291
292	0	0				if ( $$current_substring =~ /^\s*-->/p )
293						{
294	0					$comment_end_pos = length( ${^MATCH} );
295						}
296	0					return substr( $$current_substring, $comment_end_pos );
297						}
298
299						# remove text wthin script, style, iframe, applet, and textarea tags
300						sub _remove_script_text
301						{
302	0			0		my ( $lines ) = @_;
303
304	0					my $state = 'text';
305	0					my $start_scrub_tag_name;
306
307	0					for ( my $i = 0 ; $i < @{ $lines } ; $i++ )
	0
308						{
309	0					my $line = $lines->[ $i ];
310
311						#print "line $i: $line\n";
312	0					my @scrubs;
313	0					my $start_scrub_pos = 0;
314	0					while ( $line =~ /(<(\/?[a-z]+)[^>]*>)/gi )
315						{
316	0					my $tag = $1;
317	0					my $tag_name = $2;
318
319						#print "found tag $tag_name\n";
320	0	0				if ( $state eq 'text' )
		0
321						{
322	0	0				if ( grep { lc( $tag_name ) eq $_ } @{ $_SCRUB_TAGS } )
	0
	0
323						{
324
325						#print "found scrub tag\n";
326	0					$state = 'scrub_text';
327	0					$start_scrub_pos = pos( $line );
328	0					$start_scrub_tag_name = $tag_name;
329						}
330						}
331						elsif ( $state eq 'scrub_text' )
332						{
333	0	0				if ( lc( $tag_name ) eq lc( "/$start_scrub_tag_name" ) )
334						{
335	0					$state = 'text';
336	0					my $end_scrub_pos = pos( $line ) - length( $tag );
337
338						# delay actual scrubbing of text until the end so that we don't
339						# have to reset the position of the state machine
340	0					push( @scrubs, [ $start_scrub_pos, $end_scrub_pos - $start_scrub_pos ] );
341						}
342						}
343						}
344
345	0	0				if ( $state eq 'scrub_text' )
346						{
347	0					push( @scrubs, [ $start_scrub_pos, length( $line ) - $start_scrub_pos ] );
348						}
349
350	0					my $scrubbed_length = 0;
351	0					for my $scrub ( @scrubs )
352						{
353
354						#print "scrub line $i\n";
355	0					substr( $lines->[ $i ], $scrub->[ 0 ] - $scrubbed_length, $scrub->[ 1 ] ) = '';
356	0					$scrubbed_length += $scrub->[ 1 ];
357						}
358
359						#print "scrubbed line: $lines->[$i]\n";
360						}
361						}
362
363
364						my $_start_time;
365						my $_last_time;
366
367						sub _print_time
368						{
369	0			0		return;
370
371	0					my ( $s ) = @_;
372
373	0					my $t = Time::HiRes::gettimeofday();
374	0		0			$_start_time \|\|= $t;
375	0		0			$_last_time \|\|= $t;
376
377	0					my $elapsed = $t - $_start_time;
378	0					my $incremental = $t - $_last_time;
379
380	0					printf( STDERR "time $s: %f elapsed %f incremental\n", $elapsed, $incremental );
381
382	0					$_last_time = $t;
383						}
384
385						=head1 NAME
386
387						HTML::CruftText - Remove unuseful text from HTML
388
389						=head1 VERSION
390
391						Version 0.02
392
393						=cut
394
395						our $VERSION = '0.02';
396
397
398						=head1 SYNOPSIS
399
400						Removes junk from HTML page text.
401
402						This module uses a regular expression based approach to remove cruft from HTML. I.e. content/text that is very unlikely to be useful or interesting.
403
404
405						use HTML::CruftText;
406
407						open (my $MYINPUTFILE, '
408
409						my @lines = <$MYINPUTFILE>;
410
411						my $de_crufted_lines = HTML::CruftText::clearCruftText( \@lines);
412
413						...
414
415						=head1 DESCRIPTION
416
417						This module was developed for the Media Cloud project (http://mediacloud.org) as the first step in differentiating article text from ads, navigation, and other boilerplate text. Its approach is very conservative and almost never removes legitimate article text. However, it still leaves in a lot of cruft so many users will want to do additional processing.
418
419						Typically, the clearCruftText method is called with an array reference containing the lines of an HTML file. Each line is then altered so that the cruft text is removed. After completion some lines will be entirely blank, while others will have certain text removed. In a few rare cases, additional HTML tags are added. The result is NOT GUARANTEED to be valid, balanced HTML though some HTML is retained because it is extremely useful for further processing. Thus some users will want to run an HTML stripper over the results.
420
421						The following tactics are used to remove cruft text:
422
423						* Nonbody text --anything outside of the tags -- is removed
424
425						* Text within the following tags is removed: