line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HTML::CruftText; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
33460
|
use 5.012; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
67
|
|
4
|
2
|
|
|
2
|
|
12
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
60
|
|
5
|
2
|
|
|
2
|
|
11
|
use warnings; |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
82
|
|
6
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
10104
|
use Time::HiRes; |
|
2
|
|
|
|
|
12872
|
|
|
2
|
|
|
|
|
13
|
|
8
|
2
|
|
|
2
|
|
2524
|
use List::MoreUtils qw(first_index indexes last_index); |
|
2
|
|
|
|
|
3068
|
|
|
2
|
|
|
|
|
5686
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# STATICS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# markers -- patterns used to find lines than can help find the text |
13
|
|
|
|
|
|
|
my $_MARKER_PATTERNS = { |
14
|
|
|
|
|
|
|
startclickprintinclude => qr/<\!--\s*startclickprintinclude/pi, |
15
|
|
|
|
|
|
|
endclickprintinclude => qr/<\!--\s*endclickprintinclude/pi, |
16
|
|
|
|
|
|
|
startclickprintexclude => qr/<\!--\s*startclickprintexclude/pi, |
17
|
|
|
|
|
|
|
endclickprintexclude => qr/<\!--\s*endclickprintexclude/pi, |
18
|
|
|
|
|
|
|
sphereitbegin => qr/<\!--\s*DISABLEsphereit\s*start/i, |
19
|
|
|
|
|
|
|
sphereitend => qr/<\!--\s*DISABLEsphereit\s*end/i, |
20
|
|
|
|
|
|
|
body => qr/
|
21
|
|
|
|
|
|
|
comment => qr/(id|class)="[^"]*comment[^"]*"/i, |
22
|
|
|
|
|
|
|
}; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
#TODO handle sphereit like we're now handling CLickprint. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# blank everything within these elements |
27
|
|
|
|
|
|
|
my $_SCRUB_TAGS = [ qw/script style frame applet textarea/ ]; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub _remove_everything_except_newlines($) |
30
|
|
|
|
|
|
|
{ |
31
|
0
|
|
|
0
|
|
|
my $data = shift; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Retain the number of newlines |
34
|
0
|
|
|
|
|
|
my $newlines = ($data =~ tr/\n//); |
35
|
|
|
|
|
|
|
|
36
|
0
|
|
|
|
|
|
return "\n" x $newlines; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
my $_process_html_comment_regex_clickprint_comments = qr/^\s*(start|end)clickprint(in|ex)clude/ios; |
41
|
|
|
|
|
|
|
my $_process_html_comment_regex_brackets = qr/[<>]/os; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub _process_html_comment($) |
44
|
|
|
|
|
|
|
{ |
45
|
0
|
|
|
0
|
|
|
my $data = shift; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Don't touch clickprint comments |
48
|
0
|
0
|
|
|
|
|
if ($data =~ $_process_html_comment_regex_clickprint_comments) { |
49
|
0
|
|
|
|
|
|
return $data; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Replace ">" and "<" to "|" |
53
|
0
|
|
|
|
|
|
$data =~ s/$_process_html_comment_regex_brackets/|/g; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Prepend every line with comment (not precompiled because trivial) |
56
|
0
|
|
|
|
|
|
$data =~ s/\n/ -->\n/ios; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub _remove_tags_in_comments($) |
67
|
|
|
|
|
|
|
{ |
68
|
0
|
|
|
0
|
|
|
my $lines = shift; |
69
|
|
|
|
|
|
|
|
70
|
0
|
|
|
|
|
|
my $html = join("\n", @{ $lines }); |
|
0
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Remove ">" and "<" in comments |
73
|
0
|
|
|
|
|
|
$html =~ s/$_remove_tags_in_comments_regex_html_comment/''/eg; |
|
0
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
0
|
|
|
|
|
|
$lines = [ split("\n", $html) ]; |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
return $lines; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# make sure that all tags start and close on one line |
81
|
|
|
|
|
|
|
# by adding false <>s as necessary, eg: |
82
|
|
|
|
|
|
|
# |
83
|
|
|
|
|
|
|
#
|
84
|
|
|
|
|
|
|
# bar> |
85
|
|
|
|
|
|
|
# |
86
|
|
|
|
|
|
|
# becomes |
87
|
|
|
|
|
|
|
# |
88
|
|
|
|
|
|
|
# |
89
|
|
|
|
|
|
|
# |
90
|
|
|
|
|
|
|
# |
91
|
|
|
|
|
|
|
sub _fix_multiline_tags |
92
|
|
|
|
|
|
|
{ |
93
|
0
|
|
|
0
|
|
|
my ( $lines ) = @_; |
94
|
|
|
|
|
|
|
|
95
|
0
|
|
|
|
|
|
my $add_start_tag; |
96
|
0
|
|
|
|
|
|
for ( my $i = 0 ; $i < @{ $lines } ; $i++ ) |
|
0
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
{ |
98
|
0
|
0
|
|
|
|
|
if ( $add_start_tag ) |
99
|
|
|
|
|
|
|
{ |
100
|
0
|
|
|
|
|
|
$lines->[ $i ] = "<$add_start_tag " . $lines->[ $i ]; |
101
|
0
|
|
|
|
|
|
$add_start_tag = undef; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
0
|
0
|
|
|
|
|
if ( $lines->[ $i ] =~ /<([^ >]*)[^>]*$/ ) |
105
|
|
|
|
|
|
|
{ |
106
|
0
|
|
|
|
|
|
$add_start_tag = $1; |
107
|
0
|
|
|
|
|
|
$lines->[ $i ] .= ' >'; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
#remove all text not within the tag |
113
|
|
|
|
|
|
|
#Note: Some badly formated web pages will have multiple tags or will not have an open tag. |
114
|
|
|
|
|
|
|
#We go the conservative thing of only deleting stuff before the first tag and stuff after the last tag. |
115
|
|
|
|
|
|
|
sub _remove_nonbody_text |
116
|
|
|
|
|
|
|
{ |
117
|
0
|
|
|
0
|
|
|
my ( $lines ) = @_; |
118
|
|
|
|
|
|
|
|
119
|
0
|
|
|
|
|
|
my $add_start_tag; |
120
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
my $state = 'before_body'; |
122
|
|
|
|
|
|
|
|
123
|
0
|
|
|
0
|
|
|
my $body_open_tag_line_number = first_index { $_ =~ /
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
0
|
0
|
|
|
|
|
if ( $body_open_tag_line_number != -1 ) |
126
|
|
|
|
|
|
|
{ |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
#delete everything before |
129
|
0
|
|
|
|
|
|
for ( my $line_number_to_clear = 0 ; $line_number_to_clear < $body_open_tag_line_number ; $line_number_to_clear++ ) |
130
|
|
|
|
|
|
|
{ |
131
|
0
|
|
|
|
|
|
$lines->[ $line_number_to_clear ] = ''; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
$lines->[ $body_open_tag_line_number ] =~ s/^.*?\
|
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
0
|
|
|
my $body_close_tag_line_number = last_index { $_ =~ /<\/body/i } @{ $lines }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
if ( $body_close_tag_line_number != -1 ) |
140
|
|
|
|
|
|
|
{ |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
#delete everything after |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
$lines->[ $body_close_tag_line_number ] =~ s/<\/body>.*/<\/body>/i; |
145
|
0
|
|
|
|
|
|
for ( |
146
|
0
|
|
|
|
|
|
my $line_number_to_clear = ( $body_close_tag_line_number + 1 ) ; |
147
|
|
|
|
|
|
|
$line_number_to_clear < scalar( @{ $lines } ) ; |
148
|
|
|
|
|
|
|
$line_number_to_clear++ |
149
|
|
|
|
|
|
|
) |
150
|
|
|
|
|
|
|
{ |
151
|
0
|
|
|
|
|
|
$lines->[ $line_number_to_clear ] = ''; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub _clickprint_start_line |
157
|
|
|
|
|
|
|
{ |
158
|
0
|
|
|
0
|
|
|
my ( $lines ) = @_; |
159
|
|
|
|
|
|
|
|
160
|
0
|
|
|
|
|
|
my $i = 0; |
161
|
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
my $found_clickprint = 0; |
163
|
|
|
|
|
|
|
|
164
|
0
|
|
0
|
|
|
|
while ( ( $i < @{ $lines } ) && !$found_clickprint ) |
|
0
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
{ |
166
|
0
|
0
|
|
|
|
|
if ( $lines->[ $i ] =~ $_MARKER_PATTERNS->{ startclickprintinclude } ) |
167
|
|
|
|
|
|
|
{ |
168
|
0
|
|
|
|
|
|
$found_clickprint = 1; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
else |
171
|
|
|
|
|
|
|
{ |
172
|
0
|
|
|
|
|
|
$i++; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
0
|
0
|
|
|
|
|
if ( !$found_clickprint ) |
177
|
|
|
|
|
|
|
{ |
178
|
0
|
|
|
|
|
|
return; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
else |
181
|
|
|
|
|
|
|
{ |
182
|
0
|
|
|
|
|
|
return $i; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub _remove_nonclickprint_text |
188
|
|
|
|
|
|
|
{ |
189
|
0
|
|
|
0
|
|
|
my ( $lines, $clickprintmap ) = @_; |
190
|
|
|
|
|
|
|
|
191
|
0
|
|
|
|
|
|
my $clickprint_start_line = _clickprint_start_line( $lines ); |
192
|
|
|
|
|
|
|
|
193
|
0
|
0
|
|
|
|
|
return if !defined( $clickprint_start_line ); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# blank out all line before the first click_print |
196
|
|
|
|
|
|
|
|
197
|
0
|
|
|
|
|
|
for ( my $j = 0 ; $j < $clickprint_start_line ; $j++ ) |
198
|
|
|
|
|
|
|
{ |
199
|
0
|
|
|
|
|
|
$lines->[ $j ] = ''; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
|
my $i = $clickprint_start_line; |
203
|
|
|
|
|
|
|
|
204
|
0
|
|
|
|
|
|
my $current_substring = \$lines->[ $i ]; |
205
|
0
|
|
|
|
|
|
my $state = "before_clickprint"; |
206
|
|
|
|
|
|
|
|
207
|
0
|
|
|
|
|
|
while ( $i < @{ $lines } ) |
|
0
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
{ |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# print |
211
|
|
|
|
|
|
|
# "i = $i state = $state current_substring = $$current_substring \n"; |
212
|
|
|
|
|
|
|
|
213
|
0
|
0
|
|
|
|
|
if ( $state eq "before_clickprint" ) |
214
|
|
|
|
|
|
|
{ |
215
|
0
|
0
|
|
|
|
|
if ( $$current_substring =~ $_MARKER_PATTERNS->{ startclickprintinclude } ) |
216
|
|
|
|
|
|
|
{ |
217
|
0
|
|
|
|
|
|
$$current_substring =~ |
218
|
|
|
|
|
|
|
"s/.*?$_MARKER_PATTERNS->{startclickprintinclude}/$_MARKER_PATTERNS->{startclickprintinclude}/p"; |
219
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
$$current_substring =~ $_MARKER_PATTERNS->{ startclickprintinclude }; |
221
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
|
$current_substring = \substr( $$current_substring, length( ${^PREMATCH} ) + length( ${^MATCH} ) ); |
223
|
|
|
|
|
|
|
|
224
|
0
|
|
|
|
|
|
$current_substring = \_get_string_after_comment_end_tags( $current_substring ); |
225
|
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
|
$state = "in_click_print"; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
else |
229
|
|
|
|
|
|
|
{ |
230
|
0
|
|
|
|
|
|
$$current_substring = ''; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
|
if ( $state eq 'in_click_print' ) |
235
|
|
|
|
|
|
|
{ |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# print "in_click_print\n"; |
238
|
0
|
0
|
|
|
|
|
if ( $$current_substring =~ $_MARKER_PATTERNS->{ startclickprintexclude } ) |
|
|
0
|
|
|
|
|
|
239
|
|
|
|
|
|
|
{ |
240
|
0
|
|
|
|
|
|
$current_substring = \substr( $$current_substring, length( ${^MATCH} ) + length( ${^PREMATCH} ) ); |
241
|
|
|
|
|
|
|
|
242
|
0
|
|
|
|
|
|
$current_substring = \_get_string_after_comment_end_tags( $current_substring ); |
243
|
0
|
|
|
|
|
|
$state = "in_click_print_exclude"; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
elsif ( $$current_substring =~ $_MARKER_PATTERNS->{ endclickprintinclude } ) |
247
|
|
|
|
|
|
|
{ |
248
|
0
|
|
|
|
|
|
$current_substring = \substr( $$current_substring, length( ${^MATCH} ) + length( ${^PREMATCH} ) ); |
249
|
|
|
|
|
|
|
|
250
|
0
|
|
|
|
|
|
$current_substring = \_get_string_after_comment_end_tags( $current_substring ); |
251
|
|
|
|
|
|
|
|
252
|
0
|
|
|
|
|
|
$state = 'before_clickprint'; |
253
|
0
|
|
|
|
|
|
next; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
0
|
0
|
|
|
|
|
if ( $state eq 'in_click_print_exclude' ) |
258
|
|
|
|
|
|
|
{ |
259
|
0
|
0
|
|
|
|
|
if ( $$current_substring =~ $_MARKER_PATTERNS->{ endclickprintexclude } ) |
260
|
|
|
|
|
|
|
{ |
261
|
0
|
|
|
|
|
|
my $index = index( $$current_substring, $_MARKER_PATTERNS->{ endclickprintexclude } ); |
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
|
substr( $$current_substring, 0, length( ${^PREMATCH} ), '' ); |
264
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
$current_substring = \substr( $$current_substring, length( ${^MATCH} ) ); |
266
|
|
|
|
|
|
|
|
267
|
0
|
|
|
|
|
|
$current_substring = \_get_string_after_comment_end_tags( $current_substring ); |
268
|
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
$state = "in_click_print"; |
270
|
0
|
|
|
|
|
|
next; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
else |
273
|
|
|
|
|
|
|
{ |
274
|
0
|
|
|
|
|
|
$$current_substring = ''; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
|
$i++; |
279
|
0
|
0
|
|
|
|
|
if ( $i < @{ $lines } ) |
|
0
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
{ |
281
|
0
|
|
|
|
|
|
$current_substring = \$lines->[ $i ]; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub _get_string_after_comment_end_tags |
287
|
|
|
|
|
|
|
{ |
288
|
0
|
|
|
0
|
|
|
my ( $current_substring, $i ) = @_; |
289
|
|
|
|
|
|
|
|
290
|
0
|
|
|
|
|
|
my $comment_end_pos = 0; |
291
|
|
|
|
|
|
|
|
292
|
0
|
0
|
|
|
|
|
if ( $$current_substring =~ /^\s*-->/p ) |
293
|
|
|
|
|
|
|
{ |
294
|
0
|
|
|
|
|
|
$comment_end_pos = length( ${^MATCH} ); |
295
|
|
|
|
|
|
|
} |
296
|
0
|
|
|
|
|
|
return substr( $$current_substring, $comment_end_pos ); |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# remove text wthin script, style, iframe, applet, and textarea tags |
300
|
|
|
|
|
|
|
sub _remove_script_text |
301
|
|
|
|
|
|
|
{ |
302
|
0
|
|
|
0
|
|
|
my ( $lines ) = @_; |
303
|
|
|
|
|
|
|
|
304
|
0
|
|
|
|
|
|
my $state = 'text'; |
305
|
0
|
|
|
|
|
|
my $start_scrub_tag_name; |
306
|
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
for ( my $i = 0 ; $i < @{ $lines } ; $i++ ) |
|
0
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
{ |
309
|
0
|
|
|
|
|
|
my $line = $lines->[ $i ]; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
#print "line $i: $line\n"; |
312
|
0
|
|
|
|
|
|
my @scrubs; |
313
|
0
|
|
|
|
|
|
my $start_scrub_pos = 0; |
314
|
0
|
|
|
|
|
|
while ( $line =~ /(<(\/?[a-z]+)[^>]*>)/gi ) |
315
|
|
|
|
|
|
|
{ |
316
|
0
|
|
|
|
|
|
my $tag = $1; |
317
|
0
|
|
|
|
|
|
my $tag_name = $2; |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
#print "found tag $tag_name\n"; |
320
|
0
|
0
|
|
|
|
|
if ( $state eq 'text' ) |
|
|
0
|
|
|
|
|
|
321
|
|
|
|
|
|
|
{ |
322
|
0
|
0
|
|
|
|
|
if ( grep { lc( $tag_name ) eq $_ } @{ $_SCRUB_TAGS } ) |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
{ |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
#print "found scrub tag\n"; |
326
|
0
|
|
|
|
|
|
$state = 'scrub_text'; |
327
|
0
|
|
|
|
|
|
$start_scrub_pos = pos( $line ); |
328
|
0
|
|
|
|
|
|
$start_scrub_tag_name = $tag_name; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
elsif ( $state eq 'scrub_text' ) |
332
|
|
|
|
|
|
|
{ |
333
|
0
|
0
|
|
|
|
|
if ( lc( $tag_name ) eq lc( "/$start_scrub_tag_name" ) ) |
334
|
|
|
|
|
|
|
{ |
335
|
0
|
|
|
|
|
|
$state = 'text'; |
336
|
0
|
|
|
|
|
|
my $end_scrub_pos = pos( $line ) - length( $tag ); |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# delay actual scrubbing of text until the end so that we don't |
339
|
|
|
|
|
|
|
# have to reset the position of the state machine |
340
|
0
|
|
|
|
|
|
push( @scrubs, [ $start_scrub_pos, $end_scrub_pos - $start_scrub_pos ] ); |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
0
|
0
|
|
|
|
|
if ( $state eq 'scrub_text' ) |
346
|
|
|
|
|
|
|
{ |
347
|
0
|
|
|
|
|
|
push( @scrubs, [ $start_scrub_pos, length( $line ) - $start_scrub_pos ] ); |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
|
350
|
0
|
|
|
|
|
|
my $scrubbed_length = 0; |
351
|
0
|
|
|
|
|
|
for my $scrub ( @scrubs ) |
352
|
|
|
|
|
|
|
{ |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
#print "scrub line $i\n"; |
355
|
0
|
|
|
|
|
|
substr( $lines->[ $i ], $scrub->[ 0 ] - $scrubbed_length, $scrub->[ 1 ] ) = ''; |
356
|
0
|
|
|
|
|
|
$scrubbed_length += $scrub->[ 1 ]; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
#print "scrubbed line: $lines->[$i]\n"; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
my $_start_time; |
365
|
|
|
|
|
|
|
my $_last_time; |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub _print_time |
368
|
|
|
|
|
|
|
{ |
369
|
0
|
|
|
0
|
|
|
return; |
370
|
|
|
|
|
|
|
|
371
|
0
|
|
|
|
|
|
my ( $s ) = @_; |
372
|
|
|
|
|
|
|
|
373
|
0
|
|
|
|
|
|
my $t = Time::HiRes::gettimeofday(); |
374
|
0
|
|
0
|
|
|
|
$_start_time ||= $t; |
375
|
0
|
|
0
|
|
|
|
$_last_time ||= $t; |
376
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
|
my $elapsed = $t - $_start_time; |
378
|
0
|
|
|
|
|
|
my $incremental = $t - $_last_time; |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
printf( STDERR "time $s: %f elapsed %f incremental\n", $elapsed, $incremental ); |
381
|
|
|
|
|
|
|
|
382
|
0
|
|
|
|
|
|
$_last_time = $t; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head1 NAME |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
HTML::CruftText - Remove unuseful text from HTML |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head1 VERSION |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Version 0.02 |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=cut |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head1 SYNOPSIS |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Removes junk from HTML page text. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
This module uses a regular expression based approach to remove cruft from HTML. I.e. content/text that is very unlikely to be useful or interesting. |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
use HTML::CruftText; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
open (my $MYINPUTFILE, '
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
my @lines = <$MYINPUTFILE>; |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
my $de_crufted_lines = HTML::CruftText::clearCruftText( \@lines); |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
... |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=head1 DESCRIPTION |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
This module was developed for the Media Cloud project (http://mediacloud.org) as the first step in differentiating article text from ads, navigation, and other boilerplate text. Its approach is very conservative and almost never removes legitimate article text. However, it still leaves in a lot of cruft so many users will want to do additional processing. |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
Typically, the clearCruftText method is called with an array reference containing the lines of an HTML file. Each line is then altered so that the cruft text is removed. After completion some lines will be entirely blank, while others will have certain text removed. In a few rare cases, additional HTML tags are added. The result is NOT GUARANTEED to be valid, balanced HTML though some HTML is retained because it is extremely useful for further processing. Thus some users will want to run an HTML stripper over the results. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
The following tactics are used to remove cruft text: |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
* Nonbody text --anything outside of the tags -- is removed |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
* Text within the following tags is removed: |