line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
4
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
5
|
|
|
|
|
|
|
# the Free Software Foundation; either version 1, or (at your option) |
6
|
|
|
|
|
|
|
# any later version. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
9
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
10
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11
|
|
|
|
|
|
|
# GNU General Public License for more details. |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
14
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
15
|
|
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# NO WARRANTY |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
21
|
|
|
|
|
|
|
# FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
22
|
|
|
|
|
|
|
# OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
23
|
|
|
|
|
|
|
# PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED |
24
|
|
|
|
|
|
|
# OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
25
|
|
|
|
|
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS |
26
|
|
|
|
|
|
|
# TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE |
27
|
|
|
|
|
|
|
# PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, |
28
|
|
|
|
|
|
|
# REPAIR OR CORRECTION. |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
# IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
31
|
|
|
|
|
|
|
# WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
32
|
|
|
|
|
|
|
# REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, |
33
|
|
|
|
|
|
|
# INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING |
34
|
|
|
|
|
|
|
# OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED |
35
|
|
|
|
|
|
|
# TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY |
36
|
|
|
|
|
|
|
# YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER |
37
|
|
|
|
|
|
|
# PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE |
38
|
|
|
|
|
|
|
# POSSIBILITY OF SUCH DAMAGES. |
39
|
|
|
|
|
|
|
# |
40
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# $Id: FromHTML.pm 292 2008-11-08 08:54:11Z it-aar $ |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
package Combine::FromHTML; |
45
|
|
|
|
|
|
|
|
46
|
1
|
|
|
1
|
|
824
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
47
|
1
|
|
|
1
|
|
6
|
use Combine::Config; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
22
|
|
48
|
1
|
|
|
1
|
|
5
|
use HTTP::Date; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
49
|
|
49
|
1
|
|
|
1
|
|
6
|
use URI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
50
|
1
|
|
|
1
|
|
6
|
use URI::Escape; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
61
|
|
51
|
1
|
|
|
1
|
|
7
|
use HTML::Entities; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
71
|
|
52
|
1
|
|
|
1
|
|
1083
|
use Encode; |
|
1
|
|
|
|
|
13290
|
|
|
1
|
|
|
|
|
1576
|
|
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Character entities to char mapping. We do NOT convert those |
55
|
|
|
|
|
|
|
# entities with a structural meaning, because most likely |
56
|
|
|
|
|
|
|
# the output of this module will go through postprocessing. |
57
|
|
|
|
|
|
|
# |
58
|
|
|
|
|
|
|
my %Ent2CharMap=( |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# amp => '&', |
61
|
|
|
|
|
|
|
# gt => '>', |
62
|
|
|
|
|
|
|
# lt => '<', |
63
|
|
|
|
|
|
|
# quot => '"', |
64
|
|
|
|
|
|
|
# apos => "'", |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
AElig => 'Æ', |
67
|
|
|
|
|
|
|
Aacute => 'Á', |
68
|
|
|
|
|
|
|
Acirc => 'Â', |
69
|
|
|
|
|
|
|
Agrave => 'À', |
70
|
|
|
|
|
|
|
Aring => 'Å', |
71
|
|
|
|
|
|
|
Atilde => 'Ã', |
72
|
|
|
|
|
|
|
Auml => 'Ä', |
73
|
|
|
|
|
|
|
Ccedil => 'Ç', |
74
|
|
|
|
|
|
|
ETH => 'Ð', |
75
|
|
|
|
|
|
|
Eacute => 'É', |
76
|
|
|
|
|
|
|
Ecirc => 'Ê', |
77
|
|
|
|
|
|
|
Egrave => 'È', |
78
|
|
|
|
|
|
|
Euml => 'Ë', |
79
|
|
|
|
|
|
|
Iacute => 'Í', |
80
|
|
|
|
|
|
|
Icirc => 'Î', |
81
|
|
|
|
|
|
|
Igrave => 'Ì', |
82
|
|
|
|
|
|
|
Iuml => 'Ï', |
83
|
|
|
|
|
|
|
Ntilde => 'Ñ', |
84
|
|
|
|
|
|
|
Oacute => 'Ó', |
85
|
|
|
|
|
|
|
Ocirc => 'Ô', |
86
|
|
|
|
|
|
|
Ograve => 'Ò', |
87
|
|
|
|
|
|
|
Oslash => 'Ø', |
88
|
|
|
|
|
|
|
Otilde => 'Õ', |
89
|
|
|
|
|
|
|
Ouml => 'Ö', |
90
|
|
|
|
|
|
|
THORN => 'Þ', |
91
|
|
|
|
|
|
|
Uacute => 'Ú', |
92
|
|
|
|
|
|
|
Ucirc => 'Û', |
93
|
|
|
|
|
|
|
Ugrave => 'Ù', |
94
|
|
|
|
|
|
|
Uuml => 'Ü', |
95
|
|
|
|
|
|
|
Yacute => 'Ý', |
96
|
|
|
|
|
|
|
aacute => 'á', |
97
|
|
|
|
|
|
|
acirc => 'â', |
98
|
|
|
|
|
|
|
aelig => 'æ', |
99
|
|
|
|
|
|
|
agrave => 'à', |
100
|
|
|
|
|
|
|
aring => 'å', |
101
|
|
|
|
|
|
|
atilde => 'ã', |
102
|
|
|
|
|
|
|
auml => 'ä', |
103
|
|
|
|
|
|
|
ccedil => 'ç', |
104
|
|
|
|
|
|
|
eacute => 'é', |
105
|
|
|
|
|
|
|
ecirc => 'ê', |
106
|
|
|
|
|
|
|
egrave => 'è', |
107
|
|
|
|
|
|
|
eth => 'ð', |
108
|
|
|
|
|
|
|
euml => 'ë', |
109
|
|
|
|
|
|
|
iacute => 'í', |
110
|
|
|
|
|
|
|
icirc => 'î', |
111
|
|
|
|
|
|
|
igrave => 'ì', |
112
|
|
|
|
|
|
|
iuml => 'ï', |
113
|
|
|
|
|
|
|
ntilde => 'ñ', |
114
|
|
|
|
|
|
|
oacute => 'ó', |
115
|
|
|
|
|
|
|
ocirc => 'ô', |
116
|
|
|
|
|
|
|
ograve => 'ò', |
117
|
|
|
|
|
|
|
oslash => 'ø', |
118
|
|
|
|
|
|
|
otilde => 'õ', |
119
|
|
|
|
|
|
|
ouml => 'ö', |
120
|
|
|
|
|
|
|
szlig => 'ß', |
121
|
|
|
|
|
|
|
thorn => 'þ', |
122
|
|
|
|
|
|
|
uacute => 'ú', |
123
|
|
|
|
|
|
|
ucirc => 'û', |
124
|
|
|
|
|
|
|
ugrave => 'ù', |
125
|
|
|
|
|
|
|
uuml => 'ü', |
126
|
|
|
|
|
|
|
yacute => 'ý', |
127
|
|
|
|
|
|
|
yuml => 'ÿ', |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
copy => '©', |
130
|
|
|
|
|
|
|
reg => '®', |
131
|
|
|
|
|
|
|
# nbsp => "\240", |
132
|
|
|
|
|
|
|
nbsp => ' ', |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
iexcl => '¡', |
135
|
|
|
|
|
|
|
cent => '¢', |
136
|
|
|
|
|
|
|
pound => '£', |
137
|
|
|
|
|
|
|
curren => '¤', |
138
|
|
|
|
|
|
|
yen => '¥', |
139
|
|
|
|
|
|
|
brvbar => '¦', |
140
|
|
|
|
|
|
|
sect => '§', |
141
|
|
|
|
|
|
|
uml => '¨', |
142
|
|
|
|
|
|
|
ordf => 'ª', |
143
|
|
|
|
|
|
|
laquo => '«', |
144
|
|
|
|
|
|
|
not => '¬', |
145
|
|
|
|
|
|
|
shy => '', |
146
|
|
|
|
|
|
|
macr => '¯', |
147
|
|
|
|
|
|
|
deg => '°', |
148
|
|
|
|
|
|
|
plusmn => '±', |
149
|
|
|
|
|
|
|
sup1 => '¹', |
150
|
|
|
|
|
|
|
sup2 => '²', |
151
|
|
|
|
|
|
|
sup3 => '³', |
152
|
|
|
|
|
|
|
acute => '´', |
153
|
|
|
|
|
|
|
micro => 'µ', |
154
|
|
|
|
|
|
|
para => '¶', |
155
|
|
|
|
|
|
|
middot => '·', |
156
|
|
|
|
|
|
|
cedil => '¸', |
157
|
|
|
|
|
|
|
ordm => 'º', |
158
|
|
|
|
|
|
|
raquo => '»', |
159
|
|
|
|
|
|
|
frac14 => '¼', |
160
|
|
|
|
|
|
|
frac12 => '½', |
161
|
|
|
|
|
|
|
frac34 => '¾', |
162
|
|
|
|
|
|
|
iquest => '¿', |
163
|
|
|
|
|
|
|
times => '×', |
164
|
|
|
|
|
|
|
divide => '÷', |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
); |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
my $log; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub trans { |
171
|
0
|
|
|
0
|
0
|
|
my ($html, $xwi, $opt) = @_; |
172
|
0
|
0
|
|
|
|
|
return undef unless ref $xwi; |
173
|
|
|
|
|
|
|
#$opt can be 'HTML', 'TEXT', 'GuessHTML', 'GuessText' |
174
|
0
|
|
|
|
|
|
$xwi->url_rewind; # (BR) |
175
|
0
|
|
0
|
|
|
|
my $url = $xwi->url_get || return undef; # $xwi object must have url field |
176
|
0
|
0
|
|
|
|
|
if ( !defined($log) ) { |
177
|
0
|
|
|
|
|
|
$log = Combine::Config::Get('LogHandle'); |
178
|
|
|
|
|
|
|
} |
179
|
0
|
0
|
|
|
|
|
if ($$html eq '') { |
180
|
0
|
|
|
|
|
|
$html = $xwi->content; |
181
|
|
|
|
|
|
|
} |
182
|
0
|
0
|
|
|
|
|
if ( length($$html) < 10 ) { |
183
|
0
|
|
|
|
|
|
$log->say('FromHTML: short or empty file'); |
184
|
0
|
|
|
|
|
|
return $xwi; |
185
|
|
|
|
|
|
|
} |
186
|
0
|
0
|
|
|
|
|
if ( length($$html) > 1024 ) { # should we check shorter files as well ? |
187
|
0
|
|
|
|
|
|
my $teststring = substr($$html,0,1024); |
188
|
0
|
|
|
|
|
|
my $start_len = 1024; |
189
|
0
|
|
|
|
|
|
$teststring =~ s/[^\s\x20-\xfe]+//g; |
190
|
0
|
|
|
|
|
|
my $len = length($teststring); |
191
|
0
|
0
|
|
|
|
|
if ( $len > ( 0.9 * $start_len ) ) { # this is some kind of text |
192
|
0
|
|
|
|
|
|
my @rows = split(/\n/,$teststring); |
193
|
0
|
|
|
|
|
|
shift(@rows); |
194
|
0
|
|
|
|
|
|
my ($i,$uu,$b64,$r); |
195
|
0
|
|
|
|
|
|
$uu=0; $b64=0; |
|
0
|
|
|
|
|
|
|
196
|
0
|
0
|
|
|
|
|
my $n = $#rows>10 ? 10 : $#rows; |
197
|
0
|
|
|
|
|
|
for ($i=0;$i<$n;$i++) { |
198
|
0
|
|
|
|
|
|
$r = shift(@rows); |
199
|
0
|
0
|
0
|
|
|
|
$uu++ if (length($r)==61) and (substr($r,0,1) eq "M"); |
200
|
0
|
0
|
0
|
|
|
|
$b64++ if (length($r)==72) and ($r!~/\s/); |
201
|
0
|
0
|
0
|
|
|
|
if ( ( $uu == 10 ) or ( $b64 == 10 ) ) { |
202
|
|
|
|
|
|
|
# this is probably uuencoded or base64 encoded |
203
|
0
|
|
|
|
|
|
$log->say('FromHTML: probably uuencoded or base64 encoded'); |
204
|
0
|
|
|
|
|
|
return $xwi; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} else { |
208
|
|
|
|
|
|
|
# this is most likely a binary file => don't parse it |
209
|
|
|
|
|
|
|
# DISABLED since it creates problems with certain charactersets |
210
|
|
|
|
|
|
|
# $log->say('FromHTML: most likely a binary file'); |
211
|
|
|
|
|
|
|
# return $xwi; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
0
|
|
|
|
|
|
$html = $$html; |
216
|
0
|
0
|
|
|
|
|
if ($xwi->truncated()) { |
217
|
0
|
|
|
|
|
|
my $last_blank = rindex($html, ' '); |
218
|
0
|
0
|
|
|
|
|
if ($last_blank > 0) { |
219
|
0
|
|
|
|
|
|
$html = substr($html, 0, $last_blank); |
220
|
|
|
|
|
|
|
} else { |
221
|
|
|
|
|
|
|
# What ! No blanks ! This is some weird text => don't parse it |
222
|
0
|
|
|
|
|
|
$log->say('FromHTML: No blanks - Not processing'); |
223
|
0
|
|
|
|
|
|
return $xwi; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
0
|
0
|
|
|
|
|
if ( $opt =~ /^Guess/ ) { |
228
|
0
|
0
|
0
|
|
|
|
if ( ($url =~ /\..?html?$|\/$/i) || |
229
|
|
|
|
|
|
|
($html =~ /<\s*html\s*|<\s*head\s*|<\s*body\s*/i) ) { |
230
|
0
|
|
|
|
|
|
$opt = 'HTML'; |
231
|
|
|
|
|
|
|
} else { |
232
|
0
|
|
|
|
|
|
$opt = 'Text'; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
0
|
0
|
|
|
|
|
if ($opt =~ /Text/i) { |
237
|
0
|
|
|
|
|
|
$html =~ s/[\s\240\n]+/ /sg; # compress whitespace?? |
238
|
0
|
|
|
|
|
|
$xwi->text(\$html); |
239
|
0
|
|
|
|
|
|
return $xwi; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
#clean character entities #1..#255 to utf-8/latin1 |
243
|
0
|
|
|
|
|
|
my $html_utf8; |
244
|
0
|
|
|
|
|
|
if (1) { |
245
|
0
|
|
|
|
|
|
my $c; |
246
|
0
|
|
|
|
|
|
$html_utf8=HTML::Entities::decode_entities($html); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
0
|
|
|
|
|
|
my $rtext; |
250
|
|
|
|
|
|
|
##Plugin for extracting only relevant text and discarding base templates |
251
|
0
|
|
|
|
|
|
my $relTextPlugin = Combine::Config::Get('relTextPlugin'); |
252
|
0
|
0
|
0
|
|
|
|
if (defined($relTextPlugin) && $relTextPlugin ne '') { |
253
|
0
|
|
|
|
|
|
eval "require $relTextPlugin"; |
254
|
0
|
|
|
|
|
|
$rtext = $relTextPlugin->extrText($html_utf8); |
255
|
0
|
0
|
|
|
|
|
if (defined($rtext)) { |
256
|
0
|
|
|
|
|
|
$xwi->text(\$rtext); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
## |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
#Only do for HTML files |
262
|
|
|
|
|
|
|
# General modifications to the HTML code before extracting our information |
263
|
|
|
|
|
|
|
|
264
|
0
|
0
|
|
|
|
|
if ( Combine::Config::Get('useTidy') ) { |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# print "Doing Tidy\n"; |
268
|
0
|
|
|
|
|
|
require HTML::Tidy; |
269
|
0
|
|
|
|
|
|
my $tidy = new HTML::Tidy ( {config_file => Combine::Config::Get('baseConfigDir') . '/tidy.cfg'} ); |
270
|
|
|
|
|
|
|
# $tidy->ignore( type => TIDY_WARNING ); |
271
|
|
|
|
|
|
|
# if (!eval{$html = $tidy->clean( $html . "\n" )}) { print "TIDY ERR in eval\n"; } |
272
|
0
|
|
|
|
|
|
my $thtml; |
273
|
0
|
0
|
|
|
|
|
if (!eval{$thtml = $tidy->clean( $html_utf8 . "\n" )}) { |
|
0
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
print "TIDY ERR in eval\n"; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
# for my $message ( $tidy->messages ) { |
277
|
|
|
|
|
|
|
# print $message->as_string; #LOG! |
278
|
|
|
|
|
|
|
# } |
279
|
0
|
|
|
|
|
|
$html = Encode::decode('UTF-8', $thtml); # convert to Perl internal representation |
280
|
|
|
|
|
|
|
} else { |
281
|
0
|
|
|
|
|
|
$html_utf8 =~ s/<\!\-\-.*?\-\->/ /sgo; # replace all comments (including multiline) with whitespace |
282
|
0
|
|
|
|
|
|
$html = $html_utf8; |
283
|
|
|
|
|
|
|
} |
284
|
0
|
0
|
|
|
|
|
if ( ! Encode::is_utf8($html) ) { |
285
|
0
|
|
|
|
|
|
$log->say('WARN HTML content not in UTF-8'); |
286
|
|
|
|
|
|
|
} ## |
287
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the scripts (including multiline) |
289
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the scripts (including multiline) |
290
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the style scripts (including multiline) |
291
|
|
|
|
|
|
|
## $html =~ s/[\s\240]+/ /g; # compress whitespace |
292
|
|
|
|
|
|
|
|
293
|
0
|
|
|
|
|
|
my $xwicontent=$html; |
294
|
0
|
|
|
|
|
|
$xwi->content(\$xwicontent); |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# #Split into HEAD and BODY |
297
|
|
|
|
|
|
|
# my $head=''; |
298
|
|
|
|
|
|
|
## if ($html =~ s|^(.*?)<\s*body\s*|
|
299
|
|
|
|
|
|
|
## where the frameset is outside the see http://poseidon.csd.auth.gr/EN/ |
300
|
|
|
|
|
|
|
# if ( $html =~ s|^(.*?<\s*\/head[^>]*>)||i ) { ??? |
301
|
|
|
|
|
|
|
# $head=$1; |
302
|
|
|
|
|
|
|
# } |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
#Parsing and extraction of data |
305
|
0
|
0
|
|
|
|
|
if ($html =~ /([^<]+)<\/title>/i) { # extract title |
306
|
0
|
|
|
|
|
|
my $tmp = $1; |
307
|
|
|
|
|
|
|
# $tmp =~ s/\s+/ /g; #needed AA0? |
308
|
|
|
|
|
|
|
# $tmp = HTML::Entities::decode_entities($tmp); |
309
|
0
|
|
|
|
|
|
$xwi->title($tmp); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
#Extract META tags |
313
|
0
|
|
|
|
|
|
while ( $html =~ m//sgi ) { |
314
|
0
|
|
|
|
|
|
my $tag = $1; |
315
|
0
|
|
|
|
|
|
my $key=''; |
316
|
0
|
|
|
|
|
|
my $val=''; |
317
|
0
|
|
|
|
|
|
$tag =~ s/[\n\r]/ /g; |
318
|
0
|
|
|
|
|
|
foreach my $attr ('name','content') { |
319
|
0
|
|
|
|
|
|
my $str=''; |
320
|
|
|
|
|
|
|
|
321
|
0
|
0
|
|
|
|
|
if ($tag =~ /$attr\s*=\s*[\"]/i) { |
|
|
0
|
|
|
|
|
|
322
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*\"([^\"]+?)\"//i) { |
323
|
0
|
|
|
|
|
|
$str = $1; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
} elsif ($tag =~ /$attr\s*=\s*[\']/i) { |
326
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*\'([^\']+?)\'//i) { |
327
|
0
|
|
|
|
|
|
$str = $1; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} else { |
330
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*([^\s]+?)\s//i) { |
331
|
0
|
|
|
|
|
|
$str = $1; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
} |
334
|
0
|
0
|
|
|
|
|
next if($str =~ /^$/); |
335
|
0
|
0
|
|
|
|
|
if ($attr =~ /name/i) { |
|
|
0
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
$key=lc($str); |
337
|
|
|
|
|
|
|
} elsif ($attr =~ /content/i) { |
338
|
0
|
|
|
|
|
|
$val=$str; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
0
|
0
|
0
|
|
|
|
next if(($key =~ /^$/) || ($val =~ /^$/)); |
342
|
|
|
|
|
|
|
# $xwi->meta_add($key,HTML::Entities::decode_entities($val)); |
343
|
0
|
|
|
|
|
|
$xwi->meta_add($key,$val); |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
#END extract META tags |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=begin comment |
348
|
|
|
|
|
|
|
This feature is temporarily disabled |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
my $summary = ""; |
351
|
|
|
|
|
|
|
$xwi->meta_rewind; |
352
|
|
|
|
|
|
|
my ($name,$content); |
353
|
|
|
|
|
|
|
while(1) { |
354
|
|
|
|
|
|
|
($name,$content) = $xwi->meta_get; |
355
|
|
|
|
|
|
|
if (!defined($name)) { last; } |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#If abstract, description or DC.Description is not a list of keywords: add it to summary |
358
|
|
|
|
|
|
|
if ( $name eq 'description' || $name eq 'dc.description' || $name eq 'abstract' ) { |
359
|
|
|
|
|
|
|
my @kom = split(', ',$content); |
360
|
|
|
|
|
|
|
my @dot = split(' ',$content); |
361
|
|
|
|
|
|
|
if ( $#kom < $#dot ) { #If several meta-fields check if they overlap or are the same## |
362
|
|
|
|
|
|
|
$summary .= $content . ' '; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
#Generate Summary |
368
|
|
|
|
|
|
|
my $sumlength = Combine::Config::Get('SummaryLength'); |
369
|
|
|
|
|
|
|
# print "SUM1: $summary\nHTML: $html\n"; |
370
|
|
|
|
|
|
|
if ( $sumlength > 0 ) { |
371
|
|
|
|
|
|
|
if ( ($sumlength - length($summary)) > 0 ) { |
372
|
|
|
|
|
|
|
require HTML::Summary; |
373
|
|
|
|
|
|
|
require HTML::TreeBuilder; |
374
|
|
|
|
|
|
|
my $html_summarizer = new HTML::Summary( LENGTH => $sumlength - length($summary), USE_META => 0 ); |
375
|
|
|
|
|
|
|
my $tree = new HTML::TreeBuilder; |
376
|
|
|
|
|
|
|
$tree->parse( Encode::encode('latin1',$html) ); |
377
|
|
|
|
|
|
|
# $tree->parse( $html ); |
378
|
|
|
|
|
|
|
$tree->eof(); |
379
|
|
|
|
|
|
|
## $summary .= $html_summarizer->generate ( $tree ); |
380
|
|
|
|
|
|
|
my $t .= Encode::decode('latin1',$html_summarizer->generate ( $tree )); |
381
|
|
|
|
|
|
|
$tree = $tree->delete; |
382
|
|
|
|
|
|
|
$summary .= $t; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
if (length($summary)>2) { |
385
|
|
|
|
|
|
|
# $summary =~ s/[^\w\s,\.\!\?:;\'\"]//gs; |
386
|
|
|
|
|
|
|
$summary =~ s/[^\p{IsAlnum}\s,\.\!\?:;\'\"]//gs; |
387
|
|
|
|
|
|
|
$summary =~ s/[\s\240]+/ /g; |
388
|
|
|
|
|
|
|
$xwi->meta_add("Rsummary",$summary); |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=end comment |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# extract links |
397
|
1
|
|
|
1
|
|
734
|
use Combine::HTMLExtractor; |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
608
|
|
398
|
0
|
|
|
|
|
|
my ($alt, $linktext, $linkurl, $base); |
399
|
0
|
|
|
|
|
|
$base = $xwi->base; #Set by UA.pm |
400
|
0
|
|
|
|
|
|
my $lx = new Combine::HTMLExtractor(undef,undef,1); |
401
|
|
|
|
|
|
|
# print "INPUT: $html\n"; |
402
|
|
|
|
|
|
|
# $html = HTML::Entities::decode_entities( Encode::encode('latin1',$html) ); |
403
|
0
|
|
|
|
|
|
$html = HTML::Entities::decode_entities( $html ); |
404
|
0
|
|
|
|
|
|
$lx->parse(\$html); |
405
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
|
my %Tags = ( a => 1, area => 1, frame => 1, img => 1, headings => 1, text => 1 ); |
407
|
0
|
|
|
|
|
|
for my $link ( @{$lx->links} ) { |
|
0
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# print "GOTLINK: $$link{tag} = $$link{_TEXT}\n"; |
409
|
0
|
0
|
|
|
|
|
next unless exists($Tags{$$link{tag}}); |
410
|
0
|
0
|
|
|
|
|
my $linktext = $$link{_TEXT} ? $$link{_TEXT} : ''; |
411
|
0
|
0
|
0
|
|
|
|
if ( ($$link{tag} eq 'headings') ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
412
|
0
|
0
|
|
|
|
|
if ( $linktext !~ /^\s*$/ ) { |
413
|
0
|
|
|
|
|
|
$linktext =~ s/^[\s;]+//; |
414
|
0
|
|
|
|
|
|
$linktext =~ s/[\s;]+$//; |
415
|
|
|
|
|
|
|
# $xwi->heading_add(Encode::decode('latin1',$linktext)); |
416
|
0
|
|
|
|
|
|
$xwi->heading_add($linktext); |
417
|
|
|
|
|
|
|
} |
418
|
0
|
|
|
|
|
|
next; |
419
|
|
|
|
|
|
|
} elsif ( ($$link{tag} eq 'text') ) { |
420
|
0
|
0
|
|
|
|
|
if (!defined($rtext)) { |
421
|
|
|
|
|
|
|
# $linktext = Encode::decode('latin1',$linktext); |
422
|
0
|
|
|
|
|
|
$linktext =~ s/[\s\240]+/ /g; # compress whitespace?? |
423
|
0
|
|
|
|
|
|
$xwi->text(\$linktext); |
424
|
|
|
|
|
|
|
#print "HT=$linktext\n"; |
425
|
|
|
|
|
|
|
} |
426
|
0
|
|
|
|
|
|
next; |
427
|
|
|
|
|
|
|
} elsif ( ($$link{tag} eq 'frame') || ($$link{tag} eq 'img') ) { |
428
|
0
|
|
|
|
|
|
$linkurl = $$link{src}; |
429
|
0
|
|
0
|
|
|
|
$linktext .= $$link{alt} || ''; |
430
|
|
|
|
|
|
|
} else { |
431
|
0
|
|
|
|
|
|
$linkurl = $$link{href}; |
432
|
|
|
|
|
|
|
} |
433
|
0
|
|
|
|
|
|
$linktext =~ s/\[IMG\]//g; |
434
|
0
|
0
|
|
|
|
|
if ( $linkurl !~ /^#/ ) { # Throw away links within a document |
435
|
0
|
|
|
|
|
|
$linkurl =~ s/\?\s+/?/; #to be handled in normalize?? |
436
|
0
|
|
|
|
|
|
my $urlstr = URI->new_abs($linkurl, $base)->canonical->as_string; |
437
|
|
|
|
|
|
|
# $xwi->link_add($urlstr, 0, 0, Encode::decode('latin1',$linktext), $$link{tag}); |
438
|
0
|
|
|
|
|
|
$xwi->link_add($urlstr, 0, 0, $linktext, $$link{tag}); |
439
|
|
|
|
|
|
|
# print "ADD: $$link{tag}; $urlstr; |$linktext|\n"; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
return $xwi; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
1; |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
__END__ |