| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
|
4
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
|
5
|
|
|
|
|
|
|
# the Free Software Foundation; either version 1, or (at your option) |
|
6
|
|
|
|
|
|
|
# any later version. |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
|
9
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
14
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
|
15
|
|
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|
16
|
|
|
|
|
|
|
# |
|
17
|
|
|
|
|
|
|
# |
|
18
|
|
|
|
|
|
|
# NO WARRANTY |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
|
21
|
|
|
|
|
|
|
# FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
|
22
|
|
|
|
|
|
|
# OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
|
23
|
|
|
|
|
|
|
# PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED |
|
24
|
|
|
|
|
|
|
# OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
|
25
|
|
|
|
|
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS |
|
26
|
|
|
|
|
|
|
# TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE |
|
27
|
|
|
|
|
|
|
# PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, |
|
28
|
|
|
|
|
|
|
# REPAIR OR CORRECTION. |
|
29
|
|
|
|
|
|
|
# |
|
30
|
|
|
|
|
|
|
# IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
|
31
|
|
|
|
|
|
|
# WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
|
32
|
|
|
|
|
|
|
# REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, |
|
33
|
|
|
|
|
|
|
# INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING |
|
34
|
|
|
|
|
|
|
# OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED |
|
35
|
|
|
|
|
|
|
# TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY |
|
36
|
|
|
|
|
|
|
# YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER |
|
37
|
|
|
|
|
|
|
# PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE |
|
38
|
|
|
|
|
|
|
# POSSIBILITY OF SUCH DAMAGES. |
|
39
|
|
|
|
|
|
|
# |
|
40
|
|
|
|
|
|
|
# Copyright (c) 1996-1998 LUB NetLab |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# $Id: FromHTML.pm 292 2008-11-08 08:54:11Z it-aar $ |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
package Combine::FromHTML; |
|
45
|
|
|
|
|
|
|
|
|
46
|
1
|
|
|
1
|
|
824
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
48
|
|
|
47
|
1
|
|
|
1
|
|
6
|
use Combine::Config; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
22
|
|
|
48
|
1
|
|
|
1
|
|
5
|
use HTTP::Date; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
49
|
|
|
49
|
1
|
|
|
1
|
|
6
|
use URI; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
38
|
|
|
50
|
1
|
|
|
1
|
|
6
|
use URI::Escape; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
61
|
|
|
51
|
1
|
|
|
1
|
|
7
|
use HTML::Entities; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
71
|
|
|
52
|
1
|
|
|
1
|
|
1083
|
use Encode; |
|
|
1
|
|
|
|
|
13290
|
|
|
|
1
|
|
|
|
|
1576
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# Character entities to char mapping. We do NOT convert those |
|
55
|
|
|
|
|
|
|
# entities with a structural meaning, because most likely |
|
56
|
|
|
|
|
|
|
# the output of this module will go through postprocessing. |
|
57
|
|
|
|
|
|
|
# |
|
58
|
|
|
|
|
|
|
my %Ent2CharMap=( |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# amp => '&', |
|
61
|
|
|
|
|
|
|
# gt => '>', |
|
62
|
|
|
|
|
|
|
# lt => '<', |
|
63
|
|
|
|
|
|
|
# quot => '"', |
|
64
|
|
|
|
|
|
|
# apos => "'", |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
AElig => 'Æ', |
|
67
|
|
|
|
|
|
|
Aacute => 'Á', |
|
68
|
|
|
|
|
|
|
Acirc => 'Â', |
|
69
|
|
|
|
|
|
|
Agrave => 'À', |
|
70
|
|
|
|
|
|
|
Aring => 'Å', |
|
71
|
|
|
|
|
|
|
Atilde => 'Ã', |
|
72
|
|
|
|
|
|
|
Auml => 'Ä', |
|
73
|
|
|
|
|
|
|
Ccedil => 'Ç', |
|
74
|
|
|
|
|
|
|
ETH => 'Ð', |
|
75
|
|
|
|
|
|
|
Eacute => 'É', |
|
76
|
|
|
|
|
|
|
Ecirc => 'Ê', |
|
77
|
|
|
|
|
|
|
Egrave => 'È', |
|
78
|
|
|
|
|
|
|
Euml => 'Ë', |
|
79
|
|
|
|
|
|
|
Iacute => 'Í', |
|
80
|
|
|
|
|
|
|
Icirc => 'Î', |
|
81
|
|
|
|
|
|
|
Igrave => 'Ì', |
|
82
|
|
|
|
|
|
|
Iuml => 'Ï', |
|
83
|
|
|
|
|
|
|
Ntilde => 'Ñ', |
|
84
|
|
|
|
|
|
|
Oacute => 'Ó', |
|
85
|
|
|
|
|
|
|
Ocirc => 'Ô', |
|
86
|
|
|
|
|
|
|
Ograve => 'Ò', |
|
87
|
|
|
|
|
|
|
Oslash => 'Ø', |
|
88
|
|
|
|
|
|
|
Otilde => 'Õ', |
|
89
|
|
|
|
|
|
|
Ouml => 'Ö', |
|
90
|
|
|
|
|
|
|
THORN => 'Þ', |
|
91
|
|
|
|
|
|
|
Uacute => 'Ú', |
|
92
|
|
|
|
|
|
|
Ucirc => 'Û', |
|
93
|
|
|
|
|
|
|
Ugrave => 'Ù', |
|
94
|
|
|
|
|
|
|
Uuml => 'Ü', |
|
95
|
|
|
|
|
|
|
Yacute => 'Ý', |
|
96
|
|
|
|
|
|
|
aacute => 'á', |
|
97
|
|
|
|
|
|
|
acirc => 'â', |
|
98
|
|
|
|
|
|
|
aelig => 'æ', |
|
99
|
|
|
|
|
|
|
agrave => 'à', |
|
100
|
|
|
|
|
|
|
aring => 'å', |
|
101
|
|
|
|
|
|
|
atilde => 'ã', |
|
102
|
|
|
|
|
|
|
auml => 'ä', |
|
103
|
|
|
|
|
|
|
ccedil => 'ç', |
|
104
|
|
|
|
|
|
|
eacute => 'é', |
|
105
|
|
|
|
|
|
|
ecirc => 'ê', |
|
106
|
|
|
|
|
|
|
egrave => 'è', |
|
107
|
|
|
|
|
|
|
eth => 'ð', |
|
108
|
|
|
|
|
|
|
euml => 'ë', |
|
109
|
|
|
|
|
|
|
iacute => 'í', |
|
110
|
|
|
|
|
|
|
icirc => 'î', |
|
111
|
|
|
|
|
|
|
igrave => 'ì', |
|
112
|
|
|
|
|
|
|
iuml => 'ï', |
|
113
|
|
|
|
|
|
|
ntilde => 'ñ', |
|
114
|
|
|
|
|
|
|
oacute => 'ó', |
|
115
|
|
|
|
|
|
|
ocirc => 'ô', |
|
116
|
|
|
|
|
|
|
ograve => 'ò', |
|
117
|
|
|
|
|
|
|
oslash => 'ø', |
|
118
|
|
|
|
|
|
|
otilde => 'õ', |
|
119
|
|
|
|
|
|
|
ouml => 'ö', |
|
120
|
|
|
|
|
|
|
szlig => 'ß', |
|
121
|
|
|
|
|
|
|
thorn => 'þ', |
|
122
|
|
|
|
|
|
|
uacute => 'ú', |
|
123
|
|
|
|
|
|
|
ucirc => 'û', |
|
124
|
|
|
|
|
|
|
ugrave => 'ù', |
|
125
|
|
|
|
|
|
|
uuml => 'ü', |
|
126
|
|
|
|
|
|
|
yacute => 'ý', |
|
127
|
|
|
|
|
|
|
yuml => 'ÿ', |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
copy => '©', |
|
130
|
|
|
|
|
|
|
reg => '®', |
|
131
|
|
|
|
|
|
|
# nbsp => "\240", |
|
132
|
|
|
|
|
|
|
nbsp => ' ', |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
iexcl => '¡', |
|
135
|
|
|
|
|
|
|
cent => '¢', |
|
136
|
|
|
|
|
|
|
pound => '£', |
|
137
|
|
|
|
|
|
|
curren => '¤', |
|
138
|
|
|
|
|
|
|
yen => '¥', |
|
139
|
|
|
|
|
|
|
brvbar => '¦', |
|
140
|
|
|
|
|
|
|
sect => '§', |
|
141
|
|
|
|
|
|
|
uml => '¨', |
|
142
|
|
|
|
|
|
|
ordf => 'ª', |
|
143
|
|
|
|
|
|
|
laquo => '«', |
|
144
|
|
|
|
|
|
|
not => '¬', |
|
145
|
|
|
|
|
|
|
shy => '', |
|
146
|
|
|
|
|
|
|
macr => '¯', |
|
147
|
|
|
|
|
|
|
deg => '°', |
|
148
|
|
|
|
|
|
|
plusmn => '±', |
|
149
|
|
|
|
|
|
|
sup1 => '¹', |
|
150
|
|
|
|
|
|
|
sup2 => '²', |
|
151
|
|
|
|
|
|
|
sup3 => '³', |
|
152
|
|
|
|
|
|
|
acute => '´', |
|
153
|
|
|
|
|
|
|
micro => 'µ', |
|
154
|
|
|
|
|
|
|
para => '¶', |
|
155
|
|
|
|
|
|
|
middot => '·', |
|
156
|
|
|
|
|
|
|
cedil => '¸', |
|
157
|
|
|
|
|
|
|
ordm => 'º', |
|
158
|
|
|
|
|
|
|
raquo => '»', |
|
159
|
|
|
|
|
|
|
frac14 => '¼', |
|
160
|
|
|
|
|
|
|
frac12 => '½', |
|
161
|
|
|
|
|
|
|
frac34 => '¾', |
|
162
|
|
|
|
|
|
|
iquest => '¿', |
|
163
|
|
|
|
|
|
|
times => '×', |
|
164
|
|
|
|
|
|
|
divide => '÷', |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
); |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
my $log; |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub trans { |
|
171
|
0
|
|
|
0
|
0
|
|
my ($html, $xwi, $opt) = @_; |
|
172
|
0
|
0
|
|
|
|
|
return undef unless ref $xwi; |
|
173
|
|
|
|
|
|
|
#$opt can be 'HTML', 'TEXT', 'GuessHTML', 'GuessText' |
|
174
|
0
|
|
|
|
|
|
$xwi->url_rewind; # (BR) |
|
175
|
0
|
|
0
|
|
|
|
my $url = $xwi->url_get || return undef; # $xwi object must have url field |
|
176
|
0
|
0
|
|
|
|
|
if ( !defined($log) ) { |
|
177
|
0
|
|
|
|
|
|
$log = Combine::Config::Get('LogHandle'); |
|
178
|
|
|
|
|
|
|
} |
|
179
|
0
|
0
|
|
|
|
|
if ($$html eq '') { |
|
180
|
0
|
|
|
|
|
|
$html = $xwi->content; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
0
|
0
|
|
|
|
|
if ( length($$html) < 10 ) { |
|
183
|
0
|
|
|
|
|
|
$log->say('FromHTML: short or empty file'); |
|
184
|
0
|
|
|
|
|
|
return $xwi; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
0
|
0
|
|
|
|
|
if ( length($$html) > 1024 ) { # should we check shorter files as well ? |
|
187
|
0
|
|
|
|
|
|
my $teststring = substr($$html,0,1024); |
|
188
|
0
|
|
|
|
|
|
my $start_len = 1024; |
|
189
|
0
|
|
|
|
|
|
$teststring =~ s/[^\s\x20-\xfe]+//g; |
|
190
|
0
|
|
|
|
|
|
my $len = length($teststring); |
|
191
|
0
|
0
|
|
|
|
|
if ( $len > ( 0.9 * $start_len ) ) { # this is some kind of text |
|
192
|
0
|
|
|
|
|
|
my @rows = split(/\n/,$teststring); |
|
193
|
0
|
|
|
|
|
|
shift(@rows); |
|
194
|
0
|
|
|
|
|
|
my ($i,$uu,$b64,$r); |
|
195
|
0
|
|
|
|
|
|
$uu=0; $b64=0; |
|
|
0
|
|
|
|
|
|
|
|
196
|
0
|
0
|
|
|
|
|
my $n = $#rows>10 ? 10 : $#rows; |
|
197
|
0
|
|
|
|
|
|
for ($i=0;$i<$n;$i++) { |
|
198
|
0
|
|
|
|
|
|
$r = shift(@rows); |
|
199
|
0
|
0
|
0
|
|
|
|
$uu++ if (length($r)==61) and (substr($r,0,1) eq "M"); |
|
200
|
0
|
0
|
0
|
|
|
|
$b64++ if (length($r)==72) and ($r!~/\s/); |
|
201
|
0
|
0
|
0
|
|
|
|
if ( ( $uu == 10 ) or ( $b64 == 10 ) ) { |
|
202
|
|
|
|
|
|
|
# this is probably uuencoded or base64 encoded |
|
203
|
0
|
|
|
|
|
|
$log->say('FromHTML: probably uuencoded or base64 encoded'); |
|
204
|
0
|
|
|
|
|
|
return $xwi; |
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} else { |
|
208
|
|
|
|
|
|
|
# this is most likely a binary file => don't parse it |
|
209
|
|
|
|
|
|
|
# DISABLED since it creates problems with certain charactersets |
|
210
|
|
|
|
|
|
|
# $log->say('FromHTML: most likely a binary file'); |
|
211
|
|
|
|
|
|
|
# return $xwi; |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
0
|
|
|
|
|
|
$html = $$html; |
|
216
|
0
|
0
|
|
|
|
|
if ($xwi->truncated()) { |
|
217
|
0
|
|
|
|
|
|
my $last_blank = rindex($html, ' '); |
|
218
|
0
|
0
|
|
|
|
|
if ($last_blank > 0) { |
|
219
|
0
|
|
|
|
|
|
$html = substr($html, 0, $last_blank); |
|
220
|
|
|
|
|
|
|
} else { |
|
221
|
|
|
|
|
|
|
# What ! No blanks ! This is some weird text => don't parse it |
|
222
|
0
|
|
|
|
|
|
$log->say('FromHTML: No blanks - Not processing'); |
|
223
|
0
|
|
|
|
|
|
return $xwi; |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
0
|
0
|
|
|
|
|
if ( $opt =~ /^Guess/ ) { |
|
228
|
0
|
0
|
0
|
|
|
|
if ( ($url =~ /\..?html?$|\/$/i) || |
|
229
|
|
|
|
|
|
|
($html =~ /<\s*html\s*|<\s*head\s*|<\s*body\s*/i) ) { |
|
230
|
0
|
|
|
|
|
|
$opt = 'HTML'; |
|
231
|
|
|
|
|
|
|
} else { |
|
232
|
0
|
|
|
|
|
|
$opt = 'Text'; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
0
|
0
|
|
|
|
|
if ($opt =~ /Text/i) { |
|
237
|
0
|
|
|
|
|
|
$html =~ s/[\s\240\n]+/ /sg; # compress whitespace?? |
|
238
|
0
|
|
|
|
|
|
$xwi->text(\$html); |
|
239
|
0
|
|
|
|
|
|
return $xwi; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
#clean character entities #1..#255 to utf-8/latin1 |
|
243
|
0
|
|
|
|
|
|
my $html_utf8; |
|
244
|
0
|
|
|
|
|
|
if (1) { |
|
245
|
0
|
|
|
|
|
|
my $c; |
|
246
|
0
|
|
|
|
|
|
$html_utf8=HTML::Entities::decode_entities($html); |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
0
|
|
|
|
|
|
my $rtext; |
|
250
|
|
|
|
|
|
|
##Plugin for extracting only relevant text and discarding base templates |
|
251
|
0
|
|
|
|
|
|
my $relTextPlugin = Combine::Config::Get('relTextPlugin'); |
|
252
|
0
|
0
|
0
|
|
|
|
if (defined($relTextPlugin) && $relTextPlugin ne '') { |
|
253
|
0
|
|
|
|
|
|
eval "require $relTextPlugin"; |
|
254
|
0
|
|
|
|
|
|
$rtext = $relTextPlugin->extrText($html_utf8); |
|
255
|
0
|
0
|
|
|
|
|
if (defined($rtext)) { |
|
256
|
0
|
|
|
|
|
|
$xwi->text(\$rtext); |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
## |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
#Only do for HTML files |
|
262
|
|
|
|
|
|
|
# General modifications to the HTML code before extracting our information |
|
263
|
|
|
|
|
|
|
|
|
264
|
0
|
0
|
|
|
|
|
if ( Combine::Config::Get('useTidy') ) { |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# print "Doing Tidy\n"; |
|
268
|
0
|
|
|
|
|
|
require HTML::Tidy; |
|
269
|
0
|
|
|
|
|
|
my $tidy = new HTML::Tidy ( {config_file => Combine::Config::Get('baseConfigDir') . '/tidy.cfg'} ); |
|
270
|
|
|
|
|
|
|
# $tidy->ignore( type => TIDY_WARNING ); |
|
271
|
|
|
|
|
|
|
# if (!eval{$html = $tidy->clean( $html . "\n" )}) { print "TIDY ERR in eval\n"; } |
|
272
|
0
|
|
|
|
|
|
my $thtml; |
|
273
|
0
|
0
|
|
|
|
|
if (!eval{$thtml = $tidy->clean( $html_utf8 . "\n" )}) { |
|
|
0
|
|
|
|
|
|
|
|
274
|
0
|
|
|
|
|
|
print "TIDY ERR in eval\n"; |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
# for my $message ( $tidy->messages ) { |
|
277
|
|
|
|
|
|
|
# print $message->as_string; #LOG! |
|
278
|
|
|
|
|
|
|
# } |
|
279
|
0
|
|
|
|
|
|
$html = Encode::decode('UTF-8', $thtml); # convert to Perl internal representation |
|
280
|
|
|
|
|
|
|
} else { |
|
281
|
0
|
|
|
|
|
|
$html_utf8 =~ s/<\!\-\-.*?\-\->/ /sgo; # replace all comments (including multiline) with whitespace |
|
282
|
0
|
|
|
|
|
|
$html = $html_utf8; |
|
283
|
|
|
|
|
|
|
} |
|
284
|
0
|
0
|
|
|
|
|
if ( ! Encode::is_utf8($html) ) { |
|
285
|
0
|
|
|
|
|
|
$log->say('WARN HTML content not in UTF-8'); |
|
286
|
|
|
|
|
|
|
} ## |
|
287
|
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the scripts (including multiline) |
|
289
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the scripts (including multiline) |
|
290
|
0
|
|
|
|
|
|
$html =~ s// /sigo; # remove all the style scripts (including multiline) |
|
291
|
|
|
|
|
|
|
## $html =~ s/[\s\240]+/ /g; # compress whitespace |
|
292
|
|
|
|
|
|
|
|
|
293
|
0
|
|
|
|
|
|
my $xwicontent=$html; |
|
294
|
0
|
|
|
|
|
|
$xwi->content(\$xwicontent); |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# #Split into HEAD and BODY |
|
297
|
|
|
|
|
|
|
# my $head=''; |
|
298
|
|
|
|
|
|
|
## if ($html =~ s|^(.*?)<\s*body\s*|
|
|
299
|
|
|
|
|
|
|
## where the frameset is outside the see http://poseidon.csd.auth.gr/EN/ |
|
300
|
|
|
|
|
|
|
# if ( $html =~ s|^(.*?<\s*\/head[^>]*>)||i ) { ??? |
|
301
|
|
|
|
|
|
|
# $head=$1; |
|
302
|
|
|
|
|
|
|
# } |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
#Parsing and extraction of data |
|
305
|
0
|
0
|
|
|
|
|
if ($html =~ /([^<]+)<\/title>/i) { # extract title |
|
306
|
0
|
|
|
|
|
|
my $tmp = $1; |
|
307
|
|
|
|
|
|
|
# $tmp =~ s/\s+/ /g; #needed AA0? |
|
308
|
|
|
|
|
|
|
# $tmp = HTML::Entities::decode_entities($tmp); |
|
309
|
0
|
|
|
|
|
|
$xwi->title($tmp); |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
#Extract META tags |
|
313
|
0
|
|
|
|
|
|
while ( $html =~ m//sgi ) { |
|
314
|
0
|
|
|
|
|
|
my $tag = $1; |
|
315
|
0
|
|
|
|
|
|
my $key=''; |
|
316
|
0
|
|
|
|
|
|
my $val=''; |
|
317
|
0
|
|
|
|
|
|
$tag =~ s/[\n\r]/ /g; |
|
318
|
0
|
|
|
|
|
|
foreach my $attr ('name','content') { |
|
319
|
0
|
|
|
|
|
|
my $str=''; |
|
320
|
|
|
|
|
|
|
|
|
321
|
0
|
0
|
|
|
|
|
if ($tag =~ /$attr\s*=\s*[\"]/i) { |
|
|
|
0
|
|
|
|
|
|
|
322
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*\"([^\"]+?)\"//i) { |
|
323
|
0
|
|
|
|
|
|
$str = $1; |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
} elsif ($tag =~ /$attr\s*=\s*[\']/i) { |
|
326
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*\'([^\']+?)\'//i) { |
|
327
|
0
|
|
|
|
|
|
$str = $1; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
} else { |
|
330
|
0
|
0
|
|
|
|
|
if ($tag =~ s/$attr\s*=\s*([^\s]+?)\s//i) { |
|
331
|
0
|
|
|
|
|
|
$str = $1; |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
} |
|
334
|
0
|
0
|
|
|
|
|
next if($str =~ /^$/); |
|
335
|
0
|
0
|
|
|
|
|
if ($attr =~ /name/i) { |
|
|
|
0
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
$key=lc($str); |
|
337
|
|
|
|
|
|
|
} elsif ($attr =~ /content/i) { |
|
338
|
0
|
|
|
|
|
|
$val=$str; |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
} |
|
341
|
0
|
0
|
0
|
|
|
|
next if(($key =~ /^$/) || ($val =~ /^$/)); |
|
342
|
|
|
|
|
|
|
# $xwi->meta_add($key,HTML::Entities::decode_entities($val)); |
|
343
|
0
|
|
|
|
|
|
$xwi->meta_add($key,$val); |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
#END extract META tags |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=begin comment |
|
348
|
|
|
|
|
|
|
This feature is temporarily disabled |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
my $summary = ""; |
|
351
|
|
|
|
|
|
|
$xwi->meta_rewind; |
|
352
|
|
|
|
|
|
|
my ($name,$content); |
|
353
|
|
|
|
|
|
|
while(1) { |
|
354
|
|
|
|
|
|
|
($name,$content) = $xwi->meta_get; |
|
355
|
|
|
|
|
|
|
if (!defined($name)) { last; } |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#If abstract, description or DC.Description is not a list of keywords: add it to summary |
|
358
|
|
|
|
|
|
|
if ( $name eq 'description' || $name eq 'dc.description' || $name eq 'abstract' ) { |
|
359
|
|
|
|
|
|
|
my @kom = split(', ',$content); |
|
360
|
|
|
|
|
|
|
my @dot = split(' ',$content); |
|
361
|
|
|
|
|
|
|
if ( $#kom < $#dot ) { #If several meta-fields check if they overlap or are the same## |
|
362
|
|
|
|
|
|
|
$summary .= $content . ' '; |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
#Generate Summary |
|
368
|
|
|
|
|
|
|
my $sumlength = Combine::Config::Get('SummaryLength'); |
|
369
|
|
|
|
|
|
|
# print "SUM1: $summary\nHTML: $html\n"; |
|
370
|
|
|
|
|
|
|
if ( $sumlength > 0 ) { |
|
371
|
|
|
|
|
|
|
if ( ($sumlength - length($summary)) > 0 ) { |
|
372
|
|
|
|
|
|
|
require HTML::Summary; |
|
373
|
|
|
|
|
|
|
require HTML::TreeBuilder; |
|
374
|
|
|
|
|
|
|
my $html_summarizer = new HTML::Summary( LENGTH => $sumlength - length($summary), USE_META => 0 ); |
|
375
|
|
|
|
|
|
|
my $tree = new HTML::TreeBuilder; |
|
376
|
|
|
|
|
|
|
$tree->parse( Encode::encode('latin1',$html) ); |
|
377
|
|
|
|
|
|
|
# $tree->parse( $html ); |
|
378
|
|
|
|
|
|
|
$tree->eof(); |
|
379
|
|
|
|
|
|
|
## $summary .= $html_summarizer->generate ( $tree ); |
|
380
|
|
|
|
|
|
|
my $t .= Encode::decode('latin1',$html_summarizer->generate ( $tree )); |
|
381
|
|
|
|
|
|
|
$tree = $tree->delete; |
|
382
|
|
|
|
|
|
|
$summary .= $t; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
if (length($summary)>2) { |
|
385
|
|
|
|
|
|
|
# $summary =~ s/[^\w\s,\.\!\?:;\'\"]//gs; |
|
386
|
|
|
|
|
|
|
$summary =~ s/[^\p{IsAlnum}\s,\.\!\?:;\'\"]//gs; |
|
387
|
|
|
|
|
|
|
$summary =~ s/[\s\240]+/ /g; |
|
388
|
|
|
|
|
|
|
$xwi->meta_add("Rsummary",$summary); |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=end comment |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
# extract links |
|
397
|
1
|
|
|
1
|
|
734
|
use Combine::HTMLExtractor; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
608
|
|
|
398
|
0
|
|
|
|
|
|
my ($alt, $linktext, $linkurl, $base); |
|
399
|
0
|
|
|
|
|
|
$base = $xwi->base; #Set by UA.pm |
|
400
|
0
|
|
|
|
|
|
my $lx = new Combine::HTMLExtractor(undef,undef,1); |
|
401
|
|
|
|
|
|
|
# print "INPUT: $html\n"; |
|
402
|
|
|
|
|
|
|
# $html = HTML::Entities::decode_entities( Encode::encode('latin1',$html) ); |
|
403
|
0
|
|
|
|
|
|
$html = HTML::Entities::decode_entities( $html ); |
|
404
|
0
|
|
|
|
|
|
$lx->parse(\$html); |
|
405
|
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
|
my %Tags = ( a => 1, area => 1, frame => 1, img => 1, headings => 1, text => 1 ); |
|
407
|
0
|
|
|
|
|
|
for my $link ( @{$lx->links} ) { |
|
|
0
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# print "GOTLINK: $$link{tag} = $$link{_TEXT}\n"; |
|
409
|
0
|
0
|
|
|
|
|
next unless exists($Tags{$$link{tag}}); |
|
410
|
0
|
0
|
|
|
|
|
my $linktext = $$link{_TEXT} ? $$link{_TEXT} : ''; |
|
411
|
0
|
0
|
0
|
|
|
|
if ( ($$link{tag} eq 'headings') ) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
412
|
0
|
0
|
|
|
|
|
if ( $linktext !~ /^\s*$/ ) { |
|
413
|
0
|
|
|
|
|
|
$linktext =~ s/^[\s;]+//; |
|
414
|
0
|
|
|
|
|
|
$linktext =~ s/[\s;]+$//; |
|
415
|
|
|
|
|
|
|
# $xwi->heading_add(Encode::decode('latin1',$linktext)); |
|
416
|
0
|
|
|
|
|
|
$xwi->heading_add($linktext); |
|
417
|
|
|
|
|
|
|
} |
|
418
|
0
|
|
|
|
|
|
next; |
|
419
|
|
|
|
|
|
|
} elsif ( ($$link{tag} eq 'text') ) { |
|
420
|
0
|
0
|
|
|
|
|
if (!defined($rtext)) { |
|
421
|
|
|
|
|
|
|
# $linktext = Encode::decode('latin1',$linktext); |
|
422
|
0
|
|
|
|
|
|
$linktext =~ s/[\s\240]+/ /g; # compress whitespace?? |
|
423
|
0
|
|
|
|
|
|
$xwi->text(\$linktext); |
|
424
|
|
|
|
|
|
|
#print "HT=$linktext\n"; |
|
425
|
|
|
|
|
|
|
} |
|
426
|
0
|
|
|
|
|
|
next; |
|
427
|
|
|
|
|
|
|
} elsif ( ($$link{tag} eq 'frame') || ($$link{tag} eq 'img') ) { |
|
428
|
0
|
|
|
|
|
|
$linkurl = $$link{src}; |
|
429
|
0
|
|
0
|
|
|
|
$linktext .= $$link{alt} || ''; |
|
430
|
|
|
|
|
|
|
} else { |
|
431
|
0
|
|
|
|
|
|
$linkurl = $$link{href}; |
|
432
|
|
|
|
|
|
|
} |
|
433
|
0
|
|
|
|
|
|
$linktext =~ s/\[IMG\]//g; |
|
434
|
0
|
0
|
|
|
|
|
if ( $linkurl !~ /^#/ ) { # Throw away links within a document |
|
435
|
0
|
|
|
|
|
|
$linkurl =~ s/\?\s+/?/; #to be handled in normalize?? |
|
436
|
0
|
|
|
|
|
|
my $urlstr = URI->new_abs($linkurl, $base)->canonical->as_string; |
|
437
|
|
|
|
|
|
|
# $xwi->link_add($urlstr, 0, 0, Encode::decode('latin1',$linktext), $$link{tag}); |
|
438
|
0
|
|
|
|
|
|
$xwi->link_add($urlstr, 0, 0, $linktext, $$link{tag}); |
|
439
|
|
|
|
|
|
|
# print "ADD: $$link{tag}; $urlstr; |$linktext|\n"; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
return $xwi; |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
1; |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
__END__ |