line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package MMM::Text::Search; |
3
|
1
|
|
|
1
|
|
1429
|
use File::Copy; |
|
1
|
|
|
|
|
9474
|
|
|
1
|
|
|
|
|
67
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
#$Id: Search.pm,v 1.50 2004/12/13 18:45:15 maxim Exp $ |
6
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
7
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $verbose_flag ); |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
126
|
|
8
|
|
|
|
|
|
|
require Exporter; |
9
|
|
|
|
|
|
|
require AutoLoader; |
10
|
|
|
|
|
|
|
@ISA = qw(Exporter AutoLoader); |
11
|
|
|
|
|
|
|
@EXPORT = qw( |
12
|
|
|
|
|
|
|
); |
13
|
|
|
|
|
|
|
$VERSION = '0.07'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# |
16
|
|
|
|
|
|
|
# Perl module for indexing and searching text files and web pages. |
17
|
|
|
|
|
|
|
# (Max Muzi, Apr-Sep 1999) |
18
|
|
|
|
|
|
|
# |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# Note on implementation: |
21
|
|
|
|
|
|
|
# The technique used for indexing is substantially derived from that |
22
|
|
|
|
|
|
|
# exposed by Tim Kientzle on Dr. Dobbs magazine. (Actually IndexWords() |
23
|
|
|
|
|
|
|
# has been cut'n'pasted from his scripts.) |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# |
26
|
|
|
|
|
|
|
|
27
|
1
|
|
|
1
|
|
1551
|
use DB_File; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use Fcntl; |
29
|
|
|
|
|
|
|
require 5.005; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
$verbose_flag = 0; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $debug_flag = 0; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $errstr = undef; |
36
|
|
|
|
|
|
|
my $syntax_error = undef; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub errstr { $errstr }; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub new { # constructor! (see the docs for usage [sorry, there're no docs ]) |
41
|
|
|
|
|
|
|
my $pkg = shift; |
42
|
|
|
|
|
|
|
my $arg = shift; |
43
|
|
|
|
|
|
|
my $opt = undef; |
44
|
|
|
|
|
|
|
if (ref($arg) ne "HASH") { |
45
|
|
|
|
|
|
|
if (-f $arg) { |
46
|
|
|
|
|
|
|
$opt->{IndexDB} = $arg; |
47
|
|
|
|
|
|
|
$opt->{Verbose} = shift; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
else { |
50
|
|
|
|
|
|
|
die "usage: \$obj = new MMM::Text::Search ( '/index/path' or \$hashref)\n" |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
} else { |
53
|
|
|
|
|
|
|
$opt = $arg; |
54
|
|
|
|
|
|
|
}; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
$verbose_flag = $opt->{Debug} || $opt->{Verbose} ; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $indexdbpath = $opt->{IndexDB} || $opt->{IndexPath} ; |
59
|
|
|
|
|
|
|
my $filemask = $opt->{FileMask} ; |
60
|
|
|
|
|
|
|
my $dirs = ( ref($opt->{Dirs}) eq "ARRAY" ) ? $opt->{Dirs} : [ ]; |
61
|
|
|
|
|
|
|
my $followsymlinks = defined $opt->{FollowSymLinks}; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
my $opturls = $opt->{Urls} || $opt->{URLs}; |
64
|
|
|
|
|
|
|
my $urls = ( ref($opturls) eq "ARRAY" ) ? $opturls : [ ]; |
65
|
|
|
|
|
|
|
my $level = int $opt->{Level}; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
my $locationsdbpath = $indexdbpath; |
68
|
|
|
|
|
|
|
$locationsdbpath =~ s/(\.db)*$/\-locations.db/; |
69
|
|
|
|
|
|
|
my $titlesdbpath = $indexdbpath; |
70
|
|
|
|
|
|
|
$titlesdbpath =~ s/(\.db)*$/\-titles.db/; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
my $minwordsize = $opt->{MinWordSize} || 1; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
my $self = { |
76
|
|
|
|
|
|
|
indexdbpath => $indexdbpath, |
77
|
|
|
|
|
|
|
locationsdbpath => $locationsdbpath, |
78
|
|
|
|
|
|
|
titlesdbpath => $titlesdbpath, |
79
|
|
|
|
|
|
|
filemask => length($filemask) ? qr/$filemask/ : undef, |
80
|
|
|
|
|
|
|
dirs => $dirs, |
81
|
|
|
|
|
|
|
followsymlinks => $followsymlinks, |
82
|
|
|
|
|
|
|
minwordsize => $minwordsize, |
83
|
|
|
|
|
|
|
ignorelimit => $opt->{IgnoreLimit} || (2/3), |
84
|
|
|
|
|
|
|
urls => $urls, |
85
|
|
|
|
|
|
|
level => $level, |
86
|
|
|
|
|
|
|
url_exclude => $opt->{UrlExludeMask} || "(?i).*\.(zip|exe|gz|arj|bin|hqx)", |
87
|
|
|
|
|
|
|
file_reader => $opt->{FileReader}, |
88
|
|
|
|
|
|
|
use_inode => $opt->{UseInodeAsKey}, |
89
|
|
|
|
|
|
|
no_reset => $opt->{UseInodeAsKey} && $opt->{NoReset} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
}; |
92
|
|
|
|
|
|
|
DEBUG("filemask=$filemask, indexfile=$indexdbpath, ignorelimit=$self->{ignorelimit}\n"); |
93
|
|
|
|
|
|
|
DEBUG("dirs = [", join(",", @$dirs),"], "); |
94
|
|
|
|
|
|
|
DEBUG("urls = [", join(",", @$urls),"] \n"); |
95
|
|
|
|
|
|
|
bless($self, $pkg); |
96
|
|
|
|
|
|
|
return $self; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _add_keys_to_match_hash { |
100
|
|
|
|
|
|
|
# extract file-codes from $keys and update corresponding $hash elements (score) |
101
|
|
|
|
|
|
|
my ($keys, $hash) = @_; |
102
|
|
|
|
|
|
|
my $key; |
103
|
|
|
|
|
|
|
foreach $key ( unpack("N*",$keys) ) { |
104
|
|
|
|
|
|
|
# DEBUG($key, " "); |
105
|
|
|
|
|
|
|
# ignored words (stop-words) only include file-id 0 (see FlushCache() below) |
106
|
|
|
|
|
|
|
return 0 if $key == 0 ; |
107
|
|
|
|
|
|
|
$hash->{$key}++ |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
return 1; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub _push_words_from_hash { |
113
|
|
|
|
|
|
|
my ($hash,$array, $regexp) = @_; |
114
|
|
|
|
|
|
|
my $w; |
115
|
|
|
|
|
|
|
for $w(keys %$hash) { |
116
|
|
|
|
|
|
|
push @$array,$w if $w =~ $regexp; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
#notes on advanced_query(); |
123
|
|
|
|
|
|
|
# - queries containing stop-words may yields bizzare results.. |
124
|
|
|
|
|
|
|
# - score is not always correct |
125
|
|
|
|
|
|
|
# - error handling should be improved... :-) |
126
|
|
|
|
|
|
|
sub advanced_query { |
127
|
|
|
|
|
|
|
# perform queries such as "( a and ( b or c ) ) and ( d and e) " |
128
|
|
|
|
|
|
|
my $self = shift; |
129
|
|
|
|
|
|
|
my $expr = shift; |
130
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
131
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
132
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
133
|
|
|
|
|
|
|
my %indexdb; |
134
|
|
|
|
|
|
|
my %locationsdb; |
135
|
|
|
|
|
|
|
my %titlesdb; |
136
|
|
|
|
|
|
|
return undef unless (-f $indexdbpath && -r _); |
137
|
|
|
|
|
|
|
return undef unless (-f $locationsdbpath && -r _); |
138
|
|
|
|
|
|
|
return undef unless (-f $titlesdbpath && -r _); |
139
|
|
|
|
|
|
|
return undef unless |
140
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && |
141
|
|
|
|
|
|
|
tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && |
142
|
|
|
|
|
|
|
tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); |
143
|
|
|
|
|
|
|
my @ignored = (); |
144
|
|
|
|
|
|
|
my @words = (); |
145
|
|
|
|
|
|
|
my $verbose_flag_tmp = $verbose_flag; |
146
|
|
|
|
|
|
|
$verbose_flag = shift; # undocumented debug switch |
147
|
|
|
|
|
|
|
chomp $expr; |
148
|
|
|
|
|
|
|
undef $syntax_error; #reset error |
149
|
|
|
|
|
|
|
DEBUG("********** _match_expression() debug **********\n"); |
150
|
|
|
|
|
|
|
my $match = _match_expression($expr, \%indexdb, \@ignored); |
151
|
|
|
|
|
|
|
DEBUG("********** end debug **********\n"); |
152
|
|
|
|
|
|
|
if ($syntax_error) { |
153
|
|
|
|
|
|
|
$errstr = $syntax_error; |
154
|
|
|
|
|
|
|
$verbose_flag = $verbose_flag_tmp; |
155
|
|
|
|
|
|
|
return undef; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
my $result = _make_result_hash($match,\%locationsdb, \%titlesdb, \@words, \@ignored); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
untie(%indexdb); |
160
|
|
|
|
|
|
|
untie(%locationsdb); |
161
|
|
|
|
|
|
|
untie(%titlesdb); |
162
|
|
|
|
|
|
|
$verbose_flag = $verbose_flag_tmp; |
163
|
|
|
|
|
|
|
return $result; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub _match_expression { # recursively apply a keyword-search expression to indexdb |
167
|
|
|
|
|
|
|
# $expr may be either a string or a ref to an array of tokens |
168
|
|
|
|
|
|
|
# a ref to a "score" hash is returned (or undef sometimes) |
169
|
|
|
|
|
|
|
my ($expr, $index, $ignored) = @_; |
170
|
|
|
|
|
|
|
my $parsed = _parse_expression($expr); |
171
|
|
|
|
|
|
|
# _parse_expression() returns a reference to an array of three elements: |
172
|
|
|
|
|
|
|
# [ operator, left_expr, right_expr] |
173
|
|
|
|
|
|
|
# if right_expr is not defined then expr was atomic and left_expr is a string, |
174
|
|
|
|
|
|
|
# otherwise both right_expr and left_expr are references to arrays of tokens |
175
|
|
|
|
|
|
|
if ( not $parsed) { |
176
|
|
|
|
|
|
|
DEBUG("Syntax error :-( \n"); |
177
|
|
|
|
|
|
|
return undef; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
my ( $op, $left,$right) = @$parsed; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
if ($left && not $right) { |
182
|
|
|
|
|
|
|
$left =~ s/^\s*\(?\s*|\s*\)?\s*$//g; |
183
|
|
|
|
|
|
|
DEBUG("Looking up >$left<\n"); |
184
|
|
|
|
|
|
|
my %matches = (); |
185
|
|
|
|
|
|
|
my $word = $left; |
186
|
|
|
|
|
|
|
my $rc = 0; |
187
|
|
|
|
|
|
|
my $keys = $index->{lc $word}; # get file-id's from indexdb |
188
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%matches); |
189
|
|
|
|
|
|
|
# if $rc is false then $word is a stop-word, see _add_keys_to_match_hash() for more info |
190
|
|
|
|
|
|
|
if (not $rc) { |
191
|
|
|
|
|
|
|
DEBUG("$word ignored\n"); |
192
|
|
|
|
|
|
|
push @$ignored, $word; |
193
|
|
|
|
|
|
|
return undef; |
194
|
|
|
|
|
|
|
# what should we do now? gotta think it over... |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
return \%matches; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
DEBUG("Evaluating >$left< --$op-- >$right<\n"); |
200
|
|
|
|
|
|
|
my $left_match = _match_expression($left, $index, $ignored); |
201
|
|
|
|
|
|
|
my $right_match = _match_expression($right, $index, $ignored); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
return undef if ($syntax_error); |
204
|
|
|
|
|
|
|
my %matches = (); |
205
|
|
|
|
|
|
|
my $file = undef; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
if ($op eq 'AND' ) { |
208
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
209
|
|
|
|
|
|
|
for $file( keys %matches) { |
210
|
|
|
|
|
|
|
delete $matches{$file} unless $right_match->{$file} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
return \%matches; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
if ($op eq 'AND NOT') { |
215
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
216
|
|
|
|
|
|
|
for $file( keys %matches) { |
217
|
|
|
|
|
|
|
delete $matches{$file} if $right_match->{$file} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
return \%matches; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
if ($op eq 'OR') { |
222
|
|
|
|
|
|
|
%matches = ( %$left_match ); |
223
|
|
|
|
|
|
|
for $file( keys %$right_match) { |
224
|
|
|
|
|
|
|
if ($matches{$file}) { |
225
|
|
|
|
|
|
|
$matches{$file} +=$right_match->{$file}; |
226
|
|
|
|
|
|
|
} else { |
227
|
|
|
|
|
|
|
$matches{$file} =$right_match->{$file}; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
return \%matches; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
return undef; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub _parse_expression { |
236
|
|
|
|
|
|
|
my $arg = shift; |
237
|
|
|
|
|
|
|
my $tokens = undef; # this is an arry ref |
238
|
|
|
|
|
|
|
if (ref($arg) ne 'ARRAY') { |
239
|
|
|
|
|
|
|
$tokens = [ |
240
|
|
|
|
|
|
|
$arg =~ m/( \( | \)| \bAND\s+NOT\b | \bAND\b | \bOR\b | \"[^\"]+\" | \b\w+\b) /xig |
241
|
|
|
|
|
|
|
]; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
# important!! "AND NOT" is treated as a single logical operator... |
244
|
|
|
|
|
|
|
# this means that things like "not a and b" aren't well-formed, |
245
|
|
|
|
|
|
|
# while "b and not a" is |
246
|
|
|
|
|
|
|
else { $tokens = $arg; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
my $left = undef; # array ref (oppure stringa se è un espressione atomica) |
249
|
|
|
|
|
|
|
my $right = undef; # array ref ! |
250
|
|
|
|
|
|
|
my $op = 'OR'; |
251
|
|
|
|
|
|
|
my $depth = 0; |
252
|
|
|
|
|
|
|
my $pos = 0; |
253
|
|
|
|
|
|
|
my $tok; |
254
|
|
|
|
|
|
|
my $len = int @$tokens; |
255
|
|
|
|
|
|
|
DEBUG("expr = ", join(" + ", @$tokens),"\n"); |
256
|
|
|
|
|
|
|
while (1) { |
257
|
|
|
|
|
|
|
if ($len == 1) { |
258
|
|
|
|
|
|
|
return [ undef, $tokens->[0], undef ]; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
DEBUG("$tok : depth=$depth pos=$pos len=$len\n"); |
261
|
|
|
|
|
|
|
if ($depth == 0 && ($pos == $len) ) { |
262
|
|
|
|
|
|
|
if ($tokens->[0] eq '(' && $tokens->[$len-1] eq ')') { |
263
|
|
|
|
|
|
|
# take off outer parentheses... |
264
|
|
|
|
|
|
|
shift @$tokens; |
265
|
|
|
|
|
|
|
pop @$tokens; |
266
|
|
|
|
|
|
|
$len -= 2; |
267
|
|
|
|
|
|
|
$pos = 0; |
268
|
|
|
|
|
|
|
$depth = 0; |
269
|
|
|
|
|
|
|
DEBUG("expr = ", join(" + ", @$tokens),"\n"); |
270
|
|
|
|
|
|
|
} else { # ahhhh... this expression won't be parsed... |
271
|
|
|
|
|
|
|
$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; |
272
|
|
|
|
|
|
|
DEBUG("atom not atomic\n"); |
273
|
|
|
|
|
|
|
return undef; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
} elsif ( $pos == $len ) { |
277
|
|
|
|
|
|
|
$syntax_error = "Non-matching parentheses (\"".join(' ', @$tokens)."\")"; |
278
|
|
|
|
|
|
|
DEBUG("non matching parentheses\n"); |
279
|
|
|
|
|
|
|
return undef; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
$tok = $tokens->[$pos++]; |
282
|
|
|
|
|
|
|
if ($tok eq '(') { $depth++; next; } |
283
|
|
|
|
|
|
|
if ($tok eq ')') { $depth--; next; } |
284
|
|
|
|
|
|
|
next if $depth; |
285
|
|
|
|
|
|
|
if ($tok =~ /\b(AND\s+NOT|AND|OR)\b/i) { |
286
|
|
|
|
|
|
|
if ($pos == 1 || $pos == $len) { |
287
|
|
|
|
|
|
|
$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; |
288
|
|
|
|
|
|
|
return undef |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
$op = uc $1; $op =~ s/\s+/ /g; |
291
|
|
|
|
|
|
|
$left = [ @$tokens[0..$pos-2] ]; |
292
|
|
|
|
|
|
|
$right = [ @$tokens[$pos..$len-1] ]; |
293
|
|
|
|
|
|
|
DEBUG("right = ", join(" + ", @$right),"\n"); |
294
|
|
|
|
|
|
|
DEBUG("left = ", join(" + ", @$left),"\n"); |
295
|
|
|
|
|
|
|
return [ $op, $left, $right ]; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
} |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
sub query { # simple query.... altavista +/- prefixes are recognized... |
303
|
|
|
|
|
|
|
# */? globbing also works but |
304
|
|
|
|
|
|
|
# slows query down significantly |
305
|
|
|
|
|
|
|
# globbing implicitly discards +/- prefix (it's a BUG!!!) |
306
|
|
|
|
|
|
|
my $self = shift; |
307
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
308
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
309
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
310
|
|
|
|
|
|
|
my %indexdb; |
311
|
|
|
|
|
|
|
my %locationsdb; |
312
|
|
|
|
|
|
|
my %titlesdb; |
313
|
|
|
|
|
|
|
return undef unless (-f $indexdbpath && -r _); |
314
|
|
|
|
|
|
|
return undef unless (-f $locationsdbpath && -r _); |
315
|
|
|
|
|
|
|
return undef unless (-f $titlesdbpath && -r _); |
316
|
|
|
|
|
|
|
return undef unless |
317
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && |
318
|
|
|
|
|
|
|
tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && |
319
|
|
|
|
|
|
|
tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); |
320
|
|
|
|
|
|
|
my %matches; |
321
|
|
|
|
|
|
|
my %limit; |
322
|
|
|
|
|
|
|
my %exclude; |
323
|
|
|
|
|
|
|
my @ignored; |
324
|
|
|
|
|
|
|
my $key; |
325
|
|
|
|
|
|
|
my $word; |
326
|
|
|
|
|
|
|
my $mustbe_words = 0; |
327
|
|
|
|
|
|
|
my @words = (); |
328
|
|
|
|
|
|
|
my $glob_regexp = undef; |
329
|
|
|
|
|
|
|
for (@_) { # globbing feature... e.g. uni* passw? |
330
|
|
|
|
|
|
|
if ( /\*|\?/) { |
331
|
|
|
|
|
|
|
s/\*/\.\*/g; |
332
|
|
|
|
|
|
|
s/\?/\./g; |
333
|
|
|
|
|
|
|
$glob_regexp = $glob_regexp ? $glob_regexp."|^$_\$" : "^$_\$" ; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
else { |
336
|
|
|
|
|
|
|
push @words, $_; |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
if ($glob_regexp) { |
340
|
|
|
|
|
|
|
my $regexp = qr/$glob_regexp/; |
341
|
|
|
|
|
|
|
# collect all words in db matching $glob_regexp and append them to the query |
342
|
|
|
|
|
|
|
_push_words_from_hash(\%indexdb, \@words, $regexp); |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
DEBUG("looking up ", join(", ", @words ), "\n"); |
346
|
|
|
|
|
|
|
foreach $word (@words) { |
347
|
|
|
|
|
|
|
my $rc = 0; |
348
|
|
|
|
|
|
|
# DEBUG($word); |
349
|
|
|
|
|
|
|
if ($word =~ /^-(.*)/) { |
350
|
|
|
|
|
|
|
my $keys = $indexdb{lc $1}; |
351
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%exclude); |
352
|
|
|
|
|
|
|
} elsif ($word =~ /^\+(.*)/) { |
353
|
|
|
|
|
|
|
$mustbe_words++; |
354
|
|
|
|
|
|
|
my $keys = $indexdb{lc $1}; |
355
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%limit); |
356
|
|
|
|
|
|
|
} else { |
357
|
|
|
|
|
|
|
my $keys = $indexdb{lc $word}; |
358
|
|
|
|
|
|
|
$rc = _add_keys_to_match_hash($keys,\%matches); |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
# DEBUG("\n"); |
361
|
|
|
|
|
|
|
if (not $rc) { push @ignored, $word } |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
if ($mustbe_words) { |
365
|
|
|
|
|
|
|
for $key(keys %limit) { |
366
|
|
|
|
|
|
|
next unless $limit{$key} >= $mustbe_words; |
367
|
|
|
|
|
|
|
$matches{$key} += $limit{$key} ; |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
for $key(keys %matches) { |
370
|
|
|
|
|
|
|
delete $matches{$key} unless $limit{$key}; |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
for $key(keys %exclude) { |
374
|
|
|
|
|
|
|
delete $matches{$key}; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
my $result = _make_result_hash(\%matches,\%locationsdb, \%titlesdb, \@words, \@ignored); |
377
|
|
|
|
|
|
|
untie(%indexdb); |
378
|
|
|
|
|
|
|
untie(%locationsdb); |
379
|
|
|
|
|
|
|
untie(%titlesdb); |
380
|
|
|
|
|
|
|
return $result; |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
sub _make_result_hash { |
385
|
|
|
|
|
|
|
# hash-ref hash-ref hash-ref array-ref array-ref |
386
|
|
|
|
|
|
|
my ( $match, $locationsdb, $titlesdb, $words, $ignored ) = @_; |
387
|
|
|
|
|
|
|
my $result = { |
388
|
|
|
|
|
|
|
searched => $words, |
389
|
|
|
|
|
|
|
ignored => $ignored, |
390
|
|
|
|
|
|
|
entries => [] |
391
|
|
|
|
|
|
|
}; |
392
|
|
|
|
|
|
|
my $key; |
393
|
|
|
|
|
|
|
foreach $key (keys %$match) { |
394
|
|
|
|
|
|
|
my $ckey = pack("xN",$key); |
395
|
|
|
|
|
|
|
my $name = $locationsdb->{$ckey}; |
396
|
|
|
|
|
|
|
my $title = $titlesdb->{$ckey}; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
push @{ $result->{entries} }, { |
399
|
|
|
|
|
|
|
location => $name, |
400
|
|
|
|
|
|
|
score => $match->{$key}, |
401
|
|
|
|
|
|
|
title => $title |
402
|
|
|
|
|
|
|
}; |
403
|
|
|
|
|
|
|
DEBUG("$name: $match->{$key}\n"); |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
return $result; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub DEBUG (@) { $verbose_flag && print STDERR @_ }; |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub tie_hash { |
417
|
|
|
|
|
|
|
my ($hashref, $file ,$perm) = @_; |
418
|
|
|
|
|
|
|
$perm = (O_RDWR|O_CREAT) unless defined $perm; |
419
|
|
|
|
|
|
|
my $rc = tied(%$hashref); |
420
|
|
|
|
|
|
|
return $rc if $rc; |
421
|
|
|
|
|
|
|
$rc = tie(%$hashref,'DB_File',$file, $perm, 0644, $DB_File::DB_BTREE) ; |
422
|
|
|
|
|
|
|
if ($debug_flag) { |
423
|
|
|
|
|
|
|
my $count = int keys %$hashref; |
424
|
|
|
|
|
|
|
DEBUG("tie $hashref ($rc) ($count keys)\n"); |
425
|
|
|
|
|
|
|
} elsif ($verbose_flag) { |
426
|
|
|
|
|
|
|
DEBUG("tie $hashref ($rc)\n"); |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
return $rc; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub untie_hash { |
434
|
|
|
|
|
|
|
my ($hashref, $file ) = @_; |
435
|
|
|
|
|
|
|
if ($debug_flag) { |
436
|
|
|
|
|
|
|
my $count = int keys %$hashref; |
437
|
|
|
|
|
|
|
DEBUG("untie $hashref ($count keys)\n") |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
untie(%$hashref); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
1; |
444
|
|
|
|
|
|
|
#__END__ |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=head1 NAME |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
MMM::Text::Search - Perl module for indexing and searching text files and web objects |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head1 SYNOPSIS |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
use MMM::Text::Search; |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
my $srch = new MMM::Text::Search { #for indexing... |
455
|
|
|
|
|
|
|
#index main file location... |
456
|
|
|
|
|
|
|
IndexPath => "/tmp/myindex.db", |
457
|
|
|
|
|
|
|
#local files... (optional) |
458
|
|
|
|
|
|
|
FileMask => '(?i)(\.txt|\.htm.?)$', |
459
|
|
|
|
|
|
|
Dirs => [ "/usr/doc", "/tmp" ] , |
460
|
|
|
|
|
|
|
FollowSymLinks => 0|1, (default = 0) |
461
|
|
|
|
|
|
|
#web objects... (optional) |
462
|
|
|
|
|
|
|
URLs => [ "http://localhost/", ... ], |
463
|
|
|
|
|
|
|
Level => recursion-level (0=unlimited) |
464
|
|
|
|
|
|
|
#common options... |
465
|
|
|
|
|
|
|
IgnoreLimit => 0.3, (default = 2/3) |
466
|
|
|
|
|
|
|
Verbose => 0|1 |
467
|
|
|
|
|
|
|
}; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
$srch->start_indexing_session(); |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
$srch->commit_indexing_session(); |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
$srch->index_default_locations(); |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
$srch->index_content( { title => '...', |
476
|
|
|
|
|
|
|
content=> '...', |
477
|
|
|
|
|
|
|
id => '...' } ); |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
$srch->makeindex; |
480
|
|
|
|
|
|
|
(Obsolete.) |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
my $srch = new MMM::Text::Search ( #for searching.... |
487
|
|
|
|
|
|
|
"/tmp/myindex.db", verbose_flag ); |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
my $hashref = $srch->query("pizza","ciao", "-pasta" ); |
490
|
|
|
|
|
|
|
my $hashref = $srch->advanced_query("(pizza OR ciao) AND NOT pasta"); |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
$srch->errstr() # returns last error |
493
|
|
|
|
|
|
|
# (only query syntax-errors for the moment being) |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
$srch->dump_word_stats(\*FH) |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=head1 DESCRIPTION |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item * |
502
|
|
|
|
|
|
|
Indexing |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
When a session is closed the following files will have been created |
505
|
|
|
|
|
|
|
(assuming IndexPath = /path/myindex.db, see constructor): |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
/path/myindex.db word index database |
509
|
|
|
|
|
|
|
/path/myindex-locations.db filename/URL database |
510
|
|
|
|
|
|
|
/path/myindex-titles.db html title database |
511
|
|
|
|
|
|
|
/path/myindex.stopwords stop-words list |
512
|
|
|
|
|
|
|
/path/myindex.filelist readable list of indexed files/URLs |
513
|
|
|
|
|
|
|
/path/myindex.deadlinks broken http links |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
[... lots of important things missing ... ] |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
start_indexing_session() starts session. |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
commit_indexing_session() commits and closes current session. |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
index_default_locations() indexes all files and URLs specified on construction. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
index_content() pushes content into indexing engine. |
524
|
|
|
|
|
|
|
Argument must have the following structure |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
{ title => '...', content=> '...', id => '...' } |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
makeindex() is obsolete. |
530
|
|
|
|
|
|
|
Equivalent to: |
531
|
|
|
|
|
|
|
$srch->start_indexing_session(); |
532
|
|
|
|
|
|
|
$srch->index_default_locations(); |
533
|
|
|
|
|
|
|
$srch->commit_indexing_session(); |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
dump_word_stats(\*FH) dumps all words sorted by occurence frequency using |
539
|
|
|
|
|
|
|
FH file handle (or STDOUT if no parameter is specified). Stop-words get a |
540
|
|
|
|
|
|
|
frequency value of 1. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=item * |
543
|
|
|
|
|
|
|
Searching |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Both query() and advanced_query() return a reference to a hash with |
546
|
|
|
|
|
|
|
the following structure: |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
( |
549
|
|
|
|
|
|
|
ignored => [ string, string, ... ], # ignored words |
550
|
|
|
|
|
|
|
searched => [ string, string, ... ], # words searched for |
551
|
|
|
|
|
|
|
entries => [ hashref, hashref, ... ] # list of records |
552
|
|
|
|
|
|
|
# found |
553
|
|
|
|
|
|
|
) |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
The 'entries' element is a reference to an array of hashes, each having |
556
|
|
|
|
|
|
|
the following structure: |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
( |
559
|
|
|
|
|
|
|
location => string, # file path or URL or anything |
560
|
|
|
|
|
|
|
score => number, # score |
561
|
|
|
|
|
|
|
title => string # HTML title |
562
|
|
|
|
|
|
|
) |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 NOTES |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
Note on implementation: |
567
|
|
|
|
|
|
|
The technique used for indexing is substantially derived from that |
568
|
|
|
|
|
|
|
exposed by Tim Kientzle on Dr. Dobbs magazine. |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
=head1 BUGS |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
Many, I guess. |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=head1 AUTHOR |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
Max Muzi |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=head1 SEE ALSO |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
perl(1). |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=cut |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# |
587
|
|
|
|
|
|
|
#-------------------- the following code is only used when indexing ---------------- |
588
|
|
|
|
|
|
|
# |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
sub dump_word_stats { |
591
|
|
|
|
|
|
|
my $self = shift; |
592
|
|
|
|
|
|
|
my $fh = shift || \*STDOUT; |
593
|
|
|
|
|
|
|
my $indexdbpath= $self->{indexdbpath}; |
594
|
|
|
|
|
|
|
my %indexdb; |
595
|
|
|
|
|
|
|
die unless (-f $indexdbpath && -r _); |
596
|
|
|
|
|
|
|
tie_hash(\%indexdb,$indexdbpath, O_RDONLY ); |
597
|
|
|
|
|
|
|
my %index = ( %indexdb ); |
598
|
|
|
|
|
|
|
my $w; |
599
|
|
|
|
|
|
|
for $w( sort { length($index{$b}) <=> length($index{$a}) } |
600
|
|
|
|
|
|
|
keys %index ) { |
601
|
|
|
|
|
|
|
print $fh $w, "\t", length($index{$w}) / 2, "\n"; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
untie_hash(\%indexdb); |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
sub start_indexing_session |
608
|
|
|
|
|
|
|
{ |
609
|
|
|
|
|
|
|
my $self = shift; |
610
|
|
|
|
|
|
|
$self->rollback_indexing_session; |
611
|
|
|
|
|
|
|
my $key = 0; |
612
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
613
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
614
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
my $filemask = $self->{filemask}; |
617
|
|
|
|
|
|
|
my $keyref = \$key; |
618
|
|
|
|
|
|
|
my $filelistfile = $indexdbpath; |
619
|
|
|
|
|
|
|
$filelistfile =~ s/(\.db)?$/\.filelist/; |
620
|
|
|
|
|
|
|
open FILELIST, ">".$filelistfile; |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
my $session = { |
623
|
|
|
|
|
|
|
indexdbpath => $indexdbpath, |
624
|
|
|
|
|
|
|
locationsdbpath => $locationsdbpath, |
625
|
|
|
|
|
|
|
titlesdbpath => $titlesdbpath, |
626
|
|
|
|
|
|
|
indexdb => { }, |
627
|
|
|
|
|
|
|
locationsdb => { }, |
628
|
|
|
|
|
|
|
titlesdb => { }, |
629
|
|
|
|
|
|
|
cachedb => { }, |
630
|
|
|
|
|
|
|
filemask => $filemask, |
631
|
|
|
|
|
|
|
current_key => 16, # first 16 values are reserved (0 = word is ignored) |
632
|
|
|
|
|
|
|
bytes => 0, |
633
|
|
|
|
|
|
|
count => 0, |
634
|
|
|
|
|
|
|
filecount => 0, |
635
|
|
|
|
|
|
|
listfh => \*FILELIST, |
636
|
|
|
|
|
|
|
status_THE => 0, |
637
|
|
|
|
|
|
|
followsymlinks => $self->{followsymlinks}, |
638
|
|
|
|
|
|
|
minwordsize => $self->{minwordsize}, |
639
|
|
|
|
|
|
|
ignoreword => {}, |
640
|
|
|
|
|
|
|
autoignore => 1, |
641
|
|
|
|
|
|
|
ignorelimit => $self->{ignorelimit} || (2/3), |
642
|
|
|
|
|
|
|
level => $self->{level}, |
643
|
|
|
|
|
|
|
url_exclude => $self->{url_exclude}, |
644
|
|
|
|
|
|
|
file_reader => $self->{file_reader}, |
645
|
|
|
|
|
|
|
use_inode => $self->{use_inode}, |
646
|
|
|
|
|
|
|
no_reset => $self->{no_reset}, |
647
|
|
|
|
|
|
|
}; |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
unlink $indexdbpath."~"; |
650
|
|
|
|
|
|
|
unlink $locationsdbpath."~"; |
651
|
|
|
|
|
|
|
unlink $titlesdbpath."~"; |
652
|
|
|
|
|
|
|
if( $self->{no_reset} ) |
653
|
|
|
|
|
|
|
{ |
654
|
|
|
|
|
|
|
copy( $indexdbpath, $indexdbpath."~" ); |
655
|
|
|
|
|
|
|
copy( $locationsdbpath, $locationsdbpath."~" ); |
656
|
|
|
|
|
|
|
copy( $titlesdbpath, $titlesdbpath."~" ); |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
tie_hash($session->{indexdb}, $indexdbpath."~" ) or die "$indexdbpath: $!\n"; |
659
|
|
|
|
|
|
|
tie_hash($session->{locationsdb}, $locationsdbpath."~" ) or die $!; |
660
|
|
|
|
|
|
|
tie_hash($session->{titlesdb},$titlesdbpath."~" ) or die $!; |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
my $ignorefile = $indexdbpath; |
663
|
|
|
|
|
|
|
$ignorefile =~ s/(\.db)?$/\.stopwords/; |
664
|
|
|
|
|
|
|
if (-r $ignorefile) { # read *-stopwords.dat file |
665
|
|
|
|
|
|
|
open F, $ignorefile; |
666
|
|
|
|
|
|
|
while () { |
667
|
|
|
|
|
|
|
chomp; |
668
|
|
|
|
|
|
|
s/^\s+|\s+$//g; |
669
|
|
|
|
|
|
|
$session->{ignoreword}->{$_} = 1; |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
close F; |
672
|
|
|
|
|
|
|
my $count = int keys %{ $session->{ignoreword} }; |
673
|
|
|
|
|
|
|
DEBUG("using stop-words from $ignorefile ($count words)\n"); |
674
|
|
|
|
|
|
|
$session->{autoignore} = 0; |
675
|
|
|
|
|
|
|
} |
676
|
|
|
|
|
|
|
$session->{ignorefile} = $ignorefile; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
my $time = time(); |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
$session->{start_time} = $time; |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
$self->{session} = $session; |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
sub index_default_locations |
686
|
|
|
|
|
|
|
{ |
687
|
|
|
|
|
|
|
my $self = shift; |
688
|
|
|
|
|
|
|
my $session = $self->{session}; |
689
|
|
|
|
|
|
|
return unless $session; |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
my $dirs = $self->{dirs}; |
692
|
|
|
|
|
|
|
my $urls = $self->{urls}; |
693
|
|
|
|
|
|
|
my $filecount = 0; |
694
|
|
|
|
|
|
|
DEBUG("Counting files...\n") if int @$dirs; |
695
|
|
|
|
|
|
|
my $dir; |
696
|
|
|
|
|
|
|
for $dir( sort @$dirs) { $filecount += IndexDir($session, $dir, 1); } |
697
|
|
|
|
|
|
|
$session->{filecount} = $filecount; |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
for $dir( sort @$dirs) { IndexDir($session, $dir); } |
700
|
|
|
|
|
|
|
for my $url( sort @$urls) { IndexWeb($session, $url); } |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
sub index_content |
704
|
|
|
|
|
|
|
{ |
705
|
|
|
|
|
|
|
my $self = shift; |
706
|
|
|
|
|
|
|
my $session = $self->{session}; |
707
|
|
|
|
|
|
|
return unless $session; |
708
|
|
|
|
|
|
|
my $info = shift; |
709
|
|
|
|
|
|
|
if( ref($info) ne 'HASH' ) |
710
|
|
|
|
|
|
|
{ warn("usage: \$src->index_content( { content=>'...', id=>'...', title=>'...' } )\n"); |
711
|
|
|
|
|
|
|
return undef; |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
IndexFile( $session, $info->{id}, $info->{content}, $info->{title} ); |
714
|
|
|
|
|
|
|
return 1; |
715
|
|
|
|
|
|
|
} |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub rollback_indexing_session |
718
|
|
|
|
|
|
|
{ |
719
|
|
|
|
|
|
|
my $self = shift; |
720
|
|
|
|
|
|
|
my $session = $self->{session}; |
721
|
|
|
|
|
|
|
return unless $session; |
722
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
723
|
|
|
|
|
|
|
untie_hash($session->{locationsdb}); |
724
|
|
|
|
|
|
|
untie_hash($session->{titlesdb}); |
725
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
726
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
727
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
unlink $indexdbpath."~"; |
730
|
|
|
|
|
|
|
unlink $locationsdbpath."~"; |
731
|
|
|
|
|
|
|
unlink $titlesdbpath."~"; |
732
|
|
|
|
|
|
|
$self->{session} = undef; |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
sub DESTROY |
736
|
|
|
|
|
|
|
{ |
737
|
|
|
|
|
|
|
my $self = shift; |
738
|
|
|
|
|
|
|
$self->rollback_indexing_session; |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
sub commit_indexing_session |
742
|
|
|
|
|
|
|
{ |
743
|
|
|
|
|
|
|
my $self = shift; |
744
|
|
|
|
|
|
|
my $session = $self->{session}; |
745
|
|
|
|
|
|
|
return unless $session; |
746
|
|
|
|
|
|
|
FlushCache($session->{cachedb}, $session->{indexdb}, $session); |
747
|
|
|
|
|
|
|
my $time = time()-$session->{start_time}; |
748
|
|
|
|
|
|
|
DEBUG("$session->{bytes} bytes read, $session->{count} files processed in $time seconds\n"); |
749
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
750
|
|
|
|
|
|
|
untie_hash($session->{locationsdb}); |
751
|
|
|
|
|
|
|
untie_hash($session->{titlesdb}); |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
my $indexdbpath = $self->{indexdbpath}; |
754
|
|
|
|
|
|
|
my $locationsdbpath = $self->{locationsdbpath}; |
755
|
|
|
|
|
|
|
my $titlesdbpath = $self->{titlesdbpath}; |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
rename $indexdbpath."~", $indexdbpath; |
758
|
|
|
|
|
|
|
rename $locationsdbpath."~", $locationsdbpath ; |
759
|
|
|
|
|
|
|
rename $titlesdbpath."~", $titlesdbpath; |
760
|
|
|
|
|
|
|
close $session->{listfh}; |
761
|
|
|
|
|
|
|
if ( $session->{autoignore} ) { |
762
|
|
|
|
|
|
|
my $ignorefile = $session->{ignorefile}; |
763
|
|
|
|
|
|
|
open F, ">".$ignorefile; #write *-stopwords.dat file |
764
|
|
|
|
|
|
|
print F join( "\n", sort keys %{ $session->{ignoreword} } ); |
765
|
|
|
|
|
|
|
close F; |
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
$self->{session} = undef; |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
sub makeindex |
774
|
|
|
|
|
|
|
{ |
775
|
|
|
|
|
|
|
my $self = shift; |
776
|
|
|
|
|
|
|
$self->start_indexing_session(); |
777
|
|
|
|
|
|
|
$self->index_default_locations(); |
778
|
|
|
|
|
|
|
$self->commit_indexing_session(); |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
sub IndexDir { |
783
|
|
|
|
|
|
|
my ($session, $dir, $only_recurse) = @_; |
784
|
|
|
|
|
|
|
my $followsymlinks = $session->{followsymlinks}; |
785
|
|
|
|
|
|
|
my $file_reader = $session->{file_reader}; |
786
|
|
|
|
|
|
|
opendir D, $dir; |
787
|
|
|
|
|
|
|
# DEBUG "D $dir\n"; |
788
|
|
|
|
|
|
|
my @files = readdir D; |
789
|
|
|
|
|
|
|
close D; |
790
|
|
|
|
|
|
|
my $e; |
791
|
|
|
|
|
|
|
my $count = 0; |
792
|
|
|
|
|
|
|
my $text; |
793
|
|
|
|
|
|
|
for $e(@files) { |
794
|
|
|
|
|
|
|
next if $e =~ /^\.\.?/; |
795
|
|
|
|
|
|
|
my $path = $dir."/".$e; |
796
|
|
|
|
|
|
|
if (-d $path) { |
797
|
|
|
|
|
|
|
unless ($followsymlinks) { |
798
|
|
|
|
|
|
|
next if -l $path ; |
799
|
|
|
|
|
|
|
} |
800
|
|
|
|
|
|
|
$count += IndexDir($session,$path, $only_recurse); |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
elsif (-f _ ) { |
803
|
|
|
|
|
|
|
my $filemask = $session->{filemask}; |
804
|
|
|
|
|
|
|
if ($filemask) { |
805
|
|
|
|
|
|
|
next unless $e =~ $filemask; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
unless ($only_recurse) |
808
|
|
|
|
|
|
|
{ |
809
|
|
|
|
|
|
|
if( $file_reader ) |
810
|
|
|
|
|
|
|
{ |
811
|
|
|
|
|
|
|
$text = $file_reader->read( $path ); |
812
|
|
|
|
|
|
|
IndexFile($session,$path,$text); |
813
|
|
|
|
|
|
|
} else |
814
|
|
|
|
|
|
|
{ |
815
|
|
|
|
|
|
|
IndexFile($session,$path); |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
$count ++; |
819
|
|
|
|
|
|
|
} |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
return $count; |
822
|
|
|
|
|
|
|
} |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
sub IndexFile { |
827
|
|
|
|
|
|
|
my ($session, $file, $text, $title ) = @_; |
828
|
|
|
|
|
|
|
my $cachedb = $session->{cachedb}; |
829
|
|
|
|
|
|
|
my $locationsdb = $session->{locationsdb}; |
830
|
|
|
|
|
|
|
my $key = $session->{current_key}; |
831
|
|
|
|
|
|
|
if( $session->{use_inode} ) |
832
|
|
|
|
|
|
|
{ |
833
|
|
|
|
|
|
|
$key = (stat($file))[1]; |
834
|
|
|
|
|
|
|
} |
835
|
|
|
|
|
|
|
my $no_of_files = $session->{filecount}; |
836
|
|
|
|
|
|
|
if( $session->{no_reset} ) |
837
|
|
|
|
|
|
|
{ |
838
|
|
|
|
|
|
|
if( exists $locationsdb->{pack"xN",$key} ) |
839
|
|
|
|
|
|
|
{ |
840
|
|
|
|
|
|
|
warn("key $key already in locationsdb. Skipping\n"); |
841
|
|
|
|
|
|
|
return; |
842
|
|
|
|
|
|
|
} |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
DEBUG $session->{count}+1, "/$no_of_files $file (id=$key)\n"; |
845
|
|
|
|
|
|
|
my $fh = $session->{listfh}; |
846
|
|
|
|
|
|
|
print $fh "$key\t$file\n"; |
847
|
|
|
|
|
|
|
local $/; |
848
|
|
|
|
|
|
|
unless (defined $text) { |
849
|
|
|
|
|
|
|
undef $/; |
850
|
|
|
|
|
|
|
open(FILE, $file); |
851
|
|
|
|
|
|
|
($text) = ; # Read entire file |
852
|
|
|
|
|
|
|
close FILE; |
853
|
|
|
|
|
|
|
} |
854
|
|
|
|
|
|
|
my $filesize = length($text); |
855
|
|
|
|
|
|
|
if ($file =~ /\.s?htm.?/i ) { |
856
|
|
|
|
|
|
|
$text =~ /]*>([^<]+)<\/title/i ; |
857
|
|
|
|
|
|
|
$title = $1; |
858
|
|
|
|
|
|
|
$title =~ s/\s+/ /g; |
859
|
|
|
|
|
|
|
$text =~ s/<[^>]*>//g; # strip all HTML tags |
860
|
|
|
|
|
|
|
} |
861
|
|
|
|
|
|
|
if( defined $title ) |
862
|
|
|
|
|
|
|
{ |
863
|
|
|
|
|
|
|
$session->{titlesdb}->{pack"xN",$key} = $title; # put title in db |
864
|
|
|
|
|
|
|
DEBUG("* \"$title\"\n"); |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
# index all the words under the current file-id |
867
|
|
|
|
|
|
|
my($wordsIndexed) = &IndexWords($cachedb, $text,$key, $session); |
868
|
|
|
|
|
|
|
$session->{current_key}++; |
869
|
|
|
|
|
|
|
DEBUG "* $wordsIndexed words\n"; |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# map file-id (key) to this filename |
872
|
|
|
|
|
|
|
$locationsdb->{pack"xN",$key} = $file; # leading null is here for |
873
|
|
|
|
|
|
|
# historical reasons :-) |
874
|
|
|
|
|
|
|
$session->{bytes} += $filesize; |
875
|
|
|
|
|
|
|
$session->{count}++; |
876
|
|
|
|
|
|
|
$session->{_temp_size} += $filesize; |
877
|
|
|
|
|
|
|
if ($session->{_temp_size} > 2000000 ) { |
878
|
|
|
|
|
|
|
my $rc = 0; |
879
|
|
|
|
|
|
|
$rc = FlushCache($cachedb, $session->{indexdb}, $session); |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
if (! $rc ) { |
882
|
|
|
|
|
|
|
tie_hash($session->{indexdb}, $session->{indexdbpath}) or die $!; |
883
|
|
|
|
|
|
|
untie_hash($session->{indexdb}); |
884
|
|
|
|
|
|
|
$rc = FlushCache($cachedb, $session->{indexdb}, $session); |
885
|
|
|
|
|
|
|
die $! if not $rc; |
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
$session->{_temp_size} = 0; |
889
|
|
|
|
|
|
|
$session->{cachedb} = {}; |
890
|
|
|
|
|
|
|
} |
891
|
|
|
|
|
|
|
} |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
sub IndexWords { |
894
|
|
|
|
|
|
|
my ($db, $words, $fileKey, $session) = @_; |
895
|
|
|
|
|
|
|
# hash content file-id options |
896
|
|
|
|
|
|
|
my (%worduniq); # for unique-ifying word list |
897
|
|
|
|
|
|
|
my $minwordsize = $session->{minwordsize}; |
898
|
|
|
|
|
|
|
my (@words) = split( /[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words); # split into an array of words |
899
|
|
|
|
|
|
|
@words = grep { $worduniq{$_}++ == 0 } # remove duplicates |
900
|
|
|
|
|
|
|
grep { length > $minwordsize } # must be longer than one character |
901
|
|
|
|
|
|
|
grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # strip leading punct |
902
|
|
|
|
|
|
|
grep { /[a-zA-Z0-9\xc0-\xff]/ } # must have an alphanumeric |
903
|
|
|
|
|
|
|
@words; |
904
|
|
|
|
|
|
|
# " foreach (sort @words) { " |
905
|
|
|
|
|
|
|
for (@words) { # no need to sort here, |
906
|
|
|
|
|
|
|
my $a = $db->{$_}; # we will sort when cache is flushed |
907
|
|
|
|
|
|
|
$a .= pack "N",$fileKey; # appending packed file-id's |
908
|
|
|
|
|
|
|
$db->{$_} = $a; |
909
|
|
|
|
|
|
|
} |
910
|
|
|
|
|
|
|
return int @words; |
911
|
|
|
|
|
|
|
} |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
sub FlushCache { |
916
|
|
|
|
|
|
|
my ($source, $dest, $session) = @_; |
917
|
|
|
|
|
|
|
# flush source hashe into dest.... |
918
|
|
|
|
|
|
|
# %$dest is supposed to be tied, otherwise the whole |
919
|
|
|
|
|
|
|
# thing doens't make much sense... :-) |
920
|
|
|
|
|
|
|
my $scount = int keys %$source ; |
921
|
|
|
|
|
|
|
my $ucount = 0; |
922
|
|
|
|
|
|
|
my $acount = 0; |
923
|
|
|
|
|
|
|
if ($scount == 0) { |
924
|
|
|
|
|
|
|
die "error: 0 words in cache\n"; |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
# my $wordcount = int keys %$dest; |
927
|
|
|
|
|
|
|
# if ($wordcount < $session->{wordcount}) { |
928
|
|
|
|
|
|
|
# warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; |
929
|
|
|
|
|
|
|
# return undef; |
930
|
|
|
|
|
|
|
# } |
931
|
|
|
|
|
|
|
# $session->{wordcount} = $wordcount; |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
# DEBUG("$wordcount words in database\n"); |
934
|
|
|
|
|
|
|
my $objref = tied %$dest ; |
935
|
|
|
|
|
|
|
DEBUG("flushing $scount words into $dest ($objref)\n"); |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
my $filecount = $session->{count}; |
938
|
|
|
|
|
|
|
my $autoignore = $session->{autoignore}; |
939
|
|
|
|
|
|
|
my $ignorethreshold = int ( $filecount * $session->{ignorelimit} ); |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
my $w; |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
WORD: |
944
|
|
|
|
|
|
|
for $w(sort keys %$source) { |
945
|
|
|
|
|
|
|
my $data = $source->{$w}; |
946
|
|
|
|
|
|
|
if ($session->{ignoreword}->{$w} ) { |
947
|
|
|
|
|
|
|
DEBUG("ignoring '$w' \n"); |
948
|
|
|
|
|
|
|
$data = pack("N*", ( 0 ) ); # id = 0 means $w is a stop-word |
949
|
|
|
|
|
|
|
} |
950
|
|
|
|
|
|
|
elsif (defined $dest->{$w}) { |
951
|
|
|
|
|
|
|
my %uniq = (); |
952
|
|
|
|
|
|
|
my $keys = $dest->{$w} . $data ; |
953
|
|
|
|
|
|
|
my $keycount = length($keys)/2; # dividing by 2 |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
$ucount++; |
956
|
|
|
|
|
|
|
## my @keys = unpack("n*", $keys) ; |
957
|
|
|
|
|
|
|
## my $keycount = @keys; |
958
|
|
|
|
|
|
|
## |
959
|
|
|
|
|
|
|
## if ($keys[0] == 0 ) { # skip ignored word |
960
|
|
|
|
|
|
|
## DEBUG("skipping '$w' \n"); |
961
|
|
|
|
|
|
|
## next WORD; |
962
|
|
|
|
|
|
|
## } els |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
if ($autoignore && ($filecount > 100) |
965
|
|
|
|
|
|
|
&& ($keycount > $ignorethreshold ) ) { |
966
|
|
|
|
|
|
|
DEBUG("word '$w' will be ignored (found in $keycount of $filecount files)\n"); |
967
|
|
|
|
|
|
|
# ignored words are associated to file-id 0 |
968
|
|
|
|
|
|
|
## @keys = ( 0 ); |
969
|
|
|
|
|
|
|
$keys = pack("N*", 0); |
970
|
|
|
|
|
|
|
$session->{ignoreword}->{$w} = 1; |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
## @keys = grep { $uniq{$_}++ == 0} @keys; |
973
|
|
|
|
|
|
|
## $data = pack("n*", @keys); |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
$data = $keys; |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
## if ($verbose_flag && ( $w eq "the" ) ) { |
978
|
|
|
|
|
|
|
## my $len = int(@keys); |
979
|
|
|
|
|
|
|
## if ($len < $session->{status_THE} ) { |
980
|
|
|
|
|
|
|
## die "panic: problem with word 'the'"; |
981
|
|
|
|
|
|
|
## } |
982
|
|
|
|
|
|
|
## $session->{status_THE} = $len; |
983
|
|
|
|
|
|
|
## DEBUG("word 'the' found in $len files \n"); |
984
|
|
|
|
|
|
|
## } |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
} else { |
987
|
|
|
|
|
|
|
$acount++; |
988
|
|
|
|
|
|
|
} |
989
|
|
|
|
|
|
|
$dest->{$w} = $data; |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
# if ($dest->{$w} ne $data) { |
992
|
|
|
|
|
|
|
# warn "unexpected error: \$w=$w\n"; |
993
|
|
|
|
|
|
|
# return undef; |
994
|
|
|
|
|
|
|
# } |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
DEBUG("$ucount words updated, $acount new words added\n"); |
997
|
|
|
|
|
|
|
if ($debug_flag) { |
998
|
|
|
|
|
|
|
my $wordcount = int keys %$dest; |
999
|
|
|
|
|
|
|
if ($wordcount < $session->{wordcount}) { |
1000
|
|
|
|
|
|
|
warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; |
1001
|
|
|
|
|
|
|
return undef; |
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
$session->{wordcount} = $wordcount; |
1004
|
|
|
|
|
|
|
DEBUG("$wordcount words in database\n"); |
1005
|
|
|
|
|
|
|
} |
1006
|
|
|
|
|
|
|
return 1; |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
sub IndexWeb { |
1014
|
|
|
|
|
|
|
my ($session, $url) = @_; |
1015
|
|
|
|
|
|
|
require MMM::Text::Search::Inet; |
1016
|
|
|
|
|
|
|
my $req = new HTTPRequest { AutoRedirect => 1 }; |
1017
|
|
|
|
|
|
|
my %fetched = (); |
1018
|
|
|
|
|
|
|
$req->set_url($url); |
1019
|
|
|
|
|
|
|
my $host = $req->host(); |
1020
|
|
|
|
|
|
|
$session->{req} = $req; |
1021
|
|
|
|
|
|
|
$session->{fetched} = \%fetched; |
1022
|
|
|
|
|
|
|
$session->{host} = $host; |
1023
|
|
|
|
|
|
|
my $deadlinksfile = $session->{indexdbpath}; |
1024
|
|
|
|
|
|
|
$deadlinksfile =~ s/(\.db)?$/\.deadlinks/; |
1025
|
|
|
|
|
|
|
open DL, ">".$deadlinksfile; |
1026
|
|
|
|
|
|
|
$session->{deadlinksfh} = \*DL; |
1027
|
|
|
|
|
|
|
recursive_fetch($session, $url, "", 0); |
1028
|
|
|
|
|
|
|
} |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
sub recursive_fetch { |
1033
|
|
|
|
|
|
|
my ($session, $URL, $parent, $level) = @_; |
1034
|
|
|
|
|
|
|
my $req = $session->{req}; |
1035
|
|
|
|
|
|
|
$req->reset(); |
1036
|
|
|
|
|
|
|
$req->set_url($URL); |
1037
|
|
|
|
|
|
|
my $url = $req->url(); |
1038
|
|
|
|
|
|
|
return unless $req->host() eq $session->{host}; |
1039
|
|
|
|
|
|
|
return if $session->{fetched}->{$url}; |
1040
|
|
|
|
|
|
|
$session->{fetched}->{$url} = 1; |
1041
|
|
|
|
|
|
|
return unless $req->get_page(); |
1042
|
|
|
|
|
|
|
my $status = $req->status(); |
1043
|
|
|
|
|
|
|
DEBUG( ">>> $url ($status)\n"); |
1044
|
|
|
|
|
|
|
if ( $status != 200 ) { |
1045
|
|
|
|
|
|
|
my $fh = $session->{deadlinksfh}; |
1046
|
|
|
|
|
|
|
my $url = $req->url(); |
1047
|
|
|
|
|
|
|
print $fh $status, "\t", |
1048
|
|
|
|
|
|
|
$url, "(", $req->{_URL},")", |
1049
|
|
|
|
|
|
|
"\t", $parent, "\n"; |
1050
|
|
|
|
|
|
|
return; |
1051
|
|
|
|
|
|
|
}; |
1052
|
|
|
|
|
|
|
my $base = $req->base_url(); |
1053
|
|
|
|
|
|
|
my $content_ref = $req->content_ref(); |
1054
|
|
|
|
|
|
|
my $header = $req->header(); |
1055
|
|
|
|
|
|
|
IndexFile($session, $url, $$content_ref); |
1056
|
|
|
|
|
|
|
return if ($session->{level} && $level >= $session->{level}); |
1057
|
|
|
|
|
|
|
$$content_ref =~ s///gs; #remove comments |
1058
|
|
|
|
|
|
|
my @links = $$content_ref =~/href=([^>\s]+)/ig; #extract hyperlinks |
1059
|
|
|
|
|
|
|
my $count = 0; |
1060
|
|
|
|
|
|
|
my $exclude_re = $session->{url_exclude}; |
1061
|
|
|
|
|
|
|
for(@links) { |
1062
|
|
|
|
|
|
|
s/\"|\'//g; |
1063
|
|
|
|
|
|
|
next if m/^(ftp|mailto|gopher|news):/; |
1064
|
|
|
|
|
|
|
next if m/^$exclude_re$/o; |
1065
|
|
|
|
|
|
|
my $link = /^http/ ? $_ : join("/",$base,$_); |
1066
|
|
|
|
|
|
|
$link =~ s/#.*//; |
1067
|
|
|
|
|
|
|
$count++; |
1068
|
|
|
|
|
|
|
recursive_fetch($session,$link, $url, $level + 1); |
1069
|
|
|
|
|
|
|
} |
1070
|
|
|
|
|
|
|
} |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
1; |
1074
|
|
|
|
|
|
|
__END__ |