| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  |  | 
| 2 |  |  |  |  |  |  | package MMM::Text::Search; | 
| 3 | 1 |  |  | 1 |  | 1429 | use File::Copy; | 
|  | 1 |  |  |  |  | 9474 |  | 
|  | 1 |  |  |  |  | 67 |  | 
| 4 |  |  |  |  |  |  |  | 
| 5 |  |  |  |  |  |  | #$Id: Search.pm,v 1.50 2004/12/13 18:45:15 maxim Exp $ | 
| 6 | 1 |  |  | 1 |  | 7 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 31 |  | 
| 7 | 1 |  |  | 1 |  | 5 | use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $verbose_flag  ); | 
|  | 1 |  |  |  |  | 5 |  | 
|  | 1 |  |  |  |  | 126 |  | 
| 8 |  |  |  |  |  |  | require Exporter; | 
| 9 |  |  |  |  |  |  | require AutoLoader; | 
| 10 |  |  |  |  |  |  | @ISA = qw(Exporter AutoLoader); | 
| 11 |  |  |  |  |  |  | @EXPORT = qw( | 
| 12 |  |  |  |  |  |  | ); | 
| 13 |  |  |  |  |  |  | $VERSION = '0.07'; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | # | 
| 16 |  |  |  |  |  |  | # Perl module for indexing and searching text files and web pages. | 
| 17 |  |  |  |  |  |  | # 		(Max Muzi, Apr-Sep 1999) | 
| 18 |  |  |  |  |  |  | # | 
| 19 |  |  |  |  |  |  | # | 
| 20 |  |  |  |  |  |  | # Note on implementation: | 
| 21 |  |  |  |  |  |  | # The technique used for indexing is substantially derived from that | 
| 22 |  |  |  |  |  |  | # exposed by Tim Kientzle on Dr. Dobbs magazine. (Actually IndexWords() | 
| 23 |  |  |  |  |  |  | # has been cut'n'pasted from his scripts.) | 
| 24 |  |  |  |  |  |  | # | 
| 25 |  |  |  |  |  |  | # | 
| 26 |  |  |  |  |  |  |  | 
| 27 | 1 |  |  | 1 |  | 1551 | use DB_File; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | use Fcntl; | 
| 29 |  |  |  |  |  |  | require 5.005; | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | $verbose_flag = 0; | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | my $debug_flag = 0; | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | my $errstr = undef; | 
| 36 |  |  |  |  |  |  | my $syntax_error = undef; | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | sub errstr { $errstr }; | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | sub new { 			 # constructor!  (see the docs for usage [sorry, there're no docs ]) | 
| 41 |  |  |  |  |  |  | my $pkg = shift; | 
| 42 |  |  |  |  |  |  | my $arg = shift; | 
| 43 |  |  |  |  |  |  | my $opt = undef; | 
| 44 |  |  |  |  |  |  | if (ref($arg) ne "HASH") { | 
| 45 |  |  |  |  |  |  | if (-f $arg) { | 
| 46 |  |  |  |  |  |  | $opt->{IndexDB} = $arg; | 
| 47 |  |  |  |  |  |  | $opt->{Verbose} = shift; | 
| 48 |  |  |  |  |  |  | } | 
| 49 |  |  |  |  |  |  | else { | 
| 50 |  |  |  |  |  |  | die "usage:   \$obj = new MMM::Text::Search ( '/index/path' or \$hashref)\n" | 
| 51 |  |  |  |  |  |  | } | 
| 52 |  |  |  |  |  |  | } else { | 
| 53 |  |  |  |  |  |  | $opt = $arg; | 
| 54 |  |  |  |  |  |  | }; | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | $verbose_flag = $opt->{Debug} || $opt->{Verbose} ; | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | my $indexdbpath = $opt->{IndexDB} || $opt->{IndexPath} ; | 
| 59 |  |  |  |  |  |  | my $filemask 	= $opt->{FileMask} ; | 
| 60 |  |  |  |  |  |  | my $dirs 	= ( ref($opt->{Dirs}) eq "ARRAY" ) ? $opt->{Dirs} : [ ]; | 
| 61 |  |  |  |  |  |  | my $followsymlinks = defined $opt->{FollowSymLinks}; | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | my $opturls =  $opt->{Urls} ||  $opt->{URLs}; | 
| 64 |  |  |  |  |  |  | my $urls 	= ( ref($opturls) eq "ARRAY" ) ? $opturls : [ ]; | 
| 65 |  |  |  |  |  |  | my $level	= int $opt->{Level}; | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | my $locationsdbpath = $indexdbpath; | 
| 68 |  |  |  |  |  |  | $locationsdbpath =~ s/(\.db)*$/\-locations.db/; | 
| 69 |  |  |  |  |  |  | my $titlesdbpath = $indexdbpath; | 
| 70 |  |  |  |  |  |  | $titlesdbpath =~ s/(\.db)*$/\-titles.db/; | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | my $minwordsize = $opt->{MinWordSize} || 1; | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | my $self = { | 
| 76 |  |  |  |  |  |  | indexdbpath 	=> $indexdbpath, | 
| 77 |  |  |  |  |  |  | locationsdbpath 	=> $locationsdbpath, | 
| 78 |  |  |  |  |  |  | titlesdbpath	=> $titlesdbpath, | 
| 79 |  |  |  |  |  |  | filemask 	=> length($filemask) ? qr/$filemask/ : undef, | 
| 80 |  |  |  |  |  |  | dirs 		=> $dirs, | 
| 81 |  |  |  |  |  |  | followsymlinks  => $followsymlinks, | 
| 82 |  |  |  |  |  |  | minwordsize	=> $minwordsize, | 
| 83 |  |  |  |  |  |  | ignorelimit	=> $opt->{IgnoreLimit} || (2/3), | 
| 84 |  |  |  |  |  |  | urls		=> $urls, | 
| 85 |  |  |  |  |  |  | level		=> $level, | 
| 86 |  |  |  |  |  |  | url_exclude	=> $opt->{UrlExludeMask} || "(?i).*\.(zip|exe|gz|arj|bin|hqx)", | 
| 87 |  |  |  |  |  |  | file_reader     => $opt->{FileReader}, | 
| 88 |  |  |  |  |  |  | use_inode       => $opt->{UseInodeAsKey}, | 
| 89 |  |  |  |  |  |  | no_reset        => $opt->{UseInodeAsKey} && $opt->{NoReset} | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | }; | 
| 92 |  |  |  |  |  |  | DEBUG("filemask=$filemask, indexfile=$indexdbpath, ignorelimit=$self->{ignorelimit}\n"); | 
| 93 |  |  |  |  |  |  | DEBUG("dirs = [", join(",", @$dirs),"], "); | 
| 94 |  |  |  |  |  |  | DEBUG("urls = [", join(",", @$urls),"] \n"); | 
| 95 |  |  |  |  |  |  | bless($self, $pkg); | 
| 96 |  |  |  |  |  |  | return $self; | 
| 97 |  |  |  |  |  |  | } | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | sub _add_keys_to_match_hash { | 
| 100 |  |  |  |  |  |  | # extract file-codes from $keys and update corresponding $hash elements (score) | 
| 101 |  |  |  |  |  |  | my ($keys, $hash) = @_; | 
| 102 |  |  |  |  |  |  | my $key; | 
| 103 |  |  |  |  |  |  | foreach $key ( unpack("N*",$keys) ) { | 
| 104 |  |  |  |  |  |  | #		DEBUG($key, " "); | 
| 105 |  |  |  |  |  |  | # ignored words (stop-words) only include file-id 0 (see FlushCache() below) | 
| 106 |  |  |  |  |  |  | return 0 if  $key == 0 ; | 
| 107 |  |  |  |  |  |  | $hash->{$key}++ | 
| 108 |  |  |  |  |  |  | } | 
| 109 |  |  |  |  |  |  | return 1; | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | sub _push_words_from_hash { | 
| 113 |  |  |  |  |  |  | my ($hash,$array, $regexp) = @_; | 
| 114 |  |  |  |  |  |  | my $w; | 
| 115 |  |  |  |  |  |  | for $w(keys %$hash) { | 
| 116 |  |  |  |  |  |  | push @$array,$w if $w =~ $regexp; | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  | } | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | #notes on advanced_query(); | 
| 123 |  |  |  |  |  |  | # - queries containing stop-words may yields bizzare results.. | 
| 124 |  |  |  |  |  |  | # - score is not always correct | 
| 125 |  |  |  |  |  |  | # - error handling should be improved... :-) | 
| 126 |  |  |  |  |  |  | sub advanced_query { | 
| 127 |  |  |  |  |  |  | # perform queries such as  "( a and ( b or c ) ) and ( d and e) " | 
| 128 |  |  |  |  |  |  | my $self = shift; | 
| 129 |  |  |  |  |  |  | my $expr = shift; | 
| 130 |  |  |  |  |  |  | my $indexdbpath= $self->{indexdbpath}; | 
| 131 |  |  |  |  |  |  | my $locationsdbpath = $self->{locationsdbpath}; | 
| 132 |  |  |  |  |  |  | my $titlesdbpath = $self->{titlesdbpath}; | 
| 133 |  |  |  |  |  |  | my %indexdb; | 
| 134 |  |  |  |  |  |  | my %locationsdb; | 
| 135 |  |  |  |  |  |  | my %titlesdb; | 
| 136 |  |  |  |  |  |  | return undef unless (-f $indexdbpath && -r _); | 
| 137 |  |  |  |  |  |  | return undef unless (-f $locationsdbpath && -r _); | 
| 138 |  |  |  |  |  |  | return undef unless (-f $titlesdbpath && -r _); | 
| 139 |  |  |  |  |  |  | return undef unless | 
| 140 |  |  |  |  |  |  | tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && | 
| 141 |  |  |  |  |  |  | tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && | 
| 142 |  |  |  |  |  |  | tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); | 
| 143 |  |  |  |  |  |  | my @ignored = (); | 
| 144 |  |  |  |  |  |  | my @words = (); | 
| 145 |  |  |  |  |  |  | my $verbose_flag_tmp = $verbose_flag; | 
| 146 |  |  |  |  |  |  | $verbose_flag = shift; # undocumented debug switch | 
| 147 |  |  |  |  |  |  | chomp $expr; | 
| 148 |  |  |  |  |  |  | undef $syntax_error; #reset error | 
| 149 |  |  |  |  |  |  | DEBUG("********** _match_expression() debug **********\n"); | 
| 150 |  |  |  |  |  |  | my $match = _match_expression($expr, \%indexdb, \@ignored); | 
| 151 |  |  |  |  |  |  | DEBUG("**********         end debug         **********\n"); | 
| 152 |  |  |  |  |  |  | if ($syntax_error) { | 
| 153 |  |  |  |  |  |  | $errstr = $syntax_error; | 
| 154 |  |  |  |  |  |  | $verbose_flag = $verbose_flag_tmp; | 
| 155 |  |  |  |  |  |  | return undef; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  | my $result =  _make_result_hash($match,\%locationsdb, \%titlesdb, \@words, \@ignored); | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | untie(%indexdb); | 
| 160 |  |  |  |  |  |  | untie(%locationsdb); | 
| 161 |  |  |  |  |  |  | untie(%titlesdb); | 
| 162 |  |  |  |  |  |  | $verbose_flag = $verbose_flag_tmp; | 
| 163 |  |  |  |  |  |  | return $result; | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | sub _match_expression { 	 # recursively apply a keyword-search expression to indexdb | 
| 167 |  |  |  |  |  |  | # $expr may be either a string or a ref to an array of tokens | 
| 168 |  |  |  |  |  |  | # a ref to a "score" hash is returned (or undef sometimes) | 
| 169 |  |  |  |  |  |  | my ($expr, $index, $ignored) = @_; | 
| 170 |  |  |  |  |  |  | my $parsed = _parse_expression($expr); | 
| 171 |  |  |  |  |  |  | # _parse_expression() returns a reference to an array of three elements: | 
| 172 |  |  |  |  |  |  | # 			[ operator, left_expr, right_expr] | 
| 173 |  |  |  |  |  |  | #  if right_expr is not defined then expr was atomic and left_expr is a string, | 
| 174 |  |  |  |  |  |  | #  otherwise both right_expr and left_expr are references to arrays of tokens | 
| 175 |  |  |  |  |  |  | if ( not $parsed) { | 
| 176 |  |  |  |  |  |  | DEBUG("Syntax error :-( \n"); | 
| 177 |  |  |  |  |  |  | return undef; | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  | my ( $op, $left,$right) = @$parsed; | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | if ($left && not $right) { | 
| 182 |  |  |  |  |  |  | $left =~ s/^\s*\(?\s*|\s*\)?\s*$//g; | 
| 183 |  |  |  |  |  |  | DEBUG("Looking up >$left<\n"); | 
| 184 |  |  |  |  |  |  | my %matches = (); | 
| 185 |  |  |  |  |  |  | my $word = $left; | 
| 186 |  |  |  |  |  |  | my $rc = 0; | 
| 187 |  |  |  |  |  |  | my $keys = $index->{lc $word}; # get file-id's from indexdb | 
| 188 |  |  |  |  |  |  | $rc = _add_keys_to_match_hash($keys,\%matches); | 
| 189 |  |  |  |  |  |  | # if $rc is false then $word  is a stop-word, see _add_keys_to_match_hash() for more info | 
| 190 |  |  |  |  |  |  | if (not $rc) { | 
| 191 |  |  |  |  |  |  | DEBUG("$word ignored\n"); | 
| 192 |  |  |  |  |  |  | push @$ignored, $word; | 
| 193 |  |  |  |  |  |  | return undef; | 
| 194 |  |  |  |  |  |  | # what should we do now? gotta think it over... | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | return \%matches; | 
| 197 |  |  |  |  |  |  | } | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | DEBUG("Evaluating >$left< --$op-- >$right<\n"); | 
| 200 |  |  |  |  |  |  | my $left_match  = _match_expression($left, $index, $ignored); | 
| 201 |  |  |  |  |  |  | my $right_match = _match_expression($right, $index, $ignored); | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | return undef if ($syntax_error); | 
| 204 |  |  |  |  |  |  | my %matches = (); | 
| 205 |  |  |  |  |  |  | my $file = undef; | 
| 206 |  |  |  |  |  |  |  | 
| 207 |  |  |  |  |  |  | if ($op eq 'AND' ) { | 
| 208 |  |  |  |  |  |  | %matches = ( %$left_match ); | 
| 209 |  |  |  |  |  |  | for $file( keys %matches) { | 
| 210 |  |  |  |  |  |  | delete $matches{$file} unless $right_match->{$file} | 
| 211 |  |  |  |  |  |  | } | 
| 212 |  |  |  |  |  |  | return \%matches; | 
| 213 |  |  |  |  |  |  | } | 
| 214 |  |  |  |  |  |  | if ($op eq 'AND NOT') { | 
| 215 |  |  |  |  |  |  | %matches = ( %$left_match ); | 
| 216 |  |  |  |  |  |  | for $file( keys %matches) { | 
| 217 |  |  |  |  |  |  | delete $matches{$file} if $right_match->{$file} | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  | return \%matches; | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  | if ($op eq 'OR')  { | 
| 222 |  |  |  |  |  |  | %matches = (  %$left_match ); | 
| 223 |  |  |  |  |  |  | for $file( keys	%$right_match) { | 
| 224 |  |  |  |  |  |  | if ($matches{$file}) { | 
| 225 |  |  |  |  |  |  | $matches{$file} +=$right_match->{$file}; | 
| 226 |  |  |  |  |  |  | } else { | 
| 227 |  |  |  |  |  |  | $matches{$file} =$right_match->{$file}; | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  | return \%matches; | 
| 231 |  |  |  |  |  |  | } | 
| 232 |  |  |  |  |  |  | return undef; | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | sub _parse_expression { | 
| 236 |  |  |  |  |  |  | my $arg = shift; | 
| 237 |  |  |  |  |  |  | my $tokens = undef;  # this is an arry ref | 
| 238 |  |  |  |  |  |  | if (ref($arg) ne 'ARRAY') { | 
| 239 |  |  |  |  |  |  | $tokens = [ | 
| 240 |  |  |  |  |  |  | $arg =~  m/( \( | \)| \bAND\s+NOT\b | \bAND\b | \bOR\b | \"[^\"]+\" | \b\w+\b) /xig | 
| 241 |  |  |  |  |  |  | ]; | 
| 242 |  |  |  |  |  |  | } | 
| 243 |  |  |  |  |  |  | # important!!	"AND NOT" is treated as a single logical operator... | 
| 244 |  |  |  |  |  |  | # 		this means that things like "not a and b" aren't well-formed, | 
| 245 |  |  |  |  |  |  | #		while "b and not a" is | 
| 246 |  |  |  |  |  |  | else { $tokens = $arg; | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  | my $left =  undef; # array ref  (oppure stringa se è un espressione atomica) | 
| 249 |  |  |  |  |  |  | my $right = undef; # array ref ! | 
| 250 |  |  |  |  |  |  | my $op =    'OR'; | 
| 251 |  |  |  |  |  |  | my $depth = 0; | 
| 252 |  |  |  |  |  |  | my $pos = 0; | 
| 253 |  |  |  |  |  |  | my $tok; | 
| 254 |  |  |  |  |  |  | my $len = int @$tokens; | 
| 255 |  |  |  |  |  |  | DEBUG("expr = ", join(" + ", @$tokens),"\n"); | 
| 256 |  |  |  |  |  |  | while (1) { | 
| 257 |  |  |  |  |  |  | if ($len == 1) { | 
| 258 |  |  |  |  |  |  | return [ undef, $tokens->[0], undef ]; | 
| 259 |  |  |  |  |  |  | } | 
| 260 |  |  |  |  |  |  | DEBUG("$tok : depth=$depth pos=$pos len=$len\n"); | 
| 261 |  |  |  |  |  |  | if ($depth == 0 && ($pos == $len) ) { | 
| 262 |  |  |  |  |  |  | if ($tokens->[0] eq '(' && $tokens->[$len-1] eq ')') { | 
| 263 |  |  |  |  |  |  | # take off outer parentheses... | 
| 264 |  |  |  |  |  |  | shift @$tokens; | 
| 265 |  |  |  |  |  |  | pop @$tokens; | 
| 266 |  |  |  |  |  |  | $len  -= 2; | 
| 267 |  |  |  |  |  |  | $pos   = 0; | 
| 268 |  |  |  |  |  |  | $depth = 0; | 
| 269 |  |  |  |  |  |  | DEBUG("expr = ", join(" + ", @$tokens),"\n"); | 
| 270 |  |  |  |  |  |  | } else { # ahhhh... this expression won't be parsed... | 
| 271 |  |  |  |  |  |  | $syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; | 
| 272 |  |  |  |  |  |  | DEBUG("atom not atomic\n"); | 
| 273 |  |  |  |  |  |  | return undef; | 
| 274 |  |  |  |  |  |  | } | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  | } elsif ( $pos == $len ) { | 
| 277 |  |  |  |  |  |  | $syntax_error = "Non-matching parentheses (\"".join(' ', @$tokens)."\")"; | 
| 278 |  |  |  |  |  |  | DEBUG("non matching parentheses\n"); | 
| 279 |  |  |  |  |  |  | return undef; | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  | $tok = $tokens->[$pos++]; | 
| 282 |  |  |  |  |  |  | if ($tok eq '(') { $depth++; next; } | 
| 283 |  |  |  |  |  |  | if ($tok eq ')') { $depth--; next; } | 
| 284 |  |  |  |  |  |  | next if $depth; | 
| 285 |  |  |  |  |  |  | if ($tok  =~ /\b(AND\s+NOT|AND|OR)\b/i) { | 
| 286 |  |  |  |  |  |  | if ($pos == 1 || $pos == $len)  { | 
| 287 |  |  |  |  |  |  | $syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; | 
| 288 |  |  |  |  |  |  | return undef | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  | $op = uc $1; $op =~ s/\s+/ /g; | 
| 291 |  |  |  |  |  |  | $left = [ @$tokens[0..$pos-2]    ]; | 
| 292 |  |  |  |  |  |  | $right =  [ @$tokens[$pos..$len-1] ]; | 
| 293 |  |  |  |  |  |  | DEBUG("right = ", join(" + ", @$right),"\n"); | 
| 294 |  |  |  |  |  |  | DEBUG("left  = ", join(" + ", @$left),"\n"); | 
| 295 |  |  |  |  |  |  | return [ $op, $left, $right ]; | 
| 296 |  |  |  |  |  |  | } | 
| 297 |  |  |  |  |  |  | } | 
| 298 |  |  |  |  |  |  | } | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  |  | 
| 302 |  |  |  |  |  |  | sub query { 	 # simple query....  	altavista +/- prefixes are recognized... | 
| 303 |  |  |  |  |  |  | #			*/? globbing also works but | 
| 304 |  |  |  |  |  |  | #			slows query down significantly | 
| 305 |  |  |  |  |  |  | #			globbing implicitly discards +/- prefix (it's a BUG!!!) | 
| 306 |  |  |  |  |  |  | my $self = shift; | 
| 307 |  |  |  |  |  |  | my $indexdbpath= $self->{indexdbpath}; | 
| 308 |  |  |  |  |  |  | my $locationsdbpath = $self->{locationsdbpath}; | 
| 309 |  |  |  |  |  |  | my $titlesdbpath = $self->{titlesdbpath}; | 
| 310 |  |  |  |  |  |  | my %indexdb; | 
| 311 |  |  |  |  |  |  | my %locationsdb; | 
| 312 |  |  |  |  |  |  | my %titlesdb; | 
| 313 |  |  |  |  |  |  | return undef unless (-f $indexdbpath && -r _); | 
| 314 |  |  |  |  |  |  | return undef unless (-f $locationsdbpath && -r _); | 
| 315 |  |  |  |  |  |  | return undef unless (-f $titlesdbpath && -r _); | 
| 316 |  |  |  |  |  |  | return undef unless | 
| 317 |  |  |  |  |  |  | tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && | 
| 318 |  |  |  |  |  |  | tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && | 
| 319 |  |  |  |  |  |  | tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); | 
| 320 |  |  |  |  |  |  | my %matches; | 
| 321 |  |  |  |  |  |  | my %limit; | 
| 322 |  |  |  |  |  |  | my %exclude; | 
| 323 |  |  |  |  |  |  | my @ignored; | 
| 324 |  |  |  |  |  |  | my $key; | 
| 325 |  |  |  |  |  |  | my $word; | 
| 326 |  |  |  |  |  |  | my $mustbe_words = 0; | 
| 327 |  |  |  |  |  |  | my @words = (); | 
| 328 |  |  |  |  |  |  | my $glob_regexp = undef; | 
| 329 |  |  |  |  |  |  | for (@_) {		# globbing feature... e.g. uni* passw? | 
| 330 |  |  |  |  |  |  | if ( /\*|\?/) { | 
| 331 |  |  |  |  |  |  | s/\*/\.\*/g; | 
| 332 |  |  |  |  |  |  | s/\?/\./g; | 
| 333 |  |  |  |  |  |  | $glob_regexp = $glob_regexp ? $glob_regexp."|^$_\$" : "^$_\$" ; | 
| 334 |  |  |  |  |  |  | } | 
| 335 |  |  |  |  |  |  | else { | 
| 336 |  |  |  |  |  |  | push @words, $_; | 
| 337 |  |  |  |  |  |  | } | 
| 338 |  |  |  |  |  |  | } | 
| 339 |  |  |  |  |  |  | if ($glob_regexp) { | 
| 340 |  |  |  |  |  |  | my $regexp = qr/$glob_regexp/; | 
| 341 |  |  |  |  |  |  | # collect  all words in db matching $glob_regexp and append them to the query | 
| 342 |  |  |  |  |  |  | _push_words_from_hash(\%indexdb, \@words, $regexp); | 
| 343 |  |  |  |  |  |  | } | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | DEBUG("looking up ", join(", ", @words ), "\n"); | 
| 346 |  |  |  |  |  |  | foreach $word (@words) { | 
| 347 |  |  |  |  |  |  | my $rc = 0; | 
| 348 |  |  |  |  |  |  | #		DEBUG($word); | 
| 349 |  |  |  |  |  |  | if ($word =~ /^-(.*)/) { | 
| 350 |  |  |  |  |  |  | my $keys = $indexdb{lc $1}; | 
| 351 |  |  |  |  |  |  | $rc = _add_keys_to_match_hash($keys,\%exclude); | 
| 352 |  |  |  |  |  |  | } elsif ($word =~ /^\+(.*)/) { | 
| 353 |  |  |  |  |  |  | $mustbe_words++; | 
| 354 |  |  |  |  |  |  | my $keys = $indexdb{lc $1}; | 
| 355 |  |  |  |  |  |  | $rc = _add_keys_to_match_hash($keys,\%limit); | 
| 356 |  |  |  |  |  |  | } else { | 
| 357 |  |  |  |  |  |  | my $keys = $indexdb{lc $word}; | 
| 358 |  |  |  |  |  |  | $rc = _add_keys_to_match_hash($keys,\%matches); | 
| 359 |  |  |  |  |  |  | } | 
| 360 |  |  |  |  |  |  | #		DEBUG("\n"); | 
| 361 |  |  |  |  |  |  | if (not $rc) { push @ignored, $word } | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | if ($mustbe_words) { | 
| 365 |  |  |  |  |  |  | for $key(keys %limit) { | 
| 366 |  |  |  |  |  |  | next unless $limit{$key} >= $mustbe_words; | 
| 367 |  |  |  |  |  |  | $matches{$key}  += $limit{$key} ; | 
| 368 |  |  |  |  |  |  | } | 
| 369 |  |  |  |  |  |  | for $key(keys %matches) { | 
| 370 |  |  |  |  |  |  | delete $matches{$key} unless $limit{$key}; | 
| 371 |  |  |  |  |  |  | } | 
| 372 |  |  |  |  |  |  | } | 
| 373 |  |  |  |  |  |  | for $key(keys %exclude) { | 
| 374 |  |  |  |  |  |  | delete $matches{$key}; | 
| 375 |  |  |  |  |  |  | } | 
| 376 |  |  |  |  |  |  | my $result =  _make_result_hash(\%matches,\%locationsdb, \%titlesdb, \@words, \@ignored); | 
| 377 |  |  |  |  |  |  | untie(%indexdb); | 
| 378 |  |  |  |  |  |  | untie(%locationsdb); | 
| 379 |  |  |  |  |  |  | untie(%titlesdb); | 
| 380 |  |  |  |  |  |  | return $result; | 
| 381 |  |  |  |  |  |  | } | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  | sub _make_result_hash { | 
| 385 |  |  |  |  |  |  | #            hash-ref  hash-ref   hash-ref    array-ref   array-ref | 
| 386 |  |  |  |  |  |  | my ( $match,   $locationsdb,  $titlesdb,  $words,     $ignored  ) = @_; | 
| 387 |  |  |  |  |  |  | my $result = { | 
| 388 |  |  |  |  |  |  | searched =>  $words, | 
| 389 |  |  |  |  |  |  | ignored  =>  $ignored, | 
| 390 |  |  |  |  |  |  | entries	 =>  [] | 
| 391 |  |  |  |  |  |  | }; | 
| 392 |  |  |  |  |  |  | my $key; | 
| 393 |  |  |  |  |  |  | foreach $key (keys %$match) { | 
| 394 |  |  |  |  |  |  | my $ckey = pack("xN",$key); | 
| 395 |  |  |  |  |  |  | my $name = $locationsdb->{$ckey}; | 
| 396 |  |  |  |  |  |  | my $title = $titlesdb->{$ckey}; | 
| 397 |  |  |  |  |  |  |  | 
| 398 |  |  |  |  |  |  | push @{ $result->{entries} }, { | 
| 399 |  |  |  |  |  |  | location => $name, | 
| 400 |  |  |  |  |  |  | score 	 => $match->{$key}, | 
| 401 |  |  |  |  |  |  | title	 => $title | 
| 402 |  |  |  |  |  |  | }; | 
| 403 |  |  |  |  |  |  | DEBUG("$name:  $match->{$key}\n"); | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  | return $result; | 
| 406 |  |  |  |  |  |  | } | 
| 407 |  |  |  |  |  |  |  | 
| 408 |  |  |  |  |  |  |  | 
| 409 |  |  |  |  |  |  |  | 
| 410 |  |  |  |  |  |  |  | 
| 411 |  |  |  |  |  |  |  | 
| 412 |  |  |  |  |  |  |  | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | sub DEBUG (@) { $verbose_flag && print STDERR @_ }; | 
| 415 |  |  |  |  |  |  |  | 
| 416 |  |  |  |  |  |  | sub tie_hash { | 
| 417 |  |  |  |  |  |  | my ($hashref, $file ,$perm) = @_; | 
| 418 |  |  |  |  |  |  | $perm = (O_RDWR|O_CREAT) unless defined $perm; | 
| 419 |  |  |  |  |  |  | my $rc = tied(%$hashref); | 
| 420 |  |  |  |  |  |  | return $rc if $rc; | 
| 421 |  |  |  |  |  |  | $rc = tie(%$hashref,'DB_File',$file, $perm, 0644, $DB_File::DB_BTREE) ; | 
| 422 |  |  |  |  |  |  | if ($debug_flag) { | 
| 423 |  |  |  |  |  |  | my $count = int keys %$hashref; | 
| 424 |  |  |  |  |  |  | DEBUG("tie $hashref ($rc) ($count keys)\n"); | 
| 425 |  |  |  |  |  |  | } elsif ($verbose_flag) { | 
| 426 |  |  |  |  |  |  | DEBUG("tie $hashref ($rc)\n"); | 
| 427 |  |  |  |  |  |  | } | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | return $rc; | 
| 431 |  |  |  |  |  |  | } | 
| 432 |  |  |  |  |  |  |  | 
| 433 |  |  |  |  |  |  | sub untie_hash { | 
| 434 |  |  |  |  |  |  | my ($hashref, $file ) = @_; | 
| 435 |  |  |  |  |  |  | if ($debug_flag) { | 
| 436 |  |  |  |  |  |  | my $count = int keys %$hashref; | 
| 437 |  |  |  |  |  |  | DEBUG("untie $hashref ($count keys)\n") | 
| 438 |  |  |  |  |  |  | } | 
| 439 |  |  |  |  |  |  | untie(%$hashref); | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  |  | 
| 442 |  |  |  |  |  |  |  | 
| 443 |  |  |  |  |  |  | 1; | 
| 444 |  |  |  |  |  |  | #__END__ | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | =head1 NAME | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | MMM::Text::Search - Perl module for indexing and searching text files and web objects | 
| 449 |  |  |  |  |  |  |  | 
| 450 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 451 |  |  |  |  |  |  |  | 
| 452 |  |  |  |  |  |  | use MMM::Text::Search; | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | my $srch = new MMM::Text::Search {	#for indexing... | 
| 455 |  |  |  |  |  |  | #index main file location... | 
| 456 |  |  |  |  |  |  | IndexPath => "/tmp/myindex.db", | 
| 457 |  |  |  |  |  |  | #local files... (optional) | 
| 458 |  |  |  |  |  |  | FileMask  => '(?i)(\.txt|\.htm.?)$', | 
| 459 |  |  |  |  |  |  | Dirs	  => [ "/usr/doc", "/tmp" ] , | 
| 460 |  |  |  |  |  |  | FollowSymLinks => 0|1, (default = 0) | 
| 461 |  |  |  |  |  |  | #web objects... (optional) | 
| 462 |  |  |  |  |  |  | URLs	  => [ "http://localhost/", ... ], | 
| 463 |  |  |  |  |  |  | Level	  => recursion-level (0=unlimited) | 
| 464 |  |  |  |  |  |  | #common options... | 
| 465 |  |  |  |  |  |  | IgnoreLimit =>	0.3,   (default = 2/3) | 
| 466 |  |  |  |  |  |  | Verbose => 0|1 | 
| 467 |  |  |  |  |  |  | }; | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | $srch->start_indexing_session(); | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | $srch->commit_indexing_session(); | 
| 472 |  |  |  |  |  |  |  | 
| 473 |  |  |  |  |  |  | $srch->index_default_locations(); | 
| 474 |  |  |  |  |  |  |  | 
| 475 |  |  |  |  |  |  | $srch->index_content( { title =>   '...', | 
| 476 |  |  |  |  |  |  | content=>  '...', | 
| 477 |  |  |  |  |  |  | id =>      '...'  } ); | 
| 478 |  |  |  |  |  |  |  | 
| 479 |  |  |  |  |  |  | $srch->makeindex; | 
| 480 |  |  |  |  |  |  | (Obsolete.) | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  |  | 
| 485 |  |  |  |  |  |  |  | 
| 486 |  |  |  |  |  |  | my $srch = new MMM::Text::Search (  #for searching.... | 
| 487 |  |  |  |  |  |  | "/tmp/myindex.db", verbose_flag ); | 
| 488 |  |  |  |  |  |  |  | 
| 489 |  |  |  |  |  |  | my $hashref = $srch->query("pizza","ciao", "-pasta" ); | 
| 490 |  |  |  |  |  |  | my $hashref = $srch->advanced_query("(pizza OR ciao) AND NOT pasta"); | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | $srch->errstr()	# returns last error | 
| 493 |  |  |  |  |  |  | # (only query syntax-errors for the moment being) | 
| 494 |  |  |  |  |  |  |  | 
| 495 |  |  |  |  |  |  |  | 
| 496 |  |  |  |  |  |  | $srch->dump_word_stats(\*FH) | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 499 |  |  |  |  |  |  |  | 
| 500 |  |  |  |  |  |  |  | 
| 501 |  |  |  |  |  |  | =item	* | 
| 502 |  |  |  |  |  |  | Indexing | 
| 503 |  |  |  |  |  |  |  | 
| 504 |  |  |  |  |  |  | When a session is closed the following files will have been created | 
| 505 |  |  |  |  |  |  | (assuming IndexPath = /path/myindex.db, see constructor): | 
| 506 |  |  |  |  |  |  |  | 
| 507 |  |  |  |  |  |  |  | 
| 508 |  |  |  |  |  |  | /path/myindex.db	     word index database | 
| 509 |  |  |  |  |  |  | /path/myindex-locations.db   filename/URL database | 
| 510 |  |  |  |  |  |  | /path/myindex-titles.db	     html title database | 
| 511 |  |  |  |  |  |  | /path/myindex.stopwords	     stop-words list | 
| 512 |  |  |  |  |  |  | /path/myindex.filelist	     readable list of indexed files/URLs | 
| 513 |  |  |  |  |  |  | /path/myindex.deadlinks	     broken http links | 
| 514 |  |  |  |  |  |  |  | 
| 515 |  |  |  |  |  |  | [... lots of important things missing ... ] | 
| 516 |  |  |  |  |  |  |  | 
| 517 |  |  |  |  |  |  | start_indexing_session() starts session. | 
| 518 |  |  |  |  |  |  |  | 
| 519 |  |  |  |  |  |  | commit_indexing_session() commits and closes current session. | 
| 520 |  |  |  |  |  |  |  | 
| 521 |  |  |  |  |  |  | index_default_locations() indexes all files and URLs specified on construction. | 
| 522 |  |  |  |  |  |  |  | 
| 523 |  |  |  |  |  |  | index_content() pushes content into indexing engine. | 
| 524 |  |  |  |  |  |  | Argument must have the following structure | 
| 525 |  |  |  |  |  |  |  | 
| 526 |  |  |  |  |  |  | { title =>   '...', content=>  '...', id =>      '...'  } | 
| 527 |  |  |  |  |  |  |  | 
| 528 |  |  |  |  |  |  |  | 
| 529 |  |  |  |  |  |  | makeindex() is obsolete. | 
| 530 |  |  |  |  |  |  | Equivalent to: | 
| 531 |  |  |  |  |  |  | $srch->start_indexing_session(); | 
| 532 |  |  |  |  |  |  | $srch->index_default_locations(); | 
| 533 |  |  |  |  |  |  | $srch->commit_indexing_session(); | 
| 534 |  |  |  |  |  |  |  | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  |  | 
| 537 |  |  |  |  |  |  |  | 
| 538 |  |  |  |  |  |  | dump_word_stats(\*FH) dumps all words sorted by occurence frequency using | 
| 539 |  |  |  |  |  |  | FH file handle (or STDOUT if no parameter is specified). Stop-words get a | 
| 540 |  |  |  |  |  |  | frequency value of 1. | 
| 541 |  |  |  |  |  |  |  | 
| 542 |  |  |  |  |  |  | =item * | 
| 543 |  |  |  |  |  |  | Searching | 
| 544 |  |  |  |  |  |  |  | 
| 545 |  |  |  |  |  |  | Both query() and advanced_query() return a reference to a hash with | 
| 546 |  |  |  |  |  |  | the following structure: | 
| 547 |  |  |  |  |  |  |  | 
| 548 |  |  |  |  |  |  | ( | 
| 549 |  |  |  |  |  |  | ignored  => [ string, string, ... ], # ignored words | 
| 550 |  |  |  |  |  |  | searched => [ string, string, ... ], # words searched for | 
| 551 |  |  |  |  |  |  | entries    => [  hashref, hashref, ... ] # list of records | 
| 552 |  |  |  |  |  |  | # found | 
| 553 |  |  |  |  |  |  | ) | 
| 554 |  |  |  |  |  |  |  | 
| 555 |  |  |  |  |  |  | The 'entries' element is a reference to an array of hashes, each having | 
| 556 |  |  |  |  |  |  | the following structure: | 
| 557 |  |  |  |  |  |  |  | 
| 558 |  |  |  |  |  |  | ( | 
| 559 |  |  |  |  |  |  | location => string,  # file path or URL or anything | 
| 560 |  |  |  |  |  |  | score    => number,  # score | 
| 561 |  |  |  |  |  |  | title    => string   # HTML title | 
| 562 |  |  |  |  |  |  | ) | 
| 563 |  |  |  |  |  |  |  | 
| 564 |  |  |  |  |  |  | =head1 NOTES | 
| 565 |  |  |  |  |  |  |  | 
| 566 |  |  |  |  |  |  | Note on implementation: | 
| 567 |  |  |  |  |  |  | The technique used for indexing is substantially derived from that | 
| 568 |  |  |  |  |  |  | exposed by Tim Kientzle on Dr. Dobbs magazine. | 
| 569 |  |  |  |  |  |  |  | 
| 570 |  |  |  |  |  |  | =head1 BUGS | 
| 571 |  |  |  |  |  |  |  | 
| 572 |  |  |  |  |  |  | Many, I guess. | 
| 573 |  |  |  |  |  |  |  | 
| 574 |  |  |  |  |  |  | =head1 AUTHOR | 
| 575 |  |  |  |  |  |  |  | 
| 576 |  |  |  |  |  |  | Max Muzi | 
| 577 |  |  |  |  |  |  |  | 
| 578 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 579 |  |  |  |  |  |  |  | 
| 580 |  |  |  |  |  |  | perl(1). | 
| 581 |  |  |  |  |  |  |  | 
| 582 |  |  |  |  |  |  | =cut | 
| 583 |  |  |  |  |  |  |  | 
| 584 |  |  |  |  |  |  |  | 
| 585 |  |  |  |  |  |  |  | 
| 586 |  |  |  |  |  |  | # | 
| 587 |  |  |  |  |  |  | #-------------------- the following code is only used when indexing ---------------- | 
| 588 |  |  |  |  |  |  | # | 
| 589 |  |  |  |  |  |  |  | 
| 590 |  |  |  |  |  |  | sub dump_word_stats { | 
| 591 |  |  |  |  |  |  | my $self = shift; | 
| 592 |  |  |  |  |  |  | my $fh = shift || \*STDOUT; | 
| 593 |  |  |  |  |  |  | my $indexdbpath= $self->{indexdbpath}; | 
| 594 |  |  |  |  |  |  | my %indexdb; | 
| 595 |  |  |  |  |  |  | die unless (-f $indexdbpath && -r _); | 
| 596 |  |  |  |  |  |  | tie_hash(\%indexdb,$indexdbpath, O_RDONLY ); | 
| 597 |  |  |  |  |  |  | my %index = ( %indexdb ); | 
| 598 |  |  |  |  |  |  | my $w; | 
| 599 |  |  |  |  |  |  | for $w( sort { length($index{$b}) <=> length($index{$a}) } | 
| 600 |  |  |  |  |  |  | keys %index ) { | 
| 601 |  |  |  |  |  |  | print $fh $w, "\t", length($index{$w}) / 2, "\n"; | 
| 602 |  |  |  |  |  |  | } | 
| 603 |  |  |  |  |  |  | untie_hash(\%indexdb); | 
| 604 |  |  |  |  |  |  | } | 
| 605 |  |  |  |  |  |  |  | 
| 606 |  |  |  |  |  |  |  | 
| 607 |  |  |  |  |  |  | sub start_indexing_session | 
| 608 |  |  |  |  |  |  | { | 
| 609 |  |  |  |  |  |  | my $self = shift; | 
| 610 |  |  |  |  |  |  | $self->rollback_indexing_session; | 
| 611 |  |  |  |  |  |  | my $key = 0; | 
| 612 |  |  |  |  |  |  | my $indexdbpath = $self->{indexdbpath}; | 
| 613 |  |  |  |  |  |  | my $locationsdbpath = $self->{locationsdbpath}; | 
| 614 |  |  |  |  |  |  | my $titlesdbpath = $self->{titlesdbpath}; | 
| 615 |  |  |  |  |  |  |  | 
| 616 |  |  |  |  |  |  | my $filemask 	= $self->{filemask}; | 
| 617 |  |  |  |  |  |  | my $keyref = \$key; | 
| 618 |  |  |  |  |  |  | my $filelistfile = $indexdbpath; | 
| 619 |  |  |  |  |  |  | $filelistfile =~  s/(\.db)?$/\.filelist/; | 
| 620 |  |  |  |  |  |  | open FILELIST, ">".$filelistfile; | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | my $session = { | 
| 623 |  |  |  |  |  |  | indexdbpath 	=> $indexdbpath, | 
| 624 |  |  |  |  |  |  | locationsdbpath 	=> $locationsdbpath, | 
| 625 |  |  |  |  |  |  | titlesdbpath 	=> $titlesdbpath, | 
| 626 |  |  |  |  |  |  | indexdb 	=> { }, | 
| 627 |  |  |  |  |  |  | locationsdb 	=> { }, | 
| 628 |  |  |  |  |  |  | titlesdb 	=> { }, | 
| 629 |  |  |  |  |  |  | cachedb 	=> { }, | 
| 630 |  |  |  |  |  |  | filemask 	=> $filemask, | 
| 631 |  |  |  |  |  |  | current_key	=> 16, # first 16 values are reserved (0 = word is ignored) | 
| 632 |  |  |  |  |  |  | bytes		=> 0, | 
| 633 |  |  |  |  |  |  | count 		=> 0, | 
| 634 |  |  |  |  |  |  | filecount	=> 0, | 
| 635 |  |  |  |  |  |  | listfh		=> \*FILELIST, | 
| 636 |  |  |  |  |  |  | status_THE 	=> 0, | 
| 637 |  |  |  |  |  |  | followsymlinks	=> $self->{followsymlinks}, | 
| 638 |  |  |  |  |  |  | minwordsize	=> $self->{minwordsize}, | 
| 639 |  |  |  |  |  |  | ignoreword	=> {}, | 
| 640 |  |  |  |  |  |  | autoignore	=> 1, | 
| 641 |  |  |  |  |  |  | ignorelimit	=> $self->{ignorelimit} || (2/3), | 
| 642 |  |  |  |  |  |  | level		=> $self->{level}, | 
| 643 |  |  |  |  |  |  | url_exclude 	=> $self->{url_exclude}, | 
| 644 |  |  |  |  |  |  | file_reader  => $self->{file_reader}, | 
| 645 |  |  |  |  |  |  | use_inode    => $self->{use_inode}, | 
| 646 |  |  |  |  |  |  | no_reset     => $self->{no_reset}, | 
| 647 |  |  |  |  |  |  | }; | 
| 648 |  |  |  |  |  |  |  | 
| 649 |  |  |  |  |  |  | unlink $indexdbpath."~"; | 
| 650 |  |  |  |  |  |  | unlink $locationsdbpath."~"; | 
| 651 |  |  |  |  |  |  | unlink $titlesdbpath."~"; | 
| 652 |  |  |  |  |  |  | if( $self->{no_reset} ) | 
| 653 |  |  |  |  |  |  | { | 
| 654 |  |  |  |  |  |  | copy( $indexdbpath, $indexdbpath."~" ); | 
| 655 |  |  |  |  |  |  | copy( $locationsdbpath, $locationsdbpath."~" ); | 
| 656 |  |  |  |  |  |  | copy( $titlesdbpath, $titlesdbpath."~" ); | 
| 657 |  |  |  |  |  |  | } | 
| 658 |  |  |  |  |  |  | tie_hash($session->{indexdb}, $indexdbpath."~" )   or die "$indexdbpath: $!\n"; | 
| 659 |  |  |  |  |  |  | tie_hash($session->{locationsdb}, $locationsdbpath."~" )   or die $!; | 
| 660 |  |  |  |  |  |  | tie_hash($session->{titlesdb},$titlesdbpath."~" ) or die $!; | 
| 661 |  |  |  |  |  |  |  | 
| 662 |  |  |  |  |  |  | my $ignorefile = $indexdbpath; | 
| 663 |  |  |  |  |  |  | $ignorefile =~ s/(\.db)?$/\.stopwords/; | 
| 664 |  |  |  |  |  |  | if (-r $ignorefile) {  # read *-stopwords.dat file | 
| 665 |  |  |  |  |  |  | open F, $ignorefile; | 
| 666 |  |  |  |  |  |  | while () { | 
| 667 |  |  |  |  |  |  | chomp; | 
| 668 |  |  |  |  |  |  | s/^\s+|\s+$//g; | 
| 669 |  |  |  |  |  |  | $session->{ignoreword}->{$_} = 1; | 
| 670 |  |  |  |  |  |  | } | 
| 671 |  |  |  |  |  |  | close F; | 
| 672 |  |  |  |  |  |  | my $count = int keys %{ $session->{ignoreword} }; | 
| 673 |  |  |  |  |  |  | DEBUG("using stop-words from $ignorefile ($count words)\n"); | 
| 674 |  |  |  |  |  |  | $session->{autoignore} = 0; | 
| 675 |  |  |  |  |  |  | } | 
| 676 |  |  |  |  |  |  | $session->{ignorefile} = $ignorefile; | 
| 677 |  |  |  |  |  |  |  | 
| 678 |  |  |  |  |  |  | my $time = time(); | 
| 679 |  |  |  |  |  |  |  | 
| 680 |  |  |  |  |  |  | $session->{start_time} = $time; | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | $self->{session} = $session; | 
| 683 |  |  |  |  |  |  | } | 
| 684 |  |  |  |  |  |  |  | 
| 685 |  |  |  |  |  |  | sub index_default_locations | 
| 686 |  |  |  |  |  |  | { | 
| 687 |  |  |  |  |  |  | my $self = shift; | 
| 688 |  |  |  |  |  |  | my $session = $self->{session}; | 
| 689 |  |  |  |  |  |  | return unless $session; | 
| 690 |  |  |  |  |  |  |  | 
| 691 |  |  |  |  |  |  | my $dirs 	= $self->{dirs}; | 
| 692 |  |  |  |  |  |  | my $urls	= $self->{urls}; | 
| 693 |  |  |  |  |  |  | my $filecount = 0; | 
| 694 |  |  |  |  |  |  | DEBUG("Counting files...\n") if int @$dirs; | 
| 695 |  |  |  |  |  |  | my $dir; | 
| 696 |  |  |  |  |  |  | for $dir( sort  @$dirs) { $filecount += IndexDir($session, $dir, 1); } | 
| 697 |  |  |  |  |  |  | $session->{filecount} = $filecount; | 
| 698 |  |  |  |  |  |  |  | 
| 699 |  |  |  |  |  |  | for $dir( sort  @$dirs) { IndexDir($session, $dir); } | 
| 700 |  |  |  |  |  |  | for my $url( sort  @$urls) { IndexWeb($session, $url); } | 
| 701 |  |  |  |  |  |  | } | 
| 702 |  |  |  |  |  |  |  | 
| 703 |  |  |  |  |  |  | sub index_content | 
| 704 |  |  |  |  |  |  | { | 
| 705 |  |  |  |  |  |  | my $self = shift; | 
| 706 |  |  |  |  |  |  | my $session = $self->{session}; | 
| 707 |  |  |  |  |  |  | return unless $session; | 
| 708 |  |  |  |  |  |  | my $info = shift; | 
| 709 |  |  |  |  |  |  | if( ref($info) ne 'HASH'  ) | 
| 710 |  |  |  |  |  |  | {	warn("usage: \$src->index_content( { content=>'...', id=>'...', title=>'...' } )\n"); | 
| 711 |  |  |  |  |  |  | return undef; | 
| 712 |  |  |  |  |  |  | } | 
| 713 |  |  |  |  |  |  | IndexFile( $session, $info->{id}, $info->{content}, $info->{title} ); | 
| 714 |  |  |  |  |  |  | return 1; | 
| 715 |  |  |  |  |  |  | } | 
| 716 |  |  |  |  |  |  |  | 
| 717 |  |  |  |  |  |  | sub rollback_indexing_session | 
| 718 |  |  |  |  |  |  | { | 
| 719 |  |  |  |  |  |  | my $self = shift; | 
| 720 |  |  |  |  |  |  | my $session = $self->{session}; | 
| 721 |  |  |  |  |  |  | return unless $session; | 
| 722 |  |  |  |  |  |  | untie_hash($session->{indexdb}); | 
| 723 |  |  |  |  |  |  | untie_hash($session->{locationsdb}); | 
| 724 |  |  |  |  |  |  | untie_hash($session->{titlesdb}); | 
| 725 |  |  |  |  |  |  | my $indexdbpath = $self->{indexdbpath}; | 
| 726 |  |  |  |  |  |  | my $locationsdbpath = $self->{locationsdbpath}; | 
| 727 |  |  |  |  |  |  | my $titlesdbpath = $self->{titlesdbpath}; | 
| 728 |  |  |  |  |  |  |  | 
| 729 |  |  |  |  |  |  | unlink $indexdbpath."~"; | 
| 730 |  |  |  |  |  |  | unlink $locationsdbpath."~"; | 
| 731 |  |  |  |  |  |  | unlink $titlesdbpath."~"; | 
| 732 |  |  |  |  |  |  | $self->{session} = undef; | 
| 733 |  |  |  |  |  |  | } | 
| 734 |  |  |  |  |  |  |  | 
| 735 |  |  |  |  |  |  | sub DESTROY | 
| 736 |  |  |  |  |  |  | { | 
| 737 |  |  |  |  |  |  | my $self = shift; | 
| 738 |  |  |  |  |  |  | $self->rollback_indexing_session; | 
| 739 |  |  |  |  |  |  | } | 
| 740 |  |  |  |  |  |  |  | 
| 741 |  |  |  |  |  |  | sub commit_indexing_session | 
| 742 |  |  |  |  |  |  | { | 
| 743 |  |  |  |  |  |  | my $self = shift; | 
| 744 |  |  |  |  |  |  | my $session = $self->{session}; | 
| 745 |  |  |  |  |  |  | return unless $session; | 
| 746 |  |  |  |  |  |  | FlushCache($session->{cachedb}, $session->{indexdb}, $session); | 
| 747 |  |  |  |  |  |  | my $time = time()-$session->{start_time}; | 
| 748 |  |  |  |  |  |  | DEBUG("$session->{bytes} bytes read, $session->{count} files processed in $time seconds\n"); | 
| 749 |  |  |  |  |  |  | untie_hash($session->{indexdb}); | 
| 750 |  |  |  |  |  |  | untie_hash($session->{locationsdb}); | 
| 751 |  |  |  |  |  |  | untie_hash($session->{titlesdb}); | 
| 752 |  |  |  |  |  |  |  | 
| 753 |  |  |  |  |  |  | my $indexdbpath = $self->{indexdbpath}; | 
| 754 |  |  |  |  |  |  | my $locationsdbpath = $self->{locationsdbpath}; | 
| 755 |  |  |  |  |  |  | my $titlesdbpath = $self->{titlesdbpath}; | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | rename $indexdbpath."~", $indexdbpath; | 
| 758 |  |  |  |  |  |  | rename $locationsdbpath."~", $locationsdbpath ; | 
| 759 |  |  |  |  |  |  | rename $titlesdbpath."~", $titlesdbpath; | 
| 760 |  |  |  |  |  |  | close $session->{listfh}; | 
| 761 |  |  |  |  |  |  | if ( $session->{autoignore} ) { | 
| 762 |  |  |  |  |  |  | my $ignorefile = $session->{ignorefile}; | 
| 763 |  |  |  |  |  |  | open  F, ">".$ignorefile; #write *-stopwords.dat file | 
| 764 |  |  |  |  |  |  | print F join( "\n", sort keys %{ $session->{ignoreword} } ); | 
| 765 |  |  |  |  |  |  | close F; | 
| 766 |  |  |  |  |  |  | } | 
| 767 |  |  |  |  |  |  |  | 
| 768 |  |  |  |  |  |  | $self->{session} = undef; | 
| 769 |  |  |  |  |  |  | } | 
| 770 |  |  |  |  |  |  |  | 
| 771 |  |  |  |  |  |  |  | 
| 772 |  |  |  |  |  |  |  | 
| 773 |  |  |  |  |  |  | sub makeindex | 
| 774 |  |  |  |  |  |  | { | 
| 775 |  |  |  |  |  |  | my $self = shift; | 
| 776 |  |  |  |  |  |  | $self->start_indexing_session(); | 
| 777 |  |  |  |  |  |  | $self->index_default_locations(); | 
| 778 |  |  |  |  |  |  | $self->commit_indexing_session(); | 
| 779 |  |  |  |  |  |  | } | 
| 780 |  |  |  |  |  |  |  | 
| 781 |  |  |  |  |  |  |  | 
| 782 |  |  |  |  |  |  | sub IndexDir { | 
| 783 |  |  |  |  |  |  | my ($session, $dir, $only_recurse) = @_; | 
| 784 |  |  |  |  |  |  | my $followsymlinks = $session->{followsymlinks}; | 
| 785 |  |  |  |  |  |  | my $file_reader = $session->{file_reader}; | 
| 786 |  |  |  |  |  |  | opendir D, $dir; | 
| 787 |  |  |  |  |  |  | #	DEBUG "D $dir\n"; | 
| 788 |  |  |  |  |  |  | my @files = readdir D; | 
| 789 |  |  |  |  |  |  | close D; | 
| 790 |  |  |  |  |  |  | my $e; | 
| 791 |  |  |  |  |  |  | my $count = 0; | 
| 792 |  |  |  |  |  |  | my $text; | 
| 793 |  |  |  |  |  |  | for $e(@files) { | 
| 794 |  |  |  |  |  |  | next if $e =~ /^\.\.?/; | 
| 795 |  |  |  |  |  |  | my $path = $dir."/".$e; | 
| 796 |  |  |  |  |  |  | if (-d $path) { | 
| 797 |  |  |  |  |  |  | unless ($followsymlinks) { | 
| 798 |  |  |  |  |  |  | next if -l $path ; | 
| 799 |  |  |  |  |  |  | } | 
| 800 |  |  |  |  |  |  | $count += IndexDir($session,$path, $only_recurse); | 
| 801 |  |  |  |  |  |  | } | 
| 802 |  |  |  |  |  |  | elsif (-f _ ) { | 
| 803 |  |  |  |  |  |  | my $filemask = $session->{filemask}; | 
| 804 |  |  |  |  |  |  | if ($filemask) { | 
| 805 |  |  |  |  |  |  | next unless $e =~ $filemask; | 
| 806 |  |  |  |  |  |  | } | 
| 807 |  |  |  |  |  |  | unless ($only_recurse) | 
| 808 |  |  |  |  |  |  | { | 
| 809 |  |  |  |  |  |  | if( $file_reader ) | 
| 810 |  |  |  |  |  |  | { | 
| 811 |  |  |  |  |  |  | $text = $file_reader->read( $path ); | 
| 812 |  |  |  |  |  |  | IndexFile($session,$path,$text); | 
| 813 |  |  |  |  |  |  | } else | 
| 814 |  |  |  |  |  |  | { | 
| 815 |  |  |  |  |  |  | IndexFile($session,$path); | 
| 816 |  |  |  |  |  |  | } | 
| 817 |  |  |  |  |  |  | } | 
| 818 |  |  |  |  |  |  | $count ++; | 
| 819 |  |  |  |  |  |  | } | 
| 820 |  |  |  |  |  |  | } | 
| 821 |  |  |  |  |  |  | return $count; | 
| 822 |  |  |  |  |  |  | } | 
| 823 |  |  |  |  |  |  |  | 
| 824 |  |  |  |  |  |  |  | 
| 825 |  |  |  |  |  |  |  | 
| 826 |  |  |  |  |  |  | sub IndexFile { | 
| 827 |  |  |  |  |  |  | my ($session, $file, $text, $title ) = @_; | 
| 828 |  |  |  |  |  |  | my $cachedb = $session->{cachedb}; | 
| 829 |  |  |  |  |  |  | my $locationsdb = $session->{locationsdb}; | 
| 830 |  |  |  |  |  |  | my $key = $session->{current_key}; | 
| 831 |  |  |  |  |  |  | if( $session->{use_inode} ) | 
| 832 |  |  |  |  |  |  | { | 
| 833 |  |  |  |  |  |  | $key = (stat($file))[1]; | 
| 834 |  |  |  |  |  |  | } | 
| 835 |  |  |  |  |  |  | my $no_of_files = $session->{filecount}; | 
| 836 |  |  |  |  |  |  | if(  $session->{no_reset} ) | 
| 837 |  |  |  |  |  |  | { | 
| 838 |  |  |  |  |  |  | if( exists $locationsdb->{pack"xN",$key} ) | 
| 839 |  |  |  |  |  |  | { | 
| 840 |  |  |  |  |  |  | warn("key $key already in locationsdb. Skipping\n"); | 
| 841 |  |  |  |  |  |  | return; | 
| 842 |  |  |  |  |  |  | } | 
| 843 |  |  |  |  |  |  | } | 
| 844 |  |  |  |  |  |  | DEBUG $session->{count}+1, "/$no_of_files $file (id=$key)\n"; | 
| 845 |  |  |  |  |  |  | my $fh = $session->{listfh}; | 
| 846 |  |  |  |  |  |  | print $fh "$key\t$file\n"; | 
| 847 |  |  |  |  |  |  | local $/; | 
| 848 |  |  |  |  |  |  | unless (defined $text) { | 
| 849 |  |  |  |  |  |  | undef $/; | 
| 850 |  |  |  |  |  |  | open(FILE, $file); | 
| 851 |  |  |  |  |  |  | ($text) = ; 		# Read entire file | 
| 852 |  |  |  |  |  |  | close FILE; | 
| 853 |  |  |  |  |  |  | } | 
| 854 |  |  |  |  |  |  | my $filesize =  length($text); | 
| 855 |  |  |  |  |  |  | if ($file =~ /\.s?htm.?/i ) { | 
| 856 |  |  |  |  |  |  | $text =~ /]*>([^<]+)<\/title/i ; | 
| 857 |  |  |  |  |  |  | $title = $1; | 
| 858 |  |  |  |  |  |  | $title =~ s/\s+/ /g; | 
| 859 |  |  |  |  |  |  | $text =~ s/<[^>]*>//g; 		# strip all HTML tags | 
| 860 |  |  |  |  |  |  | } | 
| 861 |  |  |  |  |  |  | if( defined $title ) | 
| 862 |  |  |  |  |  |  | { | 
| 863 |  |  |  |  |  |  | $session->{titlesdb}->{pack"xN",$key} = $title;  # put title in db | 
| 864 |  |  |  |  |  |  | DEBUG("* \"$title\"\n"); | 
| 865 |  |  |  |  |  |  | } | 
| 866 |  |  |  |  |  |  | # index all the words under the current file-id | 
| 867 |  |  |  |  |  |  | my($wordsIndexed) = &IndexWords($cachedb, $text,$key, $session); | 
| 868 |  |  |  |  |  |  | $session->{current_key}++; | 
| 869 |  |  |  |  |  |  | DEBUG "* $wordsIndexed words\n"; | 
| 870 |  |  |  |  |  |  |  | 
| 871 |  |  |  |  |  |  | # map file-id (key) to this filename | 
| 872 |  |  |  |  |  |  | $locationsdb->{pack"xN",$key} = $file;   	# leading null is here for | 
| 873 |  |  |  |  |  |  | # historical reasons :-) | 
| 874 |  |  |  |  |  |  | $session->{bytes} += $filesize; | 
| 875 |  |  |  |  |  |  | $session->{count}++; | 
| 876 |  |  |  |  |  |  | $session->{_temp_size} += $filesize; | 
| 877 |  |  |  |  |  |  | if ($session->{_temp_size} > 2000000 ) { | 
| 878 |  |  |  |  |  |  | my $rc = 0; | 
| 879 |  |  |  |  |  |  | $rc = FlushCache($cachedb, $session->{indexdb}, $session); | 
| 880 |  |  |  |  |  |  |  | 
| 881 |  |  |  |  |  |  | if (! $rc ) { | 
| 882 |  |  |  |  |  |  | tie_hash($session->{indexdb}, $session->{indexdbpath}) or die $!; | 
| 883 |  |  |  |  |  |  | untie_hash($session->{indexdb}); | 
| 884 |  |  |  |  |  |  | $rc = FlushCache($cachedb, $session->{indexdb}, $session); | 
| 885 |  |  |  |  |  |  | die $! if not $rc; | 
| 886 |  |  |  |  |  |  | } | 
| 887 |  |  |  |  |  |  |  | 
| 888 |  |  |  |  |  |  | $session->{_temp_size} = 0; | 
| 889 |  |  |  |  |  |  | $session->{cachedb} = {}; | 
| 890 |  |  |  |  |  |  | } | 
| 891 |  |  |  |  |  |  | } | 
| 892 |  |  |  |  |  |  |  | 
| 893 |  |  |  |  |  |  | sub IndexWords { | 
| 894 |  |  |  |  |  |  | my ($db, $words, $fileKey, $session) = @_; | 
| 895 |  |  |  |  |  |  | #      hash  content  file-id   options | 
| 896 |  |  |  |  |  |  | my (%worduniq); # for unique-ifying word list | 
| 897 |  |  |  |  |  |  | my $minwordsize = $session->{minwordsize}; | 
| 898 |  |  |  |  |  |  | my (@words) = split( /[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words); # split into an array of words | 
| 899 |  |  |  |  |  |  | @words = grep { $worduniq{$_}++ == 0 } 		# remove duplicates | 
| 900 |  |  |  |  |  |  | grep { length  > $minwordsize } 		# must be longer than one character | 
| 901 |  |  |  |  |  |  | grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ }	# strip leading punct | 
| 902 |  |  |  |  |  |  | grep { /[a-zA-Z0-9\xc0-\xff]/ } 		# must have an alphanumeric | 
| 903 |  |  |  |  |  |  | @words; | 
| 904 |  |  |  |  |  |  | #   "  foreach (sort @words) { " | 
| 905 |  |  |  |  |  |  | for (@words) {     			# no need to sort here, | 
| 906 |  |  |  |  |  |  | my $a = $db->{$_};		# we will sort when cache is flushed | 
| 907 |  |  |  |  |  |  | $a .= pack "N",$fileKey;	# appending packed file-id's | 
| 908 |  |  |  |  |  |  | $db->{$_} = $a; | 
| 909 |  |  |  |  |  |  | } | 
| 910 |  |  |  |  |  |  | return int @words; | 
| 911 |  |  |  |  |  |  | } | 
| 912 |  |  |  |  |  |  |  | 
| 913 |  |  |  |  |  |  |  | 
| 914 |  |  |  |  |  |  |  | 
| 915 |  |  |  |  |  |  | sub FlushCache { | 
| 916 |  |  |  |  |  |  | my ($source, $dest, $session) = @_; | 
| 917 |  |  |  |  |  |  | # flush source hashe into dest.... | 
| 918 |  |  |  |  |  |  | # %$dest is supposed to be tied, otherwise the whole | 
| 919 |  |  |  |  |  |  | # thing doens't make much sense... :-) | 
| 920 |  |  |  |  |  |  | my $scount = int  keys %$source ; | 
| 921 |  |  |  |  |  |  | my $ucount = 0; | 
| 922 |  |  |  |  |  |  | my $acount = 0; | 
| 923 |  |  |  |  |  |  | if ($scount == 0) { | 
| 924 |  |  |  |  |  |  | die "error: 0 words in cache\n"; | 
| 925 |  |  |  |  |  |  | } | 
| 926 |  |  |  |  |  |  | #	my $wordcount = int keys %$dest; | 
| 927 |  |  |  |  |  |  | #	if ($wordcount < $session->{wordcount}) { | 
| 928 |  |  |  |  |  |  | #		warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; | 
| 929 |  |  |  |  |  |  | #		return undef; | 
| 930 |  |  |  |  |  |  | #	} | 
| 931 |  |  |  |  |  |  | #	$session->{wordcount} = $wordcount; | 
| 932 |  |  |  |  |  |  |  | 
| 933 |  |  |  |  |  |  | #	DEBUG("$wordcount words in database\n"); | 
| 934 |  |  |  |  |  |  | my $objref = tied %$dest ; | 
| 935 |  |  |  |  |  |  | DEBUG("flushing $scount words into $dest ($objref)\n"); | 
| 936 |  |  |  |  |  |  |  | 
| 937 |  |  |  |  |  |  | my $filecount = $session->{count}; | 
| 938 |  |  |  |  |  |  | my $autoignore = $session->{autoignore}; | 
| 939 |  |  |  |  |  |  | my $ignorethreshold = int ( $filecount * $session->{ignorelimit} ); | 
| 940 |  |  |  |  |  |  |  | 
| 941 |  |  |  |  |  |  | my $w; | 
| 942 |  |  |  |  |  |  |  | 
| 943 |  |  |  |  |  |  | WORD: | 
| 944 |  |  |  |  |  |  | for $w(sort keys %$source) { | 
| 945 |  |  |  |  |  |  | my $data = $source->{$w}; | 
| 946 |  |  |  |  |  |  | if ($session->{ignoreword}->{$w} ) { | 
| 947 |  |  |  |  |  |  | DEBUG("ignoring '$w' \n"); | 
| 948 |  |  |  |  |  |  | $data = pack("N*", ( 0 ) ); # id = 0 means $w is a stop-word | 
| 949 |  |  |  |  |  |  | } | 
| 950 |  |  |  |  |  |  | elsif (defined $dest->{$w}) { | 
| 951 |  |  |  |  |  |  | my %uniq = (); | 
| 952 |  |  |  |  |  |  | my $keys =  $dest->{$w} . $data ; | 
| 953 |  |  |  |  |  |  | my $keycount = length($keys)/2; # dividing by 2 | 
| 954 |  |  |  |  |  |  |  | 
| 955 |  |  |  |  |  |  | $ucount++; | 
| 956 |  |  |  |  |  |  | ##			my @keys = unpack("n*", $keys) ; | 
| 957 |  |  |  |  |  |  | ##			my $keycount = @keys; | 
| 958 |  |  |  |  |  |  | ## | 
| 959 |  |  |  |  |  |  | ##			if ($keys[0] == 0 ) {  # skip ignored word | 
| 960 |  |  |  |  |  |  | ##				DEBUG("skipping '$w' \n"); | 
| 961 |  |  |  |  |  |  | ##				next WORD; | 
| 962 |  |  |  |  |  |  | ##			} els | 
| 963 |  |  |  |  |  |  |  | 
| 964 |  |  |  |  |  |  | if ($autoignore && ($filecount > 100) | 
| 965 |  |  |  |  |  |  | && ($keycount > $ignorethreshold ) ) { | 
| 966 |  |  |  |  |  |  | DEBUG("word '$w' will be ignored (found in $keycount of $filecount files)\n"); | 
| 967 |  |  |  |  |  |  | # ignored words are associated to file-id 0 | 
| 968 |  |  |  |  |  |  | ##				@keys = ( 0 ); | 
| 969 |  |  |  |  |  |  | $keys = pack("N*", 0); | 
| 970 |  |  |  |  |  |  | $session->{ignoreword}->{$w} = 1; | 
| 971 |  |  |  |  |  |  | } | 
| 972 |  |  |  |  |  |  | ##			@keys = grep { $uniq{$_}++ == 0} @keys; | 
| 973 |  |  |  |  |  |  | ##			$data = pack("n*", @keys); | 
| 974 |  |  |  |  |  |  |  | 
| 975 |  |  |  |  |  |  | $data = $keys; | 
| 976 |  |  |  |  |  |  |  | 
| 977 |  |  |  |  |  |  | ##			if ($verbose_flag && ( $w eq "the" ) )  { | 
| 978 |  |  |  |  |  |  | ##				my $len = int(@keys); | 
| 979 |  |  |  |  |  |  | ##				if ($len < $session->{status_THE} ) { | 
| 980 |  |  |  |  |  |  | ##						die "panic: problem with word 'the'"; | 
| 981 |  |  |  |  |  |  | ##				} | 
| 982 |  |  |  |  |  |  | ##				$session->{status_THE} = $len; | 
| 983 |  |  |  |  |  |  | ##				DEBUG("word 'the' found in $len files \n"); | 
| 984 |  |  |  |  |  |  | ##			} | 
| 985 |  |  |  |  |  |  |  | 
| 986 |  |  |  |  |  |  | } else { | 
| 987 |  |  |  |  |  |  | $acount++; | 
| 988 |  |  |  |  |  |  | } | 
| 989 |  |  |  |  |  |  | $dest->{$w} = $data; | 
| 990 |  |  |  |  |  |  |  | 
| 991 |  |  |  |  |  |  | #		if ($dest->{$w} ne $data) { | 
| 992 |  |  |  |  |  |  | #			warn "unexpected error: \$w=$w\n"; | 
| 993 |  |  |  |  |  |  | #			return undef; | 
| 994 |  |  |  |  |  |  | #		} | 
| 995 |  |  |  |  |  |  | } | 
| 996 |  |  |  |  |  |  | DEBUG("$ucount words updated, $acount new words added\n"); | 
| 997 |  |  |  |  |  |  | if ($debug_flag) { | 
| 998 |  |  |  |  |  |  | my $wordcount = int keys %$dest; | 
| 999 |  |  |  |  |  |  | if ($wordcount < $session->{wordcount}) { | 
| 1000 |  |  |  |  |  |  | warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; | 
| 1001 |  |  |  |  |  |  | return undef; | 
| 1002 |  |  |  |  |  |  | } | 
| 1003 |  |  |  |  |  |  | $session->{wordcount} = $wordcount; | 
| 1004 |  |  |  |  |  |  | DEBUG("$wordcount words in database\n"); | 
| 1005 |  |  |  |  |  |  | } | 
| 1006 |  |  |  |  |  |  | return 1; | 
| 1007 |  |  |  |  |  |  | } | 
| 1008 |  |  |  |  |  |  |  | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  |  | 
| 1011 |  |  |  |  |  |  |  | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 |  |  |  |  |  |  | sub IndexWeb { | 
| 1014 |  |  |  |  |  |  | my ($session, $url) = @_; | 
| 1015 |  |  |  |  |  |  | require MMM::Text::Search::Inet; | 
| 1016 |  |  |  |  |  |  | my $req = new HTTPRequest { AutoRedirect => 1 }; | 
| 1017 |  |  |  |  |  |  | my %fetched = (); | 
| 1018 |  |  |  |  |  |  | $req->set_url($url); | 
| 1019 |  |  |  |  |  |  | my $host = $req->host(); | 
| 1020 |  |  |  |  |  |  | $session->{req} = $req; | 
| 1021 |  |  |  |  |  |  | $session->{fetched} = \%fetched; | 
| 1022 |  |  |  |  |  |  | $session->{host} = $host; | 
| 1023 |  |  |  |  |  |  | my $deadlinksfile = $session->{indexdbpath}; | 
| 1024 |  |  |  |  |  |  | $deadlinksfile =~ s/(\.db)?$/\.deadlinks/; | 
| 1025 |  |  |  |  |  |  | open DL, ">".$deadlinksfile; | 
| 1026 |  |  |  |  |  |  | $session->{deadlinksfh} = \*DL; | 
| 1027 |  |  |  |  |  |  | recursive_fetch($session, $url, "", 0); | 
| 1028 |  |  |  |  |  |  | } | 
| 1029 |  |  |  |  |  |  |  | 
| 1030 |  |  |  |  |  |  |  | 
| 1031 |  |  |  |  |  |  |  | 
| 1032 |  |  |  |  |  |  | sub recursive_fetch { | 
| 1033 |  |  |  |  |  |  | my ($session, $URL, $parent, $level) = @_; | 
| 1034 |  |  |  |  |  |  | my $req = $session->{req}; | 
| 1035 |  |  |  |  |  |  | $req->reset(); | 
| 1036 |  |  |  |  |  |  | $req->set_url($URL); | 
| 1037 |  |  |  |  |  |  | my $url =  $req->url(); | 
| 1038 |  |  |  |  |  |  | return unless $req->host() eq $session->{host}; | 
| 1039 |  |  |  |  |  |  | return if $session->{fetched}->{$url}; | 
| 1040 |  |  |  |  |  |  | $session->{fetched}->{$url} = 1; | 
| 1041 |  |  |  |  |  |  | return unless $req->get_page(); | 
| 1042 |  |  |  |  |  |  | my $status =  $req->status(); | 
| 1043 |  |  |  |  |  |  | DEBUG( ">>> $url ($status)\n"); | 
| 1044 |  |  |  |  |  |  | if ( $status != 200 ) { | 
| 1045 |  |  |  |  |  |  | my $fh = $session->{deadlinksfh}; | 
| 1046 |  |  |  |  |  |  | my $url = $req->url(); | 
| 1047 |  |  |  |  |  |  | print $fh $status, "\t", | 
| 1048 |  |  |  |  |  |  | $url, "(", $req->{_URL},")", | 
| 1049 |  |  |  |  |  |  | "\t", $parent, "\n"; | 
| 1050 |  |  |  |  |  |  | return; | 
| 1051 |  |  |  |  |  |  | }; | 
| 1052 |  |  |  |  |  |  | my $base =  $req->base_url(); | 
| 1053 |  |  |  |  |  |  | my $content_ref = $req->content_ref(); | 
| 1054 |  |  |  |  |  |  | my $header  = $req->header(); | 
| 1055 |  |  |  |  |  |  | IndexFile($session, $url, $$content_ref); | 
| 1056 |  |  |  |  |  |  | return if ($session->{level} && $level >= $session->{level}); | 
| 1057 |  |  |  |  |  |  | $$content_ref =~ s///gs;	#remove comments | 
| 1058 |  |  |  |  |  |  | my @links = $$content_ref =~/href=([^>\s]+)/ig; #extract hyperlinks | 
| 1059 |  |  |  |  |  |  | my $count = 0; | 
| 1060 |  |  |  |  |  |  | my $exclude_re = $session->{url_exclude}; | 
| 1061 |  |  |  |  |  |  | for(@links) { | 
| 1062 |  |  |  |  |  |  | s/\"|\'//g; | 
| 1063 |  |  |  |  |  |  | next if m/^(ftp|mailto|gopher|news):/; | 
| 1064 |  |  |  |  |  |  | next if m/^$exclude_re$/o; | 
| 1065 |  |  |  |  |  |  | my $link = /^http/ ? $_ : join("/",$base,$_); | 
| 1066 |  |  |  |  |  |  | $link =~ s/#.*//; | 
| 1067 |  |  |  |  |  |  | $count++; | 
| 1068 |  |  |  |  |  |  | recursive_fetch($session,$link, $url, $level +  1); | 
| 1069 |  |  |  |  |  |  | } | 
| 1070 |  |  |  |  |  |  | } | 
| 1071 |  |  |  |  |  |  |  | 
| 1072 |  |  |  |  |  |  |  | 
| 1073 |  |  |  |  |  |  | 1; | 
| 1074 |  |  |  |  |  |  | __END__ |