File Coverage

blib/lib/MMM/Text/Search.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1
2				package MMM::Text::Search;
3	1	1	1429	use File::Copy;
	1		9474
	1		67
4
5				#$Id: Search.pm,v 1.50 2004/12/13 18:45:15 maxim Exp $
6	1	1	7	use strict;
	1		2
	1		31
7	1	1	5	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $verbose_flag );
	1		5
	1		126
8				require Exporter;
9				require AutoLoader;
10				@ISA = qw(Exporter AutoLoader);
11				@EXPORT = qw(
12				);
13				$VERSION = '0.07';
14
15				#
16				# Perl module for indexing and searching text files and web pages.
17				# (Max Muzi, Apr-Sep 1999)
18				#
19				#
20				# Note on implementation:
21				# The technique used for indexing is substantially derived from that
22				# exposed by Tim Kientzle on Dr. Dobbs magazine. (Actually IndexWords()
23				# has been cut'n'pasted from his scripts.)
24				#
25				#
26
27	1	1	1551	use DB_File;
	0
	0
28				use Fcntl;
29				require 5.005;
30
31				$verbose_flag = 0;
32
33				my $debug_flag = 0;
34
35				my $errstr = undef;
36				my $syntax_error = undef;
37
38				sub errstr { $errstr };
39
40				sub new { # constructor! (see the docs for usage [sorry, there're no docs ])
41				my $pkg = shift;
42				my $arg = shift;
43				my $opt = undef;
44				if (ref($arg) ne "HASH") {
45				if (-f $arg) {
46				$opt->{IndexDB} = $arg;
47				$opt->{Verbose} = shift;
48				}
49				else {
50				die "usage: \$obj = new MMM::Text::Search ( '/index/path' or \$hashref)\n"
51				}
52				} else {
53				$opt = $arg;
54				};
55
56				$verbose_flag = $opt->{Debug} \|\| $opt->{Verbose} ;
57
58				my $indexdbpath = $opt->{IndexDB} \|\| $opt->{IndexPath} ;
59				my $filemask = $opt->{FileMask} ;
60				my $dirs = ( ref($opt->{Dirs}) eq "ARRAY" ) ? $opt->{Dirs} : [ ];
61				my $followsymlinks = defined $opt->{FollowSymLinks};
62
63				my $opturls = $opt->{Urls} \|\| $opt->{URLs};
64				my $urls = ( ref($opturls) eq "ARRAY" ) ? $opturls : [ ];
65				my $level = int $opt->{Level};
66
67				my $locationsdbpath = $indexdbpath;
68				$locationsdbpath =~ s/(\.db)*$/\-locations.db/;
69				my $titlesdbpath = $indexdbpath;
70				$titlesdbpath =~ s/(\.db)*$/\-titles.db/;
71
72				my $minwordsize = $opt->{MinWordSize} \|\| 1;
73
74
75				my $self = {
76				indexdbpath => $indexdbpath,
77				locationsdbpath => $locationsdbpath,
78				titlesdbpath => $titlesdbpath,
79				filemask => length($filemask) ? qr/$filemask/ : undef,
80				dirs => $dirs,
81				followsymlinks => $followsymlinks,
82				minwordsize => $minwordsize,
83				ignorelimit => $opt->{IgnoreLimit} \|\| (2/3),
84				urls => $urls,
85				level => $level,
86				url_exclude => $opt->{UrlExludeMask} \|\| "(?i).*\.(zip\|exe\|gz\|arj\|bin\|hqx)",
87				file_reader => $opt->{FileReader},
88				use_inode => $opt->{UseInodeAsKey},
89				no_reset => $opt->{UseInodeAsKey} && $opt->{NoReset}
90
91				};
92				DEBUG("filemask=$filemask, indexfile=$indexdbpath, ignorelimit=$self->{ignorelimit}\n");
93				DEBUG("dirs = [", join(",", @$dirs),"], ");
94				DEBUG("urls = [", join(",", @$urls),"] \n");
95				bless($self, $pkg);
96				return $self;
97				}
98
99				sub _add_keys_to_match_hash {
100				# extract file-codes from $keys and update corresponding $hash elements (score)
101				my ($keys, $hash) = @_;
102				my $key;
103				foreach $key ( unpack("N*",$keys) ) {
104				# DEBUG($key, " ");
105				# ignored words (stop-words) only include file-id 0 (see FlushCache() below)
106				return 0 if $key == 0 ;
107				$hash->{$key}++
108				}
109				return 1;
110				}
111
112				sub _push_words_from_hash {
113				my ($hash,$array, $regexp) = @_;
114				my $w;
115				for $w(keys %$hash) {
116				push @$array,$w if $w =~ $regexp;
117				}
118				}
119
120
121
122				#notes on advanced_query();
123				# - queries containing stop-words may yields bizzare results..
124				# - score is not always correct
125				# - error handling should be improved... :-)
126				sub advanced_query {
127				# perform queries such as "( a and ( b or c ) ) and ( d and e) "
128				my $self = shift;
129				my $expr = shift;
130				my $indexdbpath= $self->{indexdbpath};
131				my $locationsdbpath = $self->{locationsdbpath};
132				my $titlesdbpath = $self->{titlesdbpath};
133				my %indexdb;
134				my %locationsdb;
135				my %titlesdb;
136				return undef unless (-f $indexdbpath && -r _);
137				return undef unless (-f $locationsdbpath && -r _);
138				return undef unless (-f $titlesdbpath && -r _);
139				return undef unless
140				tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) &&
141				tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) &&
142				tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY );
143				my @ignored = ();
144				my @words = ();
145				my $verbose_flag_tmp = $verbose_flag;
146				$verbose_flag = shift; # undocumented debug switch
147				chomp $expr;
148				undef $syntax_error; #reset error
149				DEBUG("******** _match_expression() debug ********\n");
150				my $match = _match_expression($expr, \%indexdb, \@ignored);
151				DEBUG("******** end debug ********\n");
152				if ($syntax_error) {
153				$errstr = $syntax_error;
154				$verbose_flag = $verbose_flag_tmp;
155				return undef;
156				}
157				my $result = _make_result_hash($match,\%locationsdb, \%titlesdb, \@words, \@ignored);
158
159				untie(%indexdb);
160				untie(%locationsdb);
161				untie(%titlesdb);
162				$verbose_flag = $verbose_flag_tmp;
163				return $result;
164				}
165
166				sub _match_expression { # recursively apply a keyword-search expression to indexdb
167				# $expr may be either a string or a ref to an array of tokens
168				# a ref to a "score" hash is returned (or undef sometimes)
169				my ($expr, $index, $ignored) = @_;
170				my $parsed = _parse_expression($expr);
171				# _parse_expression() returns a reference to an array of three elements:
172				# [ operator, left_expr, right_expr]
173				# if right_expr is not defined then expr was atomic and left_expr is a string,
174				# otherwise both right_expr and left_expr are references to arrays of tokens
175				if ( not $parsed) {
176				DEBUG("Syntax error :-( \n");
177				return undef;
178				}
179				my ( $op, $left,$right) = @$parsed;
180
181				if ($left && not $right) {
182				$left =~ s/^\s$?\s\|\s$?\s$//g;
183				DEBUG("Looking up >$left<\n");
184				my %matches = ();
185				my $word = $left;
186				my $rc = 0;
187				my $keys = $index->{lc $word}; # get file-id's from indexdb
188				$rc = _add_keys_to_match_hash($keys,\%matches);
189				# if $rc is false then $word is a stop-word, see _add_keys_to_match_hash() for more info
190				if (not $rc) {
191				DEBUG("$word ignored\n");
192				push @$ignored, $word;
193				return undef;
194				# what should we do now? gotta think it over...
195				}
196				return \%matches;
197				}
198
199				DEBUG("Evaluating >$left< --$op-- >$right<\n");
200				my $left_match = _match_expression($left, $index, $ignored);
201				my $right_match = _match_expression($right, $index, $ignored);
202
203				return undef if ($syntax_error);
204				my %matches = ();
205				my $file = undef;
206
207				if ($op eq 'AND' ) {
208				%matches = ( %$left_match );
209				for $file( keys %matches) {
210				delete $matches{$file} unless $right_match->{$file}
211				}
212				return \%matches;
213				}
214				if ($op eq 'AND NOT') {
215				%matches = ( %$left_match );
216				for $file( keys %matches) {
217				delete $matches{$file} if $right_match->{$file}
218				}
219				return \%matches;
220				}
221				if ($op eq 'OR') {
222				%matches = ( %$left_match );
223				for $file( keys %$right_match) {
224				if ($matches{$file}) {
225				$matches{$file} +=$right_match->{$file};
226				} else {
227				$matches{$file} =$right_match->{$file};
228				}
229				}
230				return \%matches;
231				}
232				return undef;
233				}
234
235				sub _parse_expression {
236				my $arg = shift;
237				my $tokens = undef; # this is an arry ref
238				if (ref($arg) ne 'ARRAY') {
239				$tokens = [
240				$arg =~ m/( $ \| $\| \bAND\s+NOT\b \| \bAND\b \| \bOR\b \| \"[^\"]+\" \| \b\w+\b) /xig
241				];
242				}
243				# important!! "AND NOT" is treated as a single logical operator...
244				# this means that things like "not a and b" aren't well-formed,
245				# while "b and not a" is
246				else { $tokens = $arg;
247				}
248				my $left = undef; # array ref (oppure stringa se � un espressione atomica)
249				my $right = undef; # array ref !
250				my $op = 'OR';
251				my $depth = 0;
252				my $pos = 0;
253				my $tok;
254				my $len = int @$tokens;
255				DEBUG("expr = ", join(" + ", @$tokens),"\n");
256				while (1) {
257				if ($len == 1) {
258				return [ undef, $tokens->[0], undef ];
259				}
260				DEBUG("$tok : depth=$depth pos=$pos len=$len\n");
261				if ($depth == 0 && ($pos == $len) ) {
262				if ($tokens->[0] eq '(' && $tokens->[$len-1] eq ')') {
263				# take off outer parentheses...
264				shift @$tokens;
265				pop @$tokens;
266				$len -= 2;
267				$pos = 0;
268				$depth = 0;
269				DEBUG("expr = ", join(" + ", @$tokens),"\n");
270				} else { # ahhhh... this expression won't be parsed...
271				$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")";
272				DEBUG("atom not atomic\n");
273				return undef;
274				}
275
276				} elsif ( $pos == $len ) {
277				$syntax_error = "Non-matching parentheses (\"".join(' ', @$tokens)."\")";
278				DEBUG("non matching parentheses\n");
279				return undef;
280				}
281				$tok = $tokens->[$pos++];
282				if ($tok eq '(') { $depth++; next; }
283				if ($tok eq ')') { $depth--; next; }
284				next if $depth;
285				if ($tok =~ /\b(AND\s+NOT\|AND\|OR)\b/i) {
286				if ($pos == 1 \|\| $pos == $len) {
287				$syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")";
288				return undef
289				}
290				$op = uc $1; $op =~ s/\s+/ /g;
291				$left = [ @$tokens[0..$pos-2] ];
292				$right = [ @$tokens[$pos..$len-1] ];
293				DEBUG("right = ", join(" + ", @$right),"\n");
294				DEBUG("left = ", join(" + ", @$left),"\n");
295				return [ $op, $left, $right ];
296				}
297				}
298				}
299
300
301
302				sub query { # simple query.... altavista +/- prefixes are recognized...
303				# */? globbing also works but
304				# slows query down significantly
305				# globbing implicitly discards +/- prefix (it's a BUG!!!)
306				my $self = shift;
307				my $indexdbpath= $self->{indexdbpath};
308				my $locationsdbpath = $self->{locationsdbpath};
309				my $titlesdbpath = $self->{titlesdbpath};
310				my %indexdb;
311				my %locationsdb;
312				my %titlesdb;
313				return undef unless (-f $indexdbpath && -r _);
314				return undef unless (-f $locationsdbpath && -r _);
315				return undef unless (-f $titlesdbpath && -r _);
316				return undef unless
317				tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) &&
318				tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) &&
319				tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY );
320				my %matches;
321				my %limit;
322				my %exclude;
323				my @ignored;
324				my $key;
325				my $word;
326				my $mustbe_words = 0;
327				my @words = ();
328				my $glob_regexp = undef;
329				for (@_) { # globbing feature... e.g. uni* passw?
330				if ( /\*\|\?/) {
331				s/\/\.\/g;
332				s/\?/\./g;
333				$glob_regexp = $glob_regexp ? $glob_regexp."\|^$_\$" : "^$_\$" ;
334				}
335				else {
336				push @words, $_;
337				}
338				}
339				if ($glob_regexp) {
340				my $regexp = qr/$glob_regexp/;
341				# collect all words in db matching $glob_regexp and append them to the query
342				_push_words_from_hash(\%indexdb, \@words, $regexp);
343				}
344
345				DEBUG("looking up ", join(", ", @words ), "\n");
346				foreach $word (@words) {
347				my $rc = 0;
348				# DEBUG($word);
349				if ($word =~ /^-(.*)/) {
350				my $keys = $indexdb{lc $1};
351				$rc = _add_keys_to_match_hash($keys,\%exclude);
352				} elsif ($word =~ /^\+(.*)/) {
353				$mustbe_words++;
354				my $keys = $indexdb{lc $1};
355				$rc = _add_keys_to_match_hash($keys,\%limit);
356				} else {
357				my $keys = $indexdb{lc $word};
358				$rc = _add_keys_to_match_hash($keys,\%matches);
359				}
360				# DEBUG("\n");
361				if (not $rc) { push @ignored, $word }
362				}
363
364				if ($mustbe_words) {
365				for $key(keys %limit) {
366				next unless $limit{$key} >= $mustbe_words;
367				$matches{$key} += $limit{$key} ;
368				}
369				for $key(keys %matches) {
370				delete $matches{$key} unless $limit{$key};
371				}
372				}
373				for $key(keys %exclude) {
374				delete $matches{$key};
375				}
376				my $result = _make_result_hash(\%matches,\%locationsdb, \%titlesdb, \@words, \@ignored);
377				untie(%indexdb);
378				untie(%locationsdb);
379				untie(%titlesdb);
380				return $result;
381				}
382
383
384				sub _make_result_hash {
385				# hash-ref hash-ref hash-ref array-ref array-ref
386				my ( $match, $locationsdb, $titlesdb, $words, $ignored ) = @_;
387				my $result = {
388				searched => $words,
389				ignored => $ignored,
390				entries => []
391				};
392				my $key;
393				foreach $key (keys %$match) {
394				my $ckey = pack("xN",$key);
395				my $name = $locationsdb->{$ckey};
396				my $title = $titlesdb->{$ckey};
397
398				push @{ $result->{entries} }, {
399				location => $name,
400				score => $match->{$key},
401				title => $title
402				};
403				DEBUG("$name: $match->{$key}\n");
404				}
405				return $result;
406				}
407
408
409
410
411
412
413
414				sub DEBUG (@) { $verbose_flag && print STDERR @_ };
415
416				sub tie_hash {
417				my ($hashref, $file ,$perm) = @_;
418				$perm = (O_RDWR\|O_CREAT) unless defined $perm;
419				my $rc = tied(%$hashref);
420				return $rc if $rc;
421				$rc = tie(%$hashref,'DB_File',$file, $perm, 0644, $DB_File::DB_BTREE) ;
422				if ($debug_flag) {
423				my $count = int keys %$hashref;
424				DEBUG("tie $hashref ($rc) ($count keys)\n");
425				} elsif ($verbose_flag) {
426				DEBUG("tie $hashref ($rc)\n");
427				}
428
429
430				return $rc;
431				}
432
433				sub untie_hash {
434				my ($hashref, $file ) = @_;
435				if ($debug_flag) {
436				my $count = int keys %$hashref;
437				DEBUG("untie $hashref ($count keys)\n")
438				}
439				untie(%$hashref);
440				}
441
442
443				1;
444				#__END__
445
446				=head1 NAME
447
448				MMM::Text::Search - Perl module for indexing and searching text files and web objects
449
450				=head1 SYNOPSIS
451
452				use MMM::Text::Search;
453
454				my $srch = new MMM::Text::Search { #for indexing...
455				#index main file location...
456				IndexPath => "/tmp/myindex.db",
457				#local files... (optional)
458				FileMask => '(?i)(\.txt\|\.htm.?)$',
459				Dirs => [ "/usr/doc", "/tmp" ] ,
460				FollowSymLinks => 0\|1, (default = 0)
461				#web objects... (optional)
462				URLs => [ "http://localhost/", ... ],
463				Level => recursion-level (0=unlimited)
464				#common options...
465				IgnoreLimit => 0.3, (default = 2/3)
466				Verbose => 0\|1
467				};
468
469				$srch->start_indexing_session();
470
471				$srch->commit_indexing_session();
472
473				$srch->index_default_locations();
474
475				$srch->index_content( { title => '...',
476				content=> '...',
477				id => '...' } );
478
479				$srch->makeindex;
480				(Obsolete.)
481
482
483
484
485
486				my $srch = new MMM::Text::Search ( #for searching....
487				"/tmp/myindex.db", verbose_flag );
488
489				my $hashref = $srch->query("pizza","ciao", "-pasta" );
490				my $hashref = $srch->advanced_query("(pizza OR ciao) AND NOT pasta");
491
492				$srch->errstr() # returns last error
493				# (only query syntax-errors for the moment being)
494
495
496				$srch->dump_word_stats(\*FH)
497
498				=head1 DESCRIPTION
499
500
501				=item *
502				Indexing
503
504				When a session is closed the following files will have been created
505				(assuming IndexPath = /path/myindex.db, see constructor):
506
507
508				/path/myindex.db word index database
509				/path/myindex-locations.db filename/URL database
510				/path/myindex-titles.db html title database
511				/path/myindex.stopwords stop-words list
512				/path/myindex.filelist readable list of indexed files/URLs
513				/path/myindex.deadlinks broken http links
514
515				[... lots of important things missing ... ]
516
517				start_indexing_session() starts session.
518
519				commit_indexing_session() commits and closes current session.
520
521				index_default_locations() indexes all files and URLs specified on construction.
522
523				index_content() pushes content into indexing engine.
524				Argument must have the following structure
525
526				{ title => '...', content=> '...', id => '...' }
527
528
529				makeindex() is obsolete.
530				Equivalent to:
531				$srch->start_indexing_session();
532				$srch->index_default_locations();
533				$srch->commit_indexing_session();
534
535
536
537
538				dump_word_stats(\*FH) dumps all words sorted by occurence frequency using
539				FH file handle (or STDOUT if no parameter is specified). Stop-words get a
540				frequency value of 1.
541
542				=item *
543				Searching
544
545				Both query() and advanced_query() return a reference to a hash with
546				the following structure:
547
548				(
549				ignored => [ string, string, ... ], # ignored words
550				searched => [ string, string, ... ], # words searched for
551				entries => [ hashref, hashref, ... ] # list of records
552				# found
553				)
554
555				The 'entries' element is a reference to an array of hashes, each having
556				the following structure:
557
558				(
559				location => string, # file path or URL or anything
560				score => number, # score
561				title => string # HTML title
562				)
563
564				=head1 NOTES
565
566				Note on implementation:
567				The technique used for indexing is substantially derived from that
568				exposed by Tim Kientzle on Dr. Dobbs magazine.
569
570				=head1 BUGS
571
572				Many, I guess.
573
574				=head1 AUTHOR
575
576				Max Muzi
577
578				=head1 SEE ALSO
579
580				perl(1).
581
582				=cut
583
584
585
586				#
587				#-------------------- the following code is only used when indexing ----------------
588				#
589
590				sub dump_word_stats {
591				my $self = shift;
592				my $fh = shift \|\| \*STDOUT;
593				my $indexdbpath= $self->{indexdbpath};
594				my %indexdb;
595				die unless (-f $indexdbpath && -r _);
596				tie_hash(\%indexdb,$indexdbpath, O_RDONLY );
597				my %index = ( %indexdb );
598				my $w;
599				for $w( sort { length($index{$b}) <=> length($index{$a}) }
600				keys %index ) {
601				print $fh $w, "\t", length($index{$w}) / 2, "\n";
602				}
603				untie_hash(\%indexdb);
604				}
605
606
607				sub start_indexing_session
608				{
609				my $self = shift;
610				$self->rollback_indexing_session;
611				my $key = 0;
612				my $indexdbpath = $self->{indexdbpath};
613				my $locationsdbpath = $self->{locationsdbpath};
614				my $titlesdbpath = $self->{titlesdbpath};
615
616				my $filemask = $self->{filemask};
617				my $keyref = \$key;
618				my $filelistfile = $indexdbpath;
619				$filelistfile =~ s/(\.db)?$/\.filelist/;
620				open FILELIST, ">".$filelistfile;
621
622				my $session = {
623				indexdbpath => $indexdbpath,
624				locationsdbpath => $locationsdbpath,
625				titlesdbpath => $titlesdbpath,
626				indexdb => { },
627				locationsdb => { },
628				titlesdb => { },
629				cachedb => { },
630				filemask => $filemask,
631				current_key => 16, # first 16 values are reserved (0 = word is ignored)
632				bytes => 0,
633				count => 0,
634				filecount => 0,
635				listfh => \*FILELIST,
636				status_THE => 0,
637				followsymlinks => $self->{followsymlinks},
638				minwordsize => $self->{minwordsize},
639				ignoreword => {},
640				autoignore => 1,
641				ignorelimit => $self->{ignorelimit} \|\| (2/3),
642				level => $self->{level},
643				url_exclude => $self->{url_exclude},
644				file_reader => $self->{file_reader},
645				use_inode => $self->{use_inode},
646				no_reset => $self->{no_reset},
647				};
648
649				unlink $indexdbpath."~";
650				unlink $locationsdbpath."~";
651				unlink $titlesdbpath."~";
652				if( $self->{no_reset} )
653				{
654				copy( $indexdbpath, $indexdbpath."~" );
655				copy( $locationsdbpath, $locationsdbpath."~" );
656				copy( $titlesdbpath, $titlesdbpath."~" );
657				}
658				tie_hash($session->{indexdb}, $indexdbpath."~" ) or die "$indexdbpath: $!\n";
659				tie_hash($session->{locationsdb}, $locationsdbpath."~" ) or die $!;
660				tie_hash($session->{titlesdb},$titlesdbpath."~" ) or die $!;
661
662				my $ignorefile = $indexdbpath;
663				$ignorefile =~ s/(\.db)?$/\.stopwords/;
664				if (-r $ignorefile) { # read *-stopwords.dat file
665				open F, $ignorefile;
666				while () {
667				chomp;
668				s/^\s+\|\s+$//g;
669				$session->{ignoreword}->{$_} = 1;
670				}
671				close F;
672				my $count = int keys %{ $session->{ignoreword} };
673				DEBUG("using stop-words from $ignorefile ($count words)\n");
674				$session->{autoignore} = 0;
675				}
676				$session->{ignorefile} = $ignorefile;
677
678				my $time = time();
679
680				$session->{start_time} = $time;
681
682				$self->{session} = $session;
683				}
684
685				sub index_default_locations
686				{
687				my $self = shift;
688				my $session = $self->{session};
689				return unless $session;
690
691				my $dirs = $self->{dirs};
692				my $urls = $self->{urls};
693				my $filecount = 0;
694				DEBUG("Counting files...\n") if int @$dirs;
695				my $dir;
696				for $dir( sort @$dirs) { $filecount += IndexDir($session, $dir, 1); }
697				$session->{filecount} = $filecount;
698
699				for $dir( sort @$dirs) { IndexDir($session, $dir); }
700				for my $url( sort @$urls) { IndexWeb($session, $url); }
701				}
702
703				sub index_content
704				{
705				my $self = shift;
706				my $session = $self->{session};
707				return unless $session;
708				my $info = shift;
709				if( ref($info) ne 'HASH' )
710				{ warn("usage: \$src->index_content( { content=>'...', id=>'...', title=>'...' } )\n");
711				return undef;
712				}
713				IndexFile( $session, $info->{id}, $info->{content}, $info->{title} );
714				return 1;
715				}
716
717				sub rollback_indexing_session
718				{
719				my $self = shift;
720				my $session = $self->{session};
721				return unless $session;
722				untie_hash($session->{indexdb});
723				untie_hash($session->{locationsdb});
724				untie_hash($session->{titlesdb});
725				my $indexdbpath = $self->{indexdbpath};
726				my $locationsdbpath = $self->{locationsdbpath};
727				my $titlesdbpath = $self->{titlesdbpath};
728
729				unlink $indexdbpath."~";
730				unlink $locationsdbpath."~";
731				unlink $titlesdbpath."~";
732				$self->{session} = undef;
733				}
734
735				sub DESTROY
736				{
737				my $self = shift;
738				$self->rollback_indexing_session;
739				}
740
741				sub commit_indexing_session
742				{
743				my $self = shift;
744				my $session = $self->{session};
745				return unless $session;
746				FlushCache($session->{cachedb}, $session->{indexdb}, $session);
747				my $time = time()-$session->{start_time};
748				DEBUG("$session->{bytes} bytes read, $session->{count} files processed in $time seconds\n");
749				untie_hash($session->{indexdb});
750				untie_hash($session->{locationsdb});
751				untie_hash($session->{titlesdb});
752
753				my $indexdbpath = $self->{indexdbpath};
754				my $locationsdbpath = $self->{locationsdbpath};
755				my $titlesdbpath = $self->{titlesdbpath};
756
757				rename $indexdbpath."~", $indexdbpath;
758				rename $locationsdbpath."~", $locationsdbpath ;
759				rename $titlesdbpath."~", $titlesdbpath;
760				close $session->{listfh};
761				if ( $session->{autoignore} ) {
762				my $ignorefile = $session->{ignorefile};
763				open F, ">".$ignorefile; #write *-stopwords.dat file
764				print F join( "\n", sort keys %{ $session->{ignoreword} } );
765				close F;
766				}
767
768				$self->{session} = undef;
769				}
770
771
772
773				sub makeindex
774				{
775				my $self = shift;
776				$self->start_indexing_session();
777				$self->index_default_locations();
778				$self->commit_indexing_session();
779				}
780
781
782				sub IndexDir {
783				my ($session, $dir, $only_recurse) = @_;
784				my $followsymlinks = $session->{followsymlinks};
785				my $file_reader = $session->{file_reader};
786				opendir D, $dir;
787				# DEBUG "D $dir\n";
788				my @files = readdir D;
789				close D;
790				my $e;
791				my $count = 0;
792				my $text;
793				for $e(@files) {
794				next if $e =~ /^\.\.?/;
795				my $path = $dir."/".$e;
796				if (-d $path) {
797				unless ($followsymlinks) {
798				next if -l $path ;
799				}
800				$count += IndexDir($session,$path, $only_recurse);
801				}
802				elsif (-f _ ) {
803				my $filemask = $session->{filemask};
804				if ($filemask) {
805				next unless $e =~ $filemask;
806				}
807				unless ($only_recurse)
808				{
809				if( $file_reader )
810				{
811				$text = $file_reader->read( $path );
812				IndexFile($session,$path,$text);
813				} else
814				{
815				IndexFile($session,$path);
816				}
817				}
818				$count ++;
819				}
820				}
821				return $count;
822				}
823
824
825
826				sub IndexFile {
827				my ($session, $file, $text, $title ) = @_;
828				my $cachedb = $session->{cachedb};
829				my $locationsdb = $session->{locationsdb};
830				my $key = $session->{current_key};
831				if( $session->{use_inode} )
832				{
833				$key = (stat($file))[1];
834				}
835				my $no_of_files = $session->{filecount};
836				if( $session->{no_reset} )
837				{
838				if( exists $locationsdb->{pack"xN",$key} )
839				{
840				warn("key $key already in locationsdb. Skipping\n");
841				return;
842				}
843				}
844				DEBUG $session->{count}+1, "/$no_of_files $file (id=$key)\n";
845				my $fh = $session->{listfh};
846				print $fh "$key\t$file\n";
847				local $/;
848				unless (defined $text) {
849				undef $/;
850				open(FILE, $file);
851				($text) = ; # Read entire file
852				close FILE;
853				}
854				my $filesize = length($text);
855				if ($file =~ /\.s?htm.?/i ) {
856				$text =~ /]*>([^<]+)<\/title/i ;
857				$title = $1;
858				$title =~ s/\s+/ /g;
859				$text =~ s/<[^>]*>//g; # strip all HTML tags
860				}
861				if( defined $title )
862				{
863				$session->{titlesdb}->{pack"xN",$key} = $title; # put title in db
864				DEBUG("* \"$title\"\n");
865				}
866				# index all the words under the current file-id
867				my($wordsIndexed) = &IndexWords($cachedb, $text,$key, $session);
868				$session->{current_key}++;
869				DEBUG "* $wordsIndexed words\n";
870
871				# map file-id (key) to this filename
872				$locationsdb->{pack"xN",$key} = $file; # leading null is here for
873				# historical reasons :-)
874				$session->{bytes} += $filesize;
875				$session->{count}++;
876				$session->{_temp_size} += $filesize;
877				if ($session->{_temp_size} > 2000000 ) {
878				my $rc = 0;
879				$rc = FlushCache($cachedb, $session->{indexdb}, $session);
880
881				if (! $rc ) {
882				tie_hash($session->{indexdb}, $session->{indexdbpath}) or die $!;
883				untie_hash($session->{indexdb});
884				$rc = FlushCache($cachedb, $session->{indexdb}, $session);
885				die $! if not $rc;
886				}
887
888				$session->{_temp_size} = 0;
889				$session->{cachedb} = {};
890				}
891				}
892
893				sub IndexWords {
894				my ($db, $words, $fileKey, $session) = @_;
895				# hash content file-id options
896				my (%worduniq); # for unique-ifying word list
897				my $minwordsize = $session->{minwordsize};
898				my (@words) = split( /[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words); # split into an array of words
899				@words = grep { $worduniq{$_}++ == 0 } # remove duplicates
900				grep { length > $minwordsize } # must be longer than one character
901				grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # strip leading punct
902				grep { /[a-zA-Z0-9\xc0-\xff]/ } # must have an alphanumeric
903				@words;
904				# " foreach (sort @words) { "
905				for (@words) { # no need to sort here,
906				my $a = $db->{$_}; # we will sort when cache is flushed
907				$a .= pack "N",$fileKey; # appending packed file-id's
908				$db->{$_} = $a;
909				}
910				return int @words;
911				}
912
913
914
915				sub FlushCache {
916				my ($source, $dest, $session) = @_;
917				# flush source hashe into dest....
918				# %$dest is supposed to be tied, otherwise the whole
919				# thing doens't make much sense... :-)
920				my $scount = int keys %$source ;
921				my $ucount = 0;
922				my $acount = 0;
923				if ($scount == 0) {
924				die "error: 0 words in cache\n";
925				}
926				# my $wordcount = int keys %$dest;
927				# if ($wordcount < $session->{wordcount}) {
928				# warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n";
929				# return undef;
930				# }
931				# $session->{wordcount} = $wordcount;
932
933				# DEBUG("$wordcount words in database\n");
934				my $objref = tied %$dest ;
935				DEBUG("flushing $scount words into $dest ($objref)\n");
936
937				my $filecount = $session->{count};
938				my $autoignore = $session->{autoignore};
939				my $ignorethreshold = int ( $filecount * $session->{ignorelimit} );
940
941				my $w;
942
943				WORD:
944				for $w(sort keys %$source) {
945				my $data = $source->{$w};
946				if ($session->{ignoreword}->{$w} ) {
947				DEBUG("ignoring '$w' \n");
948				$data = pack("N*", ( 0 ) ); # id = 0 means $w is a stop-word
949				}
950				elsif (defined $dest->{$w}) {
951				my %uniq = ();
952				my $keys = $dest->{$w} . $data ;
953				my $keycount = length($keys)/2; # dividing by 2
954
955				$ucount++;
956				## my @keys = unpack("n*", $keys) ;
957				## my $keycount = @keys;
958				##
959				## if ($keys[0] == 0 ) { # skip ignored word
960				## DEBUG("skipping '$w' \n");
961				## next WORD;
962				## } els
963
964				if ($autoignore && ($filecount > 100)
965				&& ($keycount > $ignorethreshold ) ) {
966				DEBUG("word '$w' will be ignored (found in $keycount of $filecount files)\n");
967				# ignored words are associated to file-id 0
968				## @keys = ( 0 );
969				$keys = pack("N*", 0);
970				$session->{ignoreword}->{$w} = 1;
971				}
972				## @keys = grep { $uniq{$_}++ == 0} @keys;
973				## $data = pack("n*", @keys);
974
975				$data = $keys;
976
977				## if ($verbose_flag && ( $w eq "the" ) ) {
978				## my $len = int(@keys);
979				## if ($len < $session->{status_THE} ) {
980				## die "panic: problem with word 'the'";
981				## }
982				## $session->{status_THE} = $len;
983				## DEBUG("word 'the' found in $len files \n");
984				## }
985
986				} else {
987				$acount++;
988				}
989				$dest->{$w} = $data;
990
991				# if ($dest->{$w} ne $data) {
992				# warn "unexpected error: \$w=$w\n";
993				# return undef;
994				# }
995				}
996				DEBUG("$ucount words updated, $acount new words added\n");
997				if ($debug_flag) {
998				my $wordcount = int keys %$dest;
999				if ($wordcount < $session->{wordcount}) {
1000				warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n";
1001				return undef;
1002				}
1003				$session->{wordcount} = $wordcount;
1004				DEBUG("$wordcount words in database\n");
1005				}
1006				return 1;
1007				}
1008
1009
1010
1011
1012
1013				sub IndexWeb {
1014				my ($session, $url) = @_;
1015				require MMM::Text::Search::Inet;
1016				my $req = new HTTPRequest { AutoRedirect => 1 };
1017				my %fetched = ();
1018				$req->set_url($url);
1019				my $host = $req->host();
1020				$session->{req} = $req;
1021				$session->{fetched} = \%fetched;
1022				$session->{host} = $host;
1023				my $deadlinksfile = $session->{indexdbpath};
1024				$deadlinksfile =~ s/(\.db)?$/\.deadlinks/;
1025				open DL, ">".$deadlinksfile;
1026				$session->{deadlinksfh} = \*DL;
1027				recursive_fetch($session, $url, "", 0);
1028				}
1029
1030
1031
1032				sub recursive_fetch {
1033				my ($session, $URL, $parent, $level) = @_;
1034				my $req = $session->{req};
1035				$req->reset();
1036				$req->set_url($URL);
1037				my $url = $req->url();
1038				return unless $req->host() eq $session->{host};
1039				return if $session->{fetched}->{$url};
1040				$session->{fetched}->{$url} = 1;
1041				return unless $req->get_page();
1042				my $status = $req->status();
1043				DEBUG( ">>> $url ($status)\n");
1044				if ( $status != 200 ) {
1045				my $fh = $session->{deadlinksfh};
1046				my $url = $req->url();
1047				print $fh $status, "\t",
1048				$url, "(", $req->{_URL},")",
1049				"\t", $parent, "\n";
1050				return;
1051				};
1052				my $base = $req->base_url();
1053				my $content_ref = $req->content_ref();
1054				my $header = $req->header();
1055				IndexFile($session, $url, $$content_ref);
1056				return if ($session->{level} && $level >= $session->{level});
1057				$$content_ref =~ s///gs; #remove comments
1058				my @links = $$content_ref =~/href=([^>\s]+)/ig; #extract hyperlinks
1059				my $count = 0;
1060				my $exclude_re = $session->{url_exclude};
1061				for(@links) {
1062				s/\"\|\'//g;
1063				next if m/^(ftp\|mailto\|gopher\|news):/;
1064				next if m/^$exclude_re$/o;
1065				my $link = /^http/ ? $_ : join("/",$base,$_);
1066				$link =~ s/#.*//;
1067				$count++;
1068				recursive_fetch($session,$link, $url, $level + 1);
1069				}
1070				}
1071
1072
1073				1;
1074				__END__