File Coverage

blib/lib/Pod/POM/Web/Indexer.pm

Criterion	Covered	Total	%
statement	22	24	91.6
branch			n/a
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	30	32	93.7

line	stmt	sub	time	code
1				package Pod::POM::Web::Indexer;
2
3	1	1	20	use strict;
	1		2
	1		28
4	1	1	10	use warnings;
	1		2
	1		37
5	1	1	5	no warnings 'uninitialized';
	1		1
	1		35
6
7	1	1	5	use Pod::POM;
	1		1
	1		47
8	1	1	4	use List::Util qw/min max/;
	1		2
	1		70
9	1	1	6	use List::MoreUtils qw/part/;
	1		1
	1		9
10	1	1	781	use Time::HiRes qw/time/;
	1		1022
	1		4
11	1	1	491	use Search::Indexer 0.75;
	0
	0
12				use BerkeleyDB;
13
14				use parent 'Pod::POM::Web';
15				our $VERSION = 1.22;
16
17				#----------------------------------------------------------------------
18				# Initializations
19				#----------------------------------------------------------------------
20
21				my $defaut_max_size_for_indexing = 300 << 10; # 300K
22
23				my $ignore_dirs = qr[
24				auto \| unicore \| DateTime/TimeZone \| DateTime/Locale ]x;
25
26				my $ignore_headings = qr[
27				SYNOPSIS \| DESCRIPTION \| METHODS \| FUNCTIONS \|
28				BUGS \| AUTHOR \| SEE\ ALSO \| COPYRIGHT \| LICENSE ]x;
29
30				(my $index_dir = __FILE__) =~ s[Indexer\.pm$][index];
31
32				my $id_regex = qr/(?![0-9]) # don't start with a digit
33				\w\w+ # start with 2 or more word chars ..
34				(?:::\w+)* # .. and possibly ::some::more::components
35				/x;
36
37				my $wregex = qr/(?: # either a Perl variable:
38				(?:\$\#?\|\@\|\%) # initial sigil
39				(?: # followed by
40				$id_regex # an id
41				\| # or
42				\^\w # builtin var with '^' prefix
43				\| # or
44				(?:[\#\$](?!\w))# just '$$' or '$#'
45				\| # or
46				[^{\w\s\$] # builtin vars with 1 special char
47				)
48				\| # or
49				$id_regex # a plain word or module name
50				)/x;
51
52
53				my @stopwords = (
54				'a' .. 'z', '_', '0' .. '9',
55				qw/__data__ __end__ $class $self
56				above after all also always an and any are as at
57				be because been before being both but by
58				can cannot could
59				die do don done
60				defined do does doesn
61				each else elsif eq
62				for from
63				ge gt
64				has have how
65				if in into is isn it item its
66				keys
67				last le lt
68				many may me method might must my
69				ne new next no nor not
70				of on only or other our
71				package perl pl pm pod push
72				qq qr qw
73				ref return
74				see set shift should since so some something sub such
75				text than that the their them then these they this those to tr
76				undef unless until up us use used uses using
77				values
78				was we what when which while will with would
79				you your/
80				);
81
82
83				#----------------------------------------------------------------------
84				# RETRIEVING
85				#----------------------------------------------------------------------
86
87
88				sub fulltext {
89				my ($self, $search_string) = @_;
90
91				my $indexer = eval {
92				new Search::Indexer(dir => $index_dir,
93				wregex => $wregex,
94				preMatch => '[[',
95				postMatch => ']]');
96				} or die <<__EOHTML__;
97				No fulltext index found ($@).
98				<p>
99				Please ask your system administrator to run the
100				command
101				</p>
102				<pre>
103				perl -MPod::POM::Web::Indexer -e "Pod::POM::Web::Indexer->new->index"
104				</pre>
105
106				Indexing may take about half an hour and will use about
107				10 MB on your hard disk.
108				__EOHTML__
109
110
111
112				my $lib = "$self->{root_url}/lib";
113				my $html = <<__EOHTML__;
114				<html>
115				<head>
116				<link href="$lib/GvaScript.css" rel="stylesheet" type="text/css">
117				<link href="$lib/PodPomWeb.css" rel="stylesheet" type="text/css">
118				<style>
119				.src {font-size:70%; float: right}
120				.sep {font-size:110%; font-weight: bolder; color: magenta;
121				padding-left: 8px; padding-right: 8px}
122				.hl {background-color: lightpink}
123				</style>
124				</head>
125				<body>
126				__EOHTML__
127
128
129				# force Some::Module::Name into "Some::Module::Name" to prevent
130				# interpretation of ':' as a field name by Query::Parser
131				$search_string =~ s/(^\|\s)([\w]+(?:::\w+)+)(\s\|$)/$1"$2"$3/g;
132
133				my $result = $indexer->search($search_string, 'implicit_plus');
134
135				my $killedWords = join ", ", @{$result->{killedWords}};
136				$killedWords &&= " (ignoring words : $killedWords)";
137				my $regex = $result->{regex};
138
139				my $scores = $result->{scores};
140				my @doc_ids = sort {$scores->{$b} <=> $scores->{$a}} keys %$scores;
141
142				my $nav_links = $self->paginate_results(\@doc_ids);
143
144				$html .= "<b>Fulltext search</b> for '$search_string'$killedWords<br>"
145				. "$nav_links<hr>\n";
146
147				$self->_tie_docs(DB_RDONLY);
148
149				foreach my $id (@doc_ids) {
150				my ($mtime, $path, $description) = split "\t", $self->{_docs}{$id};
151				my $score = $scores->{$id};
152				my @filenames = $self->find_source($path);
153				my $buf = join "\n", map {$self->slurp_file($_)} @filenames;
154
155				my $excerpts = $indexer->excerpts($buf, $regex);
156				foreach (@$excerpts) {
157				s/&/&/g, s/</</g, s/>/>/g; # replace entities
158				s/\[\[/<span class='hl'>/g, s/\]\]/<\/span>/g; # highlight
159				}
160				$excerpts = join "<span class='sep'>/</span>", @$excerpts;
161				$html .= <<__EOHTML__;
162				<p>
163				<a href="$self->{root_url}/source/$path" class="src">source</a>
164				<a href="$self->{root_url}/$path">$path</a>
165				(<small>$score</small>) <em>$description</em>
166				<br>
167				<small>$excerpts</small>
168				</p>
169				__EOHTML__
170				}
171
172				$html .= "<hr>$nav_links\n";
173				return $self->send_html($html);
174				}
175
176
177
178				sub paginate_results {
179				my ($self, $doc_ids_ref) = @_;
180
181				my $n_docs = @$doc_ids_ref;
182				my $count = $self->{params}{count} \|\| 50;
183				my $start_record = $self->{params}{start} \|\| 0;
184				my $end_record = min($start_record + $count - 1, $n_docs - 1);
185				@$doc_ids_ref = @$doc_ids_ref[$start_record ... $end_record];
186				my $prev_idx = max($start_record - $count, 0);
187				my $next_idx = $start_record + $count;
188				my $base_url = "?source=fulltext&search=$self->{params}{search}";
189				my $prev_link
190				= $start_record > 0 ? uri_escape("$base_url&start=$prev_idx") : "";
191				my $next_link
192				= $next_idx < $n_docs ? uri_escape("$base_url&start=$next_idx") : "";
193				$_ += 1 for $start_record, $end_record;
194				my $nav_links = "";
195				$nav_links .= "<a href='$prev_link'>[Previous <<]</a> " if $prev_link;
196				$nav_links .= "Results <b>$start_record</b> to <b>$end_record</b> "
197				. "from <b>$n_docs</b>";
198				$nav_links .= " <a href='$next_link'>[>> Next]</a> " if $next_link;
199				return $nav_links;
200				}
201
202
203
204
205
206				sub modlist { # called by Ajax
207				my ($self, $search_string) = @_;
208
209				$self->_tie_docs(DB_RDONLY);
210
211				length($search_string) >= 2 or die "module_list: arg too short";
212				my $regex = qr/^\d+\t(\Q$search_string\E[^\t]*)/i;
213
214				my @modules;
215				foreach my $val (values %{$self->{_docs}}) {
216				$val =~ $regex or next;
217				(my $module = $1) =~ s[/][::]g;
218				push @modules, $module;
219				}
220
221				my $json_names = "[" . join(",", map {qq{"$_"}} sort @modules) . "]";
222				return $self->send_content({content => $json_names,
223				mime_type => 'application/x-json'});
224				}
225
226
227				sub get_abstract { # override from Web.pm
228				my ($self, $path) = @_;
229				if (!$self->{_path_to_descr}) {
230				eval {$self->_tie_docs(DB_RDONLY); 1}
231				or return; # database not found
232				$self->{_path_to_descr} = {
233				map {(split /\t/, $_)[1,2]} values %{$self->{_docs}}
234				};
235				}
236				my $description = $self->{_path_to_descr}->{$path} or return;
237				(my $abstract = $description) =~ s/^.?-\s//;
238				return $abstract;
239				}
240
241
242				#----------------------------------------------------------------------
243				# INDEXING
244				#----------------------------------------------------------------------
245
246				sub import { # export the "index" function if called from command-line
247				my $class = shift;
248				my ($package, $filename) = caller;
249
250				no strict 'refs';
251				*{'main::index'} = sub {$class->new->index(@_)}
252				if $package eq 'main' and $filename eq '-e';
253				}
254
255
256				sub index {
257				my ($self, %options) = @_;
258
259				# check invalid options
260				die "invalid option : $_"
261				if grep {!/^-(from_scratch\|max_size\|positions)$/} keys %options;
262
263				# make sure index dir exists
264				-d $index_dir or mkdir $index_dir or die "mkdir $index_dir: $!";
265
266				# if -from_scratch, throw away old index
267				if ($options{-from_scratch}) {
268				unlink $_ or die "unlink $_ : $!" foreach glob("$index_dir/*.bdb");
269				}
270
271				# store global info for indexing methods
272				$self->{_seen_path} = {};
273				$self->{_last_doc_id} = 0;
274				$self->{_max_size_for_indexing} = $options{-max_size}
275				\|\| $defaut_max_size_for_indexing;
276
277				# tie to docs.bdb, storing {$doc_id => "$mtime\t$pathname\t$description"}
278				$self->_tie_docs(DB_CREATE);
279
280				# build in-memory reverse index of info contained in %{$self->{_docs}}
281				$self->{_max_doc_id} = 0;
282				$self->{_previous_index} = {};
283				while (my ($id, $doc_descr) = each %{$self->{_docs}}) {
284				$self->{_max_doc_id} = max($id, $self->{_max_doc_id});
285				my ($mtime, $path, $description) = split /\t/, $doc_descr;
286				$self->{_previous_index}{$path}
287				= {id => $id, mtime => $mtime, description => $description};
288				}
289
290				# open the index
291				$self->{_indexer} = new Search::Indexer(dir => $index_dir,
292				writeMode => 1,
293				positions => $options{-positions},
294				wregex => $wregex,
295				stopwords => \@stopwords);
296
297				# main indexing loop
298				$self->index_dir($_) foreach @Pod::POM::Web::search_dirs;
299
300				$self->{_indexer} = $self->{_docs} = undef;
301				}
302
303
304				sub index_dir {
305				my ($self, $rootdir, $path) = @_;
306				return if $path =~ /$ignore_dirs/;
307
308				my $dir = $rootdir;
309				if ($path) {
310				$dir .= "/$path";
311				return print STDERR "SKIP DIR $dir (already in \@INC)\n"
312				if grep {m[^\Q$dir\E]} @Pod::POM::Web::search_dirs;
313				}
314
315				chdir $dir or return print STDERR "SKIP DIR $dir (chdir $dir: $!)\n";
316
317				print STDERR "DIR $dir\n";
318				opendir my $dh, "." or die $^E;
319				my ($dirs, $files) = part { -d $_ ? 0 : 1} grep {!/^\./} readdir $dh;
320				$dirs \|\|= [], $files \|\|= [];
321				closedir $dh;
322
323				my %extensions;
324				foreach my $file (sort @$files) {
325				next unless $file =~ s/\.(pm\|pod)$//;
326				$extensions{$file}{$1} = 1;
327				}
328
329				foreach my $base (keys %extensions) {
330				$self->index_file($path, $base, $extensions{$base});
331				}
332
333				my @subpaths = map {$path ? "$path/$_" : $_} @$dirs;
334				$self->index_dir($rootdir, $_) foreach @subpaths;
335				}
336
337
338				sub index_file {
339				my ($self, $path, $file, $has_ext) = @_;
340
341				my $fullpath = $path ? "$path/$file" : $file;
342				return print STDERR "SKIP $fullpath (shadowing)\n"
343				if $self->{_seen_path}{$fullpath};
344
345				$self->{_seen_path}{$fullpath} = 1;
346				my $max_mtime = 0;
347				my ($size, $mtime, @filenames);
348				EXT:
349				foreach my $ext (qw/pm pod/) {
350				next EXT unless $has_ext->{$ext};
351				my $filename = "$file.$ext";
352				($size, $mtime) = (stat $filename)[7, 9] or die "stat $filename: $!";
353				$size < $self->{_max_size_for_indexing} or
354				print STDERR "$filename too big ($size bytes), skipped " and next EXT;
355				$mtime = max($max_mtime, $mtime);
356				push @filenames, $filename;
357				}
358
359				if ($mtime <= $self->{_previous_index}{$fullpath}{mtime}) {
360				return print STDERR "SKIP $fullpath (index up to date)\n";
361				}
362
363				if (@filenames) {
364				my $old_doc_id = $self->{_previous_index}{$fullpath}{id};
365				my $doc_id = $old_doc_id \|\| ++$self->{_max_doc_id};
366
367				print STDERR "INDEXING $fullpath (id $doc_id) ... ";
368
369				my $t0 = time;
370				my $buf = join "\n", map {$self->slurp_file($_)} @filenames;
371				my ($description) = ($buf =~ /^=head1\sNAME\s(.*)$/m);
372				$description \|\|= '';
373				$description =~ s/\t/ /g;
374				$buf =~ s/^=head1\s+($ignore_headings).*$//m; # remove full line of those
375				$buf =~ s/^=(head\d\|item)//mg; # just remove command of =head* or =item
376				$buf =~ s/^=\w.*//mg; # remove full line of all other commands
377
378				if ($old_doc_id) {
379				# Here we should remove the old document from the index. But
380				# we no longer have the document source! So we cheat with the current
381				# doc buffer, hoping that most words are similar. This step sounds
382				# ridiculous but is necessary to avoid having twice the same
383				# doc listed twice in inverted lists.
384				$self->{_indexer}->remove($old_doc_id, $buf);
385				}
386
387				$self->{_indexer}->add($doc_id, $buf);
388				my $interval = time - $t0;
389				printf STDERR "%0.3f s.", $interval;
390
391				$self->{_docs}{$doc_id} = "$mtime\t$fullpath\t$description";
392				}
393
394				print STDERR "\n";
395
396				}
397
398
399				#----------------------------------------------------------------------
400				# UTILITIES
401				#----------------------------------------------------------------------
402
403				sub _tie_docs {
404				my ($self, $mode) = @_;
405
406				# tie to docs.bdb, storing {$doc_id => "$mtime\t$pathname\t$description"}
407				tie %{$self->{_docs}}, 'BerkeleyDB::Hash',
408				-Filename => "$index_dir/docs.bdb",
409				-Flags => $mode
410				or die "open $index_dir/docs.bdb : $^E $BerkeleyDB::Error";
411				}
412
413
414
415				sub uri_escape {
416				my $uri = shift;
417				$uri =~ s{([^;\/?:@&=\$,A-Za-z0-9\-_.!~*'()])}
418				{sprintf("%%%02X", ord($1)) }ge;
419				return $uri;
420				}
421
422
423				1;
424
425				__END__
426
427				=head1 NAME
428
429				Pod::POM::Web::Indexer - fulltext search for Pod::POM::Web
430
431				=head1 SYNOPSIS
432
433				perl -MPod::POM::Web::Indexer -e index
434
435				=head1 DESCRIPTION
436
437				Adds fulltext search capabilities to the
438				L<Pod::POM::Web\|Pod::POM::Web> application.
439				This requires L<Search::Indexer\|Search::Indexer> to be installed.
440
441				Queries may include plain terms, "exact phrases",
442				'+' or '-' prefixes, boolean operators and parentheses.
443				See L<Search::QueryParser\|Search::QueryParser> for details.
444
445
446				=head1 METHODS
447
448				=head2 index
449
450				Pod::POM::Web::Indexer->new->index(%options)
451
452				Walks through directories in C<@INC> and indexes
453				all C<.pm> and C<.pod> files, skipping shadowed files
454				(files for which a similar loading path was already
455				found in previous C<@INC> directories), and skipping
456				files that are too big.
457
458				Default indexing is incremental : files whose modification
459				time has not changed since the last indexing operation will
460				not be indexed again.
461
462				Options can be
463
464				=over
465
466				=item -max_size
467
468				Size limit (in bytes) above which files will not be indexed.
469				The default value is 300K.
470				Files of size above this limit are usually not worth
471				indexing because they only contain big configuration tables
472				(like for example C<Module::CoreList> or C<Unicode::Charname>).
473
474				=item -from_scratch
475
476				If true, the previous index is deleted, so all files will be freshly
477				indexed. If false (the default), indexation is incremental, i.e. files
478				whose modification time has not changed will not be re-indexed.
479
480				=item -positions
481
482				If true, the indexer will also store word positions in documents, so
483				that it can later answer to "exact phrase" queries.
484
485				So if C<-positions> are on, a search for C<"more than one way"> will
486				only return documents which contain that exact sequence of contiguous
487				words; whereas if C<-positions> are off, the query is equivalent to
488				C<more AND than AND one AND way>, i.e. it returns all documents which
489				contain these words anywhere and in any order.
490
491				The option is off by default, because it requires much more disk
492				space, and does not seem to be very relevant for searching
493				Perl documentation.
494
495				=back
496
497				The C<index> function is exported into the C<main::> namespace if perl
498				is called with the C<-e> flag, so that you can write
499
500				perl -MPod::POM::Web::Indexer -e index
501
502
503				=head1 PERFORMANCES
504
505				On my machine, indexing a module takes an average of 0.2 seconds,
506				except for some long and complex sources (this is why sources
507				above 300K are ignored by default, see options above).
508				Here are the worst figures (in seconds) :
509
510				Date/Manip 39.655
511				DBI 30.73
512				Pod/perlfunc 29.502
513				Module/CoreList 27.287
514				CGI 16.922
515				Config 13.445
516				CPAN 12.598
517				Pod/perlapi 10.906
518				CGI/FormBuilder 8.592
519				Win32/TieRegistry 7.338
520				Spreadsheet/WriteExcel 7.132
521				Pod/perldiag 5.771
522				Parse/RecDescent 5.405
523				Bit/Vector 4.768
524
525				The index will be stored in an F<index> subdirectory
526				under the module installation directory.
527				The total index size should be around 10MB if C<-positions> are off,
528				and between 30MB and 50MB if C<-positions> are on, depending on
529				how many modules are installed.
530
531
532				=head1 TODO
533
534				- highlights in shown documents
535				- paging
536
537				=cut
538
539