File Coverage

blib/lib/MyConText.pm
Criterion Covered Total %
statement 15 221 6.7
branch 0 86 0.0
condition 0 3 0.0
subroutine 5 21 23.8
pod 9 14 64.2
total 29 345 8.4


line stmt bran cond sub pod time code
1              
2             =head1 NAME
3              
4             MyConText - Indexing documents with MySQL as storage
5              
6             =cut
7              
8             package MyConText;
9 11     11   7740 use strict;
  11         19  
  11         341  
10              
11 11     11   48 use vars qw($errstr $VERSION);
  11         17  
  11         1723  
12             $errstr = undef;
13             $VERSION = '0.49';
14              
15             my %DEFAULT_PARAMS = (
16             'num_of_docs' => 0, # statistical value, should be maintained
17             'word_length' => 30, # max length of words we index
18              
19             'protocol' => 40, # we only support protocol with the same numbers
20              
21             'blob_direct_fetch' => 20, # with the blob store, when we stop searching
22             # and fetch everything at once
23             'data_table' => undef, # table where the actual index is stored
24             'name_length' => 255, # for filenames or URLs, what's the max length
25              
26             'word_id_bits' => 16, # num of bits for word_id (column store)
27             'doc_id_bits' => 16, # num of bits for doc_id
28             'count_bits' => 8, # num of bits for count value
29             'position_bits' => 32, # num of bits for word positions
30              
31             'backend' => 'blob', # what database backend (way the data is
32             # stored) we use
33             'frontend' => 'none', # what application frontend we use (how
34             # the index behaves externaly)
35             'filter' => 'map { lc $_ }',
36             'splitter' => ' $data =~ /(\w{2,$word_length})/g',
37             # can use the $data and $word_length
38             # variables
39             'init_env' => 'use locale'
40             );
41             my %backend_types = (
42             'blob' => 'MyConText::Blob',
43             'column' => 'MyConText::Column',
44             'phrase' => 'MyConText::Phrase',
45             );
46             my %frontend_types = (
47             'none' => 'MyConText',
48             'default' => 'MyConText',
49             'file' => 'MyConText::File',
50             'string' => 'MyConText::String',
51             'url' => 'MyConText::URL',
52             'table' => 'MyConText::Table',
53             );
54              
55 11     11   51 use vars qw! %BITS_TO_PACK %BITS_TO_INT %BITS_TO_PRECISION %PRECISION_TO_BITS !;
  11         28  
  11         17422  
56             %BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !;
57             %BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !;
58             %BITS_TO_PRECISION = qw! 8 4 16 6 24 9 32 11 !;
59             %PRECISION_TO_BITS = map { ( $BITS_TO_PRECISION{$_} => $_ ) } keys %BITS_TO_PRECISION;
60              
61             # Open reads in the information about existing index, creates an object
62             # in memory
63             sub open {
64 0     0 1   my ($class, $dbh, $TABLE) = @_;
65 0           $errstr = undef;
66              
67             # the $dbh is either a real dbh of a DBI->connect parameters arrayref
68 0           my $mydbh = 0;
69 0 0         if (ref $dbh eq 'ARRAY') {
70             $dbh = DBI->connect(@$dbh) or
71 0 0         do { $errstr = $DBI::errstr; return; };
  0            
  0            
72            
73 0           $mydbh = 1;
74             }
75              
76             # load the parameters to the object
77 0           my %PARAMS = %DEFAULT_PARAMS;
78 0           my $sth = $dbh->prepare("select * from $TABLE");
79 0           $sth->{'PrintError'} = 0;
80 0           $sth->{'RaiseError'} = 0;
81 0 0         $sth->execute or do {
82 0 0         if (not grep { $TABLE eq $_ }
  0            
83             MyConText->list_context_indexes($dbh)) {
84 0           $errstr = "ConText index $TABLE doesn't exist.";
85             }
86 0           else { $errstr = $sth->errstr; }
87 0           return;
88             };
89 0           while (my ($param, $value) = $sth->fetchrow_array) {
90 0           $PARAMS{$param} = $value;
91             }
92 0           my $self = bless {
93             'dbh' => $dbh,
94             'table' => $TABLE,
95             %PARAMS,
96             }, $class;
97 0           my $data_table = $self->{'data_table'};
98            
99             # we should disconnect if we've opened the dbh here
100 0 0         if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; }
  0            
101              
102             # some basic sanity check
103             defined $dbh->selectrow_array("select count(*) from $data_table")
104 0 0         or do { $errstr = "Table $data_table not found in the database\n"; return; };
  0            
  0            
105              
106              
107             # load and set the application frontend
108 0           my $front_module = $frontend_types{$PARAMS{'frontend'}};
109 0 0         if (defined $front_module) {
110 0 0         if ($front_module ne $class) {
111 0           eval "use $front_module";
112 0 0         die $@ if $@;
113             }
114 0           bless $self, $front_module;
115 0           $self->_open_tables;
116             }
117 0           else { $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; }
  0            
118              
119             # load and set the backend (actual database access) module
120 0           my $back_module = $backend_types{$PARAMS{'backend'}};
121 0 0         if (defined $back_module) {
122 0           eval "use $back_module";
123 0 0         die $@ if $@;
124 0           $self->{'db_backend'} = $back_module->open($self);
125             }
126 0           else { $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; }
  0            
127              
128             # finally, return the object
129 0           $self;
130             }
131              
132             # Create creates tables in the database according to the options, then
133             # calls open to load the object to memory
134             sub create {
135 0     0 1   my ($class, $dbh, $TABLE, %OPTIONS) = @_;
136 0           $errstr = undef;
137 0           my $mydbh = 0;
138 0 0         if (ref $dbh eq 'ARRAY') {
139             $dbh = DBI->connect(@$dbh) or
140 0 0         do { $errstr = $DBI::errstr; return; };
  0            
  0            
141 0           $mydbh = 1;
142             }
143              
144 0           my $self = bless {
145             'dbh' => $dbh,
146             'table' => $TABLE,
147             %DEFAULT_PARAMS,
148             %OPTIONS
149             }, $class;
150              
151 0 0         $self->{'data_table'} = $TABLE.'_data'
152             unless defined $self->{'data_table'};
153              
154 0           my $CREATE_PARAM = <
155             create table $TABLE (
156             param varchar(16) binary not null,
157             value varchar(255),
158             primary key (param)
159             )
160             EOF
161 0 0         $dbh->do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; };
  0            
  0            
162 0           push @{$self->{'created_tables'}}, $TABLE;
  0            
163              
164             # load and set the frontend database structures
165 0           my $front_module = $frontend_types{$self->{'frontend'}};
166 0 0         if (defined $front_module) {
167 0           eval "use $front_module";
168 0 0         die $@ if $@;
169 0           bless $self, $front_module;
170 0           $errstr = $self->_create_tables;
171 0 0         if (defined $errstr) { $self->clean_failed_create; return; }
  0            
  0            
172             }
173 0           else { $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; }
  0            
  0            
174              
175             # create the backend database structures
176 0           my $back_module = $backend_types{$self->{'backend'}};
177 0 0         if (defined $back_module) {
178 0           eval "use $back_module";
179 0 0         die $@ if $@;
180 0           $errstr = $back_module->_create_tables($self);
181 0 0         if (defined $errstr) { $self->clean_failed_create; return; }
  0            
  0            
182             }
183 0           else { $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; }
  0            
  0            
184            
185 0           for (grep { not ref $self->{$_} } keys %$self) {
  0            
186 0           $dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_});
187             }
188            
189 0           return $class->open($dbh, $TABLE);
190             }
191              
192 0     0     sub _create_tables {}
193 0     0     sub _open_tables {}
194              
195             sub clean_failed_create {
196 0     0 0   my $self = shift;
197 0           my $dbh = $self->{'dbh'};
198 0           for my $table (@{$self->{'created_tables'}}) {
  0            
199 0           $dbh->do("drop table $table");
200             }
201             }
202              
203             sub drop {
204 0     0 1   my $self = shift;
205 0           my $dbh = $self->{'dbh'};
206 0           for my $tag (keys %$self) {
207 0 0         next unless $tag =~ /(^|_)table$/;
208 0           $dbh->do("drop table $self->{$tag}");
209             }
210 0           1;
211             }
212             sub errstr {
213 0     0 0   my $self = shift;
214 0 0         ref $self ? $self->{'errstr'} : $errstr;
215             }
216              
217             sub list_context_indexes {
218 0     0 0   my ($class, $dbh) = @_;
219 0           my %tables = map { ( $_->[0] => 1 ) }
  0            
220 0           @{$dbh->selectall_arrayref('show tables')};
221 0           my %indexes = ();
222 0           for my $table (keys %tables) {
223 0           local $dbh->{'PrintError'} = 0;
224 0           local $dbh->{'RaiseError'} = 0;
225 0 0         if ($dbh->selectrow_array("select param, value from $table
226             where param = 'data_table'")) {
227 0           $indexes{$table} = 1;
228             }
229             }
230 0           return sort keys %indexes;
231             }
232              
233             sub index_document {
234 0     0 1   my ($self, $id, $data) = @_;
235 0 0         return unless defined $id;
236              
237 0           my $dbh = $self->{'dbh'};
238              
239 0           my $param_table = $self->{'table'};
240              
241 0           my $adding_doc = 0;
242              
243 0           my $adding = 0;
244 0 0 0       if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) {
245 0           $self->{'max_doc_id'} = $id;
246 0 0         my $update_max_doc_id_sth =
247             ( defined $self->{'update_max_doc_id_sth'}
248             ? $self->{'update_max_doc_id_sth'}
249             : $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)"));
250 0           $update_max_doc_id_sth->execute('max_doc_id', $id);
251 0           $adding_doc = 1;
252             }
253              
254 0           my $init_env = $self->{'init_env'}; # use packages, etc.
255 0 0         eval $init_env if defined $init_env;
256 0 0         print STDERR "Init_env failed with $@\n" if $@;
257              
258 0 0         $data = '' unless defined $data;
259 0           return $self->{'db_backend'}->parse_and_index_data($adding_doc,
260             $id, $data);
261             }
262              
263             # used for backends that need a count for each of the words
264             sub parse_and_index_data_count {
265 0     0 0   my ($backend, $adding_doc, $id, $data) = @_;
266             ## note that this is run with backend object
267 0           my $self = $backend->{'ctx'};
268              
269 0           my $word_length = $self->{'word_length'};
270             # this needs to get parametrized (lc, il2_to_ascii, parsing of
271             # HTML tags, ...)
272            
273 0           my %words;
274              
275 11     11   10459 use locale;
  11         2408  
  11         50  
276 0           my $filter = $self->{'filter'} . ' ' . $self->{'splitter'};
277 0           for my $word ( eval $filter ) {
278 0 0         $words{$word} = 0 if not defined $words{$word};
279 0           $words{$word}++;
280             }
281              
282 0           my @result;
283 0 0         if ($adding_doc) {
284 0           @result = $backend->add_document($id, \%words);
285             }
286             else {
287 0           @result = $backend->update_document($id, \%words);
288             }
289              
290 0 0         if (wantarray) {
291 0           return @result;
292             }
293 0           return $result[0];
294             }
295              
296             # used for backends where list of occurencies is needed
297             sub parse_and_index_data_list {
298 0     0 0   my ($backend, $adding_doc, $id, $data) = @_;
299             ## note that this is run with backend object
300 0           my $self = $backend->{'ctx'};
301              
302 0           my $word_length = $self->{'word_length'};
303             # this needs to get parametrized (lc, il2_to_ascii, parsing of
304             # HTML tags, ...)
305            
306 0           my %words;
307              
308 11     11   2267 use locale;
  11         22  
  11         41  
309 0           my $filter = $self->{'filter'} . ' ' . $self->{'splitter'};
310              
311 0           my $i = 0;
312 0           for my $word ( eval $filter ) {
313 0           push @{$words{$word}}, ++$i;
  0            
314             }
315              
316 0           my @result;
317 0 0         if ($adding_doc) {
318 0           @result = $backend->add_document($id, \%words);
319             }
320             else {
321 0           @result = $backend->update_document($id, \%words);
322             }
323              
324 0 0         if (wantarray) {
325 0           return @result;
326             }
327 0           return $result[0];
328             }
329             sub delete_document {
330 0     0 1   my $self = shift;
331 0           $self->{'db_backend'}->delete_document(@_);
332             }
333              
334             sub contains_hashref {
335 0     0 1   my $self = shift;
336 0           my $filter = $self->{'filter'};
337 0           $self->{'db_backend'}->contains_hashref(eval $filter.' @_');
338             }
339             sub contains {
340 0     0 1   my $self = shift;
341 0           my $res = $self->contains_hashref(@_);
342 0 0         if (not $self->{'count_bits'}) { return keys %$res; }
  0            
343 0           return sort { $res->{$b} <=> $res->{$a} } keys %$res;
  0            
344             }
345             sub econtains_hashref {
346 0     0 1   my $self = shift;
347 0           my $docs = {};
348 0           my $word_num = 0;
349              
350 0           my $is_some_plus = grep /^\+/, @_;
351              
352 0           for my $word ( map { /^\+(.+)$/s } @_) {
  0            
353 0           $word_num++;
354 0           my $oneword = $self->contains_hashref($word);
355 0 0         if ($word_num == 1) { $docs = $oneword; next; }
  0            
  0            
356 0           for my $doc (keys %$oneword) {
357 0 0         $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc};
358             }
359 0           for my $doc (keys %$docs) {
360 0 0         delete $docs->{$doc} unless defined $oneword->{$doc};
361             }
362             }
363              
364 0           for my $word ( map { /^([^+-].*)$/s } @_) {
  0            
365 0           my $oneword = $self->contains_hashref($word);
366 0           for my $doc (keys %$oneword) {
367 0 0         if ($is_some_plus) {
368 0 0         $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc};
369             }
370             else {
371 0 0         $docs->{$doc} = 0 unless defined $docs->{$doc};
372 0           $docs->{$doc} += $oneword->{$doc};
373             }
374             }
375             }
376              
377 0           for my $word ( map { /^-(.+)$/s } @_) {
  0            
378 0           my $oneword = $self->contains_hashref($word);
379 0           for my $doc (keys %$oneword) {
380 0           delete $docs->{$doc};
381             }
382             }
383 0           $docs;
384             }
385             sub econtains {
386 0     0 1   my $self = shift;
387 0           my $res = $self->econtains_hashref(@_);
388 0 0         if (not $self->{'count_bits'}) { return keys %$res; }
  0            
389 0           return sort { $res->{$b} <=> $res->{$a} } keys %$res;
  0            
390             }
391              
392             1;
393              
394             =head1 SYNOPSIS
395              
396             use MyConText;
397             use DBI;
398             # connect to database (regular DBI)
399             my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd');
400             # create a new index
401             my $ctx = MyConText->create($dbh, 'ctx_web_1',
402             'frontend' => 'string', 'backend' => 'blob');
403             # or open existing one
404             # my $ctx = MyConText->open($dbh, 'ctx_web_1');
405              
406             # index documents
407             $ctx->index_document('krtek', 'krtek leze pod zemi');
408             $ctx->index_document('jezek', 'Jezek ma ostre bodliny.');
409              
410             # search for matches
411             my @documents = $ctx->contains('krtek');
412             my @docs = $ctx->econtains('+krtek', '-Jezek');
413              
414              
415             =head1 DESCRIPTION
416              
417             MyConText is a pure man's solution for indexing contents of documents.
418             It uses the MySQL database to store the information about words and
419             documents and provides Perl interface for indexing new documents,
420             making changes and searching for matches. For MyConText, a document
421             is nearly anything -- Perl scalar, file, Web document, database field.
422              
423             The basic style of interface is shown above. What you need is a MySQL
424             database and a DBI with DBD::mysql. Then you create a MyConText index
425             -- a set of tables that maintain all necessary information. Once created
426             it can be accessed many times, either for updating the index (adding
427             documents) or searching.
428              
429             MyConText uses one basic table to store parameters of the index. Second
430             table is used to store the actual information about documents and words,
431             and depending on the type of the index (specified during index creation)
432             there may be more tables to store additional information (like
433             conversion from external string names (eg. URL's) to internal numeric
434             form). For a user, these internal thingies and internal behaviour of the
435             index are not important. The important part is the API, the methods to
436             index document and ask questions about words in documents. However,
437             certain understanding of how it all works may be usefull when you are
438             deciding if this module is for you and what type of index will best
439             suit your needs.
440              
441             =head2 Frontends
442              
443             From the user, application point of view, the MyConText index stores
444             documents that are named in a certain way, allows adding new documents,
445             and provides methods to ask: "give me list of names of documents that
446             contain this list of words". The MyConText index doesn't store the
447             documents itself. Instead, it stores information about words in the
448             documents in such a structured way that it makes easy and fast to look
449             up what documents contain certain words and return names of the
450             documents.
451              
452             MyConText provides a couple of predefined frontend classes that specify
453             various types of documents (and the way they relate to their names).
454              
455             =over 4
456              
457             =item default
458              
459             By default, user specifies the integer number of the document and the
460             content (body) of the document. The code would for example read
461              
462             $ctx->index_document(53, 'zastavujeme vyplaty vkladu');
463              
464             and MyConText will remember that the document 53 contains three words.
465             When looking for all documents containing word (string) vklad, a call
466              
467             my @docs = $ctx->contains('vklad%');
468              
469             would return numbers of all documents containing words starting with
470             'vklad', 53 among them.
471              
472             So here it's user's responsibility to maintain a relation between the
473             document numbers and their content, to know that a document 53 is about
474             vklady. Perhaps the documents are already stored somewhere and have
475             inique numeric id.
476              
477             =item string
478              
479             Frontend B allows the user to specify the names of the documents as
480             strings, instead of numbers. Still the user has to specify both the
481             name of the document and the content:
482              
483             $ctx->index_document('upozorneni',
484             'Odstrante z dosadu deti!');
485              
486             After that,
487              
488             $ctx->contains('deti')
489              
490             will return 'upozorneni' as one of the names of documents with word
491             'deti' in it.
492              
493             =item file
494              
495             To index files, use the frontend B. Here the content of the document
496             is clearly the content of the file specified by the filename, so in
497             a call to index_document, only the name is needed -- the content of the
498             file is read by the MyConText transparently:
499              
500             $ctx->index_document('/usr/doc/FAQ/Linux-FAQ');
501             my @files = $ctx->contains('penguin');
502              
503             =item url
504              
505             Web document can be indexed by the frontend B. MyConText uses LWP to
506             get the document and then parses it normally:
507              
508             $ctx->index_document('http://www.perl.com/');
509              
510             =item table
511              
512             You can have a MyConText index that indexes char or blob fields in MySQL
513             table. Since MySQL doesn't support triggers, you have to call the
514             index_document method of MyConText any time something changes in the
515             table. So the sequence probably will be
516              
517             $dbh->do('insert into the_table (id, data, other_fields)
518             values (?, ?, ?)', {}, $name, $data, $date_or_something);
519             $ctx->index_document($name);
520              
521             When calling contains, the id (name) of the record will be returned. If
522             the id in the_table is numeric, it's directly used as the internal
523             numeric id, otherwise a string's way of converting the id to numeric
524             form is used.
525              
526             =back
527              
528             The structure of MyConText is very flexible and adding new frontend
529             (what will be indexed) is very easy.
530              
531             =head2 Backends
532              
533             While frontend specifies what is indexed and how the user sees the
534             collection of documents, backend is about low level database way of
535             actually storing the information in the tables. Three types are
536             available:
537              
538             =over 4
539              
540             =item blob
541              
542             For each word, a blob holding list of all documents containing that word
543             is stored in the table, with the count (number of occurencies)
544             associated with each document number. That makes it for very compact
545             storage. Since the document names (for example URL) are internally
546             converted to numbers, storing and fetching the data is fast. However,
547             updating the information is very slow, since information concerning one
548             document is spread across all table, without any direct database access.
549             Updating a document (or merely reindexing it) requires update of all
550             blobs, which is slow.
551              
552             The list of documents is stored sorted by document name so that
553             fetching an information about a document for one word is relatively
554             easy, still a need to update (or at least scan) all records in the table
555             makes this storage unsuitable for collections of documents that often
556             change.
557              
558             =item column
559              
560             The B backend stores a word/document pair in database fields,
561             indexing both, thus allowing both fast retrieval and updates -- it's
562             easy to delete all records describing one document and insert new ones.
563             However, the database indexes that have to be maintained are large.
564              
565             Both B and B backends only store a count -- number of
566             occurencies of the word in the document (and even this can be switched
567             off, yielding just a yes/no information about the word's presence).
568             This allows questions like
569              
570             all documents containing words 'voda' or 'Mattoni'
571             but not a word 'kyselka'
572              
573             but you cannot ask whether a document contains a phrase 'kyselka
574             Mattoni' because such information is not maintained by these types of
575             backends.
576              
577             =item phrase
578              
579             To allow phrase matching, a B backend is available. For each word
580             and document number it stores a blob of lists of positions of the word
581             in the document. A query
582              
583             $ctx->contains('kyselk%', 'Mattoni');
584              
585             then only returns those documents (document names/numbers) where word
586             kyselka (or kyselky, or so) is just before word Mattoni.
587              
588             =back
589              
590             =head2 Mixing frontends and backends
591              
592             Any frontend can be used with any backend in one MyConText index. You
593             can index Web documents with B frontend and B backend
594             to be able to find phrases in the documents. And you can use the
595             default, number based document scheme with B backend to use the disk
596             space as efficiently as possible -- this is usefull for example for
597             mailing-list archives, where we need to index huge number of documents
598             that do not change at all.
599              
600             Finding optimal combination is very important and may require some
601             analysis of the document collection and manipulation, as well as the
602             speed and storage requirements. Benchmarking on actual target platform
603             is very usefull during the design phase.
604              
605             =head1 METHODS
606              
607             The following methods are available on the user side as MyConText API.
608              
609             =over 4
610              
611             =item create
612              
613             my $ctx = MyConText->create($dbh, $index_name, %opts);
614              
615             The class method B creates index of given name (the name of the
616             index is the name of its basic parameter table) and all necessary
617             tables, returns an object -- newly created index. The options that may
618             be specified after the index name define the frontend and backend types,
619             storage parameters (how many bits for what values), etc. See below for
620             list of create options and discussion of their use.
621              
622             =item open
623              
624             my $ctx = MyConText->open($dbh, $index_name);
625              
626             Opens and returns object, accessing specifies MyConText index. Since all
627             the index parameters and information are stored in the $index_name table
628             (including names of all other needed tables), the database handler and
629             the name of the parameter table are the only needed arguments.
630              
631             =item index_document
632              
633             $ctx->index_document(45, 'Sleva pri nakupu stribra.');
634             $ctx->index_document('http://www.mozilla.org/');
635              
636             For the default and B frontends, two arguments are expected -- the
637             name (number or string) of the document and its content. For B and
638             B frontends only the name of the document is needed. The method
639             returns number of words indexed (subject to wild change).
640              
641             =item delete_document
642              
643             $ctx->delete_document('http://www.mozilla.org/');
644              
645             Removes information about document from the index. Note that for B
646             backend this is very time consuming process.
647              
648             =item contains
649              
650             my @docs = $ctx->contains('sleva', 'strib%');
651              
652             Returns list of names (numbers or strings, depending on the frontend)
653             of documents that contain some of specified words.
654              
655             =item econtains
656              
657             my @docs = $ctx->contains('sleva', '+strib%', '-zlato');
658              
659             Econtains stands for extended contains and allows words to be prefixed
660             by plus or minus signs to specify that the word must or mustn't be
661             present in the document for it to match.
662              
663             =item contains_hashref, econtains_hashref
664              
665             Similar to B and B, only instead of list of document
666             names, there methods return a hash reference to a hash where keys are
667             the document names and values are the number of occurencies of the
668             words.
669              
670             =item drop
671              
672             Removes all tables associated with the index, including the base
673             parameter table. Effectivelly destroying the index form the database.
674              
675             =back
676              
677             =head1 INDEX OPTIONS
678              
679             Here we list the options that may be passed to MyConText->create call.
680             These allow to specify the style and storage parameters in great detail.
681              
682             =over 4
683              
684             =item backend
685              
686             The backend type, default B, possible values blob, column and phrase
687             (see above for explanation).
688              
689             =item frontend
690              
691             The frontend type. The default frontend requires the user to specify
692             numeric id of the document together with the content of the document,
693             other possible values are string, file and url (see above for
694             more info).
695              
696             =item word_length
697              
698             Maximum length of words that may be indexed, default 30.
699              
700             =item data_table
701              
702             Name of the table where the actual data about word/document relation is
703             stored. By default, the name of the index (of the base table) with _data
704             suffix is used.
705              
706             =item name_length
707              
708             Any frontend that uses strings as names of documents needs to maintain
709             a conversion table from these names to internal integer ids. This value
710             specifies maximum length of these string names (URLs, file names, ...).
711              
712             =item blob_direct_fetch
713              
714             Only for blob backend. When looking for information about specific
715             document in the list stored in the blob, the blob backend uses division
716             of interval to find the correct place in the blob. When the interval
717             gets equal or shorter that this value, all values are fetched from the
718             database and the final search is done in Perl code sequentially.
719              
720             =item word_id_bits
721              
722             With column or phase backends, MyConText maintains a numeric id for each
723             word to optimize the space requirements. The word_id_bits parameter
724             specifies the number of bits to reserve for this conversion and thus
725             effectively limits number of distinct words that may be indexed. The
726             default is 16 bits and possible values are 8, 16, 24 or 32 bits.
727              
728             =item word_id_table
729              
730             Name of the table that holds conversion from words to their numeric id
731             (for column and phrase backends). By default is the name of the index
732             with _words suffix.
733              
734             =item doc_id_bits
735              
736             A number of bits to hold a numeric id of the document (that is either
737             provided by the user (with default frontend) or generated by the module
738             to accomplish the conversion from the string name of the document). This
739             value limits the maximum number of documents to hold. The default is 16
740             bits and possible values are 8, 16 and 32 bits for blob backend and 8,
741             16, 24 and 32 bits for column and phrase backends.
742              
743             =item doc_id_table
744              
745             Name of the table that holds conversion from string names of documents
746             to their numeric id, by default the name of the index with _docid
747             suffix.
748              
749             =item count_bits
750              
751             Number of bits reserved for storing number of occurencies of each word
752             in the document. The default is 8 and possible values are the same as
753             with doc_id_bits.
754              
755             =item position_bits
756              
757             With phrase backend, MyConText stores positions of each word of the
758             documents. This value specifies how much space should be reserved for
759             this purpose. The default is 32 bits and possible values are 8, 16 or 32
760             bits. This value limits the maximum number of words of each document
761             that can be stored.
762              
763             =item splitter
764              
765             MyConText allows the user to provide any Perl code that will be used to
766             split the content of the document to words. The code will be evalled
767             inside of the MyConText code. The default is
768              
769             $data =~ /(\w{2,$word_length})/g
770              
771             and shows that the input is stored in the variable C<$data> and the code
772             may access any other variable available in the perl_and_index_data_*
773             methods (see source), especially C<$word_length> to get the maximum length
774             of words and C<$backend> to get the backend object.
775              
776             The default value also shows that by default, the minimum length of
777             words indexed is 2.
778              
779             =item filter
780              
781             The output words of splitter (and also any parameter of (e)contains*
782             methods) are send to filter that may do further processing. Filter is
783             again a Perl code, the default is
784              
785             map { lc $_ }
786              
787             showing that the filter operates on input list and by default does
788             conversion to lowercase (yielding case insensitive index).
789              
790             =item init_env
791              
792             Because user defined splitter or filter may depend on other things that
793             it is reasonable to set before the actual procession of words, you can
794             use yet another Perl hook to set things up. The default is
795              
796             use locale
797              
798             =item table_name
799              
800             For table frontend; this is the name of the table that will be indexed.
801              
802             =item column_name
803              
804             For table frontend; this is the name of the column in the table_name
805             that contains the documents -- data to be indexed. It can also have
806             a form table.column that will be used if the table_name option is not
807             specified.
808              
809             =item column_id_name
810              
811             For table frontend; this is the name of the field in table_name that
812             holds names (ids) of the records. If not specified, a field that has
813             primary key on it is used. If this field is numeric, it's values are
814             directly used as identifiers, otherwise a conversion to numeric values
815             is made.
816              
817             =back
818              
819             =head1 ERROR HANDLING
820              
821             The create and open methods return the MyConText object on success, upon
822             failure they return undef and set error message in $MyConText::errstr
823             variable.
824              
825             All other methods return reasonable (documented above) value on success,
826             failure is signalized by unreasonable (typically undef or null) return
827             value; the error message may then be retrieved by $ctx->errstr method
828             call.
829              
830             =head1 VERSION
831              
832             This documentation describes MyConText module version 0.49.
833              
834             =head1 BUGS
835              
836             Error handling needs more polishing.
837              
838             We do not check if the stored values are larger that specified by the
839             *_bits parameters.
840              
841             No CGI administration tool at the moment.
842              
843             Econtains doesn't work with phrase backend.
844              
845             No scoring algorithm implemented.
846              
847             No support for stop words at the moment.
848              
849             =head1 AUTHOR
850              
851             (c) 1999 Jan Pazdziora, adelton@fi.muni.cz,
852             http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk
853             University in Brno, Czech Republic
854              
855             All rights reserved. This package is free software; you can
856             redistribute it and/or modify it under the same terms as Perl itself.
857              
858             =head1 SEE ALSO
859              
860             DBI(3), mycontextadmin(1).
861              
862             =head1 OTHER PRODUCTS and why I've written this module
863              
864             I'm aware of DBIx::TextIndex module and about UdmSearch utility, and
865             about htdig and glimpse on the non-database side of the world.
866              
867             To me, using a database gives reasonable maintenance benefits. With
868             products that use their own files to store the information (even if the
869             storage algorithms are efficient and well thought of), you always
870             struggle with permissions on files and directories for various users,
871             with files that somebody accidently deleted or mungled, and making the
872             index available remotely is not trivial.
873              
874             That's why I've wanted a module that will use a database as a storage
875             backend. With MySQL, you get remote access and access control for free,
876             and on many web servers MySQL is part of the standard equipment. So
877             using it for text indexes seemed natural.
878              
879             However, existing DBIx::TextIndex and UdmSearch are too narrow-aimed to
880             me. The first only supports indexing of data that is stored in the
881             database, but you may not always want or need to store the documents in
882             the database as well. The UdmSearch on the other hand is only for web
883             documents, making it unsuitable for indexing mailing-list archives or
884             local data.
885              
886             I believe that MyConText is reasonably flexible and still very
887             efficient. It doesn't enforce its own idea of what is good for you --
888             the number of options is big and you can always extend the module with
889             your own backend of frontend if you feel that those provided are not
890             sufficient. Or you can extend existing by adding one or two parameters
891             that will add new features. Of course, patches are always welcome.
892             MyConText is a tool that can be deployed in many projects. It's not
893             a complete environment since different people have different needs. On
894             the other hand, the methods that it provides make it easy to build
895             a complete solution on top of this in very short course of time.
896              
897             I was primarily inspired by the ConText cartrige of Oracle server. Since
898             MySQL doesn't support triggers, it showed up that Perl interface will be
899             needed. Of course, porting this module to (for example) PostgreSQL
900             should be easy, so different name is probably needed. On the other hand,
901             the code is sometimes very MySQL specific to make the module work
902             efficiently, so I didn't want a name that would suggest that it's
903             a generic tool that will work with any SQL database.
904              
905             =cut
906