line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
=head1 NAME |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
MyConText - Indexing documents with MySQL as storage |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=cut |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
package MyConText; |
9
|
11
|
|
|
11
|
|
7740
|
use strict; |
|
11
|
|
|
|
|
19
|
|
|
11
|
|
|
|
|
341
|
|
10
|
|
|
|
|
|
|
|
11
|
11
|
|
|
11
|
|
48
|
use vars qw($errstr $VERSION); |
|
11
|
|
|
|
|
17
|
|
|
11
|
|
|
|
|
1723
|
|
12
|
|
|
|
|
|
|
$errstr = undef; |
13
|
|
|
|
|
|
|
$VERSION = '0.49'; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
my %DEFAULT_PARAMS = ( |
16
|
|
|
|
|
|
|
'num_of_docs' => 0, # statistical value, should be maintained |
17
|
|
|
|
|
|
|
'word_length' => 30, # max length of words we index |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
'protocol' => 40, # we only support protocol with the same numbers |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
'blob_direct_fetch' => 20, # with the blob store, when we stop searching |
22
|
|
|
|
|
|
|
# and fetch everything at once |
23
|
|
|
|
|
|
|
'data_table' => undef, # table where the actual index is stored |
24
|
|
|
|
|
|
|
'name_length' => 255, # for filenames or URLs, what's the max length |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
'word_id_bits' => 16, # num of bits for word_id (column store) |
27
|
|
|
|
|
|
|
'doc_id_bits' => 16, # num of bits for doc_id |
28
|
|
|
|
|
|
|
'count_bits' => 8, # num of bits for count value |
29
|
|
|
|
|
|
|
'position_bits' => 32, # num of bits for word positions |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
'backend' => 'blob', # what database backend (way the data is |
32
|
|
|
|
|
|
|
# stored) we use |
33
|
|
|
|
|
|
|
'frontend' => 'none', # what application frontend we use (how |
34
|
|
|
|
|
|
|
# the index behaves externaly) |
35
|
|
|
|
|
|
|
'filter' => 'map { lc $_ }', |
36
|
|
|
|
|
|
|
'splitter' => ' $data =~ /(\w{2,$word_length})/g', |
37
|
|
|
|
|
|
|
# can use the $data and $word_length |
38
|
|
|
|
|
|
|
# variables |
39
|
|
|
|
|
|
|
'init_env' => 'use locale' |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
my %backend_types = ( |
42
|
|
|
|
|
|
|
'blob' => 'MyConText::Blob', |
43
|
|
|
|
|
|
|
'column' => 'MyConText::Column', |
44
|
|
|
|
|
|
|
'phrase' => 'MyConText::Phrase', |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
my %frontend_types = ( |
47
|
|
|
|
|
|
|
'none' => 'MyConText', |
48
|
|
|
|
|
|
|
'default' => 'MyConText', |
49
|
|
|
|
|
|
|
'file' => 'MyConText::File', |
50
|
|
|
|
|
|
|
'string' => 'MyConText::String', |
51
|
|
|
|
|
|
|
'url' => 'MyConText::URL', |
52
|
|
|
|
|
|
|
'table' => 'MyConText::Table', |
53
|
|
|
|
|
|
|
); |
54
|
|
|
|
|
|
|
|
55
|
11
|
|
|
11
|
|
51
|
use vars qw! %BITS_TO_PACK %BITS_TO_INT %BITS_TO_PRECISION %PRECISION_TO_BITS !; |
|
11
|
|
|
|
|
28
|
|
|
11
|
|
|
|
|
17422
|
|
56
|
|
|
|
|
|
|
%BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !; |
57
|
|
|
|
|
|
|
%BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !; |
58
|
|
|
|
|
|
|
%BITS_TO_PRECISION = qw! 8 4 16 6 24 9 32 11 !; |
59
|
|
|
|
|
|
|
%PRECISION_TO_BITS = map { ( $BITS_TO_PRECISION{$_} => $_ ) } keys %BITS_TO_PRECISION; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# Open reads in the information about existing index, creates an object |
62
|
|
|
|
|
|
|
# in memory |
63
|
|
|
|
|
|
|
sub open { |
64
|
0
|
|
|
0
|
1
|
|
my ($class, $dbh, $TABLE) = @_; |
65
|
0
|
|
|
|
|
|
$errstr = undef; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# the $dbh is either a real dbh of a DBI->connect parameters arrayref |
68
|
0
|
|
|
|
|
|
my $mydbh = 0; |
69
|
0
|
0
|
|
|
|
|
if (ref $dbh eq 'ARRAY') { |
70
|
|
|
|
|
|
|
$dbh = DBI->connect(@$dbh) or |
71
|
0
|
0
|
|
|
|
|
do { $errstr = $DBI::errstr; return; }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
73
|
0
|
|
|
|
|
|
$mydbh = 1; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# load the parameters to the object |
77
|
0
|
|
|
|
|
|
my %PARAMS = %DEFAULT_PARAMS; |
78
|
0
|
|
|
|
|
|
my $sth = $dbh->prepare("select * from $TABLE"); |
79
|
0
|
|
|
|
|
|
$sth->{'PrintError'} = 0; |
80
|
0
|
|
|
|
|
|
$sth->{'RaiseError'} = 0; |
81
|
0
|
0
|
|
|
|
|
$sth->execute or do { |
82
|
0
|
0
|
|
|
|
|
if (not grep { $TABLE eq $_ } |
|
0
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
MyConText->list_context_indexes($dbh)) { |
84
|
0
|
|
|
|
|
|
$errstr = "ConText index $TABLE doesn't exist."; |
85
|
|
|
|
|
|
|
} |
86
|
0
|
|
|
|
|
|
else { $errstr = $sth->errstr; } |
87
|
0
|
|
|
|
|
|
return; |
88
|
|
|
|
|
|
|
}; |
89
|
0
|
|
|
|
|
|
while (my ($param, $value) = $sth->fetchrow_array) { |
90
|
0
|
|
|
|
|
|
$PARAMS{$param} = $value; |
91
|
|
|
|
|
|
|
} |
92
|
0
|
|
|
|
|
|
my $self = bless { |
93
|
|
|
|
|
|
|
'dbh' => $dbh, |
94
|
|
|
|
|
|
|
'table' => $TABLE, |
95
|
|
|
|
|
|
|
%PARAMS, |
96
|
|
|
|
|
|
|
}, $class; |
97
|
0
|
|
|
|
|
|
my $data_table = $self->{'data_table'}; |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
# we should disconnect if we've opened the dbh here |
100
|
0
|
0
|
|
|
|
|
if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; } |
|
0
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# some basic sanity check |
103
|
|
|
|
|
|
|
defined $dbh->selectrow_array("select count(*) from $data_table") |
104
|
0
|
0
|
|
|
|
|
or do { $errstr = "Table $data_table not found in the database\n"; return; }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# load and set the application frontend |
108
|
0
|
|
|
|
|
|
my $front_module = $frontend_types{$PARAMS{'frontend'}}; |
109
|
0
|
0
|
|
|
|
|
if (defined $front_module) { |
110
|
0
|
0
|
|
|
|
|
if ($front_module ne $class) { |
111
|
0
|
|
|
|
|
|
eval "use $front_module"; |
112
|
0
|
0
|
|
|
|
|
die $@ if $@; |
113
|
|
|
|
|
|
|
} |
114
|
0
|
|
|
|
|
|
bless $self, $front_module; |
115
|
0
|
|
|
|
|
|
$self->_open_tables; |
116
|
|
|
|
|
|
|
} |
117
|
0
|
|
|
|
|
|
else { $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; } |
|
0
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# load and set the backend (actual database access) module |
120
|
0
|
|
|
|
|
|
my $back_module = $backend_types{$PARAMS{'backend'}}; |
121
|
0
|
0
|
|
|
|
|
if (defined $back_module) { |
122
|
0
|
|
|
|
|
|
eval "use $back_module"; |
123
|
0
|
0
|
|
|
|
|
die $@ if $@; |
124
|
0
|
|
|
|
|
|
$self->{'db_backend'} = $back_module->open($self); |
125
|
|
|
|
|
|
|
} |
126
|
0
|
|
|
|
|
|
else { $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; } |
|
0
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# finally, return the object |
129
|
0
|
|
|
|
|
|
$self; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Create creates tables in the database according to the options, then |
133
|
|
|
|
|
|
|
# calls open to load the object to memory |
134
|
|
|
|
|
|
|
sub create { |
135
|
0
|
|
|
0
|
1
|
|
my ($class, $dbh, $TABLE, %OPTIONS) = @_; |
136
|
0
|
|
|
|
|
|
$errstr = undef; |
137
|
0
|
|
|
|
|
|
my $mydbh = 0; |
138
|
0
|
0
|
|
|
|
|
if (ref $dbh eq 'ARRAY') { |
139
|
|
|
|
|
|
|
$dbh = DBI->connect(@$dbh) or |
140
|
0
|
0
|
|
|
|
|
do { $errstr = $DBI::errstr; return; }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
$mydbh = 1; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my $self = bless { |
145
|
|
|
|
|
|
|
'dbh' => $dbh, |
146
|
|
|
|
|
|
|
'table' => $TABLE, |
147
|
|
|
|
|
|
|
%DEFAULT_PARAMS, |
148
|
|
|
|
|
|
|
%OPTIONS |
149
|
|
|
|
|
|
|
}, $class; |
150
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
|
$self->{'data_table'} = $TABLE.'_data' |
152
|
|
|
|
|
|
|
unless defined $self->{'data_table'}; |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
|
|
|
my $CREATE_PARAM = <
|
155
|
|
|
|
|
|
|
create table $TABLE ( |
156
|
|
|
|
|
|
|
param varchar(16) binary not null, |
157
|
|
|
|
|
|
|
value varchar(255), |
158
|
|
|
|
|
|
|
primary key (param) |
159
|
|
|
|
|
|
|
) |
160
|
|
|
|
|
|
|
EOF |
161
|
0
|
0
|
|
|
|
|
$dbh->do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
162
|
0
|
|
|
|
|
|
push @{$self->{'created_tables'}}, $TABLE; |
|
0
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# load and set the frontend database structures |
165
|
0
|
|
|
|
|
|
my $front_module = $frontend_types{$self->{'frontend'}}; |
166
|
0
|
0
|
|
|
|
|
if (defined $front_module) { |
167
|
0
|
|
|
|
|
|
eval "use $front_module"; |
168
|
0
|
0
|
|
|
|
|
die $@ if $@; |
169
|
0
|
|
|
|
|
|
bless $self, $front_module; |
170
|
0
|
|
|
|
|
|
$errstr = $self->_create_tables; |
171
|
0
|
0
|
|
|
|
|
if (defined $errstr) { $self->clean_failed_create; return; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
} |
173
|
0
|
|
|
|
|
|
else { $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# create the backend database structures |
176
|
0
|
|
|
|
|
|
my $back_module = $backend_types{$self->{'backend'}}; |
177
|
0
|
0
|
|
|
|
|
if (defined $back_module) { |
178
|
0
|
|
|
|
|
|
eval "use $back_module"; |
179
|
0
|
0
|
|
|
|
|
die $@ if $@; |
180
|
0
|
|
|
|
|
|
$errstr = $back_module->_create_tables($self); |
181
|
0
|
0
|
|
|
|
|
if (defined $errstr) { $self->clean_failed_create; return; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
} |
183
|
0
|
|
|
|
|
|
else { $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
|
185
|
0
|
|
|
|
|
|
for (grep { not ref $self->{$_} } keys %$self) { |
|
0
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_}); |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
return $class->open($dbh, $TABLE); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
0
|
|
|
0
|
|
|
sub _create_tables {} |
193
|
0
|
|
|
0
|
|
|
sub _open_tables {} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub clean_failed_create { |
196
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
197
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
198
|
0
|
|
|
|
|
|
for my $table (@{$self->{'created_tables'}}) { |
|
0
|
|
|
|
|
|
|
199
|
0
|
|
|
|
|
|
$dbh->do("drop table $table"); |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub drop { |
204
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
205
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
206
|
0
|
|
|
|
|
|
for my $tag (keys %$self) { |
207
|
0
|
0
|
|
|
|
|
next unless $tag =~ /(^|_)table$/; |
208
|
0
|
|
|
|
|
|
$dbh->do("drop table $self->{$tag}"); |
209
|
|
|
|
|
|
|
} |
210
|
0
|
|
|
|
|
|
1; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
sub errstr { |
213
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
214
|
0
|
0
|
|
|
|
|
ref $self ? $self->{'errstr'} : $errstr; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub list_context_indexes { |
218
|
0
|
|
|
0
|
0
|
|
my ($class, $dbh) = @_; |
219
|
0
|
|
|
|
|
|
my %tables = map { ( $_->[0] => 1 ) } |
|
0
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
|
@{$dbh->selectall_arrayref('show tables')}; |
221
|
0
|
|
|
|
|
|
my %indexes = (); |
222
|
0
|
|
|
|
|
|
for my $table (keys %tables) { |
223
|
0
|
|
|
|
|
|
local $dbh->{'PrintError'} = 0; |
224
|
0
|
|
|
|
|
|
local $dbh->{'RaiseError'} = 0; |
225
|
0
|
0
|
|
|
|
|
if ($dbh->selectrow_array("select param, value from $table |
226
|
|
|
|
|
|
|
where param = 'data_table'")) { |
227
|
0
|
|
|
|
|
|
$indexes{$table} = 1; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
0
|
|
|
|
|
|
return sort keys %indexes; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub index_document { |
234
|
0
|
|
|
0
|
1
|
|
my ($self, $id, $data) = @_; |
235
|
0
|
0
|
|
|
|
|
return unless defined $id; |
236
|
|
|
|
|
|
|
|
237
|
0
|
|
|
|
|
|
my $dbh = $self->{'dbh'}; |
238
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
my $param_table = $self->{'table'}; |
240
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
my $adding_doc = 0; |
242
|
|
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
my $adding = 0; |
244
|
0
|
0
|
0
|
|
|
|
if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) { |
245
|
0
|
|
|
|
|
|
$self->{'max_doc_id'} = $id; |
246
|
0
|
0
|
|
|
|
|
my $update_max_doc_id_sth = |
247
|
|
|
|
|
|
|
( defined $self->{'update_max_doc_id_sth'} |
248
|
|
|
|
|
|
|
? $self->{'update_max_doc_id_sth'} |
249
|
|
|
|
|
|
|
: $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)")); |
250
|
0
|
|
|
|
|
|
$update_max_doc_id_sth->execute('max_doc_id', $id); |
251
|
0
|
|
|
|
|
|
$adding_doc = 1; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
|
my $init_env = $self->{'init_env'}; # use packages, etc. |
255
|
0
|
0
|
|
|
|
|
eval $init_env if defined $init_env; |
256
|
0
|
0
|
|
|
|
|
print STDERR "Init_env failed with $@\n" if $@; |
257
|
|
|
|
|
|
|
|
258
|
0
|
0
|
|
|
|
|
$data = '' unless defined $data; |
259
|
0
|
|
|
|
|
|
return $self->{'db_backend'}->parse_and_index_data($adding_doc, |
260
|
|
|
|
|
|
|
$id, $data); |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# used for backends that need a count for each of the words |
264
|
|
|
|
|
|
|
sub parse_and_index_data_count { |
265
|
0
|
|
|
0
|
0
|
|
my ($backend, $adding_doc, $id, $data) = @_; |
266
|
|
|
|
|
|
|
## note that this is run with backend object |
267
|
0
|
|
|
|
|
|
my $self = $backend->{'ctx'}; |
268
|
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
my $word_length = $self->{'word_length'}; |
270
|
|
|
|
|
|
|
# this needs to get parametrized (lc, il2_to_ascii, parsing of |
271
|
|
|
|
|
|
|
# HTML tags, ...) |
272
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
|
my %words; |
274
|
|
|
|
|
|
|
|
275
|
11
|
|
|
11
|
|
10459
|
use locale; |
|
11
|
|
|
|
|
2408
|
|
|
11
|
|
|
|
|
50
|
|
276
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'} . ' ' . $self->{'splitter'}; |
277
|
0
|
|
|
|
|
|
for my $word ( eval $filter ) { |
278
|
0
|
0
|
|
|
|
|
$words{$word} = 0 if not defined $words{$word}; |
279
|
0
|
|
|
|
|
|
$words{$word}++; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
my @result; |
283
|
0
|
0
|
|
|
|
|
if ($adding_doc) { |
284
|
0
|
|
|
|
|
|
@result = $backend->add_document($id, \%words); |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
else { |
287
|
0
|
|
|
|
|
|
@result = $backend->update_document($id, \%words); |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
0
|
0
|
|
|
|
|
if (wantarray) { |
291
|
0
|
|
|
|
|
|
return @result; |
292
|
|
|
|
|
|
|
} |
293
|
0
|
|
|
|
|
|
return $result[0]; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# used for backends where list of occurencies is needed |
297
|
|
|
|
|
|
|
sub parse_and_index_data_list { |
298
|
0
|
|
|
0
|
0
|
|
my ($backend, $adding_doc, $id, $data) = @_; |
299
|
|
|
|
|
|
|
## note that this is run with backend object |
300
|
0
|
|
|
|
|
|
my $self = $backend->{'ctx'}; |
301
|
|
|
|
|
|
|
|
302
|
0
|
|
|
|
|
|
my $word_length = $self->{'word_length'}; |
303
|
|
|
|
|
|
|
# this needs to get parametrized (lc, il2_to_ascii, parsing of |
304
|
|
|
|
|
|
|
# HTML tags, ...) |
305
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
my %words; |
307
|
|
|
|
|
|
|
|
308
|
11
|
|
|
11
|
|
2267
|
use locale; |
|
11
|
|
|
|
|
22
|
|
|
11
|
|
|
|
|
41
|
|
309
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'} . ' ' . $self->{'splitter'}; |
310
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
|
my $i = 0; |
312
|
0
|
|
|
|
|
|
for my $word ( eval $filter ) { |
313
|
0
|
|
|
|
|
|
push @{$words{$word}}, ++$i; |
|
0
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
0
|
|
|
|
|
|
my @result; |
317
|
0
|
0
|
|
|
|
|
if ($adding_doc) { |
318
|
0
|
|
|
|
|
|
@result = $backend->add_document($id, \%words); |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
else { |
321
|
0
|
|
|
|
|
|
@result = $backend->update_document($id, \%words); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
0
|
0
|
|
|
|
|
if (wantarray) { |
325
|
0
|
|
|
|
|
|
return @result; |
326
|
|
|
|
|
|
|
} |
327
|
0
|
|
|
|
|
|
return $result[0]; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
sub delete_document { |
330
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
331
|
0
|
|
|
|
|
|
$self->{'db_backend'}->delete_document(@_); |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub contains_hashref { |
335
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
336
|
0
|
|
|
|
|
|
my $filter = $self->{'filter'}; |
337
|
0
|
|
|
|
|
|
$self->{'db_backend'}->contains_hashref(eval $filter.' @_'); |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
sub contains { |
340
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
341
|
0
|
|
|
|
|
|
my $res = $self->contains_hashref(@_); |
342
|
0
|
0
|
|
|
|
|
if (not $self->{'count_bits'}) { return keys %$res; } |
|
0
|
|
|
|
|
|
|
343
|
0
|
|
|
|
|
|
return sort { $res->{$b} <=> $res->{$a} } keys %$res; |
|
0
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
sub econtains_hashref { |
346
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
347
|
0
|
|
|
|
|
|
my $docs = {}; |
348
|
0
|
|
|
|
|
|
my $word_num = 0; |
349
|
|
|
|
|
|
|
|
350
|
0
|
|
|
|
|
|
my $is_some_plus = grep /^\+/, @_; |
351
|
|
|
|
|
|
|
|
352
|
0
|
|
|
|
|
|
for my $word ( map { /^\+(.+)$/s } @_) { |
|
0
|
|
|
|
|
|
|
353
|
0
|
|
|
|
|
|
$word_num++; |
354
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
355
|
0
|
0
|
|
|
|
|
if ($word_num == 1) { $docs = $oneword; next; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
357
|
0
|
0
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; |
358
|
|
|
|
|
|
|
} |
359
|
0
|
|
|
|
|
|
for my $doc (keys %$docs) { |
360
|
0
|
0
|
|
|
|
|
delete $docs->{$doc} unless defined $oneword->{$doc}; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
0
|
|
|
|
|
|
for my $word ( map { /^([^+-].*)$/s } @_) { |
|
0
|
|
|
|
|
|
|
365
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
366
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
367
|
0
|
0
|
|
|
|
|
if ($is_some_plus) { |
368
|
0
|
0
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
else { |
371
|
0
|
0
|
|
|
|
|
$docs->{$doc} = 0 unless defined $docs->{$doc}; |
372
|
0
|
|
|
|
|
|
$docs->{$doc} += $oneword->{$doc}; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
0
|
|
|
|
|
|
for my $word ( map { /^-(.+)$/s } @_) { |
|
0
|
|
|
|
|
|
|
378
|
0
|
|
|
|
|
|
my $oneword = $self->contains_hashref($word); |
379
|
0
|
|
|
|
|
|
for my $doc (keys %$oneword) { |
380
|
0
|
|
|
|
|
|
delete $docs->{$doc}; |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
} |
383
|
0
|
|
|
|
|
|
$docs; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
sub econtains { |
386
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
387
|
0
|
|
|
|
|
|
my $res = $self->econtains_hashref(@_); |
388
|
0
|
0
|
|
|
|
|
if (not $self->{'count_bits'}) { return keys %$res; } |
|
0
|
|
|
|
|
|
|
389
|
0
|
|
|
|
|
|
return sort { $res->{$b} <=> $res->{$a} } keys %$res; |
|
0
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
1; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head1 SYNOPSIS |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
use MyConText; |
397
|
|
|
|
|
|
|
use DBI; |
398
|
|
|
|
|
|
|
# connect to database (regular DBI) |
399
|
|
|
|
|
|
|
my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); |
400
|
|
|
|
|
|
|
# create a new index |
401
|
|
|
|
|
|
|
my $ctx = MyConText->create($dbh, 'ctx_web_1', |
402
|
|
|
|
|
|
|
'frontend' => 'string', 'backend' => 'blob'); |
403
|
|
|
|
|
|
|
# or open existing one |
404
|
|
|
|
|
|
|
# my $ctx = MyConText->open($dbh, 'ctx_web_1'); |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
# index documents |
407
|
|
|
|
|
|
|
$ctx->index_document('krtek', 'krtek leze pod zemi'); |
408
|
|
|
|
|
|
|
$ctx->index_document('jezek', 'Jezek ma ostre bodliny.'); |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# search for matches |
411
|
|
|
|
|
|
|
my @documents = $ctx->contains('krtek'); |
412
|
|
|
|
|
|
|
my @docs = $ctx->econtains('+krtek', '-Jezek'); |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
=head1 DESCRIPTION |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
MyConText is a pure man's solution for indexing contents of documents. |
418
|
|
|
|
|
|
|
It uses the MySQL database to store the information about words and |
419
|
|
|
|
|
|
|
documents and provides Perl interface for indexing new documents, |
420
|
|
|
|
|
|
|
making changes and searching for matches. For MyConText, a document |
421
|
|
|
|
|
|
|
is nearly anything -- Perl scalar, file, Web document, database field. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
The basic style of interface is shown above. What you need is a MySQL |
424
|
|
|
|
|
|
|
database and a DBI with DBD::mysql. Then you create a MyConText index |
425
|
|
|
|
|
|
|
-- a set of tables that maintain all necessary information. Once created |
426
|
|
|
|
|
|
|
it can be accessed many times, either for updating the index (adding |
427
|
|
|
|
|
|
|
documents) or searching. |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
MyConText uses one basic table to store parameters of the index. Second |
430
|
|
|
|
|
|
|
table is used to store the actual information about documents and words, |
431
|
|
|
|
|
|
|
and depending on the type of the index (specified during index creation) |
432
|
|
|
|
|
|
|
there may be more tables to store additional information (like |
433
|
|
|
|
|
|
|
conversion from external string names (eg. URL's) to internal numeric |
434
|
|
|
|
|
|
|
form). For a user, these internal thingies and internal behaviour of the |
435
|
|
|
|
|
|
|
index are not important. The important part is the API, the methods to |
436
|
|
|
|
|
|
|
index document and ask questions about words in documents. However, |
437
|
|
|
|
|
|
|
certain understanding of how it all works may be usefull when you are |
438
|
|
|
|
|
|
|
deciding if this module is for you and what type of index will best |
439
|
|
|
|
|
|
|
suit your needs. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=head2 Frontends |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
From the user, application point of view, the MyConText index stores |
444
|
|
|
|
|
|
|
documents that are named in a certain way, allows adding new documents, |
445
|
|
|
|
|
|
|
and provides methods to ask: "give me list of names of documents that |
446
|
|
|
|
|
|
|
contain this list of words". The MyConText index doesn't store the |
447
|
|
|
|
|
|
|
documents itself. Instead, it stores information about words in the |
448
|
|
|
|
|
|
|
documents in such a structured way that it makes easy and fast to look |
449
|
|
|
|
|
|
|
up what documents contain certain words and return names of the |
450
|
|
|
|
|
|
|
documents. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
MyConText provides a couple of predefined frontend classes that specify |
453
|
|
|
|
|
|
|
various types of documents (and the way they relate to their names). |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=over 4 |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=item default |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
By default, user specifies the integer number of the document and the |
460
|
|
|
|
|
|
|
content (body) of the document. The code would for example read |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
$ctx->index_document(53, 'zastavujeme vyplaty vkladu'); |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
and MyConText will remember that the document 53 contains three words. |
465
|
|
|
|
|
|
|
When looking for all documents containing word (string) vklad, a call |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
my @docs = $ctx->contains('vklad%'); |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
would return numbers of all documents containing words starting with |
470
|
|
|
|
|
|
|
'vklad', 53 among them. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
So here it's user's responsibility to maintain a relation between the |
473
|
|
|
|
|
|
|
document numbers and their content, to know that a document 53 is about |
474
|
|
|
|
|
|
|
vklady. Perhaps the documents are already stored somewhere and have |
475
|
|
|
|
|
|
|
inique numeric id. |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item string |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Frontend B allows the user to specify the names of the documents as |
480
|
|
|
|
|
|
|
strings, instead of numbers. Still the user has to specify both the |
481
|
|
|
|
|
|
|
name of the document and the content: |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
$ctx->index_document('upozorneni', |
484
|
|
|
|
|
|
|
'Odstrante z dosadu deti!'); |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
After that, |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
$ctx->contains('deti') |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
will return 'upozorneni' as one of the names of documents with word |
491
|
|
|
|
|
|
|
'deti' in it. |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=item file |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
To index files, use the frontend B. Here the content of the document |
496
|
|
|
|
|
|
|
is clearly the content of the file specified by the filename, so in |
497
|
|
|
|
|
|
|
a call to index_document, only the name is needed -- the content of the |
498
|
|
|
|
|
|
|
file is read by the MyConText transparently: |
499
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
$ctx->index_document('/usr/doc/FAQ/Linux-FAQ'); |
501
|
|
|
|
|
|
|
my @files = $ctx->contains('penguin'); |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item url |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Web document can be indexed by the frontend B. MyConText uses LWP to |
506
|
|
|
|
|
|
|
get the document and then parses it normally: |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
$ctx->index_document('http://www.perl.com/'); |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=item table |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
You can have a MyConText index that indexes char or blob fields in MySQL |
513
|
|
|
|
|
|
|
table. Since MySQL doesn't support triggers, you have to call the |
514
|
|
|
|
|
|
|
index_document method of MyConText any time something changes in the |
515
|
|
|
|
|
|
|
table. So the sequence probably will be |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
$dbh->do('insert into the_table (id, data, other_fields) |
518
|
|
|
|
|
|
|
values (?, ?, ?)', {}, $name, $data, $date_or_something); |
519
|
|
|
|
|
|
|
$ctx->index_document($name); |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
When calling contains, the id (name) of the record will be returned. If |
522
|
|
|
|
|
|
|
the id in the_table is numeric, it's directly used as the internal |
523
|
|
|
|
|
|
|
numeric id, otherwise a string's way of converting the id to numeric |
524
|
|
|
|
|
|
|
form is used. |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=back |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
The structure of MyConText is very flexible and adding new frontend |
529
|
|
|
|
|
|
|
(what will be indexed) is very easy. |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=head2 Backends |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
While frontend specifies what is indexed and how the user sees the |
534
|
|
|
|
|
|
|
collection of documents, backend is about low level database way of |
535
|
|
|
|
|
|
|
actually storing the information in the tables. Three types are |
536
|
|
|
|
|
|
|
available: |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=over 4 |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=item blob |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
For each word, a blob holding list of all documents containing that word |
543
|
|
|
|
|
|
|
is stored in the table, with the count (number of occurencies) |
544
|
|
|
|
|
|
|
associated with each document number. That makes it for very compact |
545
|
|
|
|
|
|
|
storage. Since the document names (for example URL) are internally |
546
|
|
|
|
|
|
|
converted to numbers, storing and fetching the data is fast. However, |
547
|
|
|
|
|
|
|
updating the information is very slow, since information concerning one |
548
|
|
|
|
|
|
|
document is spread across all table, without any direct database access. |
549
|
|
|
|
|
|
|
Updating a document (or merely reindexing it) requires update of all |
550
|
|
|
|
|
|
|
blobs, which is slow. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
The list of documents is stored sorted by document name so that |
553
|
|
|
|
|
|
|
fetching an information about a document for one word is relatively |
554
|
|
|
|
|
|
|
easy, still a need to update (or at least scan) all records in the table |
555
|
|
|
|
|
|
|
makes this storage unsuitable for collections of documents that often |
556
|
|
|
|
|
|
|
change. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=item column |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
The B backend stores a word/document pair in database fields, |
561
|
|
|
|
|
|
|
indexing both, thus allowing both fast retrieval and updates -- it's |
562
|
|
|
|
|
|
|
easy to delete all records describing one document and insert new ones. |
563
|
|
|
|
|
|
|
However, the database indexes that have to be maintained are large. |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Both B and B backends only store a count -- number of |
566
|
|
|
|
|
|
|
occurencies of the word in the document (and even this can be switched |
567
|
|
|
|
|
|
|
off, yielding just a yes/no information about the word's presence). |
568
|
|
|
|
|
|
|
This allows questions like |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
all documents containing words 'voda' or 'Mattoni' |
571
|
|
|
|
|
|
|
but not a word 'kyselka' |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
but you cannot ask whether a document contains a phrase 'kyselka |
574
|
|
|
|
|
|
|
Mattoni' because such information is not maintained by these types of |
575
|
|
|
|
|
|
|
backends. |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=item phrase |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
To allow phrase matching, a B backend is available. For each word |
580
|
|
|
|
|
|
|
and document number it stores a blob of lists of positions of the word |
581
|
|
|
|
|
|
|
in the document. A query |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
$ctx->contains('kyselk%', 'Mattoni'); |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
then only returns those documents (document names/numbers) where word |
586
|
|
|
|
|
|
|
kyselka (or kyselky, or so) is just before word Mattoni. |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=back |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=head2 Mixing frontends and backends |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
Any frontend can be used with any backend in one MyConText index. You |
593
|
|
|
|
|
|
|
can index Web documents with B frontend and B backend |
594
|
|
|
|
|
|
|
to be able to find phrases in the documents. And you can use the |
595
|
|
|
|
|
|
|
default, number based document scheme with B backend to use the disk |
596
|
|
|
|
|
|
|
space as efficiently as possible -- this is usefull for example for |
597
|
|
|
|
|
|
|
mailing-list archives, where we need to index huge number of documents |
598
|
|
|
|
|
|
|
that do not change at all. |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
Finding optimal combination is very important and may require some |
601
|
|
|
|
|
|
|
analysis of the document collection and manipulation, as well as the |
602
|
|
|
|
|
|
|
speed and storage requirements. Benchmarking on actual target platform |
603
|
|
|
|
|
|
|
is very usefull during the design phase. |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head1 METHODS |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
The following methods are available on the user side as MyConText API. |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=over 4 |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=item create |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
my $ctx = MyConText->create($dbh, $index_name, %opts); |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
The class method B creates index of given name (the name of the |
616
|
|
|
|
|
|
|
index is the name of its basic parameter table) and all necessary |
617
|
|
|
|
|
|
|
tables, returns an object -- newly created index. The options that may |
618
|
|
|
|
|
|
|
be specified after the index name define the frontend and backend types, |
619
|
|
|
|
|
|
|
storage parameters (how many bits for what values), etc. See below for |
620
|
|
|
|
|
|
|
list of create options and discussion of their use. |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=item open |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
my $ctx = MyConText->open($dbh, $index_name); |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
Opens and returns object, accessing specifies MyConText index. Since all |
627
|
|
|
|
|
|
|
the index parameters and information are stored in the $index_name table |
628
|
|
|
|
|
|
|
(including names of all other needed tables), the database handler and |
629
|
|
|
|
|
|
|
the name of the parameter table are the only needed arguments. |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=item index_document |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
$ctx->index_document(45, 'Sleva pri nakupu stribra.'); |
634
|
|
|
|
|
|
|
$ctx->index_document('http://www.mozilla.org/'); |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
For the default and B frontends, two arguments are expected -- the |
637
|
|
|
|
|
|
|
name (number or string) of the document and its content. For B and |
638
|
|
|
|
|
|
|
B frontends only the name of the document is needed. The method |
639
|
|
|
|
|
|
|
returns number of words indexed (subject to wild change). |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=item delete_document |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
$ctx->delete_document('http://www.mozilla.org/'); |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
Removes information about document from the index. Note that for B |
646
|
|
|
|
|
|
|
backend this is very time consuming process. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=item contains |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
my @docs = $ctx->contains('sleva', 'strib%'); |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
Returns list of names (numbers or strings, depending on the frontend) |
653
|
|
|
|
|
|
|
of documents that contain some of specified words. |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item econtains |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
my @docs = $ctx->contains('sleva', '+strib%', '-zlato'); |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
Econtains stands for extended contains and allows words to be prefixed |
660
|
|
|
|
|
|
|
by plus or minus signs to specify that the word must or mustn't be |
661
|
|
|
|
|
|
|
present in the document for it to match. |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=item contains_hashref, econtains_hashref |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
Similar to B and B, only instead of list of document |
666
|
|
|
|
|
|
|
names, there methods return a hash reference to a hash where keys are |
667
|
|
|
|
|
|
|
the document names and values are the number of occurencies of the |
668
|
|
|
|
|
|
|
words. |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=item drop |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
Removes all tables associated with the index, including the base |
673
|
|
|
|
|
|
|
parameter table. Effectivelly destroying the index form the database. |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=back |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=head1 INDEX OPTIONS |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
Here we list the options that may be passed to MyConText->create call. |
680
|
|
|
|
|
|
|
These allow to specify the style and storage parameters in great detail. |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=over 4 |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=item backend |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
The backend type, default B, possible values blob, column and phrase |
687
|
|
|
|
|
|
|
(see above for explanation). |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=item frontend |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
The frontend type. The default frontend requires the user to specify |
692
|
|
|
|
|
|
|
numeric id of the document together with the content of the document, |
693
|
|
|
|
|
|
|
other possible values are string, file and url (see above for |
694
|
|
|
|
|
|
|
more info). |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
=item word_length |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
Maximum length of words that may be indexed, default 30. |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=item data_table |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Name of the table where the actual data about word/document relation is |
703
|
|
|
|
|
|
|
stored. By default, the name of the index (of the base table) with _data |
704
|
|
|
|
|
|
|
suffix is used. |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=item name_length |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
Any frontend that uses strings as names of documents needs to maintain |
709
|
|
|
|
|
|
|
a conversion table from these names to internal integer ids. This value |
710
|
|
|
|
|
|
|
specifies maximum length of these string names (URLs, file names, ...). |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=item blob_direct_fetch |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
Only for blob backend. When looking for information about specific |
715
|
|
|
|
|
|
|
document in the list stored in the blob, the blob backend uses division |
716
|
|
|
|
|
|
|
of interval to find the correct place in the blob. When the interval |
717
|
|
|
|
|
|
|
gets equal or shorter that this value, all values are fetched from the |
718
|
|
|
|
|
|
|
database and the final search is done in Perl code sequentially. |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=item word_id_bits |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
With column or phase backends, MyConText maintains a numeric id for each |
723
|
|
|
|
|
|
|
word to optimize the space requirements. The word_id_bits parameter |
724
|
|
|
|
|
|
|
specifies the number of bits to reserve for this conversion and thus |
725
|
|
|
|
|
|
|
effectively limits number of distinct words that may be indexed. The |
726
|
|
|
|
|
|
|
default is 16 bits and possible values are 8, 16, 24 or 32 bits. |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=item word_id_table |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
Name of the table that holds conversion from words to their numeric id |
731
|
|
|
|
|
|
|
(for column and phrase backends). By default is the name of the index |
732
|
|
|
|
|
|
|
with _words suffix. |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
=item doc_id_bits |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
A number of bits to hold a numeric id of the document (that is either |
737
|
|
|
|
|
|
|
provided by the user (with default frontend) or generated by the module |
738
|
|
|
|
|
|
|
to accomplish the conversion from the string name of the document). This |
739
|
|
|
|
|
|
|
value limits the maximum number of documents to hold. The default is 16 |
740
|
|
|
|
|
|
|
bits and possible values are 8, 16 and 32 bits for blob backend and 8, |
741
|
|
|
|
|
|
|
16, 24 and 32 bits for column and phrase backends. |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=item doc_id_table |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Name of the table that holds conversion from string names of documents |
746
|
|
|
|
|
|
|
to their numeric id, by default the name of the index with _docid |
747
|
|
|
|
|
|
|
suffix. |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=item count_bits |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
Number of bits reserved for storing number of occurencies of each word |
752
|
|
|
|
|
|
|
in the document. The default is 8 and possible values are the same as |
753
|
|
|
|
|
|
|
with doc_id_bits. |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=item position_bits |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
With phrase backend, MyConText stores positions of each word of the |
758
|
|
|
|
|
|
|
documents. This value specifies how much space should be reserved for |
759
|
|
|
|
|
|
|
this purpose. The default is 32 bits and possible values are 8, 16 or 32 |
760
|
|
|
|
|
|
|
bits. This value limits the maximum number of words of each document |
761
|
|
|
|
|
|
|
that can be stored. |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=item splitter |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
MyConText allows the user to provide any Perl code that will be used to |
766
|
|
|
|
|
|
|
split the content of the document to words. The code will be evalled |
767
|
|
|
|
|
|
|
inside of the MyConText code. The default is |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
$data =~ /(\w{2,$word_length})/g |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
and shows that the input is stored in the variable C<$data> and the code |
772
|
|
|
|
|
|
|
may access any other variable available in the perl_and_index_data_* |
773
|
|
|
|
|
|
|
methods (see source), especially C<$word_length> to get the maximum length |
774
|
|
|
|
|
|
|
of words and C<$backend> to get the backend object. |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
The default value also shows that by default, the minimum length of |
777
|
|
|
|
|
|
|
words indexed is 2. |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
=item filter |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
The output words of splitter (and also any parameter of (e)contains* |
782
|
|
|
|
|
|
|
methods) are send to filter that may do further processing. Filter is |
783
|
|
|
|
|
|
|
again a Perl code, the default is |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
map { lc $_ } |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
showing that the filter operates on input list and by default does |
788
|
|
|
|
|
|
|
conversion to lowercase (yielding case insensitive index). |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=item init_env |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
Because user defined splitter or filter may depend on other things that |
793
|
|
|
|
|
|
|
it is reasonable to set before the actual procession of words, you can |
794
|
|
|
|
|
|
|
use yet another Perl hook to set things up. The default is |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
use locale |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=item table_name |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
For table frontend; this is the name of the table that will be indexed. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=item column_name |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
For table frontend; this is the name of the column in the table_name |
805
|
|
|
|
|
|
|
that contains the documents -- data to be indexed. It can also have |
806
|
|
|
|
|
|
|
a form table.column that will be used if the table_name option is not |
807
|
|
|
|
|
|
|
specified. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=item column_id_name |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
For table frontend; this is the name of the field in table_name that |
812
|
|
|
|
|
|
|
holds names (ids) of the records. If not specified, a field that has |
813
|
|
|
|
|
|
|
primary key on it is used. If this field is numeric, it's values are |
814
|
|
|
|
|
|
|
directly used as identifiers, otherwise a conversion to numeric values |
815
|
|
|
|
|
|
|
is made. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=back |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head1 ERROR HANDLING |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
The create and open methods return the MyConText object on success, upon |
822
|
|
|
|
|
|
|
failure they return undef and set error message in $MyConText::errstr |
823
|
|
|
|
|
|
|
variable. |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
All other methods return reasonable (documented above) value on success, |
826
|
|
|
|
|
|
|
failure is signalized by unreasonable (typically undef or null) return |
827
|
|
|
|
|
|
|
value; the error message may then be retrieved by $ctx->errstr method |
828
|
|
|
|
|
|
|
call. |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head1 VERSION |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
This documentation describes MyConText module version 0.49. |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=head1 BUGS |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
Error handling needs more polishing. |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
We do not check if the stored values are larger that specified by the |
839
|
|
|
|
|
|
|
*_bits parameters. |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
No CGI administration tool at the moment. |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
Econtains doesn't work with phrase backend. |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
No scoring algorithm implemented. |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
No support for stop words at the moment. |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head1 AUTHOR |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
(c) 1999 Jan Pazdziora, adelton@fi.muni.cz, |
852
|
|
|
|
|
|
|
http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk |
853
|
|
|
|
|
|
|
University in Brno, Czech Republic |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
All rights reserved. This package is free software; you can |
856
|
|
|
|
|
|
|
redistribute it and/or modify it under the same terms as Perl itself. |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
=head1 SEE ALSO |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
DBI(3), mycontextadmin(1). |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
=head1 OTHER PRODUCTS and why I've written this module |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
I'm aware of DBIx::TextIndex module and about UdmSearch utility, and |
865
|
|
|
|
|
|
|
about htdig and glimpse on the non-database side of the world. |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
To me, using a database gives reasonable maintenance benefits. With |
868
|
|
|
|
|
|
|
products that use their own files to store the information (even if the |
869
|
|
|
|
|
|
|
storage algorithms are efficient and well thought of), you always |
870
|
|
|
|
|
|
|
struggle with permissions on files and directories for various users, |
871
|
|
|
|
|
|
|
with files that somebody accidently deleted or mungled, and making the |
872
|
|
|
|
|
|
|
index available remotely is not trivial. |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
That's why I've wanted a module that will use a database as a storage |
875
|
|
|
|
|
|
|
backend. With MySQL, you get remote access and access control for free, |
876
|
|
|
|
|
|
|
and on many web servers MySQL is part of the standard equipment. So |
877
|
|
|
|
|
|
|
using it for text indexes seemed natural. |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
However, existing DBIx::TextIndex and UdmSearch are too narrow-aimed to |
880
|
|
|
|
|
|
|
me. The first only supports indexing of data that is stored in the |
881
|
|
|
|
|
|
|
database, but you may not always want or need to store the documents in |
882
|
|
|
|
|
|
|
the database as well. The UdmSearch on the other hand is only for web |
883
|
|
|
|
|
|
|
documents, making it unsuitable for indexing mailing-list archives or |
884
|
|
|
|
|
|
|
local data. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
I believe that MyConText is reasonably flexible and still very |
887
|
|
|
|
|
|
|
efficient. It doesn't enforce its own idea of what is good for you -- |
888
|
|
|
|
|
|
|
the number of options is big and you can always extend the module with |
889
|
|
|
|
|
|
|
your own backend of frontend if you feel that those provided are not |
890
|
|
|
|
|
|
|
sufficient. Or you can extend existing by adding one or two parameters |
891
|
|
|
|
|
|
|
that will add new features. Of course, patches are always welcome. |
892
|
|
|
|
|
|
|
MyConText is a tool that can be deployed in many projects. It's not |
893
|
|
|
|
|
|
|
a complete environment since different people have different needs. On |
894
|
|
|
|
|
|
|
the other hand, the methods that it provides make it easy to build |
895
|
|
|
|
|
|
|
a complete solution on top of this in very short course of time. |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
I was primarily inspired by the ConText cartrige of Oracle server. Since |
898
|
|
|
|
|
|
|
MySQL doesn't support triggers, it showed up that Perl interface will be |
899
|
|
|
|
|
|
|
needed. Of course, porting this module to (for example) PostgreSQL |
900
|
|
|
|
|
|
|
should be easy, so different name is probably needed. On the other hand, |
901
|
|
|
|
|
|
|
the code is sometimes very MySQL specific to make the module work |
902
|
|
|
|
|
|
|
efficiently, so I didn't want a name that would suggest that it's |
903
|
|
|
|
|
|
|
a generic tool that will work with any SQL database. |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=cut |
906
|
|
|
|
|
|
|
|