| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package MyConText::Blob; |
|
3
|
1
|
|
|
1
|
|
637
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
2098
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# Open in the backend just sets the object |
|
6
|
|
|
|
|
|
|
sub open { |
|
7
|
0
|
|
|
0
|
0
|
|
my ($class, $ctx) = @_; |
|
8
|
0
|
|
|
|
|
|
return bless { 'ctx' => $ctx }, $class; |
|
9
|
|
|
|
|
|
|
} |
|
10
|
|
|
|
|
|
|
# Create creates the table(s) according to the parameters |
|
11
|
|
|
|
|
|
|
sub _create_tables { |
|
12
|
0
|
|
|
0
|
|
|
my ($class, $ctx) = @_; |
|
13
|
0
|
|
|
|
|
|
my $CREATE_DATA = <
|
|
14
|
|
|
|
|
|
|
create table $ctx->{'data_table'} ( |
|
15
|
|
|
|
|
|
|
word varchar($ctx->{'word_length'}) binary |
|
16
|
|
|
|
|
|
|
default '' not null, |
|
17
|
|
|
|
|
|
|
idx longblob default '' not null, |
|
18
|
|
|
|
|
|
|
primary key (word) |
|
19
|
|
|
|
|
|
|
) |
|
20
|
|
|
|
|
|
|
EOF |
|
21
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
|
22
|
0
|
0
|
|
|
|
|
$dbh->do($CREATE_DATA) or return $dbh->errstr; |
|
23
|
0
|
|
|
|
|
|
push @{$ctx->{'created_tables'}}, $ctx->{'data_table'}; |
|
|
0
|
|
|
|
|
|
|
|
24
|
0
|
|
|
|
|
|
return; |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub add_document { |
|
28
|
0
|
|
|
0
|
0
|
|
my ($self, $id, $words) = @_; |
|
29
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
|
30
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
|
31
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
|
32
|
|
|
|
|
|
|
|
|
33
|
0
|
0
|
|
|
|
|
my $update_sth = ( defined $self->{'adding_update_sth'} |
|
34
|
|
|
|
|
|
|
? $self->{'adding_update_sth'} |
|
35
|
|
|
|
|
|
|
: $self->{'adding_update_sth'} = $dbh->prepare( |
|
36
|
|
|
|
|
|
|
"update $data_table set idx = concat(idx, ?) |
|
37
|
|
|
|
|
|
|
where word = ?") ); |
|
38
|
|
|
|
|
|
|
|
|
39
|
0
|
0
|
|
|
|
|
my $insert_sth = ( defined $self->{'insert_sth'} |
|
40
|
|
|
|
|
|
|
? $self->{'insert_sth'} |
|
41
|
|
|
|
|
|
|
: $self->{'insert_sth'} = $dbh->prepare(" |
|
42
|
|
|
|
|
|
|
insert into $data_table values (?, ?)") ); |
|
43
|
|
|
|
|
|
|
|
|
44
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
|
45
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
|
46
|
0
|
|
|
|
|
|
my $num_words = 0; |
|
47
|
0
|
|
|
|
|
|
for my $word ( keys %$words ) { |
|
48
|
|
|
|
|
|
|
### print STDERR "$word($id) adding\n"; |
|
49
|
|
|
|
|
|
|
# here we will want to parametrize the bit size of the |
|
50
|
|
|
|
|
|
|
# data |
|
51
|
0
|
|
|
|
|
|
my $value = pack $packstring, $id, $words->{$word}; |
|
52
|
0
|
|
|
|
|
|
my $rows = $update_sth->execute($value, $word); |
|
53
|
0
|
0
|
|
|
|
|
$insert_sth->execute($word, $value) if $rows == 0; |
|
54
|
0
|
|
|
|
|
|
$num_words += $words->{$word}; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
return $num_words; |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub delete_document { |
|
61
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
62
|
0
|
|
|
|
|
|
for my $id (@_) { $self->update_document($id, {}); } |
|
|
0
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
} |
|
64
|
|
|
|
|
|
|
sub update_document { |
|
65
|
0
|
|
|
0
|
0
|
|
my ($self, $id, $words) = @_; |
|
66
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
|
67
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
|
68
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
|
69
|
|
|
|
|
|
|
|
|
70
|
0
|
0
|
|
|
|
|
my $insert_sth = ( defined $self->{'insert_sth'} |
|
71
|
|
|
|
|
|
|
? $self->{'insert_sth'} |
|
72
|
|
|
|
|
|
|
: $self->{'insert_sth'} = $dbh->prepare(" |
|
73
|
|
|
|
|
|
|
insert into $data_table values (?, ?)") ); |
|
74
|
|
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
|
my $update_sth = ( defined $self->{'update_update_sth'} |
|
76
|
|
|
|
|
|
|
? $self->{'update_update_sth'} |
|
77
|
|
|
|
|
|
|
: $self->{'update_update_sth'} = |
|
78
|
|
|
|
|
|
|
$dbh->prepare("update $data_table set idx = |
|
79
|
|
|
|
|
|
|
concat(substring(idx, 1, ?), ?, substring(idx, ?)) |
|
80
|
|
|
|
|
|
|
where word = ?") ); |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
$dbh->do("lock tables $data_table write"); |
|
84
|
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
my $select_sth = $dbh->prepare("select word from $data_table"); |
|
86
|
0
|
|
|
|
|
|
$select_sth->execute; |
|
87
|
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
|
89
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
|
90
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
|
91
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
|
92
|
0
|
|
|
|
|
|
my $num_words = 0; |
|
93
|
0
|
|
|
|
|
|
while (my ($word) = $select_sth->fetchrow_array) { |
|
94
|
0
|
0
|
|
|
|
|
my $value = (defined $words->{$word} ? |
|
95
|
|
|
|
|
|
|
pack($packstring, $id, $words->{$word}) : ''); |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# the method find_position finds the position of the |
|
98
|
|
|
|
|
|
|
# "record" for document $id with word $word; returned is |
|
99
|
|
|
|
|
|
|
# the position in bytes and yes/no values specifying if |
|
100
|
|
|
|
|
|
|
# the record is already present in the blob; if it is, |
|
101
|
|
|
|
|
|
|
# we need to replace it, otherwise just insert. |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my ($pos, $shift) = $self->find_position($word, $id); |
|
104
|
0
|
0
|
|
|
|
|
if (not defined $pos) { |
|
105
|
0
|
|
|
|
|
|
$insert_sth->execute($word, $value); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
else { |
|
108
|
0
|
|
|
|
|
|
my $spos = $pos + 1; # I'm not sure why this |
|
109
|
0
|
0
|
|
|
|
|
$spos += $packlength if $shift; |
|
110
|
0
|
|
|
|
|
|
$update_sth->execute($pos, $value, $spos, $word); |
|
111
|
|
|
|
|
|
|
} |
|
112
|
0
|
|
|
|
|
|
delete $words->{$word}; |
|
113
|
0
|
0
|
|
|
|
|
$num_words++ if defined $value; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
for my $word ( keys %$words ) { |
|
117
|
0
|
|
|
|
|
|
my $value = pack $packstring, $id, $words->{$word}; |
|
118
|
0
|
|
|
|
|
|
$insert_sth->execute($word, $value); |
|
119
|
0
|
|
|
|
|
|
$num_words++; |
|
120
|
|
|
|
|
|
|
} |
|
121
|
0
|
|
|
|
|
|
$dbh->do("unlock tables"); |
|
122
|
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
return $num_words; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
sub find_position { |
|
126
|
0
|
|
|
0
|
0
|
|
my ($self, $word, $id) = @_; |
|
127
|
|
|
|
|
|
|
# here, with the calculation of where in the blob we have the |
|
128
|
|
|
|
|
|
|
# docid and where the count of words and how long they are, we |
|
129
|
|
|
|
|
|
|
# should really look at the parameters (num of bits of various |
|
130
|
|
|
|
|
|
|
# structures and values) given to create |
|
131
|
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
|
133
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
|
134
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Sth to read the length of the blob holding the document/count info |
|
137
|
0
|
0
|
|
|
|
|
my $get_length_sth = ( defined $self->{'get_length_sth'} |
|
138
|
|
|
|
|
|
|
? $self->{'get_length_sth'} |
|
139
|
|
|
|
|
|
|
: $self->{'get_length_sth'} = $dbh->prepare("select |
|
140
|
|
|
|
|
|
|
length(idx) from $data_table where word = ?")); |
|
141
|
0
|
|
|
|
|
|
my $length = $dbh->selectrow_array($get_length_sth, {}, $word); |
|
142
|
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
|
144
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
|
145
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
|
146
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
|
147
|
|
|
|
|
|
|
|
|
148
|
0
|
0
|
|
|
|
|
if (not defined $length) { return; } |
|
|
0
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
$length = int($length/$packlength); |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
my ($bot, $top, $med, $val) = (0, $length); |
|
152
|
|
|
|
|
|
|
|
|
153
|
0
|
0
|
|
|
|
|
if (not defined $ctx->{'max_doc_id'}) |
|
154
|
0
|
|
|
|
|
|
{ $med = int(($top - $bot) / 2); } |
|
155
|
|
|
|
|
|
|
else |
|
156
|
0
|
|
|
|
|
|
{ $med = int($top * $id / $ctx->{'max_doc_id'}); } |
|
157
|
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
my $blob_direct_fetch = $ctx->{'blob_direct_fetch'}; |
|
159
|
|
|
|
|
|
|
# we divide the interval |
|
160
|
0
|
|
|
|
|
|
while ($bot != $top) { |
|
161
|
0
|
0
|
|
|
|
|
$med = $top - 1 if $med >= $top; |
|
162
|
0
|
0
|
|
|
|
|
$med = $bot if $med < $bot; |
|
163
|
|
|
|
|
|
|
|
|
164
|
0
|
0
|
|
|
|
|
if ($top - $bot <= $blob_direct_fetch) { |
|
165
|
0
|
0
|
|
|
|
|
my $get_interval_sth = ( |
|
166
|
|
|
|
|
|
|
defined $self->{'get_interval_sth'} |
|
167
|
|
|
|
|
|
|
? $self->{'get_interval_sth'} |
|
168
|
|
|
|
|
|
|
: $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?")); |
|
169
|
0
|
|
|
|
|
|
my $alldata = $dbh->selectrow_array($get_interval_sth, |
|
170
|
|
|
|
|
|
|
{}, |
|
171
|
|
|
|
|
|
|
$bot * $packlength + 1, |
|
172
|
|
|
|
|
|
|
($top - $bot) * $packlength, |
|
173
|
|
|
|
|
|
|
$word); |
|
174
|
0
|
0
|
|
|
|
|
return unless defined $alldata; |
|
175
|
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
my @docs; |
|
177
|
0
|
|
|
|
|
|
my $i = 0; |
|
178
|
0
|
|
|
|
|
|
while ($i < length $alldata) { |
|
179
|
0
|
|
|
|
|
|
push @docs, unpack $packstring, |
|
180
|
|
|
|
|
|
|
substr $alldata, $i, $packlength; |
|
181
|
0
|
|
|
|
|
|
$i += $packlength; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
0
|
|
|
|
|
|
for (my $i = 0; $i < @docs; $i += 2) { |
|
184
|
0
|
0
|
|
|
|
|
if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); } |
|
|
0
|
|
|
|
|
|
|
|
185
|
0
|
0
|
|
|
|
|
if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); } |
|
|
0
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
} |
|
187
|
0
|
|
|
|
|
|
return ($top * $packlength, 0); |
|
188
|
|
|
|
|
|
|
} |
|
189
|
0
|
|
|
|
|
|
($val) = $dbh->selectrow_array( |
|
190
|
|
|
|
|
|
|
"select substring(idx, ?, 2) from $data_table |
|
191
|
|
|
|
|
|
|
where word = ?", {}, ($med * $packlength) + 1, $word); |
|
192
|
0
|
|
|
|
|
|
($val) = unpack $packstring, $val; |
|
193
|
|
|
|
|
|
|
|
|
194
|
0
|
0
|
|
|
|
|
if (not defined $val) { return; } |
|
|
0
|
|
|
|
|
|
|
|
195
|
0
|
0
|
|
|
|
|
if ($val == $id) { return ($med * $packlength, 1); } |
|
|
0
|
0
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
|
197
|
0
|
|
|
|
|
|
elsif ($val < $id) { $bot = $med + 1; } |
|
198
|
0
|
|
|
|
|
|
else { $top = $med; } |
|
199
|
|
|
|
|
|
|
|
|
200
|
0
|
|
|
|
|
|
$med = int($med * $id / $val); |
|
201
|
|
|
|
|
|
|
} |
|
202
|
0
|
|
|
|
|
|
return ($bot * $packlength, 0); |
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub contains_hashref { |
|
206
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
207
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
|
208
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
|
209
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
|
210
|
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
|
212
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
|
213
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
|
214
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
|
215
|
|
|
|
|
|
|
|
|
216
|
0
|
0
|
|
|
|
|
my $sth = ( defined $self->{'get_idx_sth'} ? |
|
217
|
|
|
|
|
|
|
$self->{'get_idx_sth'} : |
|
218
|
|
|
|
|
|
|
$self->{'get_idx_sth'} = |
|
219
|
|
|
|
|
|
|
$dbh->prepare( |
|
220
|
|
|
|
|
|
|
"select idx from $data_table where word like ?" |
|
221
|
|
|
|
|
|
|
)); |
|
222
|
|
|
|
|
|
|
|
|
223
|
0
|
|
|
|
|
|
my $out = {}; |
|
224
|
0
|
|
|
|
|
|
for my $word (@_) { |
|
225
|
0
|
|
|
|
|
|
$sth->execute($word); |
|
226
|
0
|
|
|
|
|
|
while (my ($blob) = $sth->fetchrow_array) { |
|
227
|
0
|
0
|
|
|
|
|
next unless defined $blob; |
|
228
|
0
|
|
|
|
|
|
my @data; |
|
229
|
0
|
|
|
|
|
|
my $i = 0; |
|
230
|
0
|
|
|
|
|
|
while ($i < length $blob) { |
|
231
|
0
|
|
|
|
|
|
push @data, unpack $packstring, |
|
232
|
|
|
|
|
|
|
substr $blob, $i, $packlength; |
|
233
|
0
|
|
|
|
|
|
$i += $packlength; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
0
|
|
|
|
|
|
while (@data) { |
|
236
|
0
|
|
|
|
|
|
my $doc = shift @data; |
|
237
|
0
|
|
|
|
|
|
my $count = shift @data; |
|
238
|
0
|
0
|
|
|
|
|
unless (defined $out->{$doc}) { $out->{$doc} = 0; } |
|
|
0
|
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
$out->{$doc} += $count; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
} |
|
242
|
0
|
|
|
|
|
|
$sth->finish; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
0
|
|
|
|
|
|
$out; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
*parse_and_index_data = \&MyConText::parse_and_index_data_count; |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
1; |
|
250
|
|
|
|
|
|
|
|