line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package MyConText::Blob; |
3
|
1
|
|
|
1
|
|
637
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2098
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
# Open in the backend just sets the object |
6
|
|
|
|
|
|
|
sub open { |
7
|
0
|
|
|
0
|
0
|
|
my ($class, $ctx) = @_; |
8
|
0
|
|
|
|
|
|
return bless { 'ctx' => $ctx }, $class; |
9
|
|
|
|
|
|
|
} |
10
|
|
|
|
|
|
|
# Create creates the table(s) according to the parameters |
11
|
|
|
|
|
|
|
sub _create_tables { |
12
|
0
|
|
|
0
|
|
|
my ($class, $ctx) = @_; |
13
|
0
|
|
|
|
|
|
my $CREATE_DATA = <
|
14
|
|
|
|
|
|
|
create table $ctx->{'data_table'} ( |
15
|
|
|
|
|
|
|
word varchar($ctx->{'word_length'}) binary |
16
|
|
|
|
|
|
|
default '' not null, |
17
|
|
|
|
|
|
|
idx longblob default '' not null, |
18
|
|
|
|
|
|
|
primary key (word) |
19
|
|
|
|
|
|
|
) |
20
|
|
|
|
|
|
|
EOF |
21
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
22
|
0
|
0
|
|
|
|
|
$dbh->do($CREATE_DATA) or return $dbh->errstr; |
23
|
0
|
|
|
|
|
|
push @{$ctx->{'created_tables'}}, $ctx->{'data_table'}; |
|
0
|
|
|
|
|
|
|
24
|
0
|
|
|
|
|
|
return; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub add_document { |
28
|
0
|
|
|
0
|
0
|
|
my ($self, $id, $words) = @_; |
29
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
30
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
31
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
32
|
|
|
|
|
|
|
|
33
|
0
|
0
|
|
|
|
|
my $update_sth = ( defined $self->{'adding_update_sth'} |
34
|
|
|
|
|
|
|
? $self->{'adding_update_sth'} |
35
|
|
|
|
|
|
|
: $self->{'adding_update_sth'} = $dbh->prepare( |
36
|
|
|
|
|
|
|
"update $data_table set idx = concat(idx, ?) |
37
|
|
|
|
|
|
|
where word = ?") ); |
38
|
|
|
|
|
|
|
|
39
|
0
|
0
|
|
|
|
|
my $insert_sth = ( defined $self->{'insert_sth'} |
40
|
|
|
|
|
|
|
? $self->{'insert_sth'} |
41
|
|
|
|
|
|
|
: $self->{'insert_sth'} = $dbh->prepare(" |
42
|
|
|
|
|
|
|
insert into $data_table values (?, ?)") ); |
43
|
|
|
|
|
|
|
|
44
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
45
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
46
|
0
|
|
|
|
|
|
my $num_words = 0; |
47
|
0
|
|
|
|
|
|
for my $word ( keys %$words ) { |
48
|
|
|
|
|
|
|
### print STDERR "$word($id) adding\n"; |
49
|
|
|
|
|
|
|
# here we will want to parametrize the bit size of the |
50
|
|
|
|
|
|
|
# data |
51
|
0
|
|
|
|
|
|
my $value = pack $packstring, $id, $words->{$word}; |
52
|
0
|
|
|
|
|
|
my $rows = $update_sth->execute($value, $word); |
53
|
0
|
0
|
|
|
|
|
$insert_sth->execute($word, $value) if $rows == 0; |
54
|
0
|
|
|
|
|
|
$num_words += $words->{$word}; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
0
|
|
|
|
|
|
return $num_words; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub delete_document { |
61
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
62
|
0
|
|
|
|
|
|
for my $id (@_) { $self->update_document($id, {}); } |
|
0
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
sub update_document { |
65
|
0
|
|
|
0
|
0
|
|
my ($self, $id, $words) = @_; |
66
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
67
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
68
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
69
|
|
|
|
|
|
|
|
70
|
0
|
0
|
|
|
|
|
my $insert_sth = ( defined $self->{'insert_sth'} |
71
|
|
|
|
|
|
|
? $self->{'insert_sth'} |
72
|
|
|
|
|
|
|
: $self->{'insert_sth'} = $dbh->prepare(" |
73
|
|
|
|
|
|
|
insert into $data_table values (?, ?)") ); |
74
|
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
|
my $update_sth = ( defined $self->{'update_update_sth'} |
76
|
|
|
|
|
|
|
? $self->{'update_update_sth'} |
77
|
|
|
|
|
|
|
: $self->{'update_update_sth'} = |
78
|
|
|
|
|
|
|
$dbh->prepare("update $data_table set idx = |
79
|
|
|
|
|
|
|
concat(substring(idx, 1, ?), ?, substring(idx, ?)) |
80
|
|
|
|
|
|
|
where word = ?") ); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
$dbh->do("lock tables $data_table write"); |
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
|
my $select_sth = $dbh->prepare("select word from $data_table"); |
86
|
0
|
|
|
|
|
|
$select_sth->execute; |
87
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
89
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
90
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
91
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
92
|
0
|
|
|
|
|
|
my $num_words = 0; |
93
|
0
|
|
|
|
|
|
while (my ($word) = $select_sth->fetchrow_array) { |
94
|
0
|
0
|
|
|
|
|
my $value = (defined $words->{$word} ? |
95
|
|
|
|
|
|
|
pack($packstring, $id, $words->{$word}) : ''); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# the method find_position finds the position of the |
98
|
|
|
|
|
|
|
# "record" for document $id with word $word; returned is |
99
|
|
|
|
|
|
|
# the position in bytes and yes/no values specifying if |
100
|
|
|
|
|
|
|
# the record is already present in the blob; if it is, |
101
|
|
|
|
|
|
|
# we need to replace it, otherwise just insert. |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my ($pos, $shift) = $self->find_position($word, $id); |
104
|
0
|
0
|
|
|
|
|
if (not defined $pos) { |
105
|
0
|
|
|
|
|
|
$insert_sth->execute($word, $value); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
else { |
108
|
0
|
|
|
|
|
|
my $spos = $pos + 1; # I'm not sure why this |
109
|
0
|
0
|
|
|
|
|
$spos += $packlength if $shift; |
110
|
0
|
|
|
|
|
|
$update_sth->execute($pos, $value, $spos, $word); |
111
|
|
|
|
|
|
|
} |
112
|
0
|
|
|
|
|
|
delete $words->{$word}; |
113
|
0
|
0
|
|
|
|
|
$num_words++ if defined $value; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
for my $word ( keys %$words ) { |
117
|
0
|
|
|
|
|
|
my $value = pack $packstring, $id, $words->{$word}; |
118
|
0
|
|
|
|
|
|
$insert_sth->execute($word, $value); |
119
|
0
|
|
|
|
|
|
$num_words++; |
120
|
|
|
|
|
|
|
} |
121
|
0
|
|
|
|
|
|
$dbh->do("unlock tables"); |
122
|
|
|
|
|
|
|
|
123
|
0
|
|
|
|
|
|
return $num_words; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
sub find_position { |
126
|
0
|
|
|
0
|
0
|
|
my ($self, $word, $id) = @_; |
127
|
|
|
|
|
|
|
# here, with the calculation of where in the blob we have the |
128
|
|
|
|
|
|
|
# docid and where the count of words and how long they are, we |
129
|
|
|
|
|
|
|
# should really look at the parameters (num of bits of various |
130
|
|
|
|
|
|
|
# structures and values) given to create |
131
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
133
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
134
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
# Sth to read the length of the blob holding the document/count info |
137
|
0
|
0
|
|
|
|
|
my $get_length_sth = ( defined $self->{'get_length_sth'} |
138
|
|
|
|
|
|
|
? $self->{'get_length_sth'} |
139
|
|
|
|
|
|
|
: $self->{'get_length_sth'} = $dbh->prepare("select |
140
|
|
|
|
|
|
|
length(idx) from $data_table where word = ?")); |
141
|
0
|
|
|
|
|
|
my $length = $dbh->selectrow_array($get_length_sth, {}, $word); |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
144
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
145
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
146
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
147
|
|
|
|
|
|
|
|
148
|
0
|
0
|
|
|
|
|
if (not defined $length) { return; } |
|
0
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
$length = int($length/$packlength); |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
|
my ($bot, $top, $med, $val) = (0, $length); |
152
|
|
|
|
|
|
|
|
153
|
0
|
0
|
|
|
|
|
if (not defined $ctx->{'max_doc_id'}) |
154
|
0
|
|
|
|
|
|
{ $med = int(($top - $bot) / 2); } |
155
|
|
|
|
|
|
|
else |
156
|
0
|
|
|
|
|
|
{ $med = int($top * $id / $ctx->{'max_doc_id'}); } |
157
|
|
|
|
|
|
|
|
158
|
0
|
|
|
|
|
|
my $blob_direct_fetch = $ctx->{'blob_direct_fetch'}; |
159
|
|
|
|
|
|
|
# we divide the interval |
160
|
0
|
|
|
|
|
|
while ($bot != $top) { |
161
|
0
|
0
|
|
|
|
|
$med = $top - 1 if $med >= $top; |
162
|
0
|
0
|
|
|
|
|
$med = $bot if $med < $bot; |
163
|
|
|
|
|
|
|
|
164
|
0
|
0
|
|
|
|
|
if ($top - $bot <= $blob_direct_fetch) { |
165
|
0
|
0
|
|
|
|
|
my $get_interval_sth = ( |
166
|
|
|
|
|
|
|
defined $self->{'get_interval_sth'} |
167
|
|
|
|
|
|
|
? $self->{'get_interval_sth'} |
168
|
|
|
|
|
|
|
: $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?")); |
169
|
0
|
|
|
|
|
|
my $alldata = $dbh->selectrow_array($get_interval_sth, |
170
|
|
|
|
|
|
|
{}, |
171
|
|
|
|
|
|
|
$bot * $packlength + 1, |
172
|
|
|
|
|
|
|
($top - $bot) * $packlength, |
173
|
|
|
|
|
|
|
$word); |
174
|
0
|
0
|
|
|
|
|
return unless defined $alldata; |
175
|
|
|
|
|
|
|
|
176
|
0
|
|
|
|
|
|
my @docs; |
177
|
0
|
|
|
|
|
|
my $i = 0; |
178
|
0
|
|
|
|
|
|
while ($i < length $alldata) { |
179
|
0
|
|
|
|
|
|
push @docs, unpack $packstring, |
180
|
|
|
|
|
|
|
substr $alldata, $i, $packlength; |
181
|
0
|
|
|
|
|
|
$i += $packlength; |
182
|
|
|
|
|
|
|
} |
183
|
0
|
|
|
|
|
|
for (my $i = 0; $i < @docs; $i += 2) { |
184
|
0
|
0
|
|
|
|
|
if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); } |
|
0
|
|
|
|
|
|
|
185
|
0
|
0
|
|
|
|
|
if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); } |
|
0
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
} |
187
|
0
|
|
|
|
|
|
return ($top * $packlength, 0); |
188
|
|
|
|
|
|
|
} |
189
|
0
|
|
|
|
|
|
($val) = $dbh->selectrow_array( |
190
|
|
|
|
|
|
|
"select substring(idx, ?, 2) from $data_table |
191
|
|
|
|
|
|
|
where word = ?", {}, ($med * $packlength) + 1, $word); |
192
|
0
|
|
|
|
|
|
($val) = unpack $packstring, $val; |
193
|
|
|
|
|
|
|
|
194
|
0
|
0
|
|
|
|
|
if (not defined $val) { return; } |
|
0
|
|
|
|
|
|
|
195
|
0
|
0
|
|
|
|
|
if ($val == $id) { return ($med * $packlength, 1); } |
|
0
|
0
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
0
|
|
|
|
|
|
elsif ($val < $id) { $bot = $med + 1; } |
198
|
0
|
|
|
|
|
|
else { $top = $med; } |
199
|
|
|
|
|
|
|
|
200
|
0
|
|
|
|
|
|
$med = int($med * $id / $val); |
201
|
|
|
|
|
|
|
} |
202
|
0
|
|
|
|
|
|
return ($bot * $packlength, 0); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub contains_hashref { |
206
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
207
|
0
|
|
|
|
|
|
my $ctx = $self->{'ctx'}; |
208
|
0
|
|
|
|
|
|
my $dbh = $ctx->{'dbh'}; |
209
|
0
|
|
|
|
|
|
my $data_table = $ctx->{'data_table'}; |
210
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
my $packstring = $MyConText::BITS_TO_PACK{$ctx->{'doc_id_bits'}} |
212
|
|
|
|
|
|
|
. $MyConText::BITS_TO_PACK{$ctx->{'count_bits'}}; |
213
|
0
|
|
|
|
|
|
my ($packnulls) = pack $packstring, 0, 0; |
214
|
0
|
|
|
|
|
|
my $packlength = length $packnulls; |
215
|
|
|
|
|
|
|
|
216
|
0
|
0
|
|
|
|
|
my $sth = ( defined $self->{'get_idx_sth'} ? |
217
|
|
|
|
|
|
|
$self->{'get_idx_sth'} : |
218
|
|
|
|
|
|
|
$self->{'get_idx_sth'} = |
219
|
|
|
|
|
|
|
$dbh->prepare( |
220
|
|
|
|
|
|
|
"select idx from $data_table where word like ?" |
221
|
|
|
|
|
|
|
)); |
222
|
|
|
|
|
|
|
|
223
|
0
|
|
|
|
|
|
my $out = {}; |
224
|
0
|
|
|
|
|
|
for my $word (@_) { |
225
|
0
|
|
|
|
|
|
$sth->execute($word); |
226
|
0
|
|
|
|
|
|
while (my ($blob) = $sth->fetchrow_array) { |
227
|
0
|
0
|
|
|
|
|
next unless defined $blob; |
228
|
0
|
|
|
|
|
|
my @data; |
229
|
0
|
|
|
|
|
|
my $i = 0; |
230
|
0
|
|
|
|
|
|
while ($i < length $blob) { |
231
|
0
|
|
|
|
|
|
push @data, unpack $packstring, |
232
|
|
|
|
|
|
|
substr $blob, $i, $packlength; |
233
|
0
|
|
|
|
|
|
$i += $packlength; |
234
|
|
|
|
|
|
|
} |
235
|
0
|
|
|
|
|
|
while (@data) { |
236
|
0
|
|
|
|
|
|
my $doc = shift @data; |
237
|
0
|
|
|
|
|
|
my $count = shift @data; |
238
|
0
|
0
|
|
|
|
|
unless (defined $out->{$doc}) { $out->{$doc} = 0; } |
|
0
|
|
|
|
|
|
|
239
|
0
|
|
|
|
|
|
$out->{$doc} += $count; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
0
|
|
|
|
|
|
$sth->finish; |
243
|
|
|
|
|
|
|
} |
244
|
0
|
|
|
|
|
|
$out; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
*parse_and_index_data = \&MyConText::parse_and_index_data_count; |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
1; |
250
|
|
|
|
|
|
|
|