line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package KinoSearch1::Index::MultiReader; |
2
|
34
|
|
|
34
|
|
200
|
use strict; |
|
34
|
|
|
|
|
77
|
|
|
34
|
|
|
|
|
2688
|
|
3
|
34
|
|
|
34
|
|
519
|
use warnings; |
|
34
|
|
|
|
|
81
|
|
|
34
|
|
|
|
|
1511
|
|
4
|
34
|
|
|
34
|
|
194
|
use KinoSearch1::Util::ToolSet; |
|
34
|
|
|
|
|
77
|
|
|
34
|
|
|
|
|
8526
|
|
5
|
34
|
|
|
34
|
|
202
|
use base qw( KinoSearch1::Index::IndexReader ); |
|
34
|
|
|
|
|
72
|
|
|
34
|
|
|
|
|
3796
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
8
|
34
|
|
|
34
|
|
381
|
__PACKAGE__->init_instance_vars( |
9
|
|
|
|
|
|
|
invindex => undef, |
10
|
|
|
|
|
|
|
sub_readers => undef, |
11
|
|
|
|
|
|
|
starts => undef, |
12
|
|
|
|
|
|
|
max_doc => 0, |
13
|
|
|
|
|
|
|
norms_cache => undef, |
14
|
|
|
|
|
|
|
); |
15
|
|
|
|
|
|
|
} |
16
|
|
|
|
|
|
|
|
17
|
34
|
|
|
34
|
|
235
|
use KinoSearch1::Index::FieldInfos; |
|
34
|
|
|
|
|
70
|
|
|
34
|
|
|
|
|
1689
|
|
18
|
34
|
|
|
34
|
|
192
|
use KinoSearch1::Index::SegReader; |
|
34
|
|
|
|
|
77
|
|
|
34
|
|
|
|
|
891
|
|
19
|
34
|
|
|
34
|
|
41025
|
use KinoSearch1::Index::MultiTermDocs; |
|
34
|
|
|
|
|
91
|
|
|
34
|
|
|
|
|
85425
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# use KinoSearch1::Util::Class's new() |
22
|
|
|
|
|
|
|
# Note: can't inherit IndexReader's new() without recursion problems |
23
|
|
|
|
|
|
|
*new = *KinoSearch1::Util::Class::new; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub init_instance { |
26
|
15
|
|
|
15
|
1
|
31
|
my $self = shift; |
27
|
15
|
|
50
|
|
|
97
|
$self->{sub_readers} ||= []; |
28
|
15
|
|
50
|
|
|
87
|
$self->{starts} ||= []; |
29
|
15
|
|
50
|
|
|
76
|
$self->{norms_cache} ||= {}; |
30
|
|
|
|
|
|
|
|
31
|
15
|
|
|
|
|
63
|
$self->_init_sub_readers; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub _init_sub_readers { |
35
|
15
|
|
|
15
|
|
24
|
my $self = shift; |
36
|
15
|
|
|
|
|
21
|
my @starts; |
37
|
15
|
|
|
|
|
22
|
my $max_doc = 0; |
38
|
15
|
|
|
|
|
24
|
for my $sub_reader ( @{ $self->{sub_readers} } ) { |
|
15
|
|
|
|
|
49
|
|
39
|
44
|
|
|
|
|
61
|
push @starts, $max_doc; |
40
|
44
|
|
|
|
|
130
|
$max_doc += $sub_reader->max_doc; |
41
|
|
|
|
|
|
|
} |
42
|
15
|
|
|
|
|
49
|
$self->{starts} = \@starts; |
43
|
15
|
|
|
|
|
58
|
$self->{max_doc} = $max_doc; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
3
|
|
|
3
|
0
|
134
|
sub max_doc { shift->{max_doc} } |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub num_docs { |
49
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
0
|
my $num_docs = 0; |
52
|
0
|
|
|
|
|
0
|
$num_docs += $_->num_docs for @{ $self->{sub_readers} }; |
|
0
|
|
|
|
|
0
|
|
53
|
|
|
|
|
|
|
|
54
|
0
|
|
|
|
|
0
|
return $num_docs; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub term_docs { |
58
|
3
|
|
|
3
|
0
|
19
|
my ( $self, $term ) = @_; |
59
|
|
|
|
|
|
|
|
60
|
3
|
|
|
|
|
38
|
my $term_docs = KinoSearch1::Index::MultiTermDocs->new( |
61
|
|
|
|
|
|
|
sub_readers => $self->{sub_readers}, |
62
|
|
|
|
|
|
|
starts => $self->{starts}, |
63
|
|
|
|
|
|
|
); |
64
|
3
|
|
|
|
|
18
|
$term_docs->seek($term); |
65
|
3
|
|
|
|
|
9
|
return $term_docs; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub doc_freq { |
69
|
2
|
|
|
2
|
0
|
3
|
my ( $self, $term ) = @_; |
70
|
2
|
|
|
|
|
8
|
my $doc_freq = 0; |
71
|
2
|
|
|
|
|
4
|
$doc_freq += $_->doc_freq($term) for @{ $self->{sub_readers} }; |
|
2
|
|
|
|
|
14
|
|
72
|
2
|
|
|
|
|
32
|
return $doc_freq; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub fetch_doc { |
76
|
10
|
|
|
10
|
0
|
14
|
my ( $self, $doc_num ) = @_; |
77
|
10
|
|
|
|
|
25
|
my $reader_index = $self->_reader_index($doc_num); |
78
|
10
|
|
|
|
|
20
|
$doc_num -= $self->{starts}[$reader_index]; |
79
|
10
|
|
|
|
|
40
|
return $self->{sub_readers}[$reader_index]->fetch_doc($doc_num); |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub delete_docs_by_term { |
83
|
1
|
|
|
1
|
0
|
3
|
my ( $self, $term ) = @_; |
84
|
1
|
|
|
|
|
4
|
$_->delete_docs_by_term($term) for @{ $self->{sub_readers} }; |
|
1
|
|
|
|
|
9
|
|
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub commit_deletions { |
88
|
12
|
|
|
12
|
0
|
27
|
my $self = shift; |
89
|
12
|
|
|
|
|
21
|
$_->commit_deletions for @{ $self->{sub_readers} }; |
|
12
|
|
|
|
|
96
|
|
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Determine which sub-reader a document resides in |
93
|
|
|
|
|
|
|
sub _reader_index { |
94
|
10
|
|
|
10
|
|
14
|
my ( $self, $doc_num ) = @_; |
95
|
10
|
|
|
|
|
74
|
my $starts = $self->{starts}; |
96
|
10
|
|
|
|
|
18
|
my ( $lo, $mid, $hi ) = ( 0, undef, $#$starts ); |
97
|
10
|
|
|
|
|
27
|
while ( $hi >= $lo ) { |
98
|
26
|
|
|
|
|
37
|
$mid = ( $lo + $hi ) >> 1; |
99
|
26
|
|
|
|
|
36
|
my $mid_start = $starts->[$mid]; |
100
|
26
|
100
|
|
|
|
58
|
if ( $doc_num < $mid_start ) { |
|
|
50
|
|
|
|
|
|
101
|
7
|
|
|
|
|
20
|
$hi = $mid - 1; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
elsif ( $doc_num > $mid_start ) { |
104
|
19
|
|
|
|
|
42
|
$lo = $mid + 1; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
else { |
107
|
0
|
|
0
|
|
|
0
|
while ( $mid < $#$starts and $starts->[ $mid + 1 ] == $mid_start ) |
108
|
|
|
|
|
|
|
{ |
109
|
0
|
|
|
|
|
0
|
$mid++; |
110
|
|
|
|
|
|
|
} |
111
|
0
|
|
|
|
|
0
|
return $mid; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
} |
115
|
10
|
|
|
|
|
21
|
return $hi; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub norms_reader { |
119
|
|
|
|
|
|
|
# TODO refactor and minimize copying |
120
|
1
|
|
|
1
|
0
|
3
|
my ( $self, $field_num ) = @_; |
121
|
1
|
50
|
|
|
|
4
|
if ( exists $self->{norms_cache}{$field_num} ) { |
122
|
0
|
|
|
|
|
0
|
return $self->{norms_cache}{$field_num}; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
else { |
125
|
1
|
|
|
|
|
4
|
my $bytes = ''; |
126
|
1
|
|
|
|
|
2
|
for my $seg_reader ( @{ $self->{sub_readers} } ) { |
|
1
|
|
|
|
|
5
|
|
127
|
4
|
|
|
|
|
13
|
my $seg_norms_reader = $seg_reader->norms_reader($field_num); |
128
|
4
|
50
|
|
|
|
13
|
$bytes .= ${ $seg_norms_reader->get_bytes } if $seg_norms_reader; |
|
4
|
|
|
|
|
12
|
|
129
|
|
|
|
|
|
|
} |
130
|
1
|
|
|
|
|
6
|
my $norms_reader = $self->{norms_cache}{$field_num} |
131
|
|
|
|
|
|
|
= KinoSearch1::Index::NormsReader->new( |
132
|
|
|
|
|
|
|
bytes => $bytes, |
133
|
|
|
|
|
|
|
max_doc => $self->max_doc, |
134
|
|
|
|
|
|
|
); |
135
|
1
|
|
|
|
|
5
|
return $norms_reader; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
sub generate_field_infos { |
140
|
12
|
|
|
12
|
0
|
32
|
my $self = shift; |
141
|
12
|
|
|
|
|
85
|
my $new_finfos = KinoSearch1::Index::FieldInfos->new; |
142
|
|
|
|
|
|
|
my @sub_finfos |
143
|
12
|
|
|
|
|
32
|
= map { $_->generate_field_infos } @{ $self->{sub_readers} }; |
|
32
|
|
|
|
|
114
|
|
|
12
|
|
|
|
|
38
|
|
144
|
12
|
|
|
|
|
67
|
$new_finfos->consolidate(@sub_finfos); |
145
|
12
|
|
|
|
|
165
|
return $new_finfos; |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub get_field_names { |
149
|
1
|
|
|
1
|
0
|
3
|
my $self = shift; |
150
|
1
|
|
|
|
|
4
|
my %field_names; |
151
|
1
|
|
|
|
|
4
|
for my $sub_reader ( @{ $self->{sub_readers} } ) { |
|
1
|
|
|
|
|
3
|
|
152
|
4
|
|
|
|
|
15
|
my $sub_field_names = $sub_reader->get_field_names; |
153
|
4
|
|
|
|
|
17
|
@field_names{@$sub_field_names} = (1) x scalar @$sub_field_names; |
154
|
|
|
|
|
|
|
} |
155
|
1
|
|
|
|
|
7
|
return [ keys %field_names ]; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub segreaders_to_merge { |
159
|
12
|
|
|
12
|
0
|
31
|
my ( $self, $all ) = @_; |
160
|
12
|
50
|
|
|
|
24
|
return unless @{ $self->{sub_readers} }; |
|
12
|
|
|
|
|
51
|
|
161
|
12
|
100
|
|
|
|
33
|
return @{ $self->{sub_readers} } if $all; |
|
1
|
|
|
|
|
7
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# sort by ascending size in docs |
164
|
|
|
|
|
|
|
my @sorted_sub_readers |
165
|
11
|
|
|
|
|
21
|
= sort { $a->num_docs <=> $b->num_docs } @{ $self->{sub_readers} }; |
|
24
|
|
|
|
|
97
|
|
|
11
|
|
|
|
|
45
|
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# find sparsely populated segments |
168
|
11
|
|
|
|
|
22
|
my $total_docs = 0; |
169
|
11
|
|
|
|
|
23
|
my $threshold = -1; |
170
|
11
|
|
|
|
|
37
|
for my $i ( 0 .. $#sorted_sub_readers ) { |
171
|
30
|
|
|
|
|
113
|
$total_docs += $sorted_sub_readers[$i]->num_docs; |
172
|
30
|
100
|
|
|
|
87
|
if ( $total_docs < fibonacci( $i + 5 ) ) { |
173
|
8
|
|
|
|
|
20
|
$threshold = $i; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# if any of the segments are sparse, return their readers |
178
|
11
|
100
|
|
|
|
35
|
if ( $threshold > -1 ) { |
179
|
6
|
|
|
|
|
41
|
return @sorted_sub_readers[ 0 .. $threshold ]; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
else { |
182
|
5
|
|
|
|
|
18
|
return; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# Generate fibonacci series |
187
|
|
|
|
|
|
|
my %fibo_cache; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub fibonacci { |
190
|
58
|
|
|
58
|
0
|
76
|
my $n = shift; |
191
|
58
|
100
|
|
|
|
219
|
return $fibo_cache{$n} if exists $fibo_cache{$n}; |
192
|
18
|
100
|
|
|
|
68
|
my $result = $n < 2 ? $n : fibonacci( $n - 1 ) + fibonacci( $n - 2 ); |
193
|
18
|
|
|
|
|
46
|
$fibo_cache{$n} = $result; |
194
|
18
|
|
|
|
|
53
|
return $result; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub close { |
198
|
12
|
|
|
12
|
0
|
34
|
my $self = shift; |
199
|
12
|
50
|
|
|
|
51
|
return unless $self->{close_invindex}; |
200
|
12
|
|
|
|
|
33
|
$_->close for @{ $self->{sub_readers} }; |
|
12
|
|
|
|
|
79
|
|
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
__END__ |