line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::Index::SegmentsReader; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Index::SegmentsReader - reads the segments |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my $segs_reader = Plucene::Index::SegmentsReader |
10
|
|
|
|
|
|
|
->new($dir, Plucene::Index::SegmentReader @readers); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
my $num_docs = $segs_reader->num_docs; |
13
|
|
|
|
|
|
|
my $doc = $segs_reader->document($id); |
14
|
|
|
|
|
|
|
my $norms = $seg_reader->norms($field); |
15
|
|
|
|
|
|
|
my $doc_freq = $segs_reader->doc_freq($term); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermEnum $term_enum |
18
|
|
|
|
|
|
|
= $segs_reader->terms($term); |
19
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermDocs $term_docs |
20
|
|
|
|
|
|
|
= $segs_reader->term_docs; |
21
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermPositions $term_positions |
22
|
|
|
|
|
|
|
= $segs_reader->term_positions; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
if ($segs_reader->is_deleted($id)) { ... } |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This is the segments reader class. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 METHODS |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=cut |
33
|
|
|
|
|
|
|
|
34
|
18
|
|
|
18
|
|
95
|
use strict; |
|
18
|
|
|
|
|
38
|
|
|
18
|
|
|
|
|
576
|
|
35
|
18
|
|
|
18
|
|
90
|
use warnings; |
|
18
|
|
|
|
|
44
|
|
|
18
|
|
|
|
|
493
|
|
36
|
|
|
|
|
|
|
|
37
|
18
|
|
|
18
|
|
99
|
use List::Util qw(sum); |
|
18
|
|
|
|
|
62
|
|
|
18
|
|
|
|
|
1770
|
|
38
|
18
|
|
|
18
|
|
10561
|
use Plucene::Index::SegmentsTermEnum; |
|
18
|
|
|
|
|
68
|
|
|
18
|
|
|
|
|
638
|
|
39
|
|
|
|
|
|
|
|
40
|
18
|
|
|
18
|
|
259
|
use base qw(Plucene::Index::Reader Class::Accessor::Fast); |
|
18
|
|
|
|
|
45
|
|
|
18
|
|
|
|
|
2631
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors(qw(max_doc)); |
43
|
|
|
|
|
|
|
|
44
|
18
|
|
|
18
|
|
173
|
use Memoize; |
|
18
|
|
|
|
|
44
|
|
|
18
|
|
|
|
|
32410
|
|
45
|
|
|
|
|
|
|
memoize("norms"); # Saves messing with normsCache |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 new |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $segs_reader = Plucene::Index::SegmentsReader |
50
|
|
|
|
|
|
|
->new($dir, Plucene::Index::SegmentReader @readers); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
This will create a new Plucene::Index::SegmentsReader object with the passed |
53
|
|
|
|
|
|
|
directory and Plucene::Index::SegmentReader objects. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub new { |
58
|
41
|
|
|
41
|
1
|
220
|
my ($class, $dir, @readers) = @_; |
59
|
41
|
|
|
|
|
403
|
my $self = $class->SUPER::new($dir); |
60
|
41
|
|
|
|
|
263
|
$self->{readers} = \@readers; |
61
|
41
|
|
|
|
|
153
|
$self->{max_doc} = 0; |
62
|
41
|
|
|
|
|
147
|
for my $reader (@readers) { |
63
|
129
|
|
|
|
|
228
|
push @{ $self->{starts} }, $self->{max_doc}; |
|
129
|
|
|
|
|
387
|
|
64
|
129
|
|
|
|
|
569
|
$self->{max_doc} += $reader->max_doc; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
41
|
|
|
|
|
886
|
return $self; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head2 num_docs |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
my $num_docs = $segs_reader->num_docs; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
This will return the number of documents in all the segments in the Reader. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub num_docs { |
79
|
2
|
|
|
2
|
1
|
804
|
my $self = shift; |
80
|
2
|
100
|
|
|
|
12
|
return $self->{num_docs} if exists $self->{num_docs}; |
81
|
1
|
|
|
|
|
2
|
return $self->{num_docs} = sum(map $_->num_docs, @{ $self->{readers} }); |
|
1
|
|
|
|
|
5
|
|
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head2 document |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
my $doc = $segs_reader->document($id); |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
This will return the document at the passed document id. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=cut |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub document { |
93
|
110
|
|
|
110
|
1
|
749
|
my ($self, $n) = @_; |
94
|
110
|
|
|
|
|
347
|
my $i = $self->_reader_index($n); |
95
|
110
|
|
|
|
|
597
|
return $self->{readers}[$i]->document($n - $self->{starts}[$i]); |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head2 is_deleted |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
if ($segs_reader->is_deleted($id)) { ... } |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=cut |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub is_deleted { |
105
|
2
|
|
|
2
|
1
|
11
|
my ($self, $n) = @_; |
106
|
2
|
|
|
|
|
8
|
my $i = $self->_reader_index($n); |
107
|
2
|
|
|
|
|
13
|
return $self->{readers}[$i]->is_deleted($n - $self->{starts}[$i]); |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub _do_delete { |
111
|
1
|
|
|
1
|
|
3
|
my ($self, $n) = @_; |
112
|
1
|
|
|
|
|
2
|
delete $self->{num_docs}; # Invalidate cache |
113
|
1
|
|
|
|
|
6
|
my $i = $self->_reader_index($n); |
114
|
1
|
|
|
|
|
7
|
return $self->{readers}[$i]->_do_delete($n - $self->{starts}[$i]); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub _do_close { |
118
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
119
|
2
|
|
|
|
|
4
|
$_->close for @{ $self->{readers} }; |
|
2
|
|
|
|
|
15
|
|
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
sub _reader_index { |
123
|
113
|
|
|
113
|
|
210
|
my ($self, $n) = @_; |
124
|
113
|
|
|
|
|
207
|
my ($lo, $hi) = (0, $#{ $self->{readers} }); |
|
113
|
|
|
|
|
326
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# Binary search |
127
|
113
|
|
|
|
|
371
|
while ($hi >= $lo) { |
128
|
218
|
|
|
|
|
420
|
my $mid = int(($lo + $hi) / 2); |
129
|
218
|
|
|
|
|
422
|
my $mid_val = $self->{starts}[$mid]; |
130
|
218
|
100
|
|
|
|
618
|
if ($n < $mid_val) { $hi = $mid - 1; } |
|
88
|
100
|
|
|
|
256
|
|
131
|
115
|
|
|
|
|
312
|
elsif ($n > $mid_val) { $lo = $mid + 1; } |
132
|
15
|
|
|
|
|
47
|
else { return $mid; } |
133
|
|
|
|
|
|
|
} |
134
|
98
|
|
|
|
|
201
|
return $hi; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head2 norms |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
my $norms = $seg_reader->norms($field); |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
This returns the norms for the passed field. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=cut |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub norms { |
146
|
|
|
|
|
|
|
my ($self, $field) = @_; |
147
|
|
|
|
|
|
|
my $bytes = "\0" x $self->max_doc; |
148
|
|
|
|
|
|
|
for my $i (0 .. $#{ $self->{readers} }) { |
149
|
|
|
|
|
|
|
my $norm = $self->{readers}[$i]->norms($field); |
150
|
|
|
|
|
|
|
substr($bytes, $self->{starts}[$i], length $norm) = $norm; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
return $bytes; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head2 terms |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermEnum $term_enum |
158
|
|
|
|
|
|
|
= $segs_reader->terms($term); |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
This will return the Plucene::Index::SegmentsTermEnum onject for the |
161
|
|
|
|
|
|
|
passed in term. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=cut |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
sub terms { |
166
|
1
|
|
|
1
|
1
|
9
|
my ($self, $term) = @_; |
167
|
1
|
|
|
|
|
12
|
return Plucene::Index::SegmentsTermEnum->new($self->{readers}, |
168
|
|
|
|
|
|
|
$self->{starts}, $term); |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head2 doc_freq |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
my $doc_freq = $segs_reader->doc_freq($term); |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
This returns the number of documents containing the passed term. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=cut |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub doc_freq { |
180
|
51
|
|
|
51
|
1
|
133
|
my ($self, $term) = @_; |
181
|
51
|
|
|
|
|
144
|
return sum map $_->doc_freq($term), @{ $self->{readers} }; |
|
51
|
|
|
|
|
329
|
|
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
=head2 term_docs |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermDocs $term_docs |
187
|
|
|
|
|
|
|
= $segs_reader->term_docs; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
This will return the Plucene::Index::SegmentsTermDocs object. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub term_docs { |
194
|
50
|
|
|
50
|
1
|
757
|
my $self = shift; |
195
|
50
|
|
|
|
|
134
|
my $term = shift; |
196
|
50
|
|
|
|
|
461
|
my $docs = |
197
|
|
|
|
|
|
|
Plucene::Index::SegmentsTermDocs->new($self->{readers}, $self->{starts}); |
198
|
50
|
100
|
|
|
|
206
|
if ($term) { $docs->seek($term) } |
|
49
|
|
|
|
|
199
|
|
199
|
50
|
|
|
|
|
177
|
return $docs; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head2 term_positions |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
my Plucene::Index::SegmentsTermPositions $term_positions |
206
|
|
|
|
|
|
|
= $segs_reader->term_positions; |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
This will return the Plucene::Index::SegmentsTermPositions object. |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=cut |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
sub term_positions { |
213
|
7
|
|
|
7
|
1
|
14
|
my $self = shift; |
214
|
7
|
|
|
|
|
16
|
my $term = shift; |
215
|
7
|
|
|
|
|
59
|
my $pos = |
216
|
|
|
|
|
|
|
Plucene::Index::SegmentsTermPositions->new($self->{readers}, |
217
|
|
|
|
|
|
|
$self->{starts}); |
218
|
7
|
100
|
|
|
|
25
|
if ($term) { $pos->seek($term) } |
|
6
|
|
|
|
|
26
|
|
219
|
7
|
|
|
|
|
23
|
return $pos; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
package Plucene::Index::SegmentsTermDocs; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub new { |
225
|
57
|
|
|
57
|
|
182
|
my ($class, $readers, $starts) = @_; |
226
|
57
|
|
|
|
|
567
|
bless { |
227
|
|
|
|
|
|
|
readers => $readers, |
228
|
|
|
|
|
|
|
starts => $starts, |
229
|
|
|
|
|
|
|
seg_term_docs => [], |
230
|
|
|
|
|
|
|
base => 0, |
231
|
|
|
|
|
|
|
pointer => 0, |
232
|
|
|
|
|
|
|
current => undef, |
233
|
|
|
|
|
|
|
}, $class; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub doc { |
237
|
105
|
|
|
105
|
|
566
|
my $self = shift; |
238
|
105
|
|
|
|
|
370
|
return $self->{base} + $self->{current}->doc; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
96
|
|
|
96
|
|
757
|
sub freq { return shift->{current}->freq; } |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
sub seek { |
244
|
55
|
|
|
55
|
|
150
|
my ($self, $term) = @_; |
245
|
55
|
|
|
|
|
200
|
$self->{term} = $term; |
246
|
55
|
|
|
|
|
120
|
$self->{base} = 0; |
247
|
55
|
|
|
|
|
122
|
$self->{pointer} = 0; |
248
|
55
|
|
|
|
|
128
|
$self->{current} = undef; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
223
|
|
|
223
|
|
537
|
sub _at_end { $_[0]->{pointer} >= @{ $_[0]->{readers} } } |
|
223
|
|
|
|
|
1350
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub _set_base_and_advance { |
254
|
171
|
|
|
171
|
|
315
|
my $self = shift; |
255
|
171
|
|
|
|
|
499
|
$self->{base} = $self->{starts}[ $self->{pointer} ]; |
256
|
171
|
|
|
|
|
564
|
$self->{current} = $self->term_docs($self->{pointer}++); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
sub next { |
260
|
132
|
|
|
132
|
|
193
|
my $self = shift; |
261
|
132
|
100
|
100
|
|
|
702
|
return 1 if $self->{current} && $self->{current}->next; |
262
|
27
|
100
|
|
|
|
151
|
unless ($self->_at_end) { |
263
|
23
|
|
|
|
|
90
|
$self->_set_base_and_advance; |
264
|
23
|
|
|
|
|
104
|
return $self->next; |
265
|
|
|
|
|
|
|
} |
266
|
4
|
|
|
|
|
29
|
return 0; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub read { |
270
|
112
|
|
|
112
|
|
247
|
my ($self) = @_; |
271
|
112
|
|
|
|
|
325
|
my ($docs, $freqs) = ([], []); |
272
|
112
|
|
|
|
|
206
|
while (1) { |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Get a .current, somehow |
275
|
260
|
|
|
|
|
873
|
while (!$self->{current}) { |
276
|
196
|
100
|
|
|
|
591
|
goto done if $self->_at_end; # Don't fall off |
277
|
148
|
|
|
|
|
490
|
$self->_set_base_and_advance; |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
212
|
|
|
|
|
955
|
my ($new_docs, $new_freqs) = $self->{current}->read($docs, $freqs); |
281
|
212
|
100
|
|
|
|
721
|
if (!scalar @$new_docs) { |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# It's empty |
284
|
148
|
|
|
|
|
426
|
undef $self->{current}; |
285
|
|
|
|
|
|
|
} else { |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# Correct the doc positions to the appropriate base |
288
|
64
|
|
|
|
|
386
|
$_ += $self->{base} for @$new_docs; |
289
|
64
|
|
|
|
|
727
|
push @$docs, @$new_docs; |
290
|
64
|
|
|
|
|
164
|
push @$freqs, @$new_freqs; |
291
|
64
|
|
|
|
|
555
|
goto done; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
112
|
|
|
|
|
511
|
done: return ($docs, $freqs); |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
sub skip_to { |
299
|
0
|
|
|
0
|
|
0
|
my ($self, $target) = @_; |
300
|
0
|
|
0
|
|
|
0
|
$self->next || return while $target > $self->doc; |
301
|
0
|
|
|
|
|
0
|
return 1; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
sub term_docs { |
305
|
171
|
|
|
171
|
|
355
|
my ($self, $i) = @_; |
306
|
171
|
50
|
|
|
|
599
|
return unless $self->{term}; |
307
|
171
|
50
|
|
|
|
1161
|
$self->{seg_term_docs}[$i] = $self->term_docs_r($self->{readers}[$i]) |
308
|
|
|
|
|
|
|
unless exists $self->{seg_term_docs}[$i]; |
309
|
171
|
|
|
|
|
551
|
my $result = $self->{seg_term_docs}[$i]; |
310
|
171
|
|
|
|
|
674
|
$result->seek($self->{term}); |
311
|
171
|
|
|
|
|
1594
|
return $result; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
sub term_docs_r { |
315
|
153
|
|
|
153
|
|
305
|
my ($self, $reader) = @_; |
316
|
153
|
|
|
|
|
614
|
return $reader->term_docs; |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
package Plucene::Index::SegmentsTermPositions; |
320
|
18
|
|
|
18
|
|
133
|
use base 'Plucene::Index::SegmentsTermDocs'; |
|
18
|
|
|
|
|
63
|
|
|
18
|
|
|
|
|
14315
|
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub term_docs_r { |
323
|
18
|
|
|
18
|
|
39
|
my ($self, $reader) = @_; |
324
|
18
|
|
|
|
|
103
|
return $reader->term_positions; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub next_position { |
328
|
421
|
|
|
421
|
|
3719
|
return shift->{current}->next_position; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
1; |