line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::Index::SegmentReader; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Index::SegmentReader - the Segment reader |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
my $seg_reader = |
10
|
|
|
|
|
|
|
Plucene::Index::SegmentReader->new( Plucene::Index::SegmentInfo $si); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
my @files = $seg_reader->files; |
13
|
|
|
|
|
|
|
my @terms = $seg_reader->terms; |
14
|
|
|
|
|
|
|
my $doc = $seg_reader->document($id); |
15
|
|
|
|
|
|
|
my $doc_freq = $seg_reader->doc_freq($term); |
16
|
|
|
|
|
|
|
my $max_doc = $seg_reader->max_doc; |
17
|
|
|
|
|
|
|
my $norms = $seg_reader->norms($field, $offset); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
my Plucene::Index::SegmentTermDocs $docs |
20
|
|
|
|
|
|
|
= $seg_reader->term_docs($term); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my Plucene::Index::SegmentTermPositions $pos |
23
|
|
|
|
|
|
|
= $seg_reader->term_positions($term); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
my Plucene::Store::InputStream $stream |
26
|
|
|
|
|
|
|
= $seg_reader->norm_stream($field); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
if ($seg_reader->is_deleted($id)) { .. } |
29
|
|
|
|
|
|
|
if ($seg_reader->has_deletions(Plucene::Index::SegmentInfo $si)) |
30
|
|
|
|
|
|
|
{ ... } |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The segment reader class. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 METHODS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
39
|
|
|
|
|
|
|
|
40
|
18
|
|
|
18
|
|
98
|
use strict; |
|
18
|
|
|
|
|
57
|
|
|
18
|
|
|
|
|
647
|
|
41
|
18
|
|
|
18
|
|
92
|
use warnings; |
|
18
|
|
|
|
|
99
|
|
|
18
|
|
|
|
|
471
|
|
42
|
|
|
|
|
|
|
|
43
|
18
|
|
|
18
|
|
20246
|
use File::Slurp; |
|
18
|
|
|
|
|
354111
|
|
|
18
|
|
|
|
|
1714
|
|
44
|
18
|
|
|
18
|
|
27298
|
use Plucene::Bitvector; |
|
18
|
|
|
|
|
56
|
|
|
18
|
|
|
|
|
614
|
|
45
|
18
|
|
|
18
|
|
11236
|
use Plucene::Index::FieldInfos; |
|
18
|
|
|
|
|
61
|
|
|
18
|
|
|
|
|
582
|
|
46
|
18
|
|
|
18
|
|
12648
|
use Plucene::Index::FieldsReader; |
|
18
|
|
|
|
|
62
|
|
|
18
|
|
|
|
|
801
|
|
47
|
18
|
|
|
18
|
|
11151
|
use Plucene::Index::SegmentTermDocs; |
|
18
|
|
|
|
|
68
|
|
|
18
|
|
|
|
|
174
|
|
48
|
18
|
|
|
18
|
|
13156
|
use Plucene::Index::SegmentTermPositions; |
|
18
|
|
|
|
|
53
|
|
|
18
|
|
|
|
|
168
|
|
49
|
18
|
|
|
18
|
|
11043
|
use Plucene::Index::TermInfosReader; |
|
18
|
|
|
|
|
65
|
|
|
18
|
|
|
|
|
708
|
|
50
|
18
|
|
|
18
|
|
9016
|
use Plucene::Utils; |
|
18
|
|
|
|
|
47
|
|
|
18
|
|
|
|
|
922
|
|
51
|
18
|
|
|
18
|
|
119
|
use Plucene::Store::InputStream; |
|
18
|
|
|
|
|
34
|
|
|
18
|
|
|
|
|
387
|
|
52
|
18
|
|
|
18
|
|
114
|
use Plucene::Store::OutputStream; |
|
18
|
|
|
|
|
42
|
|
|
18
|
|
|
|
|
437
|
|
53
|
|
|
|
|
|
|
|
54
|
18
|
|
|
18
|
|
96
|
use base qw(Plucene::Index::Reader Class::Accessor::Fast); |
|
18
|
|
|
|
|
36
|
|
|
18
|
|
|
|
|
34610
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
__PACKAGE__->mk_accessors( |
57
|
|
|
|
|
|
|
qw(field_infos fields_reader deleted_docs freq_stream prox_stream directory) |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 new |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
my $seg_reader = |
63
|
|
|
|
|
|
|
Plucene::Index::SegmentReader->new( Plucene::Index::SegmentInfo $si); |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
This will create a new Plucene::Index::SegmentReader object. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=cut |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub new { |
70
|
544
|
|
|
544
|
1
|
1144
|
my ($class, $si) = @_; |
71
|
544
|
|
|
|
|
2489
|
my $self = $class->SUPER::new($si->dir); |
72
|
544
|
|
|
|
|
2316
|
my $segment = $self->{segment} = $si->name; |
73
|
544
|
|
|
|
|
7623
|
$self->field_infos( |
74
|
|
|
|
|
|
|
Plucene::Index::FieldInfos->new($self->{directory}, "$segment.fnm")); |
75
|
544
|
|
|
|
|
7310
|
$self->fields_reader( |
76
|
|
|
|
|
|
|
Plucene::Index::FieldsReader->new( |
77
|
|
|
|
|
|
|
$self->{directory}, $segment, $self->{field_infos})); |
78
|
|
|
|
|
|
|
|
79
|
544
|
|
|
|
|
7676
|
$self->{tis} = |
80
|
|
|
|
|
|
|
Plucene::Index::TermInfosReader->new($self->{directory}, $segment, |
81
|
|
|
|
|
|
|
$self->{field_infos}); |
82
|
|
|
|
|
|
|
|
83
|
544
|
100
|
|
|
|
2102
|
if ($self->has_deletions($si)) { |
84
|
3
|
|
|
|
|
93
|
my $stream = |
85
|
|
|
|
|
|
|
Plucene::Store::InputStream->new("$self->{directory}/$segment.del"); |
86
|
3
|
|
|
|
|
25
|
$self->deleted_docs(Plucene::Bitvector->read($stream)); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
$self->freq_stream( |
90
|
544
|
|
|
|
|
20927
|
[ unpack "(w)*", read_file("$self->{directory}/$segment.frq") ]); |
91
|
544
|
|
|
|
|
399191
|
$self->prox_stream( |
92
|
|
|
|
|
|
|
[ unpack "(w)*", read_file("$self->{directory}/$segment.prx") ]); |
93
|
544
|
|
|
|
|
1159983
|
$self->_open_norms; |
94
|
544
|
|
|
|
|
8166
|
return $self; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub _do_close { |
98
|
16
|
|
|
16
|
|
25
|
my $self = shift; |
99
|
16
|
100
|
|
|
|
68
|
if ($self->{deleted_docs_dirty}) { |
100
|
2
|
|
|
|
|
10
|
my $file = "$self->{directory}/$self->{segment}"; |
101
|
|
|
|
|
|
|
do_locked { |
102
|
2
|
|
|
2
|
|
24
|
my $stream = Plucene::Store::OutputStream->new($file . ".tmp"); |
103
|
2
|
|
|
|
|
10
|
$self->deleted_docs->write($stream); |
104
|
2
|
|
|
|
|
11
|
$stream->close; |
105
|
2
|
|
|
|
|
142
|
rename $file . ".tmp", $file . ".del"; |
106
|
|
|
|
|
|
|
} |
107
|
2
|
|
|
|
|
25
|
"$self->{directory}/commit.lock"; |
108
|
2
|
|
|
|
|
16
|
$self->{deleted_docs_dirty} = 0; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head2 has_deletions |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
if ($seg_reader->has_deletions(Plucene::Index::SegmentInfo $si)) |
115
|
|
|
|
|
|
|
{ ... } |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=cut |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub has_deletions { |
120
|
563
|
|
|
563
|
1
|
2334
|
-e ($_[1]->dir . "/" . $_[1]->name . ".del"); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _do_delete { |
124
|
41
|
|
|
41
|
|
58
|
my ($self, $doc_num) = @_; |
125
|
41
|
100
|
|
|
|
94
|
$self->{deleted_docs} = Plucene::Bitvector->new(size => $self->max_doc) |
126
|
|
|
|
|
|
|
unless $self->deleted_docs; |
127
|
41
|
|
|
|
|
293
|
$self->deleted_docs->set($doc_num); |
128
|
41
|
|
|
|
|
487
|
$self->{deleted_docs_dirty} = 1; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 files |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
my @files = $seg_reader->files; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=cut |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub files { |
138
|
274
|
|
|
274
|
1
|
445
|
my $self = shift; |
139
|
274
|
|
|
|
|
830
|
my $segment = $self->{segment}; |
140
|
274
|
|
|
|
|
2176
|
my @files = map "$segment.$_", qw( fnm fdx fdt tii tis frq prx); |
141
|
274
|
50
|
|
|
|
4936
|
push @files, "$segment.del" if -e "$self->{directory}/$segment.del"; |
142
|
274
|
|
|
|
|
1132
|
my @fi = $self->field_infos->fields; |
143
|
274
|
|
66
|
|
|
8164
|
($fi[$_]->is_indexed && push @files, "$segment.f$_") for 0 .. $#fi; |
144
|
274
|
|
|
|
|
15265
|
return @files; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 terms |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
my @terms = $seg_reader->terms; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=cut |
152
|
|
|
|
|
|
|
|
153
|
278
|
|
|
278
|
1
|
2804
|
sub terms { shift->{tis}->terms(@_) } |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head2 document |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
my $doc = $seg_reader->document($id); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=cut |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
sub document { |
162
|
499
|
|
|
499
|
1
|
826
|
my ($self, $id) = @_; |
163
|
499
|
50
|
|
|
|
4414
|
die "Attempt to access deleted document $id" if $self->is_deleted($id); |
164
|
499
|
|
|
|
|
1806
|
return $self->{fields_reader}->doc($id); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head2 is_deleted |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
if ($seg_reader->is_deleted($id)) { .. } |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub is_deleted { |
174
|
1442
|
100
|
|
1442
|
1
|
7282
|
$_[0]->{deleted_docs} ? $_[0]->{deleted_docs}->get($_[1]) : 0; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head2 term_docs |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
my Plucene::Index::SegmentTermDocs $docs |
180
|
|
|
|
|
|
|
= $seg_reader->term_docs($term); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
This will return the Plucene::Index::SegmentTermDocs object for the |
183
|
|
|
|
|
|
|
given term. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub term_docs { |
188
|
295
|
|
|
295
|
1
|
1235
|
my ($self, $term) = @_; |
189
|
295
|
|
|
|
|
2398
|
my $docs = Plucene::Index::SegmentTermDocs->new($self); |
190
|
295
|
100
|
|
|
|
4669
|
if ($term) { $docs->seek($term) } |
|
141
|
|
|
|
|
2402
|
|
191
|
295
|
|
|
|
|
2217
|
return $docs; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head2 term_positions |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
my Plucene::Index::SegmentTermPositions $pos |
197
|
|
|
|
|
|
|
= $seg_reader->term_positions($term); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This will return the Plucene::Index::SegmentTermPositions object for the |
200
|
|
|
|
|
|
|
given term. |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
=cut |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub term_positions { |
205
|
25
|
|
|
25
|
1
|
53
|
my ($self, $term) = @_; |
206
|
25
|
|
|
|
|
161
|
my $pos = Plucene::Index::SegmentTermPositions->new($self); |
207
|
25
|
100
|
|
|
|
107
|
$pos->seek($term) if $term; |
208
|
25
|
|
|
|
|
142
|
return $pos; |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=head2 doc_freq |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $doc_freq = $seg_reader->doc_freq($term); |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
This returns the number of documents containing the passed term. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=cut |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub doc_freq { |
221
|
304
|
|
|
304
|
1
|
943
|
my ($self, $term) = @_; |
222
|
304
|
100
|
|
|
|
1590
|
my $ti = $self->{tis}->get($term) or return 0; |
223
|
224
|
|
|
|
|
2629
|
return $ti->doc_freq; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=head2 num_docs |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
my $num_docs = $seg_reader->num_docs; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
This is the number of documents, excluding deleted ones. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=cut |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub num_docs { |
235
|
276
|
|
|
276
|
1
|
375
|
my $self = shift; |
236
|
276
|
|
|
|
|
551
|
my $num = $self->max_doc; |
237
|
276
|
100
|
|
|
|
837
|
$num -= $self->deleted_docs->count if $self->deleted_docs; |
238
|
276
|
|
|
|
|
1721
|
$num; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head2 max_doc |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
my $max_doc = $seg_reader->max_doc; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=cut |
246
|
|
|
|
|
|
|
|
247
|
1629
|
|
|
1629
|
1
|
6318
|
sub max_doc { $_[0]->fields_reader->size; } |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 norms |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
my $norms = $seg_reader->norms($field, $offset); |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
This returns the byte-encoded normalisation factor for the passed |
254
|
|
|
|
|
|
|
field. This is used by the search code to score documents. |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Note we are not using the 'offset' and 'bytes' arguments per the Java. |
257
|
|
|
|
|
|
|
Instead, callers should use substr to put the result of "norms" into |
258
|
|
|
|
|
|
|
the appropriate place in a string. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=cut |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
sub norms { |
263
|
269
|
|
|
269
|
1
|
1894
|
my ($self, $field, $offset) = @_; |
264
|
269
|
50
|
|
|
|
1329
|
my $norm = $self->{norms}->{$field} or return; |
265
|
269
|
|
66
|
|
|
1854
|
return $norm->{bytes} ||= $self->_norm_read_from_stream($field); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub _norm_read_from_stream { |
269
|
260
|
|
|
260
|
|
548
|
my ($self, $field) = @_; |
270
|
260
|
50
|
|
|
|
854
|
my $ns = $self->norm_stream($field) or return; |
271
|
260
|
|
|
|
|
968
|
$ns->read(my $output, $self->max_doc); |
272
|
260
|
|
|
|
|
1554
|
return $output; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head2 norm_stream |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
my Plucene::Store::InputStream $stream |
278
|
|
|
|
|
|
|
= $seg_reader->norm_stream($field); |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
This will return the Plucene::Store::InputStream for the passed field. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=cut |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub norm_stream { |
285
|
658
|
|
|
658
|
1
|
3905
|
my ($self, $field) = @_; |
286
|
658
|
50
|
|
|
|
2748
|
my $norm = $self->{norms}->{$field} or return; |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# Clone the norm's filehandle |
289
|
658
|
|
|
|
|
2740
|
my $clon = $norm->{in}->clone; |
290
|
658
|
|
|
|
|
2500
|
$clon->seek(0, 0); |
291
|
658
|
|
|
|
|
7643
|
return $clon; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub _open_norms { |
295
|
544
|
|
|
544
|
|
1161
|
my $self = shift; |
296
|
544
|
|
|
|
|
2241
|
for my $fi (grep $_->is_indexed, $self->field_infos->fields) { |
297
|
891
|
|
|
|
|
32963
|
my $file = "$self->{directory}/$self->{segment}.f" . $fi->number; |
298
|
891
|
50
|
|
|
|
14387
|
my $fh = Plucene::Store::InputStream->new($file) or die $file . " :" . $!; |
299
|
891
|
|
|
|
|
5519
|
$self->{norms}{ $fi->name } = Plucene::Index::Norm->new($fh); |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
package Plucene::Index::Norm; |
304
|
|
|
|
|
|
|
|
305
|
891
|
|
|
891
|
|
25175
|
sub new { bless { in => $_[1] }, $_[0] } |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# They have bytes, too, but we're not worrying about that. |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
1; |