line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package KinoSearch1::Index::SegWriter; |
2
|
34
|
|
|
34
|
|
196
|
use strict; |
|
34
|
|
|
|
|
73
|
|
|
34
|
|
|
|
|
1260
|
|
3
|
34
|
|
|
34
|
|
196
|
use warnings; |
|
34
|
|
|
|
|
75
|
|
|
34
|
|
|
|
|
859
|
|
4
|
34
|
|
|
34
|
|
188
|
use KinoSearch1::Util::ToolSet; |
|
34
|
|
|
|
|
74
|
|
|
34
|
|
|
|
|
4799
|
|
5
|
34
|
|
|
34
|
|
327
|
use base qw( KinoSearch1::Util::Class ); |
|
34
|
|
|
|
|
77
|
|
|
34
|
|
|
|
|
4525
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
BEGIN { |
8
|
34
|
|
|
34
|
|
372
|
__PACKAGE__->init_instance_vars( |
9
|
|
|
|
|
|
|
# constructor params / members |
10
|
|
|
|
|
|
|
invindex => undef, |
11
|
|
|
|
|
|
|
seg_name => undef, |
12
|
|
|
|
|
|
|
finfos => undef, |
13
|
|
|
|
|
|
|
field_sims => undef, |
14
|
|
|
|
|
|
|
# members |
15
|
|
|
|
|
|
|
norm_outstreams => undef, |
16
|
|
|
|
|
|
|
fields_writer => undef, |
17
|
|
|
|
|
|
|
postings_writer => undef, |
18
|
|
|
|
|
|
|
doc_count => 0, |
19
|
|
|
|
|
|
|
); |
20
|
34
|
|
|
|
|
416
|
__PACKAGE__->ready_get(qw( seg_name doc_count )); |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
34
|
|
|
34
|
|
14205
|
use KinoSearch1::Analysis::TokenBatch; |
|
34
|
|
|
|
|
94
|
|
|
34
|
|
|
|
|
903
|
|
24
|
34
|
|
|
34
|
|
36190
|
use KinoSearch1::Index::FieldsWriter; |
|
34
|
|
|
|
|
105
|
|
|
34
|
|
|
|
|
959
|
|
25
|
34
|
|
|
34
|
|
21613
|
use KinoSearch1::Index::PostingsWriter; |
|
34
|
|
|
|
|
117
|
|
|
34
|
|
|
|
|
1271
|
|
26
|
34
|
|
|
34
|
|
20089
|
use KinoSearch1::Index::CompoundFileWriter; |
|
34
|
|
|
|
|
111
|
|
|
34
|
|
|
|
|
1018
|
|
27
|
|
|
|
|
|
|
use KinoSearch1::Index::IndexFileNames |
28
|
34
|
|
|
34
|
|
457
|
qw( @COMPOUND_EXTENSIONS SORTFILE_EXTENSION ); |
|
34
|
|
|
|
|
73
|
|
|
34
|
|
|
|
|
48156
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub init_instance { |
31
|
62
|
|
|
62
|
1
|
139
|
my $self = shift; |
32
|
62
|
|
|
|
|
363
|
my ( $invindex, $seg_name, $finfos ) |
33
|
62
|
|
|
|
|
144
|
= @{$self}{ 'invindex', 'seg_name', 'finfos' }; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# init norms |
36
|
62
|
|
|
|
|
192
|
my $norm_outstreams = $self->{norm_outstreams} = []; |
37
|
128
|
|
|
|
|
379
|
my @indexed_field_nums = map { $_->get_field_num } |
|
132
|
|
|
|
|
384
|
|
38
|
62
|
|
|
|
|
302
|
grep { $_->get_indexed } $finfos->get_infos; |
39
|
62
|
|
|
|
|
190
|
for my $field_num (@indexed_field_nums) { |
40
|
128
|
|
|
|
|
355
|
my $filename = "$seg_name.f$field_num"; |
41
|
128
|
100
|
|
|
|
438
|
$invindex->delete_file($filename) |
42
|
|
|
|
|
|
|
if $invindex->file_exists($filename); |
43
|
128
|
|
|
|
|
471
|
$norm_outstreams->[$field_num] = $invindex->open_outstream($filename); |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# init FieldsWriter |
47
|
62
|
|
|
|
|
623
|
$self->{fields_writer} = KinoSearch1::Index::FieldsWriter->new( |
48
|
|
|
|
|
|
|
invindex => $invindex, |
49
|
|
|
|
|
|
|
seg_name => $seg_name, |
50
|
|
|
|
|
|
|
); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# init PostingsWriter |
53
|
62
|
|
|
|
|
645
|
$self->{postings_writer} = KinoSearch1::Index::PostingsWriter->new( |
54
|
|
|
|
|
|
|
invindex => $invindex, |
55
|
|
|
|
|
|
|
seg_name => $seg_name, |
56
|
|
|
|
|
|
|
); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Add a document to the segment. |
60
|
|
|
|
|
|
|
sub add_doc { |
61
|
14017
|
|
|
14017
|
0
|
18598
|
my ( $self, $doc ) = @_; |
62
|
14017
|
|
|
|
|
19128
|
my $norm_outstreams = $self->{norm_outstreams}; |
63
|
14017
|
|
|
|
|
16695
|
my $postings_cache = $self->{postings_cache}; |
64
|
14017
|
|
|
|
|
16832
|
my $field_sims = $self->{field_sims}; |
65
|
14017
|
|
|
|
|
32977
|
my $doc_boost = $doc->get_boost; |
66
|
|
|
|
|
|
|
|
67
|
14017
|
|
|
|
|
32485
|
for my $indexed_field ( grep { $_->get_indexed } $doc->get_fields ) { |
|
15390
|
|
|
|
|
36849
|
|
68
|
15376
|
|
|
|
|
36644
|
my $field_name = $indexed_field->get_name; |
69
|
15376
|
|
|
|
|
70315
|
my $token_batch = KinoSearch1::Analysis::TokenBatch->new; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# if the field has content, put it in the TokenBatch |
72
|
15376
|
100
|
|
|
|
45646
|
if ( $indexed_field->get_value_len ) { |
73
|
15307
|
|
|
|
|
102131
|
$token_batch->append( $indexed_field->get_value, 0, |
74
|
|
|
|
|
|
|
$indexed_field->get_value_len ); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# analyze the field |
78
|
15376
|
100
|
|
|
|
96807
|
if ( $indexed_field->get_analyzed ) { |
79
|
15364
|
|
|
|
|
36506
|
$token_batch |
80
|
|
|
|
|
|
|
= $indexed_field->get_analyzer()->analyze($token_batch); |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# invert the doc |
84
|
15376
|
|
|
|
|
83462
|
$token_batch->build_posting_list( $self->{doc_count}, |
85
|
|
|
|
|
|
|
$indexed_field->get_field_num ); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# prepare to store the term vector, if the field is vectorized |
88
|
15376
|
50
|
33
|
|
|
43231
|
if ( $indexed_field->get_vectorized and $indexed_field->get_stored ) { |
89
|
15376
|
|
|
|
|
64638
|
$indexed_field->set_tv_string( $token_batch->get_tv_string ); |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# encode a norm into a byte, write it to an outstream |
93
|
15376
|
|
|
|
|
43815
|
my $norm_val |
94
|
|
|
|
|
|
|
= $doc_boost |
95
|
|
|
|
|
|
|
* $indexed_field->get_boost |
96
|
|
|
|
|
|
|
* $field_sims->{$field_name} |
97
|
|
|
|
|
|
|
->lengthnorm( $token_batch->get_size ); |
98
|
15376
|
|
|
|
|
42794
|
my $outstream = $norm_outstreams->[ $indexed_field->get_field_num ]; |
99
|
15376
|
|
|
|
|
73280
|
$outstream->lu_write( 'a', |
100
|
|
|
|
|
|
|
$field_sims->{$field_name}->encode_norm($norm_val) ); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# feed PostingsWriter |
103
|
15376
|
|
|
|
|
69854
|
$self->{postings_writer}->add_postings( $token_batch->get_postings ); |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# store fields |
107
|
14017
|
|
|
|
|
45670
|
$self->{fields_writer}->add_doc($doc); |
108
|
|
|
|
|
|
|
|
109
|
14017
|
|
|
|
|
97476
|
$self->{doc_count}++; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub add_segment { |
113
|
16
|
|
|
16
|
0
|
37
|
my ( $self, $seg_reader ) = @_; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# prepare to bulk add |
116
|
16
|
|
|
|
|
59
|
my $deldocs = $seg_reader->get_deldocs; |
117
|
16
|
|
|
|
|
54
|
my $doc_map = $deldocs->generate_doc_map( $seg_reader->max_doc, |
118
|
|
|
|
|
|
|
$self->{doc_count} ); |
119
|
16
|
|
|
|
|
122
|
my $field_num_map |
120
|
|
|
|
|
|
|
= $self->{finfos}->generate_field_num_map( $seg_reader->get_finfos ); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# bulk add the slab of documents to the various writers |
123
|
16
|
|
|
|
|
111
|
$self->_merge_norms( $seg_reader, $doc_map ); |
124
|
16
|
|
|
|
|
86
|
$self->{fields_writer} |
125
|
|
|
|
|
|
|
->add_segment( $seg_reader, $doc_map, $field_num_map ); |
126
|
16
|
|
|
|
|
93
|
$self->{postings_writer}->add_segment( $seg_reader, $doc_map ); |
127
|
|
|
|
|
|
|
|
128
|
16
|
|
|
|
|
84
|
$self->{doc_count} += $seg_reader->num_docs; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Bulk write norms. |
132
|
|
|
|
|
|
|
sub _merge_norms { |
133
|
16
|
|
|
16
|
|
31
|
my ( $self, $seg_reader, $doc_map ) = @_; |
134
|
16
|
|
|
|
|
38
|
my $norm_outstreams = $self->{norm_outstreams}; |
135
|
16
|
|
|
|
|
26
|
my $field_sims = $self->{field_sims}; |
136
|
16
|
|
|
|
|
65
|
my @indexed_fields = grep { $_->get_indexed } $self->{finfos}->get_infos; |
|
24
|
|
|
|
|
69
|
|
137
|
|
|
|
|
|
|
|
138
|
16
|
|
|
|
|
35
|
for my $field (@indexed_fields) { |
139
|
24
|
|
|
|
|
73
|
my $field_name = $field->get_name; |
140
|
24
|
|
|
|
|
67
|
my $outstream = $norm_outstreams->[ $field->get_field_num ]; |
141
|
24
|
|
|
|
|
82
|
my $norms_reader = $seg_reader->norms_reader($field_name); |
142
|
|
|
|
|
|
|
# if the field was indexed before, copy the norms |
143
|
24
|
100
|
|
|
|
61
|
if ( defined $norms_reader ) { |
144
|
18
|
|
|
|
|
77
|
_write_remapped_norms( $outstream, $doc_map, |
145
|
|
|
|
|
|
|
$norms_reader->get_bytes ); |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
else { |
148
|
|
|
|
|
|
|
# the field isn't in the input segment, so write a default |
149
|
6
|
|
|
|
|
27
|
my $zeronorm = $field_sims->{$field_name}->lengthnorm(0); |
150
|
6
|
|
|
|
|
18
|
my $num_docs = $seg_reader->num_docs; |
151
|
6
|
|
|
|
|
31
|
my $normstring |
152
|
|
|
|
|
|
|
= $field_sims->{$field_name}->encode_norm($zeronorm) |
153
|
|
|
|
|
|
|
x $num_docs; |
154
|
6
|
|
|
|
|
37
|
$outstream->lu_write( "a$num_docs", $normstring ); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Finish writing the segment. |
160
|
|
|
|
|
|
|
sub finish { |
161
|
62
|
|
|
62
|
0
|
135
|
my $self = shift; |
162
|
62
|
|
|
|
|
154
|
my ( $invindex, $seg_name ) = @{$self}{ 'invindex', 'seg_name' }; |
|
62
|
|
|
|
|
200
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# write Term Dictionary, positions. |
165
|
62
|
|
|
|
|
390
|
$self->{postings_writer}->write_postings; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# write FieldInfos |
168
|
62
|
|
|
|
|
277
|
my $fnm_file = "$seg_name.fnm"; |
169
|
62
|
50
|
|
|
|
249
|
$invindex->delete_file($fnm_file) if $invindex->file_exists($fnm_file); |
170
|
62
|
|
|
|
|
359
|
my $finfos_outstream = $invindex->open_outstream("$seg_name.fnm"); |
171
|
62
|
|
|
|
|
479
|
$self->{finfos}->write_infos($finfos_outstream); |
172
|
62
|
|
|
|
|
246
|
$finfos_outstream->close; |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# close down all the writers, so we can open the files they've finished. |
175
|
62
|
|
|
|
|
780
|
$self->{postings_writer}->finish; |
176
|
62
|
|
|
|
|
375
|
$self->{fields_writer}->finish; |
177
|
62
|
|
|
|
|
124
|
for ( @{ $self->{norm_outstreams} } ) { |
|
62
|
|
|
|
|
1726
|
|
178
|
132
|
100
|
|
|
|
615
|
$_->close if defined; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# consolidate compound file - if we actually added any docs |
182
|
62
|
|
|
|
|
1899
|
my @compound_files = map {"$seg_name.$_"} @COMPOUND_EXTENSIONS; |
|
434
|
|
|
|
|
1095
|
|
183
|
62
|
100
|
|
|
|
288
|
if ( $self->{doc_count} ) { |
184
|
61
|
|
|
|
|
788
|
my $compound_file_writer |
185
|
|
|
|
|
|
|
= KinoSearch1::Index::CompoundFileWriter->new( |
186
|
|
|
|
|
|
|
invindex => $invindex, |
187
|
|
|
|
|
|
|
filename => "$seg_name.tmp", |
188
|
|
|
|
|
|
|
); |
189
|
126
|
|
|
|
|
404
|
push @compound_files, map { "$seg_name.f" . $_->get_field_num } |
|
130
|
|
|
|
|
393
|
|
190
|
61
|
|
|
|
|
425
|
grep { $_->get_indexed } $self->{finfos}->get_infos; |
191
|
61
|
|
|
|
|
372
|
$compound_file_writer->add_file($_) for @compound_files; |
192
|
61
|
|
|
|
|
287
|
$compound_file_writer->finish; |
193
|
61
|
|
|
|
|
461
|
$invindex->rename_file( "$seg_name.tmp", "$seg_name.cfs" ); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# delete files that are no longer needed; |
197
|
62
|
|
|
|
|
440
|
$invindex->delete_file($_) for @compound_files; |
198
|
62
|
|
|
|
|
227
|
my $sort_file_name = "$seg_name" . SORTFILE_EXTENSION; |
199
|
62
|
50
|
|
|
|
239
|
$invindex->delete_file($sort_file_name) |
200
|
|
|
|
|
|
|
if $invindex->file_exists($sort_file_name); |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
1; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
__END__ |