File Coverage

blib/lib/KinoSearch1/Index/SegWriter.pm
Criterion Covered Total %
statement 114 114 100.0
branch 15 18 83.3
condition 1 3 33.3
subroutine 15 15 100.0
pod 1 4 25.0
total 146 154 94.8


line stmt bran cond sub pod time code
1             package KinoSearch1::Index::SegWriter;
2 34     34   196 use strict;
  34         73  
  34         1260  
3 34     34   196 use warnings;
  34         75  
  34         859  
4 34     34   188 use KinoSearch1::Util::ToolSet;
  34         74  
  34         4799  
5 34     34   327 use base qw( KinoSearch1::Util::Class );
  34         77  
  34         4525  
6              
7             BEGIN {
8 34     34   372 __PACKAGE__->init_instance_vars(
9             # constructor params / members
10             invindex => undef,
11             seg_name => undef,
12             finfos => undef,
13             field_sims => undef,
14             # members
15             norm_outstreams => undef,
16             fields_writer => undef,
17             postings_writer => undef,
18             doc_count => 0,
19             );
20 34         416 __PACKAGE__->ready_get(qw( seg_name doc_count ));
21             }
22              
23 34     34   14205 use KinoSearch1::Analysis::TokenBatch;
  34         94  
  34         903  
24 34     34   36190 use KinoSearch1::Index::FieldsWriter;
  34         105  
  34         959  
25 34     34   21613 use KinoSearch1::Index::PostingsWriter;
  34         117  
  34         1271  
26 34     34   20089 use KinoSearch1::Index::CompoundFileWriter;
  34         111  
  34         1018  
27             use KinoSearch1::Index::IndexFileNames
28 34     34   457 qw( @COMPOUND_EXTENSIONS SORTFILE_EXTENSION );
  34         73  
  34         48156  
29              
30             sub init_instance {
31 62     62 1 139 my $self = shift;
32 62         363 my ( $invindex, $seg_name, $finfos )
33 62         144 = @{$self}{ 'invindex', 'seg_name', 'finfos' };
34              
35             # init norms
36 62         192 my $norm_outstreams = $self->{norm_outstreams} = [];
37 128         379 my @indexed_field_nums = map { $_->get_field_num }
  132         384  
38 62         302 grep { $_->get_indexed } $finfos->get_infos;
39 62         190 for my $field_num (@indexed_field_nums) {
40 128         355 my $filename = "$seg_name.f$field_num";
41 128 100       438 $invindex->delete_file($filename)
42             if $invindex->file_exists($filename);
43 128         471 $norm_outstreams->[$field_num] = $invindex->open_outstream($filename);
44             }
45              
46             # init FieldsWriter
47 62         623 $self->{fields_writer} = KinoSearch1::Index::FieldsWriter->new(
48             invindex => $invindex,
49             seg_name => $seg_name,
50             );
51              
52             # init PostingsWriter
53 62         645 $self->{postings_writer} = KinoSearch1::Index::PostingsWriter->new(
54             invindex => $invindex,
55             seg_name => $seg_name,
56             );
57             }
58              
59             # Add a document to the segment.
60             sub add_doc {
61 14017     14017 0 18598 my ( $self, $doc ) = @_;
62 14017         19128 my $norm_outstreams = $self->{norm_outstreams};
63 14017         16695 my $postings_cache = $self->{postings_cache};
64 14017         16832 my $field_sims = $self->{field_sims};
65 14017         32977 my $doc_boost = $doc->get_boost;
66              
67 14017         32485 for my $indexed_field ( grep { $_->get_indexed } $doc->get_fields ) {
  15390         36849  
68 15376         36644 my $field_name = $indexed_field->get_name;
69 15376         70315 my $token_batch = KinoSearch1::Analysis::TokenBatch->new;
70              
71             # if the field has content, put it in the TokenBatch
72 15376 100       45646 if ( $indexed_field->get_value_len ) {
73 15307         102131 $token_batch->append( $indexed_field->get_value, 0,
74             $indexed_field->get_value_len );
75             }
76              
77             # analyze the field
78 15376 100       96807 if ( $indexed_field->get_analyzed ) {
79 15364         36506 $token_batch
80             = $indexed_field->get_analyzer()->analyze($token_batch);
81             }
82              
83             # invert the doc
84 15376         83462 $token_batch->build_posting_list( $self->{doc_count},
85             $indexed_field->get_field_num );
86              
87             # prepare to store the term vector, if the field is vectorized
88 15376 50 33     43231 if ( $indexed_field->get_vectorized and $indexed_field->get_stored ) {
89 15376         64638 $indexed_field->set_tv_string( $token_batch->get_tv_string );
90             }
91              
92             # encode a norm into a byte, write it to an outstream
93 15376         43815 my $norm_val
94             = $doc_boost
95             * $indexed_field->get_boost
96             * $field_sims->{$field_name}
97             ->lengthnorm( $token_batch->get_size );
98 15376         42794 my $outstream = $norm_outstreams->[ $indexed_field->get_field_num ];
99 15376         73280 $outstream->lu_write( 'a',
100             $field_sims->{$field_name}->encode_norm($norm_val) );
101              
102             # feed PostingsWriter
103 15376         69854 $self->{postings_writer}->add_postings( $token_batch->get_postings );
104             }
105              
106             # store fields
107 14017         45670 $self->{fields_writer}->add_doc($doc);
108              
109 14017         97476 $self->{doc_count}++;
110             }
111              
112             sub add_segment {
113 16     16 0 37 my ( $self, $seg_reader ) = @_;
114              
115             # prepare to bulk add
116 16         59 my $deldocs = $seg_reader->get_deldocs;
117 16         54 my $doc_map = $deldocs->generate_doc_map( $seg_reader->max_doc,
118             $self->{doc_count} );
119 16         122 my $field_num_map
120             = $self->{finfos}->generate_field_num_map( $seg_reader->get_finfos );
121              
122             # bulk add the slab of documents to the various writers
123 16         111 $self->_merge_norms( $seg_reader, $doc_map );
124 16         86 $self->{fields_writer}
125             ->add_segment( $seg_reader, $doc_map, $field_num_map );
126 16         93 $self->{postings_writer}->add_segment( $seg_reader, $doc_map );
127              
128 16         84 $self->{doc_count} += $seg_reader->num_docs;
129             }
130              
131             # Bulk write norms.
132             sub _merge_norms {
133 16     16   31 my ( $self, $seg_reader, $doc_map ) = @_;
134 16         38 my $norm_outstreams = $self->{norm_outstreams};
135 16         26 my $field_sims = $self->{field_sims};
136 16         65 my @indexed_fields = grep { $_->get_indexed } $self->{finfos}->get_infos;
  24         69  
137              
138 16         35 for my $field (@indexed_fields) {
139 24         73 my $field_name = $field->get_name;
140 24         67 my $outstream = $norm_outstreams->[ $field->get_field_num ];
141 24         82 my $norms_reader = $seg_reader->norms_reader($field_name);
142             # if the field was indexed before, copy the norms
143 24 100       61 if ( defined $norms_reader ) {
144 18         77 _write_remapped_norms( $outstream, $doc_map,
145             $norms_reader->get_bytes );
146             }
147             else {
148             # the field isn't in the input segment, so write a default
149 6         27 my $zeronorm = $field_sims->{$field_name}->lengthnorm(0);
150 6         18 my $num_docs = $seg_reader->num_docs;
151 6         31 my $normstring
152             = $field_sims->{$field_name}->encode_norm($zeronorm)
153             x $num_docs;
154 6         37 $outstream->lu_write( "a$num_docs", $normstring );
155             }
156             }
157             }
158              
159             # Finish writing the segment.
160             sub finish {
161 62     62 0 135 my $self = shift;
162 62         154 my ( $invindex, $seg_name ) = @{$self}{ 'invindex', 'seg_name' };
  62         200  
163              
164             # write Term Dictionary, positions.
165 62         390 $self->{postings_writer}->write_postings;
166              
167             # write FieldInfos
168 62         277 my $fnm_file = "$seg_name.fnm";
169 62 50       249 $invindex->delete_file($fnm_file) if $invindex->file_exists($fnm_file);
170 62         359 my $finfos_outstream = $invindex->open_outstream("$seg_name.fnm");
171 62         479 $self->{finfos}->write_infos($finfos_outstream);
172 62         246 $finfos_outstream->close;
173              
174             # close down all the writers, so we can open the files they've finished.
175 62         780 $self->{postings_writer}->finish;
176 62         375 $self->{fields_writer}->finish;
177 62         124 for ( @{ $self->{norm_outstreams} } ) {
  62         1726  
178 132 100       615 $_->close if defined;
179             }
180              
181             # consolidate compound file - if we actually added any docs
182 62         1899 my @compound_files = map {"$seg_name.$_"} @COMPOUND_EXTENSIONS;
  434         1095  
183 62 100       288 if ( $self->{doc_count} ) {
184 61         788 my $compound_file_writer
185             = KinoSearch1::Index::CompoundFileWriter->new(
186             invindex => $invindex,
187             filename => "$seg_name.tmp",
188             );
189 126         404 push @compound_files, map { "$seg_name.f" . $_->get_field_num }
  130         393  
190 61         425 grep { $_->get_indexed } $self->{finfos}->get_infos;
191 61         372 $compound_file_writer->add_file($_) for @compound_files;
192 61         287 $compound_file_writer->finish;
193 61         461 $invindex->rename_file( "$seg_name.tmp", "$seg_name.cfs" );
194             }
195              
196             # delete files that are no longer needed;
197 62         440 $invindex->delete_file($_) for @compound_files;
198 62         227 my $sort_file_name = "$seg_name" . SORTFILE_EXTENSION;
199 62 50       239 $invindex->delete_file($sort_file_name)
200             if $invindex->file_exists($sort_file_name);
201             }
202              
203             1;
204              
205             __END__