File Coverage

blib/lib/KinoSearch1/InvIndexer.pm
Criterion Covered Total %
statement 201 217 92.6
branch 47 70 67.1
condition 9 15 60.0
subroutine 39 39 100.0
pod 7 8 87.5
total 303 349 86.8


line stmt bran cond sub pod time code
1             package KinoSearch1::InvIndexer;
2 34     34   735650 use strict;
  34         1463  
  34         1237  
3 34     34   192 use warnings;
  34         70  
  34         898  
4 34     34   4698 use KinoSearch1::Util::ToolSet;
  34         86  
  34         5900  
5 34     34   210 use base qw( KinoSearch1::Util::Class );
  34         67  
  34         7508  
6              
7 34     34   226 use constant UNINITIALIZED => 0;
  34         68  
  34         2237  
8 34     34   183 use constant INITIALIZED => 1;
  34         70  
  34         1615  
9 34     34   186 use constant FINISHED => 2;
  34         63  
  34         3217  
10              
11             BEGIN {
12 34     34   379 __PACKAGE__->init_instance_vars(
13             # constructor args / members
14             create => undef,
15             invindex => undef,
16             analyzer => undef,
17              
18             # members
19             reader => undef,
20             analyzers => undef,
21             sinfos => undef,
22             finfos => undef,
23             doc_template => undef,
24             frozen_doc => undef,
25             similarity => undef,
26             field_sims => undef,
27             seg_writer => undef,
28             write_lock => undef,
29             state => UNINITIALIZED,
30             );
31             }
32              
33 34     34   23014 use Storable qw( freeze thaw );
  34         94469  
  34         2776  
34 34     34   10407 use File::Spec::Functions qw( catfile tmpdir );
  34         10865  
  34         2218  
35              
36 34     34   12099 use KinoSearch1::Document::Doc;
  34         85  
  34         884  
37 34     34   10497 use KinoSearch1::Document::Field;
  34         189  
  34         1106  
38 34     34   12015 use KinoSearch1::Analysis::Analyzer;
  34         99  
  34         918  
39 34     34   15478 use KinoSearch1::Store::FSInvIndex;
  34         108  
  34         979  
40 34     34   261 use KinoSearch1::Index::FieldInfos;
  34         68  
  34         1680  
41 34     34   201 use KinoSearch1::Index::FieldsReader;
  34         77  
  34         1213  
42 34     34   13576 use KinoSearch1::Index::IndexReader;
  34         118  
  34         943  
43 34     34   218 use KinoSearch1::Index::SegInfos;
  34         68  
  34         715  
44 34     34   31181 use KinoSearch1::Index::SegWriter;
  34         118  
  34         1491  
45 34         2317 use KinoSearch1::Index::IndexFileNames qw(
46             WRITE_LOCK_NAME
47             COMMIT_LOCK_NAME
48             WRITE_LOCK_TIMEOUT
49             COMMIT_LOCK_TIMEOUT
50 34     34   249 );
  34         80  
51 34     34   15466 use KinoSearch1::Search::Similarity;
  34         91  
  34         84382  
52              
53             sub init_instance {
54 63     63 1 138 my $self = shift;
55 63         300 $self->{analyzers} = {};
56 63         188 $self->{field_sims} = {};
57              
58             # use a no-op Analyzer if not supplied
59 63   66     331 $self->{analyzer} ||= KinoSearch1::Analysis::Analyzer->new;
60              
61             # create a few members
62 63         1923 $self->{similarity} = KinoSearch1::Search::Similarity->new;
63 63         849 $self->{sinfos} = KinoSearch1::Index::SegInfos->new;
64 63         581 $self->{doc_template} = KinoSearch1::Document::Doc->new;
65              
66             # confirm or create an InvIndex object
67 63         135 my $invindex;
68 63 100 66     1200 if ( blessed( $self->{invindex} )
    50          
69             and $self->{invindex}->isa('KinoSearch1::Store::InvIndex') )
70             {
71 47         125 $invindex = $self->{invindex};
72 47 100       297 $self->{create} = $invindex->get_create
73             unless defined $self->{create};
74             }
75             elsif ( defined $self->{invindex} ) {
76 16         151 $invindex = $self->{invindex} = KinoSearch1::Store::FSInvIndex->new(
77             create => $self->{create},
78             path => $self->{invindex},
79             );
80             }
81             else {
82 0         0 croak("Required parameter 'invindex' not supplied");
83             }
84              
85             # get a write lock for this invindex.
86 63         483 my $write_lock = $invindex->make_lock(
87             lock_name => WRITE_LOCK_NAME,
88             timeout => WRITE_LOCK_TIMEOUT,
89             );
90 63 50       444 if ( $write_lock->obtain ) {
91             # only assign if successful, otherwise DESTROY unlocks (bad!)
92 63         188 $self->{write_lock} = $write_lock;
93             }
94             else {
95 0         0 croak( "invindex locked: " . $write_lock->get_lock_name );
96             }
97              
98             # read/write SegInfos
99 63         137 eval {
100             $invindex->run_while_locked(
101             lock_name => COMMIT_LOCK_NAME,
102             timeout => COMMIT_LOCK_TIMEOUT,
103             do_body => sub {
104 63 100   63   548 $self->{create}
105             ? $self->{sinfos}->write_infos($invindex)
106             : $self->{sinfos}->read_infos($invindex);
107             },
108 63         834 );
109             };
110 63 50       487 if ($@) {
111 0 0       0 $self->{create}
112             ? croak("failed to create invindex: $@")
113             : croak("failed to open existing invindex: $@");
114             }
115              
116             # get a finfos and maybe a reader
117 63 100       247 if ( $self->{create} ) {
118 45         486 $self->{finfos} = KinoSearch1::Index::FieldInfos->new;
119             }
120             else {
121 18         196 $self->{reader}
122             = KinoSearch1::Index::IndexReader->new( invindex => $invindex );
123 18         116 $self->{finfos} = $self->{reader}->generate_field_infos;
124             }
125              
126             # more initialization is coming after fields are spec'd...
127             }
128              
129             sub _delayed_init {
130 62     62   119 my $self = shift;
131 62         245 my ( $invindex, $finfos, $field_sims )
132 62         141 = @{$self}{qw( invindex finfos field_sims )};
133              
134 62 50       271 confess("finish has been called")
135             if $self->{state} == FINISHED;
136 62 50       234 confess("internal error: already initialized")
137             if $self->{state} == INITIALIZED;
138 62         140 $self->{state} = INITIALIZED;
139              
140             # create a cloning template
141 62         185 my $doc = $self->{doc_template};
142 62         358 for my $field ( $doc->get_fields ) {
143 127         387 $field->set_field_num( $finfos->get_field_num( $field->get_name ) );
144             }
145 62         393 $self->{frozen_doc} = freeze($doc);
146              
147             # set sim for each field
148 62         9364 my $main_sim = $self->{similarity};
149 62         313 for my $finfo ( $finfos->get_infos ) {
150 132   66     436 $field_sims->{ $finfo->get_name } ||= $main_sim;
151             }
152              
153             # name a new segment and create a SegWriter
154 62         293 my $out_seg_name = $self->_new_seg_name;
155 62         321 $self->{seg_writer} = KinoSearch1::Index::SegWriter->new(
156             invindex => $invindex,
157             seg_name => $out_seg_name,
158             finfos => $finfos->clone,
159             field_sims => $field_sims,
160             );
161             }
162              
163             sub spec_field {
164 128     128 1 739 my $self = shift;
165              
166             # don't allow new fields to be spec'd once the seg is in motion
167 128 50       443 croak("Too late to spec field (new_doc has been called)")
168             unless $self->{state} == UNINITIALIZED;
169              
170             # detect or define a Field object
171 128         186 my $field;
172 128 50       468 if ( blessed( $_[0] ) ) {
173 0         0 $field = shift;
174             }
175             else {
176 128         199 eval { $field = KinoSearch1::Document::Field->new(@_) };
  128         2086  
177 128 50       372 croak $@ if $@;
178             }
179              
180             # cache fnm_bits and fdt_bits
181 128         587 $field->set_fnm_bits(
182             KinoSearch1::Index::FieldInfos->encode_fnm_bits($field) );
183 128         768 $field->set_fdt_bits(
184             KinoSearch1::Index::FieldsReader->encode_fdt_bits($field) );
185              
186             # establish which analyzer will be used against the field
187 128   66     431 $self->{analyzers}{ $field->get_name }
188             = ( $field->get_analyzer || $self->{analyzer} );
189              
190             # don't copy the analyzer into the template, so that it can be overridden
191 128         474 $field->set_analyzer(undef);
192              
193             # add the field to the finfos and the template.
194 128         530 $self->{finfos}->add_field($field);
195 128         655 $self->{doc_template}->add_field($field);
196             }
197              
198             sub new_doc {
199 14017     14017 1 18744 my $self = shift;
200 14017 100       44479 $self->_delayed_init unless $self->{state} == INITIALIZED;
201 14017         37398 return thaw( $self->{frozen_doc} );
202             }
203              
204             sub set_similarity {
205 1 50   1 0 9 if ( @_ == 3 ) {
206 1         3 my ( $self, $field_name, $sim ) = @_;
207 1         4 $self->{field_sims}{$field_name} = $sim;
208             }
209             else {
210 0         0 $_[0]->{similarity} = $_[1];
211             }
212             }
213              
214             sub add_doc {
215 14017     14017 1 18109 my ( $self, $doc ) = @_;
216              
217             # assign analyzers
218 14017         31158 for my $field ( $doc->get_fields ) {
219 15390 100       34811 if ( $field->get_analyzed ) {
220 15370 50       45819 next if $field->get_analyzer;
221 15370         36642 my $fieldname = $field->get_name;
222 15370         49864 $field->set_analyzer( $self->{analyzers}{$fieldname} );
223             }
224             }
225              
226             # add doc to output segment
227 14017         47983 $self->{seg_writer}->add_doc($doc);
228             }
229              
230             sub add_invindexes {
231 1     1 1 8 my ( $self, @invindexes ) = @_;
232 1 50       6 confess("Can't call add_invindexes after new_doc")
233             if $self->{state} == INITIALIZED;
234              
235             # verify or obtain InvIndex objects
236 1         4 for (@invindexes) {
237 2 50       7 if ( !a_isa_b( $_, 'KinoSearch1::Store::InvIndex' ) ) {
238 0         0 $_ = KinoSearch1::Store::FSInvIndex->new( path => $_ );
239             }
240             }
241              
242             # get a reader for each invindex
243             my @readers
244 1         4 = map { KinoSearch1::Index::IndexReader->new( invindex => $_ ) }
  2         17  
245             @invindexes;
246              
247             # merge finfos and init
248 1         3 for my $reader (@readers) {
249 2         9 $self->{finfos}->consolidate( $reader->get_finfos );
250             }
251 1         5 $self->_delayed_init;
252              
253             # add all segments in each of the supplied invindexes
254 1         3 my $seg_writer = $self->{seg_writer};
255 1         3 for my $reader (@readers) {
256 2         62 $seg_writer->add_segment($_) for $reader->segreaders_to_merge('all');
257             }
258             }
259              
260             sub delete_docs_by_term {
261 1     1 1 3 my ( $self, $term ) = @_;
262 1 50       5 confess("Not a KinoSearch1::Index::Term")
263             unless a_isa_b( $term, 'KinoSearch1::Index::Term' );
264 1 50       4 return unless $self->{reader};
265 1 50       7 $self->_delayed_init unless $self->{state} == INITIALIZED;
266 1         7 $self->{reader}->delete_docs_by_term($term);
267              
268             }
269              
270             our %finish_defaults = ( optimize => 0, );
271              
272             sub finish {
273 63     63 1 430 my $self = shift;
274 63 50       870 confess kerror() unless verify_args( \%finish_defaults, @_ );
275 63         730 my %args = ( %finish_defaults, @_ );
276              
277             # if no changes were made to the index, don't write anything
278 63 100       289 if ( $self->{state} == UNINITIALIZED ) {
279 3 100       14 if ( !$args{optimize} ) {
280 1         4 return;
281             }
282             else {
283 2         8 $self->_delayed_init;
284             }
285             }
286              
287 62         269 my ( $invindex, $sinfos, $seg_writer )
288 62         140 = @{$self}{qw( invindex sinfos seg_writer )};
289              
290             # perform segment merging
291             my @to_merge
292 62 100       408 = $self->{reader}
293             ? $self->{reader}->segreaders_to_merge( $args{optimize} )
294             : ();
295 62         235 $seg_writer->add_segment($_) for @to_merge;
296 62         207 $sinfos->delete_segment( $_->get_seg_name ) for @to_merge;
297              
298             # finish the segment
299 62         408 $seg_writer->finish;
300              
301             # now that the seg is complete, write its info to the 'segments' file
302 62         457 my $doc_count = $seg_writer->get_doc_count;
303 62 100       216 if ($doc_count) {
304 61         283 $sinfos->add_info(
305             KinoSearch1::Index::SegInfo->new(
306             seg_name => $seg_writer->get_seg_name,
307             doc_count => $doc_count,
308             invindex => $invindex,
309             )
310             );
311             }
312              
313             # commit changes to the invindex
314             $invindex->run_while_locked(
315             lock_name => COMMIT_LOCK_NAME,
316             timeout => COMMIT_LOCK_TIMEOUT,
317             do_body => sub {
318 62 100   62   374 $self->{reader}->commit_deletions if defined $self->{reader};
319 62         331 $sinfos->write_infos($invindex);
320             },
321 62         805 );
322              
323 62         495 my @files_to_delete = $self->_generate_deletions_list( \@to_merge );
324 62         282 push @files_to_delete, $self->_read_delqueue;
325              
326             # close reader, so that we can delete its files if appropriate
327 62 100       347 $self->{reader}->close if defined $self->{reader};
328              
329 62         261 $self->_purge_merged(@files_to_delete);
330 62         249 $self->_release_locks;
331 62         474 $self->{state} = FINISHED;
332             }
333              
334             # Given an array of SegReaders, return a list of their files.
335             sub _generate_deletions_list {
336 62     62   144 my ( $self, $readers_to_merge ) = @_;
337 62         136 my $invindex = $self->{invindex};
338 62         169 my @segs_to_merge = map { $_->get_seg_name } @$readers_to_merge;
  14         150  
339 28         84 my @deletions = grep { $invindex->file_exists($_) }
  14         56  
340 62         147 map { ( "$_.cfs", "$_.del" ) } @segs_to_merge;
341 62         179 return @deletions;
342             }
343              
344             # Retrieve a list of files that weren't successfully deleted before.
345             sub _read_delqueue {
346 62     62   118 my ( $self, $readers_to_merge ) = @_;
347 62         146 my $invindex = $self->{invindex};
348 62         101 my @deletions;
349              
350 62 50       237 if ( $invindex->file_exists('delqueue') ) {
351 0         0 my $instream = $invindex->open_instream('delqueue');
352 0         0 my $num_in_queue = $instream->lu_read('i');
353 0         0 @deletions = $instream->lu_read("T$num_in_queue");
354 0         0 $instream->close;
355             }
356              
357 62         180 return @deletions;
358             }
359              
360             # Delete segments that have been folded into the new segment.
361             sub _purge_merged {
362 62     62   155 my ( $self, @deletions ) = @_;
363 62         139 my $invindex = $self->{invindex};
364              
365 62         135 my @delqueue;
366 62         213 for my $deletion (@deletions) {
367 14         26 eval { $invindex->delete_file($deletion) };
  14         50  
368             # Win32: if the deletion fails (because a reader is open), queue it
369 14 50 33     58 if ( $@ and $invindex->file_exists($deletion) ) {
370 0         0 push @delqueue, $deletion;
371             }
372             }
373              
374 62         260 $self->_write_delqueue(@delqueue);
375             }
376              
377             sub _write_delqueue {
378 62     62   210 my ( $self, @delqueue ) = @_;
379 62         138 my $invindex = $self->{invindex};
380 62         122 my $num_files = scalar @delqueue;
381              
382 62 50       356 if ($num_files) {
    50          
383             # we have files that weren't successfully deleted, so write list
384 0         0 my $outstream = $invindex->open_outstream('delqueue.new');
385 0         0 $outstream->lu_write( "iT$num_files", $num_files, @delqueue );
386 0         0 $outstream->close;
387 0         0 $invindex->rename_file( 'delqueue.new', 'delqueue' );
388             }
389             elsif ( $invindex->file_exists('delqueue') ) {
390             # no files to delete, so delete the delqueue file if it's there
391 0         0 $invindex->delete_file('delqueue');
392             }
393             }
394              
395             # Release the write lock - if it's there.
396             sub _release_locks {
397 123     123   237 my $self = shift;
398 123 100       6287 if ( defined $self->{write_lock} ) {
399 63 50       300 $self->{write_lock}->release if $self->{write_lock}->is_locked;
400 63         268 undef $self->{write_lock};
401             }
402             }
403              
404             # Generate segment names (no longer Lucene compatible, as of 0.06).
405             sub _new_seg_name {
406 62     62   145 my $self = shift;
407              
408 62         343 my $counter = $self->{sinfos}->get_counter;
409 62         413 $self->{sinfos}->set_counter( ++$counter );
410              
411 62         216 return "_$counter";
412             }
413              
414 61     61   4480 sub DESTROY { shift->_release_locks }
415              
416             1;
417              
418             __END__