line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Dezi::Indexer; |
2
|
1
|
|
|
1
|
|
1947
|
use Moose; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
3
|
1
|
|
|
1
|
|
7142
|
use MooseX::StrictConstructor; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
4
|
|
|
|
|
|
|
with 'Dezi::Role'; |
5
|
1
|
|
|
1
|
|
3152
|
use Types::Standard qw( Str Int Bool Maybe InstanceOf ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
6
|
1
|
|
|
1
|
|
1284
|
use Dezi::Types qw( DeziInvIndex DeziIndexerConfig ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
7
|
1
|
|
|
1
|
|
512
|
use Scalar::Util qw( blessed ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
67
|
|
8
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
58
|
|
9
|
1
|
|
|
1
|
|
5
|
use Data::Dump qw( dump ); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
46
|
|
10
|
1
|
|
|
1
|
|
867
|
use Dezi::Indexer::Config; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Dezi::InvIndex; |
12
|
|
|
|
|
|
|
use SWISH::3 qw( :constants ); |
13
|
|
|
|
|
|
|
use Try::Tiny; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use namespace::autoclean; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.014'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has 'invindex' => ( |
20
|
|
|
|
|
|
|
is => 'rw', |
21
|
|
|
|
|
|
|
isa => DeziInvIndex, |
22
|
|
|
|
|
|
|
coerce => 1, |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
has 'invindex_class' => ( |
25
|
|
|
|
|
|
|
is => 'rw', |
26
|
|
|
|
|
|
|
isa => Str, |
27
|
|
|
|
|
|
|
default => sub {'Dezi::InvIndex'}, |
28
|
|
|
|
|
|
|
); |
29
|
|
|
|
|
|
|
has 'config' => ( |
30
|
|
|
|
|
|
|
is => 'rw', |
31
|
|
|
|
|
|
|
isa => DeziIndexerConfig, |
32
|
|
|
|
|
|
|
coerce => 1, |
33
|
|
|
|
|
|
|
lazy => 1, |
34
|
|
|
|
|
|
|
default => sub { Dezi::Indexer::Config->new() }, |
35
|
|
|
|
|
|
|
); |
36
|
|
|
|
|
|
|
has 'count' => ( is => 'rw', isa => Int ); |
37
|
|
|
|
|
|
|
has 'clobber' => ( is => 'rw', isa => Bool, default => 0 ); |
38
|
|
|
|
|
|
|
has 'flush' => ( is => 'rw', isa => Int ); |
39
|
|
|
|
|
|
|
has 'started' => ( is => 'ro', isa => Int ); |
40
|
|
|
|
|
|
|
has 'swish3' => ( |
41
|
|
|
|
|
|
|
is => 'rw', |
42
|
|
|
|
|
|
|
isa => Maybe [ InstanceOf ['SWISH::3'] ], |
43
|
|
|
|
|
|
|
builder => 'init_swish3', |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
has 'test_mode' => ( is => 'rw', isa => Bool, default => sub {0} ); |
46
|
|
|
|
|
|
|
has 'use_swish3_tokenizer' => ( is => 'rw', isa => Bool, default => sub {0} ); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=pod |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 NAME |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Dezi::Indexer - base indexer class |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 SYNOPSIS |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
use Dezi::Indexer; |
57
|
|
|
|
|
|
|
my $indexer = Dezi::Indexer->new( |
58
|
|
|
|
|
|
|
invindex => Dezi::InvIndex->new, |
59
|
|
|
|
|
|
|
config => Dezi::Indexer::Config->new, |
60
|
|
|
|
|
|
|
count => 0, |
61
|
|
|
|
|
|
|
clobber => 1, |
62
|
|
|
|
|
|
|
flush => 10000, |
63
|
|
|
|
|
|
|
started => time() |
64
|
|
|
|
|
|
|
); |
65
|
|
|
|
|
|
|
$indexer->start; |
66
|
|
|
|
|
|
|
for my $doc (@list_of_docs) { |
67
|
|
|
|
|
|
|
$indexer->process($doc); |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
$indexer->finish; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head1 DESCRIPTION |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Dezi::Indexer is a base class implementing the simplest of indexing |
74
|
|
|
|
|
|
|
APIs. It is intended to be subclassed, along with InvIndex, for each |
75
|
|
|
|
|
|
|
IR backend library. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head1 METHODS |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=head2 new( I<params> ) |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
Constructor. See the SYNOPSIS for default options. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
I<params> may include the following keys, each of which is also an |
84
|
|
|
|
|
|
|
accessor method: |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=over |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item clobber |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Over-write any existing InvIndex. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
=item config |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
A Dezi::Indexer::Config object or file name. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item flush |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The number of indexed docs at which in-memory changes |
99
|
|
|
|
|
|
|
should be written to disk. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=item invindex |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
A Dezi::InvIndex object. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=item test_mode |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Dry run mode, just prints info on stderr but does not |
108
|
|
|
|
|
|
|
build index. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=back |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head2 BUILD |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Setup object. Called internally by new(). |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub BUILD { |
119
|
|
|
|
|
|
|
my $self = shift; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# if our invindex path != config->IndexFile, |
122
|
|
|
|
|
|
|
# prefer config |
123
|
|
|
|
|
|
|
if ( $self->config->IndexFile |
124
|
|
|
|
|
|
|
and $self->config->IndexFile ne $self->invindex->path ) |
125
|
|
|
|
|
|
|
{ |
126
|
|
|
|
|
|
|
$self->warnings |
127
|
|
|
|
|
|
|
and warn sprintf( |
128
|
|
|
|
|
|
|
"Overriding invindex->path '%s' with IndexFile value from config '%s'\n", |
129
|
|
|
|
|
|
|
$self->invindex->path, $self->config->IndexFile ); |
130
|
|
|
|
|
|
|
$self->invindex->path( $self->config->IndexFile ); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# make sure our invindex class matches invindex_class |
134
|
|
|
|
|
|
|
if ( !$self->invindex |
135
|
|
|
|
|
|
|
or !blessed $self->invindex |
136
|
|
|
|
|
|
|
or !$self->invindex->isa( $self->invindex_class ) ) |
137
|
|
|
|
|
|
|
{ |
138
|
|
|
|
|
|
|
Class::Load::load_class( $self->invindex_class ); |
139
|
|
|
|
|
|
|
if ( !$self->invindex ) { |
140
|
|
|
|
|
|
|
$self->invindex( $self->invindex_class->new ); |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
else { |
143
|
|
|
|
|
|
|
$self->invindex( |
144
|
|
|
|
|
|
|
$self->invindex_class->new( path => $self->invindex . "" ) ); |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# merge any manual config with swish3 header |
149
|
|
|
|
|
|
|
$self->_merge_swish3_header_with_config(); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head2 init_swish3 |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Returns a SWISH::3 object that uses B<swish3_handler>. This builder |
156
|
|
|
|
|
|
|
method is called on Indexer construction if B<swish3> is uninitialized. |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=cut |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub init_swish3 { |
161
|
|
|
|
|
|
|
my $self = shift; |
162
|
|
|
|
|
|
|
return SWISH::3->new( |
163
|
|
|
|
|
|
|
handler => sub { |
164
|
|
|
|
|
|
|
$self->swish3_handler(@_); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
); |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub _merge_swish3_header_with_config { |
170
|
|
|
|
|
|
|
my $self = shift; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# 1. any existing header file. |
173
|
|
|
|
|
|
|
my $swish_3_header = $self->invindex->header_file; |
174
|
|
|
|
|
|
|
if ( -r $swish_3_header ) { |
175
|
|
|
|
|
|
|
$self->swish3->config->add($swish_3_header); |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# 2. merge config in this Indexer |
179
|
|
|
|
|
|
|
my $ver3_xml = $self->config->as_swish3_config(); |
180
|
|
|
|
|
|
|
$self->swish3->config->add($ver3_xml); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# 3. conditionally turn off tokenizer, preferring engine to do it. |
183
|
|
|
|
|
|
|
$self->swish3->analyzer->set_tokenize( $self->use_swish3_tokenizer ); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head2 start |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Opens the invindex() object and sets the started() time to time(). |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Subclasses should always call SUPER::start() if they override |
191
|
|
|
|
|
|
|
this method since it provides sanity checking on the InvIndex. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=cut |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub start { |
196
|
|
|
|
|
|
|
my $self = shift; |
197
|
|
|
|
|
|
|
my $invindex = $self->invindex or confess "No invindex object defined"; |
198
|
|
|
|
|
|
|
if ( !blessed($invindex) |
199
|
|
|
|
|
|
|
or !$invindex->can('open') ) |
200
|
|
|
|
|
|
|
{ |
201
|
|
|
|
|
|
|
confess "Invalid invindex $invindex: " |
202
|
|
|
|
|
|
|
. "either not blessed object or does not implement 'open' method"; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# sanity check. if this is an existing index |
206
|
|
|
|
|
|
|
# does our Format match what already exists? |
207
|
|
|
|
|
|
|
my $header = try { $invindex->get_header }; |
208
|
|
|
|
|
|
|
if ($header) { |
209
|
|
|
|
|
|
|
my $format = $header->Index->{Format}; |
210
|
|
|
|
|
|
|
if ( !$self->isa( 'Dezi::' . $format . '::Indexer' ) ) { |
211
|
|
|
|
|
|
|
confess "Fatal error: found existing invindex '$invindex' " |
212
|
|
|
|
|
|
|
. "with format $format.\n" |
213
|
|
|
|
|
|
|
. "You tried to open it with " |
214
|
|
|
|
|
|
|
. ref($self); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
$self->invindex->open; |
219
|
|
|
|
|
|
|
$self->{started} = time(); |
220
|
|
|
|
|
|
|
if ( -d $self->invindex->path ) { |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# for backcompat use swish3 name |
223
|
|
|
|
|
|
|
$self->invindex->path->file('swish_last_start')->touch(); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
return $self->{started}; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head2 process( I<doc> ) |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
I<doc> should be a Dezi::Indexer::Doc-derived object. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
process() should implement whatever the particular IR library |
234
|
|
|
|
|
|
|
API requires. The default action calls B<swish3_handler> on I<doc>. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub process { |
239
|
|
|
|
|
|
|
my $self = shift; |
240
|
|
|
|
|
|
|
my $doc = shift; |
241
|
|
|
|
|
|
|
unless ( $doc && blessed($doc) && $doc->isa('Dezi::Indexer::Doc') ) { |
242
|
|
|
|
|
|
|
croak "Dezi::Indexer::Doc object required"; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
$self->start unless $self->started; |
246
|
|
|
|
|
|
|
$self->swish3->parse_buffer("$doc"); |
247
|
|
|
|
|
|
|
$self->{count}++; |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
return $doc; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=head2 swish3_handler( I<swish3_payload> ) |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
This method is called on every document passed to process(). See |
255
|
|
|
|
|
|
|
the L<SWISH::3> documentation for what to expect in I<swish3_payload>. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
This is an abstract method. Subclasses must implement it. |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=cut |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub swish3_handler { confess "$_[0] must implement swish3_handler" } |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head2 finish |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Closes the invindex(). |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=cut |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub finish { |
270
|
|
|
|
|
|
|
my $self = shift; |
271
|
|
|
|
|
|
|
$self->invindex->close; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=head2 count |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Returns the number of documents processed. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=head2 started |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
The time at which the Indexer start() method was called. Returns a Unix epoch |
281
|
|
|
|
|
|
|
integer. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
1; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
__END__ |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=head1 AUTHOR |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
Peter Karman, E<lt>perl@peknet.comE<gt> |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head1 BUGS |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-swish-prog at rt.cpan.org>, or through |
298
|
|
|
|
|
|
|
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>. |
299
|
|
|
|
|
|
|
I will be notified, and then you'll |
300
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
=head1 SUPPORT |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
perldoc Dezi |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
You can also look for information at: |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=over 4 |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=item * Mailing list |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
L<http://lists.swish-e.org/listinfo/users> |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App> |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
L<http://annocpan.org/dist/Dezi-App> |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=item * CPAN Ratings |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/Dezi-App> |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=item * Search CPAN |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
L<http://search.cpan.org/dist/Dezi-App/> |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=back |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
Copyright 2008-2009 by Peter Karman |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
340
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=head1 SEE ALSO |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
L<http://swish-e.org/> |