line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Dezi::Lucy::Indexer; |
2
|
1
|
|
|
1
|
|
22866
|
use Moose; |
|
1
|
|
|
|
|
503725
|
|
|
1
|
|
|
|
|
7
|
|
3
|
|
|
|
|
|
|
extends 'Dezi::Indexer'; |
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
8522
|
use Dezi::Lucy::InvIndex; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Lucy::Index::Indexer; |
8
|
|
|
|
|
|
|
use Lucy::Plan::Schema; |
9
|
|
|
|
|
|
|
use Lucy::Plan::FullTextType; |
10
|
|
|
|
|
|
|
use Lucy::Plan::StringType; |
11
|
|
|
|
|
|
|
use Lucy::Analysis::PolyAnalyzer; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use Carp; |
14
|
|
|
|
|
|
|
use SWISH::3 qw( :constants ); |
15
|
|
|
|
|
|
|
use Scalar::Util qw( blessed ); |
16
|
|
|
|
|
|
|
use Data::Dump qw( dump ); |
17
|
|
|
|
|
|
|
use Search::Tools::UTF8; |
18
|
|
|
|
|
|
|
use Path::Class::File::Lockable; |
19
|
|
|
|
|
|
|
use Sys::Hostname qw( hostname ); |
20
|
|
|
|
|
|
|
use Digest::MD5 (); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our $VERSION = '0.014'; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
has 'highlightable_fields' => |
25
|
|
|
|
|
|
|
( is => 'rw', isa => 'Bool', default => sub {0} ); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
my $BUILT_IN_PROPS = SWISH_DOC_PROP_MAP(); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 NAME |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
Dezi::Lucy::Indexer - Dezi::App Apache Lucy indexer |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 SYNOPSIS |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
use Dezi::Lucy::Indexer; |
36
|
|
|
|
|
|
|
my $indexer = Dezi::Lucy::Indexer->new( |
37
|
|
|
|
|
|
|
config => Dezi::Indexer::Config->new(), |
38
|
|
|
|
|
|
|
invindex => Dezi::Lucy::InvIndex->new(), |
39
|
|
|
|
|
|
|
highlightable_fields => 0, |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 DESCRIPTION |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Dezi::Lucy::Indexer is an Apache Lucy based indexer |
45
|
|
|
|
|
|
|
class based on L<SWISH::3>. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 CONSTANTS |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
All the L<SWISH::3> constants are imported into this namespace, |
50
|
|
|
|
|
|
|
including: |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=over |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item SWISH_DOC_PROP_MAP |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=item SWISH_INDEX_STEMMER_LANG |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=item SWISH_INDEX_NAME |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item SWISH_INDEX_FORMAT |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=back |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 METHODS |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Only new and overridden methods are documented here. See |
67
|
|
|
|
|
|
|
the L<Dezi::Indexer> documentation. |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head2 BUILD |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Implements basic object set up. Called internally by new(). |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
In addition to the attributes documented in Dezi::Indexer, |
74
|
|
|
|
|
|
|
this class implements the following attributes: |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=over |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=item highlightable_fields |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Value should be 0 or 1. Default is 0. Passed directly to the |
81
|
|
|
|
|
|
|
constructor for Lucy::Plan::FullTextField objects as the value |
82
|
|
|
|
|
|
|
for the C<highlightable> option. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=back |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=cut |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub BUILD { |
89
|
|
|
|
|
|
|
my $self = shift; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# coerce our invindex into our format subclass |
92
|
|
|
|
|
|
|
unless ( $self->invindex->isa('Dezi::Lucy::InvIndex') ) { |
93
|
|
|
|
|
|
|
$self->invindex( |
94
|
|
|
|
|
|
|
Dezi::Lucy::InvIndex->new( path => $self->invindex->path ) ); |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
$self->_build_lucy_delegates(); |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub _build_lucy_delegates { |
101
|
|
|
|
|
|
|
my $self = shift; |
102
|
|
|
|
|
|
|
my $s3config = $self->swish3->config; |
103
|
|
|
|
|
|
|
my $lang = $s3config->get_index->get( SWISH_INDEX_STEMMER_LANG() ) |
104
|
|
|
|
|
|
|
|| 'none'; |
105
|
|
|
|
|
|
|
$self->{_lang} = $lang; # cache for finish() |
106
|
|
|
|
|
|
|
my $schema = Lucy::Plan::Schema->new(); |
107
|
|
|
|
|
|
|
my $analyzers = {}; |
108
|
|
|
|
|
|
|
my $case_folder = Lucy::Analysis::CaseFolder->new; |
109
|
|
|
|
|
|
|
my $tokenizer = Lucy::Analysis::RegexTokenizer->new; |
110
|
|
|
|
|
|
|
my $multival_tokenizer |
111
|
|
|
|
|
|
|
= Lucy::Analysis::RegexTokenizer->new( |
112
|
|
|
|
|
|
|
pattern => '[^' . SWISH_TOKENPOS_BUMPER() . ']+' ); |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# mimic StringType fields that require case and/or multival parsing. |
115
|
|
|
|
|
|
|
$analyzers->{store_lc} = Lucy::Analysis::PolyAnalyzer->new( |
116
|
|
|
|
|
|
|
analyzers => [ $multival_tokenizer, $case_folder ] ); |
117
|
|
|
|
|
|
|
$analyzers->{store} = $multival_tokenizer; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# stemming means we fold case and tokenize too. |
120
|
|
|
|
|
|
|
if ( $lang and $lang =~ m/^\w\w$/ ) { |
121
|
|
|
|
|
|
|
my $stemmer |
122
|
|
|
|
|
|
|
= Lucy::Analysis::SnowballStemmer->new( language => $lang ); |
123
|
|
|
|
|
|
|
$analyzers->{fulltext_lc} |
124
|
|
|
|
|
|
|
= Lucy::Analysis::PolyAnalyzer->new( analyzers => |
125
|
|
|
|
|
|
|
[ $multival_tokenizer, $case_folder, $tokenizer, $stemmer ] ); |
126
|
|
|
|
|
|
|
$analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new( |
127
|
|
|
|
|
|
|
analyzers => [ $multival_tokenizer, $tokenizer, $stemmer ] ); |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
else { |
130
|
|
|
|
|
|
|
$analyzers->{fulltext_lc} |
131
|
|
|
|
|
|
|
= Lucy::Analysis::PolyAnalyzer->new( |
132
|
|
|
|
|
|
|
analyzers => [ $multival_tokenizer, $case_folder, $tokenizer, ], |
133
|
|
|
|
|
|
|
); |
134
|
|
|
|
|
|
|
$analyzers->{fulltext} = Lucy::Analysis::PolyAnalyzer->new( |
135
|
|
|
|
|
|
|
analyzers => [ $multival_tokenizer, $tokenizer ] ); |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# cache our objects for later |
139
|
|
|
|
|
|
|
$self->{__lucy}->{analyzers} = $analyzers; |
140
|
|
|
|
|
|
|
$self->{__lucy}->{schema} = $schema; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# build the Lucy fields, which are a merger of MetaNames+PropertyNames |
143
|
|
|
|
|
|
|
my %fields; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my $metanames = $s3config->get_metanames; |
146
|
|
|
|
|
|
|
my $meta_keys = $metanames->keys; |
147
|
|
|
|
|
|
|
my $properties = $s3config->get_properties; |
148
|
|
|
|
|
|
|
my $property_keys = $properties->keys; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# merge first by name so we pair correctly in _create_field_def() |
151
|
|
|
|
|
|
|
my %tmpfields; |
152
|
|
|
|
|
|
|
for my $name (@$meta_keys) { |
153
|
|
|
|
|
|
|
my $mn = $metanames->get($name); |
154
|
|
|
|
|
|
|
$tmpfields{$name}->{meta} = $mn; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
for my $name (@$property_keys) { |
157
|
|
|
|
|
|
|
if ( exists $BUILT_IN_PROPS->{$name} ) { |
158
|
|
|
|
|
|
|
confess |
159
|
|
|
|
|
|
|
"$name is a built-in PropertyName and should not be defined in config"; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
my $pr = $properties->get($name); |
162
|
|
|
|
|
|
|
$tmpfields{$name}->{prop} = $pr; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# build out field definitions |
166
|
|
|
|
|
|
|
for my $n ( keys %tmpfields ) { |
167
|
|
|
|
|
|
|
my %fdef = $self->_create_field_def( $tmpfields{$n}->{meta}, |
168
|
|
|
|
|
|
|
$tmpfields{$n}->{prop} ); |
169
|
|
|
|
|
|
|
$fields{ $fdef{name} } = $fdef{def}; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
$self->{_fields} = \%fields; |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
for my $name ( keys %fields ) { |
175
|
|
|
|
|
|
|
my $def = $fields{$name}; |
176
|
|
|
|
|
|
|
my $key = $name; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# if a field is purely an alias, skip it. |
179
|
|
|
|
|
|
|
if ( defined $def->{is_meta_alias} |
180
|
|
|
|
|
|
|
and defined $def->{is_prop_alias} ) |
181
|
|
|
|
|
|
|
{ |
182
|
|
|
|
|
|
|
$def->{store_as}->{ $def->{is_meta_alias} } = 1; |
183
|
|
|
|
|
|
|
$def->{store_as}->{ $def->{is_prop_alias} } = 1; |
184
|
|
|
|
|
|
|
next; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $type = $self->_get_lucy_field_type($def) or next; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
$schema->spec_field( name => $name, type => $type ); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
$def->{store_as}->{$name} = 1; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# build in the built-ins |
195
|
|
|
|
|
|
|
$self->debug and warn dump \%fields; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
for my $name ( keys %$BUILT_IN_PROPS ) { |
198
|
|
|
|
|
|
|
if ( exists $fields{$name} ) { |
199
|
|
|
|
|
|
|
my $def = $fields{$name}; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
#carp "found $name in built-in props: " . dump($field); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# in theory this should never happen. |
204
|
|
|
|
|
|
|
if ( !$def->{is_prop} ) { |
205
|
|
|
|
|
|
|
confess |
206
|
|
|
|
|
|
|
"$name is a built-in PropertyName but not defined as a PropertyName in config"; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# default property |
211
|
|
|
|
|
|
|
else { |
212
|
|
|
|
|
|
|
$schema->spec_field( |
213
|
|
|
|
|
|
|
name => $name, |
214
|
|
|
|
|
|
|
type => Lucy::Plan::StringType->new( sortable => 1, ) |
215
|
|
|
|
|
|
|
); |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
#dump( \%fields ); |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# TODO can pass lucy in? make 'lucy' attribute public? |
222
|
|
|
|
|
|
|
my $hostname = hostname() or confess "Can't get unique hostname"; |
223
|
|
|
|
|
|
|
my $manager = Lucy::Index::IndexManager->new( host => $hostname ); |
224
|
|
|
|
|
|
|
$self->{lucy} ||= Lucy::Index::Indexer->new( |
225
|
|
|
|
|
|
|
schema => $schema, |
226
|
|
|
|
|
|
|
index => $self->invindex->path . "", |
227
|
|
|
|
|
|
|
create => 1, |
228
|
|
|
|
|
|
|
manager => $manager, |
229
|
|
|
|
|
|
|
); |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub _get_lucy_field_type { |
234
|
|
|
|
|
|
|
my ( $self, $def ) = @_; |
235
|
|
|
|
|
|
|
my ( $type, $key ); |
236
|
|
|
|
|
|
|
my $analyzers = $self->{__lucy}->{analyzers}; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# MetaName==yes, PropertyName==no |
239
|
|
|
|
|
|
|
if ( $def->{is_meta} and !$def->{is_prop} ) { |
240
|
|
|
|
|
|
|
if ( defined $def->{is_meta_alias} ) { |
241
|
|
|
|
|
|
|
$key = $def->{is_meta_alias}; |
242
|
|
|
|
|
|
|
$def->{store_as}->{$key} = 1; |
243
|
|
|
|
|
|
|
return; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
#warn "spec meta $name"; |
247
|
|
|
|
|
|
|
$type = Lucy::Plan::FullTextType->new( |
248
|
|
|
|
|
|
|
analyzer => $analyzers->{fulltext_lc}, |
249
|
|
|
|
|
|
|
stored => 0, |
250
|
|
|
|
|
|
|
boost => $def->{bias} || 1.0, |
251
|
|
|
|
|
|
|
highlightable => $self->highlightable_fields, |
252
|
|
|
|
|
|
|
); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# MetaName==yes, PropertyName==yes |
256
|
|
|
|
|
|
|
# this is the trickiest case, because the field |
257
|
|
|
|
|
|
|
# is both prop+meta and could be an alias for one |
258
|
|
|
|
|
|
|
# and a real for the other. |
259
|
|
|
|
|
|
|
# **NOTE** we must have already eliminated the case where |
260
|
|
|
|
|
|
|
# the field is an alias for both. |
261
|
|
|
|
|
|
|
elsif ( $def->{is_meta} and $def->{is_prop} ) { |
262
|
|
|
|
|
|
|
if ( defined $def->{is_meta_alias} ) { |
263
|
|
|
|
|
|
|
$key = $def->{is_meta_alias}; |
264
|
|
|
|
|
|
|
$def->{store_as}->{$key} = 1; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
elsif ( defined $def->{is_prop_alias} ) { |
267
|
|
|
|
|
|
|
$key = $def->{is_prop_alias}; |
268
|
|
|
|
|
|
|
$def->{store_as}->{$key} = 1; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
my $analyzer = $analyzers->{fulltext_lc}; |
272
|
|
|
|
|
|
|
if ( !$def->{ignore_case} ) { |
273
|
|
|
|
|
|
|
$analyzer = $analyzers->{fulltext}; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
#warn "spec meta+prop $name"; |
277
|
|
|
|
|
|
|
$type = Lucy::Plan::FullTextType->new( |
278
|
|
|
|
|
|
|
analyzer => $analyzer, |
279
|
|
|
|
|
|
|
highlightable => $self->highlightable_fields, |
280
|
|
|
|
|
|
|
sortable => $def->{sortable}, |
281
|
|
|
|
|
|
|
boost => $def->{bias} || 1.0, |
282
|
|
|
|
|
|
|
); |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# MetaName==no, PropertyName==yes |
286
|
|
|
|
|
|
|
elsif (!$def->{is_meta} |
287
|
|
|
|
|
|
|
and $def->{is_prop} ) |
288
|
|
|
|
|
|
|
{ |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
if ( defined $def->{is_prop_alias} ) { |
291
|
|
|
|
|
|
|
$key = $def->{is_prop_alias}; |
292
|
|
|
|
|
|
|
$def->{store_as}->{$key} = 1; |
293
|
|
|
|
|
|
|
return; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
#warn "spec prop !sort $name"; |
297
|
|
|
|
|
|
|
my $analyzer_key = 'store'; |
298
|
|
|
|
|
|
|
if ( $def->{ignore_case} ) { |
299
|
|
|
|
|
|
|
$analyzer_key = 'store_lc'; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
$type = Lucy::Plan::FullTextType->new( |
303
|
|
|
|
|
|
|
analyzer => $analyzers->{$analyzer_key}, |
304
|
|
|
|
|
|
|
highlightable => $self->highlightable_fields, |
305
|
|
|
|
|
|
|
sortable => $def->{sortable}, |
306
|
|
|
|
|
|
|
boost => $def->{bias} || 1.0, |
307
|
|
|
|
|
|
|
); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
$self->debug |
311
|
|
|
|
|
|
|
and warn |
312
|
|
|
|
|
|
|
sprintf( "field def %s => field type %s", dump($def), $type ); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
return $type; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub _create_field_def { |
319
|
|
|
|
|
|
|
my ( $self, $metaname, $propname ) = @_; |
320
|
|
|
|
|
|
|
if ( !$metaname and !$propname ) { |
321
|
|
|
|
|
|
|
confess "Must have one of metaname or propname objects"; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
my $name = $metaname ? $metaname->name : $propname->name; |
324
|
|
|
|
|
|
|
my %field_def = (); |
325
|
|
|
|
|
|
|
if ($metaname) { |
326
|
|
|
|
|
|
|
if ( $metaname->name ne $name ) { |
327
|
|
|
|
|
|
|
confess "Mismatched metaname for '$name': " . $metaname->name; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
my $alias = $metaname->alias_for; |
330
|
|
|
|
|
|
|
$field_def{is_meta} = 1; |
331
|
|
|
|
|
|
|
$field_def{is_meta_alias} = $alias; |
332
|
|
|
|
|
|
|
$field_def{bias} = $metaname->bias; |
333
|
|
|
|
|
|
|
$field_def{store_as}->{$name} = 1; |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# allow for aliases to built-ins |
336
|
|
|
|
|
|
|
if ( exists $BUILT_IN_PROPS->{$name} ) { |
337
|
|
|
|
|
|
|
$field_def{is_prop} = 1; |
338
|
|
|
|
|
|
|
$field_def{sortable} = 1; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
if ($propname) { |
342
|
|
|
|
|
|
|
if ( $propname->name ne $name ) { |
343
|
|
|
|
|
|
|
confess "Mismatched propname for '$name'" . $propname->name; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
my $prop_alias = $propname->alias_for; |
346
|
|
|
|
|
|
|
$field_def{is_prop} = 1; |
347
|
|
|
|
|
|
|
$field_def{is_prop_alias} = $prop_alias; |
348
|
|
|
|
|
|
|
if ( $propname->sort ) { |
349
|
|
|
|
|
|
|
$field_def{sortable} = 1; |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
for my $attr (qw( ignore_case verbatim max )) { |
352
|
|
|
|
|
|
|
$field_def{$attr} = $propname->$attr; |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
return ( name => $name, def => \%field_def ); |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub _add_new_field { |
359
|
|
|
|
|
|
|
my ( $self, $metaname, $propname ) = @_; |
360
|
|
|
|
|
|
|
my $fields = $self->{_fields}; |
361
|
|
|
|
|
|
|
my %field_def = $self->_create_field_def( $metaname, $propname ); |
362
|
|
|
|
|
|
|
my $name = $field_def{name}; |
363
|
|
|
|
|
|
|
my $def = $field_def{def}; |
364
|
|
|
|
|
|
|
$fields->{$name} ||= $def; |
365
|
|
|
|
|
|
|
$self->{__lucy}->{schema}->spec_field( |
366
|
|
|
|
|
|
|
name => $name, |
367
|
|
|
|
|
|
|
type => $self->_get_lucy_field_type($def), |
368
|
|
|
|
|
|
|
); |
369
|
|
|
|
|
|
|
return $def; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=head2 swish3_handler( I<swish3_data> ) |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
Called by the SWISH::3::handler() function for every document being |
375
|
|
|
|
|
|
|
indexed. |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub swish3_handler { |
380
|
|
|
|
|
|
|
my ( $self, $data ) = @_; |
381
|
|
|
|
|
|
|
my $config = $data->config; |
382
|
|
|
|
|
|
|
my $conf_props = $config->get_properties; |
383
|
|
|
|
|
|
|
my $conf_metas = $config->get_metanames; |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# will hold all the parsed text, keyed by field name |
386
|
|
|
|
|
|
|
my %doc; |
387
|
|
|
|
|
|
|
my $docinfo = $data->doc; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# Swish built-in fields first |
390
|
|
|
|
|
|
|
for my $propname ( keys %$BUILT_IN_PROPS ) { |
391
|
|
|
|
|
|
|
my $attr = $BUILT_IN_PROPS->{$propname}; |
392
|
|
|
|
|
|
|
$doc{$propname} = [ $docinfo->$attr ]; |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# fields parsed from document |
396
|
|
|
|
|
|
|
my $props = $data->properties; |
397
|
|
|
|
|
|
|
my $metas = $data->metanames; |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# field def cache |
400
|
|
|
|
|
|
|
my $fields = $self->{_fields}; |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
# may need to add newly-discovered fields from $metas |
403
|
|
|
|
|
|
|
# that were added via UndefinedMetaTags e.g. |
404
|
|
|
|
|
|
|
for my $mname ( keys %$metas ) { |
405
|
|
|
|
|
|
|
if ( !exists $fields->{$mname} ) { |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
#warn "New field: $mname\n"; |
408
|
|
|
|
|
|
|
my $prop; |
409
|
|
|
|
|
|
|
if ( exists $props->{$mname} ) { |
410
|
|
|
|
|
|
|
$prop = $conf_props->get($mname); |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
$self->_add_new_field( $conf_metas->get($mname), $prop ); |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
#dump $fields; |
417
|
|
|
|
|
|
|
#dump $props; |
418
|
|
|
|
|
|
|
#dump $metas; |
419
|
|
|
|
|
|
|
for my $fname ( sort keys %$fields ) { |
420
|
|
|
|
|
|
|
my $field = $self->{_fields}->{$fname}; |
421
|
|
|
|
|
|
|
next if $field->{is_prop_alias}; |
422
|
|
|
|
|
|
|
next if $field->{is_meta_alias}; |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
my @keys = keys %{ $field->{store_as} }; |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
for my $key (@keys) { |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# prefer properties over metanames because |
429
|
|
|
|
|
|
|
# properties have verbatim flag, which affects |
430
|
|
|
|
|
|
|
# the stored whitespace. |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
if ( $field->{is_prop} and !exists $BUILT_IN_PROPS->{$fname} ) { |
433
|
|
|
|
|
|
|
push( @{ $doc{$key} }, @{ $props->{$fname} } ); |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
elsif ( $field->{is_meta} ) { |
436
|
|
|
|
|
|
|
push( @{ $doc{$key} }, @{ $metas->{$fname} } ); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
else { |
439
|
|
|
|
|
|
|
croak "field '$fname' is neither a PropertyName nor MetaName"; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# serialize the doc with our tokenpos_bump char |
445
|
|
|
|
|
|
|
for my $k ( keys %doc ) { |
446
|
|
|
|
|
|
|
$doc{$k} = to_utf8( join( SWISH_TOKENPOS_BUMPER(), @{ $doc{$k} } ) ); |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
$self->debug and carp dump \%doc; |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
# make sure we delete any existing doc with same URI |
452
|
|
|
|
|
|
|
$self->{lucy}->delete_by_term( |
453
|
|
|
|
|
|
|
field => 'swishdocpath', |
454
|
|
|
|
|
|
|
term => $doc{swishdocpath} |
455
|
|
|
|
|
|
|
); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
$self->{lucy}->add_doc( \%doc ); |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=head2 finish |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
Calls commit() on the internal Lucy::Indexer object, |
463
|
|
|
|
|
|
|
writes the C<swish.xml> header file and calls the superclass finish() |
464
|
|
|
|
|
|
|
method. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=cut |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
my @chars = ( 'a' .. 'z', 'A' .. 'Z', 0 .. 9 ); |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
around finish => sub { |
471
|
|
|
|
|
|
|
my $super_method = shift; |
472
|
|
|
|
|
|
|
my $self = shift; |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
return 0 if $self->{_is_finished}; |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
my $doc_count = $self->_finish_lucy(); |
477
|
|
|
|
|
|
|
$super_method->( $self, @_ ); |
478
|
|
|
|
|
|
|
$self->{_is_finished} = 1; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
return $doc_count; |
481
|
|
|
|
|
|
|
}; |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub _finish_lucy { |
484
|
|
|
|
|
|
|
my $self = shift; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# get a lock on our header file till |
487
|
|
|
|
|
|
|
# this entire transaction is complete. |
488
|
|
|
|
|
|
|
# Note that we trust the Lucy locking feature |
489
|
|
|
|
|
|
|
# to have prevented any other process |
490
|
|
|
|
|
|
|
# from getting a lock on the invindex itself, |
491
|
|
|
|
|
|
|
# but we want to make sure nothing interrupts |
492
|
|
|
|
|
|
|
# us from writing our own header after calling ->commit(). |
493
|
|
|
|
|
|
|
my $invindex = $self->invindex; |
494
|
|
|
|
|
|
|
my $header = $invindex->header_file->stringify; |
495
|
|
|
|
|
|
|
my $lock_file = Path::Class::File::Lockable->new($header); |
496
|
|
|
|
|
|
|
if ( $lock_file->locked ) { |
497
|
|
|
|
|
|
|
croak "Lock file found on $header -- cannot commit indexing changes"; |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
$lock_file->lock; |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
# commit our changes |
502
|
|
|
|
|
|
|
$self->{lucy}->commit(); |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
# get total doc count |
505
|
|
|
|
|
|
|
my $polyreader = Lucy::Index::PolyReader->open( index => "$invindex", ); |
506
|
|
|
|
|
|
|
my $doc_count = $polyreader->doc_count(); |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# write header |
509
|
|
|
|
|
|
|
# the current config should contain any existing header + runtime config |
510
|
|
|
|
|
|
|
my $idx_cfg = $self->swish3->config->get_index; |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# poor man's uuid |
513
|
|
|
|
|
|
|
my $uuid = Digest::MD5::md5_hex( |
514
|
|
|
|
|
|
|
time() . join( "", @chars[ map { rand @chars } ( 1 .. 24 ) ] ) ); |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
$idx_cfg->set( SWISH_INDEX_NAME(), "$invindex" ); |
517
|
|
|
|
|
|
|
$idx_cfg->set( SWISH_INDEX_FORMAT(), 'Lucy' ); |
518
|
|
|
|
|
|
|
$idx_cfg->set( SWISH_INDEX_STEMMER_LANG(), $self->{_lang} ); |
519
|
|
|
|
|
|
|
$idx_cfg->set( 'DeziVersion', $invindex->version ); |
520
|
|
|
|
|
|
|
$idx_cfg->set( "DocCount", $doc_count ); |
521
|
|
|
|
|
|
|
$idx_cfg->set( "UUID", $uuid ); |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
$self->swish3->config->write($header); |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# transaction complete |
526
|
|
|
|
|
|
|
$lock_file->unlock; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
$self->debug and carp "wrote $header with uuid $uuid"; |
529
|
|
|
|
|
|
|
$self->debug and carp "$doc_count docs indexed"; |
530
|
|
|
|
|
|
|
$self->swish3(undef); # invalidate this indexer |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
return $doc_count; |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=head2 get_lucy |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
Returns the internal Lucy::Index::Indexer object. |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=cut |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
sub get_lucy { |
542
|
|
|
|
|
|
|
return shift->{lucy}; |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=head2 abort |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
Sets the internal Lucy::Index::Indexer to undef, |
548
|
|
|
|
|
|
|
which should release any locks on the index. |
549
|
|
|
|
|
|
|
Also flags the Dezi::Lucy::Indexer object |
550
|
|
|
|
|
|
|
as stale. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=cut |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
sub abort { |
555
|
|
|
|
|
|
|
my $self = shift; |
556
|
|
|
|
|
|
|
$self->{lucy} = undef; |
557
|
|
|
|
|
|
|
$self->{_is_finished} = 1; |
558
|
|
|
|
|
|
|
$self->swish3(undef); |
559
|
|
|
|
|
|
|
} |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
1; |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
__END__ |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=head2 MetaNames and PropertyNames |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
Some implementation notes about MetaNames and PropertyNames. |
570
|
|
|
|
|
|
|
See also L<http://dezi.org/2014/07/18/metanames-and-propertynames/>. |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=over |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=item |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
A field defined as either a MetaName, PropertyName or both, can be searched. |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=item |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
Fields are matched against tag names in your XML/HTML documents. See also the TagAlias, UndefinedMetaTags, UndefinedXMLAttributes, and XMLClassAttributes directives. |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
=item |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
You can alias field names with MetaNamesAlias and PropertyNamesAlias. |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=item |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
MetaNames are tokenized and case-insensitive and (optionally, with FuzzyIndexingMode) stemmed. |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=item |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
PropertyNames are stored, case-sensitive strings. |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
=item |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
If a field is defined as both a MetaName and PropertyName, then it will be tokenized. |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=item |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
If a field is defined only as a MetaName, it will be parsed but not stored. That means you can search on the field but when you try and retrieve the field's value from the results, it will cause a fatal error. |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=item |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
If a field is defined only as a PropertyName, it will be parsed and stored, but it will not be tokenized. That means the field's contents are stored without being split up into words. |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
=item |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
You can control the parsing and storage of PropertyName-only fields with the following additional directives: |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
=over |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=item PropertyNamesCompareCase |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
case sensitive search |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
=item PropertyNamesIgnoreCase |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
case insensitive search (default) |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=item PropertyNamesNoStripChars |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
preserve whitespace |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=back |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
=item |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
There are two default MetaNames defined: swishdefault and swishtitle. |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=item |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
There are two default PropertyNames defined: swishtitle and swishdescription. |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
=item |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
The libswish3 XML and HTML parsers will automatically treat a <title> tag as swishtitle. Likewise they will treat <body> tag as swishdescription. |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
=item |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
Things get complicated quickly when defining fields. Experiment with small test cases to arrive a the configuration that works best with your application. |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=back |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
=head1 AUTHOR |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
Peter Karman, E<lt>karpet@dezi.orgE<gt> |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=head1 BUGS |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
Please report any bugs or feature requests to C<bug-dezi-app at rt.cpan.org>, or through |
651
|
|
|
|
|
|
|
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Dezi-App>. |
652
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=head1 SUPPORT |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
perldoc Dezi::App |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
You can also look for information at: |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=over 4 |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=item * Website |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
L<http://dezi.org/> |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=item * IRC |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
#dezisearch at freenode |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item * Mailing list |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
L<https://groups.google.com/forum/#!forum/dezi-search> |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Dezi-App> |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
L<http://annocpan.org/dist/Dezi-App> |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=item * CPAN Ratings |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
L<http://cpanratings.perl.org/d/Dezi-App> |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=item * Search CPAN |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
L<https://metacpan.org/dist/Dezi-App/> |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
=back |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
Copyright 2014 by Peter Karman |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
699
|
|
|
|
|
|
|
it under the terms of the GPL v2 or later. |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
=head1 SEE ALSO |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
L<http://dezi.org/>, L<http://swish-e.org/>, L<http://lucy.apache.org/> |
704
|
|
|
|
|
|
|
|