line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Plucene::Simple; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Simple - An interface to Plucene |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Plucene::Simple; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# create an index |
12
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# add to the index |
15
|
|
|
|
|
|
|
$plucy->add( |
16
|
|
|
|
|
|
|
$id1 => { $field => $term1 }, |
17
|
|
|
|
|
|
|
$id2 => { $field => $term2 }, |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# or ... |
21
|
|
|
|
|
|
|
$plucy->index_document($id => $data); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# search an existing index |
24
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
25
|
|
|
|
|
|
|
my @results = $plucy->search($search_string); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# optimize the index |
28
|
|
|
|
|
|
|
$plucy->optimize; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# remove something from the index |
31
|
|
|
|
|
|
|
$plucy->delete_document($id); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# is something in the index? |
34
|
|
|
|
|
|
|
if ($plucy->indexed($id) { ... } |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This provides a simple interface to L. Plucene is large and |
39
|
|
|
|
|
|
|
multi-featured, and it expected that users will subclass it, and tie |
40
|
|
|
|
|
|
|
all the pieces together to suit their own needs. Plucene::Simple is, |
41
|
|
|
|
|
|
|
therefore, just one way to use Plucene. It's not expected that it will |
42
|
|
|
|
|
|
|
do exactly what *you* want, but you can always use it as an example of |
43
|
|
|
|
|
|
|
how to build your own interface. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 INDEXING |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 open |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
You make a new Plucene::Simple object like so: |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
If this index doesn't exist, then it will be created for you, otherwise you |
54
|
|
|
|
|
|
|
will be adding to an exisiting one. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Then you can add your documents to the index: |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head2 add |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Every document must be indexed with a unique key (which will be returned |
61
|
|
|
|
|
|
|
from searches). |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
A document can be made up of many fields, which can be added as |
64
|
|
|
|
|
|
|
a hashref: |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
$plucy->add($key, \%data); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
$plucy->add( |
69
|
|
|
|
|
|
|
chap1 => { |
70
|
|
|
|
|
|
|
title => "Moby-Dick", |
71
|
|
|
|
|
|
|
author => "Herman Melville", |
72
|
|
|
|
|
|
|
text => "Call me Ishmael ..." |
73
|
|
|
|
|
|
|
}, |
74
|
|
|
|
|
|
|
chap2 => { |
75
|
|
|
|
|
|
|
title => "Boo-Hoo", |
76
|
|
|
|
|
|
|
author => "Lydia Lee", |
77
|
|
|
|
|
|
|
text => "...", |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head2 index_document |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Alternatively, if you do not want to index lots of metadata, but rather |
84
|
|
|
|
|
|
|
just simple text, you can use the index_document() method. |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
$plucy->index_document($key, $data); |
87
|
|
|
|
|
|
|
$plucy->index_document(chap1 => 'Call me Ishmael ...'); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head2 delete_document |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
$plucy->delete_document($id); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 optimize |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
$plucy->optimize; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Plucene is set-up to perform insertions quickly. After a bunch of inserts |
98
|
|
|
|
|
|
|
it is good to optimize() the index for better search speed. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 SEARCHING |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 search |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
my @ids = $plucy->search('ishmael'); |
105
|
|
|
|
|
|
|
# ("chap1", ...) |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
This will return the IDs of each document matching the search term. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
If you have indexed your documents with fields, you can also search with |
110
|
|
|
|
|
|
|
the field name as a prefix: |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
my @ids = $plucy->search("author:lee"); |
113
|
|
|
|
|
|
|
# ("chap2" ...) |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my @results = $plucy->search($search_string); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
This will search the index with the given query, and return a list of |
118
|
|
|
|
|
|
|
document ids. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Searches can be much more powerful than this - see L for |
121
|
|
|
|
|
|
|
further details. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 search_during |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my @results = $lucy->search_during($search_string, $date1, $date2); |
126
|
|
|
|
|
|
|
my @results = $lucy->search_during("to:Fred", "2001-01-01" => "2003-12-31"); |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
If your documents were given an ISO 'date' field when indexing, |
129
|
|
|
|
|
|
|
search_during() will restrict the results to all documents between the |
130
|
|
|
|
|
|
|
specified dates. Any document without a 'date' field will be ignored. |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 indexed |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
if ($plucy->indexed($id) { ... } |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
This returns true if there is a document with the given ID in the index. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
139
|
|
|
|
|
|
|
|
140
|
6
|
|
|
6
|
|
3777
|
use strict; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
172
|
|
141
|
6
|
|
|
6
|
|
29
|
use warnings; |
|
6
|
|
|
|
|
7
|
|
|
6
|
|
|
|
|
217
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
our $VERSION = '1.04'; |
144
|
|
|
|
|
|
|
|
145
|
6
|
|
|
6
|
|
4814
|
use Plucene::Analysis::SimpleAnalyzer; |
|
6
|
|
|
|
|
71985
|
|
|
6
|
|
|
|
|
168
|
|
146
|
6
|
|
|
6
|
|
4715
|
use Plucene::Analysis::WhitespaceAnalyzer; |
|
6
|
|
|
|
|
7115
|
|
|
6
|
|
|
|
|
149
|
|
147
|
6
|
|
|
6
|
|
4501
|
use Plucene::Document; |
|
6
|
|
|
|
|
2052
|
|
|
6
|
|
|
|
|
200
|
|
148
|
6
|
|
|
6
|
|
5251
|
use Plucene::Document::DateSerializer; |
|
6
|
|
|
|
|
112355
|
|
|
6
|
|
|
|
|
405
|
|
149
|
6
|
|
|
6
|
|
5681
|
use Plucene::Document::Field; |
|
6
|
|
|
|
|
4747
|
|
|
6
|
|
|
|
|
57
|
|
150
|
6
|
|
|
6
|
|
6009
|
use Plucene::Index::Reader; |
|
6
|
|
|
|
|
640608
|
|
|
6
|
|
|
|
|
191
|
|
151
|
6
|
|
|
6
|
|
5474
|
use Plucene::Index::Writer; |
|
6
|
|
|
|
|
218772
|
|
|
6
|
|
|
|
|
201
|
|
152
|
6
|
|
|
6
|
|
5181
|
use Plucene::QueryParser; |
|
6
|
|
|
|
|
93768
|
|
|
6
|
|
|
|
|
65
|
|
153
|
6
|
|
|
6
|
|
6325
|
use Plucene::Search::DateFilter; |
|
6
|
|
|
|
|
10749
|
|
|
6
|
|
|
|
|
212
|
|
154
|
6
|
|
|
6
|
|
4993
|
use Plucene::Search::HitCollector; |
|
6
|
|
|
|
|
1167
|
|
|
6
|
|
|
|
|
168
|
|
155
|
6
|
|
|
6
|
|
5221
|
use Plucene::Search::IndexSearcher; |
|
6
|
|
|
|
|
42991
|
|
|
6
|
|
|
|
|
184
|
|
156
|
|
|
|
|
|
|
|
157
|
6
|
|
|
6
|
|
72
|
use Carp; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
379
|
|
158
|
6
|
|
|
6
|
|
5879
|
use File::Spec::Functions qw(catfile); |
|
6
|
|
|
|
|
5736
|
|
|
6
|
|
|
|
|
499
|
|
159
|
6
|
|
|
6
|
|
38
|
use Time::Piece; |
|
6
|
|
|
|
|
13
|
|
|
6
|
|
|
|
|
56
|
|
160
|
6
|
|
|
6
|
|
5728
|
use Time::Piece::Range; |
|
6
|
|
|
|
|
18995
|
|
|
6
|
|
|
|
|
7504
|
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub open { |
163
|
21
|
|
|
21
|
1
|
185339
|
my ($class, $dir) = @_; |
164
|
21
|
50
|
|
|
|
97
|
$dir or croak "No directory given"; |
165
|
21
|
|
|
|
|
132
|
bless { _dir => $dir }, $class; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
84
|
|
|
84
|
|
1829
|
sub _dir { shift->{_dir} } |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub _parsed_query { |
171
|
14
|
|
|
14
|
|
37
|
my ($self, $query, $default) = @_; |
172
|
14
|
|
|
|
|
120
|
my $parser = Plucene::QueryParser->new({ |
173
|
|
|
|
|
|
|
analyzer => Plucene::Analysis::SimpleAnalyzer->new(), |
174
|
|
|
|
|
|
|
default => $default |
175
|
|
|
|
|
|
|
}); |
176
|
14
|
|
|
|
|
556
|
$parser->parse($query); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
18
|
|
|
18
|
|
68
|
sub _searcher { Plucene::Search::IndexSearcher->new(shift->_dir) } |
180
|
|
|
|
|
|
|
|
181
|
16
|
|
|
16
|
|
47
|
sub _reader { Plucene::Index::Reader->open(shift->_dir) } |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub search { |
184
|
15
|
|
|
15
|
1
|
78768
|
my ($self, $sstring) = @_; |
185
|
15
|
100
|
|
|
|
57
|
return () unless $sstring; |
186
|
14
|
|
|
|
|
26
|
my @docs; |
187
|
14
|
|
|
|
|
54
|
my $searcher = $self->_searcher; |
188
|
|
|
|
|
|
|
my $hc = Plucene::Search::HitCollector->new( |
189
|
|
|
|
|
|
|
collect => sub { |
190
|
17
|
|
|
17
|
|
95995
|
my ($self, $doc, $score) = @_; |
191
|
17
|
|
|
|
|
36
|
my $res = eval { $searcher->doc($doc) }; |
|
17
|
|
|
|
|
63
|
|
192
|
17
|
50
|
|
|
|
3251
|
push @docs, [ $res, $score ] if $res; |
193
|
14
|
|
|
|
|
26684
|
}); |
194
|
14
|
|
|
|
|
164
|
$searcher->search_hc($self->_parsed_query($sstring, 'text'), $hc); |
195
|
14
|
|
|
|
|
17061
|
return map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs; |
|
8
|
|
|
|
|
63
|
|
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub search_during { |
199
|
4
|
|
|
4
|
1
|
175
|
my ($self, $sstring, $date1, $date2) = @_; |
200
|
4
|
50
|
|
|
|
18
|
return () unless $sstring; |
201
|
4
|
|
|
|
|
47
|
my $range = Time::Piece::Range->new( |
202
|
|
|
|
|
|
|
Time::Piece->strptime($date1, "%Y-%m-%d"), |
203
|
|
|
|
|
|
|
Time::Piece->strptime($date2, "%Y-%m-%d")); |
204
|
4
|
|
|
|
|
1026
|
my $filter = Plucene::Search::DateFilter->new({ |
205
|
|
|
|
|
|
|
field => '_date_', |
206
|
|
|
|
|
|
|
from => $range->start, |
207
|
|
|
|
|
|
|
to => $range->end, |
208
|
|
|
|
|
|
|
}); |
209
|
4
|
|
|
|
|
747
|
my $qp = Plucene::QueryParser->new({ |
210
|
|
|
|
|
|
|
analyzer => Plucene::Analysis::WhitespaceAnalyzer->new(), |
211
|
|
|
|
|
|
|
default => "text" |
212
|
|
|
|
|
|
|
}); |
213
|
4
|
|
|
|
|
207
|
my $query = $qp->parse($sstring); |
214
|
4
|
|
|
|
|
8100
|
my $hits = $self->_searcher->search($query, $filter); |
215
|
4
|
100
|
|
|
|
95240
|
return () unless $hits->length; |
216
|
3
|
|
|
|
|
49
|
my @docs = map $hits->doc($_), 0 .. ($hits->length - 1); |
217
|
3
|
|
|
|
|
2325
|
return map $_->get("id")->string, @docs; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub _writer { |
221
|
25
|
|
|
25
|
|
122
|
my $self = shift; |
222
|
25
|
100
|
|
|
|
111
|
return Plucene::Index::Writer->new( |
223
|
|
|
|
|
|
|
$self->_dir, |
224
|
|
|
|
|
|
|
Plucene::Analysis::SimpleAnalyzer->new(), |
225
|
|
|
|
|
|
|
-e catfile($self->_dir, "segments") ? 0 : 1 |
226
|
|
|
|
|
|
|
); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub add { |
230
|
13
|
|
|
13
|
1
|
86388
|
my ($self, @data) = @_; |
231
|
13
|
|
|
|
|
117
|
my $writer = $self->_writer; |
232
|
13
|
|
|
|
|
22690
|
while (my ($id, $terms) = splice @data, 0, 2) { |
233
|
38
|
|
|
|
|
134078
|
my $doc = Plucene::Document->new; |
234
|
38
|
|
|
|
|
578
|
$doc->add(Plucene::Document::Field->Keyword(id => $id)); |
235
|
38
|
|
|
|
|
1344
|
foreach my $key (keys %$terms) { |
236
|
64
|
100
|
|
|
|
318
|
if ($key eq 'text') { |
|
|
100
|
|
|
|
|
|
237
|
2
|
|
|
|
|
5
|
next; # gets added at the end anyway |
238
|
|
|
|
|
|
|
} elsif ($key eq "date") { |
239
|
5
|
|
|
|
|
7
|
my $date = eval { Time::Piece->strptime($terms->{date}, "%Y-%m-%d") }; |
|
5
|
|
|
|
|
77
|
|
240
|
5
|
50
|
|
|
|
186
|
do { $date = Time::Piece->new; $terms->{date} = $date->ymd; } if $@; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
241
|
5
|
|
|
|
|
25
|
$doc->add( |
242
|
|
|
|
|
|
|
Plucene::Document::Field->Keyword("_date_", freeze_date($date))); |
243
|
5
|
|
|
|
|
578
|
$doc->add(Plucene::Document::Field->Keyword("date", $date->ymd)); |
244
|
|
|
|
|
|
|
} else { |
245
|
57
|
|
|
|
|
237
|
$doc->add(Plucene::Document::Field->UnStored($key => $terms->{$key})); |
246
|
57
|
50
|
|
|
|
1690
|
$terms->{text} .= " " . $terms->{$key} unless $key =~ /^_/; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
38
|
|
|
|
|
297
|
$doc->add(Plucene::Document::Field->UnStored(text => $terms->{text})); |
250
|
38
|
|
|
|
|
984
|
$writer->add_document($doc); |
251
|
|
|
|
|
|
|
} |
252
|
13
|
|
|
|
|
132731
|
undef $writer; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub index_document { |
256
|
8
|
|
|
8
|
1
|
483825
|
my ($self, $id, $data) = @_; |
257
|
8
|
|
|
|
|
45
|
my $writer = $self->_writer; |
258
|
8
|
|
|
|
|
7646
|
my $doc = Plucene::Document->new; |
259
|
8
|
|
|
|
|
171
|
$doc->add(Plucene::Document::Field->Keyword(id => $id)); |
260
|
8
|
|
|
|
|
313
|
$doc->add(Plucene::Document::Field->UnStored('text' => $data)); |
261
|
8
|
|
|
|
|
220
|
$writer->add_document($doc); |
262
|
8
|
|
|
|
|
301647
|
undef $writer; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub delete_document { |
266
|
14
|
|
|
14
|
1
|
172512
|
my ($self, $id) = @_; |
267
|
14
|
|
|
|
|
57
|
my $reader = $self->_reader; |
268
|
14
|
|
|
|
|
24213
|
$reader->delete_term( |
269
|
|
|
|
|
|
|
Plucene::Index::Term->new({ field => "id", text => $id })); |
270
|
14
|
|
|
|
|
27131
|
$reader->close; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
4
|
|
|
4
|
1
|
242917
|
sub optimize { shift->_writer->optimize() } |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub indexed { |
276
|
2
|
|
|
2
|
1
|
3083
|
my ($self, $id) = @_; |
277
|
2
|
|
|
|
|
18
|
my $term = Plucene::Index::Term->new({ field => 'id', text => $id }); |
278
|
2
|
|
|
|
|
24
|
return $self->_reader->doc_freq($term); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 COPYRIGHT |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Copyright (C) 2003-2004 Kasei Limited |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=cut |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
1; |