| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Plucene::Simple; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Plucene::Simple - An interface to Plucene |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use Plucene::Simple; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# create an index |
|
12
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# add to the index |
|
15
|
|
|
|
|
|
|
$plucy->add( |
|
16
|
|
|
|
|
|
|
$id1 => { $field => $term1 }, |
|
17
|
|
|
|
|
|
|
$id2 => { $field => $term2 }, |
|
18
|
|
|
|
|
|
|
); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# or ... |
|
21
|
|
|
|
|
|
|
$plucy->index_document($id => $data); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# search an existing index |
|
24
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
|
25
|
|
|
|
|
|
|
my @results = $plucy->search($search_string); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# optimize the index |
|
28
|
|
|
|
|
|
|
$plucy->optimize; |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# remove something from the index |
|
31
|
|
|
|
|
|
|
$plucy->delete_document($id); |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# is something in the index? |
|
34
|
|
|
|
|
|
|
if ($plucy->indexed($id) { ... } |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This provides a simple interface to L. Plucene is large and |
|
39
|
|
|
|
|
|
|
multi-featured, and it expected that users will subclass it, and tie |
|
40
|
|
|
|
|
|
|
all the pieces together to suit their own needs. Plucene::Simple is, |
|
41
|
|
|
|
|
|
|
therefore, just one way to use Plucene. It's not expected that it will |
|
42
|
|
|
|
|
|
|
do exactly what *you* want, but you can always use it as an example of |
|
43
|
|
|
|
|
|
|
how to build your own interface. |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 INDEXING |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head2 open |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
You make a new Plucene::Simple object like so: |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $plucy = Plucene::Simple->open($index_path); |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
If this index doesn't exist, then it will be created for you, otherwise you |
|
54
|
|
|
|
|
|
|
will be adding to an exisiting one. |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Then you can add your documents to the index: |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head2 add |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Every document must be indexed with a unique key (which will be returned |
|
61
|
|
|
|
|
|
|
from searches). |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
A document can be made up of many fields, which can be added as |
|
64
|
|
|
|
|
|
|
a hashref: |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
$plucy->add($key, \%data); |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
$plucy->add( |
|
69
|
|
|
|
|
|
|
chap1 => { |
|
70
|
|
|
|
|
|
|
title => "Moby-Dick", |
|
71
|
|
|
|
|
|
|
author => "Herman Melville", |
|
72
|
|
|
|
|
|
|
text => "Call me Ishmael ..." |
|
73
|
|
|
|
|
|
|
}, |
|
74
|
|
|
|
|
|
|
chap2 => { |
|
75
|
|
|
|
|
|
|
title => "Boo-Hoo", |
|
76
|
|
|
|
|
|
|
author => "Lydia Lee", |
|
77
|
|
|
|
|
|
|
text => "...", |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
); |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head2 index_document |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Alternatively, if you do not want to index lots of metadata, but rather |
|
84
|
|
|
|
|
|
|
just simple text, you can use the index_document() method. |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
$plucy->index_document($key, $data); |
|
87
|
|
|
|
|
|
|
$plucy->index_document(chap1 => 'Call me Ishmael ...'); |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head2 delete_document |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
$plucy->delete_document($id); |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 optimize |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
$plucy->optimize; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Plucene is set-up to perform insertions quickly. After a bunch of inserts |
|
98
|
|
|
|
|
|
|
it is good to optimize() the index for better search speed. |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 SEARCHING |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 search |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
my @ids = $plucy->search('ishmael'); |
|
105
|
|
|
|
|
|
|
# ("chap1", ...) |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
This will return the IDs of each document matching the search term. |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
If you have indexed your documents with fields, you can also search with |
|
110
|
|
|
|
|
|
|
the field name as a prefix: |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
my @ids = $plucy->search("author:lee"); |
|
113
|
|
|
|
|
|
|
# ("chap2" ...) |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
my @results = $plucy->search($search_string); |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
This will search the index with the given query, and return a list of |
|
118
|
|
|
|
|
|
|
document ids. |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
Searches can be much more powerful than this - see L for |
|
121
|
|
|
|
|
|
|
further details. |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 search_during |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my @results = $lucy->search_during($search_string, $date1, $date2); |
|
126
|
|
|
|
|
|
|
my @results = $lucy->search_during("to:Fred", "2001-01-01" => "2003-12-31"); |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
If your documents were given an ISO 'date' field when indexing, |
|
129
|
|
|
|
|
|
|
search_during() will restrict the results to all documents between the |
|
130
|
|
|
|
|
|
|
specified dates. Any document without a 'date' field will be ignored. |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head2 indexed |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
if ($plucy->indexed($id) { ... } |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
This returns true if there is a document with the given ID in the index. |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
|
139
|
|
|
|
|
|
|
|
|
140
|
6
|
|
|
6
|
|
3777
|
use strict; |
|
|
6
|
|
|
|
|
10
|
|
|
|
6
|
|
|
|
|
172
|
|
|
141
|
6
|
|
|
6
|
|
29
|
use warnings; |
|
|
6
|
|
|
|
|
7
|
|
|
|
6
|
|
|
|
|
217
|
|
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
our $VERSION = '1.04'; |
|
144
|
|
|
|
|
|
|
|
|
145
|
6
|
|
|
6
|
|
4814
|
use Plucene::Analysis::SimpleAnalyzer; |
|
|
6
|
|
|
|
|
71985
|
|
|
|
6
|
|
|
|
|
168
|
|
|
146
|
6
|
|
|
6
|
|
4715
|
use Plucene::Analysis::WhitespaceAnalyzer; |
|
|
6
|
|
|
|
|
7115
|
|
|
|
6
|
|
|
|
|
149
|
|
|
147
|
6
|
|
|
6
|
|
4501
|
use Plucene::Document; |
|
|
6
|
|
|
|
|
2052
|
|
|
|
6
|
|
|
|
|
200
|
|
|
148
|
6
|
|
|
6
|
|
5251
|
use Plucene::Document::DateSerializer; |
|
|
6
|
|
|
|
|
112355
|
|
|
|
6
|
|
|
|
|
405
|
|
|
149
|
6
|
|
|
6
|
|
5681
|
use Plucene::Document::Field; |
|
|
6
|
|
|
|
|
4747
|
|
|
|
6
|
|
|
|
|
57
|
|
|
150
|
6
|
|
|
6
|
|
6009
|
use Plucene::Index::Reader; |
|
|
6
|
|
|
|
|
640608
|
|
|
|
6
|
|
|
|
|
191
|
|
|
151
|
6
|
|
|
6
|
|
5474
|
use Plucene::Index::Writer; |
|
|
6
|
|
|
|
|
218772
|
|
|
|
6
|
|
|
|
|
201
|
|
|
152
|
6
|
|
|
6
|
|
5181
|
use Plucene::QueryParser; |
|
|
6
|
|
|
|
|
93768
|
|
|
|
6
|
|
|
|
|
65
|
|
|
153
|
6
|
|
|
6
|
|
6325
|
use Plucene::Search::DateFilter; |
|
|
6
|
|
|
|
|
10749
|
|
|
|
6
|
|
|
|
|
212
|
|
|
154
|
6
|
|
|
6
|
|
4993
|
use Plucene::Search::HitCollector; |
|
|
6
|
|
|
|
|
1167
|
|
|
|
6
|
|
|
|
|
168
|
|
|
155
|
6
|
|
|
6
|
|
5221
|
use Plucene::Search::IndexSearcher; |
|
|
6
|
|
|
|
|
42991
|
|
|
|
6
|
|
|
|
|
184
|
|
|
156
|
|
|
|
|
|
|
|
|
157
|
6
|
|
|
6
|
|
72
|
use Carp; |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
379
|
|
|
158
|
6
|
|
|
6
|
|
5879
|
use File::Spec::Functions qw(catfile); |
|
|
6
|
|
|
|
|
5736
|
|
|
|
6
|
|
|
|
|
499
|
|
|
159
|
6
|
|
|
6
|
|
38
|
use Time::Piece; |
|
|
6
|
|
|
|
|
13
|
|
|
|
6
|
|
|
|
|
56
|
|
|
160
|
6
|
|
|
6
|
|
5728
|
use Time::Piece::Range; |
|
|
6
|
|
|
|
|
18995
|
|
|
|
6
|
|
|
|
|
7504
|
|
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub open { |
|
163
|
21
|
|
|
21
|
1
|
185339
|
my ($class, $dir) = @_; |
|
164
|
21
|
50
|
|
|
|
97
|
$dir or croak "No directory given"; |
|
165
|
21
|
|
|
|
|
132
|
bless { _dir => $dir }, $class; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
84
|
|
|
84
|
|
1829
|
sub _dir { shift->{_dir} } |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub _parsed_query { |
|
171
|
14
|
|
|
14
|
|
37
|
my ($self, $query, $default) = @_; |
|
172
|
14
|
|
|
|
|
120
|
my $parser = Plucene::QueryParser->new({ |
|
173
|
|
|
|
|
|
|
analyzer => Plucene::Analysis::SimpleAnalyzer->new(), |
|
174
|
|
|
|
|
|
|
default => $default |
|
175
|
|
|
|
|
|
|
}); |
|
176
|
14
|
|
|
|
|
556
|
$parser->parse($query); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
|
|
179
|
18
|
|
|
18
|
|
68
|
sub _searcher { Plucene::Search::IndexSearcher->new(shift->_dir) } |
|
180
|
|
|
|
|
|
|
|
|
181
|
16
|
|
|
16
|
|
47
|
sub _reader { Plucene::Index::Reader->open(shift->_dir) } |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub search { |
|
184
|
15
|
|
|
15
|
1
|
78768
|
my ($self, $sstring) = @_; |
|
185
|
15
|
100
|
|
|
|
57
|
return () unless $sstring; |
|
186
|
14
|
|
|
|
|
26
|
my @docs; |
|
187
|
14
|
|
|
|
|
54
|
my $searcher = $self->_searcher; |
|
188
|
|
|
|
|
|
|
my $hc = Plucene::Search::HitCollector->new( |
|
189
|
|
|
|
|
|
|
collect => sub { |
|
190
|
17
|
|
|
17
|
|
95995
|
my ($self, $doc, $score) = @_; |
|
191
|
17
|
|
|
|
|
36
|
my $res = eval { $searcher->doc($doc) }; |
|
|
17
|
|
|
|
|
63
|
|
|
192
|
17
|
50
|
|
|
|
3251
|
push @docs, [ $res, $score ] if $res; |
|
193
|
14
|
|
|
|
|
26684
|
}); |
|
194
|
14
|
|
|
|
|
164
|
$searcher->search_hc($self->_parsed_query($sstring, 'text'), $hc); |
|
195
|
14
|
|
|
|
|
17061
|
return map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs; |
|
|
8
|
|
|
|
|
63
|
|
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub search_during { |
|
199
|
4
|
|
|
4
|
1
|
175
|
my ($self, $sstring, $date1, $date2) = @_; |
|
200
|
4
|
50
|
|
|
|
18
|
return () unless $sstring; |
|
201
|
4
|
|
|
|
|
47
|
my $range = Time::Piece::Range->new( |
|
202
|
|
|
|
|
|
|
Time::Piece->strptime($date1, "%Y-%m-%d"), |
|
203
|
|
|
|
|
|
|
Time::Piece->strptime($date2, "%Y-%m-%d")); |
|
204
|
4
|
|
|
|
|
1026
|
my $filter = Plucene::Search::DateFilter->new({ |
|
205
|
|
|
|
|
|
|
field => '_date_', |
|
206
|
|
|
|
|
|
|
from => $range->start, |
|
207
|
|
|
|
|
|
|
to => $range->end, |
|
208
|
|
|
|
|
|
|
}); |
|
209
|
4
|
|
|
|
|
747
|
my $qp = Plucene::QueryParser->new({ |
|
210
|
|
|
|
|
|
|
analyzer => Plucene::Analysis::WhitespaceAnalyzer->new(), |
|
211
|
|
|
|
|
|
|
default => "text" |
|
212
|
|
|
|
|
|
|
}); |
|
213
|
4
|
|
|
|
|
207
|
my $query = $qp->parse($sstring); |
|
214
|
4
|
|
|
|
|
8100
|
my $hits = $self->_searcher->search($query, $filter); |
|
215
|
4
|
100
|
|
|
|
95240
|
return () unless $hits->length; |
|
216
|
3
|
|
|
|
|
49
|
my @docs = map $hits->doc($_), 0 .. ($hits->length - 1); |
|
217
|
3
|
|
|
|
|
2325
|
return map $_->get("id")->string, @docs; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub _writer { |
|
221
|
25
|
|
|
25
|
|
122
|
my $self = shift; |
|
222
|
25
|
100
|
|
|
|
111
|
return Plucene::Index::Writer->new( |
|
223
|
|
|
|
|
|
|
$self->_dir, |
|
224
|
|
|
|
|
|
|
Plucene::Analysis::SimpleAnalyzer->new(), |
|
225
|
|
|
|
|
|
|
-e catfile($self->_dir, "segments") ? 0 : 1 |
|
226
|
|
|
|
|
|
|
); |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub add { |
|
230
|
13
|
|
|
13
|
1
|
86388
|
my ($self, @data) = @_; |
|
231
|
13
|
|
|
|
|
117
|
my $writer = $self->_writer; |
|
232
|
13
|
|
|
|
|
22690
|
while (my ($id, $terms) = splice @data, 0, 2) { |
|
233
|
38
|
|
|
|
|
134078
|
my $doc = Plucene::Document->new; |
|
234
|
38
|
|
|
|
|
578
|
$doc->add(Plucene::Document::Field->Keyword(id => $id)); |
|
235
|
38
|
|
|
|
|
1344
|
foreach my $key (keys %$terms) { |
|
236
|
64
|
100
|
|
|
|
318
|
if ($key eq 'text') { |
|
|
|
100
|
|
|
|
|
|
|
237
|
2
|
|
|
|
|
5
|
next; # gets added at the end anyway |
|
238
|
|
|
|
|
|
|
} elsif ($key eq "date") { |
|
239
|
5
|
|
|
|
|
7
|
my $date = eval { Time::Piece->strptime($terms->{date}, "%Y-%m-%d") }; |
|
|
5
|
|
|
|
|
77
|
|
|
240
|
5
|
50
|
|
|
|
186
|
do { $date = Time::Piece->new; $terms->{date} = $date->ymd; } if $@; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
241
|
5
|
|
|
|
|
25
|
$doc->add( |
|
242
|
|
|
|
|
|
|
Plucene::Document::Field->Keyword("_date_", freeze_date($date))); |
|
243
|
5
|
|
|
|
|
578
|
$doc->add(Plucene::Document::Field->Keyword("date", $date->ymd)); |
|
244
|
|
|
|
|
|
|
} else { |
|
245
|
57
|
|
|
|
|
237
|
$doc->add(Plucene::Document::Field->UnStored($key => $terms->{$key})); |
|
246
|
57
|
50
|
|
|
|
1690
|
$terms->{text} .= " " . $terms->{$key} unless $key =~ /^_/; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
} |
|
249
|
38
|
|
|
|
|
297
|
$doc->add(Plucene::Document::Field->UnStored(text => $terms->{text})); |
|
250
|
38
|
|
|
|
|
984
|
$writer->add_document($doc); |
|
251
|
|
|
|
|
|
|
} |
|
252
|
13
|
|
|
|
|
132731
|
undef $writer; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub index_document { |
|
256
|
8
|
|
|
8
|
1
|
483825
|
my ($self, $id, $data) = @_; |
|
257
|
8
|
|
|
|
|
45
|
my $writer = $self->_writer; |
|
258
|
8
|
|
|
|
|
7646
|
my $doc = Plucene::Document->new; |
|
259
|
8
|
|
|
|
|
171
|
$doc->add(Plucene::Document::Field->Keyword(id => $id)); |
|
260
|
8
|
|
|
|
|
313
|
$doc->add(Plucene::Document::Field->UnStored('text' => $data)); |
|
261
|
8
|
|
|
|
|
220
|
$writer->add_document($doc); |
|
262
|
8
|
|
|
|
|
301647
|
undef $writer; |
|
263
|
|
|
|
|
|
|
} |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub delete_document { |
|
266
|
14
|
|
|
14
|
1
|
172512
|
my ($self, $id) = @_; |
|
267
|
14
|
|
|
|
|
57
|
my $reader = $self->_reader; |
|
268
|
14
|
|
|
|
|
24213
|
$reader->delete_term( |
|
269
|
|
|
|
|
|
|
Plucene::Index::Term->new({ field => "id", text => $id })); |
|
270
|
14
|
|
|
|
|
27131
|
$reader->close; |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
4
|
|
|
4
|
1
|
242917
|
sub optimize { shift->_writer->optimize() } |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub indexed { |
|
276
|
2
|
|
|
2
|
1
|
3083
|
my ($self, $id) = @_; |
|
277
|
2
|
|
|
|
|
18
|
my $term = Plucene::Index::Term->new({ field => 'id', text => $id }); |
|
278
|
2
|
|
|
|
|
24
|
return $self->_reader->doc_freq($term); |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Copyright (C) 2003-2004 Kasei Limited |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=cut |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
1; |