File Coverage

blib/lib/Plucene/Simple.pm
Criterion Covered Total %
statement 117 119 98.3
branch 15 20 75.0
condition n/a
subroutine 31 31 100.0
pod 8 8 100.0
total 171 178 96.0


line stmt bran cond sub pod time code
1             package Plucene::Simple;
2              
3             =head1 NAME
4              
5             Plucene::Simple - An interface to Plucene
6              
7             =head1 SYNOPSIS
8              
9             use Plucene::Simple;
10              
11             # create an index
12             my $plucy = Plucene::Simple->open($index_path);
13              
14             # add to the index
15             $plucy->add(
16             $id1 => { $field => $term1 },
17             $id2 => { $field => $term2 },
18             );
19              
20             # or ...
21             $plucy->index_document($id => $data);
22              
23             # search an existing index
24             my $plucy = Plucene::Simple->open($index_path);
25             my @results = $plucy->search($search_string);
26              
27             # optimize the index
28             $plucy->optimize;
29              
30             # remove something from the index
31             $plucy->delete_document($id);
32              
33             # is something in the index?
34             if ($plucy->indexed($id) { ... }
35            
36             =head1 DESCRIPTION
37              
38             This provides a simple interface to L. Plucene is large and
39             multi-featured, and it expected that users will subclass it, and tie
40             all the pieces together to suit their own needs. Plucene::Simple is,
41             therefore, just one way to use Plucene. It's not expected that it will
42             do exactly what *you* want, but you can always use it as an example of
43             how to build your own interface.
44              
45             =head1 INDEXING
46              
47             =head2 open
48              
49             You make a new Plucene::Simple object like so:
50              
51             my $plucy = Plucene::Simple->open($index_path);
52              
53             If this index doesn't exist, then it will be created for you, otherwise you
54             will be adding to an exisiting one.
55            
56             Then you can add your documents to the index:
57              
58             =head2 add
59              
60             Every document must be indexed with a unique key (which will be returned
61             from searches).
62              
63             A document can be made up of many fields, which can be added as
64             a hashref:
65              
66             $plucy->add($key, \%data);
67              
68             $plucy->add(
69             chap1 => {
70             title => "Moby-Dick",
71             author => "Herman Melville",
72             text => "Call me Ishmael ..."
73             },
74             chap2 => {
75             title => "Boo-Hoo",
76             author => "Lydia Lee",
77             text => "...",
78             }
79             );
80              
81             =head2 index_document
82              
83             Alternatively, if you do not want to index lots of metadata, but rather
84             just simple text, you can use the index_document() method.
85              
86             $plucy->index_document($key, $data);
87             $plucy->index_document(chap1 => 'Call me Ishmael ...');
88              
89             =head2 delete_document
90              
91             $plucy->delete_document($id);
92              
93             =head2 optimize
94              
95             $plucy->optimize;
96              
97             Plucene is set-up to perform insertions quickly. After a bunch of inserts
98             it is good to optimize() the index for better search speed.
99              
100             =head1 SEARCHING
101              
102             =head2 search
103              
104             my @ids = $plucy->search('ishmael');
105             # ("chap1", ...)
106              
107             This will return the IDs of each document matching the search term.
108              
109             If you have indexed your documents with fields, you can also search with
110             the field name as a prefix:
111              
112             my @ids = $plucy->search("author:lee");
113             # ("chap2" ...)
114              
115             my @results = $plucy->search($search_string);
116              
117             This will search the index with the given query, and return a list of
118             document ids.
119              
120             Searches can be much more powerful than this - see L for
121             further details.
122              
123             =head2 search_during
124              
125             my @results = $lucy->search_during($search_string, $date1, $date2);
126             my @results = $lucy->search_during("to:Fred", "2001-01-01" => "2003-12-31");
127              
128             If your documents were given an ISO 'date' field when indexing,
129             search_during() will restrict the results to all documents between the
130             specified dates. Any document without a 'date' field will be ignored.
131              
132             =head2 indexed
133              
134             if ($plucy->indexed($id) { ... }
135              
136             This returns true if there is a document with the given ID in the index.
137              
138             =cut
139              
140 6     6   3777 use strict;
  6         10  
  6         172  
141 6     6   29 use warnings;
  6         7  
  6         217  
142              
143             our $VERSION = '1.04';
144              
145 6     6   4814 use Plucene::Analysis::SimpleAnalyzer;
  6         71985  
  6         168  
146 6     6   4715 use Plucene::Analysis::WhitespaceAnalyzer;
  6         7115  
  6         149  
147 6     6   4501 use Plucene::Document;
  6         2052  
  6         200  
148 6     6   5251 use Plucene::Document::DateSerializer;
  6         112355  
  6         405  
149 6     6   5681 use Plucene::Document::Field;
  6         4747  
  6         57  
150 6     6   6009 use Plucene::Index::Reader;
  6         640608  
  6         191  
151 6     6   5474 use Plucene::Index::Writer;
  6         218772  
  6         201  
152 6     6   5181 use Plucene::QueryParser;
  6         93768  
  6         65  
153 6     6   6325 use Plucene::Search::DateFilter;
  6         10749  
  6         212  
154 6     6   4993 use Plucene::Search::HitCollector;
  6         1167  
  6         168  
155 6     6   5221 use Plucene::Search::IndexSearcher;
  6         42991  
  6         184  
156              
157 6     6   72 use Carp;
  6         11  
  6         379  
158 6     6   5879 use File::Spec::Functions qw(catfile);
  6         5736  
  6         499  
159 6     6   38 use Time::Piece;
  6         13  
  6         56  
160 6     6   5728 use Time::Piece::Range;
  6         18995  
  6         7504  
161              
162             sub open {
163 21     21 1 185339 my ($class, $dir) = @_;
164 21 50       97 $dir or croak "No directory given";
165 21         132 bless { _dir => $dir }, $class;
166             }
167              
168 84     84   1829 sub _dir { shift->{_dir} }
169              
170             sub _parsed_query {
171 14     14   37 my ($self, $query, $default) = @_;
172 14         120 my $parser = Plucene::QueryParser->new({
173             analyzer => Plucene::Analysis::SimpleAnalyzer->new(),
174             default => $default
175             });
176 14         556 $parser->parse($query);
177             }
178              
179 18     18   68 sub _searcher { Plucene::Search::IndexSearcher->new(shift->_dir) }
180              
181 16     16   47 sub _reader { Plucene::Index::Reader->open(shift->_dir) }
182              
183             sub search {
184 15     15 1 78768 my ($self, $sstring) = @_;
185 15 100       57 return () unless $sstring;
186 14         26 my @docs;
187 14         54 my $searcher = $self->_searcher;
188             my $hc = Plucene::Search::HitCollector->new(
189             collect => sub {
190 17     17   95995 my ($self, $doc, $score) = @_;
191 17         36 my $res = eval { $searcher->doc($doc) };
  17         63  
192 17 50       3251 push @docs, [ $res, $score ] if $res;
193 14         26684 });
194 14         164 $searcher->search_hc($self->_parsed_query($sstring, 'text'), $hc);
195 14         17061 return map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs;
  8         63  
196             }
197              
198             sub search_during {
199 4     4 1 175 my ($self, $sstring, $date1, $date2) = @_;
200 4 50       18 return () unless $sstring;
201 4         47 my $range = Time::Piece::Range->new(
202             Time::Piece->strptime($date1, "%Y-%m-%d"),
203             Time::Piece->strptime($date2, "%Y-%m-%d"));
204 4         1026 my $filter = Plucene::Search::DateFilter->new({
205             field => '_date_',
206             from => $range->start,
207             to => $range->end,
208             });
209 4         747 my $qp = Plucene::QueryParser->new({
210             analyzer => Plucene::Analysis::WhitespaceAnalyzer->new(),
211             default => "text"
212             });
213 4         207 my $query = $qp->parse($sstring);
214 4         8100 my $hits = $self->_searcher->search($query, $filter);
215 4 100       95240 return () unless $hits->length;
216 3         49 my @docs = map $hits->doc($_), 0 .. ($hits->length - 1);
217 3         2325 return map $_->get("id")->string, @docs;
218             }
219              
220             sub _writer {
221 25     25   122 my $self = shift;
222 25 100       111 return Plucene::Index::Writer->new(
223             $self->_dir,
224             Plucene::Analysis::SimpleAnalyzer->new(),
225             -e catfile($self->_dir, "segments") ? 0 : 1
226             );
227             }
228              
229             sub add {
230 13     13 1 86388 my ($self, @data) = @_;
231 13         117 my $writer = $self->_writer;
232 13         22690 while (my ($id, $terms) = splice @data, 0, 2) {
233 38         134078 my $doc = Plucene::Document->new;
234 38         578 $doc->add(Plucene::Document::Field->Keyword(id => $id));
235 38         1344 foreach my $key (keys %$terms) {
236 64 100       318 if ($key eq 'text') {
    100          
237 2         5 next; # gets added at the end anyway
238             } elsif ($key eq "date") {
239 5         7 my $date = eval { Time::Piece->strptime($terms->{date}, "%Y-%m-%d") };
  5         77  
240 5 50       186 do { $date = Time::Piece->new; $terms->{date} = $date->ymd; } if $@;
  0         0  
  0         0  
241 5         25 $doc->add(
242             Plucene::Document::Field->Keyword("_date_", freeze_date($date)));
243 5         578 $doc->add(Plucene::Document::Field->Keyword("date", $date->ymd));
244             } else {
245 57         237 $doc->add(Plucene::Document::Field->UnStored($key => $terms->{$key}));
246 57 50       1690 $terms->{text} .= " " . $terms->{$key} unless $key =~ /^_/;
247             }
248             }
249 38         297 $doc->add(Plucene::Document::Field->UnStored(text => $terms->{text}));
250 38         984 $writer->add_document($doc);
251             }
252 13         132731 undef $writer;
253             }
254              
255             sub index_document {
256 8     8 1 483825 my ($self, $id, $data) = @_;
257 8         45 my $writer = $self->_writer;
258 8         7646 my $doc = Plucene::Document->new;
259 8         171 $doc->add(Plucene::Document::Field->Keyword(id => $id));
260 8         313 $doc->add(Plucene::Document::Field->UnStored('text' => $data));
261 8         220 $writer->add_document($doc);
262 8         301647 undef $writer;
263             }
264              
265             sub delete_document {
266 14     14 1 172512 my ($self, $id) = @_;
267 14         57 my $reader = $self->_reader;
268 14         24213 $reader->delete_term(
269             Plucene::Index::Term->new({ field => "id", text => $id }));
270 14         27131 $reader->close;
271             }
272              
273 4     4 1 242917 sub optimize { shift->_writer->optimize() }
274              
275             sub indexed {
276 2     2 1 3083 my ($self, $id) = @_;
277 2         18 my $term = Plucene::Index::Term->new({ field => 'id', text => $id });
278 2         24 return $self->_reader->doc_freq($term);
279             }
280              
281             =head1 COPYRIGHT
282              
283             Copyright (C) 2003-2004 Kasei Limited
284              
285             =cut
286              
287             1;