File Coverage

blib/lib/Plucene/Simple.pm

Criterion	Covered	Total	%
statement	117	119	98.3
branch	15	20	75.0
condition			n/a
subroutine	31	31	100.0
pod	8	8	100.0
total	171	178	96.0

line	stmt	bran	sub	pod	time	code
1						package Plucene::Simple;
2
3						=head1 NAME
4
5						Plucene::Simple - An interface to Plucene
6
7						=head1 SYNOPSIS
8
9						use Plucene::Simple;
10
11						# create an index
12						my $plucy = Plucene::Simple->open($index_path);
13
14						# add to the index
15						$plucy->add(
16						$id1 => { $field => $term1 },
17						$id2 => { $field => $term2 },
18						);
19
20						# or ...
21						$plucy->index_document($id => $data);
22
23						# search an existing index
24						my $plucy = Plucene::Simple->open($index_path);
25						my @results = $plucy->search($search_string);
26
27						# optimize the index
28						$plucy->optimize;
29
30						# remove something from the index
31						$plucy->delete_document($id);
32
33						# is something in the index?
34						if ($plucy->indexed($id) { ... }
35
36						=head1 DESCRIPTION
37
38						This provides a simple interface to L. Plucene is large and
39						multi-featured, and it expected that users will subclass it, and tie
40						all the pieces together to suit their own needs. Plucene::Simple is,
41						therefore, just one way to use Plucene. It's not expected that it will
42						do exactly what you want, but you can always use it as an example of
43						how to build your own interface.
44
45						=head1 INDEXING
46
47						=head2 open
48
49						You make a new Plucene::Simple object like so:
50
51						my $plucy = Plucene::Simple->open($index_path);
52
53						If this index doesn't exist, then it will be created for you, otherwise you
54						will be adding to an exisiting one.
55
56						Then you can add your documents to the index:
57
58						=head2 add
59
60						Every document must be indexed with a unique key (which will be returned
61						from searches).
62
63						A document can be made up of many fields, which can be added as
64						a hashref:
65
66						$plucy->add($key, \%data);
67
68						$plucy->add(
69						chap1 => {
70						title => "Moby-Dick",
71						author => "Herman Melville",
72						text => "Call me Ishmael ..."
73						},
74						chap2 => {
75						title => "Boo-Hoo",
76						author => "Lydia Lee",
77						text => "...",
78						}
79						);
80
81						=head2 index_document
82
83						Alternatively, if you do not want to index lots of metadata, but rather
84						just simple text, you can use the index_document() method.
85
86						$plucy->index_document($key, $data);
87						$plucy->index_document(chap1 => 'Call me Ishmael ...');
88
89						=head2 delete_document
90
91						$plucy->delete_document($id);
92
93						=head2 optimize
94
95						$plucy->optimize;
96
97						Plucene is set-up to perform insertions quickly. After a bunch of inserts
98						it is good to optimize() the index for better search speed.
99
100						=head1 SEARCHING
101
102						=head2 search
103
104						my @ids = $plucy->search('ishmael');
105						# ("chap1", ...)
106
107						This will return the IDs of each document matching the search term.
108
109						If you have indexed your documents with fields, you can also search with
110						the field name as a prefix:
111
112						my @ids = $plucy->search("author:lee");
113						# ("chap2" ...)
114
115						my @results = $plucy->search($search_string);
116
117						This will search the index with the given query, and return a list of
118						document ids.
119
120						Searches can be much more powerful than this - see L for
121						further details.
122
123						=head2 search_during
124
125						my @results = $lucy->search_during($search_string, $date1, $date2);
126						my @results = $lucy->search_during("to:Fred", "2001-01-01" => "2003-12-31");
127
128						If your documents were given an ISO 'date' field when indexing,
129						search_during() will restrict the results to all documents between the
130						specified dates. Any document without a 'date' field will be ignored.
131
132						=head2 indexed
133
134						if ($plucy->indexed($id) { ... }
135
136						This returns true if there is a document with the given ID in the index.
137
138						=cut
139
140	6		6		3777	use strict;
	6				10
	6				172
141	6		6		29	use warnings;
	6				7
	6				217
142
143						our $VERSION = '1.04';
144
145	6		6		4814	use Plucene::Analysis::SimpleAnalyzer;
	6				71985
	6				168
146	6		6		4715	use Plucene::Analysis::WhitespaceAnalyzer;
	6				7115
	6				149
147	6		6		4501	use Plucene::Document;
	6				2052
	6				200
148	6		6		5251	use Plucene::Document::DateSerializer;
	6				112355
	6				405
149	6		6		5681	use Plucene::Document::Field;
	6				4747
	6				57
150	6		6		6009	use Plucene::Index::Reader;
	6				640608
	6				191
151	6		6		5474	use Plucene::Index::Writer;
	6				218772
	6				201
152	6		6		5181	use Plucene::QueryParser;
	6				93768
	6				65
153	6		6		6325	use Plucene::Search::DateFilter;
	6				10749
	6				212
154	6		6		4993	use Plucene::Search::HitCollector;
	6				1167
	6				168
155	6		6		5221	use Plucene::Search::IndexSearcher;
	6				42991
	6				184
156
157	6		6		72	use Carp;
	6				11
	6				379
158	6		6		5879	use File::Spec::Functions qw(catfile);
	6				5736
	6				499
159	6		6		38	use Time::Piece;
	6				13
	6				56
160	6		6		5728	use Time::Piece::Range;
	6				18995
	6				7504
161
162						sub open {
163	21		21	1	185339	my ($class, $dir) = @_;
164	21	50			97	$dir or croak "No directory given";
165	21				132	bless { _dir => $dir }, $class;
166						}
167
168	84		84		1829	sub _dir { shift->{_dir} }
169
170						sub _parsed_query {
171	14		14		37	my ($self, $query, $default) = @_;
172	14				120	my $parser = Plucene::QueryParser->new({
173						analyzer => Plucene::Analysis::SimpleAnalyzer->new(),
174						default => $default
175						});
176	14				556	$parser->parse($query);
177						}
178
179	18		18		68	sub _searcher { Plucene::Search::IndexSearcher->new(shift->_dir) }
180
181	16		16		47	sub _reader { Plucene::Index::Reader->open(shift->_dir) }
182
183						sub search {
184	15		15	1	78768	my ($self, $sstring) = @_;
185	15	100			57	return () unless $sstring;
186	14				26	my @docs;
187	14				54	my $searcher = $self->_searcher;
188						my $hc = Plucene::Search::HitCollector->new(
189						collect => sub {
190	17		17		95995	my ($self, $doc, $score) = @_;
191	17				36	my $res = eval { $searcher->doc($doc) };
	17				63
192	17	50			3251	push @docs, [ $res, $score ] if $res;
193	14				26684	});
194	14				164	$searcher->search_hc($self->_parsed_query($sstring, 'text'), $hc);
195	14				17061	return map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs;
	8				63
196						}
197
198						sub search_during {
199	4		4	1	175	my ($self, $sstring, $date1, $date2) = @_;
200	4	50			18	return () unless $sstring;
201	4				47	my $range = Time::Piece::Range->new(
202						Time::Piece->strptime($date1, "%Y-%m-%d"),
203						Time::Piece->strptime($date2, "%Y-%m-%d"));
204	4				1026	my $filter = Plucene::Search::DateFilter->new({
205						field => '_date_',
206						from => $range->start,
207						to => $range->end,
208						});
209	4				747	my $qp = Plucene::QueryParser->new({
210						analyzer => Plucene::Analysis::WhitespaceAnalyzer->new(),
211						default => "text"
212						});
213	4				207	my $query = $qp->parse($sstring);
214	4				8100	my $hits = $self->_searcher->search($query, $filter);
215	4	100			95240	return () unless $hits->length;
216	3				49	my @docs = map $hits->doc($_), 0 .. ($hits->length - 1);
217	3				2325	return map $_->get("id")->string, @docs;
218						}
219
220						sub _writer {
221	25		25		122	my $self = shift;
222	25	100			111	return Plucene::Index::Writer->new(
223						$self->_dir,
224						Plucene::Analysis::SimpleAnalyzer->new(),
225						-e catfile($self->_dir, "segments") ? 0 : 1
226						);
227						}
228
229						sub add {
230	13		13	1	86388	my ($self, @data) = @_;
231	13				117	my $writer = $self->_writer;
232	13				22690	while (my ($id, $terms) = splice @data, 0, 2) {
233	38				134078	my $doc = Plucene::Document->new;
234	38				578	$doc->add(Plucene::Document::Field->Keyword(id => $id));
235	38				1344	foreach my $key (keys %$terms) {
236	64	100			318	if ($key eq 'text') {
		100
237	2				5	next; # gets added at the end anyway
238						} elsif ($key eq "date") {
239	5				7	my $date = eval { Time::Piece->strptime($terms->{date}, "%Y-%m-%d") };
	5				77
240	5	50			186	do { $date = Time::Piece->new; $terms->{date} = $date->ymd; } if $@;
	0				0
	0				0
241	5				25	$doc->add(
242						Plucene::Document::Field->Keyword("_date_", freeze_date($date)));
243	5				578	$doc->add(Plucene::Document::Field->Keyword("date", $date->ymd));
244						} else {
245	57				237	$doc->add(Plucene::Document::Field->UnStored($key => $terms->{$key}));
246	57	50			1690	$terms->{text} .= " " . $terms->{$key} unless $key =~ /^_/;
247						}
248						}
249	38				297	$doc->add(Plucene::Document::Field->UnStored(text => $terms->{text}));
250	38				984	$writer->add_document($doc);
251						}
252	13				132731	undef $writer;
253						}
254
255						sub index_document {
256	8		8	1	483825	my ($self, $id, $data) = @_;
257	8				45	my $writer = $self->_writer;
258	8				7646	my $doc = Plucene::Document->new;
259	8				171	$doc->add(Plucene::Document::Field->Keyword(id => $id));
260	8				313	$doc->add(Plucene::Document::Field->UnStored('text' => $data));
261	8				220	$writer->add_document($doc);
262	8				301647	undef $writer;
263						}
264
265						sub delete_document {
266	14		14	1	172512	my ($self, $id) = @_;
267	14				57	my $reader = $self->_reader;
268	14				24213	$reader->delete_term(
269						Plucene::Index::Term->new({ field => "id", text => $id }));
270	14				27131	$reader->close;
271						}
272
273	4		4	1	242917	sub optimize { shift->_writer->optimize() }
274
275						sub indexed {
276	2		2	1	3083	my ($self, $id) = @_;
277	2				18	my $term = Plucene::Index::Term->new({ field => 'id', text => $id });
278	2				24	return $self->_reader->doc_freq($term);
279						}
280
281						=head1 COPYRIGHT
282
283						Copyright (C) 2003-2004 Kasei Limited
284
285						=cut
286
287						1;