| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Plucene::SearchEngine::Query; |
|
2
|
1
|
|
|
1
|
|
68079
|
use 5.006; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
45
|
|
|
3
|
1
|
|
|
1
|
|
7
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
42
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
37
|
|
|
5
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
87
|
|
|
6
|
1
|
|
|
1
|
|
8
|
use UNIVERSAL::require; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
33
|
|
|
7
|
1
|
|
|
1
|
|
960
|
use Lucene::QueryParser; |
|
|
1
|
|
|
|
|
15393
|
|
|
|
1
|
|
|
|
|
77
|
|
|
8
|
1
|
|
|
1
|
|
1005
|
use Plucene::Search::IndexSearcher; |
|
|
1
|
|
|
|
|
8990
|
|
|
|
1
|
|
|
|
|
20
|
|
|
9
|
1
|
|
|
1
|
|
39
|
use Plucene::Search::HitCollector; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
9
|
|
|
10
|
1
|
|
|
1
|
|
988
|
use Plucene::QueryParser; |
|
|
1
|
|
|
|
|
2872
|
|
|
|
1
|
|
|
|
|
11
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Plucene::SearchEngine::Query - A higher level abstraction for Plucene |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Plucene::SearchEngine::Query; |
|
21
|
|
|
|
|
|
|
my $query = Plucene::SearchEngine::Query->new( |
|
22
|
|
|
|
|
|
|
dir => "/var/plucene/foo" |
|
23
|
|
|
|
|
|
|
); |
|
24
|
|
|
|
|
|
|
my @docs = $queryer->search("some stuff"); |
|
25
|
|
|
|
|
|
|
for my $id (@docs) { |
|
26
|
|
|
|
|
|
|
$snippeter = $query->snippeter( retrieve_text_for_doc($id) ); |
|
27
|
|
|
|
|
|
|
print "Doc $id \n"; |
|
28
|
|
|
|
|
|
|
print "" . $snippeter->as_html . " "; |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Plucene is an extremely powerful library for building search engines, but |
|
34
|
|
|
|
|
|
|
each time I build a search engine with it, I always find myself doing the |
|
35
|
|
|
|
|
|
|
same things. This module provides an abstraction layer around Plucene - |
|
36
|
|
|
|
|
|
|
not quite as abstracted as L, but more abstracted than |
|
37
|
|
|
|
|
|
|
Plucene itself. |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 METHODS |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=cut |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head2 new |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
Plucene::SearchEngine::Query->new( |
|
46
|
|
|
|
|
|
|
dir => "/var/plucene/foo", |
|
47
|
|
|
|
|
|
|
analyzer => "Plucene::Analysis::SimpleAnalyzer", |
|
48
|
|
|
|
|
|
|
default => "text", |
|
49
|
|
|
|
|
|
|
expand_docs => sub { shift; @_ }, |
|
50
|
|
|
|
|
|
|
snippeter => "Text::Context"; |
|
51
|
|
|
|
|
|
|
) |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
This prepares for searching the index. The only mandatory argument is |
|
54
|
|
|
|
|
|
|
C, which tells Plucene where the index is to be found. The |
|
55
|
|
|
|
|
|
|
C and C arguments are explained below; |
|
56
|
|
|
|
|
|
|
C specifies which Plucene analysis class to use when tokenising |
|
57
|
|
|
|
|
|
|
the search terms, and the C argument denotes the default field |
|
58
|
|
|
|
|
|
|
for unqualified query terms. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=cut |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub new { |
|
63
|
1
|
|
|
1
|
1
|
885
|
my ($class, %args) = @_; |
|
64
|
1
|
50
|
|
|
|
9
|
croak("No directory given!") unless $args{dir}; |
|
65
|
1
|
50
|
|
|
|
30
|
croak("$args{dir} isn't a directory") unless -d $args{dir}; |
|
66
|
1
|
|
|
|
|
10
|
my $self = bless { |
|
67
|
|
|
|
|
|
|
analyzer => "Plucene::Analysis::SimpleAnalyzer", |
|
68
|
|
|
|
|
|
|
default => "text", |
|
69
|
|
|
|
|
|
|
expand_docs => \&expand_docs, |
|
70
|
|
|
|
|
|
|
snippeter => "Text::Context", |
|
71
|
|
|
|
|
|
|
%args |
|
72
|
|
|
|
|
|
|
}, $class; |
|
73
|
1
|
50
|
|
|
|
25
|
$self->{analyzer}->require |
|
74
|
|
|
|
|
|
|
or die "Couldn't require analyzer: $self->{analyzer}"; |
|
75
|
1
|
50
|
|
|
|
49
|
$self->{snippeter}->require |
|
76
|
|
|
|
|
|
|
or die "Couldn't require snippet class: $self->{snippeter}"; |
|
77
|
1
|
|
|
|
|
1386
|
return $self; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub prepare_search { |
|
81
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
|
82
|
1
|
|
33
|
|
|
20
|
$self->{searcher} ||= Plucene::Search::IndexSearcher->new( $self->{dir} ); |
|
83
|
1
|
|
33
|
|
|
2834
|
$self->{parser} ||= Plucene::QueryParser->new({ |
|
84
|
|
|
|
|
|
|
analyzer => $self->{analyzer}->new, |
|
85
|
|
|
|
|
|
|
default => $self->{default} |
|
86
|
|
|
|
|
|
|
}); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head2 search |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
@docs = $queryer->search("foo bar"); |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Returns a set of documents matching the search query. The default |
|
94
|
|
|
|
|
|
|
way of "expanding" these search results is to sort them by score, |
|
95
|
|
|
|
|
|
|
and then return the value of the C field from the Plucene index. |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Those more familiar with Plucene can have alternative data structures |
|
98
|
|
|
|
|
|
|
returned by providing a different C parameter to the |
|
99
|
|
|
|
|
|
|
constructor. For instance, the default doesn't actually B the |
|
100
|
|
|
|
|
|
|
score, so if you want to get at it, you can say: |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
expand_docs => sub { my ($self, @docs) = @_; return @docs } |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
This will return a list of array references; the first element in each |
|
105
|
|
|
|
|
|
|
array ref will be the C object, and the second will |
|
106
|
|
|
|
|
|
|
be the score. |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Or, if you're dealing with C-derived classes, you might |
|
109
|
|
|
|
|
|
|
like to try: |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
expand_docs => sub { my ($self, @docs) = @_; |
|
112
|
|
|
|
|
|
|
sort { $b->date <=> $a->date } # Sort by date descending |
|
113
|
|
|
|
|
|
|
map { My::Class->retrieve($_->[0]->get("id")->string) } |
|
114
|
|
|
|
|
|
|
@docs; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
The choice is yours. |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub search { |
|
122
|
1
|
|
|
1
|
1
|
847
|
my ($self, $query) = @_; |
|
123
|
1
|
|
|
|
|
3
|
$self->{orig_query} = $query; |
|
124
|
1
|
|
|
|
|
5
|
$self->prepare_search; |
|
125
|
1
|
|
|
|
|
54
|
$self->{query} = $self->{parser}->parse($query); |
|
126
|
|
|
|
|
|
|
|
|
127
|
1
|
|
|
|
|
5772
|
my @docs; |
|
128
|
1
|
|
|
|
|
5
|
my $searcher = $self->{searcher}; |
|
129
|
|
|
|
|
|
|
my $hc = Plucene::Search::HitCollector->new( |
|
130
|
|
|
|
|
|
|
collect => sub { |
|
131
|
2
|
|
|
2
|
|
3043
|
my ($self, $doc, $score) = @_; |
|
132
|
2
|
|
|
|
|
3
|
my $res = eval { $searcher->doc($doc) }; |
|
|
2
|
|
|
|
|
8
|
|
|
133
|
2
|
50
|
|
|
|
1342
|
die $@ if $@; |
|
134
|
2
|
50
|
|
|
|
14
|
push @docs, [$res, $score] if $res; |
|
135
|
1
|
|
|
|
|
13
|
}); |
|
136
|
1
|
|
|
|
|
21
|
$self->{searcher}->search_hc($self->{query}, $hc); |
|
137
|
1
|
|
|
|
|
78
|
return $self->{expand_docs}->($self, @docs); |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub expand_docs { |
|
141
|
1
|
|
|
1
|
0
|
4
|
my ($self, @docs) = @_; |
|
142
|
1
|
|
|
|
|
8
|
map $_->[0]->get("id")->string, sort { $b->[1] <=> $a->[1] } @docs; |
|
|
1
|
|
|
|
|
9
|
|
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _unlucene { |
|
146
|
0
|
|
|
0
|
|
|
my ($self, $ast) = @_; |
|
147
|
0
|
0
|
|
|
|
|
return map { |
|
|
|
0
|
|
|
|
|
|
|
148
|
0
|
0
|
0
|
|
|
|
$_->{query} eq "SUBQUERY" ? $self->_unlucene($_->{subquery}) : |
|
149
|
|
|
|
|
|
|
$_->{query} ne "PHRASE" ? $_->{term} : |
|
150
|
|
|
|
|
|
|
(split /\s+/, $_->{term}) |
|
151
|
|
|
|
|
|
|
} grep { |
|
152
|
0
|
|
|
|
|
|
$_->{type} ne "PROHIBITED" and |
|
153
|
|
|
|
|
|
|
(!exists($_->{field}) or $_->{field} eq $self->{default}) |
|
154
|
0
|
|
|
|
|
|
} @{$ast}; |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 snippeter |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
$self->snippeter($doc_text) |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Given the searchable text of a document, returns a snippeter class |
|
162
|
|
|
|
|
|
|
(C, C, etc.) object primed with |
|
163
|
|
|
|
|
|
|
the positive parts of the query. |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
When you call the rendering method (say, C) on this object, |
|
166
|
|
|
|
|
|
|
you'll get the text snippet highlighting where the search terms appear |
|
167
|
|
|
|
|
|
|
in the document. |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub snippeter { |
|
172
|
0
|
|
|
0
|
1
|
|
my ($self, $body) = @_; |
|
173
|
0
|
0
|
|
|
|
|
croak "It doesn't look like you've actually done a search yet" |
|
174
|
|
|
|
|
|
|
unless $self->{orig_query}; |
|
175
|
|
|
|
|
|
|
# We can't actually use the original parser, because it may have |
|
176
|
|
|
|
|
|
|
# tokenized us funny. (Porter stemming, etc.) |
|
177
|
0
|
|
|
|
|
|
my @terms = $self->_unlucene(parse_query($self->{orig_query})); |
|
178
|
0
|
|
|
|
|
|
$self->{snippeter}->new($body, @terms); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
1; |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 AUTHOR |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Simon Cozens, C |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
L, L, L. |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |