File Coverage

blib/lib/Wiki/Toolkit/Search/Base.pm

Criterion	Covered	Total	%
statement	9	55	16.3
branch	0	12	0.0
condition	0	3	0.0
subroutine	3	20	15.0
pod	10	10	100.0
total	22	100	22.0

line	stmt	bran	cond	sub	pod	time	code
1							package Wiki::Toolkit::Search::Base;
2
3	1			1		635	use strict;
	1					2
	1					24
4	1			1		3	use Carp "croak";
	1					1
	1					43
5
6	1			1		3	use vars qw( @ISA $VERSION );
	1					15
	1					565
7
8							sub _abstract {
9	0			0			my $who = (caller(1))[3];
10	0						croak "$who is an abstract method which the ".(ref shift).
11							" class has not provided";
12							}
13
14							$VERSION = 0.01;
15
16							=head1 NAME
17
18							Wiki::Toolkit::Search::Base - Base class for Wiki::Toolkit search plugins.
19
20							=head1 SYNOPSIS
21
22							my $search = Wiki::Toolkit::Search::XXX->new( @args );
23							my %wombat_nodes = $search->search_nodes("wombat");
24
25							This class details the methods that need to be overridden by search plugins.
26
27							=cut
28
29							=head1 METHODS
30
31							=head2 C
32
33							my $search = Wiki::Toolkit::Search::XXX->new( @args );
34
35							Creates a new searcher. By default the arguments are just passed to
36							C<_init>, so you may wish to override that instead.
37
38							=cut
39
40							sub new {
41	0			0	1		my ($class, @args) = @_;
42	0						my $self = {};
43	0						bless $self, $class;
44	0						return $self->_init(@args);
45							}
46
47							sub _init {
48	0			0			my ($self, %args) = @_;
49	0						@{$self}{keys %args} = values %args;
	0
50	0						return $self;
51							}
52
53							=head2 C
54
55							# Find all the nodes which contain the word 'expert'.
56							my %results = $search->search_nodes('expert');
57
58							Returns a (possibly empty) hash whose keys are the node names and
59							whose values are the scores in some kind of relevance-scoring system I
60							haven't entirely come up with yet. For OR searches, this could
61							initially be the number of terms that appear in the node, perhaps.
62
63							Defaults to AND searches (if $and_or is not supplied, or is anything
64							other than C or C).
65
66							Searches are case-insensitive.
67
68							=cut
69
70							sub search_nodes {
71	0			0	1		my ($self, $termstr, $and_or) = @_;
72
73	0						$and_or = lc($and_or);
74	0	0	0				unless ( defined $and_or and $and_or eq "or" ) {
75	0						$and_or = "and";
76							}
77
78							# Extract individual search terms.
79	0						my @terms = $self->analyze($termstr);
80
81	0						return $self->_do_search($and_or, \@terms);
82							}
83
84	0			0			sub _do_search { shift->_abstract };
85
86							=head2 C
87
88							@terms = $self->analyze($string)
89
90							Splits a string into a set of terms for indexing and searching. Typically
91							this is done case-insensitively, splitting at word boundaries, and extracting
92							words that contain at least 1 word characters.
93
94							=cut
95
96							sub analyze {
97	0			0	1		my ($self, $string) = @_;
98	0	0					return grep { length > 1 # ignore single characters
	0
99							and ! /^\W*$/ } # and things composed entirely
100							# of non-word characters
101							split( /\b/, # split at word boundaries
102							lc($string) # be case-insensitive
103							);
104							}
105
106							=head2 C
107
108							$wiki->write_node( "King's Cross St Pancras", "A station." );
109							my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" );
110
111							Returns a (possibly empty) hash whose keys are the node names and
112							whose values are the scores in some kind of relevance-scoring system I
113							haven't entirely come up with yet.
114
115							Note that even if an exact match is found, any other similar enough
116							matches will also be returned. However, any exact match is guaranteed
117							to have the highest relevance score.
118
119							The matching is done against "canonicalised" forms of the search
120							string and the node titles in the database: stripping vowels, repeated
121							letters and non-word characters, and lowercasing.
122
123							=cut
124
125							sub fuzzy_title_match {
126	0			0	1		my ($self, $string) = @_;
127	0						my $canonical = $self->canonicalise_title( $string );
128	0						$self->_fuzzy_match($string, $canonical);
129							}
130
131	0			0			sub _fuzzy_match { shift->_abstract };
132
133							=head2 C
134
135							$search->index_node( $node, $content, $metadata );
136
137							Indexes or reindexes the given node in the search engine indexes.
138							You must supply both the node name and its content, but metadata is
139							optional.
140
141							If you do supply metadata, it will be used if and only if your chosen
142							search backend supports metadata indexing (see
143							C). It should be a reference to a hash
144							where the keys are the names of the metadata fields and the values are
145							either scalars or references to arrays of scalars. For example:
146
147							$search->index_node( "Calthorpe Arms", "Nice pub in Bloomsbury.",
148							{ category => [ "Pubs", "Bloomsbury" ],
149							postcode => "WC1X 8JR" } );
150
151							=cut
152
153							sub index_node {
154	0			0	1		my ($self, $node, $content) = @_;
155	0	0					croak "Must supply a node name" unless $node;
156	0	0					croak "Must supply node content" unless defined $content;
157
158							# Index the individual words in the node content and title.
159	0						my @keys = $self->analyze("$content $node");
160	0						$self->_index_node($node, $content, \@keys);
161	0						$self->_index_fuzzy($node, $self->canonicalise_title( $node ));
162							}
163
164	0			0			sub _index_node { shift->_abstract };
165	0			0			sub _index_fuzzy { shift->_abstract };
166
167							=head2 B
168
169							$fuzzy = $self->canonicalise_title( $ node);
170
171							Returns the node title as suitable for fuzzy searching: with punctuation
172							and spaces removes, vowels removed, and double letters squashed.
173
174							=cut
175
176							sub canonicalise_title {
177	0			0	1		my ($self, $title) = @_;
178	0	0					return "" unless $title;
179	0						my $canonical = lc($title);
180	0						$canonical =~ s/\W//g; # remove non-word characters
181	0						$canonical =~ s/[aeiouy]//g; # remove vowels and 'y'
182	0						$canonical =~ tr/a-z//s; # collapse doubled (or tripled, etc) letters
183	0						return $canonical;
184							}
185
186							=head2 C
187
188							$search->delete_node($node);
189
190							Removes the given node from the search indexes. NOTE: It's up to you to
191							make sure the node is removed from the backend store. Croaks on error.
192
193							=cut
194
195							sub delete_node {
196	0			0	1		my ($self, $node) = @_;
197	0	0					croak "Must supply a node name" unless $node;
198	0						$self->_delete_node($node);
199							}
200
201	0			0			sub _delete_node { shift->_abstract };
202
203							=head2 C
204
205							if ( $search->supports_phrase_searches ) {
206							return $search->search_nodes( '"fox in socks"' );
207							}
208
209							Returns true if this search backend supports phrase searching, and
210							false otherwise.
211
212							=cut
213
214	0			0	1		sub supports_phrase_searches { shift->_abstract };
215
216							=head2 C
217
218							if ( $search->supports_fuzzy_searches ) {
219							return $search->fuzzy_title_match("Kings Cross St Pancreas");
220							}
221
222							Returns true if this search backend supports fuzzy title matching, and
223							false otherwise.
224
225							=cut
226
227	0			0	1		sub supports_fuzzy_searches { shift->_abstract };
228
229							=head2 C
230
231							if ( $search->supports_metadata_indexing ) {
232							print "This search backend indexes metadata as well as content.";
233							}
234
235							Returns true if this search backend supports metadata indexing, and
236							false otherwise.
237
238							=cut
239
240	0			0	1		sub supports_metadata_indexing { 0; };
241
242							=head1 SEE ALSO
243
244							L
245
246							=cut
247
248							1;