| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Wiki::Toolkit::Search::Base; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 635 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 24 |  | 
| 4 | 1 |  |  | 1 |  | 3 | use Carp "croak"; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 43 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 1 |  |  | 1 |  | 3 | use vars qw( @ISA $VERSION ); | 
|  | 1 |  |  |  |  | 15 |  | 
|  | 1 |  |  |  |  | 565 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | sub _abstract { | 
| 9 | 0 |  |  | 0 |  |  | my $who = (caller(1))[3]; | 
| 10 | 0 |  |  |  |  |  | croak "$who is an abstract method which the ".(ref shift). | 
| 11 |  |  |  |  |  |  | " class has not provided"; | 
| 12 |  |  |  |  |  |  | } | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | $VERSION = 0.01; | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | =head1 NAME | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | Wiki::Toolkit::Search::Base - Base class for Wiki::Toolkit search plugins. | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | my $search = Wiki::Toolkit::Search::XXX->new( @args ); | 
| 23 |  |  |  |  |  |  | my %wombat_nodes = $search->search_nodes("wombat"); | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | This class details the methods that need to be overridden by search plugins. | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | =cut | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =head1 METHODS | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | =head2 C | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | my $search = Wiki::Toolkit::Search::XXX->new( @args ); | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | Creates a new searcher. By default the arguments are just passed to | 
| 36 |  |  |  |  |  |  | C<_init>, so you may wish to override that instead. | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | =cut | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | sub new { | 
| 41 | 0 |  |  | 0 | 1 |  | my ($class, @args) = @_; | 
| 42 | 0 |  |  |  |  |  | my $self = {}; | 
| 43 | 0 |  |  |  |  |  | bless $self, $class; | 
| 44 | 0 |  |  |  |  |  | return $self->_init(@args); | 
| 45 |  |  |  |  |  |  | } | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | sub _init { | 
| 48 | 0 |  |  | 0 |  |  | my ($self, %args) = @_; | 
| 49 | 0 |  |  |  |  |  | @{$self}{keys %args} = values %args; | 
|  | 0 |  |  |  |  |  |  | 
| 50 | 0 |  |  |  |  |  | return $self; | 
| 51 |  |  |  |  |  |  | } | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | =head2 C | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | # Find all the nodes which contain the word 'expert'. | 
| 56 |  |  |  |  |  |  | my %results = $search->search_nodes('expert'); | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | Returns a (possibly empty) hash whose keys are the node names and | 
| 59 |  |  |  |  |  |  | whose values are the scores in some kind of relevance-scoring system I | 
| 60 |  |  |  |  |  |  | haven't entirely come up with yet. For OR searches, this could | 
| 61 |  |  |  |  |  |  | initially be the number of terms that appear in the node, perhaps. | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | Defaults to AND searches (if $and_or is not supplied, or is anything | 
| 64 |  |  |  |  |  |  | other than C or C). | 
| 65 |  |  |  |  |  |  |  | 
| 66 |  |  |  |  |  |  | Searches are case-insensitive. | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | =cut | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub search_nodes { | 
| 71 | 0 |  |  | 0 | 1 |  | my ($self, $termstr, $and_or) = @_; | 
| 72 |  |  |  |  |  |  |  | 
| 73 | 0 |  |  |  |  |  | $and_or = lc($and_or); | 
| 74 | 0 | 0 | 0 |  |  |  | unless ( defined $and_or and $and_or eq "or" ) { | 
| 75 | 0 |  |  |  |  |  | $and_or = "and"; | 
| 76 |  |  |  |  |  |  | } | 
| 77 |  |  |  |  |  |  |  | 
| 78 |  |  |  |  |  |  | # Extract individual search terms. | 
| 79 | 0 |  |  |  |  |  | my @terms = $self->analyze($termstr); | 
| 80 |  |  |  |  |  |  |  | 
| 81 | 0 |  |  |  |  |  | return $self->_do_search($and_or, \@terms); | 
| 82 |  |  |  |  |  |  | } | 
| 83 |  |  |  |  |  |  |  | 
| 84 | 0 |  |  | 0 |  |  | sub _do_search { shift->_abstract }; | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | =head2 C | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | @terms = $self->analyze($string) | 
| 89 |  |  |  |  |  |  |  | 
| 90 |  |  |  |  |  |  | Splits a string into a set of terms for indexing and searching. Typically | 
| 91 |  |  |  |  |  |  | this is done case-insensitively, splitting at word boundaries, and extracting | 
| 92 |  |  |  |  |  |  | words that contain at least 1 word characters. | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | =cut | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  | sub analyze { | 
| 97 | 0 |  |  | 0 | 1 |  | my ($self, $string) = @_; | 
| 98 | 0 | 0 |  |  |  |  | return grep { length > 1            # ignore single characters | 
|  | 0 |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | and ! /^\W*$/ }       # and things composed entirely | 
| 100 |  |  |  |  |  |  | # of non-word characters | 
| 101 |  |  |  |  |  |  | split( /\b/,                 # split at word boundaries | 
| 102 |  |  |  |  |  |  | lc($string)      # be case-insensitive | 
| 103 |  |  |  |  |  |  | ); | 
| 104 |  |  |  |  |  |  | } | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | =head2 C | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | $wiki->write_node( "King's Cross St Pancras", "A station." ); | 
| 109 |  |  |  |  |  |  | my %matches = $search->fuzzy_title_match( "Kings Cross St. Pancras" ); | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | Returns a (possibly empty) hash whose keys are the node names and | 
| 112 |  |  |  |  |  |  | whose values are the scores in some kind of relevance-scoring system I | 
| 113 |  |  |  |  |  |  | haven't entirely come up with yet. | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | Note that even if an exact match is found, any other similar enough | 
| 116 |  |  |  |  |  |  | matches will also be returned. However, any exact match is guaranteed | 
| 117 |  |  |  |  |  |  | to have the highest relevance score. | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | The matching is done against "canonicalised" forms of the search | 
| 120 |  |  |  |  |  |  | string and the node titles in the database: stripping vowels, repeated | 
| 121 |  |  |  |  |  |  | letters and non-word characters, and lowercasing. | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | =cut | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | sub fuzzy_title_match { | 
| 126 | 0 |  |  | 0 | 1 |  | my ($self, $string) = @_; | 
| 127 | 0 |  |  |  |  |  | my $canonical = $self->canonicalise_title( $string ); | 
| 128 | 0 |  |  |  |  |  | $self->_fuzzy_match($string, $canonical); | 
| 129 |  |  |  |  |  |  | } | 
| 130 |  |  |  |  |  |  |  | 
| 131 | 0 |  |  | 0 |  |  | sub _fuzzy_match { shift->_abstract }; | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | =head2 C | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | $search->index_node( $node, $content, $metadata ); | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | Indexes or reindexes the given node in the search engine indexes. | 
| 138 |  |  |  |  |  |  | You must supply both the node name and its content, but metadata is | 
| 139 |  |  |  |  |  |  | optional. | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | If you do supply metadata, it will be used if and only if your chosen | 
| 142 |  |  |  |  |  |  | search backend supports metadata indexing (see | 
| 143 |  |  |  |  |  |  | C).  It should be a reference to a hash | 
| 144 |  |  |  |  |  |  | where the keys are the names of the metadata fields and the values are | 
| 145 |  |  |  |  |  |  | either scalars or references to arrays of scalars.  For example: | 
| 146 |  |  |  |  |  |  |  | 
| 147 |  |  |  |  |  |  | $search->index_node( "Calthorpe Arms", "Nice pub in Bloomsbury.", | 
| 148 |  |  |  |  |  |  | { category => [ "Pubs", "Bloomsbury" ], | 
| 149 |  |  |  |  |  |  | postcode => "WC1X 8JR" } ); | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | =cut | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | sub index_node { | 
| 154 | 0 |  |  | 0 | 1 |  | my ($self, $node, $content) = @_; | 
| 155 | 0 | 0 |  |  |  |  | croak "Must supply a node name" unless $node; | 
| 156 | 0 | 0 |  |  |  |  | croak "Must supply node content" unless defined $content; | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | # Index the individual words in the node content and title. | 
| 159 | 0 |  |  |  |  |  | my @keys = $self->analyze("$content $node"); | 
| 160 | 0 |  |  |  |  |  | $self->_index_node($node, $content, \@keys); | 
| 161 | 0 |  |  |  |  |  | $self->_index_fuzzy($node, $self->canonicalise_title( $node )); | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  |  | 
| 164 | 0 |  |  | 0 |  |  | sub _index_node  { shift->_abstract }; | 
| 165 | 0 |  |  | 0 |  |  | sub _index_fuzzy { shift->_abstract }; | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | =head2 B | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | $fuzzy = $self->canonicalise_title( $ node); | 
| 170 |  |  |  |  |  |  |  | 
| 171 |  |  |  |  |  |  | Returns the node title as suitable for fuzzy searching: with punctuation | 
| 172 |  |  |  |  |  |  | and spaces removes, vowels removed, and double letters squashed. | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | =cut | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | sub canonicalise_title { | 
| 177 | 0 |  |  | 0 | 1 |  | my ($self, $title) = @_; | 
| 178 | 0 | 0 |  |  |  |  | return "" unless $title; | 
| 179 | 0 |  |  |  |  |  | my $canonical = lc($title); | 
| 180 | 0 |  |  |  |  |  | $canonical =~ s/\W//g;         # remove non-word characters | 
| 181 | 0 |  |  |  |  |  | $canonical =~ s/[aeiouy]//g;   # remove vowels and 'y' | 
| 182 | 0 |  |  |  |  |  | $canonical =~ tr/a-z//s;       # collapse doubled (or tripled, etc) letters | 
| 183 | 0 |  |  |  |  |  | return $canonical; | 
| 184 |  |  |  |  |  |  | } | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | =head2 C | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | $search->delete_node($node); | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | Removes the given node from the search indexes.  NOTE: It's up to you to | 
| 191 |  |  |  |  |  |  | make sure the node is removed from the backend store.  Croaks on error. | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | =cut | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | sub delete_node { | 
| 196 | 0 |  |  | 0 | 1 |  | my ($self, $node) = @_; | 
| 197 | 0 | 0 |  |  |  |  | croak "Must supply a node name" unless $node; | 
| 198 | 0 |  |  |  |  |  | $self->_delete_node($node); | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 | 0 |  |  | 0 |  |  | sub _delete_node { shift->_abstract }; | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | =head2 C | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | if ( $search->supports_phrase_searches ) { | 
| 206 |  |  |  |  |  |  | return $search->search_nodes( '"fox in socks"' ); | 
| 207 |  |  |  |  |  |  | } | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | Returns true if this search backend supports phrase searching, and | 
| 210 |  |  |  |  |  |  | false otherwise. | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | =cut | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 0 |  |  | 0 | 1 |  | sub supports_phrase_searches { shift->_abstract }; | 
| 215 |  |  |  |  |  |  |  | 
| 216 |  |  |  |  |  |  | =head2 C | 
| 217 |  |  |  |  |  |  |  | 
| 218 |  |  |  |  |  |  | if ( $search->supports_fuzzy_searches ) { | 
| 219 |  |  |  |  |  |  | return $search->fuzzy_title_match("Kings Cross St Pancreas"); | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | Returns true if this search backend supports fuzzy title matching, and | 
| 223 |  |  |  |  |  |  | false otherwise. | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | =cut | 
| 226 |  |  |  |  |  |  |  | 
| 227 | 0 |  |  | 0 | 1 |  | sub supports_fuzzy_searches { shift->_abstract }; | 
| 228 |  |  |  |  |  |  |  | 
| 229 |  |  |  |  |  |  | =head2 C | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | if ( $search->supports_metadata_indexing ) { | 
| 232 |  |  |  |  |  |  | print "This search backend indexes metadata as well as content."; | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  |  | 
| 235 |  |  |  |  |  |  | Returns true if this search backend supports metadata indexing, and | 
| 236 |  |  |  |  |  |  | false otherwise. | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | =cut | 
| 239 |  |  |  |  |  |  |  | 
| 240 | 0 |  |  | 0 | 1 |  | sub supports_metadata_indexing { 0; }; | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | L | 
| 245 |  |  |  |  |  |  |  | 
| 246 |  |  |  |  |  |  | =cut | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | 1; |