line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Search::Tools::QueryParser; |
2
|
26
|
|
|
26
|
|
73591
|
use Moo; |
|
26
|
|
|
|
|
87342
|
|
|
26
|
|
|
|
|
136
|
|
3
|
|
|
|
|
|
|
extends 'Search::Tools::Object'; |
4
|
26
|
|
|
26
|
|
16215
|
use Carp; |
|
26
|
|
|
|
|
53
|
|
|
26
|
|
|
|
|
1267
|
|
5
|
26
|
|
|
26
|
|
2043
|
use Data::Dump qw( dump ); |
|
26
|
|
|
|
|
22277
|
|
|
26
|
|
|
|
|
1043
|
|
6
|
26
|
|
|
26
|
|
11825
|
use Search::Query::Parser; |
|
26
|
|
|
|
|
2986430
|
|
|
26
|
|
|
|
|
780
|
|
7
|
26
|
|
|
26
|
|
4282
|
use Encode; |
|
26
|
|
|
|
|
71077
|
|
|
26
|
|
|
|
|
1715
|
|
8
|
26
|
|
|
26
|
|
143
|
use Data::Dump; |
|
26
|
|
|
|
|
43
|
|
|
26
|
|
|
|
|
993
|
|
9
|
26
|
|
|
26
|
|
9663
|
use Search::Tools::Query; |
|
26
|
|
|
|
|
68
|
|
|
26
|
|
|
|
|
753
|
|
10
|
26
|
|
|
26
|
|
153
|
use Search::Tools::UTF8; |
|
26
|
|
|
|
|
42
|
|
|
26
|
|
|
|
|
2135
|
|
11
|
26
|
|
|
26
|
|
136
|
use Search::Tools::XML; |
|
26
|
|
|
|
|
45
|
|
|
26
|
|
|
|
|
457
|
|
12
|
26
|
|
|
26
|
|
113
|
use Search::Tools::RegEx; |
|
26
|
|
|
|
|
51
|
|
|
26
|
|
|
|
|
520
|
|
13
|
|
|
|
|
|
|
|
14
|
26
|
|
|
26
|
|
109
|
use namespace::autoclean; |
|
26
|
|
|
|
|
41
|
|
|
26
|
|
|
|
|
100
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our $VERSION = '1.006'; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $XML = Search::Tools::XML->new(); |
19
|
|
|
|
|
|
|
my $C2E = $XML->char2ent_map; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# we turn locale pragma on in a small block |
22
|
|
|
|
|
|
|
# because we don't want it to mess up our regex building |
23
|
|
|
|
|
|
|
# or taint vars in other areas. We just want to use setlocale() |
24
|
|
|
|
|
|
|
# and make sure we get correct ->utf8 encoding |
25
|
|
|
|
|
|
|
my ( $locale, $lang, $charset ); |
26
|
|
|
|
|
|
|
{ |
27
|
26
|
|
|
26
|
|
12069
|
use POSIX qw(locale_h); |
|
26
|
|
|
|
|
117853
|
|
|
26
|
|
|
|
|
128
|
|
28
|
|
|
|
|
|
|
$locale = setlocale(LC_CTYPE); |
29
|
|
|
|
|
|
|
( $lang, $charset ) = split( m/\./, $locale ); |
30
|
|
|
|
|
|
|
$charset ||= q/UTF-8/; #
|
31
|
|
|
|
|
|
|
$lang = q/en_US/ if $lang =~ m/^(posix|c)$/i; |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my %Defaults = ( |
35
|
|
|
|
|
|
|
and_word => q/and|near\d*/, |
36
|
|
|
|
|
|
|
charset => $charset, |
37
|
|
|
|
|
|
|
default_field => "", |
38
|
|
|
|
|
|
|
ignore_case => 1, |
39
|
|
|
|
|
|
|
ignore_fields => {}, |
40
|
|
|
|
|
|
|
ignore_first_char => quotemeta(q/'-/), |
41
|
|
|
|
|
|
|
ignore_last_char => quotemeta(q/'-/), |
42
|
|
|
|
|
|
|
lang => $lang, |
43
|
|
|
|
|
|
|
locale => $locale, |
44
|
|
|
|
|
|
|
not_word => q/not/, |
45
|
|
|
|
|
|
|
or_word => q/or/, |
46
|
|
|
|
|
|
|
phrase_delim => q/"/, |
47
|
|
|
|
|
|
|
query_class => 'Search::Tools::Query', |
48
|
|
|
|
|
|
|
query_dialect => "Search::Query::Dialect::Native", |
49
|
|
|
|
|
|
|
stemmer => undef, |
50
|
|
|
|
|
|
|
stopwords => [], |
51
|
|
|
|
|
|
|
tag_re => $XML->tag_re, |
52
|
|
|
|
|
|
|
term_re => qr/\w+(?:[\'\-]\w+)*/, |
53
|
|
|
|
|
|
|
term_min_length => 1, |
54
|
|
|
|
|
|
|
treat_uris_like_phrases => 1, |
55
|
|
|
|
|
|
|
whitespace => $XML->html_whitespace, |
56
|
|
|
|
|
|
|
wildcard => q/*/, |
57
|
|
|
|
|
|
|
word_characters => q/\w/ . quotemeta(q/'-/), |
58
|
|
|
|
|
|
|
); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
for my $attr ( keys %Defaults ) { |
61
|
|
|
|
|
|
|
has( $attr => ( is => 'rw', default => sub { $Defaults{$attr} } ) ); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
has 'start_bound' => ( is => 'ro' ); |
64
|
|
|
|
|
|
|
has 'end_bound' => ( is => 'ro' ); |
65
|
|
|
|
|
|
|
has 'plain_phrase_bound' => ( is => 'ro' ); |
66
|
|
|
|
|
|
|
has 'html_phrase_bound' => ( is => 'ro' ); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub get_defaults { |
69
|
0
|
|
|
0
|
1
|
0
|
return {%Defaults}; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub BUILD { |
73
|
51
|
|
|
51
|
1
|
213
|
my $self = shift; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# TODO handle case where both term_re and word_characters are defined |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# charset/locale/lang are a bit interdependent |
78
|
|
|
|
|
|
|
# so make sure charset/lang are set if locale is explicitly passed. |
79
|
51
|
100
|
|
|
|
168
|
if ( $self->{locale} ne $Defaults{locale} ) { |
80
|
1
|
|
|
|
|
4
|
( $self->{lang}, $self->{charset} ) = split( m/\./, $self->{locale} ); |
81
|
1
|
50
|
|
|
|
4
|
$self->{lang} = 'en_US' if $self->{lang} =~ m/^(posix|c)$/i; |
82
|
1
|
|
33
|
|
|
3
|
$self->{charset} ||= $Defaults{charset}; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# make sure ignore_fields is a hash ref |
86
|
51
|
50
|
|
|
|
171
|
if ( ref( $self->{ignore_fields} ) eq 'ARRAY' ) { |
87
|
|
|
|
|
|
|
$self->{ignore_fields} |
88
|
0
|
|
|
|
|
0
|
= { map { $_ => $_ } @{ $self->{ignore_fields} } }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
51
|
|
|
|
|
162
|
$self->_setup_regex_builder; |
92
|
|
|
|
|
|
|
|
93
|
51
|
|
|
|
|
393
|
return $self; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub parse { |
97
|
63
|
|
|
63
|
1
|
7403
|
my $self = shift; |
98
|
63
|
|
|
|
|
109
|
my $query_str = shift; |
99
|
63
|
50
|
|
|
|
177
|
confess "query required" unless defined $query_str; |
100
|
63
|
50
|
|
|
|
160
|
if ( ref $query_str ) { |
101
|
0
|
|
|
|
|
0
|
croak "query must be a scalar string"; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
#$query_str = to_utf8( $query_str, $self->charset ); |
105
|
63
|
|
|
|
|
158
|
my $extracted = $self->_extract_terms($query_str); |
106
|
63
|
|
|
|
|
186
|
my %regex; |
107
|
63
|
|
|
|
|
135
|
TERM: for my $term ( @{ $extracted->{terms} } ) { |
|
63
|
|
|
|
|
191
|
|
108
|
145
|
|
|
|
|
1747
|
my ( $plain, $html, $escaped ) = $self->_build_regex($term); |
109
|
145
|
|
|
|
|
493
|
my $is_phrase = $term =~ m/\ /; |
110
|
145
|
|
|
|
|
223
|
my @phrase_terms; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# if the term is a phrase, |
113
|
|
|
|
|
|
|
# build regex for each term in the phrase |
114
|
145
|
100
|
|
|
|
322
|
if ($is_phrase) { |
115
|
32
|
|
|
|
|
155
|
my @pts = split( /\ /, $term ); |
116
|
32
|
|
|
|
|
81
|
for my $pt (@pts) { |
117
|
87
|
|
|
|
|
987
|
my ( $pt_plain, $pt_html, $pt_esc ) |
118
|
|
|
|
|
|
|
= $self->_build_regex($pt); |
119
|
87
|
|
|
|
|
2735
|
push @phrase_terms, |
120
|
|
|
|
|
|
|
Search::Tools::RegEx->new( |
121
|
|
|
|
|
|
|
plain => $pt_plain, |
122
|
|
|
|
|
|
|
html => $pt_html, |
123
|
|
|
|
|
|
|
term => $pt, |
124
|
|
|
|
|
|
|
term_re => qr/$pt_esc/i, |
125
|
|
|
|
|
|
|
is_phrase => 0, |
126
|
|
|
|
|
|
|
); |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
} |
129
|
145
|
|
|
|
|
6723
|
$regex{$term} = Search::Tools::RegEx->new( |
130
|
|
|
|
|
|
|
plain => $plain, |
131
|
|
|
|
|
|
|
html => $html, |
132
|
|
|
|
|
|
|
term => $term, |
133
|
|
|
|
|
|
|
term_re => qr/$escaped/i, |
134
|
|
|
|
|
|
|
is_phrase => $is_phrase, |
135
|
|
|
|
|
|
|
phrase_terms => \@phrase_terms, |
136
|
|
|
|
|
|
|
); |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
return $self->{query_class}->new( |
140
|
|
|
|
|
|
|
dialect => $extracted->{dialect}, |
141
|
|
|
|
|
|
|
terms => $extracted->{terms}, |
142
|
|
|
|
|
|
|
fields => $extracted->{fields}, |
143
|
63
|
|
|
|
|
1350
|
str => to_utf8( $query_str, $self->charset ), |
144
|
|
|
|
|
|
|
regex => \%regex, |
145
|
|
|
|
|
|
|
qp => $self, |
146
|
|
|
|
|
|
|
); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub _extract_terms { |
150
|
63
|
|
|
63
|
|
106
|
my $self = shift; |
151
|
63
|
|
|
|
|
88
|
my $query = shift; |
152
|
63
|
50
|
|
|
|
136
|
confess "need query to extract terms" unless defined $query; |
153
|
63
|
|
|
|
|
215
|
my $stopwords = $self->stopwords; |
154
|
63
|
|
|
|
|
131
|
my $and_word = $self->and_word; |
155
|
63
|
|
|
|
|
142
|
my $or_word = $self->or_word; |
156
|
63
|
|
|
|
|
187
|
my $not_word = $self->not_word; |
157
|
63
|
|
|
|
|
127
|
my $wildcard = $self->wildcard; |
158
|
63
|
|
|
|
|
153
|
my $phrase = $self->phrase_delim; |
159
|
63
|
|
|
|
|
155
|
my $igf = $self->ignore_first_char; |
160
|
63
|
|
|
|
|
137
|
my $igl = $self->ignore_last_char; |
161
|
63
|
|
|
|
|
151
|
my $wordchar = $self->word_characters; |
162
|
63
|
|
|
|
|
118
|
my $default_field = $self->default_field; |
163
|
63
|
|
|
|
|
117
|
my $esc_wildcard = quotemeta($wildcard); |
164
|
63
|
|
|
|
|
2051
|
my $word_re = qr/(($esc_wildcard)?[$wordchar]+($esc_wildcard)?)/; |
165
|
63
|
|
|
|
|
234
|
my $min_length = $self->term_min_length; |
166
|
63
|
|
|
|
|
100
|
my $raw_query = $query; |
167
|
|
|
|
|
|
|
|
168
|
63
|
100
|
|
|
|
190
|
$stopwords = [ split( /\s+/, $stopwords ) ] unless ref $stopwords; |
169
|
63
|
|
|
|
|
145
|
my %stophash = map { to_utf8( lc($_), $self->charset ) => 1 } @$stopwords; |
|
15
|
|
|
|
|
75
|
|
170
|
63
|
|
|
|
|
105
|
my ( %words, %uniq, $c ); |
171
|
63
|
|
|
|
|
1756
|
my $parser = Search::Query::Parser->new( |
172
|
|
|
|
|
|
|
and_regex => qr{$and_word}i, |
173
|
|
|
|
|
|
|
or_regex => qr{$or_word}i, |
174
|
|
|
|
|
|
|
not_regex => qr{$not_word}i, |
175
|
|
|
|
|
|
|
default_field => $default_field, |
176
|
|
|
|
|
|
|
query_class => $self->query_dialect, |
177
|
|
|
|
|
|
|
); |
178
|
|
|
|
|
|
|
|
179
|
63
|
|
|
|
|
113907
|
my $baked_query = $raw_query; |
180
|
63
|
50
|
|
|
|
302
|
$baked_query = lc($baked_query) if $self->ignore_case; |
181
|
63
|
|
|
|
|
4408
|
$baked_query = to_utf8( $baked_query, $self->charset ); |
182
|
63
|
50
|
|
|
|
203
|
my $dialect = $parser->parse($baked_query) or croak $parser->error; |
183
|
63
|
50
|
|
|
|
138895
|
$self->debug && carp "parsetree: " . Data::Dump::dump( $dialect->tree ); |
184
|
63
|
|
|
|
|
611
|
my $fields_searched |
185
|
|
|
|
|
|
|
= $self->_get_value_from_tree( \%uniq, $dialect->tree, $c ); |
186
|
|
|
|
|
|
|
|
187
|
63
|
50
|
|
|
|
1130
|
$self->debug && carp "parsed: " . Data::Dump::dump( \%uniq ); |
188
|
|
|
|
|
|
|
|
189
|
63
|
|
|
|
|
420
|
my $count = scalar( keys %uniq ); |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# parse uniq into word tokens |
192
|
|
|
|
|
|
|
# including removing stop words |
193
|
|
|
|
|
|
|
|
194
|
63
|
50
|
|
|
|
846
|
$self->debug && carp "word_re: $word_re"; |
195
|
|
|
|
|
|
|
|
196
|
63
|
|
|
|
|
477
|
U: for my $u ( sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ) { |
|
171
|
|
|
|
|
331
|
|
197
|
|
|
|
|
|
|
|
198
|
152
|
|
|
|
|
283
|
my $n = $uniq{$u}; |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# only phrases have space |
201
|
|
|
|
|
|
|
# but due to our word_re, a single non-spaced string |
202
|
|
|
|
|
|
|
# might actually be multiple word tokens |
203
|
152
|
|
100
|
|
|
564
|
my $isphrase = $u =~ m/\s/ || 0; |
204
|
|
|
|
|
|
|
|
205
|
152
|
50
|
|
|
|
427
|
if ( $self->treat_uris_like_phrases ) { |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# special case: treat email addresses, uris, as phrase |
208
|
152
|
|
100
|
|
|
2336
|
$isphrase ||= $u =~ m/[$wordchar][\@\.\\\/][$wordchar]/ || 0; |
|
|
|
100
|
|
|
|
|
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
152
|
50
|
|
|
|
2278
|
$self->debug && carp "$u -> isphrase = $isphrase"; |
212
|
|
|
|
|
|
|
|
213
|
152
|
|
|
|
|
901
|
my @w = (); |
214
|
|
|
|
|
|
|
|
215
|
152
|
|
|
|
|
435
|
TOK: for my $w ( split( m/\s+/, to_utf8( $u, $self->charset ) ) ) { |
216
|
|
|
|
|
|
|
|
217
|
199
|
50
|
|
|
|
591
|
next TOK unless $w =~ m/\S/; |
218
|
|
|
|
|
|
|
|
219
|
199
|
|
|
|
|
584
|
$w =~ s/\Q$phrase\E//g; |
220
|
|
|
|
|
|
|
|
221
|
199
|
|
|
|
|
876
|
while ( $w =~ m/$word_re/g ) { |
222
|
208
|
|
|
|
|
378
|
my $tok = _untaint($1); |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# strip ignorable chars |
225
|
208
|
50
|
|
|
|
1022
|
$tok =~ s/^[$igf]+// if length($igf); |
226
|
208
|
50
|
|
|
|
812
|
$tok =~ s/[$igl]+$// if length($igl); |
227
|
|
|
|
|
|
|
|
228
|
208
|
50
|
|
|
|
392
|
unless ($tok) { |
229
|
0
|
0
|
|
|
|
0
|
$self->debug && carp "no token for '$w' $word_re"; |
230
|
0
|
|
|
|
|
0
|
next TOK; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
208
|
50
|
|
|
|
2987
|
$self->debug && carp "found token: $tok"; |
234
|
|
|
|
|
|
|
|
235
|
208
|
100
|
|
|
|
1306
|
if ( exists $stophash{ lc($tok) } ) { |
236
|
13
|
50
|
|
|
|
156
|
$self->debug && carp "$tok = stopword"; |
237
|
13
|
100
|
|
|
|
81
|
next TOK unless $isphrase; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
202
|
100
|
|
|
|
374
|
unless ($isphrase) { |
241
|
115
|
50
|
|
|
|
1169
|
next TOK if $tok =~ m/^($and_word|$or_word|$not_word)$/i; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# if tainting was on, odd things can happen. |
245
|
|
|
|
|
|
|
# so check one more time |
246
|
202
|
|
|
|
|
571
|
$tok = to_utf8( $tok, $self->charset ); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# final sanity check |
249
|
202
|
50
|
|
|
|
445
|
if ( !Encode::is_utf8($tok) ) { |
250
|
0
|
|
|
|
|
0
|
carp "$tok is NOT utf8"; |
251
|
0
|
|
|
|
|
0
|
next TOK; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
#$self->debug && carp "pushing $tok into wordlist"; |
255
|
202
|
|
|
|
|
777
|
push( @w, $tok ); |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
152
|
100
|
|
|
|
382
|
next U unless @w; |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
#$self->debug && carp "joining \@w: " . Data::Dump::dump(\@w); |
264
|
146
|
100
|
|
|
|
288
|
if ($isphrase) { |
265
|
32
|
|
|
|
|
140
|
$words{ join( ' ', @w ) } = $n + $count++; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
else { |
268
|
114
|
|
|
|
|
206
|
for (@w) { |
269
|
115
|
|
|
|
|
327
|
$words{$_} = $n + $count++; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
63
|
50
|
|
|
|
940
|
$self->debug && carp "tokenized: " . Data::Dump::dump( \%words ); |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# make sure we don't have 'foo' and 'foo*' |
278
|
63
|
|
|
|
|
480
|
for ( keys %words ) { |
279
|
147
|
100
|
|
|
|
482
|
if ( $_ =~ m/$esc_wildcard/ ) { |
280
|
12
|
|
|
|
|
99
|
( my $copy = $_ ) =~ s,$esc_wildcard,,g; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# delete the more exact of the two |
283
|
|
|
|
|
|
|
# since the * will match both |
284
|
12
|
|
|
|
|
28
|
delete( $words{$copy} ); |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
147
|
100
|
|
|
|
372
|
if ( length $_ < $min_length ) { |
288
|
1
|
50
|
|
|
|
24
|
$self->debug and carp "token too short: '$_'"; |
289
|
1
|
|
|
|
|
9
|
delete $words{$_}; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
63
|
50
|
|
|
|
940
|
$self->debug && carp "wildcards removed: " . Data::Dump::dump( \%words ); |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# if any words need to be stemmed |
297
|
63
|
100
|
|
|
|
510
|
if ( $self->stemmer ) { |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# split each $word into words |
300
|
|
|
|
|
|
|
# stem each word |
301
|
|
|
|
|
|
|
# if stem ne word, break into chars and find first N common |
302
|
|
|
|
|
|
|
# rejoin $uniq |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
#carp "stemming ON\n"; |
305
|
|
|
|
|
|
|
|
306
|
8
|
|
|
|
|
22
|
K: for ( keys %words ) { |
307
|
15
|
|
|
|
|
47
|
my (@w) = split /\s+/; |
308
|
15
|
|
|
|
|
27
|
W: for my $w (@w) { |
309
|
27
|
|
|
|
|
38
|
my $func = $self->stemmer; |
310
|
27
|
|
|
|
|
52
|
my $f = &$func( $self, $w ); |
311
|
27
|
50
|
33
|
|
|
171
|
if ( !defined $f or !length $f ) { |
312
|
0
|
|
|
|
|
0
|
next W; |
313
|
|
|
|
|
|
|
} |
314
|
27
|
|
|
|
|
95
|
$f = to_utf8($f); |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
#warn "w: $w\nf: $f\n"; |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# add wildcard to indicate chars were lost |
319
|
27
|
|
|
|
|
70
|
$w = $f . $wildcard; |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
} |
322
|
15
|
|
|
|
|
34
|
my $new = join ' ', @w; |
323
|
15
|
50
|
|
|
|
63
|
if ( $new ne $_ ) { |
324
|
15
|
|
|
|
|
39
|
$words{$new} = $words{$_}; |
325
|
15
|
|
|
|
|
33
|
delete $words{$_}; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
63
|
50
|
|
|
|
866
|
$self->debug && carp "stemming done: " . Data::Dump::dump( \%words ); |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# sort keeps query in same order as we entered |
334
|
|
|
|
|
|
|
return { |
335
|
63
|
|
|
|
|
664
|
terms => [ sort { $words{$a} <=> $words{$b} } keys %words ], |
|
159
|
|
|
|
|
581
|
|
336
|
|
|
|
|
|
|
fields => [ keys %$fields_searched ], |
337
|
|
|
|
|
|
|
dialect => $dialect, |
338
|
|
|
|
|
|
|
query => $raw_query, |
339
|
|
|
|
|
|
|
}; |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# stolen nearly verbatim from Taint::Runtime |
344
|
|
|
|
|
|
|
# apparently regex can be tainted when running under 'use locale'. |
345
|
|
|
|
|
|
|
# as of version 0.24 this should not be needed but until I can find a way |
346
|
|
|
|
|
|
|
# to easily test the Taint feature, we just do this. It's low overhead. |
347
|
|
|
|
|
|
|
sub _untaint { |
348
|
208
|
|
|
208
|
|
362
|
my $str = shift; |
349
|
208
|
50
|
|
|
|
426
|
my $ref = ref($str) ? $str : \$str; |
350
|
208
|
50
|
|
|
|
445
|
if ( !defined $$ref ) { |
351
|
0
|
|
|
|
|
0
|
$$ref = undef; |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
else { |
354
|
|
|
|
|
|
|
$$ref |
355
|
|
|
|
|
|
|
= ( $$ref =~ /(.*)/ ) |
356
|
|
|
|
|
|
|
? $1 |
357
|
208
|
50
|
|
|
|
695
|
: do { confess("Couldn't find data to untaint") }; |
|
0
|
|
|
|
|
0
|
|
358
|
|
|
|
|
|
|
} |
359
|
208
|
50
|
|
|
|
484
|
return ref($str) ? 1 : $str; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
sub _get_value_from_tree { |
363
|
63
|
|
|
63
|
|
2816
|
my $self = shift; |
364
|
63
|
|
|
|
|
90
|
my $uniq = shift; |
365
|
63
|
|
|
|
|
99
|
my $parseTree = shift; |
366
|
63
|
|
|
|
|
87
|
my $c = shift; |
367
|
63
|
|
|
|
|
108
|
my %fields = (); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# we only want the values from non minus queries |
370
|
63
|
|
|
|
|
109
|
for my $node ( '+', '' ) { |
371
|
126
|
100
|
|
|
|
273
|
next unless exists $parseTree->{$node}; |
372
|
|
|
|
|
|
|
|
373
|
63
|
|
|
|
|
110
|
my @branches = @{ $parseTree->{$node} }; |
|
63
|
|
|
|
|
145
|
|
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
#warn dump \@branches; |
376
|
|
|
|
|
|
|
|
377
|
63
|
|
|
|
|
115
|
for my $leaf (@branches) { |
378
|
150
|
|
|
|
|
221
|
my $v = $leaf->{value}; |
379
|
150
|
50
|
|
|
|
269
|
if ( !defined $v ) { |
380
|
0
|
|
|
|
|
0
|
croak "undefined value in query tree: " . dump($leaf); |
381
|
|
|
|
|
|
|
} |
382
|
150
|
50
|
66
|
|
|
316
|
if ( defined $leaf->{field} |
383
|
|
|
|
|
|
|
and exists $self->ignore_fields->{ $leaf->{field} } ) |
384
|
|
|
|
|
|
|
{ |
385
|
0
|
|
|
|
|
0
|
next; |
386
|
|
|
|
|
|
|
} |
387
|
150
|
|
|
|
|
191
|
my $field = $leaf->{field}; |
388
|
150
|
100
|
|
|
|
244
|
if ( defined $field ) { |
389
|
3
|
|
|
|
|
8
|
$fields{$field}++; |
390
|
|
|
|
|
|
|
} |
391
|
150
|
50
|
|
|
|
347
|
if ( ref $v eq 'HASH' ) { |
|
|
100
|
|
|
|
|
|
392
|
0
|
|
|
|
|
0
|
my $f = $self->_get_value_from_tree( $uniq, $v, $c ); |
393
|
0
|
|
|
|
|
0
|
$fields{$_} = $f->{$_} for ( keys %$f ); |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
elsif ( ref $v eq 'ARRAY' ) { |
396
|
1
|
|
|
|
|
3
|
for my $value (@$v) { |
397
|
2
|
|
|
|
|
3
|
$value =~ s/\s+/ /g; |
398
|
2
|
|
|
|
|
7
|
$uniq->{$value} = ++$c; |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
else { |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# if the $leaf is a proximity query, |
404
|
|
|
|
|
|
|
# ignore the "phrase-ness" of it and split |
405
|
|
|
|
|
|
|
# on whitespace. This is a compromise, |
406
|
|
|
|
|
|
|
# mitigated by the tendency of HeatMap |
407
|
|
|
|
|
|
|
# to reward proximity anyway. |
408
|
149
|
100
|
66
|
|
|
281
|
if ( $leaf->{proximity} and $leaf->{proximity} > 1 ) { |
409
|
1
|
|
|
|
|
7
|
my @tokens = split( m/\ +/, $v ); |
410
|
1
|
|
|
|
|
5
|
$uniq->{$_} = ++$c for @tokens; |
411
|
1
|
|
|
|
|
3
|
next; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# collapse any whitespace |
415
|
148
|
|
|
|
|
362
|
$v =~ s,\s+,\ ,g; |
416
|
|
|
|
|
|
|
|
417
|
148
|
|
|
|
|
405
|
$uniq->{$v} = ++$c; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
} |
421
|
63
|
|
|
|
|
177
|
return \%fields; |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub _setup_regex_builder { |
425
|
51
|
|
|
51
|
|
77
|
my $self = shift; |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# TODO optional for term_re |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
# a search for a '<' or '>' should still highlight, |
430
|
|
|
|
|
|
|
# since < or > can be indexed as literal < and > |
431
|
|
|
|
|
|
|
# but this causes a great deal of hassle |
432
|
|
|
|
|
|
|
# so we just ignore them. |
433
|
51
|
|
|
|
|
150
|
my $wordchars = $self->word_characters; |
434
|
51
|
|
|
|
|
115
|
$wordchars =~ s,[<>&],,g; |
435
|
51
|
|
|
|
|
95
|
$self->{html_safe_wordchars} = $wordchars; # remember for build |
436
|
51
|
|
|
|
|
161
|
my $ignore_first = $self->ignore_first_char; |
437
|
51
|
|
|
|
|
117
|
my $ignore_last = $self->ignore_last_char; |
438
|
51
|
|
|
|
|
196
|
my $html_whitespace = $self->whitespace; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# what's the boundary between a word and a not-word? |
441
|
|
|
|
|
|
|
# by default: |
442
|
|
|
|
|
|
|
# the beginning of a string |
443
|
|
|
|
|
|
|
# the end of a string |
444
|
|
|
|
|
|
|
# whatever we've defined as WhiteSpace |
445
|
|
|
|
|
|
|
# any character that is not a WordChar |
446
|
|
|
|
|
|
|
# any character we explicitly ignore at start or end of word |
447
|
|
|
|
|
|
|
# |
448
|
|
|
|
|
|
|
# the \A and \Z (beginning and end) should help if the word butts up |
449
|
|
|
|
|
|
|
# against the beginning or end of a tagset |
450
|
|
|
|
|
|
|
# like Word or Word |
451
|
|
|
|
|
|
|
|
452
|
51
|
|
|
|
|
212
|
my @start_bound = ( |
453
|
|
|
|
|
|
|
'\A', |
454
|
|
|
|
|
|
|
'[>]', |
455
|
|
|
|
|
|
|
'(?:&[\w\#]+;)', # because a ; might be a legitimate wordchar |
456
|
|
|
|
|
|
|
# and we treat a char entity like a single char. |
457
|
|
|
|
|
|
|
# if &char; resolves to a legit wordchar |
458
|
|
|
|
|
|
|
# this might give unexpected results. |
459
|
|
|
|
|
|
|
# NOTE that etc is in $WhiteSpace |
460
|
|
|
|
|
|
|
$html_whitespace, |
461
|
|
|
|
|
|
|
'[^' . $wordchars . ']' |
462
|
|
|
|
|
|
|
); |
463
|
51
|
50
|
|
|
|
561
|
push( @start_bound, qr/[$ignore_first]+/i ) if length $ignore_first; |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
my @end_bound |
466
|
51
|
|
|
|
|
291
|
= ( '\Z', '[<&]', $html_whitespace, '[^' . $wordchars . ']' ); |
467
|
51
|
50
|
|
|
|
371
|
push( @end_bound, qr/[$ignore_last]+/i ) if length $ignore_last; |
468
|
|
|
|
|
|
|
|
469
|
51
|
|
33
|
|
|
401
|
$self->{start_bound} ||= join( '|', @start_bound ); |
470
|
|
|
|
|
|
|
|
471
|
51
|
|
33
|
|
|
2815
|
$self->{end_bound} ||= join( '|', @end_bound ); |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# the whitespace in a query phrase might be: |
474
|
|
|
|
|
|
|
# any ignore_last_char, followed by |
475
|
|
|
|
|
|
|
# one or more nonwordchar or whitespace, followed by |
476
|
|
|
|
|
|
|
# any ignore_first_char |
477
|
|
|
|
|
|
|
# define for both text and html |
478
|
|
|
|
|
|
|
# NOTE the first/last swap for plain vs html |
479
|
|
|
|
|
|
|
# is intentional because of how regex are built. |
480
|
|
|
|
|
|
|
|
481
|
51
|
50
|
|
|
|
3442
|
my @plain_phrase_bound = ( |
|
|
50
|
|
|
|
|
|
482
|
|
|
|
|
|
|
( length($ignore_last) ? qr/[$ignore_last]*/i : '' ), |
483
|
|
|
|
|
|
|
qr/(?:[\s\x20]|[^$wordchars])+/is, |
484
|
|
|
|
|
|
|
( length($ignore_first) ? qr/[$ignore_first]?/i : '' ), |
485
|
|
|
|
|
|
|
); |
486
|
51
|
|
|
|
|
252
|
$self->{plain_phrase_bound} = join( '', @plain_phrase_bound ); |
487
|
|
|
|
|
|
|
|
488
|
51
|
50
|
|
|
|
1859
|
my @html_phrase_bound = ( |
|
|
50
|
|
|
|
|
|
489
|
|
|
|
|
|
|
( length($ignore_first) ? qr/[$ignore_first]*/i : '' ), |
490
|
|
|
|
|
|
|
qr/(?:$html_whitespace|[^$wordchars])+/is, |
491
|
|
|
|
|
|
|
( length($ignore_last) ? qr/[$ignore_last]?/i : '' ), |
492
|
|
|
|
|
|
|
); |
493
|
51
|
|
|
|
|
195610
|
$self->{html_phrase_bound} = join( '', @html_phrase_bound ); |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
sub _build_regex { |
498
|
232
|
|
|
232
|
|
411
|
my $self = shift; |
499
|
232
|
50
|
|
|
|
533
|
my $q = shift or croak "need query to build()"; |
500
|
232
|
|
|
|
|
413
|
my $wild = $self->{html_safe_wordchars}; |
501
|
232
|
|
|
|
|
368
|
my $st_bound = $self->{start_bound}; |
502
|
232
|
|
|
|
|
367
|
my $end_bound = $self->{end_bound}; |
503
|
232
|
|
|
|
|
407
|
my $wc = $self->{html_safe_wordchars}; |
504
|
232
|
|
|
|
|
314
|
my $ppb = $self->{plain_phrase_bound}; |
505
|
232
|
|
|
|
|
347
|
my $hpb = $self->{html_phrase_bound}; |
506
|
232
|
|
|
|
|
532
|
my $wildcard = $self->wildcard; |
507
|
232
|
|
|
|
|
358
|
my $wild_esc = quotemeta($wildcard); |
508
|
232
|
|
|
|
|
391
|
my $tag_re = $self->tag_re; |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# define simple pattern for plain text |
511
|
|
|
|
|
|
|
# and complex pattern for HTML markup |
512
|
232
|
|
|
|
|
312
|
my ( $plain, $html ); |
513
|
232
|
|
|
|
|
366
|
my $escaped = quotemeta($q); |
514
|
232
|
|
|
|
|
943
|
$escaped =~ s/\\[$wild_esc]/[$wc]*/g; # wildcard |
515
|
232
|
|
|
|
|
660
|
$escaped =~ s/\\[\s]/$ppb/g; # whitespace |
516
|
|
|
|
|
|
|
|
517
|
232
|
|
|
|
|
28886
|
$plain = qr/ |
518
|
|
|
|
|
|
|
( |
519
|
|
|
|
|
|
|
\A|$ppb |
520
|
|
|
|
|
|
|
) |
521
|
|
|
|
|
|
|
( |
522
|
|
|
|
|
|
|
${escaped} |
523
|
|
|
|
|
|
|
) |
524
|
|
|
|
|
|
|
( |
525
|
|
|
|
|
|
|
\Z|$ppb |
526
|
|
|
|
|
|
|
) |
527
|
|
|
|
|
|
|
/xis; |
528
|
|
|
|
|
|
|
|
529
|
232
|
|
|
|
|
1423
|
my (@char) = split( m//, $q ); |
530
|
|
|
|
|
|
|
|
531
|
232
|
|
|
|
|
358
|
my $counter = -1; |
532
|
|
|
|
|
|
|
|
533
|
232
|
|
|
|
|
426
|
CHAR: foreach my $c (@char) { |
534
|
1461
|
|
|
|
|
1738
|
$counter++; |
535
|
|
|
|
|
|
|
|
536
|
1461
|
|
100
|
|
|
3201
|
my $ent = $C2E->{$c} || undef; |
537
|
1461
|
|
|
|
|
1799
|
my $num = ord($c); |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
# if this is a special regexp char, protect it |
540
|
1461
|
|
|
|
|
1724
|
$c = quotemeta($c); |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# if it's a *, replace it with the Wild class |
543
|
1461
|
100
|
|
|
|
2177
|
$c = "[$wild]*" if $c eq $wild_esc; |
544
|
|
|
|
|
|
|
|
545
|
1461
|
100
|
|
|
|
2014
|
if ( $c eq '\ ' ) { |
546
|
55
|
|
|
|
|
140
|
$c = $hpb . $tag_re . '*'; |
547
|
55
|
|
|
|
|
114
|
next CHAR; |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
1406
|
|
|
|
|
1434
|
my $aka; |
551
|
1406
|
100
|
|
|
|
1928
|
if ($ent) { |
552
|
1404
|
100
|
|
|
|
2517
|
$aka = $ent eq "$num;" ? $ent : "$ent|$num;"; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
else { |
555
|
2
|
|
|
|
|
5
|
$aka = "$num;"; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# make $c into a regexp |
559
|
1406
|
100
|
|
|
|
12396
|
$c = qr/$c|$aka/i unless $c eq "[$wild]*"; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# any char might be followed by zero or more tags, unless it's the last char |
562
|
1406
|
100
|
|
|
|
5104
|
$c .= $tag_re . '*' unless $counter == $#char; |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# re-join the chars into a single string |
567
|
232
|
|
|
|
|
707
|
my $safe = join( "\n", @char ); # use \n to make it legible in debugging |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
# for debugging legibility we include newlines, so make sure we s//x in matches |
570
|
232
|
|
|
|
|
119020
|
$html = qr/ |
571
|
|
|
|
|
|
|
( |
572
|
|
|
|
|
|
|
${st_bound} |
573
|
|
|
|
|
|
|
) |
574
|
|
|
|
|
|
|
( |
575
|
|
|
|
|
|
|
${safe} |
576
|
|
|
|
|
|
|
) |
577
|
|
|
|
|
|
|
( |
578
|
|
|
|
|
|
|
${end_bound} |
579
|
|
|
|
|
|
|
) |
580
|
|
|
|
|
|
|
/xis; |
581
|
|
|
|
|
|
|
|
582
|
232
|
|
|
|
|
2737
|
return ( $plain, $html, $escaped ); |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
sub _build_term_re { |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# this based on SWISH::PhraseHighlight::set_match_regexp() |
588
|
|
|
|
|
|
|
|
589
|
0
|
|
|
0
|
|
|
my $self = shift; |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
#dump $self; |
592
|
|
|
|
|
|
|
|
593
|
0
|
|
|
|
|
|
my $wc = $self->word_characters; |
594
|
|
|
|
|
|
|
$self->{_wc_regexp} |
595
|
0
|
|
|
|
|
|
= qr/[^$wc]+/io; # regexp for splitting into swish-words |
596
|
|
|
|
|
|
|
|
597
|
0
|
|
|
|
|
|
my $igf = $self->ignore_first_char; |
598
|
0
|
|
|
|
|
|
my $igl = $self->ignore_last_char; |
599
|
0
|
|
|
|
|
|
for ( $igf, $igl ) { |
600
|
0
|
0
|
|
|
|
|
if ($_) { |
601
|
0
|
|
|
|
|
|
$_ = "[$_]*"; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
else { |
604
|
0
|
|
|
|
|
|
$_ = ''; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
|
608
|
0
|
|
|
|
|
|
$self->{_ignoreFirst} = $igf; |
609
|
0
|
|
|
|
|
|
$self->{_ignoreLast} = $igl; |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
1; |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
__END__ |