File Coverage

blib/lib/Search/Tools/QueryParser.pm

Criterion	Covered	Total	%
statement	233	260	89.6
branch	82	126	65.0
condition	17	27	62.9
subroutine	19	21	90.4
pod	3	3	100.0
total	354	437	81.0

line	stmt	bran	cond	sub	pod	time	code
1							package Search::Tools::QueryParser;
2	26			26		90717	use Moo;
	26					129412
	26					206
3							extends 'Search::Tools::Object';
4	26			26		23376	use Carp;
	26					69
	26					1843
5	26			26		2916	use Data::Dump qw( dump );
	26					31222
	26					1569
6	26			26		16929	use Search::Query::Parser;
	26					4155494
	26					1251
7	26			26		6802	use Encode;
	26					110667
	26					2683
8	26			26		232	use Data::Dump;
	26					65
	26					1373
9	26			26		15209	use Search::Tools::Query;
	26					106
	26					1123
10	26			26		225	use Search::Tools::UTF8;
	26					61
	26					2849
11	26			26		222	use Search::Tools::XML;
	26					70
	26					653
12	26			26		161	use Search::Tools::RegEx;
	26					73
	26					686
13
14	26			26		156	use namespace::autoclean;
	26					63
	26					134
15
16							our $VERSION = '1.007';
17
18							my $XML = Search::Tools::XML->new();
19							my $C2E = $XML->char2ent_map;
20
21							# we turn locale pragma on in a small block
22							# because we don't want it to mess up our regex building
23							# or taint vars in other areas. We just want to use setlocale()
24							# and make sure we get correct ->utf8 encoding
25							my ( $locale, $lang, $charset );
26							{
27	26			26		17507	use POSIX qw(locale_h);
	26					164357
	26					204
28							$locale = setlocale(LC_CTYPE);
29							( $lang, $charset ) = split( m/\./, $locale );
30							$charset \|\|= q/UTF-8/; #
31							$lang = q/en_US/ if $lang =~ m/^(posix\|c)$/i;
32							}
33
34							my %Defaults = (
35							and_word => q/and\|near\d*/,
36							charset => $charset,
37							default_field => "",
38							ignore_case => 1,
39							ignore_fields => {},
40							ignore_first_char => quotemeta(q/'-/),
41							ignore_last_char => quotemeta(q/'-/),
42							lang => $lang,
43							locale => $locale,
44							not_word => q/not/,
45							or_word => q/or/,
46							phrase_delim => q/"/,
47							query_class => 'Search::Tools::Query',
48							query_dialect => "Search::Query::Dialect::Native",
49							stemmer => undef,
50							stopwords => [],
51							tag_re => $XML->tag_re,
52							term_re => qr/\w+(?:[\'\-]\w+)*/,
53							term_min_length => 1,
54							treat_uris_like_phrases => 1,
55							whitespace => $XML->html_whitespace,
56							wildcard => q/*/,
57							word_characters => q/\w/ . quotemeta(q/'-/),
58							);
59
60							for my $attr ( keys %Defaults ) {
61							has( $attr => ( is => 'rw', default => sub { $Defaults{$attr} } ) );
62							}
63							has 'start_bound' => ( is => 'ro' );
64							has 'end_bound' => ( is => 'ro' );
65							has 'plain_phrase_bound' => ( is => 'ro' );
66							has 'html_phrase_bound' => ( is => 'ro' );
67
68							sub get_defaults {
69	0			0	1	0	return {%Defaults};
70							}
71
72							sub BUILD {
73	51			51	1	375	my $self = shift;
74
75							# TODO handle case where both term_re and word_characters are defined
76
77							# charset/locale/lang are a bit interdependent
78							# so make sure charset/lang are set if locale is explicitly passed.
79	51	100				299	if ( $self->{locale} ne $Defaults{locale} ) {
80	1					10	( $self->{lang}, $self->{charset} ) = split( m/\./, $self->{locale} );
81	1	50				12	$self->{lang} = 'en_US' if $self->{lang} =~ m/^(posix\|c)$/i;
82	1		33			6	$self->{charset} \|\|= $Defaults{charset};
83							}
84
85							# make sure ignore_fields is a hash ref
86	51	50				242	if ( ref( $self->{ignore_fields} ) eq 'ARRAY' ) {
87							$self->{ignore_fields}
88	0					0	= { map { $_ => $_ } @{ $self->{ignore_fields} } };
	0					0
	0					0
89							}
90
91	51					260	$self->_setup_regex_builder;
92
93	51					590	return $self;
94							}
95
96							sub parse {
97	63			63	1	11614	my $self = shift;
98	63					154	my $query_str = shift;
99	63	50				263	confess "query required" unless defined $query_str;
100	63	50				234	if ( ref $query_str ) {
101	0					0	croak "query must be a scalar string";
102							}
103
104							#$query_str = to_utf8( $query_str, $self->charset );
105	63					259	my $extracted = $self->_extract_terms($query_str);
106	63					262	my %regex;
107	63					189	TERM: for my $term ( @{ $extracted->{terms} } ) {
	63					310
108	145					2570	my ( $plain, $html, $escaped ) = $self->_build_regex($term);
109	145					690	my $is_phrase = $term =~ m/\ /;
110	145					317	my @phrase_terms;
111
112							# if the term is a phrase,
113							# build regex for each term in the phrase
114	145	100				470	if ($is_phrase) {
115	32					215	my @pts = split( /\ /, $term );
116	32					108	for my $pt (@pts) {
117	87					1448	my ( $pt_plain, $pt_html, $pt_esc )
118							= $self->_build_regex($pt);
119	87					3997	push @phrase_terms,
120							Search::Tools::RegEx->new(
121							plain => $pt_plain,
122							html => $pt_html,
123							term => $pt,
124							term_re => qr/$pt_esc/i,
125							is_phrase => 0,
126							);
127							}
128							}
129	145					9856	$regex{$term} = Search::Tools::RegEx->new(
130							plain => $plain,
131							html => $html,
132							term => $term,
133							term_re => qr/$escaped/i,
134							is_phrase => $is_phrase,
135							phrase_terms => \@phrase_terms,
136							);
137
138							}
139							return $self->{query_class}->new(
140							dialect => $extracted->{dialect},
141							terms => $extracted->{terms},
142							fields => $extracted->{fields},
143	63					2072	str => to_utf8( $query_str, $self->charset ),
144							regex => \%regex,
145							qp => $self,
146							);
147							}
148
149							sub _extract_terms {
150	63			63		143	my $self = shift;
151	63					137	my $query = shift;
152	63	50				205	confess "need query to extract terms" unless defined $query;
153	63					335	my $stopwords = $self->stopwords;
154	63					210	my $and_word = $self->and_word;
155	63					210	my $or_word = $self->or_word;
156	63					269	my $not_word = $self->not_word;
157	63					209	my $wildcard = $self->wildcard;
158	63					222	my $phrase = $self->phrase_delim;
159	63					238	my $igf = $self->ignore_first_char;
160	63					200	my $igl = $self->ignore_last_char;
161	63					226	my $wordchar = $self->word_characters;
162	63					206	my $default_field = $self->default_field;
163	63					170	my $esc_wildcard = quotemeta($wildcard);
164	63					2817	my $word_re = qr/(($esc_wildcard)?[$wordchar]+($esc_wildcard)?)/;
165	63					363	my $min_length = $self->term_min_length;
166	63					240	my $raw_query = $query;
167
168	63	100				302	$stopwords = [ split( /\s+/, $stopwords ) ] unless ref $stopwords;
169	63					234	my %stophash = map { to_utf8( lc($_), $self->charset ) => 1 } @$stopwords;
	15					97
170	63					191	my ( %words, %uniq, $c );
171	63					2596	my $parser = Search::Query::Parser->new(
172							and_regex => qr{$and_word}i,
173							or_regex => qr{$or_word}i,
174							not_regex => qr{$not_word}i,
175							default_field => $default_field,
176							query_class => $self->query_dialect,
177							);
178
179	63					160290	my $baked_query = $raw_query;
180	63	50				496	$baked_query = lc($baked_query) if $self->ignore_case;
181	63					6289	$baked_query = to_utf8( $baked_query, $self->charset );
182	63	50				333	my $dialect = $parser->parse($baked_query) or croak $parser->error;
183	63	50				197856	$self->debug && carp "parsetree: " . Data::Dump::dump( $dialect->tree );
184	63					931	my $fields_searched
185							= $self->_get_value_from_tree( \%uniq, $dialect->tree, $c );
186
187	63	50				1719	$self->debug && carp "parsed: " . Data::Dump::dump( \%uniq );
188
189	63					679	my $count = scalar( keys %uniq );
190
191							# parse uniq into word tokens
192							# including removing stop words
193
194	63	50				1351	$self->debug && carp "word_re: $word_re";
195
196	63					740	U: for my $u ( sort { $uniq{$a} <=> $uniq{$b} } keys %uniq ) {
	168					538
197
198	152					412	my $n = $uniq{$u};
199
200							# only phrases have space
201							# but due to our word_re, a single non-spaced string
202							# might actually be multiple word tokens
203	152		100			871	my $isphrase = $u =~ m/\s/ \|\| 0;
204
205	152	50				649	if ( $self->treat_uris_like_phrases ) {
206
207							# special case: treat email addresses, uris, as phrase
208	152		100			3317	$isphrase \|\|= $u =~ m/[$wordchar][\@\.\\\/][$wordchar]/ \|\| 0;
			100
209							}
210
211	152	50				3587	$self->debug && carp "$u -> isphrase = $isphrase";
212
213	152					1274	my @w = ();
214
215	152					708	TOK: for my $w ( split( m/\s+/, to_utf8( $u, $self->charset ) ) ) {
216
217	199	50				869	next TOK unless $w =~ m/\S/;
218
219	199					950	$w =~ s/\Q$phrase\E//g;
220
221	199					1380	while ( $w =~ m/$word_re/g ) {
222	208					598	my $tok = _untaint($1);
223
224							# strip ignorable chars
225	208	50				1529	$tok =~ s/^[$igf]+// if length($igf);
226	208	50				1280	$tok =~ s/[$igl]+$// if length($igl);
227
228	208	50				594	unless ($tok) {
229	0	0				0	$self->debug && carp "no token for '$w' $word_re";
230	0					0	next TOK;
231							}
232
233	208	50				4614	$self->debug && carp "found token: $tok";
234
235	208	100				1925	if ( exists $stophash{ lc($tok) } ) {
236	13	50				243	$self->debug && carp "$tok = stopword";
237	13	100				114	next TOK unless $isphrase;
238							}
239
240	202	100				551	unless ($isphrase) {
241	115	50				1798	next TOK if $tok =~ m/^($and_word\|$or_word\|$not_word)$/i;
242							}
243
244							# if tainting was on, odd things can happen.
245							# so check one more time
246	202					887	$tok = to_utf8( $tok, $self->charset );
247
248							# final sanity check
249	202	50				661	if ( !Encode::is_utf8($tok) ) {
250	0					0	carp "$tok is NOT utf8";
251	0					0	next TOK;
252							}
253
254							#$self->debug && carp "pushing $tok into wordlist";
255	202					1216	push( @w, $tok );
256
257							}
258
259							}
260
261	152	100				550	next U unless @w;
262
263							#$self->debug && carp "joining \@w: " . Data::Dump::dump(\@w);
264	146	100				405	if ($isphrase) {
265	32					215	$words{ join( ' ', @w ) } = $n + $count++;
266							}
267							else {
268	114					305	for (@w) {
269	115					573	$words{$_} = $n + $count++;
270							}
271							}
272
273							}
274
275	63	50				1355	$self->debug && carp "tokenized: " . Data::Dump::dump( \%words );
276
277							# make sure we don't have 'foo' and 'foo*'
278	63					672	for ( keys %words ) {
279	147	100				761	if ( $_ =~ m/$esc_wildcard/ ) {
280	12					103	( my $copy = $_ ) =~ s,$esc_wildcard,,g;
281
282							# delete the more exact of the two
283							# since the * will match both
284	12					40	delete( $words{$copy} );
285							}
286
287	147	100				550	if ( length $_ < $min_length ) {
288	1	50				19	$self->debug and carp "token too short: '$_'";
289	1					8	delete $words{$_};
290							}
291
292							}
293
294	63	50				1372	$self->debug && carp "wildcards removed: " . Data::Dump::dump( \%words );
295
296							# if any words need to be stemmed
297	63	100				763	if ( $self->stemmer ) {
298
299							# split each $word into words
300							# stem each word
301							# if stem ne word, break into chars and find first N common
302							# rejoin $uniq
303
304							#carp "stemming ON\n";
305
306	8					31	K: for ( keys %words ) {
307	15					61	my (@w) = split /\s+/;
308	15					33	W: for my $w (@w) {
309	27					57	my $func = $self->stemmer;
310	27					64	my $f = &$func( $self, $w );
311	27	50	33			211	if ( !defined $f or !length $f ) {
312	0					0	next W;
313							}
314	27					113	$f = to_utf8($f);
315
316							#warn "w: $w\nf: $f\n";
317
318							# add wildcard to indicate chars were lost
319	27					88	$w = $f . $wildcard;
320
321							}
322	15					40	my $new = join ' ', @w;
323	15	50				83	if ( $new ne $_ ) {
324	15					47	$words{$new} = $words{$_};
325	15					45	delete $words{$_};
326							}
327							}
328
329							}
330
331	63	50				1249	$self->debug && carp "stemming done: " . Data::Dump::dump( \%words );
332
333							# sort keeps query in same order as we entered
334							return {
335	63					1030	terms => [ sort { $words{$a} <=> $words{$b} } keys %words ],
	156					862
336							fields => [ keys %$fields_searched ],
337							dialect => $dialect,
338							query => $raw_query,
339							};
340
341							}
342
343							# stolen nearly verbatim from Taint::Runtime
344							# apparently regex can be tainted when running under 'use locale'.
345							# as of version 0.24 this should not be needed but until I can find a way
346							# to easily test the Taint feature, we just do this. It's low overhead.
347							sub _untaint {
348	208			208		570	my $str = shift;
349	208	50				663	my $ref = ref($str) ? $str : \$str;
350	208	50				564	if ( !defined $$ref ) {
351	0					0	$$ref = undef;
352							}
353							else {
354							$$ref
355							= ( $$ref =~ /(.*)/ )
356							? $1
357	208	50				983	: do { confess("Couldn't find data to untaint") };
	0					0
358							}
359	208	50				695	return ref($str) ? 1 : $str;
360							}
361
362							sub _get_value_from_tree {
363	63			63		4407	my $self = shift;
364	63					131	my $uniq = shift;
365	63					139	my $parseTree = shift;
366	63					142	my $c = shift;
367	63					167	my %fields = ();
368
369							# we only want the values from non minus queries
370	63					170	for my $node ( '+', '' ) {
371	126	100				439	next unless exists $parseTree->{$node};
372
373	63					130	my @branches = @{ $parseTree->{$node} };
	63					228
374
375							#warn dump \@branches;
376
377	63					175	for my $leaf (@branches) {
378	150					346	my $v = $leaf->{value};
379	150	50				398	if ( !defined $v ) {
380	0					0	croak "undefined value in query tree: " . dump($leaf);
381							}
382	150	50	66			467	if ( defined $leaf->{field}
383							and exists $self->ignore_fields->{ $leaf->{field} } )
384							{
385	0					0	next;
386							}
387	150					291	my $field = $leaf->{field};
388	150	100				356	if ( defined $field ) {
389	3					11	$fields{$field}++;
390							}
391	150	50				552	if ( ref $v eq 'HASH' ) {
		100
392	0					0	my $f = $self->_get_value_from_tree( $uniq, $v, $c );
393	0					0	$fields{$_} = $f->{$_} for ( keys %$f );
394							}
395							elsif ( ref $v eq 'ARRAY' ) {
396	1					4	for my $value (@$v) {
397	2					6	$value =~ s/\s+/ /g;
398	2					7	$uniq->{$value} = ++$c;
399							}
400							}
401							else {
402
403							# if the $leaf is a proximity query,
404							# ignore the "phrase-ness" of it and split
405							# on whitespace. This is a compromise,
406							# mitigated by the tendency of HeatMap
407							# to reward proximity anyway.
408	149	100	66			507	if ( $leaf->{proximity} and $leaf->{proximity} > 1 ) {
409	1					8	my @tokens = split( m/\ +/, $v );
410	1					6	$uniq->{$_} = ++$c for @tokens;
411	1					4	next;
412							}
413
414							# collapse any whitespace
415	148					566	$v =~ s,\s+,\ ,g;
416
417	148					689	$uniq->{$v} = ++$c;
418							}
419							}
420							}
421	63					302	return \%fields;
422							}
423
424							sub _setup_regex_builder {
425	51			51		124	my $self = shift;
426
427							# TODO optional for term_re
428
429							# a search for a '<' or '>' should still highlight,
430							# since < or > can be indexed as literal < and >
431							# but this causes a great deal of hassle
432							# so we just ignore them.
433	51					222	my $wordchars = $self->word_characters;
434	51					176	$wordchars =~ s,[<>&],,g;
435	51					162	$self->{html_safe_wordchars} = $wordchars; # remember for build
436	51					192	my $ignore_first = $self->ignore_first_char;
437	51					173	my $ignore_last = $self->ignore_last_char;
438	51					303	my $html_whitespace = $self->whitespace;
439
440							# what's the boundary between a word and a not-word?
441							# by default:
442							# the beginning of a string
443							# the end of a string
444							# whatever we've defined as WhiteSpace
445							# any character that is not a WordChar
446							# any character we explicitly ignore at start or end of word
447							#
448							# the \A and \Z (beginning and end) should help if the word butts up
449							# against the beginning or end of a tagset
450							# like Word or Word
451
452	51					358	my @start_bound = (
453							'\A',
454							'[>]',
455							'(?:&[\w\#]+;)', # because a ; might be a legitimate wordchar
456							# and we treat a char entity like a single char.
457							# if &char; resolves to a legit wordchar
458							# this might give unexpected results.
459							# NOTE that etc is in $WhiteSpace
460							$html_whitespace,
461							'[^' . $wordchars . ']'
462							);
463	51	50				979	push( @start_bound, qr/[$ignore_first]+/i ) if length $ignore_first;
464
465							my @end_bound
466	51					409	= ( '\Z', '[<&]', $html_whitespace, '[^' . $wordchars . ']' );
467	51	50				624	push( @end_bound, qr/[$ignore_last]+/i ) if length $ignore_last;
468
469	51		33			593	$self->{start_bound} \|\|= join( '\|', @start_bound );
470
471	51		33			461	$self->{end_bound} \|\|= join( '\|', @end_bound );
472
473							# the whitespace in a query phrase might be:
474							# any ignore_last_char, followed by
475							# one or more nonwordchar or whitespace, followed by
476							# any ignore_first_char
477							# define for both text and html
478							# NOTE the first/last swap for plain vs html
479							# is intentional because of how regex are built.
480
481	51	50				5083	my @plain_phrase_bound = (
		50
482							( length($ignore_last) ? qr/[$ignore_last]*/i : '' ),
483							qr/(?:[\s\x20]\|[^$wordchars])+/is,
484							( length($ignore_first) ? qr/[$ignore_first]?/i : '' ),
485							);
486	51					372	$self->{plain_phrase_bound} = join( '', @plain_phrase_bound );
487
488	51	50				2835	my @html_phrase_bound = (
		50
489							( length($ignore_first) ? qr/[$ignore_first]*/i : '' ),
490							qr/(?:$html_whitespace\|[^$wordchars])+/is,
491							( length($ignore_last) ? qr/[$ignore_last]?/i : '' ),
492							);
493	51					261128	$self->{html_phrase_bound} = join( '', @html_phrase_bound );
494
495							}
496
497							sub _build_regex {
498	232			232		582	my $self = shift;
499	232	50				765	my $q = shift or croak "need query to build()";
500	232					605	my $wild = $self->{html_safe_wordchars};
501	232					541	my $st_bound = $self->{start_bound};
502	232					511	my $end_bound = $self->{end_bound};
503	232					546	my $wc = $self->{html_safe_wordchars};
504	232					463	my $ppb = $self->{plain_phrase_bound};
505	232					471	my $hpb = $self->{html_phrase_bound};
506	232					797	my $wildcard = $self->wildcard;
507	232					521	my $wild_esc = quotemeta($wildcard);
508	232					553	my $tag_re = $self->tag_re;
509
510							# define simple pattern for plain text
511							# and complex pattern for HTML markup
512	232					452	my ( $plain, $html );
513	232					479	my $escaped = quotemeta($q);
514	232					1403	$escaped =~ s/\\[$wild_esc]/[$wc]*/g; # wildcard
515	232					933	$escaped =~ s/\\[\s]/$ppb/g; # whitespace
516
517	232					40141	$plain = qr/
518							(
519							\A\|$ppb
520							)
521							(
522							${escaped}
523							)
524							(
525							\Z\|$ppb
526							)
527							/xis;
528
529	232					2146	my (@char) = split( m//, $q );
530
531	232					526	my $counter = -1;
532
533	232					633	CHAR: foreach my $c (@char) {
534	1461					2490	$counter++;
535
536	1461		100			4672	my $ent = $C2E->{$c} \|\| undef;
537	1461					2552	my $num = ord($c);
538
539							# if this is a special regexp char, protect it
540	1461					2409	$c = quotemeta($c);
541
542							# if it's a *, replace it with the Wild class
543	1461	100				3077	$c = "[$wild]*" if $c eq $wild_esc;
544
545	1461	100				2891	if ( $c eq '\ ' ) {
546	55					213	$c = $hpb . $tag_re . '*';
547	55					160	next CHAR;
548							}
549
550	1406					1982	my $aka;
551	1406	100				2694	if ($ent) {
552	1404	100				3600	$aka = $ent eq "&#$num;" ? $ent : "$ent\|&#$num;";
553							}
554							else {
555	2					6	$aka = "&#$num;";
556							}
557
558							# make $c into a regexp
559	1406	100				18415	$c = qr/$c\|$aka/i unless $c eq "[$wild]*";
560
561							# any char might be followed by zero or more tags, unless it's the last char
562	1406	100				7552	$c .= $tag_re . '*' unless $counter == $#char;
563
564							}
565
566							# re-join the chars into a single string
567	232					1079	my $safe = join( "\n", @char ); # use \n to make it legible in debugging
568
569							# for debugging legibility we include newlines, so make sure we s//x in matches
570	232					169069	$html = qr/
571							(
572							${st_bound}
573							)
574							(
575							${safe}
576							)
577							(
578							${end_bound}
579							)
580							/xis;
581
582	232					3995	return ( $plain, $html, $escaped );
583							}
584
585							sub _build_term_re {
586
587							# this based on SWISH::PhraseHighlight::set_match_regexp()
588
589	0			0			my $self = shift;
590
591							#dump $self;
592
593	0						my $wc = $self->word_characters;
594							$self->{_wc_regexp}
595	0						= qr/[^$wc]+/io; # regexp for splitting into swish-words
596
597	0						my $igf = $self->ignore_first_char;
598	0						my $igl = $self->ignore_last_char;
599	0						for ( $igf, $igl ) {
600	0	0					if ($_) {
601	0						$_ = "[$_]*";
602							}
603							else {
604	0						$_ = '';
605							}
606							}
607
608	0						$self->{_ignoreFirst} = $igf;
609	0						$self->{_ignoreLast} = $igl;
610
611							}
612
613							1;
614
615							__END__