File Coverage

blib/lib/Search/Tools/Query.pm

Criterion	Covered	Total	%
statement	114	126	90.4
branch	15	24	62.5
condition			n/a
subroutine	24	25	96.0
pod	11	11	100.0
total	164	186	88.1

line	stmt	bran	sub	pod	time	code
1						package Search::Tools::Query;
2	26		26		1599	use Moo;
	26				12461
	26				218
3						extends 'Search::Tools::Object';
4						use overload
5	9		9		1066	'""' => sub { $_[0]->str; },
6	36		36		7248	'bool' => sub {1},
7	26		26		13147	fallback => 1;
	26				89
	26				399
8	26		26		3273	use Carp;
	26				106
	26				2026
9	26		26		232	use Data::Dump qw( dump );
	26				74
	26				1586
10	26		26		12546	use Search::Tools::RegEx;
	26				85
	26				997
11	26		26		5522	use Search::Tools::UTF8;
	26				78
	26				3165
12	26		26		5838	use Search::Tools::Tokenizer;
	26				96
	26				932
13	26		26		6337	use Search::Tools::XML;
	26				79
	26				1120
14
15	26		26		224	use namespace::autoclean;
	26				67
	26				155
16
17						our $VERSION = '1.007';
18
19						has 'terms' => ( is => 'ro' );
20						has 'fields' => ( is => 'ro' );
21						has 'dialect' => ( is => 'ro' );
22						has 'str' => ( is => 'ro' );
23						has 'regex' => ( is => 'ro' );
24						has 'qp' => ( is => 'ro' );
25
26						=head1 NAME
27
28						Search::Tools::Query - objectified string for highlighting, snipping, etc.
29
30						=head1 SYNOPSIS
31
32						use Search::Tools::QueryParser;
33						my $qparser = Search::Tools::QueryParser->new;
34						my $query = $qparser->parse(q(the quick color:brown "fox jumped"));
35						my $fields = $query->fields; # ['color']
36						my $terms = $query->terms; # ['quick', 'brown', '"fox jumped"']
37						my $regex = $query->regex_for($terms->[0]); # S::T::RegEx
38						my $tree = $query->tree; # the Search::Query::Dialect tree()
39						print "$query\n"; # the quick color:brown "fox jumped"
40						print $query->str . "\n"; # same thing
41
42
43						=head1 DESCRIPTION
44
45
46						=head1 METHODS
47
48						=head2 fields
49
50						Array ref of fields from the original query string.
51						See Search::Tools::QueryParser for controls over ignore_fields().
52
53						=head2 terms
54
55						Array ref of key words from the original query string.
56						See Search::Tools::QueryParser for controls over ignore_fields()
57						and tokenizing regex.
58
59						B
60						Only positive words are extracted by QueryParser.
61						In other words, if you search for:
62
63						foo not bar
64
65						then only C is returned. Likewise:
66
67						+foo -bar
68
69						would return only C.
70
71						=head2 str
72
73						The original string.
74
75						=head2 regex
76
77						The hash ref of terms to Search::Tools::RegEx objects.
78
79						=head2 dialect
80
81						The internal Search::Query::Dialect object. See tree()
82						and str_clean() which delegate to the dialect object.
83
84						=head2 qp
85
86						The Search::Tools::QueryParser object used to generate the Query.
87
88						=head2 num_terms
89
90						Returns the number of terms().
91
92						=cut
93
94						sub num_terms {
95	38		38	1	2610	return scalar @{ shift->{terms} };
	38				190
96						}
97
98						=head2 unique_terms
99
100						Returns array ref of unique terms from query.
101						If stemming was on in the QueryParser,
102						all terms have already been stemmed as part
103						of the parsing process.
104
105						=cut
106
107						sub unique_terms {
108	2		2	1	8	my $self = shift;
109	2				5	my @t = @{ $self->{terms} };
	2				12
110	2				6	my %uniq;
111	2				9	for my $t (@t) {
112	4				17	my $re = $self->regex_for($t);
113	4	50			22	if ( $re->is_phrase ) {
114	4				11	for my $pt ( @{ $re->phrase_terms } ) {
	4				21
115	8				44	$uniq{ $pt->term }++;
116						}
117						}
118						else {
119	0				0	$uniq{ $re->term }++;
120						}
121						}
122	2				30	return [ keys %uniq ];
123						}
124
125						=head2 num_unique_terms
126
127						Returns number of unique_terms().
128
129						=cut
130
131						sub num_unique_terms {
132	2		2	1	8	return scalar( @{ $_[0]->unique_terms } );
	2				12
133						}
134
135						=head2 phrases
136
137						Return array ref of RegEx objects for all terms where is_phrase
138						is true.
139
140						=cut
141
142						sub phrases {
143	30		30	1	84	my $self = shift;
144	30				71	my @p;
145	30				72	for my $t ( keys %{ $self->{regex} } ) {
	30				198
146	57	100			313	if ( $self->{regex}->{$t}->is_phrase ) {
147	21				68	push @p, $self->{regex}->{$t};
148						}
149						}
150	30				175	return \@p;
151						}
152
153						=head2 non_phrases
154
155						Return array ref of RegEx objects for all terms where is_phrase
156						is false.
157
158						=cut
159
160						sub non_phrases {
161	0		0	1	0	my $self = shift;
162	0				0	my @p;
163	0				0	for my $t ( keys %{ $self->{regex} } ) {
	0				0
164	0	0			0	if ( !$self->{regex}->{$t}->is_phrase ) {
165	0				0	push @p, $self->{regex}->{$t};
166						}
167						}
168	0				0	return \@p;
169						}
170
171						=head2 tree
172
173						Returns the internal Search::Query::Dialect tree().
174
175						=cut
176
177						sub tree {
178	1		1	1	673	my $self = shift;
179	1				8	return $self->dialect->tree();
180						}
181
182						=head2 str_clean
183
184						Returns the internal Search::Query::Dialect stringify().
185
186						=cut
187
188						sub str_clean {
189	1		1	1	2	my $self = shift;
190	1				11	return $self->dialect->stringify();
191						}
192
193						=head2 regex_for(I)
194
195						Returns a Search::Tools::RegEx object for I.
196
197						=cut
198
199						sub regex_for {
200	116		116	1	39890	my $self = shift;
201	116				248	my $term = shift;
202	116	50			335	unless ( defined $term ) {
203	0				0	croak "term required";
204						}
205	116	50			384	my $regex = $self->{regex} or croak "regex not defined for query";
206	116	50			391	if ( !exists $regex->{$term} ) {
207	0				0	croak "no regex for $term";
208						}
209	116				480	return $regex->{$term};
210						}
211
212						=head2 regexp_for
213
214						Alias for regex_for(). The author has come to prefer "regex"
215						instead of "regexp" because it's one less keystroke.
216
217						=cut
218
219						*regexp_for = \®ex_for;
220
221						=head2 matches_text( I )
222
223						Returns the number of matches for the query against I.
224
225						=head2 matches_html( I )
226
227						Returns the number of matches for the query against I.
228
229						=cut
230
231						sub _matches_stemmed {
232	4		4		10	my $self = shift;
233	4				15	my $text = to_utf8( $_[0] );
234	4				8	my $count = 0;
235	4				13	my $qp = $self->qp;
236	4				9	my $stemmer = $qp->stemmer;
237	4				12	my $wildcard = $qp->wildcard;
238	4				112	my $tokenizer = Search::Tools::Tokenizer->new(
239						re => $qp->term_re,
240						debug => $self->debug,
241						);
242
243						# stem the whole text, creating a new buffer to
244						# match against. This covers both the cases where
245						# a term is a phrase and where it is not.
246	4				7	my @buf;
247						my $buf_maker = sub {
248	30		30		356	push @buf, $stemmer->( $qp, $_[0]->str );
249	4				24	};
250	4				159	$tokenizer->tokenize( $text, $buf_maker );
251	4				127	my $new_text = join( " ", @buf );
252
253	4				9	for my $term ( @{ $self->{terms} } ) {
	4				14
254	4				14	my $re = $self->{regex}->{$term}->{plain};
255	4				58	$count += $new_text =~ m/$re/;
256						}
257	4				65	return $count;
258						}
259
260						sub _matches {
261	2		2		4	my $self = shift;
262	2				4	my $style = shift;
263	2				7	my $text = to_utf8( $_[0] );
264	2				4	my $count = 0;
265	2				3	for my $term ( @{ $self->{terms} } ) {
	2				6
266	2				7	my $regex = $self->{regex}->{$term}->{$style};
267	2				41	$count += $text =~ m/$regex/;
268						}
269	2				11	return $count;
270						}
271
272						sub matches_text {
273	3		3	1	538	my $self = shift;
274	3				5	my $text = shift;
275	3	50			12	if ( !defined $text ) {
276	0				0	croak "text required";
277						}
278	3	100			22	return $self->_matches_stemmed($text) if $self->qp->stemmer;
279	1				5	return $self->_matches( 'plain', $text );
280						}
281
282						sub matches_html {
283	3		3	1	289	my $self = shift;
284	3				8	my $html = shift;
285	3	50			12	if ( !defined $html ) {
286	0				0	croak "html required";
287						}
288	3	100			19	if ( $self->qp->stemmer ) {
289	2				17	return $self->_matches_stemmed( Search::Tools::XML->no_html($html) );
290						}
291	1				4	return $self->_matches( 'html', $html );
292						}
293
294						=head2 terms_as_regex([I])
295
296						Returns all terms() as a single qr// regex, pipe-joined in a "OR"
297						logic.
298
299						=cut
300
301						sub terms_as_regex {
302	31		31	1	77	my $self = shift;
303	31				62	my $treat_phrases_as_singles = shift;
304	31	50			105	$treat_phrases_as_singles = 1 unless defined $treat_phrases_as_singles;
305	31				134	my $wildcard = $self->qp->wildcard;
306	31				82	my $wild_esc = quotemeta($wildcard);
307	31				102	my $wc = $self->qp->word_characters;
308	31				63	my @re;
309	31				58	for my $term ( @{ $self->{terms} } ) {
	31				122
310
311	65				147	my $q = quotemeta($term); # quotemeta speeds up the match, too
312						# even though we have to unquote below
313
314	65				392	$q =~ s/\\$wild_esc/[$wc]*/g; # wildcard match is very approximate
315
316						# treat phrases like OR'd words
317						# since that will just create more matches.
318						# if hiliting later, the phrase will be treated as such.
319	65	100			175	if ($treat_phrases_as_singles) {
320	46				180	$q =~ s/(\\ )+/\\|/g;
321						}
322
323						# if keeping phrases together use a less-naive regex instead of a space.
324						else {
325
326						#$q = $self->regex_for($term)->plain();
327						#$q =~ s/(\\ )+/[^$wc]+/g;
328						}
329
330	65				190	push( @re, $q );
331						}
332
333	31				219	my $j = sprintf( '(%s)', join( '\|', @re ) );
334	31				1372	return qr/$j/i;
335						}
336
337						1;
338
339						__END__