File Coverage

blib/lib/KSx/Search/RegexpTermQuery.pm

Criterion	Covered	Total	%
statement	42	126	33.3
branch	4	26	15.3
condition	2	14	14.2
subroutine	11	21	52.3
pod	2	2	100.0
total	61	189	32.2

line	stmt	bran	cond	sub	pod	time	code
1	2			2		1045	use strict;
	2					3
	2					86
2	2			2		12	use warnings;
	2					4
	2					97
3
4							package KSx::Search::RegexpTermQuery;
5	2			2		22	use base qw( KinoSearch::Search::Query );
	2					4
	2					2196
6
7							our $VERSION = '0.05';
8
9	2			2		8421	use Hash::Util::FieldHash::Compat 'fieldhashes';
	2					6406
	2					16
10							fieldhashes \my( %re, %prefix, %field );
11
12							sub new {
13	2			2	1	5746	my ($package, %args) = @_;
14
15	2					15	my $re = delete $args{regexp};
16	2					7	my $field = delete $args{field};
17
18	2					46	my $self = $package->SUPER::new(%args);
19
20	2					529	$re{$self} = $re;
21	2					12	$field{$self} = $field;
22
23							# get the literal prefix of the regexp, if any.
24	2	100	50			80	if($re{$self} =~
	1					21
25							m<^
26							(?: # prefix for qr//'s, without allowing /i :
27							\(\? ([a-hj-z]) (?:-[a-z])?:
28							)?
29							(\\[GA]\|\^) # anchor
30							([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments)
31							>x
32							) {{
33	1					2	my ($mod,$anchor,$prefix) = ($1\|\|'',$2,$3);
34	1	50	33			8	$anchor eq '^' and $mod =~ /m/ and last;
35	1					28	for($prefix) {
36	1	50				5	$mod =~ /x/ and s/\s+//g;
37							}
38	1					6	$prefix{$self} = $prefix;
39							}}
40
41	2					10	$self;
42							}
43
44							#sub extract_terms {
45							# my $self = shift;
46							# return @{ $self->{terms} };
47							#}
48
49							sub make_compiler {
50	2			2	1	108	return KSx::Search::RegexpTermCompiler->new(
51							parent => @_
52							);
53							}
54
55
56							package KSx::Search::RegexpTermCompiler;
57	2			2		828	use base qw( KinoSearch::Search::Compiler );
	2					5
	2					2454
58
59	2			2		345	use Hash::Util::FieldHash::Compat 'fieldhashes';
	2					5
	2					19
60
61							fieldhashes \my ( %idf, %raw_impact, #%plists,
62							%terms,
63							%query_norm_factor, % normalized_impact, %tfs );
64							sub new {
65	2			2		10	my($pack, %args) = @_;
66
67	2					6	my $searcher = $args{searchable};
68	2					556	my $reader = $searcher->get_reader;
69	0						my $lex_reader = $reader->fetch("KinoSearch::Index::LexiconReader");
70	0						my $post_reader = $reader->fetch("KinoSearch::Index::PostingsReader");
71
72							# Retrieve the correct Similarity for the Query's field.
73	0						my $sim = $args{similarity} =
74							$searcher->get_schema->fetch_sim($field{$args{parent}});
75
76	0						my $self = $pack->SUPER::new(%args);
77
78	0						my $parent = $args{parent};
79
80							# Get a lexicon and find our place therein
81	0						my( $re, $prefix ) = ($re{$parent}, $prefix{$parent});
82	0	0					ref $re eq 'Regexp' or $re = qr/$re/; # avoid repetitive recompilation
83	0						my $lexcn = $lex_reader->lexicon( field => $field{$parent} );
84	0	0					$lexcn->seek(defined $prefix ? $prefix : '');
85
86							# iterate through it, stopping at terms that match
87	0						my @terms; #my @plists;
88							my %hits; # The keys are the doc nums; the values the tfs.
89
90	0						while () {
91	0						my $term = get_term $lexcn;
92
93							# sift out unwanted terms
94	0	0	0				last if defined $prefix and index( $term, $prefix ) != 0;
95	0	0					next unless $term =~ $re;
96
97							# for terms that match...
98
99	0						push @terms, $term;
100
101							# We have to iterate through the documents in each posting list,
102							# recording the doc numbers, so we can calc the doc freq later on.
103							# E.g., if there are two documents, one containing ‘dog’ and ‘dot,’
104							# and the other containing just ‘dog,’ and the re is /^do.*/, then
105							# the doc freq has to be 2, since the re matches two docs. The doc
106							# freqs of the individual terms are 1 and 2, so we can’t add or
107							# average them.
108	0						my $plist = $post_reader->posting_list(
109							term => $term,
110							field => $field{$parent},
111							);
112	0						my $posting; my $weight;
113	0						while (my $doc_num = $plist ->next) {
114							# For efficiency’s sake, we’ll collect the results now, to
115							# avoid iterating through postings (the slowest part of search-
116							# ing) more than once, even though this code probably belongs
117							# in RegexpTermScorer
118	0		0				my $posting \|\|= $plist->get_posting;
119	0		0				$hits{$doc_num} +=
120							$weight \|\|= $posting->get_freq * $posting->get_weight
121							}
122
123							} continue {
124	0	0					last unless $lexcn->next ;
125							}
126	0						my $doc_freq = scalar keys %hits;
127
128							# Save the hits and terms for later
129							# $plists{$self} = \@plists;
130	0						$tfs{$self} = \%hits;
131	0						$terms{$self} = \@terms;
132
133							# Calculate and store the IDF
134	0						my $max_doc = $searcher->doc_max;
135	0	0					my $idf = $idf{$self} = $max_doc
136							? 1 + log( $max_doc / ( 1 + $doc_freq ) )
137							: 1
138							;
139
140	0						$raw_impact{$self} = $idf * $parent->get_boost;
141
142							# make final preparations
143	0						$self->perform_query_normalization($searcher);
144
145	0						$self;
146							}
147
148							sub perform_query_normalization {
149							# copied from KinoSearch::Search::Weight originally
150	0			0			my ( $self, $searcher ) = @_;
151	0						my $sim = $self->get_similarity;
152
153	0						my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t )
154	0						$factor = $sim->query_norm($factor); # factor /= norm_q
155	0						$self->normalize($factor); # impact *= factor
156							}
157
158	0			0			sub get_value { shift->get_parent->get_boost }
159
160	0			0			sub sum_of_squared_weights { $raw_impact{+shift}**2 }
161
162							sub normalize { # copied from TermQuery
163	0			0			my ( $self, $query_norm_factor ) = @_;
164	0						$query_norm_factor{$self} = $query_norm_factor;
165
166							# Multiply raw impact by ( tf_q * idf_q / norm_q )
167							#
168							# Note: factoring in IDF a second time is correct. See formula.
169	0						$normalized_impact{$self}
170							= $raw_impact{$self} * $idf{$self} * $query_norm_factor;
171							}
172
173							sub make_matcher {
174	0			0			my $self = shift;
175
176	0						return KSx::Search::RegexpTermScorer->new(
177							# posting_lists => $plists{$self},
178							@_,
179							compiler => $self,
180							);
181							}
182
183							sub highlight_spans { # plagiarised form of TermWeight’s routine
184	0			0			my ($self, %args) = @_;
185	0						my $doc_vector = $args{doc_vec};
186	0						my $field_name = $args{field};
187	0	0					return if $field{$self->get_parent} ne $field_name;
188	0						my $searcher = $args{searcher};
189	0						my $terms = $terms{$self};
190
191	0						require KinoSearch::Search::Span;
192
193	0						my @posits;
194	0						my $weight_val = $self->get_value;
195	0						for (@$terms) {
196	0						my $term_vector
197							= $doc_vector->term_vector( field => $field_name, term => $_ );
198	0	0					next unless defined $term_vector;
199	0						my $starts = $term_vector->get_start_offsets->to_arrayref;
200	0						my $ends = $term_vector->get_end_offsets->to_arrayref;
201	0						while (@$starts) {
202	0						my $start = shift @$starts;
203	0						push @posits, KinoSearch::Search::Span->new(
204							offset => $start,
205							length => shift(@$ends)-$start,
206							weight => $weight_val,
207							);
208							}
209							}
210
211	0						return \@posits;
212							}
213
214
215							package KSx::Search::RegexpTermScorer;
216	2			2		2406	use base 'KinoSearch::Search::Matcher';
	2					5
	2					2345
217
218	2			2		439	use Hash::Util::FieldHash::Compat 'fieldhashes';
	2					5
	2					20
219							fieldhashes\my( %doc_nums, %pos, %wv, %sim, %compiler );
220
221							sub new {
222	0			0			my ($class, %args) = @_;
223							# my $plists = delete $args{posting_lists};
224	0						my $compiler = delete $args{compiler};
225	0						my $reader = delete $args{reader};
226	0						my $need_score = delete $args{need_score};
227	0						my $self = $class->SUPER::new(%args);
228	0						$sim{$self} = $compiler->get_similarity;
229
230	0						my $tfs = $tfs{$compiler};
231	0						$doc_nums{$self} = [ sort { $a <=> $b } keys %$tfs ];
	0
232
233	0						$pos{$self} = -1;
234	0						$wv {$self} = $compiler->get_value;
235	0						$compiler{$self} = $compiler;
236
237	0						$self
238							}
239
240							sub next {
241	0			0			my $self = shift;
242	0						my $doc_nums = $doc_nums{$self};
243	0	0					return 0 if $pos{$self} >= $#$doc_nums;
244	0						return $$doc_nums[ ++$pos{$self} ];
245							}
246
247							sub get_doc_num {
248	0			0			my $self = shift;
249	0						my $pos = $pos{$self};
250	0						my $doc_nums = $doc_nums{$self};
251	0	0					return $pos < scalar @$doc_nums ? $$doc_nums[$pos] : 0;
252							}
253
254							sub score {
255	0			0			my $self = shift;
256	0						my $pos = $pos{$self};
257	0						my $doc_nums = $doc_nums{$self};
258	0						return $wv{$self} * $sim{$self}->tf(
259							$tfs{$compiler{$self}}{$$doc_nums[$pos]}
260							);
261							}
262
263
264							1;
265
266							__END__