File Coverage

blib/lib/KSx/Search/RegexpTermQuery.pm
Criterion Covered Total %
statement 42 126 33.3
branch 4 26 15.3
condition 2 14 14.2
subroutine 11 21 52.3
pod 2 2 100.0
total 61 189 32.2


line stmt bran cond sub pod time code
1 2     2   1045 use strict;
  2         3  
  2         86  
2 2     2   12 use warnings;
  2         4  
  2         97  
3              
4             package KSx::Search::RegexpTermQuery;
5 2     2   22 use base qw( KinoSearch::Search::Query );
  2         4  
  2         2196  
6              
7             our $VERSION = '0.05';
8              
9 2     2   8421 use Hash::Util::FieldHash::Compat 'fieldhashes';
  2         6406  
  2         16  
10             fieldhashes \my( %re, %prefix, %field );
11              
12             sub new {
13 2     2 1 5746 my ($package, %args) = @_;
14              
15 2         15 my $re = delete $args{regexp};
16 2         7 my $field = delete $args{field};
17              
18 2         46 my $self = $package->SUPER::new(%args);
19              
20 2         529 $re{$self} = $re;
21 2         12 $field{$self} = $field;
22              
23             # get the literal prefix of the regexp, if any.
24 2 100 50     80 if($re{$self} =~
  1         21  
25             m<^
26             (?: # prefix for qr//'s, without allowing /i :
27             \(\? ([a-hj-z]*) (?:-[a-z]*)?:
28             )?
29             (\\[GA]|\^) # anchor
30             ([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments)
31             >x
32             ) {{
33 1         2 my ($mod,$anchor,$prefix) = ($1||'',$2,$3);
34 1 50 33     8 $anchor eq '^' and $mod =~ /m/ and last;
35 1         28 for($prefix) {
36 1 50       5 $mod =~ /x/ and s/\s+//g;
37             }
38 1         6 $prefix{$self} = $prefix;
39             }}
40              
41 2         10 $self;
42             }
43              
44             #sub extract_terms {
45             # my $self = shift;
46             # return @{ $self->{terms} };
47             #}
48              
49             sub make_compiler {
50 2     2 1 108 return KSx::Search::RegexpTermCompiler->new(
51             parent => @_
52             );
53             }
54              
55              
56             package KSx::Search::RegexpTermCompiler;
57 2     2   828 use base qw( KinoSearch::Search::Compiler );
  2         5  
  2         2454  
58              
59 2     2   345 use Hash::Util::FieldHash::Compat 'fieldhashes';
  2         5  
  2         19  
60              
61             fieldhashes \my ( %idf, %raw_impact, #%plists,
62             %terms,
63             %query_norm_factor, % normalized_impact, %tfs );
64             sub new {
65 2     2   10 my($pack, %args) = @_;
66              
67 2         6 my $searcher = $args{searchable};
68 2         556 my $reader = $searcher->get_reader;
69 0           my $lex_reader = $reader->fetch("KinoSearch::Index::LexiconReader");
70 0           my $post_reader = $reader->fetch("KinoSearch::Index::PostingsReader");
71              
72             # Retrieve the correct Similarity for the Query's field.
73 0           my $sim = $args{similarity} =
74             $searcher->get_schema->fetch_sim($field{$args{parent}});
75              
76 0           my $self = $pack->SUPER::new(%args);
77              
78 0           my $parent = $args{parent};
79              
80             # Get a lexicon and find our place therein
81 0           my( $re, $prefix ) = ($re{$parent}, $prefix{$parent});
82 0 0         ref $re eq 'Regexp' or $re = qr/$re/; # avoid repetitive recompilation
83 0           my $lexcn = $lex_reader->lexicon( field => $field{$parent} );
84 0 0         $lexcn->seek(defined $prefix ? $prefix : '');
85            
86             # iterate through it, stopping at terms that match
87 0           my @terms; #my @plists;
88             my %hits; # The keys are the doc nums; the values the tfs.
89              
90 0           while () {
91 0           my $term = get_term $lexcn;
92            
93             # sift out unwanted terms
94 0 0 0       last if defined $prefix and index( $term, $prefix ) != 0;
95 0 0         next unless $term =~ $re;
96              
97             # for terms that match...
98              
99 0           push @terms, $term;
100              
101             # We have to iterate through the documents in each posting list,
102             # recording the doc numbers, so we can calc the doc freq later on.
103             # E.g., if there are two documents, one containing ‘dog’ and ‘dot,’
104             # and the other containing just ‘dog,’ and the re is /^do.*/, then
105             # the doc freq has to be 2, since the re matches two docs. The doc
106             # freqs of the individual terms are 1 and 2, so we can’t add or
107             # average them.
108 0           my $plist = $post_reader->posting_list(
109             term => $term,
110             field => $field{$parent},
111             );
112 0           my $posting; my $weight;
113 0           while (my $doc_num = $plist ->next) {
114             # For efficiency’s sake, we’ll collect the results now, to
115             # avoid iterating through postings (the slowest part of search-
116             # ing) more than once, even though this code probably belongs
117             # in RegexpTermScorer
118 0   0       my $posting ||= $plist->get_posting;
119 0   0       $hits{$doc_num} +=
120             $weight ||= $posting->get_freq * $posting->get_weight
121             }
122              
123             } continue {
124 0 0         last unless $lexcn->next ;
125             }
126 0           my $doc_freq = scalar keys %hits;
127              
128             # Save the hits and terms for later
129             # $plists{$self} = \@plists;
130 0           $tfs{$self} = \%hits;
131 0           $terms{$self} = \@terms;
132              
133             # Calculate and store the IDF
134 0           my $max_doc = $searcher->doc_max;
135 0 0         my $idf = $idf{$self} = $max_doc
136             ? 1 + log( $max_doc / ( 1 + $doc_freq ) )
137             : 1
138             ;
139              
140 0           $raw_impact{$self} = $idf * $parent->get_boost;
141              
142             # make final preparations
143 0           $self->perform_query_normalization($searcher);
144              
145 0           $self;
146             }
147              
148             sub perform_query_normalization {
149             # copied from KinoSearch::Search::Weight originally
150 0     0     my ( $self, $searcher ) = @_;
151 0           my $sim = $self->get_similarity;
152              
153 0           my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t )
154 0           $factor = $sim->query_norm($factor); # factor /= norm_q
155 0           $self->normalize($factor); # impact *= factor
156             }
157              
158 0     0     sub get_value { shift->get_parent->get_boost }
159              
160 0     0     sub sum_of_squared_weights { $raw_impact{+shift}**2 }
161              
162             sub normalize { # copied from TermQuery
163 0     0     my ( $self, $query_norm_factor ) = @_;
164 0           $query_norm_factor{$self} = $query_norm_factor;
165              
166             # Multiply raw impact by ( tf_q * idf_q / norm_q )
167             #
168             # Note: factoring in IDF a second time is correct. See formula.
169 0           $normalized_impact{$self}
170             = $raw_impact{$self} * $idf{$self} * $query_norm_factor;
171             }
172              
173             sub make_matcher {
174 0     0     my $self = shift;
175              
176 0           return KSx::Search::RegexpTermScorer->new(
177             # posting_lists => $plists{$self},
178             @_,
179             compiler => $self,
180             );
181             }
182              
183             sub highlight_spans { # plagiarised form of TermWeight’s routine
184 0     0     my ($self, %args) = @_;
185 0           my $doc_vector = $args{doc_vec};
186 0           my $field_name = $args{field};
187 0 0         return if $field{$self->get_parent} ne $field_name;
188 0           my $searcher = $args{searcher};
189 0           my $terms = $terms{$self};
190              
191 0           require KinoSearch::Search::Span;
192              
193 0           my @posits;
194 0           my $weight_val = $self->get_value;
195 0           for (@$terms) {
196 0           my $term_vector
197             = $doc_vector->term_vector( field => $field_name, term => $_ );
198 0 0         next unless defined $term_vector;
199 0           my $starts = $term_vector->get_start_offsets->to_arrayref;
200 0           my $ends = $term_vector->get_end_offsets->to_arrayref;
201 0           while (@$starts) {
202 0           my $start = shift @$starts;
203 0           push @posits, KinoSearch::Search::Span->new(
204             offset => $start,
205             length => shift(@$ends)-$start,
206             weight => $weight_val,
207             );
208             }
209             }
210              
211 0           return \@posits;
212             }
213              
214              
215             package KSx::Search::RegexpTermScorer;
216 2     2   2406 use base 'KinoSearch::Search::Matcher';
  2         5  
  2         2345  
217              
218 2     2   439 use Hash::Util::FieldHash::Compat 'fieldhashes';
  2         5  
  2         20  
219             fieldhashes\my( %doc_nums, %pos, %wv, %sim, %compiler );
220              
221             sub new {
222 0     0     my ($class, %args) = @_;
223             # my $plists = delete $args{posting_lists};
224 0           my $compiler = delete $args{compiler};
225 0           my $reader = delete $args{reader};
226 0           my $need_score = delete $args{need_score};
227 0           my $self = $class->SUPER::new(%args);
228 0           $sim{$self} = $compiler->get_similarity;
229              
230 0           my $tfs = $tfs{$compiler};
231 0           $doc_nums{$self} = [ sort { $a <=> $b } keys %$tfs ];
  0            
232            
233 0           $pos{$self} = -1;
234 0           $wv {$self} = $compiler->get_value;
235 0           $compiler{$self} = $compiler;
236            
237 0           $self
238             }
239              
240             sub next {
241 0     0     my $self = shift;
242 0           my $doc_nums = $doc_nums{$self};
243 0 0         return 0 if $pos{$self} >= $#$doc_nums;
244 0           return $$doc_nums[ ++$pos{$self} ];
245             }
246              
247             sub get_doc_num {
248 0     0     my $self = shift;
249 0           my $pos = $pos{$self};
250 0           my $doc_nums = $doc_nums{$self};
251 0 0         return $pos < scalar @$doc_nums ? $$doc_nums[$pos] : 0;
252             }
253              
254             sub score {
255 0     0     my $self = shift;
256 0           my $pos = $pos{$self};
257 0           my $doc_nums = $doc_nums{$self};
258 0           return $wv{$self} * $sim{$self}->tf(
259             $tfs{$compiler{$self}}{$$doc_nums[$pos]}
260             );
261             }
262              
263              
264             1;
265              
266             __END__