line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
46
|
|
|
46
|
|
2850918
|
use strict; |
|
46
|
|
|
|
|
115
|
|
|
46
|
|
|
|
|
1754
|
|
2
|
46
|
|
|
46
|
|
398
|
use warnings; |
|
46
|
|
|
|
|
165
|
|
|
46
|
|
|
|
|
2229
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KinoSearch::Test::TestUtils; |
5
|
46
|
|
|
46
|
|
397
|
use base qw( Exporter ); |
|
46
|
|
|
|
|
101
|
|
|
46
|
|
|
|
|
7355
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
8
|
|
|
|
|
|
|
working_dir |
9
|
|
|
|
|
|
|
create_working_dir |
10
|
|
|
|
|
|
|
remove_working_dir |
11
|
|
|
|
|
|
|
create_index |
12
|
|
|
|
|
|
|
create_uscon_index |
13
|
|
|
|
|
|
|
test_index_loc |
14
|
|
|
|
|
|
|
persistent_test_index_loc |
15
|
|
|
|
|
|
|
init_test_index_loc |
16
|
|
|
|
|
|
|
get_uscon_docs |
17
|
|
|
|
|
|
|
utf8_test_strings |
18
|
|
|
|
|
|
|
test_analyzer |
19
|
|
|
|
|
|
|
doc_ids_from_td_coll |
20
|
|
|
|
|
|
|
modulo_set |
21
|
|
|
|
|
|
|
); |
22
|
|
|
|
|
|
|
|
23
|
46
|
|
|
46
|
|
26578
|
use KinoSearch; |
|
46
|
|
|
|
|
132
|
|
|
46
|
|
|
|
|
2382
|
|
24
|
46
|
|
|
46
|
|
23923
|
use KinoSearch::Test; |
|
46
|
|
|
|
|
114
|
|
|
46
|
|
|
|
|
1442
|
|
25
|
|
|
|
|
|
|
|
26
|
46
|
|
|
46
|
|
418
|
use lib 'sample'; |
|
46
|
|
|
|
|
97
|
|
|
46
|
|
|
|
|
473
|
|
27
|
46
|
|
|
46
|
|
25341
|
use KinoSearch::Test::USConSchema; |
|
46
|
|
|
|
|
131
|
|
|
46
|
|
|
|
|
2023
|
|
28
|
|
|
|
|
|
|
|
29
|
46
|
|
|
46
|
|
289
|
use File::Spec::Functions qw( catdir catfile curdir ); |
|
46
|
|
|
|
|
95
|
|
|
46
|
|
|
|
|
3510
|
|
30
|
46
|
|
|
46
|
|
57702
|
use Encode qw( _utf8_off ); |
|
46
|
|
|
|
|
646193
|
|
|
46
|
|
|
|
|
4765
|
|
31
|
46
|
|
|
46
|
|
429
|
use File::Path qw( rmtree ); |
|
46
|
|
|
|
|
87
|
|
|
46
|
|
|
|
|
2822
|
|
32
|
46
|
|
|
46
|
|
335
|
use Carp; |
|
46
|
|
|
|
|
97
|
|
|
46
|
|
|
|
|
90060
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
my $working_dir = catfile( curdir(), 'kinosearch_test' ); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# Return a directory within the system's temp directory where we will put all |
37
|
|
|
|
|
|
|
# testing scratch files. |
38
|
3
|
|
|
3
|
0
|
84
|
sub working_dir {$working_dir} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub create_working_dir { |
41
|
1
|
50
|
|
1
|
0
|
132
|
mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!"; |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# Verify that this user owns the working dir, then zap it. Returns true upon |
45
|
|
|
|
|
|
|
# success. |
46
|
|
|
|
|
|
|
sub remove_working_dir { |
47
|
2
|
100
|
|
2
|
0
|
76
|
return unless -d $working_dir; |
48
|
1
|
|
|
|
|
23028
|
rmtree $working_dir; |
49
|
1
|
|
|
|
|
10
|
return 1; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Return a location for a test index to be used by a single test file. If |
53
|
|
|
|
|
|
|
# the test file crashes it cannot clean up after itself, so we put the cleanup |
54
|
|
|
|
|
|
|
# routine in a single test file to be run at or near the end of the test |
55
|
|
|
|
|
|
|
# suite. |
56
|
|
|
|
|
|
|
sub test_index_loc { |
57
|
9
|
|
|
9
|
0
|
97
|
return catdir( $working_dir, 'test_index' ); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Return a location for a test index intended to be shared by multiple test |
61
|
|
|
|
|
|
|
# files. It will be cleaned as above. |
62
|
|
|
|
|
|
|
sub persistent_test_index_loc { |
63
|
3
|
|
|
3
|
0
|
195
|
return catdir( $working_dir, 'persistent_test_index' ); |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Destroy anything left over in the test_index location, then create the |
67
|
|
|
|
|
|
|
# directory. Finally, return the path. |
68
|
|
|
|
|
|
|
sub init_test_index_loc { |
69
|
9
|
|
|
9
|
0
|
85
|
my $dir = test_index_loc(); |
70
|
9
|
|
|
|
|
10376
|
rmtree $dir; |
71
|
9
|
50
|
|
|
|
175
|
die "Can't clean up '$dir'" if -e $dir; |
72
|
9
|
50
|
|
|
|
839
|
mkdir $dir or die "Can't mkdir '$dir': $!"; |
73
|
9
|
|
|
|
|
51
|
return $dir; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Build a RAM index, using the supplied array of strings as source material. |
77
|
|
|
|
|
|
|
# The index will have a single field: "content". |
78
|
|
|
|
|
|
|
sub create_index { |
79
|
27
|
|
|
27
|
0
|
240886
|
my $folder = KinoSearch::Store::RAMFolder->new; |
80
|
27
|
|
|
|
|
1157
|
my $indexer = KinoSearch::Index::Indexer->new( |
81
|
|
|
|
|
|
|
index => $folder, |
82
|
|
|
|
|
|
|
schema => KinoSearch::Test::TestSchema->new, |
83
|
|
|
|
|
|
|
); |
84
|
27
|
|
|
|
|
53959
|
$indexer->add_doc( { content => $_ } ) for @_; |
85
|
27
|
|
|
|
|
231606
|
$indexer->commit; |
86
|
27
|
|
|
|
|
2818
|
return $folder; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Slurp us constitition docs and build hashrefs. |
90
|
|
|
|
|
|
|
sub get_uscon_docs { |
91
|
|
|
|
|
|
|
|
92
|
2
|
|
|
2
|
0
|
26
|
my $uscon_dir = catdir( 'sample', 'us_constitution' ); |
93
|
2
|
50
|
|
|
|
1282
|
opendir( my $uscon_dh, $uscon_dir ) |
94
|
|
|
|
|
|
|
or die "couldn't opendir '$uscon_dir': $!"; |
95
|
2
|
|
|
|
|
340
|
my @filenames = grep {/\.txt$/} sort readdir $uscon_dh; |
|
112
|
|
|
|
|
262
|
|
96
|
2
|
50
|
|
|
|
63
|
closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!"; |
97
|
|
|
|
|
|
|
|
98
|
2
|
|
|
|
|
6
|
my %docs; |
99
|
|
|
|
|
|
|
|
100
|
2
|
|
|
|
|
9
|
for my $filename (@filenames) { |
101
|
104
|
|
|
|
|
671
|
my $filepath = catfile( $uscon_dir, $filename ); |
102
|
104
|
50
|
|
|
|
5216
|
open( my $fh, '<', $filepath ) |
103
|
|
|
|
|
|
|
or die "couldn't open file '$filepath': $!"; |
104
|
104
|
|
|
|
|
137
|
my $content = do { local $/; <$fh> }; |
|
104
|
|
|
|
|
386
|
|
|
104
|
|
|
|
|
53727
|
|
105
|
104
|
50
|
|
|
|
690
|
$content =~ /(.*?)\n\n(.*)/s |
106
|
|
|
|
|
|
|
or die "Can't extract title/bodytext from '$filepath'"; |
107
|
104
|
|
|
|
|
324
|
my $title = $1; |
108
|
104
|
|
|
|
|
292
|
my $bodytext = $2; |
109
|
104
|
|
|
|
|
9502
|
$bodytext =~ s/\s+/ /sg; |
110
|
104
|
50
|
|
|
|
636
|
my $category |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
111
|
|
|
|
|
|
|
= $filename =~ /art/ ? 'article' |
112
|
|
|
|
|
|
|
: $filename =~ /amend/ ? 'amendment' |
113
|
|
|
|
|
|
|
: $filename =~ /preamble/ ? 'preamble' |
114
|
|
|
|
|
|
|
: confess "Can't derive category for $filename"; |
115
|
|
|
|
|
|
|
|
116
|
104
|
|
|
|
|
2698
|
$docs{$filename} = { |
117
|
|
|
|
|
|
|
title => $title, |
118
|
|
|
|
|
|
|
bodytext => $bodytext, |
119
|
|
|
|
|
|
|
url => "/us_constitution/$filename", |
120
|
|
|
|
|
|
|
category => $category, |
121
|
|
|
|
|
|
|
}; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
2
|
|
|
|
|
34
|
return \%docs; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub create_uscon_index { |
128
|
1
|
|
|
1
|
0
|
5
|
my $folder = KinoSearch::Store::FSFolder->new( |
129
|
|
|
|
|
|
|
path => persistent_test_index_loc() ); |
130
|
1
|
|
|
|
|
80
|
my $schema = KinoSearch::Test::USConSchema->new; |
131
|
1
|
|
|
|
|
13
|
my $indexer = KinoSearch::Index::Indexer->new( |
132
|
|
|
|
|
|
|
schema => $schema, |
133
|
|
|
|
|
|
|
index => $folder, |
134
|
|
|
|
|
|
|
truncate => 1, |
135
|
|
|
|
|
|
|
create => 1, |
136
|
|
|
|
|
|
|
); |
137
|
|
|
|
|
|
|
|
138
|
1
|
|
|
|
|
104
|
$indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 ); |
139
|
1
|
|
|
|
|
250320
|
$indexer->commit; |
140
|
1
|
|
|
|
|
4
|
undef $indexer; |
141
|
|
|
|
|
|
|
|
142
|
1
|
|
|
|
|
117
|
$indexer = KinoSearch::Index::Indexer->new( |
143
|
|
|
|
|
|
|
schema => $schema, |
144
|
|
|
|
|
|
|
index => $folder, |
145
|
|
|
|
|
|
|
); |
146
|
1
|
|
|
|
|
7
|
my $source_docs = get_uscon_docs(); |
147
|
|
|
|
|
|
|
$indexer->add_doc( { content => $_->{bodytext} } ) |
148
|
1
|
|
|
|
|
687
|
for values %$source_docs; |
149
|
1
|
|
|
|
|
28842
|
$indexer->commit; |
150
|
1
|
|
|
|
|
4
|
undef $indexer; |
151
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
233
|
$indexer = KinoSearch::Index::Indexer->new( |
153
|
|
|
|
|
|
|
schema => $schema, |
154
|
|
|
|
|
|
|
index => $folder, |
155
|
|
|
|
|
|
|
); |
156
|
1
|
|
|
|
|
11
|
my @chars = ( 'a' .. 'z' ); |
157
|
1
|
|
|
|
|
6
|
for ( 0 .. 1000 ) { |
158
|
1001
|
|
|
|
|
1898
|
my $content = ''; |
159
|
1001
|
|
|
|
|
2043
|
for my $num_words ( 1 .. int( rand(20) ) ) { |
160
|
9570
|
|
|
|
|
15882
|
for ( 1 .. ( int( rand(10) ) + 10 ) ) { |
161
|
139035
|
|
|
|
|
227414
|
$content .= @chars[ rand(@chars) ]; |
162
|
|
|
|
|
|
|
} |
163
|
9570
|
|
|
|
|
14177
|
$content .= ' '; |
164
|
|
|
|
|
|
|
} |
165
|
1001
|
|
|
|
|
73037
|
$indexer->add_doc( { content => $content } ); |
166
|
|
|
|
|
|
|
} |
167
|
1
|
|
|
|
|
9
|
$indexer->optimize; |
168
|
1
|
|
|
|
|
98
|
$indexer->commit; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# Return 3 strings useful for verifying UTF-8 integrity. |
172
|
|
|
|
|
|
|
sub utf8_test_strings { |
173
|
5
|
|
|
5
|
0
|
136802
|
my $smiley = "\x{263a}"; |
174
|
5
|
|
|
|
|
14
|
my $not_a_smiley = $smiley; |
175
|
5
|
|
|
|
|
70
|
_utf8_off($not_a_smiley); |
176
|
5
|
|
|
|
|
13
|
my $frowny = $not_a_smiley; |
177
|
5
|
|
|
|
|
25
|
utf8::upgrade($frowny); |
178
|
5
|
|
|
|
|
22
|
return ( $smiley, $not_a_smiley, $frowny ); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Verify an Analyzer's transform, transform_text, and split methods. |
182
|
|
|
|
|
|
|
sub test_analyzer { |
183
|
9
|
|
|
9
|
0
|
2733
|
my ( $analyzer, $source, $expected, $message ) = @_; |
184
|
|
|
|
|
|
|
|
185
|
9
|
|
|
|
|
212
|
my $inversion = KinoSearch::Analysis::Inversion->new( text => $source ); |
186
|
9
|
|
|
|
|
235
|
$inversion = $analyzer->transform($inversion); |
187
|
9
|
|
|
|
|
48
|
my @got; |
188
|
9
|
|
|
|
|
168
|
while ( my $token = $inversion->next ) { |
189
|
13
|
|
|
|
|
114
|
push @got, $token->get_text; |
190
|
|
|
|
|
|
|
} |
191
|
9
|
|
|
|
|
76
|
Test::More::is_deeply( \@got, $expected, "analyze: $message" ); |
192
|
|
|
|
|
|
|
|
193
|
9
|
|
|
|
|
11897
|
$inversion = $analyzer->transform_text($source); |
194
|
9
|
|
|
|
|
127
|
@got = (); |
195
|
9
|
|
|
|
|
90
|
while ( my $token = $inversion->next ) { |
196
|
13
|
|
|
|
|
96
|
push @got, $token->get_text; |
197
|
|
|
|
|
|
|
} |
198
|
9
|
|
|
|
|
56
|
Test::More::is_deeply( \@got, $expected, "transform_text: $message" ); |
199
|
|
|
|
|
|
|
|
200
|
9
|
|
|
|
|
11672
|
@got = @{ $analyzer->split($source) }; |
|
9
|
|
|
|
|
297
|
|
201
|
9
|
|
|
|
|
72
|
Test::More::is_deeply( \@got, $expected, "split: $message" ); |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# Extract all doc nums from a SortCollector. Return two sorted array refs: |
205
|
|
|
|
|
|
|
# by_score and by_id. |
206
|
|
|
|
|
|
|
sub doc_ids_from_td_coll { |
207
|
1223
|
|
|
1223
|
0
|
498958
|
my $collector = shift; |
208
|
1223
|
|
|
|
|
2080
|
my @by_score; |
209
|
1223
|
|
|
|
|
29624
|
my $match_docs = $collector->pop_match_docs; |
210
|
17616
|
50
|
|
|
|
54919
|
my @by_score_then_id = map { $_->get_doc_id } |
|
19040
|
|
|
|
|
127458
|
|
211
|
|
|
|
|
|
|
sort { |
212
|
1223
|
|
|
|
|
5595
|
$b->get_score <=> $a->get_score |
213
|
|
|
|
|
|
|
|| $a->get_doc_id <=> $b->get_doc_id |
214
|
|
|
|
|
|
|
} @$match_docs; |
215
|
1223
|
|
|
|
|
3643
|
my @by_id = sort { $a <=> $b } @by_score_then_id; |
|
37343
|
|
|
|
|
41009
|
|
216
|
1223
|
|
|
|
|
21753
|
return ( \@by_score_then_id, \@by_id ); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Use a modulus to generate a set of numbers. |
220
|
|
|
|
|
|
|
sub modulo_set { |
221
|
3229
|
|
|
3229
|
0
|
2401714
|
my ( $interval, $max ) = @_; |
222
|
3229
|
|
|
|
|
4835
|
my @out; |
223
|
3229
|
|
|
|
|
10929
|
for ( my $doc = $interval; $doc < $max; $doc += $interval ) { |
224
|
45031
|
|
|
|
|
106110
|
push @out, $doc; |
225
|
|
|
|
|
|
|
} |
226
|
3229
|
|
|
|
|
11216
|
return \@out; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
1; |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
__END__ |