line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
23
|
|
|
23
|
|
201160
|
use strict; |
|
23
|
|
|
|
|
61
|
|
|
23
|
|
|
|
|
929
|
|
2
|
23
|
|
|
23
|
|
129
|
use warnings; |
|
23
|
|
|
|
|
54
|
|
|
23
|
|
|
|
|
1060
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package KinoSearch1::Test::TestUtils; |
5
|
23
|
|
|
23
|
|
138
|
use base qw( Exporter ); |
|
23
|
|
|
|
|
48
|
|
|
23
|
|
|
|
|
2990
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
8
|
|
|
|
|
|
|
working_dir |
9
|
|
|
|
|
|
|
create_working_dir |
10
|
|
|
|
|
|
|
remove_working_dir |
11
|
|
|
|
|
|
|
create_index |
12
|
|
|
|
|
|
|
create_persistent_test_index |
13
|
|
|
|
|
|
|
test_index_loc |
14
|
|
|
|
|
|
|
persistent_test_index_loc |
15
|
|
|
|
|
|
|
init_test_index_loc |
16
|
|
|
|
|
|
|
get_uscon_docs |
17
|
|
|
|
|
|
|
utf8_test_strings |
18
|
|
|
|
|
|
|
test_analyzer |
19
|
|
|
|
|
|
|
); |
20
|
|
|
|
|
|
|
|
21
|
23
|
|
|
23
|
|
14087
|
use KinoSearch1::InvIndexer; |
|
23
|
|
|
|
|
82
|
|
|
23
|
|
|
|
|
809
|
|
22
|
23
|
|
|
23
|
|
24012
|
use KinoSearch1::Store::RAMInvIndex; |
|
23
|
|
|
|
|
79
|
|
|
23
|
|
|
|
|
676
|
|
23
|
23
|
|
|
23
|
|
150
|
use KinoSearch1::Store::FSInvIndex; |
|
23
|
|
|
|
|
49
|
|
|
23
|
|
|
|
|
504
|
|
24
|
23
|
|
|
23
|
|
9152
|
use KinoSearch1::Analysis::Tokenizer; |
|
23
|
|
|
|
|
59
|
|
|
23
|
|
|
|
|
572
|
|
25
|
23
|
|
|
23
|
|
133
|
use KinoSearch1::Analysis::TokenBatch; |
|
23
|
|
|
|
|
48
|
|
|
23
|
|
|
|
|
444
|
|
26
|
23
|
|
|
23
|
|
14633
|
use KinoSearch1::Analysis::PolyAnalyzer; |
|
23
|
|
|
|
|
83
|
|
|
23
|
|
|
|
|
926
|
|
27
|
|
|
|
|
|
|
|
28
|
23
|
|
|
23
|
|
217
|
use File::Spec::Functions qw( catdir catfile curdir ); |
|
23
|
|
|
|
|
52
|
|
|
23
|
|
|
|
|
1598
|
|
29
|
23
|
|
|
23
|
|
23497
|
use Encode qw( _utf8_off ); |
|
23
|
|
|
|
|
289406
|
|
|
23
|
|
|
|
|
2431
|
|
30
|
23
|
|
|
23
|
|
222
|
use File::Path qw( rmtree ); |
|
23
|
|
|
|
|
50
|
|
|
23
|
|
|
|
|
1437
|
|
31
|
23
|
|
|
23
|
|
139
|
use Carp; |
|
23
|
|
|
|
|
56
|
|
|
23
|
|
|
|
|
33436
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my $working_dir = catfile( curdir(), 'kinosearch_test' ); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Return a directory within the system's temp directory where we will put all |
36
|
|
|
|
|
|
|
# testing scratch files. |
37
|
3
|
|
|
3
|
0
|
55
|
sub working_dir {$working_dir} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub create_working_dir { |
40
|
1
|
50
|
|
1
|
0
|
139
|
mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!"; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Verify that this user owns the working dir, then zap it. Returns true upon |
44
|
|
|
|
|
|
|
# success. |
45
|
|
|
|
|
|
|
sub remove_working_dir { |
46
|
2
|
100
|
|
2
|
0
|
82
|
return unless -d $working_dir; |
47
|
1
|
|
|
|
|
1205
|
rmtree $working_dir; |
48
|
1
|
|
|
|
|
4
|
return 1; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Return a location for a test index to be used by a single test file. If |
52
|
|
|
|
|
|
|
# the test file crashes it cannot clean up after itself, so we put the cleanup |
53
|
|
|
|
|
|
|
# routine in a single test file to be run at or near the end of the test |
54
|
|
|
|
|
|
|
# suite. |
55
|
|
|
|
|
|
|
sub test_index_loc { |
56
|
2
|
|
|
2
|
0
|
27
|
return catdir( $working_dir, 'test_index' ); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Return a location for a test index intended to be shared by multiple test |
60
|
|
|
|
|
|
|
# files. It will be cleaned as above. |
61
|
|
|
|
|
|
|
sub persistent_test_index_loc { |
62
|
5
|
|
|
5
|
0
|
79
|
return catdir( $working_dir, 'persistent_test_index' ); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Destroy anything left over in the test_index location, then create the |
66
|
|
|
|
|
|
|
# directory. Finally, return the path. |
67
|
|
|
|
|
|
|
sub init_test_index_loc { |
68
|
1
|
|
|
1
|
0
|
10
|
my $dir = test_index_loc(); |
69
|
1
|
|
|
|
|
220
|
rmtree $dir; |
70
|
1
|
50
|
|
|
|
13
|
die "Can't clean up '$dir'" if -e $dir; |
71
|
1
|
50
|
|
|
|
100
|
mkdir $dir or die "Can't mkdir '$dir': $!"; |
72
|
1
|
|
|
|
|
4
|
return $dir; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Build a RAM index, using the supplied array of strings as source material. |
76
|
|
|
|
|
|
|
# The index will have a single field: "content". |
77
|
|
|
|
|
|
|
sub create_index { |
78
|
20
|
|
|
20
|
0
|
83093
|
my @docs = @_; |
79
|
|
|
|
|
|
|
|
80
|
20
|
|
|
|
|
442
|
my $tokenizer = KinoSearch1::Analysis::Tokenizer->new; |
81
|
20
|
|
|
|
|
319
|
my $invindex = KinoSearch1::Store::RAMInvIndex->new; |
82
|
20
|
|
|
|
|
242
|
my $invindexer = KinoSearch1::InvIndexer->new( |
83
|
|
|
|
|
|
|
invindex => $invindex, |
84
|
|
|
|
|
|
|
analyzer => $tokenizer, |
85
|
|
|
|
|
|
|
create => 1, |
86
|
|
|
|
|
|
|
); |
87
|
|
|
|
|
|
|
|
88
|
20
|
|
|
|
|
128
|
$invindexer->spec_field( name => 'content' ); |
89
|
|
|
|
|
|
|
|
90
|
20
|
|
|
|
|
72
|
for (@docs) { |
91
|
2530
|
|
|
|
|
7440
|
my $doc = $invindexer->new_doc; |
92
|
2530
|
|
|
|
|
83020
|
$doc->set_value( content => $_ ); |
93
|
2530
|
|
|
|
|
6454
|
$invindexer->add_doc($doc); |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
20
|
|
|
|
|
126
|
$invindexer->finish; |
97
|
|
|
|
|
|
|
|
98
|
20
|
|
|
|
|
112
|
return $invindex; |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Slurp us constitition docs and build hashrefs. |
102
|
|
|
|
|
|
|
sub get_uscon_docs { |
103
|
|
|
|
|
|
|
|
104
|
1
|
|
|
1
|
0
|
7
|
my $uscon_dir = catdir( 't', 'us_constitution' ); |
105
|
1
|
50
|
|
|
|
33
|
opendir( my $uscon_dh, $uscon_dir ) |
106
|
|
|
|
|
|
|
or die "couldn't opendir '$uscon_dir': $!"; |
107
|
1
|
|
|
|
|
137
|
my @filenames = grep {/\.html$/} sort readdir $uscon_dh; |
|
56
|
|
|
|
|
144
|
|
108
|
1
|
50
|
|
|
|
25
|
closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!"; |
109
|
|
|
|
|
|
|
|
110
|
1
|
|
|
|
|
3
|
my %docs; |
111
|
|
|
|
|
|
|
|
112
|
1
|
|
|
|
|
3
|
for my $filename (@filenames) { |
113
|
53
|
100
|
|
|
|
123
|
next if $filename eq 'index.html'; |
114
|
52
|
|
|
|
|
233
|
my $filepath = catfile( $uscon_dir, $filename ); |
115
|
52
|
50
|
|
|
|
1828
|
open( my $fh, '<', $filepath ) |
116
|
|
|
|
|
|
|
or die "couldn't open file '$filepath': $!"; |
117
|
52
|
|
|
|
|
61
|
my $content = do { local $/; <$fh> }; |
|
52
|
|
|
|
|
159
|
|
|
52
|
|
|
|
|
1029
|
|
118
|
52
|
50
|
|
|
|
303
|
$content =~ m#(.*?)#s |
119
|
|
|
|
|
|
|
or die "couldn't isolate title in '$filepath'"; |
120
|
52
|
|
|
|
|
110
|
my $title = $1; |
121
|
52
|
50
|
|
|
|
367
|
$content =~ m# (.*?) #s |
122
|
|
|
|
|
|
|
or die "couldn't isolate bodytext in '$filepath'"; |
123
|
52
|
|
|
|
|
145
|
my $bodytext = $1; |
124
|
52
|
|
|
|
|
730
|
$bodytext =~ s/<.*?>//sg; |
125
|
52
|
|
|
|
|
9788
|
$bodytext =~ s/\s+/ /sg; |
126
|
|
|
|
|
|
|
|
127
|
52
|
|
|
|
|
928
|
$docs{$filename} = { |
128
|
|
|
|
|
|
|
title => $title, |
129
|
|
|
|
|
|
|
bodytext => $bodytext, |
130
|
|
|
|
|
|
|
url => "/us_constitution/$filename", |
131
|
|
|
|
|
|
|
}; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
1
|
|
|
|
|
15
|
return \%docs; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub create_persistent_test_index { |
138
|
1
|
|
|
1
|
0
|
2
|
my $invindexer; |
139
|
1
|
|
|
|
|
21
|
my $polyanalyzer |
140
|
|
|
|
|
|
|
= KinoSearch1::Analysis::PolyAnalyzer->new( language => 'en' ); |
141
|
|
|
|
|
|
|
|
142
|
1
|
|
|
|
|
7
|
$invindexer = KinoSearch1::InvIndexer->new( |
143
|
|
|
|
|
|
|
invindex => persistent_test_index_loc(), |
144
|
|
|
|
|
|
|
create => 1, |
145
|
|
|
|
|
|
|
analyzer => $polyanalyzer, |
146
|
|
|
|
|
|
|
); |
147
|
1
|
|
|
|
|
7
|
$invindexer->spec_field( name => 'content' ); |
148
|
1
|
|
|
|
|
2
|
for ( 0 .. 10000 ) { |
149
|
10001
|
|
|
|
|
30026
|
my $doc = $invindexer->new_doc; |
150
|
10001
|
|
|
|
|
280878
|
$doc->set_value( content => "zz$_" ); |
151
|
10001
|
|
|
|
|
32522
|
$invindexer->add_doc($doc); |
152
|
|
|
|
|
|
|
} |
153
|
1
|
|
|
|
|
9
|
$invindexer->finish; |
154
|
1
|
|
|
|
|
2
|
undef $invindexer; |
155
|
|
|
|
|
|
|
|
156
|
1
|
|
|
|
|
16
|
$invindexer = KinoSearch1::InvIndexer->new( |
157
|
|
|
|
|
|
|
invindex => persistent_test_index_loc(), |
158
|
|
|
|
|
|
|
analyzer => $polyanalyzer, |
159
|
|
|
|
|
|
|
); |
160
|
1
|
|
|
|
|
8
|
$invindexer->spec_field( name => 'content' ); |
161
|
1
|
|
|
|
|
5
|
my $source_docs = get_uscon_docs(); |
162
|
1
|
|
|
|
|
10
|
for ( values %$source_docs ) { |
163
|
52
|
|
|
|
|
180
|
my $doc = $invindexer->new_doc; |
164
|
52
|
|
|
|
|
1892
|
$doc->set_value( content => $_->{bodytext} ); |
165
|
52
|
|
|
|
|
166
|
$invindexer->add_doc($doc); |
166
|
|
|
|
|
|
|
} |
167
|
1
|
|
|
|
|
8
|
$invindexer->finish; |
168
|
1
|
|
|
|
|
3
|
undef $invindexer; |
169
|
|
|
|
|
|
|
|
170
|
1
|
|
|
|
|
6
|
$invindexer = KinoSearch1::InvIndexer->new( |
171
|
|
|
|
|
|
|
invindex => persistent_test_index_loc(), |
172
|
|
|
|
|
|
|
analyzer => $polyanalyzer, |
173
|
|
|
|
|
|
|
); |
174
|
1
|
|
|
|
|
7
|
$invindexer->spec_field( name => 'content' ); |
175
|
1
|
|
|
|
|
12
|
my @chars = ( 'a' .. 'z' ); |
176
|
1
|
|
|
|
|
5
|
for ( 0 .. 1000 ) { |
177
|
1001
|
|
|
|
|
1582
|
my $content = ''; |
178
|
1001
|
|
|
|
|
2528
|
for my $num_words ( 1 .. int( rand(20) ) ) { |
179
|
9486
|
|
|
|
|
15834
|
for ( 1 .. ( int( rand(10) ) + 10 ) ) { |
180
|
137431
|
|
|
|
|
223317
|
$content .= @chars[ rand(@chars) ]; |
181
|
|
|
|
|
|
|
} |
182
|
9486
|
|
|
|
|
14962
|
$content .= ' '; |
183
|
|
|
|
|
|
|
} |
184
|
1001
|
|
|
|
|
3543
|
my $doc = $invindexer->new_doc; |
185
|
1001
|
|
|
|
|
32725
|
$doc->set_value( content => $content ); |
186
|
1001
|
|
|
|
|
3042
|
$invindexer->add_doc($doc); |
187
|
|
|
|
|
|
|
} |
188
|
1
|
|
|
|
|
8
|
$invindexer->finish( optimize => 1 ); |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# Return 3 strings useful for verifying UTF-8 integrity. |
192
|
|
|
|
|
|
|
sub utf8_test_strings { |
193
|
1
|
|
|
1
|
0
|
1060
|
my $smiley = "\x{263a}"; |
194
|
1
|
|
|
|
|
3
|
my $not_a_smiley = $smiley; |
195
|
1
|
|
|
|
|
12
|
_utf8_off($not_a_smiley); |
196
|
1
|
|
|
|
|
3
|
my $frowny = $not_a_smiley; |
197
|
1
|
|
|
|
|
4
|
utf8::upgrade($frowny); |
198
|
1
|
|
|
|
|
4
|
return ( $smiley, $not_a_smiley, $frowny ); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# Verify an Analyzer's analyze() method. |
202
|
|
|
|
|
|
|
sub test_analyzer { |
203
|
5
|
|
|
5
|
0
|
48
|
my ( $analyzer, $source, $expected, $message ) = @_; |
204
|
|
|
|
|
|
|
|
205
|
5
|
|
|
|
|
40
|
my $batch = KinoSearch1::Analysis::TokenBatch->new; |
206
|
5
|
|
|
|
|
40
|
$batch->append( $source, 0, length($source) ); |
207
|
|
|
|
|
|
|
|
208
|
5
|
|
|
|
|
21
|
$batch = $analyzer->analyze($batch); |
209
|
5
|
|
|
|
|
15
|
my @got; |
210
|
5
|
|
|
|
|
26
|
while ( $batch->next ) { |
211
|
14
|
|
|
|
|
86
|
push @got, $batch->get_text; |
212
|
|
|
|
|
|
|
} |
213
|
5
|
|
|
|
|
25
|
Test::More::is_deeply( \@got, $expected, "analyze: $message" ); |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
1; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
__END__ |