line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
2
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
3
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
4
|
|
|
|
|
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0 |
5
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
6
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
11
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
12
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
14
|
|
|
|
|
|
|
# limitations under the License. |
15
|
|
|
|
|
|
|
|
16
|
47
|
|
|
47
|
|
504536
|
use strict; |
|
47
|
|
|
|
|
60
|
|
|
47
|
|
|
|
|
1222
|
|
17
|
47
|
|
|
47
|
|
158
|
use warnings; |
|
47
|
|
|
|
|
47
|
|
|
47
|
|
|
|
|
2432
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
package Lucy::Test::TestUtils; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.006000_001'; |
22
|
|
|
|
|
|
|
$VERSION = eval $VERSION; |
23
|
|
|
|
|
|
|
|
24
|
47
|
|
|
47
|
|
160
|
use Exporter 'import'; |
|
47
|
|
|
|
|
58
|
|
|
47
|
|
|
|
|
2279
|
|
25
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
26
|
|
|
|
|
|
|
working_dir |
27
|
|
|
|
|
|
|
create_working_dir |
28
|
|
|
|
|
|
|
remove_working_dir |
29
|
|
|
|
|
|
|
uscon_dir |
30
|
|
|
|
|
|
|
create_index |
31
|
|
|
|
|
|
|
create_uscon_index |
32
|
|
|
|
|
|
|
test_index_loc |
33
|
|
|
|
|
|
|
persistent_test_index_loc |
34
|
|
|
|
|
|
|
init_test_index_loc |
35
|
|
|
|
|
|
|
get_uscon_docs |
36
|
|
|
|
|
|
|
utf8_test_strings |
37
|
|
|
|
|
|
|
test_analyzer |
38
|
|
|
|
|
|
|
doc_ids_from_td_coll |
39
|
|
|
|
|
|
|
modulo_set |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
47
|
|
|
47
|
|
9587
|
use Lucy; |
|
47
|
|
|
|
|
75
|
|
|
47
|
|
|
|
|
1612
|
|
43
|
47
|
|
|
47
|
|
12689
|
use Lucy::Test; |
|
47
|
|
|
|
|
82
|
|
|
47
|
|
|
|
|
1205
|
|
44
|
47
|
|
|
47
|
|
16573
|
use File::Spec::Functions qw( catdir catfile curdir updir ); |
|
47
|
|
|
|
|
22997
|
|
|
47
|
|
|
|
|
2915
|
|
45
|
47
|
|
|
47
|
|
22403
|
use Encode qw( _utf8_off ); |
|
47
|
|
|
|
|
337010
|
|
|
47
|
|
|
|
|
2897
|
|
46
|
47
|
|
|
47
|
|
242
|
use File::Path qw( rmtree ); |
|
47
|
|
|
|
|
58
|
|
|
47
|
|
|
|
|
2334
|
|
47
|
47
|
|
|
47
|
|
184
|
use Carp; |
|
47
|
|
|
|
|
50
|
|
|
47
|
|
|
|
|
53915
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $working_dir = catfile( curdir(), 'lucy_test' ); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Return a directory within the system's temp directory where we will put all |
52
|
|
|
|
|
|
|
# testing scratch files. |
53
|
3
|
|
|
3
|
0
|
39
|
sub working_dir {$working_dir} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
sub create_working_dir { |
56
|
1
|
50
|
|
1
|
0
|
87
|
mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!"; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Verify that this user owns the working dir, then zap it. Returns true upon |
60
|
|
|
|
|
|
|
# success. |
61
|
|
|
|
|
|
|
sub remove_working_dir { |
62
|
2
|
100
|
|
2
|
0
|
77
|
return unless -d $working_dir; |
63
|
1
|
|
|
|
|
2302
|
rmtree $working_dir; |
64
|
1
|
|
|
|
|
2
|
return 1; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# Return a location for a test index to be used by a single test file. If |
68
|
|
|
|
|
|
|
# the test file crashes it cannot clean up after itself, so we put the cleanup |
69
|
|
|
|
|
|
|
# routine in a single test file to be run at or near the end of the test |
70
|
|
|
|
|
|
|
# suite. |
71
|
|
|
|
|
|
|
sub test_index_loc { |
72
|
9
|
|
|
9
|
0
|
55
|
return catdir( $working_dir, 'test_index' ); |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Return a location for a test index intended to be shared by multiple test |
76
|
|
|
|
|
|
|
# files. It will be cleaned as above. |
77
|
|
|
|
|
|
|
sub persistent_test_index_loc { |
78
|
3
|
|
|
3
|
0
|
836
|
return catdir( $working_dir, 'persistent_test_index' ); |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Destroy anything left over in the test_index location, then create the |
82
|
|
|
|
|
|
|
# directory. Finally, return the path. |
83
|
|
|
|
|
|
|
sub init_test_index_loc { |
84
|
9
|
|
|
9
|
0
|
42399
|
my $dir = test_index_loc(); |
85
|
9
|
|
|
|
|
6421
|
rmtree $dir; |
86
|
9
|
50
|
|
|
|
114
|
die "Can't clean up '$dir'" if -e $dir; |
87
|
9
|
50
|
|
|
|
445
|
mkdir $dir or die "Can't mkdir '$dir': $!"; |
88
|
9
|
|
|
|
|
33
|
return $dir; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# Build a RAM index, using the supplied array of strings as source material. |
92
|
|
|
|
|
|
|
# The index will have a single field: "content". |
93
|
|
|
|
|
|
|
sub create_index { |
94
|
26
|
|
|
26
|
0
|
121006
|
my $folder = Lucy::Store::RAMFolder->new; |
95
|
26
|
|
|
|
|
1084
|
my $indexer = Lucy::Index::Indexer->new( |
96
|
|
|
|
|
|
|
index => $folder, |
97
|
|
|
|
|
|
|
schema => Lucy::Test::TestSchema->new, |
98
|
|
|
|
|
|
|
); |
99
|
26
|
|
|
|
|
22051
|
$indexer->add_doc( { content => $_ } ) for @_; |
100
|
26
|
|
|
|
|
137619
|
$indexer->commit; |
101
|
26
|
|
|
|
|
1381
|
return $folder; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub uscon_dir { |
105
|
4
|
|
|
4
|
0
|
387
|
my @dirs = ( |
106
|
|
|
|
|
|
|
catdir('sample', 'us_constitution'), |
107
|
|
|
|
|
|
|
catdir(updir(), 'common', 'sample', 'us_constitution'), |
108
|
|
|
|
|
|
|
); |
109
|
|
|
|
|
|
|
|
110
|
4
|
|
|
|
|
13
|
for my $dir (@dirs) { |
111
|
4
|
50
|
|
|
|
74
|
return $dir if -d $dir; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
0
|
die("uscon source dir not found"); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# Slurp us constitition docs and build hashrefs. |
118
|
|
|
|
|
|
|
sub get_uscon_docs { |
119
|
|
|
|
|
|
|
|
120
|
2
|
|
|
2
|
0
|
11
|
my $uscon_dir = uscon_dir(); |
121
|
2
|
50
|
|
|
|
95
|
opendir( my $uscon_dh, $uscon_dir ) |
122
|
|
|
|
|
|
|
or die "couldn't opendir '$uscon_dir': $!"; |
123
|
2
|
|
|
|
|
217
|
my @filenames = grep {/\.txt$/} sort readdir $uscon_dh; |
|
112
|
|
|
|
|
136
|
|
124
|
2
|
50
|
|
|
|
38
|
closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!"; |
125
|
|
|
|
|
|
|
|
126
|
2
|
|
|
|
|
4
|
my %docs; |
127
|
|
|
|
|
|
|
|
128
|
2
|
|
|
|
|
5
|
for my $filename (@filenames) { |
129
|
104
|
|
|
|
|
240
|
my $filepath = catfile( $uscon_dir, $filename ); |
130
|
104
|
50
|
|
|
|
1847
|
open( my $fh, '<', $filepath ) |
131
|
|
|
|
|
|
|
or die "couldn't open file '$filepath': $!"; |
132
|
104
|
|
|
|
|
71
|
my $content = do { local $/; <$fh> }; |
|
104
|
|
|
|
|
221
|
|
|
104
|
|
|
|
|
862
|
|
133
|
104
|
50
|
|
|
|
362
|
$content =~ /\A(.+?)^\s+(.*)/ms |
134
|
|
|
|
|
|
|
or die "Can't extract title/bodytext from '$filepath'"; |
135
|
104
|
|
|
|
|
124
|
my $title = $1; |
136
|
104
|
|
|
|
|
143
|
my $bodytext = $2; |
137
|
104
|
|
|
|
|
4381
|
$bodytext =~ s/\s+/ /sg; |
138
|
104
|
50
|
|
|
|
230
|
my $category |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
139
|
|
|
|
|
|
|
= $filename =~ /art/ ? 'article' |
140
|
|
|
|
|
|
|
: $filename =~ /amend/ ? 'amendment' |
141
|
|
|
|
|
|
|
: $filename =~ /preamble/ ? 'preamble' |
142
|
|
|
|
|
|
|
: confess "Can't derive category for $filename"; |
143
|
|
|
|
|
|
|
|
144
|
104
|
|
|
|
|
782
|
$docs{$filename} = { |
145
|
|
|
|
|
|
|
title => $title, |
146
|
|
|
|
|
|
|
bodytext => $bodytext, |
147
|
|
|
|
|
|
|
url => "/us_constitution/$filename", |
148
|
|
|
|
|
|
|
category => $category, |
149
|
|
|
|
|
|
|
}; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
2
|
|
|
|
|
12
|
return \%docs; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub _uscon_schema { |
156
|
1
|
|
|
1
|
|
42
|
my $schema = Lucy::Plan::Schema->new; |
157
|
1
|
|
|
|
|
30
|
my $analyzer = Lucy::Analysis::EasyAnalyzer->new( language => 'en' ); |
158
|
1
|
|
|
|
|
15
|
my $title_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, ); |
159
|
1
|
|
|
|
|
5
|
my $content_type = Lucy::Plan::FullTextType->new( |
160
|
|
|
|
|
|
|
analyzer => $analyzer, |
161
|
|
|
|
|
|
|
highlightable => 1, |
162
|
|
|
|
|
|
|
); |
163
|
1
|
|
|
|
|
7
|
my $url_type = Lucy::Plan::StringType->new( indexed => 0, ); |
164
|
1
|
|
|
|
|
3
|
my $cat_type = Lucy::Plan::StringType->new; |
165
|
1
|
|
|
|
|
14
|
$schema->spec_field( name => 'title', type => $title_type ); |
166
|
1
|
|
|
|
|
28
|
$schema->spec_field( name => 'content', type => $content_type ); |
167
|
1
|
|
|
|
|
6
|
$schema->spec_field( name => 'url', type => $url_type ); |
168
|
1
|
|
|
|
|
4
|
$schema->spec_field( name => 'category', type => $cat_type ); |
169
|
1
|
|
|
|
|
9
|
return $schema; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub create_uscon_index { |
173
|
1
|
|
|
1
|
0
|
3
|
my $folder |
174
|
|
|
|
|
|
|
= Lucy::Store::FSFolder->new( path => persistent_test_index_loc() ); |
175
|
1
|
|
|
|
|
3
|
my $indexer = Lucy::Index::Indexer->new( |
176
|
|
|
|
|
|
|
schema => _uscon_schema(), |
177
|
|
|
|
|
|
|
index => $folder, |
178
|
|
|
|
|
|
|
truncate => 1, |
179
|
|
|
|
|
|
|
create => 1, |
180
|
|
|
|
|
|
|
); |
181
|
|
|
|
|
|
|
|
182
|
1
|
|
|
|
|
69781
|
$indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 ); |
183
|
1
|
|
|
|
|
73367
|
$indexer->commit; |
184
|
1
|
|
|
|
|
74
|
undef $indexer; |
185
|
|
|
|
|
|
|
|
186
|
1
|
|
|
|
|
8
|
$indexer = Lucy::Index::Indexer->new( index => $folder ); |
187
|
1
|
|
|
|
|
6
|
my $source_docs = get_uscon_docs(); |
188
|
|
|
|
|
|
|
$indexer->add_doc( { content => $_->{bodytext} } ) |
189
|
1
|
|
|
|
|
12501
|
for values %$source_docs; |
190
|
1
|
|
|
|
|
7977
|
$indexer->commit; |
191
|
1
|
|
|
|
|
39
|
undef $indexer; |
192
|
|
|
|
|
|
|
|
193
|
1
|
|
|
|
|
7
|
$indexer = Lucy::Index::Indexer->new( index => $folder ); |
194
|
1
|
|
|
|
|
9
|
my @chars = ( 'a' .. 'z' ); |
195
|
1
|
|
|
|
|
5
|
for ( 0 .. 1000 ) { |
196
|
1001
|
|
|
|
|
1166
|
my $content = ''; |
197
|
1001
|
|
|
|
|
1520
|
for my $num_words ( 1 .. int( rand(20) ) ) { |
198
|
9851
|
|
|
|
|
9376
|
for ( 1 .. ( int( rand(10) ) + 10 ) ) { |
199
|
143456
|
|
|
|
|
110990
|
$content .= @chars[ rand(@chars) ]; |
200
|
|
|
|
|
|
|
} |
201
|
9851
|
|
|
|
|
6152
|
$content .= ' '; |
202
|
|
|
|
|
|
|
} |
203
|
1001
|
|
|
|
|
37307
|
$indexer->add_doc( { content => $content } ); |
204
|
|
|
|
|
|
|
} |
205
|
1
|
|
|
|
|
4
|
$indexer->optimize; |
206
|
1
|
|
|
|
|
232629
|
$indexer->commit; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Return 3 strings useful for verifying UTF-8 integrity. |
210
|
|
|
|
|
|
|
sub utf8_test_strings { |
211
|
4
|
|
|
4
|
0
|
78027
|
my $smiley = "\x{263a}"; |
212
|
4
|
|
|
|
|
7
|
my $not_a_smiley = $smiley; |
213
|
4
|
|
|
|
|
37
|
_utf8_off($not_a_smiley); |
214
|
4
|
|
|
|
|
6
|
my $frowny = $not_a_smiley; |
215
|
4
|
|
|
|
|
13
|
utf8::upgrade($frowny); |
216
|
4
|
|
|
|
|
11
|
return ( $smiley, $not_a_smiley, $frowny ); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Verify an Analyzer's transform, transform_text, and split methods. |
220
|
|
|
|
|
|
|
sub test_analyzer { |
221
|
11
|
|
|
11
|
0
|
2791
|
my ( $analyzer, $source, $expected, $message ) = @_; |
222
|
|
|
|
|
|
|
|
223
|
11
|
|
|
|
|
151
|
my $inversion = Lucy::Analysis::Inversion->new( text => $source ); |
224
|
11
|
|
|
|
|
235
|
$inversion = $analyzer->transform($inversion); |
225
|
11
|
|
|
|
|
35
|
my @got; |
226
|
11
|
|
|
|
|
162
|
while ( my $token = $inversion->next ) { |
227
|
18
|
|
|
|
|
109
|
push @got, $token->get_text; |
228
|
|
|
|
|
|
|
} |
229
|
11
|
|
|
|
|
116
|
Test::More::is_deeply( \@got, $expected, "analyze: $message" ); |
230
|
|
|
|
|
|
|
|
231
|
11
|
|
|
|
|
6856
|
$inversion = $analyzer->transform_text($source); |
232
|
11
|
|
|
|
|
124
|
@got = (); |
233
|
11
|
|
|
|
|
81
|
while ( my $token = $inversion->next ) { |
234
|
18
|
|
|
|
|
84
|
push @got, $token->get_text; |
235
|
|
|
|
|
|
|
} |
236
|
11
|
|
|
|
|
49
|
Test::More::is_deeply( \@got, $expected, "transform_text: $message" ); |
237
|
|
|
|
|
|
|
|
238
|
11
|
|
|
|
|
4549
|
@got = @{ $analyzer->split($source) }; |
|
11
|
|
|
|
|
257
|
|
239
|
11
|
|
|
|
|
59
|
Test::More::is_deeply( \@got, $expected, "split: $message" ); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Extract all doc nums from a SortCollector. Return two sorted array refs: |
243
|
|
|
|
|
|
|
# by_score and by_id. |
244
|
|
|
|
|
|
|
sub doc_ids_from_td_coll { |
245
|
1484
|
|
|
1484
|
0
|
306168
|
my $collector = shift; |
246
|
1484
|
|
|
|
|
1286
|
my @by_score; |
247
|
1484
|
|
|
|
|
14712
|
my $match_docs = $collector->pop_match_docs; |
248
|
24842
|
|
|
|
|
28557
|
my @by_score_then_id = map { $_->get_doc_id } |
249
|
|
|
|
|
|
|
sort { |
250
|
1484
|
50
|
|
|
|
3600
|
$b->get_score <=> $a->get_score |
|
27290
|
|
|
|
|
83307
|
|
251
|
|
|
|
|
|
|
|| $a->get_doc_id <=> $b->get_doc_id |
252
|
|
|
|
|
|
|
} @$match_docs; |
253
|
1484
|
|
|
|
|
2313
|
my @by_id = sort { $a <=> $b } @by_score_then_id; |
|
50985
|
|
|
|
|
29713
|
|
254
|
1484
|
|
|
|
|
15345
|
return ( \@by_score_then_id, \@by_id ); |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# Use a modulus to generate a set of numbers. |
258
|
|
|
|
|
|
|
sub modulo_set { |
259
|
3973
|
|
|
3973
|
0
|
1539277
|
my ( $interval, $max ) = @_; |
260
|
3973
|
|
|
|
|
2859
|
my @out; |
261
|
3973
|
|
|
|
|
6223
|
for ( my $doc = $interval; $doc < $max; $doc += $interval ) { |
262
|
53362
|
|
|
|
|
64784
|
push @out, $doc; |
263
|
|
|
|
|
|
|
} |
264
|
3973
|
|
|
|
|
6762
|
return \@out; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
1; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
__END__ |