File Coverage

buildlib/Lucy/Test/TestUtils.pm

Criterion	Covered	Total	%
statement	133	134	99.2
branch	16	26	61.5
condition			n/a
subroutine	24	24	100.0
pod	0	14	0.0
total	173	198	87.3

line	stmt	bran	sub	pod	time	code
1						# Licensed to the Apache Software Foundation (ASF) under one or more
2						# contributor license agreements. See the NOTICE file distributed with
3						# this work for additional information regarding copyright ownership.
4						# The ASF licenses this file to You under the Apache License, Version 2.0
5						# (the "License"); you may not use this file except in compliance with
6						# the License. You may obtain a copy of the License at
7						#
8						# http://www.apache.org/licenses/LICENSE-2.0
9						#
10						# Unless required by applicable law or agreed to in writing, software
11						# distributed under the License is distributed on an "AS IS" BASIS,
12						# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13						# See the License for the specific language governing permissions and
14						# limitations under the License.
15
16	47		47		504536	use strict;
	47				60
	47				1222
17	47		47		158	use warnings;
	47				47
	47				2432
18
19						package Lucy::Test::TestUtils;
20
21						our $VERSION = '0.006000_001';
22						$VERSION = eval $VERSION;
23
24	47		47		160	use Exporter 'import';
	47				58
	47				2279
25						our @EXPORT_OK = qw(
26						working_dir
27						create_working_dir
28						remove_working_dir
29						uscon_dir
30						create_index
31						create_uscon_index
32						test_index_loc
33						persistent_test_index_loc
34						init_test_index_loc
35						get_uscon_docs
36						utf8_test_strings
37						test_analyzer
38						doc_ids_from_td_coll
39						modulo_set
40						);
41
42	47		47		9587	use Lucy;
	47				75
	47				1612
43	47		47		12689	use Lucy::Test;
	47				82
	47				1205
44	47		47		16573	use File::Spec::Functions qw( catdir catfile curdir updir );
	47				22997
	47				2915
45	47		47		22403	use Encode qw( _utf8_off );
	47				337010
	47				2897
46	47		47		242	use File::Path qw( rmtree );
	47				58
	47				2334
47	47		47		184	use Carp;
	47				50
	47				53915
48
49						my $working_dir = catfile( curdir(), 'lucy_test' );
50
51						# Return a directory within the system's temp directory where we will put all
52						# testing scratch files.
53	3		3	0	39	sub working_dir {$working_dir}
54
55						sub create_working_dir {
56	1	50	1	0	87	mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
57						}
58
59						# Verify that this user owns the working dir, then zap it. Returns true upon
60						# success.
61						sub remove_working_dir {
62	2	100	2	0	77	return unless -d $working_dir;
63	1				2302	rmtree $working_dir;
64	1				2	return 1;
65						}
66
67						# Return a location for a test index to be used by a single test file. If
68						# the test file crashes it cannot clean up after itself, so we put the cleanup
69						# routine in a single test file to be run at or near the end of the test
70						# suite.
71						sub test_index_loc {
72	9		9	0	55	return catdir( $working_dir, 'test_index' );
73						}
74
75						# Return a location for a test index intended to be shared by multiple test
76						# files. It will be cleaned as above.
77						sub persistent_test_index_loc {
78	3		3	0	836	return catdir( $working_dir, 'persistent_test_index' );
79						}
80
81						# Destroy anything left over in the test_index location, then create the
82						# directory. Finally, return the path.
83						sub init_test_index_loc {
84	9		9	0	42399	my $dir = test_index_loc();
85	9				6421	rmtree $dir;
86	9	50			114	die "Can't clean up '$dir'" if -e $dir;
87	9	50			445	mkdir $dir or die "Can't mkdir '$dir': $!";
88	9				33	return $dir;
89						}
90
91						# Build a RAM index, using the supplied array of strings as source material.
92						# The index will have a single field: "content".
93						sub create_index {
94	26		26	0	121006	my $folder = Lucy::Store::RAMFolder->new;
95	26				1084	my $indexer = Lucy::Index::Indexer->new(
96						index => $folder,
97						schema => Lucy::Test::TestSchema->new,
98						);
99	26				22051	$indexer->add_doc( { content => $_ } ) for @_;
100	26				137619	$indexer->commit;
101	26				1381	return $folder;
102						}
103
104						sub uscon_dir {
105	4		4	0	387	my @dirs = (
106						catdir('sample', 'us_constitution'),
107						catdir(updir(), 'common', 'sample', 'us_constitution'),
108						);
109
110	4				13	for my $dir (@dirs) {
111	4	50			74	return $dir if -d $dir;
112						}
113
114	0				0	die("uscon source dir not found");
115						}
116
117						# Slurp us constitition docs and build hashrefs.
118						sub get_uscon_docs {
119
120	2		2	0	11	my $uscon_dir = uscon_dir();
121	2	50			95	opendir( my $uscon_dh, $uscon_dir )
122						or die "couldn't opendir '$uscon_dir': $!";
123	2				217	my @filenames = grep {/\.txt$/} sort readdir $uscon_dh;
	112				136
124	2	50			38	closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
125
126	2				4	my %docs;
127
128	2				5	for my $filename (@filenames) {
129	104				240	my $filepath = catfile( $uscon_dir, $filename );
130	104	50			1847	open( my $fh, '<', $filepath )
131						or die "couldn't open file '$filepath': $!";
132	104				71	my $content = do { local $/; <$fh> };
	104				221
	104				862
133	104	50			362	$content =~ /\A(.+?)^\s+(.*)/ms
134						or die "Can't extract title/bodytext from '$filepath'";
135	104				124	my $title = $1;
136	104				143	my $bodytext = $2;
137	104				4381	$bodytext =~ s/\s+/ /sg;
138	104	50			230	my $category
		100
		100
139						= $filename =~ /art/ ? 'article'
140						: $filename =~ /amend/ ? 'amendment'
141						: $filename =~ /preamble/ ? 'preamble'
142						: confess "Can't derive category for $filename";
143
144	104				782	$docs{$filename} = {
145						title => $title,
146						bodytext => $bodytext,
147						url => "/us_constitution/$filename",
148						category => $category,
149						};
150						}
151
152	2				12	return \%docs;
153						}
154
155						sub _uscon_schema {
156	1		1		42	my $schema = Lucy::Plan::Schema->new;
157	1				30	my $analyzer = Lucy::Analysis::EasyAnalyzer->new( language => 'en' );
158	1				15	my $title_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
159	1				5	my $content_type = Lucy::Plan::FullTextType->new(
160						analyzer => $analyzer,
161						highlightable => 1,
162						);
163	1				7	my $url_type = Lucy::Plan::StringType->new( indexed => 0, );
164	1				3	my $cat_type = Lucy::Plan::StringType->new;
165	1				14	$schema->spec_field( name => 'title', type => $title_type );
166	1				28	$schema->spec_field( name => 'content', type => $content_type );
167	1				6	$schema->spec_field( name => 'url', type => $url_type );
168	1				4	$schema->spec_field( name => 'category', type => $cat_type );
169	1				9	return $schema;
170						}
171
172						sub create_uscon_index {
173	1		1	0	3	my $folder
174						= Lucy::Store::FSFolder->new( path => persistent_test_index_loc() );
175	1				3	my $indexer = Lucy::Index::Indexer->new(
176						schema => _uscon_schema(),
177						index => $folder,
178						truncate => 1,
179						create => 1,
180						);
181
182	1				69781	$indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 );
183	1				73367	$indexer->commit;
184	1				74	undef $indexer;
185
186	1				8	$indexer = Lucy::Index::Indexer->new( index => $folder );
187	1				6	my $source_docs = get_uscon_docs();
188						$indexer->add_doc( { content => $_->{bodytext} } )
189	1				12501	for values %$source_docs;
190	1				7977	$indexer->commit;
191	1				39	undef $indexer;
192
193	1				7	$indexer = Lucy::Index::Indexer->new( index => $folder );
194	1				9	my @chars = ( 'a' .. 'z' );
195	1				5	for ( 0 .. 1000 ) {
196	1001				1166	my $content = '';
197	1001				1520	for my $num_words ( 1 .. int( rand(20) ) ) {
198	9851				9376	for ( 1 .. ( int( rand(10) ) + 10 ) ) {
199	143456				110990	$content .= @chars[ rand(@chars) ];
200						}
201	9851				6152	$content .= ' ';
202						}
203	1001				37307	$indexer->add_doc( { content => $content } );
204						}
205	1				4	$indexer->optimize;
206	1				232629	$indexer->commit;
207						}
208
209						# Return 3 strings useful for verifying UTF-8 integrity.
210						sub utf8_test_strings {
211	4		4	0	78027	my $smiley = "\x{263a}";
212	4				7	my $not_a_smiley = $smiley;
213	4				37	_utf8_off($not_a_smiley);
214	4				6	my $frowny = $not_a_smiley;
215	4				13	utf8::upgrade($frowny);
216	4				11	return ( $smiley, $not_a_smiley, $frowny );
217						}
218
219						# Verify an Analyzer's transform, transform_text, and split methods.
220						sub test_analyzer {
221	11		11	0	2791	my ( $analyzer, $source, $expected, $message ) = @_;
222
223	11				151	my $inversion = Lucy::Analysis::Inversion->new( text => $source );
224	11				235	$inversion = $analyzer->transform($inversion);
225	11				35	my @got;
226	11				162	while ( my $token = $inversion->next ) {
227	18				109	push @got, $token->get_text;
228						}
229	11				116	Test::More::is_deeply( \@got, $expected, "analyze: $message" );
230
231	11				6856	$inversion = $analyzer->transform_text($source);
232	11				124	@got = ();
233	11				81	while ( my $token = $inversion->next ) {
234	18				84	push @got, $token->get_text;
235						}
236	11				49	Test::More::is_deeply( \@got, $expected, "transform_text: $message" );
237
238	11				4549	@got = @{ $analyzer->split($source) };
	11				257
239	11				59	Test::More::is_deeply( \@got, $expected, "split: $message" );
240						}
241
242						# Extract all doc nums from a SortCollector. Return two sorted array refs:
243						# by_score and by_id.
244						sub doc_ids_from_td_coll {
245	1484		1484	0	306168	my $collector = shift;
246	1484				1286	my @by_score;
247	1484				14712	my $match_docs = $collector->pop_match_docs;
248	24842				28557	my @by_score_then_id = map { $_->get_doc_id }
249						sort {
250	1484	50			3600	$b->get_score <=> $a->get_score
	27290				83307
251						\|\| $a->get_doc_id <=> $b->get_doc_id
252						} @$match_docs;
253	1484				2313	my @by_id = sort { $a <=> $b } @by_score_then_id;
	50985				29713
254	1484				15345	return ( \@by_score_then_id, \@by_id );
255						}
256
257						# Use a modulus to generate a set of numbers.
258						sub modulo_set {
259	3973		3973	0	1539277	my ( $interval, $max ) = @_;
260	3973				2859	my @out;
261	3973				6223	for ( my $doc = $interval; $doc < $max; $doc += $interval ) {
262	53362				64784	push @out, $doc;
263						}
264	3973				6762	return \@out;
265						}
266
267						1;
268
269						__END__