File Coverage

buildlib/KinoSearch1/Test/TestUtils.pm

Criterion	Covered	Total	%
statement	129	129	100.0
branch	12	20	60.0
condition			n/a
subroutine	24	24	100.0
pod	0	11	0.0
total	165	184	89.6

line	stmt	bran	sub	pod	time	code
1	23		23		201160	use strict;
	23				61
	23				929
2	23		23		129	use warnings;
	23				54
	23				1060
3
4						package KinoSearch1::Test::TestUtils;
5	23		23		138	use base qw( Exporter );
	23				48
	23				2990
6
7						our @EXPORT_OK = qw(
8						working_dir
9						create_working_dir
10						remove_working_dir
11						create_index
12						create_persistent_test_index
13						test_index_loc
14						persistent_test_index_loc
15						init_test_index_loc
16						get_uscon_docs
17						utf8_test_strings
18						test_analyzer
19						);
20
21	23		23		14087	use KinoSearch1::InvIndexer;
	23				82
	23				809
22	23		23		24012	use KinoSearch1::Store::RAMInvIndex;
	23				79
	23				676
23	23		23		150	use KinoSearch1::Store::FSInvIndex;
	23				49
	23				504
24	23		23		9152	use KinoSearch1::Analysis::Tokenizer;
	23				59
	23				572
25	23		23		133	use KinoSearch1::Analysis::TokenBatch;
	23				48
	23				444
26	23		23		14633	use KinoSearch1::Analysis::PolyAnalyzer;
	23				83
	23				926
27
28	23		23		217	use File::Spec::Functions qw( catdir catfile curdir );
	23				52
	23				1598
29	23		23		23497	use Encode qw( _utf8_off );
	23				289406
	23				2431
30	23		23		222	use File::Path qw( rmtree );
	23				50
	23				1437
31	23		23		139	use Carp;
	23				56
	23				33436
32
33						my $working_dir = catfile( curdir(), 'kinosearch_test' );
34
35						# Return a directory within the system's temp directory where we will put all
36						# testing scratch files.
37	3		3	0	55	sub working_dir {$working_dir}
38
39						sub create_working_dir {
40	1	50	1	0	139	mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
41						}
42
43						# Verify that this user owns the working dir, then zap it. Returns true upon
44						# success.
45						sub remove_working_dir {
46	2	100	2	0	82	return unless -d $working_dir;
47	1				1205	rmtree $working_dir;
48	1				4	return 1;
49						}
50
51						# Return a location for a test index to be used by a single test file. If
52						# the test file crashes it cannot clean up after itself, so we put the cleanup
53						# routine in a single test file to be run at or near the end of the test
54						# suite.
55						sub test_index_loc {
56	2		2	0	27	return catdir( $working_dir, 'test_index' );
57						}
58
59						# Return a location for a test index intended to be shared by multiple test
60						# files. It will be cleaned as above.
61						sub persistent_test_index_loc {
62	5		5	0	79	return catdir( $working_dir, 'persistent_test_index' );
63						}
64
65						# Destroy anything left over in the test_index location, then create the
66						# directory. Finally, return the path.
67						sub init_test_index_loc {
68	1		1	0	10	my $dir = test_index_loc();
69	1				220	rmtree $dir;
70	1	50			13	die "Can't clean up '$dir'" if -e $dir;
71	1	50			100	mkdir $dir or die "Can't mkdir '$dir': $!";
72	1				4	return $dir;
73						}
74
75						# Build a RAM index, using the supplied array of strings as source material.
76						# The index will have a single field: "content".
77						sub create_index {
78	20		20	0	83093	my @docs = @_;
79
80	20				442	my $tokenizer = KinoSearch1::Analysis::Tokenizer->new;
81	20				319	my $invindex = KinoSearch1::Store::RAMInvIndex->new;
82	20				242	my $invindexer = KinoSearch1::InvIndexer->new(
83						invindex => $invindex,
84						analyzer => $tokenizer,
85						create => 1,
86						);
87
88	20				128	$invindexer->spec_field( name => 'content' );
89
90	20				72	for (@docs) {
91	2530				7440	my $doc = $invindexer->new_doc;
92	2530				83020	$doc->set_value( content => $_ );
93	2530				6454	$invindexer->add_doc($doc);
94						}
95
96	20				126	$invindexer->finish;
97
98	20				112	return $invindex;
99						}
100
101						# Slurp us constitition docs and build hashrefs.
102						sub get_uscon_docs {
103
104	1		1	0	7	my $uscon_dir = catdir( 't', 'us_constitution' );
105	1	50			33	opendir( my $uscon_dh, $uscon_dir )
106						or die "couldn't opendir '$uscon_dir': $!";
107	1				137	my @filenames = grep {/\.html$/} sort readdir $uscon_dh;
	56				144
108	1	50			25	closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
109
110	1				3	my %docs;
111
112	1				3	for my $filename (@filenames) {
113	53	100			123	next if $filename eq 'index.html';
114	52				233	my $filepath = catfile( $uscon_dir, $filename );
115	52	50			1828	open( my $fh, '<', $filepath )
116						or die "couldn't open file '$filepath': $!";
117	52				61	my $content = do { local $/; <$fh> };
	52				159
	52				1029
118	52	50			303	$content =~ m#(.*?)#s
119						or die "couldn't isolate title in '$filepath'";
120	52				110	my $title = $1;
121	52	50			367	$content =~ m# (.*?) #s
122						or die "couldn't isolate bodytext in '$filepath'";
123	52				145	my $bodytext = $1;
124	52				730	$bodytext =~ s/<.*?>//sg;
125	52				9788	$bodytext =~ s/\s+/ /sg;
126
127	52				928	$docs{$filename} = {
128						title => $title,
129						bodytext => $bodytext,
130						url => "/us_constitution/$filename",
131						};
132						}
133
134	1				15	return \%docs;
135						}
136
137						sub create_persistent_test_index {
138	1		1	0	2	my $invindexer;
139	1				21	my $polyanalyzer
140						= KinoSearch1::Analysis::PolyAnalyzer->new( language => 'en' );
141
142	1				7	$invindexer = KinoSearch1::InvIndexer->new(
143						invindex => persistent_test_index_loc(),
144						create => 1,
145						analyzer => $polyanalyzer,
146						);
147	1				7	$invindexer->spec_field( name => 'content' );
148	1				2	for ( 0 .. 10000 ) {
149	10001				30026	my $doc = $invindexer->new_doc;
150	10001				280878	$doc->set_value( content => "zz$_" );
151	10001				32522	$invindexer->add_doc($doc);
152						}
153	1				9	$invindexer->finish;
154	1				2	undef $invindexer;
155
156	1				16	$invindexer = KinoSearch1::InvIndexer->new(
157						invindex => persistent_test_index_loc(),
158						analyzer => $polyanalyzer,
159						);
160	1				8	$invindexer->spec_field( name => 'content' );
161	1				5	my $source_docs = get_uscon_docs();
162	1				10	for ( values %$source_docs ) {
163	52				180	my $doc = $invindexer->new_doc;
164	52				1892	$doc->set_value( content => $_->{bodytext} );
165	52				166	$invindexer->add_doc($doc);
166						}
167	1				8	$invindexer->finish;
168	1				3	undef $invindexer;
169
170	1				6	$invindexer = KinoSearch1::InvIndexer->new(
171						invindex => persistent_test_index_loc(),
172						analyzer => $polyanalyzer,
173						);
174	1				7	$invindexer->spec_field( name => 'content' );
175	1				12	my @chars = ( 'a' .. 'z' );
176	1				5	for ( 0 .. 1000 ) {
177	1001				1582	my $content = '';
178	1001				2528	for my $num_words ( 1 .. int( rand(20) ) ) {
179	9486				15834	for ( 1 .. ( int( rand(10) ) + 10 ) ) {
180	137431				223317	$content .= @chars[ rand(@chars) ];
181						}
182	9486				14962	$content .= ' ';
183						}
184	1001				3543	my $doc = $invindexer->new_doc;
185	1001				32725	$doc->set_value( content => $content );
186	1001				3042	$invindexer->add_doc($doc);
187						}
188	1				8	$invindexer->finish( optimize => 1 );
189						}
190
191						# Return 3 strings useful for verifying UTF-8 integrity.
192						sub utf8_test_strings {
193	1		1	0	1060	my $smiley = "\x{263a}";
194	1				3	my $not_a_smiley = $smiley;
195	1				12	_utf8_off($not_a_smiley);
196	1				3	my $frowny = $not_a_smiley;
197	1				4	utf8::upgrade($frowny);
198	1				4	return ( $smiley, $not_a_smiley, $frowny );
199						}
200
201						# Verify an Analyzer's analyze() method.
202						sub test_analyzer {
203	5		5	0	48	my ( $analyzer, $source, $expected, $message ) = @_;
204
205	5				40	my $batch = KinoSearch1::Analysis::TokenBatch->new;
206	5				40	$batch->append( $source, 0, length($source) );
207
208	5				21	$batch = $analyzer->analyze($batch);
209	5				15	my @got;
210	5				26	while ( $batch->next ) {
211	14				86	push @got, $batch->get_text;
212						}
213	5				25	Test::More::is_deeply( \@got, $expected, "analyze: $message" );
214						}
215
216						1;
217
218						__END__