File Coverage

buildlib/KinoSearch/Test/TestUtils.pm
Criterion Covered Total %
statement 126 126 100.0
branch 15 24 62.5
condition n/a
subroutine 24 24 100.0
pod 0 13 0.0
total 165 187 88.2


line stmt bran cond sub pod time code
1 46     46   2850918 use strict;
  46         115  
  46         1754  
2 46     46   398 use warnings;
  46         165  
  46         2229  
3              
4             package KinoSearch::Test::TestUtils;
5 46     46   397 use base qw( Exporter );
  46         101  
  46         7355  
6              
7             our @EXPORT_OK = qw(
8             working_dir
9             create_working_dir
10             remove_working_dir
11             create_index
12             create_uscon_index
13             test_index_loc
14             persistent_test_index_loc
15             init_test_index_loc
16             get_uscon_docs
17             utf8_test_strings
18             test_analyzer
19             doc_ids_from_td_coll
20             modulo_set
21             );
22              
23 46     46   26578 use KinoSearch;
  46         132  
  46         2382  
24 46     46   23923 use KinoSearch::Test;
  46         114  
  46         1442  
25              
26 46     46   418 use lib 'sample';
  46         97  
  46         473  
27 46     46   25341 use KinoSearch::Test::USConSchema;
  46         131  
  46         2023  
28              
29 46     46   289 use File::Spec::Functions qw( catdir catfile curdir );
  46         95  
  46         3510  
30 46     46   57702 use Encode qw( _utf8_off );
  46         646193  
  46         4765  
31 46     46   429 use File::Path qw( rmtree );
  46         87  
  46         2822  
32 46     46   335 use Carp;
  46         97  
  46         90060  
33              
34             my $working_dir = catfile( curdir(), 'kinosearch_test' );
35              
36             # Return a directory within the system's temp directory where we will put all
37             # testing scratch files.
38 3     3 0 84 sub working_dir {$working_dir}
39              
40             sub create_working_dir {
41 1 50   1 0 132 mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
42             }
43              
44             # Verify that this user owns the working dir, then zap it. Returns true upon
45             # success.
46             sub remove_working_dir {
47 2 100   2 0 76 return unless -d $working_dir;
48 1         23028 rmtree $working_dir;
49 1         10 return 1;
50             }
51              
52             # Return a location for a test index to be used by a single test file. If
53             # the test file crashes it cannot clean up after itself, so we put the cleanup
54             # routine in a single test file to be run at or near the end of the test
55             # suite.
56             sub test_index_loc {
57 9     9 0 97 return catdir( $working_dir, 'test_index' );
58             }
59              
60             # Return a location for a test index intended to be shared by multiple test
61             # files. It will be cleaned as above.
62             sub persistent_test_index_loc {
63 3     3 0 195 return catdir( $working_dir, 'persistent_test_index' );
64             }
65              
66             # Destroy anything left over in the test_index location, then create the
67             # directory. Finally, return the path.
68             sub init_test_index_loc {
69 9     9 0 85 my $dir = test_index_loc();
70 9         10376 rmtree $dir;
71 9 50       175 die "Can't clean up '$dir'" if -e $dir;
72 9 50       839 mkdir $dir or die "Can't mkdir '$dir': $!";
73 9         51 return $dir;
74             }
75              
76             # Build a RAM index, using the supplied array of strings as source material.
77             # The index will have a single field: "content".
78             sub create_index {
79 27     27 0 240886 my $folder = KinoSearch::Store::RAMFolder->new;
80 27         1157 my $indexer = KinoSearch::Index::Indexer->new(
81             index => $folder,
82             schema => KinoSearch::Test::TestSchema->new,
83             );
84 27         53959 $indexer->add_doc( { content => $_ } ) for @_;
85 27         231606 $indexer->commit;
86 27         2818 return $folder;
87             }
88              
89             # Slurp us constitition docs and build hashrefs.
90             sub get_uscon_docs {
91              
92 2     2 0 26 my $uscon_dir = catdir( 'sample', 'us_constitution' );
93 2 50       1282 opendir( my $uscon_dh, $uscon_dir )
94             or die "couldn't opendir '$uscon_dir': $!";
95 2         340 my @filenames = grep {/\.txt$/} sort readdir $uscon_dh;
  112         262  
96 2 50       63 closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
97              
98 2         6 my %docs;
99              
100 2         9 for my $filename (@filenames) {
101 104         671 my $filepath = catfile( $uscon_dir, $filename );
102 104 50       5216 open( my $fh, '<', $filepath )
103             or die "couldn't open file '$filepath': $!";
104 104         137 my $content = do { local $/; <$fh> };
  104         386  
  104         53727  
105 104 50       690 $content =~ /(.*?)\n\n(.*)/s
106             or die "Can't extract title/bodytext from '$filepath'";
107 104         324 my $title = $1;
108 104         292 my $bodytext = $2;
109 104         9502 $bodytext =~ s/\s+/ /sg;
110 104 50       636 my $category
    100          
    100          
111             = $filename =~ /art/ ? 'article'
112             : $filename =~ /amend/ ? 'amendment'
113             : $filename =~ /preamble/ ? 'preamble'
114             : confess "Can't derive category for $filename";
115              
116 104         2698 $docs{$filename} = {
117             title => $title,
118             bodytext => $bodytext,
119             url => "/us_constitution/$filename",
120             category => $category,
121             };
122             }
123              
124 2         34 return \%docs;
125             }
126              
127             sub create_uscon_index {
128 1     1 0 5 my $folder = KinoSearch::Store::FSFolder->new(
129             path => persistent_test_index_loc() );
130 1         80 my $schema = KinoSearch::Test::USConSchema->new;
131 1         13 my $indexer = KinoSearch::Index::Indexer->new(
132             schema => $schema,
133             index => $folder,
134             truncate => 1,
135             create => 1,
136             );
137              
138 1         104 $indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 );
139 1         250320 $indexer->commit;
140 1         4 undef $indexer;
141              
142 1         117 $indexer = KinoSearch::Index::Indexer->new(
143             schema => $schema,
144             index => $folder,
145             );
146 1         7 my $source_docs = get_uscon_docs();
147             $indexer->add_doc( { content => $_->{bodytext} } )
148 1         687 for values %$source_docs;
149 1         28842 $indexer->commit;
150 1         4 undef $indexer;
151              
152 1         233 $indexer = KinoSearch::Index::Indexer->new(
153             schema => $schema,
154             index => $folder,
155             );
156 1         11 my @chars = ( 'a' .. 'z' );
157 1         6 for ( 0 .. 1000 ) {
158 1001         1898 my $content = '';
159 1001         2043 for my $num_words ( 1 .. int( rand(20) ) ) {
160 9570         15882 for ( 1 .. ( int( rand(10) ) + 10 ) ) {
161 139035         227414 $content .= @chars[ rand(@chars) ];
162             }
163 9570         14177 $content .= ' ';
164             }
165 1001         73037 $indexer->add_doc( { content => $content } );
166             }
167 1         9 $indexer->optimize;
168 1         98 $indexer->commit;
169             }
170              
171             # Return 3 strings useful for verifying UTF-8 integrity.
172             sub utf8_test_strings {
173 5     5 0 136802 my $smiley = "\x{263a}";
174 5         14 my $not_a_smiley = $smiley;
175 5         70 _utf8_off($not_a_smiley);
176 5         13 my $frowny = $not_a_smiley;
177 5         25 utf8::upgrade($frowny);
178 5         22 return ( $smiley, $not_a_smiley, $frowny );
179             }
180              
181             # Verify an Analyzer's transform, transform_text, and split methods.
182             sub test_analyzer {
183 9     9 0 2733 my ( $analyzer, $source, $expected, $message ) = @_;
184              
185 9         212 my $inversion = KinoSearch::Analysis::Inversion->new( text => $source );
186 9         235 $inversion = $analyzer->transform($inversion);
187 9         48 my @got;
188 9         168 while ( my $token = $inversion->next ) {
189 13         114 push @got, $token->get_text;
190             }
191 9         76 Test::More::is_deeply( \@got, $expected, "analyze: $message" );
192              
193 9         11897 $inversion = $analyzer->transform_text($source);
194 9         127 @got = ();
195 9         90 while ( my $token = $inversion->next ) {
196 13         96 push @got, $token->get_text;
197             }
198 9         56 Test::More::is_deeply( \@got, $expected, "transform_text: $message" );
199              
200 9         11672 @got = @{ $analyzer->split($source) };
  9         297  
201 9         72 Test::More::is_deeply( \@got, $expected, "split: $message" );
202             }
203              
204             # Extract all doc nums from a SortCollector. Return two sorted array refs:
205             # by_score and by_id.
206             sub doc_ids_from_td_coll {
207 1223     1223 0 498958 my $collector = shift;
208 1223         2080 my @by_score;
209 1223         29624 my $match_docs = $collector->pop_match_docs;
210 17616 50       54919 my @by_score_then_id = map { $_->get_doc_id }
  19040         127458  
211             sort {
212 1223         5595 $b->get_score <=> $a->get_score
213             || $a->get_doc_id <=> $b->get_doc_id
214             } @$match_docs;
215 1223         3643 my @by_id = sort { $a <=> $b } @by_score_then_id;
  37343         41009  
216 1223         21753 return ( \@by_score_then_id, \@by_id );
217             }
218              
219             # Use a modulus to generate a set of numbers.
220             sub modulo_set {
221 3229     3229 0 2401714 my ( $interval, $max ) = @_;
222 3229         4835 my @out;
223 3229         10929 for ( my $doc = $interval; $doc < $max; $doc += $interval ) {
224 45031         106110 push @out, $doc;
225             }
226 3229         11216 return \@out;
227             }
228              
229             1;
230              
231             __END__