File Coverage

buildlib/Lucy/Test/TestUtils.pm
Criterion Covered Total %
statement 133 134 99.2
branch 16 26 61.5
condition n/a
subroutine 24 24 100.0
pod 0 14 0.0
total 173 198 87.3


line stmt bran cond sub pod time code
1             # Licensed to the Apache Software Foundation (ASF) under one or more
2             # contributor license agreements. See the NOTICE file distributed with
3             # this work for additional information regarding copyright ownership.
4             # The ASF licenses this file to You under the Apache License, Version 2.0
5             # (the "License"); you may not use this file except in compliance with
6             # the License. You may obtain a copy of the License at
7             #
8             # http://www.apache.org/licenses/LICENSE-2.0
9             #
10             # Unless required by applicable law or agreed to in writing, software
11             # distributed under the License is distributed on an "AS IS" BASIS,
12             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13             # See the License for the specific language governing permissions and
14             # limitations under the License.
15              
16 47     47   504536 use strict;
  47         60  
  47         1222  
17 47     47   158 use warnings;
  47         47  
  47         2432  
18              
19             package Lucy::Test::TestUtils;
20              
21             our $VERSION = '0.006000_001';
22             $VERSION = eval $VERSION;
23              
24 47     47   160 use Exporter 'import';
  47         58  
  47         2279  
25             our @EXPORT_OK = qw(
26             working_dir
27             create_working_dir
28             remove_working_dir
29             uscon_dir
30             create_index
31             create_uscon_index
32             test_index_loc
33             persistent_test_index_loc
34             init_test_index_loc
35             get_uscon_docs
36             utf8_test_strings
37             test_analyzer
38             doc_ids_from_td_coll
39             modulo_set
40             );
41              
42 47     47   9587 use Lucy;
  47         75  
  47         1612  
43 47     47   12689 use Lucy::Test;
  47         82  
  47         1205  
44 47     47   16573 use File::Spec::Functions qw( catdir catfile curdir updir );
  47         22997  
  47         2915  
45 47     47   22403 use Encode qw( _utf8_off );
  47         337010  
  47         2897  
46 47     47   242 use File::Path qw( rmtree );
  47         58  
  47         2334  
47 47     47   184 use Carp;
  47         50  
  47         53915  
48              
49             my $working_dir = catfile( curdir(), 'lucy_test' );
50              
51             # Return a directory within the system's temp directory where we will put all
52             # testing scratch files.
53 3     3 0 39 sub working_dir {$working_dir}
54              
55             sub create_working_dir {
56 1 50   1 0 87 mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
57             }
58              
59             # Verify that this user owns the working dir, then zap it. Returns true upon
60             # success.
61             sub remove_working_dir {
62 2 100   2 0 77 return unless -d $working_dir;
63 1         2302 rmtree $working_dir;
64 1         2 return 1;
65             }
66              
67             # Return a location for a test index to be used by a single test file. If
68             # the test file crashes it cannot clean up after itself, so we put the cleanup
69             # routine in a single test file to be run at or near the end of the test
70             # suite.
71             sub test_index_loc {
72 9     9 0 55 return catdir( $working_dir, 'test_index' );
73             }
74              
75             # Return a location for a test index intended to be shared by multiple test
76             # files. It will be cleaned as above.
77             sub persistent_test_index_loc {
78 3     3 0 836 return catdir( $working_dir, 'persistent_test_index' );
79             }
80              
81             # Destroy anything left over in the test_index location, then create the
82             # directory. Finally, return the path.
83             sub init_test_index_loc {
84 9     9 0 42399 my $dir = test_index_loc();
85 9         6421 rmtree $dir;
86 9 50       114 die "Can't clean up '$dir'" if -e $dir;
87 9 50       445 mkdir $dir or die "Can't mkdir '$dir': $!";
88 9         33 return $dir;
89             }
90              
91             # Build a RAM index, using the supplied array of strings as source material.
92             # The index will have a single field: "content".
93             sub create_index {
94 26     26 0 121006 my $folder = Lucy::Store::RAMFolder->new;
95 26         1084 my $indexer = Lucy::Index::Indexer->new(
96             index => $folder,
97             schema => Lucy::Test::TestSchema->new,
98             );
99 26         22051 $indexer->add_doc( { content => $_ } ) for @_;
100 26         137619 $indexer->commit;
101 26         1381 return $folder;
102             }
103              
104             sub uscon_dir {
105 4     4 0 387 my @dirs = (
106             catdir('sample', 'us_constitution'),
107             catdir(updir(), 'common', 'sample', 'us_constitution'),
108             );
109              
110 4         13 for my $dir (@dirs) {
111 4 50       74 return $dir if -d $dir;
112             }
113              
114 0         0 die("uscon source dir not found");
115             }
116              
117             # Slurp us constitition docs and build hashrefs.
118             sub get_uscon_docs {
119              
120 2     2 0 11 my $uscon_dir = uscon_dir();
121 2 50       95 opendir( my $uscon_dh, $uscon_dir )
122             or die "couldn't opendir '$uscon_dir': $!";
123 2         217 my @filenames = grep {/\.txt$/} sort readdir $uscon_dh;
  112         136  
124 2 50       38 closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
125              
126 2         4 my %docs;
127              
128 2         5 for my $filename (@filenames) {
129 104         240 my $filepath = catfile( $uscon_dir, $filename );
130 104 50       1847 open( my $fh, '<', $filepath )
131             or die "couldn't open file '$filepath': $!";
132 104         71 my $content = do { local $/; <$fh> };
  104         221  
  104         862  
133 104 50       362 $content =~ /\A(.+?)^\s+(.*)/ms
134             or die "Can't extract title/bodytext from '$filepath'";
135 104         124 my $title = $1;
136 104         143 my $bodytext = $2;
137 104         4381 $bodytext =~ s/\s+/ /sg;
138 104 50       230 my $category
    100          
    100          
139             = $filename =~ /art/ ? 'article'
140             : $filename =~ /amend/ ? 'amendment'
141             : $filename =~ /preamble/ ? 'preamble'
142             : confess "Can't derive category for $filename";
143              
144 104         782 $docs{$filename} = {
145             title => $title,
146             bodytext => $bodytext,
147             url => "/us_constitution/$filename",
148             category => $category,
149             };
150             }
151              
152 2         12 return \%docs;
153             }
154              
155             sub _uscon_schema {
156 1     1   42 my $schema = Lucy::Plan::Schema->new;
157 1         30 my $analyzer = Lucy::Analysis::EasyAnalyzer->new( language => 'en' );
158 1         15 my $title_type = Lucy::Plan::FullTextType->new( analyzer => $analyzer, );
159 1         5 my $content_type = Lucy::Plan::FullTextType->new(
160             analyzer => $analyzer,
161             highlightable => 1,
162             );
163 1         7 my $url_type = Lucy::Plan::StringType->new( indexed => 0, );
164 1         3 my $cat_type = Lucy::Plan::StringType->new;
165 1         14 $schema->spec_field( name => 'title', type => $title_type );
166 1         28 $schema->spec_field( name => 'content', type => $content_type );
167 1         6 $schema->spec_field( name => 'url', type => $url_type );
168 1         4 $schema->spec_field( name => 'category', type => $cat_type );
169 1         9 return $schema;
170             }
171              
172             sub create_uscon_index {
173 1     1 0 3 my $folder
174             = Lucy::Store::FSFolder->new( path => persistent_test_index_loc() );
175 1         3 my $indexer = Lucy::Index::Indexer->new(
176             schema => _uscon_schema(),
177             index => $folder,
178             truncate => 1,
179             create => 1,
180             );
181              
182 1         69781 $indexer->add_doc( { content => "zz$_" } ) for ( 0 .. 10000 );
183 1         73367 $indexer->commit;
184 1         74 undef $indexer;
185              
186 1         8 $indexer = Lucy::Index::Indexer->new( index => $folder );
187 1         6 my $source_docs = get_uscon_docs();
188             $indexer->add_doc( { content => $_->{bodytext} } )
189 1         12501 for values %$source_docs;
190 1         7977 $indexer->commit;
191 1         39 undef $indexer;
192              
193 1         7 $indexer = Lucy::Index::Indexer->new( index => $folder );
194 1         9 my @chars = ( 'a' .. 'z' );
195 1         5 for ( 0 .. 1000 ) {
196 1001         1166 my $content = '';
197 1001         1520 for my $num_words ( 1 .. int( rand(20) ) ) {
198 9851         9376 for ( 1 .. ( int( rand(10) ) + 10 ) ) {
199 143456         110990 $content .= @chars[ rand(@chars) ];
200             }
201 9851         6152 $content .= ' ';
202             }
203 1001         37307 $indexer->add_doc( { content => $content } );
204             }
205 1         4 $indexer->optimize;
206 1         232629 $indexer->commit;
207             }
208              
209             # Return 3 strings useful for verifying UTF-8 integrity.
210             sub utf8_test_strings {
211 4     4 0 78027 my $smiley = "\x{263a}";
212 4         7 my $not_a_smiley = $smiley;
213 4         37 _utf8_off($not_a_smiley);
214 4         6 my $frowny = $not_a_smiley;
215 4         13 utf8::upgrade($frowny);
216 4         11 return ( $smiley, $not_a_smiley, $frowny );
217             }
218              
219             # Verify an Analyzer's transform, transform_text, and split methods.
220             sub test_analyzer {
221 11     11 0 2791 my ( $analyzer, $source, $expected, $message ) = @_;
222              
223 11         151 my $inversion = Lucy::Analysis::Inversion->new( text => $source );
224 11         235 $inversion = $analyzer->transform($inversion);
225 11         35 my @got;
226 11         162 while ( my $token = $inversion->next ) {
227 18         109 push @got, $token->get_text;
228             }
229 11         116 Test::More::is_deeply( \@got, $expected, "analyze: $message" );
230              
231 11         6856 $inversion = $analyzer->transform_text($source);
232 11         124 @got = ();
233 11         81 while ( my $token = $inversion->next ) {
234 18         84 push @got, $token->get_text;
235             }
236 11         49 Test::More::is_deeply( \@got, $expected, "transform_text: $message" );
237              
238 11         4549 @got = @{ $analyzer->split($source) };
  11         257  
239 11         59 Test::More::is_deeply( \@got, $expected, "split: $message" );
240             }
241              
242             # Extract all doc nums from a SortCollector. Return two sorted array refs:
243             # by_score and by_id.
244             sub doc_ids_from_td_coll {
245 1484     1484 0 306168 my $collector = shift;
246 1484         1286 my @by_score;
247 1484         14712 my $match_docs = $collector->pop_match_docs;
248 24842         28557 my @by_score_then_id = map { $_->get_doc_id }
249             sort {
250 1484 50       3600 $b->get_score <=> $a->get_score
  27290         83307  
251             || $a->get_doc_id <=> $b->get_doc_id
252             } @$match_docs;
253 1484         2313 my @by_id = sort { $a <=> $b } @by_score_then_id;
  50985         29713  
254 1484         15345 return ( \@by_score_then_id, \@by_id );
255             }
256              
257             # Use a modulus to generate a set of numbers.
258             sub modulo_set {
259 3973     3973 0 1539277 my ( $interval, $max ) = @_;
260 3973         2859 my @out;
261 3973         6223 for ( my $doc = $interval; $doc < $max; $doc += $interval ) {
262 53362         64784 push @out, $doc;
263             }
264 3973         6762 return \@out;
265             }
266              
267             1;
268              
269             __END__