File Coverage

blib/lib/CDB_File/Generator.pm

Criterion	Covered	Total	%
statement	56	73	76.7
branch	5	14	35.7
condition	0	3	0.0
subroutine	9	13	69.2
pod	5	7	71.4
total	75	110	68.1

line	stmt	bran	cond	sub	pod	time	code
1							package CDB_File::Generator;
2							$REVISION=q$Revision: 1.13 $ ;
3	1			1		746	use vars qw($VERSION);
	1					2
	1					71
4							$VERSION='0.030';
5
6							=head1 NAME
7
8							CDB_File::Generator - generate massive sorted CDB files simply.
9
10							=head1 SYNOPSIS
11
12							use CDB_File::Generator;
13							$gen = new CDB::Generator "my.cdb";
14							$gen->("Fred", "Martha");
15							$gen->("Fred", "Olivia");
16							$gen->("Fred", "Jenny");
17							$gen->("Roger", "Joe");
18							$gen->("Roger", "Jenny");
19							$gen = undef;
20							use CDB_File;
21
22
23							=head1 DESCRIPTION
24
25							This is a class which makes generating sorted large (much bigger than
26							memory, but the speed will depend on the efficiency of your sort
27							command. If you haven't got one, for example, it won't work at all.)
28							CDB files on the fly very easy
29
30							=cut
31
32							#this lets us have lots of unique temporary files.
33
34	1			1		5	use Carp;
	1					2
	1					58
35	1			1		994	use IO::File;
	1					11625
	1					164
36	1			1		9	use strict;
	1					2
	1					56
37
38							BEGIN {
39	1			1		1291	my $tempfile_no=0;
40	2			2	0	9	sub next_tmp_file () { $tempfile_no++ };
41							}
42
43							=head1 METHODS
44
45							=head2 Generator::new $cdbfile [$cdbmaketemp [{$tmpname [$sorttmpname] \| $tmpdir}]]
46
47							The new function creates a generator for a given filename, optionally
48							specifying where it sould put it's temporary files.
49
50							=cut
51
52							my $deftemp="/tmp/cdb_generator_tmp.$$.";
53
54							sub new ($$@) {
55	1			1	1	42	my $class=shift;
56	1					3	my $self = bless {}, $class;
57	1					3	my $cdbfilename = shift;
58	1					7	$self->{"cdbfile"} = $cdbfilename;
59	1					7	$self->{"tmpoutfile"} = $deftemp . next_tmp_file();
60	1					4	$self->{"tmpsortfile"} = $deftemp . next_tmp_file();
61	1					2	my $tmpmake = $cdbfilename;
62	1	50				10	$tmpmake =~ s/.cdb$/.tmp/ or $tmpmake = $tmpmake . ".tmp";
63	1	50				13	Carp::croak "file $tmpmake for cdb temfile exists" if -e $tmpmake;
64	1					3	$self->{"tmpmakefile"} = $tmpmake;
65	1	50				11	my $fh = new IO::File $self->{"tmpoutfile"}, '>'
66							or die "couldn't create output file";
67	1					271	$self->{"fh"} = $fh;
68	1					4	$self->{"added"} = 0;
69	1					3	return $self;
70							}
71
72							=head2 $gen->add($key, $value)
73
74							Adds a value to the CDB being created
75
76							=cut
77
78							sub add ($$$) {
79	5			5	1	48	my ($self, $key, $value) = @_;
80	5					9	my $fh = $self->{"fh"};
81
82							#change newlines so sort can sort everything as lines.
83							#change tabs so we can use them as a separator
84	5					7	$key =~ s,\\,\\\\,g; #now we have all \s in even numbered groups
85	5					7	$key =~ s,\n,\\n,g; #an odd \ followed by a newline is a nl
86	5					6	$key =~ s,\t,\\t,g; #an odd \ followed by a tab is a nl
87	5					6	$value =~ s,\\,\\\\,g; #....
88	5					7	$value =~ s,\n,\\n,g;
89	5					7	$value =~ s,\t,\\t,g; # ....
90
91	5					27	print $fh $key, "\t", $value, "\n";
92
93	5					14	$self->{"added"} ++;
94							}
95
96							=head2 $gen->DESTROY
97
98							This is not normally called by the user, but rather by the completion
99							of the cdbfile being writen out and that block of the program being
100							exited or by the program completing. When it us run, it calls the
101							finish method which ends the CDB creation. See below.
102
103							=cut
104
105							sub DESTROY ($) {
106	0			0		0	my $self=shift;
107	0	0	0			0	$self->{"added"} && $self->finish() unless $self->{"abort"};
108	0					0	foreach my $del ( "tmpoutfile", "tmpsortfile", "tmpmakefile") {
109	0					0	unlink $self->{$del};
110							}
111							}
112
113							=head2 finish
114
115							Finish ends of the cdb creation. First it closes the output temporary
116							file, then it sorts it to another file and finally it calls C
117							to complete the creation job.
118
119							In the current implementation this uses C and deletes repeats of
120							the same key with the same value.
121
122							In order to increase database portability, by default all sorting is
123							done in the 'C' locale, even if the current program is working in
124							another locale. This is "the right thing" in many cases. Where you
125							are dealing with real word keys it won't be the right thing. In this case, use the locale function to set the locale.
126
127							=cut
128
129							sub locale ($) {
130	0			0	0	0	my $self=shift;
131	0					0	my $locale=shift;
132	0					0	$self->{locale}=$locale;
133							}
134
135							sub finish ($) {
136	1			1	1	9	my $self=shift;
137	1					56	close $self->{"fh"};
138
139							{
140	1					3	local $ENV{LC_ALL};
	1					7
141	1	50				6	if ($self->{locale}) {
142	0					0	$ENV{LC_ALL}=$self->{locale};
143							} else {
144	1					8	$ENV{LC_ALL}='C';
145							}
146	1					15392	system 'sort' , '-u', '-o' ,$self->{"tmpsortfile"} , $self->{"tmpoutfile"};
147							}
148
149	1	50				67	my $fh = new IO::File $self->{"tmpsortfile"}
150							or die "couldn't open sorted output file";
151
152	1					372	my $cdbmakeout = new IO::File ( '\|cdbmake ' . $self->{"cdbfile"}
153							. ' ' . $self->{"tmpmakefile"} );
154
155	1					10887	while (<$fh>) {
156	1					35	my ($key, $value) = m/^(.)\t(.)$/;
157
158							#the \G s allow for multiple new lines in a row
159							#odd numbered slash with t is tab
160	1					8	$key =~ s,((?:\G\|^\|[^\\])(?:\\\\)*)\\t,$1\t,g;
161							#odd numbered slash with t is tab
162	1					7	$key =~ s,((?:\G\|^\|[^\\])(?:\\\\)*)\\n,$1\n,g;
163							#pairs of slashes match a single slash
164	1					2	$key =~ s,\\\\,\\,g;
165
166							#same for value....
167	1					8	$value =~ s,((?:\G\|^\|[^\\])(?:\\\\)*)\\t,$1\t,g;
168	1					7	$value =~ s,((?:\G\|^\|[^\\])(?:\\\\)*)\\n,$1\n,g;
169	1					7	$value =~ s,\\\\,\\,g;
170
171	1					216	print $cdbmakeout &gen_cdb_input($key, $value);
172							}
173	0						print $cdbmakeout "\n";
174	0						$cdbmakeout->close;
175	0						$fh->close;
176
177							# $self->{"abort"} = 1;
178							#FIXME return codes etc..
179	0	0					unlink $self->{"tmpsortfile"} , $self->{"tmpoutfile"}
180							or warn "trouble deleting temp files "
181							. $self->{"tmpsortfile"} . $self->{"tmpoutfile"};
182	0						$self->{"added"} = 0;
183							}
184
185							=head2 $gen->abort
186
187							If you decide not to create the CDB file you were creating, you have
188							to call this method. Otherwise, it will be created as your program
189							exits (or possibly earlier)
190
191							=cut
192
193							sub abort ($) {
194	0			0	1		shift->{"abort"} = 1
195							}
196
197							=head2 gen_cdb_input($key,$value)
198
199							This is a little utility function which formats a cdbmake input line.
200
201							=cut
202
203							sub gen_cdb_input ($$) {
204	0			0	1		my $key=shift;
205	0						my $value=shift;
206
207	0						return "+" . length($key) . "," . length($value) . ":"
208							. $key . "->" . $value . "\n";
209							}
210
211							=head1 BUGS
212
213							We use the external programs C and C. These almost
214							certainly improve our performance on large databases (and those are
215							all we care about), but they make portability difficult.. Possibly
216							system independent alternatives should be written and used where
217							needed.
218
219							We should write out to the sort file with some encoding that gets rid
220							of new lines and then read back, de-coding that to feed it to cdbmake..
221
222							=cut
223