line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package CDB_File::Generator; |
2
|
|
|
|
|
|
|
$REVISION=q$Revision: 1.13 $ ; |
3
|
1
|
|
|
1
|
|
746
|
use vars qw($VERSION); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
71
|
|
4
|
|
|
|
|
|
|
$VERSION='0.030'; |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
CDB_File::Generator - generate massive sorted CDB files simply. |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use CDB_File::Generator; |
13
|
|
|
|
|
|
|
$gen = new CDB::Generator "my.cdb"; |
14
|
|
|
|
|
|
|
$gen->("Fred", "Martha"); |
15
|
|
|
|
|
|
|
$gen->("Fred", "Olivia"); |
16
|
|
|
|
|
|
|
$gen->("Fred", "Jenny"); |
17
|
|
|
|
|
|
|
$gen->("Roger", "Joe"); |
18
|
|
|
|
|
|
|
$gen->("Roger", "Jenny"); |
19
|
|
|
|
|
|
|
$gen = undef; |
20
|
|
|
|
|
|
|
use CDB_File; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
This is a class which makes generating sorted large (much bigger than |
26
|
|
|
|
|
|
|
memory, but the speed will depend on the efficiency of your sort |
27
|
|
|
|
|
|
|
command. If you haven't got one, for example, it won't work at all.) |
28
|
|
|
|
|
|
|
CDB files on the fly very easy |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=cut |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#this lets us have lots of unique temporary files. |
33
|
|
|
|
|
|
|
|
34
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
58
|
|
35
|
1
|
|
|
1
|
|
994
|
use IO::File; |
|
1
|
|
|
|
|
11625
|
|
|
1
|
|
|
|
|
164
|
|
36
|
1
|
|
|
1
|
|
9
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
56
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
BEGIN { |
39
|
1
|
|
|
1
|
|
1291
|
my $tempfile_no=0; |
40
|
2
|
|
|
2
|
0
|
9
|
sub next_tmp_file () { $tempfile_no++ }; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 METHODS |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head2 Generator::new $cdbfile [$cdbmaketemp [{$tmpname [$sorttmpname] | $tmpdir}]] |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
The new function creates a generator for a given filename, optionally |
48
|
|
|
|
|
|
|
specifying where it sould put it's temporary files. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my $deftemp="/tmp/cdb_generator_tmp.$$."; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub new ($$@) { |
55
|
1
|
|
|
1
|
1
|
42
|
my $class=shift; |
56
|
1
|
|
|
|
|
3
|
my $self = bless {}, $class; |
57
|
1
|
|
|
|
|
3
|
my $cdbfilename = shift; |
58
|
1
|
|
|
|
|
7
|
$self->{"cdbfile"} = $cdbfilename; |
59
|
1
|
|
|
|
|
7
|
$self->{"tmpoutfile"} = $deftemp . next_tmp_file(); |
60
|
1
|
|
|
|
|
4
|
$self->{"tmpsortfile"} = $deftemp . next_tmp_file(); |
61
|
1
|
|
|
|
|
2
|
my $tmpmake = $cdbfilename; |
62
|
1
|
50
|
|
|
|
10
|
$tmpmake =~ s/.cdb$/.tmp/ or $tmpmake = $tmpmake . ".tmp"; |
63
|
1
|
50
|
|
|
|
13
|
Carp::croak "file $tmpmake for cdb temfile exists" if -e $tmpmake; |
64
|
1
|
|
|
|
|
3
|
$self->{"tmpmakefile"} = $tmpmake; |
65
|
1
|
50
|
|
|
|
11
|
my $fh = new IO::File $self->{"tmpoutfile"}, '>' |
66
|
|
|
|
|
|
|
or die "couldn't create output file"; |
67
|
1
|
|
|
|
|
271
|
$self->{"fh"} = $fh; |
68
|
1
|
|
|
|
|
4
|
$self->{"added"} = 0; |
69
|
1
|
|
|
|
|
3
|
return $self; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 $gen->add($key, $value) |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Adds a value to the CDB being created |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub add ($$$) { |
79
|
5
|
|
|
5
|
1
|
48
|
my ($self, $key, $value) = @_; |
80
|
5
|
|
|
|
|
9
|
my $fh = $self->{"fh"}; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#change newlines so sort can sort everything as lines. |
83
|
|
|
|
|
|
|
#change tabs so we can use them as a separator |
84
|
5
|
|
|
|
|
7
|
$key =~ s,\\,\\\\,g; #now we have all \s in even numbered groups |
85
|
5
|
|
|
|
|
7
|
$key =~ s,\n,\\n,g; #an odd \ followed by a newline is a nl |
86
|
5
|
|
|
|
|
6
|
$key =~ s,\t,\\t,g; #an odd \ followed by a tab is a nl |
87
|
5
|
|
|
|
|
6
|
$value =~ s,\\,\\\\,g; #.... |
88
|
5
|
|
|
|
|
7
|
$value =~ s,\n,\\n,g; |
89
|
5
|
|
|
|
|
7
|
$value =~ s,\t,\\t,g; # .... |
90
|
|
|
|
|
|
|
|
91
|
5
|
|
|
|
|
27
|
print $fh $key, "\t", $value, "\n"; |
92
|
|
|
|
|
|
|
|
93
|
5
|
|
|
|
|
14
|
$self->{"added"} ++; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=head2 $gen->DESTROY |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
This is not normally called by the user, but rather by the completion |
99
|
|
|
|
|
|
|
of the cdbfile being writen out and that block of the program being |
100
|
|
|
|
|
|
|
exited or by the program completing. When it us run, it calls the |
101
|
|
|
|
|
|
|
finish method which ends the CDB creation. See below. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=cut |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub DESTROY ($) { |
106
|
0
|
|
|
0
|
|
0
|
my $self=shift; |
107
|
0
|
0
|
0
|
|
|
0
|
$self->{"added"} && $self->finish() unless $self->{"abort"}; |
108
|
0
|
|
|
|
|
0
|
foreach my $del ( "tmpoutfile", "tmpsortfile", "tmpmakefile") { |
109
|
0
|
|
|
|
|
0
|
unlink $self->{$del}; |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 finish |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Finish ends of the cdb creation. First it closes the output temporary |
116
|
|
|
|
|
|
|
file, then it sorts it to another file and finally it calls C |
117
|
|
|
|
|
|
|
to complete the creation job. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
In the current implementation this uses C and deletes repeats of |
120
|
|
|
|
|
|
|
the same key with the same value. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
In order to increase database portability, by default all sorting is |
123
|
|
|
|
|
|
|
done in the 'C' locale, even if the current program is working in |
124
|
|
|
|
|
|
|
another locale. This is "the right thing" in many cases. Where you |
125
|
|
|
|
|
|
|
are dealing with real word keys it won't be the right thing. In this case, use the locale function to set the locale. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub locale ($) { |
130
|
0
|
|
|
0
|
0
|
0
|
my $self=shift; |
131
|
0
|
|
|
|
|
0
|
my $locale=shift; |
132
|
0
|
|
|
|
|
0
|
$self->{locale}=$locale; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub finish ($) { |
136
|
1
|
|
|
1
|
1
|
9
|
my $self=shift; |
137
|
1
|
|
|
|
|
56
|
close $self->{"fh"}; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
{ |
140
|
1
|
|
|
|
|
3
|
local $ENV{LC_ALL}; |
|
1
|
|
|
|
|
7
|
|
141
|
1
|
50
|
|
|
|
6
|
if ($self->{locale}) { |
142
|
0
|
|
|
|
|
0
|
$ENV{LC_ALL}=$self->{locale}; |
143
|
|
|
|
|
|
|
} else { |
144
|
1
|
|
|
|
|
8
|
$ENV{LC_ALL}='C'; |
145
|
|
|
|
|
|
|
} |
146
|
1
|
|
|
|
|
15392
|
system 'sort' , '-u', '-o' ,$self->{"tmpsortfile"} , $self->{"tmpoutfile"}; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
1
|
50
|
|
|
|
67
|
my $fh = new IO::File $self->{"tmpsortfile"} |
150
|
|
|
|
|
|
|
or die "couldn't open sorted output file"; |
151
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
372
|
my $cdbmakeout = new IO::File ( '|cdbmake ' . $self->{"cdbfile"} |
153
|
|
|
|
|
|
|
. ' ' . $self->{"tmpmakefile"} ); |
154
|
|
|
|
|
|
|
|
155
|
1
|
|
|
|
|
10887
|
while (<$fh>) { |
156
|
1
|
|
|
|
|
35
|
my ($key, $value) = m/^(.*)\t(.*)$/; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
#the \G s allow for multiple new lines in a row |
159
|
|
|
|
|
|
|
#odd numbered slash with t is tab |
160
|
1
|
|
|
|
|
8
|
$key =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\t,$1\t,g; |
161
|
|
|
|
|
|
|
#odd numbered slash with t is tab |
162
|
1
|
|
|
|
|
7
|
$key =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\n,$1\n,g; |
163
|
|
|
|
|
|
|
#pairs of slashes match a single slash |
164
|
1
|
|
|
|
|
2
|
$key =~ s,\\\\,\\,g; |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
#same for value.... |
167
|
1
|
|
|
|
|
8
|
$value =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\t,$1\t,g; |
168
|
1
|
|
|
|
|
7
|
$value =~ s,((?:\G|^|[^\\])(?:\\\\)*)\\n,$1\n,g; |
169
|
1
|
|
|
|
|
7
|
$value =~ s,\\\\,\\,g; |
170
|
|
|
|
|
|
|
|
171
|
1
|
|
|
|
|
216
|
print $cdbmakeout &gen_cdb_input($key, $value); |
172
|
|
|
|
|
|
|
} |
173
|
0
|
|
|
|
|
|
print $cdbmakeout "\n"; |
174
|
0
|
|
|
|
|
|
$cdbmakeout->close; |
175
|
0
|
|
|
|
|
|
$fh->close; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# $self->{"abort"} = 1; |
178
|
|
|
|
|
|
|
#FIXME return codes etc.. |
179
|
0
|
0
|
|
|
|
|
unlink $self->{"tmpsortfile"} , $self->{"tmpoutfile"} |
180
|
|
|
|
|
|
|
or warn "trouble deleting temp files " |
181
|
|
|
|
|
|
|
. $self->{"tmpsortfile"} . $self->{"tmpoutfile"}; |
182
|
0
|
|
|
|
|
|
$self->{"added"} = 0; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 $gen->abort |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
If you decide not to create the CDB file you were creating, you have |
188
|
|
|
|
|
|
|
to call this method. Otherwise, it will be created as your program |
189
|
|
|
|
|
|
|
exits (or possibly earlier) |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub abort ($) { |
194
|
0
|
|
|
0
|
1
|
|
shift->{"abort"} = 1 |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head2 gen_cdb_input($key,$value) |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
This is a little utility function which formats a cdbmake input line. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=cut |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub gen_cdb_input ($$) { |
204
|
0
|
|
|
0
|
1
|
|
my $key=shift; |
205
|
0
|
|
|
|
|
|
my $value=shift; |
206
|
|
|
|
|
|
|
|
207
|
0
|
|
|
|
|
|
return "+" . length($key) . "," . length($value) . ":" |
208
|
|
|
|
|
|
|
. $key . "->" . $value . "\n"; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head1 BUGS |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
We use the external programs C and C. These almost |
214
|
|
|
|
|
|
|
certainly improve our performance on large databases (and those are |
215
|
|
|
|
|
|
|
all we care about), but they make portability difficult.. Possibly |
216
|
|
|
|
|
|
|
system independent alternatives should be written and used where |
217
|
|
|
|
|
|
|
needed. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
We should write out to the sort file with some encoding that gets rid |
220
|
|
|
|
|
|
|
of new lines and then read back, de-coding that to feed it to cdbmake.. |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=cut |
223
|
|
|
|
|
|
|
|