File Coverage

blib/lib/CDB_File.pm

Criterion	Covered	Total	%
statement	19	19	100.0
branch	1	2	50.0
condition			n/a
subroutine	7	7	100.0
pod	0	1	0.0
total	27	29	93.1

line	stmt	bran	sub	pod	time	code
1						package CDB_File;
2
3	4		4		276324	use strict;
	4				35
	4				116
4
5	4		4		21	use XSLoader ();
	4				10
	4				50
6	4		4		16	use Exporter ();
	4				6
	4				1073
7
8						our @ISA = qw(Exporter);
9						our $VERSION = '1.03';
10						our @EXPORT_OK = qw(create);
11
12						=head1 NAME
13
14						CDB_File - Perl extension for access to cdb databases
15
16						=head1 SYNOPSIS
17
18						use CDB_File;
19						$c = tie %h, 'CDB_File', 'file.cdb' or die "tie failed: $!\n";
20
21						$fh = $c->handle;
22						sysseek $fh, $c->datapos, 0 or die ...;
23						sysread $fh, $x, $c->datalen;
24						undef $c;
25						untie %h;
26
27						$t = CDB_File->new('t.cdb', "t.$$") or die ...;
28						$t->insert('key', 'value');
29						$t->finish;
30
31						CDB_File::create %t, $file, "$file.$$";
32
33						or
34
35						use CDB_File 'create';
36						create %t, $file, "$file.$$";
37
38						=head1 DESCRIPTION
39
40						B is a module which provides a Perl interface to Dan
41						Bernstein's B package:
42
43						cdb is a fast, reliable, lightweight package for creating and
44						reading constant databases.
45
46						=head2 Reading from a cdb
47
48						After the C shown above, accesses to C<%h> will refer
49						to the B file C, as described in L.
50
51						Low level access to the database is provided by the three methods
52						C, C, and C. To use them, you must remember
53						the C object returned by the C call: C<$c> in the
54						example above. The C and C methods return the
55						file offset position and length respectively of the most recently
56						visited key (for example, via C).
57
58						Beware that if you create an extra reference to the C object
59						(like C<$c> in the example above) you must destroy it (with C)
60						before calling C on the hash. This ensures that the object's
61						C method is called. Note that C will check this for
62						you; see L for further details.
63
64						=head2 Creating a cdb
65
66						A B file is created in three steps. First call C
67						($final, $tmp)>, where C<$final> is the name of the database to be
68						created, and C<$tmp> is the name of a temporary file which can be
69						atomically renamed to C<$final>. Secondly, call the C method
70						once for each (I, I) pair. Finally, call the C
71						method to complete the creation and renaming of the B file.
72
73						Alternatively, call the C method with multiple key/value
74						pairs. This can be significantly faster because there is less crossing
75						over the bridge from perl to C code. One simple way to do this is to pass
76						in an entire hash, as in: C<< $cdbmaker->insert(%hash); >>.
77
78						A simpler interface to B file creation is provided by
79						C. This creates a B file named
80						C<$final> containing the contents of C<%t>. As before, C<$tmp> must
81						name a temporary file which can be atomically renamed to C<$final>.
82						C may be imported.
83
84						=head1 EXAMPLES
85
86						These are all complete programs.
87
88						1. Convert a Berkeley DB (B-tree) database to B format.
89
90						use CDB_File;
91						use DB_File;
92
93						tie %h, DB_File, $ARGV[0], O_RDONLY, undef, $DB_BTREE or
94						die "$0: can't tie to $ARGV[0]: $!\n";
95
96						CDB_File::create %h, $ARGV[1], "$ARGV[1].$$" or
97						die "$0: can't create cdb: $!\n";
98
99						2. Convert a flat file to B format. In this example, the flat
100						file consists of one key per line, separated by a colon from the value.
101						Blank lines and lines beginning with B<#> are skipped.
102
103						use CDB_File;
104
105						$cdb = new CDB_File("data.cdb", "data.$$") or
106						die "$0: new CDB_File failed: $!\n";
107						while (<>) {
108						next if /^$/ or /^#/;
109						chop;
110						($k, $v) = split /:/, $_, 2;
111						if (defined $v) {
112						$cdb->insert($k, $v);
113						} else {
114						warn "bogus line: $_\n";
115						}
116						}
117						$cdb->finish or die "$0: CDB_File finish failed: $!\n";
118
119						3. Perl version of B.
120
121						use CDB_File;
122
123						tie %data, 'CDB_File', $ARGV[0] or
124						die "$0: can't tie to $ARGV[0]: $!\n";
125						while (($k, $v) = each %data) {
126						print '+', length $k, ',', length $v, ":$k->$v\n";
127						}
128						print "\n";
129
130						4. For really enormous data values, you can use C, C,
131						and C, in combination with C and C, to
132						avoid reading the values into memory. Here is the script F,
133						which can extract uncompressed files and directories from a B
134						file.
135
136						use CDB_File;
137
138						sub unnetstrings {
139						my($netstrings) = @_;
140						my @result;
141						while ($netstrings =~ s/^([0-9]+)://) {
142						push @result, substr($netstrings, 0, $1, '');
143						$netstrings =~ s/^,//;
144						}
145						return @result;
146						}
147
148						my $chunk = 8192;
149
150						sub extract {
151						my($file, $t, $b) = @_;
152						my $head = $$b{"H$file"};
153						my ($code, $type) = $head =~ m/^([0-9]+)(.)/;
154						if ($type eq "/") {
155						mkdir $file, 0777;
156						} elsif ($type eq "_") {
157						my ($total, $now, $got, $x);
158						open OUT, ">$file" or die "open for output: $!\n";
159						exists $$b{"D$code"} or die "corrupt bun file\n";
160						my $fh = $t->handle;
161						sysseek $fh, $t->datapos, 0;
162						$total = $t->datalen;
163						while ($total) {
164						$now = ($total > $chunk) ? $chunk : $total;
165						$got = sysread $fh, $x, $now;
166						if (not $got) { die "read error\n"; }
167						$total -= $got;
168						print OUT $x;
169						}
170						close OUT;
171						} else {
172						print STDERR "warning: skipping unknown file type\n";
173						}
174						}
175
176						die "usage\n" if @ARGV != 1;
177
178						my (%b, $t);
179						$t = tie %b, 'CDB_File', $ARGV[0] or die "tie: $!\n";
180						map { extract $_, $t, \%b } unnetstrings $b{""};
181
182						5. Although a B file is constant, you can simulate updating it
183						in Perl. This is an expensive operation, as you have to create a
184						new database, and copy into it everything that's unchanged from the
185						old database. (As compensation, the update does not affect database
186						readers. The old database is available for them, till the moment the
187						new one is Ced.)
188
189						use CDB_File;
190
191						$file = 'data.cdb';
192						$new = new CDB_File($file, "$file.$$") or
193						die "$0: new CDB_File failed: $!\n";
194
195						# Add the new values; remember which keys we've seen.
196						while (<>) {
197						chop;
198						($k, $v) = split;
199						$new->insert($k, $v);
200						$seen{$k} = 1;
201						}
202
203						# Add any old values that haven't been replaced.
204						tie %old, 'CDB_File', $file or die "$0: can't tie to $file: $!\n";
205						while (($k, $v) = each %old) {
206						$new->insert($k, $v) unless $seen{$k};
207						}
208
209						$new->finish or die "$0: CDB_File finish failed: $!\n";
210
211						=head1 REPEATED KEYS
212
213						Most users can ignore this section.
214
215						A B file can contain repeated keys. If the C method is
216						called more than once with the same key during the creation of a B
217						file, that key will be repeated.
218
219						Here's an example.
220
221						$cdb = new CDB_File ("$file.cdb", "$file.$$") or die ...;
222						$cdb->insert('cat', 'gato');
223						$cdb->insert('cat', 'chat');
224						$cdb->finish;
225
226						Normally, any attempt to access a key retrieves the first value
227						stored under that key. This code snippet always prints B.
228
229						$catref = tie %catalogue, CDB_File, "$file.cdb" or die ...;
230						print "$catalogue{cat}";
231
232						However, all the usual ways of iterating over a hash---C,
233						C, and C---do the Right Thing, even in the presence of
234						repeated keys. This code snippet prints B.
235
236						print join(' ', keys %catalogue, values %catalogue);
237
238						And these two both print B, although the second is
239						more efficient.
240
241						foreach $key (keys %catalogue) {
242						print "$key:$catalogue{$key} ";
243						}
244
245						while (($key, $val) = each %catalogue) {
246						print "$key:$val ";
247						}
248
249						The C method retrieves all the values associated with a key.
250						It returns a reference to an array containing all the values. This code
251						prints B.
252
253						print "@{$catref->multi_get('cat')}";
254
255						C always returns an array reference. If the key was not
256						found in the database, it will be a reference to an empty array. To
257						test whether the key was found, you must test the array, and not the
258						reference.
259
260						$x = $catref->multiget($key);
261						warn "$key not found\n" unless $x; # WRONG; message never printed
262						warn "$key not found\n" unless @$x; # Correct
263
264						The C method returns a hashref of all keys with the first
265						value in the cdb. This is useful for quickly loading a cdb file where
266						there is a 1:1 key mapping. In practice it proved to be about 400%
267						faster then iterating a tied hash.
268
269						# Slow
270						my %copy = %tied_cdb;
271
272						# Much Faster
273						my $copy_hashref = $catref->fetch_all();
274
275						=head1 RETURN VALUES
276
277						The routines C, C, and C return B if the
278						attempted operation failed; C<$!> contains the reason for failure.
279
280						=head1 DIAGNOSTICS
281
282						The following fatal errors may occur. (See L if
283						you want to trap them.)
284
285						=over 4
286
287						=item Modification of a CDB_File attempted
288
289						You attempted to modify a hash tied to a B.
290
291						=item CDB database too large
292
293						You attempted to create a B file larger than 4 gigabytes.
294
295						=item [ Write to \| Read of \| Seek in ] CDB_File failed:
296
297						If B is B, you tried to C to
298						access something that isn't a B file. Otherwise a serious OS level
299						problem occurred, for example, you have run out of disk space.
300
301						=back
302
303						=head1 PERFORMANCE
304
305						Sometimes you need to get the most performance possible out of a
306						library. Rumour has it that perl's tie() interface is slow. In order
307						to get around that you can use CDB_File in an object oriented
308						fashion, rather than via tie().
309
310						my $cdb = CDB_File->TIEHASH('/path/to/cdbfile.cdb');
311
312						if ($cdb->EXISTS('key')) {
313						print "Key is: ", $cdb->FETCH('key'), "\n";
314						}
315
316						For more information on the methods available on tied hashes see
317						L.
318
319						=head1 THE ALGORITHM
320
321						This algorithm is described at L It is
322						small enough that it is included inline in the event that the
323						internet loses the page:
324
325						=head2 A structure for constant databases
326
327						Copyright (c) 1996 D. J. Bernstein, L
328
329						A cdb is an associative array: it maps strings ('keys'') to strings
330						('data'').
331
332						A cdb contains 256 pointers to linearly probed open hash tables. The
333						hash tables contain pointers to (key,data) pairs. A cdb is stored in
334						a single file on disk:
335
336						+----------------+---------+-------+-------+-----+---------+
337						\| p0 p1 ... p255 \| records \| hash0 \| hash1 \| ... \| hash255 \|
338						+----------------+---------+-------+-------+-----+---------+
339
340						Each of the 256 initial pointers states a position and a length. The
341						position is the starting byte position of the hash table. The length
342						is the number of slots in the hash table.
343
344						Records are stored sequentially, without special alignment. A record
345						states a key length, a data length, the key, and the data.
346
347						Each hash table slot states a hash value and a byte position. If the
348						byte position is 0, the slot is empty. Otherwise, the slot points to
349						a record whose key has that hash value.
350
351						Positions, lengths, and hash values are 32-bit quantities, stored in
352						little-endian form in 4 bytes. Thus a cdb must fit into 4 gigabytes.
353
354						A record is located as follows. Compute the hash value of the key in
355						the record. The hash value modulo 256 is the number of a hash table.
356						The hash value divided by 256, modulo the length of that table, is a
357						slot number. Probe that slot, the next higher slot, and so on, until
358						you find the record or run into an empty slot.
359
360						The cdb hash function is C, with a starting
361						hash of 5381.
362
363
364						=head1 BUGS
365
366						The C interface could be done with C.
367
368						=head1 SEE ALSO
369
370						cdb(3)
371
372						=head1 AUTHOR
373
374						Tim Goodwin, . B began on 1997-01-08.
375
376						Work provided through 2008 by Matt Sergeant,
377
378						Now maintained by Todd Rinaldo,
379
380						=cut
381
382						XSLoader::load( 'CDB_File', $VERSION );
383
384						sub CLEAR {
385	2		2		14	require Carp;
386	2				288	Carp::croak("Modification of a CDB_File attempted");
387						}
388
389						sub DELETE {
390	1		1		627	&CLEAR;
391						}
392
393						sub STORE {
394	1		1		7283	&CLEAR;
395						}
396
397						# Must be preloaded for the prototype.
398
399						sub create(\%$$) {
400	4		4	0	34485	my ( $RHdata, $fn, $fntemp ) = @_;
401
402	4	50			271	my $cdb = CDB_File->new( $fn, $fntemp ) or return undef;
403	4				17	my ( $k, $v );
404	4				84	$cdb->insert(%$RHdata);
405	4				18003	$cdb->finish;
406	4				62	return 1;
407						}
408
409						1;