File Coverage

blib/lib/CDB_File.pm
Criterion Covered Total %
statement 20 20 100.0
branch 1 4 25.0
condition 4 13 30.7
subroutine 7 7 100.0
pod 0 1 0.0
total 32 45 71.1


line stmt bran cond sub pod time code
1             package CDB_File;
2              
3 4     4   280192 use strict;
  4         30  
  4         92  
4              
5 4     4   17 use XSLoader ();
  4         8  
  4         38  
6 4     4   26 use Exporter ();
  4         4  
  4         1102  
7              
8             our @ISA = qw(Exporter);
9             our $VERSION = '1.04';
10             our @EXPORT_OK = qw(create);
11              
12             =head1 NAME
13              
14             CDB_File - Perl extension for access to cdb databases
15              
16             =head1 SYNOPSIS
17              
18             use CDB_File;
19             $c = tie(%h, 'CDB_File', 'file.cdb') or die "tie failed: $!\n";
20              
21             # If accessing a utf8 stored CDB_File
22             $c = tie(%h, 'CDB_File', 'file.cdb', utf8 => 1) or die "tie failed: $!\n";
23              
24             $fh = $c->handle;
25             sysseek $fh, $c->datapos, 0 or die ...;
26             sysread $fh, $x, $c->datalen;
27             undef $c;
28             untie %h;
29              
30             $t = CDB_File->new('t.cdb', "t.$$") or die ...;
31             $t->insert('key', 'value');
32             $t->finish;
33              
34             CDB_File::create %t, $file, "$file.$$";
35              
36             or
37              
38             use CDB_File 'create';
39             create %t, $file, "$file.$$";
40              
41             # If you want to store the data in utf8 mode.
42             create %t, $file, "$file.$$", utf8 => 1;
43             =head1 DESCRIPTION
44              
45             B is a module which provides a Perl interface to Dan
46             Bernstein's B package:
47              
48             cdb is a fast, reliable, lightweight package for creating and
49             reading constant databases.
50              
51             =head2 Reading from a cdb
52              
53             After the C shown above, accesses to C<%h> will refer
54             to the B file C, as described in L.
55              
56             Low level access to the database is provided by the three methods
57             C, C, and C. To use them, you must remember
58             the C object returned by the C call: C<$c> in the
59             example above. The C and C methods return the
60             file offset position and length respectively of the most recently
61             visited key (for example, via C).
62              
63             Beware that if you create an extra reference to the C object
64             (like C<$c> in the example above) you must destroy it (with C)
65             before calling C on the hash. This ensures that the object's
66             C method is called. Note that C will check this for
67             you; see L for further details.
68              
69             =head2 Creating a cdb
70              
71             A B file is created in three steps. First call C
72             ($final, $tmp)>, where C<$final> is the name of the database to be
73             created, and C<$tmp> is the name of a temporary file which can be
74             atomically renamed to C<$final>. Secondly, call the C method
75             once for each (I, I) pair. Finally, call the C
76             method to complete the creation and renaming of the B file.
77              
78             Alternatively, call the C method with multiple key/value
79             pairs. This can be significantly faster because there is less crossing
80             over the bridge from perl to C code. One simple way to do this is to pass
81             in an entire hash, as in: C<< $cdbmaker->insert(%hash); >>.
82              
83             A simpler interface to B file creation is provided by
84             C. This creates a B file named
85             C<$final> containing the contents of C<%t>. As before, C<$tmp> must
86             name a temporary file which can be atomically renamed to C<$final>.
87             C may be imported.
88              
89             =head2 UTF8 support.
90              
91             When CDB_File was created in 1997 (prior even to Perl 5.6), Perl SVs
92             didn't really deal with UTF8. In order to properly store mixed
93             bytes and utf8 data in the file, we would normally need to store a bit
94             for each string which clarifies the encoding of the key / values.
95             This would be useful since Perl hash keys are downgraded to bytes when
96             possible so as to normalize the hash key access regardless of encoding.
97              
98             The CDB_File format is used outside of Perl and so must maintain file
99             format compatibility with those systems. As a result this module provides
100             a utf8 mode which must be enabled at database generation and then later
101             at read. Keys will always be stored as UTF8 strings which is the opposite
102             of how Perl stores the strings. This approach had to be taken to assure no
103             data corruption happened due to accidentally downgraded SVs before they
104             are stored or on retrieval.
105              
106             You can enable utf8 mode by passing C 1> to B, B,
107             or B. All returned SVs while in this mode will be encoded in utf8.
108             This feature is not available below 5.14 due to lack of Perl macro support.
109              
110             B read/write of databases not stored in utf8 mode will often be
111             incompatible with any non-ascii data.
112              
113             =head1 EXAMPLES
114              
115             These are all complete programs.
116              
117             1. Convert a Berkeley DB (B-tree) database to B format.
118              
119             use CDB_File;
120             use DB_File;
121              
122             tie %h, DB_File, $ARGV[0], O_RDONLY, undef, $DB_BTREE or
123             die "$0: can't tie to $ARGV[0]: $!\n";
124              
125             CDB_File::create %h, $ARGV[1], "$ARGV[1].$$" or
126             die "$0: can't create cdb: $!\n";
127              
128             2. Convert a flat file to B format. In this example, the flat
129             file consists of one key per line, separated by a colon from the value.
130             Blank lines and lines beginning with B<#> are skipped.
131              
132             use CDB_File;
133              
134             $cdb = new CDB_File("data.cdb", "data.$$") or
135             die "$0: new CDB_File failed: $!\n";
136             while (<>) {
137             next if /^$/ or /^#/;
138             chop;
139             ($k, $v) = split /:/, $_, 2;
140             if (defined $v) {
141             $cdb->insert($k, $v);
142             } else {
143             warn "bogus line: $_\n";
144             }
145             }
146             $cdb->finish or die "$0: CDB_File finish failed: $!\n";
147              
148             3. Perl version of B.
149              
150             use CDB_File;
151              
152             tie %data, 'CDB_File', $ARGV[0] or
153             die "$0: can't tie to $ARGV[0]: $!\n";
154             while (($k, $v) = each %data) {
155             print '+', length $k, ',', length $v, ":$k->$v\n";
156             }
157             print "\n";
158              
159             4. For really enormous data values, you can use C, C,
160             and C, in combination with C and C, to
161             avoid reading the values into memory. Here is the script F,
162             which can extract uncompressed files and directories from a B
163             file.
164              
165             use CDB_File;
166              
167             sub unnetstrings {
168             my($netstrings) = @_;
169             my @result;
170             while ($netstrings =~ s/^([0-9]+)://) {
171             push @result, substr($netstrings, 0, $1, '');
172             $netstrings =~ s/^,//;
173             }
174             return @result;
175             }
176              
177             my $chunk = 8192;
178              
179             sub extract {
180             my($file, $t, $b) = @_;
181             my $head = $$b{"H$file"};
182             my ($code, $type) = $head =~ m/^([0-9]+)(.)/;
183             if ($type eq "/") {
184             mkdir $file, 0777;
185             } elsif ($type eq "_") {
186             my ($total, $now, $got, $x);
187             open OUT, ">$file" or die "open for output: $!\n";
188             exists $$b{"D$code"} or die "corrupt bun file\n";
189             my $fh = $t->handle;
190             sysseek $fh, $t->datapos, 0;
191             $total = $t->datalen;
192             while ($total) {
193             $now = ($total > $chunk) ? $chunk : $total;
194             $got = sysread $fh, $x, $now;
195             if (not $got) { die "read error\n"; }
196             $total -= $got;
197             print OUT $x;
198             }
199             close OUT;
200             } else {
201             print STDERR "warning: skipping unknown file type\n";
202             }
203             }
204              
205             die "usage\n" if @ARGV != 1;
206              
207             my (%b, $t);
208             $t = tie %b, 'CDB_File', $ARGV[0] or die "tie: $!\n";
209             map { extract $_, $t, \%b } unnetstrings $b{""};
210              
211             5. Although a B file is constant, you can simulate updating it
212             in Perl. This is an expensive operation, as you have to create a
213             new database, and copy into it everything that's unchanged from the
214             old database. (As compensation, the update does not affect database
215             readers. The old database is available for them, till the moment the
216             new one is Ced.)
217              
218             use CDB_File;
219              
220             $file = 'data.cdb';
221             $new = new CDB_File($file, "$file.$$") or
222             die "$0: new CDB_File failed: $!\n";
223              
224             # Add the new values; remember which keys we've seen.
225             while (<>) {
226             chop;
227             ($k, $v) = split;
228             $new->insert($k, $v);
229             $seen{$k} = 1;
230             }
231              
232             # Add any old values that haven't been replaced.
233             tie %old, 'CDB_File', $file or die "$0: can't tie to $file: $!\n";
234             while (($k, $v) = each %old) {
235             $new->insert($k, $v) unless $seen{$k};
236             }
237              
238             $new->finish or die "$0: CDB_File finish failed: $!\n";
239              
240             =head1 REPEATED KEYS
241              
242             Most users can ignore this section.
243              
244             A B file can contain repeated keys. If the C method is
245             called more than once with the same key during the creation of a B
246             file, that key will be repeated.
247              
248             Here's an example.
249              
250             $cdb = new CDB_File ("$file.cdb", "$file.$$") or die ...;
251             $cdb->insert('cat', 'gato');
252             $cdb->insert('cat', 'chat');
253             $cdb->finish;
254              
255             Normally, any attempt to access a key retrieves the first value
256             stored under that key. This code snippet always prints B.
257              
258             $catref = tie %catalogue, CDB_File, "$file.cdb" or die ...;
259             print "$catalogue{cat}";
260              
261             However, all the usual ways of iterating over a hash---C,
262             C, and C---do the Right Thing, even in the presence of
263             repeated keys. This code snippet prints B.
264              
265             print join(' ', keys %catalogue, values %catalogue);
266              
267             And these two both print B, although the second is
268             more efficient.
269              
270             foreach $key (keys %catalogue) {
271             print "$key:$catalogue{$key} ";
272             }
273              
274             while (($key, $val) = each %catalogue) {
275             print "$key:$val ";
276             }
277              
278             The C method retrieves all the values associated with a key.
279             It returns a reference to an array containing all the values. This code
280             prints B.
281              
282             print "@{$catref->multi_get('cat')}";
283              
284             C always returns an array reference. If the key was not
285             found in the database, it will be a reference to an empty array. To
286             test whether the key was found, you must test the array, and not the
287             reference.
288              
289             $x = $catref->multiget($key);
290             warn "$key not found\n" unless $x; # WRONG; message never printed
291             warn "$key not found\n" unless @$x; # Correct
292              
293             The C method returns a hashref of all keys with the first
294             value in the cdb. This is useful for quickly loading a cdb file where
295             there is a 1:1 key mapping. In practice it proved to be about 400%
296             faster then iterating a tied hash.
297              
298             # Slow
299             my %copy = %tied_cdb;
300              
301             # Much Faster
302             my $copy_hashref = $catref->fetch_all();
303              
304             =head1 RETURN VALUES
305              
306             The routines C, C, and C return B if the
307             attempted operation failed; C<$!> contains the reason for failure.
308              
309             =head1 DIAGNOSTICS
310              
311             The following fatal errors may occur. (See L if
312             you want to trap them.)
313              
314             =over 4
315              
316             =item Modification of a CDB_File attempted
317              
318             You attempted to modify a hash tied to a B.
319              
320             =item CDB database too large
321              
322             You attempted to create a B file larger than 4 gigabytes.
323              
324             =item [ Write to | Read of | Seek in ] CDB_File failed:
325              
326             If B is B, you tried to C to
327             access something that isn't a B file. Otherwise a serious OS level
328             problem occurred, for example, you have run out of disk space.
329              
330             =back
331              
332             =head1 PERFORMANCE
333              
334             Sometimes you need to get the most performance possible out of a
335             library. Rumour has it that perl's tie() interface is slow. In order
336             to get around that you can use CDB_File in an object oriented
337             fashion, rather than via tie().
338              
339             my $cdb = CDB_File->TIEHASH('/path/to/cdbfile.cdb');
340              
341             if ($cdb->EXISTS('key')) {
342             print "Key is: ", $cdb->FETCH('key'), "\n";
343             }
344              
345             For more information on the methods available on tied hashes see
346             L.
347              
348             =head1 THE ALGORITHM
349              
350             This algorithm is described at L It is
351             small enough that it is included inline in the event that the
352             internet loses the page:
353              
354             =head2 A structure for constant databases
355              
356             Copyright (c) 1996 D. J. Bernstein, L
357              
358             A cdb is an associative array: it maps strings ('keys'') to strings
359             ('data'').
360              
361             A cdb contains 256 pointers to linearly probed open hash tables. The
362             hash tables contain pointers to (key,data) pairs. A cdb is stored in
363             a single file on disk:
364              
365             +----------------+---------+-------+-------+-----+---------+
366             | p0 p1 ... p255 | records | hash0 | hash1 | ... | hash255 |
367             +----------------+---------+-------+-------+-----+---------+
368              
369             Each of the 256 initial pointers states a position and a length. The
370             position is the starting byte position of the hash table. The length
371             is the number of slots in the hash table.
372              
373             Records are stored sequentially, without special alignment. A record
374             states a key length, a data length, the key, and the data.
375              
376             Each hash table slot states a hash value and a byte position. If the
377             byte position is 0, the slot is empty. Otherwise, the slot points to
378             a record whose key has that hash value.
379              
380             Positions, lengths, and hash values are 32-bit quantities, stored in
381             little-endian form in 4 bytes. Thus a cdb must fit into 4 gigabytes.
382              
383             A record is located as follows. Compute the hash value of the key in
384             the record. The hash value modulo 256 is the number of a hash table.
385             The hash value divided by 256, modulo the length of that table, is a
386             slot number. Probe that slot, the next higher slot, and so on, until
387             you find the record or run into an empty slot.
388              
389             The cdb hash function is C, with a starting
390             hash of 5381.
391              
392              
393             =head1 BUGS
394              
395             The C interface could be done with C.
396              
397             =head1 SEE ALSO
398              
399             cdb(3)
400              
401             =head1 AUTHOR
402              
403             Tim Goodwin, . B began on 1997-01-08.
404              
405             Work provided through 2008 by Matt Sergeant,
406              
407             Now maintained by Todd Rinaldo,
408              
409             =cut
410              
411             XSLoader::load( 'CDB_File', $VERSION );
412              
413             sub CLEAR {
414 2     2   11 require Carp;
415 2         237 Carp::croak("Modification of a CDB_File attempted");
416             }
417              
418             sub DELETE {
419 1     1   527 goto &CLEAR;
420             }
421              
422             sub STORE {
423 1     1   6676 goto &CLEAR;
424             }
425              
426             # Must be preloaded for the prototype.
427              
428             sub create(\%$$;$$) {
429 4     4 0 31060 my ( $RHdata, $fn, $fntemp, $option_key, $is_utf8 ) = @_;
430              
431 4 0 33     19 die("utf8 CDB_Files are not supported below Perl 5.14") if $option_key && $option_key eq 'utf8' && $is_utf8 && $] < "5.014";
      33        
      0        
432              
433 4 50 50     248 my $cdb = CDB_File->new( $fn, $fntemp, $option_key || '', $is_utf8 || 0 ) or return undef;
      50        
434             {
435 4         14 $cdb->insert(%$RHdata);
  4         68  
436             }
437 4         41183 $cdb->finish;
438 4         54 return 1;
439             }
440              
441             1;