| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WARC::Index::File::CDX::Builder;			# -*- CPerl -*- | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 74537 | use strict; | 
|  | 1 |  |  |  |  | 14 |  | 
|  | 1 |  |  |  |  | 40 |  | 
| 4 | 1 |  |  | 1 |  | 6 | use warnings; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 72 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | require WARC::Index::Builder; | 
| 7 |  |  |  |  |  |  | our @ISA = qw(WARC::Index::Builder); | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | require WARC; *WARC::Index::File::CDX::Builder::VERSION = \$WARC::VERSION; | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 1 |  |  | 1 |  | 604 | use URI; | 
|  | 1 |  |  |  |  | 7057 |  | 
|  | 1 |  |  |  |  | 32 |  | 
| 12 | 1 |  |  | 1 |  | 7 | use Carp; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 69 |  | 
| 13 | 1 |  |  | 1 |  | 6 | use Cwd qw//; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 17 |  | 
| 14 | 1 |  |  | 1 |  | 5 | use File::Spec; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 25 |  | 
| 15 | 1 |  |  | 1 |  | 4 | use Fcntl qw/:seek/; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 1416 |  | 
| 16 |  |  |  |  |  |  | require File::Spec::Unix; | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | our %Record_Field_Handlers = | 
| 19 |  |  |  |  |  |  | # each handler is called with WARC::Record and index builder objects and | 
| 20 |  |  |  |  |  |  | # returns the text value for that field or undef, which is written as '-' | 
| 21 |  |  |  |  |  |  | (a => sub { (shift)->field('WARC-Target-URI') }, | 
| 22 |  |  |  |  |  |  | k => sub { (shift)->field('WARC-Payload-Digest') }, | 
| 23 |  |  |  |  |  |  | u => sub { (shift)->id }, | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | b => sub { my $date = (shift)->date; | 
| 26 |  |  |  |  |  |  | $date =~ y/-T:Z//d; substr $date, 0, 14 }, | 
| 27 |  |  |  |  |  |  | N => sub { my $uri = (shift)->field('WARC-Target-URI'); | 
| 28 |  |  |  |  |  |  | return undef unless $uri; | 
| 29 |  |  |  |  |  |  | $uri = new URI ($uri); | 
| 30 |  |  |  |  |  |  | return undef unless $uri->can('host') && $uri->can('path'); | 
| 31 |  |  |  |  |  |  | my $surt_host = join ',', reverse split /[.]/, $uri->host; | 
| 32 |  |  |  |  |  |  | return $surt_host.')'.$uri->path | 
| 33 |  |  |  |  |  |  | }, | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | g => sub { my $record = shift; my $builder = shift; | 
| 36 |  |  |  |  |  |  | return $builder->_get_relvolname($record->volume) }, | 
| 37 |  |  |  |  |  |  | S => sub { (shift)->{sl_packed_size} }, | 
| 38 |  |  |  |  |  |  | v => sub { my $record = shift; | 
| 39 |  |  |  |  |  |  | return undef if $record->volume->filename !~ m/[.]warc\z/; | 
| 40 |  |  |  |  |  |  | return $record->offset }, | 
| 41 |  |  |  |  |  |  | V => sub { my $record = shift; | 
| 42 |  |  |  |  |  |  | return undef if $record->volume->filename =~ m/[.]warc\z/; | 
| 43 |  |  |  |  |  |  | return $record->offset }, | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | # HTTP responses only | 
| 46 |  |  |  |  |  |  | m => sub { my $response = (shift)->replay; | 
| 47 |  |  |  |  |  |  | return undef unless UNIVERSAL::can($response, 'headers'); | 
| 48 |  |  |  |  |  |  | $response->headers->content_type }, | 
| 49 |  |  |  |  |  |  | r => sub { my $response = (shift)->replay; | 
| 50 |  |  |  |  |  |  | return undef unless UNIVERSAL::can($response, 'headers'); | 
| 51 |  |  |  |  |  |  | $response->headers->header('Location') }, | 
| 52 |  |  |  |  |  |  | s => sub { my $response = (shift)->replay; | 
| 53 |  |  |  |  |  |  | return undef unless UNIVERSAL::can($response, 'code'); | 
| 54 |  |  |  |  |  |  | $response->code }, | 
| 55 |  |  |  |  |  |  | ); | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | # This implementation uses a hash as the underlying structure. | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | #  Keys defined by this class: | 
| 60 |  |  |  |  |  |  | # | 
| 61 |  |  |  |  |  |  | #   file_name | 
| 62 |  |  |  |  |  |  | #	Name of CDX file where records will be appended. | 
| 63 |  |  |  |  |  |  | #   file | 
| 64 |  |  |  |  |  |  | #	Handle opened for writing/appending on that file. | 
| 65 |  |  |  |  |  |  | #   fields | 
| 66 |  |  |  |  |  |  | #	CDX field letters to be written. | 
| 67 |  |  |  |  |  |  | #   fieldgen | 
| 68 |  |  |  |  |  |  | #	Array of handlers to call to produce field values. | 
| 69 |  |  |  |  |  |  | #   delimiter | 
| 70 |  |  |  |  |  |  | #	Field delimiter used in CDX file. Default is space; cannot be set | 
| 71 |  |  |  |  |  |  | #	 as an option but can be read from an existing CDX file header. | 
| 72 |  |  |  |  |  |  | #   volnames | 
| 73 |  |  |  |  |  |  | #	Hash mapping volume names to relative paths from the CDX file. | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | sub _get_relvolname { | 
| 76 | 37 |  |  | 37 |  | 67 | my $self = shift; | 
| 77 | 37 |  |  |  |  | 81 | my $name = (shift)->filename; | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 37 | 100 |  |  |  | 156 | return $self->{volnames}->{$name} if defined $self->{volnames}{$name}; | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | # otherwise ... | 
| 82 | 4 |  |  |  |  | 75 | my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{file_name}); | 
| 83 | 4 |  |  |  |  | 354 | my $relname = File::Spec->abs2rel | 
| 84 |  |  |  |  |  |  | ($name, File::Spec->catpath($vol, $cdx_dirs, undef)); | 
| 85 | 4 |  |  |  |  | 40 | my ($rvol, $rel_dirs, $rel_file) = File::Spec->splitpath($relname); | 
| 86 | 4 |  |  |  |  | 18 | my @rel_dirs = File::Spec->splitdir($rel_dirs); | 
| 87 | 4 |  |  |  |  | 30 | my $warcfilename = File::Spec::Unix->catpath | 
| 88 |  |  |  |  |  |  | ($rvol, File::Spec::Unix->catdir(@rel_dirs), $rel_file); | 
| 89 | 4 |  |  |  |  | 23 | return $self->{volnames}{$name} = $warcfilename; | 
| 90 |  |  |  |  |  |  | } | 
| 91 |  |  |  |  |  |  |  | 
| 92 |  |  |  |  |  |  | sub _new { | 
| 93 | 7 |  |  | 7 |  | 5475 | my $class = shift; | 
| 94 | 7 |  |  |  |  | 28 | my %args = @_; | 
| 95 |  |  |  |  |  |  |  | 
| 96 | 7 | 100 |  |  |  | 278 | croak "required parameter 'into' missing" unless $args{into}; | 
| 97 |  |  |  |  |  |  |  | 
| 98 |  |  |  |  |  |  | my $ob = { delimiter => ' ', fields => [qw/N b a m s k r M S V g u/], | 
| 99 | 6 |  |  |  |  | 386 | file_name => Cwd::abs_path($args{into}) }; | 
| 100 |  |  |  |  |  |  |  | 
| 101 | 6 | 100 |  |  |  | 35 | $ob->{fields} = $args{fields} if $args{fields}; | 
| 102 |  |  |  |  |  |  |  | 
| 103 | 6 | 50 |  |  |  | 358 | open my $fh, '+>>', $args{into} or croak $args{into}.': '.$!; | 
| 104 |  |  |  |  |  |  | { | 
| 105 | 6 |  |  |  |  | 21 | local $/ = "\012"; | 
|  | 6 |  |  |  |  | 38 |  | 
| 106 | 6 | 50 |  |  |  | 57 | seek $fh, 0, SEEK_SET or croak 'seek '.$args{into}.': '.$!; | 
| 107 | 6 |  |  |  |  | 115 | my $header = <$fh>; | 
| 108 | 6 | 100 |  |  |  | 25 | if ($header) { | 
| 109 |  |  |  |  |  |  | $header =~ m/^(.)CDX((?:\1[[:alpha:]])+)/ | 
| 110 | 3 | 100 |  |  |  | 438 | or croak $args{into}.' exists but lacks CDX header'; | 
| 111 | 1 |  |  |  |  | 5 | $ob->{delimiter} = $1; | 
| 112 | 1 |  |  |  |  | 18 | $ob->{fields} = [split /\Q$1/, $2]; | 
| 113 | 1 |  |  |  |  | 3 | shift @{$ob->{fields}};	# remove leading empty field | 
|  | 1 |  |  |  |  | 3 |  | 
| 114 |  |  |  |  |  |  | } else { | 
| 115 |  |  |  |  |  |  | # write CDX header | 
| 116 | 3 |  |  |  |  | 11 | print $fh ' CDX ', join(' ', @{$ob->{fields}}), "\012"; | 
|  | 3 |  |  |  |  | 24 |  | 
| 117 |  |  |  |  |  |  | } | 
| 118 | 4 | 50 |  |  |  | 121 | seek $fh, 0, SEEK_END or croak 'seek '.$args{into}.': '.$!; | 
| 119 |  |  |  |  |  |  | } | 
| 120 | 4 |  |  |  |  | 18 | $ob->{file} = $fh; | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | $ob->{fieldgen} = | 
| 123 | 4 | 100 |  | 29 |  | 11 | [map { $Record_Field_Handlers{$_} or sub { undef } } @{$ob->{fields}}]; | 
|  | 30 |  |  |  |  | 105 |  | 
|  | 29 |  |  |  |  | 60 |  | 
|  | 4 |  |  |  |  | 13 |  | 
| 124 |  |  |  |  |  |  |  | 
| 125 | 4 |  |  |  |  | 82 | bless $ob, $class | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  |  | 
| 128 |  |  |  |  |  |  | # inherit add | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | sub _add_record { | 
| 131 | 37 |  |  | 37 |  | 76 | my $self = shift; | 
| 132 | 37 |  |  |  |  | 62 | my $record = shift; | 
| 133 |  |  |  |  |  |  |  | 
| 134 | 356 | 100 | 100 |  |  | 1077 | my $line = join $self->{delimiter}, map { defined $_ && $_ ne '' ? $_ : '-' } | 
| 135 | 37 |  |  |  |  | 74 | map { $_->($record, $self) } @{$self->{fieldgen}}; | 
|  | 372 |  |  |  |  | 2760 |  | 
|  | 37 |  |  |  |  | 105 |  | 
| 136 |  |  |  |  |  |  |  | 
| 137 | 37 |  |  |  |  | 111 | print {$self->{file}} $line, "\012"; | 
|  | 37 |  |  |  |  | 256 |  | 
| 138 |  |  |  |  |  |  | } | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | sub flush { | 
| 141 | 4 |  |  | 4 | 1 | 33 | my $self = shift; | 
| 142 |  |  |  |  |  |  |  | 
| 143 | 4 |  |  |  |  | 138 | seek $self->{file}, 0, SEEK_END | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | 1; | 
| 147 |  |  |  |  |  |  | __END__ |