File Coverage

blib/lib/WARC/Volume.pm
Criterion Covered Total %
statement 52 52 100.0
branch 3 4 75.0
condition 3 8 37.5
subroutine 17 17 100.0
pod 5 5 100.0
total 80 86 93.0


line stmt bran cond sub pod time code
1             package WARC::Volume; # -*- CPerl -*-
2              
3 25     25   72152 use strict;
  25         57  
  25         739  
4 25     25   117 use warnings;
  25         48  
  25         616  
5              
6 25     25   149 use Carp;
  25         47  
  25         1399  
7 25     25   142 use Cwd qw//;
  25         66  
  25         1017  
8              
9             our @ISA = qw();
10              
11 25     25   547 use WARC; *WARC::Volume::VERSION = \$WARC::VERSION;
  25         62  
  25         1090  
12              
13 25     25   9996 use WARC::Record;
  25         70  
  25         897  
14 25     25   10272 use WARC::Record::FromVolume;
  25         77  
  25         1106  
15              
16             =head1 NAME
17              
18             WARC::Volume - Web ARChive file access for Perl
19              
20             =head1 SYNOPSIS
21              
22             use WARC::Volume;
23              
24             $volume = mount WARC::Volume ($filename);
25              
26             $filename = $volume->filename;
27              
28             $handle = $volume->open;
29              
30             $record = $volume->first_record;
31              
32             $record = $volume->record_at($offset);
33              
34             =cut
35              
36 25     25   152 use overload '""' => 'filename';
  25         51  
  25         134  
37 25     25   1688 use overload fallback => 1;
  25         46  
  25         86  
38              
39             # This implementation is almost laughably simple, needing to store only a
40             # single data value: the absolute filename of the WARC file. As such, the
41             # underlying implementation, is, in fact, a blessed string.
42              
43             =head1 DESCRIPTION
44              
45             A C object represents a WARC file in the filesystem and
46             provides access to the WARC records within as C objects.
47              
48             =head2 Methods
49              
50             =over
51              
52             =item $volume = mount WARC::Volume ($filename)
53              
54             Construct a C object. The parameter is the name of an
55             existing WARC file. An exception is raised if the first record does not
56             have a valid WARC header.
57              
58             =cut
59              
60             sub mount {
61 47     47 1 36827 my $class = shift;
62 47         81 my $filename = shift;
63              
64 47         2499 my $fullfilename = Cwd::abs_path($filename);
65 47         179 my $ob = bless \$fullfilename, $class;
66              
67 47         165 $ob->first_record;
68              
69 47         763 return $ob;
70             }
71              
72             =item $volume-Efilename
73              
74             Return the filename for this volume.
75              
76             =cut
77              
78 1000     1000 1 12256 sub filename { ${(shift)} }
  1000         13566  
79              
80             =item $volume-Eopen
81              
82             Return a readable and seekable file handle for this volume. The returned
83             value may be a tied handle. Do not assume that it is an C.
84              
85             =cut
86              
87             sub open {
88 810     810 1 1125 my $self = shift;
89 810         1393 my $filename = $$self;
90              
91 810 100       28960 open my $fh, '<', $filename or die "$filename: $!";
92 809         4937 binmode $fh, ':raw'; # WARC files contain binary data and UTF-8 headers
93 809         2555 return $fh;
94             }
95              
96             =item $volume-Efirst_record
97              
98             Construct and return a C object representing the first WARC
99             record in $volume. This should be a "warcinfo" record, but it is not
100             required to be so.
101              
102             =cut
103              
104 85     85 1 7984 sub first_record { (shift)->record_at(0) }
105              
106             =item $volume-Erecord_at( $offset )
107              
108             Construct and return a C object representing the WARC record
109             beginning at $offset within $volume. An exception is raised if an
110             appropriate magic number is not found at $offset.
111              
112             =cut
113              
114 96     96 1 2583 sub record_at { _read WARC::Record::FromVolume @_ }
115              
116             =back
117              
118             =cut
119              
120             # $volume->_file_tag
121             #
122             # Return a "file tag" for this volume.
123             #
124             # This is an internal procedure. The exact definition of "file tag" is
125             # platform-dependent, but it will be the same value if both file names can
126             # be proven to be the same underlying file.
127              
128             BEGIN {
129 25     25   5502 use constant ();
  25         53  
  25         1854  
130              
131 25     25   77 my $have_valid_inodes = 0;
132              
133             # We accept DEV:INO as valid if two files in the same directory have the
134             # same DEV and different INO values. We use two modules from this
135             # library for this test and retrieve their actual locations from %INC.
136 25         638 my @stat_record = stat $INC{'WARC/Record.pm'};
137 25         407 my @stat_volume = stat $INC{'WARC/Volume.pm'};
138              
139 25 50 50     393 $have_valid_inodes = 1
      33        
      33        
140             if (scalar @stat_record && scalar @stat_volume # both stat calls worked
141             && $stat_record[0] == $stat_volume[0] # both have same DEV
142             && $stat_record[1] != $stat_volume[1]); # different INO values
143              
144 25         3036 constant->import(HAVE_VALID_INODES => $have_valid_inodes);
145             }
146             sub _file_tag {
147 521     521   619 if (HAVE_VALID_INODES) {
148             # Two modules have been found to have distinct inode numbers, therefore
149             # we are probably running in a POSIX environment. Use the dev:ino
150             # pair from the stat builtin as file tag.
151              
152             # POSIX requires that this be sufficient to distinguish files, although
153             # there are situations, particularly in complex network environments,
154             # where two different dev:ino pairs may correspond to the same file.
155             # Such situations can be avoided with careful administration.
156 521         1077 return join ':', ((stat shift)[0, 1])
157             } else {
158             # Use the absolute filename and assume no links on other platforms
159              
160             # The file name stored in the WARC::Volume object is already absolute.
161             return (shift)->filename
162             }
163             }
164              
165             1;
166             __END__