File Coverage

blib/lib/WARC/Record/Logical/Block.pm
Criterion Covered Total %
statement 159 159 100.0
branch 64 66 96.9
condition 28 29 96.5
subroutine 21 21 100.0
pod n/a
total 272 275 98.9


line stmt bran cond sub pod time code
1             package WARC::Record::Logical::Block; # -*- CPerl -*-
2              
3 2     2   72173 use strict;
  2         14  
  2         61  
4 2     2   20 use warnings;
  2         6  
  2         99  
5              
6             our @ISA = qw();
7              
8 2     2   427 use WARC; *WARC::Record::Logical::Block::VERSION = \$WARC::VERSION;
  2         5  
  2         86  
9              
10 2     2   12 use Carp;
  2         3  
  2         126  
11 2     2   13 use Fcntl qw/SEEK_SET SEEK_CUR SEEK_END/;
  2         3  
  2         113  
12              
13             # This implementation uses an array as the underlying object.
14              
15 2     2   11 use constant { PARENT => 0, SEGMENT => 1, HANDLE => 2 };
  2         3  
  2         178  
16 2     2   14 use constant OBJECT_INIT => undef, 0, undef;
  2         4  
  2         100  
17              
18             # Invariant: HANDLE is always valid if SEGMENT is within range.
19              
20 2     2   534 BEGIN { require WARC::Record::Logical; }
21             BEGIN { $WARC::Record::Logical::Block::{$_} = $WARC::Record::Logical::{$_}
22 2     2   781 for WARC::Record::Logical::SEGMENT_INDEX; }
23              
24             sub _dbg_dump {
25 60     60   3753 my $self = shift;
26              
27 60         107 my $out = 'logical record block';
28             $out .= ' @['.($self->[SEGMENT]).' / '
29 60         203 .($#{$self->[PARENT]{segments}}).']';
  60         239  
30 60 100       117 $out .= ' [EOF]' if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  60         178  
31 60         101 $out .= "\n";
32 60 50 66     253 $out .= ' '.((tied $self->[HANDLE])->_dbg_dump)
33             if $self->[HANDLE] && tied $self->[HANDLE];
34              
35 60         220 return $out;
36             }
37              
38             sub TIEHANDLE {
39 21     21   7522 my $class = shift;
40 21         36 my $parent = shift;
41              
42 21         51 my $ob = [OBJECT_INIT];
43 21         41 $ob->[PARENT] = $parent;
44 21         73 $ob->[HANDLE] = $parent->{segments}[0][SEG_REC]->open_block;
45              
46 21         5028 bless $ob, $class;
47             }
48              
49             # advance to next segment; return false if already at last segment
50             sub _next_segment {
51 77     77   197 my $self = shift;
52              
53 77 100       199 unless ($self->[PARENT]{segments}[1+$self->[SEGMENT]])
54 15         23 { $self->[SEGMENT]++; return 0 }
  15         59  
55              
56 62         157 close $self->[HANDLE];
57             $self->[HANDLE] =
58 62         329 $self->[PARENT]{segments}[++$self->[SEGMENT]][SEG_REC]->open_block;
59 62         10403 return $self->[HANDLE];
60             }
61              
62             sub READLINE {
63 57     57   5119 my $self = shift;
64              
65 57 100       114 return undef if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  57         218  
66              
67 52 100       224 if (not defined $/) { # file slurp; we might run out of memory...
    100          
    100          
68 4         14 my $data = readline $self->[HANDLE];
69 4 100       12 $data = '' unless defined $data;
70 4         11 $data .= readline $self->[HANDLE] while $self->_next_segment;
71 4 100       29 return (length($data) ? $data : undef);
72             } elsif (ref $/) { # record read; fill a block...
73 2     2   351 use bytes;
  2         6  
  2         17  
74 10         14 my $rec_len = 0+${$/};
  10         25  
75 10         35 my $rec = readline $self->[HANDLE];
76 10 100       26 $rec = '' unless defined $rec;
77 10         31 while (length $rec < $rec_len) {
78 9         16 local $/ = \(do {$rec_len - length $rec});
  9         30  
79 9 100       24 last unless $self->_next_segment;
80 8         37 $rec .= readline $self->[HANDLE];
81             }
82 10 100       74 return (length ($rec) ? $rec : undef);
83             } elsif ($/ eq '') { # paragraph read; delimiter is empty line...
84 11         22 my $para = '';
85 11         41 my $input = readline $self->[HANDLE];
86              
87 11   100     1465 while (((defined $input and $para .= $input) or $self->_next_segment)
      100        
88             and ("\n\n" ne substr $para, -2))
89             # segment boundary was in the middle of a line, continuing read is safe
90 13         393 { $input = readline $self->[HANDLE] }
91             # paragraph delimiter may or may not span segments...
92 11 100       529 if (eof $self->[HANDLE]) {
93             # next segment may begin with newlines, if so, they must be read
94 6         130 my $ch; my $end_pos = $self->TELL;
  6         19  
95 6   100     63 $end_pos = $self->TELL, $para .= "\n"
96             while (defined ($ch = $self->GETC) and $ch eq "\n");
97 6         19 $self->SEEK($end_pos, SEEK_SET);
98 6 100       213 $input = defined($ch) ? '' : undef;
99             }
100 11 100       106 return (length ($para) ? $para : undef);
101             } else { # ordinary line read...
102 2     2   545 use bytes; # a line delimiter may be split on any octet boundary
  2         6  
  2         10  
103 27         48 my $line = '';
104 27         96 my $input = readline $self->[HANDLE];
105              
106             # read more data until we have a complete line or reach EOF
107 27   100     358 while (((defined $input and $line .= $input) or $self->_next_segment)
      100        
108             and ($/ ne substr $line, -length $/)) {
109 49 100       726 if (length $/ > 1) {
110             # each number N in this array indicates that a length-N prefix of
111             # $/ currently matches at the end of the line buffer
112             my @prefixes =
113 22         49 grep {substr($/, 0, $_) eq substr($line, -$_)} 1 .. length $/;
  128         283  
114 22         59 while (@prefixes) {
115 39         82 my $count = $self->READ($input, 1);
116 39 100       75 return $line unless $count;
117 38         61 $line .= $input;
118 38 100       148 return $line if $/ eq substr $line, -length $/;
119 34         58 unshift @prefixes, 0; @prefixes = # sieve prefixes
120 34         55 grep {++$_; substr($/, 0, $_) eq substr($line, -$_)} @prefixes;
  92         117  
  92         240  
121             } # loop iterates until no prefix of $/ matches the tail of $line
122             }
123              
124 44         254 $input = readline $self->[HANDLE];
125             }
126 22 100       56 return $input unless length $line;
127 21         189 return $line;
128             }
129             }
130              
131             # This sub must rely on the aliasing effect of @_.
132             sub READ {
133 70     70   489 my $self = shift;
134             # args now: 0: buffer 1: length 2: offset into buffer or undef
135 70         96 my $length = $_[1];
136 70   100     208 my $offset = $_[2] || 0;
137              
138 70 100       107 return 0 if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  70         179  
139              
140 68         105 my $buf = ''; my $count = 1; my $bpos = 0;
  68         98  
  68         126  
141 68   100     178 while ($length && ($count || $self->_next_segment)) {
      100        
142 99         238 $count = read $self->[HANDLE], $buf, $length, $bpos;
143 99 50       901 return undef unless defined $count;
144 99         140 $length -= $count; $bpos += $count;
  99         225  
145             }
146              
147 68 100       130 $_[0] = '' unless defined $_[0];
148 68 100       140 $_[0] .= "\0" x ($offset - length($_[0])) if $offset > length $_[0];
149 68         117 substr $_[0], $offset, (length($_[0]) - $offset), $buf;
150 68         157 return $bpos;
151             }
152              
153             sub GETC {
154 22     22   488 my $self = shift;
155              
156 22         30 my $ch;
157 22 100       44 return undef unless $self->READ($ch, 1);
158 18         70 return $ch;
159             }
160              
161             sub EOF {
162 15     15   345 my $self = shift;
163              
164 15 100       29 return 1 if $self->[SEGMENT] > $#{$self->[PARENT]{segments}};
  15         94  
165 2 100       4 return 0 if $self->[SEGMENT] < $#{$self->[PARENT]{segments}};
  2         11  
166 1         5 return eof $self->[HANDLE];
167             }
168              
169             sub SEEK {
170 12     12   26 my $self = shift;
171 12         20 my $offset = shift;
172 12         19 my $whence = shift;
173              
174 12         24 my $segments = $self->[PARENT]{segments};
175              
176 12         16 my $npos;
177              
178 12 100       37 if ($whence == SEEK_SET) { $npos = $offset }
  8 100       14  
    100          
179 1         3 elsif ($whence == SEEK_CUR) { $npos = $self->TELL + $offset }
180             elsif ($whence == SEEK_END)
181 2         7 { $npos = ($segments->[-1][SEG_BASE]
182             + $segments->[-1][SEG_LENGTH] + $offset) }
183 1         259 else { croak "unknown WHENCE $whence in call to seek" }
184              
185             # This function must be able to seek backwards, even if the underlying
186             # handles cannot, to support paragraph reads. Seeking generally requires
187             # finding the new segment, switching to that segment, and seeking
188             # forwards to the desired offset within the new segment.
189              
190             # Special handling for seek to or past end-of-file
191 11 100       32 if ($npos >= ($segments->[-1][SEG_BASE] + $segments->[-1][SEG_LENGTH])) {
192 4         9 $self->[SEGMENT] = @$segments;
193 4         14 close $self->[HANDLE]; $self->[HANDLE] = undef;
  4         65  
194 4         101 return 1;
195             }
196              
197 7         14 my $new_segment_index = @$segments;
198 7         17 for (my $i = 0; $i < $new_segment_index; $i++) {
199 23 100 100     90 $new_segment_index = $i # which also exits this loop
200             if $npos >= $segments->[$i][SEG_BASE]
201             && $npos < ($segments->[$i][SEG_BASE] + $segments->[$i][SEG_LENGTH]);
202             }
203              
204 7         12 my $new_segment = $segments->[$new_segment_index];
205 7 100       21 return 0 unless $new_segment;
206              
207 6         10 $self->[SEGMENT] = $new_segment_index;
208 6         20 close $self->[HANDLE];
209 6         60 $self->[HANDLE] = $new_segment->[SEG_REC]->open_block;
210 6         3188 seek $self->[HANDLE], $npos - $new_segment->[SEG_BASE], SEEK_SET;
211             }
212              
213             sub TELL {
214 15     15   417 my $self = shift;
215              
216 15         27 my $segments = $self->[PARENT]{segments};
217              
218             return ($segments->[$self->[SEGMENT]][SEG_BASE] + (tell $self->[HANDLE]))
219 15 100       24 if $self->[SEGMENT] <= $#{$segments};
  15         84  
220 3         12 return ($segments->[-1][SEG_BASE] + $segments->[-1][SEG_LENGTH]);
221             }
222              
223             sub CLOSE {
224 1     1   3 my $self = shift;
225 1         3 $self->[SEGMENT] = 1+$#{$self->[PARENT]{segments}};
  1         3  
226 1         4 close $self->[HANDLE]; $self->[HANDLE] = undef;
  1         5  
227             }
228              
229             1;
230             __END__