File Coverage

blib/lib/Mail/Mbox/MessageParser/Grep.pm
Criterion Covered Total %
statement 22 198 11.1
branch 3 52 5.7
condition 0 15 0.0
subroutine 7 22 31.8
pod 4 4 100.0
total 36 291 12.3


line stmt bran cond sub pod time code
1             package Mail::Mbox::MessageParser::Grep;
2              
3 38     38   924754 use strict;
  38         90  
  38         1035  
4 38     38   179 use Carp;
  38         78  
  38         1837  
5              
6 38     38   562 use Mail::Mbox::MessageParser;
  38         61  
  38         615  
7 38     38   258 use Mail::Mbox::MessageParser::Config;
  38         139  
  38         901  
8              
9 38     38   293 use vars qw( $VERSION $_DEBUG @ISA );
  38         55  
  38         1960  
10 38     38   185 use vars qw( $_CACHE );
  38         108  
  38         82817  
11              
12             @ISA = qw( Exporter Mail::Mbox::MessageParser );
13              
14             $VERSION = sprintf "%d.%02d%02d", q/1.70.5/ =~ /(\d+)/g;
15              
16             *_ENTRY_STILL_VALID = \&Mail::Mbox::MessageParser::MetaInfo::_ENTRY_STILL_VALID;
17             sub _ENTRY_STILL_VALID;
18              
19             *_CACHE = \$Mail::Mbox::MessageParser::MetaInfo::_CACHE;
20              
21             *_DEBUG = \$Mail::Mbox::MessageParser::_DEBUG;
22             *_dprint = \&Mail::Mbox::MessageParser::_dprint;
23             sub _dprint;
24              
25             #-------------------------------------------------------------------------------
26              
27             sub new
28             {
29 15     15 1 48 my ($proto, $self) = @_;
30              
31 15 50       54 carp "Need file_name option" unless defined $self->{'file_name'};
32 15 50       53 carp "Need file_handle option" unless defined $self->{'file_handle'};
33              
34             return "Mail::Mbox::MessageParser::Grep not configured to use GNU grep. Perhaps it is not installed"
35 15 50       87 unless defined $Mail::Mbox::MessageParser::Config{'programs'}{'grep'};
36              
37 0           bless ($self, __PACKAGE__);
38              
39 0           $self->_init();
40              
41 0           return $self;
42             }
43              
44             #-------------------------------------------------------------------------------
45              
46             sub _init
47             {
48 0     0     my $self = shift;
49              
50             # Reading grep data provides us with an array of potential email starting
51             # locations. However, due to included emails and attachments, we have to
52             # validate these locations as actually being the start of emails. As a
53             # result, there may be more "chunks" in the array than emails. So
54             # CHUNK_INDEX >= email_number-1.
55 0           $self->{'CHUNK_INDEX'} = -1;
56              
57 0           $self->{'READ_BUFFER'} = '';
58 0           $self->{'START_OF_EMAIL'} = 0;
59 0           $self->{'END_OF_EMAIL'} = 0;
60              
61 0           $self->SUPER::_init();
62              
63 0           $self->_initialize_cache_entry();
64             }
65              
66             #-------------------------------------------------------------------------------
67              
68             sub reset
69             {
70 0     0 1   my $self = shift;
71              
72 0           $self->{'CHUNK_INDEX'} = 0;
73              
74 0           $self->{'READ_BUFFER'} = '';
75 0           $self->{'START_OF_EMAIL'} = 0;
76 0           $self->{'END_OF_EMAIL'} = 0;
77              
78 0           $self->SUPER::reset();
79             }
80              
81             #-------------------------------------------------------------------------------
82              
83             sub end_of_file
84             {
85 0     0 1   my $self = shift;
86              
87             # Reset eof in case the file was appended to. Hopefully this works all the
88             # time. See perldoc -f seek for details.
89 0 0         seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'};
90              
91             return eof $self->{'file_handle'} &&
92 0   0       $self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'});
93             }
94              
95             #-------------------------------------------------------------------------------
96              
97             sub _read_prologue
98             {
99 0     0     my $self = shift;
100              
101 0           _dprint "Reading mailbox prologue using grep";
102              
103 0           $self->_read_until_match(
104             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,0);
105              
106 0           my $start_of_email = pos($self->{'READ_BUFFER'});
107 0           $self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email);
108              
109             # Set up for read_next_email
110 0           $self->{'END_OF_EMAIL'} = $start_of_email;
111             }
112              
113             #-------------------------------------------------------------------------------
114              
115             sub read_next_email
116             {
117 0     0 1   my $self = shift;
118              
119 0 0 0       unless (defined $self->{'file_name'} && _ENTRY_STILL_VALID($self->{'file_name'}))
120             {
121             # Patch up the data structures for the Perl implementation
122 0           undef $self->{'CHUNK_INDEX'};
123             $self->{'CURRENT_LINE_NUMBER'} =
124 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'line_number'};
125             $self->{'CURRENT_OFFSET'} =
126 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'offset'};
127             $self->{'READ_CHUNK_SIZE'} =
128 0           $Mail::Mbox::MessageParser::Config{'read_chunk_size'};
129              
130             # Invalidate the remaining data
131 0           $#{ $_CACHE->{$self->{'file_name'}}{'emails'} } = $self->{'email_number'};
  0            
132              
133 0           bless ($self, 'Mail::Mbox::MessageParser::Perl');
134              
135 0           return $self->read_next_email();
136             }
137              
138 0 0         return undef if $self->end_of_file(); ## no critic (ProhibitExplicitReturnUndef)
139              
140             $self->{'email_line_number'} =
141 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'line_number'};
142             $self->{'email_offset'} =
143 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'offset'};
144              
145 0           $self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'};
146              
147             # Slurp in an entire multipart email (but continue looking for the next
148             # header so that we can get any following newlines as well)
149 0 0         unless ($self->_read_header())
150             {
151 0           return $self->_extract_email_and_finalize();
152             }
153              
154 0 0         unless ($self->_read_email_parts())
155             {
156             # Could issue a warning here, but I'm not sure how to do this cleanly for
157             # a work-only module like this. Maybe something like CGI's cgi_error()?
158 0           _dprint "Inconsistent multi-part message. Could not find ending for " .
159             "boundary \"" . $self->_multipart_boundary() . "\"";
160              
161             # Try to read the content length and use that
162             my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
163 0           $self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'});
164              
165             my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD(
166 0           \$email_header, 'Content-Length:', $self->{'endline'});
167              
168 0 0         if (defined $content_length)
169             {
170 0           $content_length =~ s/Content-Length: *(\d+).*/$1/i;
171 0           pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length;
172             }
173             # Otherwise use the start of the body
174             else
175             {
176 0           pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'};
177             }
178              
179             # Reset the search and look for the start of the next email.
180 0           $self->_read_rest_of_email();
181              
182 0           return $self->_extract_email_and_finalize();
183             }
184              
185 0           $self->_read_rest_of_email();
186              
187 0           return $self->_extract_email_and_finalize();
188             }
189              
190             #-------------------------------------------------------------------------------
191              
192             sub _read_rest_of_email
193             {
194 0     0     my $self = shift;
195              
196             # Look for the start of the next email
197 0           while (1)
198             {
199 0           while ($self->{'READ_BUFFER'} =~
200             m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg)
201             {
202 0           $self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1);
203              
204 0           my $endline = $self->{'endline'};
205              
206             # Keep looking if the header we found is part of a "Begin Included
207             # Message".
208 0           my $end_of_string = '';
209 0           my $backup_amount = 100;
210             do
211             {
212 0           $backup_amount *= 2;
213             $end_of_string = substr($self->{'READ_BUFFER'},
214 0           $self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount);
215             } while (index($end_of_string, "$endline$endline") == -1 &&
216 0   0       $backup_amount < $self->{'END_OF_EMAIL'});
217              
218 0 0         next if $end_of_string =~
219             /$endline-----(?: Begin Included Message |Original Message)-----$endline[^\r\n]*(?:$endline)*$/i;
220              
221 0 0         next unless $end_of_string =~ /$endline$endline$/;
222              
223             # Found the next email!
224 0           return;
225             }
226              
227             # Didn't find next email in current buffer. Most likely we need to read some
228             # more of the mailbox. Shift the current email to the front of the buffer
229             # unless we've already done so.
230 0           my $shift_amount = $self->{'START_OF_EMAIL'};
231             $self->{'READ_BUFFER'} =
232 0           substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'});
233 0           $self->{'START_OF_EMAIL'} -= $shift_amount;
234 0           $self->{'START_OF_BODY'} -= $shift_amount;
235 0           pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
236              
237             # Start looking at the end of the buffer, but back up some in case the
238             # edge of the newly read buffer contains the start of a new header. I
239             # believe the RFC says header lines can be at most 90 characters long.
240 0 0         unless ($self->_read_until_match(
241             qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,90))
242             {
243 0           $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
244 0           return;
245             }
246              
247 0           redo;
248             }
249             }
250              
251             #-------------------------------------------------------------------------------
252              
253             sub _multipart_boundary
254             {
255 0     0     my $self = shift;
256              
257 0           my $endline = $self->{'endline'};
258              
259 0 0         if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'},
260             $self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~
261             /^(content-type: *multipart[^\n\r]*$endline( [^\n\r]*$endline)*)/im)
262             {
263 0           my $content_type_header = $1;
264 0           $content_type_header =~ s/$endline//g;
265              
266 0 0 0       if ($content_type_header =~ /boundary *= *"([^"]*)"/i ||
267             $content_type_header =~ /boundary *= *([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i)
268             {
269 0           return $1
270             }
271             }
272              
273 0           return undef; ## no critic (ProhibitExplicitReturnUndef)
274             }
275              
276             #-------------------------------------------------------------------------------
277              
278             sub _read_email_parts
279             {
280 0     0     my $self = shift;
281              
282 0           my $boundary = $self->_multipart_boundary();
283              
284 0 0         return 1 unless defined $boundary;
285              
286             # RFC 1521 says the boundary can be no longer than 70 characters. Back up a
287             # little more than that.
288 0           my $endline = $self->{'endline'};
289 0 0         $self->_read_until_match(qr/^--\Q$boundary\E--$endline/m,76)
290             or return 0;
291              
292 0           return 1;
293             }
294              
295             #-------------------------------------------------------------------------------
296              
297             sub _extract_email_and_finalize
298             {
299 0     0     my $self = shift;
300              
301 0           $self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'};
302              
303             my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
304 0           $self->{'email_length'});
305              
306 0           while ($self->{'email_length'} >
307             $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'length'})
308             {
309 0           $self->_adjust_cache_data();
310             }
311              
312 0           $self->{'email_number'}++;
313              
314 0           $self->SUPER::read_next_email();
315              
316 0           return \$email;
317             }
318              
319             #-------------------------------------------------------------------------------
320              
321             sub _read_header
322             {
323 0     0     my $self = shift;
324              
325 0 0         $self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/m,0)
326             or return 0;
327              
328             $self->{'START_OF_BODY'} =
329 0           pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}");
330              
331 0           return 1;
332             }
333              
334             #-------------------------------------------------------------------------------
335              
336             # The search position is at the start of the pattern when this function
337             # returns 1.
338             sub _read_until_match
339             {
340 0     0     my $self = shift;
341 0           my $pattern = shift;
342 0           my $backup = shift;
343              
344             # Start looking at the end of the buffer, but back up some in case the edge
345             # of the newly read buffer contains part of the pattern.
346 0 0 0       if (!defined pos($self->{'READ_BUFFER'}) ||
347             pos($self->{'READ_BUFFER'}) - $backup <= 0) {
348 0           pos($self->{'READ_BUFFER'}) = 0;
349             } else {
350 0           pos($self->{'READ_BUFFER'}) -= $backup;
351             }
352              
353 0           while (1)
354             {
355 0 0         if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg)
356             {
357 0           pos($self->{'READ_BUFFER'}) -= length($1);
358 0           return 1;
359             }
360              
361 0           pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
362              
363 0 0         unless ($self->_read_chunk()) {
364 0           $self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
365 0           return 0;
366             }
367              
368 0 0         if (pos($self->{'READ_BUFFER'}) - $backup <= 0) {
369 0           pos($self->{'READ_BUFFER'}) = 0;
370             } else {
371 0           pos($self->{'READ_BUFFER'}) -= $backup;
372             }
373             }
374             }
375              
376             #-------------------------------------------------------------------------------
377              
378             # Maintains pos($self->{'READ_BUFFER'})
379             sub _read_chunk
380             {
381 0     0     my $self = shift;
382              
383 0           my $search_position = pos($self->{'READ_BUFFER'});
384              
385             # Reading the prologue, so use the offset of the first email
386 0 0         if ($self->{'CHUNK_INDEX'} == -1)
387             {
388 0           my $length_to_read = $_CACHE->{$self->{'file_name'}}{'emails'}[0]{'offset'};
389 0           my $total_amount_read = 0;
390              
391 0           do {
392             $total_amount_read += read($self->{'file_handle'}, $self->{'READ_BUFFER'},
393 0           $length_to_read-$total_amount_read, length($self->{'READ_BUFFER'}));
394             } while ($total_amount_read != $length_to_read);
395              
396 0           pos($self->{'READ_BUFFER'}) = $search_position;
397              
398 0           $self->{'CHUNK_INDEX'}++;
399             }
400              
401 0           my $last_email_index = $#{$_CACHE->{$self->{'file_name'}}{'emails'}};
  0            
402              
403 0 0         return 0 if $self->{'CHUNK_INDEX'} == $last_email_index+1;
404              
405             my $length_to_read =
406 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'CHUNK_INDEX'}]{'length'};
407 0           my $total_amount_read = 0;
408              
409 0           do {
410             $total_amount_read += read($self->{'file_handle'}, $self->{'READ_BUFFER'},
411 0           $length_to_read-$total_amount_read, length($self->{'READ_BUFFER'}));
412             } while ($total_amount_read != $length_to_read);
413              
414 0           pos($self->{'READ_BUFFER'}) = $search_position;
415              
416 0           $self->{'CHUNK_INDEX'}++;
417              
418 0           return 1;
419             }
420              
421             #-------------------------------------------------------------------------------
422              
423             sub _adjust_cache_data
424             {
425 0     0     my $self = shift;
426              
427 0           my $last_email_index = $#{$_CACHE->{$self->{'file_name'}}{'emails'}};
  0            
428              
429             die<
430             Error: Cannot adjust cache data. Please email the author with your mailbox to
431             have him fix the problem. In the meantime, disable the grep implementation.
432             EOF
433 0 0         if $self->{'email_number'} == $last_email_index;
434              
435             $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'length'} +=
436 0           $_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}+1]{'length'};
437              
438 0 0         if($self->{'email_number'}+2 <= $last_email_index)
439             {
440 0           @{$_CACHE->{$self->{'file_name'}}{'emails'}}
441             [$self->{'email_number'}+1..$last_email_index-1] =
442 0           @{$_CACHE->{$self->{'file_name'}}{'emails'}}
443 0           [$self->{'email_number'}+2..$last_email_index];
444             }
445              
446 0           pop @{$_CACHE->{$self->{'file_name'}}{'emails'}};
  0            
447              
448 0           $self->{'CHUNK_INDEX'}--;
449             }
450              
451             #-------------------------------------------------------------------------------
452              
453             sub _initialize_cache_entry
454             {
455 0     0     my $self = shift;
456            
457 0           my @stat = stat $self->{'file_name'};
458            
459 0           my $size = $stat[7];
460 0           my $time_stamp = $stat[9];
461              
462 0           $_CACHE->{$self->{'file_name'}}{'size'} = $size;
463 0           $_CACHE->{$self->{'file_name'}}{'time_stamp'} = $time_stamp;
464             $_CACHE->{$self->{'file_name'}}{'emails'} =
465 0           _READ_GREP_DATA($self->{'file_name'});
466             }
467              
468             #-------------------------------------------------------------------------------
469              
470             sub _READ_GREP_DATA
471             {
472 0     0     my $filename = shift;
473              
474 0           my @lines_and_offsets;
475              
476 0           _dprint "Reading grep data";
477              
478             {
479 0           my @grep_results;
  0            
480              
481 0           @grep_results = `unset LC_ALL LC_COLLATE LANG LC_CTYPE LC_MESSAGES; $Mail::Mbox::MessageParser::Config{'programs'}{'grep'} --extended-regexp --line-number --byte-offset --binary-files=text "^From [^:]+(:[0-9][0-9]){1,2}( *([A-Z]{2,6}|[+-]?[0-9]{4})){1,3}( remote from .*)?\r?\$" "$filename"`;
482              
483 0           _dprint "Read " . scalar(@grep_results) . " lines of grep data";
484              
485 0           foreach my $match_result (@grep_results)
486             {
487 0           my ($line_number, $byte_offset) = $match_result =~ /^(\d+):(\d+):/;
488 0           push @lines_and_offsets,
489             {'line number' => $line_number,'byte offset' => $byte_offset};
490             }
491             }
492              
493 0           my @emails;
494              
495 0           for(my $match_number = 0; $match_number <= $#lines_and_offsets; $match_number++)
496             {
497 0 0         if ($match_number == $#lines_and_offsets)
498             {
499 0           my $filesize = -s $filename;
500             $emails[$match_number]{'length'} =
501 0           $filesize - $lines_and_offsets[$match_number]{'byte offset'};
502             }
503             else
504             {
505             $emails[$match_number]{'length'} =
506             $lines_and_offsets[$match_number+1]{'byte offset'} -
507 0           $lines_and_offsets[$match_number]{'byte offset'};
508             }
509              
510             $emails[$match_number]{'line_number'} =
511 0           $lines_and_offsets[$match_number]{'line number'};
512              
513             $emails[$match_number]{'offset'} =
514 0           $lines_and_offsets[$match_number]{'byte offset'};
515              
516 0           $emails[$match_number]{'validated'} = 0;
517             }
518              
519 0           return \@emails;
520             }
521              
522             1;
523              
524             __END__