File Coverage

blib/lib/Mail/Mbox/MessageParser/Grep.pm

Criterion	Covered	Total	%
statement	22	198	11.1
branch	3	52	5.7
condition	0	15	0.0
subroutine	7	22	31.8
pod	4	4	100.0
total	36	291	12.3

line	stmt	bran	cond	sub	pod	time	code
1							package Mail::Mbox::MessageParser::Grep;
2
3	42			42		832413	use strict;
	42					96
	42					1071
4	42			42		177	use Carp;
	42					77
	42					2000
5
6	42			42		637	use Mail::Mbox::MessageParser;
	42					75
	42					679
7	42			42		248	use Mail::Mbox::MessageParser::Config;
	42					77
	42					1099
8
9	42			42		301	use vars qw( $VERSION $_DEBUG @ISA );
	42					78
	42					2056
10	42			42		219	use vars qw( $_CACHE );
	42					102
	42					90866
11
12							@ISA = qw( Exporter Mail::Mbox::MessageParser );
13
14							$VERSION = sprintf "%d.%02d%02d", q/1.70.5/ =~ /(\d+)/g;
15
16							*_ENTRY_STILL_VALID = \&Mail::Mbox::MessageParser::MetaInfo::_ENTRY_STILL_VALID;
17							sub _ENTRY_STILL_VALID;
18
19							*_CACHE = \$Mail::Mbox::MessageParser::MetaInfo::_CACHE;
20
21							*_DEBUG = \$Mail::Mbox::MessageParser::_DEBUG;
22							*_dprint = \&Mail::Mbox::MessageParser::_dprint;
23							sub _dprint;
24
25							#-------------------------------------------------------------------------------
26
27							sub new
28							{
29	16			16	1	44	my ($proto, $self) = @_;
30
31	16	50				47	carp "Need file_name option" unless defined $self->{'file_name'};
32	16	50				47	carp "Need file_handle option" unless defined $self->{'file_handle'};
33
34							return "Mail::Mbox::MessageParser::Grep not configured to use GNU grep. Perhaps it is not installed"
35	16	50				67	unless defined $Mail::Mbox::MessageParser::Config{'programs'}{'grep'};
36
37	0						bless ($self, __PACKAGE__);
38
39	0						$self->_init();
40
41	0						return $self;
42							}
43
44							#-------------------------------------------------------------------------------
45
46							sub _init
47							{
48	0			0			my $self = shift;
49
50							# Reading grep data provides us with an array of potential email starting
51							# locations. However, due to included emails and attachments, we have to
52							# validate these locations as actually being the start of emails. As a
53							# result, there may be more "chunks" in the array than emails. So
54							# CHUNK_INDEX >= email_number-1.
55	0						$self->{'CHUNK_INDEX'} = -1;
56
57	0						$self->{'READ_BUFFER'} = '';
58	0						$self->{'START_OF_EMAIL'} = 0;
59	0						$self->{'END_OF_EMAIL'} = 0;
60
61	0						$self->SUPER::_init();
62
63	0						$self->_initialize_cache_entry();
64							}
65
66							#-------------------------------------------------------------------------------
67
68							sub reset
69							{
70	0			0	1		my $self = shift;
71
72	0						$self->{'CHUNK_INDEX'} = 0;
73
74	0						$self->{'READ_BUFFER'} = '';
75	0						$self->{'START_OF_EMAIL'} = 0;
76	0						$self->{'END_OF_EMAIL'} = 0;
77
78	0						$self->SUPER::reset();
79							}
80
81							#-------------------------------------------------------------------------------
82
83							sub end_of_file
84							{
85	0			0	1		my $self = shift;
86
87							# Reset eof in case the file was appended to. Hopefully this works all the
88							# time. See perldoc -f seek for details.
89	0	0					seek($self->{'file_handle'},0,1) if eof $self->{'file_handle'};
90
91							return eof $self->{'file_handle'} &&
92	0		0				$self->{'END_OF_EMAIL'} == length($self->{'READ_BUFFER'});
93							}
94
95							#-------------------------------------------------------------------------------
96
97							sub _read_prologue
98							{
99	0			0			my $self = shift;
100
101	0						_dprint "Reading mailbox prologue using grep";
102
103	0						$self->_read_until_match(
104							qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,0);
105
106	0						my $start_of_email = pos($self->{'READ_BUFFER'});
107	0						$self->{'prologue'} = substr($self->{'READ_BUFFER'}, 0, $start_of_email);
108
109							# Set up for read_next_email
110	0						$self->{'END_OF_EMAIL'} = $start_of_email;
111							}
112
113							#-------------------------------------------------------------------------------
114
115							sub read_next_email
116							{
117	0			0	1		my $self = shift;
118
119	0	0	0				unless (defined $self->{'file_name'} && _ENTRY_STILL_VALID($self->{'file_name'}))
120							{
121							# Patch up the data structures for the Perl implementation
122	0						undef $self->{'CHUNK_INDEX'};
123							$self->{'CURRENT_LINE_NUMBER'} =
124	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'line_number'};
125							$self->{'CURRENT_OFFSET'} =
126	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'offset'};
127							$self->{'READ_CHUNK_SIZE'} =
128	0						$Mail::Mbox::MessageParser::Config{'read_chunk_size'};
129
130							# Invalidate the remaining data
131	0						$#{ $_CACHE->{$self->{'file_name'}}{'emails'} } = $self->{'email_number'};
	0
132
133	0						bless ($self, 'Mail::Mbox::MessageParser::Perl');
134
135	0						return $self->read_next_email();
136							}
137
138	0	0					return undef if $self->end_of_file(); ## no critic (ProhibitExplicitReturnUndef)
139
140							$self->{'email_line_number'} =
141	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'line_number'};
142							$self->{'email_offset'} =
143	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'offset'};
144
145	0						$self->{'START_OF_EMAIL'} = $self->{'END_OF_EMAIL'};
146
147							# Slurp in an entire multipart email (but continue looking for the next
148							# header so that we can get any following newlines as well)
149	0	0					unless ($self->_read_header())
150							{
151	0						return $self->_extract_email_and_finalize();
152							}
153
154	0	0					unless ($self->_read_email_parts())
155							{
156							# Could issue a warning here, but I'm not sure how to do this cleanly for
157							# a work-only module like this. Maybe something like CGI's cgi_error()?
158	0						_dprint "Inconsistent multi-part message. Could not find ending for " .
159							"boundary \"" . $self->_multipart_boundary() . "\"";
160
161							# Try to read the content length and use that
162							my $email_header = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
163	0						$self->{'START_OF_BODY'} - $self->{'START_OF_EMAIL'});
164
165							my $content_length = Mail::Mbox::MessageParser::_GET_HEADER_FIELD(
166	0						\$email_header, 'Content-Length:', $self->{'endline'});
167
168	0	0					if (defined $content_length)
169							{
170	0						$content_length =~ s/Content-Length: (\d+)./$1/i;
171	0						pos($self->{'READ_BUFFER'}) = $self->{'START_OF_EMAIL'} + $content_length;
172							}
173							# Otherwise use the start of the body
174							else
175							{
176	0						pos($self->{'READ_BUFFER'}) = $self->{'START_OF_BODY'};
177							}
178
179							# Reset the search and look for the start of the next email.
180	0						$self->_read_rest_of_email();
181
182	0						return $self->_extract_email_and_finalize();
183							}
184
185	0						$self->_read_rest_of_email();
186
187	0						return $self->_extract_email_and_finalize();
188							}
189
190							#-------------------------------------------------------------------------------
191
192							sub _read_rest_of_email
193							{
194	0			0			my $self = shift;
195
196							# Look for the start of the next email
197	0						while (1)
198							{
199	0						while ($self->{'READ_BUFFER'} =~
200							m/$Mail::Mbox::MessageParser::Config{'from_pattern'}/mg)
201							{
202	0						$self->{'END_OF_EMAIL'} = pos($self->{'READ_BUFFER'}) - length($1);
203
204	0						my $endline = $self->{'endline'};
205
206							# Keep looking if the header we found is part of a "Begin Included
207							# Message".
208	0						my $end_of_string = '';
209	0						my $backup_amount = 100;
210							do
211							{
212	0						$backup_amount *= 2;
213							$end_of_string = substr($self->{'READ_BUFFER'},
214	0						$self->{'END_OF_EMAIL'}-$backup_amount, $backup_amount);
215							} while (index($end_of_string, "$endline$endline") == -1 &&
216	0		0				$backup_amount < $self->{'END_OF_EMAIL'});
217
218	0	0					next if $end_of_string =~
219							/$endline-----(?: Begin Included Message \|Original Message)-----$endline[^\r\n](?:$endline)$/i;
220
221	0	0					next unless $end_of_string =~ /$endline$endline$/;
222
223							# Found the next email!
224	0						return;
225							}
226
227							# Didn't find next email in current buffer. Most likely we need to read some
228							# more of the mailbox. Shift the current email to the front of the buffer
229							# unless we've already done so.
230	0						my $shift_amount = $self->{'START_OF_EMAIL'};
231							$self->{'READ_BUFFER'} =
232	0						substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'});
233	0						$self->{'START_OF_EMAIL'} -= $shift_amount;
234	0						$self->{'START_OF_BODY'} -= $shift_amount;
235	0						pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
236
237							# Start looking at the end of the buffer, but back up some in case the
238							# edge of the newly read buffer contains the start of a new header. I
239							# believe the RFC says header lines can be at most 90 characters long.
240	0	0					unless ($self->_read_until_match(
241							qr/$Mail::Mbox::MessageParser::Config{'from_pattern'}/m,90))
242							{
243	0						$self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
244	0						return;
245							}
246
247	0						redo;
248							}
249							}
250
251							#-------------------------------------------------------------------------------
252
253							sub _multipart_boundary
254							{
255	0			0			my $self = shift;
256
257	0						my $endline = $self->{'endline'};
258
259	0	0					if (substr($self->{'READ_BUFFER'},$self->{'START_OF_EMAIL'},
260							$self->{'START_OF_BODY'}-$self->{'START_OF_EMAIL'}) =~
261							/^(content-type: multipart[^\n\r]$endline( [^\n\r]$endline))/im)
262							{
263	0						my $content_type_header = $1;
264	0						$content_type_header =~ s/$endline//g;
265
266	0	0	0				if ($content_type_header =~ /boundary = "([^"]*)"/i \|\|
267							$content_type_header =~ /boundary = ([-0-9A-Za-z'()+_,.\/:=? ]*[-0-9A-Za-z'()+_,.\/:=?])/i)
268							{
269	0						return $1
270							}
271							}
272
273	0						return undef; ## no critic (ProhibitExplicitReturnUndef)
274							}
275
276							#-------------------------------------------------------------------------------
277
278							sub _read_email_parts
279							{
280	0			0			my $self = shift;
281
282	0						my $boundary = $self->_multipart_boundary();
283
284	0	0					return 1 unless defined $boundary;
285
286							# RFC 1521 says the boundary can be no longer than 70 characters. Back up a
287							# little more than that.
288	0						my $endline = $self->{'endline'};
289	0	0					$self->_read_until_match(qr/^--\Q$boundary\E--$endline/m,76)
290							or return 0;
291
292	0						return 1;
293							}
294
295							#-------------------------------------------------------------------------------
296
297							sub _extract_email_and_finalize
298							{
299	0			0			my $self = shift;
300
301	0						$self->{'email_length'} = $self->{'END_OF_EMAIL'}-$self->{'START_OF_EMAIL'};
302
303							my $email = substr($self->{'READ_BUFFER'}, $self->{'START_OF_EMAIL'},
304	0						$self->{'email_length'});
305
306	0						while ($self->{'email_length'} >
307							$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'length'})
308							{
309	0						$self->_adjust_cache_data();
310							}
311
312	0						$self->{'email_number'}++;
313
314	0						$self->SUPER::read_next_email();
315
316	0						return \$email;
317							}
318
319							#-------------------------------------------------------------------------------
320
321							sub _read_header
322							{
323	0			0			my $self = shift;
324
325	0	0					$self->_read_until_match(qr/$self->{'endline'}$self->{'endline'}/m,0)
326							or return 0;
327
328							$self->{'START_OF_BODY'} =
329	0						pos($self->{'READ_BUFFER'}) + length("$self->{'endline'}$self->{'endline'}");
330
331	0						return 1;
332							}
333
334							#-------------------------------------------------------------------------------
335
336							# The search position is at the start of the pattern when this function
337							# returns 1.
338							sub _read_until_match
339							{
340	0			0			my $self = shift;
341	0						my $pattern = shift;
342	0						my $backup = shift;
343
344							# Start looking at the end of the buffer, but back up some in case the edge
345							# of the newly read buffer contains part of the pattern.
346	0	0	0				if (!defined pos($self->{'READ_BUFFER'}) \|\|
347							pos($self->{'READ_BUFFER'}) - $backup <= 0) {
348	0						pos($self->{'READ_BUFFER'}) = 0;
349							} else {
350	0						pos($self->{'READ_BUFFER'}) -= $backup;
351							}
352
353	0						while (1)
354							{
355	0	0					if ($self->{'READ_BUFFER'} =~ m/($pattern)/mg)
356							{
357	0						pos($self->{'READ_BUFFER'}) -= length($1);
358	0						return 1;
359							}
360
361	0						pos($self->{'READ_BUFFER'}) = length($self->{'READ_BUFFER'});
362
363	0	0					unless ($self->_read_chunk()) {
364	0						$self->{'END_OF_EMAIL'} = length($self->{'READ_BUFFER'});
365	0						return 0;
366							}
367
368	0	0					if (pos($self->{'READ_BUFFER'}) - $backup <= 0) {
369	0						pos($self->{'READ_BUFFER'}) = 0;
370							} else {
371	0						pos($self->{'READ_BUFFER'}) -= $backup;
372							}
373							}
374							}
375
376							#-------------------------------------------------------------------------------
377
378							# Maintains pos($self->{'READ_BUFFER'})
379							sub _read_chunk
380							{
381	0			0			my $self = shift;
382
383	0						my $search_position = pos($self->{'READ_BUFFER'});
384
385							# Reading the prologue, so use the offset of the first email
386	0	0					if ($self->{'CHUNK_INDEX'} == -1)
387							{
388	0						my $length_to_read = $_CACHE->{$self->{'file_name'}}{'emails'}[0]{'offset'};
389	0						my $total_amount_read = 0;
390
391	0						do {
392							$total_amount_read += read($self->{'file_handle'}, $self->{'READ_BUFFER'},
393	0						$length_to_read-$total_amount_read, length($self->{'READ_BUFFER'}));
394							} while ($total_amount_read != $length_to_read);
395
396	0						pos($self->{'READ_BUFFER'}) = $search_position;
397
398	0						$self->{'CHUNK_INDEX'}++;
399							}
400
401	0						my $last_email_index = $#{$_CACHE->{$self->{'file_name'}}{'emails'}};
	0
402
403	0	0					return 0 if $self->{'CHUNK_INDEX'} == $last_email_index+1;
404
405							my $length_to_read =
406	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'CHUNK_INDEX'}]{'length'};
407	0						my $total_amount_read = 0;
408
409	0						do {
410							$total_amount_read += read($self->{'file_handle'}, $self->{'READ_BUFFER'},
411	0						$length_to_read-$total_amount_read, length($self->{'READ_BUFFER'}));
412							} while ($total_amount_read != $length_to_read);
413
414	0						pos($self->{'READ_BUFFER'}) = $search_position;
415
416	0						$self->{'CHUNK_INDEX'}++;
417
418	0						return 1;
419							}
420
421							#-------------------------------------------------------------------------------
422
423							sub _adjust_cache_data
424							{
425	0			0			my $self = shift;
426
427	0						my $last_email_index = $#{$_CACHE->{$self->{'file_name'}}{'emails'}};
	0
428
429							die<
430							Error: Cannot adjust cache data. Please email the author with your mailbox to
431							have him fix the problem. In the meantime, disable the grep implementation.
432							EOF
433	0	0					if $self->{'email_number'} == $last_email_index;
434
435							$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}]{'length'} +=
436	0						$_CACHE->{$self->{'file_name'}}{'emails'}[$self->{'email_number'}+1]{'length'};
437
438	0	0					if($self->{'email_number'}+2 <= $last_email_index)
439							{
440	0						@{$_CACHE->{$self->{'file_name'}}{'emails'}}
441							[$self->{'email_number'}+1..$last_email_index-1] =
442	0						@{$_CACHE->{$self->{'file_name'}}{'emails'}}
443	0						[$self->{'email_number'}+2..$last_email_index];
444							}
445
446	0						pop @{$_CACHE->{$self->{'file_name'}}{'emails'}};
	0
447
448	0						$self->{'CHUNK_INDEX'}--;
449							}
450
451							#-------------------------------------------------------------------------------
452
453							sub _initialize_cache_entry
454							{
455	0			0			my $self = shift;
456
457	0						my @stat = stat $self->{'file_name'};
458
459	0						my $size = $stat[7];
460	0						my $time_stamp = $stat[9];
461
462	0						$_CACHE->{$self->{'file_name'}}{'size'} = $size;
463	0						$_CACHE->{$self->{'file_name'}}{'time_stamp'} = $time_stamp;
464							$_CACHE->{$self->{'file_name'}}{'emails'} =
465	0						_READ_GREP_DATA($self->{'file_name'});
466							}
467
468							#-------------------------------------------------------------------------------
469
470							sub _READ_GREP_DATA
471							{
472	0			0			my $filename = shift;
473
474	0						my @lines_and_offsets;
475
476	0						_dprint "Reading grep data";
477
478							{
479	0						my @grep_results;
	0
480
481	0						@grep_results = `unset LC_ALL LC_COLLATE LANG LC_CTYPE LC_MESSAGES; $Mail::Mbox::MessageParser::Config{'programs'}{'grep'} --extended-regexp --line-number --byte-offset --binary-files=text "^From [^:]+(:[0-9][0-9]){1,2}( ([A-Z]{2,6}\|[+-]?[0-9]{4})){1,3}( remote from .)?\r?\$" "$filename"`;
482
483	0						_dprint "Read " . scalar(@grep_results) . " lines of grep data";
484
485	0						foreach my $match_result (@grep_results)
486							{
487	0						my ($line_number, $byte_offset) = $match_result =~ /^(\d+):(\d+):/;
488	0						push @lines_and_offsets,
489							{'line number' => $line_number,'byte offset' => $byte_offset};
490							}
491							}
492
493	0						my @emails;
494
495	0						for(my $match_number = 0; $match_number <= $#lines_and_offsets; $match_number++)
496							{
497	0	0					if ($match_number == $#lines_and_offsets)
498							{
499	0						my $filesize = -s $filename;
500							$emails[$match_number]{'length'} =
501	0						$filesize - $lines_and_offsets[$match_number]{'byte offset'};
502							}
503							else
504							{
505							$emails[$match_number]{'length'} =
506							$lines_and_offsets[$match_number+1]{'byte offset'} -
507	0						$lines_and_offsets[$match_number]{'byte offset'};
508							}
509
510							$emails[$match_number]{'line_number'} =
511	0						$lines_and_offsets[$match_number]{'line number'};
512
513							$emails[$match_number]{'offset'} =
514	0						$lines_and_offsets[$match_number]{'byte offset'};
515
516	0						$emails[$match_number]{'validated'} = 0;
517							}
518
519	0						return \@emails;
520							}
521
522							1;
523
524							__END__