File Coverage

blib/lib/Mail/Field/Received.pm

Criterion	Covered	Total	%
statement	168	280	60.0
branch	82	192	42.7
condition	57	87	65.5
subroutine	12	12	100.0
pod	7	7	100.0
total	326	578	56.4

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl -w
2							#
3							# Mail::Field::Received --
4							# mostly RFC822-compliant parser of Received headers
5							#
6							# Copyright (c) 2000 Adam Spiers . All rights
7							# reserved. This program is free software; you can redistribute it and/or
8							# modify it under the same terms as Perl itself.
9							#
10							# $Id: Received.pm,v 1.28 2003/03/17 23:45:17 adams Exp $
11							#
12
13							require 5.005;
14
15							package Mail::Field::Received;
16
17	1			1		40316	use strict;
	1					1
	1					35
18
19	1			1		3021	use Mail::Field ();
	1					5044
	1					23
20	1			1		9	use Carp;
	1					8
	1					69
21
22	1			1		5	use vars qw($VERSION @ISA @EXPORT_OK);
	1					2
	1					226
23							@ISA = qw(Exporter Mail::Field Mail::Field::Generic);
24							@EXPORT_OK = qw(%RC &diagnose);
25
26							$VERSION = '0.26';
27
28							=head1 NAME
29
30							Mail::Field::Received -- mostly RFC822-compliant parser of Received headers
31
32							=head1 SYNOPSIS
33
34							use Mail::Field;
35
36							my $received = Mail::Field->new('Received', $header);
37							my $results = $received->parse_tree();
38							my $parsed_ok = $received->parsed_ok();
39							my $diagnostics = $received->diagnostics();
40
41							=head1 DESCRIPTION
42
43							I Instead ask Mail::Field for new
44							instances based on the field name!
45
46							Mail::Field::Received provides subroutines for parsing Received
47							headers from e-mails. It mostly complies with RFC822, but deviates to
48							accomodate a number of broken MTAs which are in common use. It also
49							attempts to extract useful information which MTAs often embed within
50							the C<(comments)>.
51
52							It is a subclass derived from the Mail::Field and Mail::Field::Generic
53							classes.
54
55							=head1 ROUTINES
56
57							=over 4
58
59							=cut
60
61							INIT: {
62							bless([])->register('Received');
63							}
64
65							##
66
67							=item * B
68
69							Returns current debugging level obtained via the C method.
70							If a parameter is given, the debugging level is changed. The default
71							level is 3.
72
73							=cut
74
75							my $debug = 3;
76
77							sub debug {
78	2			2	1	1780	my $self = shift;
79	2	100				37	if (@_) {
80	1					3	$debug = shift;
81							}
82	2					8	return $debug;
83							}
84
85							##
86
87							=item * B
88
89							$received->diagnose("foo", "\n");
90
91							Appends stuff to the parser's diagnostics buffer.
92
93							=cut
94
95							sub diagnose {
96	70			70	1	107	my $self = shift;
97	70					136	my (@msgs) = @_;
98	70					231	$self->{Diags} .= join '', @msgs;
99							}
100
101							=item * B
102
103							my $diagnostics = $received->diagnostics();
104
105							Returns the contents of the parser's diagnostics buffer.
106
107							=cut
108
109							sub diagnostics {
110	1			1	1	2	my $self = shift;
111	1		50			17	return $self->{Diags} \|\| '';
112							}
113
114							##
115
116							# Here be all the roughly (!) RFC822-compliant regexps. They
117							# sometimes deviate from RFC822 to allow for many common MTAs which
118							# don't comply either.
119							#
120							# N.B. we need lots of butt-ugly extra ()s to avoid a nasty bug with
121							# (?-x:) in many recent Perls (fixed by 5.005_63 it seems, maybe earlier).
122
123	1			1		4	use vars qw(%RC);
	1					2
	1					5663
124							%RC = ();
125
126							# Atoms consist of all CHARs except SPACE, CTLs, and SPECIALs.
127							$RC{atom} = qr/(?:[\041\043-\047\052\053\055-\071\075\077\101-\132\136-\176]+)/;
128
129							$RC{ctext} = qr/[\000-\014\016-\047\052-\133\135-\177]/;
130							$RC{dtext} = qr/[\000-\014\016-\132\136-\177]/;
131							$RC{quoted_pair} = qr/(?:\\[\000-\177])/;
132							$RC{qtext} = qr/[\000-\014\016-\041\043-\133\135-\177]/;
133							$RC{quoted_str} = qr/(?:"(?:$RC{qtext}\|$RC{quoted_pair})*")/;
134
135							# Comments can be arbitrarily nested but I can't be bothered to
136							# support that here; it's too much effort and no-one will nest more than
137							# once ... I hope!
138							$RC{comment_base}= qr/($(?:$RC{ctext}\|$RC{quoted_pair})*$)/;
139							$RC{comment} = qr/($(?:$RC{ctext}\|$RC{quoted_pair}\|$RC{comment_base})*$)/;
140
141							$RC{word} = qr/(?:$RC{atom}\|$RC{quoted_str})/;
142							$RC{words} = qr/($RC{atom}(\s+$RC{atom})*\|$RC{quoted_str})/;
143
144							# ' 1' isn't 2DIGIT according to RFC822 but some MTAs use it anyway
145							$RC{TWO_DIGIT} = qr/((?:\d\|(?<= )\| )\d)/;
146
147							# This could be improved upon. I left the common triples in, even
148							# though [A-Z]{3} makes them redundant.
149							$RC{zone_name} = qr/(UT\|GMT\|[CEMPW][DES]T\|[A-Z]\|[A-Z]{3})/;
150
151							$RC{zone} = qr/(
152							([+-]?[01]\d(?:00\|15\|30\|45))(?:
153							)(?:\s(?:$RC{zone_name}\|$$RC{zone_name}$))?
154							\|
155							(?:$RC{zone_name})(?:
156							))/x;
157							$RC{hms} = qr/($RC{TWO_DIGIT}:(\d\d)(?::(\d\d))?)/;
158							# Note: case-insensitivity is not RFC-compliant here, but some MTAs
159							# write days/months in all lower case.
160							$RC{month} = qr/(Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)/i;
161							$RC{week_day} = qr/(Mon\|Tue\|Wed\|Thu\|Fri\|Sat\|Sun)/i;
162							$RC{year} = qr/((?:19\|20)?\d{2}\|100)/; # god-DAMN the incompetence!
163							$RC{year_day1} = qr/(?:$RC{TWO_DIGIT}\s$RC{month})/;
164							$RC{year_day2} = qr/(?:$RC{month}\s$RC{TWO_DIGIT})/;
165							$RC{day_of_year} = qr/(?:$RC{year_day1}\|$RC{year_day2})/;
166							$RC{date_time1} = qr/(?:$RC{hms}\s+$RC{year}\s+(?:$RC{zone})?)/;
167							$RC{date_time2} = qr/(?:$RC{hms}\s+$RC{zone}\s+$RC{year})/;
168							$RC{date_time3} = qr/(?:$RC{year}\s+$RC{hms}\s+(?:$RC{zone})?)/;
169							$RC{date_time} = qr/(
170							(?: $RC{week_day} ,? \s* )?
171							($RC{day_of_year}) \s+
172							($RC{date_time1}\|$RC{date_time2}\|$RC{date_time3})
173							)/x;
174
175							$RC{ipv4_addr} = qr/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/;
176							# check valid with inet_aton()
177
178							$RC{domain_lit} = qr/(?:\[(?:$RC{dtext}\|$RC{quoted_pair})*\])/;
179							$RC{sub_domain} = qr/(?:$RC{atom}\|$RC{domain_lit})/;
180							$RC{domain} = qr/(?:$RC{sub_domain}(?:\.$RC{sub_domain})*)/;
181							$RC{local_part} = qr/(?:$RC{word}(?:\.$RC{word})*)/;
182
183							# This is the RFC822 addr-spec ...
184							$RC{addr_spec} = qr/($RC{local_part})\@($RC{domain})/;
185
186							# ... but many MTAs are non-compliant:
187							$RC{addr_spec2} = qr/($RC{local_part})(?:\@($RC{domain}))?/;
188							$RC{addr_spec3} = qr/$RC{addr_spec2}\|($RC{domain})/;
189							$RC{addr_spec4} = qr/((?:$RC{words}\s+)?<$RC{addr_spec3}>\|$RC{addr_spec3})
190							(?:,\s?\.\.\.)?/x;
191							$RC{addr_spec5} = qr/(?:(?:($RC{local_part})\@)?($RC{domain}))/;
192
193							# RFC822 dictates that msg-id is "<" addr-spec ">" but in practice
194							# many MTAs do not adhere to this for the "id" part of Received headers.
195							$RC{msg_id} = qr/(<$RC{addr_spec2}>\|\#?[\w\.-]+)/;
196
197							$RC{from1} = qr/((?i:from) \s+ (<$RC{addr_spec}>))/x;
198							$RC{from2} = qr/((?i:from) \s+ ($RC{addr_spec5})?)/x;
199							$RC{by} = qr/((?i:by) \s+ ($RC{domain}))/x;
200							$RC{via} = qr/((?i:via) \s+ ($RC{atom}))/x;
201							$RC{with} = qr/((?i:with) \s ($RC{atom})?)/x; # sometimes empty atom
202							$RC{id} = qr/((?i:id) \s+ $RC{msg_id}(?::(\d+))?)/x;
203							$RC{for} = qr/((?i:for) \s+ $RC{addr_spec4})/x;
204							$RC{sent_by} = qr/((?i:sent \s by) \s+ $RC{addr_spec4})/x;
205							$RC{convert} = qr/((?i:convert) \s+ ($RC{atom}))/x;
206
207							##
208
209							sub set {
210	1			1	1	93	my $self = shift;
211	1					7	return $self;
212							}
213
214							##
215
216							=item * B
217
218							The actual parser. Returns the object (Mail::Field barfs otherwise).
219
220							=cut
221
222							sub parse {
223	6			6	1	1374	my ($self, $recv) = @_;
224
225	6					12	$self->{Text} = $recv;
226	6					12	$self->{Diags} = '';
227
228	6					17	my %parsed = (whole => $recv);
229
230							# \234 sometimes crops up for some unknown reason. Huh?!
231	6					20	$recv =~ tr/\234//d;
232
233							# From RFC822:
234							# received = "Received" ":" ; one per relay
235							# ["from" domain] ; sending host
236							# ["by" domain] ; receiving host
237							# ["via" atom] ; physical path
238							# *("with" atom) ; link/mail protocol
239							# ["id" msg-id] ; receiver msg id
240							# ["for" addr-spec] ; initial form
241							# ";" date-time ; time received
242							#
243							# Sadly many many MTAs are broken, however, so we have to deal with
244							# a lot of special cases. Improvements to this section are very welcome.
245
246	6					15	my %expecting = map { $_ => 1 }
	54					97
247							(qw/from by via with id convert for sent_by date_time/);
248
249	6					17	for ($recv) {
250	6					10	my $last_section = '';
251
252							TOKEN:
253	6					7	while (1) {
254	43	50				377	$self->diagnose("---- Expecting: ", (join ' ', sort keys %expecting),
255							"\n") if $debug >= 5;
256	43	50				126	$self->diagnose("---- Last section: $last_section\n")
257							if $debug >= 6;
258
259	43	100				420	if (/\G$RC{comment}/cg) {
260	7					20	my $comment = $1;
261	7	50				37	$self->diagnose("Got comment $comment\n") if $debug >= 4;
262
263	7	100				18	push @{$parsed{$last_section}{comments}}, $comment
	4					15
264							if $last_section;
265	7					8	push @{$parsed{comments}}, $comment;
	7					19
266
267	7	100				20	if ($last_section eq 'from') {
268							FROMCOMMENT:
269							{
270	2	50				3	if ($comment =~ /\(
	2					177
271							(?:(?:($RC{local_part})\@)?($RC{domain})\s+)?
272							(?:\[ $RC{ipv4_addr} \])(?:
273							)\)/x)
274							{
275	2	50				9	if ($1) {
276	0	0				0	$self->diagnose("Got `from' ident in comments: $1\n")
277							if $debug >= 3;
278	0					0	$parsed{from}{ident} = $1;
279							}
280
281	2	50				9	if ($2) {
282	2	50				16	$self->diagnose("Got `from' domain in comments: $2\n")
283							if $debug >= 3;
284	2					9	$parsed{from}{domain} = $2;
285							}
286
287	2	50				8	if ($3) {
288	2	50				13	$self->diagnose("Got `from' IP address in comments: $3\n")
289							if $debug >= 3;
290	2					6	$parsed{from}{address} = $3;
291							}
292
293	2					5	last FROMCOMMENT;
294							}
295
296	0	0				0	if ($comment =~ /(HELO\|EHLO)(?:\s+\|=)($RC{domain})/i) {
297							# HELO domain is in comments, not outside, so swap
298	0	0				0	$self->diagnose("Got `from' $1 domain in comments: $2\n")
299							if $debug >= 3;
300	0					0	@{$parsed{from}}{qw/domain HELO/}
	0					0
301							= ($parsed{from}{HELO}, $2);
302							}
303
304	0	0				0	if ($comment =~ /$RC{ipv4_addr}\]?(?::(\d{1,5}))?/) {
305	0	0				0	$self->diagnose("Got `from' IP address in comments: $1\n")
306							if $debug >= 3;
307
308	0					0	$parsed{from}{address} = $1;
309
310	0	0				0	if ($2) {
311	0					0	$parsed{from}{port} = $2;
312	0	0				0	$self->diagnose("Got `from' port in comments: $1\n")
313							if $debug >= 3;
314							}
315							}
316							}
317	2					10	$parsed{from}{whole} .= " $comment\n";
318							}
319
320	7					18	next TOKEN;
321							}
322
323	36	100				117	if (/\G(\s+)/cg) {
324	11	50				27	$self->diagnose("Got whitespace: <$1>\n") if $debug >= 7;
325	11					21	next TOKEN;
326							}
327
328	25	50	66			328	if ($expecting{from} and /\G$RC{from1}/cg) {
329	0		0			0	print map { ($_ \|\| '__undef__') . "\n---\n" } $1, $2, $3, $4, $5, $6;
	0					0
330	0	0				0	$self->diagnose("Got from type1: $1\n") if $debug >= 2;
331	0					0	$last_section = 'from';
332
333	0					0	$parsed{from}{whole} = $1;
334	0					0	$parsed{from}{from} = $2;
335	0	0				0	$parsed{from}{ident} = $3 if $3;
336	0					0	$parsed{from}{HELO} = $4;
337
338	0					0	delete $expecting{from};
339	0					0	delete @expecting{grep /^after_/, keys %expecting};
340	0					0	$expecting{after_from}++;
341	0					0	next TOKEN;
342							}
343
344	25	100	100			369	if ($expecting{from} and /\G$RC{from2}/cg) {
345	2	50				27	$self->diagnose("Got from type2: $1\n") if $debug >= 2;
346	2					4	$last_section = 'from';
347
348	2					8	$parsed{from}{whole} = $1;
349	2					6	$parsed{from}{from} = $2;
350	2	50				9	$parsed{from}{ident} = $3 if $3;
351	2					6	$parsed{from}{HELO} = $4;
352
353	2					6	delete $expecting{from};
354	2					11	delete @expecting{grep /^after_/, keys %expecting};
355	2					17	$expecting{after_from}++;
356	2					9	next TOKEN;
357							}
358
359	23	50	66			111	if ($expecting{after_from} and /\G($RC{domain_lit})/cg) {
360	0	0				0	$self->diagnose("Got address from bad `from': $1\n") if $debug >= 3;
361	0					0	$parsed{from}{address} = $1;
362	0					0	delete $expecting{after_from};
363	0					0	next TOKEN;
364							}
365
366	23	50	66			78	if ($expecting{after_from} and $parsed{from}{whole} eq 'from mail' and
			33
367							/\G(pickup service)/cg) {
368	0	0				0	$self->diagnose("Got bad `from': appending: $1\n")
369							if $debug >= 3;
370	0					0	$parsed{from}{whole} .= $1;
371	0					0	delete $expecting{after_from};
372	0					0	next TOKEN;
373							}
374
375							# Deal with incompetence from the fucking /imbeciles/ at M$.
376	23	50	66			69	if ($expecting{after_from} and $parsed{whole} =~ /Microsoft SMTPSVC/ and
			33
377							/\G-\s+$RC{ipv4_addr}/cg) {
378	0	0				0	$self->diagnose("Got IP from bad M\$ from: $1\n") if $debug >= 3;
379	0					0	$parsed{from}{address} = $1;
380	0					0	delete $expecting{after_from};
381	0					0	next TOKEN;
382							}
383
384	23	50	66			111	if ($expecting{after_from} and /\G, claiming to be ($RC{word})/cg) {
385	0	0				0	$self->diagnose("Got HELO: $1 from brain-dead MTA\n") if $debug >= 3;
386	0					0	$parsed{allow_parse_fail}++; # More brain-dead MTAs
387	0					0	$parsed{from}{HELO} = $1;
388	0					0	delete $expecting{after_from};
389	0					0	next TOKEN;
390							}
391
392	23	100	100			183	if ($expecting{by} and /\G$RC{by},?/cg) {
393	2	50				15	$self->diagnose("Got by: $1\n") if $debug >= 2;
394	2					3	$last_section = 'by';
395
396	2					9	$parsed{by}{whole} = $1;
397	2					8	$parsed{by}{domain} = $2;
398
399	2					5	delete @expecting{qw/by/};
400	2					15	delete @expecting{grep /^after_/, keys %expecting};
401	2					5	$expecting{after_by}++;
402	2					9	next TOKEN;
403							}
404
405	21	50	66			113	if ($expecting{after_by} and /\G($RC{domain_lit})/cg) {
406	0	0				0	$self->diagnose("Got address from bad `by': $1\n") if $debug >= 3;
407	0					0	$parsed{by}{address} = $1;
408	0					0	delete $expecting{after_by};
409	0					0	next TOKEN;
410							}
411
412	21	50	66			61	if ($expecting{after_by} and /\G(Sendmail)/cg) {
413	0	0				0	$self->diagnose("Got MTA from bad `by': $1\n") if $debug >= 3;
414	0					0	$parsed{by}{MTA} = $1;
415
416	0	0				0	if ($expecting{via}) {
417	0					0	$parsed{via}{via} = $1;
418							}
419
420	0					0	delete $expecting{after_by};
421	0					0	next TOKEN;
422							}
423
424	21	50	66			129	if ($expecting{via} and /\G$RC{via}/cg) {
425	0	0				0	$self->diagnose("Got via: $1\n") if $debug >= 2;
426	0					0	$last_section = 'via';
427
428	0					0	$parsed{via}{whole} = $1;
429	0					0	$parsed{via}{via} = $2;
430
431	0					0	delete $expecting{via};
432	0					0	delete @expecting{grep /^after_/, keys %expecting};
433	0					0	$expecting{after_via}++;
434	0					0	next TOKEN;
435							}
436
437	21	50	33			57	if ($expecting{after_via} and /\G\[$RC{ipv4_addr}\]/cg) {
438	0	0				0	$self->diagnose("Got address from bad `via': $1\n") if $debug >= 3;
439	0					0	$parsed{via}{address} = $1;
440	0					0	delete $expecting{after_via};
441	0					0	next TOKEN;
442							}
443
444	21	50	66			82	if (! $expecting{from} and /\Gfrom\s+stdin/cg) {
445	0	0				0	$self->diagnose("Got `from stdin'\n") if $debug >= 3;
446	0					0	$parsed{from}{stdin} = 'yep';
447	0					0	next TOKEN;
448							}
449
450	21	50	66			77	if ($expecting{with} and
451							m!
452							\G((?i:with) \s
453							(P:(stdio\|smtp)/R:(inet\|bind)_hosts/T:(smtp\|inet_zone_bind_smtp)))
454							!cgx) {
455	0	0				0	$self->diagnose("Got weird with: $1\n") if $debug >= 2;
456	0					0	$last_section = 'with';
457
458	0					0	$parsed{with}{whole} = $1;
459	0					0	$parsed{with}{with} = $2;
460
461	0					0	delete @expecting{grep /^after_/, keys %expecting};
462	0					0	$expecting{after_with}++;
463							# I've seen the `from' bit come after the `with' bit sometimes.
464							# Why oh why ...
465	0					0	$expecting{from}++;
466	0					0	next TOKEN;
467							}
468
469	21	100	100			173	if ($expecting{with} and /\G$RC{with}/cg) {
470	2	50				79	$self->diagnose("Got with: $1\n") if $debug >= 2;
471	2					5	$last_section = 'with';
472
473	2					27	$parsed{with}{whole} = $1;
474	2					6	$parsed{with}{with} = $2;
475	2	50				16	$parsed{with}{with} .= $3 if $3;
476
477	2					16	delete @expecting{grep /^after_/, keys %expecting};
478	2					5	$expecting{after_with}++;
479							# I've seen the `from' bit come after the `with' bit sometimes.
480							# Why oh why ...
481	2					5	$expecting{from}++;
482	2					7	next TOKEN;
483							}
484
485	19	100	66			63	if ($expecting{after_with} && $parsed{with}{with}) {
486
487							# Microsoft SMTPSVC uses two atoms -- yet /another/ example of
488							# Microsoft not following standards ... gasp
489
490	2	50				10	if ($parsed{with}{with} eq 'Microsoft') {
491	0	0				0	if (/\GSMTPSVC(?:$([\d\.]+)$)?/cg) {
		0
492	0	0				0	$self->diagnose("Got M\$ SMTPSVC version from bad `with'",
		0
493							$1 ? ": $1" : '',
494							"\n")
495							if $debug >= 3;
496	0					0	delete $expecting{after_with};
497	0					0	next TOKEN;
498							}
499							elsif (/\GMAPI/cg) {
500	0	0				0	$self->diagnose("Got Microsoft MAPI from bad `with'\n")
501							if $debug >= 3;
502	0					0	delete $expecting{after_with};
503	0					0	next TOKEN;
504							}
505							}
506
507							# More brain damage ...
508
509	2	50	33			10	if ($parsed{with}{with} eq 'Internet' and
510							/\GMail Service\s*$([\d\.]+)$/cg) {
511	0	0				0	$self->diagnose("Got Internet Mail Service version from bad `with': $1\n")
512							if $debug >= 3;
513	0					0	delete $expecting{after_with};
514	0					0	next TOKEN;
515							}
516
517	2	50	33			20	if ($parsed{with}{with} eq 'WorldClient' and
518							/\G($RC{domain_lit})/cg) {
519	0	0				0	$self->diagnose("Got WorldClient address from bad `with': $1\n")
520							if $debug >= 3;
521	0					0	delete $expecting{after_with};
522	0					0	next TOKEN;
523							}
524
525	2	50	33			9	if ($parsed{with}{with} eq 'Local' and
526							/\GSMTP/cg) {
527	0	0				0	$self->diagnose("Got Local SMTP from bad `with'\n")
528							if $debug >= 3;
529	0					0	delete $expecting{after_with};
530	0					0	next TOKEN;
531							}
532
533							}
534
535	19	100	100			350	if ($expecting{id} and /\G$RC{id}/cg) {
536	2	50				17	$self->diagnose("Got id: $1\n") if $debug >= 2;
537	2					5	$last_section = 'id';
538
539	2					8	$parsed{id}{whole} = $1;
540	2					6	$parsed{id}{id} = $2;
541	2	50				9	$parsed{id}{port} = $3 if $3;
542
543	2					7	delete @expecting{qw/by via with/};
544	2					17	delete @expecting{grep /^after_/, keys %expecting};
545	2					9	next TOKEN;
546							}
547
548	17	50	66			129	if ($expecting{convert} and /\G$RC{convert}/cg) {
549	0	0				0	$self->diagnose("Got convert: $1\n") if $debug >= 2;
550	0					0	$last_section = 'convert';
551
552	0					0	$parsed{convert}{whole} = $1;
553
554	0					0	delete @expecting{qw/from by via with convert/};
555	0					0	delete @expecting{grep /^after_/, keys %expecting};
556	0					0	next TOKEN;
557							}
558
559	17	100	100			527	if ($expecting{for} and
560							/\G$RC{for}(\s+bugtraq\@securityfocus\.com)?/cgi) {
561	1	50				11	$self->diagnose("Got for: $1\n") if $debug >= 2;
562	1					2	$last_section = 'for';
563
564	1					5	$parsed{for}{whole} = $1;
565	1					4	$parsed{for}{for} = $2;
566	1	50				6	$parsed{for}{bugtraq} = $3 if $3;
567
568	1					5	delete @expecting{qw/from by convert for/};
569	1					5	delete @expecting{grep /^after_/, keys %expecting};
570	1					8	next TOKEN;
571							}
572
573	16	50	66			472	if ($expecting{sent_by} and /\G$RC{sent_by}/cg) {
574	0	0				0	$self->diagnose("Got sent by: $1\n") if $debug >= 2;
575	0					0	$last_section = 'sent_by';
576
577	0					0	$parsed{sent_by}{whole} = $1;
578	0					0	$parsed{sent_by}{sent_by} = $2;
579
580	0					0	delete @expecting{qw/from by via with convert for sent_by/};
581	0					0	delete @expecting{grep /^after_/, keys %expecting};
582	0					0	next TOKEN;
583							}
584
585	16	100	100			902	if ($expecting{date_time} and /\G((?:on\s+)?$RC{date_time})/cg) {
586	5	50				32	$self->diagnose("Got date_time: $1\n") if $debug >= 2;
587	5					8	$last_section = 'date_time';
588
589							# Eugh. This is horrible. Maybe I should have used
590							# Parse::RecDescent after all ...
591
592	5					11	@{$parsed{date_time}}{qw/whole date_time week_day day_of_year rest/}
	5					44
593							= ($1, $2, $3, $4, $9);
594
595	5	50				56	if (" $parsed{date_time}{day_of_year}" =~ $RC{year_day1}) {
		0
596	5					8	@{$parsed{date_time}}{qw/month_day month/} = ($1, $2);
	5					23
597							}
598							elsif (" $parsed{date_time}{day_of_year}" =~ $RC{year_day2}) {
599	0					0	@{$parsed{date_time}}{qw/month month_day/} = ($1, $2);
	0					0
600							}
601							else {
602	0					0	$self->diagnose("Couldn't parse day_of_year: <$parsed{date_time}{day_of_year}>");
603	0					0	$parsed{parse_failed}++;
604							}
605
606	5	100				179	if ($parsed{date_time}{rest} =~ $RC{date_time1}) {
		100
		50
607	1					4	@{$parsed{date_time}}{qw/hms hour minute second year/}
	1					9
608							= ($1, $2, $3, $4, $5);
609	1	50				7	$parsed{date_time}{zone} = $6 if defined $6;
610							}
611							elsif ($parsed{date_time}{rest} =~ $RC{date_time2}) {
612	1					3	@{$parsed{date_time}}{qw/hms hour minute second zone year/}
	1					29
613							= ($1, $2, $3, $4, $5, $10);
614							}
615							elsif ($parsed{date_time}{rest} =~ $RC{date_time3}) {
616	3					13	@{$parsed{date_time}}{qw/year hms hour minute second/}
	3					29
617							= ($1, $2, $3, $4, $5);
618	3	50				20	$parsed{date_time}{zone} = $6 if defined $6;
619							}
620							else {
621	0					0	$self->diagnose("Couldn't parse rest of date_time: <$parsed{date_time}{rest}>");
622	0					0	$parsed{parse_failed}++;
623							}
624
625	5					21	%expecting = (after_date_time => 1);
626	5					19	next TOKEN;
627							}
628
629	11	50	66			59	if ($expecting{after_date_time} and /\G((mail.from\|env.from).+)/cg) {
630	0	0				0	$self->diagnose("Got random crap after date: $1\n") if $debug >= 3;
631	0					0	$parsed{after_date_time} = $1;
632	0					0	next TOKEN;
633							}
634
635							# Reluctantly allow semi-colons in random places
636	11	100				41	if (/\G(;\s+)/cg) {
637	5	50				11	$self->diagnose("Got semi-colon: <$1>\n") if $debug >= 7;
638	5					17	next TOKEN;
639							}
640
641	6		100			88	my $old_pos = pos() \|\| 0;
642	6					19	my @start = ($old_pos - 35, $old_pos);
643	6	100				17	$start[0] = 0 if $start[0] < 0;
644	6					10	my $length = $old_pos - $start[0];
645	6	100				19	if (/\G(.{1,35})/cg) {
646	1	50				12	$self->diagnose("** Ran out of things to match at position $old_pos:\n",
647							substr($_, $start[0], $length), "<<<\n",
648							' ' x ($length - 3), ">>>$1\n\n")
649							if $debug >= 1;
650	1					3	$parsed{parse_failed}++;
651							}
652	6					26	last TOKEN;
653							}
654							}
655
656	6					19	$self->{parse_tree} = \%parsed;
657
658	6		66			45	my $failed = $parsed{parse_failed} && ! $parsed{allow_parse_fail};
659	6	100				18	$self->{parsed_ok} = $failed ? 0 : 1;
660	6					31	return $self;
661							}
662
663							##
664
665							=item * B
666
667							if ($received->parsed_ok()) {
668							...
669							}
670
671							Returns true if the parse succeed, or if it failed, but was permitted
672							to fail for some reason, such as encountering evidence of a known
673							broken (non-RFC822-compliant) format mid-parse.
674
675							=cut
676
677							sub parsed_ok {
678	6			6	1	17	my $self = shift;
679	6	50				19	croak "Header not parsed yet" unless $self->{parse_tree};
680	6					33	return $self->{parsed_ok};
681							}
682
683							##
684
685							=item * B
686
687							my $parse_tree = $received->parse_tree();
688
689							Returns the actual parse tree, which is where you get all the useful
690							information. It is returned as a hashref whose keys are strings like
691							`from', `by', `with', `id', `via' etc., corresponding to the
692							components of Received headers as defined by RFC822:
693
694							received = "Received" ":" ; one per relay
695							["from" domain] ; sending host
696							["by" domain] ; receiving host
697							["via" atom] ; physical path
698							*("with" atom) ; link/mail protocol
699							["id" msg-id] ; receiver msg id
700							["for" addr-spec] ; initial form
701							";" date-time ; time received
702
703							The corresponding values are more hashrefs which are mini-parse-trees
704							for these individual components. A typical parse tree looks something
705							like:
706
707							{
708							'by' => {
709							'domain' => 'host5.hostingcheck.com',
710							'whole' => 'by host5.hostingcheck.com',
711							'comments' => [
712							'(8.9.3/8.9.3)'
713							],
714							},
715							'date_time' => {
716							'year' => 2000,
717							'week_day' => 'Tue',
718							'minute' => 57,
719							'day_of_year' => '1 Feb',
720							'month_day' => ' 1',
721							'zone' => '-0500',
722							'second' => 18,
723							'hms' => '21:57:18',
724							'date_time' => 'Tue, 1 Feb 2000 21:57:18 -0500',
725							'hour' => 21,
726							'month' => 'Feb',
727							'rest' => '2000 21:57:18 -0500',
728							'whole' => 'Tue, 1 Feb 2000 21:57:18 -0500'
729							},
730							'with' => {
731							'with' => 'ESMTP',
732							'whole' => 'with ESMTP'
733							},
734							'from' => {
735							'domain' => 'mediacons.tecc.co.uk',
736							'HELO' => 'tr909.mediaconsult.com',
737							'from' => 'tr909.mediaconsult.com',
738							'address' => '193.128.6.132',
739							'comments' => [
740							'(mediacons.tecc.co.uk [193.128.6.132])',
741							],
742							'whole' => 'from tr909.mediaconsult.com (mediacons.tecc.co.uk [193.128.6.132])
743							'
744							},
745							'id' => {
746							'id' => 'VAA24164',
747							'whole' => 'id VAA24164'
748							},
749							'comments' => [
750							'(mediacons.tecc.co.uk [193.128.6.132])',
751							'(8.9.3/8.9.3)'
752							],
753							'for' => {
754							'for' => '',
755							'whole' => 'for '
756							},
757							'whole' => 'from tr909.mediaconsult.com (mediacons.tecc.co.uk [193.128.6.132]) by host5.hostingcheck.com (8.9.3/8.9.3) with ESMTP id VAA24164 for ; Tue, 1 Feb 2000 21:57:18 -0500'
758							}
759
760							=cut
761
762							sub parse_tree {
763	5			5	1	12	my $self = shift;
764	5	50				23	croak "Header not parsed yet" unless $self->{parse_tree};
765	5					26	return $self->{parse_tree};
766							}
767
768							=back
769
770							=head1 BUGS
771
772							Doesn't use Parse::RecDescent, which it maybe should.
773
774							Doesn't offer a `strict RFC822' parsing mode. To implement that would
775							be a royal pain in the arse, unless we move to Parse::RecDescent.
776
777							=head1 SEE ALSO
778
779							L, L
780
781							=head1 AUTHOR
782
783							Adam Spiers
784
785							=head1 LICENSE
786
787							All rights reserved. This program is free software; you can redistribute
788							it and/or modify it under the same terms as Perl itself.
789
790							=cut