File Coverage

blib/lib/IMAP/BodyStructure.pm

Criterion	Covered	Total	%
statement	172	181	95.0
branch	62	72	86.1
condition	15	24	62.5
subroutine	26	28	92.8
pod	8	8	100.0
total	283	313	90.4

line	stmt	bran	cond	sub	pod	time	code
1							package IMAP::BodyStructure;
2	1			1		23413	use strict;
	1					2
	1					28
3
4							# $Id: BodyStructure.pm,v 1.17 2006/05/02 16:56:36 kappa Exp $
5
6							=head1 NAME
7
8							IMAP::BodyStructure - IMAP4-compatible BODYSTRUCTURE and ENVELOPE parser
9
10							=head1 SYNOPSIS
11
12							use IMAP::BodyStructure;
13
14							# $imap is a low-level IMAP-client with an ability to fetch items
15							# by message uids
16
17							my $bs = new IMAP::BodyStructure
18							$imap->imap_fetch($msg_uid,
19							'BODYSTRUCTURE', 1)->[0]->{BODYSTRUCTURE};
20
21							print "[UID:$msg_uid] message is in Russian. Sure.\n"
22							if $bs->charset =~ /(?:koi8-r\|windows-1251)/i;
23
24							my $part = $bs->part_at('1.3');
25							$part->type =~ m#^image/#
26							and print "The 3rd part is an image named \""
27							. $part->filename . "\"\n";
28
29							=head1 DESCRIPTION
30
31							An IMAP4-compatible IMAP server MUST include a full MIME-parser which
32							parses the messages inside IMAP mailboxes and is accessible via
33							BODYSTRUCTURE fetch item. This module provides a Perl interface to
34							parse the output of IMAP4 MIME-parser. Hope no one will have problems
35							with parsing this doc.
36
37							It is a rather straightforward C-style parser and is
38							therefore much, much faster then the venerable L
39							which is based on a L grammar. I believe it is also
40							more correct when parsing nested multipart C parts. See
41							testsuite if interested.
42
43							I'd also like to emphasize that I
44							client!> You will need to employ one from CPAN, there are many. A
45							section with examples of getting to a BODYSTRUCTURE fetch item with
46							various Perl IMAP clients available on CPAN would greatly
47							enhance this document.
48
49							=head1 INTERFACE
50
51							=cut
52
53	1			1		19	use 5.005;
	1					3
54
55	1			1		4	use vars qw/$VERSION/;
	1					6
	1					105
56
57							$VERSION = '1.02';
58
59							sub _get_envelope($\$);
60							sub _get_bodystructure(\$;$$);
61							sub _get_npairs(\$);
62							sub _get_ndisp(\$);
63							sub _get_nstring(\$);
64
65							=head2 METHODS
66
67							=over 4
68
69							=item new($)
70
71							The constructor does most of the work here. It initializes the
72							hierarchial data structure representing all the message parts and their
73							properties. It takes one argument which should be a string returned
74							by IMAP server in reply to a FETCH command with BODYSTRUCTURE item.
75
76							All the parts on all the levels are represented by IMAP::BodyStructure
77							objects and that enables the uniform access to them. It is a direct
78							implementation of the Composite Design Pattern.
79
80							=cut
81
82	1					14	use fields qw/type encoding size disp params parts desc bodystructure
83	1			1		205764	part_id cid textlines md5 lang loc envelope/;
	1					3337
84
85							sub new {
86	13			13	1	630	my $class = shift;
87	13		33			85	$class = ref $class \|\| $class;
88	13					25	my $imap_str= shift;
89
90	13					42	return _get_bodystructure($imap_str, $class);
91							}
92
93							=item type()
94
95							Returns the MIME type of the part. Expect something like C
96							or C.
97
98							=item encoding()
99
100							Returns the MIME encoding of the part. This is usually one of '7bit',
101							'8bit', 'base64' or 'quoted-printable'.
102
103							=item size()
104
105							Returns the size of the part in octets. It is I the size of the
106							data in the part, which may be encoded as quoted-printable leaving us
107							without an obvious method of calculating the exact size of original
108							data.
109
110							=cut
111
112							for my $field (qw/type encoding size/) {
113	1			1		384	no strict 'refs';
	1					3
	1					3720
114	123			123		474	*$field = sub { return $_[0]->{$field} };
115							}
116
117							=item disp()
118
119							Returns the content-disposition of the part. One of 'inline' or
120							'attachment', usually. Defaults to inline, but you should remember
121							that if there IS a disposition but you cannot recognize it than act as
122							if it's 'attachment'. And use case-insensitive comparisons.
123
124							=cut
125
126							sub disp {
127	2			2	1	2	my $self = shift;
128
129	2	50	50			15	return $self->{disp} ? $self->{disp}->[0] \|\| 'inline' : 'inline';
130							}
131
132							=item charset()
133
134							Returns the charset of the part OR the charset of the first nested
135							part. This looks like a good heuristic really. Charset is something
136							resembling 'UTF-8', 'US-ASCII', 'ISO-8859-13' or 'KOI8-R'. The standard
137							does not say it should be uppercase, by the way.
138
139							Can be undefined.
140
141							=cut
142
143							sub charset {
144	2			2	1	514	my $self = shift;
145
146							# get charset from params OR dive into the first part
147							return $self->{params}->{charset}
148	2		0			15	\|\| ($self->{parts} && @{$self->{parts}} && $self->{parts}->[0]->charset)
149							\|\| undef; # please oh please, no '' or '0' charsets
150							}
151
152							=item filename()
153
154							Returns the filename specified as a part of Content-Disposition
155							header.
156
157							Can be undefined.
158
159							=cut
160
161							sub filename {
162	2			2	1	4	my $self = shift;
163
164	2					10	return $self->{disp}->[1]->{filename};
165							}
166
167							=item description()
168
169							Returns the description of the part.
170
171							=cut
172
173							sub description {
174	0			0	1	0	my $self = shift;
175
176	0					0	return $self->{desc};
177							}
178
179							=item parts(;$)
180
181							This sub acts differently depending on whether you pass it an
182							argument or not.
183
184							Without any arguments it returns a list of parts in list context and
185							the number in scalar context.
186
187							Specifying a scalar argument allows you to get an individual part with
188							that index.
189
190							I
191							etc. but IMAP::BodyStructure objects containing information about the
192							message parts which was extracted from parsing BODYSTRUCTURE IMAP
193							response!>
194
195							=cut
196
197							sub parts {
198	14			14	1	23	my $self = shift;
199	14					19	my $arg = shift;
200
201	14	100				28	if (defined $arg) {
202	13					55	return $self->{parts}->[$arg];
203							} else {
204	1	50				3	return wantarray ? @{$self->{parts}} : scalar @{$self->{parts}};
	0					0
	1					5
205							}
206							}
207
208							=item part_at($)
209
210							This method returns a message part by its path. A path to a part in
211							the hierarchy is a dot-separated string of part indices. See L for
212							an example. A nested C does not add a hierarchy level
213							UNLESS it is a single part of another C part (with no
214							C levels in between). Instead, it has an additional
215							C<.TEXT> part which refers to the internal IMAP::BodyStructure object.
216							Look, here is an outline of an example message structure with part
217							paths alongside each part.
218
219							multipart/mixed 1
220							text/plain 1.1
221							application/msword 1.2
222							message/rfc822 1.3
223							multipart/alternative 1.3.TEXT
224							text/plain 1.3.1
225							multipart/related 1.3.2
226							text/html 1.3.2.1
227							image/png 1.3.2.2
228							image/png 1.3.2.3
229
230							This is a text email with two attachments, one being an MS Word document,
231							and the other is itself a message (probably a forward) which is composed in a
232							graphical MUA and contains two alternative representations, one
233							plain text fallback and one HTML with images (bundled as a
234							C).
235
236							Another one with several levels of C. This one is hard
237							to compose in a modern MUA, however.
238
239							multipart/mixed 1
240							text/plain 1.1
241							message/rfc822 1.2
242							message/rfc822 1.2.TEXT
243							text/plain 1.2.1
244
245							=cut
246
247							sub part_at {
248	33			33	1	3632	my $self = shift;
249	33					52	my $path = shift;
250
251	33					115	return $self->_part_at(split /\./, $path);
252							}
253
254							sub _part_at {
255	77			77		100	my $self = shift;
256	77					378	my @parts = @_;
257
258	77	100				240	return $self unless @parts; # (cond ((null? l) s)
259
260	60					86	my $part_num = shift @parts; # (car l)
261
262	60	100				167	if ($self->type =~ /^multipart\//) {
		100
263	33	100				87	if (exists $self->{parts}->[$part_num - 1]) {
264	27					74	return $self->{parts}->[$part_num - 1]->_part_at(@parts);
265							} else {
266	6					31	return;
267							}
268							} elsif ($self->type eq 'message/rfc822') {
269	22	100				61	return $self->{bodystructure} if $part_num eq 'TEXT';
270
271	18	100				34	if ($self->{bodystructure}->type =~ m{^ multipart/ \| ^ message/rfc822 \z}xms) {
272	12					28	return $self->{bodystructure}->_part_at($part_num, @parts);
273							} else {
274	6	100				23	return $part_num == 1 ? $self->{bodystructure}->_part_at(@parts) : undef;
275							}
276							} else {
277							# there's no included parts in single non-rfc822 parts
278							# so if you still want one you get undef
279	5	100	66			55	if ($part_num && $part_num ne '1' \|\| @parts) {
			100
280	3					26	return;
281							} else {
282	2					17	return $self;
283							}
284							}
285							}
286
287							=item part_path()
288
289							Returns the part path to the current part.
290
291							=back
292
293							=head2 DATA MEMBERS
294
295							These are additional pieces of information returned by IMAP server and
296							parsed. They are rarely used, though (and rarely defined too, btw), so
297							I chose not to provide access methods for them.
298
299							=over 4
300
301							=item params
302
303							This is a hashref of MIME parameters. The only interesting param is
304							charset and there's a shortcut method for it.
305
306							=item lang
307
308							Content language.
309
310							=item loc
311
312							Content location.
313
314							=item cid
315
316							Content ID.
317
318							=item md5
319
320							Content MD5. No one seems to bother with calculating and it is usually
321							undefined.
322
323							=back
324
325							B and B members exist only in singlepart parts.
326
327							=cut
328
329							sub part_path {
330	0			0	1	0	my $self = shift;
331
332	0					0	return $self->{part_id};
333							}
334
335							sub _get_envelope($\$) {
336	7			7		515	eval "$_[0]::Envelope->new(\$_[1])";
337							}
338
339							sub _get_bodystructure(\$;$$) {
340	66			66		77	my $str = shift;
341	66		50			130	my $class = shift \|\| __PACKAGE__;
342	66					76	my $id = shift;
343
344	66					161	my __PACKAGE__ $bs = fields::new($class);
345	66		100			15534	$bs->{part_id} = $id \|\| 1; # !defined $id --> top-level message
346							# and single-part has one part with part_id 1
347
348	66	100				152	my $id_prefix = $id ? "$id." : '';
349
350	66	100				369	$$str =~ m/\G\s(?:\(BODYSTRUCTURE\s)?\(/gc
351							or return 0;
352
353	53					122	$bs->{parts} = [];
354	53	100				157	if ($$str =~ /\G(?=\()/gc) {
355							# multipart
356	13					24	$bs->{type} = 'multipart/';
357	13					18	my $part_id = 1;
358	13					22	$id_prefix =~ s/\.?TEXT//;
359	13					77	while (my $part_bs = _get_bodystructure($$str, $class, $id_prefix . $part_id++)) {
360	33					34	push @{$bs->{parts}}, $part_bs;
	33					149
361							}
362
363	13					36	$bs->{type} .= lc(_get_nstring($$str));
364	13					26	$bs->{params} = _get_npairs($$str);
365	13					26	$bs->{disp} = _get_ndisp($$str);
366	13					27	$bs->{lang} = _get_nstring($$str);
367	13					27	$bs->{loc} = _get_nstring($$str);
368							} else {
369	40					93	$bs->{type} = lc (_get_nstring($$str) . '/' . _get_nstring($$str));
370	40					143	$bs->{params} = _get_npairs($$str);
371	40					84	$bs->{cid} = _get_nstring($$str);
372	40					104	$bs->{desc} = _get_nstring($$str);
373	40					82	$bs->{encoding} = _get_nstring($$str);
374	40					114	$bs->{size} = _get_nstring($$str);
375
376	40	100				198	if ($bs->{type} eq 'message/rfc822') {
		100
377	7					24	$bs->{envelope} = _get_envelope($class, $$str);
378	7	100				31	if ($id_prefix =~ s/\.?TEXT//) {
379	1					4	$bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . '1');
380							} else {
381	6					26	$bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . 'TEXT');
382							}
383	7					16	$bs->{textlines} = _get_nstring($$str);
384							} elsif ($bs->{type} =~ /^text\//) {
385	23					45	$bs->{textlines} = _get_nstring($$str);
386							}
387
388	40					88	$bs->{md5} = _get_nstring($$str);
389	40					86	$bs->{disp} = _get_ndisp($$str);
390	40					73	$bs->{lang} = _get_nstring($$str);
391	40					68	$bs->{loc} = _get_nstring($$str);
392							}
393
394	53					122	$$str =~ m/\G\s*\)/gc;
395
396	53					198	return $bs;
397							}
398
399							sub _get_ndisp(\$) {
400	53			53		59	my $str = shift;
401
402	53					92	$$str =~ /\G\s+/gc;
403
404	53	100				175	if ($$str =~ /\GNIL/gc) {
		50
405	23					66	return undef;
406							} elsif ($$str =~ m/\G\s*\(/gc) {
407	30					33	my @disp;
408
409	30					48	$disp[0] = _get_nstring($$str);
410	30					114	$disp[1] = _get_npairs($$str);
411
412	30					63	$$str =~ m/\G\s*\)/gc;
413	30					66	return \@disp;
414							}
415
416	0					0	return 0;
417							}
418
419							sub _get_npairs(\$) {
420	83			83		109	my $str = shift;
421
422	83					159	$$str =~ /\G\s+/gc;
423
424	83	100				297	if ($$str =~ /\GNIL/gc) {
		50
425	27					41	return undef;
426							} elsif ($$str =~ m/\G\s*\(/gc) {
427	56					64	my %r;
428	56					58	while ('fareva') {
429	114					212	my ($key, $data) = (_get_nstring($$str), _get_nstring($$str));
430	114	100				274	$key or last;
431
432	58					169	$r{$key} = $data;
433							}
434
435	56					126	$$str =~ m/\G\s*\)/gc;
436	56					136	return \%r;
437							}
438
439	0					0	return 0;
440							}
441
442							sub _get_nstring(\$) {
443	789			789		7109	my $str = $_[0];
444
445							# nstring = string / nil
446							# nil = "NIL"
447							# string = quoted / literal
448							# quoted = DQUOTE *QUOTED-CHAR DQUOTE
449							# QUOTED-CHAR = /
450							# "\" quoted-specials
451							# quoted-specials = DQUOTE / "\"
452							# literal = "{" number "}" CRLF *CHAR8
453							# ; Number represents the number of CHAR8s
454
455							# astring = 1*(any CHAR except "(" / ")" / "{" / SP / CTL / list-wildcards / quoted-specials)
456
457	789					1354	$$str =~ /\G\s+/gc;
458
459	789	100				3267	if ($$str =~ /\GNIL/gc) {
		100
		100
		100
460	208					387	return undef;
461							} elsif ($$str =~ m/\G(\"(?>[^\\\"](?:\\.[^\\\"])*)\")/gc) { # delimited re ala Regexp::Common::delimited + (?>...)
462	340					556	return _unescape($1);
463							} elsif ($$str =~ /\G\{(\d+)\}\r\n/gc) {
464	5					13	my $pos = pos($$str);
465	5					51	my $data = substr $$str, $pos, $1;
466	5					14	pos($$str) = $pos + $1;
467	5					14	return $data;
468							} elsif ($$str =~ /\G([^"\{ \%\*\"\\\x00-\x1F]+)/gc) {
469	71					216	return $1;
470							}
471
472	165					308	return 0;
473							}
474
475							sub _unescape {
476	340			340		632	my $str = shift;
477
478	340					897	$str =~ s/^"//;
479	340					850	$str =~ s/"$//;
480	340					443	$str =~ s/\\\"/\"/g;
481	340					425	$str =~ s/\\\\/\\/g;
482
483	340					976	return $str;
484							}
485
486							=over 4
487
488							=item get_enveleope($)
489
490							Parses a string into IMAP::BodyStructure::Envelope object. See below.
491
492							=back
493
494							=head2 IMAP::BodyStructure::Envelope CLASS
495
496							Every message on an IMAP server has an envelope. You can get it
497							using ENVELOPE fetch item or, and this is relevant, from BODYSTRUCTURE
498							response in case there are some nested messages (parts with type of
499							C). So, if we have a part with such a type then the
500							corresponding IMAP::BodyStructure object always has
501							B data member which is, in turn, an object of
502							IMAP::BodyStructure::Envelope.
503
504							You can of course use this satellite class on its own, this is very
505							useful when generating meaningful message lists in IMAP folders.
506
507							=cut
508
509							package IMAP::BodyStructure::Envelope;
510
511							sub _get_nstring(\$); # proto
512
513							*_get_nstring = \&IMAP::BodyStructure::_get_nstring;
514
515							sub _get_naddrlist(\$);
516							sub _get_naddress(\$);
517
518	1			1		10	use vars qw/@envelope_addrs/;
	1					2
	1					107
519							@envelope_addrs = qw/from sender reply_to to cc bcc/;
520
521							=head2 METHODS
522
523							=over 4
524
525							=item new($)
526
527							The constructor create Envelope object from string which should be an
528							IMAP server respone to a fetch with ENVELOPE item or a substring of
529							BODYSTRUCTURE response for a message with message/rfc822 parts inside.
530
531							=back
532
533							=head2 DATA MEMBERS
534
535							=over 4
536
537							=item date
538
539							Date of the message as specified in the envelope. Not the IMAP
540							INTERNALDATE, be careful!
541
542							=item subject
543
544							Subject of the message, may be RFC2047 encoded, of course.
545
546							=item message_id
547
548							=item in_reply_to
549
550							Message-IDs of the current message and the message in reply to which
551							this one was composed.
552
553							=item to, from, cc, bcc, sender, reply_to
554
555							These are the so called address-lists or just arrays of addresses.
556							Remember, a message may be addressed to lots of people.
557
558							Each address is a hash of four elements:
559
560							=over 4
561
562							=item name
563
564							The informal part, "A.U.Thor" from "A.U.Thor, "
565
566							=item sroute
567
568							Source-routing information, not used. (By the way, IMAP4r1 spec was
569							born after the last email address with sroute passed away.)
570
571							=item account
572
573							The part before @.
574
575							=item host
576
577							The part after @.
578
579							=item full
580
581							The full address for display purposes.
582
583							=back
584
585							=back
586
587							=cut
588
589	1			1		7	use fields qw/from sender reply_to to cc bcc date subject in_reply_to message_id/;
	1					4
	1					7
590
591							sub new(\$) {
592	7			7		15	my $class = shift;
593	7					12	my $str = shift;
594
595	7	50				37	$$str =~ m/\G\s(?:\(ENVELOPE)?\s\(/gc
596							or return 0;
597
598	7					18	my __PACKAGE__ $self = fields::new($class);
599
600	7					535	$self->{'date'} = _get_nstring($$str);
601	7					20	$self->{'subject'} = _get_nstring($$str);
602
603	7					18	foreach my $header (@envelope_addrs) {
604	42					73	$self->{$header} = _get_naddrlist($$str);
605							}
606
607	7					13	$self->{'in_reply_to'} = _get_nstring($$str);
608	7					44	$self->{'message_id'} = _get_nstring($$str);
609
610	7					18	$$str =~ m/\G\s*\)/gc;
611
612	7					62	return $self;
613							}
614
615							sub _get_naddress(\$) {
616	32			32		38	my $str = shift;
617
618	32	50				111	if ($$str =~ /\GNIL/gc) {
		100
619	0					0	return undef;
620							} elsif ($$str =~ m/\G\s*\(/gc) {
621	16					29	my %addr = (
622							name => _get_nstring($$str),
623							sroute => _get_nstring($$str),
624							account => _get_nstring($$str),
625							host => _get_nstring($$str),
626							);
627							$addr{address} = ($addr{account}
628	16	50	100			97	? "$addr{account}@" . ($addr{host} \|\| '')
629							: '');
630
631	16					33	$addr{full} = _format_address($addr{name}, $addr{address});
632
633	16					43	$$str =~ m/\G\s*\)/gc;
634	16					49	return \%addr;
635							}
636	16					39	return 0;
637							}
638
639							sub _get_naddrlist(\$) {
640	42			42		47	my $str = shift;
641
642	42					74	$$str =~ /\G\s+/gc;
643
644	42	100				111	if ($$str =~ /\GNIL/gc) {
		50
645	26					63	return undef;
646							} elsif ($$str =~ m/\G\s*\(/gc) {
647	16					24	my @addrs = ();
648	16					30	while (my $addr = _get_naddress($$str)) {
649	16					40	push @addrs, $addr;
650							}
651
652	16					32	$$str =~ m/\G\s*\)/gc;
653	16					58	return \@addrs;
654							}
655	0					0	return 0;
656							}
657
658							my $rfc2822_atext = q(a-zA-Z0-9!#$%&'*+/=?^_`{\|}~-); # simple non-interpolating string (think apostrophs)
659							my $rfc2822_atom = qr/[$rfc2822_atext]+/; # straight from rfc2822
660
661	1			1		1175	use constant EMPTY_STR => q{};
	1					3
	1					386
662							sub _format_address {
663	16			16		27	my ($phrase, $email) = @_;
664
665	16	100	66			65	if (defined $phrase && $phrase ne EMPTY_STR) {
666	13	50				34	if ($phrase !~ /^ \s* " [^"]+ " \s* \z/xms) {
667							# $phrase is not already quoted
668
669	13					20	$phrase =~ s/ (["\\]) /\\$1/xmsg;
670
671	13	100				222	if ($phrase !~ m/^ \s* $rfc2822_atom (?: \s+ $rfc2822_atom)* \s* \z/xms) {
672	3					10	$phrase = qq{"$phrase"};
673							}
674							}
675
676	13	50				57	return $email ? "$phrase <$email>" : $phrase;
677							} else {
678	3		50			9	return $email \|\| '';
679							}
680							}
681
682							1;
683
684							__END__