File Coverage

blib/lib/IMAP/BodyStructure.pm
Criterion Covered Total %
statement 172 181 95.0
branch 62 72 86.1
condition 15 24 62.5
subroutine 26 28 92.8
pod 8 8 100.0
total 283 313 90.4


line stmt bran cond sub pod time code
1             package IMAP::BodyStructure;
2 1     1   23413 use strict;
  1         2  
  1         28  
3              
4             # $Id: BodyStructure.pm,v 1.17 2006/05/02 16:56:36 kappa Exp $
5              
6             =head1 NAME
7              
8             IMAP::BodyStructure - IMAP4-compatible BODYSTRUCTURE and ENVELOPE parser
9              
10             =head1 SYNOPSIS
11            
12             use IMAP::BodyStructure;
13              
14             # $imap is a low-level IMAP-client with an ability to fetch items
15             # by message uids
16              
17             my $bs = new IMAP::BodyStructure
18             $imap->imap_fetch($msg_uid,
19             'BODYSTRUCTURE', 1)->[0]->{BODYSTRUCTURE};
20              
21             print "[UID:$msg_uid] message is in Russian. Sure.\n"
22             if $bs->charset =~ /(?:koi8-r|windows-1251)/i;
23              
24             my $part = $bs->part_at('1.3');
25             $part->type =~ m#^image/#
26             and print "The 3rd part is an image named \""
27             . $part->filename . "\"\n";
28              
29             =head1 DESCRIPTION
30              
31             An IMAP4-compatible IMAP server MUST include a full MIME-parser which
32             parses the messages inside IMAP mailboxes and is accessible via
33             BODYSTRUCTURE fetch item. This module provides a Perl interface to
34             parse the output of IMAP4 MIME-parser. Hope no one will have problems
35             with parsing this doc.
36              
37             It is a rather straightforward C-style parser and is
38             therefore much, much faster then the venerable L
39             which is based on a L grammar. I believe it is also
40             more correct when parsing nested multipart C parts. See
41             testsuite if interested.
42              
43             I'd also like to emphasize that I
44             client!> You will need to employ one from CPAN, there are many. A
45             section with examples of getting to a BODYSTRUCTURE fetch item with
46             various Perl IMAP clients available on CPAN would greatly
47             enhance this document.
48              
49             =head1 INTERFACE
50              
51             =cut
52              
53 1     1   19 use 5.005;
  1         3  
54              
55 1     1   4 use vars qw/$VERSION/;
  1         6  
  1         105  
56              
57             $VERSION = '1.02';
58              
59             sub _get_envelope($\$);
60             sub _get_bodystructure(\$;$$);
61             sub _get_npairs(\$);
62             sub _get_ndisp(\$);
63             sub _get_nstring(\$);
64              
65             =head2 METHODS
66              
67             =over 4
68              
69             =item new($)
70              
71             The constructor does most of the work here. It initializes the
72             hierarchial data structure representing all the message parts and their
73             properties. It takes one argument which should be a string returned
74             by IMAP server in reply to a FETCH command with BODYSTRUCTURE item.
75              
76             All the parts on all the levels are represented by IMAP::BodyStructure
77             objects and that enables the uniform access to them. It is a direct
78             implementation of the Composite Design Pattern.
79              
80             =cut
81              
82 1         14 use fields qw/type encoding size disp params parts desc bodystructure
83 1     1   205764 part_id cid textlines md5 lang loc envelope/;
  1         3337  
84              
85             sub new {
86 13     13 1 630 my $class = shift;
87 13   33     85 $class = ref $class || $class;
88 13         25 my $imap_str= shift;
89              
90 13         42 return _get_bodystructure($imap_str, $class);
91             }
92              
93             =item type()
94              
95             Returns the MIME type of the part. Expect something like C
96             or C.
97              
98             =item encoding()
99              
100             Returns the MIME encoding of the part. This is usually one of '7bit',
101             '8bit', 'base64' or 'quoted-printable'.
102              
103             =item size()
104              
105             Returns the size of the part in octets. It is I the size of the
106             data in the part, which may be encoded as quoted-printable leaving us
107             without an obvious method of calculating the exact size of original
108             data.
109              
110             =cut
111              
112             for my $field (qw/type encoding size/) {
113 1     1   384 no strict 'refs';
  1         3  
  1         3720  
114 123     123   474 *$field = sub { return $_[0]->{$field} };
115             }
116              
117             =item disp()
118              
119             Returns the content-disposition of the part. One of 'inline' or
120             'attachment', usually. Defaults to inline, but you should remember
121             that if there IS a disposition but you cannot recognize it than act as
122             if it's 'attachment'. And use case-insensitive comparisons.
123              
124             =cut
125              
126             sub disp {
127 2     2 1 2 my $self = shift;
128              
129 2 50 50     15 return $self->{disp} ? $self->{disp}->[0] || 'inline' : 'inline';
130             }
131              
132             =item charset()
133              
134             Returns the charset of the part OR the charset of the first nested
135             part. This looks like a good heuristic really. Charset is something
136             resembling 'UTF-8', 'US-ASCII', 'ISO-8859-13' or 'KOI8-R'. The standard
137             does not say it should be uppercase, by the way.
138              
139             Can be undefined.
140              
141             =cut
142              
143             sub charset {
144 2     2 1 514 my $self = shift;
145              
146             # get charset from params OR dive into the first part
147             return $self->{params}->{charset}
148 2   0     15 || ($self->{parts} && @{$self->{parts}} && $self->{parts}->[0]->charset)
149             || undef; # please oh please, no '' or '0' charsets
150             }
151              
152             =item filename()
153              
154             Returns the filename specified as a part of Content-Disposition
155             header.
156              
157             Can be undefined.
158              
159             =cut
160              
161             sub filename {
162 2     2 1 4 my $self = shift;
163              
164 2         10 return $self->{disp}->[1]->{filename};
165             }
166              
167             =item description()
168              
169             Returns the description of the part.
170              
171             =cut
172              
173             sub description {
174 0     0 1 0 my $self = shift;
175              
176 0         0 return $self->{desc};
177             }
178              
179             =item parts(;$)
180              
181             This sub acts differently depending on whether you pass it an
182             argument or not.
183              
184             Without any arguments it returns a list of parts in list context and
185             the number in scalar context.
186              
187             Specifying a scalar argument allows you to get an individual part with
188             that index.
189              
190             I
191             etc. but IMAP::BodyStructure objects containing information about the
192             message parts which was extracted from parsing BODYSTRUCTURE IMAP
193             response!>
194              
195             =cut
196              
197             sub parts {
198 14     14 1 23 my $self = shift;
199 14         19 my $arg = shift;
200              
201 14 100       28 if (defined $arg) {
202 13         55 return $self->{parts}->[$arg];
203             } else {
204 1 50       3 return wantarray ? @{$self->{parts}} : scalar @{$self->{parts}};
  0         0  
  1         5  
205             }
206             }
207              
208             =item part_at($)
209              
210             This method returns a message part by its path. A path to a part in
211             the hierarchy is a dot-separated string of part indices. See L for
212             an example. A nested C does not add a hierarchy level
213             UNLESS it is a single part of another C part (with no
214             C levels in between). Instead, it has an additional
215             C<.TEXT> part which refers to the internal IMAP::BodyStructure object.
216             Look, here is an outline of an example message structure with part
217             paths alongside each part.
218              
219             multipart/mixed 1
220             text/plain 1.1
221             application/msword 1.2
222             message/rfc822 1.3
223             multipart/alternative 1.3.TEXT
224             text/plain 1.3.1
225             multipart/related 1.3.2
226             text/html 1.3.2.1
227             image/png 1.3.2.2
228             image/png 1.3.2.3
229              
230             This is a text email with two attachments, one being an MS Word document,
231             and the other is itself a message (probably a forward) which is composed in a
232             graphical MUA and contains two alternative representations, one
233             plain text fallback and one HTML with images (bundled as a
234             C).
235              
236             Another one with several levels of C. This one is hard
237             to compose in a modern MUA, however.
238              
239             multipart/mixed 1
240             text/plain 1.1
241             message/rfc822 1.2
242             message/rfc822 1.2.TEXT
243             text/plain 1.2.1
244              
245             =cut
246              
247             sub part_at {
248 33     33 1 3632 my $self = shift;
249 33         52 my $path = shift;
250              
251 33         115 return $self->_part_at(split /\./, $path);
252             }
253              
254             sub _part_at {
255 77     77   100 my $self = shift;
256 77         378 my @parts = @_;
257              
258 77 100       240 return $self unless @parts; # (cond ((null? l) s)
259              
260 60         86 my $part_num = shift @parts; # (car l)
261              
262 60 100       167 if ($self->type =~ /^multipart\//) {
    100          
263 33 100       87 if (exists $self->{parts}->[$part_num - 1]) {
264 27         74 return $self->{parts}->[$part_num - 1]->_part_at(@parts);
265             } else {
266 6         31 return;
267             }
268             } elsif ($self->type eq 'message/rfc822') {
269 22 100       61 return $self->{bodystructure} if $part_num eq 'TEXT';
270              
271 18 100       34 if ($self->{bodystructure}->type =~ m{^ multipart/ | ^ message/rfc822 \z}xms) {
272 12         28 return $self->{bodystructure}->_part_at($part_num, @parts);
273             } else {
274 6 100       23 return $part_num == 1 ? $self->{bodystructure}->_part_at(@parts) : undef;
275             }
276             } else {
277             # there's no included parts in single non-rfc822 parts
278             # so if you still want one you get undef
279 5 100 66     55 if ($part_num && $part_num ne '1' || @parts) {
      100        
280 3         26 return;
281             } else {
282 2         17 return $self;
283             }
284             }
285             }
286              
287             =item part_path()
288              
289             Returns the part path to the current part.
290              
291             =back
292              
293             =head2 DATA MEMBERS
294              
295             These are additional pieces of information returned by IMAP server and
296             parsed. They are rarely used, though (and rarely defined too, btw), so
297             I chose not to provide access methods for them.
298              
299             =over 4
300              
301             =item params
302              
303             This is a hashref of MIME parameters. The only interesting param is
304             charset and there's a shortcut method for it.
305              
306             =item lang
307              
308             Content language.
309              
310             =item loc
311              
312             Content location.
313              
314             =item cid
315              
316             Content ID.
317              
318             =item md5
319              
320             Content MD5. No one seems to bother with calculating and it is usually
321             undefined.
322              
323             =back
324              
325             B and B members exist only in singlepart parts.
326              
327             =cut
328              
329             sub part_path {
330 0     0 1 0 my $self = shift;
331              
332 0         0 return $self->{part_id};
333             }
334              
335             sub _get_envelope($\$) {
336 7     7   515 eval "$_[0]::Envelope->new(\$_[1])";
337             }
338              
339             sub _get_bodystructure(\$;$$) {
340 66     66   77 my $str = shift;
341 66   50     130 my $class = shift || __PACKAGE__;
342 66         76 my $id = shift;
343              
344 66         161 my __PACKAGE__ $bs = fields::new($class);
345 66   100     15534 $bs->{part_id} = $id || 1; # !defined $id --> top-level message
346             # and single-part has one part with part_id 1
347              
348 66 100       152 my $id_prefix = $id ? "$id." : '';
349              
350 66 100       369 $$str =~ m/\G\s*(?:\(BODYSTRUCTURE\s*)?\(/gc
351             or return 0;
352              
353 53         122 $bs->{parts} = [];
354 53 100       157 if ($$str =~ /\G(?=\()/gc) {
355             # multipart
356 13         24 $bs->{type} = 'multipart/';
357 13         18 my $part_id = 1;
358 13         22 $id_prefix =~ s/\.?TEXT//;
359 13         77 while (my $part_bs = _get_bodystructure($$str, $class, $id_prefix . $part_id++)) {
360 33         34 push @{$bs->{parts}}, $part_bs;
  33         149  
361             }
362              
363 13         36 $bs->{type} .= lc(_get_nstring($$str));
364 13         26 $bs->{params} = _get_npairs($$str);
365 13         26 $bs->{disp} = _get_ndisp($$str);
366 13         27 $bs->{lang} = _get_nstring($$str);
367 13         27 $bs->{loc} = _get_nstring($$str);
368             } else {
369 40         93 $bs->{type} = lc (_get_nstring($$str) . '/' . _get_nstring($$str));
370 40         143 $bs->{params} = _get_npairs($$str);
371 40         84 $bs->{cid} = _get_nstring($$str);
372 40         104 $bs->{desc} = _get_nstring($$str);
373 40         82 $bs->{encoding} = _get_nstring($$str);
374 40         114 $bs->{size} = _get_nstring($$str);
375              
376 40 100       198 if ($bs->{type} eq 'message/rfc822') {
    100          
377 7         24 $bs->{envelope} = _get_envelope($class, $$str);
378 7 100       31 if ($id_prefix =~ s/\.?TEXT//) {
379 1         4 $bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . '1');
380             } else {
381 6         26 $bs->{bodystructure} = _get_bodystructure($$str, $class, $id_prefix . 'TEXT');
382             }
383 7         16 $bs->{textlines} = _get_nstring($$str);
384             } elsif ($bs->{type} =~ /^text\//) {
385 23         45 $bs->{textlines} = _get_nstring($$str);
386             }
387              
388 40         88 $bs->{md5} = _get_nstring($$str);
389 40         86 $bs->{disp} = _get_ndisp($$str);
390 40         73 $bs->{lang} = _get_nstring($$str);
391 40         68 $bs->{loc} = _get_nstring($$str);
392             }
393              
394 53         122 $$str =~ m/\G\s*\)/gc;
395              
396 53         198 return $bs;
397             }
398              
399             sub _get_ndisp(\$) {
400 53     53   59 my $str = shift;
401              
402 53         92 $$str =~ /\G\s+/gc;
403              
404 53 100       175 if ($$str =~ /\GNIL/gc) {
    50          
405 23         66 return undef;
406             } elsif ($$str =~ m/\G\s*\(/gc) {
407 30         33 my @disp;
408              
409 30         48 $disp[0] = _get_nstring($$str);
410 30         114 $disp[1] = _get_npairs($$str);
411              
412 30         63 $$str =~ m/\G\s*\)/gc;
413 30         66 return \@disp;
414             }
415            
416 0         0 return 0;
417             }
418              
419             sub _get_npairs(\$) {
420 83     83   109 my $str = shift;
421              
422 83         159 $$str =~ /\G\s+/gc;
423              
424 83 100       297 if ($$str =~ /\GNIL/gc) {
    50          
425 27         41 return undef;
426             } elsif ($$str =~ m/\G\s*\(/gc) {
427 56         64 my %r;
428 56         58 while ('fareva') {
429 114         212 my ($key, $data) = (_get_nstring($$str), _get_nstring($$str));
430 114 100       274 $key or last;
431              
432 58         169 $r{$key} = $data;
433             }
434              
435 56         126 $$str =~ m/\G\s*\)/gc;
436 56         136 return \%r;
437             }
438            
439 0         0 return 0;
440             }
441              
442             sub _get_nstring(\$) {
443 789     789   7109 my $str = $_[0];
444              
445             # nstring = string / nil
446             # nil = "NIL"
447             # string = quoted / literal
448             # quoted = DQUOTE *QUOTED-CHAR DQUOTE
449             # QUOTED-CHAR = /
450             # "\" quoted-specials
451             # quoted-specials = DQUOTE / "\"
452             # literal = "{" number "}" CRLF *CHAR8
453             # ; Number represents the number of CHAR8s
454              
455             # astring = 1*(any CHAR except "(" / ")" / "{" / SP / CTL / list-wildcards / quoted-specials)
456              
457 789         1354 $$str =~ /\G\s+/gc;
458              
459 789 100       3267 if ($$str =~ /\GNIL/gc) {
    100          
    100          
    100          
460 208         387 return undef;
461             } elsif ($$str =~ m/\G(\"(?>[^\\\"]*(?:\\.[^\\\"]*)*)\")/gc) { # delimited re ala Regexp::Common::delimited + (?>...)
462 340         556 return _unescape($1);
463             } elsif ($$str =~ /\G\{(\d+)\}\r\n/gc) {
464 5         13 my $pos = pos($$str);
465 5         51 my $data = substr $$str, $pos, $1;
466 5         14 pos($$str) = $pos + $1;
467 5         14 return $data;
468             } elsif ($$str =~ /\G([^"\(\)\{ \%\*\"\\\x00-\x1F]+)/gc) {
469 71         216 return $1;
470             }
471              
472 165         308 return 0;
473             }
474              
475             sub _unescape {
476 340     340   632 my $str = shift;
477              
478 340         897 $str =~ s/^"//;
479 340         850 $str =~ s/"$//;
480 340         443 $str =~ s/\\\"/\"/g;
481 340         425 $str =~ s/\\\\/\\/g;
482              
483 340         976 return $str;
484             }
485              
486             =over 4
487              
488             =item get_enveleope($)
489              
490             Parses a string into IMAP::BodyStructure::Envelope object. See below.
491              
492             =back
493              
494             =head2 IMAP::BodyStructure::Envelope CLASS
495              
496             Every message on an IMAP server has an envelope. You can get it
497             using ENVELOPE fetch item or, and this is relevant, from BODYSTRUCTURE
498             response in case there are some nested messages (parts with type of
499             C). So, if we have a part with such a type then the
500             corresponding IMAP::BodyStructure object always has
501             B data member which is, in turn, an object of
502             IMAP::BodyStructure::Envelope.
503              
504             You can of course use this satellite class on its own, this is very
505             useful when generating meaningful message lists in IMAP folders.
506              
507             =cut
508              
509             package IMAP::BodyStructure::Envelope;
510              
511             sub _get_nstring(\$); # proto
512              
513             *_get_nstring = \&IMAP::BodyStructure::_get_nstring;
514              
515             sub _get_naddrlist(\$);
516             sub _get_naddress(\$);
517              
518 1     1   10 use vars qw/@envelope_addrs/;
  1         2  
  1         107  
519             @envelope_addrs = qw/from sender reply_to to cc bcc/;
520              
521             =head2 METHODS
522              
523             =over 4
524              
525             =item new($)
526              
527             The constructor create Envelope object from string which should be an
528             IMAP server respone to a fetch with ENVELOPE item or a substring of
529             BODYSTRUCTURE response for a message with message/rfc822 parts inside.
530              
531             =back
532              
533             =head2 DATA MEMBERS
534              
535             =over 4
536              
537             =item date
538              
539             Date of the message as specified in the envelope. Not the IMAP
540             INTERNALDATE, be careful!
541              
542             =item subject
543              
544             Subject of the message, may be RFC2047 encoded, of course.
545              
546             =item message_id
547              
548             =item in_reply_to
549              
550             Message-IDs of the current message and the message in reply to which
551             this one was composed.
552              
553             =item to, from, cc, bcc, sender, reply_to
554              
555             These are the so called address-lists or just arrays of addresses.
556             Remember, a message may be addressed to lots of people.
557              
558             Each address is a hash of four elements:
559              
560             =over 4
561              
562             =item name
563              
564             The informal part, "A.U.Thor" from "A.U.Thor, "
565              
566             =item sroute
567              
568             Source-routing information, not used. (By the way, IMAP4r1 spec was
569             born after the last email address with sroute passed away.)
570              
571             =item account
572              
573             The part before @.
574              
575             =item host
576              
577             The part after @.
578              
579             =item full
580              
581             The full address for display purposes.
582              
583             =back
584              
585             =back
586              
587             =cut
588              
589 1     1   7 use fields qw/from sender reply_to to cc bcc date subject in_reply_to message_id/;
  1         4  
  1         7  
590              
591             sub new(\$) {
592 7     7   15 my $class = shift;
593 7         12 my $str = shift;
594            
595 7 50       37 $$str =~ m/\G\s*(?:\(ENVELOPE)?\s*\(/gc
596             or return 0;
597              
598 7         18 my __PACKAGE__ $self = fields::new($class);
599              
600 7         535 $self->{'date'} = _get_nstring($$str);
601 7         20 $self->{'subject'} = _get_nstring($$str);
602              
603 7         18 foreach my $header (@envelope_addrs) {
604 42         73 $self->{$header} = _get_naddrlist($$str);
605             }
606              
607 7         13 $self->{'in_reply_to'} = _get_nstring($$str);
608 7         44 $self->{'message_id'} = _get_nstring($$str);
609              
610 7         18 $$str =~ m/\G\s*\)/gc;
611              
612 7         62 return $self;
613             }
614              
615             sub _get_naddress(\$) {
616 32     32   38 my $str = shift;
617              
618 32 50       111 if ($$str =~ /\GNIL/gc) {
    100          
619 0         0 return undef;
620             } elsif ($$str =~ m/\G\s*\(/gc) {
621 16         29 my %addr = (
622             name => _get_nstring($$str),
623             sroute => _get_nstring($$str),
624             account => _get_nstring($$str),
625             host => _get_nstring($$str),
626             );
627             $addr{address} = ($addr{account}
628 16 50 100     97 ? "$addr{account}@" . ($addr{host} || '')
629             : '');
630              
631 16         33 $addr{full} = _format_address($addr{name}, $addr{address});
632              
633 16         43 $$str =~ m/\G\s*\)/gc;
634 16         49 return \%addr;
635             }
636 16         39 return 0;
637             }
638              
639             sub _get_naddrlist(\$) {
640 42     42   47 my $str = shift;
641            
642 42         74 $$str =~ /\G\s+/gc;
643              
644 42 100       111 if ($$str =~ /\GNIL/gc) {
    50          
645 26         63 return undef;
646             } elsif ($$str =~ m/\G\s*\(/gc) {
647 16         24 my @addrs = ();
648 16         30 while (my $addr = _get_naddress($$str)) {
649 16         40 push @addrs, $addr;
650             }
651              
652 16         32 $$str =~ m/\G\s*\)/gc;
653 16         58 return \@addrs;
654             }
655 0         0 return 0;
656             }
657              
658             my $rfc2822_atext = q(a-zA-Z0-9!#$%&'*+/=?^_`{|}~-); # simple non-interpolating string (think apostrophs)
659             my $rfc2822_atom = qr/[$rfc2822_atext]+/; # straight from rfc2822
660              
661 1     1   1175 use constant EMPTY_STR => q{};
  1         3  
  1         386  
662             sub _format_address {
663 16     16   27 my ($phrase, $email) = @_;
664              
665 16 100 66     65 if (defined $phrase && $phrase ne EMPTY_STR) {
666 13 50       34 if ($phrase !~ /^ \s* " [^"]+ " \s* \z/xms) {
667             # $phrase is not already quoted
668              
669 13         20 $phrase =~ s/ (["\\]) /\\$1/xmsg;
670              
671 13 100       222 if ($phrase !~ m/^ \s* $rfc2822_atom (?: \s+ $rfc2822_atom)* \s* \z/xms) {
672 3         10 $phrase = qq{"$phrase"};
673             }
674             }
675              
676 13 50       57 return $email ? "$phrase <$email>" : $phrase;
677             } else {
678 3   50     9 return $email || '';
679             }
680             }
681              
682             1;
683              
684             __END__