File Coverage

blib/lib/Email/MIME/ContentType.pm
Criterion Covered Total %
statement 212 241 87.9
branch 109 158 68.9
condition 30 48 62.5
subroutine 17 17 100.0
pod 4 4 100.0
total 372 468 79.4


line stmt bran cond sub pod time code
1 4     4   327540 use strict;
  4         41  
  4         131  
2 4     4   22 use warnings;
  4         6  
  4         214  
3             package Email::MIME::ContentType;
4             # ABSTRACT: Parse and build a MIME Content-Type or Content-Disposition Header
5             $Email::MIME::ContentType::VERSION = '1.024';
6 4     4   181 use Carp;
  4         17  
  4         309  
7 4     4   2218 use Encode 2.87 qw(encode find_mime_encoding);
  4         55374  
  4         566  
8 4     4   39 use Exporter 5.57 'import';
  4         56  
  4         159  
9 4     4   2024 use Text::Unidecode;
  4         10221  
  4         6283  
10              
11             our @EXPORT = qw(parse_content_type parse_content_disposition build_content_type build_content_disposition);
12              
13             #pod =head1 SYNOPSIS
14             #pod
15             #pod use Email::MIME::ContentType;
16             #pod
17             #pod # Content-Type: text/plain; charset="us-ascii"; format=flowed
18             #pod my $ct = 'text/plain; charset="us-ascii"; format=flowed';
19             #pod my $data = parse_content_type($ct);
20             #pod
21             #pod $data = {
22             #pod type => "text",
23             #pod subtype => "plain",
24             #pod attributes => {
25             #pod charset => "us-ascii",
26             #pod format => "flowed"
27             #pod }
28             #pod };
29             #pod
30             #pod my $ct_new = build_content_type($data);
31             #pod # text/plain; charset=us-ascii; format=flowed
32             #pod
33             #pod
34             #pod # Content-Type: application/x-stuff;
35             #pod # title*0*=us-ascii'en'This%20is%20even%20more%20;
36             #pod # title*1*=%2A%2A%2Afun%2A%2A%2A%20;
37             #pod # title*2="isn't it!"
38             #pod my $ct = q(application/x-stuff;
39             #pod title*0*=us-ascii'en'This%20is%20even%20more%20;
40             #pod title*1*=%2A%2A%2Afun%2A%2A%2A%20;
41             #pod title*2="isn't it!");
42             #pod my $data = parse_content_type($ct);
43             #pod
44             #pod $data = {
45             #pod type => "application",
46             #pod subtype => "x-stuff",
47             #pod attributes => {
48             #pod title => "This is even more ***fun*** isn't it!"
49             #pod }
50             #pod };
51             #pod
52             #pod
53             #pod # Content-Disposition: attachment; filename=genome.jpeg;
54             #pod # modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
55             #pod my $cd = q(attachment; filename=genome.jpeg;
56             #pod modification-date="Wed, 12 Feb 1997 16:29:51 -0500");
57             #pod my $data = parse_content_disposition($cd);
58             #pod
59             #pod $data = {
60             #pod type => "attachment",
61             #pod attributes => {
62             #pod filename => "genome.jpeg",
63             #pod "modification-date" => "Wed, 12 Feb 1997 16:29:51 -0500"
64             #pod }
65             #pod };
66             #pod
67             #pod my $cd_new = build_content_disposition($data);
68             #pod # attachment; filename=genome.jpeg; modification-date="Wed, 12 Feb 1997 16:29:51 -0500"
69             #pod
70             #pod =cut
71              
72             our $STRICT_PARAMS = 1;
73              
74             my $ct_default = 'text/plain; charset=us-ascii';
75              
76             my $re_token = qr/[\x21\x23-\x27\x2A\x2B\x2D\x2E\x30-\x39\x41-\x5A\x5E-\x7E]+/; # US-ASCII except SPACE, CTLs and tspecials ()<>@,;:\\"/[]?=
77             my $re_token_non_strict = qr/([\x00-\x08\x0B\x0C\x0E-\x1F\x7E-\xFF]+|$re_token)/; # allow CTLs and above ASCII
78              
79             my $re_qtext = qr/[\x01-\x08\x0B\x0C\x0E-\x1F\x21\x23-\x5B\x5D-\x7E\x7F]/; # US-ASCII except CR, LF, white space, backslash and quote
80             my $re_quoted_pair = qr/\\[\x00-\x7F]/;
81             my $re_quoted_string = qr/"((?:[ \t]*(?:$re_qtext|$re_quoted_pair))*[ \t]*)"/;
82              
83             my $re_qtext_non_strict = qr/[\x80-\xFF]|$re_qtext/;
84             my $re_quoted_pair_non_strict = qr/\\[\x00-\xFF]/;
85             my $re_quoted_string_non_strict = qr/"((?:[ \t]*(?:$re_qtext_non_strict|$re_quoted_pair_non_strict))*[ \t]*)"/;
86              
87             my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
88             my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
89             my $re_exvalue = qr/($re_charset)?'(?:$re_language)?'(.*)/;
90              
91             sub parse_content_type {
92 83     83 1 71051 my $ct = shift;
93              
94             # If the header isn't there or is empty, give default answer.
95 83 100 66     1357 return parse_content_type($ct_default) unless defined $ct and length $ct;
96              
97 81         721 _unfold_lines($ct);
98 81         173 _clean_comments($ct);
99              
100             # It is also recommend (sic.) that this default be assumed when a
101             # syntactically invalid Content-Type header field is encountered.
102 81 50       762 unless ($ct =~ s/^($re_token)\/($re_token)//) {
103 0 0 0     0 unless ($STRICT_PARAMS and $ct =~ s/^($re_token_non_strict)\/($re_token_non_strict)//) {
104 0         0 carp "Invalid Content-Type '$ct'";
105 0         0 return parse_content_type($ct_default);
106             }
107             }
108              
109 81         327 my ($type, $subtype) = (lc $1, lc $2);
110              
111 81         158 _clean_comments($ct);
112 81         220 $ct =~ s/\s+$//;
113              
114 81         128 my $attributes = {};
115 81 50 100     343 if ($STRICT_PARAMS and length $ct and $ct !~ /^;/) {
      66        
116 0         0 carp "Missing semicolon before first Content-Type parameter '$ct'";
117             } else {
118 81         213 $attributes = _process_rfc2231(_parse_attributes($ct));
119             }
120              
121             return {
122 81         432 type => $type,
123             subtype => $subtype,
124             attributes => $attributes,
125              
126             # This is dumb. Really really dumb. For backcompat. -- rjbs,
127             # 2013-08-10
128             discrete => $type,
129             composite => $subtype,
130             };
131             }
132              
133             my $cd_default = 'attachment';
134              
135             sub parse_content_disposition {
136 34     34 1 75496 my $cd = shift;
137              
138 34 100 66     215 return parse_content_disposition($cd_default) unless defined $cd and length $cd;
139              
140 32         111 _unfold_lines($cd);
141 32         94 _clean_comments($cd);
142              
143 32 50       311 unless ($cd =~ s/^($re_token)//) {
144 0 0 0     0 unless ($STRICT_PARAMS and $cd =~ s/^($re_token_non_strict)//) {
145 0         0 carp "Invalid Content-Disposition '$cd'";
146 0         0 return parse_content_disposition($cd_default);
147             }
148             }
149              
150 32         134 my $type = lc $1;
151              
152 32         80 _clean_comments($cd);
153 32         114 $cd =~ s/\s+$//;
154              
155 32         66 my $attributes = {};
156 32 50 100     207 if ($STRICT_PARAMS and length $cd and $cd !~ /^;/) {
      66        
157 0         0 carp "Missing semicolon before first Content-Disposition parameter '$cd'";
158             } else {
159 32         90 $attributes = _process_rfc2231(_parse_attributes($cd));
160             }
161              
162             return {
163 32         161 type => $type,
164             attributes => $attributes,
165             };
166             }
167              
168             my $re_invalid_for_quoted_value = qr/[\x00-\x08\x0A-\x1F\x7F-\xFF]/; # non-US-ASCII and CTLs without SPACE and TAB
169             my $re_escape_extended_value = qr/[\x00-\x20\x7F-\xFF\*'%()<>@,;:\\"\/\[\]?=]/; # non-US-ASCII, SPACE, CTLs, *'% and tspecials ()<>@,;:\\"/[]?=
170              
171             sub build_content_type {
172 15     15 1 44723 my $ct = shift;
173              
174 15 50       723 croak 'Missing Content-Type \'type\' parameter' unless exists $ct->{type};
175 15 50       40 croak 'Missing Content-Type \'subtype\' parameter' unless exists $ct->{subtype};
176              
177 15 50       696 croak 'Invalid Content-Type \'type\' parameter' if $ct->{type} !~ /^(?:$re_token)*$/;
178 15 50       111 croak 'Invalid Content-Type \'subtype\' parameter' if $ct->{subtype} !~ /^(?:$re_token)*$/;
179              
180 15 50       61 croak 'Too long Content-Type \'type\' and \'subtype\' parameters' if length($ct->{type}) + length($ct->{subtype}) > 76;
181              
182 15         30 my ($extra) = grep !/(?:type|subtype|attributes)/, sort keys %{$ct};
  15         243  
183 15 50       49 croak "Extra Content-Type '$extra' parameter" if defined $extra;
184              
185 15         49 my $ret = $ct->{type} . '/' . $ct->{subtype};
186 15 50       64 my $attrs = exists $ct->{attributes} ? _build_attributes($ct->{attributes}) : '';
187 15 100       51 $ret .= "; $attrs" if length($attrs);
188 15         128 return $ret;
189             }
190              
191             sub build_content_disposition {
192 13     13 1 26982 my $cd = shift;
193              
194 13 50       52 croak 'Missing Content-Type \'type\' parameter' unless exists $cd->{type};
195              
196 13 50       195 croak 'Invalid Content-Type \'type\' parameter' if $cd->{type} !~ /^(?:$re_token)*$/;
197              
198 13 50       51 croak 'Too long Content-Type \'type\' parameter' if length($cd->{type}) > 77;
199              
200 13         29 my ($extra) = grep !/(?:type|attributes)/, sort keys %{$cd};
  13         201  
201 13 50       56 croak "Extra Content-Type '$extra' parameter" if defined $extra;
202              
203 13         30 my $ret = $cd->{type};
204 13 50       63 my $attrs = exists $cd->{attributes} ? _build_attributes($cd->{attributes}) : '';
205 13 100       53 $ret .= "; $attrs" if length($attrs);
206 13         51 return $ret;
207             }
208              
209             sub _build_attributes {
210 28     28   46 my $attributes = shift;
211              
212 28         51 my $ret = '';
213              
214 28         47 foreach my $key (sort keys %{$attributes}) {
  28         103  
215 38         87 my $value = $attributes->{$key};
216 38         78 my $ascii_value = $value;
217 38         95 my @continuous_value;
218             my $extended_value_charset;
219              
220 38 50       11587 croak "Invalid attribute '$key'" if $key =~ /$re_escape_extended_value/; # complement to attribute-char in 8bit space
221 38 50       95 croak "Undefined attribute '$key'" unless defined $value;
222              
223 4 100   4   43 if ($value =~ /\P{ASCII}/) {
  4         27  
  4         424  
  38         998  
224 7         44 $ascii_value = unidecode($value);
225 7         3928 $ascii_value =~ s/\P{ASCII}/_/g;
226 7         73 @continuous_value = map { encode('UTF-8', $_) } split //, $value;
  197         7835  
227 7         277 $extended_value_charset = 'UTF-8';
228             }
229              
230 38 100 100     591 if ($ascii_value !~ /^(?:$re_token)*$/ or $ascii_value =~ /'/) {
231 20 50       105 if ($ascii_value =~ /$re_invalid_for_quoted_value/) {
232 0 0       0 @continuous_value = split //, $value unless @continuous_value;
233 0         0 $ascii_value =~ s/[\n\r]/ /g;
234 0         0 $ascii_value =~ s/$re_invalid_for_quoted_value/_/g;
235             }
236 20         72 $ascii_value =~ s/(["\\])/\\$1/g;
237 20         55 $ascii_value = "\"$ascii_value\"";
238             }
239              
240 38 100       112 if (length($key) + length($ascii_value) > 75) { # length(" $key=$ascii_value;") > 78
241 3 50       11 croak "Too long attribute '$key'" if length($key) > 71; # length(" $key=...;") > 78
242 3 100       14 my $pos = $ascii_value =~ /"$/ ? 71 : 72;
243 3         15 substr($ascii_value, $pos - length($key), length($ascii_value) + length($key) - 72, '...');
244 3 50       69 @continuous_value = split //, $value unless @continuous_value;
245             }
246              
247 38 100       98 if (@continuous_value) {
248 10         22 my $needs_quote;
249 10 100       29 unless (defined $extended_value_charset) {
250 3 100       7 $needs_quote = 1 if grep { $_ !~ /^(?:$re_token)*$/ or $_ =~ /'/ } @continuous_value;
  204 100       1039  
251 3 50 66     81 $extended_value_charset = 'US-ASCII' if $needs_quote and grep /$re_invalid_for_quoted_value/, @continuous_value;
252             }
253              
254 10         28 my $add_param_len = 4; # for '; *='
255 10 100       39 if (defined $extended_value_charset) {
    100          
256 7         123 $_ =~ s/($re_escape_extended_value)/sprintf('%%%02X', ord($1))/eg foreach @continuous_value;
  71         485  
257 7         35 substr($continuous_value[0], 0, 0, "$extended_value_charset''");
258 7         16 $add_param_len += 1; # for '*' - charset
259             } elsif ($needs_quote) {
260 2         43 $_ =~ s/(["\\])/\\$1/g foreach @continuous_value;
261 2         7 $add_param_len += 2; # for quotes
262             }
263              
264 10 100 100     100 if ($value =~ /\P{ASCII}/ and length(my $oneparameter = "; $key*=" . join '', @continuous_value) <= 78) {
265 4         13 $ret .= $oneparameter;
266             } else {
267 6         17 my $buf = '';
268 6         14 my $count = 0;
269 6         14 foreach (@continuous_value) {
270 308 100       721 if (length($key) + length($count) + length($buf) + length($_) + $add_param_len > 78) {
271 6 100       19 $buf = "\"$buf\"" if $needs_quote;
272 6         21 my $parameter = "; $key*$count";
273 6 100       19 $parameter .= '*' if defined $extended_value_charset;
274 6         16 $parameter .= "=$buf";
275 6 50       19 croak "Too long attribute '$key'" if length($parameter) > 78;
276 6         14 $ret .= $parameter;
277 6         12 $buf = '';
278 6         12 $count++;
279             }
280 308         452 $buf .= $_;
281             }
282 6 50       18 if (length($buf)) {
283 6 100       18 $buf = "\"$buf\"" if $needs_quote;
284 6         19 my $parameter = "; $key*$count";
285 6 100       19 $parameter .= '*' if defined $extended_value_charset;
286 6         15 $parameter .= "=$buf";
287 6 50       20 croak "Too long attribute '$key'" if length($parameter) > 78;
288 6         14 $ret .= $parameter;
289             }
290             }
291             }
292              
293 38         551 $ret .= "; $key=$ascii_value";
294             }
295              
296 28 100       395 substr($ret, 0, 2, '') if length $ret;
297 28         92 return $ret;
298             }
299              
300             sub _unfold_lines {
301 113     113   1090 $_[0] =~ s/(?:\r\n|[\r\n])(?=[ \t])//g;
302             }
303              
304             sub _clean_comments {
305 950     950   1819 my $ret = ($_[0] =~ s/^\s+//);
306 950         1658 while (length $_[0]) {
307 774 100       1951 last unless $_[0] =~ s/^\(//;
308 32         46 my $level = 1;
309 32         59 while (length $_[0]) {
310 408         532 my $ch = substr $_[0], 0, 1, '';
311 408 100       846 if ($ch eq '(') {
    100          
    100          
312 10         18 $level++;
313             } elsif ($ch eq ')') {
314 42         44 $level--;
315 42 100       72 last if $level == 0;
316             } elsif ($ch eq '\\') {
317 12         20 substr $_[0], 0, 1, '';
318             }
319             }
320 32 0 33     54 carp "Unbalanced comment" if $level != 0 and $STRICT_PARAMS;
321 32         78 $ret |= ($_[0] =~ s/^\s+//);
322             }
323 950         1365 return $ret;
324             }
325              
326             sub _process_rfc2231 {
327 113     113   197 my ($attribs) = @_;
328 113         159 my %cont;
329             my %encoded;
330              
331 113         148 foreach (keys %{$attribs}) {
  113         341  
332 179 100       496 next unless $_ =~ m/^(.*)\*([0-9]+)\*?$/;
333 64         161 my ($attr, $sec) = ($1, $2);
334 64         147 $cont{$attr}->[$sec] = $attribs->{$_};
335 64 100       186 $encoded{$attr}->[$sec] = 1 if $_ =~ m/\*$/;
336 64         128 delete $attribs->{$_};
337             }
338              
339 113         536 foreach (keys %cont) {
340 24         38 my $key = $_;
341 24 100       75 $key .= '*' if $encoded{$_};
342 24         37 $attribs->{$key} = join '', @{$cont{$_}};
  24         113  
343             }
344              
345 113         565 foreach (keys %{$attribs}) {
  113         218  
346 136 100       355 next unless $_ =~ m/^(.*)\*$/;
347 33         62 my $key = $1;
348 33 50 33     518 next unless defined $attribs->{$_} and $attribs->{$_} =~ m/^$re_exvalue$/;
349 33         108 my ($charset, $value) = ($1, $2);
350 33         142 $value =~ s/%([0-9A-Fa-f]{2})/pack('C', hex($1))/eg;
  311         922  
351 33 100       85 if (length $charset) {
352 21         101 my $enc = find_mime_encoding($charset);
353 21 50       5358 if (defined $enc) {
354 21         183 $value = $enc->decode($value);
355             } else {
356 0         0 carp "Unknown charset '$charset' in attribute '$key' value";
357             }
358             }
359 33         142 $attribs->{$key} = $value;
360 33         91 delete $attribs->{$_};
361             }
362              
363 113         375 return $attribs;
364             }
365              
366             sub _parse_attributes {
367 113     113   187 local $_ = shift;
368 113 50 66     508 substr($_, 0, 0, '; ') if length $_ and $_ !~ /^;/;
369 113         190 my $attribs = {};
370              
371 113         217 while (length $_) {
372 183 100 33     753 s/^;// or $STRICT_PARAMS and do {
373 0         0 carp "Missing semicolon before parameter '$_'";
374 0         0 return $attribs;
375             };
376              
377 183         408 _clean_comments($_);
378              
379 183 100       505 unless (length $_) {
380             # Some mail software generates a Content-Type like this:
381             # "Content-Type: text/plain;"
382             # RFC 1521 section 3 says a parameter must exist if there is a
383             # semicolon.
384 4 50       11 carp "Extra semicolon after last parameter" if $STRICT_PARAMS;
385 4         12 return $attribs;
386             }
387              
388 179         223 my $attribute;
389 179 100       942 if (s/^($re_token)=//) {
390 178         414 $attribute = lc $1;
391             } else {
392 1 50       4 if ($STRICT_PARAMS) {
393 0         0 carp "Illegal parameter '$_'";
394 0         0 return $attribs;
395             }
396 1 50       32 if (s/^($re_token_non_strict)=//) {
397 0         0 $attribute = lc $1;
398             } else {
399 1 50       9 unless (s/^([^;=\s]+)\s*=//) {
400 0         0 carp "Cannot parse parameter '$_'";
401 0         0 return $attribs;
402             }
403 1         4 $attribute = lc $1;
404             }
405             }
406              
407 179         381 _clean_comments($_);
408 179         301 my $value = _extract_attribute_value();
409 179         405 $attribs->{$attribute} = $value;
410 179         271 _clean_comments($_);
411             }
412              
413 109         233 return $attribs;
414             }
415              
416             sub _extract_attribute_value { # EXPECTS AND MODIFIES $_
417 179     179   212 my $value;
418 179         324 while (length $_) {
419 183 100       2392 if (s/^($re_token)//) {
    50          
    0          
    0          
    0          
420 110         261 $value .= $1;
421             } elsif (s/^$re_quoted_string//) {
422 73         193 my $sub = $1;
423 73         160 $sub =~ s/\\(.)/$1/g;
424 73         149 $value .= $sub;
425             } elsif ($STRICT_PARAMS) {
426 0         0 my $char = substr $_, 0, 1;
427 0         0 carp "Unquoted '$char' not allowed";
428 0         0 return;
429             } elsif (s/^($re_token_non_strict)//) {
430 0         0 $value .= $1;
431             } elsif (s/^$re_quoted_string_non_strict//) {
432 0         0 my $sub = $1;
433 0         0 $sub =~ s/\\(.)/$1/g;
434 0         0 $value .= $sub;
435             }
436              
437 183         334 my $erased = _clean_comments($_);
438 183 100 100     696 last if !length $_ or /^;/;
439 6 50       13 if ($STRICT_PARAMS) {
440 0         0 my $char = substr $_, 0, 1;
441 0         0 carp "Extra '$char' found after parameter";
442 0         0 return;
443             }
444              
445 6 50       11 if ($erased) {
446             # Sometimes semicolon is missing, so check for = char
447 6 100       56 last if m/^$re_token_non_strict=/;
448 4         9 $value .= ' ';
449             }
450              
451 4         11 $value .= substr $_, 0, 1, '';
452             }
453 179         426 return $value;
454             }
455              
456             1;
457              
458             #pod =func parse_content_type
459             #pod
460             #pod This routine is exported by default.
461             #pod
462             #pod This routine parses email content type headers according to section 5.1 of RFC
463             #pod 2045 and also RFC 2231 (Character Set and Parameter Continuations). It returns
464             #pod a hash as above, with entries for the C, the C, and a hash of
465             #pod C.
466             #pod
467             #pod For backward compatibility with a really unfortunate misunderstanding of RFC
468             #pod 2045 by the early implementors of this module, C and C are
469             #pod also present in the returned hashref, with the values of C and C
470             #pod respectively.
471             #pod
472             #pod =func parse_content_disposition
473             #pod
474             #pod This routine is exported by default.
475             #pod
476             #pod This routine parses email Content-Disposition headers according to RFC 2183 and
477             #pod RFC 2231. It returns a hash as above, with entries for the C, and a hash
478             #pod of C.
479             #pod
480             #pod =func build_content_type
481             #pod
482             #pod This routine is exported by default.
483             #pod
484             #pod This routine builds email Content-Type header according to RFC 2045 and RFC 2231.
485             #pod It takes a hash as above, with entries for the C, the C, and
486             #pod optionally also a hash of C. It returns a string representing
487             #pod Content-Type header. Non-ASCII attributes are encoded to UTF-8 according to
488             #pod Character Set section of RFC 2231. Attribute which has more then 78 ASCII
489             #pod characters is split into more attributes accorrding to Parameter Continuations
490             #pod of RFC 2231. For compatibility reasons with clients which do not support
491             #pod RFC 2231, output string contains also truncated ASCII version of any too long or
492             #pod non-ASCII attribute. Encoding to ASCII is done via Text::Unidecode module.
493             #pod
494             #pod =func build_content_disposition
495             #pod
496             #pod This routine is exported by default.
497             #pod
498             #pod This routine builds email Content-Disposition header according to RFC 2182 and
499             #pod RFC 2231. It takes a hash as above, with entries for the C, and
500             #pod optionally also a hash of C. It returns a string representing
501             #pod Content-Disposition header. Non-ASCII or too long attributes are handled in
502             #pod the same way like in L.
503             #pod
504             #pod =head1 WARNINGS
505             #pod
506             #pod This is not a valid content-type header, according to both RFC 1521 and RFC
507             #pod 2045:
508             #pod
509             #pod Content-Type: type/subtype;
510             #pod
511             #pod If a semicolon appears, a parameter must. C will carp if
512             #pod it encounters a header of this type, but you can suppress this by setting
513             #pod C<$Email::MIME::ContentType::STRICT_PARAMS> to a false value. Please consider
514             #pod localizing this assignment!
515             #pod
516             #pod Same applies for C.
517             #pod
518             #pod =cut
519              
520             __END__