File Coverage

lib/Sisimai/RFC2045.pm
Criterion Covered Total %
statement 183 183 100.0
branch 109 120 90.8
condition 57 64 89.0
subroutine 16 16 100.0
pod 9 9 100.0
total 374 392 95.4


line stmt bran cond sub pod time code
1             package Sisimai::RFC2045;
2 88     88   137984 use v5.26;
  88         352  
3 88     88   503 use strict;
  88         172  
  88         2354  
4 88     88   401 use warnings;
  88         588  
  88         5434  
5 88     88   1379 use Encode;
  88         24686  
  88         10428  
6 88     88   47473 use MIME::Base64 ();
  88         74908  
  88         2923  
7 88     88   42801 use MIME::QuotedPrint ();
  88         27568  
  88         2666  
8 88     88   1094 use Sisimai::String;
  88         169  
  88         272367  
9              
10             sub is_encoded {
11             # Check that the argument is MIME-Encoded string or not
12             # @param [String] argv0 String to be checked
13             # @return [Boolean] 0: Not MIME encoded string
14             # 1: MIME encoded string
15 10383     10383 1 393090 my $class = shift;
16 10383   100     26585 my $argv0 = shift || return 0;
17 10382         17526 my $text1 = $$argv0; $text1 =~ y/"//d;
  10382         17559  
18 10382         24056 my @piece = ($text1);
19 10382         18603 my $mime1 = 0;
20              
21             # Multiple MIME-Encoded strings in a line
22 10382 100       40786 @piece = split(' ', $text1) if rindex($text1, ' ') > -1;
23 10382         29847 while( my $e = shift @piece ) {
24             # Check all the string in the array
25 24295 100       76914 next unless $e =~ /[ \t]*=[?][-_0-9A-Za-z]+[?][BbQq][?].+[?]=?[ \t]*/;
26 1295         3645 $mime1 = 1;
27             }
28 10382         38921 return $mime1;
29             }
30              
31             sub decodeH {
32             # Decode MIME-Encoded string in an email header
33             # @param [Array] argvs Reference to an array including MIME-Encoded text
34             # @return [String] MIME-Decoded text
35 3251     3251 1 15589 my $class = shift;
36 3251   100     9348 my $argvs = shift || return '';
37              
38 3250         8415 my $ctxcharset = '';
39 3250         6489 my $qbencoding = '';
40 3250         5287 my @textblocks;
41              
42 3250         10660 while( my $e = shift @$argvs ) {
43             # Check and decode each element
44 3469         20885 s/\A[ \t]+//g, s/[ \t]+\z//g, y/"//d for $e;
45              
46 3469 100       10006 if( __PACKAGE__->is_encoded(\$e) ) {
47             # =?utf-8?B?55m954yr44Gr44KD44KT44GT?=
48 496 100       3996 next unless $e =~ m{\A(.*)=[?]([-_0-9A-Za-z]+)[?]([BbQq])[?](.+)[?]=?(.*)\z};
49 476   66     3712 $ctxcharset ||= lc $2;
50 476   66     3230 $qbencoding ||= uc $3;
51              
52 476         1938 push @textblocks, $1;
53 476 100       3947 push @textblocks, $qbencoding eq 'B'
54             ? MIME::Base64::decode($4)
55             : MIME::QuotedPrint::decode($4);
56 476         1094 $textblocks[-1] =~ y/\r\n//d;
57 476         8045 push @textblocks, $5;
58              
59             } else {
60 2973 100       13495 push @textblocks, scalar @textblocks ? ' '.$e : $e;
61             }
62             }
63 3250 50       8124 return '' unless scalar @textblocks;
64              
65 3250         11714 my $p = join('', @textblocks);
66 3250 100 66     11889 if( $ctxcharset && $qbencoding ) {
67             # utf-8 => utf8
68 412 100       1440 $ctxcharset = 'utf8' if $ctxcharset eq 'utf-8';
69              
70 412 100       1291 unless( $ctxcharset eq 'utf8' ) {
71             # Characterset is not UTF-8
72 128         278 eval { Encode::from_to($p, $ctxcharset, 'utf8') };
  128         1760  
73 128 50       30807 $p = 'FAILED TO CONVERT THE SUBJECT' if $@;
74             }
75             }
76 3250         13959 return $p;
77             }
78              
79             sub decodeB {
80             # Decode MIME BASE64 Encoded string
81             # @param [String] argv0 MIME Encoded text
82             # @return [String] MIME-Decoded text
83 47     47 1 550 my $class = shift;
84 47   100     221 my $argv0 = shift // return "";
85              
86 46 50       919 my $p = $$argv0 =~ m|([+/=0-9A-Za-z\r\n]+)| ? MIME::Base64::decode($1) : '';
87 46         207 return \$p;
88             }
89              
90             sub decodeQ {
91             # Decode MIME Quoted-Printable Encoded string
92             # @param [String] argv0 Entire MIME-Encoded text
93             # @param [String] argv1 The value of Content-Type: header
94             # @return [String] MIME Decoded text
95 286     286 1 3230 my $class = shift;
96 286   100     906 my $argv0 = shift // return "";
97              
98 285   100     5908 my $p = MIME::QuotedPrint::decode($$argv0) || '';
99 285         1075 return \$p;
100             }
101              
102             sub parameter {
103             # Find a value of specified field name from Content-Type: header
104             # @param [String] argv0 The value of Content-Type: header
105             # @param [String] argv1 Lower-cased attribute name of the parameter
106             # @return [String] The value of the parameter
107             # @since v5.0.0
108 12714     12714 1 18045 my $class = shift;
109 12714   100     30200 my $argv0 = shift || return "";
110 10326   100     26869 my $argv1 = shift || '';
111              
112 10326 100       24058 my $parameterq = length $argv1 > 0 ? $argv1.'=' : '';
113 10326 100       20532 my $paramindex = length $argv1 > 0 ? index($argv0, $parameterq) : 0;
114 10326 100       21829 return '' if $paramindex == -1;
115              
116             # Find the value of the parameter name specified in $argv1
117 10192         37613 my $foundtoken = [split(';', substr($argv0, $paramindex + length($parameterq)), 2)]->[0];
118 10192 100       31120 $foundtoken = lc $foundtoken unless $argv1 eq 'boundary';
119 10192         20422 $foundtoken =~ y/"'//d;
120 10192         29193 return $foundtoken;
121             }
122              
123             sub boundary {
124             # Get a boundary string
125             # @param [String] argv0 The value of Content-Type header
126             # @param [Integer] start -1: boundary string itself
127             # 0: Start of boundary
128             # 1: End of boundary
129             # @return [String] Boundary string
130 4309     4309 1 11391 my $class = shift;
131 4309   100     10251 my $argv0 = shift || return "";
132 4265   100     9298 my $start = shift // -1;
133 4265   100     15118 my $btext = __PACKAGE__->parameter($argv0, 'boundary') || return '';
134              
135             # Content-Type: multipart/mixed; boundary=Apple-Mail-5--931376066
136             # Content-Type: multipart/report; report-type=delivery-status;
137             # boundary="n6H9lKZh014511.1247824040/mx.example.jp"
138 4133 100       12078 $btext = '--'.$btext if $start > -1;
139 4133 100       9426 $btext = $btext.'--' if $start > 0;
140 4133         13072 return $btext;
141             }
142              
143             sub haircut {
144             # Cut header fields except Content-Type, Content-Transfer-Encoding from multipart/* block
145             # @param [String] block multipart/* block text
146             # @param [Boolean] heads 1 = Returns only Content-(Type|Transfer-Encoding) headers
147             # @return [Array] Two headers and body part of multipart/* block
148             # @since v5.0.0
149 8910     8910 1 20979 my $class = shift;
150 8910   100     19305 my $block = shift // return undef;
151 8909   100     25762 my $heads = shift // undef;
152              
153 8909         34844 my($upperchunk, $lowerchunk) = split("\n\n", $$block, 2);
154 8909 50       18605 return ['', ''] unless $upperchunk;
155 8909 100       26251 return ['', ''] unless index($upperchunk, 'Content-Type:') > -1;
156              
157 6521         13819 my $headerpart = ['', '']; # ["text/plain; charset=iso-2022-jp; ...", "quoted-printable"]
158 6521         11366 my $multipart1 = []; # [@$headerpart, "body"]
159              
160 6521         18185 for my $e ( split("\n", $upperchunk) ) {
161             # Remove fields except Content-Type:, and Content-Transfer-Encoding: in each part
162             # of multipart/* block such as the following:
163             # Date: Thu, 29 Apr 2018 22:22:22 +0900
164             # MIME-Version: 1.0
165             # Message-ID: ...
166             # Content-Transfer-Encoding: quoted-printable
167             # Content-Type: text/plain; charset=us-ascii
168 12492 100 100     43869 if( index($e, 'Content-Type:') == 0 ) {
    100          
    100          
169             # Content-Type: ***
170 6521         18523 my $v = [split(' ', $e, 2)]->[-1];
171 6521 100       25235 $headerpart->[0] = index($v, 'boundary=') > -1 ? $v : lc $v;
172              
173             } elsif( index($e, 'Content-Transfer-Encoding:') == 0 ) {
174             # Content-Transfer-Encoding: ***
175 1461         6717 $headerpart->[1] = lc [split(' ', $e, 2)]->[-1];
176              
177             } elsif( index($e, 'boundary=') > -1 || index($e, 'charset=') > -1 ) {
178             # "Content-Type" field has boundary="..." or charset="utf-8"
179 273 50       878 next unless length $headerpart->[0];
180 273         810 $headerpart->[0] .= " ".$e;
181 273         2511 $headerpart->[0] =~ s/\s\s+/ /g;
182             }
183             }
184 6521 100       17886 return $headerpart if $heads;
185              
186 6520         13373 my $mediatypev = lc $headerpart->[0];
187 6520         10661 my $ctencoding = $headerpart->[1];
188 6520         18684 push @$multipart1, @$headerpart, '';
189              
190 6520         8419 UPPER: while(1) {
191             # Make a body part at the 2nd element of $multipart1
192 6520         12060 $multipart1->[2] = sprintf("Content-Type: %s\n", $headerpart->[0]);
193              
194             # Do not append Content-Transfer-Encoding: header when the part is the original message:
195             # Content-Type is message/rfc822 or text/rfc822-headers, or message/delivery-status or
196             # message/feedback-report
197 6520 100       16909 last if index($mediatypev, '/rfc822') > -1;
198 4348 100       11763 last if index($mediatypev, '/delivery-status') > -1;
199 2378 100       7211 last if index($mediatypev, '/feedback-report') > -1;
200 2312 100       6960 last if length $ctencoding == 0;
201              
202 794         2017 $multipart1->[2] .= sprintf("Content-Transfer-Encoding: %s\n", $ctencoding);
203 794         1435 last;
204             }
205              
206             # Append LF before the lower chunk into the 2nd element of $multipart1
207 6520 100 100     30291 $multipart1->[2] .= "\n" if $lowerchunk ne "" && substr($lowerchunk, 0, 1) ne "\n";
208 6520         19871 $multipart1->[2] .= $lowerchunk;
209 6520         16509 return $multipart1;
210             }
211              
212             sub levelout {
213             # Split argv1: multipart/* blocks by a boundary string in argv0
214             # @param [String] argv0 The value of Content-Type header
215             # @param [String] argv1 A pointer to multipart/* message blocks
216             # @return [Array] List of each part of multipart/*
217             # @since v5.0.0
218 2829     2829 1 9571 my $class = shift;
219 2829 50 100     7768 my $argv0 = shift || return []; return [] unless length $argv0;
  2828         8921  
220 2828 50 100     7456 my $argv1 = shift || return []; return [] unless length $$argv1;
  2827         7065  
221              
222 2827   50     15178 my $boundary01 = __PACKAGE__->boundary($argv0, 0) || return [];
223 2827         112717 my $multiparts = [split(/\Q$boundary01\E\n/, $$argv1)];
224 2827         7704 my $partstable = [];
225              
226             # Remove empty or useless preamble and epilogue of multipart/* block
227 2827 100       10385 shift @$multiparts if length $multiparts->[0] < 8;
228 2827 100       7935 return [] if scalar @$multiparts == 0;
229 2825 100       9506 pop @$multiparts if length $multiparts->[-1] < 8;
230              
231 2825         10954 while( my $e = shift @$multiparts ) {
232             # Check each part and breaks up internal multipart/* block
233 8907         23501 my $f = __PACKAGE__->haircut(\$e);
234 8907 100       22539 if( index($f->[0], 'multipart/') > -1 ) {
235             # There is nested multipart/* block
236 462   50     1671 my $boundary02 = __PACKAGE__->boundary($f->[0], -1) || next;
237 462         22621 my $bodyinside = [split(/\n\n/, $f->[-1], 2)]->[-1];
238 462 50 33     3052 next if length $bodyinside < 9 || index($bodyinside, $boundary02) < 0;
239              
240 462         38553 my $v = __PACKAGE__->levelout($f->[0], \$bodyinside);
241 462 50       2868 push @$partstable, @$v if scalar @$v;
242              
243             } else {
244             # The part is not a multipart/* block
245 8445 100       18082 my $b = length $f->[-1] ? $f->[-1] : $e;
246 8445 100       35184 my $v = [$f->[0], $f->[1], length $f->[0] ? [split("\n\n", $b, 2)]->[-1] : $b];
247 8445         35579 push @$partstable, $v;
248             }
249             }
250 2825 50       12297 return [] unless scalar @$partstable;
251              
252             # Remove $boundary01.'--' and strings from the boundary to the end of the body part.
253 2825         6130 chomp $boundary01;
254 2825         6947 my $b = $partstable->[-1]->[2];
255 2825         9021 my $p = index($b, $boundary01.'--');
256 2825 100       12796 substr($partstable->[-1]->[2], $p, length $b, "") if $p > -1;
257              
258 2825         8491 return $partstable;
259             }
260              
261             sub makeflat {
262             # Make flat multipart/* part blocks and decode
263             # @param [String] argv0 The value of Content-Type header
264             # @param [String] argv1 A pointer to multipart/* message blocks
265             # @return [String] Message body
266 2412     2412 1 11501 my $class = shift;
267 2412   100     16009 my $argv0 = shift // return undef;
268 2411   100     6549 my $argv1 = shift // return undef;
269 2410 100 100     13053 return undef if index($argv0, 'multipart/') < 0 || index($argv0, 'boundary=') < 0;
270              
271 2364         18115 my $iso2022set = qr/charset=["']?(iso-2022-[-a-z0-9]+)['"]?\b/;
272 2364         10813 my $multiparts = __PACKAGE__->levelout($argv0, $argv1);
273 2364         5441 my $flattenout = '';
274 2364         14213 my $delimiters = ["/delivery-status", "/rfc822", "/feedback-report", "/partial"];
275              
276 2364         9425 while( my $e = shift @$multiparts ) {
277             # Pick only the following parts Sisimai::Lhost will use, and decode each part
278             # - text/plain, text/rfc822-headers
279             # - message/delivery-status, message/rfc822, message/partial, message/feedback-report
280 8443         11243 my $istexthtml = 0;
281 8443   100     23611 my $mediatypev = __PACKAGE__->parameter($e->[0]) || 'text/plain';
282 8443 100 100     29326 next if index($mediatypev, 'text/') != 0 && index($mediatypev, 'message/') != 0;
283              
284 8409 100       17172 if( $mediatypev eq 'text/html' ) {
285             # Skip text/html part when the value of Content-Type: header in an internal part of
286             # multipart/* includes multipart/alternative;
287 12 50       46 next if index($argv0, 'multipart/alternative') > -1;
288 12         34 $istexthtml = 1;
289             }
290 8409   100     24036 my $ctencoding = $e->[1] || '';
291 8409         13520 my $bodyinside = $e->[2];
292 8409         11914 my $bodystring = '';
293              
294 8409 100       16422 if( length $ctencoding ) {
295             # Check the value of Content-Transfer-Encoding: header
296 1425 100       6517 if( $ctencoding eq 'base64' ) {
    100          
    100          
297             # Content-Transfer-Encoding: base64
298 34         184 $bodystring = __PACKAGE__->decodeB(\$bodyinside)->$*;
299 34         75 my $dontset = 0; while( my $first10 = substr($bodystring, 0, 10) ) {
  34         237  
300             # Don't pick the decoded part as an error message when the part is
301             # - BASE64 encoded.
302             # - the value of the charset is not utf-8.
303             # - NOT a plain text.
304 34 100       363 last if Sisimai::String->aligned(\$e->[0], ['charset', '=', 'utf-8']);
305 10 100       76 last unless $first10 =~ /[\x00-\x08\x0E-\x1F\x7F-]/;
306 5         12 $dontset = 1; last;
  5         12  
307             }
308 34 100       134 next if $dontset;
309              
310             } elsif( $ctencoding eq 'quoted-printable') {
311             # Content-Transfer-Encoding: quoted-printable
312 222         957 $bodystring = __PACKAGE__->decodeQ(\$bodyinside)->$*;
313              
314             } elsif( $ctencoding eq '7bit' ) {
315             # Content-Transfer-Encoding: 7bit
316 880 100       9954 if( lc $e->[0] =~ $iso2022set ) {
317             # Content-Type: text/plain; charset=ISO-2022-JP
318 67         765 $bodystring = Sisimai::String->to_utf8(\$bodyinside, $1)->$*;
319              
320             } else {
321             # No "charset" parameter in the value of Content-Type: header
322 813         1711 $bodystring = $bodyinside;
323             }
324             } else {
325             # Content-Transfer-Encoding: 8bit, binary, and so on
326 289         686 $bodystring = $bodyinside;
327             }
328              
329             # Try to delete HTML tags inside of text/html part whenever possible
330 1420 100       4164 $bodystring = Sisimai::String->to_plain(\$bodystring)->$* if $istexthtml;
331 1420 100       3419 next unless $bodystring;
332 1409 100       4756 $bodystring =~ s|\r\n|\n|g if index($bodystring, "\r\n") > -1; # Convert CRLF to LF
333              
334             } else {
335             # There is no Content-Transfer-Encoding header in the part
336 6984         14752 $bodystring .= $bodyinside;
337             }
338            
339 8393 100       14214 if( grep { index($mediatypev, $_) > 0 } @$delimiters ) {
  33572         63070  
340             # Add Content-Type: header of each part (will be used as a delimiter at Sisimai::Lhost) into
341             # the body inside when the value of Content-Type: is message/delivery-status, message/rfc822,
342             # or text/rfc822-headers
343 4213         8860 $bodystring = sprintf("Content-Type: %s\n%s", $mediatypev, $bodystring);
344             }
345              
346             # Append "\n" when the last character of $bodystring is not LF
347 8393 100       20670 $bodystring .= "\n\n" unless substr($bodystring, -2, 2) eq "\n\n";
348 8393         42417 $flattenout .= $bodystring;
349             }
350 2364         15652 return \$flattenout;
351             }
352              
353             1;
354             __END__