File Coverage

blib/lib/Encode/IMAPUTF7.pm

Criterion	Covered	Total	%
statement	44	50	88.0
branch	13	22	59.0
condition			n/a
subroutine	7	8	87.5
pod	3	3	100.0
total	67	83	80.7

line	stmt	bran	sub	pod	time	code
1						package Encode::IMAPUTF7 1.07;
2	2		2		274286	use strict;
	2				4
	2				82
3	2		2		18	use warnings;
	2				5
	2				124
4
5	2		2		524	use parent qw(Encode::Encoding);
	2				422
	2				12
6
7						# ABSTRACT: modification of UTF-7 encoding for IMAP
8
9						__PACKAGE__->Define('IMAP-UTF-7', 'imap-utf-7');
10
11	2		2		25877	use Encode ();
	2				6
	2				31
12	2		2		1123	use MIME::Base64;
	2				3115
	2				1835
13
14						#pod =head1 SYNOPSIS
15						#pod
16						#pod use Encode qw/encode decode/;
17						#pod use Encode::IMAPUTF7;
18						#pod
19						#pod print encode('IMAP-UTF-7', 'Répertoire');
20						#pod print decode('IMAP-UTF-7', 'R&AOk-pertoire');
21						#pod
22						#pod =head1 ABSTRACT
23						#pod
24						#pod IMAP mailbox names are encoded in a modified UTF-7 when names contains
25						#pod international characters outside of the printable ASCII range. The modified
26						#pod UTF-7 encoding is defined in RFC2060 (section 5.1.3).
27						#pod
28						#pod =head2 RFC2060 - section 5.1.3 - Mailbox International Naming Convention
29						#pod
30						#pod By convention, international mailbox names are specified using a modified
31						#pod version of the UTF-7 encoding described in [UTF-7]. The purpose of these
32						#pod modifications is to correct the following problems with UTF-7:
33						#pod
34						#pod =for :list
35						#pod 1. UTF-7 uses the "+" character for shifting; this conflicts with the common
36						#pod use of "+" in mailbox names, in particular USENET newsgroup names.
37						#pod 2. UTF-7's encoding is BASE64 which uses the "/" character; this conflicts
38						#pod with the use of "/" as a popular hierarchy delimiter.
39						#pod 3. UTF-7 prohibits the unencoded usage of "\"; this conflicts with the use of
40						#pod "\" as a popular hierarchy delimiter.
41						#pod 4. UTF-7 prohibits the unencoded usage of "~"; this conflicts with the use of
42						#pod "~" in some servers as a home directory indicator.
43						#pod 5. UTF-7 permits multiple alternate forms to represent the same string; in
44						#pod particular, printable US-ASCII chararacters can be represented in encoded
45						#pod form.
46						#pod
47						#pod In modified UTF-7, printable US-ASCII characters except for "&" represent
48						#pod themselves; that is, characters with octet values 0x20-0x25 and 0x27-0x7e. The
49						#pod character "&" (0x26) is represented by the two-octet sequence "&-".
50						#pod
51						#pod All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all Unicode 16-bit
52						#pod octets) are represented in modified BASE64, with a further modification from
53						#pod [UTF-7] that "," is used instead of "/". Modified BASE64 MUST NOT be used to
54						#pod represent any printing US-ASCII character which can represent itself.
55						#pod
56						#pod "&" is used to shift to modified BASE64 and "-" to shift back to US- ASCII.
57						#pod All names start in US-ASCII, and MUST end in US-ASCII (that is, a name that
58						#pod ends with a Unicode 16-bit octet MUST end with a "- ").
59						#pod
60						#pod For example, here is a mailbox name which mixes English, Japanese,
61						#pod and Chinese text: C<~peter/mail/&ZeVnLIqe-/&U,BTFw->
62						#pod
63						#pod =cut
64
65						# Algorithms taken from Unicode::String by Gisle Aas
66						# Code directly borrowed from Encode::Unicode::UTF7 by Dan Kogai
67
68						# Directly from the definition in RFC2060:
69						# Ampersand (\x26) is represented as a special case
70						my $re_asis = qr/(?:[\x20-\x25\x27-\x7e])/; # printable US-ASCII except "&" represents itself
71						my $re_encoded = qr/(?:[^\x20-\x7e])/; # Everything else are represented by modified base64
72
73						my $e_utf16 = Encode::find_encoding("UTF-16BE");
74
75	0		0	1	0	sub needs_lines { 1 };
76
77						sub encode($$;$) {
78	6		6	1	607558	my ( $obj, $str, $chk ) = @_;
79	6				29	my $len = length($str);
80	6				16	pos($str) = 0;
81	6				15	my $bytes = '';
82	6				19	while ( pos($str) < $len ) {
83	739	100			2683	if ( $str =~ /\G($re_asis+)/ogc ) {
		100
		50
84	367				1076	$bytes .= $1;
85						} elsif ( $str =~ /\G&/ogc ) {
86	5				9	$bytes .= "&-";
87						} elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
88	367				702	my $s = $1;
89	367				1825	my $base64 = encode_base64( $e_utf16->encode($s), '' );
90	367				1135	$base64 =~ s/=+$//;
91	367				803	$base64 =~ s/\//,/g;
92	367				1068	$bytes .= "&$base64-";
93						} else {
94	0				0	die "This should not happen! (pos=" . pos($str) . ")";
95						}
96						}
97	6	50			12	$_[1] = '' if $chk;
98	6				78	return $bytes;
99						}
100
101						sub decode($$;$) {
102	1		1	1	67	my ( $obj, $bytes, $chk ) = @_;
103	1				12	my $len = length($bytes);
104	1				2	my $str = "";
105	1				5	pos($bytes) = 0;
106	1				5	while ( pos($bytes) < $len ) {
107	726	100			2918	if ( $bytes =~ /\G([^&]+)/ogc ) {
		50
		50
		0
108	363				1051	$str .= $1;
109						} elsif ( $bytes =~ /\G\&-/ogc ) {
110	0				0	$str .= "&";
111						} elsif ( $bytes =~ /\G\&([A-Za-z0-9+,]+)-?/ogsc ) {
112	363				665	my $base64 = $1;
113	363				978	$base64 =~ s/,/\//g;
114	363				823	my $pad = length($base64) % 4;
115	363	100			848	$base64 .= "=" x ( 4 - $pad ) if $pad;
116	363				2771	$str .= $e_utf16->decode( decode_base64($base64) );
117						} elsif ( $bytes =~ /\G\&/ogc ) {
118	0	0			0	$^W and warn "Bad IMAP-UTF7 data escape";
119	0				0	$str .= "&";
120						} else {
121	0				0	die "This should not happen " . pos($bytes);
122						}
123						}
124	1	50			6	$_[1] = '' if $chk;
125	1				74	return $str;
126						}
127
128						1;
129
130						__END__