File Coverage

blib/lib/MIME/Words.pm

Criterion	Covered	Total	%
statement	73	76	96.0
branch	15	18	83.3
condition	5	8	62.5
subroutine	12	13	92.3
pod	3	3	100.0
total	108	118	91.5

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::Words;
2
3							=head1 NAME
4
5							MIME::Words - deal with RFC 2047 encoded words
6
7
8							=head1 SYNOPSIS
9
10							Before reading further, you should see L to make sure that
11							you understand where this module fits into the grand scheme of things.
12							Go on, do it now. I'll wait.
13
14							Ready? Ok...
15
16
17							use MIME::Words qw(:all);
18
19							### Decode the string into another string, forgetting the charsets:
20							$decoded = decode_mimewords(
21							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22							);
23
24							### Split string into array of decoded [DATA,CHARSET] pairs:
25							@decoded = decode_mimewords(
26							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27							);
28
29							### Encode a single unsafe word:
30							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31
32							### Encode a string, trying to find the unsafe words inside it:
33							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34
35
36
37							=head1 DESCRIPTION
38
39							Fellow Americans, you probably won't know what the hell this module
40							is for. Europeans, Russians, et al, you probably do. C<:-)>.
41
42							For example, here's a valid MIME header you might get:
43
44							From: =?US-ASCII?Q?Keith_Moore?=
45							To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46							CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47							Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48							=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49							=?US-ASCII?Q?.._cool!?=
50
51							The fields basically decode to (sorry, I can only approximate the
52							Latin characters with 7 bit sequences /o and 'e):
53
54							From: Keith Moore
55							To: Keld J/orn Simonsen
56							CC: Andr'e Pirard
57							Subject: If you can read this you understand the example... cool!
58
59
60							=head1 PUBLIC INTERFACE
61
62							=over 4
63
64							=cut
65
66							require 5.001;
67
68							### Pragmas:
69	29			29		82125	use strict;
	29					57
	29					773
70	29			29		135	use re 'taint';
	29					51
	29					1531
71	29			29		135	use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
	29					44
	29					1967
72
73							### Exporting:
74	29			29		142	use Exporter;
	29					54
	29					1978
75							%EXPORT_TAGS = (all => [qw(decode_mimewords
76							encode_mimeword
77							encode_mimewords
78							)]);
79							Exporter::export_ok_tags('all');
80
81							### Inheritance:
82							@ISA = qw(Exporter);
83
84							### Other modules:
85	29			29		20662	use MIME::Base64;
	29					18345
	29					1602
86	29			29		40871	use MIME::QuotedPrint;
	29					5841
	29					27916
87
88
89
90							#------------------------------
91							#
92							# Globals...
93							#
94							#------------------------------
95
96							### The package version, both in 1.23 style and usable by MakeMaker:
97							$VERSION = "5.507";
98
99							### Nonprintables (controls + x7F + 8bit):
100							my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101
102
103							#------------------------------
104
105							# _decode_Q STRING
106							# Private: used by _decode_header() to decode "Q" encoding, which is
107							# almost, but not exactly, quoted-printable. :-P
108							sub _decode_Q {
109	54			54		88	my $str = shift;
110	54					123	local $1;
111	54					126	$str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112	54					205	$str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
	272					1023
113	54					175	$str;
114							}
115
116							# _encode_Q STRING
117							# Private: used by _encode_header() to decode "Q" encoding, which is
118							# almost, but not exactly, quoted-printable. :-P
119							sub _encode_Q {
120	25			25		47	my $str = shift;
121	25					69	local $1;
122	25					172	$str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
	238					860
123	25					200	$str;
124							}
125
126							# _decode_B STRING
127							# Private: used by _decode_header() to decode "B" encoding.
128							sub _decode_B {
129	12			12		39	my $str = shift;
130	12					71	decode_base64($str);
131							}
132
133							# _encode_B STRING
134							# Private: used by _decode_header() to decode "B" encoding.
135							sub _encode_B {
136	0			0		0	my $str = shift;
137	0					0	encode_base64($str, '');
138							}
139
140
141
142							#------------------------------
143
144							=item decode_mimewords ENCODED
145
146							I
147							Go through the string looking for RFC 2047-style "Q"
148							(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149
150							B splits the ENCODED string into a list of decoded
151							C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152							data are returned in a 1-element array C<[DATA]>, giving an effective
153							CHARSET of C.
154
155							$enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156							foreach (decode_mimewords($enc)) {
157							print "", ($_->[1] \|\| 'US-ASCII'), ": ", $_->[0], "\n";
158							}
159
160							B joins the "data" elements of the above
161							list together, and returns that. I
162							and probably I what you want, but if you know that all charsets
163							in the ENCODED string are identical, it might be useful to you.
164							(Before you use this, please see L,
165							which is probably what you want.)
166
167							In the event of a syntax error, $@ will be set to a description
168							of the error, but parsing will continue as best as possible (so as to
169							get I back when decoding headers).
170							$@ will be false if no error was detected.
171
172							Any arguments past the ENCODED string are taken to define a hash of options:
173
174							=cut
175
176							sub decode_mimewords {
177	131			131	1	9432	my $encstr = shift;
178	131					177	my @tokens;
179	131					423	local($1,$2,$3);
180	131					214	$@ = ''; ### error-return
181
182							### Collapse boundaries between adjacent encoded words:
183	131					346	$encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184	131					332	pos($encstr) = 0;
185							### print STDOUT "ENC = [", $encstr, "]\n";
186
187							### Decode:
188	131					229	my ($charset, $encoding, $enc, $dec);
189	131					162	while (1) {
190	337	100				806	last if (pos($encstr) >= length($encstr));
191	206					283	my $pos = pos($encstr); ### save it
192
193							### Case 1: are we looking at "=?..?..?="?
194	206	100				769	if ($encstr =~ m{\G # from where we left off..
195							=\?([^?]*) # "=?" + charset +
196							\?([bq]) # "?" + encoding +
197							\?([^?]+) # "?" + data maybe with spcs +
198							\?= # "?="
199							}xgi) {
200	66					235	($charset, $encoding, $enc) = ($1, lc($2), $3);
201	66	100				238	$dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202	66					176	push @tokens, [$dec, $charset];
203	66					116	next;
204							}
205
206							### Case 2: are we looking at a bad "=?..." prefix?
207							### We need this to detect problems for case 3, which stops at "=?":
208	140					266	pos($encstr) = $pos; # reset the pointer.
209	140	100				433	if ($encstr =~ m{\G=\?}xg) {
210	4					15	$@ .= qq\|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n\|;
211	4					9	push @tokens, ['=?'];
212	4					7	next;
213							}
214
215							### Case 3: are we looking at ordinary text?
216	136					225	pos($encstr) = $pos; # reset the pointer.
217	136	50				1167	if ($encstr =~ m{\G # from where we left off...
218							(.*? # shortest possible string,
219							\n*) # followed by 0 or more NLs,
220							(?=(\Z\|=\?)) # terminated by "=?" or EOS
221							}sxg) {
222	136	50				380	length($1) or die "MIME::Words: internal logic err: empty token\n";
223	136					424	push @tokens, [$1];
224	136					222	next;
225							}
226
227							### Case 4: bug!
228	0					0	die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229							"Please alert developer.\n";
230							}
231	131	100				594	return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
	86					399
232							}
233
234							#------------------------------
235
236							=item encode_mimeword RAW, [ENCODING], [CHARSET]
237
238							I
239							Encode a single RAW "word" that has unsafe characters.
240							The "word" will be encoded in its entirety.
241
242							### Encode "<>":
243							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244
245							You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246							You may specify the CHARSET, which defaults to C.
247
248							=cut
249
250							sub encode_mimeword {
251	25			25	1	62	my $word = shift;
252	25		50			100	my $encoding = uc(shift \|\| 'Q');
253	25		50			81	my $charset = uc(shift \|\| 'ISO-8859-1');
254	25	50				91	my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255	25					98	"=?$charset?$encoding?" . &$encfunc($word) . "?=";
256							}
257
258							#------------------------------
259
260							=item encode_mimewords RAW, [OPTS]
261
262							I
263							Given a RAW string, try to find and encode all "unsafe" sequences
264							of characters:
265
266							### Encode a string with some unsafe "words":
267							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268
269							Returns the encoded string.
270							Any arguments past the RAW string are taken to define a hash of options:
271
272							=over 4
273
274							=item Charset
275
276							Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277							a.k.a. "Latin-1".
278
279							=item Encoding
280
281							The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282
283							=back
284
285							B this is a quick-and-dirty solution, intended for character
286							sets which overlap ASCII. B
287							rules regarding the use of encoded words in message headers>.
288							You may want to roll your own variant,
289							using C, for your application.
290							I
291
292							=cut
293
294							sub encode_mimewords {
295	6			6	1	1808	my ($rawstr, %params) = @_;
296	6		100			50	my $charset = $params{Charset} \|\| 'ISO-8859-1';
297	6		50			45	my $encoding = lc($params{Encoding} \|\| 'q');
298
299							### Encode any "words" with unsafe characters.
300							### We limit such words to 18 characters, to guarantee that the
301							### worst-case encoding give us no more than 54 + ~10 < 75 characters
302	6					13	my $word;
303	6					22	local $1;
304	6					45	$rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305	51					126	$word = $1;
306	51	100				677	(($word !~ /(?:[$NONPRINT])\|(?:^\s+$)/o)
307							? $word ### no unsafe chars
308							: encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309							}xeg;
310	6					160	$rawstr =~ s/\?==\?/?= =?/g;
311	6					113	$rawstr;
312							}
313
314							1;
315							__END__