File Coverage

blib/lib/MIME/Words.pm

Criterion	Covered	Total	%
statement	73	76	96.0
branch	15	18	83.3
condition	5	8	62.5
subroutine	12	13	92.3
pod	3	3	100.0
total	108	118	91.5

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::Words;
2
3							=head1 NAME
4
5							MIME::Words - deal with RFC 2047 encoded words
6
7
8							=head1 SYNOPSIS
9
10							Before reading further, you should see L to make sure that
11							you understand where this module fits into the grand scheme of things.
12							Go on, do it now. I'll wait.
13
14							Ready? Ok...
15
16
17							use MIME::Words qw(:all);
18
19							### Decode the string into another string, forgetting the charsets:
20							$decoded = decode_mimewords(
21							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
22							);
23
24							### Split string into array of decoded [DATA,CHARSET] pairs:
25							@decoded = decode_mimewords(
26							'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ',
27							);
28
29							### Encode a single unsafe word:
30							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
31
32							### Encode a string, trying to find the unsafe words inside it:
33							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
34
35
36
37							=head1 DESCRIPTION
38
39							Fellow Americans, you probably won't know what the hell this module
40							is for. Europeans, Russians, et al, you probably do. C<:-)>.
41
42							For example, here's a valid MIME header you might get:
43
44							From: =?US-ASCII?Q?Keith_Moore?=
45							To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=
46							CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard
47							Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
48							=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
49							=?US-ASCII?Q?.._cool!?=
50
51							The fields basically decode to (sorry, I can only approximate the
52							Latin characters with 7 bit sequences /o and 'e):
53
54							From: Keith Moore
55							To: Keld J/orn Simonsen
56							CC: Andr'e Pirard
57							Subject: If you can read this you understand the example... cool!
58
59
60							=head1 PUBLIC INTERFACE
61
62							=over 4
63
64							=cut
65
66							require 5.001;
67
68							### Pragmas:
69	31			31		168116	use strict;
	31					36
	31					834
70	31			31		107	use re 'taint';
	31					36
	31					1383
71	31			31		109	use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
	31					38
	31					1796
72
73							### Exporting:
74	31			31		117	use Exporter;
	31					39
	31					1923
75							%EXPORT_TAGS = (all => [qw(decode_mimewords
76							encode_mimeword
77							encode_mimewords
78							)]);
79							Exporter::export_ok_tags('all');
80
81							### Inheritance:
82							@ISA = qw(Exporter);
83
84							### Other modules:
85	31			31		12810	use MIME::Base64;
	31					16371
	31					1589
86	31			31		12444	use MIME::QuotedPrint;
	31					5028
	31					23226
87
88
89
90							#------------------------------
91							#
92							# Globals...
93							#
94							#------------------------------
95
96							### The package version, both in 1.23 style and usable by MakeMaker:
97							$VERSION = "5.509";
98
99							### Nonprintables (controls + x7F + 8bit):
100							my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
101
102
103							#------------------------------
104
105							# _decode_Q STRING
106							# Private: used by _decode_header() to decode "Q" encoding, which is
107							# almost, but not exactly, quoted-printable. :-P
108							sub _decode_Q {
109	54			54		46	my $str = shift;
110	54					73	local $1;
111	54					87	$str =~ s/_/\x20/g; # RFC-1522, Q rule 2
112	54					131	$str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
	272					510
113	54					102	$str;
114							}
115
116							# _encode_Q STRING
117							# Private: used by _encode_header() to decode "Q" encoding, which is
118							# almost, but not exactly, quoted-printable. :-P
119							sub _encode_Q {
120	25			25		16	my $str = shift;
121	25					30	local $1;
122	25					103	$str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
	238					336
123	25					92	$str;
124							}
125
126							# _decode_B STRING
127							# Private: used by _decode_header() to decode "B" encoding.
128							sub _decode_B {
129	12			12		21	my $str = shift;
130	12					64	decode_base64($str);
131							}
132
133							# _encode_B STRING
134							# Private: used by _decode_header() to decode "B" encoding.
135							sub _encode_B {
136	0			0		0	my $str = shift;
137	0					0	encode_base64($str, '');
138							}
139
140
141
142							#------------------------------
143
144							=item decode_mimewords ENCODED
145
146							I
147							Go through the string looking for RFC 2047-style "Q"
148							(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
149
150							B splits the ENCODED string into a list of decoded
151							C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
152							data are returned in a 1-element array C<[DATA]>, giving an effective
153							CHARSET of C.
154
155							$enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ';
156							foreach (decode_mimewords($enc)) {
157							print "", ($_->[1] \|\| 'US-ASCII'), ": ", $_->[0], "\n";
158							}
159
160							B joins the "data" elements of the above
161							list together, and returns that. I
162							and probably I what you want, but if you know that all charsets
163							in the ENCODED string are identical, it might be useful to you.
164							(Before you use this, please see L,
165							which is probably what you want.)
166
167							In the event of a syntax error, $@ will be set to a description
168							of the error, but parsing will continue as best as possible (so as to
169							get I back when decoding headers).
170							$@ will be false if no error was detected.
171
172							Any arguments past the ENCODED string are taken to define a hash of options:
173
174							=cut
175
176							sub decode_mimewords {
177	132			132	1	5312	my $encstr = shift;
178	132					112	my @tokens;
179	132					289	local($1,$2,$3);
180	132					132	$@ = ''; ### error-return
181
182							### Collapse boundaries between adjacent encoded words:
183	132					284	$encstr =~ s{(\?\=)\s*(\=\?)}{$1$2}gs;
184	132					299	pos($encstr) = 0;
185							### print STDOUT "ENC = [", $encstr, "]\n";
186
187							### Decode:
188	132					149	my ($charset, $encoding, $enc, $dec);
189	132					106	while (1) {
190	339	100				544	last if (pos($encstr) >= length($encstr));
191	207					184	my $pos = pos($encstr); ### save it
192
193							### Case 1: are we looking at "=?..?..?="?
194	207	100				524	if ($encstr =~ m{\G # from where we left off..
195							=\?([^?]*) # "=?" + charset +
196							\?([bq]) # "?" + encoding +
197							\?([^?]+) # "?" + data maybe with spcs +
198							\?= # "?="
199							}xgi) {
200	66					156	($charset, $encoding, $enc) = ($1, lc($2), $3);
201	66	100				160	$dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
202	66					110	push @tokens, [$dec, $charset];
203	66					63	next;
204							}
205
206							### Case 2: are we looking at a bad "=?..." prefix?
207							### We need this to detect problems for case 3, which stops at "=?":
208	141					175	pos($encstr) = $pos; # reset the pointer.
209	141	100				272	if ($encstr =~ m{\G=\?}xg) {
210	4					9	$@ .= qq\|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n\|;
211	4					8	push @tokens, ['=?'];
212	4					4	next;
213							}
214
215							### Case 3: are we looking at ordinary text?
216	137					139	pos($encstr) = $pos; # reset the pointer.
217	137	50				772	if ($encstr =~ m{\G # from where we left off...
218							(.*? # shortest possible string,
219							\n*) # followed by 0 or more NLs,
220							(?=(\Z\|=\?)) # terminated by "=?" or EOS
221							}sxg) {
222	137	50				333	length($1) or die "MIME::Words: internal logic err: empty token\n";
223	137					284	push @tokens, [$1];
224	137					146	next;
225							}
226
227							### Case 4: bug!
228	0					0	die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
229							"Please alert developer.\n";
230							}
231	132	100				497	return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
	86					219
232							}
233
234							#------------------------------
235
236							=item encode_mimeword RAW, [ENCODING], [CHARSET]
237
238							I
239							Encode a single RAW "word" that has unsafe characters.
240							The "word" will be encoded in its entirety.
241
242							### Encode "<>":
243							$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
244
245							You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
246							You may specify the CHARSET, which defaults to C.
247
248							=cut
249
250							sub encode_mimeword {
251	25			25	1	132	my $word = shift;
252	25		50			43	my $encoding = uc(shift \|\| 'Q');
253	25		50			42	my $charset = uc(shift \|\| 'ISO-8859-1');
254	25	50				39	my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
255	25					47	"=?$charset?$encoding?" . &$encfunc($word) . "?=";
256							}
257
258							#------------------------------
259
260							=item encode_mimewords RAW, [OPTS]
261
262							I
263							Given a RAW string, try to find and encode all "unsafe" sequences
264							of characters:
265
266							### Encode a string with some unsafe "words":
267							$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
268
269							Returns the encoded string.
270							Any arguments past the RAW string are taken to define a hash of options:
271
272							=over 4
273
274							=item Charset
275
276							Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
277							a.k.a. "Latin-1".
278
279							=item Encoding
280
281							The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
282
283							=back
284
285							B this is a quick-and-dirty solution, intended for character
286							sets which overlap ASCII. B
287							rules regarding the use of encoded words in message headers>.
288							You may want to roll your own variant,
289							using C, for your application.
290							I
291
292							=cut
293
294							sub encode_mimewords {
295	6			6	1	937	my ($rawstr, %params) = @_;
296	6		100			28	my $charset = $params{Charset} \|\| 'ISO-8859-1';
297	6		50			27	my $encoding = lc($params{Encoding} \|\| 'q');
298
299							### Encode any "words" with unsafe characters.
300							### We limit such words to 18 characters, to guarantee that the
301							### worst-case encoding give us no more than 54 + ~10 < 75 characters
302	6					5	my $word;
303	6					14	local $1;
304	6					32	$rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]+\s*)}{ ### get next "word"
305	51					53	$word = $1;
306	51	100				232	(($word !~ /(?:[$NONPRINT])\|(?:^\s+$)/o)
307							? $word ### no unsafe chars
308							: encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
309							}xeg;
310	6					22	$rawstr =~ s/\?==\?/?= =?/g;
311	6					26	$rawstr;
312							}
313
314							1;
315							__END__