File Coverage

blib/lib/MIME/WordDecoder.pm

Criterion	Covered	Total	%
statement	103	142	72.5
branch	17	44	38.6
condition	14	32	43.7
subroutine	28	34	82.3
pod	7	9	77.7
total	169	261	64.7

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::WordDecoder;
2
3							=head1 NAME
4
5							MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6
7							WARNING: Most of this module is deprecated and may disappear. The only
8							function you should use for MIME decoding is "mime_to_perl_string".
9
10							=head1 SYNOPSIS
11
12							See L for the basics of encoded words.
13							See L<"DESCRIPTION"> for how this class works.
14
15							use MIME::WordDecoder;
16
17
18							### Get the default word-decoder (used by unmime()):
19							$wd = default MIME::WordDecoder;
20
21							### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22							$wd = supported MIME::WordDecoder "ISO-8859-1";
23
24
25							### Decode a MIME string (e.g., into Latin1) via the default decoder:
26							$str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27
28							### Decode a string using the default decoder, non-OO style:
29							$str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30
31							### Decode a string to an internal Perl string, non-OO style
32							### The result is likely to have the UTF8 flag ON.
33							$str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34
35							=head1 DESCRIPTION
36
37							WARNING: Most of this module is deprecated and may disappear. It
38							duplicates (badly) the function of the standard 'Encode' module. The
39							only function you should rely on is mime_to_perl_string.
40
41							A MIME::WordDecoder consists, fundamentally, of a hash which maps
42							a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43							knows how to take bytes in that character set and turn them into
44							the target string representation. Ideally, this target representation
45							would be Unicode, but we don't want to overspecify the translation
46							that takes place: if you want to convert MIME strings directly to Big5,
47							that's your own decision.
48
49							The subroutine will be invoked with two arguments: DATA (the data in
50							the given character set), and CHARSET (the upcased character set name).
51
52							For example:
53
54							### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55							sub keep7bit {
56							local $_ = shift;
57							tr/\x00-\x7F/#/c;
58							$_;
59							}
60
61							Here's a decoder which uses that:
62
63							### Construct a decoder:
64							$wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65							'ISO-8859-1' => \&keep7bit,
66							'ISO-8859-2' => \&keep7bit,
67							'Big5' => "WARN",
68							'*' => "DIE"});
69
70							### Convert some MIME text to a pure ASCII string...
71							$ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72
73							### ...which will now hold: "To: Keld J#rn Simonsen "
74
75							The UTF-8 built-in decoder decodes everything into Perl's internal
76							string format, possibly turning on the internal UTF8 flag. Use it like
77							this:
78
79							$wd = supported MIME::WordDecoder 'UTF-8';
80							$perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81							# perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82
83							Generally, you should use the UTF-8 decoder in preference to "unmime".
84
85							=head1 PUBLIC INTERFACE
86
87							=over
88
89							=cut
90
91	25			25		56520	use strict;
	25					38
	25					641
92	25			25		76	use Carp qw( carp croak );
	25					28
	25					1023
93	25			25		7221	use MIME::Words qw(decode_mimewords);
	25					38
	25					1098
94	25			25		102	use Exporter;
	25					43
	25					678
95	25			25		80	use vars qw(@ISA @EXPORT);
	25					29
	25					14479
96
97							@ISA = qw(Exporter);
98							@EXPORT = qw( unmime mime_to_perl_string );
99
100
101
102							#------------------------------
103							#
104							# Globals
105							#
106							#------------------------------
107
108							### Decoders.
109							my %DecoderFor = ();
110
111							### Standard handlers.
112							my %Handler =
113							(
114							KEEP => sub {$_[0]},
115							IGNORE => sub {''},
116							WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117							DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118							);
119
120							### Global default decoder. We init it below.
121							my $Default;
122
123							### Global UTF8 decoder.
124							my $DefaultUTF8;
125
126							#------------------------------
127
128							=item default [DECODER]
129
130							I
131							Get/set the default DECODER object.
132
133							=cut
134
135							sub default {
136	25			25	1	110	my $class = shift;
137	25	50				51	if (@_) {
138	25					35	$Default = shift;
139							}
140	25					81	$Default;
141							}
142
143							#------------------------------
144
145							=item supported CHARSET, [DECODER]
146
147							I
148							If just CHARSET is given, returns a decoder object which maps
149							data into that character set (the character set is forced to
150							all-uppercase).
151
152							$wd = supported MIME::WordDecoder "ISO-8859-1";
153
154							If DECODER is given, installs such an object:
155
156							MIME::WordDecoder->supported("ISO-8859-1" =>
157							(new MIME::WordDecoder::ISO_8859 "1"));
158
159							You should not override this method.
160
161							=cut
162
163							sub supported {
164	215			215	1	1195	my ($class, $charset, $decoder) = @_;
165	215	50				326	$DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166	215					443	$DecoderFor{uc($charset)};
167							}
168
169							#------------------------------
170
171							=item new [\@HANDLERS]
172
173							I
174							If \@HANDLERS is given, then @HANDLERS is passed to handler()
175							to initialize the internal map.
176
177							=cut
178
179							sub new {
180	476			476	1	370	my ($class, $h) = @_;
181	476					808	my $self = bless { MWD_Map=>{} }, $class;
182
183							### Init the map:
184	476					1556	$self->handler(@$h);
185
186							### Add fallbacks:
187	476		66			1556	$self->{MWD_Map}{'*'} \|\|= $Handler{WARN};
188	476		66			1125	$self->{MWD_Map}{'raw'} \|\|= $self->{MWD_Map}{'US-ASCII'};
189	476					503	$self;
190							}
191
192							#------------------------------
193
194							=item handler CHARSET=>\&SUBREF, ...
195
196							I
197							Set the handler SUBREF for a given CHARSET, for as many pairs
198							as you care to supply.
199
200							When performing the translation of a MIME-encoded string, a
201							given SUBREF will be invoked when translating a block of text
202							in character set CHARSET. The subroutine will be invoked with
203							the following arguments:
204
205							DATA - the data in the given character set.
206							CHARSET - the upcased character set name, which may prove useful
207							if you are using the same SUBREF for multiple CHARSETs.
208							DECODER - the decoder itself, if it contains configuration information
209							that your handler function needs.
210
211							For example:
212
213							$wd = new MIME::WordDecoder;
214							$wd->handler('US-ASCII' => "KEEP");
215							$wd->handler('ISO-8859-1' => \&handle_latin1,
216							'ISO-8859-2' => \&handle_latin1,
217							'*' => "DIE");
218
219							Notice that, much as with %SIG, the SUBREF can also be taken from
220							a set of special keywords:
221
222							KEEP Pass data through unchanged.
223							IGNORE Ignore data in this character set, without warning.
224							WARN Ignore data in this character set, with warning.
225							DIE Fatal exception with "can't handle character set" message.
226
227							The subroutine for the special CHARSET of 'raw' is used for raw
228							(non-MIME-encoded) text, which is supposed to be US-ASCII.
229							The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230							at the time of construction.
231
232							The subroutine for the special CHARSET of '*' is used for any
233							unrecognized character set. The default action for '*' is WARN.
234
235							=cut
236
237							sub handler {
238	951			951	1	651	my $self = shift;
239
240							### Copy the hash, and edit it:
241	951					1409	while (@_) {
242	928					678	my $c = shift;
243	928					553	my $sub = shift;
244	928					1074	$self->{MWD_Map}{$c} = $self->real_handler($sub);
245							}
246	951					627	$self;
247							}
248
249							#------------------------------
250
251							=item decode STRING
252
253							I
254							Decode a STRING which might contain MIME-encoded components into a
255							local representation (e.g., UTF-8, etc.).
256
257							=cut
258
259							sub decode {
260	94			94	1	937	my ($self, $str) = @_;
261	94	50				172	defined($str) or return undef;
262							join('', map {
263							### Get the data and (upcased) charset:
264	94					236	my $data = $_->[0];
	119					146
265	119	100				194	my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266	119					121	$charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267
268							### Get the handler; guess if never seen before:
269							defined($self->{MWD_Map}{$charset}) or
270	119	100	100			402	$self->{MWD_Map}{$charset} =
271							($self->real_handler($self->guess_handler($charset)) \|\| 0);
272	119		66			323	my $subr = $self->{MWD_Map}{$charset} \|\| $self->{MWD_Map}{'*'};
273
274							### Map this chunk:
275	119					193	&$subr($data, $charset, $self);
276							} decode_mimewords($str));
277							}
278
279							#------------------------------
280							#
281							# guess_handler CHARSET
282							#
283							# Instance method.
284							# An unrecognized charset has been seen. Guess a handler subref
285							# for the given charset, returning false if there is none.
286							# Successful mappings will be cached in the main map.
287							#
288							sub guess_handler {
289	14			14	0	41	undef;
290							}
291
292							#------------------------------
293							#
294							# real_handler HANDLER
295							#
296							# Instance method.
297							# Translate the given handler, which might be a subref or a string.
298							#
299							sub real_handler {
300	944			944	0	745	my ($self, $sub) = @_;
301							(!$sub) or
302							(ref($sub) eq 'CODE') or
303	944	100	33			3174	$sub = ($Handler{$sub} \|\| croak "bad named handler: $sub\n");
			100
304	944					1866	$sub;
305							}
306
307							#------------------------------
308
309							=item unmime STRING
310
311							I
312							Decode the given STRING using the default() decoder.
313							See L.
314
315							You should consider using the UTF-8 decoder instead. It decodes
316							MIME strings into Perl's internal string format.
317
318							=cut
319
320							sub unmime($) {
321	0			0	1	0	my $str = shift;
322	0					0	$Default->decode($str);
323							}
324
325							=item mime_to_perl_string
326
327							I
328							Decode the given STRING into an internal Perl Unicode string.
329							You should use this function in preference to all others.
330
331							The result of mime_to_perl_string is likely to have Perl's
332							UTF8 flag set.
333
334							=cut
335
336							sub mime_to_perl_string($) {
337	1			1	1	802	my $str = shift;
338	1					4	$DecoderFor{'UTF-8'}->decode($str);
339							}
340
341							=back
342
343							=cut
344
345
346
347
348
349							=head1 SUBCLASSES
350
351							=over
352
353							=cut
354
355							#------------------------------------------------------------
356							#------------------------------------------------------------
357
358							=item MIME::WordDecoder::ISO_8859
359
360							A simple decoder which keeps US-ASCII and the 7-bit characters
361							of ISO-8859 character sets and UTF8, and also keeps 8-bit
362							characters from the indicated character set.
363
364							### Construct:
365							$wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366
367							### What to translate unknown characters to (can also use empty):
368							### Default is "?".
369							$wd->unknown("?");
370
371							### Collapse runs of unknown characters to a single unknown()?
372							### Default is false.
373							$wd->collapse(1);
374
375
376							According to B
377							(ca. November 2000):
378
379							ISO 8859 is a full series of 10 (and soon even more) standardized
380							multilingual single-byte coded (8bit) graphic character sets for
381							writing in alphabetic languages:
382
383							1. Latin1 (West European)
384							2. Latin2 (East European)
385							3. Latin3 (South European)
386							4. Latin4 (North European)
387							5. Cyrillic
388							6. Arabic
389							7. Greek
390							8. Hebrew
391							9. Latin5 (Turkish)
392							10. Latin6 (Nordic)
393
394							The ISO 8859 charsets are not even remotely as complete as the truly
395							great Unicode but they have been around and usable for quite a while
396							(first registered Internet charsets for use with MIME) and have
397							already offered a major improvement over the plain 7bit US-ASCII.
398
399							Characters 0 to 127 are always identical with US-ASCII and the
400							positions 128 to 159 hold some less used control characters: the
401							so-called C1 set from ISO 6429.
402
403							=cut
404
405							package MIME::WordDecoder::ISO_8859;
406
407	25			25		113	use strict;
	25					35
	25					479
408	25			25		75	use vars qw(@ISA);
	25					27
	25					15795
409							@ISA = qw( MIME::WordDecoder );
410
411
412							#------------------------------
413							#
414							# HANDLERS
415							#
416							#------------------------------
417
418							### Keep 7bit characters.
419							### Turn all else to the special \x00.
420							sub h_keep7bit {
421	2			2		3	local $_ = $_[0];
422							# my $unknown = $_[2]->{MWDI_Unknown};
423
424	2					3	s{[\x80-\xFF]}{\x00}g;
425	2					3	$_;
426							}
427
428							### Note: should use Unicode::String, converting/manipulating
429							### everything into full Unicode form.
430
431							### Keep 7bit UTF8 characters (ASCII).
432							### Keep ISO-8859-1 if this decoder is for Latin-1.
433							### Turn all else to the special \x00.
434							sub h_utf8 {
435	0			0		0	local $_ = $_[0];
436							# my $unknown = $_[2]->{MWDI_Unknown};
437	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
438							#print STDERR "UTF8 in: <$_>\n";
439
440	0					0	local($1,$2,$3);
441	0					0	my $tgt = '';
442	0		0			0	while (m{\G(
443							([\x00-\x7F]) \| # 0xxxxxxx
444							([\xC0-\xDF] [\x80-\xBF]) \| # 110yyyyy 10xxxxxx
445							([\xE0-\xEF] [\x80-\xBF]{2}) \| # 1110zzzz 10yyyyyy 10xxxxxx
446							([\xF0-\xF7] [\x80-\xBF]{3}) \| # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447							. # error; synch
448							)}gcsx and ($1 ne '')) {
449
450	0	0	0			0	if (defined($2)) { $tgt .= $2 }
	0	0				0
451	0					0	elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452	0					0	else { $tgt .= "\x00" }
453							}
454
455							#print STDERR "UTF8 out: <$tgt>\n";
456	0					0	$tgt;
457							}
458
459							### Keep characters which are 7bit in UTF8 (ASCII).
460							### Keep ISO-8859-1 if this decoder is for Latin-1.
461							### Turn all else to the special \x00.
462							sub h_utf16 {
463	0			0		0	local $_ = $_[0];
464							# my $unknown = $_[2]->{MWDI_Unknown};
465	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
466							#print STDERR "UTF16 in: <$_>\n";
467
468	0					0	local($1,$2,$3,$4,$5);
469	0					0	my $tgt = '';
470	0		0			0	while (m{\G(
471							( \x00 ([\x00-\x7F])) \| # 00000000 0xxxxxxx
472							( \x00 ([\x80-\xFF])) \| # 00000000 1xxxxxxx
473							( [^\x00] [\x00-\xFF]) \| # etc
474							)
475							}gcsx and ($1 ne '')) {
476
477	0	0	0			0	if (defined($2)) { $tgt .= $3 }
	0	0				0
478	0					0	elsif (defined($4) && $latin1) { $tgt .= $5 }
479	0					0	else { $tgt .= "\x00" }
480							}
481
482							#print STDERR "UTF16 out: <$tgt>\n";
483	0					0	$tgt;
484							}
485
486
487							#------------------------------
488							#
489							# PUBLIC INTERFACE
490							#
491							#------------------------------
492
493							#------------------------------
494							#
495							# new NUMBER
496							#
497							sub new {
498	450			450		849	my ($class, $num) = @_;
499
500	450					593	my $self = $class->SUPER::new();
501	450					544	$self->handler('raw' => 'KEEP',
502							'US-ASCII' => 'KEEP');
503
504	450					415	$self->{MWDI_Num} = $num;
505	450					397	$self->{MWDI_Unknown} = "?";
506	450					339	$self->{MWDI_Collapse} = 0;
507	450					899	$self;
508							}
509
510							#------------------------------
511							#
512							# guess_handler CHARSET
513							#
514							sub guess_handler {
515	2			2		3	my ($self, $charset) = @_;
516							return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517	2	100	66			18	($1 eq $self->{MWDI_Num}));
518	1	50				7	return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519	0	0				0	return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520	0	0				0	return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521	0					0	undef;
522							}
523
524							#------------------------------
525							#
526							# unknown [REPLACEMENT]
527							#
528							sub unknown {
529	0			0		0	my $self = shift;
530	0	0				0	$self->{MWDI_Unknown} = shift if @_;
531	0					0	$self->{MWDI_Unknown};
532							}
533
534							#------------------------------
535							#
536							# collapse [YESNO]
537							#
538							sub collapse {
539	0			0		0	my $self = shift;
540	0	0				0	$self->{MWDI_Collapse} = shift if @_;
541	0					0	$self->{MWDI_Collapse};
542							}
543
544							#------------------------------
545							#
546							# decode STRING
547							#
548							sub decode {
549	10			10		6387	my $self = shift;
550
551							### Do inherited action:
552	10					26	my $basic = $self->SUPER::decode(@_);
553	10	50				22	defined($basic) or return undef;
554
555							### Translate/consolidate illegal characters:
556	10	50				19	$basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557	10					8	$basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558	10					15	$basic;
559							}
560
561							#------------------------------------------------------------
562							#------------------------------------------------------------
563
564							=item MIME::WordDecoder::US_ASCII
565
566							A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567							You're probably better off using ISO-8859-1.
568
569							=cut
570
571							package MIME::WordDecoder::US_ASCII;
572
573	25			25		113	use strict;
	25					29
	25					459
574	25			25		74	use vars qw(@ISA);
	25					24
	25					3223
575							@ISA = qw( MIME::WordDecoder::ISO_8859 );
576
577							sub new {
578	25			25		40	my ($class) = @_;
579	25					107	return $class->SUPER::new("1");
580							}
581
582							sub decode {
583	0			0		0	my $self = shift;
584
585							### Do inherited action:
586	0					0	my $basic = $self->SUPER::decode(@_);
587	0	0				0	defined($basic) or return undef;
588
589							### Translate/consolidate 8-bit characters:
590	0	0				0	$basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591	0					0	$basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592	0					0	$basic;
593							}
594
595							=back
596
597							=cut
598
599							package MIME::WordDecoder::UTF_8;
600	25			25		142	use strict;
	25					25
	25					410
601	25			25		12636	use Encode qw();
	25					182119
	25					533
602	25			25		129	use Carp qw( carp );
	25					29
	25					1105
603	25			25		93	use vars qw(@ISA);
	25					23
	25					5317
604
605							@ISA = qw( MIME::WordDecoder );
606
607							sub h_convert_to_utf8
608							{
609	96			96		107	my ($data, $charset, $decoder) = @_;
610	96	100				187	$charset = 'US-ASCII' if ($charset eq 'raw');
611	96					226	my $enc = Encode::find_encoding($charset);
612	96	50				6910	if (!$enc) {
613	0					0	carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614	0					0	return '';
615							}
616	96					503	my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617	96					498	return $ans;
618							}
619
620							sub new {
621	25			25		70	my ($class) = @_;
622	25					96	my $self = $class->SUPER::new();
623	25					59	$self->handler('*' => \&h_convert_to_utf8);
624							}
625
626
627							#------------------------------------------------------------
628							#------------------------------------------------------------
629
630							package MIME::WordDecoder;
631
632							### Now we can init the default handler.
633							$Default = (MIME::WordDecoder::ISO_8859->new('1'));
634
635
636							### Add US-ASCII handler:
637							$DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638
639							### Add ISO-8859-{1..15} handlers:
640							for (1..15) {
641							$DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642							}
643
644							### UTF-8
645							$DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646
647							1; # end the module
648							__END__