File Coverage

blib/lib/MIME/WordDecoder.pm

Criterion	Covered	Total	%
statement	103	142	72.5
branch	17	44	38.6
condition	14	32	43.7
subroutine	28	34	82.3
pod	7	9	77.7
total	169	261	64.7

line	stmt	bran	cond	sub	pod	time	code
1							package MIME::WordDecoder;
2
3							=head1 NAME
4
5							MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6
7							WARNING: Most of this module is deprecated and may disappear. The only
8							function you should use for MIME decoding is "mime_to_perl_string".
9
10							=head1 SYNOPSIS
11
12							See L for the basics of encoded words.
13							See L<"DESCRIPTION"> for how this class works.
14
15							use MIME::WordDecoder;
16
17
18							### Get the default word-decoder (used by unmime()):
19							$wd = default MIME::WordDecoder;
20
21							### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22							$wd = supported MIME::WordDecoder "ISO-8859-1";
23
24
25							### Decode a MIME string (e.g., into Latin1) via the default decoder:
26							$str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27
28							### Decode a string using the default decoder, non-OO style:
29							$str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30
31							### Decode a string to an internal Perl string, non-OO style
32							### The result is likely to have the UTF8 flag ON.
33							$str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34
35							=head1 DESCRIPTION
36
37							WARNING: Most of this module is deprecated and may disappear. It
38							duplicates (badly) the function of the standard 'Encode' module. The
39							only function you should rely on is mime_to_perl_string.
40
41							A MIME::WordDecoder consists, fundamentally, of a hash which maps
42							a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43							knows how to take bytes in that character set and turn them into
44							the target string representation. Ideally, this target representation
45							would be Unicode, but we don't want to overspecify the translation
46							that takes place: if you want to convert MIME strings directly to Big5,
47							that's your own decision.
48
49							The subroutine will be invoked with two arguments: DATA (the data in
50							the given character set), and CHARSET (the upcased character set name).
51
52							For example:
53
54							### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55							sub keep7bit {
56							local $_ = shift;
57							tr/\x00-\x7F/#/c;
58							$_;
59							}
60
61							Here's a decoder which uses that:
62
63							### Construct a decoder:
64							$wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65							'ISO-8859-1' => \&keep7bit,
66							'ISO-8859-2' => \&keep7bit,
67							'Big5' => "WARN",
68							'*' => "DIE"});
69
70							### Convert some MIME text to a pure ASCII string...
71							$ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72
73							### ...which will now hold: "To: Keld J#rn Simonsen "
74
75							The UTF-8 built-in decoder decodes everything into Perl's internal
76							string format, possibly turning on the internal UTF8 flag. Use it like
77							this:
78
79							$wd = supported MIME::WordDecoder 'UTF-8';
80							$perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81							# perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82
83							Generally, you should use the UTF-8 decoder in preference to "unmime".
84
85							=head1 PUBLIC INTERFACE
86
87							=over
88
89							=cut
90
91	29			29		141290	use strict;
	29					82
	29					2876
92	29			29		151	use Carp qw( carp croak );
	29					54
	29					1960
93	29			29		13406	use MIME::Words qw(decode_mimewords);
	29					96
	29					2036
94	29			29		251	use Exporter;
	29					69
	29					1265
95	29			29		148	use vars qw(@ISA @EXPORT);
	29					60
	29					30153
96
97							@ISA = qw(Exporter);
98							@EXPORT = qw( unmime mime_to_perl_string );
99
100
101
102							#------------------------------
103							#
104							# Globals
105							#
106							#------------------------------
107
108							### Decoders.
109							my %DecoderFor = ();
110
111							### Standard handlers.
112							my %Handler =
113							(
114							KEEP => sub {$_[0]},
115							IGNORE => sub {''},
116							WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117							DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118							);
119
120							### Global default decoder. We init it below.
121							my $Default;
122
123							### Global UTF8 decoder.
124							my $DefaultUTF8;
125
126							#------------------------------
127
128							=item default [DECODER]
129
130							I
131							Get/set the default DECODER object.
132
133							=cut
134
135							sub default {
136	25			25	1	316	my $class = shift;
137	25	50				83	if (@_) {
138	25					145	$Default = shift;
139							}
140	25					53	$Default;
141							}
142
143							#------------------------------
144
145							=item supported CHARSET, [DECODER]
146
147							I
148							If just CHARSET is given, returns a decoder object which maps
149							data into that character set (the character set is forced to
150							all-uppercase).
151
152							$wd = supported MIME::WordDecoder "ISO-8859-1";
153
154							If DECODER is given, installs such an object:
155
156							MIME::WordDecoder->supported("ISO-8859-1" =>
157							(new MIME::WordDecoder::ISO_8859 "1"));
158
159							You should not override this method.
160
161							=cut
162
163							sub supported {
164	218			218	1	139446	my ($class, $charset, $decoder) = @_;
165	218	50				529	$DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166	218					3200	$DecoderFor{uc($charset)};
167							}
168
169							#------------------------------
170
171							=item new [\@HANDLERS]
172
173							I
174							If \@HANDLERS is given, then @HANDLERS is passed to handler()
175							to initialize the internal map.
176
177							=cut
178
179							sub new {
180	548			548	1	1013	my ($class, $h) = @_;
181	548					1461	my $self = bless { MWD_Map=>{} }, $class;
182
183							### Init the map:
184	548					1770	$self->handler(@$h);
185
186							### Add fallbacks:
187	548		66			2788	$self->{MWD_Map}{'*'} \|\|= $Handler{WARN};
188	548		66			2292	$self->{MWD_Map}{'raw'} \|\|= $self->{MWD_Map}{'US-ASCII'};
189	548					1005	$self;
190							}
191
192							#------------------------------
193
194							=item handler CHARSET=>\&SUBREF, ...
195
196							I
197							Set the handler SUBREF for a given CHARSET, for as many pairs
198							as you care to supply.
199
200							When performing the translation of a MIME-encoded string, a
201							given SUBREF will be invoked when translating a block of text
202							in character set CHARSET. The subroutine will be invoked with
203							the following arguments:
204
205							DATA - the data in the given character set.
206							CHARSET - the upcased character set name, which may prove useful
207							if you are using the same SUBREF for multiple CHARSETs.
208							DECODER - the decoder itself, if it contains configuration information
209							that your handler function needs.
210
211							For example:
212
213							$wd = new MIME::WordDecoder;
214							$wd->handler('US-ASCII' => "KEEP");
215							$wd->handler('ISO-8859-1' => \&handle_latin1,
216							'ISO-8859-2' => \&handle_latin1,
217							'*' => "DIE");
218
219							Notice that, much as with %SIG, the SUBREF can also be taken from
220							a set of special keywords:
221
222							KEEP Pass data through unchanged.
223							IGNORE Ignore data in this character set, without warning.
224							WARN Ignore data in this character set, with warning.
225							DIE Fatal exception with "can't handle character set" message.
226
227							The subroutine for the special CHARSET of 'raw' is used for raw
228							(non-MIME-encoded) text, which is supposed to be US-ASCII.
229							The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230							at the time of construction.
231
232							The subroutine for the special CHARSET of '*' is used for any
233							unrecognized character set. The default action for '*' is WARN.
234
235							=cut
236
237							sub handler {
238	1095			1095	1	1509	my $self = shift;
239
240							### Copy the hash, and edit it:
241	1095					2110	while (@_) {
242	1068					1616	my $c = shift;
243	1068					1411	my $sub = shift;
244	1068					2301	$self->{MWD_Map}{$c} = $self->real_handler($sub);
245							}
246	1095					1769	$self;
247							}
248
249							#------------------------------
250
251							=item decode STRING
252
253							I
254							Decode a STRING which might contain MIME-encoded components into a
255							local representation (e.g., UTF-8, etc.).
256
257							=cut
258
259							sub decode {
260	95			95	1	1521	my ($self, $str) = @_;
261	95	50				250	defined($str) or return undef;
262							join('', map {
263							### Get the data and (upcased) charset:
264	95					370	my $data = $_->[0];
	120					235
265	120	100				385	my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266	120					253	$charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267
268							### Get the handler; guess if never seen before:
269							defined($self->{MWD_Map}{$charset}) or
270	120	100	100			490	$self->{MWD_Map}{$charset} =
271							($self->real_handler($self->guess_handler($charset)) \|\| 0);
272	120		66			538	my $subr = $self->{MWD_Map}{$charset} \|\| $self->{MWD_Map}{'*'};
273
274							### Map this chunk:
275	120					339	&$subr($data, $charset, $self);
276							} decode_mimewords($str));
277							}
278
279							#------------------------------
280							#
281							# guess_handler CHARSET
282							#
283							# Instance method.
284							# An unrecognized charset has been seen. Guess a handler subref
285							# for the given charset, returning false if there is none.
286							# Successful mappings will be cached in the main map.
287							#
288							sub guess_handler {
289	15			15	0	134	undef;
290							}
291
292							#------------------------------
293							#
294							# real_handler HANDLER
295							#
296							# Instance method.
297							# Translate the given handler, which might be a subref or a string.
298							#
299							sub real_handler {
300	1085			1085	0	1800	my ($self, $sub) = @_;
301							(!$sub) or
302							(ref($sub) eq 'CODE') or
303	1085	100	33			4124	$sub = ($Handler{$sub} \|\| croak "bad named handler: $sub\n");
			100
304	1085					3213	$sub;
305							}
306
307							#------------------------------
308
309							=item unmime STRING
310
311							I
312							Decode the given STRING using the default() decoder.
313							See L.
314
315							You should consider using the UTF-8 decoder instead. It decodes
316							MIME strings into Perl's internal string format.
317
318							=cut
319
320							sub unmime($) {
321	0			0	1	0	my $str = shift;
322	0					0	$Default->decode($str);
323							}
324
325							=item mime_to_perl_string
326
327							I
328							Decode the given STRING into an internal Perl Unicode string.
329							You should use this function in preference to all others.
330
331							The result of mime_to_perl_string is likely to have Perl's
332							UTF8 flag set.
333
334							=cut
335
336							sub mime_to_perl_string($) {
337	1			1	1	2204	my $str = shift;
338	1					5	$DecoderFor{'UTF-8'}->decode($str);
339							}
340
341							=back
342
343							=cut
344
345
346
347
348
349							=head1 SUBCLASSES
350
351							=over
352
353							=cut
354
355							#------------------------------------------------------------
356							#------------------------------------------------------------
357
358							=item MIME::WordDecoder::ISO_8859
359
360							A simple decoder which keeps US-ASCII and the 7-bit characters
361							of ISO-8859 character sets and UTF8, and also keeps 8-bit
362							characters from the indicated character set.
363
364							### Construct:
365							$wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366
367							### What to translate unknown characters to (can also use empty):
368							### Default is "?".
369							$wd->unknown("?");
370
371							### Collapse runs of unknown characters to a single unknown()?
372							### Default is false.
373							$wd->collapse(1);
374
375
376							According to B
377							(ca. November 2000):
378
379							ISO 8859 is a full series of 10 (and soon even more) standardized
380							multilingual single-byte coded (8bit) graphic character sets for
381							writing in alphabetic languages:
382
383							1. Latin1 (West European)
384							2. Latin2 (East European)
385							3. Latin3 (South European)
386							4. Latin4 (North European)
387							5. Cyrillic
388							6. Arabic
389							7. Greek
390							8. Hebrew
391							9. Latin5 (Turkish)
392							10. Latin6 (Nordic)
393
394							The ISO 8859 charsets are not even remotely as complete as the truly
395							great Unicode but they have been around and usable for quite a while
396							(first registered Internet charsets for use with MIME) and have
397							already offered a major improvement over the plain 7bit US-ASCII.
398
399							Characters 0 to 127 are always identical with US-ASCII and the
400							positions 128 to 159 hold some less used control characters: the
401							so-called C1 set from ISO 6429.
402
403							=cut
404
405							package MIME::WordDecoder::ISO_8859;
406
407	29			29		234	use strict;
	29					92
	29					849
408	29			29		153	use vars qw(@ISA);
	29					50
	29					33522
409							@ISA = qw( MIME::WordDecoder );
410
411
412							#------------------------------
413							#
414							# HANDLERS
415							#
416							#------------------------------
417
418							### Keep 7bit characters.
419							### Turn all else to the special \x00.
420							sub h_keep7bit {
421	2			2		6	local $_ = $_[0];
422							# my $unknown = $_[2]->{MWDI_Unknown};
423
424	2					6	s{[\x80-\xFF]}{\x00}g;
425	2					7	$_;
426							}
427
428							### Note: should use Unicode::String, converting/manipulating
429							### everything into full Unicode form.
430
431							### Keep 7bit UTF8 characters (ASCII).
432							### Keep ISO-8859-1 if this decoder is for Latin-1.
433							### Turn all else to the special \x00.
434							sub h_utf8 {
435	0			0		0	local $_ = $_[0];
436							# my $unknown = $_[2]->{MWDI_Unknown};
437	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
438							#print STDERR "UTF8 in: <$_>\n";
439
440	0					0	local($1,$2,$3);
441	0					0	my $tgt = '';
442	0		0			0	while (m{\G(
443							([\x00-\x7F]) \| # 0xxxxxxx
444							([\xC0-\xDF] [\x80-\xBF]) \| # 110yyyyy 10xxxxxx
445							([\xE0-\xEF] [\x80-\xBF]{2}) \| # 1110zzzz 10yyyyyy 10xxxxxx
446							([\xF0-\xF7] [\x80-\xBF]{3}) \| # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447							. # error; synch
448							)}gcsx and ($1 ne '')) {
449
450	0	0	0			0	if (defined($2)) { $tgt .= $2 }
	0	0				0
451	0					0	elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452	0					0	else { $tgt .= "\x00" }
453							}
454
455							#print STDERR "UTF8 out: <$tgt>\n";
456	0					0	$tgt;
457							}
458
459							### Keep characters which are 7bit in UTF8 (ASCII).
460							### Keep ISO-8859-1 if this decoder is for Latin-1.
461							### Turn all else to the special \x00.
462							sub h_utf16 {
463	0			0		0	local $_ = $_[0];
464							# my $unknown = $_[2]->{MWDI_Unknown};
465	0					0	my $latin1 = ($_[2]->{MWDI_Num} == 1);
466							#print STDERR "UTF16 in: <$_>\n";
467
468	0					0	local($1,$2,$3,$4,$5);
469	0					0	my $tgt = '';
470	0		0			0	while (m{\G(
471							( \x00 ([\x00-\x7F])) \| # 00000000 0xxxxxxx
472							( \x00 ([\x80-\xFF])) \| # 00000000 1xxxxxxx
473							( [^\x00] [\x00-\xFF]) \| # etc
474							)
475							}gcsx and ($1 ne '')) {
476
477	0	0	0			0	if (defined($2)) { $tgt .= $3 }
	0	0				0
478	0					0	elsif (defined($4) && $latin1) { $tgt .= $5 }
479	0					0	else { $tgt .= "\x00" }
480							}
481
482							#print STDERR "UTF16 out: <$tgt>\n";
483	0					0	$tgt;
484							}
485
486
487							#------------------------------
488							#
489							# PUBLIC INTERFACE
490							#
491							#------------------------------
492
493							#------------------------------
494							#
495							# new NUMBER
496							#
497							sub new {
498	518			518		153440	my ($class, $num) = @_;
499
500	518					1167	my $self = $class->SUPER::new();
501	518					1922	$self->handler('raw' => 'KEEP',
502							'US-ASCII' => 'KEEP');
503
504	518					885	$self->{MWDI_Num} = $num;
505	518					858	$self->{MWDI_Unknown} = "?";
506	518					845	$self->{MWDI_Collapse} = 0;
507	518					1613	$self;
508							}
509
510							#------------------------------
511							#
512							# guess_handler CHARSET
513							#
514							sub guess_handler {
515	2			2		6	my ($self, $charset) = @_;
516							return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517	2	100	66			26	($1 eq $self->{MWDI_Num}));
518	1	50				10	return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519	0	0				0	return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520	0	0				0	return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521	0					0	undef;
522							}
523
524							#------------------------------
525							#
526							# unknown [REPLACEMENT]
527							#
528							sub unknown {
529	0			0		0	my $self = shift;
530	0	0				0	$self->{MWDI_Unknown} = shift if @_;
531	0					0	$self->{MWDI_Unknown};
532							}
533
534							#------------------------------
535							#
536							# collapse [YESNO]
537							#
538							sub collapse {
539	0			0		0	my $self = shift;
540	0	0				0	$self->{MWDI_Collapse} = shift if @_;
541	0					0	$self->{MWDI_Collapse};
542							}
543
544							#------------------------------
545							#
546							# decode STRING
547							#
548							sub decode {
549	10			10		17025	my $self = shift;
550
551							### Do inherited action:
552	10					43	my $basic = $self->SUPER::decode(@_);
553	10	50				36	defined($basic) or return undef;
554
555							### Translate/consolidate illegal characters:
556	10	50				29	$basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557	10					20	$basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558	10					30	$basic;
559							}
560
561							#------------------------------------------------------------
562							#------------------------------------------------------------
563
564							=item MIME::WordDecoder::US_ASCII
565
566							A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567							You're probably better off using ISO-8859-1.
568
569							=cut
570
571							package MIME::WordDecoder::US_ASCII;
572
573	29			29		280	use strict;
	29					88
	29					1040
574	29			29		151	use vars qw(@ISA);
	29					58
	29					7052
575							@ISA = qw( MIME::WordDecoder::ISO_8859 );
576
577							sub new {
578	29			29		78	my ($class) = @_;
579	29					161	return $class->SUPER::new("1");
580							}
581
582							sub decode {
583	0			0		0	my $self = shift;
584
585							### Do inherited action:
586	0					0	my $basic = $self->SUPER::decode(@_);
587	0	0				0	defined($basic) or return undef;
588
589							### Translate/consolidate 8-bit characters:
590	0	0				0	$basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591	0					0	$basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592	0					0	$basic;
593							}
594
595							=back
596
597							=cut
598
599							package MIME::WordDecoder::UTF_8;
600	29			29		244	use strict;
	29					63
	29					857
601	29			29		15555	use Encode qw();
	29					488089
	29					1344
602	29			29		267	use Carp qw( carp );
	29					102
	29					2103
603	29			29		204	use vars qw(@ISA);
	29					56
	29					11577
604
605							@ISA = qw( MIME::WordDecoder );
606
607							sub h_convert_to_utf8
608							{
609	97			97		261	my ($data, $charset, $decoder) = @_;
610	97	100				283	$charset = 'US-ASCII' if ($charset eq 'raw');
611	97					412	my $enc = Encode::find_encoding($charset);
612	97	50				13735	if (!$enc) {
613	0					0	carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614	0					0	return '';
615							}
616	97					619	my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617	97					781	return $ans;
618							}
619
620							sub new {
621	29			29		114	my ($class) = @_;
622	29					307	my $self = $class->SUPER::new();
623	29					162	$self->handler('*' => \&h_convert_to_utf8);
624							}
625
626
627							#------------------------------------------------------------
628							#------------------------------------------------------------
629
630							package MIME::WordDecoder;
631
632							### Now we can init the default handler.
633							$Default = (MIME::WordDecoder::ISO_8859->new('1'));
634
635
636							### Add US-ASCII handler:
637							$DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638
639							### Add ISO-8859-{1..15} handlers:
640							for (1..15) {
641							$DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642							}
643
644							### UTF-8
645							$DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646
647							1; # end the module
648							__END__