File Coverage

blib/lib/MIME/WordDecoder.pm
Criterion Covered Total %
statement 103 142 72.5
branch 17 44 38.6
condition 14 32 43.7
subroutine 28 34 82.3
pod 7 9 77.7
total 169 261 64.7


line stmt bran cond sub pod time code
1             package MIME::WordDecoder;
2              
3             =head1 NAME
4              
5             MIME::WordDecoder - decode RFC 2047 encoded words to a local representation
6              
7             WARNING: Most of this module is deprecated and may disappear. The only
8             function you should use for MIME decoding is "mime_to_perl_string".
9              
10             =head1 SYNOPSIS
11              
12             See L for the basics of encoded words.
13             See L<"DESCRIPTION"> for how this class works.
14              
15             use MIME::WordDecoder;
16              
17              
18             ### Get the default word-decoder (used by unmime()):
19             $wd = default MIME::WordDecoder;
20              
21             ### Get a word-decoder which maps to ISO-8859-1 (Latin1):
22             $wd = supported MIME::WordDecoder "ISO-8859-1";
23              
24              
25             ### Decode a MIME string (e.g., into Latin1) via the default decoder:
26             $str = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
27              
28             ### Decode a string using the default decoder, non-OO style:
29             $str = unmime('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
30              
31             ### Decode a string to an internal Perl string, non-OO style
32             ### The result is likely to have the UTF8 flag ON.
33             $str = mime_to_perl_string('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
34              
35             =head1 DESCRIPTION
36              
37             WARNING: Most of this module is deprecated and may disappear. It
38             duplicates (badly) the function of the standard 'Encode' module. The
39             only function you should rely on is mime_to_perl_string.
40              
41             A MIME::WordDecoder consists, fundamentally, of a hash which maps
42             a character set name (US-ASCII, ISO-8859-1, etc.) to a subroutine which
43             knows how to take bytes in that character set and turn them into
44             the target string representation. Ideally, this target representation
45             would be Unicode, but we don't want to overspecify the translation
46             that takes place: if you want to convert MIME strings directly to Big5,
47             that's your own decision.
48              
49             The subroutine will be invoked with two arguments: DATA (the data in
50             the given character set), and CHARSET (the upcased character set name).
51              
52             For example:
53              
54             ### Keep 7-bit characters as-is, convert 8-bit characters to '#':
55             sub keep7bit {
56             local $_ = shift;
57             tr/\x00-\x7F/#/c;
58             $_;
59             }
60              
61             Here's a decoder which uses that:
62              
63             ### Construct a decoder:
64             $wd = MIME::WordDecoder->new({'US-ASCII' => "KEEP", ### sub { $_[0] }
65             'ISO-8859-1' => \&keep7bit,
66             'ISO-8859-2' => \&keep7bit,
67             'Big5' => "WARN",
68             '*' => "DIE"});
69              
70             ### Convert some MIME text to a pure ASCII string...
71             $ascii = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
72              
73             ### ...which will now hold: "To: Keld J#rn Simonsen "
74              
75             The UTF-8 built-in decoder decodes everything into Perl's internal
76             string format, possibly turning on the internal UTF8 flag. Use it like
77             this:
78              
79             $wd = supported MIME::WordDecoder 'UTF-8';
80             $perl_string = $wd->decode('To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ');
81             # perl_string will be a valid UTF-8 string with the "UTF8" flag set.
82              
83             Generally, you should use the UTF-8 decoder in preference to "unmime".
84              
85             =head1 PUBLIC INTERFACE
86              
87             =over
88              
89             =cut
90              
91 29     29   141290 use strict;
  29         82  
  29         2876  
92 29     29   151 use Carp qw( carp croak );
  29         54  
  29         1960  
93 29     29   13406 use MIME::Words qw(decode_mimewords);
  29         96  
  29         2036  
94 29     29   251 use Exporter;
  29         69  
  29         1265  
95 29     29   148 use vars qw(@ISA @EXPORT);
  29         60  
  29         30153  
96              
97             @ISA = qw(Exporter);
98             @EXPORT = qw( unmime mime_to_perl_string );
99              
100              
101              
102             #------------------------------
103             #
104             # Globals
105             #
106             #------------------------------
107              
108             ### Decoders.
109             my %DecoderFor = ();
110              
111             ### Standard handlers.
112             my %Handler =
113             (
114             KEEP => sub {$_[0]},
115             IGNORE => sub {''},
116             WARN => sub { carp "ignoring text in character set `$_[1]'\n" },
117             DIE => sub { croak "can't handle text in character set `$_[1]'\n" },
118             );
119              
120             ### Global default decoder. We init it below.
121             my $Default;
122              
123             ### Global UTF8 decoder.
124             my $DefaultUTF8;
125              
126             #------------------------------
127              
128             =item default [DECODER]
129              
130             I
131             Get/set the default DECODER object.
132              
133             =cut
134              
135             sub default {
136 25     25 1 316 my $class = shift;
137 25 50       83 if (@_) {
138 25         145 $Default = shift;
139             }
140 25         53 $Default;
141             }
142              
143             #------------------------------
144              
145             =item supported CHARSET, [DECODER]
146              
147             I
148             If just CHARSET is given, returns a decoder object which maps
149             data into that character set (the character set is forced to
150             all-uppercase).
151              
152             $wd = supported MIME::WordDecoder "ISO-8859-1";
153              
154             If DECODER is given, installs such an object:
155              
156             MIME::WordDecoder->supported("ISO-8859-1" =>
157             (new MIME::WordDecoder::ISO_8859 "1"));
158              
159             You should not override this method.
160              
161             =cut
162              
163             sub supported {
164 218     218 1 139446 my ($class, $charset, $decoder) = @_;
165 218 50       529 $DecoderFor{uc($charset)} = $decoder if (@_ > 2);
166 218         3200 $DecoderFor{uc($charset)};
167             }
168              
169             #------------------------------
170              
171             =item new [\@HANDLERS]
172              
173             I
174             If \@HANDLERS is given, then @HANDLERS is passed to handler()
175             to initialize the internal map.
176              
177             =cut
178              
179             sub new {
180 548     548 1 1013 my ($class, $h) = @_;
181 548         1461 my $self = bless { MWD_Map=>{} }, $class;
182              
183             ### Init the map:
184 548         1770 $self->handler(@$h);
185              
186             ### Add fallbacks:
187 548   66     2788 $self->{MWD_Map}{'*'} ||= $Handler{WARN};
188 548   66     2292 $self->{MWD_Map}{'raw'} ||= $self->{MWD_Map}{'US-ASCII'};
189 548         1005 $self;
190             }
191              
192             #------------------------------
193              
194             =item handler CHARSET=>\&SUBREF, ...
195              
196             I
197             Set the handler SUBREF for a given CHARSET, for as many pairs
198             as you care to supply.
199              
200             When performing the translation of a MIME-encoded string, a
201             given SUBREF will be invoked when translating a block of text
202             in character set CHARSET. The subroutine will be invoked with
203             the following arguments:
204              
205             DATA - the data in the given character set.
206             CHARSET - the upcased character set name, which may prove useful
207             if you are using the same SUBREF for multiple CHARSETs.
208             DECODER - the decoder itself, if it contains configuration information
209             that your handler function needs.
210              
211             For example:
212              
213             $wd = new MIME::WordDecoder;
214             $wd->handler('US-ASCII' => "KEEP");
215             $wd->handler('ISO-8859-1' => \&handle_latin1,
216             'ISO-8859-2' => \&handle_latin1,
217             '*' => "DIE");
218              
219             Notice that, much as with %SIG, the SUBREF can also be taken from
220             a set of special keywords:
221              
222             KEEP Pass data through unchanged.
223             IGNORE Ignore data in this character set, without warning.
224             WARN Ignore data in this character set, with warning.
225             DIE Fatal exception with "can't handle character set" message.
226              
227             The subroutine for the special CHARSET of 'raw' is used for raw
228             (non-MIME-encoded) text, which is supposed to be US-ASCII.
229             The handler for 'raw' defaults to whatever was specified for 'US-ASCII'
230             at the time of construction.
231              
232             The subroutine for the special CHARSET of '*' is used for any
233             unrecognized character set. The default action for '*' is WARN.
234              
235             =cut
236              
237             sub handler {
238 1095     1095 1 1509 my $self = shift;
239              
240             ### Copy the hash, and edit it:
241 1095         2110 while (@_) {
242 1068         1616 my $c = shift;
243 1068         1411 my $sub = shift;
244 1068         2301 $self->{MWD_Map}{$c} = $self->real_handler($sub);
245             }
246 1095         1769 $self;
247             }
248              
249             #------------------------------
250              
251             =item decode STRING
252              
253             I
254             Decode a STRING which might contain MIME-encoded components into a
255             local representation (e.g., UTF-8, etc.).
256              
257             =cut
258              
259             sub decode {
260 95     95 1 1521 my ($self, $str) = @_;
261 95 50       250 defined($str) or return undef;
262             join('', map {
263             ### Get the data and (upcased) charset:
264 95         370 my $data = $_->[0];
  120         235  
265 120 100       385 my $charset = (defined($_->[1]) ? uc($_->[1]) : 'raw');
266 120         253 $charset =~ s/\*\w+\Z//; ### RFC2184 language suffix
267              
268             ### Get the handler; guess if never seen before:
269             defined($self->{MWD_Map}{$charset}) or
270 120 100 100     490 $self->{MWD_Map}{$charset} =
271             ($self->real_handler($self->guess_handler($charset)) || 0);
272 120   66     538 my $subr = $self->{MWD_Map}{$charset} || $self->{MWD_Map}{'*'};
273              
274             ### Map this chunk:
275 120         339 &$subr($data, $charset, $self);
276             } decode_mimewords($str));
277             }
278              
279             #------------------------------
280             #
281             # guess_handler CHARSET
282             #
283             # Instance method.
284             # An unrecognized charset has been seen. Guess a handler subref
285             # for the given charset, returning false if there is none.
286             # Successful mappings will be cached in the main map.
287             #
288             sub guess_handler {
289 15     15 0 134 undef;
290             }
291              
292             #------------------------------
293             #
294             # real_handler HANDLER
295             #
296             # Instance method.
297             # Translate the given handler, which might be a subref or a string.
298             #
299             sub real_handler {
300 1085     1085 0 1800 my ($self, $sub) = @_;
301             (!$sub) or
302             (ref($sub) eq 'CODE') or
303 1085 100 33     4124 $sub = ($Handler{$sub} || croak "bad named handler: $sub\n");
      100        
304 1085         3213 $sub;
305             }
306              
307             #------------------------------
308              
309             =item unmime STRING
310              
311             I
312             Decode the given STRING using the default() decoder.
313             See L.
314              
315             You should consider using the UTF-8 decoder instead. It decodes
316             MIME strings into Perl's internal string format.
317              
318             =cut
319              
320             sub unmime($) {
321 0     0 1 0 my $str = shift;
322 0         0 $Default->decode($str);
323             }
324              
325             =item mime_to_perl_string
326              
327             I
328             Decode the given STRING into an internal Perl Unicode string.
329             You should use this function in preference to all others.
330              
331             The result of mime_to_perl_string is likely to have Perl's
332             UTF8 flag set.
333              
334             =cut
335              
336             sub mime_to_perl_string($) {
337 1     1 1 2204 my $str = shift;
338 1         5 $DecoderFor{'UTF-8'}->decode($str);
339             }
340              
341             =back
342              
343             =cut
344              
345              
346              
347              
348              
349             =head1 SUBCLASSES
350              
351             =over
352              
353             =cut
354              
355             #------------------------------------------------------------
356             #------------------------------------------------------------
357              
358             =item MIME::WordDecoder::ISO_8859
359              
360             A simple decoder which keeps US-ASCII and the 7-bit characters
361             of ISO-8859 character sets and UTF8, and also keeps 8-bit
362             characters from the indicated character set.
363              
364             ### Construct:
365             $wd = new MIME::WordDecoder::ISO_8859 2; ### ISO-8859-2
366              
367             ### What to translate unknown characters to (can also use empty):
368             ### Default is "?".
369             $wd->unknown("?");
370              
371             ### Collapse runs of unknown characters to a single unknown()?
372             ### Default is false.
373             $wd->collapse(1);
374              
375              
376             According to B
377             (ca. November 2000):
378              
379             ISO 8859 is a full series of 10 (and soon even more) standardized
380             multilingual single-byte coded (8bit) graphic character sets for
381             writing in alphabetic languages:
382              
383             1. Latin1 (West European)
384             2. Latin2 (East European)
385             3. Latin3 (South European)
386             4. Latin4 (North European)
387             5. Cyrillic
388             6. Arabic
389             7. Greek
390             8. Hebrew
391             9. Latin5 (Turkish)
392             10. Latin6 (Nordic)
393              
394             The ISO 8859 charsets are not even remotely as complete as the truly
395             great Unicode but they have been around and usable for quite a while
396             (first registered Internet charsets for use with MIME) and have
397             already offered a major improvement over the plain 7bit US-ASCII.
398              
399             Characters 0 to 127 are always identical with US-ASCII and the
400             positions 128 to 159 hold some less used control characters: the
401             so-called C1 set from ISO 6429.
402              
403             =cut
404              
405             package MIME::WordDecoder::ISO_8859;
406              
407 29     29   234 use strict;
  29         92  
  29         849  
408 29     29   153 use vars qw(@ISA);
  29         50  
  29         33522  
409             @ISA = qw( MIME::WordDecoder );
410              
411              
412             #------------------------------
413             #
414             # HANDLERS
415             #
416             #------------------------------
417              
418             ### Keep 7bit characters.
419             ### Turn all else to the special \x00.
420             sub h_keep7bit {
421 2     2   6 local $_ = $_[0];
422             # my $unknown = $_[2]->{MWDI_Unknown};
423              
424 2         6 s{[\x80-\xFF]}{\x00}g;
425 2         7 $_;
426             }
427              
428             ### Note: should use Unicode::String, converting/manipulating
429             ### everything into full Unicode form.
430              
431             ### Keep 7bit UTF8 characters (ASCII).
432             ### Keep ISO-8859-1 if this decoder is for Latin-1.
433             ### Turn all else to the special \x00.
434             sub h_utf8 {
435 0     0   0 local $_ = $_[0];
436             # my $unknown = $_[2]->{MWDI_Unknown};
437 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
438             #print STDERR "UTF8 in: <$_>\n";
439              
440 0         0 local($1,$2,$3);
441 0         0 my $tgt = '';
442 0   0     0 while (m{\G(
443             ([\x00-\x7F]) | # 0xxxxxxx
444             ([\xC0-\xDF] [\x80-\xBF]) | # 110yyyyy 10xxxxxx
445             ([\xE0-\xEF] [\x80-\xBF]{2}) | # 1110zzzz 10yyyyyy 10xxxxxx
446             ([\xF0-\xF7] [\x80-\xBF]{3}) | # 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
447             . # error; synch
448             )}gcsx and ($1 ne '')) {
449              
450 0 0 0     0 if (defined($2)) { $tgt .= $2 }
  0 0       0  
451 0         0 elsif (defined($3) && $latin1) { $tgt .= "\x00" }
452 0         0 else { $tgt .= "\x00" }
453             }
454              
455             #print STDERR "UTF8 out: <$tgt>\n";
456 0         0 $tgt;
457             }
458              
459             ### Keep characters which are 7bit in UTF8 (ASCII).
460             ### Keep ISO-8859-1 if this decoder is for Latin-1.
461             ### Turn all else to the special \x00.
462             sub h_utf16 {
463 0     0   0 local $_ = $_[0];
464             # my $unknown = $_[2]->{MWDI_Unknown};
465 0         0 my $latin1 = ($_[2]->{MWDI_Num} == 1);
466             #print STDERR "UTF16 in: <$_>\n";
467              
468 0         0 local($1,$2,$3,$4,$5);
469 0         0 my $tgt = '';
470 0   0     0 while (m{\G(
471             ( \x00 ([\x00-\x7F])) | # 00000000 0xxxxxxx
472             ( \x00 ([\x80-\xFF])) | # 00000000 1xxxxxxx
473             ( [^\x00] [\x00-\xFF]) | # etc
474             )
475             }gcsx and ($1 ne '')) {
476              
477 0 0 0     0 if (defined($2)) { $tgt .= $3 }
  0 0       0  
478 0         0 elsif (defined($4) && $latin1) { $tgt .= $5 }
479 0         0 else { $tgt .= "\x00" }
480             }
481              
482             #print STDERR "UTF16 out: <$tgt>\n";
483 0         0 $tgt;
484             }
485              
486              
487             #------------------------------
488             #
489             # PUBLIC INTERFACE
490             #
491             #------------------------------
492              
493             #------------------------------
494             #
495             # new NUMBER
496             #
497             sub new {
498 518     518   153440 my ($class, $num) = @_;
499              
500 518         1167 my $self = $class->SUPER::new();
501 518         1922 $self->handler('raw' => 'KEEP',
502             'US-ASCII' => 'KEEP');
503              
504 518         885 $self->{MWDI_Num} = $num;
505 518         858 $self->{MWDI_Unknown} = "?";
506 518         845 $self->{MWDI_Collapse} = 0;
507 518         1613 $self;
508             }
509              
510             #------------------------------
511             #
512             # guess_handler CHARSET
513             #
514             sub guess_handler {
515 2     2   6 my ($self, $charset) = @_;
516             return 'KEEP' if (($charset =~ /^ISO[-_]?8859[-_](\d+)$/) &&
517 2 100 66     26 ($1 eq $self->{MWDI_Num}));
518 1 50       10 return \&h_keep7bit if ($charset =~ /^ISO[-_]?8859/);
519 0 0       0 return \&h_utf8 if ($charset =~ /^UTF[-_]?8$/);
520 0 0       0 return \&h_utf16 if ($charset =~ /^UTF[-_]?16$/);
521 0         0 undef;
522             }
523              
524             #------------------------------
525             #
526             # unknown [REPLACEMENT]
527             #
528             sub unknown {
529 0     0   0 my $self = shift;
530 0 0       0 $self->{MWDI_Unknown} = shift if @_;
531 0         0 $self->{MWDI_Unknown};
532             }
533              
534             #------------------------------
535             #
536             # collapse [YESNO]
537             #
538             sub collapse {
539 0     0   0 my $self = shift;
540 0 0       0 $self->{MWDI_Collapse} = shift if @_;
541 0         0 $self->{MWDI_Collapse};
542             }
543              
544             #------------------------------
545             #
546             # decode STRING
547             #
548             sub decode {
549 10     10   17025 my $self = shift;
550              
551             ### Do inherited action:
552 10         43 my $basic = $self->SUPER::decode(@_);
553 10 50       36 defined($basic) or return undef;
554              
555             ### Translate/consolidate illegal characters:
556 10 50       29 $basic =~ tr{\x00}{\x00}c if $self->{MWDI_Collapse};
557 10         20 $basic =~ s{\x00}{$self->{MWDI_Unknown}}g;
558 10         30 $basic;
559             }
560              
561             #------------------------------------------------------------
562             #------------------------------------------------------------
563              
564             =item MIME::WordDecoder::US_ASCII
565              
566             A subclass of the ISO-8859-1 decoder which discards 8-bit characters.
567             You're probably better off using ISO-8859-1.
568              
569             =cut
570              
571             package MIME::WordDecoder::US_ASCII;
572              
573 29     29   280 use strict;
  29         88  
  29         1040  
574 29     29   151 use vars qw(@ISA);
  29         58  
  29         7052  
575             @ISA = qw( MIME::WordDecoder::ISO_8859 );
576              
577             sub new {
578 29     29   78 my ($class) = @_;
579 29         161 return $class->SUPER::new("1");
580             }
581              
582             sub decode {
583 0     0   0 my $self = shift;
584              
585             ### Do inherited action:
586 0         0 my $basic = $self->SUPER::decode(@_);
587 0 0       0 defined($basic) or return undef;
588              
589             ### Translate/consolidate 8-bit characters:
590 0 0       0 $basic =~ tr{\x80-\xFF}{}c if $self->{MWDI_Collapse};
591 0         0 $basic =~ s{[\x80-\xFF]}{$self->{MWDI_Unknown}}g;
592 0         0 $basic;
593             }
594              
595             =back
596              
597             =cut
598              
599             package MIME::WordDecoder::UTF_8;
600 29     29   244 use strict;
  29         63  
  29         857  
601 29     29   15555 use Encode qw();
  29         488089  
  29         1344  
602 29     29   267 use Carp qw( carp );
  29         102  
  29         2103  
603 29     29   204 use vars qw(@ISA);
  29         56  
  29         11577  
604              
605             @ISA = qw( MIME::WordDecoder );
606              
607             sub h_convert_to_utf8
608             {
609 97     97   261 my ($data, $charset, $decoder) = @_;
610 97 100       283 $charset = 'US-ASCII' if ($charset eq 'raw');
611 97         412 my $enc = Encode::find_encoding($charset);
612 97 50       13735 if (!$enc) {
613 0         0 carp "Unable to convert text in character set `$charset' to UTF-8... ignoring\n";
614 0         0 return '';
615             }
616 97         619 my $ans = $enc->decode($data, Encode::FB_PERLQQ);
617 97         781 return $ans;
618             }
619              
620             sub new {
621 29     29   114 my ($class) = @_;
622 29         307 my $self = $class->SUPER::new();
623 29         162 $self->handler('*' => \&h_convert_to_utf8);
624             }
625              
626              
627             #------------------------------------------------------------
628             #------------------------------------------------------------
629              
630             package MIME::WordDecoder;
631              
632             ### Now we can init the default handler.
633             $Default = (MIME::WordDecoder::ISO_8859->new('1'));
634              
635              
636             ### Add US-ASCII handler:
637             $DecoderFor{"US-ASCII"} = MIME::WordDecoder::US_ASCII->new;
638              
639             ### Add ISO-8859-{1..15} handlers:
640             for (1..15) {
641             $DecoderFor{"ISO-8859-$_"} = MIME::WordDecoder::ISO_8859->new($_);
642             }
643              
644             ### UTF-8
645             $DecoderFor{'UTF-8'} = MIME::WordDecoder::UTF_8->new();
646              
647             1; # end the module
648             __END__