lib/Encode/Repair.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 13 | 15 | 86.6 |
branch | n/a | ||
condition | n/a | ||
subroutine | 5 | 5 | 100.0 |
pod | n/a | ||
total | 18 | 20 | 90.0 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Encode::Repair; | ||||||
2 | our $VERSION = '0.0.2'; | ||||||
3 | 2 | 2 | 170414 | use strict; | |||
2 | 7 | ||||||
2 | 91 | ||||||
4 | 2 | 2 | 12 | use warnings; | |||
2 | 4 | ||||||
2 | 141 | ||||||
5 | |||||||
6 | our @EXPORT_OK = qw(repair_double learn_recoding repair_encoding); | ||||||
7 | 2 | 2 | 12 | use Exporter qw(import); | |||
2 | 9 | ||||||
2 | 80 | ||||||
8 | 2 | 2 | 949 | use Encode qw(encode decode); | |||
2 | 10665 | ||||||
2 | 260 | ||||||
9 | 2 | 2 | 2155 | use Algorithm::Loops qw(NestedLoops MapCar); | |||
0 | |||||||
0 | |||||||
10 | |||||||
11 | # since Algorithm::Loops already provides MapCar, it is very easy to implement | ||||||
12 | # zip() with it, instead of introducing another dependency (on | ||||||
13 | # List::MoreUtils, specifically) | ||||||
14 | sub zip { | ||||||
15 | MapCar { @_ == 2 ? @_ : () } @_; | ||||||
16 | } | ||||||
17 | |||||||
18 | my %subs = ( | ||||||
19 | encode => \&encode, | ||||||
20 | decode => \&decode, | ||||||
21 | ); | ||||||
22 | |||||||
23 | sub repair_encoding { | ||||||
24 | my ($str, $actions) = @_; | ||||||
25 | for (my $i = 0; $i < @$actions; $i += 2) { | ||||||
26 | my $type = $actions->[$i]; | ||||||
27 | my $encoding = $actions->[$i+1]; | ||||||
28 | no warnings 'utf8'; | ||||||
29 | $str = $subs{$type}->($encoding, $str); | ||||||
30 | } | ||||||
31 | $str; | ||||||
32 | } | ||||||
33 | |||||||
34 | sub repair_double { | ||||||
35 | my ($buf, $options) = @_; | ||||||
36 | my $via = 'ISO-8859-1'; | ||||||
37 | $via = $options->{via} if $options && exists $options->{via}; | ||||||
38 | repair_encoding($buf, [ | ||||||
39 | 'decode', 'UTF-8', | ||||||
40 | 'encode', $via, | ||||||
41 | 'decode', 'UTF-8', | ||||||
42 | ]); | ||||||
43 | } | ||||||
44 | |||||||
45 | sub learn_recoding { | ||||||
46 | my %args = @_; | ||||||
47 | my $source = $args{from}; | ||||||
48 | my $target = $args{to}; | ||||||
49 | my $encodings = $args{encodings}; | ||||||
50 | my $maxdepth = $args{depth} || 5; | ||||||
51 | my $search_mode = $args{search} || 'first'; | ||||||
52 | return [] if $source eq $target; | ||||||
53 | |||||||
54 | my @result; | ||||||
55 | for my $depth (1..$maxdepth) { | ||||||
56 | my $iter = NestedLoops( [($encodings) x $depth] ); | ||||||
57 | my @ed = (qw(encode decode)) x (int($depth / 2) + 1); | ||||||
58 | my @de = (qw(decode encode)) x (int($depth / 2) + 1); | ||||||
59 | while (my @steps = $iter->()) { | ||||||
60 | no warnings 'uninitialized'; | ||||||
61 | for my $steps ([zip \@ed, \@steps], [zip \@de, \@steps]) { | ||||||
62 | # use Data::Dumper; | ||||||
63 | # warn Dumper($steps); | ||||||
64 | if (eval {repair_encoding($source, $steps)} eq $target) { | ||||||
65 | if (lc($search_mode) eq 'first') { | ||||||
66 | return $steps; | ||||||
67 | } else { | ||||||
68 | push @result, $steps; | ||||||
69 | } | ||||||
70 | } | ||||||
71 | } | ||||||
72 | } | ||||||
73 | return \@result if @result && lc($search_mode) eq 'shallow'; | ||||||
74 | } | ||||||
75 | return \@result if @result; | ||||||
76 | return; | ||||||
77 | } | ||||||
78 | |||||||
79 | 1; | ||||||
80 | |||||||
81 | =encoding utf-8 | ||||||
82 | |||||||
83 | =head1 NAME | ||||||
84 | |||||||
85 | Encode::Repair - Repair wrongly encoded text strings | ||||||
86 | |||||||
87 | =head1 SYNOPSIS | ||||||
88 | |||||||
89 | # Simple usage | ||||||
90 | use Encode::Repair qw(repair_double); | ||||||
91 | binmode STDOUT, ':encoding(UTF-8)'; | ||||||
92 | |||||||
93 | # prints: small ae: ä | ||||||
94 | print repair_double("small ae: \xc3\x83\xc2\xa4\n"); | ||||||
95 | |||||||
96 | # prints: beta: β | ||||||
97 | print repair_double("beta: \xc4\xaa\xc2\xb2\n", {via => 'Latin-7'}); | ||||||
98 | |||||||
99 | |||||||
100 | # Advanced usage | ||||||
101 | # assumes you have a sample text both correctly decoded in a | ||||||
102 | # character string, and as a wrongly encoded buffer | ||||||
103 | |||||||
104 | use Encode::Repair qw(repair_encoding learn_recoding); | ||||||
105 | use charnames qw(:full); | ||||||
106 | binmode STDOUT, ':encoding(UTF-8)'; | ||||||
107 | |||||||
108 | my $recoding_pattern = learn_recoding( | ||||||
109 | from => "beta: \xc4\xaa\xc2\xb2", | ||||||
110 | to => "beta: \N{GREEK SMALL LETTER BETA}", | ||||||
111 | encodings => ['UTF-8', 'Latin-1', 'Latin-7'], | ||||||
112 | ); | ||||||
113 | if ($recoding_pattern) { | ||||||
114 | my $mojibake = "\304\252\302\273\304\252\302\261\304\252\302" | ||||||
115 | ."\274\304\252\342\200\234\304\252\302\261"; | ||||||
116 | print repair_encoding($mojibake, $recoding_pattern), "\n"; | ||||||
117 | } else { | ||||||
118 | print "Sorry, could not help you :-(\n"; | ||||||
119 | } | ||||||
120 | |||||||
121 | |||||||
122 | =head1 DESCRIPTION | ||||||
123 | |||||||
124 | Sometimes software or humans mess up the character encoding of text. In some | ||||||
125 | cases it is possible to reconstruct the original text. This module helps you | ||||||
126 | to do it. | ||||||
127 | |||||||
128 | It covers the rather common case that a program assumes a wrong character | ||||||
129 | encoding on reading some input, and converts it to Mojibake (see | ||||||
130 | L |
||||||
131 | |||||||
132 | If you use this module on a regular basis, it most likely indicates that | ||||||
133 | something is wrong in your processs. It should only be used for one-time tasks | ||||||
134 | such as migrating a database to a new system. | ||||||
135 | |||||||
136 | =head1 FUNCTIONS | ||||||
137 | |||||||
138 | =over | ||||||
139 | |||||||
140 | =item repair_double | ||||||
141 | |||||||
142 | Repairs the common case when a UTF-8 string was read as another encoding, | ||||||
143 | and was encoded as UTF-8 again. The other encoding defaults to ISO-8859-1 aka | ||||||
144 | Latin-1, and can be overridden with the C |
||||||
145 | |||||||
146 | my $repaired = repair_double($buffer, {via => 'ISO-8859-2' }); | ||||||
147 | |||||||
148 | It expects an octet string as input, and returns a decoded character string. | ||||||
149 | |||||||
150 | =item learn_recoding | ||||||
151 | |||||||
152 | Given a sample of text twice, once correctly decoded and once mistreated, | ||||||
153 | attemps to find a sequence of encoding and decoding that turns the mistreated | ||||||
154 | text into the correct form. | ||||||
155 | |||||||
156 | my $coding_pattern = learn_recoding( | ||||||
157 | from => $mistreated_buffer, | ||||||
158 | to => $correct_string, | ||||||
159 | encodings => \@involved_encodings, | ||||||
160 | depth => 5, | ||||||
161 | search => 'first', | ||||||
162 | ); | ||||||
163 | |||||||
164 | C |
||||||
165 | encodings involved in the process that messes up the encoding. If you don't | ||||||
166 | know these, try it with C |
||||||
167 | system uses by default. | ||||||
168 | |||||||
169 | C |
||||||
170 | example C |
||||||
171 | slow down the program significantly, although smaller depths are tried first. | ||||||
172 | |||||||
173 | The return value is C |
||||||
174 | returns the encoding/decoding steps suitable for feeding into C |
||||||
175 | It contains a list of even size, where elements with even indexes are either | ||||||
176 | C<'encode'> or C<'decode'>, and those with odd indexes contain the name of the | ||||||
177 | encoding. | ||||||
178 | |||||||
179 | With C |
||||||
180 | sequence. | ||||||
181 | WIth the default of C<'first'> it returns the first possible sequence. With | ||||||
182 | C<'shallow'> it searches for the first working sequence and all other | ||||||
183 | sequences of the same length, and then returns an array reference containing | ||||||
184 | array references to all sequences. With the value C<'all'>, all possible | ||||||
185 | sequences are searched and returned, but often that's a very bad idea, because | ||||||
186 | it also finds sequences where parts of the sequence undo the work of other | ||||||
187 | sequences (something like C<[qw(encode latin-1 decode latin-1)]>). | ||||||
188 | |||||||
189 | Since Version 0.0.2 C |
||||||
190 | encoding and decoding. So even if C<['decode', 'UTF-8', 'decode', 'UTF-8']> is | ||||||
191 | a working input, C |
||||||
192 | 'Latin-1', 'decode', 'UTF-8']> instead. So you might have to include C |
||||||
193 | in your encoding list even if it is not strictly involved. | ||||||
194 | |||||||
195 | =item repair_encoding | ||||||
196 | |||||||
197 | Takes an input string and an encoding/decoding pattern (as returned from | ||||||
198 | C |
||||||
199 | |||||||
200 | =back | ||||||
201 | |||||||
202 | =head1 Troubleshooting | ||||||
203 | |||||||
204 | If C |
||||||
205 | value (for example to 7). If that doesn't help, check that the two input | ||||||
206 | strings actually corespond. C |
||||||
207 | trailing newline characters or spaces will cause it to fail. | ||||||
208 | |||||||
209 | If C |
||||||
210 | you used for learning was not long enough, or not representative. For example | ||||||
211 | if your system uses both ISO-8859-1 and ISO-8859-15 (which are quite similar), | ||||||
212 | C |
||||||
213 | least one character that's in ISO-8859-15 but not in ISO-8859-1, like the | ||||||
214 | Euro sign (€). | ||||||
215 | |||||||
216 | =head1 Further Reading | ||||||
217 | |||||||
218 | This document tries to stick to the terminology introduced in the L |
||||||
219 | module. | ||||||
220 | |||||||
221 | If you want to learn more about the way text is encoded and how perl handles | ||||||
222 | that, take a look at L |
||||||
223 | |||||||
224 | =head1 LICENSE AND COPYRIGHT | ||||||
225 | |||||||
226 | Copyright (C) 2008, 2009 by Moritz Lenz, L |
||||||
227 | moritz@faui2k3.org. | ||||||
228 | |||||||
229 | This is free software; you my use it under the terms of the Artistic License 2 | ||||||
230 | as published by The Perl Foundation. | ||||||
231 | |||||||
232 | The code examples distributed with this package are an exception, and may be | ||||||
233 | used, modified and redistributed without any limitations. | ||||||
234 | |||||||
235 | Encode::Repair is distributed in the hope that it will be useful, but WITHOUT | ||||||
236 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||||||
237 | FOR A PARTICULAR PURPOSE. | ||||||
238 | |||||||
239 | =head1 Development | ||||||
240 | |||||||
241 | The source code is stored in a public git repository at | ||||||
242 | L |
||||||
243 | issue tracker linked from this site. | ||||||
244 | |||||||
245 | If you find a case of messed-up encodings that can be repaired deterministically | ||||||
246 | and that's not covered by this module, please contact the author, providing a | ||||||
247 | hex dump of both input and output, and as much information of the encoding and | ||||||
248 | decoding process as you have. | ||||||
249 | |||||||
250 | Patches are also very welcome. | ||||||
251 | |||||||
252 | =cut |