line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
Class::DBI::utf8 - A Class:::DBI subclass that knows about UTF-8 |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
This module is a Class::DBI plugin: |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
package Foo; |
10
|
|
|
|
|
|
|
use base qw( Class::DBI ); |
11
|
|
|
|
|
|
|
use Class::DBI::utf8; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
... |
14
|
|
|
|
|
|
|
__PACKAGE__->columns( All => qw( id text other ) ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# the text column contains utf8-encoded character data |
17
|
|
|
|
|
|
|
__PACKAGE__->utf8_columns(qw( text )); |
18
|
|
|
|
|
|
|
... |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# create an object with a nasty character. |
21
|
|
|
|
|
|
|
my $foo = Foo->create({ |
22
|
|
|
|
|
|
|
text => "a \x{2264} b for some a", |
23
|
|
|
|
|
|
|
}); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# search for utf8 chars. |
26
|
|
|
|
|
|
|
Foo->search( text => "a \x{2264} b for some a" ); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 DESCRIPTION |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Rather than have to think about things like character sets, I prefer to have |
31
|
|
|
|
|
|
|
my objects just Do The Right Thing. I also want utf-8 encoded byte strings in |
32
|
|
|
|
|
|
|
the database whenever possible. Using this subclass of Class::DBI, I can just |
33
|
|
|
|
|
|
|
put perl strings into the properties of an object, and the right thing will |
34
|
|
|
|
|
|
|
always go into the database and come out again. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
For example, without Class::DBI::utf8, |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
MyObject->create({ id => 1, text => "\x{2264}" }); # a less-than-or-equal-to symbol |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
..will create a row in the database containing (probably) the utf-8 byte |
41
|
|
|
|
|
|
|
encoding of the less-than-or-equal-to symbol. But when trying to retrieve the |
42
|
|
|
|
|
|
|
object again.. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my $broken = MyObject->retrieve( 1 ); |
45
|
|
|
|
|
|
|
my $text = $broken->text; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
... $text will (probably) contain 3 characters and look nothing like a |
48
|
|
|
|
|
|
|
less-than-or-equal-to symbol. Likewise, you will be unable to search properly |
49
|
|
|
|
|
|
|
for strings containing non-ascii characters. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Creating objects with simpler non-ascii characters from the latin-1 range |
52
|
|
|
|
|
|
|
will lead to even stranger behaviours: |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my $e_acute = "\x{e9}"; # an e-acute |
55
|
|
|
|
|
|
|
MyObject->create({ text => $e_acute }); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
utf8::upgrade($e_acute); # still the same letter, but with a different |
58
|
|
|
|
|
|
|
# internal representation |
59
|
|
|
|
|
|
|
MyObject->create({ text => $e_acute }); |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
This will create two rows in the database - the first containing the latin-1 |
62
|
|
|
|
|
|
|
encoded bytes of an e-acute character (or the database may refuse to let you |
63
|
|
|
|
|
|
|
create the row, if it's been set up to require utf-8), the latter containing |
64
|
|
|
|
|
|
|
the utf-8 encoded bytes of an e-acute. In the latter case you won't get an |
65
|
|
|
|
|
|
|
e-acute back out again if you retrieve the row; You'll get a string |
66
|
|
|
|
|
|
|
containing two characters, one for each byte of the utf-8 encoding. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Because of this, if you're handling data from an outside source, you won't |
69
|
|
|
|
|
|
|
really have any clear idea of what will be going into the database at all. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Fortunately, simply adding the lines: |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
use Class::DBI::utf8; |
74
|
|
|
|
|
|
|
__PACKAGE__->utf8_columns("text"); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
will make all these operations work much more as expected - the database will |
77
|
|
|
|
|
|
|
always contain utf-8 bytes, you will always get back the characters you put |
78
|
|
|
|
|
|
|
in, and you will instantly become the most popular person at work. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
This module assumes that the underlying database and driver don't know |
81
|
|
|
|
|
|
|
anything about character sets, and just store bytes. Some databases, for |
82
|
|
|
|
|
|
|
instance postgresql and later versions of mysql, allow you to create tables |
83
|
|
|
|
|
|
|
with utf-8 character sets, but the Perl DB drivers don't respect this and |
84
|
|
|
|
|
|
|
still require you to pass utf-8 bytes, and return utf-8 bytes and hence |
85
|
|
|
|
|
|
|
still need special handling with Class::DBI. |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Class::DBI::utf8 will do the right thing in both cases, and I would |
88
|
|
|
|
|
|
|
suggest you tell the database to use utf-8 encoding as well as using |
89
|
|
|
|
|
|
|
Class::DBI::utf8 where possible. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head1 CAVEATS |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
This module requires perl 5.8.0 or later - if you're still using 5.6, and you |
94
|
|
|
|
|
|
|
want to use unicode, I suggest you don't. It's not nice. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Be aware that utf-8 encoded strings will commonly have a byte length greater |
97
|
|
|
|
|
|
|
than their character length - this is because non-ascii characters such as |
98
|
|
|
|
|
|
|
e-actute will encode to two bytes, and other characters can be encoded to |
99
|
|
|
|
|
|
|
other numbers of bytes, although 2 or 3 bytes are typical. If your database |
100
|
|
|
|
|
|
|
has an underlying data type of a limited length, for instance a CHAR(10), you |
101
|
|
|
|
|
|
|
may not be able to store 10 characters in it. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Internally, the module is futzing with the _utf8_on and _utf8_off methods. If |
104
|
|
|
|
|
|
|
you don't know I doing that is probably a bad idea, you should read into |
105
|
|
|
|
|
|
|
it before you start trying to do this sort of thing yourself. I'd prefer to |
106
|
|
|
|
|
|
|
use encode_utf8 and decode_utf8, but I have my reasons for doing it this way |
107
|
|
|
|
|
|
|
- mostly, it's so that we can allow for DBD drivers that do know about |
108
|
|
|
|
|
|
|
character sets. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Finally, the database may have some internal string-handling functions, for |
111
|
|
|
|
|
|
|
instance LOWER(), UPPER(), various sorting functions, etc. I the database |
112
|
|
|
|
|
|
|
is properly utf-8 aware, it I do the right thing to the utf-8 encoded |
113
|
|
|
|
|
|
|
strings in the database if you use these functions. But I've never seen a |
114
|
|
|
|
|
|
|
database do the right thing. Likewise, there are all sorts of nasty |
115
|
|
|
|
|
|
|
normalisation considerations when performing searches that are outside of the |
116
|
|
|
|
|
|
|
scope of these docs to discuss, but which can really ruin your day. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head1 BUGS |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
I've attempted to make the module keep doing the Right Thing even when the |
121
|
|
|
|
|
|
|
DBD driver for the database knows what it's doing, ie, if you give it |
122
|
|
|
|
|
|
|
sensible perl strings it'll store the right thing in the database and recover |
123
|
|
|
|
|
|
|
the right thing from the database. However, I've been forced to assume that, |
124
|
|
|
|
|
|
|
in this eventuality, the database driver will hand back strings that already |
125
|
|
|
|
|
|
|
have the utf-8 bit set. If they don't, things I break. On the bright |
126
|
|
|
|
|
|
|
side, they'll break really fast. I also find it extremely unlikely that |
127
|
|
|
|
|
|
|
anyone would bother reducing strings to latin1 internally. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Also, I've been forced to override the _do_search method to make searching |
130
|
|
|
|
|
|
|
for utf8 strings work, so if you override it locally as well, bad things |
131
|
|
|
|
|
|
|
will happen. Sorry. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Incredible popularity and fame gained through understanding of utf-8 may not |
134
|
|
|
|
|
|
|
actually be real. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 SEE ALSO |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
L |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=head1 AUTHOR |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Tom Insam |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Copyright Fotango 2005. All rights reserved. |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under |
147
|
|
|
|
|
|
|
the same terms as Perl itself. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=cut |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
package Class::DBI::utf8; |
153
|
1
|
|
|
1
|
|
116516
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
39
|
|
154
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
42
|
|
155
|
1
|
|
|
1
|
|
1235
|
use Encode qw( encode_utf8 decode_utf8 ); |
|
1
|
|
|
|
|
16304
|
|
|
1
|
|
|
|
|
109
|
|
156
|
1
|
|
|
1
|
|
1082
|
use utf8; |
|
1
|
|
|
|
|
10
|
|
|
1
|
|
|
|
|
5
|
|
157
|
1
|
|
|
1
|
|
48
|
use base qw( Exporter Class::Data::Inheritable ); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
132
|
|
158
|
1
|
|
|
1
|
|
5
|
use Class::DBI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
our $VERSION = 0.2; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# export the following functions.. |
164
|
|
|
|
|
|
|
our @EXPORT = (qw( utf8_all_columns utf8_columns )); |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# add an accessor to store which columns are utf8-enabled |
167
|
|
|
|
|
|
|
Class::DBI->mk_classdata('_utf8_columns'); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub utf8_all_columns { |
170
|
0
|
|
|
0
|
0
|
0
|
my $class = shift; |
171
|
0
|
|
|
|
|
0
|
$class->utf8_columns( $class->columns('All') ); |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub utf8_columns { |
175
|
6
|
|
|
6
|
0
|
1350
|
my $class = shift; |
176
|
|
|
|
|
|
|
# the default |
177
|
6
|
100
|
|
|
|
36
|
$class->_utf8_columns([]) unless $class->_utf8_columns; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# a getter? |
180
|
6
|
100
|
|
|
|
86
|
return @{ $class->_utf8_columns } unless @_; |
|
5
|
|
|
|
|
19
|
|
181
|
|
|
|
|
|
|
|
182
|
1
|
|
|
|
|
4
|
my @columns = @_; |
183
|
1
|
|
|
|
|
1
|
push @{ $class->_utf8_columns }, @columns; |
|
1
|
|
|
|
|
3
|
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
$class->add_trigger($_ => sub { |
186
|
7
|
|
|
7
|
|
2387456
|
my ($self) = @_; |
187
|
7
|
|
|
|
|
22
|
for (@columns) { |
188
|
14
|
50
|
|
|
|
43
|
next if ref($self->{$_}); |
189
|
14
|
100
|
|
|
|
78
|
utf8::upgrade( $self->{$_} ) if defined($self->{$_}); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
1
|
|
|
|
|
16
|
}) for qw( before_create before_update ); |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$class->add_trigger(select => sub { |
195
|
8
|
|
|
8
|
|
135016
|
my ($self) = @_; |
196
|
|
|
|
|
|
|
|
197
|
8
|
|
|
|
|
24
|
for (@columns) { |
198
|
16
|
50
|
|
|
|
198
|
next if ref($self->{$_}); |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# flip the bit.. |
201
|
16
|
100
|
|
|
|
86
|
Encode::_utf8_on($self->{$_}) if defined($self->{$_}); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# ..sanity check |
204
|
16
|
50
|
66
|
|
|
123
|
if (defined($self->{$_}) and !utf8::valid($self->{$_})) { |
205
|
|
|
|
|
|
|
# if we're in an eval, let's at least not _completely_ stuff |
206
|
|
|
|
|
|
|
# the process. Turn the bit off again. |
207
|
0
|
|
|
|
|
0
|
Encode::_utf8_off($self->{$_}); |
208
|
|
|
|
|
|
|
# ..and die |
209
|
0
|
|
|
|
|
0
|
$self->_croak("Invalid UTF8 from database in column '$_'"); |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
1
|
|
|
|
|
86
|
}); |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
sub import { |
217
|
1
|
|
|
1
|
|
7
|
my $class = shift; |
218
|
1
|
|
|
|
|
2
|
local $Exporter::ExportLevel = 1; |
219
|
1
|
50
|
33
|
|
|
7
|
if ($_[0] && $_[0] eq "-nosearch") { |
220
|
0
|
|
|
|
|
0
|
shift; # ignore option |
221
|
0
|
|
|
|
|
0
|
return $class->SUPER::import(@_); |
222
|
|
|
|
|
|
|
} |
223
|
1
|
50
|
|
|
|
17
|
if (caller(0)->isa('Class::DBI')) { |
224
|
1
|
|
|
|
|
11
|
caller(0)->add_searcher(search => "Class::DBI::utf8::Search"); |
225
|
|
|
|
|
|
|
} |
226
|
1
|
|
|
|
|
217
|
$class->SUPER::import(@_); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
package Class::DBI::utf8::Search; |
231
|
1
|
|
|
1
|
|
589
|
use base 'Class::DBI::Search::Basic'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
364
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
sub bind { |
234
|
5
|
|
|
5
|
|
77066
|
my $self = shift; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# for fast lookup of which cols are utf8 |
237
|
5
|
|
|
|
|
23
|
my %hash = map { $_ => 1 } $self->class->utf8_columns; |
|
10
|
|
|
|
|
63
|
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# get name => values of columns to search for |
240
|
5
|
|
|
|
|
92
|
my $search_for = $self->_search_for(); |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# make an array that says whether the value at that position should be |
243
|
|
|
|
|
|
|
# upgraded to utf8. This relies on ->bind() sorting the keys from _search_for() |
244
|
|
|
|
|
|
|
# in the same way. |
245
|
5
|
50
|
|
|
|
288
|
my @utf8cols = map { $hash{$_} && defined($search_for->{$_}) } sort keys %$search_for; |
|
5
|
|
|
|
|
126
|
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# take copy of array to avoid upgrading the original values; we only want to |
248
|
|
|
|
|
|
|
# upgrade the values for the search. |
249
|
5
|
|
|
|
|
271
|
my @bind = @{ $self->SUPER::bind(@_) }; |
|
5
|
|
|
|
|
25
|
|
250
|
|
|
|
|
|
|
|
251
|
5
|
|
|
|
|
69
|
my $i = 0; |
252
|
5
|
|
|
|
|
18
|
for (@bind) { |
253
|
5
|
50
|
|
|
|
17
|
if (shift @utf8cols) { |
254
|
5
|
|
|
|
|
8
|
my $copy = $_; |
255
|
5
|
|
|
|
|
22
|
utf8::upgrade($copy); |
256
|
5
|
|
|
|
|
11
|
$bind[$i] = $copy; |
257
|
|
|
|
|
|
|
} |
258
|
5
|
|
|
|
|
14
|
$i++; |
259
|
|
|
|
|
|
|
} |
260
|
5
|
|
|
|
|
52
|
\@bind; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
1; |