File Coverage

lib/Class/DBI/utf8.pm
Criterion Covered Total %
statement 61 67 91.0
branch 15 22 68.1
condition 3 6 50.0
subroutine 12 13 92.3
pod 0 2 0.0
total 91 110 82.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Class::DBI::utf8 - A Class:::DBI subclass that knows about UTF-8
4              
5             =head1 SYNOPSIS
6              
7             This module is a Class::DBI plugin:
8              
9             package Foo;
10             use base qw( Class::DBI );
11             use Class::DBI::utf8;
12              
13             ...
14             __PACKAGE__->columns( All => qw( id text other ) );
15              
16             # the text column contains utf8-encoded character data
17             __PACKAGE__->utf8_columns(qw( text ));
18             ...
19            
20             # create an object with a nasty character.
21             my $foo = Foo->create({
22             text => "a \x{2264} b for some a",
23             });
24            
25             # search for utf8 chars.
26             Foo->search( text => "a \x{2264} b for some a" );
27              
28             =head1 DESCRIPTION
29              
30             Rather than have to think about things like character sets, I prefer to have
31             my objects just Do The Right Thing. I also want utf-8 encoded byte strings in
32             the database whenever possible. Using this subclass of Class::DBI, I can just
33             put perl strings into the properties of an object, and the right thing will
34             always go into the database and come out again.
35              
36             For example, without Class::DBI::utf8,
37              
38             MyObject->create({ id => 1, text => "\x{2264}" }); # a less-than-or-equal-to symbol
39              
40             ..will create a row in the database containing (probably) the utf-8 byte
41             encoding of the less-than-or-equal-to symbol. But when trying to retrieve the
42             object again..
43              
44             my $broken = MyObject->retrieve( 1 );
45             my $text = $broken->text;
46              
47             ... $text will (probably) contain 3 characters and look nothing like a
48             less-than-or-equal-to symbol. Likewise, you will be unable to search properly
49             for strings containing non-ascii characters.
50              
51             Creating objects with simpler non-ascii characters from the latin-1 range
52             will lead to even stranger behaviours:
53              
54             my $e_acute = "\x{e9}"; # an e-acute
55             MyObject->create({ text => $e_acute });
56              
57             utf8::upgrade($e_acute); # still the same letter, but with a different
58             # internal representation
59             MyObject->create({ text => $e_acute });
60              
61             This will create two rows in the database - the first containing the latin-1
62             encoded bytes of an e-acute character (or the database may refuse to let you
63             create the row, if it's been set up to require utf-8), the latter containing
64             the utf-8 encoded bytes of an e-acute. In the latter case you won't get an
65             e-acute back out again if you retrieve the row; You'll get a string
66             containing two characters, one for each byte of the utf-8 encoding.
67              
68             Because of this, if you're handling data from an outside source, you won't
69             really have any clear idea of what will be going into the database at all.
70              
71             Fortunately, simply adding the lines:
72              
73             use Class::DBI::utf8;
74             __PACKAGE__->utf8_columns("text");
75              
76             will make all these operations work much more as expected - the database will
77             always contain utf-8 bytes, you will always get back the characters you put
78             in, and you will instantly become the most popular person at work.
79              
80             This module assumes that the underlying database and driver don't know
81             anything about character sets, and just store bytes. Some databases, for
82             instance postgresql and later versions of mysql, allow you to create tables
83             with utf-8 character sets, but the Perl DB drivers don't respect this and
84             still require you to pass utf-8 bytes, and return utf-8 bytes and hence
85             still need special handling with Class::DBI.
86              
87             Class::DBI::utf8 will do the right thing in both cases, and I would
88             suggest you tell the database to use utf-8 encoding as well as using
89             Class::DBI::utf8 where possible.
90              
91             =head1 CAVEATS
92              
93             This module requires perl 5.8.0 or later - if you're still using 5.6, and you
94             want to use unicode, I suggest you don't. It's not nice.
95              
96             Be aware that utf-8 encoded strings will commonly have a byte length greater
97             than their character length - this is because non-ascii characters such as
98             e-actute will encode to two bytes, and other characters can be encoded to
99             other numbers of bytes, although 2 or 3 bytes are typical. If your database
100             has an underlying data type of a limited length, for instance a CHAR(10), you
101             may not be able to store 10 characters in it.
102              
103             Internally, the module is futzing with the _utf8_on and _utf8_off methods. If
104             you don't know I doing that is probably a bad idea, you should read into
105             it before you start trying to do this sort of thing yourself. I'd prefer to
106             use encode_utf8 and decode_utf8, but I have my reasons for doing it this way
107             - mostly, it's so that we can allow for DBD drivers that do know about
108             character sets.
109              
110             Finally, the database may have some internal string-handling functions, for
111             instance LOWER(), UPPER(), various sorting functions, etc. I the database
112             is properly utf-8 aware, it I do the right thing to the utf-8 encoded
113             strings in the database if you use these functions. But I've never seen a
114             database do the right thing. Likewise, there are all sorts of nasty
115             normalisation considerations when performing searches that are outside of the
116             scope of these docs to discuss, but which can really ruin your day.
117              
118             =head1 BUGS
119              
120             I've attempted to make the module keep doing the Right Thing even when the
121             DBD driver for the database knows what it's doing, ie, if you give it
122             sensible perl strings it'll store the right thing in the database and recover
123             the right thing from the database. However, I've been forced to assume that,
124             in this eventuality, the database driver will hand back strings that already
125             have the utf-8 bit set. If they don't, things I break. On the bright
126             side, they'll break really fast. I also find it extremely unlikely that
127             anyone would bother reducing strings to latin1 internally.
128              
129             Also, I've been forced to override the _do_search method to make searching
130             for utf8 strings work, so if you override it locally as well, bad things
131             will happen. Sorry.
132              
133             Incredible popularity and fame gained through understanding of utf-8 may not
134             actually be real.
135              
136             =head1 SEE ALSO
137              
138             L
139              
140             =head1 AUTHOR
141              
142             Tom Insam
143              
144             Copyright Fotango 2005. All rights reserved.
145              
146             This module is free software; you can redistribute it and/or modify it under
147             the same terms as Perl itself.
148              
149             =cut
150              
151              
152             package Class::DBI::utf8;
153 1     1   116516 use warnings;
  1         3  
  1         39  
154 1     1   6 use strict;
  1         2  
  1         42  
155 1     1   1235 use Encode qw( encode_utf8 decode_utf8 );
  1         16304  
  1         109  
156 1     1   1082 use utf8;
  1         10  
  1         5  
157 1     1   48 use base qw( Exporter Class::Data::Inheritable );
  1         2  
  1         132  
158 1     1   5 use Class::DBI;
  1         2  
  1         19  
159              
160             our $VERSION = 0.2;
161              
162              
163             # export the following functions..
164             our @EXPORT = (qw( utf8_all_columns utf8_columns ));
165              
166             # add an accessor to store which columns are utf8-enabled
167             Class::DBI->mk_classdata('_utf8_columns');
168              
169             sub utf8_all_columns {
170 0     0 0 0 my $class = shift;
171 0         0 $class->utf8_columns( $class->columns('All') );
172             }
173              
174             sub utf8_columns {
175 6     6 0 1350 my $class = shift;
176             # the default
177 6 100       36 $class->_utf8_columns([]) unless $class->_utf8_columns;
178              
179             # a getter?
180 6 100       86 return @{ $class->_utf8_columns } unless @_;
  5         19  
181              
182 1         4 my @columns = @_;
183 1         1 push @{ $class->_utf8_columns }, @columns;
  1         3  
184              
185             $class->add_trigger($_ => sub {
186 7     7   2387456 my ($self) = @_;
187 7         22 for (@columns) {
188 14 50       43 next if ref($self->{$_});
189 14 100       78 utf8::upgrade( $self->{$_} ) if defined($self->{$_});
190             }
191              
192 1         16 }) for qw( before_create before_update );
193              
194             $class->add_trigger(select => sub {
195 8     8   135016 my ($self) = @_;
196              
197 8         24 for (@columns) {
198 16 50       198 next if ref($self->{$_});
199              
200             # flip the bit..
201 16 100       86 Encode::_utf8_on($self->{$_}) if defined($self->{$_});
202              
203             # ..sanity check
204 16 50 66     123 if (defined($self->{$_}) and !utf8::valid($self->{$_})) {
205             # if we're in an eval, let's at least not _completely_ stuff
206             # the process. Turn the bit off again.
207 0         0 Encode::_utf8_off($self->{$_});
208             # ..and die
209 0         0 $self->_croak("Invalid UTF8 from database in column '$_'");
210             }
211             }
212 1         86 });
213              
214             }
215              
216             sub import {
217 1     1   7 my $class = shift;
218 1         2 local $Exporter::ExportLevel = 1;
219 1 50 33     7 if ($_[0] && $_[0] eq "-nosearch") {
220 0         0 shift; # ignore option
221 0         0 return $class->SUPER::import(@_);
222             }
223 1 50       17 if (caller(0)->isa('Class::DBI')) {
224 1         11 caller(0)->add_searcher(search => "Class::DBI::utf8::Search");
225             }
226 1         217 $class->SUPER::import(@_);
227             }
228              
229              
230             package Class::DBI::utf8::Search;
231 1     1   589 use base 'Class::DBI::Search::Basic';
  1         2  
  1         364  
232              
233             sub bind {
234 5     5   77066 my $self = shift;
235              
236             # for fast lookup of which cols are utf8
237 5         23 my %hash = map { $_ => 1 } $self->class->utf8_columns;
  10         63  
238              
239             # get name => values of columns to search for
240 5         92 my $search_for = $self->_search_for();
241            
242             # make an array that says whether the value at that position should be
243             # upgraded to utf8. This relies on ->bind() sorting the keys from _search_for()
244             # in the same way.
245 5 50       288 my @utf8cols = map { $hash{$_} && defined($search_for->{$_}) } sort keys %$search_for;
  5         126  
246            
247             # take copy of array to avoid upgrading the original values; we only want to
248             # upgrade the values for the search.
249 5         271 my @bind = @{ $self->SUPER::bind(@_) };
  5         25  
250              
251 5         69 my $i = 0;
252 5         18 for (@bind) {
253 5 50       17 if (shift @utf8cols) {
254 5         8 my $copy = $_;
255 5         22 utf8::upgrade($copy);
256 5         11 $bind[$i] = $copy;
257             }
258 5         14 $i++;
259             }
260 5         52 \@bind;
261             }
262              
263             1;