File Coverage

blib/lib/File/BOM.pm
Criterion Covered Total %
statement 155 158 98.1
branch 69 76 90.7
condition 11 12 91.6
subroutine 28 28 100.0
pod 6 10 60.0
total 269 284 94.7


line stmt bran cond sub pod time code
1             package File::BOM;
2              
3             =head1 NAME
4              
5             File::BOM - Utilities for handling Byte Order Marks
6              
7             =head1 SYNOPSIS
8              
9             use File::BOM qw( :all )
10              
11             =head2 high-level functions
12              
13             # read a file with encoding from the BOM:
14             open_bom(FH, $file)
15             open_bom(FH, $file, ':utf8') # the same but with a default encoding
16              
17             # get encoding too
18             $encoding = open_bom(FH, $file, ':utf8');
19              
20             # open a potentially unseekable file:
21             ($encoding, $spillage) = open_bom(FH, $file, ':utf8');
22              
23             # change encoding of an open handle according to BOM
24             $encoding = defuse(*HANDLE);
25             ($encoding, $spillage) = defuse(*HANDLE);
26              
27             # Decode a string according to leading BOM:
28             $unicode = decode_from_bom($string_with_bom);
29            
30             # Decode a string and get the encoding:
31             ($unicode, $encoding) = decode_from_bom($string_with_bom)
32              
33             =head2 PerlIO::via interface
34              
35             # Read the Right Thing from a unicode file with BOM:
36             open(HANDLE, '<:via(File::BOM)', $filename)
37              
38             # Writing little-endian UTF-16 file with BOM:
39             open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
40              
41              
42             =head2 lower-level functions
43              
44             # read BOM encoding from a filehandle:
45             $encoding = get_encoding_from_filehandle(FH)
46              
47             # Get encoding even if FH is unseekable:
48             ($encoding, $spillage) = get_encoding_from_filehandle(FH);
49              
50             # Get encoding from a known unseekable handle:
51             ($encdoing, $spillage) = get_encoding_from_stream(FH);
52              
53             # get encoding and BOM length from BOM at start of string:
54             ($encoding, $offset) = get_encoding_from_bom($string);
55              
56             =head2 variables
57              
58             # print a BOM for a known encoding
59             print FH $enc2bom{$encoding};
60              
61             # get an encoding from a known BOM
62             $enc = $bom2enc{$bom}
63              
64             =head1 DESCRIPTION
65              
66             This module provides functions for handling unicode byte order marks, which are
67             to be found at the beginning of some files and streams.
68              
69             For details about what a byte order mark is, see
70             L
71              
72             The intention of File::BOM is for files with BOMs to be readable as seamlessly
73             as possible, regardless of the encoding used. To that end, several different
74             interfaces are available, as shown in the synopsis above.
75              
76             =cut
77              
78 16     16   1838904 use strict;
  16         361  
  16         580  
79 16     16   81 use warnings;
  16         32  
  16         637  
80              
81             # We don't want any character semantics at all
82 16     16   104 use bytes;
  16         17  
  16         100  
83              
84 16     16   475 use base qw( Exporter );
  16         20  
  16         1533  
85              
86 16     16   9957 use Readonly;
  16         67349  
  16         903  
87              
88 16     16   125 use Carp qw( croak );
  16         32  
  16         733  
89 16     16   80 use Fcntl qw( :seek );
  16         32  
  16         2413  
90 16     16   127 use Encode qw( :DEFAULT :fallbacks is_utf8 );
  16         32  
  16         2901  
91 16     16   136 use Symbol qw( gensym qualify_to_ref );
  16         53  
  16         36568  
92              
93             my @subs = qw(
94             open_bom
95             defuse
96             decode_from_bom
97             get_encoding_from_bom
98             get_encoding_from_filehandle
99             get_encoding_from_stream
100             );
101              
102             my @vars = qw( %bom2enc %enc2bom );
103              
104             our $VERSION = '0.17';
105              
106             our @EXPORT = ();
107             our @EXPORT_OK = ( @subs, @vars );
108             our %EXPORT_TAGS = (
109             all => \@EXPORT_OK,
110             subs => \@subs,
111             vars => \@vars
112             );
113              
114             =head1 EXPORTS
115              
116             Nothing by default.
117              
118             =head2 symbols
119              
120             =over 4
121              
122             =item * open_bom()
123              
124             =item * defuse()
125              
126             =item * decode_from_bom()
127              
128             =item * get_encoding_from_filehandle()
129              
130             =item * get_encoding_from_stream()
131              
132             =item * get_encoding_from_bom()
133              
134             =item * %bom2enc
135              
136             =item * %enc2bom
137              
138             =back
139              
140             =head2 tags
141              
142             =over 4
143              
144             =item * :all
145              
146             All of the above
147              
148             =item * :subs
149              
150             subroutines only
151              
152             =item * :vars
153              
154             just %bom2enc and %enc2bom
155              
156             =back
157              
158             =cut
159              
160             =head1 VARIABLES
161              
162             =head2 %bom2enc
163              
164             Maps Byte Order marks to their encodings.
165              
166             The keys of this hash are strings which represent the BOMs, the values are their
167             encodings, in a format which is understood by L
168              
169             The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE,
170             UTF-32BE and UTF-32LE
171              
172             =head2 %enc2bom
173              
174             A reverse-lookup hash for bom2enc, with a few aliases used in L, namely utf8, iso-10646-1 and UCS-2.
175              
176             Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly
177             because Encode::encode automatically puts BOMs on output. See L
178              
179             =cut
180              
181             our(%bom2enc, %enc2bom, $MAX_BOM_LENGTH, $bom_re);
182              
183             # length in bytes of the longest BOM
184             $MAX_BOM_LENGTH = 4;
185              
186             Readonly %bom2enc => (
187             map { encode($_, "\x{feff}") => $_ } qw(
188             UTF-8
189             UTF-16BE
190             UTF-16LE
191             UTF-32BE
192             UTF-32LE
193             )
194             );
195              
196             Readonly %enc2bom => (
197             reverse(%bom2enc),
198             map { $_ => encode($_, "\x{feff}") } qw(
199             UCS-2
200             iso-10646-1
201             utf8
202             )
203             );
204              
205             {
206             local $" = '|';
207              
208             my @bombs = sort { length $b <=> length $a } keys %bom2enc;
209              
210             Readonly $MAX_BOM_LENGTH => length $bombs[0];
211              
212             Readonly $bom_re => qr/^(@bombs)/o;
213             }
214              
215             =head1 FUNCTIONS
216              
217             =head2 open_bom
218              
219             $encoding = open_bom(HANDLE, $filename, $default_mode)
220              
221             ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode)
222              
223             opens HANDLE for reading on $filename, setting the mode to the appropriate
224             encoding for the BOM stored in the file.
225              
226             On failure, a fatal error is raised, see the DIAGNOSTICS section for details on
227             how to catch these. This is in order to allow the return value(s) to be used for
228             other purposes.
229              
230             If the file doesn't contain a BOM, $default_mode is used instead. Hence:
231              
232             open_bom(FH, 'my_file.txt', ':utf8')
233              
234             Opens my_file.txt for reading in an appropriate encoding found from the BOM in
235             that file, or as a UTF-8 file if none is found.
236              
237             In the absence of a $default_mode argument, the following 2 calls should be equivalent:
238              
239             open_bom(FH, 'no_bom.txt');
240              
241             open(FH, '<', 'no_bom.txt');
242              
243             If an undefined value is passed as the handle, a symbol will be generated for it
244             like open() does:
245              
246             # create filehandle on the fly
247             $enc = open_bom(my $fh, $filename, ':utf8');
248             $line = <$fh>;
249              
250             The filehandle will be cued up to read after the BOM. Unseekable files (e.g.
251             fifos) will cause croaking, unless called in list context to catch spillage
252             from the handle. Any spillage will be automatically decoded from the encoding,
253             if found.
254              
255             e.g.
256              
257             # croak if my_socket is unseekable
258             open_bom(FH, 'my_socket');
259              
260             # keep spillage if my_socket is unseekable
261             ($encoding, $spillage) = open_bom(FH, 'my_socket');
262              
263             # discard any spillage from open_bom
264             ($encoding) = open_bom(FH, 'my_socket');
265              
266             =cut
267              
268             sub open_bom (*$;$) {
269 120     120 1 8779578 my($fh, $filename, $mode) = @_;
270 120 100       824 if (defined $fh) {
271 72         336 $fh = qualify_to_ref($fh, caller);
272             }
273             else {
274 48         1673 $fh = $_[0] = gensym();
275             }
276              
277 120         6263 my $enc;
278 120         760 my $spill = '';
279              
280 120 100       12039 open($fh, '<', $filename)
281             or croak "Couldn't read '$filename': $!";
282              
283 111 100       674 if (wantarray) {
284 38         773 ($enc, $spill) = get_encoding_from_filehandle($fh);
285             }
286             else {
287 73         246 $enc = get_encoding_from_filehandle($fh);
288             }
289              
290 111 100       314 if ($enc) {
291 95         492 $mode = ":encoding($enc)";
292              
293 95 100       713 $spill = decode($enc, $spill, FB_CROAK) if $spill;
294             }
295              
296 111 100       1731 if ($mode) {
297 13 100   13   160 binmode($fh, $mode)
  13         28  
  13         134  
  96         2680  
298             or croak "Couldn't set binmode of handle opened on '$filename' "
299             . "to '$mode': $!";
300             }
301              
302 110 100       21596 return wantarray ? ($enc, $spill) : $enc;
303             }
304              
305             =head2 defuse
306              
307             $enc = defuse(FH);
308              
309             ($enc, $spill) = defuse(FH);
310              
311             FH should be a filehandle opened for reading, it will have the relevant encoding
312             layer pushed onto it be binmode if a BOM is found. Spillage should be Unicode,
313             not bytes.
314              
315             Any uncaptured spillage will be silently lost. If the handle is unseekable, use
316             list context to avoid data loss.
317              
318             If no BOM is found, the mode will be unaffected.
319              
320             =cut
321              
322             sub defuse (*) {
323 102     102 1 9490860 my $fh = qualify_to_ref(shift, caller);
324              
325 102         3178 my($enc, $spill) = get_encoding_from_filehandle($fh);
326              
327 102 100       332 if ($enc) {
328 90         2616 binmode($fh, ":encoding($enc)");
329 90 100       8136 $spill = decode($enc, $spill, FB_CROAK) if $spill;
330             }
331              
332 102 100       1252 return wantarray ? ($enc, $spill) : $enc;
333             }
334              
335             =head2 decode_from_bom
336              
337             $unicode_string = decode_from_bom($string, $default, $check)
338              
339             ($unicode_string, $encoding) = decode_from_bom($string, $default, $check)
340              
341             Reads a BOM from the beginning of $string, decodes $string (minus the BOM) and
342             returns it to you as a perl unicode string.
343              
344             if $string doesn't have a BOM, $default is used instead.
345              
346             $check, if supplied, is passed to Encode::decode as the third argument.
347              
348             If there's no BOM and no default, the original string is returned and encoding
349             is ''.
350              
351             See L
352              
353             =cut
354              
355             sub decode_from_bom ($;$$) {
356 219     219 1 208893 my($string, $default, $check) = @_;
357              
358 219 100       645 croak "No string" unless defined $string;
359              
360 216         456 my($enc, $off) = get_encoding_from_bom($string);
361 216   100     624 $enc ||= $default;
362              
363 216         312 my $out;
364 216 100       480 if (defined $enc) {
365 204         684 $out = decode($enc, substr($string, $off), $check);
366             }
367             else {
368 12         48 $out = $string;
369 12         36 $enc = '';
370             }
371              
372 216 100       11112 return wantarray ? ($out, $enc) : $out;
373             }
374              
375             =head2 get_encoding_from_filehandle
376              
377             $encoding = get_encoding_from_filehandle(HANDLE)
378              
379             ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE)
380              
381             Returns the encoding found in the given filehandle.
382              
383             The handle should be opened in a non-unicode way (e.g. mode '<:bytes') so that
384             the BOM can be read in its natural state.
385              
386             After calling, the handle will be set to read at a point after the BOM (or at
387             the beginning of the file if no BOM was found)
388              
389             If called in scalar context, unseekable handles cause a croak().
390              
391             If called in list context, unseekable handles will be read byte-by-byte and any
392             spillage will be returned. See get_encoding_from_stream()
393              
394             =cut
395              
396             sub get_encoding_from_filehandle (*) {
397 303     303 1 96758 my $fh = qualify_to_ref(shift, caller);
398              
399 303         5039 my $enc;
400 303         972 my $spill = '';
401 303 100       3712 if (seek($fh, 0, SEEK_SET)) {
    100          
402 234         753 $enc = _get_encoding_seekable($fh);
403             }
404             elsif (wantarray) {
405 67         1429 ($enc, $spill) = _get_encoding_unseekable($fh);
406             }
407             else {
408 2         40 croak "Unseekable handle: $!";
409             }
410              
411 301 100       1599 return wantarray ? ($enc, $spill) : $enc;
412             }
413              
414             =head2 get_encoding_from_stream
415              
416             ($encoding, $spillage) = get_encoding_from_stream(*FH);
417              
418             Read a BOM from an unrewindable source. This means reading the stream one byte
419             at a time until either a BOM is found or every possible BOM is ruled out. Any
420             non-BOM bytes read from the handle will be returned in $spillage.
421              
422             If a BOM is found and the spillage contains a partial character (judging by the
423             expected character width for the encoding) more bytes will be read from the
424             handle to ensure that a complete character is returned.
425              
426             Spillage is always in bytes, not characters.
427              
428             This function is less efficient than get_encoding_from_filehandle, but should
429             work just as well on a seekable handle as on an unseekable one.
430              
431             =cut
432              
433             sub get_encoding_from_stream (*) {
434 72     72 1 80424 my $fh = qualify_to_ref(shift, caller);
435              
436 72         1764 _get_encoding_unseekable($fh);
437             }
438              
439             # internal:
440             #
441             # Return encoding and seek to position after BOM
442             sub _get_encoding_seekable (*) {
443 239     239   9404 my $fh = shift;
444              
445             # This doesn't work on all platforms:
446             # defined(read($fh, my $bom, $MAX_BOM_LENGTH))
447             # or croak "Couldn't read from handle: $!";
448              
449 239         394 my $bom = eval { _safe_read($fh, $MAX_BOM_LENGTH) };
  239         536  
450 239 100       612 croak "Couldn't read from handle: $@" if $@;
451              
452 236         591 my($enc, $off) = get_encoding_from_bom($bom);
453              
454 236 100       3360 seek($fh, $off, SEEK_SET) or croak "Couldn't reset read position: $!";
455              
456 234         792 return $enc;
457             }
458              
459             # internal:
460             #
461             # Return encoding and non-BOM overspill
462             sub _get_encoding_unseekable (*) {
463 142     142   2752 my $fh = shift;
464              
465 142         728 my $so_far = '';
466 142         4065 for my $c (1 .. $MAX_BOM_LENGTH) {
467             # defined(read($fh, my $byte, 1)) or croak "Couldn't read byte: $!";
468 406         5090 my $byte = eval { _safe_read($fh, 1) };
  406         1230  
469 406 100       915 croak "Couldn't read byte: $@" if $@;
470              
471 403         701 $so_far .= $byte;
472              
473             # find matching BOMs
474 403         3112 my @possible = grep { $so_far eq substr($_, 0, $c) } keys %bom2enc;
  2015         16122  
475              
476 403 100 100     4279 if (@possible == 1 and my $enc = $bom2enc{$so_far}) {
    100          
477             # There's only one match, this must be it
478 96         2296 return ($enc, '');
479             }
480             elsif (@possible == 0) {
481             # might need to backtrack one byte
482 43         300 my $spill = chop $so_far;
483              
484 43 100       657 if (my $enc = $bom2enc{$so_far}) {
485 29         505 my $char_length = _get_char_length($enc, $spill);
486              
487 29         79 my $extra = eval {
488 29         99 _safe_read($fh, $char_length - length $spill);
489             };
490 29 50       102 croak "Coudln't read byte: $@" if $@;
491 29         50 $spill .= $extra;
492              
493 29         301 return ($enc, $spill);
494             }
495             else {
496             # no BOM
497 14         275 return ('', $so_far . $spill);
498             }
499             }
500             }
501             }
502              
503             sub _safe_read {
504 674     674   2120 my ($fh, $count) = @_;
505              
506             # read is supposed to return undef on error, but on some platforms it
507             # seems to just return 0 and set $!
508 674         6849 local $!;
509 674         7981 my $status = read($fh, my $out, $count);
510              
511 674 100 100     2099 die $! if !$status && $!;
512              
513 668         3458 return $out;
514             }
515              
516             =head2 get_encoding_from_bom
517              
518             ($encoding, $offset) = get_encoding_from_bom($string)
519              
520             Returns the encoding and length in bytes of the BOM in $string.
521              
522             If there is no BOM, an empty string is returned and $offset is zero.
523              
524             To get the data from the string, the following should work:
525              
526             use Encode;
527              
528             my($encoding, $offset) = get_encoding_from_bom($string);
529              
530             if ($encoding) {
531             $string = decode($encoding, substr($string, $offset))
532             }
533              
534             =cut
535              
536             sub get_encoding_from_bom ($) {
537 524     524 1 1031 my $bom = shift;
538              
539 524         755 my $encoding = '';
540 524         646 my $offset = 0;
541              
542 524 100       1975 if (my($found) = $bom =~ $bom_re) {
543 434         6064 $encoding = $bom2enc{$found};
544 434         3143 $offset = length($found);
545             }
546              
547 524         2269 return ($encoding, $offset);
548             }
549              
550             # Internal:
551             # Work out character length for given encoding and spillage byte
552             sub _get_char_length ($$) {
553 34     34   304058 my($enc, $byte) = @_;
554              
555 34 100       829 if ($enc eq 'UTF-8') {
    100          
556 3 100       31 if (($byte & 0x80) == 0) {
557 1         15 return 1;
558             }
559             else {
560 2         9 my $length = 0;
561              
562 2         32 1 while (($byte << $length++) & 0xc0) == 0xc0;
563              
564 2         86 return $length;
565             }
566             }
567             elsif ($enc =~ /^UTF-(16|32)/) {
568 30         443 return $1 / 8;
569             }
570             else {
571 1         11 return;
572             }
573             }
574              
575             =head1 PerlIO::via interface
576              
577             File::BOM can be used as a PerlIO::via interface.
578              
579             open(HANDLE, '<:via(File::BOM)', 'my_file.txt');
580              
581             open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt');
582             print "foo\n"; # BOM is written to file here
583              
584             This method is less prone to errors on non-seekable files as spillage is
585             incorporated into an internal buffer, but it doesn't give you any information
586             about the encoding being used, or indeed whether or not a BOM
587             was present.
588              
589             There are a few known problems with this interface, especially surrounding
590             seek() and tell(), please see the BUGS section for more details about this.
591              
592             =head2 Reading
593              
594             The via(File::BOM) layer must be added before the handle is read from, otherwise
595             any BOM will be missed. If there is no BOM, no decoding will be done.
596              
597             Because of a limitation in PerlIO::via, read() always works on bytes, not characters. BOM decoding will still be done but output will be bytes of UTF-8.
598              
599             open(BOM, '<:via(File::BOM)', $file);
600             $bytes_read = read(BOM, $buffer, $length);
601             $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET);
602              
603             # Now $unicode is valid unicode and $buffer contains any left-over bytes
604              
605             =head2 Writing
606              
607             Add the via(File::BOM) layer on top of a unicode encoding layer to print a BOM
608             at the start of the output file. This needs to be done before any data is
609             written. The BOM is written as part of the first print command on the handle, so
610             if you don't print anything to the handle, you won't get a BOM.
611              
612             There is a "Wide character in print" warning generated when the via(File::BOM)
613             layer doesn't receive utf8 on writing. This glitch was resolved in perl version
614             5.8.7, but if your perl version is older than that, you'll need to make sure
615             that the via(File::BOM) layer receives utf8 like this:
616              
617             # This works OK
618             open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename)
619              
620             # This generates warnings with older perls
621             open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
622              
623             =head2 Seeking
624              
625             Seeking with SEEK_SET results in an offset equal to the length of any detected
626             BOM being applied to the position parameter. Thus:
627              
628             # Seek to end of BOM (not start of file!)
629             seek(FILE_BOM_HANDLE, 0, SEEK_SET)
630              
631             =head2 Telling
632              
633             In order to work correctly with seek(), tell() also returns a postion adjusted
634             by the length of the BOM.
635              
636             =cut
637              
638 28 50   28 0 12527 sub PUSHED { bless({offset => 0}, $_[0]) || -1 }
639              
640             sub UTF8 {
641             # There is a bug with this method previous to 5.8.7
642              
643 28 50   28 0 85 if ($] >= 5.008007) {
644 28         1329 return 1;
645             }
646             else {
647 0         0 return 0;
648             }
649             }
650              
651             sub FILL {
652 28     28 0 4742 my($self, $fh) = @_;
653              
654 28         43 my $line;
655 28 100       71 if (not defined $self->{enc}) {
656 15         37 ($self->{enc}, my $spill) = get_encoding_from_filehandle($fh);
657              
658 15 100       44 if ($self->{enc} ne '') {
659 14     1   188 binmode($fh, ":encoding($self->{enc})");
  1         7  
  1         2  
  1         6  
660 14 50       1537 $line .= decode($self->{enc}, $spill, FB_CROAK) if $spill;
661              
662 14         69 $self->{offset} = length $enc2bom{$self->{enc}};
663             }
664              
665 15         353 $line .= <$fh>;
666             }
667             else {
668 13         62 $line = <$fh>;
669             }
670              
671 28         192 return $line;
672             }
673              
674             sub WRITE {
675 21     21   5758 my($self, $buf, $fh) = @_;
676              
677 21 100 66     388 if (tell $fh == 0 and not $self->{wrote_bom}) {
678 13         68 print $fh "\x{feff}";
679 13         26 $self->{wrote_bom} = 1;
680             }
681              
682 21         81 $buf = decode('UTF-8', $buf, FB_CROAK);
683              
684 21         1154 print $fh $buf;
685              
686 21         74 return 1;
687             }
688              
689 28     28 0 12482 sub FLUSH { 0 }
690              
691             sub SEEK {
692 2     2   13 my $self = shift;
693              
694 2         5 my($pos, $whence, $fh) = @_;
695              
696 2 50       6 if ($whence == SEEK_SET) {
697 2         6 $pos += $self->{offset};
698             }
699              
700 2 50       24 if (seek($fh, $pos, $whence)) {
701 2         9 return 0;
702             }
703             else {
704 0         0 return -1;
705             }
706             }
707              
708             sub TELL {
709 1     1   12 my($self, $fh) = @_;
710              
711 1         7 my $pos = tell $fh;
712              
713 1 50       5 if ($pos == -1) {
714 0         0 return -1;
715             }
716             else {
717 1         4 return $pos - $self->{offset};
718             }
719             }
720              
721             1;
722              
723             __END__