File Coverage

blib/lib/File/BOM.pm
Criterion Covered Total %
statement 155 158 98.1
branch 69 76 90.7
condition 11 12 91.6
subroutine 28 28 100.0
pod 6 10 60.0
total 269 284 94.7


line stmt bran cond sub pod time code
1             package File::BOM;
2              
3             =head1 NAME
4              
5             File::BOM - Utilities for handling Byte Order Marks
6              
7             =head1 SYNOPSIS
8              
9             use File::BOM qw( :all )
10              
11             =head2 high-level functions
12              
13             # read a file with encoding from the BOM:
14             open_bom(FH, $file)
15             open_bom(FH, $file, ':utf8') # the same but with a default encoding
16              
17             # get encoding too
18             $encoding = open_bom(FH, $file, ':utf8');
19              
20             # open a potentially unseekable file:
21             ($encoding, $spillage) = open_bom(FH, $file, ':utf8');
22              
23             # change encoding of an open handle according to BOM
24             $encoding = defuse(*HANDLE);
25             ($encoding, $spillage) = defuse(*HANDLE);
26              
27             # Decode a string according to leading BOM:
28             $unicode = decode_from_bom($string_with_bom);
29            
30             # Decode a string and get the encoding:
31             ($unicode, $encoding) = decode_from_bom($string_with_bom)
32              
33             =head2 PerlIO::via interface
34              
35             # Read the Right Thing from a unicode file with BOM:
36             open(HANDLE, '<:via(File::BOM)', $filename)
37              
38             # Writing little-endian UTF-16 file with BOM:
39             open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
40              
41              
42             =head2 lower-level functions
43              
44             # read BOM encoding from a filehandle:
45             $encoding = get_encoding_from_filehandle(FH)
46              
47             # Get encoding even if FH is unseekable:
48             ($encoding, $spillage) = get_encoding_from_filehandle(FH);
49              
50             # Get encoding from a known unseekable handle:
51             ($encdoing, $spillage) = get_encoding_from_stream(FH);
52              
53             # get encoding and BOM length from BOM at start of string:
54             ($encoding, $offset) = get_encoding_from_bom($string);
55              
56             =head2 variables
57              
58             # print a BOM for a known encoding
59             print FH $enc2bom{$encoding};
60              
61             # get an encoding from a known BOM
62             $enc = $bom2enc{$bom}
63              
64             =head1 DESCRIPTION
65              
66             This module provides functions for handling unicode byte order marks, which are
67             to be found at the beginning of some files and streams.
68              
69             For details about what a byte order mark is, see
70             L
71              
72             The intention of File::BOM is for files with BOMs to be readable as seamlessly
73             as possible, regardless of the encoding used. To that end, several different
74             interfaces are available, as shown in the synopsis above.
75              
76             =cut
77              
78 16     16   1659085 use strict;
  16         80  
  16         474  
79 16     16   79 use warnings;
  16         33  
  16         553  
80              
81             # We don't want any character semantics at all
82 16     16   80 use bytes;
  16         28  
  16         83  
83              
84 16     16   386 use base qw( Exporter );
  16         32  
  16         1506  
85              
86 16     16   8872 use Readonly;
  16         61542  
  16         904  
87              
88 16     16   130 use Carp qw( croak );
  16         32  
  16         713  
89 16     16   95 use Fcntl qw( :seek );
  16         18  
  16         1730  
90 16     16   114 use Encode qw( :DEFAULT :fallbacks is_utf8 );
  16         20  
  16         2478  
91 16     16   123 use Symbol qw( gensym qualify_to_ref );
  16         32  
  16         33689  
92              
93             my @subs = qw(
94             open_bom
95             defuse
96             decode_from_bom
97             get_encoding_from_bom
98             get_encoding_from_filehandle
99             get_encoding_from_stream
100             );
101              
102             my @vars = qw( %bom2enc %enc2bom );
103              
104             our $VERSION = '0.16';
105              
106             our @EXPORT = ();
107             our @EXPORT_OK = ( @subs, @vars );
108             our %EXPORT_TAGS = (
109             all => \@EXPORT_OK,
110             subs => \@subs,
111             vars => \@vars
112             );
113              
114             =head1 EXPORTS
115              
116             Nothing by default.
117              
118             =head2 symbols
119              
120             =over 4
121              
122             =item * open_bom()
123              
124             =item * defuse()
125              
126             =item * decode_from_bom()
127              
128             =item * get_encoding_from_filehandle()
129              
130             =item * get_encoding_from_stream()
131              
132             =item * get_encoding_from_bom()
133              
134             =item * %bom2enc
135              
136             =item * %enc2bom
137              
138             =back
139              
140             =head2 tags
141              
142             =over 4
143              
144             =item * :all
145              
146             All of the above
147              
148             =item * :subs
149              
150             subroutines only
151              
152             =item * :vars
153              
154             just %bom2enc and %enc2bom
155              
156             =back
157              
158             =cut
159              
160             =head1 VARIABLES
161              
162             =head2 %bom2enc
163              
164             Maps Byte Order marks to their encodings.
165              
166             The keys of this hash are strings which represent the BOMs, the values are their
167             encodings, in a format which is understood by L
168              
169             The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE,
170             UTF-32BE and UTF-32LE
171              
172             =head2 %enc2bom
173              
174             A reverse-lookup hash for bom2enc, with a few aliases used in L, namely utf8, iso-10646-1 and UCS-2.
175              
176             Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly
177             because Encode::encode automatically puts BOMs on output. See L
178              
179             =cut
180              
181             our(%bom2enc, %enc2bom, $MAX_BOM_LENGTH, $bom_re);
182              
183             # length in bytes of the longest BOM
184             $MAX_BOM_LENGTH = 4;
185              
186             Readonly %bom2enc => (
187             map { encode($_, "\x{feff}") => $_ } qw(
188             UTF-8
189             UTF-16BE
190             UTF-16LE
191             UTF-32BE
192             UTF-32LE
193             )
194             );
195              
196             Readonly %enc2bom => (
197             reverse(%bom2enc),
198             map { $_ => encode($_, "\x{feff}") } qw(
199             UCS-2
200             iso-10646-1
201             utf8
202             )
203             );
204              
205             {
206             local $" = '|';
207              
208             my @bombs = sort { length $b <=> length $a } keys %bom2enc;
209              
210             Readonly $MAX_BOM_LENGTH => length $bombs[0];
211              
212             Readonly $bom_re => qr/^(@bombs)/o;
213             }
214              
215             =head1 FUNCTIONS
216              
217             =head2 open_bom
218              
219             $encoding = open_bom(HANDLE, $filename, $default_mode)
220              
221             ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode)
222              
223             opens HANDLE for reading on $filename, setting the mode to the appropriate
224             encoding for the BOM stored in the file.
225              
226             On failure, a fatal error is raised, see the DIAGNOSTICS section for details on
227             how to catch these. This is in order to allow the return value(s) to be used for
228             other purposes.
229              
230             If the file doesn't contain a BOM, $default_mode is used instead. Hence:
231              
232             open_bom(FH, 'my_file.txt', ':utf8')
233              
234             Opens my_file.txt for reading in an appropriate encoding found from the BOM in
235             that file, or as a UTF-8 file if none is found.
236              
237             In the absence of a $default_mode argument, the following 2 calls should be equivalent:
238              
239             open_bom(FH, 'no_bom.txt');
240              
241             open(FH, '<', 'no_bom.txt');
242              
243             If an undefined value is passed as the handle, a symbol will be generated for it
244             like open() does:
245              
246             # create filehandle on the fly
247             $enc = open_bom(my $fh, $filename, ':utf8');
248             $line = <$fh>;
249              
250             The filehandle will be cued up to read after the BOM. Unseekable files (e.g.
251             fifos) will cause croaking, unless called in list context to catch spillage
252             from the handle. Any spillage will be automatically decoded from the encoding,
253             if found.
254              
255             e.g.
256              
257             # croak if my_socket is unseekable
258             open_bom(FH, 'my_socket');
259              
260             # keep spillage if my_socket is unseekable
261             ($encoding, $spillage) = open_bom(FH, 'my_socket');
262              
263             # discard any spillage from open_bom
264             ($encoding) = open_bom(FH, 'my_socket');
265              
266             =cut
267              
268             sub open_bom (*$;$) {
269 120     120 1 8548376 my($fh, $filename, $mode) = @_;
270 120 100       883 if (defined $fh) {
271 72         336 $fh = qualify_to_ref($fh, caller);
272             }
273             else {
274 48         1305 $fh = $_[0] = gensym();
275             }
276              
277 120         6179 my $enc;
278 120         635 my $spill = '';
279              
280 120 100       50013 open($fh, '<', $filename)
281             or croak "Couldn't read '$filename': $!";
282              
283 111 100       745 if (wantarray) {
284 38         772 ($enc, $spill) = get_encoding_from_filehandle($fh);
285             }
286             else {
287 73         201 $enc = get_encoding_from_filehandle($fh);
288             }
289              
290 111 100       347 if ($enc) {
291 95         273 $mode = ":encoding($enc)";
292              
293 95 100       567 $spill = decode($enc, $spill, FB_CROAK) if $spill;
294             }
295              
296 111 100       1895 if ($mode) {
297 13 100   13   133 binmode($fh, $mode)
  13         27  
  13         83  
  96         2276  
298             or croak "Couldn't set binmode of handle opened on '$filename' "
299             . "to '$mode': $!";
300             }
301              
302 110 100       20212 return wantarray ? ($enc, $spill) : $enc;
303             }
304              
305             =head2 defuse
306              
307             $enc = defuse(FH);
308              
309             ($enc, $spill) = defuse(FH);
310              
311             FH should be a filehandle opened for reading, it will have the relevant encoding
312             layer pushed onto it be binmode if a BOM is found. Spillage should be Unicode,
313             not bytes.
314              
315             Any uncaptured spillage will be silently lost. If the handle is unseekable, use
316             list context to avoid data loss.
317              
318             If no BOM is found, the mode will be unaffected.
319              
320             =cut
321              
322             sub defuse (*) {
323 102     102 1 9157412 my $fh = qualify_to_ref(shift, caller);
324              
325 102         2918 my($enc, $spill) = get_encoding_from_filehandle($fh);
326              
327 102 100       316 if ($enc) {
328 90         2338 binmode($fh, ":encoding($enc)");
329 90 100       7352 $spill = decode($enc, $spill, FB_CROAK) if $spill;
330             }
331              
332 102 100       1452 return wantarray ? ($enc, $spill) : $enc;
333             }
334              
335             =head2 decode_from_bom
336              
337             $unicode_string = decode_from_bom($string, $default, $check)
338              
339             ($unicode_string, $encoding) = decode_from_bom($string, $default, $check)
340              
341             Reads a BOM from the beginning of $string, decodes $string (minus the BOM) and
342             returns it to you as a perl unicode string.
343              
344             if $string doesn't have a BOM, $default is used instead.
345              
346             $check, if supplied, is passed to Encode::decode as the third argument.
347              
348             If there's no BOM and no default, the original string is returned and encoding
349             is ''.
350              
351             See L
352              
353             =cut
354              
355             sub decode_from_bom ($;$$) {
356 219     219 1 191142 my($string, $default, $check) = @_;
357              
358 219 100       618 croak "No string" unless defined $string;
359              
360 216         420 my($enc, $off) = get_encoding_from_bom($string);
361 216   100     624 $enc ||= $default;
362              
363 216         372 my $out;
364 216 100       432 if (defined $enc) {
365 204         696 $out = decode($enc, substr($string, $off), $check);
366             }
367             else {
368 12         24 $out = $string;
369 12         24 $enc = '';
370             }
371              
372 216 100       10872 return wantarray ? ($out, $enc) : $out;
373             }
374              
375             =head2 get_encoding_from_filehandle
376              
377             $encoding = get_encoding_from_filehandle(HANDLE)
378              
379             ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE)
380              
381             Returns the encoding found in the given filehandle.
382              
383             The handle should be opened in a non-unicode way (e.g. mode '<:bytes') so that
384             the BOM can be read in its natural state.
385              
386             After calling, the handle will be set to read at a point after the BOM (or at
387             the beginning of the file if no BOM was found)
388              
389             If called in scalar context, unseekable handles cause a croak().
390              
391             If called in list context, unseekable handles will be read byte-by-byte and any
392             spillage will be returned. See get_encoding_from_stream()
393              
394             =cut
395              
396             sub get_encoding_from_filehandle (*) {
397 303     303 1 95610 my $fh = qualify_to_ref(shift, caller);
398              
399 303         4257 my $enc;
400 303         842 my $spill = '';
401 303 100       3565 if (seek($fh, 0, SEEK_SET)) {
    100          
402 234         571 $enc = _get_encoding_seekable($fh);
403             }
404             elsif (wantarray) {
405 67         1528 ($enc, $spill) = _get_encoding_unseekable($fh);
406             }
407             else {
408 2         32 croak "Unseekable handle: $!";
409             }
410              
411 301 100       1574 return wantarray ? ($enc, $spill) : $enc;
412             }
413              
414             =head2 get_encoding_from_stream
415              
416             ($encoding, $spillage) = get_encoding_from_stream(*FH);
417              
418             Read a BOM from an unrewindable source. This means reading the stream one byte
419             at a time until either a BOM is found or every possible BOM is ruled out. Any
420             non-BOM bytes read from the handle will be returned in $spillage.
421              
422             If a BOM is found and the spillage contains a partial character (judging by the
423             expected character width for the encoding) more bytes will be read from the
424             handle to ensure that a complete character is returned.
425              
426             Spillage is always in bytes, not characters.
427              
428             This function is less efficient than get_encoding_from_filehandle, but should
429             work just as well on a seekable handle as on an unseekable one.
430              
431             =cut
432              
433             sub get_encoding_from_stream (*) {
434 72     72 1 74532 my $fh = qualify_to_ref(shift, caller);
435              
436 72         1728 _get_encoding_unseekable($fh);
437             }
438              
439             # internal:
440             #
441             # Return encoding and seek to position after BOM
442             sub _get_encoding_seekable (*) {
443 239     239   8661 my $fh = shift;
444              
445             # This doesn't work on all platforms:
446             # defined(read($fh, my $bom, $MAX_BOM_LENGTH))
447             # or croak "Couldn't read from handle: $!";
448              
449 239         335 my $bom = eval { _safe_read($fh, $MAX_BOM_LENGTH) };
  239         531  
450 239 100       564 croak "Couldn't read from handle: $@" if $@;
451              
452 236         525 my($enc, $off) = get_encoding_from_bom($bom);
453              
454 236 100       2854 seek($fh, $off, SEEK_SET) or croak "Couldn't reset read position: $!";
455              
456 234         742 return $enc;
457             }
458              
459             # internal:
460             #
461             # Return encoding and non-BOM overspill
462             sub _get_encoding_unseekable (*) {
463 142     142   2718 my $fh = shift;
464              
465 142         1212 my $so_far = '';
466 142         3667 for my $c (1 .. $MAX_BOM_LENGTH) {
467             # defined(read($fh, my $byte, 1)) or croak "Couldn't read byte: $!";
468 406         4471 my $byte = eval { _safe_read($fh, 1) };
  406         1202  
469 406 100       1010 croak "Couldn't read byte: $@" if $@;
470              
471 403         668 $so_far .= $byte;
472              
473             # find matching BOMs
474 403         3154 my @possible = grep { $so_far eq substr($_, 0, $c) } keys %bom2enc;
  2015         15086  
475              
476 403 100 100     3845 if (@possible == 1 and my $enc = $bom2enc{$so_far}) {
    100          
477             # There's only one match, this must be it
478 96         1899 return ($enc, '');
479             }
480             elsif (@possible == 0) {
481             # might need to backtrack one byte
482 43         155 my $spill = chop $so_far;
483              
484 43 100       653 if (my $enc = $bom2enc{$so_far}) {
485 29         584 my $char_length = _get_char_length($enc, $spill);
486              
487 29         79 my $extra = eval {
488 29         87 _safe_read($fh, $char_length - length $spill);
489             };
490 29 50       99 croak "Coudln't read byte: $@" if $@;
491 29         41 $spill .= $extra;
492              
493 29         157 return ($enc, $spill);
494             }
495             else {
496             # no BOM
497 14         241 return ('', $so_far . $spill);
498             }
499             }
500             }
501             }
502              
503             sub _safe_read {
504 674     674   2460 my ($fh, $count) = @_;
505              
506             # read is supposed to return undef on error, but on some platforms it
507             # seems to just return 0 and set $!
508 674         6735 local $!;
509 674         6948 my $status = read($fh, my $out, $count);
510              
511 674 100 100     2037 die $! if !$status && $!;
512              
513 668         3553 return $out;
514             }
515              
516             =head2 get_encoding_from_bom
517              
518             ($encoding, $offset) = get_encoding_from_bom($string)
519              
520             Returns the encoding and length in bytes of the BOM in $string.
521              
522             If there is no BOM, an empty string is returned and $offset is zero.
523              
524             To get the data from the string, the following should work:
525              
526             use Encode;
527              
528             my($encoding, $offset) = get_encoding_from_bom($string);
529              
530             if ($encoding) {
531             $string = decode($encoding, substr($string, $offset))
532             }
533              
534             =cut
535              
536             sub get_encoding_from_bom ($) {
537 524     524 1 866 my $bom = shift;
538              
539 524         818 my $encoding = '';
540 524         657 my $offset = 0;
541              
542 524 100       1846 if (my($found) = $bom =~ $bom_re) {
543 434         5746 $encoding = $bom2enc{$found};
544 434         3047 $offset = length($found);
545             }
546              
547 524         2601 return ($encoding, $offset);
548             }
549              
550             # Internal:
551             # Work out character length for given encoding and spillage byte
552             sub _get_char_length ($$) {
553 34     34   311938 my($enc, $byte) = @_;
554              
555 34 100       627 if ($enc eq 'UTF-8') {
    100          
556 3 100       31 if (($byte & 0x80) == 0) {
557 1         14 return 1;
558             }
559             else {
560 2         15 my $length = 0;
561              
562 2         27 1 while (($byte << $length++) & 0xc0) == 0xc0;
563              
564 2         80 return $length;
565             }
566             }
567             elsif ($enc =~ /^UTF-(16|32)/) {
568 30         327 return $1 / 8;
569             }
570             else {
571 1         12 return;
572             }
573             }
574              
575             =head1 PerlIO::via interface
576              
577             File::BOM can be used as a PerlIO::via interface.
578              
579             open(HANDLE, '<:via(File::BOM)', 'my_file.txt');
580              
581             open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt)
582             print "foo\n"; # BOM is written to file here
583              
584             This method is less prone to errors on non-seekable files as spillage is
585             incorporated into an internal buffer, but it doesn't give you any information
586             about the encoding being used, or indeed whether or not a BOM
587             was present.
588              
589             There are a few known problems with this interface, especially surrounding
590             seek() and tell(), please see the BUGS section for more details about this.
591              
592             =head2 Reading
593              
594             The via(File::BOM) layer must be added before the handle is read from, otherwise
595             any BOM will be missed. If there is no BOM, no decoding will be done.
596              
597             Because of a limitation in PerlIO::via, read() always works on bytes, not characters. BOM decoding will still be done but output will be bytes of UTF-8.
598              
599             open(BOM, '<:via(File::BOM)', $file)
600             $bytes_read = read(BOM, $buffer, $length);
601             $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET);
602              
603             # Now $unicode is valid unicode and $buffer contains any left-over bytes
604              
605             =head2 Writing
606              
607             Add the via(File::BOM) layer on top of a unicode encoding layer to print a BOM
608             at the start of the output file. This needs to be done before any data is
609             written. The BOM is written as part of the first print command on the handle, so
610             if you don't print anything to the handle, you won't get a BOM.
611              
612             There is a "Wide character in print" warning generated when the via(File::BOM)
613             layer doesn't receive utf8 on writing. This glitch was resolved in perl version
614             5.8.7, but if your perl version is older than that, you'll need to make sure
615             that the via(File::BOM) layer receives utf8 like this:
616              
617             # This works OK
618             open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename)
619              
620             # This generates warnings with older perls
621             open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename)
622              
623             =head2 Seeking
624              
625             Seeking with SEEK_SET results in an offset equal to the length of any detected
626             BOM being applied to the position parameter. Thus:
627              
628             # Seek to end of BOM (not start of file!)
629             seek(FILE_BOM_HANDLE, 0, SEEK_SET)
630              
631             =head2 Telling
632              
633             In order to work correctly with seek(), tell() also returns a postion adjusted
634             by the length of the BOM.
635              
636             =cut
637              
638 28 50   28 0 12318 sub PUSHED { bless({offset => 0}, $_[0]) || -1 }
639              
640             sub UTF8 {
641             # There is a bug with this method previous to 5.8.7
642              
643 28 50   28 0 91 if ($] >= 5.008007) {
644 28         1257 return 1;
645             }
646             else {
647 0         0 return 0;
648             }
649             }
650              
651             sub FILL {
652 28     28 0 4681 my($self, $fh) = @_;
653              
654 28         44 my $line;
655 28 100       74 if (not defined $self->{enc}) {
656 15         37 ($self->{enc}, my $spill) = get_encoding_from_filehandle($fh);
657              
658 15 100       40 if ($self->{enc} ne '') {
659 14     1   185 binmode($fh, ":encoding($self->{enc})");
  1         8  
  1         2  
  1         5  
660 14 50       1528 $line .= decode($self->{enc}, $spill, FB_CROAK) if $spill;
661              
662 14         66 $self->{offset} = length $enc2bom{$self->{enc}};
663             }
664              
665 15         343 $line .= <$fh>;
666             }
667             else {
668 13         61 $line = <$fh>;
669             }
670              
671 28         192 return $line;
672             }
673              
674             sub WRITE {
675 21     21   5653 my($self, $buf, $fh) = @_;
676              
677 21 100 66     420 if (tell $fh == 0 and not $self->{wrote_bom}) {
678 13         65 print $fh "\x{feff}";
679 13         28 $self->{wrote_bom} = 1;
680             }
681              
682 21         79 $buf = decode('UTF-8', $buf, FB_CROAK);
683              
684 21         1154 print $fh $buf;
685              
686 21         71 return 1;
687             }
688              
689 28     28 0 20199 sub FLUSH { 0 }
690              
691             sub SEEK {
692 2     2   13 my $self = shift;
693              
694 2         5 my($pos, $whence, $fh) = @_;
695              
696 2 50       7 if ($whence == SEEK_SET) {
697 2         4 $pos += $self->{offset};
698             }
699              
700 2 50       23 if (seek($fh, $pos, $whence)) {
701 2         9 return 0;
702             }
703             else {
704 0         0 return -1;
705             }
706             }
707              
708             sub TELL {
709 1     1   9 my($self, $fh) = @_;
710              
711 1         7 my $pos = tell $fh;
712              
713 1 50       6 if ($pos == -1) {
714 0         0 return -1;
715             }
716             else {
717 1         6 return $pos - $self->{offset};
718             }
719             }
720              
721             1;
722              
723             __END__