| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package File::BOM; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
File::BOM - Utilities for handling Byte Order Marks |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use File::BOM qw( :all ) |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head2 high-level functions |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# read a file with encoding from the BOM: |
|
14
|
|
|
|
|
|
|
open_bom(FH, $file) |
|
15
|
|
|
|
|
|
|
open_bom(FH, $file, ':utf8') # the same but with a default encoding |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# get encoding too |
|
18
|
|
|
|
|
|
|
$encoding = open_bom(FH, $file, ':utf8'); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# open a potentially unseekable file: |
|
21
|
|
|
|
|
|
|
($encoding, $spillage) = open_bom(FH, $file, ':utf8'); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# change encoding of an open handle according to BOM |
|
24
|
|
|
|
|
|
|
$encoding = defuse(*HANDLE); |
|
25
|
|
|
|
|
|
|
($encoding, $spillage) = defuse(*HANDLE); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Decode a string according to leading BOM: |
|
28
|
|
|
|
|
|
|
$unicode = decode_from_bom($string_with_bom); |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# Decode a string and get the encoding: |
|
31
|
|
|
|
|
|
|
($unicode, $encoding) = decode_from_bom($string_with_bom) |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head2 PerlIO::via interface |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Read the Right Thing from a unicode file with BOM: |
|
36
|
|
|
|
|
|
|
open(HANDLE, '<:via(File::BOM)', $filename) |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Writing little-endian UTF-16 file with BOM: |
|
39
|
|
|
|
|
|
|
open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename) |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 lower-level functions |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# read BOM encoding from a filehandle: |
|
45
|
|
|
|
|
|
|
$encoding = get_encoding_from_filehandle(FH) |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# Get encoding even if FH is unseekable: |
|
48
|
|
|
|
|
|
|
($encoding, $spillage) = get_encoding_from_filehandle(FH); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Get encoding from a known unseekable handle: |
|
51
|
|
|
|
|
|
|
($encdoing, $spillage) = get_encoding_from_stream(FH); |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# get encoding and BOM length from BOM at start of string: |
|
54
|
|
|
|
|
|
|
($encoding, $offset) = get_encoding_from_bom($string); |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 variables |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# print a BOM for a known encoding |
|
59
|
|
|
|
|
|
|
print FH $enc2bom{$encoding}; |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# get an encoding from a known BOM |
|
62
|
|
|
|
|
|
|
$enc = $bom2enc{$bom} |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
This module provides functions for handling unicode byte order marks, which are |
|
67
|
|
|
|
|
|
|
to be found at the beginning of some files and streams. |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
For details about what a byte order mark is, see |
|
70
|
|
|
|
|
|
|
L |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
The intention of File::BOM is for files with BOMs to be readable as seamlessly |
|
73
|
|
|
|
|
|
|
as possible, regardless of the encoding used. To that end, several different |
|
74
|
|
|
|
|
|
|
interfaces are available, as shown in the synopsis above. |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
|
77
|
|
|
|
|
|
|
|
|
78
|
16
|
|
|
16
|
|
1774606
|
use strict; |
|
|
16
|
|
|
|
|
77
|
|
|
|
16
|
|
|
|
|
479
|
|
|
79
|
16
|
|
|
16
|
|
77
|
use warnings; |
|
|
16
|
|
|
|
|
32
|
|
|
|
16
|
|
|
|
|
549
|
|
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# We don't want any character semantics at all |
|
82
|
16
|
|
|
16
|
|
101
|
use bytes; |
|
|
16
|
|
|
|
|
19
|
|
|
|
16
|
|
|
|
|
76
|
|
|
83
|
|
|
|
|
|
|
|
|
84
|
16
|
|
|
16
|
|
434
|
use base qw( Exporter ); |
|
|
16
|
|
|
|
|
32
|
|
|
|
16
|
|
|
|
|
1220
|
|
|
85
|
|
|
|
|
|
|
|
|
86
|
16
|
|
|
16
|
|
9352
|
use Readonly; |
|
|
16
|
|
|
|
|
64405
|
|
|
|
16
|
|
|
|
|
889
|
|
|
87
|
|
|
|
|
|
|
|
|
88
|
16
|
|
|
16
|
|
124
|
use Carp qw( croak ); |
|
|
16
|
|
|
|
|
35
|
|
|
|
16
|
|
|
|
|
732
|
|
|
89
|
16
|
|
|
16
|
|
92
|
use Fcntl qw( :seek ); |
|
|
16
|
|
|
|
|
34
|
|
|
|
16
|
|
|
|
|
1629
|
|
|
90
|
16
|
|
|
16
|
|
108
|
use Encode qw( :DEFAULT :fallbacks is_utf8 ); |
|
|
16
|
|
|
|
|
35
|
|
|
|
16
|
|
|
|
|
2497
|
|
|
91
|
16
|
|
|
16
|
|
125
|
use Symbol qw( gensym qualify_to_ref ); |
|
|
16
|
|
|
|
|
28
|
|
|
|
16
|
|
|
|
|
34617
|
|
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
my @subs = qw( |
|
94
|
|
|
|
|
|
|
open_bom |
|
95
|
|
|
|
|
|
|
defuse |
|
96
|
|
|
|
|
|
|
decode_from_bom |
|
97
|
|
|
|
|
|
|
get_encoding_from_bom |
|
98
|
|
|
|
|
|
|
get_encoding_from_filehandle |
|
99
|
|
|
|
|
|
|
get_encoding_from_stream |
|
100
|
|
|
|
|
|
|
); |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
my @vars = qw( %bom2enc %enc2bom ); |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
our $VERSION = '0.18'; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
our @EXPORT = (); |
|
107
|
|
|
|
|
|
|
our @EXPORT_OK = ( @subs, @vars ); |
|
108
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
|
109
|
|
|
|
|
|
|
all => \@EXPORT_OK, |
|
110
|
|
|
|
|
|
|
subs => \@subs, |
|
111
|
|
|
|
|
|
|
vars => \@vars |
|
112
|
|
|
|
|
|
|
); |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 EXPORTS |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Nothing by default. |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=head2 symbols |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=over 4 |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item * open_bom() |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=item * defuse() |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item * decode_from_bom() |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=item * get_encoding_from_filehandle() |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=item * get_encoding_from_stream() |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=item * get_encoding_from_bom() |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=item * %bom2enc |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=item * %enc2bom |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=back |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=head2 tags |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=over 4 |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item * :all |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
All of the above |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=item * :subs |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
subroutines only |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=item * :vars |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
just %bom2enc and %enc2bom |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=back |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=cut |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head1 VARIABLES |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head2 %bom2enc |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Maps Byte Order marks to their encodings. |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
The keys of this hash are strings which represent the BOMs, the values are their |
|
167
|
|
|
|
|
|
|
encodings, in a format which is understood by L |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE, |
|
170
|
|
|
|
|
|
|
UTF-32BE and UTF-32LE |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head2 %enc2bom |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
A reverse-lookup hash for bom2enc, with a few aliases used in L, namely utf8, iso-10646-1 and UCS-2. |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly |
|
177
|
|
|
|
|
|
|
because Encode::encode automatically puts BOMs on output. See L |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=cut |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
our(%bom2enc, %enc2bom, $MAX_BOM_LENGTH, $bom_re); |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# length in bytes of the longest BOM |
|
184
|
|
|
|
|
|
|
$MAX_BOM_LENGTH = 4; |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
Readonly %bom2enc => ( |
|
187
|
|
|
|
|
|
|
map { encode($_, "\x{feff}") => $_ } qw( |
|
188
|
|
|
|
|
|
|
UTF-8 |
|
189
|
|
|
|
|
|
|
UTF-16BE |
|
190
|
|
|
|
|
|
|
UTF-16LE |
|
191
|
|
|
|
|
|
|
UTF-32BE |
|
192
|
|
|
|
|
|
|
UTF-32LE |
|
193
|
|
|
|
|
|
|
) |
|
194
|
|
|
|
|
|
|
); |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Readonly %enc2bom => ( |
|
197
|
|
|
|
|
|
|
reverse(%bom2enc), |
|
198
|
|
|
|
|
|
|
map { $_ => encode($_, "\x{feff}") } qw( |
|
199
|
|
|
|
|
|
|
UCS-2 |
|
200
|
|
|
|
|
|
|
iso-10646-1 |
|
201
|
|
|
|
|
|
|
utf8 |
|
202
|
|
|
|
|
|
|
) |
|
203
|
|
|
|
|
|
|
); |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
{ |
|
206
|
|
|
|
|
|
|
local $" = '|'; |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
my @bombs = sort { length $b <=> length $a } keys %bom2enc; |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
Readonly $MAX_BOM_LENGTH => length $bombs[0]; |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
Readonly $bom_re => qr/^(@bombs)/o; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head2 open_bom |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
$encoding = open_bom(HANDLE, $filename, $default_mode) |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode) |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
opens HANDLE for reading on $filename, setting the mode to the appropriate |
|
224
|
|
|
|
|
|
|
encoding for the BOM stored in the file. |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
On failure, a fatal error is raised, see the DIAGNOSTICS section for details on |
|
227
|
|
|
|
|
|
|
how to catch these. This is in order to allow the return value(s) to be used for |
|
228
|
|
|
|
|
|
|
other purposes. |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
If the file doesn't contain a BOM, $default_mode is used instead. Hence: |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
open_bom(FH, 'my_file.txt', ':utf8') |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Opens my_file.txt for reading in an appropriate encoding found from the BOM in |
|
235
|
|
|
|
|
|
|
that file, or as a UTF-8 file if none is found. |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
In the absence of a $default_mode argument, the following 2 calls should be equivalent: |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
open_bom(FH, 'no_bom.txt'); |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
open(FH, '<', 'no_bom.txt'); |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
If an undefined value is passed as the handle, a symbol will be generated for it |
|
244
|
|
|
|
|
|
|
like open() does: |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# create filehandle on the fly |
|
247
|
|
|
|
|
|
|
$enc = open_bom(my $fh, $filename, ':utf8'); |
|
248
|
|
|
|
|
|
|
$line = <$fh>; |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
The filehandle will be cued up to read after the BOM. Unseekable files (e.g. |
|
251
|
|
|
|
|
|
|
fifos) will cause croaking, unless called in list context to catch spillage |
|
252
|
|
|
|
|
|
|
from the handle. Any spillage will be automatically decoded from the encoding, |
|
253
|
|
|
|
|
|
|
if found. |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
e.g. |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# croak if my_socket is unseekable |
|
258
|
|
|
|
|
|
|
open_bom(FH, 'my_socket'); |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# keep spillage if my_socket is unseekable |
|
261
|
|
|
|
|
|
|
($encoding, $spillage) = open_bom(FH, 'my_socket'); |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# discard any spillage from open_bom |
|
264
|
|
|
|
|
|
|
($encoding) = open_bom(FH, 'my_socket'); |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub open_bom (*$;$) { |
|
269
|
120
|
|
|
120
|
1
|
8412354
|
my($fh, $filename, $mode) = @_; |
|
270
|
120
|
100
|
|
|
|
741
|
if (defined $fh) { |
|
271
|
72
|
|
|
|
|
300
|
$fh = qualify_to_ref($fh, caller); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
else { |
|
274
|
48
|
|
|
|
|
1605
|
$fh = $_[0] = gensym(); |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
|
|
277
|
120
|
|
|
|
|
5568
|
my $enc; |
|
278
|
120
|
|
|
|
|
807
|
my $spill = ''; |
|
279
|
|
|
|
|
|
|
|
|
280
|
120
|
100
|
|
|
|
40036
|
open($fh, '<', $filename) |
|
281
|
|
|
|
|
|
|
or croak "Couldn't read '$filename': $!"; |
|
282
|
|
|
|
|
|
|
|
|
283
|
111
|
100
|
|
|
|
594
|
if (wantarray) { |
|
284
|
38
|
|
|
|
|
891
|
($enc, $spill) = get_encoding_from_filehandle($fh); |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
else { |
|
287
|
73
|
|
|
|
|
226
|
$enc = get_encoding_from_filehandle($fh); |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
111
|
100
|
|
|
|
301
|
if ($enc) { |
|
291
|
95
|
|
|
|
|
498
|
$mode = ":encoding($enc)"; |
|
292
|
|
|
|
|
|
|
|
|
293
|
95
|
100
|
|
|
|
577
|
$spill = decode($enc, $spill, FB_CROAK) if $spill; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
111
|
100
|
|
|
|
1708
|
if ($mode) { |
|
297
|
13
|
100
|
|
13
|
|
125
|
binmode($fh, $mode) |
|
|
13
|
|
|
|
|
31
|
|
|
|
13
|
|
|
|
|
141
|
|
|
|
96
|
|
|
|
|
2338
|
|
|
298
|
|
|
|
|
|
|
or croak "Couldn't set binmode of handle opened on '$filename' " |
|
299
|
|
|
|
|
|
|
. "to '$mode': $!"; |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
110
|
100
|
|
|
|
19786
|
return wantarray ? ($enc, $spill) : $enc; |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=head2 defuse |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
$enc = defuse(FH); |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
($enc, $spill) = defuse(FH); |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
FH should be a filehandle opened for reading, it will have the relevant encoding |
|
312
|
|
|
|
|
|
|
layer pushed onto it be binmode if a BOM is found. Spillage should be Unicode, |
|
313
|
|
|
|
|
|
|
not bytes. |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Any uncaptured spillage will be silently lost. If the handle is unseekable, use |
|
316
|
|
|
|
|
|
|
list context to avoid data loss. |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
If no BOM is found, the mode will be unaffected. |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub defuse (*) { |
|
323
|
102
|
|
|
102
|
1
|
9070092
|
my $fh = qualify_to_ref(shift, caller); |
|
324
|
|
|
|
|
|
|
|
|
325
|
102
|
|
|
|
|
2630
|
my($enc, $spill) = get_encoding_from_filehandle($fh); |
|
326
|
|
|
|
|
|
|
|
|
327
|
102
|
100
|
|
|
|
328
|
if ($enc) { |
|
328
|
90
|
|
|
|
|
2204
|
binmode($fh, ":encoding($enc)"); |
|
329
|
90
|
100
|
|
|
|
7942
|
$spill = decode($enc, $spill, FB_CROAK) if $spill; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
102
|
100
|
|
|
|
1432
|
return wantarray ? ($enc, $spill) : $enc; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=head2 decode_from_bom |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
$unicode_string = decode_from_bom($string, $default, $check) |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
($unicode_string, $encoding) = decode_from_bom($string, $default, $check) |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
Reads a BOM from the beginning of $string, decodes $string (minus the BOM) and |
|
342
|
|
|
|
|
|
|
returns it to you as a perl unicode string. |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
if $string doesn't have a BOM, $default is used instead. |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
$check, if supplied, is passed to Encode::decode as the third argument. |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
If there's no BOM and no default, the original string is returned and encoding |
|
349
|
|
|
|
|
|
|
is ''. |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
See L |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=cut |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub decode_from_bom ($;$$) { |
|
356
|
219
|
|
|
219
|
1
|
194874
|
my($string, $default, $check) = @_; |
|
357
|
|
|
|
|
|
|
|
|
358
|
219
|
100
|
|
|
|
594
|
croak "No string" unless defined $string; |
|
359
|
|
|
|
|
|
|
|
|
360
|
216
|
|
|
|
|
420
|
my($enc, $off) = get_encoding_from_bom($string); |
|
361
|
216
|
|
100
|
|
|
588
|
$enc ||= $default; |
|
362
|
|
|
|
|
|
|
|
|
363
|
216
|
|
|
|
|
264
|
my $out; |
|
364
|
216
|
100
|
|
|
|
468
|
if (defined $enc) { |
|
365
|
204
|
|
|
|
|
720
|
$out = decode($enc, substr($string, $off), $check); |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
else { |
|
368
|
12
|
|
|
|
|
36
|
$out = $string; |
|
369
|
12
|
|
|
|
|
36
|
$enc = ''; |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
|
|
372
|
216
|
100
|
|
|
|
11100
|
return wantarray ? ($out, $enc) : $out; |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=head2 get_encoding_from_filehandle |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
$encoding = get_encoding_from_filehandle(HANDLE) |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
($encoding, $spillage) = get_encoding_from_filehandle(HANDLE) |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
Returns the encoding found in the given filehandle. |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
The handle should be opened in a non-unicode way (e.g. mode '<:bytes') so that |
|
384
|
|
|
|
|
|
|
the BOM can be read in its natural state. |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
After calling, the handle will be set to read at a point after the BOM (or at |
|
387
|
|
|
|
|
|
|
the beginning of the file if no BOM was found) |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
If called in scalar context, unseekable handles cause a croak(). |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
If called in list context, unseekable handles will be read byte-by-byte and any |
|
392
|
|
|
|
|
|
|
spillage will be returned. See get_encoding_from_stream() |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub get_encoding_from_filehandle (*) { |
|
397
|
303
|
|
|
303
|
1
|
100130
|
my $fh = qualify_to_ref(shift, caller); |
|
398
|
|
|
|
|
|
|
|
|
399
|
303
|
|
|
|
|
4227
|
my $enc; |
|
400
|
303
|
|
|
|
|
999
|
my $spill = ''; |
|
401
|
303
|
100
|
|
|
|
2872
|
if (seek($fh, 0, SEEK_SET)) { |
|
|
|
100
|
|
|
|
|
|
|
402
|
234
|
|
|
|
|
1198
|
$enc = _get_encoding_seekable($fh); |
|
403
|
|
|
|
|
|
|
} |
|
404
|
|
|
|
|
|
|
elsif (wantarray) { |
|
405
|
67
|
|
|
|
|
1008
|
($enc, $spill) = _get_encoding_unseekable($fh); |
|
406
|
|
|
|
|
|
|
} |
|
407
|
|
|
|
|
|
|
else { |
|
408
|
2
|
|
|
|
|
58
|
croak "Unseekable handle: $!"; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
301
|
100
|
|
|
|
1564
|
return wantarray ? ($enc, $spill) : $enc; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head2 get_encoding_from_stream |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
($encoding, $spillage) = get_encoding_from_stream(*FH); |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
Read a BOM from an unrewindable source. This means reading the stream one byte |
|
419
|
|
|
|
|
|
|
at a time until either a BOM is found or every possible BOM is ruled out. Any |
|
420
|
|
|
|
|
|
|
non-BOM bytes read from the handle will be returned in $spillage. |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
If a BOM is found and the spillage contains a partial character (judging by the |
|
423
|
|
|
|
|
|
|
expected character width for the encoding) more bytes will be read from the |
|
424
|
|
|
|
|
|
|
handle to ensure that a complete character is returned. |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
Spillage is always in bytes, not characters. |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
This function is less efficient than get_encoding_from_filehandle, but should |
|
429
|
|
|
|
|
|
|
work just as well on a seekable handle as on an unseekable one. |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=cut |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
sub get_encoding_from_stream (*) { |
|
434
|
72
|
|
|
72
|
1
|
75012
|
my $fh = qualify_to_ref(shift, caller); |
|
435
|
|
|
|
|
|
|
|
|
436
|
72
|
|
|
|
|
1740
|
_get_encoding_unseekable($fh); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# internal: |
|
440
|
|
|
|
|
|
|
# |
|
441
|
|
|
|
|
|
|
# Return encoding and seek to position after BOM |
|
442
|
|
|
|
|
|
|
sub _get_encoding_seekable (*) { |
|
443
|
239
|
|
|
239
|
|
8899
|
my $fh = shift; |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# This doesn't work on all platforms: |
|
446
|
|
|
|
|
|
|
# defined(read($fh, my $bom, $MAX_BOM_LENGTH)) |
|
447
|
|
|
|
|
|
|
# or croak "Couldn't read from handle: $!"; |
|
448
|
|
|
|
|
|
|
|
|
449
|
239
|
|
|
|
|
404
|
my $bom = eval { _safe_read($fh, $MAX_BOM_LENGTH) }; |
|
|
239
|
|
|
|
|
513
|
|
|
450
|
239
|
100
|
|
|
|
539
|
croak "Couldn't read from handle: $@" if $@; |
|
451
|
|
|
|
|
|
|
|
|
452
|
236
|
|
|
|
|
553
|
my($enc, $off) = get_encoding_from_bom($bom); |
|
453
|
|
|
|
|
|
|
|
|
454
|
236
|
100
|
|
|
|
2872
|
seek($fh, $off, SEEK_SET) or croak "Couldn't reset read position: $!"; |
|
455
|
|
|
|
|
|
|
|
|
456
|
234
|
|
|
|
|
782
|
return $enc; |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
# internal: |
|
460
|
|
|
|
|
|
|
# |
|
461
|
|
|
|
|
|
|
# Return encoding and non-BOM overspill |
|
462
|
|
|
|
|
|
|
sub _get_encoding_unseekable (*) { |
|
463
|
142
|
|
|
142
|
|
2842
|
my $fh = shift; |
|
464
|
|
|
|
|
|
|
|
|
465
|
142
|
|
|
|
|
1046
|
my $so_far = ''; |
|
466
|
142
|
|
|
|
|
3908
|
for my $c (1 .. $MAX_BOM_LENGTH) { |
|
467
|
|
|
|
|
|
|
# defined(read($fh, my $byte, 1)) or croak "Couldn't read byte: $!"; |
|
468
|
406
|
|
|
|
|
5323
|
my $byte = eval { _safe_read($fh, 1) }; |
|
|
406
|
|
|
|
|
877
|
|
|
469
|
406
|
100
|
|
|
|
932
|
croak "Couldn't read byte: $@" if $@; |
|
470
|
|
|
|
|
|
|
|
|
471
|
403
|
|
|
|
|
655
|
$so_far .= $byte; |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
# find matching BOMs |
|
474
|
403
|
|
|
|
|
3183
|
my @possible = grep { $so_far eq substr($_, 0, $c) } keys %bom2enc; |
|
|
2015
|
|
|
|
|
14723
|
|
|
475
|
|
|
|
|
|
|
|
|
476
|
403
|
100
|
100
|
|
|
3918
|
if (@possible == 1 and my $enc = $bom2enc{$so_far}) { |
|
|
|
100
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# There's only one match, this must be it |
|
478
|
96
|
|
|
|
|
2376
|
return ($enc, ''); |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
elsif (@possible == 0) { |
|
481
|
|
|
|
|
|
|
# might need to backtrack one byte |
|
482
|
43
|
|
|
|
|
356
|
my $spill = chop $so_far; |
|
483
|
|
|
|
|
|
|
|
|
484
|
43
|
100
|
|
|
|
647
|
if (my $enc = $bom2enc{$so_far}) { |
|
485
|
29
|
|
|
|
|
796
|
my $char_length = _get_char_length($enc, $spill); |
|
486
|
|
|
|
|
|
|
|
|
487
|
29
|
|
|
|
|
87
|
my $extra = eval { |
|
488
|
29
|
|
|
|
|
128
|
_safe_read($fh, $char_length - length $spill); |
|
489
|
|
|
|
|
|
|
}; |
|
490
|
29
|
50
|
|
|
|
111
|
croak "Coudln't read byte: $@" if $@; |
|
491
|
29
|
|
|
|
|
70
|
$spill .= $extra; |
|
492
|
|
|
|
|
|
|
|
|
493
|
29
|
|
|
|
|
301
|
return ($enc, $spill); |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
else { |
|
496
|
|
|
|
|
|
|
# no BOM |
|
497
|
14
|
|
|
|
|
235
|
return ('', $so_far . $spill); |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub _safe_read { |
|
504
|
674
|
|
|
674
|
|
2355
|
my ($fh, $count) = @_; |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# read is supposed to return undef on error, but on some platforms it |
|
507
|
|
|
|
|
|
|
# seems to just return 0 and set $! |
|
508
|
674
|
|
|
|
|
6486
|
local $!; |
|
509
|
674
|
|
|
|
|
7062
|
my $status = read($fh, my $out, $count); |
|
510
|
|
|
|
|
|
|
|
|
511
|
674
|
100
|
100
|
|
|
1966
|
die $! if !$status && $!; |
|
512
|
|
|
|
|
|
|
|
|
513
|
668
|
|
|
|
|
3216
|
return $out; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head2 get_encoding_from_bom |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
($encoding, $offset) = get_encoding_from_bom($string) |
|
519
|
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
Returns the encoding and length in bytes of the BOM in $string. |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
If there is no BOM, an empty string is returned and $offset is zero. |
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
To get the data from the string, the following should work: |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
use Encode; |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
my($encoding, $offset) = get_encoding_from_bom($string); |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
if ($encoding) { |
|
531
|
|
|
|
|
|
|
$string = decode($encoding, substr($string, $offset)) |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=cut |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
sub get_encoding_from_bom ($) { |
|
537
|
524
|
|
|
524
|
1
|
1011
|
my $bom = shift; |
|
538
|
|
|
|
|
|
|
|
|
539
|
524
|
|
|
|
|
742
|
my $encoding = ''; |
|
540
|
524
|
|
|
|
|
902
|
my $offset = 0; |
|
541
|
|
|
|
|
|
|
|
|
542
|
524
|
100
|
|
|
|
1870
|
if (my($found) = $bom =~ $bom_re) { |
|
543
|
434
|
|
|
|
|
5980
|
$encoding = $bom2enc{$found}; |
|
544
|
434
|
|
|
|
|
3172
|
$offset = length($found); |
|
545
|
|
|
|
|
|
|
} |
|
546
|
|
|
|
|
|
|
|
|
547
|
524
|
|
|
|
|
2041
|
return ($encoding, $offset); |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
# Internal: |
|
551
|
|
|
|
|
|
|
# Work out character length for given encoding and spillage byte |
|
552
|
|
|
|
|
|
|
sub _get_char_length ($$) { |
|
553
|
34
|
|
|
34
|
|
292823
|
my($enc, $byte) = @_; |
|
554
|
|
|
|
|
|
|
|
|
555
|
34
|
100
|
|
|
|
570
|
if ($enc eq 'UTF-8') { |
|
|
|
100
|
|
|
|
|
|
|
556
|
3
|
100
|
|
|
|
28
|
if (($byte & 0x80) == 0) { |
|
557
|
1
|
|
|
|
|
5
|
return 1; |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
else { |
|
560
|
2
|
|
|
|
|
14
|
my $length = 0; |
|
561
|
|
|
|
|
|
|
|
|
562
|
2
|
|
|
|
|
24
|
1 while (($byte << $length++) & 0xc0) == 0xc0; |
|
563
|
|
|
|
|
|
|
|
|
564
|
2
|
|
|
|
|
87
|
return $length; |
|
565
|
|
|
|
|
|
|
} |
|
566
|
|
|
|
|
|
|
} |
|
567
|
|
|
|
|
|
|
elsif ($enc =~ /^UTF-(16|32)/) { |
|
568
|
30
|
|
|
|
|
454
|
return $1 / 8; |
|
569
|
|
|
|
|
|
|
} |
|
570
|
|
|
|
|
|
|
else { |
|
571
|
1
|
|
|
|
|
5
|
return; |
|
572
|
|
|
|
|
|
|
} |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head1 PerlIO::via interface |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
File::BOM can be used as a PerlIO::via interface. |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
open(HANDLE, '<:via(File::BOM)', 'my_file.txt'); |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt'); |
|
582
|
|
|
|
|
|
|
print "foo\n"; # BOM is written to file here |
|
583
|
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
This method is less prone to errors on non-seekable files as spillage is |
|
585
|
|
|
|
|
|
|
incorporated into an internal buffer, but it doesn't give you any information |
|
586
|
|
|
|
|
|
|
about the encoding being used, or indeed whether or not a BOM |
|
587
|
|
|
|
|
|
|
was present. |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
There are a few known problems with this interface, especially surrounding |
|
590
|
|
|
|
|
|
|
seek() and tell(), please see the BUGS section for more details about this. |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
=head2 Reading |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
The via(File::BOM) layer must be added before the handle is read from, otherwise |
|
595
|
|
|
|
|
|
|
any BOM will be missed. If there is no BOM, no decoding will be done. |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
Because of a limitation in PerlIO::via, read() always works on bytes, not characters. BOM decoding will still be done but output will be bytes of UTF-8. |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
open(BOM, '<:via(File::BOM)', $file); |
|
600
|
|
|
|
|
|
|
$bytes_read = read(BOM, $buffer, $length); |
|
601
|
|
|
|
|
|
|
$unicode = decode('UTF-8', $buffer, Encode::FB_QUIET); |
|
602
|
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
# Now $unicode is valid unicode and $buffer contains any left-over bytes |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head2 Writing |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Add the via(File::BOM) layer on top of a unicode encoding layer to print a BOM |
|
608
|
|
|
|
|
|
|
at the start of the output file. This needs to be done before any data is |
|
609
|
|
|
|
|
|
|
written. The BOM is written as part of the first print command on the handle, so |
|
610
|
|
|
|
|
|
|
if you don't print anything to the handle, you won't get a BOM. |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
There is a "Wide character in print" warning generated when the via(File::BOM) |
|
613
|
|
|
|
|
|
|
layer doesn't receive utf8 on writing. This glitch was resolved in perl version |
|
614
|
|
|
|
|
|
|
5.8.7, but if your perl version is older than that, you'll need to make sure |
|
615
|
|
|
|
|
|
|
that the via(File::BOM) layer receives utf8 like this: |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# This works OK |
|
618
|
|
|
|
|
|
|
open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename) |
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
# This generates warnings with older perls |
|
621
|
|
|
|
|
|
|
open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename) |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=head2 Seeking |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
Seeking with SEEK_SET results in an offset equal to the length of any detected |
|
626
|
|
|
|
|
|
|
BOM being applied to the position parameter. Thus: |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
# Seek to end of BOM (not start of file!) |
|
629
|
|
|
|
|
|
|
seek(FILE_BOM_HANDLE, 0, SEEK_SET) |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
=head2 Telling |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
In order to work correctly with seek(), tell() also returns a postion adjusted |
|
634
|
|
|
|
|
|
|
by the length of the BOM. |
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=cut |
|
637
|
|
|
|
|
|
|
|
|
638
|
28
|
50
|
|
28
|
0
|
12232
|
sub PUSHED { bless({offset => 0}, $_[0]) || -1 } |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
sub UTF8 { |
|
641
|
|
|
|
|
|
|
# There is a bug with this method previous to 5.8.7 |
|
642
|
|
|
|
|
|
|
|
|
643
|
28
|
50
|
|
28
|
0
|
84
|
if ($] >= 5.008007) { |
|
644
|
28
|
|
|
|
|
1187
|
return 1; |
|
645
|
|
|
|
|
|
|
} |
|
646
|
|
|
|
|
|
|
else { |
|
647
|
0
|
|
|
|
|
0
|
return 0; |
|
648
|
|
|
|
|
|
|
} |
|
649
|
|
|
|
|
|
|
} |
|
650
|
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
sub FILL { |
|
652
|
28
|
|
|
28
|
0
|
4703
|
my($self, $fh) = @_; |
|
653
|
|
|
|
|
|
|
|
|
654
|
28
|
|
|
|
|
41
|
my $line; |
|
655
|
28
|
100
|
|
|
|
72
|
if (not defined $self->{enc}) { |
|
656
|
15
|
|
|
|
|
31
|
($self->{enc}, my $spill) = get_encoding_from_filehandle($fh); |
|
657
|
|
|
|
|
|
|
|
|
658
|
15
|
100
|
|
|
|
36
|
if ($self->{enc} ne '') { |
|
659
|
14
|
|
|
1
|
|
181
|
binmode($fh, ":encoding($self->{enc})"); |
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
5
|
|
|
660
|
14
|
50
|
|
|
|
1458
|
$line .= decode($self->{enc}, $spill, FB_CROAK) if $spill; |
|
661
|
|
|
|
|
|
|
|
|
662
|
14
|
|
|
|
|
65
|
$self->{offset} = length $enc2bom{$self->{enc}}; |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
|
|
665
|
15
|
|
|
|
|
352
|
$line .= <$fh>; |
|
666
|
|
|
|
|
|
|
} |
|
667
|
|
|
|
|
|
|
else { |
|
668
|
13
|
|
|
|
|
63
|
$line = <$fh>; |
|
669
|
|
|
|
|
|
|
} |
|
670
|
|
|
|
|
|
|
|
|
671
|
28
|
|
|
|
|
183
|
return $line; |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
sub WRITE { |
|
675
|
21
|
|
|
21
|
|
5573
|
my($self, $buf, $fh) = @_; |
|
676
|
|
|
|
|
|
|
|
|
677
|
21
|
100
|
66
|
|
|
355
|
if (tell $fh == 0 and not $self->{wrote_bom}) { |
|
678
|
13
|
|
|
|
|
63
|
print $fh "\x{feff}"; |
|
679
|
13
|
|
|
|
|
25
|
$self->{wrote_bom} = 1; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
21
|
|
|
|
|
74
|
$buf = decode('UTF-8', $buf, FB_CROAK); |
|
683
|
|
|
|
|
|
|
|
|
684
|
21
|
|
|
|
|
1189
|
print $fh $buf; |
|
685
|
|
|
|
|
|
|
|
|
686
|
21
|
|
|
|
|
72
|
return 1; |
|
687
|
|
|
|
|
|
|
} |
|
688
|
|
|
|
|
|
|
|
|
689
|
28
|
|
|
28
|
0
|
12075
|
sub FLUSH { 0 } |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub SEEK { |
|
692
|
2
|
|
|
2
|
|
14
|
my $self = shift; |
|
693
|
|
|
|
|
|
|
|
|
694
|
2
|
|
|
|
|
4
|
my($pos, $whence, $fh) = @_; |
|
695
|
|
|
|
|
|
|
|
|
696
|
2
|
50
|
|
|
|
7
|
if ($whence == SEEK_SET) { |
|
697
|
2
|
|
|
|
|
4
|
$pos += $self->{offset}; |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
|
|
700
|
2
|
50
|
|
|
|
24
|
if (seek($fh, $pos, $whence)) { |
|
701
|
2
|
|
|
|
|
9
|
return 0; |
|
702
|
|
|
|
|
|
|
} |
|
703
|
|
|
|
|
|
|
else { |
|
704
|
0
|
|
|
|
|
0
|
return -1; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
} |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
sub TELL { |
|
709
|
1
|
|
|
1
|
|
9
|
my($self, $fh) = @_; |
|
710
|
|
|
|
|
|
|
|
|
711
|
1
|
|
|
|
|
7
|
my $pos = tell $fh; |
|
712
|
|
|
|
|
|
|
|
|
713
|
1
|
50
|
|
|
|
5
|
if ($pos == -1) { |
|
714
|
0
|
|
|
|
|
0
|
return -1; |
|
715
|
|
|
|
|
|
|
} |
|
716
|
|
|
|
|
|
|
else { |
|
717
|
1
|
|
|
|
|
5
|
return $pos - $self->{offset}; |
|
718
|
|
|
|
|
|
|
} |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
1; |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
__END__ |