File Coverage

blib/lib/File/LoadLines.pm
Criterion Covered Total %
statement 91 146 62.3
branch 57 98 58.1
condition 16 32 50.0
subroutine 8 8 100.0
pod 2 2 100.0
total 174 286 60.8


line stmt bran cond sub pod time code
1             #! perl
2              
3             package File::LoadLines;
4              
5 14     14   1819441 use warnings;
  14         32  
  14         916  
6 14     14   149 use strict;
  14         35  
  14         436  
7 14     14   71 use Exporter qw(import);
  14         48  
  14         1064  
8             our @EXPORT = qw( loadlines );
9             our @EXPORT_OK = qw( loadblob );
10 14     14   6588 use Encode;
  14         260786  
  14         1608  
11 14     14   129 use Carp;
  14         37  
  14         1058  
12 14     14   681 use utf8;
  14         388  
  14         130  
13              
14             =head1 NAME
15              
16             File::LoadLines - Load lines from files and network
17              
18             =cut
19              
20             our $VERSION = '1.047';
21              
22             =head1 SYNOPSIS
23              
24             use File::LoadLines;
25             my @lines = loadlines("mydata.txt");
26              
27             use File::LoadLines qw(loadblob);
28             my $img = loadblob("https://img.shields.io/badge/Language-Perl-blue");
29              
30             =head1 DESCRIPTION
31              
32             File::LoadLines provides an easy way to load the contents of a text
33             file into an array of lines. It is intended for small to moderate size files
34             like config files that are often produced by weird tools (and users).
35              
36             It will transparently fetch data from the network if the provided file
37             name is a URL.
38              
39             File::LoadLines automatically handles ASCII, Latin-1 and UTF-8 text.
40             When the file has a BOM, it handles UTF-8, UTF-16 LE and BE, and
41             UTF-32 LE and BE.
42              
43             Recognized line terminators are NL (Unix, Linux), CRLF (DOS, Windows)
44             and CR (Mac)
45              
46             Function loadblob(), exported on depand, fetches the content and
47             returns it without processing, equivalent to File::Slurp and ilk.
48              
49             =head1 EXPORT
50              
51             By default the function loadlines() is exported.
52              
53             =head1 FUNCTIONS
54              
55             =head2 loadlines
56              
57             @lines = loadlines("mydata.txt");
58             @lines = loadlines("mydata.txt", $options);
59              
60             The file is opened, read, decoded and split into lines
61             that are returned in the result array. Line terminators are removed.
62              
63             In scalar context, returns an array reference.
64              
65             The first argument may be the name of a file, an opened file handle,
66             or a reference to a string that contains the data.
67             The name of a file on disk may start with C<"file://">, this is ignored.
68             If the name starts with C<"http:"> or C<"https:"> the data will be
69             retrieved using LWP.
70             L like C<"data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="> are
71             also supported.
72              
73             The second argument can be used to influence the behaviour.
74             It is a hash reference of option settings.
75              
76             Note that loadlines() is a I, it reads the whole file into
77             memory and, for splitting, requires temporarily memory for twice the
78             size of the file.
79              
80             =over
81              
82             =item split
83              
84             Enabled by default.
85              
86             The data is split into lines and returned as an array (in list
87             context) or as an array reference (in scalar context).
88              
89             If set to zero, the data is not split into lines but returned as a
90             single string.
91              
92             =item chomp
93              
94             Enabled by default.
95              
96             Line terminators are removed from the resultant lines.
97              
98             If set to zero, the line terminators are not removed.
99              
100             =item encoding
101              
102             If specified, loadlines() will use this encoding to decode the file
103             data if it cannot automatically detect the encoding.
104              
105             If you pass an options hash, File::LoadLines will set C to
106             the encoding it detected and used for this file data.
107              
108             =item blob
109              
110             If specified, the data read is not touched but returned exactly as read.
111              
112             C overrules C and C.
113              
114             =item fail
115              
116             If specified, it should be either C<"hard"> or C<"soft">.
117              
118             If C<"hard">, read errors are signalled using croak exceptions.
119             This is the default.
120              
121             If set to C<"soft">, loadlines() will return an empty result and set
122             the error message in the options hash with key C<"error">.
123              
124             =back
125              
126             =cut
127              
128             sub loadlines {
129 70     70 1 2398047 my ( $filename, $options ) = @_;
130 70 100       341 croak("Missing filename.\n") unless defined $filename;
131 69 100 100     481 croak("Invalid options.\n") if (defined $options && (ref($options) ne "HASH"));
132              
133 68   100     408 $options->{blob} //= 0;
134 68   33     588 $options->{split} //= !$options->{blob};
135 68   66     455 $options->{chomp} //= !$options->{blob};
136 68   100     295 $options->{fail} //= "hard";
137              
138 68         140 my $data; # slurped file data
139             my $encoded; # already encoded
140              
141             # Gather data from the input.
142 68 100       523 if ( ref($filename) ) {
    50          
    50          
    50          
143 7 50 33     34 if ( ref($filename) eq 'GLOB' || ref($filename) eq 'IO::File' ) {
144 0         0 binmode( $filename, ':raw' );
145 0         0 $data = do { local $/; <$filename> };
  0         0  
  0         0  
146 0         0 $filename = "__GLOB__";
147             }
148             else {
149 7         14 $data = $$filename;
150 7         14 $filename = "__STRING__";
151 7         13 $encoded++;
152             }
153             }
154             elsif ( $filename eq '-' ) {
155 0         0 $filename = "__STDIN__";
156 0         0 binmode( STDIN, ':raw' );
157 0         0 $data = do { local $/; };
  0         0  
  0         0  
158             }
159             elsif ( $filename =~ /^https?:/ ) {
160 0         0 require LWP::UserAgent;
161 0         0 my $ua = LWP::UserAgent->new( timeout => 20 );
162 0         0 my $res = $ua->get($filename);
163 0 0       0 if ( $res->is_success ) {
    0          
164 0         0 $data = $res->decoded_content;
165             }
166             elsif ( $options->{fail} eq "soft" ) {
167 0         0 $options->{error} = $res->status_line;
168 0         0 return;
169             }
170             else {
171 0         0 croak("$filename: ", $res->status_line);
172             }
173             }
174             elsif ( $filename =~ /^data:/ ) {
175 0 0       0 unless ( $filename =~ m! ^ data:
176             (? .*? )
177             ,
178             (? .* ) $
179             !sx ) {
180 0 0       0 if ( $options->{fail} eq "soft" ) {
181 0         0 $options->{error} = "Malformed inline data";
182 0         0 return;
183             }
184             else {
185 0         0 croak("Malformed inline data");
186             }
187             }
188 0         0 $data = $+{data};
189 0         0 $filename = "__DATA__";
190 0         0 my $mediatype = $+{mediatype};
191 0         0 my $enc = "";
192 0 0 0     0 if ( $mediatype && $mediatype =~ /^(.*);base64$/ ) {
193 0         0 $mediatype = $1;
194 0         0 $enc = "base64";
195             }
196 0 0       0 $options->{mediatype} = $mediatype if $mediatype;
197 0 0       0 if ( ! $enc ) {
198             # URL encoded.
199 0         0 $data = $+{data};
200 0         0 $data =~ s/\%([0-9a-f][0-9a-f])/chr(hex($1))/ige;
  0         0  
201             }
202             else {
203             # Base64.
204 0         0 require MIME::Base64;
205 0         0 $data = MIME::Base64::decode($data);
206             }
207 0 0 0     0 if ( $mediatype && $mediatype =~ /;charset=([^;]*)/ ) {
208 0         0 $data = decode( $1, $data );
209 0         0 $options->{encoding} = $1;
210 0         0 $encoded++;
211             }
212             }
213             else {
214 61         133 my $name = $filename;
215 61         134 $name =~ s;^file://;;;
216 61         771 $filename = decode_utf8($name);
217             # On MS Windows, non-latin (wide) filenames need special treatment.
218 61 50 66     604 if ( $filename ne $name && $^O =~ /mswin/i ) {
219 0         0 require Win32API::File;
220 0         0 my $fn = encode('UTF-16LE', "$filename").chr(0).chr(0);
221 0         0 my $fh = Win32API::File::CreateFileW
222             ( $fn, Win32API::File::FILE_READ_DATA(), 0, [],
223             Win32API::File::OPEN_EXISTING(), 0, []);
224 0 0       0 croak("$filename: $^E (Win32)\n") if $^E;
225 0 0       0 unless ( Win32API::File::OsFHandleOpen( 'FILE', $fh, "r") ) {
226 0 0       0 $options->{error} = "$!", return if $options->{fail} eq "soft";
227 0         0 croak("$filename: $!\n");
228             }
229 0         0 binmode FILE => ':raw';
230 0         0 $data = do { local $/; readline(\*FILE) };
  0         0  
  0         0  
231             # warn("$filename³: len=", length($data), "\n");
232 0         0 close(FILE);
233             }
234             else {
235 61         101 my $f;
236 61 100       3437 unless ( open( $f, '<:raw', $filename ) ) {
237             $options->{error} = "$!", return
238 1 50       28 if $options->{fail} eq "soft";
239 0         0 croak("$name: $!\n");
240             }
241 60         171 $data = do { local $/; <$f> };
  60         289  
  60         2874  
242             }
243             }
244 67 50       371 $options->{_filesource} = $filename if $options;
245              
246 67         331 my $name = encode_utf8($filename);
247 67 100       683 if ( $options->{blob} ) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
248             # Do not touch.
249 4         17 $options->{encoding} = 'Blob';
250             }
251             elsif ( $encoded ) {
252             # Nothing to do, already dealt with.
253 3   50     12 $options->{encoding} //= 'Perl';
254             }
255              
256             # Detect Byte Order Mark.
257             elsif ( $data =~ /^\xEF\xBB\xBF/ ) {
258 13 50       52 warn("$name is UTF-8 (BOM)\n") if $options->{debug};
259 13         46 $options->{encoding} = 'UTF-8';
260 13         119 $data = decode( "UTF-8", substr($data, 3) );
261             }
262             elsif ( $data =~ /^\xFE\xFF/ ) {
263 4 50       13 warn("$name is UTF-16BE (BOM)\n") if $options->{debug};
264 4         11 $options->{encoding} = 'UTF-16BE';
265 4         25 $data = decode( "UTF-16BE", substr($data, 2) );
266             }
267             elsif ( $data =~ /^\xFF\xFE\x00\x00/ ) {
268 4 50       12 warn("$name is UTF-32LE (BOM)\n") if $options->{debug};
269 4         12 $options->{encoding} = 'UTF-32LE';
270 4         25 $data = decode( "UTF-32LE", substr($data, 4) );
271             }
272             elsif ( $data =~ /^\xFF\xFE/ ) {
273 13 50       53 warn("$name is UTF-16LE (BOM)\n") if $options->{debug};
274 13         46 $options->{encoding} = 'UTF-16LE';
275 13         149 $data = decode( "UTF-16LE", substr($data, 2) );
276             }
277             elsif ( $data =~ /^\x00\x00\xFE\xFF/ ) {
278 4 50       10 warn("$name is UTF-32BE (BOM)\n") if $options->{debug};
279 4         11 $options->{encoding} = 'UTF-32BE';
280 4         22 $data = decode( "UTF-32BE", substr($data, 4) );
281             }
282              
283             # No BOM, did user specify an encoding?
284             elsif ( $options->{encoding} ) {
285             warn("$name is ", $options->{encoding}, " (fallback)\n")
286 1 50       7 if $options->{debug};
287 1         11 $data = decode( $options->{encoding}, $data, 1 );
288             }
289              
290             # Try UTF8, fallback to ISO-8895.1.
291             else {
292 21         52 my $d = eval { decode( "UTF-8", $data, 1 ) };
  21         167  
293 21 100       1286 if ( $@ ) {
    100          
294 5 50       18 warn("$name is ISO-8859.1 (assumed)\n") if $options->{debug};
295 5         18 $options->{encoding} = 'ISO-8859-1';
296 5         19 $data = decode( "iso-8859-1", $data );
297             }
298             elsif ( $d !~ /[^[:ascii:]]/ ) {
299 2 50       11 warn("$name is ASCII (detected)\n") if $options->{debug};
300 2         8 $options->{encoding} = 'ASCII';
301 2         3 $data = $d;
302             }
303             else {
304 14 50       70 warn("$name is UTF-8 (detected)\n") if $options->{debug};
305 14         101 $options->{encoding} = 'UTF-8';
306 14         99 $data = $d;
307             }
308             }
309              
310             # This can be used to add line continuation or comment stripping.
311 67 50       19619 if ( $options->{strip} ) {
312 0         0 $data =~ s/$options->{strip}//g;
313             }
314              
315 67 100       248 return $data unless $options->{split};
316              
317             # Split in lines;
318 63         94 my @lines;
319 63 100       143 if ( $options->{chomp} ) {
320             # Unless empty, make sure there is a final newline.
321 51 100       373 $data .= "\n" if $data =~ /.(?!\r\n|\n|\r)\z/;
322             # We need to maintain trailing newlines.
323 51         841 push( @lines, $1 ) while $data =~ /(.*?)(?:\r\n|\n|\r)/g;
324             }
325             else {
326 12         168 push( @lines, $1 ) while $data =~ /(.*?(?:\r\n|\n|\r))/g;
327             # In case the last line has no terminator.
328 12 100       137 push( @lines, $1 ) if $data =~ /(?:\r\n|\n|\r)([^\r\n]+)\z/;
329             }
330 63         256 undef $data;
331 63 100       397 return wantarray ? @lines : \@lines;
332             }
333              
334             =head2 loadblob
335              
336             use File::LoadLines qw(loadblob);
337             $rawdata = loadblob("raw.dat");
338             $rawdata = loadblob("raw.dat", $options);
339              
340             This is equivalent to calling loadlines() with C<< blob=>1 >> in the options.
341              
342             =cut
343              
344             sub loadblob {
345 2     2 1 166424 my ( $filename, $options ) = @_;
346 2 50       10 croak("Missing filename.\n") unless defined $filename;
347 2 50 33     9 croak("Invalid options.\n")
348             if defined($options) && ref($options) ne "HASH";
349 2   50     15 $options //= {};
350 2         7 $options->{blob} = 1;
351 2         10 loadlines( $filename, $options );
352             }
353              
354             =head1 SEE ALSO
355              
356             There are currently no other modules that handle BOM detection and
357             line splitting.
358              
359             I have a faint hope that future versions of Perl and Raku will deal
360             with this transparently, but I fear the worst.
361              
362             =head1 HINTS
363              
364             When you have raw file data (e.g. from a zip), you can use loadlines()
365             to decode and unpack:
366              
367             open( my $data, '<', \$contents );
368             $lines = loadlines( $data, $options );
369              
370             There is no hard requirement on LWP. If you want to use transparent
371             fetching of data over the network please make sure LWP::UserAgent is
372             available.
373              
374             =head1 AUTHOR
375              
376             Johan Vromans, C<< >>
377              
378             =head1 SUPPORT AND DOCUMENTATION
379              
380             Development of this module takes place on GitHub:
381             https://github.com/sciurius/perl-File-LoadLines.
382              
383             You can find documentation for this module with the perldoc command.
384              
385             perldoc File::LoadLines
386              
387             Please report any bugs or feature requests using the issue tracker on
388             GitHub.
389              
390             =head1 COPYRIGHT & LICENSE
391              
392             Copyright 2018,2020,2024 Johan Vromans, all rights reserved.
393              
394             This program is free software; you can redistribute it and/or modify it
395             under the same terms as Perl itself.
396              
397             =cut
398              
399             1; # End of File::LoadLines