File Coverage

blib/lib/Spreadsheet/Edit/IO.pm
Criterion Covered Total %
statement 210 534 39.3
branch 59 310 19.0
condition 41 181 22.6
subroutine 45 84 53.5
pod 10 11 90.9
total 365 1120 32.5


line stmt bran cond sub pod time code
1             # License: http://creativecommons.org/publicdomain/zero/1.0/
2             # (CC0 or Public Domain). To the extent possible under law, the author,
3             # Jim Avera (email jim.avera at gmail dot com) has waived all copyright and
4             # related or neighboring rights to this document. Attribution is requested
5             # but not required.
6 3     3   23 use strict; use warnings FATAL => 'all'; use utf8;
  3     3   7  
  3     3   114  
  3         18  
  3         6  
  3         111  
  3         17  
  3         7  
  3         21  
7 3     3   96 use feature qw(say state lexical_subs current_sub);
  3         6  
  3         251  
8 3     3   20 no warnings qw(experimental::lexical_subs);
  3         5  
  3         164  
9              
10             package Spreadsheet::Edit::IO;
11              
12             # Allow "use <thismodule. VERSION ..." in development sandbox to not bomb
13 3     3   18 { no strict 'refs'; ${__PACKAGE__."::VER"."SION"} = 1999.999; }
  3         1916  
  3         315  
14             our $VERSION = '1000.009'; # VERSION from Dist::Zilla::Plugin::OurPkgVersion
15             our $DATE = '2023-09-23'; # DATE from Dist::Zilla::Plugin::OurDate
16              
17             # This module is derived from the old never-released Text:CSV::Spreadsheet
18              
19 3     3   26 use Exporter 'import';
  3         5  
  3         225  
20              
21             our @EXPORT = qw/convert_spreadsheet OpenAsCsv cx2let let2cx cxrx2sheetaddr
22             sheetname_from_spec filepath_from_spec
23             form_spec_with_sheetname/;
24              
25             our @EXPORT_OK = qw/can_cvt_spreadsheets can_extract_allsheets can_extract_named_sheet
26             openlibreoffice_path
27             @sane_CSV_read_options @sane_CSV_write_options/;
28              
29             # TODO: Provide "known_attributes" function ala Text::CSV::known_attributes()
30              
31 3     3   1922 use version ();
  3         7206  
  3         99  
32 3     3   21 use Carp;
  3         7  
  3         175  
33              
34 3     3   19 use File::Find ();
  3         8  
  3         65  
35 3     3   2007 use File::Copy ();
  3         8399  
  3         91  
36 3     3   3049 use File::Copy::Recursive ();
  3         14128  
  3         108  
37 3     3   40 use File::Glob qw/bsd_glob/;
  3         6  
  3         349  
38              
39 3     3   913 use Path::Tiny qw/path/;
  3         13032  
  3         186  
40              
41             # Path::Tiny OBVIATES NEED for many but we still need this
42 3     3   52 use File::Spec ();
  3         8  
  3         96  
43 3     3   536 use File::Spec::Functions qw/devnull tmpdir rootdir catdir catfile/;
  3         887  
  3         230  
44              
45             # Still sometimes convenient...
46 3     3   35 use File::Basename qw(basename);
  3         6  
  3         169  
47              
48 3     3   2559 use File::Which qw/which/;
  3         3431  
  3         172  
49 3     3   2250 use URI::file ();
  3         36410  
  3         109  
50 3     3   26 use Guard qw(guard scope_guard);
  3         6  
  3         195  
51 3     3   20 use Fcntl qw(:flock :seek);
  3         7  
  3         441  
52 3     3   23 use Scalar::Util qw/blessed/;
  3         7  
  3         151  
53 3     3   22 use List::Util qw/none all notall first min max/;
  3         8  
  3         243  
54 3     3   22 use Encode qw(encode decode);
  3         7  
  3         149  
55 3     3   17 use File::Glob qw/bsd_glob GLOB_NOCASE/;
  3         8  
  3         186  
56 3     3   28 use Digest::MD5 qw/md5_base64/;
  3         10  
  3         166  
57 3     3   19 use Text::CSV ();
  3         5  
  3         151  
58             # DDI 5.025 is needed for Windows-aware qsh()
59 3     3   30 use Data::Dumper::Interp 5.025 qw/vis visq dvis dvisq ivis ivisq avis qsh qshlist u/;
  3         71  
  3         61  
60              
61 3         28 use Spreadsheet::Edit::Log qw/log_call fmt_call log_methcall fmt_methcall oops/,
62 3     3   1146 ':btw=IO${lno}:';
  3         6  
63             our %SpreadsheetEdit_Log_Options = (
64             is_public_api => sub{
65             $_[1][3] =~ /(?: ::|^ )(?: [a-z][^:]* | OpenAsCsv | ConvertSpreadsheet )$/x
66             },
67             );
68              
69             my $progname = path($0)->basename;
70              
71             sub _get_username(;$) {
72 4     4   11 my ($uid) = @_;
73 4   50     20 $uid //= eval{ $> } // -1; # default to EUID
  4   33     61  
74 4         10 state $answer = {};
75 4   66     24 return $answer->{$uid} //= do {
76             # https://stackoverflow.com/questions/12081246/how-to-get-system-user-full-name-on-windows-in-perl
77 3   33     3806 eval { getpwuid($uid) // $uid }
78             ||
79 3 0 0     8 ($^O =~ /MSWin/ && $uid == (eval{$>}//-1) && eval{ # untested...
  0   0     0  
      0        
      33        
80 0         0 require Win32API::Net;
81 0   0     0 Win32API::Net::UserGetInfo($ENV{LOGONSERVER}||'',Win32::LoginName(),10,my $info={});
82             $info->{fullName}
83 0         0 })
84             ||
85             "UID$uid"
86             };
87             }
88              
89              
90             # A private Libre/Open Office profile dir is needed to avoid conflicts
91             # with interactive sessions, see
92             # https://ask.libreoffice.org/en/question/290306/how-to-start-independent-lo-instance-process
93             #
94             # We use a persistent profile dir shared among processes for a given user
95             # (actually one for each unique external tool which needs one).
96             # Sharing is okay because we get an exclusive lock before actually using it.
97             state $profile_parent_dir = do{ # also used for lockfile
98             my $user = _get_username();
99             my $dname = __PACKAGE__."_${user}_profileparent";
100             $dname =~ s/::/-/g;
101             $dname =~ s/[^-\w.:]/_/g;
102             (my $path = path(File::Spec->tmpdir)->child($dname))->mkpath;
103             $path # Path::Tiny
104             };
105             sub _get_tool_profdir($) {
106 0     0   0 my ($tool_path) = @_;
107 0         0 my $fingerprint = _file_fingerprint($tool_path);
108 0         0 (my $toolname = path($tool_path)->basename(qw/\.\w+$/)) =~ s/[^-\w.:]/_/g;
109 0         0 my $path = $profile_parent_dir->child("${toolname}_$fingerprint");
110 0         0 $path->mkpath;
111 0         0 $path
112             }
113              
114             # Prevent concurrent document conversions.
115             # LO & OO can't handle concurrent access to the same profile.
116             my $locked_fh;
117             sub _get_exclusive_lock($) { # returns lock object
118 0     0   0 my $opts = shift;
119 0         0 my $lockfile_path = $profile_parent_dir->child("LOCKFILE")->canonpath;
120 0         0 my $sleeptime = 1;
121 0         0 my $lock_fh;
122 0         0 while (! defined $lock_fh) {
123             #warn "$$ : ### AAA open $lockfile_path ...\n";
124 0 0       0 open $lock_fh, "+>>", $lockfile_path or die $!;
125             #warn "$$ : ### AA2 open succeeded.\n";
126 0         0 eval { chmod 0666, $lock_fh; }; # sometimes not implemented
  0         0  
127             #warn "$$ : ### AA3 flock ...\n";
128 0 0       0 if (! flock($lock_fh, LOCK_EX|LOCK_NB)) {
129             #warn "$$ : ### AA4 flock FAILED\n";
130 0 0       0 seek($lock_fh, 0, SEEK_SET) or die;
131 0         0 my @lines = <$lock_fh>;
132 0 0       0 close($lock_fh) or die "close:$!"; $lock_fh = undef;
  0         0  
133             #warn "$$ : ### AA6 fh closed...\n";
134 0   0     0 my $owner = $lines[-1] // ""; # pid NNN (progname)
135 0 0       0 { my ($pid) = ($owner =~ /pid (\d+)/) or last;
  0         0  
136 0 0       0 my @s = stat("/proc/$pid") or last;
137 0         0 $owner = _get_username($s[4])." ".$owner;
138             }
139 0 0       0 my $ownermsg = $owner ? " held by $owner" : "";
140             # Carp::longmess ...
141             warn ">> ($$) Waiting for exclusive lock${ownermsg}...\n",
142             " $lockfile_path\n"
143 0 0       0 unless $opts->{silent};
144 0         0 sleep $sleeptime;
145             } else {
146 0         0 $locked_fh = $lock_fh;
147 0 0       0 seek($lock_fh, 0, SEEK_END) or die;
148 0         0 print $lock_fh "pid $$ ($progname)\n"; # always appends anyway on *nix
149             }
150             }
151 0         0 $opts->{lockfile_fh} = $lock_fh;
152             #warn "$$ : ### GOT LOCK\n";
153             }
154             END{
155 3 50   3   746 if (defined $locked_fh) {
156 0         0 flock($locked_fh, LOCK_UN);
157 0         0 close($locked_fh);
158 0         0 $locked_fh = undef;
159 0         0 warn "Did emergency unlock!\n";
160             }
161             #else { warn "(emergency unlock not needed)\n"; }
162             }
163             sub _release_lock($) {
164 0     0   0 my $opts = shift;
165 0   0     0 my $fh = delete($opts->{lockfile_fh}) // oops;
166 0 0       0 oops unless $fh == $locked_fh;
167             #seek($fh, 0, SEEK_SET) or die;
168             #my @x = (<$fh>);
169             #seek($fh, 0, SEEK_SET) or die;
170             #warn dvis "$$ : Lockfile contains: @x\n";
171             ##warn "$$ : ###BBB stalling before unlock...\n"; sleep 3;
172 0         0 truncate($fh,0);
173 0 0       0 flock($fh, LOCK_UN) or die "flock UN: $!";
174 0         0 close $fh;
175 0         0 $locked_fh = undef;
176             #warn "$$ : ###BB0 unlocked and closed.\n";
177             }
178              
179             # Libre Office text converter "charset" numbers
180             my %LO_charsets = (
181             'WINDOWS1252' => 1, 'WINLATIN1' => 1,
182             'APPLEWESTERN' => 2,
183             'DOS/OS2437' => 3,
184             'DOS/OS2850' => 4,
185             'DOS/OS2860' => 5,
186             'DOS/OS2861' => 6,
187             'DOS/OS2863' => 7,
188             'DOS/OS2865' => 8,
189             'SYSTEM' => 9, 'SYSTEMDDEFAULT' => 9,
190             'SYMBOL' => 10,
191             'ASCII' => 11,
192             'ISO88591' => 12,
193             'ISO88592' => 13,
194             'ISO88593' => 14,
195             'ISO88594' => 15,
196             'ISO88595' => 16,
197             'ISO88596' => 17,
198             'ISO88597' => 18,
199             'ISO88598' => 19,
200             'ISO88599' => 20,
201             'ISO885914' => 21,
202             'ISO885915' => 22,
203             'OS2737' => 23,
204             'OS2775' => 24,
205             'OS2852' => 25,
206             'OS2855' => 26,
207             'OS2857' => 27,
208             'OS2862' => 28,
209             'OS2864' => 29,
210             'OS2866' => 30,
211             'OS2869' => 31,
212             'WINDOWS874' => 32,
213             'WINDOWS1250' => 33, 'WINLATIN2' => 33,
214             'WINDOWS1251' => 34,
215             'WINDOWS1253' => 35,
216             'WINDOWS1254' => 36,
217             'WINDOWS1255' => 37,
218             'WINDOWS1256' => 38,
219             'WINDOWS1257' => 39,
220             'WINDOWS1258' => 40,
221             'APPLEARABIC' => 41,
222             'APPLECENTRALEUROPEAN' => 42,
223             'APPLECROATIAN' => 43,
224             'APPLECYRILLIC' => 44,
225             'APPLEDEVANAGARI' => 45,
226             'APPLEFARSI' => 46,
227             'APPLEGREEK' => 47,
228             'APPLEGUJARATI' => 48,
229             'APPLEGURMUKHI' => 49,
230             'APPLEHEBREW' => 50,
231             'APPLEICELANDIC' => 51,
232             'APPLEROMANIAN' => 52,
233             'APPLETHAI' => 53,
234             'APPLETURKISH' => 54,
235             'APPLEUKRAINIAN' => 55,
236             'APPLECHINESESIMPLIFIED' => 56,
237             'APPLECHINESETRADITIONAL' => 57,
238             'APPLEJAPANESE' => 58,
239             'APPLEKOREAN' => 59,
240             'WINDOWS932' => 60,
241             'WINDOWS936' => 61,
242             'WINDOWSWANSUNG949' => 62,
243             'WINDOWS950' => 63,
244             'SHIFTJIS' => 64,
245             'GB2312' => 65,
246             'GBT12345' => 66,
247             'GBK' => 67, 'GB231280' => 67,
248             'BIG5' => 68,
249             'EUCJP' => 69,
250             'EUCCN' => 70,
251             'EUCTW' => 71,
252             'ISO2022JP' => 72,
253             'ISO2022CN' => 73,
254             'KOI8R' => 74,
255             'UTF7' => 75,
256             'UTF8' => 76,
257             'ISO885910' => 77,
258             'ISO885913' => 78,
259             'EUCKR' => 79,
260             'ISO2022KR' => 80,
261             'JIS0201' => 81,
262             'JIS0208' => 82,
263             'JIS0212' => 83,
264             'WINDOWSJOHAB1361' => 84,
265             'GB18030' => 85,
266             'BIG5HKSCS' => 86,
267             'TIS620' => 87,
268             'KOI8U' => 88,
269             'ISCIIDEVANAGARI' => 89,
270             'JAVAUTF8' => 90,
271             'ADOBESTANDARD' => 91,
272             'ADOBESYMBOL' => 92,
273             'PT154' => 93,
274             'UCS4' => 65534,
275             'UCS2' => 65535,
276             );
277              
278             =for Pod::Coverage _name2LOcharsetnum
279             =cut
280              
281             sub _name2LOcharsetnum($) {
282 0     0   0 my ($enc) = @_;
283 0         0 local $_ = uc $enc;
284 0         0 while (! $LO_charsets{$_}) {
285             # successively remove - and other special characters
286 0 0       0 s/\W//a or confess "LO charset: Unknown encoding name '$enc'";
287             }
288 0         0 $LO_charsets{$_}
289             }
290              
291             # convert between 0-based index and spreadsheet column letter code.
292             # Default argument is $_
293             sub cx2let(_) {
294 0     0 1 0 my $cx = shift;
295 0         0 my $ABC="A"; ++$ABC for (1..$cx);
  0         0  
296 0         0 return $ABC
297             }
298             sub let2cx(_) {
299 0     0 1 0 my $ABC = shift;
300 0         0 my $n = ord(substr($ABC,0,1,"")) - ord('A');
301 0         0 while (length $ABC) {
302 0         0 my $letter = substr($ABC,0,1,"");
303 0         0 $n = (($n+1) * 26) + (ord($letter) - ord('A'));
304             }
305 0         0 return $n;
306             }
307             sub cxrx2sheetaddr($$) { # (1,99) -> "B100"
308 0     0 0 0 my ($cx, $rx) = @_;
309 0         0 return cx2let($cx) . ($rx + 1);
310             }
311              
312             =for Pod::Coverage cxrx2sheetaddr oops btw
313             =cut
314              
315             our @sane_CSV_read_options = (
316             # Text::CSV pod says to not specify 'eol' to allow embedded newlines,
317             # and to automatically handle "\n", "\r", or "\r\n".
318             #eol => $/,
319             binary => 1, # Allow reading embedded newlines & unicode etc.
320             sep_char => ",",
321             quote_char => '"',
322             escape_char => '"', # Embedded "s appear as ""
323             allow_whitespace => 0, # Preserve leading & trailing white space
324             auto_diag => 2, # die on errors
325             );
326             our @sane_CSV_write_options = (
327             eol => $/, # Necessary when WRITING csv files
328             binary => 1,
329             sep_char => ",",
330             quote_char => '"',
331             escape_char => '"', # Embedded "s appear as ""
332             allow_whitespace => 0, # Preserve leading & trailing white space
333             auto_diag => 2, # die on errors
334             );
335              
336             my %Saved_Sigs;
337             sub _sighandler {
338 0 0 0 0   0 if (! $Saved_Sigs{$_[0]} or $Saved_Sigs{$_[0]} eq 'DEFAULT') {
339             # The user isn't catching this, so the process will abort without
340             # running destructors: Call exit instead
341 0         0 warn "($$)".__PACKAGE__." caught signal $_[0], exiting\n";
342 0         0 Carp::cluck "($$)".__PACKAGE__." caught signal $_[0], exiting\n";
343 0         0 exit 1;
344             }
345 0         0 $SIG{$_[0]} = $Saved_Sigs{$_[0]};
346 0         0 kill $_[0], $$;
347 0         0 oops "Default (or user-defined) sig $_[0] action was to ignore!";
348             }
349             sub _signals_guard() {
350 0   0 0   0 %Saved_Sigs = ( map{ ($_ => ($SIG{$_} // undef)) } qw/HUP INT QUIT TERM/ );
  0         0  
351 0         0 $SIG{HUP} = $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = \&_sighandler;
352 0     0   0 return guard { @SIG{keys %Saved_Sigs} = (values %Saved_Sigs) }
353 0         0 }
354              
355             # Create a probably-unique fingerprint for a particular file
356             sub _file_fingerprint($) {
357 0     0   0 my $path = shift;
358 0         0 my $ctx = Digest::MD5->new;
359 0         0 $ctx->add($_) for((stat($path))[0,1,9]); # dev,ino,mtime
360 0         0 substr($ctx->b64digest,0,6)
361             }
362              
363             # Find LibreOffice, or failing that OpenOffice
364             our $OLpath_answer = $ENV{SPREADSHEET_EDIT_LOPATH};
365             sub openlibreoffice_path() {
366             return $OLpath_answer if $OLpath_answer;
367             unless ($ENV{SPREADSHEET_EDIT_IGNPATH}) {
368             foreach my $short_name (qw(libreoffice loffice localc)) {
369             if ($OLpath_answer = which($short_name)) {
370             return( ($OLpath_answer=path($OLpath_answer)->canonpath) );
371             }
372             }
373             }
374             # Search for an installation. On Windows it will usually be
375             # C:\Program Files\LibreOffice\...
376             # On *nix, a local/isolated install (i.e. the result of extracting files
377             # from a .deb or other archive) will be
378             # somwehere/opt/libreofficeA.B/...
379             # and "..." is the same standard hierarchy on all platforms.
380             #
381             # If multiple are found try to pick the "latest".
382             my sub _cmp_subpaths($$) {
383             my ($sp1, $sp2) = @_;
384             oops if !defined($sp1);
385             return 1 if !defined($sp2);
386             # Use longest version in the (sub-)path, e.g. "4.4.1/opt/openoffice4/..."
387             my (@v1) = sort { length($a) <=> length($b) } ($sp1 =~ /(\d[.\da-z]*)/ag);
388             my (@v2) = sort { length($a) <=> length($b) } ($sp2 =~ /(\d[.\da-z]*)/ag);
389             my $v1 = $v1[-1]//0;
390             my $v2 = $v2[-1]//0;
391             if ($v1 =~ s/alpha/0/) {
392             return -1 unless $v2 =~ s/alpha/0/;
393             }
394             if ($v2 =~ /alpha/) {
395             return +1
396             }
397             version->parse($v1) <=> version->parse($v2)
398             }
399              
400             # I tried just doing File::Glob::bsd_glob('/*/*/*/opt/libre*/program') but
401             # it silently failed even though the same glob works from the shell. Mmff...
402 3     3   29 no warnings FATAL => 'all';
  3         6  
  3         37803  
403             state $is_MSWin = ($^O eq "MSWin32");
404             my (@search_dirs, $searchfor_re, $maxdepth);
405             if ($is_MSWin) {
406             @search_dirs = ("C:\\Program Files","C:\\Program Files (x86)");
407             $searchfor_re = qr/^Program Files/;
408             $maxdepth = 1;
409             # depth: C:\Program Files\libreofficeXXX/program/
410             # 1
411             } else {
412             @search_dirs = (File::Spec->rootdir());
413             push @search_dirs, $ENV{HOME} if $ENV{HOME};
414             $maxdepth = 4;
415             $searchfor_re = qr/^opt$/;
416             # depth: /*/*/<unpackparent>/opt/libreofficeXXX/program/
417             # 1 2 3 4
418             }
419              
420             my $debug = $ENV{SPREADSHEET_EDIT_FINDDEBUG};
421             my sub _Findvarsmsg() {
422             if (u($_) eq u($File::Find::name) && u($_) eq u($File::Find::fullname)) {
423             return qsh($_)."\n"
424             }
425             if (u($_) eq u($File::Find::name)) {
426             return "\$_=name=".qsh($_)." -> ".qsh($File::Find::fullname)."\n";
427             }
428             "\$_=".qsh($_)." name=".qsh($File::Find::name)." fullname=".qsh($File::Find::fullname)."\n";
429             }
430              
431             my %results;
432             $ENV{SPREADSHEET_EDIT_NOLOSEARCH} or
433             File::Find::find(
434             { wanted => sub{
435             # Undef fullname OR invalid "_" filehandle implies a broken symlink,
436             # see https://github.com/Perl/perl5/issues/21122
437             # Zero size on *nix implies /proc or something similar; do not enter.
438             # File::Find::fullname unreadable implies followed link to inaccessable
439             # (The initial "_" stat may be invalid, so "-l _" is useless)
440 4866704     4866704   18601589 $! = 0;
441             # https://github.com/Perl/perl5/issues/21143
442 4866704         8481339 my $fullname = $File::Find::fullname;
443 4866704 50 33     15116855 if (!defined($fullname) && $is_MSWin) {
444 0 0       0 warn "# _ MSWin undef fullname! ",_Findvarsmsg() if $debug;
445 0         0 stat($_); # lstat was not done. Grr...
446 0         0 $fullname = $File::Find::name;
447 0 0       0 unless (-d _) {
448 0         0 $File::Find::prune = 1; # in case it really is a dir
449 0         0 return;
450             }
451             } else {
452 4866704 100 100     17096678 unless (-d _ or -l _) {
453 4488482 100       10175996 warn "# _ notdir/symlink: ",_Findvarsmsg() if $debug;
454 4488482         55521296 $File::Find::prune = 1; # in case it really is
455 4488482         162594147 return;
456             }
457             }
458 378222 100 33     14234401 if (
      66        
      66        
      66        
      66        
      66        
      66        
      100        
459             !defined($fullname) # broken link, per docs
460             || (! -r _) || (! -x _) # unreadable item or invalid "_" handle
461             # https://github.com/Perl/perl5/issues/21122
462             || (!$is_MSWin && (stat(_))[7] == 0) # zero size ==> /proc etc.
463             || /\$/ # $some_windows_special_thing$
464             || ! -r $fullname # presumably a symlink to unreadable
465             || ! -x _ # or unsearchable dir
466             || m#^/snap/(?!.*ffice)# # snap other than e.g. /snap/libreoffice
467             || m#^/(proc|dev|sys|tmp|boot|run|lost+found|usr/(include|src))$#
468             ) {
469 646 100       3959 warn "# PRUNING ",_Findvarsmsg() if $debug;
470 646         11398 $File::Find::prune = 1;
471             return
472 646         123069 }
473 377576 100       1670058 warn "# DIR: ",_Findvarsmsg() if $debug;
474             # Maximum depth: /*/*/<unpackparent>/opt/libreofficeXXX/program/
475 377576         5678109 my $path = path($_);
476 377576         14740453 my $depth = scalar(() = $path->stringify =~ m#(/)#g);
477 377576 100       15200341 if (basename($_) =~ $searchfor_re) { # ^opt$ or ^Program Files
478 6         42 my $prefix = path($_)->parent->parent;
479 6         5518 for my $o_l (qw/libre open/) {
480 12         136 my $pattern
481             = path($_)->child("${o_l}*/program/soffice*")->canonpath;
482             # eval because I'm suspicious of the glob on Windows
483 12         1361 my @hits; eval{ @hits = sort +bsd_glob($pattern, GLOB_NOCASE) };
  12         38  
  12         9284  
484 12 50       113 if (@hits) {
485             # On windows, use soffice.com not .exe because it writes messages
486             # to stdout not a window. See https://help.libreoffice.org/7.5/en-GB/text/shared/guide/start_parameters.html?&DbPAR=SHARED&System=WIN
487             my $path = (first{ /soffice\.com$/ } @hits) ||
488 0   0     0 (first{ /soffice$/ } @hits);
489 0 0       0 if ($path) {
490 0 0       0 $prefix->subsumes($path) or oops dvis '$prefix $path';
491 0         0 my $subpath = path($path)->relative($prefix);
492 0 0       0 if (_cmp_subpaths($subpath, $results{$o_l}{subpath}) >= 0) {
493 0         0 @{$results{$o_l}}{qw/path subpath/} = ($path, $subpath);
  0         0  
494             # We found where installations are, don't look deeper
495 0         0 $maxdepth = $depth;
496             }
497             }
498             }
499 12 50       97 else { btw dvis '##glob failed: $pattern\n$@' if $@; }
500             }
501             }
502 377576 100       46883298 if ($depth == $maxdepth) {
    50          
503 2070 100       6311 warn "# pruning at maxdepth $depth ",qsh($_),"\n" if $debug;
504 2070         20560 $File::Find::prune = 1;
505 2070         58184 return;
506             }
507 0         0 elsif ($depth > $maxdepth) { oops dvis '$depth $maxdepth $_' }
508             },
509             follow_fast => 1,
510             follow_skip => 2,
511             dangling_symlinks => 0,
512             no_chdir => 1
513             },
514             @search_dirs
515             );
516             $OLpath_answer = path(
517             $results{libre}{path} // $results{open}{path}
518             || (!$ENV{SPREADSHEET_EDIT_IGNPATH} && which("soffice")) # installed OO?
519             || return(undef)
520             )->realpath->canonpath
521             }#openlibreoffice_path
522              
523             sub _openlibre_features() {
524 1     1   4 state $hash;
525 1 50       11 return $hash if defined $hash;
526 1   50     20 my $prog = openlibreoffice_path() // return(($hash={ available => 0 }));
527 0         0 my $raw_version;
528             # This is gross but fast and works in recent versions of LO
529 0 0       0 if (my $fh = eval{ path($prog)->realpath->parent->child("types/offapi.rdb")
  0         0  
530             ->filehandle("<",":raw")} ) {
531 0         0 my $octets; sysread $fh, $octets, 100;
  0         0  
532 0 0       0 if ($octets =~ /Created by LibreOffice (\d+\.\d+\.\w+)/) {
533 0         0 $raw_version = $1;
534             }
535             }
536 0 0       0 unless ($raw_version) {
537 0 0       0 if (qx/$prog --version 2>&1/ =~ /Libre.*? (\d+\.\d+\.\w+)/) {
538 0         0 $raw_version = $1;
539             } else {
540 0         0 warn "$prog --version DID NOT WORK\n";
541             }
542             }
543 0 0       0 unless ($raw_version) {
544 0         0 warn "WARNING: Could not determine version of $prog\n";
545 0         0 $raw_version = "999.01";
546             }
547 0         0 my $version = version->parse("v$raw_version");
548 0         0 $hash = {
549             available => 1,
550             # LibreOffice 7.2 allows extracting all sheets at once
551             allsheets => ($version >= version->parse("v7.2")),
552             # ...but not yet extracting a single sheet by name.
553             # https://bugs.documentfoundation.org/show_bug.cgi?id=135762#c24
554             named_sheet => 0,
555             # Supported output formats are too many to list
556             ousuf_any => 1,
557             raw_version => $raw_version, version => "$version",
558             }
559             }
560              
561 0     0   0 sub _openlibre_supports_allsheets() { _openlibre_features()->{allsheets} }
562 0     0   0 sub _openlibre_supports_named_sheet() { _openlibre_features()->{named_sheet} }
563 0     0   0 sub _openlibre_supports_writing($) { _openlibre_features()->{available} }
564              
565 1     1   2046 sub _ssconvert_features() { return { availble => 0 } } # TODO add back?
566 0     0   0 sub _ssconvert_supports_allsheets() { _ssconvert_features()->{allsheets} }
567 0     0   0 sub _ssconvert_supports_named_sheet() { _ssconvert_features()->{named_sheet} }
568 0     0   0 sub _ssconvert_supports_writing($) { _ssconvert_features()->{available} }
569              
570             # These allow users (e.g. App-diff_spreadsheets tests) to determine
571             # if external tool(s) are available to convert between spreadsheet formats
572             # or to/from csv (CSVs are supported directly so can always be used)
573             # Currently used by t/io.pl to skip tests
574             sub can_cvt_spreadsheets() {
575             _openlibre_features()->{available} || _ssconvert_features()->{availble}
576 1 50   1 1 2630455 }
577             sub can_extract_allsheets() {
578 0 0   0 1 0 _openlibre_supports_allsheets() || _ssconvert_supports_allsheets()
579             }
580             sub can_extract_named_sheet() {
581 0 0 0 0 1 0 can_extract_allsheets() # used to emulate
582             || _openlibre_supports_named_sheet() || _ssconvert_supports_named_sheet()
583             }
584              
585             =for Pod::Coverage can_cvt_spreadsheets can_extract_allsheets
586             -for Pod::Coverage can_extract_named_sheet
587             =cut
588              
589             sub _runcmd($@) {
590 0     0   0 my ($opts, @cmd) = @_;
591 0         0 my $guard = _signals_guard;
592             # This used to fork & exec but that blows up on MSWin32 because the child
593             # pseudo-process executes all END{} blocks everywhere after "exec"
594 0         0 my $redirs = "";
595 0 0       0 if ($opts->{suppress_stderr}) {
596 0         0 $redirs .= " 2>".devnull();
597             }
598 0 0       0 if ($opts->{stdout_to_stderr}) {
599 0         0 confess "Not portable";
600 0         0 $redirs .= " 1>&2";
601             }
602 0 0       0 if ($opts->{stderr_to_stdout}) {
603 0         0 confess "Not portable";
604 0         0 $redirs .= " 2>&1";
605             }
606 0 0       0 if ($opts->{suppress_stdout}) {
607 0         0 $redirs .= " >".devnull();
608             }
609 0         0 my $cmdstr = join(" ", map{qsh} @cmd) . $redirs;
  0         0  
610 0 0       0 if ($redirs) {
611 0         0 foreach (@cmd) {
612 0 0       0 confess "Can not portably pass argument '$_'" if /["']/;
613             }
614 0 0       0 warn "> $cmdstr\n" if $opts->{verbose};
615 0         0 system $cmdstr;
616             } else {
617 0 0       0 warn "> $cmdstr\n" if $opts->{verbose};
618 0         0 system @cmd;
619             }
620 0         0 return $?
621             }
622              
623             sub _fmt_outpath_contents($) {
624 0   0 0   0 my $outpath = $_[0]->{outpath} // oops;
625 0 0       0 return "(outpath does not exist)" unless -e $outpath;
626 0 0       0 return "(outpath is a file)" if -f $outpath;
627 0 0       0 return "(outpath is a STRANGE OBJECT)" unless -d $outpath;
628             "\n outpath contains: "
629 0         0 .join(", ",map{qsh basename $_} path($outpath)->children);
  0         0  
630             }
631              
632             my $tempdir;
633             sub _create_tempdir_if_needed($) {
634 2     2   4 my $opts = shift;
635             # Keep a per-process persistent temp directory, deleted at process exit.
636             # It contains result files when the user did not specify {outpath},
637             # plus a cache of as-yet unrequested sheet .csv files, used when the
638             # external tool can only extract all sheets, not a single sheet by name:
639             #
640             # tempdir/
641             # <ifbase>_<sig>.xlsx etc. # single file returned to user
642             # <ifbase>_<sig>/*.csv # directory returned to user
643             # <ifbase>_<sig>_csvcache/*.csv
644             #
645             # <ifbase> is derived from the intput file name, and <sig> is a fingerprint
646             # based on input file's dev, inode, and modification timestamp.
647             #
648 2   66     12 $tempdir //= do{
649             #(my $template = __PACKAGE__."_XXXXX") =~ s/::/-/g;
650             #Path::Tiny->tempdir($template)
651 1         3 my $pid = $$;
652 1         5 my $user = _get_username();
653 1         9 (my $dname = __PACKAGE__."_${user}_${pid}_tempdir") =~ s/::/-/g;
654 1         17 (my $path = path(File::Spec->tmpdir)->child($dname))->mkpath;
655 1         374 $path
656             };
657             }
658 3 100   3   7623 END{ $tempdir->remove_tree if $tempdir; }
659              
660             # Compose a unique path under $tempdir.
661             # This is *not* a "tempfile" or "tempdir" object which auto-destructs,
662             # in fact it does not even exist yet and we don't know here which it will be.
663             # Either the user must remove it when they are done with it, or it will
664             # be removed when $tempdir is removed at process exit.
665             #
666             sub _path_under_tempdir($@) {
667 0     0   0 my $opts = shift;
668             my %args = (
669 0         0 words => [$opts->{ifbase}, $opts->{sheetname}],
670             @_
671             );
672 0         0 my $bname = join "_", grep{defined} @{$args{words}};
  0         0  
  0         0  
673             # Collisions occur when recursing to emulate Extract-by-name,
674             # or if the user repeatedly reads the same thing, etc.
675 0         0 state $seqnums = {};
676 0 0       0 if ($seqnums->{$bname}++) {
677 0         0 $bname .= "_".$seqnums->{$bname}; # append unique sequence number
678             }
679 0 0       0 $bname .= ".$args{suf}" if $args{suf};
680 0         0 return $tempdir->child($bname);
681             }
682              
683             # Compose csv cache subdir path
684             sub _cachedir($) {
685 0     0   0 my $opts = shift;
686 0         0 _path_under_tempdir($opts,words => [$opts->{ifbase}, "csvcache"]);
687             }
688              
689             ## Copy an ephemeral temp file to a path under tempdir if needed
690             #sub _make_file_permanent($$) {
691             # my ($opts, $path) = @_;
692             # if (eval{ $path->cached_temp }) { # didn't throw
693             # my $suf = $path->basename =~ /\.(\w+)$/a ? $1 : undef;
694             # my $newpath = _path_under_tempdir($opts, suf => $suf);
695             # $path->move($newpath);
696             # return $newpath
697             # } else {
698             # return $path
699             # }
700             #}
701              
702             sub _convert_using_openlibre($$$) {
703 0     0   0 my ($opts, $src, $dst) = @_;
704 0 0   0   0 oops unless all{ $opts->{$_} } qw/cvt_from cvt_to/;
  0         0  
705 0 0 0     0 oops if $opts->{allsheets} && ! _openlibre_supports_allsheets();
706 0 0 0     0 oops if $opts->{sheetname} && ! _openlibre_supports_named_sheet();
707 0         0 my $debug = $opts->{debug};
708              
709 0   0     0 my $prog = openlibreoffice_path() // oops;
710              
711 0         0 my $saved_UserInstallation = $ENV{UserInstallation};
712             # URI format is file://server/path where 'server' is empty. "file://path" is
713             # "never correct, but is often used" en.wikipedia.org/wiki/File_URI_scheme
714             # Correct examples: file::///tmp/something file:///C:/somewhere
715 0         0 $ENV{UserInstallation} = URI::file->new(_get_tool_profdir($prog)->canonpath);
716 0 0       0 warn "Temporarily set UserInstallation=$ENV{UserInstallation}\n" if $debug;
717             scope_guard {
718 0 0   0   0 if (defined $saved_UserInstallation) {
719 0         0 $ENV{UserInstallation} = $saved_UserInstallation;
720             } else {
721             delete $ENV{UserInstallation}
722 0         0 }
723 0         0 };
724              
725             # The --convert-to argument is "suffix:filtername:filteropts"
726              
727             # I think (not certain) that we can only specify the encoding of CSV files,
728             # either as input or output; .xlsx and .ods spreadsheets (which are based
729             # on XML) could in principle use any encoding internally, but I'm not sure
730             # we can control that, nor should anyone ever need to.
731              
732             # REFERENCES:
733             # https://help.libreoffice.org/7.5/en-US/text/shared/guide/start_parameters.html?&DbPAR=SHARED&System=UNIX
734             # http://wiki.openoffice.org/wiki/Documentation/DevGuide/Spreadsheets/Filter_Options
735             # https://wiki.documentfoundation.org/Documentation/DevGuide/Spreadsheet_Documents#Filter_Options_for_the_CSV_Filter
736              
737             # I think we never want to specify the filter unless we have parameters
738             # for it. Currently that is only for csv.
739             # If no filter is specified, the suffix (e.g. 'ods') should be enough
740 0         0 state $suf2ofilter = {
741             csv => "Text - txt - csv (StarCalc)",
742             txt => "Text - txt - csv (StarCalc)",
743             #xls => "MS Excel 97",
744             #xlsx => "Calc MS Excel 2007 XML",
745             #ods => "calc8",
746             };
747              
748 0   0     0 my $ifilter = $opts->{soffice_infilter} //= do{
749 0 0       0 if ($opts->{cvt_from} eq "csv") {
750 0 0       0 my $filter_name = $suf2ofilter->{$opts->{cvt_from}} or oops;
751 0         0 my $enc = $opts->{input_encoding};
752 0         0 my $charset = _name2LOcharsetnum($enc); # dies if unknown enc
753 0         0 my $colformats = "";
754 0 0       0 if (my $cf = $opts->{col_formats}) {
755 0 0       0 $cf = [split /[\/,]/, $cf] unless ref($cf); # fmtA/fmtB/...
756 0         0 for (my $ix=0; $ix <= $#$cf; $ix++) {
757 0   0     0 local $_ = $cf->[$ix] // 1;
758 0 0 0     0 m#^([123459]|10)$#
      0        
      0        
      0        
      0        
      0        
      0        
759             || s#^standard$#1#i
760             || s#^text$#2#i
761             || s#^M+/D+/Y+$#3#i
762             || s#^D+/M+/Y+$#4#i
763             || s#^Y+/M+/D+$#5#i
764             || s#^ignore$#9#i
765             || s#^US.*English$#10#i
766             || croak "Unknown format code '$_' in {col_formats}";
767 0 0       0 $colformats .= "/" if $colformats;
768 0         0 $colformats .= ($ix+1)."/$_";
769             }
770             }
771             $filter_name.":"
772             # Tokens 1-4: FldSep=',' TxtDelim='"' Charset FirstLineNum
773             #. "44,34,$charset,1"
774             . ord($opts->{sep_char}//",").","
775 0   0     0 . ord($opts->{quote_char}//'"').","
      0        
776             . "$charset,1"
777             # Token 5: Cell format codes:
778             # If variable-width cells (the norm): colnum/fmt/colnum/fmt...
779             # colnum: 1-based column number
780             # fmt: 1=Std 2=Text 3=MM/DD/YY 4=DD/MM/YY 5=YY/MM/DD 6-8 unused
781             # 9=ignore field (do not import),
782             # 10=US-English content (e.g. 3.14 not 3,14)
783             # (I'm guessing 1=Std means use current lang [or per Tok 6?])
784             # If fixed-width cells... [something else]
785             . ",$colformats"
786             # Token 6: MS-LCID Language Id; 0 or omitted means UI language
787             . "," # default: false
788             # Token 7: On input: true => Quoted cells are always read a 'text',
789             # effectively disabling Token 8. This must be false to recognize dates
790             # like "Jan 1, 2000" which by necessity must be quoted for the comma,
791             # but will **CORRUPT** zip codes with leading zeroes unless
792             # col_formats overrides (which it does now by default).
793             .",false" # default: false
794             # Token 8: on input: "Detect Special Numbers", i.e. date or time values
795             # in human form, numbers in scientific (expondntial) notation etc.
796             # If false, ONLY decimal numbers (thousands separators ok).
797             .",true" # default: false (for import)
798             # Tokens 9-10: not used on import
799             .",,"
800             # Token 11: Remove spaces; trim leading & trailing spaces when reading
801             ."," # default: false
802             # Token 12: not use on import
803             .","
804             # Token 13: Import "=..." as formulas instead of text?
805             ."," # default: false i.e. do not recognize formulas
806             # Token 14: "Automatically detected since LibreOffice 7.6" [BOM?]
807             .","
808             }
809             else {
810             undef
811 0         0 }
812             };
813              
814 0   0     0 my $ofilter = $opts->{soffice_outfilter} //= do{
815             # OutputFilterName[:paramtoken,paramtoken,...]
816 0 0       0 if ($opts->{cvt_to} eq "csv") {
817 0 0       0 my $filter_name = $suf2ofilter->{$opts->{cvt_to}} or oops;
818 0         0 my $enc = $opts->{output_encoding};
819 0         0 my $charset = _name2LOcharsetnum($enc); # dies if unknown enc
820             $filter_name.":"
821             # Tokens 1-4: FldSep=, TxtDelim=" Charset FirstLineNum
822             #."44,34,$charset,1"
823             . ord($opts->{sep_char}//",").","
824             . ord($opts->{quote_char}//'"').","
825             . "$charset,1"
826             # Token 5: Cell format codes. Only used for import? (see above)
827             # What about fixed-width?
828             .","
829             # Token 6: Language identifier (uses Microsoft lang ids)
830             # 1033 means US-English (omitted => use UI's language)
831             .","
832             # Token 7: QuoteAllTextCells
833             # *** true will "quote" even single-bareword cells, which looks
834             # *** bad and makes t/ tests messier, but preserves information
835             # *** that such cells were not numbers or dates, etc. This ensures
836             # *** that Zip codes, etc. with leading zeroes won't be corrupted
837             # *** if converted back into a spreadsheet
838             # Option #1: Specify true to quote all cells on export, then post-process
839             # the result to un-quote obviously safe cells (for yet more overhead).
840             # Option #2: Specify false, and assume the resulting CSV will never
841             # be imported into a spreadsheet except via us, and we pre-scan
842             # the data to generate {col_formats} so will usually be safe.
843             # 5/30/23: Switching to Option #2...
844             .",false"
845             # Token 8: on output: true to store number as numbers; false to
846             # store number cells as text. No UI equivalent.
847             .",true" # default: documented as true (for export) BUT IS NOT!
848             # Token 9: "Save cell contents as shown"
849             # Generally we DO NOT want this because things like dates
850             # can be formatted many different ways.
851             ##.",".($opts->{raw_values} ? "false" : "true")
852             .",false"
853             # Token 10: "Export cell formulas"
854             .",false"
855             # Token 11: not used for export
856             .","
857             # Token 12: (LO 7.2+) sheet selections:
858             # 0 or absent => the "first" sheet
859             # 1-N => the Nth sheet (arrgh, can not specify name!!)
860             # -1 => export all sheets to files named filebasenamne.Sheetname.csv
861             .",".($opts->{allsheets} ? -1 :
862 0 0 0     0 $opts->{sheetname} ? die("add named-sheet support here") :
    0 0        
863             0)
864             # Token 13: Not used for export
865             .","
866             # Token 14: true to include BOM in the result
867             #.","
868             }
869             else {
870             undef
871 0         0 }
872             };
873              
874             # We can only control the output directory path, not the name of
875             # an individual result file. If $dst is a directory then the result
876             # could theoretically output into it directly, but instead we always
877             # output to an ephemeral temp directory and then move the results to $dst
878             #
879             # With 'allsheets' the resulting files must be renamed to conform to our
880             # external API (namely SHEETNAME.csv).
881             #
882             # ERROR DETECTION: As of LO 7.5 we always get zero exist status and the
883             # only way to detect errors is to notice that no files were written.
884             # https://bugs.documentfoundation.org/show_bug.cgi?id=155415
885             #
886 0         0 my $tdir = Path::Tiny->tempdir(path($dst)->basename."_XXXXX");
887             # will be deleted when $dirpath goes out of scope
888              
889             my @cmd = ($prog,
890             "--headless", "--invisible",
891             "--nolockcheck", "--norestore",
892             "--view", # open read-only in case can't create lockfile
893             $ifilter ? ("--infilter=$ifilter") : (),
894             "--convert-to",
895 0 0       0 $opts->{cvt_to}.($ofilter ? ":$ofilter" : ""),
    0          
896             "--outdir", $tdir->canonpath,
897             path($src)->canonpath);
898              
899 0 0       0 unless ($debug) {
900 0         0 $opts->{suppress_stdout} = 1;
901             #$opts->{suppress_stderr} = 1;
902             }
903              
904 0         0 my $cmdstatus = _runcmd($opts, @cmd);
905              
906 0 0       0 if ($cmdstatus != 0) {
907             # This should never happen, see ERROR DETECTION above
908             confess sprintf("($$) UNEXPECTED FAILURE, wstat=0x%04x\n",$cmdstatus),
909             " converting '$opts->{inpath}' to $opts->{cvt_to}\n",
910 0         0 " Command was: ",join(" ",map{qsh} @cmd);
  0         0  
911             }
912              
913 0         0 my @result_files = path($tdir)->children;
914 0 0       0 btw dvis '>> @result_files' if $debug;
915 0 0       0 if (@result_files == 0) {
916 0 0       0 croak qsh($src)." is missing or unreadable\n", "cmd: @cmd\n"
917             unless -r $src;
918 0         0 croak "Something went wrong, ",path($prog)->basename," produced no output\n",
919             "cmd: @cmd\n"
920             }
921              
922 0 0       0 if ($opts->{allsheets}) {
923             # Rename files to match our API (omit the spreadsheetbasename- prefix)
924 0         0 foreach (@result_files) {
925 0         0 my $dir = $_->parent; # Like dirname but including Volume:
926 0         0 my $base = $_->basename;
927 0 0       0 (my $newbase = $base) =~ s/^\Q$opts->{ifbase}\E-// or oops dvis '$base $opts';
928 0         0 my $newpath = $dir->child($newbase)->canonpath;
929 0         0 my $oldpath = path($_)->canonpath;
930 0 0       0 btw ">> Renaming $oldpath -> $newbase\n" if $debug;
931 0 0       0 rename ($oldpath, $newpath) or oops "$!";
932 0         0 $_ = $newpath; # update @result_files
933             }
934             }
935              
936             # Move the results to $dst
937 0 0       0 if (-e $dst) {
938 0 0       0 croak "$dst must be a directory if it pre-exists\n" unless -d $dst;
939 0 0       0 btw ">> Moving results -> $dst\n" if $debug;
940 0         0 foreach (@result_files) {
941 0 0       0 btw ">>> move $_ -> $dst" if $debug;
942 0         0 File::Copy::move($_, $dst)
943             }
944 0 0       0 btw ">> Now $dst contains: ",avis($dst->children) if $debug;
945             } else {
946 0 0       0 if ($opts->{allsheets}) {
947 0 0       0 btw ">> dirmove $tdir -> $dst\n" if $debug;
948 0 0       0 rename($tdir, $dst) or File::Copy::dirmove($tdir, $dst);
949             } else {
950 0 0       0 croak "Expecting only one result file, not @result_files"
951             if @result_files > 1;
952 0 0       0 btw ">> move $result_files[0] -> $dst\n" if $debug;
953 0         0 File::Copy::move($result_files[0], $dst);
954             }
955             }
956             }#_convert_using_openlibre
957              
958             sub _convert_using_ssconvert($$$) {
959 0     0   0 my ($opts, $src, $dst) = @_;
960 0         0 confess "Deprecated with extreme prejudice"; # no longer supported
961             ##
962             ## foreach (qw/inpath cvt_to /)
963             ## { oops "missing opts->{$_}" unless exists $opts->{$_} }
964             ##
965             ## my $eff_outpath = $opts->{outpath};
966             ## if (my $prog=which("ssconvert")) {
967             ## my $enc = _get_encodings_from_opts($opts);
968             ## $enc //= "UTF-8"; # default
969             ## my @options;
970             ## if ($opts->{cvt_to} eq "csv") {
971             ## push @options, '--export-type=Gnumeric_stf:stf_assistant';
972             ## my @dashO_terms = ("format=preserve", "transliterate-mode=escape");
973             ## push @dashO_terms, "charset='${enc}'" if defined($enc);
974             ## if ($opts->{sheetname}) {
975             ## push @dashO_terms, "sheet='$opts->{sheetname}'";
976             ## }
977             ## if ($opts->{allsheets}) {
978             ## #If both {allsheets} and {sheetname} are specified, only a single
979             ## # .csv file will be in the output directory
980             ## croak "'allsheets' option: 'outpath' must specify an existing directory"
981             ## unless -d $eff_outpath;
982             ## $eff_outpath = catfile($eff_outpath, "%s.csv");
983             ## push @options, "--export-file-per-sheet";
984             ## }
985             ## elsif ($opts->{sheetname}) {
986             ## # handled above
987             ## }
988             ## else {
989             ## # A backwards-incompatible change to ssconvert stopped extracting
990             ## # the "current" sheet by default; now all sheets are concatenated!
991             ## # See https://gitlab.gnome.org/GNOME/gnumeric/issues/461
992             ## # ssconvert verison 1.12.45 supports a new "-O active-sheet=y" option
993             ## ## PORTABILITY BUG: Redirection syntax will not work on windows
994             ## my ($ssver) = (qx/ssconvert --version 2>&1/ =~ /ssconvert version '?(\d[\d\.]*)/);
995             ## if (version::is_lax($ssver) && version->parse($ssver) >= v1.12.45) {
996             ## push @dashO_terms, "active-sheet=y";
997             ## } else {
998             ## croak("Due to an ssconvert bug, a sheetname must be given.\n",
999             ## "(for more information, see comment at ",__FILE__,
1000             ## " near line ", (__LINE__-10), ")\n");
1001             ## }
1002             ## }
1003             ## push @options, '-O', join(" ",@dashO_terms);
1004             ## }
1005             ## elsif ($opts->{cvt_to} eq 'xlsx') {
1006             ## @options = ('--export-type=Gnumeric_Excel:xlsx2');
1007             ## }
1008             ## elsif ($opts->{cvt_to} eq 'xls') {
1009             ## @options = ('--export-type=Gnumeric_Excel:excel_biff8'); # M'soft Excel 97/2000/XP
1010             ## }
1011             ## elsif ($opts->{cvt_to} =~ /^od/) {
1012             ## @options = ('--export-type=Gnumeric_OpenCalc:odf');
1013             ## }
1014             ## elsif ($eff_outpath =~ /\.[a-z]{3,4}$/) {
1015             ## # let ssconvert choose based on the output file suffix
1016             ## }
1017             ## else {
1018             ## croak "unrecognized cvt_to='".u($opts->{cvt_to})."' and no outpath suffix";
1019             ## }
1020             ##
1021             ## my $eff_inpath = $opts->{inpath};
1022             ## if ($opts->{sheetname} && $opts->{inpath} =~ /.csv$/i) {
1023             ## # Control generated sheet name by using a symlink to the input file
1024             ## # See http://stackoverflow.com/questions/22550050/how-to-convert-csv-to-xls-with-ssconvert
1025             ## my $td = catdir($tempdir // oops, "Gnumeric");
1026             ## remove_tree($td); mkdir($td) or die $!;
1027             ## $eff_inpath = catfile($td, $opts->{sheetname});
1028             ## symlink $opts->{inpath}, $eff_inpath or die $!;
1029             ## fixme: handle unimplmented or no-perms symlink failures
1030             ## }
1031             ## my @cmd = ($prog, @options, $eff_inpath, $eff_outpath);
1032             ##
1033             ## my $suppress_stderr = !$opts->{debug};
1034             ## if (0 != _runcmd({%$opts, suppress_stderr => $suppress_stderr}, @cmd)) {
1035             ## # Before showing a complicated ssconvert failure with backtrace,
1036             ## # check to see if the problem is just a non-existent input file
1037             ## { open my $dummy_fh, "<", $eff_inpath or croak "$eff_inpath : $!"; }
1038             ## my $failmsg = "($$) Conversion of '$opts->{inpath}' to $eff_outpath failed\n"."cmd: ".qshlist(@cmd)."\n";
1039             ## if ($suppress_stderr) { # repeat showing all output
1040             ## if (0 == _runcmd({%$opts, suppress_stderr => 0}, @cmd)) {
1041             ## warn "Surprise! Command failed the first time but succeeded on 2nd try!\n";
1042             ## }
1043             ## croak $failmsg;
1044             ## }
1045             ## }
1046             ## elsif (! -e $opts->{outpath}) {
1047             ## croak "($$) Conversion SILENTLY failed\n(using $prog)\n",
1048             ## " cmd: ",qshlist(@cmd),"\n"
1049             ## ;
1050             ## }
1051             ## return ($enc)
1052             ## }
1053             ## else {
1054             ## croak "Can not find ssconvert to convert '$opts->{inpath}' to $opts->{cvt_to}\n",
1055             ## "To install ssconvert: sudo apt-get install gnumeric\n";
1056             ## }
1057             }
1058              
1059             # Extracts |||SHEETNAME or !SHEETNAME or [SHEETNAME] from a path+sheet
1060             # specification, if present.
1061             # (Lots of historical compatibility issues...)
1062             # In scalar context, returns SHEETNAME or undef.
1063             # INTERNAL USE ONLY: In array context, returns (filepath, SHEETNAME or undef)
1064             sub sheetname_from_spec($) {
1065 6     6 1 9 my $spec = shift;
1066 6         11 local $_;
1067 6         17 my $p = path($spec);
1068 6         317 my $parent = $p->parent;
1069 6         413 my ($base,$sn) = ($p->basename =~ /^(.*) (?| \|\|\|([^\!\[\|]+)$
1070             | \!([^\!\[\|]+)$
1071             | \[([^\[\]]+)\]$
1072             )/x);
1073 6 100 33     102 wantarray ? ($parent->child($base//$p->basename)->stringify, $sn) : $sn
1074             }
1075             sub filepath_from_spec($) {
1076 2     2 1 6 my ($path, undef) = sheetname_from_spec($_[0]);
1077 2         99 $path
1078             }
1079             #Tester
1080             #foreach ("", "/a!b/c", "/a!b/c!sheet1", "/a/b/c[sheet2]", "/a/b/c[bozo]d.xls",
1081             # ) {
1082             # foreach($_, basename($_)) {
1083             # my ($fp,$sn) = sheetname_from_spec($_);
1084             # use open ':std', ':locale';
1085             # warn ivis '# $_ → $fp $sn\n';
1086             # my $sn2 = sheetname_from_spec($_);
1087             # die "bug" unless u($sn) eq u($sn2);
1088             # }
1089             #}
1090             #die "TEX";
1091              
1092             # Construct a file + sheetname spec in the preferred form for humans to read
1093             # If sheetname is undef, just return the file path
1094             sub form_spec_with_sheetname($$) {
1095 2     2 1 7 my ($filespec, $sheetname) = @_;
1096 2         7 my $embedded_sheetname = sheetname_from_spec($filespec);
1097 2 0 33     7 croak "conflicting embedded and separate sheetnames given"
      33        
1098             if $embedded_sheetname && $sheetname && $embedded_sheetname ne $sheetname;
1099 2   33     11 $sheetname //= $embedded_sheetname;
1100 2         6 my $filepath = filepath_from_spec($filespec);
1101 2 50       14 $sheetname ? "${filepath}[${sheetname}]" : $filepath
1102             #$sheetname ? "${filepath}|||${sheetname}" : $filepath
1103             }
1104              
1105             our $default_input_encodings = "UTF-8,windows-1252,UTF-16BE,UTF-16LE";
1106             our $default_output_encoding = "UTF-8";
1107              
1108             # Return digested %opts setting
1109             # sheetname, inpath_sans_sheet (as Path::Tiny), encoding or default
1110             sub _process_args($;@) {
1111 2 50   2   8 confess "fix obsolete call to pass linearized options"
1112             if ref($_[0]) eq "HASH";
1113 2 50       7 my $leading_inpath = ( scalar(@_) % 2 == 1 ? shift(@_) : undef );
1114 2         12 my %opts = (
1115             cvt_from => "",
1116             cvt_to => "",
1117             @_,
1118             #verbose => 999, tempdir => "/tmp/J",
1119             );
1120 2 50       6 if (defined $opts{inpath}) {
1121 0 0       0 croak "Initial INPATH arg specified as well as inpath => ... in options"
1122             if defined $leading_inpath;
1123             } else {
1124 2   33     9 $opts{inpath} = $leading_inpath // croak "No inpath was specified";
1125             }
1126 2 50       7 $opts{verbose}=1 if $opts{debug};
1127              
1128             # inpath or outpath may have "!sheetname" appended (or alternate syntaxes),
1129             # but may exist only if a separate 'sheetname' option is not specified.
1130             # Input and output can not both be spreadsheets; one must be a CSV.
1131 2 50       6 if (exists($opts{sheet})) {
1132 0         0 carp "WARNING: Deprecated 'sheet' option found (use 'sheetname' instead)\n";
1133 0 0       0 croak "Both {sheet} and {sheetname} specified" if exists $opts{sheetname};
1134 0         0 $opts{sheetname} = delete $opts{sheet};
1135             }
1136 2         4 { my ($path_sans_sheet, $sheetname, $key);
  2         13  
1137 2         5 for my $thiskey ('inpath', 'outpath') {
1138 4   100     18 my $spec = $opts{$thiskey} || next;
1139 2         11 my ($pssn, $sn) = sheetname_from_spec($spec);
1140 2 50       113 if (defined $sn) {
1141 0 0       0 croak "A sheetname is embeeded in both ",
1142             "'$thiskey' ($opts{$thiskey}) and '$key' ($opts{$key})\n"
1143             if $sheetname;
1144 0         0 ($path_sans_sheet, $sheetname, $key) = ($pssn, $sn, $thiskey);
1145             }
1146             }
1147 2 50       8 if ($opts{sheetname}) {
    50          
1148             croak "'sheetname' option conflicts with embedded sheet name\n",
1149             " sheetname => ", qsh($opts{sheetname}),"\n",
1150             " $key => ", qsh($opts{$key}),"\n"
1151 0 0 0     0 if defined($sheetname) && $sheetname ne $opts{sheetname};
1152             }
1153             elsif (defined $sheetname) {
1154             btw "(extracted sheet name \"$sheetname\" from $key)\n"
1155 0 0       0 if $opts{verbose};
1156 0         0 $opts{sheetname} = $sheetname;
1157             }
1158             $opts{inpath_sans_sheet} = path(
1159             ($key && $key eq 'inpath') ? $path_sans_sheet : $opts{inpath}
1160 2 50 33     10 );
1161             }
1162             # Input file basename sans any .suffix
1163 2         101 $opts{ifbase} = $opts{inpath_sans_sheet}->basename(qr/\.[^.]+/);
1164              
1165 2         124 %opts
1166             }#_process_args
1167              
1168             # Extract the of encoding(s) specified in an iolayers string
1169             # Parse iolayers string, returning ($prefix,[encodings],$suffix)
1170             # For example from ":raw:encodings(utf8,windows-1252):zz" the output
1171             # would be (":raw", [:utf8","windows-1252"], ":zz")
1172             sub _parse_iolayers($) {
1173 0   0 0   0 local $_ = (shift) // "";
1174 0 0       0 /\A(<prefix>.*?)
1175             (<encspec>:utf8|:encoding\(([^\)]+)\))
1176             (<suffix>.*?)\z/ or croak "Invalid iolayers spec '$_'\n";
1177 0         0 (my $prefix, $_, my $suffix) = ($+{prefix}, $+{encspec}, $+{suffix});
1178 0 0 0     0 /^:(utf8)$/ || /^:encoding\(([^\)]+)\)$/ or oops($_);
1179 0         0 my $enclist = [split /,/, $1]; # comma,separated,list,of,encodings
1180 0         0 ($prefix, $enclist, $suffix);
1181             }
1182              
1183             # Detect cvt_from and cvt_to from filenames, or peeking at the data.
1184             # If input is CSV, detect encoding, separator and quote characters;
1185             # add quotes to values with leading zeroes (e.g. Zip codes) which would
1186             # otherwise be corrupted by being read as numbers instead of text strings.
1187             # The modified data is written to a temp file
1188             # Set default output_encoding if not specified
1189             # RETURNS: The effective input path, either inpath_sans_sheet or a tempfile
1190             sub _determine_enc_tofrom($) {
1191             my $opts = shift;
1192             my $debug = $opts->{debug};
1193             # Skip to ==BODY== below
1194              
1195             my sub determine_input_encoding($) {
1196             my $r2octets = shift;
1197             # If user specified one encoding, use it; if user specified list, try them.
1198             # If user did not specify, the default is a list to try.
1199             $opts->{input_encoding} //= $default_input_encodings;
1200             my @enclist = split m#,#, $opts->{input_encoding};
1201             return
1202             if @enclist == 1;
1203             $$r2octets //= $opts->{inpath_sans_sheet}->slurp_raw;
1204             for my $enc (@enclist) {
1205             eval { decode($enc, $$r2octets, Encode::FB_CROAK|Encode::LEAVE_SRC) };
1206             if ($@) {
1207             btw "Input encoding '$enc' did not work...($@)\n" if $debug;
1208             next;
1209             }
1210             btw "Input encoding '$enc' seems to work.\n" if $debug;
1211             @enclist = ($enc);
1212             last
1213             }
1214             #croak "Could not detect encoding of $opts->{inpath_sans_sheet}\n"
1215             confess "Could not detect encoding of $opts->{inpath_sans_sheet}\n"
1216             if @enclist > 1;
1217             $opts->{input_encoding} = $enclist[0];
1218             } #determine_input_encoding
1219              
1220             my sub readparse_csv($@) {
1221             my $fh = shift;
1222             my %csvopts = (
1223             @sane_CSV_read_options,
1224             defined($opts->{quote_char}) ? (quote_char=>$opts->{quote_char}) : (),
1225             defined($opts->{sep_char}) ? (sep_char=>$opts->{sep_char}) : (),
1226             auto_diag => 2, # throw on error
1227             @_
1228             );
1229             $csvopts{escape_char} = $csvopts{quote_char}; # must always be the same
1230              
1231             my $csv = Text::CSV->new (\%csvopts)
1232             or croak "Text::CSV->new: ", Text::CSV->error_diag(),
1233             dvis('\n## %csvopts\n');
1234             my @rows;
1235             while (my $F = $csv->getline( $fh )) {
1236             push(@rows, $F);
1237             }
1238             \@rows
1239             }
1240              
1241             my sub open_input($) {
1242             my $r2octets = shift;
1243             oops unless $opts->{input_encoding};
1244             my $fh;
1245             my $pathish = defined($$r2octets)
1246             ? \$$r2octets : $opts->{inpath_sans_sheet};
1247 1     1   7 open($fh, "<:encoding($opts->{input_encoding})", $pathish)
  1         3  
  1         7  
1248             or die "$pathish : $!";
1249             $fh
1250             }
1251              
1252             my sub determine_csv_q_sep($$) {
1253             my ($r2octets, $r2rows) = @_;
1254             return
1255             if defined($opts->{quote_char}) && defined($opts->{sep_char});
1256              
1257             my $fh = open_input($r2octets);
1258              
1259             # my $chars;
1260             # if (defined($$r2octets)
1261             # $chars = decode($opts->{input_encoding},$$r2octets,Encode::FB_CROAK);
1262             # }
1263              
1264             # Try combinations starting with the most-common '"' and ',' while
1265             # parsing the file for unsafe unquoted values (throws on syntax error).
1266             # The expectation is that the first try usually succeeds
1267             Q:
1268             for my $q (defined($opts->{quote_char})
1269             ? ($opts->{quote_char}) : ("\"", "'")) {
1270             my $found_q;
1271             SEP:
1272             for my $sep (defined($opts->{sep_char})
1273             ? ($opts->{sep_char}) : (",","\t")) {
1274             btw dvis '--- TRYING $q $sep ---' if $debug;
1275             # # Preliminary check for an illegal use of the quote char
1276             # if (defined($chars)
1277             # && $chars =~ /[^${q}${sep}\x{0D}\x{0A}]
1278             # ${q}
1279             # (?=[^${q}${sep}\x{0D}\x{0A}] | \z)/gx) {
1280             # btw ivis '>>>quote_char CAN NOT BE $q with sep=$sep because q exists mid-field before pos ${\(pos($chars))}'
1281             # if $debug;
1282             # next SEP
1283             # }
1284             $$r2rows = eval{ readparse_csv($fh, quote_char=>$q, sep_char=>$sep) };
1285             if ($@ eq "") {
1286             warn ivis '>> Detected quote_char=$q sep_char=$sep\n' if $debug;
1287             $opts->{quote_char} = $q;
1288             $opts->{sep_char} = $sep;
1289             last Q;
1290             }
1291             warn vis '$@\nq=$q sep=$sep did not work...\n' if $debug;
1292             seek $fh, 0, SEEK_SET;
1293             }
1294             }
1295             unless (defined($$r2rows)) {
1296             confess "Input file is not valid CSV (or we have a bug)\n"
1297             }
1298             }#determine_csv_q_sep
1299              
1300             my sub determine_csv_col_formats($$) {
1301             my ($r2octets, $r2rows) = @_;
1302             return
1303             if defined $opts->{col_formats};
1304             $$r2rows //= do{
1305             my $fh = open_input($r2octets);
1306             readparse_csv($fh);
1307             };
1308             my $max_cols = 0; for my $row (@{ $$r2rows }) { $max_cols = @$row if $max_cols < @$row }
1309             state $curr_yy = (localtime(time))[5];
1310             my @col_formats;
1311             my sub recognized($$$$;$) {
1312             my ($cx, $rx, $thing, $format, $as_msg) = @_;
1313             $col_formats[$cx] = $format;
1314             return unless $debug;
1315             $as_msg //= " as ".vis($col_formats[$cx])." format";
1316             if (length($thing) > 35) { $thing = substr($thing,0,32)."..."; }
1317             @_ = ("Recognized ",$thing," in ", cxrx2sheetaddr($cx,$rx), $as_msg);
1318             goto &btw
1319             }
1320             CX:
1321             for my $cx (0..$max_cols-1) {
1322             RX:
1323             for my $rx (0..$#{$$r2rows}) {
1324             my $row = $$r2rows->[$rx];
1325             next if $cx > $#$row; # row has fewer columns than others
1326             for ($row->[$cx]) {
1327             # recognize obvious Y/M/D or M/D/Y or D/M/Y date forms
1328             if (m#\b(?<y>(?:[12]\d)?\d\d)/(?<m>\d\d)/(?<d>\d\d)\b#) {
1329             if ($+{d} > 12 && $+{d} <= 31 && $+{m} >= 1 && $+{m} <= 12
1330             && ($+{y} < 100 || $+{y} >= 1000)) {
1331             recognized($cx,$rx,$_,"YY/MM/DD");
1332             next CX;
1333             }
1334             # If ambiguous YYYY/??/?? we can still assume it is a date and not text
1335             if (length($+{y})==4) {
1336             #recognized($cx,$rx,$_,""," as some kind of date, fmt unknown");
1337             next RX;
1338             }
1339             }
1340             if (m#\b(?<m>\d\d)/(?<d>\d\d)/(?<y>(?:[12]\d)?\d\d)\b#) {
1341             if ($+{y} < 100 || $+{y} >= 1000) {
1342             if ($+{d} > 12 && $+{d} <= 31 && $+{m} >= 1 && $+{m} <= 12) {
1343             recognized($cx,$rx,$_,"MM/DD/YY");
1344             next CX
1345             }
1346             elsif ($+{m} > 12 && $+{m} <= 31 && $+{d} >= 1 && $+{d} <= 12) {
1347             recognized($cx,$rx,$_,"DD/MM/YY");
1348             next CX
1349             }
1350             }
1351             # If ambiguous ??/??/YYYY we can still assume it is a date and not text
1352             if (length($+{y})==4) {
1353             #recognized($cx,$rx,$_,""," as some kind of date, fmt unknown");
1354             next RX;
1355             }
1356             }
1357             # Things to force to be read as text fields:
1358             # 1. Leading zeroes
1359             # 2. Leading ascii minus (\x{2D}) rather than math minus \N{U+2212}.
1360             # This prevents conversion to the Unicode math minus when LO
1361             # reads the CSV. The assumption is that if the input has an ascii
1362             # minus then the original spreadsheet format was "text" not
1363             # numeric.
1364             if (/^[\x{2D}0]/) {
1365             recognized($cx,$rx,$_,"text");
1366             next CX;
1367             }
1368             }
1369             }
1370             }
1371             $opts->{col_formats} = \@col_formats;
1372             }#determine_csv_col_formats
1373              
1374             # ==BODY==
1375             unless ($opts->{cvt_to}) {
1376             if ($opts->{outpath} && $opts->{outpath} =~ /\.([^.]+)$/) {
1377             $opts->{cvt_to} = $1;
1378             }
1379             croak "'cvt_to' was not specified and can not be intuited from 'outpath'"
1380             ,dvis('\n### $opts') ###TEMP
1381             unless $opts->{cvt_to};
1382             }
1383             unless ($opts->{cvt_from}) {
1384             if ($opts->{inpath_sans_sheet} =~ /\.([^.]+)$/) {
1385             $opts->{cvt_from} = $1;
1386             }
1387             }
1388             $opts->{cvt_from} =~ s/^\.txt$/.csv/i if $opts->{cvt_from};
1389              
1390             # Detect file format and, if CSV, encoding
1391             my ($octets, $rows);
1392             if (!$opts->{cvt_from} || $opts->{cvt_from} eq "csv") {
1393             determine_input_encoding(\$octets);
1394             }
1395             if (!$opts->{cvt_from}) {
1396             # Detect the file format by looking at the data. Actually, we only
1397             # support CSV in this case, so this is just a (half-baked) sanity check.
1398             eval {
1399             determine_csv_q_sep(\$octets, \$rows);
1400             if (!$opts->{cvt_from}) {
1401             $rows //= do{
1402             my $fh = open_input(\$octets);
1403             readparse_csv($fh);
1404             };
1405             }
1406             };
1407             if ($@ eq "") {
1408             warn "> Detected $opts->{inpath_sans_sheet} as a seemingly-valid CSV\n"
1409             if $debug;
1410             $opts->{cvt_from} = "csv";
1411             } else {
1412             croak "Can not detect what kind of file ",qsh($opts->{inpath})," is\n";
1413             }
1414             }
1415              
1416             if ($opts->{cvt_from} eq "csv") {
1417             determine_csv_col_formats(\$octets, \$rows);
1418             } else {
1419             oops if defined($octets) or defined($rows);
1420             }
1421              
1422             # Set default ouput_encoding if not specified
1423             $opts->{output_encoding} //= $default_output_encoding
1424             if $opts->{cvt_to} eq "csv";
1425              
1426             }#_determine_enc_tofrom
1427              
1428             sub _tool_extract_all_csvs($$) {
1429 0     0   0 my ($opts, $destdir) = @_;
1430              
1431 0         0 _get_exclusive_lock($opts);
1432 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1433              
1434 0         0 delete local $opts->{sheetname};
1435 0         0 local $opts->{allsheets} = 1;
1436 0 0       0 if (_openlibre_supports_allsheets()) {
    0          
1437 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destdir);
1438             }
1439             elsif (_ssconvert_supports_allsheets()) {
1440 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destdir);
1441             }
1442 0         0 else { confess "Can't extract 'allsheets'. Please install LibreOffice 7.2 or newer" }
1443             }
1444              
1445             sub _tool_can_extract_csv_byname() {
1446 0 0   0   0 _openlibre_supports_named_sheet() || _ssconvert_supports_named_sheet()
1447             }
1448             sub _tool_extract_one_csv($$) {
1449 0     0   0 my ($opts, $destpath) = @_;
1450              
1451 0         0 _get_exclusive_lock($opts);
1452 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1453              
1454 0 0       0 if (_openlibre_features()->{available}) {
1455 0 0 0     0 oops if $opts->{sheetname} && !_openlibre_supports_named_sheet();
1456 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destpath);
1457             } else {
1458 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1459             }
1460             }
1461             sub _tool_can_extract_current_sheet() {
1462             _openlibre_features()->{available} || _ssconvert_features()->{available}
1463 0 0   0   0 }
1464              
1465             sub _tool_write_spreadsheet($$) {
1466 0     0   0 my ($opts, $destpath) = @_;
1467              
1468 0         0 _get_exclusive_lock($opts);
1469 0     0   0 scope_guard { _release_lock($opts); };
  0         0  
1470              
1471             # ssconvert allows specifying the sheetname when importing a csv but not LO
1472 0 0 0     0 if ($opts->{sheetname} && _ssconvert_supports_writing($opts->{cvt_to})) {
    0          
    0          
1473 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1474             }
1475             elsif (_openlibre_supports_writing($opts->{cvt_to})) {
1476 0 0       0 if ($opts->{sheetname}) {
1477 0         0 carp "WARNING: Sheet name when creating a spreadsheet will be ignored\n";
1478 0         0 delete $opts->{sheetname};
1479             }
1480 0         0 _convert_using_openlibre($opts, $opts->{inpath_sans_sheet}, $destpath);
1481             }
1482             elsif (_ssconvert_supports_writing($opts->{cvt_to})) {
1483 0         0 _convert_using_ssconvert($opts, $opts->{inpath_sans_sheet}, $destpath);
1484             }
1485 0         0 else { croak "Can't create $opts->{cvt_to} spreadsheets. Please install LibreOffice 7.2 or newer" }
1486             }
1487              
1488              
1489             # Extract CSVs for every sheet into {outpath} (setting to tmpdir if not preset).
1490             # If cached CSVs are available they are moved into {outpath}/ .
1491             sub _extract_all_csvs($) {
1492 0     0   0 my ($opts) = @_;
1493 0         0 my $outpath = _final_outpath($opts);
1494 0         0 $outpath->mkpath; # nop if exists, croaks if conflicts with file
1495              
1496 0         0 _tool_extract_all_csvs($opts, $outpath); #logs
1497             }
1498              
1499              
1500             # Extract a single sheet into a CSV at {outpath} (defaulting to temp file).
1501             # If a cached CSV is available it is moved to {outpath}.
1502             sub _extract_one_csv($) {
1503             my ($opts) = @_;
1504             my $cachedirpath = _cachedir($opts);
1505              
1506             my sub _fill_csv_cache() {
1507             $cachedirpath->remove_tree;
1508             $cachedirpath->mkpath;
1509             { #local $opts->{verbose} = 0;
1510             #local $opts->{debug} = 0;
1511             _tool_extract_all_csvs($opts, $cachedirpath);
1512             }
1513             }
1514              
1515             my $outpath = _final_outpath($opts);
1516             $outpath->remove unless -d $outpath;
1517              
1518             if (defined($opts->{sheetname})) {
1519             my $fname = $opts->{sheetname}.".csv";
1520             my $cached_path = $cachedirpath->child($fname);
1521             if (! -e $cached_path) {
1522             if (_tool_can_extract_csv_byname()) {
1523             _tool_extract_one_csv($opts, $outpath); #logs
1524             return
1525             }
1526             warn ">>Emulating extract-by-name by extracting all csvs into cache...\n"
1527             if $opts->{debug};
1528             _fill_csv_cache;
1529             }
1530             croak "Sheet '$opts->{sheetname}' does not exist in $opts->{inpath_sans_sheet}\n"
1531             unless -e $cached_path;
1532             warn "> Moving cached $fname to $outpath\n" if $opts->{verbose};
1533             File::Copy::move($cached_path, $outpath);
1534             return
1535             }
1536             elsif (_tool_can_extract_current_sheet()) {
1537             _tool_extract_one_csv($opts, $outpath); #logs
1538             return
1539             }
1540             else {
1541             _fill_csv_cache;
1542             my @children = $cachedirpath->children;
1543             if (@children == 0) {
1544             croak "$opts->{inpath_sans_sheet} appears to have zero sheets!\n"
1545             }
1546             elsif (@children == 1) {
1547             my $fname = $children[0]->basename;
1548             my $cached_path = $cachedirpath->child($fname);
1549             warn "> Moving cached $fname to $outpath\n" if $opts->{verbose};
1550             File::Copy::move($cached_path, $outpath);
1551             return
1552             }
1553             else {
1554             croak "$opts->{inpath_sans_sheet} contains multiple sheets; you must specify a sheetname\n"
1555             }
1556             }
1557             }
1558             sub _write_spreadsheet($) {
1559 0     0   0 my ($opts) = @_;
1560              
1561 0         0 my $outpath = _final_outpath($opts);
1562 0 0       0 $outpath->remove unless -d $outpath;
1563              
1564 0         0 _tool_write_spreadsheet($opts, $outpath);
1565             }
1566              
1567             # If {outpath} is not set, set it to a unique output path in $tempdir
1568             # Always returns {outpath} as a Path::Tiny object.
1569             sub _final_outpath($) {
1570 0     0   0 my $opts = shift;
1571 0 0       0 if (defined $opts->{outpath}) {
1572 0         0 return path($opts->{outpath});
1573             } else {
1574 0 0       0 my $suf = $opts->{cvt_to} unless $opts->{allsheets};
1575             return(
1576 0         0 ($opts->{outpath}=_path_under_tempdir($opts, suf=>$suf))
1577             );
1578             }
1579             }
1580              
1581             sub convert_spreadsheet(@) {
1582             # Set inpath_sans_sheet, sheetname, ifbase, etc.
1583 2     2 1 7 my %opts = &_process_args;
1584 2 50       25 btw dvis('>>> convert_spreadsheet %opts\n') if $opts{debug};
1585 2         19 my %input_opts = %opts;
1586              
1587 2         12 _create_tempdir_if_needed(\%opts);
1588              
1589             # intuit cvt_from & cvt_to, detect encoding, and pre-process .csv input
1590             # if needed to avoid corruption of leading zeroes.
1591 2         39 _determine_enc_tofrom(\%opts);
1592              
1593 2         6 my $input_enc = $opts{input_encoding};
1594 2         5 my $output_enc = $opts{output_encoding};
1595              
1596             croak "Either input or output must be 'csv'\n"
1597 2 50 33     8 unless $opts{cvt_from} eq 'csv' || $opts{cvt_to} eq 'csv';
1598 2 50       6 if ($opts{allsheets}) {
1599             croak "'allsheets' is allowed only with cvt_to => 'csv'"
1600 0 0 0     0 unless ($opts{cvt_to}//"") eq "csv";
1601             croak "With 'allsheets', a sheet name may not be specified\n"
1602 0 0       0 if $opts{sheetname};
1603             croak "With 'allsheets', 'outpath' must be a directory if it exists\n"
1604 0 0 0     0 if $opts{outpath} && -e $opts{outpath} && ! -d _;
      0        
1605             }
1606              
1607 2         3 my $done;
1608 2 50       7 if ($opts{cvt_from} eq $opts{cvt_to}) { # csv to csv
1609 2 50       7 if (!$opts{allsheets}) {
1610 2 50       4 if ($input_enc ne $output_enc) {
1611             # Special case #1: in & out are CSVs but different encodings.
1612             warn "> Transcoding csv: $input_enc -> $output_enc\n"
1613 0 0       0 if $opts{debug};
1614 0         0 my $octets = $opts{inpath_sans_sheet}->slurp_raw;
1615 0         0 my $chars = decode($input_enc, $octets, Encode::FB_CROAK);
1616 0         0 $octets = encode($output_enc, $chars, Encode::FB_CROAK);
1617 0         0 path(_final_outpath(\%opts))->spew_raw($octets);
1618 0         0 $done = 1;
1619             } else {
1620             # Special case #2: No conversion is needed: Just copy the file or
1621             # return the input path itself as the output
1622 2 50       6 if (defined $opts{outpath}) {
1623             warn "> No conversion needed, copying into ",qsh($opts{outpath}),"\n"
1624 0 0       0 if $opts{verbose};
1625 0         0 $opts{inpath_sans_sheet}->copy($opts{outpath});
1626 0         0 $done = 1;
1627             } else {
1628 2         5 $opts{outpath} = $opts{inpath_sans_sheet};
1629             warn "> No conversion needed, returning ", qsh($opts{outpath}),"\n"
1630 2 50       6 if $opts{verbose};
1631 2         4 $done = 1;
1632             }
1633             }
1634             }
1635             else {
1636             # Special case #2: <allsheets> with input already a csv:
1637             # Leave a symlink to the input in the <outpath> directory.
1638 0 0       0 croak "transcoding not implemented in this situation"
1639             if ($input_enc ne $output_enc);
1640 0         0 my $outpath = path(_final_outpath(\%opts));
1641 0         0 $outpath->mkpath; # nop if exists, croaks if conflicts with file
1642 0         0 my $dest = $outpath->child( $opts{ifbase}.".csv" );
1643 0         0 my $inpath = $opts{inpath_sans_sheet};
1644 0         0 my $s = eval{ symlink($inpath, $dest) };
  0         0  
1645 0 0 0     0 if ($@ or !$s) { # symlink unimplmented or insufficient permissions
1646 0 0       0 btw dvis '>> $@' if $opts{debug};
1647             warn "> No conversion needed! Copying into ", qsh($dest),"\n"
1648 0 0       0 if $opts{verbose};
1649 0         0 $opts{inpath_sans_sheet}->copy($dest);
1650             } else {
1651             warn "> No conversion needed! Left symlink at ", qsh($dest),"\n"
1652 0 0       0 if $opts{verbose};
1653             }
1654 0         0 $done = 1;
1655             }
1656             }
1657 2 50       6 if (! $done) {
1658 0 0       0 if ($opts{allsheets}) {
1659 0         0 _extract_all_csvs(\%opts);
1660             }
1661             else {
1662             # Result will be a single file.
1663 0 0       0 if ($opts{cvt_to} eq "csv") {
1664 0         0 _extract_one_csv(\%opts);
1665             } else {
1666 0         0 _write_spreadsheet(\%opts);
1667             }
1668             }
1669             }
1670             my $result = {
1671             defined($output_enc) ? (encoding => $output_enc):(),
1672 8         43 (map{ my $v = $opts{$_};
1673 8 100       39 ($_ => (blessed($v) ? $v->stringify : $v))
1674 2 50       6 } grep{ defined $opts{$_} }
  10         23  
1675             qw/inpath_sans_sheet outpath cvt_from cvt_to sheetname/)
1676             };
1677             log_call [\%input_opts], [$result, \_fmt_outpath_contents($result)]
1678 2 50       8 if $opts{verbose};
1679              
1680 2         8 $result;
1681             }#convert_spreadsheet
1682              
1683             # Open as a CSV, intuiting input encoding, converting from spreadsheet if
1684             # necessary.
1685             #
1686             # :crlf translation is enabled on the resulting file handle, which converts
1687             # DOS CR,LF to \n while passing *nix bare LF through unmolested.
1688             #
1689             # Input argument(s) are the same as for convert_spreadsheet (except
1690             # outpath may not be specified).
1691             #
1692             # Returns a hash containing the file handle and other information.
1693             sub OpenAsCsv {
1694 2 50   2 1 16 my %opts = (
1695             (@_ == 1 ? (inpath => $_[0]) : (@_)),
1696             cvt_to => 'csv',
1697             );
1698             # TODO: Rename {path} to {inpath} in all usages and rm this cruft;
1699             carp "Obsolete OpenAsCsv usage: Change path to inpath\n"
1700 2 50 33     9 if exists($opts{path}) and !$opts{silent};
1701 2   33     12 $opts{inpath} //= delete $opts{path}; # be compatible with old API
1702              
1703 2         6 my $inpath = delete $opts{inpath};
1704 2 50       8 croak "OpenAsCsv: missing 'inpath' option\n" unless $inpath;
1705 2 50       16 croak "OpenAsCsv: outpath may not be specified\n" if $opts{outpath};
1706              
1707 2         12 my $h = convert_spreadsheet($inpath, %opts, verbose => $opts{debug});
1708 2 50       9 oops "sheetname key bug" if exists $h->{sheet};
1709              
1710 2   33     6 my $csvpath = $h->{outpath} // oops; # same as {inpath} if already a CSV
1711 2 50       81 open my $fh, "<", $csvpath or croak "$csvpath : $!\n";
1712 2 50       46 binmode $fh, ":crlf:encoding(".$h->{encoding}.")" or die "binmode:$!";
1713              
1714             my $r = {
1715             fh => $fh,
1716             csvpath => $csvpath,
1717             inpath => $inpath,
1718 2 100       134 (map{ exists($h->{$_}) ? ($_ => $h->{$_}) : () }
  10         31  
1719             qw/inpath_sans_sheet sheetname encoding tempdir raw_values/),
1720             };
1721              
1722 2         14 return $r;
1723             }
1724              
1725             1;
1726             __END__
1727              
1728             =pod
1729              
1730             =head1 NAME
1731              
1732             Spreadsheet::Edit::IO - convert between spreadsheet and csv files
1733              
1734             =head1 SYNOPSIS
1735              
1736             use Spreadsheet::Edit::IO qw/
1737             convert_spreadsheet OpenAsCsv
1738             cx2let let2cx
1739             @sane_CSV_read_options @sane_CSV_write_options/;
1740              
1741             # Open a CSV file or result of converting a sheet from a spreadsheet
1742             my $hash = OpenAsCsv("/path/to/spreadsheet.odt!Sheet1"); # single-arg form
1743             my $hash = OpenAsCsv(inpath => "/path/to/spreadsheet.odt",
1744             sheetname -> "Sheet1");
1745              
1746             print "Reading ",$hash->{csvpath}," with encoding ",$hash->{encoding},"\n";
1747             while (<$hash->{fh}>) { ... }
1748              
1749             # Convert CSV to spreadsheet in temp file (deleted at process exit)
1750             $hash = convert_spreadsheet(inpath => "mycsv.csv", cvt_to => "xlsx");
1751             print "Output is $hash->{outpath}\n"; # e.g. "/tmp/dwYT6qf/mycsv.xlsx"
1752              
1753             # Convert *all* sheets to CSV files in a temp directory
1754             $hash = convert_spreadsheet(inpath => "mywork.xls", cvt_to => "csv",
1755             allsheets => 1);
1756             opendir $dh, $hash->{outpath};
1757             while (readrir($h)) { say "Output csv file is $_" }
1758              
1759             # Transcode a CSV from windows-1252 to UTF-8
1760             convert_spreadsheet(
1761             inpath => "input.csv", input_encoding => 'windows-1252',
1762             outpath => "output.csv", output_encodoutg => 'UTF-8',
1763             );
1764              
1765             # Translate between 0-based column index and letter code (A, B, etc.)
1766             print cx2let(0); # "A"
1767             print let2cx("A"); # 0
1768             print cx2let(26); # "AA"
1769             print let2cx("ABC"); # 730
1770              
1771             # Extract components from "filepath!SHEETNAME" specifiers
1772             my $path = filepath_from_spec("/path/to/spreasheet.xls!Sheet1")
1773             my $sheetname = sheetname_from_spec("/path/to/spreasheet.xls!Sheet1")
1774              
1775             # Parse a csv file with sane options
1776             my $csv = Text::CSV->new({ @sane_CSV_read_options, eol => $hash->{eol} })
1777             or die "ERROR: ".Text::CSV->error_diag ();
1778             my @rows
1779             while (my $F = $csv->getline( $infh )) {
1780             push @rows, $F;
1781             }
1782             close $infh or die "Error reading ", $hash->csvpath(), ": $!";
1783              
1784             # Write a csv file with sane options
1785             my $ocsv = Text::CSV->new({ @sane_CSV_write_options })
1786             or die "ERROR: ".Text::CSV->error_diag ();
1787             open my $outfh, ">:encoding(utf8)", $outpath
1788             or die "$outpath: $!";
1789             foreach (@rows) { $ocsv->print($outfh, $_) }
1790             close $outfh or die "Error writing $outpath: $!";
1791              
1792             =head1 DESCRIPTION
1793              
1794             Convert between CSV and spreadsheet files using external tools, plus some utility functions
1795              
1796             Currently this uses LibreOffice or OpenOffice (whatever is installed). An external tool is not needed when only CSV files are involved.
1797              
1798             =head2 $hash = OpenAsCsv INPUT
1799              
1800             =head2 $hash = OpenAsCsv inpath => INPUT, sheetname => SHEETNAME, ...
1801              
1802             This is a thin wrapper for C<convert_spreadsheet> followed by C<open>
1803              
1804             If a single argument is given it specifies INPUT; otherwise all arguments must
1805             be specified as key => value pairs, and may include any options supported
1806             by C<convert_spreadsheet>.
1807              
1808             INPUT may be a csv or spreadsheet workbook path; if a spreadsheet,
1809             then a single "sheet" is converted, specified by either a !SHEETNAME suffix
1810             in the INPUT path, a separate C<< sheetname => SHEETNAME >> option,
1811             or unspecified to extract the only sheet (croaks if there is more than one).
1812              
1813             The resulting file handle refers to a guaranteed-seekable CSV file;
1814             this will either be a temporary file (auto-removed at process exit),
1815             or the original INPUT if it was already a seekable csv file.
1816              
1817             RETURNS: A ref to a hash containing the following:
1818              
1819             {
1820             fh => the resulting open file handle
1821             csvpath => the path {fh} refers to, which might be a temporary file
1822             sheetname => sheet name if the input was a spreadsheet
1823             }
1824              
1825             =head2 convert_spreadsheet INPUT, cvt_to=>suffix, OPTIONS
1826              
1827             =head2 convert_spreadsheet INPUT, cvt_to=>"csv", allsheets => 1, OPTIONS
1828              
1829             Convert CSV to spreadsheet or vice-versa, or transcode CSV to CSV.
1830              
1831             RETURNS: A ref to a hash containing:
1832              
1833             {
1834             outpath => path to the output file (or directory with 'allsheets')
1835             (a temporary file/dir or as you specified in OPTIONS).
1836              
1837             encoding => the encoding used when writing .csv files
1838             }
1839              
1840             INPUT is the input file path; it may be a separate first argument as
1841             shown above, or else included in OPTIONS as C<< inpath =E<gt> INPUT >>.
1842              
1843             If C<outpath =E<gt> OUTPATH> is specifed then results are I<always> saved
1844             to that path. With C<allsheets> this is a directory, which will be created
1845             if necessary.
1846              
1847             If C<outpath> is NOT specified in OPTIONS then, with one exception,
1848             results are saved to a temporary file or directory and that path is returned
1849             as C<outpath> in the result hash.
1850             The exception is if no conversion is necessary
1851             (i.e. C<cvt_from> is the same as C<cvt_to>), when the
1852             input file itself is returned as C<outpath>.
1853              
1854             In all cases C<outpath> in the result hash points to the results.
1855              
1856             C<cvt_to> or C<cvt_from> are filename suffixes (sans dot)
1857             e.g. "csv", "xlsx", etc., and need not be specified when indicated by
1858             C<outpath> or INPUT parameters.
1859              
1860             OPTIONS may also include:
1861              
1862             =over 4
1863              
1864             =item sheetname => "sheet name"
1865              
1866             The workbook 'sheet' name used when reading or writing a spreadsheet.
1867             An input sheet name may also be specified as "!sheetname" appended to
1868             the INPUT path.
1869              
1870             =item allsheets => BOOL
1871              
1872             B<All> sheets in the input
1873             are converted to separate .csv files named "SHEETNAME.csv" in
1874             the 'outpath' directory. C<< cvt_to =E<gt> 'csv' >> is also requred.
1875              
1876             =item input_encoding => ENCODING
1877              
1878             Specifies the encoding of INPUT if it is a csv file.
1879              
1880             ENCODING may be a comma-separated list of encoding
1881             names which will be tried in the order until one seems to work.
1882             If only one is specified it will be used without trying it first.
1883             The default is "UTF-8,windows-1252".
1884              
1885             =item output_encoding => ENCODING
1886              
1887             Used when writing csv file(s), defaults to 'UTF-8'.
1888              
1889             =item col_formats => [...]
1890              
1891             This specifies how CSV data is imported into a spreadsheet. Each element
1892             of the array may contain:
1893              
1894             undef, "standard" or "" (LibreOffice will auto-detect)
1895             "text" (imported as unmolested text)
1896             "MM/DD/YY",
1897             "DD/MM/YY",
1898             "YY/MM/DD",
1899             "ignore" (do not import this column)
1900              
1901             Elements may also contain the numeric format codes defined by LibreOffice
1902             at L<https://wiki.documentfoundation.org/Documentation/DevGuide/Spreadsheet_Documents#Filter_Options_for_the_CSV_Filter>
1903              
1904             B<Automatic format detection:>
1905             Input CSV data is pre-scanned to auto-detect column formats
1906             as much as possible. This usually works well as long as dates are
1907             represented unambiguously, e.g. "2021-01-01" or "Jan 1, 2023".
1908              
1909             Specifically, this detects leading zeroes such as in U.S. Zip Codes,
1910             and MM/DD/YY or DD/MM/YY dates when a DD happens to be more than 12.
1911              
1912             =item verbose => BOOL
1913              
1914             =back
1915              
1916             =head3 B<'binmode' Argument For Reading result CSVs>
1917              
1918             It is not possible to control the line-ending style in output CSV files,
1919             but the following incantation will correctly read either DOS/Windows (CR,LF)
1920             or *nix (LF) line endings properly, i.e. as a single \n:
1921              
1922             open my $fh, "<", $resulthash->{outpath};
1923             my $enc = $resulthash->{encoding};
1924             binmode($fh, ":raw:encoding($enc):crlf");
1925              
1926             =head2 @sane_CSV_read_options
1927              
1928             =head2 @sane_CSV_write_options
1929              
1930             These contain options you will always want to use with
1931             S<<< C<< Text::CSV->new() >> >>>.
1932             Specifically, quotes and embedded newlines are handled correctly.
1933              
1934             Not exported by default.
1935              
1936             =head2 cx2let COLUMNINDEX
1937              
1938             =head2 let2cx LETTERCODE
1939              
1940             Functions which translate between spreadsheet-column
1941             letter codes ("A", "B", etc.) and 0-based column indicies.
1942             Not exported by default.
1943              
1944             =head2 filepath_from_spec EXPR
1945              
1946             =head2 sheetname_from_spec EXPR
1947              
1948             Functions which decompose strings containing a spreadsheet path and possibly sheetname
1949             suffix, of the form "FILEPATH!SHEETNAME", "FILEPATH|||SHEETNAME", or "FILEPATH[SHEETNAME]".
1950             C<sheetname_from_spec> returns C<undef> if the input does not have a
1951             a sheetname suffix.
1952             Not exported by default.
1953              
1954             =head2 form_spec_with_sheetname(PATH, SHEENAME)
1955              
1956             Composes a combined string in a "preferred" format (currently "PATH!SHEETNAME").
1957             Not exported by default.
1958              
1959             =head1 Feature Test Functions
1960              
1961             =head2 $bool = can_cvt_spreadsheets();
1962              
1963             =head2 $bool = can_extract_allsheets();
1964              
1965             =head2 $bool = can_extract_named_sheet();
1966              
1967             These functions return false if the corresponding operations
1968             are not possible because LibreOffice (or, someday gnumeric) is not installed
1969             or is an older version which does not have needed capabilities.
1970              
1971             =head2 $path = openlibreoffice_path();
1972              
1973             Returns the detected path of I<soffice> (Apache Open Office or Libre Office)
1974             or undef if not found.
1975              
1976             These are not exported by default.
1977              
1978             =head1 SEE ALSO
1979              
1980             L<Spreadsheet::Edit> and L<Text::CSV>
1981              
1982             =cut
1983