File Coverage

blib/lib/Spreadsheet/Read/Ingester.pm
Criterion Covered Total %
statement 98 105 93.3
branch 22 30 73.3
condition 4 12 33.3
subroutine 13 13 100.0
pod 3 3 100.0
total 140 163 85.8


line stmt bran cond sub pod time code
1             package Spreadsheet::Read::Ingester ;
2             $Spreadsheet::Read::Ingester::VERSION = '0.010';
3 2     2   679168 use strict;
  2         10  
  2         49  
4 2     2   9 use warnings;
  2         3  
  2         43  
5              
6 2     2   538 use Storable;
  2         2950  
  2         90  
7 2     2   12 use File::Spec;
  2         4  
  2         26  
8 2     2   440 use File::Signature;
  2         3235  
  2         32  
9 2     2   441 use File::UserConfig;
  2         32539  
  2         49  
10 2     2   37 use base qw (Spreadsheet::Read);
  2         4  
  2         717  
11              
12             ### Public methods ###
13              
14             # Override constructor
15             sub new {
16 3     3 1 4766 my $s = shift;
17 3         8 my $data = $s->_fetch_data(@_);
18 3         17 return $data;
19             }
20              
21             # wrap functions that require 'Spreadsheet::Read' objects
22             my @funcs = qw (parses rows row col2label cr2cell cell2cr cellrow);
23              
24             foreach my $func (@funcs) {
25 2     2   404509 { no strict;
  2         9  
  2         1529  
26             *$func = sub {
27 2     2   328 my $s = shift;
28 2         5 bless $s, 'Spreadsheet::Read';
29 2         6 my $super_func = "SUPER::$func";
30 2 100       6 if (wantarray) {
31 1         6 my @result = $s->$super_func(shift);
32 1         31 bless $s, 'Spreadsheet::Read::Ingester';
33 1         3 return @result;
34             } else {
35 1         7 my $result = $s->$super_func(shift);
36 1         28 bless $s, 'Spreadsheet::Read::Ingester';
37 1         8 return $result;
38             }
39             }
40             }
41             }
42              
43             # Override add function
44             sub add {
45 1     1 1 38 my $book = shift;
46 1         3 my $data = $book->_fetch_data(@_);
47             $book && (ref $book eq "ARRAY" ||
48 1 50 33     15 ref $book eq __PACKAGE__) && $book->[0]{sheets} or return $data;
      33        
      33        
49              
50 1         3 my $c1 = $book->[0];
51 1         2 my $c2 = $data->[0];
52              
53 1 50       3 unless ($c1->{parsers}) {
54 0         0 $c1->{parsers}[0]{$_} = $c1->{$_} for qw( type parser version );
55 0         0 $book->[$_]{parser} = 0 for 1 .. $c1->{sheets};
56             }
57 1         2 my ($pidx) = (grep { my $p = $c1->{parsers}[$_];
58             $p->{type} eq $c2->{type} &&
59             $p->{parser} eq $c2->{parser} &&
60 1 50 33     2 $p->{version} eq $c2->{version} } 0 .. $#{$c1->{parsers}});
  1         8  
  1         4  
61 1 50       4 unless (defined $pidx) {
62 0         0 $pidx = scalar @{$c1->{parsers}};
  0         0  
63 0         0 $c1->{parsers}[$pidx]{$_} = $c2->{$_} for qw( type parser version );
64             }
65              
66 1         2 foreach my $sn (sort { $c2->{sheet}{$a} <=> $c2->{sheet}{$b} } keys %{$c2->{sheet}}) {
  0         0  
  1         4  
67 1         2 my $s = $sn;
68 1         3 my $v = 2;
69 1         3 while (exists $c1->{sheet}{$s}) {
70 0         0 $s = $sn."[".$v++."]";
71             }
72 1         3 $c1->{sheet}{$s} = $c1->{sheets} + $c2->{sheet}{$sn};
73 1         2 $data->[$c2->{sheet}{$sn}]{parser} = $pidx;
74 1         3 push @$book, $data->[$c2->{sheet}{$sn}];
75             }
76 1         3 $c1->{sheets} += $c2->{sheets};
77              
78 1         13 return $book;
79             }
80              
81             # Fetch data from stored variable, if available
82             sub _fetch_data {
83 4     4   8 my $s = shift;
84 4         6 my $file = shift;
85 4         8 my @args = @_;
86              
87 4         7 my $sig = '';
88 4         7 eval { $sig = File::Signature->new($file)->{digest} };
  4         27  
89              
90 4         492 my %args = @args;
91 4         7 my $suffix;
92 4         17 foreach my $key (sort keys %args) {
93 4         6 $suffix .= $key;
94 4         9 $suffix .= $args{$key};
95             }
96 4 100       11 if ($suffix) {
97 2         4 $sig .= "-$suffix";
98             }
99 4         21 my $configdir = File::UserConfig->new(dist => 'Spreadsheet-Read-Ingester')->configdir;
100 4         1670 my $parsed_file = File::Spec->catfile($configdir, $sig);
101              
102 4         9 my $data;
103              
104             # try to retrieve parsed data
105 4         7 eval { $data = retrieve $parsed_file };
  4         13  
106              
107             # otherwise reingest from raw file
108 4 100       1072 if (!$data) {
109 2         15 $data = $s->SUPER::new($file, @_);
110 2         1978 my $error = $data->[0]{error};
111 2 50       7 die "Unable to read data from file: $file. Error: $error" if $data->[0]{error};
112 2         7 store $data, $parsed_file;
113             }
114              
115 4         357 return $data;
116             }
117              
118             sub cleanup {
119 3     3 1 2002668 my $s = shift;
120 3         7 my $age = shift;
121              
122 3 100       21 if (!defined $age) {
    100          
    50          
123 1         4 $age = 30;
124             } elsif ($age eq '0') {
125 1         4 $age = -1
126             } elsif ($age !~ /^\d+$/) {
127 1         19 warn 'cleanup method accepts only positive integer values or 0';
128 1         68 return;
129             }
130              
131 2         19 my $configdir = File::UserConfig->new(dist => 'Spreadsheet-Read-Ingester')->configdir;
132              
133 2 50       1326 opendir (DIR, $configdir) or die 'Could not open directory.';
134 2         60 my @files = readdir (DIR);
135 2         28 closedir (DIR);
136 2         8 foreach my $file (@files) {
137 12         116 $file = File::Spec->catfile($configdir, $file);
138 12 100       161 next if (-d $file);
139 6 100       86 if (-M $file >= $age) {
140 3 50       186 unlink $file or die 'Cannot remove file: $file';
141             }
142             }
143             }
144              
145             1; # Magic true value
146             # ABSTRACT: ingest and save csv and spreadsheet data to a perl data structure to avoid reparsing
147              
148             __END__
149              
150             =pod
151              
152             =head1 NAME
153              
154             Spreadsheet::Read::Ingester - ingest and save csv and spreadsheet data to a perl data structure to avoid reparsing
155              
156             =head1 SYNOPSIS
157              
158             use Spreadsheet::Read::Ingester;
159              
160             # ingest raw file, store parsed data file, and return data object
161             my $data = Spreadsheet::Read::Ingester->new('/path/to/file');
162              
163             # the returned data object has all the methods of a L<Spreadsheet::Read> object
164             my $num_cols = $data->sheet(1)->maxcol;
165              
166             # delete old data files older than 30 days to save disk space
167             Spreadsheet::Read::Ingester->cleanup;
168              
169             =head1 DESCRIPTION
170              
171             This module is intended to be a drop-in replacement for L<Spreadsheet::Read> and
172             is a simple, unobtrusive wrapper for it.
173              
174             Parsing spreadsheet and csv data files is time consuming, especially with large
175             data sets. If a data file is ingested more than once, much time and processing
176             power is wasted reparsing the same data. To avoid reparsing, this module uses
177             L<Storable> to save a parsed version of the data to disk when a new file is
178             ingested. All subsequent ingestions are retrieved from the stored Perl data
179             structure. Files are saved in the directory determined by L<File::UserConfig>
180             and is a function of the user's OS.
181              
182             The stored data file names are the unique file signatures for the raw data file.
183             The signature is used to detect if the original file changed, in which case the
184             data is reingested from the raw file and a new parsed file is saved using an
185             updated file signature. Arguments passed to the constructor are appended to the
186             name of the file to ensure different parse options are accounted for. Parsed
187             data files are kept indefinitely but can be deleted with the C<cleanup()>
188             method.
189              
190             Consult the L<Spreadsheet::Read> documentation for accessing the data object
191             returned by this module.
192              
193             =head1 METHODS
194              
195             =head2 new( $path_to_file )
196              
197             my $data = Spreadsheet::Read::Ingester->new('/path/to/file');
198              
199             Takes same arguments as the new constructor in L<Spreadsheet::Read> module.
200             Returns an object identical to the object returned by the L<Spreadsheet::Read>
201             module along with its corresponding methods.
202              
203             =head2 cleanup( $file_age_in_days )
204              
205             =head2 cleanup()
206              
207             Spreadsheet::Read::Ingester->cleanup(0);
208              
209             Deletes all stored files from the user's application data directory. Takes an
210             optional argument indicating the minimum number of days old the file must be
211             before it is deleted. Defaults to 30 days. Passing a value of 0 deletes all
212             files.
213              
214             =head1 REQUIRES
215              
216             =over 4
217              
218             =item * L<File::Signature|File::Signature>
219              
220             =item * L<File::Spec|File::Spec>
221              
222             =item * L<File::UserConfig|File::UserConfig>
223              
224             =item * L<Storable|Storable>
225              
226             =item * L<strict|strict>
227              
228             =item * L<warnings|warnings>
229              
230             =back
231              
232             =for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
233              
234             =head1 SUPPORT
235              
236             =head2 Perldoc
237              
238             You can find documentation for this module with the perldoc command.
239              
240             perldoc Spreadsheet::Read::Ingester
241              
242             =head2 Websites
243              
244             The following websites have more information about this module, and may be of help to you. As always,
245             in addition to those websites please use your favorite search engine to discover more resources.
246              
247             =over 4
248              
249             =item *
250              
251             MetaCPAN
252              
253             A modern, open-source CPAN search engine, useful to view POD in HTML format.
254              
255             L<https://metacpan.org/release/Spreadsheet-Read-Ingester>
256              
257             =back
258              
259             =head2 Source Code
260              
261             The code is open to the world, and available for you to hack on. Please feel free to browse it and play
262             with it, or whatever. If you want to contribute patches, please send me a diff or prod me to pull
263             from your repository :)
264              
265             L<https://github.com/sdondley/Spreadsheet-Read-Ingester>
266              
267             git clone git://github.com/sdondley/Spreadsheet-Read-Ingester.git
268              
269             =head1 BUGS
270              
271             Please report any bugs or feature requests on the bugtracker website
272             L<https://github.com/sdondley/Spreadsheet-Read-Ingester/issues>
273              
274             When submitting a bug or request, please include a test-file or a
275             patch to an existing test-file that illustrates the bug or desired
276             feature.
277              
278             =head1 LIMITATIONS
279              
280             If a new parser is installed (e.g. L<Text::CSV_XS>) and a previous ingestion
281             used a different parser (e.g. L<Text::CSV_PP>), results from the previous parser
282             will be returned. Most likely, this will have no practical consequence. But if
283             you are concerned, you can avoid the problem by specifying the same parser using
284             an environment variable per the L<Spreadsheet::Read> documentation:
285              
286             env SPREADSHEET_READ_CSV=Text::CSV_PP ...
287              
288             Similarly, upgrading to a newer version of a parser can cause the same problem.
289             Currently, the only workaround is to delete the stored data files parsed with
290             the old older parser version.
291              
292             =head1 INSTALLATION
293              
294             See perlmodinstall for information and options on installing Perl modules.
295              
296             =head1 SEE ALSO
297              
298             L<Spreadsheet::Read>
299              
300             =head1 AUTHOR
301              
302             Steve Dondley <s@dondley.com>
303              
304             =head1 COPYRIGHT AND LICENSE
305              
306             This software is copyright (c) 2019 by Steve Dondley.
307              
308             This is free software; you can redistribute it and/or modify it under
309             the same terms as the Perl 5 programming language system itself.
310              
311             =cut