File Coverage

blib/lib/File/Tabular.pm
Criterion Covered Total %
statement 351 387 90.7
branch 131 204 64.2
condition 42 68 61.7
subroutine 44 48 91.6
pod 16 18 88.8
total 584 725 80.5


line stmt bran cond sub pod time code
1             package File::Tabular;
2              
3             # TODO : -optimize _getField (could probably dispense with
4             # "mkRecord", call $self->..)
5             # -BUG : preMatch/postMatch won't work on explicit field searches
6             # -optimize: postpone preMatch/postMatch until display time
7             # -escaping fieldSep : make it optional
8              
9             # - synopsis : example of file cloning with select (e.g. year=2004)
10              
11              
12              
13             our $VERSION = "0.72";
14              
15 1     1   31526 use strict;
  1         3  
  1         31  
16 1     1   4 use warnings;
  1         2  
  1         24  
17 1     1   10 no warnings 'uninitialized';
  1         6  
  1         31  
18 1     1   843 use locale;
  1         226  
  1         5  
19 1     1   30 use Carp;
  1         3  
  1         81  
20             # use Carp::Assert; # dropped because not really needed and not in Perl core
21 1     1   4 use Fcntl ':flock';
  1         1  
  1         103  
22 1     1   785 use Hash::Type;
  1         1615  
  1         33  
23 1     1   912 use Search::QueryParser 0.92;
  1         1576  
  1         32  
24 1     1   1115 use File::Temp;
  1         26359  
  1         153  
25              
26             =head1 NAME
27              
28             File::Tabular - searching and editing flat tabular files
29              
30             =head1 SYNOPSIS
31              
32             use File::Tabular;
33             my $f = new File::Tabular($filename);
34              
35             my $row = $f->fetchrow;
36             print $row->{field1}, $row->{field2};
37              
38             $row = $f->fetchrow(where => 'someWord');
39             $row = $f->fetchrow(where => 'field1 > 4 AND field2 >= "01.01.2001"');
40             $row = $f->fetchrow(where => qr/some\s+(complex\s*)?(regex|regular expression)/i);
41              
42             $f->rewind;
43             my $rows = $f->fetchall(where => 'someField =~ ^[abc]+');
44             print $_->{someField} foreach @$rows;
45              
46             $f->rewind;
47             $rows = $f->fetchall(where => '+field1:someWord -field2:otherWord',
48             orderBy => 'field3, field6:num, field5:-alpha');
49              
50             $f->rewind;
51             my $hashRows = $f->fetchall(where => 'foo AND NOT bar',
52             key => 'someField');
53             print $hashRows->{someKey}{someOtherField};
54              
55             # open for updates, and remember the updates in a journal file
56             $f = new File::Tabular("+<$filename", {journal => ">>$journalFile"});
57              
58             # updates at specific positions (line numbers)
59             $f->splices(4 => 2, undef, # delete 2 lines from position 4
60             7 => 1, {f1 => $v1, f2 => $v2, ...}, # replace line 7
61             9 => 0, { ...}, # insert 1 new line at position 9
62             22 => 0, [{...}, {...}, ...] # insert several lines at pos. 22
63             ...
64             -1 => 0, [{...}, {...}, ...] # append at the end
65             );
66              
67             # shorthand to add new data at the end
68             $f->append({f1 => $v1, f2 => $v2, ...});
69             # same thing, but use the "Hash::Type" associated to the file
70             $f->append($f->ht->new($v1, $v2, ...));
71              
72              
73             $f->clear; # removes all data (but keeps the header line)
74              
75             # updates at specific keys, corresponding to @keyFields
76             $f->writeKeys({key1 => {f1 => $v1, f2 => $v2, ...}, # add or update
77             key2 => undef, # remove
78             ...
79             }, @keyFields);
80              
81              
82             # replay the updates on a backup file
83             my $bck = new File::Tabular("+<$backupFile");
84             $bck->playJournal($journalFile);
85              
86             # get info from associated filehandle
87             printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
88             my $mtime = $f->mtime;
89             printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
90              
91             =head1 DESCRIPTION
92              
93             A I is a flat text file containing data organised
94             in rows (records) and columns (fields).
95              
96             This module provides database-like functionalities for managing
97             tabular files : retrieving, searching, writing, autonumbering, journaling.
98             However, unlike other modules like L, it doesn't try
99             to make it look like a database : rather, the API was designed
100             specifically for work with tabular files.
101             Instead of SQL, search queries are specified in a web-like
102             fashion, with support for regular expressions and cross-field
103             searches. Queries are compiled internally into perl closures
104             before being applied to every data record, which makes it
105             quite fast.
106              
107             Write operations take a list of modifications as argument;
108             then they apply the whole list atomically in a single rewrite
109             of the data file.
110              
111             Here are some of the reasons why you might choose to
112             work with a tabular file rather than a regular database :
113              
114             =over
115              
116             =item *
117              
118             no need to install a database system (not even buy one)!
119              
120             =item *
121              
122             easy portability and data exchange with external tools
123             (text editor, spreadsheet, etc.)
124              
125             =item *
126              
127             search queries immediately ready for a web application
128              
129             =item *
130              
131             good search performance, even with several thousand records
132              
133             =back
134              
135              
136             On the other hand, tabular files will probably be inappropriate if you
137             need very large volumes of data, complex multi-table data models or
138             frequent write operations.
139              
140              
141              
142             =head1 METHODS
143              
144             =over
145              
146             =item C<< new (open1, open2, ..., {opt1 => v1, opt2 => v2, ...}) >>
147              
148             Creates a new tabular file object.
149             The list of arguments C is fed directly to
150             L for opening the associated file.
151             Can also be a reference to an already opened filehandle.
152              
153             The final hash ref is a collection of optional parameters, taken
154             from the following list :
155              
156             =over
157              
158             =item fieldSep
159              
160             field separator : any character except '%' ('|' by default).
161             Escape sequences like C<\t> are admitted.
162              
163             =item recordSep
164              
165             record separator ('\n' by default).
166              
167             =item fieldSepRepl
168              
169             string to substitute if fieldSep is met in the data.
170             (by default, url encoding of B, i.e. '%7C' )
171              
172             =item recordSepRepl
173              
174             string to substitute if recordSep is met in the data
175             (by default, url encoding of B, i.e. '%0A' )
176              
177              
178             =item autoNumField
179              
180             name of field for which autonumbering is turned on (none by default).
181             This is useful to generate keys : when you write a record, the
182             character '#' in that field will be replaced by a fresh number,
183             incremented automatically. This number will be 1 + the
184             largest number read I (it is your responsability to read all
185             records before the first write operation).
186              
187             =item autoNum
188              
189             initial value of the counter for autonumbering (1 by default).
190              
191             =item autoNumChar
192              
193             character that will be substituted by an autonumber when
194             writing records ('#' by default).
195              
196              
197             =item flockMode
198              
199             mode for locking the file, see L. By default,
200             this will be LOCK_EX if B contains 'E' or
201             '+E', LOCK_SH otherwise.
202              
203             =item flockAttempts
204              
205             Number of attempts to lock the file,
206             at 1 second intervals, before returning an error.
207             Zero by default.
208             If nonzero, LOCK_NB is added to flockMode;
209             if zero, a single locking attempt will be made, blocking
210             until the lock is available.
211              
212             =item headers
213              
214             reference to an array of field names.
215             If not present, headers will be read from the first line of
216             the file.
217              
218             =item printHeaders
219              
220             if true, the B will be printed to the file.
221             If not specified, treated as 'true' if
222             B contains 'E'.
223              
224             =item journal
225              
226             name of journaling file, or reference to a list of arguments for
227             L. The journaling file will log all write operations.
228             If specified as a simple file name, it will be be opened in
229             'EE' mode.
230              
231             A journal file can then be replayed through method L
232             (this is useful to recover after a crash, by playing the journal
233             on a backup copy of your data).
234              
235             =item rxDate
236              
237             Regular expression for matching a date.
238             Default value is C<< qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/ >>.
239             This will be used by L to perform appropriate comparisons.
240              
241             =item date2str
242              
243             Ref to a function for transforming dates into strings
244             suitable for sorting (i.e. year-month-day).
245             Default is :
246              
247             sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
248             $y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
249             return sprintf "%04d%02d%02d", $y, $m, $d;}
250              
251             =item rxNum
252              
253             Regular expression for matching a number.
254             Default value is C<< qr/^[-+]?\d+(?:\.\d*)?$/ >>.
255             This will be used by L to perform appropriate comparisons.
256              
257             =item preMatch/postMatch
258              
259             Strings to insert before or after a match when filtering rows
260             (will only apply to search operator ':' on the whole line, i.e.
261             query C<< "foo OR bar" >> will highlight both "foo" and "bar",
262             but query C<< "~ 'foo' OR someField:bar" >>
263             will not highlight anything; furthermore, a match-all
264             request containing just '*' will not highlight anything either).
265              
266             =item avoidMatchKey
267              
268             If true, searches will avoid to match on the first field. So a request
269             like C<< $ft->fetchall(where => '123 OR 456') >> will not find
270             the record with key 123, unless the word '123' appears somewhere
271             in the other fields. This is useful when queries come from a Web
272             application, and we don't want users to match a purely technical
273             field.
274              
275             This search behaviour will not apply to regex searches. So requests like
276             C<< $ft->fetchall(where => qr/\b(123|456)\b/) >>
277             or
278             C<< $ft->fetchall(where => ' ~ 123 OR ~ 456') >>
279             will actually find the record with key 123.
280              
281             =back
282              
283             =cut
284              
285             ############################################################
286             # CONSTANTS
287             ############################################################
288              
289 1     1   8 use constant BUFSIZE => 1 << 21; # 2MB, used in copyData
  1         1  
  1         209  
290              
291 0         0 use constant DEFAULT => {
292             fieldSep => '|',
293             recordSep => "\n",
294             autoNumField => undef,
295             autoNumChar => '#',
296             autoNum => 1,
297             lockAttempts => 0,
298             rxNum => qr/^[-+]?\d+(?:\.\d*)?$/,
299             rxDate => qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/,
300             date2str => sub {my ($d, $m, $y) =
301             ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
302 0 0 0     0 $y += ($y > 50) ? 1900 : 2000
    0          
303             if defined($y) && $y < 100;
304 0         0 return sprintf "%04d%02d%02d", $y, $m, $d;},
305 1         94 preMatch => '',
306             postMatch => '',
307             avoidMatchKey => undef
308 1     1   6 };
  1         2  
309              
310              
311             use constant {
312 1         13 statType => Hash::Type->new(qw(dev ino mode nlink uid gid rdev size
313             atime mtime ctime blksize blocks)),
314             timeType => Hash::Type->new(qw(sec min hour mday mon year wday yday isdst))
315 1     1   5 };
  1         1  
316              
317              
318             ############################################################
319             # METHODS
320             ############################################################
321              
322             sub new {
323 3     3 1 2342 my $class = shift;
324 3 50       18 my $args = ref $_[-1] eq 'HASH' ? pop : {};
325              
326             # create object with default values
327 3         7 my $self = bless {};
328 3         9 foreach my $option (qw(fieldSep recordSep autoNumField autoNumChar autoNum
329             rxDate rxNum date2str preMatch postMatch
330             avoidMatchKey)) {
331 33   100     172 $self->{$option} = $args->{$option} || DEFAULT->{$option};
332             }
333              
334             # eval to expand escape sequences, for example if fieldSep is given as '\t'
335 3         10 foreach my $option (qw(fieldSep recordSep)) {
336 6         392 $self->{$option} = eval qq{qq{$self->{$option}}};
337             }
338              
339             # field and record separators
340 3 50       15 croak "can't use '%' as field separator" if $self->{fieldSep} =~ /%/;
341            
342 3   33     22 $self->{recordSepRepl} = $args->{recordSepRepl} ||
343             urlEncode($self->{recordSep});
344 3   33     18 $self->{fieldSepRepl} = $args->{fieldSepRepl} ||
345             urlEncode($self->{fieldSep});
346 3         47 $self->{rxFieldSep} = qr/\Q$self->{fieldSep}\E/;
347              
348              
349             # open file and get lock
350 3 50       17 _open($self->{FH}, @_) or croak "open @_ : $! $^E";
351 3   50     16 my $flockAttempts = $args->{flockAttempts} || 0;
352 3 100 66     36 my $flockMode = $args->{flockMode} ||
353             $_[0] =~ />|\+
354 3 50       19 $flockMode |= LOCK_NB if $flockAttempts > 0;
355 3         11 for (my $n = $flockAttempts; $n >= 1; $n--) {
356 0 0       0 last if flock $self->{FH}, $flockMode; # exit loop if flock succeeded
357 0 0       0 $n > 1 ? sleep(1) : croak "could not flock @_: $^E";
358             };
359              
360             # setup journaling
361 3 100       9 if (exists $args->{journal}) {
362 1         3 my $j = {}; # create a fake object for _printRow
363 1         7 $j->{$_} = $self->{$_} foreach qw(fieldSep recordSep
364             fieldSepRepl recordSepRepl);
365 1 50       7 _open($j->{FH}, ref $args->{journal} eq 'ARRAY' ? @{$args->{journal}}
  0 50       0  
366             : ">>$args->{journal}")
367             or croak "open journal $args->{journal} : $^E";
368 1         4 $self->{journal} = bless $j;
369             }
370              
371             # field headers
372 3   100     16 my $h = $args->{headers} || [split($self->{rxFieldSep}, $self->_getLine, -1)];
373 3         24 $self->{ht} = new Hash::Type(@$h);
374 3 50       113 $self->_printRow(@$h) if
    100          
375             exists $args->{printHeaders} ? $args->{printHeaders} : ($_[0] =~ />/);
376              
377             # ready for reading data lines
378 3         11 $self->{dataStart} = tell($self->{FH});
379 3         13 $. = 0; # setting line counter to zero for first dataline
380              
381              
382             # create a closure which takes a (already chomped) line and returns a record
383 3         5 my %tmp; # copy some attributes of $self in order to avoid a cyclic ref
384 3         34 $tmp{$_} = $self->{$_} foreach qw/rxFieldSep fieldSepRepl fieldSep ht/;
385             $self->{mkRecord} = sub {
386 365     365   1829 my @vals = split $tmp{rxFieldSep}, $_[0], -1;
387 365         2062 s/$tmp{fieldSepRepl}/$tmp{fieldSep}/g foreach @vals;
388 365         1290 return $tmp{ht}->new(@vals);
389 3         17 };
390              
391 3         13 return $self;
392             }
393              
394              
395             sub _open { # stupid : because of 'open' strange prototyping,
396             # cannot pass an array directly
397 5 100   5   736 my $result = (ref $_[1] eq 'GLOB') ? $_[0] = $_[1] :
    50          
    50          
398             @_ > 3 ? open($_[0], $_[1], $_[2], @_[3..$#_]) :
399             @_ > 2 ? open($_[0], $_[1], $_[2]) :
400             open($_[0], $_[1]);
401 5 50       48 binmode($_[0], ":crlf") if $result; # portably open text file, see PerlIO
402 5         21 return $result;
403             }
404              
405              
406              
407              
408             sub _getLine {
409 700     700   2416 my $self = shift;
410 700         2185 local $/ = $self->{recordSep};
411 700         2424 my $line = readline $self->{FH};
412 700 100       1279 if (defined $line) {
413 688         746 chomp $line;
414 688         1499 $line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
415             }
416 700         2784 return $line;
417             }
418              
419              
420             sub _printRow { # Internal function to print a data row and automatically deal with
421             # autonumbering, if necessary.
422 335     335   996 my ($self, @vals) = @_;
423              
424 335 100       5763 if ($self->{autoNumField}) { # autoNumbering
425 216         458 my $ix = $self->{ht}{$self->{autoNumField}} - 1;
426 216 100       1166 if ($vals[$ix] =~ s/$self->{autoNumChar}/$self->{autoNum}/) {
    100          
427 8         16 $self->{autoNum} += 1;
428             }
429             elsif ($vals[$ix] =~ m/(\d+)/) {
430 206 100       687 $self->{autoNum} = $1 + 1 if $1 + 1 > $self->{autoNum};
431             }
432             }
433              
434 335         1907 s/\Q$self->{fieldSep}\E/$self->{fieldSepRepl}/ foreach @vals;
435 335         883 my $line = join $self->{fieldSep}, @vals;
436 335         672 $line =~ s/\Q$self->{recordSep}\E/$self->{recordSepRepl}/g;
437 335         488 my $fh = $self->{FH};
438 335         2021 print $fh $line, $self->{recordSep};
439             }
440              
441              
442             =item C<< fetchrow(where => filter) >>
443              
444             returns the next record matching the (optional) filter. If there is
445             no filter, just returns the next record.
446              
447             The filter is either a code reference generated by L,
448             or a string which will be automatically fed as argument to
449             L; this string can contain just a word, a regular
450             expression, a complex boolean query involving field names and
451             operators, etc., as explained below.
452              
453             =cut
454              
455             # _getField($r, $fieldNumber)
456             # Internal method for lazy creation of a record from a line.
457             # Will be called only when a specific field is required.
458             # See creation of $r in method 'fetchrow' just below.
459              
460 133   66 133   1205 sub _getField { tied(%{$_[0]->{record} ||= $_[0]->{mkRecord}($_[0]->{line})})->[$_[1]]; }
  133         2218  
461              
462              
463             sub fetchrow {
464 317     317 1 2111 my $self = shift;
465 317         342 my $filter = undef;
466              
467             # accept fetchrow(where=>filter) or fetchrow({where=>filter}) or fetchrow(filter)
468 317 50       824 my @args = ref $_[0] eq 'HASH' ? @{$_[0]} : @_;
  0         0  
469 317 100       644 if (@args) {
470 293 100       676 shift @args if $args[0] =~ /^where$/i;
471 293 50       568 croak "fetchrow : invalid number of arguments" if @args != 1;
472 293         424 $filter = $args[0];
473 293 100 100     757 $filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
474             }
475              
476 317         707 while (my $line = $self->_getLine) {
477              
478             # create structure $r for _getField
479 687         2006 my $r = {line => $line, record => undef, mkRecord => $self->{mkRecord}};
480              
481 687 100 100     10791 next if $filter and not $filter->($r);
482              
483 305   66     1085 $r->{record} ||= $self->{mkRecord}($r->{line});
484              
485 305 100       4355 if ($self->{autoNumField}) {
486 186         779 my ($n) = $r->{record}{$self->{autoNumField}} =~ m/(\d+)/;
487 186 50 33     2334 $self->{autoNum} = $n+1 if $n and $n+1 > $self->{autoNum};
488             }
489 305         1342 return $r->{record};
490             }
491 12         49 return undef;
492             }
493              
494             =item C<< fetchall(where => filter, orderBy => cmp) >>
495              
496             =item C<< fetchall(where => filter, key => keySpecif) >>
497              
498             finds all next records matching the (optional) filter.
499             If there is no filter, finds all remaining records.
500              
501             The filter is either a code reference generated by L,
502             or a string which will be automatically fed as argument to
503             L.
504              
505             The return value depends on context and on arguments :
506              
507             =over
508              
509             =item *
510              
511             if no B parameter is given, and we are in a scalar context, then
512             C returns a reference to an array of records.
513              
514             The optional B parameter can be a field name, a ref to a list
515             of field names, a string like C<"field1: -alpha, field2:-num, ...">,
516             or, more generally, a user-provided comparison function;
517             see L for a fully detailed explanation.
518              
519             Otherwise, the resulting array is in data source order.
520              
521             =item *
522              
523             if no B parameter is given, and we are in a list context, then
524             C returns a pair : the first item is a reference to an array
525             of records as explained above ; the second item is a reference to an
526             array of line numbers corresponding to those records (first data line
527             has number 0). These line numbers might be useful later
528             if you update the records through the L method.
529             No B is allowed if C is called in
530             list context.
531              
532             =item *
533              
534             if a B parameter is given,
535             then C returns a reference to a hash, whose
536             values are the retrieved records, and whose keys
537             are built according to the B argument.
538             This must be either a single field name (scalar), or
539             a a list of field names (ref to an array of scalars).
540             Values corresponding to those field names will form the
541             key for each entry of the hash;
542             if necessary, multiple values are joined together
543             through L<$;|perlvar/$;>.
544             No B argument is allowed, because hashes have no ordering.
545              
546             =back
547              
548             =cut
549              
550             sub fetchall {
551 8     8 1 1571 my $self = shift;
552 8 50       44 my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
  0         0  
553              
554 8 50 33     39 croak "fetchall in list context : not allowed with 'orderBy' or 'key' arg"
      66        
555             if wantarray and ($args{orderBy} or $args{key});
556              
557 8 50 33     35 croak "fetchall : args 'orderBy' and 'key' not allowed together"
558             if $args{orderBy} and $args{key};
559              
560 8 50       33 my @k = !$args{key} ? () : ref $args{key} ? @{$args{key}} : ($args{key});
  0 100       0  
561              
562 8         18 my $filter = $args{where};
563 8 100 66     35 $filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
564              
565 8 100       26 if (@k) { # will return a hash of rows
566 2 50       8 croak "fetchall : 'orderBy' not allowed with 'key'" if $args{orderBy};
567 2 50       8 croak "fetchall in list context : not allowed with 'key'" if wantarray;
568 2         4 my $rows = {};
569 2         6 while (my $row = $self->fetchrow($filter)) {
570 19         28 $rows->{join($;, @{$row}{@k})} = $row;
  19         71  
571             }
572 2         14 return $rows;
573             }
574             else { # will return an array of rows
575 6         17 my ($rows, $line_nos) = ([], []);
576 6         19 while (my $row = $self->fetchrow($filter)) {
577 260         432 push @$rows, $row;
578 260 100       945 push @$line_nos, $. - 1 if wantarray;
579             }
580              
581 6 50       18 if ($args{orderBy}) {
582 0 0       0 croak "fetchall in list context : not allowed with 'orderBy'" if wantarray;
583 0         0 my $tmp = ref $args{orderBy};
584 0 0       0 my $cmpFunc = $tmp eq 'ARRAY' ? $self->{ht}->cmp(@{$args{orderBy}}) :
  0 0       0  
585             $tmp eq 'CODE' ? $args{orderBy} :
586             $self->{ht}->cmp($args{orderBy});
587 0         0 $rows = [sort $cmpFunc @$rows];
588             }
589 6 100       48 return wantarray ? ($rows, $line_nos) : $rows;
590             }
591             }
592              
593              
594              
595             =item C<< rewind >>
596              
597             Rewinds the file to the first data line (after the headers)
598              
599             =cut
600              
601             sub rewind {
602 26     26 1 7985 my $self = shift;
603 26         277 seek $self->{FH}, $self->{dataStart}, 0;
604 26         67 $. = 0;
605             }
606              
607              
608              
609             =item C<< ht >>
610              
611             Returns the instance of L associated with
612             the file.
613              
614             =cut
615              
616 318     318 1 390 sub ht { my $self = shift; $self->{ht}; }
  318         964  
617              
618              
619              
620             =item C<< headers >>
621              
622             returns the list of field names
623              
624             =cut
625              
626 314     314 1 432 sub headers { my $self = shift; $self->ht->names; }
  314         557  
627              
628             =item C<< stat >>
629              
630             returns a hash ref corresponding to a call of
631             L on the associated filehandle.
632             Keys of the hash have names as documented in
633             L. Ex:
634              
635             printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
636              
637             =cut
638              
639              
640 5     5 1 30755 sub stat {my $self = shift; statType->new(stat($self->{FH}));}
  5         87  
641              
642              
643             =item C<< atime >>, C<< mtime >>, C<< ctime >>
644              
645             each of these methods returns a hash ref corresponding to a call of
646             L on the last access time, last modified
647             time, or last inode change time of the associated filehandle
648             (see L for explanations).
649             Keys of the hash have names as documented in
650             L. Ex:
651              
652             my $mtime = $f->mtime;
653             printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
654              
655             =cut
656              
657 0     0 1 0 sub atime {my $self = shift; timeType->new(localtime(($self->stat->{atime})));}
  0         0  
658 1     1 1 401 sub mtime {my $self = shift; timeType->new(localtime(($self->stat->{mtime})));}
  1         4  
659 0     0 1 0 sub ctime {my $self = shift; timeType->new(localtime(($self->stat->{ctime})));}
  0         0  
660              
661             =item C<< splices >>
662              
663             splices(pos1 => 2, undef, # delete 2 lines
664             pos2 => 1, row, # replace 1 line
665             pos3 => 0, [row1, row2 ...] # insert lines
666             ...
667             -1 => 0, [row1, ... ] # append lines
668             );
669              
670             # special case : autonum if pos== -1
671              
672              
673             Updates the data, in a spirit similar to
674             L (hence the name of the method). The whole file is
675             rewritten in an atomic operation, deleting, replacing or appending
676             data lines as specified by the "splice instructions". Returns the
677             number of "splice instructions" performed.
678              
679             A splice instruction is a triple composed of :
680              
681             =over
682              
683             =item 1
684              
685             a position (line number) that specifies
686             the place where modifications will occur.
687             Line numbers start at 0.
688             Position -1 means end of data.
689              
690             =item 2
691              
692             a number of lines to delete (might be zero).
693              
694             =item 3
695              
696             a ref to a hash or to a list of hashes containing new data to
697             insert (or C if there is no new data).
698              
699             =back
700              
701             If there are several splice instructions, their positions must be
702             sorted in increasing order (except of course position -1,
703             meaning "end of data", which must appear last).
704              
705             Positions always refer to line numbers in the original file, before
706             any modifications. Therefore, it makes no sense to write
707              
708             splices(10 => 5, undef,
709             12 => 0, $myRow)
710              
711             because after deleting 5 rows at line 10, we cannot insert a new
712             row at line 12.
713              
714             The whole collection of splice instructions
715             may also be passed as an array ref instead of a list.
716              
717             If you intend to fetch rows again after a B, you
718             must L the file first.
719              
720             =cut
721              
722              
723              
724             sub splices {
725 10     10 1 629 my $self = shift;
726 10 100       33 my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
727 10         15 my $nArgs = @$args;
728 10 50       34 croak "splices : number of arguments must be multiple of 3" if $nArgs % 3;
729              
730 10         11 my $TMP = undef; # handle for a tempfile
731              
732 10         15 my $i;
733 10         28 for ($i=0; $i < $nArgs; $i+=3 ) {
734 16         54 my ($pos, $del, $lines) = @$args[$i, $i+1, $i+2];
735              
736 16         42 $self->_journal('SPLICE', $pos, $del, $lines);
737              
738 16 100 66     58 if ($pos == -1) { # we want to append new data at end of file
    100          
739 10 100       214 $TMP ? # if we have a tempfile ...
740             copyData($TMP, $self->{FH}) # copy back all remaining data
741             : seek $self->{FH}, 0, 2; # otherwise goto end of file
742 10         27 $pos = $.; # sync positions (because of test 12 lines below)
743             }
744             elsif ( # we want to put data in the middle of file and ..
745             not $TMP and $self->stat->{size} > $self->{dataStart}) {
746 2 50       94 $TMP = new File::Temp or croak "no tempfile: $^E";
747 2         1168 binmode($TMP, ":crlf");
748              
749 2         9 $self->rewind;
750 2         15 copyData($self->{FH}, $TMP);
751 2         5 $self->rewind;
752 2         87 seek $TMP, 0, 0;
753             }
754              
755 16 50       68 croak "splices : cannot go back to line $pos" if $. > $pos;
756              
757 16         61 local $/ = $self->{recordSep};
758              
759 16         44 while ($. < $pos) { # sync with tempfile
760 10         34 my $line = <$TMP>;
761 10 50       20 croak "splices : no such line : $pos ($.)" unless defined $line;
762 10         13 my $fh = $self->{FH};
763 10         32 print $fh $line;
764             }
765              
766 16         45 while ($del--) { # skip lines to delete from tempfile
767 6         13 my $line = <$TMP>;
768 6 50       19 croak "splices : no line to delete at pos $pos" unless defined $line;
769             }
770              
771 16 50       43 $lines = [$lines] if ref $lines eq 'HASH'; # single line
772 16         57 $self->_printRow(@{$_}{$self->headers}) for @$lines;
  196         2698  
773             }
774 10 100       23 copyData($TMP, $self->{FH}) if $TMP; # copy back all remaining data
775 10         451 truncate $self->{FH}, tell $self->{FH};
776 10         27 $self->_journal('ENDSPLICES');
777 10         42 return $i / 3;
778             }
779              
780              
781              
782             =item C<< append(row1, row2, ...) >>
783              
784             This appends new records at the end of data, i.e. it is
785             a shorthand for
786              
787             splices(-1 => 0, [row1, row2, ...])
788              
789             =cut
790              
791              
792             sub append {
793 4     4 1 659 my $self = shift;
794 4 100       16 my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
795 4         16 $self->splices([-1 => 0, $args]);
796             }
797              
798              
799             =item C<< clear >>
800              
801             removes all data (but keeps the header line)
802              
803             =cut
804              
805             sub clear {
806 4     4 1 869 my $self = shift;
807 4         10 $self->rewind;
808 4         11 $self->_journal('CLEAR');
809 4         125 truncate $self->{FH}, $self->{dataStart};
810             }
811              
812              
813              
814             =item C<< writeKeys({key1 => row1, key2 => ...}, @keyFields) >>
815              
816             Rewrites the whole file, applying modifications as specified
817             in the hash ref passed as first argument. Keys in this hash
818             are compared to keys built from the original data,
819             according to C<@keyFields>. Therefore, C may replace
820             an existing row, if the key corresponding to C was found ;
821             otherwise, a new row is added. If C is C, the
822             corresponding row is deleted from the file.
823              
824             C<@keyFields> must contain the name of one or several
825             fields that build up the primary key. For each data record, the
826             values corresponding to those fields are taken and
827             joined together through L<$;|perlvar/$;>, and then compared to
828             C, C, etc.
829              
830             If you intend to fetch rows again after a B, you
831             must L the file first.
832              
833             =cut
834              
835             sub writeKeys {
836 2     2 1 11 my $self = shift;
837 2         3 my $lstModifs = shift;
838 2         10 my %modifs = %$lstModifs;
839              
840 2 50       10 croak 'writeKeys : missing @keyFields' if not @_;
841              
842             # clone object associated with a temp file
843 2         32 my $clone = bless {%$self};
844 2         9 $clone->{journal} = undef;
845 2         5 $clone->{FH} = undef;
846 2 50       18 $clone->{FH} = new File::Temp or croak "no tempfile: $^E";
847 2         935 binmode($clone->{FH}, ":crlf");
848              
849 2         47 seek $self->{FH}, 0, 0; # rewind to start of FILE (not start of DATA)
850 2         10 copyData($self->{FH}, $clone->{FH});
851 2         7 $self->rewind;
852 2         5 $clone->rewind;
853              
854 2         13 $self->_journal('KEY', $_, $modifs{$_}) foreach keys %modifs;
855 2         8 $self->_journal('ENDKEYS', @_);
856              
857 2         8 while (my $row = $clone->fetchrow) {
858 20         33 my $k = join($; , @{$row}{@_});
  20         86  
859 20 100       162 my $data = exists $modifs{$k} ? $modifs{$k} : $row;
860 20 100       62 $self->_printRow(@{$data}{$self->headers}) if $data;
  18         264  
861 20         121 delete $modifs{$k};
862             #TODO : optimization, exit loop and copyData if no more items in %modifs
863             }
864              
865             # add remaining values (new keys)
866 2         8 $self->_printRow(@{$_}{$self->headers}) foreach grep {$_} values %modifs;
  0         0  
  0         0  
867              
868 2         114 truncate $self->{FH}, tell $self->{FH};
869             }
870              
871              
872             sub _journal { # ($op, @args, \details)
873             # Internal function for recording an update operation in a journal.
874             # The journal can then be replayed through method L.
875 38     38   48 my $self = shift;
876 38 100       102 return if not $self->{journal}; # return if no active journaling
877              
878 19         632 my @t = localtime;
879 19         41 $t[5] += 1900;
880 19         22 $t[4] += 1;
881 19         92 my $t = sprintf "%04d-%02d-%02d %02d:%02d:%02d", @t[5,4,3,2,1,0];
882              
883 19         40 my @args = @_;
884 19         32 my $rows = [];
885 19         43 for (ref $args[-1]) { # last arg is an array of rows or a single row or none
886 19 100       53 /ARRAY/ and do {($rows, $args[-1]) = ($args[-1], scalar(@{$args[-1]}))};
  7         11  
  7         16  
887 19 100       61 /HASH/ and do {($rows, $args[-1]) = ([$args[-1]], 1)};
  2         28  
888             }
889              
890 19         59 $self->{journal}->_printRow($t, 'ROW', @{$_}{$self->headers}) foreach @$rows;
  100         1399  
891 19         50 $self->{journal}->_printRow($t, @args);
892             }
893              
894              
895             =item C<< playJournal(open1, open2, ...) >>
896              
897             Reads a sequence of update instructions from a journal file
898             and applies them to the current tabular file.
899             Arguments C will be passed to L
900             for opening the journal file ; in most cases, just give the filename.
901              
902             The journal file must contain a sequence of instructions
903             as encoded by the automatic journaling function of this module ;
904             to activate journaling, see the C parameter of the
905             L method.
906              
907             =cut
908              
909              
910             sub playJournal {
911 1     1 1 10 my $self = shift;
912 1 50       6 croak "cannot playJournal while journaling is on!" if $self->{journal};
913 1         1 my $J;
914 1 50       4 _open($J, @_) or croak "open @_: $^E";
915              
916 1         3 my @rows = ();
917 1         5 my @splices = ();
918 1         3 my @writeKeys = ();
919              
920 1         6 local $/ = $self->{recordSep};
921              
922 1         46 while (my $line = <$J>) {
923 119         164 chomp $line;
924              
925 119         270 $line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
926 119         683 my ($t, $ins, @vals) = split $self->{rxFieldSep}, $line, -1;
927 119         610 s/$self->{fieldSepRepl}/$self->{fieldSep}/g foreach @vals;
928              
929 119         210 for ($ins) {
930 119 100       238 /^CLEAR/ and do {$self->clear; next };
  2         8  
  2         11  
931 117 100       351 /^ROW/ and do {push @rows, $self->{ht}->new(@vals); next};
  100         326  
  100         1547  
932 17 100       45 /^SPLICE/ and do {my $nRows = pop @vals;
  8         13  
933 8 50 100     37 carp "invalid number of data rows in journal at $line"
934             if ($nRows||0) != @rows;
935 8 100       38 push @splices, @vals, $nRows ? [@rows] : undef;
936 8         17 @rows = ();
937 8         41 next };
938 9 100       35 /^ENDSPLICES/ and do {$self->splices(@splices);
  5         19  
939 5         378 @splices = ();
940 5         62 next};
941 4 100       14 /^KEY/ and do {my $nRows = pop @vals;
  3         5  
942 3 50 100     18 carp "invalid number of data rows in journal at $line"
943             if ($nRows||0) > 1;
944 3 100       10 push @writeKeys, $vals[0], $nRows ? $rows[0] : undef;
945 3         5 @rows = ();
946 3         14 next };
947 1 50       7 /^ENDKEYS/ and do {$self->writeKeys({@writeKeys}, @vals);
  1         8  
948 1         357 @writeKeys = ();
949 1         10 next};
950             }
951             }
952             }
953              
954              
955              
956             =item C<< compileFilter(query [, implicitPlus]) >>
957              
958             Compiles a query into a filter (code reference) that can be passed to
959             L or L.
960              
961             The query can be
962              
963             =over
964              
965             =item *
966              
967             a regular expression compiled through C<< qr/.../ >>. The regex will be applied
968             to whole data lines, and therefore covers all fields at once.
969             This is the fastest way to filter lines, because it avoids systematic
970             splitting into data records.
971              
972             =item *
973              
974             a data structure resulting from a previous call to
975             C
976              
977             =item *
978              
979             a string of shape C<< K_E_Y : value >> (without any spaces before
980             or after ':'). This will be compiled into
981             a regex matching C in the first column.
982             The special spelling is meant to avoid collision with a real field
983             hypothetically named 'KEY'.
984              
985             =item *
986              
987             a string that will be analyzed through C, and
988             then compiled into a filter function. The query string can contain
989             boolean combinators, parenthesis, comparison operators, etc., as
990             documented in L. The optional second argument
991             I is passed to C ;
992             if true, an implicit '+' is added in front of every
993             query item (therefore the whole query is a big AND).
994              
995             Notice that in addition to usual comparison operators,
996             you can also use regular expressions
997             in queries like
998              
999             +field1=~'^[abc]+' +field2!~'foobar$'
1000              
1001             The query compiler needs to distinguish between word and non-word
1002             characters ; therefore it is important to C in your
1003             scripts (see L). The compiler tries to be clever about a
1004             number of details :
1005              
1006             =over
1007              
1008             =item looking for complete words
1009              
1010             Words in queries become regular expressions enclosed by C<\b> (word
1011             boundaries) ; so a query for C will not match C.
1012              
1013             =item supports * for word completion
1014              
1015             A '*' in a word is compiled into regular expression C<\w*> ;
1016             so queries C or C<*bar> will both match C.
1017              
1018             =item case insensitive, accent-insensitive
1019              
1020             Iso-latin-1 accented characters are translated into character
1021             classes, so for example C becomes C.
1022             Furthermore, as shown in this example, the C flag is turned
1023             on (case-insensitive). Therefore this query will also match
1024             C.
1025              
1026             =item numbers and dates in operators
1027              
1028             When compiling a subquery like C<< fieldname >= 'value' >>, the compiler
1029             checks the value against C and C (as specified in the
1030             L method). Depending on these tests, the subquery is translated
1031             into a string comparison, a numerical comparison, or a date
1032             comparison (more precisely, C<< {date2str($a) cmp date2str($b)} >>).
1033              
1034              
1035             =item set of integers
1036              
1037             Operator C<#> means comparison with a set of integers; internally
1038             this is implemented with a bit vector. So query
1039             C will return records where
1040             field C contains one of the listed integers.
1041             The field name may be omitted if it is the first
1042             field (usually the key field).
1043              
1044             =item pre/postMatch
1045              
1046             Words matched by a query can be highlighted; see
1047             parameters C and C in the L method.
1048              
1049             =back
1050              
1051             =back
1052              
1053             =cut
1054              
1055              
1056             sub compileFilter {
1057 8     8 1 13 my $self = shift;
1058 8         10 my $query = shift;
1059 8         10 my $implicitPlus = shift;
1060              
1061 8 50       29 return $self->_cplRegex($query) if ref $query eq 'Regexp';
1062              
1063 8 50       20 unless (ref $query eq 'HASH') { # if HASH, query was already parsed
1064 8         46 $query = Search::QueryParser->new->parse($query, $implicitPlus);
1065             }
1066              
1067 8         1318 my $code = $self->_cplQ($query);
1068 1 50   1   8 eval 'sub {no warnings "numeric"; (' .$code. ') ? $_[0] : undef;}'
  1     1   1  
  1     1   90  
  1     1   8  
  1     1   1  
  1     1   187  
  1     1   7  
  1     1   1  
  1         80  
  1         6  
  1         2  
  1         80  
  1         5  
  1         2  
  1         65  
  1         5  
  1         2  
  1         68  
  1         5  
  1         2  
  1         65  
  1         12  
  1         2  
  1         88  
  8         640  
1069             or croak $@;
1070             }
1071              
1072              
1073             sub _cplRegex {
1074 0     0   0 my $self = shift;
1075 0         0 my $regex = shift;
1076 0     0   0 return eval {sub {$_[0]->{line} =~ $regex}};
  0         0  
  0         0  
1077             }
1078              
1079              
1080             sub _cplQ {
1081 9     9   16 my $self = shift;
1082 9         13 my $q = shift;
1083              
1084 9         12 my $mandatory = join(" and ", map {$self->_cplSubQ($_)} @{$q->{'+'}});
  2         19  
  9         29  
1085 9         11 my $exclude = join(" or ", map {$self->_cplSubQ($_)} @{$q->{'-'}});
  3         7  
  9         23  
1086 9         13 my $optional = join(" or ", map {$self->_cplSubQ($_)} @{$q->{''}});
  7         22  
  9         22  
1087              
1088 9 50 66     53 croak "missing positive criteria in query" if not ($mandatory || $optional);
1089 9   66     40 my $r = "(" . ($mandatory || $optional) . ")";
1090 9 100       22 $r .= " and not ($exclude)" if $exclude;
1091 9         22 return $r;
1092             }
1093              
1094              
1095             sub _cplSubQ {
1096 12     12   17 my $self = shift;
1097 12         12 my $subQ = shift;
1098              
1099 12         38 for ($subQ->{op}) {
1100              
1101             # Either a list of subqueries...
1102             /^\(\)$/
1103 12 100       29 and do {# assert(ref $subQ->{value} eq 'HASH' and not $subQ->{field})
1104             # if DEBUG;
1105 1         6 return $self->_cplQ($subQ->{value}); };
1106              
1107             # ...or a comparison operator with a word or list of words.
1108             # In that case we need to do some preparation for the source of comparison.
1109              
1110             # assert(not ref $subQ->{value} or ref $subQ->{value} eq 'ARRAY') if DEBUG;
1111              
1112             # Data to compare : either ...
1113 11         15 my $src = qq{\$_[0]->{line}}; # ... by default, the whole line ;
1114 11 100       26 if ($subQ->{field}) { # ... or an individual field.
1115 6 100       14 if ($subQ->{field} eq 'K_E_Y') { # Special pseudo field (in first position) :
1116 2         3 $subQ->{op} = '~'; # cheat, replace ':' by a regex operation.
1117 2         7 $subQ->{value} = "^$subQ->{value}(?:\\Q$self->{fieldSep}\\E|\$)";
1118             }
1119             else {
1120 4 50       6 my $fieldNum = $self->ht->{$subQ->{field}} or
1121             croak "invalid field name $subQ->{field} in request";
1122 4         11 $src = qq{_getField(\$_[0], $fieldNum)};
1123             }
1124             }
1125              
1126             /^:$/
1127 11 100       45 and do {my $s = $subQ->{value};
  7         14  
1128              
1129 7   33     65 my $noHighlights = # no result highlighting if ...
1130             $s eq '*' # .. request matches anything
1131             || ! ($self->{preMatch} || $self->{postMatch})
1132             # .. or no highlight was requested
1133             || $subQ->{field}; # .. or request is on specific field
1134              
1135 7         14 $s =~ s[\*][\\w*]g; # replace star by \w* regex
1136 7         13 $s =~ s{[\[\]\(\)+?]}{\Q$&\E}g; # escape other regex chars
1137 7         19 $s =~ s[\s+][\\s+]g; # replace spaces by \s+ regex
1138              
1139              
1140 7         14 $s =~ s/ç/[çc]/g;
1141 7         12 $s =~ s/([áàâä])/[a$1]/ig;
1142 7         14 $s =~ s/([éèêë])/[e$1]/ig;
1143 7         11 $s =~ s/([íìîï])/[i$1]/ig;
1144 7         12 $s =~ s/([óòôö])/[o$1]/ig;
1145 7         13 $s =~ s/([úùûü])/[u$1]/ig;
1146 7         11 $s =~ s/([ýÿ])/[y$1]/ig;
1147              
1148 7 100       28 my $wdIni = ($s =~ /^\w/) ? '\b' : '';
1149 7 100       25 my $wdEnd = ($s =~ /\w$/) ? '\b' : '';
1150 7         10 my $lineIni = "";
1151 7 100 100     35 $lineIni = "(?{avoidMatchKey} and not $subQ->{field};
1152 7         15 $s = "$lineIni$wdIni$s$wdEnd";
1153              
1154 7 50       46 return $noHighlights ? "($src =~ m[$s]i)" :
1155             "($src =~ s[$s][$self->{preMatch}\$&$self->{postMatch}]ig)";
1156             };
1157              
1158              
1159              
1160             /^#$/ # compare source with a list of numbers
1161 4 50       17 and do {
1162 0         0 my $has_state = eval "use feature 'state'; 1"; # true from Perl 5.10
1163 0 0       0 my $decl = $has_state ? "use feature 'state'; state \$numvec"
1164             : "my \$numvec if 0"; # undocumented hack
1165              
1166             # build a block that at first call creates a bit vector; then at
1167             # each call, the data source is compared with the bit vector
1168 0         0 return qq{
1169             do {
1170             $decl;
1171             no warnings qw/uninitialized numeric/;
1172             \$numvec or do {
1173             my \$nums = q{$subQ->{value}};
1174             vec(\$numvec, \$_, 1) = 1 for (\$nums =~ /\\d+/g);
1175             };
1176             vec(\$numvec, int($src), 1);
1177             }
1178             };
1179             };
1180              
1181              
1182             # for all other ops, $subQ->{value} must be a scalar
1183             # assert(not ref $subQ->{value}) if DEBUG;
1184              
1185 4 50 33     47 (/^(!)~$/ or /^()=?~$/) and return "$1($src =~ m[$subQ->{value}])";
1186              
1187             # choose proper comparison according to datatype of $subQ->{value}
1188 0 0       0 my $cmp = ($subQ->{value} =~ $self->{rxDate}) ?
    0          
1189             "(\$self->{date2str}($src) cmp q{" .
1190             $self->{date2str}($subQ->{value}) . "})" :
1191             ($subQ->{value} =~ $self->{rxNum}) ?
1192             "($src <=> $subQ->{value})" :
1193             # otherwise
1194             "($src cmp q{$subQ->{value}})";
1195              
1196 0 0       0 /^=?=$/ and return "$cmp == 0";
1197 0 0       0 /^(?:!=|<>)$/ and return "$cmp != 0";
1198 0 0       0 /^>$/ and return "$cmp > 0";
1199 0 0       0 /^>=$/ and return "$cmp >= 0";
1200 0 0       0 /^<$/ and return "$cmp < 0";
1201 0 0       0 /^<=$/ and return "$cmp <= 0";
1202 0         0 croak "unexpected op $_ ($subQ->{field} / $subQ->{value})";
1203             }
1204             }
1205              
1206             ############################################################
1207             # utility functions
1208             ############################################################
1209              
1210             sub urlEncode {
1211 6     6 0 9 my $s = shift;
1212 6         18 return join "", map {sprintf "%%%02X", ord($_)} split //, $s;
  6         50  
1213             }
1214              
1215             sub copyData { # copy from one filehandle to another
1216 8     8 0 34 my ($f1, $f2) = @_;
1217 8         11 my $buf;
1218 8         101 while (read $f1, $buf, BUFSIZE) {print $f2 $buf;}
  6         94  
1219             }
1220              
1221             =back
1222              
1223             =head1 AUTHOR
1224              
1225             Laurent Dami, Elaurent.dami AT etat ge chE
1226              
1227              
1228             =head1 COPYRIGHT AND LICENSE
1229              
1230             Copyright (C) 2005 by Laurent Dami.
1231              
1232             This library is free software; you can redistribute it and/or modify
1233             it under the same terms as Perl itself.
1234              
1235             =cut
1236              
1237             1;