File Coverage

blib/lib/File/Tabular.pm
Criterion Covered Total %
statement 358 394 90.8
branch 130 206 63.1
condition 42 71 59.1
subroutine 45 49 91.8
pod 16 18 88.8
total 591 738 80.0


line stmt bran cond sub pod time code
1             package File::Tabular;
2              
3             # TODO : -optimize _getField (could probably dispense with
4             # "mkRecord", call $self->..)
5             # -BUG : preMatch/postMatch won't work on explicit field searches
6             # -optimize: postpone preMatch/postMatch until display time
7             # -escaping fieldSep : make it optional
8              
9 1     1   128719 use strict;
  1         2  
  1         49  
10 1     1   6 use warnings;
  1         2  
  1         66  
11 1     1   6 no warnings 'uninitialized';
  1         2  
  1         53  
12 1     1   709 use locale;
  1         1021  
  1         9  
13 1     1   44 use Carp;
  1         1  
  1         71  
14 1     1   4 use Fcntl ':flock';
  1         1  
  1         135  
15 1     1   573 use Hash::Type;
  1         1850  
  1         46  
16 1     1   712 use Search::QueryParser 0.92;
  1         2470  
  1         2669  
17 1     1   15 use File::Temp;
  1         3  
  1         295  
18              
19             our $VERSION = "0.73";
20              
21              
22             =head1 NAME
23              
24             File::Tabular - searching and editing flat tabular files
25              
26             =head1 SYNOPSIS
27              
28             use File::Tabular;
29             my $f = File::Tabular->new($filename);
30              
31             my $row = $f->fetchrow;
32             print $row->{field1}, $row->{field2};
33              
34             $row = $f->fetchrow(where => 'someWord');
35             $row = $f->fetchrow(where => 'field1 > 4 AND field2 >= "01.01.2001"');
36             $row = $f->fetchrow(where => qr/some\s+(complex\s*)?(regex|regular expression)/i);
37              
38             $f->rewind;
39             my $rows = $f->fetchall(where => 'someField =~ ^[abc]+');
40             print $_->{someField} foreach @$rows;
41              
42             $f->rewind;
43             $rows = $f->fetchall(where => '+field1:someWord -field2:otherWord',
44             orderBy => 'field3, field6:num, field5:-alpha');
45              
46             $f->rewind;
47             my $hashRows = $f->fetchall(where => 'foo AND NOT bar',
48             key => 'someField');
49             print $hashRows->{someKey}{someOtherField};
50              
51             # open for updates, and remember the updates in a journal file
52             $f = File::Tabular->new("+<$filename", {journal => $journalFile});
53              
54             # updates at specific positions (line numbers)
55             $f->splices(4 => 2, undef, # delete 2 lines from position 4
56             7 => 1, {f1 => $v1, f2 => $v2, ...}, # replace line 7
57             9 => 0, { ...}, # insert 1 new line at position 9
58             22 => 0, [{...}, {...}, ...] # insert several lines at pos. 22
59             ...
60             -1 => 0, [{...}, {...}, ...] # append at the end
61             );
62              
63             # shorthand to add new data at the end
64             $f->append({f1 => $v1, f2 => $v2, ...});
65             # same thing, but use the "Hash::Type" associated to the file
66             $f->append($f->ht->new($v1, $v2, ...));
67              
68              
69             $f->clear; # removes all data (but keeps the header line)
70              
71             # updates at specific keys, corresponding to @keyFields
72             $f->writeKeys({key1 => {f1 => $v1, f2 => $v2, ...}, # add or update
73             key2 => undef, # remove
74             ...
75             }, @keyFields);
76              
77              
78             # replay the updates on a backup file
79             my $bck = File::Tabular->new("+<$backupFile");
80             $bck->playJournal($journalFile);
81              
82             # get info from associated filehandle
83             printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
84             my $mtime = $f->mtime;
85             printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
86              
87             =head1 DESCRIPTION
88              
89             A I is a flat text file containing data organised
90             in rows (records) and columns (fields).
91              
92             This module provides database-like functionalities for managing
93             tabular files : retrieving, searching, writing, autonumbering, journaling.
94             However, unlike other modules like L, it doesn't try
95             to make it look like a database : rather, the API was designed
96             specifically for work with tabular files.
97             Instead of SQL, search queries are specified in a web-like
98             fashion, with support for regular expressions and cross-field
99             searches. Queries are compiled internally into perl closures
100             before being applied to every data record, which makes it
101             quite fast.
102              
103             Write operations take a list of modifications as argument;
104             then they apply the whole list atomically in a single rewrite
105             of the data file.
106              
107             Here are some of the reasons why you might choose to
108             work with a tabular file rather than a regular database :
109              
110             =over
111              
112             =item *
113              
114             no need to install a database system (not even buy one)!
115              
116             =item *
117              
118             easy portability and data exchange with external tools
119             (text editor, spreadsheet, etc.)
120              
121             =item *
122              
123             search queries immediately ready for a web application
124              
125             =item *
126              
127             good search performance, even with several thousand records
128              
129             =back
130              
131              
132             On the other hand, tabular files will probably be inappropriate if you
133             need very large volumes of data, complex multi-table data models or
134             frequent write operations.
135              
136              
137              
138             =head1 METHODS
139              
140             =over
141              
142             =item C<< new (open1, open2, ..., {opt1 => v1, opt2 => v2, ...}) >>
143              
144             Creates a new tabular file object.
145             The list of arguments C is fed directly to
146             L for opening the associated file.
147             A reference to an already opened filehandle can also be given instead.
148              
149             The final hash ref is a collection of optional parameters, taken
150             from the following list :
151              
152             =over
153              
154             =item fieldSep
155              
156             field separator : any character except '%' ('|' by default).
157             Escape sequences like C<\t> are admitted.
158              
159             =item recordSep
160              
161             record separator ('\n' by default).
162              
163             =item crlf
164              
165             if true, filehandles use the L layer C<:crlf>.
166             This is the default, for unfortunate historical reasons.
167              
168              
169             =item fieldSepRepl
170              
171             string to substitute if fieldSep is met in the data.
172             (by default, url encoding of B, i.e. '%7C' )
173              
174             =item recordSepRepl
175              
176             string to substitute if recordSep is met in the data
177             (by default, url encoding of B, i.e. '%0A' )
178              
179              
180             =item autoNumField
181              
182             name of field for which autonumbering is turned on (none by default).
183             This is useful to generate keys : when you write a record, the
184             character '#' in that field will be replaced by a fresh number,
185             incremented automatically. This number will be 1 + the
186             largest number read I (it is your responsability to read all
187             records before the first write operation).
188              
189             =item autoNum
190              
191             initial value of the counter for autonumbering (1 by default).
192              
193             =item autoNumChar
194              
195             character that will be substituted by an autonumber when
196             writing records ('#' by default).
197              
198              
199             =item flockMode
200              
201             mode for locking the file, see L. By default,
202             this will be LOCK_EX if B contains 'E' or
203             '+E', LOCK_SH otherwise.
204              
205             =item flockAttempts
206              
207             number of attempts to lock the file,
208             at 1 second intervals, before returning an error.
209             Zero by default.
210             If nonzero, LOCK_NB is added to flockMode;
211             if zero, a single locking attempt will be made, blocking
212             until the lock is available.
213              
214             =item headers
215              
216             reference to an array of field names.
217             If not present, headers will be read from the first line of
218             the file.
219              
220             =item printHeaders
221              
222             if true, the B will be printed to the file.
223             If not specified, treated as 'true' if
224             B contains 'E'.
225              
226             =item journal
227              
228             name of journaling file, or reference to a list of arguments for
229             L. The journaling file will log all write operations.
230             If specified as a simple file name, it will be be opened in
231             'EE' mode.
232              
233             A journal file can then be replayed through method L
234             (this is useful to recover after a crash, by playing the journal
235             on a backup copy of your data).
236              
237             =item rxDate
238              
239             Regular expression for matching a date.
240             Default value is C<< qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/ >>.
241             This will be used by L to perform appropriate comparisons.
242              
243             =item date2str
244              
245             Ref to a function for transforming dates into strings
246             suitable for sorting (i.e. year-month-day).
247             Default is :
248              
249             sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
250             $y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
251             return sprintf "%04d%02d%02d", $y, $m, $d;}
252              
253             =item rxNum
254              
255             Regular expression for matching a number.
256             Default value is C<< qr/^[-+]?\d+(?:\.\d*)?$/ >>.
257             This will be used by L to perform appropriate comparisons.
258              
259             =item preMatch/postMatch
260              
261             Strings to insert before or after a match when filtering rows
262             (will only apply to search operator ':' on the whole line, i.e.
263             query C<< "foo OR bar" >> will highlight both "foo" and "bar",
264             but query C<< "~ 'foo' OR someField:bar" >>
265             will not highlight anything; furthermore, a match-all
266             request containing just '*' will not highlight anything either).
267              
268             =item avoidMatchKey
269              
270             If true, searches will avoid to match on the first field. So a request
271             like C<< $ft->fetchall(where => '123 OR 456') >> will not find
272             the record with key 123, unless the word '123' appears somewhere
273             in the other fields. This is useful when queries come from a Web
274             application, and we don't want users to match a purely technical
275             field.
276              
277             This search behaviour will not apply to regex searches. So requests like
278             C<< $ft->fetchall(where => qr/\b(123|456)\b/) >>
279             or
280             C<< $ft->fetchall(where => ' ~ 123 OR ~ 456') >>
281             will actually find the record with key 123.
282              
283             =back
284              
285             =cut
286              
287             ############################################################
288             # CONSTANTS
289             ############################################################
290              
291 1     1   9 use constant BUFSIZE => 1 << 21; # 2MB, used in copyData
  1         3  
  1         531  
292              
293             use constant DEFAULT => {
294             fieldSep => '|',
295             recordSep => "\n",
296             crlf => 1,
297             autoNumField => undef,
298             autoNumChar => '#',
299             autoNum => 1,
300             lockAttempts => 0,
301             rxNum => qr/^[-+]?\d+(?:\.\d*)?$/,
302             rxDate => qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/,
303 0         0 date2str => sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
304 0 0 0     0 $y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
    0          
305 0         0 return sprintf "%04d%02d%02d", $y, $m, $d;},
306 1         149 preMatch => '',
307             postMatch => '',
308             avoidMatchKey => undef,
309 1     1   11 };
  1         2  
310              
311              
312             use constant {
313 1         9 statType => Hash::Type->new(qw(dev ino mode nlink uid gid rdev size
314             atime mtime ctime blksize blocks)),
315             timeType => Hash::Type->new(qw(sec min hour mday mon year wday yday isdst))
316 1     1   8 };
  1         2  
317              
318              
319             ############################################################
320             # METHODS
321             ############################################################
322              
323             sub new {
324 3     3 1 162389 my $class = shift;
325 3 50       18 my $args = ref $_[-1] eq 'HASH' ? pop : {};
326              
327             # create object with default values
328 3         10 my $self = bless {};
329 3         6 foreach my $option (keys %{DEFAULT()}) {
  3         33  
330 39   100     182 $self->{$option} = $args->{$option} || DEFAULT->{$option};
331             }
332              
333             # eval to expand escape sequences, for example if fieldSep is given as '\t'
334 3         11 foreach my $option (qw(fieldSep recordSep)) {
335 6         558 $self->{$option} = eval qq{qq{$self->{$option}}};
336             }
337              
338             # field and record separators
339 3 50       20 croak "can't use '%' as field separator" if $self->{fieldSep} =~ /%/;
340              
341 3   33     28 $self->{recordSepRepl} = $args->{recordSepRepl} || urlEncode($self->{recordSep});
342 3   33     17 $self->{fieldSepRepl} = $args->{fieldSepRepl} || urlEncode($self->{fieldSep});
343 3         47 $self->{rxFieldSep} = qr/\Q$self->{fieldSep}\E/;
344              
345              
346             # open file and get lock
347 3 50       19 $self->_open($self->{FH}, @_) or croak "open @_ : $! $^E";
348 3   50     18 my $flockAttempts = $args->{flockAttempts} || 0;
349 3   33     30 my $flockMode = $args->{flockMode} || ($_[0] =~ />|\+
350 3 50       10 $flockMode |= LOCK_NB if $flockAttempts > 0;
351 3         14 for (my $n = $flockAttempts; $n >= 1; $n--) {
352 0 0       0 last if flock $self->{FH}, $flockMode; # exit loop if flock succeeded
353 0 0       0 $n > 1 ? sleep(1) : croak "could not flock @_: $^E";
354             };
355              
356             # setup journaling
357 3 100       9 if (exists $args->{journal}) {
358 1         4 my $j = {}; # create a fake object for _printRow
359 1         9 $j->{$_} = $self->{$_} foreach qw(fieldSep recordSep
360             fieldSepRepl recordSepRepl);
361 1 50       11 $self->_open($j->{FH}, ref $args->{journal} eq 'ARRAY' ? @{$args->{journal}}
  0 50       0  
362             : ">>$args->{journal}")
363             or croak "open journal $args->{journal} : $^E";
364 1         4 $self->{journal} = bless $j;
365             }
366              
367             # field headers
368 3   100     28 my $h = $args->{headers} || [split($self->{rxFieldSep}, $self->_getLine, -1)];
369 3         28 $self->{ht} = Hash::Type->new(@$h);
370 3 50       193 my $must_print_headers = exists $args->{printHeaders} ? $args->{printHeaders} : ($_[0] =~ />/);
371 3 100       16 $self->_printRow(@$h) if $must_print_headers;
372              
373             # ready for reading data lines
374 3         11 $self->{dataStart} = tell($self->{FH});
375 3         10 $. = 0; # setting line counter to zero for first dataline
376              
377              
378             # create a closure which takes a (already chomped) line and returns a record
379 3         6 my %tmp; # copy some attributes of $self in order to avoid a cyclic ref
380 3         23 $tmp{$_} = $self->{$_} foreach qw/rxFieldSep fieldSepRepl fieldSep ht/;
381             $self->{mkRecord} = sub {
382 432     432   2380 my @vals = split $tmp{rxFieldSep}, $_[0], -1;
383 432         2674 s/$tmp{fieldSepRepl}/$tmp{fieldSep}/g foreach @vals;
384 432         1395 return $tmp{ht}->new(@vals);
385 3         19 };
386              
387 3         35 return $self;
388             }
389              
390              
391             sub _open { # CORE::open is prototyped, so it cannot take args from an array. This is a workaround
392 5     5   10 my $self = shift;
393 5 100       587 my $result = (ref $_[1] eq 'GLOB') ? $_[0] = $_[1] :
    50          
    50          
394             @_ > 3 ? open($_[0], $_[1], $_[2], @_[3..$#_]) :
395             @_ > 2 ? open($_[0], $_[1], $_[2]) :
396             open($_[0], $_[1]);
397 5 50 33     78 binmode($_[0], ":crlf") if $result && $self->{crlf};
398 5         23 return $result;
399             }
400              
401              
402             sub _getLine {
403 768     768   2475 my $self = shift;
404 768         2776 local $/ = $self->{recordSep};
405 768         3693 my $line = readline $self->{FH};
406 768 100       1645 if (defined $line) {
407 755         1318 chomp $line;
408 755         2639 $line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
409             }
410 768         2862 return $line;
411             }
412              
413              
414             sub _printRow { # Internal function to print a data row and automatically deal with
415             # autonumbering, if necessary.
416 335     335   1132 my ($self, @vals) = @_;
417              
418             # autoNumbering
419 335 100       6047 if ($self->{autoNumField}) {
420 216         464 my $ix = $self->{ht}{$self->{autoNumField}} - 1;
421 216 100       1638 if ($vals[$ix] =~ s/$self->{autoNumChar}/$self->{autoNum}/) {
    100          
422 8         19 $self->{autoNum} += 1;
423             }
424             elsif ($vals[$ix] =~ m/(\d+)/) {
425 206         657 my $next_autonum = $1 + 1;
426 206 100       522 $self->{autoNum} = $next_autonum if $next_autonum > $self->{autoNum};
427             }
428             }
429              
430             # build row and print it
431 335         2867 s/\Q$self->{fieldSep}\E/$self->{fieldSepRepl}/ foreach @vals;
432 335         900 my $line = join $self->{fieldSep}, @vals;
433 335         1003 $line =~ s/\Q$self->{recordSep}\E/$self->{recordSepRepl}/g;
434 335         622 my $fh = $self->{FH};
435 335         2161 print $fh $line, $self->{recordSep};
436             }
437              
438              
439             =item C<< fetchrow(where => filter) >>
440              
441             returns the next record matching the (optional) filter. If there is
442             no filter, just returns the next record.
443              
444             The filter is either a code reference generated by L,
445             or a string which will be automatically fed as argument to
446             L; this string can contain just a word, a regular
447             expression, a complex boolean query involving field names and
448             operators, etc., as explained below.
449              
450             =cut
451              
452             # _getField($r, $fieldNumber)
453             # Internal method for lazy creation of a record from a line.
454             # Will be called only when a specific field is required.
455             # See creation of $r in method 'fetchrow' just below.
456              
457 200   66 200   1234 sub _getField { tied(%{$_[0]->{record} ||= $_[0]->{mkRecord}($_[0]->{line})})->[$_[1]]; }
  200         1992  
458              
459              
460             sub fetchrow {
461 385     385 1 2530 my $self = shift;
462 385         594 my $filter = undef;
463              
464             # accept fetchrow(where=>filter) or fetchrow({where=>filter}) or fetchrow(filter)
465 385 50       1339 my @args = ref $_[0] eq 'HASH' ? @{$_[0]} : @_;
  0         0  
466 385 100       762 if (@args) {
467 361 100       963 shift @args if $args[0] =~ /^where$/i;
468 361 50       790 croak "fetchrow : invalid number of arguments" if @args != 1;
469 361         547 $filter = $args[0];
470 361 100 100     1028 $filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
471             }
472              
473 385         990 while (my $line = $self->_getLine) {
474              
475             # create structure $r for _getField
476 754         2491 my $r = {line => $line, record => undef, mkRecord => $self->{mkRecord}};
477              
478 754 100 100     13333 next if $filter and not $filter->($r);
479              
480 372   66     3433 $r->{record} ||= $self->{mkRecord}($r->{line});
481              
482 372 100       5972 if ($self->{autoNumField}) {
483 186         755 my ($n) = $r->{record}{$self->{autoNumField}} =~ m/(\d+)/;
484 186 50 33     2549 $self->{autoNum} = $n+1 if $n and $n+1 > $self->{autoNum};
485             }
486 372         1674 return $r->{record};
487             }
488 13         58 return undef;
489             }
490              
491             =item C<< fetchall(where => filter, orderBy => cmp) >>
492              
493             =item C<< fetchall(where => filter, key => keySpecif) >>
494              
495             finds all next records matching the (optional) filter.
496             If there is no filter, finds all remaining records.
497              
498             The filter is either a code reference generated by L,
499             or a string which will be automatically fed as argument to
500             L.
501              
502             The return value depends on context and on arguments :
503              
504             =over
505              
506             =item *
507              
508             if no B parameter is given, and we are in a scalar context, then
509             C returns a reference to an array of records.
510              
511             The optional B parameter can be a field name, a ref to a list
512             of field names, a string like C<"field1: -alpha, field2:-num, ...">,
513             or, more generally, a user-provided comparison function;
514             see L for a fully detailed explanation.
515              
516             Otherwise, the resulting array is in data source order.
517              
518             =item *
519              
520             if no B parameter is given, and we are in a list context, then
521             C returns a pair : the first item is a reference to an array
522             of records as explained above ; the second item is a reference to an
523             array of line numbers corresponding to those records (first data line
524             has number 0). These line numbers might be useful later
525             if you update the records through the L method.
526             No B is allowed if C is called in
527             list context.
528              
529             =item *
530              
531             if a B parameter is given,
532             then C returns a reference to a hash, whose
533             values are the retrieved records, and whose keys
534             are built according to the B argument.
535             This must be either a single field name (scalar), or
536             a a list of field names (ref to an array of scalars).
537             Values corresponding to those field names will form the
538             key for each entry of the hash;
539             if necessary, multiple values are joined together
540             through L<$;|perlvar/$;>.
541             No B argument is allowed, because hashes have no ordering.
542              
543             =back
544              
545             =cut
546              
547             sub fetchall {
548 9     9 1 2189 my $self = shift;
549 9 50       50 my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
  0         0  
550              
551             croak "fetchall in list context : not allowed with 'orderBy' or 'key' arg"
552 9 50 33     43 if wantarray and ($args{orderBy} or $args{key});
      66        
553              
554             croak "fetchall : args 'orderBy' and 'key' not allowed together"
555 9 0 33     47 if $args{orderBy} and $args{key};
556              
557 9 50       55 my @k = !$args{key} ? () : ref $args{key} ? @{$args{key}} : ($args{key});
  0 100       0  
558              
559 9         22 my $filter = $args{where};
560 9 100 66     50 $filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
561              
562 9 100       31 if (@k) { # will return a hash of rows
563 2 50       9 croak "fetchall : 'orderBy' not allowed with 'key'" if $args{orderBy};
564 2 50       5 croak "fetchall in list context : not allowed with 'key'" if wantarray;
565 2         6 my $rows = {};
566 2         8 while (my $row = $self->fetchrow($filter)) {
567 19         43 $rows->{join($;, @{$row}{@k})} = $row;
  19         94  
568             }
569 2         83 return $rows;
570             }
571             else { # will return an array of rows
572 7         19 my ($rows, $line_nos) = ([], []);
573 7         25 while (my $row = $self->fetchrow($filter)) {
574 327         630 push @$rows, $row;
575 327 100       1036 push @$line_nos, $. - 1 if wantarray;
576             }
577              
578 7 50       27 if ($args{orderBy}) {
579 0 0       0 croak "fetchall in list context : not allowed with 'orderBy'" if wantarray;
580 0         0 my $tmp = ref $args{orderBy};
581 0         0 my $cmpFunc = $tmp eq 'ARRAY' ? $self->{ht}->cmp(@{$args{orderBy}}) :
582             $tmp eq 'CODE' ? $args{orderBy} :
583 0 0       0 $self->{ht}->cmp($args{orderBy});
    0          
584 0         0 $rows = [sort $cmpFunc @$rows];
585             }
586 7 100       115 return wantarray ? ($rows, $line_nos) : $rows;
587             }
588             }
589              
590              
591              
592             =item C<< rewind >>
593              
594             Rewinds the file to the first data line (after the headers)
595              
596             =cut
597              
598             sub rewind {
599 27     27 1 12640 my $self = shift;
600 27         438 seek $self->{FH}, $self->{dataStart}, 0;
601 27         94 $. = 0;
602             }
603              
604              
605              
606             =item C<< ht >>
607              
608             Returns the instance of L associated with
609             the file.
610              
611             =cut
612              
613 319     319 1 501 sub ht { my $self = shift; $self->{ht}; }
  319         910  
614              
615              
616              
617             =item C<< headers >>
618              
619             returns the list of field names
620              
621             =cut
622              
623 314     314 1 549 sub headers { my $self = shift; $self->ht->names; }
  314         629  
624              
625             =item C<< stat >>
626              
627             returns a hash ref corresponding to a call of
628             L on the associated filehandle.
629             Keys of the hash have names as documented in
630             L. Ex:
631              
632             printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
633              
634             =cut
635              
636              
637 5     5 1 30415 sub stat {my $self = shift; statType->new(stat($self->{FH}));}
  5         112  
638              
639              
640             =item C<< atime >>, C<< mtime >>, C<< ctime >>
641              
642             each of these methods returns a hash ref corresponding to a call of
643             L on the last access time, last modified
644             time, or last inode change time of the associated filehandle
645             (see L for explanations).
646             Keys of the hash have names as documented in
647             L. Ex:
648              
649             my $mtime = $f->mtime;
650             printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
651              
652             =cut
653              
654 0     0 1 0 sub atime {my $self = shift; timeType->new(localtime(($self->stat->{atime})));}
  0         0  
655 1     1 1 457 sub mtime {my $self = shift; timeType->new(localtime(($self->stat->{mtime})));}
  1         12  
656 0     0 1 0 sub ctime {my $self = shift; timeType->new(localtime(($self->stat->{ctime})));}
  0         0  
657              
658             =item C<< splices >>
659              
660             splices(pos1 => 2, undef, # delete 2 lines
661             pos2 => 1, row, # replace 1 line
662             pos3 => 0, [row1, row2 ...] # insert lines
663             ...
664             -1 => 0, [row1, ... ] # append lines
665             );
666              
667             # special case : autonum if pos== -1
668              
669              
670             Updates the data, in a spirit similar to
671             L (hence the name of the method). The whole file is
672             rewritten in an atomic operation, deleting, replacing or appending
673             data lines as specified by the "splice instructions". Returns the
674             number of "splice instructions" performed.
675              
676             A splice instruction is a triple composed of :
677              
678             =over
679              
680             =item 1
681              
682             a position (line number) that specifies
683             the place where modifications will occur.
684             Line numbers start at 0.
685             Position -1 means end of data.
686              
687             =item 2
688              
689             a number of lines to delete (might be zero).
690              
691             =item 3
692              
693             a ref to a hash or to a list of hashes containing new data to
694             insert (or C if there is no new data).
695              
696             =back
697              
698             If there are several splice instructions, their positions must be
699             sorted in increasing order (except of course position -1,
700             meaning "end of data", which must appear last).
701              
702             Positions always refer to line numbers in the original file, before
703             any modifications. Therefore, it makes no sense to write
704              
705             splices(10 => 5, undef,
706             12 => 0, $myRow)
707              
708             because after deleting 5 rows at line 10, we cannot insert a new
709             row at line 12.
710              
711             The whole collection of splice instructions
712             may also be passed as an array ref instead of a list.
713              
714             If you intend to fetch rows again after a B, you
715             must L the file first.
716              
717             =cut
718              
719              
720              
721             sub splices {
722 10     10 1 1172 my $self = shift;
723 10 100       35 my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
724 10         20 my $nArgs = @$args;
725 10 50       30 croak "splices : number of arguments must be multiple of 3" if $nArgs % 3;
726              
727 10         19 my $TMP = undef; # handle for a tempfile
728              
729 10         19 my $i;
730 10         29 for ($i=0; $i < $nArgs; $i+=3 ) {
731 16         55 my ($pos, $del, $lines) = @$args[$i, $i+1, $i+2];
732              
733 16         58 $self->_journal('SPLICE', $pos, $del, $lines);
734              
735 16 100 66     67 if ($pos == -1) { # we want to append new data at end of file
    100          
736             $TMP ? # if we have a tempfile ...
737             copyData($TMP, $self->{FH}) # copy back all remaining data
738 10 100       254 : seek $self->{FH}, 0, 2; # otherwise goto end of file
739 10         39 $pos = $.; # sync positions (because of test 12 lines below)
740             }
741             elsif ( # we want to put data in the middle of file and ..
742             not $TMP and $self->stat->{size} > $self->{dataStart}) {
743 2 50       93 $TMP = File::Temp->new or croak "no tempfile: $^E";
744 2 50       1346 binmode($TMP, ":crlf") if $self->{crlf};
745              
746 2         12 $self->rewind;
747 2         9 copyData($self->{FH}, $TMP);
748 2         9 $self->rewind;
749 2         155 seek $TMP, 0, 0;
750             }
751              
752 16 50       83 croak "splices : cannot go back to line $pos" if $. > $pos;
753              
754 16         85 local $/ = $self->{recordSep};
755              
756 16         48 while ($. < $pos) { # sync with tempfile
757 10         44 my $line = <$TMP>;
758 10 50       26 croak "splices : no such line : $pos ($.)" unless defined $line;
759 10         19 my $fh = $self->{FH};
760 10         64 print $fh $line;
761             }
762              
763 16         48 while ($del--) { # skip lines to delete from tempfile
764 6         79 my $line = <$TMP>;
765 6 50       20 croak "splices : no line to delete at pos $pos" unless defined $line;
766             }
767              
768 16 50       66 $lines = [$lines] if ref $lines eq 'HASH'; # single line
769 16         63 $self->_printRow(@{$_}{$self->headers}) for @$lines;
  196         2259  
770             }
771 10 100       30 copyData($TMP, $self->{FH}) if $TMP; # copy back all remaining data
772 10         913 truncate $self->{FH}, tell $self->{FH};
773 10         61 $self->_journal('ENDSPLICES');
774 10         47 return $i / 3;
775             }
776              
777              
778              
779             =item C<< append(row1, row2, ...) >>
780              
781             This appends new records at the end of data, i.e. it is
782             a shorthand for
783              
784             splices(-1 => 0, [row1, row2, ...])
785              
786             =cut
787              
788              
789             sub append {
790 4     4 1 1300 my $self = shift;
791 4 100       22 my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
792 4         20 $self->splices([-1 => 0, $args]);
793             }
794              
795              
796             =item C<< clear >>
797              
798             removes all data (but keeps the header line)
799              
800             =cut
801              
802             sub clear {
803 4     4 1 1247 my $self = shift;
804 4         17 $self->rewind;
805 4         15 $self->_journal('CLEAR');
806 4         494 truncate $self->{FH}, $self->{dataStart};
807             }
808              
809              
810              
811             =item C<< writeKeys({key1 => row1, key2 => ...}, @keyFields) >>
812              
813             Rewrites the whole file, applying modifications as specified
814             in the hash ref passed as first argument. Keys in this hash
815             are compared to keys built from the original data,
816             according to C<@keyFields>. Therefore, C may replace
817             an existing row, if the key corresponding to C was found ;
818             otherwise, a new row is added. If C is C, the
819             corresponding row is deleted from the file.
820              
821             C<@keyFields> must contain the name of one or several
822             fields that build up the primary key. For each data record, the
823             values corresponding to those fields are taken and
824             joined together through L<$;|perlvar/$;>, and then compared to
825             C, C, etc.
826              
827             If you intend to fetch rows again after a B, you
828             must L the file first.
829              
830             =cut
831              
832             sub writeKeys {
833 2     2 1 14 my $self = shift;
834 2         4 my $lstModifs = shift;
835 2         10 my %modifs = %$lstModifs;
836              
837 2 50       10 croak 'writeKeys : missing @keyFields' if not @_;
838              
839             # clone object associated with a temp file
840 2         30 my $clone = bless {%$self};
841 2         8 $clone->{journal} = undef;
842 2         6 $clone->{FH} = undef;
843 2 50       11 $clone->{FH} = File::Temp->new or croak "no tempfile: $^E";
844 2 50       1283 binmode($clone->{FH}, ":crlf") if $self->{crlf};
845              
846 2         14 seek $self->{FH}, 0, 0; # rewind to start of FILE (not start of DATA)
847 2         9 copyData($self->{FH}, $clone->{FH});
848 2         10 $self->rewind;
849 2         7 $clone->rewind;
850              
851 2         15 $self->_journal('KEY', $_, $modifs{$_}) foreach keys %modifs;
852 2         10 $self->_journal('ENDKEYS', @_);
853              
854 2         8 while (my $row = $clone->fetchrow) {
855 20         43 my $k = join($; , @{$row}{@_});
  20         82  
856 20 100       155 my $data = exists $modifs{$k} ? $modifs{$k} : $row;
857 20 100       67 $self->_printRow(@{$data}{$self->headers}) if $data;
  18         248  
858 20         157 delete $modifs{$k};
859             #TODO : optimization, exit loop and copyData if no more items in %modifs
860             }
861              
862             # add remaining values (new keys)
863 2         11 $self->_printRow(@{$_}{$self->headers}) foreach grep {$_} values %modifs;
  0         0  
  0         0  
864              
865 2         257 truncate $self->{FH}, tell $self->{FH};
866             }
867              
868              
869             sub _journal { # ($op, @args, \details)
870             # Internal function for recording an update operation in a journal.
871             # The journal can then be replayed through method L.
872 38     38   72 my $self = shift;
873 38 100       121 return if not $self->{journal}; # return if no active journaling
874              
875 19         429 my @t = localtime;
876 19         61 $t[5] += 1900;
877 19         136 $t[4] += 1;
878 19         108 my $t = sprintf "%04d-%02d-%02d %02d:%02d:%02d", @t[5,4,3,2,1,0];
879              
880 19         53 my @args = @_;
881 19         38 my $rows = [];
882 19         57 for (ref $args[-1]) { # last arg is an array of rows or a single row or none
883 19 100       218 /ARRAY/ and do {($rows, $args[-1]) = ($args[-1], scalar(@{$args[-1]}))};
  7         18  
  7         22  
884 19 100       64 /HASH/ and do {($rows, $args[-1]) = ([$args[-1]], 1)};
  2         8  
885             }
886              
887 19         58 $self->{journal}->_printRow($t, 'ROW', @{$_}{$self->headers}) foreach @$rows;
  100         1158  
888 19         58 $self->{journal}->_printRow($t, @args);
889             }
890              
891              
892             =item C<< playJournal(open1, open2, ...) >>
893              
894             Reads a sequence of update instructions from a journal file
895             and applies them to the current tabular file.
896             Arguments C will be passed to L
897             for opening the journal file ; in most cases, just give the filename.
898              
899             The journal file must contain a sequence of instructions
900             as encoded by the automatic journaling function of this module ;
901             to activate journaling, see the C parameter of the
902             L method.
903              
904             =cut
905              
906              
907             sub playJournal {
908 1     1 1 362 my $self = shift;
909 1 50       18 croak "cannot playJournal while journaling is on!" if $self->{journal};
910 1         3 my $J;
911 1 50       5 $self->_open($J, @_) or croak "open @_: $^E";
912              
913 1         3 my @rows = ();
914 1         3 my @splices = ();
915 1         2 my @writeKeys = ();
916              
917 1         7 local $/ = $self->{recordSep};
918              
919 1         80 while (my $line = <$J>) {
920 119         200 chomp $line;
921              
922 119         462 $line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
923 119         667 my ($t, $ins, @vals) = split $self->{rxFieldSep}, $line, -1;
924 119         979 s/$self->{fieldSepRepl}/$self->{fieldSep}/g foreach @vals;
925              
926 119         231 for ($ins) {
927 119 100       266 /^CLEAR/ and do {$self->clear; next };
  2         10  
  2         19  
928 117 100       300 /^ROW/ and do {push @rows, $self->{ht}->new(@vals); next};
  100         294  
  100         1898  
929 17 100       73 /^SPLICE/ and do {my $nRows = pop @vals;
  8         17  
930 8 50 100     39 carp "invalid number of data rows in journal at $line" if ($nRows||0) != @rows;
931 8 100       40 push @splices, @vals, $nRows ? [@rows] : undef;
932 8         19 @rows = ();
933 8         32 next };
934 9 100       24 /^ENDSPLICES/ and do {$self->splices(@splices);
  5         24  
935 5         487 @splices = ();
936 5         62 next};
937 4 100       12 /^KEY/ and do {my $nRows = pop @vals;
  3         7  
938 3 50 100     17 carp "invalid number of data rows in journal at $line" if ($nRows||0) > 1;
939 3 100       11 push @writeKeys, $vals[0], $nRows ? $rows[0] : undef;
940 3         7 @rows = ();
941 3         11 next };
942 1 50       5 /^ENDKEYS/ and do {$self->writeKeys({@writeKeys}, @vals);
  1         9  
943 1         396 @writeKeys = ();
944 1         10 next};
945             }
946             }
947             }
948              
949              
950              
951             =item C<< compileFilter(query [, implicitPlus]) >>
952              
953             Compiles a query into a filter (code reference) that can be passed to
954             L or L.
955              
956             The query can be
957              
958             =over
959              
960             =item *
961              
962             a regular expression compiled through C<< qr/.../ >>. The regex will be applied
963             to whole data lines, and therefore covers all fields at once.
964             This is the fastest way to filter lines, because it avoids systematic
965             splitting into data records.
966              
967             =item *
968              
969             a data structure resulting from a previous call to
970             C
971              
972             =item *
973              
974             a string of shape C<< K_E_Y : value >> (without any spaces before
975             or after ':'). This will be compiled into
976             a regex matching C in the first column.
977             The special spelling is meant to avoid collision with a real field
978             hypothetically named 'KEY'.
979              
980             =item *
981              
982             a string that will be analyzed through C, and
983             then compiled into a filter function. The query string can contain
984             boolean combinators, parenthesis, comparison operators, etc., as
985             documented in L. The optional second argument
986             I is passed to C ;
987             if true, an implicit '+' is added in front of every
988             query item (therefore the whole query is a big AND).
989              
990             Notice that in addition to usual comparison operators,
991             you can also use regular expressions
992             in queries like
993              
994             +field1=~'^[abc]+' +field2!~'foobar$'
995              
996             The query compiler needs to distinguish between word and non-word
997             characters ; therefore it is important to C in your
998             scripts (see L). The compiler tries to be clever about a
999             number of details :
1000              
1001             =over
1002              
1003             =item looking for complete words
1004              
1005             Words in queries become regular expressions enclosed by C<\b> (word
1006             boundaries) ; so a query for C will not match C.
1007              
1008             =item supports * for word completion
1009              
1010             A '*' in a word is compiled into regular expression C<\w*> ;
1011             so queries C or C<*bar> will both match C.
1012              
1013             =item case insensitive, accent-insensitive
1014              
1015             Iso-latin-1 accented characters are translated into character
1016             classes, so for example C becomes C.
1017             Furthermore, as shown in this example, the C flag is turned
1018             on (case-insensitive). Therefore this query will also match
1019             C.
1020              
1021             =item numbers and dates in operators
1022              
1023             When compiling a subquery like C<< fieldname >= 'value' >>, the compiler
1024             checks the value against C and C (as specified in the
1025             L method). Depending on these tests, the subquery is translated
1026             into a string comparison, a numerical comparison, or a date
1027             comparison (more precisely, C<< {date2str($a) cmp date2str($b)} >>).
1028              
1029              
1030             =item set of integers
1031              
1032             Operator C<#> means comparison with a set of integers; internally
1033             this is implemented with a bit vector. So query
1034             C will return records where
1035             field C contains one of the listed integers.
1036             The field name may be omitted if it is the first
1037             field (usually the key field).
1038              
1039             =item pre/postMatch
1040              
1041             Words matched by a query can be highlighted; see
1042             parameters C and C in the L method.
1043              
1044             =back
1045              
1046             =back
1047              
1048             =cut
1049              
1050              
1051             sub compileFilter {
1052 9     9 1 16 my $self = shift;
1053 9         17 my $query = shift;
1054 9         14 my $implicitPlus = shift;
1055              
1056 9 50       28 return $self->_cplRegex($query) if ref $query eq 'Regexp';
1057              
1058 9 50       24 unless (ref $query eq 'HASH') { # if HASH, query was already parsed
1059 9         53 $query = Search::QueryParser->new->parse($query, $implicitPlus);
1060             }
1061              
1062 9         1996 my $code = $self->_cplQ($query);
1063 1 50   1   9 eval 'sub {no warnings "numeric"; (' .$code. ') ? $_[0] : undef;}'
  1     1   3  
  1     1   102  
  1     1   7  
  1     1   2  
  1     1   269  
  1     1   8  
  1     1   3  
  1     1   95  
  1         12  
  1         4  
  1         153  
  1         10  
  1         3  
  1         151  
  1         10  
  1         2  
  1         150  
  1         9  
  1         2  
  1         99  
  1         9  
  1         2  
  1         175  
  1         12  
  1         2  
  1         126  
  9         1142  
1064             or croak $@;
1065             }
1066              
1067              
1068             sub _cplRegex {
1069 0     0   0 my $self = shift;
1070 0         0 my $regex = shift;
1071 0     0   0 return eval {sub {$_[0]->{line} =~ $regex}};
  0         0  
  0         0  
1072             }
1073              
1074              
1075             sub _cplQ {
1076 10     10   24 my $self = shift;
1077 10         16 my $q = shift;
1078              
1079 10         17 my $mandatory = join(" and ", map {$self->_cplSubQ($_)} @{$q->{'+'}});
  3         25  
  10         34  
1080 10         19 my $exclude = join(" or ", map {$self->_cplSubQ($_)} @{$q->{'-'}});
  3         6  
  10         27  
1081 10         19 my $optional = join(" or ", map {$self->_cplSubQ($_)} @{$q->{''}});
  7         43  
  10         24  
1082              
1083 10 50 66     42 croak "missing positive criteria in query" if not ($mandatory || $optional);
1084 10   66     33 my $r = "(" . ($mandatory || $optional) . ")";
1085 10 100       24 $r .= " and not ($exclude)" if $exclude;
1086 10         28 return $r;
1087             }
1088              
1089              
1090             sub _cplSubQ {
1091 13     13   23 my $self = shift;
1092 13         22 my $subQ = shift;
1093              
1094 13         33 for ($subQ->{op}) {
1095              
1096             # Either a list of subqueries...
1097             /^\(\)$/
1098 13 100       39 and do {# assert(ref $subQ->{value} eq 'HASH' and not $subQ->{field})
1099             # if DEBUG;
1100 1         5 return $self->_cplQ($subQ->{value}); };
1101              
1102             # ...or a comparison operator with a word or list of words.
1103             # In that case we need to do some preparation for the source of comparison.
1104              
1105             # assert(not ref $subQ->{value} or ref $subQ->{value} eq 'ARRAY') if DEBUG;
1106              
1107             # Data to compare : either ...
1108 12         22 my $src = qq{\$_[0]->{line}}; # ... by default, the whole line ;
1109 12 100       55 if ($subQ->{field}) { # ... or an individual field.
1110 7 100       35 if ($subQ->{field} eq 'K_E_Y') { # Special pseudo field (in first position) :
1111 2         5 $subQ->{op} = '~'; # cheat, replace ':' by a regex operation.
1112 2         10 $subQ->{value} = "^$subQ->{value}(?:\\Q$self->{fieldSep}\\E|\$)";
1113             }
1114             else {
1115             my $fieldNum = $self->ht->{$subQ->{field}} or
1116 5 50       11 croak "invalid field name $subQ->{field} in request";
1117 5         11 $src = qq{_getField(\$_[0], $fieldNum)};
1118             }
1119             }
1120              
1121             /^:$/
1122 12 100       40 and do {my $s = $subQ->{value};
  7         19  
1123              
1124             my $noHighlights = # no result highlighting if ...
1125             $s eq '*' # .. request matches anything
1126             || ! ($self->{preMatch} || $self->{postMatch})
1127             # .. or no highlight was requested
1128 7   33     54 || $subQ->{field}; # .. or request is on specific field
1129              
1130 7         18 $s =~ s[\*][\\w*]g; # replace star by \w* regex
1131 7         15 $s =~ s{[\[\]\(\)+?]}{\Q$&\E}g; # escape other regex chars
1132 7         18 $s =~ s[\s+][\\s+]g; # replace spaces by \s+ regex
1133              
1134              
1135 7         13 $s =~ s/ç/[çc]/g;
1136 7         17 $s =~ s/([áàâä])/[a$1]/ig;
1137 7         13 $s =~ s/([éèêë])/[e$1]/ig;
1138 7         51 $s =~ s/([íìîï])/[i$1]/ig;
1139 7         13 $s =~ s/([óòôö])/[o$1]/ig;
1140 7         15 $s =~ s/([úùûü])/[u$1]/ig;
1141 7         17 $s =~ s/([ýÿ])/[y$1]/ig;
1142              
1143 7 100       26 my $wdIni = ($s =~ /^\w/) ? '\b' : '';
1144 7 100       27 my $wdEnd = ($s =~ /\w$/) ? '\b' : '';
1145 7         14 my $lineIni = "";
1146 7 100 100     34 $lineIni = "(?{avoidMatchKey} and not $subQ->{field};
1147 7         18 $s = "$lineIni$wdIni$s$wdEnd";
1148              
1149 7 50       44 return $noHighlights ? "($src =~ m[$s]i)" :
1150             "($src =~ s[$s][$self->{preMatch}\$&$self->{postMatch}]ig)";
1151             };
1152              
1153              
1154              
1155             /^#$/ # compare source with a list of numbers
1156 5 50       17 and do {
1157 0         0 my $has_state = eval "use feature 'state'; 1"; # true from Perl 5.10
1158 0 0       0 my $decl = $has_state ? "use feature 'state'; state \$numvec"
1159             : "my \$numvec if 0"; # undocumented hack
1160              
1161             # build a block that at first call creates a bit vector; then at
1162             # each call, the data source is compared with the bit vector
1163 0         0 return qq{
1164             do {
1165             $decl;
1166             no warnings qw/uninitialized numeric/;
1167             \$numvec or do {
1168             my \$nums = q{$subQ->{value}};
1169             vec(\$numvec, \$_, 1) = 1 for (\$nums =~ /\\d+/g);
1170             };
1171             vec(\$numvec, int($src), 1);
1172             }
1173             };
1174             };
1175              
1176              
1177             # for all other ops, $subQ->{value} must be a scalar
1178             # assert(not ref $subQ->{value}) if DEBUG;
1179              
1180 5 50 33     79 (/^(!)~$/ or /^()=?~$/) and return "$1($src =~ m[$subQ->{value}])";
1181              
1182             # choose proper comparison according to datatype of $subQ->{value}
1183             my $cmp = ($subQ->{value} =~ $self->{rxDate}) ? "(\$self->{date2str}($src) cmp q{" . $self->{date2str}($subQ->{value}) . "})" :
1184 0 0       0 ($subQ->{value} =~ $self->{rxNum}) ? "($src <=> $subQ->{value})" :
    0          
1185             "($src cmp q{$subQ->{value}})";
1186              
1187 0 0       0 /^=?=$/ and return "$cmp == 0";
1188 0 0       0 /^(?:!=|<>)$/ and return "$cmp != 0";
1189 0 0       0 /^>$/ and return "$cmp > 0";
1190 0 0       0 /^>=$/ and return "$cmp >= 0";
1191 0 0       0 /^<$/ and return "$cmp < 0";
1192 0 0       0 /^<=$/ and return "$cmp <= 0";
1193 0         0 croak "unexpected op $_ ($subQ->{field} / $subQ->{value})";
1194             }
1195             }
1196              
1197             ############################################################
1198             # utility functions
1199             ############################################################
1200              
1201             sub urlEncode {
1202 6     6 0 14 my $s = shift;
1203 6         23 return join "", map {sprintf "%%%02X", ord($_)} split //, $s;
  6         83  
1204             }
1205              
1206             sub copyData { # copy from one filehandle to another
1207 8     8 0 45 my ($f1, $f2) = @_;
1208 8         14 my $buf;
1209 8         184 while (read $f1, $buf, BUFSIZE) {print $f2 $buf;}
  6         71  
1210             }
1211              
1212             =back
1213              
1214             =head1 AUTHOR
1215              
1216             Laurent Dami, Elaurent.dami AT etat ge chE
1217              
1218              
1219             =head1 COPYRIGHT AND LICENSE
1220              
1221             Copyright (C) 2005 by Laurent Dami.
1222              
1223             This library is free software; you can redistribute it and/or modify
1224             it under the same terms as Perl itself.
1225              
1226             =cut
1227              
1228             1;