File Coverage

blib/lib/File/Tabular.pm

Criterion	Covered	Total	%
statement	351	387	90.7
branch	131	204	64.2
condition	42	68	61.7
subroutine	44	48	91.6
pod	16	18	88.8
total	584	725	80.5

line	stmt	bran	cond	sub	pod	time	code
1							package File::Tabular;
2
3							# TODO : -optimize _getField (could probably dispense with
4							# "mkRecord", call $self->..)
5							# -BUG : preMatch/postMatch won't work on explicit field searches
6							# -optimize: postpone preMatch/postMatch until display time
7							# -escaping fieldSep : make it optional
8
9							# - synopsis : example of file cloning with select (e.g. year=2004)
10
11
12
13							our $VERSION = "0.72";
14
15	1			1		31526	use strict;
	1					3
	1					31
16	1			1		4	use warnings;
	1					2
	1					24
17	1			1		10	no warnings 'uninitialized';
	1					6
	1					31
18	1			1		843	use locale;
	1					226
	1					5
19	1			1		30	use Carp;
	1					3
	1					81
20							# use Carp::Assert; # dropped because not really needed and not in Perl core
21	1			1		4	use Fcntl ':flock';
	1					1
	1					103
22	1			1		785	use Hash::Type;
	1					1615
	1					33
23	1			1		912	use Search::QueryParser 0.92;
	1					1576
	1					32
24	1			1		1115	use File::Temp;
	1					26359
	1					153
25
26							=head1 NAME
27
28							File::Tabular - searching and editing flat tabular files
29
30							=head1 SYNOPSIS
31
32							use File::Tabular;
33							my $f = new File::Tabular($filename);
34
35							my $row = $f->fetchrow;
36							print $row->{field1}, $row->{field2};
37
38							$row = $f->fetchrow(where => 'someWord');
39							$row = $f->fetchrow(where => 'field1 > 4 AND field2 >= "01.01.2001"');
40							$row = $f->fetchrow(where => qr/some\s+(complex\s*)?(regex\|regular expression)/i);
41
42							$f->rewind;
43							my $rows = $f->fetchall(where => 'someField =~ ^[abc]+');
44							print $_->{someField} foreach @$rows;
45
46							$f->rewind;
47							$rows = $f->fetchall(where => '+field1:someWord -field2:otherWord',
48							orderBy => 'field3, field6:num, field5:-alpha');
49
50							$f->rewind;
51							my $hashRows = $f->fetchall(where => 'foo AND NOT bar',
52							key => 'someField');
53							print $hashRows->{someKey}{someOtherField};
54
55							# open for updates, and remember the updates in a journal file
56							$f = new File::Tabular("+<$filename", {journal => ">>$journalFile"});
57
58							# updates at specific positions (line numbers)
59							$f->splices(4 => 2, undef, # delete 2 lines from position 4
60							7 => 1, {f1 => $v1, f2 => $v2, ...}, # replace line 7
61							9 => 0, { ...}, # insert 1 new line at position 9
62							22 => 0, [{...}, {...}, ...] # insert several lines at pos. 22
63							...
64							-1 => 0, [{...}, {...}, ...] # append at the end
65							);
66
67							# shorthand to add new data at the end
68							$f->append({f1 => $v1, f2 => $v2, ...});
69							# same thing, but use the "Hash::Type" associated to the file
70							$f->append($f->ht->new($v1, $v2, ...));
71
72
73							$f->clear; # removes all data (but keeps the header line)
74
75							# updates at specific keys, corresponding to @keyFields
76							$f->writeKeys({key1 => {f1 => $v1, f2 => $v2, ...}, # add or update
77							key2 => undef, # remove
78							...
79							}, @keyFields);
80
81
82							# replay the updates on a backup file
83							my $bck = new File::Tabular("+<$backupFile");
84							$bck->playJournal($journalFile);
85
86							# get info from associated filehandle
87							printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
88							my $mtime = $f->mtime;
89							printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
90
91							=head1 DESCRIPTION
92
93							A I is a flat text file containing data organised
94							in rows (records) and columns (fields).
95
96							This module provides database-like functionalities for managing
97							tabular files : retrieving, searching, writing, autonumbering, journaling.
98							However, unlike other modules like L, it doesn't try
99							to make it look like a database : rather, the API was designed
100							specifically for work with tabular files.
101							Instead of SQL, search queries are specified in a web-like
102							fashion, with support for regular expressions and cross-field
103							searches. Queries are compiled internally into perl closures
104							before being applied to every data record, which makes it
105							quite fast.
106
107							Write operations take a list of modifications as argument;
108							then they apply the whole list atomically in a single rewrite
109							of the data file.
110
111							Here are some of the reasons why you might choose to
112							work with a tabular file rather than a regular database :
113
114							=over
115
116							=item *
117
118							no need to install a database system (not even buy one)!
119
120							=item *
121
122							easy portability and data exchange with external tools
123							(text editor, spreadsheet, etc.)
124
125							=item *
126
127							search queries immediately ready for a web application
128
129							=item *
130
131							good search performance, even with several thousand records
132
133							=back
134
135
136							On the other hand, tabular files will probably be inappropriate if you
137							need very large volumes of data, complex multi-table data models or
138							frequent write operations.
139
140
141
142							=head1 METHODS
143
144							=over
145
146							=item C<< new (open1, open2, ..., {opt1 => v1, opt2 => v2, ...}) >>
147
148							Creates a new tabular file object.
149							The list of arguments C is fed directly to
150							L for opening the associated file.
151							Can also be a reference to an already opened filehandle.
152
153							The final hash ref is a collection of optional parameters, taken
154							from the following list :
155
156							=over
157
158							=item fieldSep
159
160							field separator : any character except '%' ('\|' by default).
161							Escape sequences like C<\t> are admitted.
162
163							=item recordSep
164
165							record separator ('\n' by default).
166
167							=item fieldSepRepl
168
169							string to substitute if fieldSep is met in the data.
170							(by default, url encoding of B, i.e. '%7C' )
171
172							=item recordSepRepl
173
174							string to substitute if recordSep is met in the data
175							(by default, url encoding of B, i.e. '%0A' )
176
177
178							=item autoNumField
179
180							name of field for which autonumbering is turned on (none by default).
181							This is useful to generate keys : when you write a record, the
182							character '#' in that field will be replaced by a fresh number,
183							incremented automatically. This number will be 1 + the
184							largest number read I (it is your responsability to read all
185							records before the first write operation).
186
187							=item autoNum
188
189							initial value of the counter for autonumbering (1 by default).
190
191							=item autoNumChar
192
193							character that will be substituted by an autonumber when
194							writing records ('#' by default).
195
196
197							=item flockMode
198
199							mode for locking the file, see L. By default,
200							this will be LOCK_EX if B contains 'E' or
201							'+E', LOCK_SH otherwise.
202
203							=item flockAttempts
204
205							Number of attempts to lock the file,
206							at 1 second intervals, before returning an error.
207							Zero by default.
208							If nonzero, LOCK_NB is added to flockMode;
209							if zero, a single locking attempt will be made, blocking
210							until the lock is available.
211
212							=item headers
213
214							reference to an array of field names.
215							If not present, headers will be read from the first line of
216							the file.
217
218							=item printHeaders
219
220							if true, the B will be printed to the file.
221							If not specified, treated as 'true' if
222							B contains 'E'.
223
224							=item journal
225
226							name of journaling file, or reference to a list of arguments for
227							L. The journaling file will log all write operations.
228							If specified as a simple file name, it will be be opened in
229							'EE' mode.
230
231							A journal file can then be replayed through method L
232							(this is useful to recover after a crash, by playing the journal
233							on a backup copy of your data).
234
235							=item rxDate
236
237							Regular expression for matching a date.
238							Default value is C<< qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/ >>.
239							This will be used by L to perform appropriate comparisons.
240
241							=item date2str
242
243							Ref to a function for transforming dates into strings
244							suitable for sorting (i.e. year-month-day).
245							Default is :
246
247							sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
248							$y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
249							return sprintf "%04d%02d%02d", $y, $m, $d;}
250
251							=item rxNum
252
253							Regular expression for matching a number.
254							Default value is C<< qr/^[-+]?\d+(?:\.\d*)?$/ >>.
255							This will be used by L to perform appropriate comparisons.
256
257							=item preMatch/postMatch
258
259							Strings to insert before or after a match when filtering rows
260							(will only apply to search operator ':' on the whole line, i.e.
261							query C<< "foo OR bar" >> will highlight both "foo" and "bar",
262							but query C<< "~ 'foo' OR someField:bar" >>
263							will not highlight anything; furthermore, a match-all
264							request containing just '*' will not highlight anything either).
265
266							=item avoidMatchKey
267
268							If true, searches will avoid to match on the first field. So a request
269							like C<< $ft->fetchall(where => '123 OR 456') >> will not find
270							the record with key 123, unless the word '123' appears somewhere
271							in the other fields. This is useful when queries come from a Web
272							application, and we don't want users to match a purely technical
273							field.
274
275							This search behaviour will not apply to regex searches. So requests like
276							C<< $ft->fetchall(where => qr/\b(123\|456)\b/) >>
277							or
278							C<< $ft->fetchall(where => ' ~ 123 OR ~ 456') >>
279							will actually find the record with key 123.
280
281							=back
282
283							=cut
284
285							############################################################
286							# CONSTANTS
287							############################################################
288
289	1			1		8	use constant BUFSIZE => 1 << 21; # 2MB, used in copyData
	1					1
	1					209
290
291	0					0	use constant DEFAULT => {
292							fieldSep => '\|',
293							recordSep => "\n",
294							autoNumField => undef,
295							autoNumChar => '#',
296							autoNum => 1,
297							lockAttempts => 0,
298							rxNum => qr/^[-+]?\d+(?:\.\d*)?$/,
299							rxDate => qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/,
300							date2str => sub {my ($d, $m, $y) =
301							($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
302	0	0	0			0	$y += ($y > 50) ? 1900 : 2000
		0
303							if defined($y) && $y < 100;
304	0					0	return sprintf "%04d%02d%02d", $y, $m, $d;},
305	1					94	preMatch => '',
306							postMatch => '',
307							avoidMatchKey => undef
308	1			1		6	};
	1					2
309
310
311							use constant {
312	1					13	statType => Hash::Type->new(qw(dev ino mode nlink uid gid rdev size
313							atime mtime ctime blksize blocks)),
314							timeType => Hash::Type->new(qw(sec min hour mday mon year wday yday isdst))
315	1			1		5	};
	1					1
316
317
318							############################################################
319							# METHODS
320							############################################################
321
322							sub new {
323	3			3	1	2342	my $class = shift;
324	3	50				18	my $args = ref $_[-1] eq 'HASH' ? pop : {};
325
326							# create object with default values
327	3					7	my $self = bless {};
328	3					9	foreach my $option (qw(fieldSep recordSep autoNumField autoNumChar autoNum
329							rxDate rxNum date2str preMatch postMatch
330							avoidMatchKey)) {
331	33		100			172	$self->{$option} = $args->{$option} \|\| DEFAULT->{$option};
332							}
333
334							# eval to expand escape sequences, for example if fieldSep is given as '\t'
335	3					10	foreach my $option (qw(fieldSep recordSep)) {
336	6					392	$self->{$option} = eval qq{qq{$self->{$option}}};
337							}
338
339							# field and record separators
340	3	50				15	croak "can't use '%' as field separator" if $self->{fieldSep} =~ /%/;
341
342	3		33			22	$self->{recordSepRepl} = $args->{recordSepRepl} \|\|
343							urlEncode($self->{recordSep});
344	3		33			18	$self->{fieldSepRepl} = $args->{fieldSepRepl} \|\|
345							urlEncode($self->{fieldSep});
346	3					47	$self->{rxFieldSep} = qr/\Q$self->{fieldSep}\E/;
347
348
349							# open file and get lock
350	3	50				17	_open($self->{FH}, @_) or croak "open @_ : $! $^E";
351	3		50			16	my $flockAttempts = $args->{flockAttempts} \|\| 0;
352	3	100	66			36	my $flockMode = $args->{flockMode} \|\|
353							$_[0] =~ />\|\+
354	3	50				19	$flockMode \|= LOCK_NB if $flockAttempts > 0;
355	3					11	for (my $n = $flockAttempts; $n >= 1; $n--) {
356	0	0				0	last if flock $self->{FH}, $flockMode; # exit loop if flock succeeded
357	0	0				0	$n > 1 ? sleep(1) : croak "could not flock @_: $^E";
358							};
359
360							# setup journaling
361	3	100				9	if (exists $args->{journal}) {
362	1					3	my $j = {}; # create a fake object for _printRow
363	1					7	$j->{$_} = $self->{$_} foreach qw(fieldSep recordSep
364							fieldSepRepl recordSepRepl);
365	1	50				7	_open($j->{FH}, ref $args->{journal} eq 'ARRAY' ? @{$args->{journal}}
	0	50				0
366							: ">>$args->{journal}")
367							or croak "open journal $args->{journal} : $^E";
368	1					4	$self->{journal} = bless $j;
369							}
370
371							# field headers
372	3		100			16	my $h = $args->{headers} \|\| [split($self->{rxFieldSep}, $self->_getLine, -1)];
373	3					24	$self->{ht} = new Hash::Type(@$h);
374	3	50				113	$self->_printRow(@$h) if
		100
375							exists $args->{printHeaders} ? $args->{printHeaders} : ($_[0] =~ />/);
376
377							# ready for reading data lines
378	3					11	$self->{dataStart} = tell($self->{FH});
379	3					13	$. = 0; # setting line counter to zero for first dataline
380
381
382							# create a closure which takes a (already chomped) line and returns a record
383	3					5	my %tmp; # copy some attributes of $self in order to avoid a cyclic ref
384	3					34	$tmp{$_} = $self->{$_} foreach qw/rxFieldSep fieldSepRepl fieldSep ht/;
385							$self->{mkRecord} = sub {
386	365			365		1829	my @vals = split $tmp{rxFieldSep}, $_[0], -1;
387	365					2062	s/$tmp{fieldSepRepl}/$tmp{fieldSep}/g foreach @vals;
388	365					1290	return $tmp{ht}->new(@vals);
389	3					17	};
390
391	3					13	return $self;
392							}
393
394
395							sub _open { # stupid : because of 'open' strange prototyping,
396							# cannot pass an array directly
397	5	100		5		736	my $result = (ref $_[1] eq 'GLOB') ? $_[0] = $_[1] :
		50
		50
398							@_ > 3 ? open($_[0], $_[1], $_[2], @_[3..$#_]) :
399							@_ > 2 ? open($_[0], $_[1], $_[2]) :
400							open($_[0], $_[1]);
401	5	50				48	binmode($_[0], ":crlf") if $result; # portably open text file, see PerlIO
402	5					21	return $result;
403							}
404
405
406
407
408							sub _getLine {
409	700			700		2416	my $self = shift;
410	700					2185	local $/ = $self->{recordSep};
411	700					2424	my $line = readline $self->{FH};
412	700	100				1279	if (defined $line) {
413	688					746	chomp $line;
414	688					1499	$line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
415							}
416	700					2784	return $line;
417							}
418
419
420							sub _printRow { # Internal function to print a data row and automatically deal with
421							# autonumbering, if necessary.
422	335			335		996	my ($self, @vals) = @_;
423
424	335	100				5763	if ($self->{autoNumField}) { # autoNumbering
425	216					458	my $ix = $self->{ht}{$self->{autoNumField}} - 1;
426	216	100				1166	if ($vals[$ix] =~ s/$self->{autoNumChar}/$self->{autoNum}/) {
		100
427	8					16	$self->{autoNum} += 1;
428							}
429							elsif ($vals[$ix] =~ m/(\d+)/) {
430	206	100				687	$self->{autoNum} = $1 + 1 if $1 + 1 > $self->{autoNum};
431							}
432							}
433
434	335					1907	s/\Q$self->{fieldSep}\E/$self->{fieldSepRepl}/ foreach @vals;
435	335					883	my $line = join $self->{fieldSep}, @vals;
436	335					672	$line =~ s/\Q$self->{recordSep}\E/$self->{recordSepRepl}/g;
437	335					488	my $fh = $self->{FH};
438	335					2021	print $fh $line, $self->{recordSep};
439							}
440
441
442							=item C<< fetchrow(where => filter) >>
443
444							returns the next record matching the (optional) filter. If there is
445							no filter, just returns the next record.
446
447							The filter is either a code reference generated by L,
448							or a string which will be automatically fed as argument to
449							L; this string can contain just a word, a regular
450							expression, a complex boolean query involving field names and
451							operators, etc., as explained below.
452
453							=cut
454
455							# _getField($r, $fieldNumber)
456							# Internal method for lazy creation of a record from a line.
457							# Will be called only when a specific field is required.
458							# See creation of $r in method 'fetchrow' just below.
459
460	133		66	133		1205	sub _getField { tied(%{$_[0]->{record} \|\|= $_[0]->{mkRecord}($_[0]->{line})})->[$_[1]]; }
	133					2218
461
462
463							sub fetchrow {
464	317			317	1	2111	my $self = shift;
465	317					342	my $filter = undef;
466
467							# accept fetchrow(where=>filter) or fetchrow({where=>filter}) or fetchrow(filter)
468	317	50				824	my @args = ref $_[0] eq 'HASH' ? @{$_[0]} : @_;
	0					0
469	317	100				644	if (@args) {
470	293	100				676	shift @args if $args[0] =~ /^where$/i;
471	293	50				568	croak "fetchrow : invalid number of arguments" if @args != 1;
472	293					424	$filter = $args[0];
473	293	100	100			757	$filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
474							}
475
476	317					707	while (my $line = $self->_getLine) {
477
478							# create structure $r for _getField
479	687					2006	my $r = {line => $line, record => undef, mkRecord => $self->{mkRecord}};
480
481	687	100	100			10791	next if $filter and not $filter->($r);
482
483	305		66			1085	$r->{record} \|\|= $self->{mkRecord}($r->{line});
484
485	305	100				4355	if ($self->{autoNumField}) {
486	186					779	my ($n) = $r->{record}{$self->{autoNumField}} =~ m/(\d+)/;
487	186	50	33			2334	$self->{autoNum} = $n+1 if $n and $n+1 > $self->{autoNum};
488							}
489	305					1342	return $r->{record};
490							}
491	12					49	return undef;
492							}
493
494							=item C<< fetchall(where => filter, orderBy => cmp) >>
495
496							=item C<< fetchall(where => filter, key => keySpecif) >>
497
498							finds all next records matching the (optional) filter.
499							If there is no filter, finds all remaining records.
500
501							The filter is either a code reference generated by L,
502							or a string which will be automatically fed as argument to
503							L.
504
505							The return value depends on context and on arguments :
506
507							=over
508
509							=item *
510
511							if no B parameter is given, and we are in a scalar context, then
512							C returns a reference to an array of records.
513
514							The optional B parameter can be a field name, a ref to a list
515							of field names, a string like C<"field1: -alpha, field2:-num, ...">,
516							or, more generally, a user-provided comparison function;
517							see L for a fully detailed explanation.
518
519							Otherwise, the resulting array is in data source order.
520
521							=item *
522
523							if no B parameter is given, and we are in a list context, then
524							C returns a pair : the first item is a reference to an array
525							of records as explained above ; the second item is a reference to an
526							array of line numbers corresponding to those records (first data line
527							has number 0). These line numbers might be useful later
528							if you update the records through the L method.
529							No B is allowed if C is called in
530							list context.
531
532							=item *
533
534							if a B parameter is given,
535							then C returns a reference to a hash, whose
536							values are the retrieved records, and whose keys
537							are built according to the B argument.
538							This must be either a single field name (scalar), or
539							a a list of field names (ref to an array of scalars).
540							Values corresponding to those field names will form the
541							key for each entry of the hash;
542							if necessary, multiple values are joined together
543							through L<$;\|perlvar/$;>.
544							No B argument is allowed, because hashes have no ordering.
545
546							=back
547
548							=cut
549
550							sub fetchall {
551	8			8	1	1571	my $self = shift;
552	8	50				44	my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
	0					0
553
554	8	50	33			39	croak "fetchall in list context : not allowed with 'orderBy' or 'key' arg"
			66
555							if wantarray and ($args{orderBy} or $args{key});
556
557	8	50	33			35	croak "fetchall : args 'orderBy' and 'key' not allowed together"
558							if $args{orderBy} and $args{key};
559
560	8	50				33	my @k = !$args{key} ? () : ref $args{key} ? @{$args{key}} : ($args{key});
	0	100				0
561
562	8					18	my $filter = $args{where};
563	8	100	66			35	$filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
564
565	8	100				26	if (@k) { # will return a hash of rows
566	2	50				8	croak "fetchall : 'orderBy' not allowed with 'key'" if $args{orderBy};
567	2	50				8	croak "fetchall in list context : not allowed with 'key'" if wantarray;
568	2					4	my $rows = {};
569	2					6	while (my $row = $self->fetchrow($filter)) {
570	19					28	$rows->{join($;, @{$row}{@k})} = $row;
	19					71
571							}
572	2					14	return $rows;
573							}
574							else { # will return an array of rows
575	6					17	my ($rows, $line_nos) = ([], []);
576	6					19	while (my $row = $self->fetchrow($filter)) {
577	260					432	push @$rows, $row;
578	260	100				945	push @$line_nos, $. - 1 if wantarray;
579							}
580
581	6	50				18	if ($args{orderBy}) {
582	0	0				0	croak "fetchall in list context : not allowed with 'orderBy'" if wantarray;
583	0					0	my $tmp = ref $args{orderBy};
584	0	0				0	my $cmpFunc = $tmp eq 'ARRAY' ? $self->{ht}->cmp(@{$args{orderBy}}) :
	0	0				0
585							$tmp eq 'CODE' ? $args{orderBy} :
586							$self->{ht}->cmp($args{orderBy});
587	0					0	$rows = [sort $cmpFunc @$rows];
588							}
589	6	100				48	return wantarray ? ($rows, $line_nos) : $rows;
590							}
591							}
592
593
594
595							=item C<< rewind >>
596
597							Rewinds the file to the first data line (after the headers)
598
599							=cut
600
601							sub rewind {
602	26			26	1	7985	my $self = shift;
603	26					277	seek $self->{FH}, $self->{dataStart}, 0;
604	26					67	$. = 0;
605							}
606
607
608
609							=item C<< ht >>
610
611							Returns the instance of L associated with
612							the file.
613
614							=cut
615
616	318			318	1	390	sub ht { my $self = shift; $self->{ht}; }
	318					964
617
618
619
620							=item C<< headers >>
621
622							returns the list of field names
623
624							=cut
625
626	314			314	1	432	sub headers { my $self = shift; $self->ht->names; }
	314					557
627
628							=item C<< stat >>
629
630							returns a hash ref corresponding to a call of
631							L on the associated filehandle.
632							Keys of the hash have names as documented in
633							L. Ex:
634
635							printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
636
637							=cut
638
639
640	5			5	1	30755	sub stat {my $self = shift; statType->new(stat($self->{FH}));}
	5					87
641
642
643							=item C<< atime >>, C<< mtime >>, C<< ctime >>
644
645							each of these methods returns a hash ref corresponding to a call of
646							L on the last access time, last modified
647							time, or last inode change time of the associated filehandle
648							(see L for explanations).
649							Keys of the hash have names as documented in
650							L. Ex:
651
652							my $mtime = $f->mtime;
653							printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
654
655							=cut
656
657	0			0	1	0	sub atime {my $self = shift; timeType->new(localtime(($self->stat->{atime})));}
	0					0
658	1			1	1	401	sub mtime {my $self = shift; timeType->new(localtime(($self->stat->{mtime})));}
	1					4
659	0			0	1	0	sub ctime {my $self = shift; timeType->new(localtime(($self->stat->{ctime})));}
	0					0
660
661							=item C<< splices >>
662
663							splices(pos1 => 2, undef, # delete 2 lines
664							pos2 => 1, row, # replace 1 line
665							pos3 => 0, [row1, row2 ...] # insert lines
666							...
667							-1 => 0, [row1, ... ] # append lines
668							);
669
670							# special case : autonum if pos== -1
671
672
673							Updates the data, in a spirit similar to
674							L (hence the name of the method). The whole file is
675							rewritten in an atomic operation, deleting, replacing or appending
676							data lines as specified by the "splice instructions". Returns the
677							number of "splice instructions" performed.
678
679							A splice instruction is a triple composed of :
680
681							=over
682
683							=item 1
684
685							a position (line number) that specifies
686							the place where modifications will occur.
687							Line numbers start at 0.
688							Position -1 means end of data.
689
690							=item 2
691
692							a number of lines to delete (might be zero).
693
694							=item 3
695
696							a ref to a hash or to a list of hashes containing new data to
697							insert (or C if there is no new data).
698
699							=back
700
701							If there are several splice instructions, their positions must be
702							sorted in increasing order (except of course position -1,
703							meaning "end of data", which must appear last).
704
705							Positions always refer to line numbers in the original file, before
706							any modifications. Therefore, it makes no sense to write
707
708							splices(10 => 5, undef,
709							12 => 0, $myRow)
710
711							because after deleting 5 rows at line 10, we cannot insert a new
712							row at line 12.
713
714							The whole collection of splice instructions
715							may also be passed as an array ref instead of a list.
716
717							If you intend to fetch rows again after a B, you
718							must L the file first.
719
720							=cut
721
722
723
724							sub splices {
725	10			10	1	629	my $self = shift;
726	10	100				33	my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
727	10					15	my $nArgs = @$args;
728	10	50				34	croak "splices : number of arguments must be multiple of 3" if $nArgs % 3;
729
730	10					11	my $TMP = undef; # handle for a tempfile
731
732	10					15	my $i;
733	10					28	for ($i=0; $i < $nArgs; $i+=3 ) {
734	16					54	my ($pos, $del, $lines) = @$args[$i, $i+1, $i+2];
735
736	16					42	$self->_journal('SPLICE', $pos, $del, $lines);
737
738	16	100	66			58	if ($pos == -1) { # we want to append new data at end of file
		100
739	10	100				214	$TMP ? # if we have a tempfile ...
740							copyData($TMP, $self->{FH}) # copy back all remaining data
741							: seek $self->{FH}, 0, 2; # otherwise goto end of file
742	10					27	$pos = $.; # sync positions (because of test 12 lines below)
743							}
744							elsif ( # we want to put data in the middle of file and ..
745							not $TMP and $self->stat->{size} > $self->{dataStart}) {
746	2	50				94	$TMP = new File::Temp or croak "no tempfile: $^E";
747	2					1168	binmode($TMP, ":crlf");
748
749	2					9	$self->rewind;
750	2					15	copyData($self->{FH}, $TMP);
751	2					5	$self->rewind;
752	2					87	seek $TMP, 0, 0;
753							}
754
755	16	50				68	croak "splices : cannot go back to line $pos" if $. > $pos;
756
757	16					61	local $/ = $self->{recordSep};
758
759	16					44	while ($. < $pos) { # sync with tempfile
760	10					34	my $line = <$TMP>;
761	10	50				20	croak "splices : no such line : $pos ($.)" unless defined $line;
762	10					13	my $fh = $self->{FH};
763	10					32	print $fh $line;
764							}
765
766	16					45	while ($del--) { # skip lines to delete from tempfile
767	6					13	my $line = <$TMP>;
768	6	50				19	croak "splices : no line to delete at pos $pos" unless defined $line;
769							}
770
771	16	50				43	$lines = [$lines] if ref $lines eq 'HASH'; # single line
772	16					57	$self->_printRow(@{$_}{$self->headers}) for @$lines;
	196					2698
773							}
774	10	100				23	copyData($TMP, $self->{FH}) if $TMP; # copy back all remaining data
775	10					451	truncate $self->{FH}, tell $self->{FH};
776	10					27	$self->_journal('ENDSPLICES');
777	10					42	return $i / 3;
778							}
779
780
781
782							=item C<< append(row1, row2, ...) >>
783
784							This appends new records at the end of data, i.e. it is
785							a shorthand for
786
787							splices(-1 => 0, [row1, row2, ...])
788
789							=cut
790
791
792							sub append {
793	4			4	1	659	my $self = shift;
794	4	100				16	my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
795	4					16	$self->splices([-1 => 0, $args]);
796							}
797
798
799							=item C<< clear >>
800
801							removes all data (but keeps the header line)
802
803							=cut
804
805							sub clear {
806	4			4	1	869	my $self = shift;
807	4					10	$self->rewind;
808	4					11	$self->_journal('CLEAR');
809	4					125	truncate $self->{FH}, $self->{dataStart};
810							}
811
812
813
814							=item C<< writeKeys({key1 => row1, key2 => ...}, @keyFields) >>
815
816							Rewrites the whole file, applying modifications as specified
817							in the hash ref passed as first argument. Keys in this hash
818							are compared to keys built from the original data,
819							according to C<@keyFields>. Therefore, C may replace
820							an existing row, if the key corresponding to C was found ;
821							otherwise, a new row is added. If C is C, the
822							corresponding row is deleted from the file.
823
824							C<@keyFields> must contain the name of one or several
825							fields that build up the primary key. For each data record, the
826							values corresponding to those fields are taken and
827							joined together through L<$;\|perlvar/$;>, and then compared to
828							C, C, etc.
829
830							If you intend to fetch rows again after a B, you
831							must L the file first.
832
833							=cut
834
835							sub writeKeys {
836	2			2	1	11	my $self = shift;
837	2					3	my $lstModifs = shift;
838	2					10	my %modifs = %$lstModifs;
839
840	2	50				10	croak 'writeKeys : missing @keyFields' if not @_;
841
842							# clone object associated with a temp file
843	2					32	my $clone = bless {%$self};
844	2					9	$clone->{journal} = undef;
845	2					5	$clone->{FH} = undef;
846	2	50				18	$clone->{FH} = new File::Temp or croak "no tempfile: $^E";
847	2					935	binmode($clone->{FH}, ":crlf");
848
849	2					47	seek $self->{FH}, 0, 0; # rewind to start of FILE (not start of DATA)
850	2					10	copyData($self->{FH}, $clone->{FH});
851	2					7	$self->rewind;
852	2					5	$clone->rewind;
853
854	2					13	$self->_journal('KEY', $_, $modifs{$_}) foreach keys %modifs;
855	2					8	$self->_journal('ENDKEYS', @_);
856
857	2					8	while (my $row = $clone->fetchrow) {
858	20					33	my $k = join($; , @{$row}{@_});
	20					86
859	20	100				162	my $data = exists $modifs{$k} ? $modifs{$k} : $row;
860	20	100				62	$self->_printRow(@{$data}{$self->headers}) if $data;
	18					264
861	20					121	delete $modifs{$k};
862							#TODO : optimization, exit loop and copyData if no more items in %modifs
863							}
864
865							# add remaining values (new keys)
866	2					8	$self->_printRow(@{$_}{$self->headers}) foreach grep {$_} values %modifs;
	0					0
	0					0
867
868	2					114	truncate $self->{FH}, tell $self->{FH};
869							}
870
871
872							sub _journal { # ($op, @args, \details)
873							# Internal function for recording an update operation in a journal.
874							# The journal can then be replayed through method L.
875	38			38		48	my $self = shift;
876	38	100				102	return if not $self->{journal}; # return if no active journaling
877
878	19					632	my @t = localtime;
879	19					41	$t[5] += 1900;
880	19					22	$t[4] += 1;
881	19					92	my $t = sprintf "%04d-%02d-%02d %02d:%02d:%02d", @t[5,4,3,2,1,0];
882
883	19					40	my @args = @_;
884	19					32	my $rows = [];
885	19					43	for (ref $args[-1]) { # last arg is an array of rows or a single row or none
886	19	100				53	/ARRAY/ and do {($rows, $args[-1]) = ($args[-1], scalar(@{$args[-1]}))};
	7					11
	7					16
887	19	100				61	/HASH/ and do {($rows, $args[-1]) = ([$args[-1]], 1)};
	2					28
888							}
889
890	19					59	$self->{journal}->_printRow($t, 'ROW', @{$_}{$self->headers}) foreach @$rows;
	100					1399
891	19					50	$self->{journal}->_printRow($t, @args);
892							}
893
894
895							=item C<< playJournal(open1, open2, ...) >>
896
897							Reads a sequence of update instructions from a journal file
898							and applies them to the current tabular file.
899							Arguments C will be passed to L
900							for opening the journal file ; in most cases, just give the filename.
901
902							The journal file must contain a sequence of instructions
903							as encoded by the automatic journaling function of this module ;
904							to activate journaling, see the C parameter of the
905							L method.
906
907							=cut
908
909
910							sub playJournal {
911	1			1	1	10	my $self = shift;
912	1	50				6	croak "cannot playJournal while journaling is on!" if $self->{journal};
913	1					1	my $J;
914	1	50				4	_open($J, @_) or croak "open @_: $^E";
915
916	1					3	my @rows = ();
917	1					5	my @splices = ();
918	1					3	my @writeKeys = ();
919
920	1					6	local $/ = $self->{recordSep};
921
922	1					46	while (my $line = <$J>) {
923	119					164	chomp $line;
924
925	119					270	$line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
926	119					683	my ($t, $ins, @vals) = split $self->{rxFieldSep}, $line, -1;
927	119					610	s/$self->{fieldSepRepl}/$self->{fieldSep}/g foreach @vals;
928
929	119					210	for ($ins) {
930	119	100				238	/^CLEAR/ and do {$self->clear; next };
	2					8
	2					11
931	117	100				351	/^ROW/ and do {push @rows, $self->{ht}->new(@vals); next};
	100					326
	100					1547
932	17	100				45	/^SPLICE/ and do {my $nRows = pop @vals;
	8					13
933	8	50	100			37	carp "invalid number of data rows in journal at $line"
934							if ($nRows\|\|0) != @rows;
935	8	100				38	push @splices, @vals, $nRows ? [@rows] : undef;
936	8					17	@rows = ();
937	8					41	next };
938	9	100				35	/^ENDSPLICES/ and do {$self->splices(@splices);
	5					19
939	5					378	@splices = ();
940	5					62	next};
941	4	100				14	/^KEY/ and do {my $nRows = pop @vals;
	3					5
942	3	50	100			18	carp "invalid number of data rows in journal at $line"
943							if ($nRows\|\|0) > 1;
944	3	100				10	push @writeKeys, $vals[0], $nRows ? $rows[0] : undef;
945	3					5	@rows = ();
946	3					14	next };
947	1	50				7	/^ENDKEYS/ and do {$self->writeKeys({@writeKeys}, @vals);
	1					8
948	1					357	@writeKeys = ();
949	1					10	next};
950							}
951							}
952							}
953
954
955
956							=item C<< compileFilter(query [, implicitPlus]) >>
957
958							Compiles a query into a filter (code reference) that can be passed to
959							L or L.
960
961							The query can be
962
963							=over
964
965							=item *
966
967							a regular expression compiled through C<< qr/.../ >>. The regex will be applied
968							to whole data lines, and therefore covers all fields at once.
969							This is the fastest way to filter lines, because it avoids systematic
970							splitting into data records.
971
972							=item *
973
974							a data structure resulting from a previous call to
975							C
976
977							=item *
978
979							a string of shape C<< K_E_Y : value >> (without any spaces before
980							or after ':'). This will be compiled into
981							a regex matching C in the first column.
982							The special spelling is meant to avoid collision with a real field
983							hypothetically named 'KEY'.
984
985							=item *
986
987							a string that will be analyzed through C, and
988							then compiled into a filter function. The query string can contain
989							boolean combinators, parenthesis, comparison operators, etc., as
990							documented in L. The optional second argument
991							I is passed to C ;
992							if true, an implicit '+' is added in front of every
993							query item (therefore the whole query is a big AND).
994
995							Notice that in addition to usual comparison operators,
996							you can also use regular expressions
997							in queries like
998
999							+field1=~'^[abc]+' +field2!~'foobar$'
1000
1001							The query compiler needs to distinguish between word and non-word
1002							characters ; therefore it is important to C in your
1003							scripts (see L). The compiler tries to be clever about a
1004							number of details :
1005
1006							=over
1007
1008							=item looking for complete words
1009
1010							Words in queries become regular expressions enclosed by C<\b> (word
1011							boundaries) ; so a query for C will not match C.
1012
1013							=item supports * for word completion
1014
1015							A '' in a word is compiled into regular expression C<\w> ;
1016							so queries C or C<*bar> will both match C.
1017
1018							=item case insensitive, accent-insensitive
1019
1020							Iso-latin-1 accented characters are translated into character
1021							classes, so for example C becomes C.
1022							Furthermore, as shown in this example, the C flag is turned
1023							on (case-insensitive). Therefore this query will also match
1024							C.
1025
1026							=item numbers and dates in operators
1027
1028							When compiling a subquery like C<< fieldname >= 'value' >>, the compiler
1029							checks the value against C and C (as specified in the
1030							L method). Depending on these tests, the subquery is translated
1031							into a string comparison, a numerical comparison, or a date
1032							comparison (more precisely, C<< {date2str($a) cmp date2str($b)} >>).
1033
1034
1035							=item set of integers
1036
1037							Operator C<#> means comparison with a set of integers; internally
1038							this is implemented with a bit vector. So query
1039							C will return records where
1040							field C contains one of the listed integers.
1041							The field name may be omitted if it is the first
1042							field (usually the key field).
1043
1044							=item pre/postMatch
1045
1046							Words matched by a query can be highlighted; see
1047							parameters C and C in the L method.
1048
1049							=back
1050
1051							=back
1052
1053							=cut
1054
1055
1056							sub compileFilter {
1057	8			8	1	13	my $self = shift;
1058	8					10	my $query = shift;
1059	8					10	my $implicitPlus = shift;
1060
1061	8	50				29	return $self->_cplRegex($query) if ref $query eq 'Regexp';
1062
1063	8	50				20	unless (ref $query eq 'HASH') { # if HASH, query was already parsed
1064	8					46	$query = Search::QueryParser->new->parse($query, $implicitPlus);
1065							}
1066
1067	8					1318	my $code = $self->_cplQ($query);
1068	1	50		1		8	eval 'sub {no warnings "numeric"; (' .$code. ') ? $_[0] : undef;}'
	1			1		1
	1			1		90
	1			1		8
	1			1		1
	1			1		187
	1			1		7
	1			1		1
	1					80
	1					6
	1					2
	1					80
	1					5
	1					2
	1					65
	1					5
	1					2
	1					68
	1					5
	1					2
	1					65
	1					12
	1					2
	1					88
	8					640
1069							or croak $@;
1070							}
1071
1072
1073							sub _cplRegex {
1074	0			0		0	my $self = shift;
1075	0					0	my $regex = shift;
1076	0			0		0	return eval {sub {$_[0]->{line} =~ $regex}};
	0					0
	0					0
1077							}
1078
1079
1080							sub _cplQ {
1081	9			9		16	my $self = shift;
1082	9					13	my $q = shift;
1083
1084	9					12	my $mandatory = join(" and ", map {$self->_cplSubQ($_)} @{$q->{'+'}});
	2					19
	9					29
1085	9					11	my $exclude = join(" or ", map {$self->_cplSubQ($_)} @{$q->{'-'}});
	3					7
	9					23
1086	9					13	my $optional = join(" or ", map {$self->_cplSubQ($_)} @{$q->{''}});
	7					22
	9					22
1087
1088	9	50	66			53	croak "missing positive criteria in query" if not ($mandatory \|\| $optional);
1089	9		66			40	my $r = "(" . ($mandatory \|\| $optional) . ")";
1090	9	100				22	$r .= " and not ($exclude)" if $exclude;
1091	9					22	return $r;
1092							}
1093
1094
1095							sub _cplSubQ {
1096	12			12		17	my $self = shift;
1097	12					12	my $subQ = shift;
1098
1099	12					38	for ($subQ->{op}) {
1100
1101							# Either a list of subqueries...
1102							/^$/
1103	12	100				29	and do {# assert(ref $subQ->{value} eq 'HASH' and not $subQ->{field})
1104							# if DEBUG;
1105	1					6	return $self->_cplQ($subQ->{value}); };
1106
1107							# ...or a comparison operator with a word or list of words.
1108							# In that case we need to do some preparation for the source of comparison.
1109
1110							# assert(not ref $subQ->{value} or ref $subQ->{value} eq 'ARRAY') if DEBUG;
1111
1112							# Data to compare : either ...
1113	11					15	my $src = qq{\$_[0]->{line}}; # ... by default, the whole line ;
1114	11	100				26	if ($subQ->{field}) { # ... or an individual field.
1115	6	100				14	if ($subQ->{field} eq 'K_E_Y') { # Special pseudo field (in first position) :
1116	2					3	$subQ->{op} = '~'; # cheat, replace ':' by a regex operation.
1117	2					7	$subQ->{value} = "^$subQ->{value}(?:\\Q$self->{fieldSep}\\E\|\$)";
1118							}
1119							else {
1120	4	50				6	my $fieldNum = $self->ht->{$subQ->{field}} or
1121							croak "invalid field name $subQ->{field} in request";
1122	4					11	$src = qq{_getField(\$_[0], $fieldNum)};
1123							}
1124							}
1125
1126							/^:$/
1127	11	100				45	and do {my $s = $subQ->{value};
	7					14
1128
1129	7		33			65	my $noHighlights = # no result highlighting if ...
1130							$s eq '*' # .. request matches anything
1131							\|\| ! ($self->{preMatch} \|\| $self->{postMatch})
1132							# .. or no highlight was requested
1133							\|\| $subQ->{field}; # .. or request is on specific field
1134
1135	7					14	$s =~ s[\][\\w]g; # replace star by \w* regex
1136	7					13	$s =~ s{[\[\]+?]}{\Q$&\E}g; # escape other regex chars
1137	7					19	$s =~ s[\s+][\\s+]g; # replace spaces by \s+ regex
1138
1139
1140	7					14	$s =~ s/�/[�c]/g;
1141	7					12	$s =~ s/([��])/[a$1]/ig;
1142	7					14	$s =~ s/([��])/[e$1]/ig;
1143	7					11	$s =~ s/([��])/[i$1]/ig;
1144	7					12	$s =~ s/([��])/[o$1]/ig;
1145	7					13	$s =~ s/([��])/[u$1]/ig;
1146	7					11	$s =~ s/([��])/[y$1]/ig;
1147
1148	7	100				28	my $wdIni = ($s =~ /^\w/) ? '\b' : '';
1149	7	100				25	my $wdEnd = ($s =~ /\w$/) ? '\b' : '';
1150	7					10	my $lineIni = "";
1151	7	100	100			35	$lineIni = "(?{avoidMatchKey} and not $subQ->{field};
1152	7					15	$s = "$lineIni$wdIni$s$wdEnd";
1153
1154	7	50				46	return $noHighlights ? "($src =~ m[$s]i)" :
1155							"($src =~ s[$s][$self->{preMatch}\$&$self->{postMatch}]ig)";
1156							};
1157
1158
1159
1160							/^#$/ # compare source with a list of numbers
1161	4	50				17	and do {
1162	0					0	my $has_state = eval "use feature 'state'; 1"; # true from Perl 5.10
1163	0	0				0	my $decl = $has_state ? "use feature 'state'; state \$numvec"
1164							: "my \$numvec if 0"; # undocumented hack
1165
1166							# build a block that at first call creates a bit vector; then at
1167							# each call, the data source is compared with the bit vector
1168	0					0	return qq{
1169							do {
1170							$decl;
1171							no warnings qw/uninitialized numeric/;
1172							\$numvec or do {
1173							my \$nums = q{$subQ->{value}};
1174							vec(\$numvec, \$_, 1) = 1 for (\$nums =~ /\\d+/g);
1175							};
1176							vec(\$numvec, int($src), 1);
1177							}
1178							};
1179							};
1180
1181
1182							# for all other ops, $subQ->{value} must be a scalar
1183							# assert(not ref $subQ->{value}) if DEBUG;
1184
1185	4	50	33			47	(/^(!)~$/ or /^()=?~$/) and return "$1($src =~ m[$subQ->{value}])";
1186
1187							# choose proper comparison according to datatype of $subQ->{value}
1188	0	0				0	my $cmp = ($subQ->{value} =~ $self->{rxDate}) ?
		0
1189							"(\$self->{date2str}($src) cmp q{" .
1190							$self->{date2str}($subQ->{value}) . "})" :
1191							($subQ->{value} =~ $self->{rxNum}) ?
1192							"($src <=> $subQ->{value})" :
1193							# otherwise
1194							"($src cmp q{$subQ->{value}})";
1195
1196	0	0				0	/^=?=$/ and return "$cmp == 0";
1197	0	0				0	/^(?:!=\|<>)$/ and return "$cmp != 0";
1198	0	0				0	/^>$/ and return "$cmp > 0";
1199	0	0				0	/^>=$/ and return "$cmp >= 0";
1200	0	0				0	/^<$/ and return "$cmp < 0";
1201	0	0				0	/^<=$/ and return "$cmp <= 0";
1202	0					0	croak "unexpected op $_ ($subQ->{field} / $subQ->{value})";
1203							}
1204							}
1205
1206							############################################################
1207							# utility functions
1208							############################################################
1209
1210							sub urlEncode {
1211	6			6	0	9	my $s = shift;
1212	6					18	return join "", map {sprintf "%%%02X", ord($_)} split //, $s;
	6					50
1213							}
1214
1215							sub copyData { # copy from one filehandle to another
1216	8			8	0	34	my ($f1, $f2) = @_;
1217	8					11	my $buf;
1218	8					101	while (read $f1, $buf, BUFSIZE) {print $f2 $buf;}
	6					94
1219							}
1220
1221							=back
1222
1223							=head1 AUTHOR
1224
1225							Laurent Dami, Elaurent.dami AT etat ge chE
1226
1227
1228							=head1 COPYRIGHT AND LICENSE
1229
1230							Copyright (C) 2005 by Laurent Dami.
1231
1232							This library is free software; you can redistribute it and/or modify
1233							it under the same terms as Perl itself.
1234
1235							=cut
1236
1237							1;