File Coverage

blib/lib/File/Tabular.pm

Criterion	Covered	Total	%
statement	358	394	90.8
branch	130	206	63.1
condition	42	71	59.1
subroutine	45	49	91.8
pod	16	18	88.8
total	591	738	80.0

line	stmt	bran	cond	sub	pod	time	code
1							package File::Tabular;
2
3							# TODO : -optimize _getField (could probably dispense with
4							# "mkRecord", call $self->..)
5							# -BUG : preMatch/postMatch won't work on explicit field searches
6							# -optimize: postpone preMatch/postMatch until display time
7							# -escaping fieldSep : make it optional
8
9	1			1		128719	use strict;
	1					2
	1					49
10	1			1		6	use warnings;
	1					2
	1					66
11	1			1		6	no warnings 'uninitialized';
	1					2
	1					53
12	1			1		709	use locale;
	1					1021
	1					9
13	1			1		44	use Carp;
	1					1
	1					71
14	1			1		4	use Fcntl ':flock';
	1					1
	1					135
15	1			1		573	use Hash::Type;
	1					1850
	1					46
16	1			1		712	use Search::QueryParser 0.92;
	1					2470
	1					2669
17	1			1		15	use File::Temp;
	1					3
	1					295
18
19							our $VERSION = "0.73";
20
21
22							=head1 NAME
23
24							File::Tabular - searching and editing flat tabular files
25
26							=head1 SYNOPSIS
27
28							use File::Tabular;
29							my $f = File::Tabular->new($filename);
30
31							my $row = $f->fetchrow;
32							print $row->{field1}, $row->{field2};
33
34							$row = $f->fetchrow(where => 'someWord');
35							$row = $f->fetchrow(where => 'field1 > 4 AND field2 >= "01.01.2001"');
36							$row = $f->fetchrow(where => qr/some\s+(complex\s*)?(regex\|regular expression)/i);
37
38							$f->rewind;
39							my $rows = $f->fetchall(where => 'someField =~ ^[abc]+');
40							print $_->{someField} foreach @$rows;
41
42							$f->rewind;
43							$rows = $f->fetchall(where => '+field1:someWord -field2:otherWord',
44							orderBy => 'field3, field6:num, field5:-alpha');
45
46							$f->rewind;
47							my $hashRows = $f->fetchall(where => 'foo AND NOT bar',
48							key => 'someField');
49							print $hashRows->{someKey}{someOtherField};
50
51							# open for updates, and remember the updates in a journal file
52							$f = File::Tabular->new("+<$filename", {journal => $journalFile});
53
54							# updates at specific positions (line numbers)
55							$f->splices(4 => 2, undef, # delete 2 lines from position 4
56							7 => 1, {f1 => $v1, f2 => $v2, ...}, # replace line 7
57							9 => 0, { ...}, # insert 1 new line at position 9
58							22 => 0, [{...}, {...}, ...] # insert several lines at pos. 22
59							...
60							-1 => 0, [{...}, {...}, ...] # append at the end
61							);
62
63							# shorthand to add new data at the end
64							$f->append({f1 => $v1, f2 => $v2, ...});
65							# same thing, but use the "Hash::Type" associated to the file
66							$f->append($f->ht->new($v1, $v2, ...));
67
68
69							$f->clear; # removes all data (but keeps the header line)
70
71							# updates at specific keys, corresponding to @keyFields
72							$f->writeKeys({key1 => {f1 => $v1, f2 => $v2, ...}, # add or update
73							key2 => undef, # remove
74							...
75							}, @keyFields);
76
77
78							# replay the updates on a backup file
79							my $bck = File::Tabular->new("+<$backupFile");
80							$bck->playJournal($journalFile);
81
82							# get info from associated filehandle
83							printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
84							my $mtime = $f->mtime;
85							printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
86
87							=head1 DESCRIPTION
88
89							A I is a flat text file containing data organised
90							in rows (records) and columns (fields).
91
92							This module provides database-like functionalities for managing
93							tabular files : retrieving, searching, writing, autonumbering, journaling.
94							However, unlike other modules like L, it doesn't try
95							to make it look like a database : rather, the API was designed
96							specifically for work with tabular files.
97							Instead of SQL, search queries are specified in a web-like
98							fashion, with support for regular expressions and cross-field
99							searches. Queries are compiled internally into perl closures
100							before being applied to every data record, which makes it
101							quite fast.
102
103							Write operations take a list of modifications as argument;
104							then they apply the whole list atomically in a single rewrite
105							of the data file.
106
107							Here are some of the reasons why you might choose to
108							work with a tabular file rather than a regular database :
109
110							=over
111
112							=item *
113
114							no need to install a database system (not even buy one)!
115
116							=item *
117
118							easy portability and data exchange with external tools
119							(text editor, spreadsheet, etc.)
120
121							=item *
122
123							search queries immediately ready for a web application
124
125							=item *
126
127							good search performance, even with several thousand records
128
129							=back
130
131
132							On the other hand, tabular files will probably be inappropriate if you
133							need very large volumes of data, complex multi-table data models or
134							frequent write operations.
135
136
137
138							=head1 METHODS
139
140							=over
141
142							=item C<< new (open1, open2, ..., {opt1 => v1, opt2 => v2, ...}) >>
143
144							Creates a new tabular file object.
145							The list of arguments C is fed directly to
146							L for opening the associated file.
147							A reference to an already opened filehandle can also be given instead.
148
149							The final hash ref is a collection of optional parameters, taken
150							from the following list :
151
152							=over
153
154							=item fieldSep
155
156							field separator : any character except '%' ('\|' by default).
157							Escape sequences like C<\t> are admitted.
158
159							=item recordSep
160
161							record separator ('\n' by default).
162
163							=item crlf
164
165							if true, filehandles use the L layer C<:crlf>.
166							This is the default, for unfortunate historical reasons.
167
168
169							=item fieldSepRepl
170
171							string to substitute if fieldSep is met in the data.
172							(by default, url encoding of B, i.e. '%7C' )
173
174							=item recordSepRepl
175
176							string to substitute if recordSep is met in the data
177							(by default, url encoding of B, i.e. '%0A' )
178
179
180							=item autoNumField
181
182							name of field for which autonumbering is turned on (none by default).
183							This is useful to generate keys : when you write a record, the
184							character '#' in that field will be replaced by a fresh number,
185							incremented automatically. This number will be 1 + the
186							largest number read I (it is your responsability to read all
187							records before the first write operation).
188
189							=item autoNum
190
191							initial value of the counter for autonumbering (1 by default).
192
193							=item autoNumChar
194
195							character that will be substituted by an autonumber when
196							writing records ('#' by default).
197
198
199							=item flockMode
200
201							mode for locking the file, see L. By default,
202							this will be LOCK_EX if B contains 'E' or
203							'+E', LOCK_SH otherwise.
204
205							=item flockAttempts
206
207							number of attempts to lock the file,
208							at 1 second intervals, before returning an error.
209							Zero by default.
210							If nonzero, LOCK_NB is added to flockMode;
211							if zero, a single locking attempt will be made, blocking
212							until the lock is available.
213
214							=item headers
215
216							reference to an array of field names.
217							If not present, headers will be read from the first line of
218							the file.
219
220							=item printHeaders
221
222							if true, the B will be printed to the file.
223							If not specified, treated as 'true' if
224							B contains 'E'.
225
226							=item journal
227
228							name of journaling file, or reference to a list of arguments for
229							L. The journaling file will log all write operations.
230							If specified as a simple file name, it will be be opened in
231							'EE' mode.
232
233							A journal file can then be replayed through method L
234							(this is useful to recover after a crash, by playing the journal
235							on a backup copy of your data).
236
237							=item rxDate
238
239							Regular expression for matching a date.
240							Default value is C<< qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/ >>.
241							This will be used by L to perform appropriate comparisons.
242
243							=item date2str
244
245							Ref to a function for transforming dates into strings
246							suitable for sorting (i.e. year-month-day).
247							Default is :
248
249							sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
250							$y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
251							return sprintf "%04d%02d%02d", $y, $m, $d;}
252
253							=item rxNum
254
255							Regular expression for matching a number.
256							Default value is C<< qr/^[-+]?\d+(?:\.\d*)?$/ >>.
257							This will be used by L to perform appropriate comparisons.
258
259							=item preMatch/postMatch
260
261							Strings to insert before or after a match when filtering rows
262							(will only apply to search operator ':' on the whole line, i.e.
263							query C<< "foo OR bar" >> will highlight both "foo" and "bar",
264							but query C<< "~ 'foo' OR someField:bar" >>
265							will not highlight anything; furthermore, a match-all
266							request containing just '*' will not highlight anything either).
267
268							=item avoidMatchKey
269
270							If true, searches will avoid to match on the first field. So a request
271							like C<< $ft->fetchall(where => '123 OR 456') >> will not find
272							the record with key 123, unless the word '123' appears somewhere
273							in the other fields. This is useful when queries come from a Web
274							application, and we don't want users to match a purely technical
275							field.
276
277							This search behaviour will not apply to regex searches. So requests like
278							C<< $ft->fetchall(where => qr/\b(123\|456)\b/) >>
279							or
280							C<< $ft->fetchall(where => ' ~ 123 OR ~ 456') >>
281							will actually find the record with key 123.
282
283							=back
284
285							=cut
286
287							############################################################
288							# CONSTANTS
289							############################################################
290
291	1			1		9	use constant BUFSIZE => 1 << 21; # 2MB, used in copyData
	1					3
	1					531
292
293							use constant DEFAULT => {
294							fieldSep => '\|',
295							recordSep => "\n",
296							crlf => 1,
297							autoNumField => undef,
298							autoNumChar => '#',
299							autoNum => 1,
300							lockAttempts => 0,
301							rxNum => qr/^[-+]?\d+(?:\.\d*)?$/,
302							rxDate => qr/^\d\d?\.\d\d?\.\d\d\d?\d?$/,
303	0					0	date2str => sub {my ($d, $m, $y) = ($_[0] =~ /(\d\d?)\.(\d\d?)\.(\d\d\d?\d?)$/);
304	0	0	0			0	$y += ($y > 50) ? 1900 : 2000 if defined($y) && $y < 100;
		0
305	0					0	return sprintf "%04d%02d%02d", $y, $m, $d;},
306	1					149	preMatch => '',
307							postMatch => '',
308							avoidMatchKey => undef,
309	1			1		11	};
	1					2
310
311
312							use constant {
313	1					9	statType => Hash::Type->new(qw(dev ino mode nlink uid gid rdev size
314							atime mtime ctime blksize blocks)),
315							timeType => Hash::Type->new(qw(sec min hour mday mon year wday yday isdst))
316	1			1		8	};
	1					2
317
318
319							############################################################
320							# METHODS
321							############################################################
322
323							sub new {
324	3			3	1	162389	my $class = shift;
325	3	50				18	my $args = ref $_[-1] eq 'HASH' ? pop : {};
326
327							# create object with default values
328	3					10	my $self = bless {};
329	3					6	foreach my $option (keys %{DEFAULT()}) {
	3					33
330	39		100			182	$self->{$option} = $args->{$option} \|\| DEFAULT->{$option};
331							}
332
333							# eval to expand escape sequences, for example if fieldSep is given as '\t'
334	3					11	foreach my $option (qw(fieldSep recordSep)) {
335	6					558	$self->{$option} = eval qq{qq{$self->{$option}}};
336							}
337
338							# field and record separators
339	3	50				20	croak "can't use '%' as field separator" if $self->{fieldSep} =~ /%/;
340
341	3		33			28	$self->{recordSepRepl} = $args->{recordSepRepl} \|\| urlEncode($self->{recordSep});
342	3		33			17	$self->{fieldSepRepl} = $args->{fieldSepRepl} \|\| urlEncode($self->{fieldSep});
343	3					47	$self->{rxFieldSep} = qr/\Q$self->{fieldSep}\E/;
344
345
346							# open file and get lock
347	3	50				19	$self->_open($self->{FH}, @_) or croak "open @_ : $! $^E";
348	3		50			18	my $flockAttempts = $args->{flockAttempts} \|\| 0;
349	3		33			30	my $flockMode = $args->{flockMode} \|\| ($_[0] =~ />\|\+
350	3	50				10	$flockMode \|= LOCK_NB if $flockAttempts > 0;
351	3					14	for (my $n = $flockAttempts; $n >= 1; $n--) {
352	0	0				0	last if flock $self->{FH}, $flockMode; # exit loop if flock succeeded
353	0	0				0	$n > 1 ? sleep(1) : croak "could not flock @_: $^E";
354							};
355
356							# setup journaling
357	3	100				9	if (exists $args->{journal}) {
358	1					4	my $j = {}; # create a fake object for _printRow
359	1					9	$j->{$_} = $self->{$_} foreach qw(fieldSep recordSep
360							fieldSepRepl recordSepRepl);
361	1	50				11	$self->_open($j->{FH}, ref $args->{journal} eq 'ARRAY' ? @{$args->{journal}}
	0	50				0
362							: ">>$args->{journal}")
363							or croak "open journal $args->{journal} : $^E";
364	1					4	$self->{journal} = bless $j;
365							}
366
367							# field headers
368	3		100			28	my $h = $args->{headers} \|\| [split($self->{rxFieldSep}, $self->_getLine, -1)];
369	3					28	$self->{ht} = Hash::Type->new(@$h);
370	3	50				193	my $must_print_headers = exists $args->{printHeaders} ? $args->{printHeaders} : ($_[0] =~ />/);
371	3	100				16	$self->_printRow(@$h) if $must_print_headers;
372
373							# ready for reading data lines
374	3					11	$self->{dataStart} = tell($self->{FH});
375	3					10	$. = 0; # setting line counter to zero for first dataline
376
377
378							# create a closure which takes a (already chomped) line and returns a record
379	3					6	my %tmp; # copy some attributes of $self in order to avoid a cyclic ref
380	3					23	$tmp{$_} = $self->{$_} foreach qw/rxFieldSep fieldSepRepl fieldSep ht/;
381							$self->{mkRecord} = sub {
382	432			432		2380	my @vals = split $tmp{rxFieldSep}, $_[0], -1;
383	432					2674	s/$tmp{fieldSepRepl}/$tmp{fieldSep}/g foreach @vals;
384	432					1395	return $tmp{ht}->new(@vals);
385	3					19	};
386
387	3					35	return $self;
388							}
389
390
391							sub _open { # CORE::open is prototyped, so it cannot take args from an array. This is a workaround
392	5			5		10	my $self = shift;
393	5	100				587	my $result = (ref $_[1] eq 'GLOB') ? $_[0] = $_[1] :
		50
		50
394							@_ > 3 ? open($_[0], $_[1], $_[2], @_[3..$#_]) :
395							@_ > 2 ? open($_[0], $_[1], $_[2]) :
396							open($_[0], $_[1]);
397	5	50	33			78	binmode($_[0], ":crlf") if $result && $self->{crlf};
398	5					23	return $result;
399							}
400
401
402							sub _getLine {
403	768			768		2475	my $self = shift;
404	768					2776	local $/ = $self->{recordSep};
405	768					3693	my $line = readline $self->{FH};
406	768	100				1645	if (defined $line) {
407	755					1318	chomp $line;
408	755					2639	$line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
409							}
410	768					2862	return $line;
411							}
412
413
414							sub _printRow { # Internal function to print a data row and automatically deal with
415							# autonumbering, if necessary.
416	335			335		1132	my ($self, @vals) = @_;
417
418							# autoNumbering
419	335	100				6047	if ($self->{autoNumField}) {
420	216					464	my $ix = $self->{ht}{$self->{autoNumField}} - 1;
421	216	100				1638	if ($vals[$ix] =~ s/$self->{autoNumChar}/$self->{autoNum}/) {
		100
422	8					19	$self->{autoNum} += 1;
423							}
424							elsif ($vals[$ix] =~ m/(\d+)/) {
425	206					657	my $next_autonum = $1 + 1;
426	206	100				522	$self->{autoNum} = $next_autonum if $next_autonum > $self->{autoNum};
427							}
428							}
429
430							# build row and print it
431	335					2867	s/\Q$self->{fieldSep}\E/$self->{fieldSepRepl}/ foreach @vals;
432	335					900	my $line = join $self->{fieldSep}, @vals;
433	335					1003	$line =~ s/\Q$self->{recordSep}\E/$self->{recordSepRepl}/g;
434	335					622	my $fh = $self->{FH};
435	335					2161	print $fh $line, $self->{recordSep};
436							}
437
438
439							=item C<< fetchrow(where => filter) >>
440
441							returns the next record matching the (optional) filter. If there is
442							no filter, just returns the next record.
443
444							The filter is either a code reference generated by L,
445							or a string which will be automatically fed as argument to
446							L; this string can contain just a word, a regular
447							expression, a complex boolean query involving field names and
448							operators, etc., as explained below.
449
450							=cut
451
452							# _getField($r, $fieldNumber)
453							# Internal method for lazy creation of a record from a line.
454							# Will be called only when a specific field is required.
455							# See creation of $r in method 'fetchrow' just below.
456
457	200		66	200		1234	sub _getField { tied(%{$_[0]->{record} \|\|= $_[0]->{mkRecord}($_[0]->{line})})->[$_[1]]; }
	200					1992
458
459
460							sub fetchrow {
461	385			385	1	2530	my $self = shift;
462	385					594	my $filter = undef;
463
464							# accept fetchrow(where=>filter) or fetchrow({where=>filter}) or fetchrow(filter)
465	385	50				1339	my @args = ref $_[0] eq 'HASH' ? @{$_[0]} : @_;
	0					0
466	385	100				762	if (@args) {
467	361	100				963	shift @args if $args[0] =~ /^where$/i;
468	361	50				790	croak "fetchrow : invalid number of arguments" if @args != 1;
469	361					547	$filter = $args[0];
470	361	100	100			1028	$filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
471							}
472
473	385					990	while (my $line = $self->_getLine) {
474
475							# create structure $r for _getField
476	754					2491	my $r = {line => $line, record => undef, mkRecord => $self->{mkRecord}};
477
478	754	100	100			13333	next if $filter and not $filter->($r);
479
480	372		66			3433	$r->{record} \|\|= $self->{mkRecord}($r->{line});
481
482	372	100				5972	if ($self->{autoNumField}) {
483	186					755	my ($n) = $r->{record}{$self->{autoNumField}} =~ m/(\d+)/;
484	186	50	33			2549	$self->{autoNum} = $n+1 if $n and $n+1 > $self->{autoNum};
485							}
486	372					1674	return $r->{record};
487							}
488	13					58	return undef;
489							}
490
491							=item C<< fetchall(where => filter, orderBy => cmp) >>
492
493							=item C<< fetchall(where => filter, key => keySpecif) >>
494
495							finds all next records matching the (optional) filter.
496							If there is no filter, finds all remaining records.
497
498							The filter is either a code reference generated by L,
499							or a string which will be automatically fed as argument to
500							L.
501
502							The return value depends on context and on arguments :
503
504							=over
505
506							=item *
507
508							if no B parameter is given, and we are in a scalar context, then
509							C returns a reference to an array of records.
510
511							The optional B parameter can be a field name, a ref to a list
512							of field names, a string like C<"field1: -alpha, field2:-num, ...">,
513							or, more generally, a user-provided comparison function;
514							see L for a fully detailed explanation.
515
516							Otherwise, the resulting array is in data source order.
517
518							=item *
519
520							if no B parameter is given, and we are in a list context, then
521							C returns a pair : the first item is a reference to an array
522							of records as explained above ; the second item is a reference to an
523							array of line numbers corresponding to those records (first data line
524							has number 0). These line numbers might be useful later
525							if you update the records through the L method.
526							No B is allowed if C is called in
527							list context.
528
529							=item *
530
531							if a B parameter is given,
532							then C returns a reference to a hash, whose
533							values are the retrieved records, and whose keys
534							are built according to the B argument.
535							This must be either a single field name (scalar), or
536							a a list of field names (ref to an array of scalars).
537							Values corresponding to those field names will form the
538							key for each entry of the hash;
539							if necessary, multiple values are joined together
540							through L<$;\|perlvar/$;>.
541							No B argument is allowed, because hashes have no ordering.
542
543							=back
544
545							=cut
546
547							sub fetchall {
548	9			9	1	2189	my $self = shift;
549	9	50				50	my %args = ref $_[0] eq 'HASH' ? %{$_[0]} : @_;
	0					0
550
551							croak "fetchall in list context : not allowed with 'orderBy' or 'key' arg"
552	9	50	33			43	if wantarray and ($args{orderBy} or $args{key});
			66
553
554							croak "fetchall : args 'orderBy' and 'key' not allowed together"
555	9	0	33			47	if $args{orderBy} and $args{key};
556
557	9	50				55	my @k = !$args{key} ? () : ref $args{key} ? @{$args{key}} : ($args{key});
	0	100				0
558
559	9					22	my $filter = $args{where};
560	9	100	66			50	$filter = $self->compileFilter($filter) if $filter and not ref $filter eq 'CODE';
561
562	9	100				31	if (@k) { # will return a hash of rows
563	2	50				9	croak "fetchall : 'orderBy' not allowed with 'key'" if $args{orderBy};
564	2	50				5	croak "fetchall in list context : not allowed with 'key'" if wantarray;
565	2					6	my $rows = {};
566	2					8	while (my $row = $self->fetchrow($filter)) {
567	19					43	$rows->{join($;, @{$row}{@k})} = $row;
	19					94
568							}
569	2					83	return $rows;
570							}
571							else { # will return an array of rows
572	7					19	my ($rows, $line_nos) = ([], []);
573	7					25	while (my $row = $self->fetchrow($filter)) {
574	327					630	push @$rows, $row;
575	327	100				1036	push @$line_nos, $. - 1 if wantarray;
576							}
577
578	7	50				27	if ($args{orderBy}) {
579	0	0				0	croak "fetchall in list context : not allowed with 'orderBy'" if wantarray;
580	0					0	my $tmp = ref $args{orderBy};
581	0					0	my $cmpFunc = $tmp eq 'ARRAY' ? $self->{ht}->cmp(@{$args{orderBy}}) :
582							$tmp eq 'CODE' ? $args{orderBy} :
583	0	0				0	$self->{ht}->cmp($args{orderBy});
		0
584	0					0	$rows = [sort $cmpFunc @$rows];
585							}
586	7	100				115	return wantarray ? ($rows, $line_nos) : $rows;
587							}
588							}
589
590
591
592							=item C<< rewind >>
593
594							Rewinds the file to the first data line (after the headers)
595
596							=cut
597
598							sub rewind {
599	27			27	1	12640	my $self = shift;
600	27					438	seek $self->{FH}, $self->{dataStart}, 0;
601	27					94	$. = 0;
602							}
603
604
605
606							=item C<< ht >>
607
608							Returns the instance of L associated with
609							the file.
610
611							=cut
612
613	319			319	1	501	sub ht { my $self = shift; $self->{ht}; }
	319					910
614
615
616
617							=item C<< headers >>
618
619							returns the list of field names
620
621							=cut
622
623	314			314	1	549	sub headers { my $self = shift; $self->ht->names; }
	314					629
624
625							=item C<< stat >>
626
627							returns a hash ref corresponding to a call of
628							L on the associated filehandle.
629							Keys of the hash have names as documented in
630							L. Ex:
631
632							printf "%d size, %d blocks", $f->stat->{size}, $f->stat->{blocks};
633
634							=cut
635
636
637	5			5	1	30415	sub stat {my $self = shift; statType->new(stat($self->{FH}));}
	5					112
638
639
640							=item C<< atime >>, C<< mtime >>, C<< ctime >>
641
642							each of these methods returns a hash ref corresponding to a call of
643							L on the last access time, last modified
644							time, or last inode change time of the associated filehandle
645							(see L for explanations).
646							Keys of the hash have names as documented in
647							L. Ex:
648
649							my $mtime = $f->mtime;
650							printf "time last modified : %02d:%02d:%02d", @{$mtime}{qw(hour min sec)};
651
652							=cut
653
654	0			0	1	0	sub atime {my $self = shift; timeType->new(localtime(($self->stat->{atime})));}
	0					0
655	1			1	1	457	sub mtime {my $self = shift; timeType->new(localtime(($self->stat->{mtime})));}
	1					12
656	0			0	1	0	sub ctime {my $self = shift; timeType->new(localtime(($self->stat->{ctime})));}
	0					0
657
658							=item C<< splices >>
659
660							splices(pos1 => 2, undef, # delete 2 lines
661							pos2 => 1, row, # replace 1 line
662							pos3 => 0, [row1, row2 ...] # insert lines
663							...
664							-1 => 0, [row1, ... ] # append lines
665							);
666
667							# special case : autonum if pos== -1
668
669
670							Updates the data, in a spirit similar to
671							L (hence the name of the method). The whole file is
672							rewritten in an atomic operation, deleting, replacing or appending
673							data lines as specified by the "splice instructions". Returns the
674							number of "splice instructions" performed.
675
676							A splice instruction is a triple composed of :
677
678							=over
679
680							=item 1
681
682							a position (line number) that specifies
683							the place where modifications will occur.
684							Line numbers start at 0.
685							Position -1 means end of data.
686
687							=item 2
688
689							a number of lines to delete (might be zero).
690
691							=item 3
692
693							a ref to a hash or to a list of hashes containing new data to
694							insert (or C if there is no new data).
695
696							=back
697
698							If there are several splice instructions, their positions must be
699							sorted in increasing order (except of course position -1,
700							meaning "end of data", which must appear last).
701
702							Positions always refer to line numbers in the original file, before
703							any modifications. Therefore, it makes no sense to write
704
705							splices(10 => 5, undef,
706							12 => 0, $myRow)
707
708							because after deleting 5 rows at line 10, we cannot insert a new
709							row at line 12.
710
711							The whole collection of splice instructions
712							may also be passed as an array ref instead of a list.
713
714							If you intend to fetch rows again after a B, you
715							must L the file first.
716
717							=cut
718
719
720
721							sub splices {
722	10			10	1	1172	my $self = shift;
723	10	100				35	my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
724	10					20	my $nArgs = @$args;
725	10	50				30	croak "splices : number of arguments must be multiple of 3" if $nArgs % 3;
726
727	10					19	my $TMP = undef; # handle for a tempfile
728
729	10					19	my $i;
730	10					29	for ($i=0; $i < $nArgs; $i+=3 ) {
731	16					55	my ($pos, $del, $lines) = @$args[$i, $i+1, $i+2];
732
733	16					58	$self->_journal('SPLICE', $pos, $del, $lines);
734
735	16	100	66			67	if ($pos == -1) { # we want to append new data at end of file
		100
736							$TMP ? # if we have a tempfile ...
737							copyData($TMP, $self->{FH}) # copy back all remaining data
738	10	100				254	: seek $self->{FH}, 0, 2; # otherwise goto end of file
739	10					39	$pos = $.; # sync positions (because of test 12 lines below)
740							}
741							elsif ( # we want to put data in the middle of file and ..
742							not $TMP and $self->stat->{size} > $self->{dataStart}) {
743	2	50				93	$TMP = File::Temp->new or croak "no tempfile: $^E";
744	2	50				1346	binmode($TMP, ":crlf") if $self->{crlf};
745
746	2					12	$self->rewind;
747	2					9	copyData($self->{FH}, $TMP);
748	2					9	$self->rewind;
749	2					155	seek $TMP, 0, 0;
750							}
751
752	16	50				83	croak "splices : cannot go back to line $pos" if $. > $pos;
753
754	16					85	local $/ = $self->{recordSep};
755
756	16					48	while ($. < $pos) { # sync with tempfile
757	10					44	my $line = <$TMP>;
758	10	50				26	croak "splices : no such line : $pos ($.)" unless defined $line;
759	10					19	my $fh = $self->{FH};
760	10					64	print $fh $line;
761							}
762
763	16					48	while ($del--) { # skip lines to delete from tempfile
764	6					79	my $line = <$TMP>;
765	6	50				20	croak "splices : no line to delete at pos $pos" unless defined $line;
766							}
767
768	16	50				66	$lines = [$lines] if ref $lines eq 'HASH'; # single line
769	16					63	$self->_printRow(@{$_}{$self->headers}) for @$lines;
	196					2259
770							}
771	10	100				30	copyData($TMP, $self->{FH}) if $TMP; # copy back all remaining data
772	10					913	truncate $self->{FH}, tell $self->{FH};
773	10					61	$self->_journal('ENDSPLICES');
774	10					47	return $i / 3;
775							}
776
777
778
779							=item C<< append(row1, row2, ...) >>
780
781							This appends new records at the end of data, i.e. it is
782							a shorthand for
783
784							splices(-1 => 0, [row1, row2, ...])
785
786							=cut
787
788
789							sub append {
790	4			4	1	1300	my $self = shift;
791	4	100				22	my $args = ref $_[0] eq 'ARRAY' ? $_[0] : \@_;
792	4					20	$self->splices([-1 => 0, $args]);
793							}
794
795
796							=item C<< clear >>
797
798							removes all data (but keeps the header line)
799
800							=cut
801
802							sub clear {
803	4			4	1	1247	my $self = shift;
804	4					17	$self->rewind;
805	4					15	$self->_journal('CLEAR');
806	4					494	truncate $self->{FH}, $self->{dataStart};
807							}
808
809
810
811							=item C<< writeKeys({key1 => row1, key2 => ...}, @keyFields) >>
812
813							Rewrites the whole file, applying modifications as specified
814							in the hash ref passed as first argument. Keys in this hash
815							are compared to keys built from the original data,
816							according to C<@keyFields>. Therefore, C may replace
817							an existing row, if the key corresponding to C was found ;
818							otherwise, a new row is added. If C is C, the
819							corresponding row is deleted from the file.
820
821							C<@keyFields> must contain the name of one or several
822							fields that build up the primary key. For each data record, the
823							values corresponding to those fields are taken and
824							joined together through L<$;\|perlvar/$;>, and then compared to
825							C, C, etc.
826
827							If you intend to fetch rows again after a B, you
828							must L the file first.
829
830							=cut
831
832							sub writeKeys {
833	2			2	1	14	my $self = shift;
834	2					4	my $lstModifs = shift;
835	2					10	my %modifs = %$lstModifs;
836
837	2	50				10	croak 'writeKeys : missing @keyFields' if not @_;
838
839							# clone object associated with a temp file
840	2					30	my $clone = bless {%$self};
841	2					8	$clone->{journal} = undef;
842	2					6	$clone->{FH} = undef;
843	2	50				11	$clone->{FH} = File::Temp->new or croak "no tempfile: $^E";
844	2	50				1283	binmode($clone->{FH}, ":crlf") if $self->{crlf};
845
846	2					14	seek $self->{FH}, 0, 0; # rewind to start of FILE (not start of DATA)
847	2					9	copyData($self->{FH}, $clone->{FH});
848	2					10	$self->rewind;
849	2					7	$clone->rewind;
850
851	2					15	$self->_journal('KEY', $_, $modifs{$_}) foreach keys %modifs;
852	2					10	$self->_journal('ENDKEYS', @_);
853
854	2					8	while (my $row = $clone->fetchrow) {
855	20					43	my $k = join($; , @{$row}{@_});
	20					82
856	20	100				155	my $data = exists $modifs{$k} ? $modifs{$k} : $row;
857	20	100				67	$self->_printRow(@{$data}{$self->headers}) if $data;
	18					248
858	20					157	delete $modifs{$k};
859							#TODO : optimization, exit loop and copyData if no more items in %modifs
860							}
861
862							# add remaining values (new keys)
863	2					11	$self->_printRow(@{$_}{$self->headers}) foreach grep {$_} values %modifs;
	0					0
	0					0
864
865	2					257	truncate $self->{FH}, tell $self->{FH};
866							}
867
868
869							sub _journal { # ($op, @args, \details)
870							# Internal function for recording an update operation in a journal.
871							# The journal can then be replayed through method L.
872	38			38		72	my $self = shift;
873	38	100				121	return if not $self->{journal}; # return if no active journaling
874
875	19					429	my @t = localtime;
876	19					61	$t[5] += 1900;
877	19					136	$t[4] += 1;
878	19					108	my $t = sprintf "%04d-%02d-%02d %02d:%02d:%02d", @t[5,4,3,2,1,0];
879
880	19					53	my @args = @_;
881	19					38	my $rows = [];
882	19					57	for (ref $args[-1]) { # last arg is an array of rows or a single row or none
883	19	100				218	/ARRAY/ and do {($rows, $args[-1]) = ($args[-1], scalar(@{$args[-1]}))};
	7					18
	7					22
884	19	100				64	/HASH/ and do {($rows, $args[-1]) = ([$args[-1]], 1)};
	2					8
885							}
886
887	19					58	$self->{journal}->_printRow($t, 'ROW', @{$_}{$self->headers}) foreach @$rows;
	100					1158
888	19					58	$self->{journal}->_printRow($t, @args);
889							}
890
891
892							=item C<< playJournal(open1, open2, ...) >>
893
894							Reads a sequence of update instructions from a journal file
895							and applies them to the current tabular file.
896							Arguments C will be passed to L
897							for opening the journal file ; in most cases, just give the filename.
898
899							The journal file must contain a sequence of instructions
900							as encoded by the automatic journaling function of this module ;
901							to activate journaling, see the C parameter of the
902							L method.
903
904							=cut
905
906
907							sub playJournal {
908	1			1	1	362	my $self = shift;
909	1	50				18	croak "cannot playJournal while journaling is on!" if $self->{journal};
910	1					3	my $J;
911	1	50				5	$self->_open($J, @_) or croak "open @_: $^E";
912
913	1					3	my @rows = ();
914	1					3	my @splices = ();
915	1					2	my @writeKeys = ();
916
917	1					7	local $/ = $self->{recordSep};
918
919	1					80	while (my $line = <$J>) {
920	119					200	chomp $line;
921
922	119					462	$line =~ s/$self->{recordSepRepl}/$self->{recordSep}/g;
923	119					667	my ($t, $ins, @vals) = split $self->{rxFieldSep}, $line, -1;
924	119					979	s/$self->{fieldSepRepl}/$self->{fieldSep}/g foreach @vals;
925
926	119					231	for ($ins) {
927	119	100				266	/^CLEAR/ and do {$self->clear; next };
	2					10
	2					19
928	117	100				300	/^ROW/ and do {push @rows, $self->{ht}->new(@vals); next};
	100					294
	100					1898
929	17	100				73	/^SPLICE/ and do {my $nRows = pop @vals;
	8					17
930	8	50	100			39	carp "invalid number of data rows in journal at $line" if ($nRows\|\|0) != @rows;
931	8	100				40	push @splices, @vals, $nRows ? [@rows] : undef;
932	8					19	@rows = ();
933	8					32	next };
934	9	100				24	/^ENDSPLICES/ and do {$self->splices(@splices);
	5					24
935	5					487	@splices = ();
936	5					62	next};
937	4	100				12	/^KEY/ and do {my $nRows = pop @vals;
	3					7
938	3	50	100			17	carp "invalid number of data rows in journal at $line" if ($nRows\|\|0) > 1;
939	3	100				11	push @writeKeys, $vals[0], $nRows ? $rows[0] : undef;
940	3					7	@rows = ();
941	3					11	next };
942	1	50				5	/^ENDKEYS/ and do {$self->writeKeys({@writeKeys}, @vals);
	1					9
943	1					396	@writeKeys = ();
944	1					10	next};
945							}
946							}
947							}
948
949
950
951							=item C<< compileFilter(query [, implicitPlus]) >>
952
953							Compiles a query into a filter (code reference) that can be passed to
954							L or L.
955
956							The query can be
957
958							=over
959
960							=item *
961
962							a regular expression compiled through C<< qr/.../ >>. The regex will be applied
963							to whole data lines, and therefore covers all fields at once.
964							This is the fastest way to filter lines, because it avoids systematic
965							splitting into data records.
966
967							=item *
968
969							a data structure resulting from a previous call to
970							C
971
972							=item *
973
974							a string of shape C<< K_E_Y : value >> (without any spaces before
975							or after ':'). This will be compiled into
976							a regex matching C in the first column.
977							The special spelling is meant to avoid collision with a real field
978							hypothetically named 'KEY'.
979
980							=item *
981
982							a string that will be analyzed through C, and
983							then compiled into a filter function. The query string can contain
984							boolean combinators, parenthesis, comparison operators, etc., as
985							documented in L. The optional second argument
986							I is passed to C ;
987							if true, an implicit '+' is added in front of every
988							query item (therefore the whole query is a big AND).
989
990							Notice that in addition to usual comparison operators,
991							you can also use regular expressions
992							in queries like
993
994							+field1=~'^[abc]+' +field2!~'foobar$'
995
996							The query compiler needs to distinguish between word and non-word
997							characters ; therefore it is important to C in your
998							scripts (see L). The compiler tries to be clever about a
999							number of details :
1000
1001							=over
1002
1003							=item looking for complete words
1004
1005							Words in queries become regular expressions enclosed by C<\b> (word
1006							boundaries) ; so a query for C will not match C.
1007
1008							=item supports * for word completion
1009
1010							A '' in a word is compiled into regular expression C<\w> ;
1011							so queries C or C<*bar> will both match C.
1012
1013							=item case insensitive, accent-insensitive
1014
1015							Iso-latin-1 accented characters are translated into character
1016							classes, so for example C becomes C.
1017							Furthermore, as shown in this example, the C flag is turned
1018							on (case-insensitive). Therefore this query will also match
1019							C.
1020
1021							=item numbers and dates in operators
1022
1023							When compiling a subquery like C<< fieldname >= 'value' >>, the compiler
1024							checks the value against C and C (as specified in the
1025							L method). Depending on these tests, the subquery is translated
1026							into a string comparison, a numerical comparison, or a date
1027							comparison (more precisely, C<< {date2str($a) cmp date2str($b)} >>).
1028
1029
1030							=item set of integers
1031
1032							Operator C<#> means comparison with a set of integers; internally
1033							this is implemented with a bit vector. So query
1034							C will return records where
1035							field C contains one of the listed integers.
1036							The field name may be omitted if it is the first
1037							field (usually the key field).
1038
1039							=item pre/postMatch
1040
1041							Words matched by a query can be highlighted; see
1042							parameters C and C in the L method.
1043
1044							=back
1045
1046							=back
1047
1048							=cut
1049
1050
1051							sub compileFilter {
1052	9			9	1	16	my $self = shift;
1053	9					17	my $query = shift;
1054	9					14	my $implicitPlus = shift;
1055
1056	9	50				28	return $self->_cplRegex($query) if ref $query eq 'Regexp';
1057
1058	9	50				24	unless (ref $query eq 'HASH') { # if HASH, query was already parsed
1059	9					53	$query = Search::QueryParser->new->parse($query, $implicitPlus);
1060							}
1061
1062	9					1996	my $code = $self->_cplQ($query);
1063	1	50		1		9	eval 'sub {no warnings "numeric"; (' .$code. ') ? $_[0] : undef;}'
	1			1		3
	1			1		102
	1			1		7
	1			1		2
	1			1		269
	1			1		8
	1			1		3
	1			1		95
	1					12
	1					4
	1					153
	1					10
	1					3
	1					151
	1					10
	1					2
	1					150
	1					9
	1					2
	1					99
	1					9
	1					2
	1					175
	1					12
	1					2
	1					126
	9					1142
1064							or croak $@;
1065							}
1066
1067
1068							sub _cplRegex {
1069	0			0		0	my $self = shift;
1070	0					0	my $regex = shift;
1071	0			0		0	return eval {sub {$_[0]->{line} =~ $regex}};
	0					0
	0					0
1072							}
1073
1074
1075							sub _cplQ {
1076	10			10		24	my $self = shift;
1077	10					16	my $q = shift;
1078
1079	10					17	my $mandatory = join(" and ", map {$self->_cplSubQ($_)} @{$q->{'+'}});
	3					25
	10					34
1080	10					19	my $exclude = join(" or ", map {$self->_cplSubQ($_)} @{$q->{'-'}});
	3					6
	10					27
1081	10					19	my $optional = join(" or ", map {$self->_cplSubQ($_)} @{$q->{''}});
	7					43
	10					24
1082
1083	10	50	66			42	croak "missing positive criteria in query" if not ($mandatory \|\| $optional);
1084	10		66			33	my $r = "(" . ($mandatory \|\| $optional) . ")";
1085	10	100				24	$r .= " and not ($exclude)" if $exclude;
1086	10					28	return $r;
1087							}
1088
1089
1090							sub _cplSubQ {
1091	13			13		23	my $self = shift;
1092	13					22	my $subQ = shift;
1093
1094	13					33	for ($subQ->{op}) {
1095
1096							# Either a list of subqueries...
1097							/^$/
1098	13	100				39	and do {# assert(ref $subQ->{value} eq 'HASH' and not $subQ->{field})
1099							# if DEBUG;
1100	1					5	return $self->_cplQ($subQ->{value}); };
1101
1102							# ...or a comparison operator with a word or list of words.
1103							# In that case we need to do some preparation for the source of comparison.
1104
1105							# assert(not ref $subQ->{value} or ref $subQ->{value} eq 'ARRAY') if DEBUG;
1106
1107							# Data to compare : either ...
1108	12					22	my $src = qq{\$_[0]->{line}}; # ... by default, the whole line ;
1109	12	100				55	if ($subQ->{field}) { # ... or an individual field.
1110	7	100				35	if ($subQ->{field} eq 'K_E_Y') { # Special pseudo field (in first position) :
1111	2					5	$subQ->{op} = '~'; # cheat, replace ':' by a regex operation.
1112	2					10	$subQ->{value} = "^$subQ->{value}(?:\\Q$self->{fieldSep}\\E\|\$)";
1113							}
1114							else {
1115							my $fieldNum = $self->ht->{$subQ->{field}} or
1116	5	50				11	croak "invalid field name $subQ->{field} in request";
1117	5					11	$src = qq{_getField(\$_[0], $fieldNum)};
1118							}
1119							}
1120
1121							/^:$/
1122	12	100				40	and do {my $s = $subQ->{value};
	7					19
1123
1124							my $noHighlights = # no result highlighting if ...
1125							$s eq '*' # .. request matches anything
1126							\|\| ! ($self->{preMatch} \|\| $self->{postMatch})
1127							# .. or no highlight was requested
1128	7		33			54	\|\| $subQ->{field}; # .. or request is on specific field
1129
1130	7					18	$s =~ s[\][\\w]g; # replace star by \w* regex
1131	7					15	$s =~ s{[\[\]+?]}{\Q$&\E}g; # escape other regex chars
1132	7					18	$s =~ s[\s+][\\s+]g; # replace spaces by \s+ regex
1133
1134
1135	7					13	$s =~ s/�/[�c]/g;
1136	7					17	$s =~ s/([��])/[a$1]/ig;
1137	7					13	$s =~ s/([��])/[e$1]/ig;
1138	7					51	$s =~ s/([��])/[i$1]/ig;
1139	7					13	$s =~ s/([��])/[o$1]/ig;
1140	7					15	$s =~ s/([��])/[u$1]/ig;
1141	7					17	$s =~ s/([��])/[y$1]/ig;
1142
1143	7	100				26	my $wdIni = ($s =~ /^\w/) ? '\b' : '';
1144	7	100				27	my $wdEnd = ($s =~ /\w$/) ? '\b' : '';
1145	7					14	my $lineIni = "";
1146	7	100	100			34	$lineIni = "(?{avoidMatchKey} and not $subQ->{field};
1147	7					18	$s = "$lineIni$wdIni$s$wdEnd";
1148
1149	7	50				44	return $noHighlights ? "($src =~ m[$s]i)" :
1150							"($src =~ s[$s][$self->{preMatch}\$&$self->{postMatch}]ig)";
1151							};
1152
1153
1154
1155							/^#$/ # compare source with a list of numbers
1156	5	50				17	and do {
1157	0					0	my $has_state = eval "use feature 'state'; 1"; # true from Perl 5.10
1158	0	0				0	my $decl = $has_state ? "use feature 'state'; state \$numvec"
1159							: "my \$numvec if 0"; # undocumented hack
1160
1161							# build a block that at first call creates a bit vector; then at
1162							# each call, the data source is compared with the bit vector
1163	0					0	return qq{
1164							do {
1165							$decl;
1166							no warnings qw/uninitialized numeric/;
1167							\$numvec or do {
1168							my \$nums = q{$subQ->{value}};
1169							vec(\$numvec, \$_, 1) = 1 for (\$nums =~ /\\d+/g);
1170							};
1171							vec(\$numvec, int($src), 1);
1172							}
1173							};
1174							};
1175
1176
1177							# for all other ops, $subQ->{value} must be a scalar
1178							# assert(not ref $subQ->{value}) if DEBUG;
1179
1180	5	50	33			79	(/^(!)~$/ or /^()=?~$/) and return "$1($src =~ m[$subQ->{value}])";
1181
1182							# choose proper comparison according to datatype of $subQ->{value}
1183							my $cmp = ($subQ->{value} =~ $self->{rxDate}) ? "(\$self->{date2str}($src) cmp q{" . $self->{date2str}($subQ->{value}) . "})" :
1184	0	0				0	($subQ->{value} =~ $self->{rxNum}) ? "($src <=> $subQ->{value})" :
		0
1185							"($src cmp q{$subQ->{value}})";
1186
1187	0	0				0	/^=?=$/ and return "$cmp == 0";
1188	0	0				0	/^(?:!=\|<>)$/ and return "$cmp != 0";
1189	0	0				0	/^>$/ and return "$cmp > 0";
1190	0	0				0	/^>=$/ and return "$cmp >= 0";
1191	0	0				0	/^<$/ and return "$cmp < 0";
1192	0	0				0	/^<=$/ and return "$cmp <= 0";
1193	0					0	croak "unexpected op $_ ($subQ->{field} / $subQ->{value})";
1194							}
1195							}
1196
1197							############################################################
1198							# utility functions
1199							############################################################
1200
1201							sub urlEncode {
1202	6			6	0	14	my $s = shift;
1203	6					23	return join "", map {sprintf "%%%02X", ord($_)} split //, $s;
	6					83
1204							}
1205
1206							sub copyData { # copy from one filehandle to another
1207	8			8	0	45	my ($f1, $f2) = @_;
1208	8					14	my $buf;
1209	8					184	while (read $f1, $buf, BUFSIZE) {print $f2 $buf;}
	6					71
1210							}
1211
1212							=back
1213
1214							=head1 AUTHOR
1215
1216							Laurent Dami, Elaurent.dami AT etat ge chE
1217
1218
1219							=head1 COPYRIGHT AND LICENSE
1220
1221							Copyright (C) 2005 by Laurent Dami.
1222
1223							This library is free software; you can redistribute it and/or modify
1224							it under the same terms as Perl itself.
1225
1226							=cut
1227
1228							1;