File Coverage

blib/lib/Text/FixedLengthMultiline.pm

Criterion	Covered	Total	%
statement	200	211	94.7
branch	97	114	85.0
condition	30	39	76.9
subroutine	20	21	95.2
pod	5	5	100.0
total	352	390	90.2

: Missing data: need to feed next line to fill remining columns.

line	stmt	bran	cond	sub	pod	time	code
1							package Text::FixedLengthMultiline;
2
3	4			4		143474	use utf8;
	4					42
	4					24
4	4			4		137	use strict;
	4					7
	4					124
5	4			4		22	use warnings;
	4					12
	4					113
6
7	4			4		21	use Carp;
	4					6
	4					356
8
9							BEGIN {
10	4			4		83	our $VERSION = '0.071';
11							}
12
13	4			4		21	use constant FIRST => 1;
	4					10
	4					344
14	4			4		29	use constant LAST => 2;
	4					10
	4					166
15	4			4		22	use constant ANY => 3; # FIRST \| LAST
	4					6
	4					15600
16
17							my %continue_styles = (
18							'first' => FIRST,
19							'last' => LAST,
20							'any' => ANY
21							);
22
23
24							=encoding utf8
25
26							=head1 NAME
27
28							Text::FixedLengthMultiline - Parse text data formatted in space separated columns optionnaly on multiple lines
29
30							=head1 SYNOPSIS
31
32							use Text::FixedLengthMultiline;
33
34							#234567890 12345678901234567890 12
35							my $text = <
36							Alice Pretty girl!
37							Bob Good old uncle Bob,
38							very old. 92
39							Charlie Best known as Waldo 14
40							or Wally. Where's
41							he?
42							EOT
43
44							my $fmt = Text::FixedLengthMultiline->new(format => ['!name' => 10, 1, 'comment~' => 20, 1, 'age' => -2 ]);
45
46							# Compute the RegExp that matches the first line
47							my $first_line_re = $fmt->get_first_line_re();
48							# Compute the RegExp that matches a continuation line
49							my $continue_line_re = $fmt->get_continue_line_re();
50
51							my @data;
52							my $err;
53							while ($text =~ /^([^\n]+)$/gm) {
54							my $line = $1;
55							push @data, {} if $line =~ $first_line_re;
56							if (($err = $fmt->parse_line($line, $data[$#data])) > 0) {
57							warn "Parse error at column $err";
58							}
59							}
60
61							=head1 DESCRIPTION
62
63							A row of data can be splitted on multiple lines of text with cell content
64							flowing in the same column space.
65
66							=head1 FORMAT SPECIFICATION
67
68							The format is given at the contruction time as an array ref. Modifying the
69							array content after the construction call is done at your own risks.
70
71							The array contains the ordered sequence of columns. Each colmun can either be:
72
73							=over
74
75							=item *
76
77							a positive integer representing the size of a separating column which is
78							expected to always be filled with spaces.
79
80							=item *
81
82							a string that matches this regexp: /^(?#mandatory)!?(?#name)[:alnum:]\w*(?:(?#multi)~(?#cont).?)?$/
83
84							=over
85
86							=item *
87
88							C means the column is mandatory
89
90							=item *
91
92							C is the column name. This will be the key for the hash after parsing.
93
94							=item *
95
96							C<~> means the column data can be on multiple lines.
97
98							=back
99
100							=back
101
102							=head1 METHODS
103
104							=head2 new()
105
106							Arguments:
107
108							=over
109
110							=item *
111
112							C: an array reference following the L.
113
114							=item *
115
116							C
117
118							=back
119
120							Example:
121
122							my $format = Text::FixedLengthMultiline->new(format => [ 2, col1 => 4, 1, '!col2' => 4 ]);
123
124							=cut
125
126							# TODO add 'continue-style': first/last/any
127							sub new
128							{
129	41			41	1	23045	my $class = shift;
130	41					133	my %params = @_;
131	41	100	66			479	(%params && exists $params{'format'}) or croak('['.__PACKAGE__."] Missing format");
132	40	100				488	ref $params{'format'} eq 'ARRAY' or croak('['.__PACKAGE__."] Invalid format: array ref expected");
133	37					56	my $continue_style = ANY;
134	37	100				108	if (exists $params{'continue_style'}) {
135	23					33	my $style = $params{'continue_style'};
136	23	100				575	croak('['.__PACKAGE__."] Invalid continue_style: first/last/any expected") unless exists $continue_styles{$style};
137	19					527	$continue_style = $continue_styles{$style};
138							}
139							# TODO Check the format, and report errors
140	33		33			200	my $self = {
141							FORMAT => $params{'format'},
142							# Maybe doing a copy would be a good idea...
143							# But we trust the user even if we all know
144							# he's a crazy programmer
145							DEBUG => exists $params{'debug'} && $params{'debug'},
146							CONTINUE_STYLE => $continue_style
147							};
148	33					88	bless $self, $class;
149	33					109	return $self;
150							}
151
152
153							=head2 C
154
155							Parse a table.
156
157							my @table = $fmt->parse_table($text);
158
159							Returns an array of hashes. Each hash is a row of data.
160
161							=cut
162
163							sub parse_table
164							{
165	1			1	1	10	my ($self, $text) = @_;
166	1					3	my $first_re = $self->get_first_line_re();
167	1					2	my @table;
168							my $err;
169	1					1	my $linenum = 1;
170	1					19	(pos $text) = 0;
171	1					8	while ($text =~ /^([^\n]+)$/gm) {
172	6					11	my $line = $1;
173	6	100				62	push @table, {} if $line =~ $first_re;
174	6	50				16	if (($err = $self->parse_line($line, $table[$#table])) > 0) {
175	0					0	croak "Parse error at line $linenum, column $err";
176							}
177							}
178	1					4	return @table;
179							}
180
181
182
183
184							=head2 C
185
186							Parse a line of text and add parsed data to the hash.
187
188							my $error = $fmt->parse_line($line, \%row_data);
189
190							Multiple calls to C with the same hashref may be needed to fully
191							read a "logical line" in case some columns are multiline.
192
193							Returns:
194
195							=over
196
197							=item *
198
199							C<-col>: Parse error. The value is a negative integer indicating the
200							character position in the line where the parse error occured.
201
202							=item *
203
204							C<0>: OK
205
206							=item *
207
208							C
209							The value is the character position of the column where data is expected.
210
211							=back
212
213							=cut
214
215							# TODO: return a RE in case of missing data
216							sub parse_line
217							{
218	40			40	1	16571	my ($self, $line, $data) = @_;
219	40					56	my @fmt = @{$self->{FORMAT}};
	40					116
220	40					52	my $col = 1;
221	40					45	my $ret = 0;
222	40	100				99	$line = '' unless defined $line;
223	40					92	while ($#fmt >= 0) {
224	86					114	my $f = shift @fmt;
225	86					88	my $data_len;
226	86	100				390	if ($f =~ /^\d+$/) {
		50
227							# Spaces to skip
228	36	50				69	next if $f == 0;
229	36					140	$line =~ /^( {0,$f})/;
230	36					54	$data_len = length $1;
231	36	100				127	return -($col+$data_len) if $data_len < $f;
232							} elsif ($f =~ /^(!?)([A-Za-z_]\w*)(?:(~)(.?))?$/) {
233	50					141	my ($mandatory, $field, $multi, $cont) = ($1, $2, $3, $4);
234	50	100				110	$multi = 0 unless defined $multi;
235	50	50	66			137	$cont = ' ' unless defined $cont && $cont ne '';
236	50					58	my $len = shift @fmt;
237	50	50				90	next if $len == 0;
238	50					91	my $d = substr($line, 0, abs $len);
239	50					49	$data_len = length $d;
240	50	100				79	if ($len > 0) {
241	38					128	$d =~ s/ +$//;
242							} else {
243	12					27	$d .= ' ' x -($data_len+$len);
244	12					27	$d =~ s/^ +//;
245							}
246	50	100				104	if ($d ne '') {
247	37	100	100			157	return -$col if !$multi && exists $data->{$field};
248	34	100	100			103	if ($multi && exists $data->{$field}) {
249							# Multilines => concat
250	6					13	$data->{$field} .= "\n" . $d;
251	6	50	33			43	$ret = $col if $ret == 0 && $d =~ /\Q$cont\E$/;
252							} else {
253	28					57	$data->{$field} = $d;
254							}
255							}
256	47	100	100			163	$ret = $col if $mandatory && !exists $data->{$field} && $ret == 0;
			66
257							} else {
258	0					0	warn "Bad format!\n";
259	0					0	return -$col;
260							}
261	69					78	$col += $data_len;
262	69					122	$line = substr($line, $data_len);
263	69	100	66			229	last if $ret != 0 && $line eq '';
264							}
265	23	100				95	return -$col unless $line =~ /^ *$/;
266	20					107	return $ret;
267							}
268
269
270
271
272							sub _dump_line_re()
273							{
274	0			0		0	while ($#_ >= 0) {
275	0					0	print "> [" . (shift @_) ."]\n";
276	0					0	print ' [' . join('] :: [', @{ (shift @_) }) . "]\n";
	0					0
277							}
278							}
279
280							sub _serialize_line_re()
281							{
282							#&_dump_line_re(@_);
283	76			76		117	my $re = '';
284	76					186	while ($#_ > -1) {
285							# Pop the alternatives for the end of the line
286	69					77	my @b = grep(!/^$/, @{ (pop @_) });
	69					250
287							# TODO remove duplicates
288	69	100				162	push @b, $re if $re ne '';
289	69	100				155	if ($#b > 0) {
		100
290	23					73	$re = "(?:" . join('\|', @b) . ")";
291							} elsif ($#b > -1) {
292	42					63	$re = $b[0];
293							} else {
294	4					6	$re = '';
295							}
296							# Pop
297	69					256	$re = (pop @_) . $re;
298							#print "$re\n";
299							}
300	76					236	return $re;
301							}
302
303							sub _parse_column_format($;$)
304							{
305	104			104		148	my ($format, $width) = @_;
306	104	50				449	if ($format =~ /^(!?)([A-Za-z_]\w*)(?:(~)(.?))?$/) {
307	104					683	my %def = (
308							mandatory => $1,
309							name => $2,
310							multi => $3,
311							cont => $4,
312							width => abs $width
313							);
314	104	100				273	$def{multi} = '' unless defined $def{multi};
315	104	100				239	$def{align} = $width > 0 ? 'L' : 'R';
316	104					751	return %def;
317							} else {
318	0					0	return undef;
319							}
320							}
321
322							sub _build_repetition_re($;$;$)
323							{
324	101			101		136	my ($c, $min, $max) = @_;
325	101	50				193	return '' if $max <= 0;
326	101	50				156	if ($max == 1) {
327	0	0				0	$c .= '?' if $min <= 0;
328							} else {
329	101	100				162	if ($min < $max) {
330	45					91	$c .= "{$min,$max}";
331							} else {
332	56					222	$c .= "{$max}";
333							}
334							}
335	101					202	return $c;
336							}
337
338							sub _build_column_re
339							{
340	78			78		95	my $self = shift;
341	78					391	my %def = @_;
342	78		100			407	my $branch_multi = $def{multi} && exists $def{branch_multi} && $def{branch_multi};
343	78	50				184	my $re_label = $self->{DEBUG} ? "(?#_$def{mandatory}$def{name}$def{multi}_)" : '';
344	78	100				311	my $re_spaces = $def{spaces} > 0 ? ' '.($def{spaces} > 1 ? "{$def{spaces}}":'') : '';
		100
345	78					94	my $width = $def{width};
346	78					85	my ($re_col_mand, $re_col_end, $re_col);
347	78	100	100			227	if ($def{mandatory} \|\| $branch_multi) {
348	56					67	$re_col_mand = $re_spaces . $re_label;
349	56	100				129	if ($def{align} eq 'L') { # Left aligned
350	45					100	$re_col_end = &_build_repetition_re('.', 0, $width-1);
351	45	100				89	unless ($branch_multi) {
352	40					164	$re_col_mand .= '\S';
353	40					89	$re_col = &_build_repetition_re('.', $width-1, $width-1);
354							} else {
355	5					11	$re_col = &_build_repetition_re('.', $width, $width);
356	5					10	$re_col_end = '\S' . $re_col_end;
357							}
358							} else {
359	11					28	$re_col_mand .= &_build_repetition_re('.', $width-1, $width-1);
360	11	50				25	unless ($branch_multi) {
361	11					16	$re_col_end = $re_col = '';
362	11					16	$re_col_mand .= '\S';
363							} else {
364	0					0	$re_col_end = '\S';
365	0					0	$re_col = '.';
366							}
367							}
368							} else {
369	22					30	$re_col_mand = '';
370	22	50				75	$re_col_end = $re_spaces . $re_label . '.' . ($width > 1 ? "{0,$width}" : '?');
371	22	50				96	$re_col_end = "(?:$re_col_end)?" if $def{spaces};
372	22	50				57	$re_col = $re_spaces . $re_label . '.' . ($width > 1 ? "{$width}" : '' );
373							}
374							#print "$def{name} => /$re_col_mand/ /$re_col_end/ /$re_col/ (spaces = $def{spaces})\n";
375	78					411	return ($re_col_mand, $re_col_end, $re_col);
376							}
377
378							sub _has_multi(@)
379							{
380	17			17		35	foreach (@_) {
381	43	100				129	return 1 if /!?[_[:alpha:]]\w+~/;
382							}
383	12					38	return 0;
384							}
385
386
387							# @_ is the format
388							# TODO handle the case where all columns are optionnal
389							# The RE is then the union of the cases where one of the colmuns, up to the first multi, is mandatory
390							sub _build_first_line_re
391							{
392	37			37		46	my $self = shift;
393	37					48	my $branch_multi = shift;
394	37					40	my $spaces = 0;
395	37					612	my @re = ();
396	37					49	my $re_acc = ''; # Accumulator
397	37					101	my $multi = '~'; # Force the initialisation of @re
398	37					99	while ($#_ >= 0) {
399	121					155	my $f = shift;
400	121	100				437	if ($f =~ /^\d+$/) {
401	60					152	$spaces += $f;
402							} else {
403	61					114	my %def = &_parse_column_format($f, shift);
404	61	100	100			279	if ($multi && ($branch_multi \|\| $#re == -1)) {
			66
405							# The previous column was a multi. The following fields may not be
406							# on this line but on one of the next ones.
407							# So the end of the line is optionnal.
408							# We are starting a new altenative in the RE.
409	42					80	push @re, $re_acc, [ ];
410	42					70	$re_acc = '';
411							}
412	61					223	my ($re_col_mand, $re_col_end, $re_col) = $self->_build_column_re(%def, spaces => $spaces);
413	61	100				156	if ($def{mandatory}) {
414							# Flush optional columns and append this column
415	40					91	$re[$#re-1] .= $re_acc . $re_col_mand;
416	40	100				83	if ($re_col_end eq '') {
417	8					16	$re[$#re] = [ ];
418							} else {
419	32					72	$re[$#re] = [ $re_col_end ];
420							}
421	40					64	$re_acc = $re_col;
422							} else {
423							# Save column format for later
424	21					28	push @{$re[$#re]}, $re_acc . $re_col_mand . $re_col_end;
	21					57
425	21					38	$re_acc .= $re_col_mand . $re_col;
426							}
427	61					67	$spaces = 0;
428	61					270	$multi = $def{multi};
429							}
430							}
431	37					133	return @re;
432							}
433
434							sub _build_continue_line_re
435							{
436	39			39		51	my $self = shift;
437	39					46	my $spaces = 0;
438	39					45	my $multi = '~'; # Force the initialisation of @re
439	39					102	while ($#_ >= 0) {
440	85					120	my $f = shift;
441	85	100				326	if ($f =~ /^\d+$/) {
442	42					105	$spaces += $f;
443							} else {
444	43					84	my %def = &_parse_column_format($f, shift);
445	43	100				131	unless ($def{multi}) {
446	26					35	$spaces += $def{width};
447	26					109	next;
448							}
449	17					23	my @re;
450	17					20	my ($re_col_end, $re_col);
451	17					63	($re[0], $re_col_end, $re_col) = $self->_build_column_re(%def, spaces => $spaces, branch_multi => &_has_multi(@_));
452	17					86	push @re, [ $re_col_end ];
453	17					21	my @re_end;
454	17	100				100	push @re_end, &_serialize_line_re($self->_build_continue_line_re(@_)) if $self->{CONTINUE_STYLE} & FIRST;
455	17	100				69	push @re_end, &_serialize_line_re($self->_build_first_line_re(1, @_)) if $self->{CONTINUE_STYLE} & LAST;
456	17					79	@re_end = grep !/^$/, @re_end;
457							#pop @re_end if $#re_end == 1 && $re_end[1] eq $re_end[0];
458	17	100				50	push @re, $re_col, [ @re_end ] if (@re_end);
459	17					97	return @re;
460							}
461							}
462	22					56	return ();
463							}
464
465							=head2 C
466
467							Returns a regular expression that matches the first line of a "logical line"
468							of data.
469
470							my $re = $fmt->get_first_line_re();
471
472							=cut
473
474							sub get_first_line_re
475							{
476	27			27	1	291	my $self = shift;
477	27	100				87	if (!exists $self->{FIRST_LINE_RE}) {
478	26					54	my @re;
479	26	100				68	if ($self->{CONTINUE_STYLE} == FIRST) {
480	6					10	@re = $self->_build_first_line_re(0, @{$self->{FORMAT}});
	6					24
481							} else {
482	20					35	@re = $self->_build_first_line_re(1, @{$self->{FORMAT}});
	20					73
483							}
484	26					63	my $re = &_serialize_line_re(@re);
485	26	100				668	$self->{FIRST_LINE_RE} = ($re eq '' ? undef : qr/^$re *$/);
486							}
487	27					185	return $self->{FIRST_LINE_RE};
488							}
489
490							=head2 C
491
492							Returns a regular expression that matches the 2nd line and the following
493							lines of a "logical line".
494
495							my $re = $fmt->get_continue_line_re();
496
497							Returns undef if the format specification does not contains any column that
498							can be splitted on multiples lines.
499
500							=cut
501
502							# continue-style: first (only cont columns can appear on a continue line)
503							sub get_continue_line_re
504							{
505	26			26	1	54	my $self = shift;
506	26	50				127	if (!exists $self->{CONTINUE_LINE_RE}) {
507	26					35	my @re = $self->_build_continue_line_re(@{$self->{FORMAT}});
	26					93
508							#&_dump_line_re(@re);
509	26					57	my $re = &_serialize_line_re(@re);
510	26	100				599	$self->{CONTINUE_LINE_RE} = ($re eq '' ? undef : qr/^$re *$/);
511							}
512	26					169	return $self->{CONTINUE_LINE_RE};
513							}
514
515							1; # Magic for module end
516
517							__END__