File Coverage

blib/lib/Data/Transform/Line.pm

Criterion	Covered	Total	%
statement	82	84	97.6
branch	44	46	95.6
condition	15	18	83.3
subroutine	9	9	100.0
pod	3	3	100.0
total	153	160	95.6

line	stmt	bran	cond	sub	pod	time	code
1							# vim: ts=2 sw=2 expandtab
2							package Data::Transform::Line;
3	2			2		1382	use strict;
	2					2
	2					48
4
5	2			2		339	use Data::Transform;
	2					2
	2					39
6
7	2			2		7	use vars qw($VERSION @ISA);
	2					1
	2					102
8							$VERSION = '0.01';
9							@ISA = qw(Data::Transform);
10
11	2			2		6	use Carp qw(carp croak);
	2					2
	2					1799
12
13							=head1 NAME
14
15							Data::Transform::Line - serialize and parse terminated records (lines)
16
17							=head1 SYNOPSIS
18
19							#!perl
20
21							use POE qw(Wheel::FollowTail Filter::Line);
22
23							POE::Session->create(
24							inline_states => {
25							_start => sub {
26							$_[HEAP]{tailor} = POE::Wheel::FollowTail->new(
27							Filename => "/var/log/system.log",
28							InputEvent => "got_log_line",
29							Filter => POE::Filter::Line->new(),
30							);
31							},
32							got_log_line => sub {
33							print "Log: $_[ARG0]\n";
34							}
35							}
36							);
37
38							POE::Kernel->run();
39							exit;
40
41							=head1 DESCRIPTION
42
43							Data::Transform::Line parses stream data into terminated records. The
44							default parser interprets newlines as the record terminator, and the
45							default serializer appends network newlines (CR/LF, or "\x0D\x0A") to
46							outbound records.
47
48							Data::Transform::Line supports a number of other ways to parse lines.
49							Constructor parameters may specify literal newlines, regular
50							expressions, or that the filter should detect newlines on its own.
51
52							=head1 PUBLIC FILTER METHODS
53
54							Data::Transform::Line's new() method has some interesting parameters.
55
56							=cut
57
58							sub DEBUG () { 0 }
59
60							sub INPUT_BUFFER () { 0 }
61							sub FRAMING_BUFFER () { 1 }
62							sub INPUT_REGEXP () { 2 }
63							sub OUTPUT_LITERAL () { 3 }
64							sub AUTODETECT_STATE () { 4 }
65
66							sub AUTO_STATE_DONE () { 0x00 }
67							sub AUTO_STATE_FIRST () { 0x01 }
68							sub AUTO_STATE_SECOND () { 0x02 }
69
70							=head2 new
71
72							new() accepts a list of named parameters.
73
74							C may be used to parse records that are terminated by
75							some literal string. For example, Data::Transform::Line may be used to
76							parse and emit C-style lines, which are terminated with an ASCII NUL:
77
78							my $c_line_filter = Data::Transform::Line->new(
79							InputLiteral => chr(0),
80							OutputLiteral => chr(0),
81							);
82
83							C allows a filter to put() records with a different
84							record terminator than it parses. This can be useful in applications
85							that must translate record terminators.
86
87							C is a shorthand for the common case where the input and
88							output literals are identical. The previous example may be written
89							as:
90
91							my $c_line_filter = Data::Transform::Line->new(
92							Literal => chr(0),
93							);
94
95							An application can also allow Data::Transform::Line to figure out which
96							newline to use. This is done by specifying C to be
97							undef:
98
99							my $whichever_line_filter = Data::Transform::Line->new(
100							InputLiteral => undef,
101							OutputLiteral => "\n",
102							);
103
104							C may be used in place of C to recognize
105							line terminators based on a regular expression. In this example,
106							input is terminated by two or more consecutive newlines. On output,
107							the paragraph separator is "---" on a line by itself.
108
109							my $paragraph_filter = Data::Transform::Line->new(
110							InputRegexp => "([\x0D\x0A]{2,})",
111							OutputLiteral => "\n---\n",
112							);
113
114							=cut
115
116							sub new {
117	16			16	1	5295	my $type = shift;
118
119	16	100	100			252	croak "$type requires an even number of parameters" if @_ and @_ & 1;
120	15					29	my %params = @_;
121
122							croak "$type cannot have both Regexp and Literal line endings" if (
123							defined $params{Regexp} and defined $params{Literal}
124	15	100	66			136	);
125
126	14					11	my ($input_regexp, $output_literal);
127	14					12	my $autodetect = AUTO_STATE_DONE;
128
129							# Literal newline for both incoming and outgoing. Every other known
130							# parameter conflicts with this one.
131	14	100				18	if (defined $params{Literal}) {
132							croak "A defined Literal must have a nonzero length"
133	6	100				89	unless length($params{Literal});
134	5					6	$input_regexp = quotemeta $params{Literal};
135	5					6	$output_literal = $params{Literal};
136	5	100	100			25	if ( exists $params{InputLiteral } or # undef means something
			100
137							defined $params{InputRegexp } or
138							defined $params{OutputLiteral } ) {
139	3					269	croak "$type cannot have Literal with any other parameter";
140							}
141
142							} else { # Input and output are specified separately, then.
143
144							# Input can be either a literal or a regexp. The regexp may be
145							# compiled or not; we don't rightly care at this point.
146	8	100				16	if (exists $params{InputLiteral}) {
		100
147	5					13	$input_regexp = $params{InputLiteral};
148
149							# InputLiteral is defined. Turn it into a regexp and be done.
150							# Otherwise we will autodetect it.
151	5	100	66			14	if (defined($input_regexp) and length($input_regexp)) {
152	2					2	$input_regexp = quotemeta $input_regexp;
153							}
154							else {
155	3					3	$autodetect = AUTO_STATE_FIRST;
156	3					2	$input_regexp = '';
157							}
158
159							croak "$type cannot have both InputLiteral and InputRegexp"
160	5	100				98	if defined $params{InputRegexp};
161							}
162							elsif (defined $params{InputRegexp}) {
163	2					2	$input_regexp = $params{InputRegexp};
164							# unreachable
165							#croak "$type cannot have both InputLiteral and InputRegexp"
166							# if defined $params{InputLiteral};
167							}
168							else {
169	1					1	$input_regexp = "(\\x0D\\x0A?\|\\x0A\\x0D?)";
170							}
171
172	7	100				9	if (defined $params{OutputLiteral}) {
173	6					6	$output_literal = $params{OutputLiteral};
174							}
175							else {
176	1					1	$output_literal = "\x0D\x0A";
177							}
178							}
179
180	9					17	delete @params{qw(Literal InputLiteral OutputLiteral InputRegexp)};
181	9	50				15	carp("$type ignores unknown parameters: ", join(', ', sort keys %params))
182							if scalar keys %params;
183
184	9					18	my $self = bless [
185							[], # INPUT_BUFFER
186							'', # FRAMING_BUFFER
187							$input_regexp, # INPUT_REGEXP
188							$output_literal, # OUTPUT_LITERAL
189							$autodetect, # AUTODETECT_STATE
190							], $type;
191
192	9					7	DEBUG and warn join ':', @$self;
193
194	9					22	$self;
195							}
196
197							sub clone {
198	3			3	1	679	my $self = shift;
199
200	3					10	my $new = bless [
201							[],
202							'',
203							$self->[INPUT_REGEXP],
204							$self->[OUTPUT_LITERAL],
205							$self->[AUTODETECT_STATE],
206							];
207
208	3					6	return bless $new, ref $self;
209							}
210
211							sub get_pending {
212	7			7	1	11	my $self = shift;
213	7					7	my @ret = @{$self->[INPUT_BUFFER]};
	7					10
214	7	100				12	if (length $self->[FRAMING_BUFFER]) {
215	3					6	unshift @ret, $self->[FRAMING_BUFFER];
216							}
217	7	100				24	return @ret ? [ @ret ] : undef;
218							}
219
220							# get() is inherited from Data::Transform.
221							# get_one_start() is inherited from Data::Transform.
222							# get_one() is inherited from Data::Transform.
223
224							sub _handle_get_data {
225	110			110		71	my ($self, $data) = @_;
226
227	110	100				143	if (defined $data) {
228	42					41	$self->[FRAMING_BUFFER] .= $data;
229							}
230							# Process as many newlines an we can find.
231	110					54	LINE: while (1) {
232
233							# Autodetect is done, or it never started. Parse some buffer!
234	112	100				130	unless ($self->[AUTODETECT_STATE]) {
235	100					65	DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP];
236							last LINE
237	100	100				405	unless $self->[FRAMING_BUFFER] =~ s/^(.*?)$self->[INPUT_REGEXP]//s;
238	36					46	DEBUG and warn "got line: <<", unpack('H*', $1), ">>\n";
239
240	36					77	return $1;
241							}
242
243							# Waiting for the first line ending. Look for a generic newline.
244	12	100				15	if ($self->[AUTODETECT_STATE] & AUTO_STATE_FIRST) {
245							last LINE
246	6	100				23	unless $self->[FRAMING_BUFFER] =~ s/^(.*?)(\x0D\x0A?\|\x0A\x0D?)//;
247
248	3					4	my $line = $1;
249
250							# The newline can be complete under two conditions. First: If
251							# it's two characters. Second: If there's more data in the
252							# framing buffer. Loop around in case there are more lines.
253	3	100	66			13	if ( (length($2) == 2) or
254							(length $self->[FRAMING_BUFFER])
255							) {
256	1					2	DEBUG and warn "detected complete newline after line: <<$1>>\n";
257	1					2	$self->[INPUT_REGEXP] = $2;
258	1					0	$self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
259							}
260
261							# The regexp has matched a potential partial newline. Save it,
262							# and move to the next state. There is no more data in the
263							# framing buffer, so we're done.
264							else {
265	2					1	DEBUG and warn "detected suspicious newline after line: <<$1>>\n";
266	2					2	$self->[INPUT_REGEXP] = $2;
267	2					2	$self->[AUTODETECT_STATE] = AUTO_STATE_SECOND;
268							}
269
270	3					7	return $line;
271							}
272
273							# Waiting for the second line beginning. Bail out if we don't
274							# have anything in the framing buffer.
275	6	50				8	if ($self->[AUTODETECT_STATE] & AUTO_STATE_SECOND) {
276	6	100				14	return unless length $self->[FRAMING_BUFFER];
277
278							# Test the first character to see if it completes the previous
279							# potentially partial newline.
280	2	100				7	if (
		100
281							substr($self->[FRAMING_BUFFER], 0, 1) eq
282							( $self->[INPUT_REGEXP] eq "\x0D" ? "\x0A" : "\x0D" )
283							) {
284
285							# Combine the first character with the previous newline, and
286							# discard the newline from the buffer. This is two statements
287							# for backward compatibility.
288	1					1	DEBUG and warn "completed newline after line: <<$1>>\n";
289	1					3	$self->[INPUT_REGEXP] .= substr($self->[FRAMING_BUFFER], 0, 1);
290	1					1	substr($self->[FRAMING_BUFFER], 0, 1) = '';
291							}
292	0					0	elsif (DEBUG) {
293							warn "decided prior suspicious newline is okay\n";
294							}
295
296							# Regardless, whatever is in INPUT_REGEXP is now a complete
297							# newline. End autodetection, post-process the found newline,
298							# and loop to see if there are other lines in the buffer.
299	2					2	$self->[INPUT_REGEXP] = $self->[INPUT_REGEXP];
300	2					2	$self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
301	2					2	next LINE;
302							}
303
304	0					0	die "consistency error: AUTODETECT_STATE = $self->[AUTODETECT_STATE]";
305							}
306
307	67					104	return;
308							}
309
310							# New behavior. First translate system newlines ("\n") into whichever
311							# newlines are supposed to be sent. Second, add a trailing newline if
312							# one doesn't already exist. Since the referenced output list is
313							# supposed to contain one line per element, we also do a split and
314							# join. Bleah. ... why isn't the code doing what the comment says?
315
316							sub _handle_put_data {
317	31			31		23	my ($self, $line) = @_;
318
319	31					82	return $line . $self->[OUTPUT_LITERAL];
320							}
321
322
323							1;
324
325							__END__