File Coverage

blib/lib/Data/Transform/Line.pm
Criterion Covered Total %
statement 82 84 97.6
branch 44 46 95.6
condition 15 18 83.3
subroutine 9 9 100.0
pod 3 3 100.0
total 153 160 95.6


line stmt bran cond sub pod time code
1             # vim: ts=2 sw=2 expandtab
2             package Data::Transform::Line;
3 2     2   2230 use strict;
  2         4  
  2         69  
4              
5 2     2   668 use Data::Transform;
  2         4  
  2         44  
6              
7 2     2   9 use vars qw($VERSION @ISA);
  2         2  
  2         117  
8             $VERSION = '0.01';
9             @ISA = qw(Data::Transform);
10              
11 2     2   10 use Carp qw(carp croak);
  2         3  
  2         2289  
12              
13             =head1 NAME
14              
15             Data::Transform::Line - serialize and parse terminated records (lines)
16              
17             =head1 SYNOPSIS
18              
19             #!perl
20              
21             use POE qw(Wheel::FollowTail Filter::Line);
22              
23             POE::Session->create(
24             inline_states => {
25             _start => sub {
26             $_[HEAP]{tailor} = POE::Wheel::FollowTail->new(
27             Filename => "/var/log/system.log",
28             InputEvent => "got_log_line",
29             Filter => POE::Filter::Line->new(),
30             );
31             },
32             got_log_line => sub {
33             print "Log: $_[ARG0]\n";
34             }
35             }
36             );
37              
38             POE::Kernel->run();
39             exit;
40              
41             =head1 DESCRIPTION
42              
43             Data::Transform::Line parses stream data into terminated records. The
44             default parser interprets newlines as the record terminator, and the
45             default serializer appends network newlines (CR/LF, or "\x0D\x0A") to
46             outbound records.
47              
48             Data::Transform::Line supports a number of other ways to parse lines.
49             Constructor parameters may specify literal newlines, regular
50             expressions, or that the filter should detect newlines on its own.
51              
52             =head1 PUBLIC FILTER METHODS
53              
54             Data::Transform::Line's new() method has some interesting parameters.
55              
56             =cut
57              
58             sub DEBUG () { 0 }
59              
60             sub INPUT_BUFFER () { 0 }
61             sub FRAMING_BUFFER () { 1 }
62             sub INPUT_REGEXP () { 2 }
63             sub OUTPUT_LITERAL () { 3 }
64             sub AUTODETECT_STATE () { 4 }
65              
66             sub AUTO_STATE_DONE () { 0x00 }
67             sub AUTO_STATE_FIRST () { 0x01 }
68             sub AUTO_STATE_SECOND () { 0x02 }
69              
70             =head2 new
71              
72             new() accepts a list of named parameters.
73              
74             C may be used to parse records that are terminated by
75             some literal string. For example, Data::Transform::Line may be used to
76             parse and emit C-style lines, which are terminated with an ASCII NUL:
77              
78             my $c_line_filter = Data::Transform::Line->new(
79             InputLiteral => chr(0),
80             OutputLiteral => chr(0),
81             );
82              
83             C allows a filter to put() records with a different
84             record terminator than it parses. This can be useful in applications
85             that must translate record terminators.
86              
87             C is a shorthand for the common case where the input and
88             output literals are identical. The previous example may be written
89             as:
90              
91             my $c_line_filter = Data::Transform::Line->new(
92             Literal => chr(0),
93             );
94              
95             An application can also allow Data::Transform::Line to figure out which
96             newline to use. This is done by specifying C to be
97             undef:
98              
99             my $whichever_line_filter = Data::Transform::Line->new(
100             InputLiteral => undef,
101             OutputLiteral => "\n",
102             );
103              
104             C may be used in place of C to recognize
105             line terminators based on a regular expression. In this example,
106             input is terminated by two or more consecutive newlines. On output,
107             the paragraph separator is "---" on a line by itself.
108              
109             my $paragraph_filter = Data::Transform::Line->new(
110             InputRegexp => "([\x0D\x0A]{2,})",
111             OutputLiteral => "\n---\n",
112             );
113              
114             =cut
115              
116             sub new {
117 16     16 1 10766 my $type = shift;
118              
119 16 100 100     356 croak "$type requires an even number of parameters" if @_ and @_ & 1;
120 15         39 my %params = @_;
121              
122 15 100 66     175 croak "$type cannot have both Regexp and Literal line endings" if (
123             defined $params{Regexp} and defined $params{Literal}
124             );
125              
126 14         18 my ($input_regexp, $output_literal);
127 14         15 my $autodetect = AUTO_STATE_DONE;
128              
129             # Literal newline for both incoming and outgoing. Every other known
130             # parameter conflicts with this one.
131 14 100       28 if (defined $params{Literal}) {
132 6 100       129 croak "A defined Literal must have a nonzero length"
133             unless length($params{Literal});
134 5         10 $input_regexp = quotemeta $params{Literal};
135 5         7 $output_literal = $params{Literal};
136 5 100 100     33 if ( exists $params{InputLiteral } or # undef means something
      100        
137             defined $params{InputRegexp } or
138             defined $params{OutputLiteral } ) {
139 3         412 croak "$type cannot have Literal with any other parameter";
140             }
141              
142             } else { # Input and output are specified separately, then.
143              
144             # Input can be either a literal or a regexp. The regexp may be
145             # compiled or not; we don't rightly care at this point.
146 8 100       518 if (exists $params{InputLiteral}) {
    100          
147 5         22 $input_regexp = $params{InputLiteral};
148              
149             # InputLiteral is defined. Turn it into a regexp and be done.
150             # Otherwise we will autodetect it.
151 5 100 66     16 if (defined($input_regexp) and length($input_regexp)) {
152 2         4 $input_regexp = quotemeta $input_regexp;
153             }
154             else {
155 3         6 $autodetect = AUTO_STATE_FIRST;
156 3         5 $input_regexp = '';
157             }
158              
159 5 100       167 croak "$type cannot have both InputLiteral and InputRegexp"
160             if defined $params{InputRegexp};
161             }
162             elsif (defined $params{InputRegexp}) {
163 2         4 $input_regexp = $params{InputRegexp};
164             # unreachable
165             #croak "$type cannot have both InputLiteral and InputRegexp"
166             # if defined $params{InputLiteral};
167             }
168             else {
169 1         2 $input_regexp = "(\\x0D\\x0A?|\\x0A\\x0D?)";
170             }
171              
172 7 100       13 if (defined $params{OutputLiteral}) {
173 6         9 $output_literal = $params{OutputLiteral};
174             }
175             else {
176 1         2 $output_literal = "\x0D\x0A";
177             }
178             }
179              
180 9         19 delete @params{qw(Literal InputLiteral OutputLiteral InputRegexp)};
181 9 50       21 carp("$type ignores unknown parameters: ", join(', ', sort keys %params))
182             if scalar keys %params;
183              
184 9         35 my $self = bless [
185             [], # INPUT_BUFFER
186             '', # FRAMING_BUFFER
187             $input_regexp, # INPUT_REGEXP
188             $output_literal, # OUTPUT_LITERAL
189             $autodetect, # AUTODETECT_STATE
190             ], $type;
191              
192 9         9 DEBUG and warn join ':', @$self;
193              
194 9         31 $self;
195             }
196              
197             sub clone {
198 3     3 1 1297 my $self = shift;
199              
200 3         12 my $new = bless [
201             [],
202             '',
203             $self->[INPUT_REGEXP],
204             $self->[OUTPUT_LITERAL],
205             $self->[AUTODETECT_STATE],
206             ];
207              
208 3         14 return bless $new, ref $self;
209             }
210              
211             sub get_pending {
212 7     7 1 18 my $self = shift;
213 7         8 my @ret = @{$self->[INPUT_BUFFER]};
  7         15  
214 7 100       21 if (length $self->[FRAMING_BUFFER]) {
215 3         5 unshift @ret, $self->[FRAMING_BUFFER];
216             }
217 7 100       34 return @ret ? [ @ret ] : undef;
218             }
219              
220             # get() is inherited from Data::Transform.
221             # get_one_start() is inherited from Data::Transform.
222             # get_one() is inherited from Data::Transform.
223              
224             sub _handle_get_data {
225 110     110   123 my ($self, $data) = @_;
226              
227 110 100       191 if (defined $data) {
228 42         61 $self->[FRAMING_BUFFER] .= $data;
229             }
230             # Process as many newlines an we can find.
231 110         109 LINE: while (1) {
232              
233             # Autodetect is done, or it never started. Parse some buffer!
234 112 100       228 unless ($self->[AUTODETECT_STATE]) {
235 100         81 DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP];
236             last LINE
237 100 100       604 unless $self->[FRAMING_BUFFER] =~ s/^(.*?)$self->[INPUT_REGEXP]//s;
238 36         43 DEBUG and warn "got line: <<", unpack('H*', $1), ">>\n";
239              
240 36         123 return $1;
241             }
242              
243             # Waiting for the first line ending. Look for a generic newline.
244 12 100       25 if ($self->[AUTODETECT_STATE] & AUTO_STATE_FIRST) {
245             last LINE
246 6 100       31 unless $self->[FRAMING_BUFFER] =~ s/^(.*?)(\x0D\x0A?|\x0A\x0D?)//;
247              
248 3         8 my $line = $1;
249              
250             # The newline can be complete under two conditions. First: If
251             # it's two characters. Second: If there's more data in the
252             # framing buffer. Loop around in case there are more lines.
253 3 100 66     17 if ( (length($2) == 2) or
254             (length $self->[FRAMING_BUFFER])
255             ) {
256 1         2 DEBUG and warn "detected complete newline after line: <<$1>>\n";
257 1         3 $self->[INPUT_REGEXP] = $2;
258 1         2 $self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
259             }
260              
261             # The regexp has matched a potential partial newline. Save it,
262             # and move to the next state. There is no more data in the
263             # framing buffer, so we're done.
264             else {
265 2         2 DEBUG and warn "detected suspicious newline after line: <<$1>>\n";
266 2         5 $self->[INPUT_REGEXP] = $2;
267 2         3 $self->[AUTODETECT_STATE] = AUTO_STATE_SECOND;
268             }
269              
270 3         10 return $line;
271             }
272              
273             # Waiting for the second line beginning. Bail out if we don't
274             # have anything in the framing buffer.
275 6 50       13 if ($self->[AUTODETECT_STATE] & AUTO_STATE_SECOND) {
276 6 100       23 return unless length $self->[FRAMING_BUFFER];
277              
278             # Test the first character to see if it completes the previous
279             # potentially partial newline.
280 2 100       11 if (
    100          
281             substr($self->[FRAMING_BUFFER], 0, 1) eq
282             ( $self->[INPUT_REGEXP] eq "\x0D" ? "\x0A" : "\x0D" )
283             ) {
284              
285             # Combine the first character with the previous newline, and
286             # discard the newline from the buffer. This is two statements
287             # for backward compatibility.
288 1         2 DEBUG and warn "completed newline after line: <<$1>>\n";
289 1         3 $self->[INPUT_REGEXP] .= substr($self->[FRAMING_BUFFER], 0, 1);
290 1         4 substr($self->[FRAMING_BUFFER], 0, 1) = '';
291             }
292 0         0 elsif (DEBUG) {
293             warn "decided prior suspicious newline is okay\n";
294             }
295              
296             # Regardless, whatever is in INPUT_REGEXP is now a complete
297             # newline. End autodetection, post-process the found newline,
298             # and loop to see if there are other lines in the buffer.
299 2         4 $self->[INPUT_REGEXP] = $self->[INPUT_REGEXP];
300 2         4 $self->[AUTODETECT_STATE] = AUTO_STATE_DONE;
301 2         3 next LINE;
302             }
303              
304 0         0 die "consistency error: AUTODETECT_STATE = $self->[AUTODETECT_STATE]";
305             }
306              
307 67         197 return;
308             }
309              
310             # New behavior. First translate system newlines ("\n") into whichever
311             # newlines are supposed to be sent. Second, add a trailing newline if
312             # one doesn't already exist. Since the referenced output list is
313             # supposed to contain one line per element, we also do a split and
314             # join. Bleah. ... why isn't the code doing what the comment says?
315              
316             sub _handle_put_data {
317 31     31   42 my ($self, $line) = @_;
318              
319 31         126 return $line . $self->[OUTPUT_LITERAL];
320             }
321              
322              
323             1;
324              
325             __END__