File Coverage

blib/lib/Alvis/Pipeline.pm
Criterion Covered Total %
statement 31 31 100.0
branch 1 2 50.0
condition n/a
subroutine 9 9 100.0
pod 1 2 50.0
total 42 44 95.4


line stmt bran cond sub pod time code
1             # $Id: Pipeline.pm,v 1.23 2006/08/31 15:35:26 mike Exp $
2              
3             package Alvis::Pipeline;
4              
5 5     5   128367 use 5.008;
  5         26  
  5         230  
6 5     5   35 use strict;
  5         8  
  5         232  
7 5     5   32 use warnings;
  5         17  
  5         202  
8              
9 5     5   2732 use Alvis::Logger;
  5         16  
  5         143  
10 5     5   2942 use Alvis::Pipeline::Read;
  5         20  
  5         180  
11 5     5   3610 use Alvis::Pipeline::Write;
  5         17  
  5         1406  
12              
13             our $VERSION = '0.11';
14              
15              
16             =head1 NAME
17              
18             Alvis::Pipeline - Perl extension for passing XML documents along the Alvis pipeline
19              
20             =head1 SYNOPSIS
21              
22             use Alvis::Pipeline;
23             $in = new Alvis::Pipeline::Read(host => "harvester.alvis.info",
24             port => 16716,
25             spooldir => "/home/alvis/spool");
26             $out = new Alvis::Pipeline::Write(port => 29168);
27             while ($xml = $in->read(1)) {
28             $transformed = process($xml);
29             $out->write($transformed);
30             }
31              
32             =head1 DESCRIPTION
33              
34             This module provides a simple means for components in the Alvis
35             pipeline to pass documents between themselves without needing to know
36             about the underlying transfer protocol. Pipe objects may be created
37             either for reading or writing; components in the middle of the
38             pipeline will create one of each. Pipes support exactly one method,
39             which is either C or C depending on the type of the
40             pipe. The granularity of reading and writing is
41             the XML document; neither smaller fragments nor larger aggregates can
42             be transferred.
43              
44             The documents expected to pass through this pipeline are those
45             representing documents acquired for, and being analysed by, Alvis.
46             These documents are expressed as XML contructed according to the
47             specifications described in the Metadata Format for Enriched
48             Documents. However, while this is the motivating example pipeline
49             that led to the creation of this module, there is no reason why other
50             kinds of documents should not also be passed through pipeline using
51             this software.
52              
53             The pipeline protocol is described below, to facilitate the
54             development of indepedent implementations in other languages.
55              
56             =head1 METHODS
57              
58             =head2 new()
59              
60             $in = new Alvis::Pipeline::Read(host => "harvester.alvis.info",
61             port => 16716,
62             spooldir => "/home/alvis/spool");
63             $out = new Alvis::Pipeline::Write(port => 29168);
64              
65             Creates a new pipeline, either for reading or for writing. Any number
66             of I-I pairs may be passed as parameters. Among these,
67             most are optional but some are mandatory:
68              
69             =over 4
70              
71             =item *
72              
73             Read-pipes must specify both the C and C of the component
74             that they will read from, and C,
75             a directory that is writable to the user the process is running as.
76             (When files become available by being written down a write-pipe, they
77             are immediately read in the background, then stored in the
78             specified spool directory until picked up by a reader.)
79              
80             =item *
81              
82             Pipes may specify C [default 0]: higher levels
83             providing some commentary on under-the-hood behaviour.
84              
85             =back
86              
87             =head2 option()
88              
89             $old = $pipe->option("foo");
90             $pipe->option(bar => 23);
91              
92             Can be used to set the value for a specific option, or to retrieve its
93             value.
94              
95             =head2 read()
96              
97             # Read-pipes only
98             $xml = $in->read($block);
99              
100             Reads an XML document from the specified inbound pipe, and returns it
101             as a string. If there is no document ready to read, it
102             either returns an undefined value (if no argment is provided, or if
103             the argument is false) or blocks if the argument is provided and true.
104             C throws an exception if an error occurs.
105              
106             Once a document has been read in this way, it will no longer be
107             available for subsequent Cs, so a sequence of C calls
108             will read all the available records one at a time.
109              
110             Once a document has been read, it is the responsibility of the reader
111             to process it and pass it on to the next component in the pipeline.
112             If something catastrophic happens, and the record is lost, then an
113             out-of-band mechanism may be used to request a new copy of the record
114             from the writer. The C module does not directly
115             support such requests; they are considered to be application-level and
116             therefore not appropriate for this low-level module to deal with.
117              
118             (As a matter of application design, we offer the observation that, in
119             Alvis, the C<> attribute on the top-level element specifies the
120             identity of the record, and should remain changed even if the record
121             itself is updated; so any out-of-band request for records to be
122             re-sent should do so by specifying the IDs of the required records.)
123              
124             =head2 write()
125              
126             # Write-pipes only
127             $out->write($xmlDocument);
128              
129             Writes an XML document to the specified outbound pipe. The document
130             may be passed in either as a DOM tree (C) or a
131             string containing the text of the document. Throws an exception if an
132             error occurs.
133              
134             This method returns only when the record has been successfully
135             transferred to the receiver at the other end of the pipeline; so the
136             sender is then able to forget about the transferred, which is now the
137             responsibility of the next component in the pipeline.
138              
139             =head2 close()
140              
141             $pipe->close();
142              
143             Closes a pipe, after which no further reading or writing may be done
144             on it. This is important for read-pipes, as it frees up the Internet
145             port that the server is listening on.
146              
147             =head1 PIPELINE PROTOCOL
148              
149             Because the pipeline is unidirectional, it is very simple: there is no
150             back-channel by which a downstream component can talk to an upstream
151             one, and the protocol consists entirely of wrappings for the documents
152             that are sent downstream.
153              
154             Each document packet consists of the following, in order:
155              
156             =over 4
157              
158             =item 1
159              
160             The magic literal string C,
161             followed by a single newline character.
162              
163             =item 2
164              
165             Decimal-rendered protocol version-number (currently 1),
166             followed by a single newline character.
167              
168             =item 3
169              
170             Decimal-rendered integer byte-count,
171             followed by a single newline character.
172             Note that the protocol counts I rather than
173             I: these two counts can be different when
174             non-ASCII character sets such as UTF-8 are used.
175              
176             =item 4
177              
178             The XML document itself (or other binary object),
179             of the length specified.
180              
181             =item 5
182              
183             The magic literal string C<--end-->,
184             followed by a single newline character.
185              
186             =back
187              
188             For example, the simple document
189              
190            
191             Brachiosaurus
192            
193              
194             would be sent as the following packet:
195              
196             Alvis::Pipeline
197             1
198             55
199            
200             Brachiosaurus
201            
202             ---end--
203              
204             This packaging allows the downstream component to locate object
205             boundaries and to consistency-check the stream.
206              
207             =head1 SEE ALSO
208              
209             I
210             Milestone M3.2 - Month 12 (December 2004).>
211             Includes a useful overview of the Alvis processing pipeline.
212             http://www.miketaylor.org.uk/alvis/t3-2/m3-2.html
213              
214             =head1 AUTHOR
215              
216             Mike Taylor, Emike@indexdata.comE
217              
218             =head1 COPYRIGHT AND LICENSE
219              
220             Copyright (C) 2005 by Index Data ApS.
221              
222             This library is free software; you can redistribute it and/or modify
223             it under the same terms as Perl itself, either Perl version 5.8.4 or,
224             at your option, any later version of Perl 5 you may have available.
225              
226             =cut
227              
228              
229             # Instantiation setup code shared by both subclasses
230             sub _setopts {
231 7     7   20 my $this = shift();
232 7         42 my(%opts) = @_;
233              
234 7         122 my $loglevel = delete $opts{loglevel};
235 7         283 $this->{logger} = new Alvis::Logger(level => $loglevel);
236 7         47 $this->{opts} = \%opts;
237             }
238              
239              
240             sub option {
241 3     3 1 6 my $this = shift();
242 3         18 my($key, $newval) = @_;
243              
244 3         8 my $val = $this->{opts}->{$key};
245 3 50       9 $this->{opts}->{$key} = $newval if defined $newval;
246 3         21000757 return $val;
247             }
248              
249              
250             sub log {
251 69     69 0 122 my $this = shift();
252 69         92 my $level = shift();
253 69         378 $this->{logger}->log($level, $$, ": ", @_);
254             }
255              
256              
257             1;