line
stmt
bran
cond
sub
pod
time
code
1
#-*-perl-*-
2
#---------------------------------------------------------------------------
3
# Copyright (C) 2004-2012 Joerg Tiedemann
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 2 of the License, or
8
# (at your option) any later version.
9
#
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
14
#
15
# You should have received a copy of the GNU General Public License
16
# along with this program; if not, write to the Free Software
17
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
#---------------------------------------------------------------------------
19
20
=head1 NAME
21
22
Uplug - a toolbox for processing (parallel) text corpora
23
24
=head1 SYNOPSIS
25
26
$module = 'pre/basic';
27
%args = ( '-in' => $input_file_name,
28
'-ci' => $input_char_encoding );
29
30
my $uplug=Uplug->new($module, %args); # create a new uplug module
31
$uplug->load(); # load it
32
$uplug->run(); # and run it
33
34
=head1 DESCRIPTION
35
36
This library provides the main methods for loading Uplug modules and running them. Configuration files describe the module and its parameters (see L). Each module may contain a number of sub-modules. Each of them can usually calls the uplug scripts provided in the package.
37
38
=head1 USAGE
39
40
More information on how to use the Uplug toolkit with the provided modules can be found here:
41
L
42
43
Add-ons and language-specific modules can be downloaded from the Uplug project website at bitbucket: L
44
45
46
=cut
47
48
49
package Uplug;
50
51
require 5.005;
52
53
5
5
258331
use strict;
5
12
5
194
54
5
5
4809
use IO::File;
5
77119
5
751
55
5
5
12842
use POSIX qw(tmpnam);
5
47401
5
34
56
5
5
12262
use Uplug::Config;
5
36
5
1535
57
5
5
71
use File::Basename;
5
11
5
488
58
59
5
5
27
use FindBin qw($Bin);
5
10
5
1304
60
61
5
5
27
use vars qw($VERSION $AUTHOR $DEBUG);
5
7
5
435
62
5
5
25
use vars qw(@TempFiles);
5
8
5
478
63
64
65
$VERSION = '0.3.8';
66
$AUTHOR = 'Joerg Tiedemann';
67
$DEBUG = 0;
68
69
#-----------------------------------------------------------------------
70
BEGIN{
71
5
5
117
setpgrp(0,0); # become leader of the process group
72
5
19430
$SIG{HUP}=sub{die "# Uplug.pm: hangup";};
0
0
73
}
74
75
END{
76
5
5
4354652
local $SIG{HUP}='IGNORE'; # ignore HANGUP signal for right now
77
5
243
kill ('HUP',-$$); # kill child processes before you die
78
}
79
#-----------------------------------------------------------------------
80
81
82
=head1 Class methods
83
84
=head2 Constructor
85
86
$uplug = new Uplug ( $module, %args )
87
88
Construct a new Uplug object for the given Uplug module ($module refers to a configuration file). Module arguments are specified in C<%args> and depend on the module. For more information about specific Uplug modules, use the Uplug startup script:
89
90
uplug -h module-name
91
92
=cut
93
94
95
96
sub new{
97
0
0
0
my $class=shift;
98
0
my $configfile=shift;
99
100
0
my $self={};
101
0
bless $self,$class;
102
103
0
$self->{CONFIGFILE} = $configfile;
104
0
$self->{CONFIG} = &ReadConfig($configfile,@_);
105
106
0
0
mkdir 'data',0755 if (! -d 'data');
107
0
0
mkdir 'data/runtime',0755 if (! -d 'data/runtime');
108
109
0
$self->{RUNTIMEDIR} = 'data/runtime/'.$$;
110
0
0
mkdir $self->{RUNTIMEDIR},0755 if (! -d $self->{RUNTIMEDIR});
111
112
0
return $self;
113
}
114
115
116
##---------------------------------------------------------------------
117
## DESTROY: clean up! remove all temporary files and directories!
118
119
sub DESTROY{
120
0
0
my $self=shift;
121
0
0
if ($DEBUG){exit;}
0
122
0
unlink $self->{MODULE};
123
0
0
if (ref($self->{TEMPFILES}) eq 'ARRAY'){
124
0
unlink @{$self->{TEMPFILES}};
0
125
}
126
0
rmdir $self->{RUNTIMEDIR};
127
}
128
129
=head2 C
130
131
$uplug->load()
132
133
Load the module given in the constructor and all its sub-modules. This also creates temporary configuration files with adjusted parameters in C.
134
135
=cut
136
137
##---------------------------------------------------------------------
138
## load module configurations
139
## * create runtime config files the module and all submodules
140
141
sub load{
142
0
0
1
my $self=shift;
143
144
0
my $count=1;
145
0
my $runtime = $self->{RUNTIMEDIR}.'/';
146
0
$runtime .= basename($self->{CONFIGFILE});
147
0
while (-e $runtime.$count){$count++;}
0
148
0
$self->{MODULE} = $runtime.$count;
149
0
&WriteConfig($self->{MODULE},$self->{CONFIG});
150
0
$self->loadSubMods();
151
0
$self->data($self->output()); # my own data is available
152
}
153
154
=head2 C
155
156
$uplug->run()
157
158
Run all commands specified in all sub-modules. Pipeline commands will be constructed according to the sequence of sub-modules and the specifications in the Uplug configuration files. The will be simply executed as external system calls.
159
160
=cut
161
162
163
##---------------------------------------------------------------------
164
## run the Uplug module (and all its submodules)
165
## * get the system command
166
## * split it up into separate system calls
167
## * run the system calls and print elapsed time/call
168
169
sub run{
170
0
0
1
my $self=shift;
171
0
my $cmd=$self->command();
172
0
my @seq=split(/;/,$cmd); # split command sequence
173
0
my $start=time();
174
0
for (@seq){
175
0
my $time=time();
176
0
print STDERR "$_\n---------------------------------------------\n";
177
0
0
if (my $sig=system ($_)){
178
0
print STDERR "# Uplug.pm: Got signal $? from child process:\n";
179
0
print STDERR "# $_\n";
180
0
return 0;
181
}
182
0
$time=time()-$time;
183
0
my ($sec,$min,$hour,$mday,$mon,$year)=gmtime($time);
184
0
printf STDERR
185
" processing time: %2d:%2d:%2d:%2d:%2d:%2d\n",
186
$year-70,$mon,$mday-1,$hour,$min,$sec;
187
}
188
0
$start=time()-$start;
189
0
my ($sec,$min,$hour,$mday,$mon,$year)=gmtime($start);
190
0
printf STDERR
191
" total processing time: %2d:%2d:%2d:%2d:%2d:%2d\n",
192
$year-70,$mon,$mday-1,$hour,$min,$sec;
193
}
194
195
=head1 Class-internal methods
196
197
=head2 C
198
199
Load all sub-modules and adjust input and output according to the configuration files and the current pipe-line. Output streams will be used as input streams with the same name for the next sub-module. This method tries to find possible pipelines for combining commands.
200
201
=cut
202
203
204
##---------------------------------------------------------------------
205
## create config files for all sub-modules
206
## * modify input/output according to the data in the module sequence
207
## * check if I can use pipes (stdout -> stdin)
208
## * expand loops
209
210
sub loadSubMods{
211
0
0
1
my $self=shift;
212
213
0
my $submod=&GetParam($self->{CONFIG},'module','submodules');
214
0
my $loop=&GetParam($self->{CONFIG},'module','loop');
215
0
my ($loopstart,$loopend)=split(/:/,$loop);
216
0
my $iter=&GetParam($self->{CONFIG},'module','iterations');
217
218
0
0
if (ref($submod) eq 'ARRAY'){
219
0
$self->{SUBMOD}=[]; # initialize sub-module array
220
0
my $count=1; # iteration counter
221
222
0
my $input=$self->input; # my input will be
223
0
my $data=$self->data($input); # the initial data collection
224
225
0
my $stdout; # is defined if previous module produces STDOUT
226
0
my $i=0; # sub-module number
227
0
my $n=0; # module number in the sequence
228
0
while ($i<@$submod){
229
0
0
0
if ((defined $iter) and ($count>$iter)){last;}
0
230
0
my ($conf,@par)=split(/\s+/,$submod->[$i]);
231
0
0
0
$i++ && next unless (-e &FindConfig($conf)); # skip modules without config
232
0
$self->{SUBMOD}->[$n]=Uplug->new($conf,@par); # check also params
233
0
$self->{SUBMOD}->[$n]->input($data); # change input
234
235
## check if stdout in last module but no stdin now
236
## --> if yes: broken pipe!
237
238
0
my $broken=0;
239
0
my $stdin=$self->{SUBMOD}->[$n]->stdin();
240
241
0
0
0
if ($stdout and (not $stdin)){
0
0
242
0
$broken = 1;
243
}
244
245
## otherwise if STDIN and STDOUT:
246
## check if any output file is in use
247
## if yes --> broken pipe
248
249
elsif ($stdin and $stdout){
250
0
my $out=$self->{SUBMOD}->[$n]->output();
251
0
0
if (ref($out) eq 'HASH'){
252
0
for (keys %$out){
253
0
0
0
if ((exists $out->{file}) and
254
$self->FileInUse($out->{file})){
255
0
$broken=1;
256
0
last;
257
}
258
}
259
}
260
}
261
262
## if pipe is broken:
263
## * save to temp file if no file given
264
## * delete 'stdout' flag from config file
265
266
0
0
if ($broken){
267
0
0
if (not &GetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
268
'output',$stdout,'file')){
269
0
my $tmpfile=$self->NewTempFile();
270
0
&SetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
271
$tmpfile,'output',$stdout,'file');
272
0
&SetParam($data,$tmpfile,'output',$stdout,'file');
273
0
$self->{SUBMOD}->[$n-1]->load();
274
}
275
0
&SetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
276
undef,'module','stdout');
277
}
278
279
## change input data according to available data-spec
280
## load the current module
281
282
0
$self->{SUBMOD}->[$n]->load(); # load module
283
284
0
$stdout=$self->{SUBMOD}->[$n]->stdout();
285
0
my $new=$self->{SUBMOD}->[$n]->data(); # get new output
286
0
$data=$self->data($new); # set new data
287
288
## jump back to the loop start
289
## (if a loop is defiend)
290
291
0
0
0
if ((defined $loopend) and ($i==$loopend)){
292
0
$count++;
293
0
$i=$loopstart-1;
294
}
295
0
$i++;$n++;
0
296
}
297
298
# if there is at least one submodule:
299
# my output should be the one produced by the last submodule
300
301
0
0
if (@$submod){
302
0
my $output=$self->output;
303
0
$self->{SUBMOD}->[-1]->output($output);
304
0
my $data=$self->data($output);
305
}
306
0
$self->data($data);
307
}
308
}
309
310
=head2 C
311
312
$cmd = $uplug->command()
313
314
Return a sequence of system commands for the entire pipeline. Commands are separated by ';'. System command may include several pipelines through STDIN/STDOUT.
315
316
=cut
317
318
319
##---------------------------------------------------------------------
320
## return the system command to be called for this Uplug module
321
## (including all sub-modules, pipes, ...)
322
323
sub command{
324
0
0
1
my $self=shift;
325
0
my $stdout=shift;
326
327
0
0
if (ref($self->{SUBMOD}) eq 'ARRAY'){
328
0
my $cmd;
329
330
0
my $loop=&GetParam($self->{CONFIG},'module','loop');
331
0
my ($loopstart,$loopend)=split(/:/,$loop);
332
0
my $iter=&GetParam($self->{CONFIG},'module','iterations');
333
0
my $count=0;
334
335
0
for my $s (@{$self->{SUBMOD}}){
0
336
0
my $c=$s->command($cmd,$stdout);
337
0
my $stdin=$s->stdin();
338
0
0
0
if ($stdout and $stdin){
0
0
339
0
$cmd.=' | '.$c;
340
}
341
0
elsif ($cmd){$cmd.=';'.$c;}
342
else{$cmd=$c;}
343
0
$stdout=$s->stdout;
344
}
345
0
return $cmd;
346
}
347
0
my $bin=&GetParam($self->{CONFIG},'module','location');
348
0
my $cmd=&GetParam($self->{CONFIG},'module','program');
349
0
0
if (-f $bin.'/'.$cmd){$cmd=$bin.'/'.$cmd;}
0
350
# if (-f $Bin.'/'.$cmd){$cmd=$Bin.'/'.$cmd;}
351
0
$cmd.=' -i '.$self->{MODULE};
352
353
0
0
if ($DEBUG){
354
0
$cmd='perl -d:DProf '.$cmd;
355
}
356
357
0
return $cmd;
358
}
359
360
=head2 C
361
362
Change the input settings in a particular configuration.
363
364
=cut
365
366
367
368
##---------------------------------------------------------------------
369
## change input settings in the module configuraton
370
## (only for the ones that exist already)
371
## and write changes to the physical config file
372
373
sub input{
374
0
0
1
my $self=shift;
375
0
my ($input)=@_;
376
0
0
if (ref($input) eq 'HASH'){
377
0
foreach (keys %$input){
378
0
0
if (&GetParam($self->{CONFIG},'input',$_)){
379
0
&SetParam($self->{CONFIG},$input->{$_},'input',$_);
380
}
381
0
$self->{DATA}->{$_}=$input->{$_};
382
}
383
0
0
if (exists $self->{MODULE}){
384
0
&WriteConfig($self->{MODULE},$self->{CONFIG});
385
}
386
}
387
0
return &GetParam($self->{CONFIG},'input');
388
}
389
390
=head2 C
391
392
Change the output settings in a particular configuration.
393
394
=cut
395
396
##---------------------------------------------------------------------
397
## change output settings in the module configuraton
398
## (only for the ones that exist already)
399
## and write changes to the physical config file
400
401
sub output{
402
0
0
1
my $self=shift;
403
404
0
my ($output)=@_;
405
0
0
if (ref($output) eq 'HASH'){
406
0
foreach (keys %$output){
407
0
0
if (&GetParam($self->{CONFIG},'output',$_)){
408
0
&SetParam($self->{CONFIG},$output->{$_},'output',$_);
409
}
410
0
$self->{DATA}->{$_}=$output->{$_};
411
}
412
0
$self->load();
413
}
414
0
return &GetParam($self->{CONFIG},'output');
415
}
416
417
=head2 C
418
419
Set/return data files available in the current pipeline.
420
421
=cut
422
423
424
##---------------------------------------------------------------------
425
## set/return available data
426
## (here we store al kinds of data available in the module sequence)
427
428
sub data{
429
0
0
1
my $self=shift;
430
0
my ($data)=@_;
431
0
0
if (ref($data) eq 'HASH'){
432
0
foreach (keys %$data){
433
0
$self->{DATA}->{$_}=$data->{$_};
434
}
435
}
436
0
0
if (ref($self->{DATA}) eq 'HASH'){ # save open files
437
0
for my $d (keys %{$self->{DATA}}){ # (to check pipe-conflicts)
0
438
0
0
if (exists $self->{DATA}->{$d}->{file}){
439
0
$self->{FILES}->{$self->{DATA}->{$d}->{file}}=1;
440
}
441
}
442
}
443
0
return $self->{DATA};
444
}
445
446
=head2 C
447
448
Check whether their is an input stream that can read from STDIN.
449
450
=cut
451
452
453
##---------------------------------------------------------------------
454
# stdin: returns input name if there is one that reads from stdin
455
# (looks at {module => {stdin => '...'}}
456
# and the definition of the input stream (check 'file' attr))
457
# returns undef if no input defined that reads from STDIN
458
459
sub stdin{
460
0
0
1
my $self=shift;
461
0
my $in=&GetParam($self->{CONFIG},'module','stdin');
462
0
0
if (&GetParam($self->{CONFIG},'input',$in)){
463
0
0
if (not &GetParam($self->{CONFIG},'input',$in,'file')){
464
0
return $in;
465
}
466
}
467
0
return undef;
468
}
469
470
=head2 C
471
472
Check whether their is an output stream that can write to STDOUT.
473
474
=cut
475
476
##---------------------------------------------------------------------
477
# stdout: same as stdin but for STDOUT
478
479
sub stdout{
480
0
0
1
my $self=shift;
481
0
my $out=&GetParam($self->{CONFIG},'module','stdout');
482
0
0
if (&GetParam($self->{CONFIG},'output',$out)){
483
0
0
if (not &GetParam($self->{CONFIG},'output',$out,'file')){
484
0
return $out;
485
}
486
}
487
0
return undef;
488
}
489
490
=head2 FileInUse
491
492
Checks whether a particular file is already in use in the current pipeline (avoids writing over files that a command still reads from).
493
494
=cut
495
496
sub FileInUse{
497
0
0
1
my $self=shift;
498
0
return $self->{FILES}->{$_[0]};
499
}
500
501
=head2 C
502
503
Return a new temporary file (in data/runtime).
504
505
=cut
506
507
##---------------------------------------------------------------------
508
## return a temporary file name (and touch it)
509
#
510
# TODO: use File::Temp instead
511
512
sub NewTempFile{
513
0
0
1
my $self=shift;
514
0
my $count=0;
515
0
my $temp = $self->{RUNTIMEDIR}.'/.temp';
516
0
while (-e $temp.$count){
517
0
$count++;
518
}
519
0
$temp.=$count;
520
0
open F,">$temp";close F;
0
521
0
push (@{$self->{TEMPFILES}},$temp);
0
522
0
return $temp;
523
}
524
525
526
527
1;
528
529
__END__