| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package HPCI; |
|
2
|
|
|
|
|
|
|
### HPCI.pm ################################################################### |
|
3
|
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
### INCLUDES ################################################################## |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
# safe Perl |
|
7
|
18
|
|
|
18
|
|
4168585
|
use warnings; |
|
|
18
|
|
|
|
|
49
|
|
|
|
18
|
|
|
|
|
717
|
|
|
8
|
18
|
|
|
18
|
|
108
|
use strict; |
|
|
18
|
|
|
|
|
45
|
|
|
|
18
|
|
|
|
|
443
|
|
|
9
|
18
|
|
|
18
|
|
132
|
use Carp; |
|
|
18
|
|
|
|
|
49
|
|
|
|
18
|
|
|
|
|
1027
|
|
|
10
|
18
|
|
|
18
|
|
8603
|
use Module::Load; |
|
|
18
|
|
|
|
|
18182
|
|
|
|
18
|
|
|
|
|
115
|
|
|
11
|
18
|
|
|
18
|
|
10299
|
use Module::Load::Conditional qw(can_load); |
|
|
18
|
|
|
|
|
398939
|
|
|
|
18
|
|
|
|
|
1299
|
|
|
12
|
18
|
|
|
18
|
|
22511
|
use List::MoreUtils qw(uniq); |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @extra_roles; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub add_extra_role { |
|
17
|
|
|
|
|
|
|
# next line is documentation |
|
18
|
|
|
|
|
|
|
# my ($cluster, $level, $role) = @_; |
|
19
|
|
|
|
|
|
|
shift; # get rid of HPCI class name |
|
20
|
|
|
|
|
|
|
push @extra_roles, [ @_ ]; |
|
21
|
|
|
|
|
|
|
} |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub get_extra_roles { |
|
24
|
|
|
|
|
|
|
my ($target_cluster, $target_level) = @_; |
|
25
|
|
|
|
|
|
|
my @roles; |
|
26
|
|
|
|
|
|
|
for my $role_bunch (@extra_roles) { |
|
27
|
|
|
|
|
|
|
my ($cluster, $level, $roles) = @$role_bunch; |
|
28
|
|
|
|
|
|
|
next unless $cluster eq 'ALL' || $cluster eq $target_cluster; |
|
29
|
|
|
|
|
|
|
next unless $level eq $target_level; |
|
30
|
|
|
|
|
|
|
push @roles, ref $roles ? @$roles : $roles; |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
return @roles; |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $default_attrs = {}; |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub add_default_attrs { |
|
38
|
|
|
|
|
|
|
shift; # get rid of HPCI class name |
|
39
|
|
|
|
|
|
|
my $newhash = ref($_[0]) eq 'HASH' ? shift : { @_ }; |
|
40
|
|
|
|
|
|
|
_merge_hash( $default_attrs, $newhash ); |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub _merge_hash { |
|
44
|
|
|
|
|
|
|
my( $target, $new, $path ) = @_; |
|
45
|
|
|
|
|
|
|
$path ||= []; |
|
46
|
|
|
|
|
|
|
croak "not a hash when merging attribute hash{".join('}{',@$path)."}" |
|
47
|
|
|
|
|
|
|
unless ref($target) eq 'HASH' && ref($new) eq 'HASH'; |
|
48
|
|
|
|
|
|
|
while (my($k,$v) = each %$new) { |
|
49
|
|
|
|
|
|
|
if (ref($v) eq 'HASH' || (exists $target->{$k} && ref($target->{$k}) eq 'HASH')) { |
|
50
|
|
|
|
|
|
|
$target->{$k} //= {}; |
|
51
|
|
|
|
|
|
|
_merge_hash( $target->{$k}, $v, [ @$path, $k ] ); |
|
52
|
|
|
|
|
|
|
} |
|
53
|
|
|
|
|
|
|
else { |
|
54
|
|
|
|
|
|
|
$target->{$k} = $v; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
} |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub explist { |
|
60
|
|
|
|
|
|
|
return ( |
|
61
|
|
|
|
|
|
|
map { |
|
62
|
|
|
|
|
|
|
ref($_) eq 'ARRAY' ? @$_ |
|
63
|
|
|
|
|
|
|
: defined($_) ? ( $_ ) |
|
64
|
|
|
|
|
|
|
: ( ) |
|
65
|
|
|
|
|
|
|
} @_ |
|
66
|
|
|
|
|
|
|
); |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# get the env_keys, in original order, but use *LAST* instance |
|
70
|
|
|
|
|
|
|
# that retains the order specified in either default or args |
|
71
|
|
|
|
|
|
|
# but lets the relative order in args take precedence for keys |
|
72
|
|
|
|
|
|
|
# that are in both |
|
73
|
|
|
|
|
|
|
# |
|
74
|
|
|
|
|
|
|
# So, the order is: |
|
75
|
|
|
|
|
|
|
# [ keys that are only in default in the order they were specified in default ] |
|
76
|
|
|
|
|
|
|
# [ then keys that are in args in the order they were specified in args ] |
|
77
|
|
|
|
|
|
|
# No complaint is made if the same key is specified twice in either default |
|
78
|
|
|
|
|
|
|
# or args, the earlier one(s) are simply ignored. |
|
79
|
|
|
|
|
|
|
sub keylist { |
|
80
|
|
|
|
|
|
|
my @keys; |
|
81
|
|
|
|
|
|
|
for my $arg (@_) { |
|
82
|
|
|
|
|
|
|
my $keys = (delete $arg->{env_keys}) // []; |
|
83
|
|
|
|
|
|
|
push @keys, @$keys; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
|
|
|
|
|
|
return (reverse uniq reverse @keys); |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub group { |
|
89
|
|
|
|
|
|
|
my $pkg = shift; |
|
90
|
|
|
|
|
|
|
my $args = |
|
91
|
|
|
|
|
|
|
scalar(@_) == 1 && ref($_[0]) eq 'HASH' ? shift |
|
92
|
|
|
|
|
|
|
: scalar(@_) % 2 == 0 ? { @_ } |
|
93
|
|
|
|
|
|
|
: croak("HPCI->group() requires a hashref or a hash in list form"); |
|
94
|
|
|
|
|
|
|
# copy the default attributes as a start |
|
95
|
|
|
|
|
|
|
my $use_args = {}; |
|
96
|
|
|
|
|
|
|
_merge_hash( $use_args, $default_attrs ); |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# pull out the env_keys (if any) |
|
99
|
|
|
|
|
|
|
my @keys = keylist( $use_args, $args ); |
|
100
|
|
|
|
|
|
|
my @key_specific = map { $_->{env_key_specific} // () } $use_args, $args; |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# merge any specified env_key list that has a value available |
|
103
|
|
|
|
|
|
|
for my $key (@keys) { |
|
104
|
|
|
|
|
|
|
for my $key_spec (@key_specific) { |
|
105
|
|
|
|
|
|
|
if (my $spec_args = $key_spec->{$key}) { |
|
106
|
|
|
|
|
|
|
_merge_hash( $use_args, $spec_args ); |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
my $cluster = $args->{cluster} // $use_args->{cluster} |
|
112
|
|
|
|
|
|
|
// croak("HPCI->group() requires a cluster key in the argument hash"); |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
for my $arg_set ($use_args, $args) { |
|
115
|
|
|
|
|
|
|
if (my $spec_args = delete $arg_set->{cluster_specific}) { |
|
116
|
|
|
|
|
|
|
_merge_hash( $use_args, $spec_args->{$cluster} // {} ); |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
_merge_hash( $use_args, $args ); |
|
120
|
|
|
|
|
|
|
my $clmod = "HPCD::${cluster}::Group"; |
|
121
|
|
|
|
|
|
|
load $clmod; |
|
122
|
|
|
|
|
|
|
return $clmod->new($use_args); |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _trigger_mkdir { |
|
127
|
|
|
|
|
|
|
my $self = shift; # an object with a log |
|
128
|
|
|
|
|
|
|
my $dir = shift; # a Path::Class::Dir object |
|
129
|
|
|
|
|
|
|
$self->info( "Created directory: $_" ) for $dir->mkpath; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 NAME |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
HPCI |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 VERSION |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Version 0.53 |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=cut |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
our $VERSION = '0.53'; |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
our $LocalConfigFound; |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
$LocalConfigFound = can_load( modules => { 'HPCI::LocalConfig' => undef }); |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
if (!$LocalConfigFound) { |
|
149
|
|
|
|
|
|
|
my $err = $Module::Load::Conditional::ERROR; |
|
150
|
|
|
|
|
|
|
if (defined $err && $err !~ /^Could not find or check module /) { |
|
151
|
|
|
|
|
|
|
print STDERR "Conditional load of HPCI::LocalConfig failed. Error is:\n"; |
|
152
|
|
|
|
|
|
|
print STDERR "$err\n"; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
use HPCI; |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
my $group = HPCI->group( |
|
161
|
|
|
|
|
|
|
cluster => ($ENV{HPCI_CLUSTER} // 'uni'), |
|
162
|
|
|
|
|
|
|
... |
|
163
|
|
|
|
|
|
|
); |
|
164
|
|
|
|
|
|
|
$group->stage( |
|
165
|
|
|
|
|
|
|
name => 'analysis_A', |
|
166
|
|
|
|
|
|
|
command => '...' |
|
167
|
|
|
|
|
|
|
); |
|
168
|
|
|
|
|
|
|
$group->stage( |
|
169
|
|
|
|
|
|
|
name => 'analysis_B', |
|
170
|
|
|
|
|
|
|
command => '...' |
|
171
|
|
|
|
|
|
|
); |
|
172
|
|
|
|
|
|
|
$group->stage( |
|
173
|
|
|
|
|
|
|
name => 'analysis_C', |
|
174
|
|
|
|
|
|
|
command => '...' |
|
175
|
|
|
|
|
|
|
); |
|
176
|
|
|
|
|
|
|
$group->stage( |
|
177
|
|
|
|
|
|
|
name => 'report', |
|
178
|
|
|
|
|
|
|
command => '...' |
|
179
|
|
|
|
|
|
|
); |
|
180
|
|
|
|
|
|
|
$group->add_deps( |
|
181
|
|
|
|
|
|
|
pre_reqs => [ qw(analysis_A analysis_B analysis_C) ], |
|
182
|
|
|
|
|
|
|
dep => 'report' |
|
183
|
|
|
|
|
|
|
); |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
my $status_info = $group->execute; |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $exit_status = 0; |
|
188
|
|
|
|
|
|
|
for my $stage ( qw(analysis_A analysis_B analysis_C report) ) { |
|
189
|
|
|
|
|
|
|
if (my $stat = $status_info->{$stage}[-1]{exit_status}) { |
|
190
|
|
|
|
|
|
|
$exit_status ||= $stat; |
|
191
|
|
|
|
|
|
|
print stderr "Stage $stage failed, status $stat!\n"; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
exit(0); # all stages completed without error |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 OVERVIEW |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
HPCI (High Performance Computing Interface) provides an interface to |
|
200
|
|
|
|
|
|
|
a range of types of computer aggregations (clusters, clouds, ...). |
|
201
|
|
|
|
|
|
|
(The rest of this document will use I<cluster> henceforth to refer |
|
202
|
|
|
|
|
|
|
to any type of aggregation that is supported by HPCI.) |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
A cluster is defined as a software interface that allows running |
|
205
|
|
|
|
|
|
|
multiple programs on separate compute elements (nodes). |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
HPCI uses an HPCD (High Performance Computing Driver) module |
|
208
|
|
|
|
|
|
|
to translate its standard interface into the appropriate access |
|
209
|
|
|
|
|
|
|
mechanisms for the type of cluster that is selected. (If you have |
|
210
|
|
|
|
|
|
|
used the DBI/DBD modules for accessing databases, this will seem |
|
211
|
|
|
|
|
|
|
very familiar.) |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
The goal of this HPCI/HPCD split is to allow users to write |
|
214
|
|
|
|
|
|
|
programs that make use of cluster facilities in a portable manner. |
|
215
|
|
|
|
|
|
|
If there is a reason to run the same program using a different |
|
216
|
|
|
|
|
|
|
type of cluster, it should only require change the cluster |
|
217
|
|
|
|
|
|
|
definition attributes provided to one parent object creation; the |
|
218
|
|
|
|
|
|
|
rest of code need not know or care about the changed cluster type. |
|
219
|
|
|
|
|
|
|
Programs which are likely to be run on different cluster types will |
|
220
|
|
|
|
|
|
|
usually be written to get the cluster attribute information from |
|
221
|
|
|
|
|
|
|
a configuration file, or command line arguments - so the program |
|
222
|
|
|
|
|
|
|
itself need not change at all. |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Running a program on different types of clusters can happen for a |
|
225
|
|
|
|
|
|
|
number of reasons. An organization might have access to multiple |
|
226
|
|
|
|
|
|
|
types of cluster, such as an in-house cluster plus an external cloud. |
|
227
|
|
|
|
|
|
|
Scholarly research often shares programs both to allow similar |
|
228
|
|
|
|
|
|
|
research, or to validate existing research results. |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
HPCD modules can provide cluster-specific extensions. That can |
|
231
|
|
|
|
|
|
|
either be a different kind of functionality, or it can be as simple |
|
232
|
|
|
|
|
|
|
as allowing the teminology familiar to users of that cluster type |
|
233
|
|
|
|
|
|
|
to be used in place of the generic terminology provided by HPCI. |
|
234
|
|
|
|
|
|
|
However, using such extensions makes it harder to move to a |
|
235
|
|
|
|
|
|
|
different cluster type. So, actually making use of such extensions |
|
236
|
|
|
|
|
|
|
must be considered carefully. |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=head1 The life cycle of a B<group> |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
A B<group> is the main mechanism for using HPCI. It is an object that |
|
241
|
|
|
|
|
|
|
manages a group of computation steps (called B<stage>s), distributing them |
|
242
|
|
|
|
|
|
|
across the cluster and keeping track of various housekeeping details like |
|
243
|
|
|
|
|
|
|
when each stage can be run, checking for the result of each completed stage |
|
244
|
|
|
|
|
|
|
run, deciding whether a failure should cause a stage to be retried to to |
|
245
|
|
|
|
|
|
|
prevent other stages from being executed, and collecting the status for each |
|
246
|
|
|
|
|
|
|
stage. |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
The life cycle of running a group of commands on a cluster is: |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=over 4 |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item create group |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
A B<group> object is created using the HPCI "class method" B<group>. |
|
255
|
|
|
|
|
|
|
HPCI isn't really a class, it just appears to be one. Its B<group> |
|
256
|
|
|
|
|
|
|
"class method" actually delegates creation of a group object to |
|
257
|
|
|
|
|
|
|
the HPCD module that is indicated by the I<cluster> attribute |
|
258
|
|
|
|
|
|
|
and it returns an cluster-specific group object that supports the |
|
259
|
|
|
|
|
|
|
HPCI interface. |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=item create stages |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
A B<stage> is created for each command that is to be executed on a |
|
264
|
|
|
|
|
|
|
separate node of the cluster. This is created using the B<group> |
|
265
|
|
|
|
|
|
|
object's method B<stage>. |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=item define dependency ordering between the stages |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
An important reason for running a group of jobs on a cluster is the |
|
270
|
|
|
|
|
|
|
ability to use multiple computers to run portions of the computation |
|
271
|
|
|
|
|
|
|
at the same time, rather than having them compete for the rsources |
|
272
|
|
|
|
|
|
|
of a single computer. However, often some stages will depend |
|
273
|
|
|
|
|
|
|
upon the output of other stages. Such a dependent stage cannot |
|
274
|
|
|
|
|
|
|
start executing until all pre-requisite stages have completed. |
|
275
|
|
|
|
|
|
|
Specifying such dependency requirements is done with the B<group> |
|
276
|
|
|
|
|
|
|
method B<add_deps>. |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item execution |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Finally, the B<group> method B<execute> will run the entire set |
|
281
|
|
|
|
|
|
|
of stages. It does not return until all stages have completed (or |
|
282
|
|
|
|
|
|
|
have been skipped). Each stage will normally be run once, however |
|
283
|
|
|
|
|
|
|
it is possible for some stages to be retried under some |
|
284
|
|
|
|
|
|
|
failure conditions. |
|
285
|
|
|
|
|
|
|
A failure of one stage (after retry possibilities have been exhausted) |
|
286
|
|
|
|
|
|
|
can be a trigger for |
|
287
|
|
|
|
|
|
|
completely skipping the execution of other stages. Each separate |
|
288
|
|
|
|
|
|
|
execution of a stage (original or retry) is managed with an internal object |
|
289
|
|
|
|
|
|
|
called a job - but a user program won't see job objects directly. |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
As many stages as possible are run simultaneously. This is limited by |
|
292
|
|
|
|
|
|
|
the specified dependencies, by cluster-specific driver limits, and by |
|
293
|
|
|
|
|
|
|
user-specified limits on concurrent execution. |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=back |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
The objects that calling code deals with directly are a group object to |
|
298
|
|
|
|
|
|
|
manage a group of stages, and a stage object for each separately run job. |
|
299
|
|
|
|
|
|
|
Internally, there are also job objects for each retry of a stage, and a |
|
300
|
|
|
|
|
|
|
log object for logging the execution process (alternately, the user can |
|
301
|
|
|
|
|
|
|
provide their own Log4Perl compatible log object for HPCI to use - this may be |
|
302
|
|
|
|
|
|
|
of use if you wish to merge logging of multiple groups and/or of other |
|
303
|
|
|
|
|
|
|
processing within your program together in a single log). |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
There are also some facilities to provide local customization of the standard |
|
306
|
|
|
|
|
|
|
usage of HPCI (see "Local Customization" below). |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=head1 Output Tree Layout |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
There are a number of output files and directories created during a group execution. |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
The default layout of these is: |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
<base_dir> "." |
|
315
|
|
|
|
|
|
|
<group_dir> <base_dir>/<name>-<YYYYMMDD-hhmmss> |
|
316
|
|
|
|
|
|
|
<log> <group_dir>/<name>.log |
|
317
|
|
|
|
|
|
|
<stage_dir> <group_dir>/<stage_name> |
|
318
|
|
|
|
|
|
|
<script_file> <stage_dir>/script.sh |
|
319
|
|
|
|
|
|
|
<job_dir> <stage_dir>/<retry_number> |
|
320
|
|
|
|
|
|
|
stdout |
|
321
|
|
|
|
|
|
|
stderr |
|
322
|
|
|
|
|
|
|
final_retry symlink to final <job_dir> |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
Many of these files/directories can be re-assigned to different |
|
325
|
|
|
|
|
|
|
location using group or stage attributes - shown above is the |
|
326
|
|
|
|
|
|
|
default layout. Commonly, you will specifically use the I<base_dir> |
|
327
|
|
|
|
|
|
|
attribute to choose a location other than the current directory for |
|
328
|
|
|
|
|
|
|
placing the tree; or else use the I<group_dir> attribute if you want |
|
329
|
|
|
|
|
|
|
to choose a location that does not create a sub-directory for you. |
|
330
|
|
|
|
|
|
|
(If this is an already existing directory that is being re-used you |
|
331
|
|
|
|
|
|
|
may end up with a mixture of old and new contents that are hard to |
|
332
|
|
|
|
|
|
|
figure out.) |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 4 |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item base_dir |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
The top level of all the generated output. It defaults to ".", |
|
339
|
|
|
|
|
|
|
but can be specified explicitly when the group is created with |
|
340
|
|
|
|
|
|
|
the attribute B<base_dir>. |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item group_dir |
|
343
|
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
By default, a new directory is created under B<base_dir>. Its name |
|
345
|
|
|
|
|
|
|
is I<name>-I<YYYYMMDD>-I<hhmmss> - the name of the group along with |
|
346
|
|
|
|
|
|
|
a timestamp of when the execution started. This can be over-ridden |
|
347
|
|
|
|
|
|
|
when the group is created by providing the group attribute B<group_dir>. |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=item log |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
The automatically provided log is written to the file I<"group.log"> |
|
352
|
|
|
|
|
|
|
directly under I<group_dir>. This logs information about the |
|
353
|
|
|
|
|
|
|
execution of the entire group of stages. See B<Logging Attributes |
|
354
|
|
|
|
|
|
|
of group object> below for ways of changing the default setting. |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=item stage_dir |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Each stage creates a sub-directory beneath I<group_dir> with the |
|
359
|
|
|
|
|
|
|
same name as the stage. An alternate name can be used by providing |
|
360
|
|
|
|
|
|
|
the B<dir> attribute when the stage object is created. |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=item script_file |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
The script created to be executed on the cluster node. This wraps |
|
365
|
|
|
|
|
|
|
the specified command with additional logic to pass on environment |
|
366
|
|
|
|
|
|
|
and config info, and to set output redirection. It is called |
|
367
|
|
|
|
|
|
|
"script.sh" and placed in I<stage_dir>. |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=item job_dir |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
A sub-directory is created under I<stage_dir> for each attempt to |
|
372
|
|
|
|
|
|
|
run the command. Usually, there will only be a single attempt. |
|
373
|
|
|
|
|
|
|
However, if the cluster driver provides mechanisms for detecting |
|
374
|
|
|
|
|
|
|
recoverable issues and then retries a command there can be more |
|
375
|
|
|
|
|
|
|
than one attempt; or alternately, if a pre-requisite stage |
|
376
|
|
|
|
|
|
|
fails there might be no attempt made (in that case, though, |
|
377
|
|
|
|
|
|
|
the entire I<stage_dir> directory would not even get created). |
|
378
|
|
|
|
|
|
|
These directories are simply named with the retry number ("0", |
|
379
|
|
|
|
|
|
|
"1", ...). |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=item stdout/stderr |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
Within each I<job_dir>, the files "stdout" and "stderr" collect |
|
384
|
|
|
|
|
|
|
the standard output and standard error output from that (re)try |
|
385
|
|
|
|
|
|
|
attempt to run the command. |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=item final_retry |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
A symlink named "final_retry" is created within I<stage_dir> that |
|
390
|
|
|
|
|
|
|
points to the I<job_dir> of the final (re)try. Since you often |
|
391
|
|
|
|
|
|
|
don't care as much about the initial run tries as you do about the |
|
392
|
|
|
|
|
|
|
last one, this symlink provides a consistant access path to that |
|
393
|
|
|
|
|
|
|
final retry. |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=back |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head1 HPCI "Class" Methods |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
You can pretend that B<HPCI> is a class with one primary class |
|
400
|
|
|
|
|
|
|
method named B<group>. |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
There a few other class methods used for localization purposes, they |
|
403
|
|
|
|
|
|
|
are decribed below in "Local Customization". |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head2 B<group> method |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
The B<group> method creates and returns a group object, which |
|
408
|
|
|
|
|
|
|
you can treat like a B<HPCI::Group> object. (In fact, it really |
|
409
|
|
|
|
|
|
|
returns an object of class B<HPCD::I<cluster>::Group>, but if you |
|
410
|
|
|
|
|
|
|
ignore that fact then you can trivially have your program run on |
|
411
|
|
|
|
|
|
|
some other cluster type.) |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 B<group> object |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
The description of attributes and methods for the B<group> object given here describe |
|
416
|
|
|
|
|
|
|
the generic attributes and how they are treated for all cluster types. |
|
417
|
|
|
|
|
|
|
Individual cluster drivers can modify this behaviour and can provide |
|
418
|
|
|
|
|
|
|
additional attributes and methods for cluster-specific purposes. |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head3 Cluster-Related Attributes of B<group> object |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
The one necessary attribute is B<cluster>. For some specific |
|
423
|
|
|
|
|
|
|
cluster types there may be additional attributes required for |
|
424
|
|
|
|
|
|
|
connecting to the cluster software (authentification, usage |
|
425
|
|
|
|
|
|
|
class info, etc.). |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=over 4 |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item cluster |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
The B<cluster> attribute specifies which type of cluster is to be used. |
|
432
|
|
|
|
|
|
|
This is the only required attribute. (Some cluster types may have |
|
433
|
|
|
|
|
|
|
additional attributes that are required for specifying connection |
|
434
|
|
|
|
|
|
|
and authentification info.) |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item cluster_specific |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
The attribute B<cluster_specific> is optional. If provided, it should |
|
439
|
|
|
|
|
|
|
contain a hashref of hashrefs. If the value specified for the I<cluster> |
|
440
|
|
|
|
|
|
|
attribute is present as a key in the B<cluster_specific> |
|
441
|
|
|
|
|
|
|
hash, the corresponding value will be used as a set of attribute values |
|
442
|
|
|
|
|
|
|
when the group is created. Its elements will replace or augment any values |
|
443
|
|
|
|
|
|
|
for the same attribute name provided to the group method. This will normally be |
|
444
|
|
|
|
|
|
|
used if the program can be dynamically configured for different cluster |
|
445
|
|
|
|
|
|
|
types, and there are different arg settings required for the different |
|
446
|
|
|
|
|
|
|
types of cluster. |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=back |
|
449
|
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head3 Basic Attributes of B<group> object |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=over 4 |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=item name |
|
455
|
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
The B<name> you give to a group is used for creating the directory |
|
457
|
|
|
|
|
|
|
where output is stored, and also in log messages. A default name |
|
458
|
|
|
|
|
|
|
"default_group_name" is provided if you do not specific an explicit |
|
459
|
|
|
|
|
|
|
name. Using the default name is adequate in simple programs which |
|
460
|
|
|
|
|
|
|
only create one group, but for more complicated programs giving |
|
461
|
|
|
|
|
|
|
separate names to each group is necessary to easily identify the |
|
462
|
|
|
|
|
|
|
output of each group. The value of B<name> may also be used by |
|
463
|
|
|
|
|
|
|
the cluster-specific driver to provide an identifier name (or the |
|
464
|
|
|
|
|
|
|
basis of one) to the underlying cluster, if it needs one. |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=item stage_defaults |
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
The attribute B<stage_defaults> is optional. If provided, it should |
|
469
|
|
|
|
|
|
|
contain a hashref. This hash will be used as default values for |
|
470
|
|
|
|
|
|
|
every stage created by this group. |
|
471
|
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=back |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=head3 Directory Layout Attributes of B<group> object |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
=over 4 |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=item base_dir |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
If none of the other directory layout attributes are used to |
|
481
|
|
|
|
|
|
|
over-ride this, this attribute specifies the directory in which |
|
482
|
|
|
|
|
|
|
all output directories and files will be created. This is |
|
483
|
|
|
|
|
|
|
usually an existing directory; it defaults to the current |
|
484
|
|
|
|
|
|
|
directory ".". |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item group_dir |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
This directory is usually created to contain the outputs of the |
|
489
|
|
|
|
|
|
|
group execution. By default, it is directly under B<base_dir> with |
|
490
|
|
|
|
|
|
|
a name that consists of the group name attribute and a timestamp |
|
491
|
|
|
|
|
|
|
(e.g. "T_Definition-20150521-153256"). |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
If you provide an explicit value for this parameter, then it |
|
494
|
|
|
|
|
|
|
should not be an existing directory containing previous results. |
|
495
|
|
|
|
|
|
|
(If it is, the log file will be appended to the previous one, but |
|
496
|
|
|
|
|
|
|
the stage directories will over-write equivalently named directories |
|
497
|
|
|
|
|
|
|
and files that are created in this run, while leaving unchanged any |
|
498
|
|
|
|
|
|
|
that did not recur, so you'll have a mix of old and new contents.) |
|
499
|
|
|
|
|
|
|
The names of files and directories created under B<group_dir> are |
|
500
|
|
|
|
|
|
|
chosen to be consistent and easy to find automatically. |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=back |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head3 Logging Attributes of B<group> object |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
An HPCI group logs its activities using a Log::Log4perl logger. |
|
507
|
|
|
|
|
|
|
The logger can either be provided by the caller, or else HPCI will |
|
508
|
|
|
|
|
|
|
create its own. |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=over 4 |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=item log |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
This a Log::Log4perl::Logger object. If it is provided as an |
|
515
|
|
|
|
|
|
|
attribute to the B<group> creation call, it will be used as it is, |
|
516
|
|
|
|
|
|
|
and the other logging attributes will be ignored. |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
If it is not provided by the user, a new Log::Log4perl::Logger |
|
519
|
|
|
|
|
|
|
object will be created using the attributes below to define where |
|
520
|
|
|
|
|
|
|
it is logged to. This created logger will send all log entries to |
|
521
|
|
|
|
|
|
|
a file, as well as sending all info and higher log entries to stderr. |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item log_path |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
If this attribute is provided (and the B<log> attribute is not |
|
526
|
|
|
|
|
|
|
provided) it will be used as the full pathname of a file where the |
|
527
|
|
|
|
|
|
|
log will be written. If it is not provided, it will use the path |
|
528
|
|
|
|
|
|
|
B<log_dir>/B<log_file> by default. |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=item log_dir |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
If neither B<log> or B<log_path> is provided, this attribute can |
|
533
|
|
|
|
|
|
|
be used to specify the directory where the log file is to be written. |
|
534
|
|
|
|
|
|
|
By default, it uses B<group_dir>. |
|
535
|
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item log_file |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
If neither B<log> or B<log_path> is provided, this attribute can |
|
539
|
|
|
|
|
|
|
be used to specify the file name to be written in the log directory. |
|
540
|
|
|
|
|
|
|
By default, it uses the constant name "group.log". |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=item log_level |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
You can provide this attribute to change the default log level setting from "info" to any of I<debug info warn error fatal>. |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=item log_no_stderr, log_no_file |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
Normally, the default log is written to both stderr and to the log file. |
|
549
|
|
|
|
|
|
|
Either of those can be suppressed by setting the corresponding attribute to a true value. |
|
550
|
|
|
|
|
|
|
These attributes have no effect if the user proviedes their own logger instead of using the default one. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=back |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head3 Operational Attributes of B<group> object |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=over 4 |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=item max_concurrent |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
This attribute specifies the maximum number of stages that will |
|
561
|
|
|
|
|
|
|
be executing at one time. The default setting of 0 allows as |
|
562
|
|
|
|
|
|
|
many stages as possible (all those that are not waiting for a |
|
563
|
|
|
|
|
|
|
pre-requisite stage to complete) to run at the same time. |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=item status |
|
566
|
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
This attribute is set internally while stages are executed. |
|
568
|
|
|
|
|
|
|
It contains the final result status from each stage run that |
|
569
|
|
|
|
|
|
|
has completed. The B<execute> method returns this value when |
|
570
|
|
|
|
|
|
|
execution completes, so you will usually not need to access it |
|
571
|
|
|
|
|
|
|
explicitly yourself. |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
This value is a hashref (indexed by stage name). The values are |
|
574
|
|
|
|
|
|
|
arrayrefs (indexed by run number 0..n). For each run, there is |
|
575
|
|
|
|
|
|
|
a hash. The key B<exit_status> contains the exit status of the run. |
|
576
|
|
|
|
|
|
|
If the stage was never run, B<exit_status> instead contains a text |
|
577
|
|
|
|
|
|
|
message listing the reason that it was skipped. |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=back |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head3 Environment Passing Attributes of B<group> object |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
You can set up a set of enviroment variables that will be provided to |
|
584
|
|
|
|
|
|
|
all stages. (You can also set variables that are only for individual |
|
585
|
|
|
|
|
|
|
stages - if so, they will modify any set you provide in the group.) |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
See B<HPCI::Env> for a description of these. |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=head3 Method B<stage> of B<group> object |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
The method B<stage> is used to create a new stage object. |
|
592
|
|
|
|
|
|
|
Its characteristics are described below. |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
The B<group> object keeps track of all B<stage> objects created |
|
595
|
|
|
|
|
|
|
within that group so that they can all be managed properly when the |
|
596
|
|
|
|
|
|
|
B<execute> method is invoked. |
|
597
|
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=head3 Method B<add_deps> of B<group> object |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
The method B<add_deps> is used to specify pre-requisite/dependent |
|
601
|
|
|
|
|
|
|
relationships. It takes either a hashref or a list containing |
|
602
|
|
|
|
|
|
|
pairs. One of the keys must be either B<pre_req> or B<pre_reqs>, |
|
603
|
|
|
|
|
|
|
another must be either B<dep> or B<deps>. |
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
The value for each of these keys can be either a scalar, or an arrayref |
|
606
|
|
|
|
|
|
|
of scalar values. A scalar value can be either a B<stage> object (a reference), |
|
607
|
|
|
|
|
|
|
the exact name of a stage object (a string), or a pattern that matches |
|
608
|
|
|
|
|
|
|
the name of zero or more stages (a regexp). |
|
609
|
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
HPCI will ensure that the stage or all of the stages specified for pre_req |
|
611
|
|
|
|
|
|
|
or pre_reqs have completed execution before any of the dep (or deps) stages |
|
612
|
|
|
|
|
|
|
is allowed to start executing. |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
The plural forms are provided for convenience - often the output |
|
615
|
|
|
|
|
|
|
file from one preparation stage is required by many others, or the |
|
616
|
|
|
|
|
|
|
output from many processing stages is needed by a stage that merges |
|
617
|
|
|
|
|
|
|
results into a summary report. Rather than having to loop over the |
|
618
|
|
|
|
|
|
|
pre_reqs and deps and calling B<add_deps> individually for every |
|
619
|
|
|
|
|
|
|
individual dependency, a single call will handle the entire combination. |
|
620
|
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
Allowing a regexp to match no stages at all makes it possible to write |
|
622
|
|
|
|
|
|
|
an add_deps call for stages that are optional - no dependency will be |
|
623
|
|
|
|
|
|
|
added if the optional stage was not created this run. |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
While it is recommended for code readability that you use the singular |
|
626
|
|
|
|
|
|
|
form (B<dep> or B<pre_req>) is you are providing a single stage, and the |
|
627
|
|
|
|
|
|
|
plural form (B<deps> or B<pre_reqs>) if you are providing a list of |
|
628
|
|
|
|
|
|
|
stages, either can be used. |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
The B<add_deps> method can be called multiple times. HPCI will |
|
631
|
|
|
|
|
|
|
accumlate the dependencies appropriately. |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
It is an error to provide a sequence of dependencies that form |
|
634
|
|
|
|
|
|
|
a cycle in which a stage directly or indirectly has itself as a |
|
635
|
|
|
|
|
|
|
pre-requisite. (Such a stage could never run. HPCI will detect |
|
636
|
|
|
|
|
|
|
when all remaining stages are blocked by pre-requisites and abort, |
|
637
|
|
|
|
|
|
|
but that might be after numerous stages have already been executed.) |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=head3 METHOD execute of B<group> object |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
The B<execute> method is the final goal of building the group. |
|
642
|
|
|
|
|
|
|
It schedules the execution of individual stages. It waits for |
|
643
|
|
|
|
|
|
|
pre-requisites before running a stage. It provides for re-running |
|
644
|
|
|
|
|
|
|
a stage if a soft failure has occurred that allows a retry. If a |
|
645
|
|
|
|
|
|
|
failure that cannot be retried occurs, it can skip scheduling dependent |
|
646
|
|
|
|
|
|
|
stages, or even stop scheduling all new stages. |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=head2 Stage Object |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head3 Attributes |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=over 4 |
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item name |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
A unique B<name> attribute must be provided for stages. It is a string. |
|
657
|
|
|
|
|
|
|
There is no default value provided. |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=item command |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
The B<command> attribute must be provided before the group is |
|
662
|
|
|
|
|
|
|
executed. It can either be provided as a string attribute when the |
|
663
|
|
|
|
|
|
|
stage is created, or by using the one of |
|
664
|
|
|
|
|
|
|
the command-setting methods provided by the stage class. |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
See B<HPCI::Stage> for more details about the command setting |
|
667
|
|
|
|
|
|
|
methods. |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=item dir |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
The B<dir> attribute is optional. It specifies the direcory |
|
672
|
|
|
|
|
|
|
in which files related to the stage are placed. By default, |
|
673
|
|
|
|
|
|
|
it is I<group_dir>/I<stage_name>. You will usually not need to |
|
674
|
|
|
|
|
|
|
change this. |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item cluster |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
The B<cluster> attribute is automatically passed on fro mthe B<group> |
|
679
|
|
|
|
|
|
|
to each B<stage>. You are not likely to need this. |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=item group |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
The B<group> that created a stage is automatically passed on (as a weak |
|
684
|
|
|
|
|
|
|
reference) to the stage. You are not likely to need to use this attribute |
|
685
|
|
|
|
|
|
|
in user code. |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item resources_required |
|
688
|
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=item retry_resources_required |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
The B<resources_required> and B<retry_resources_required> are used to |
|
692
|
|
|
|
|
|
|
define resources that will be required by the stage when it executes. |
|
693
|
|
|
|
|
|
|
These attributes are somewhat cluster specific - each cluster has |
|
694
|
|
|
|
|
|
|
its own set of requirements for how a job submission must specify |
|
695
|
|
|
|
|
|
|
the sort of resources that it will require. |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
The B<resources_required> attribute is a hash, specifying the |
|
698
|
|
|
|
|
|
|
value for each resource that is to be considered. |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
The B<retry_resources_required> attribute is also a hash. For |
|
701
|
|
|
|
|
|
|
each resource, you can specify an array of values. If the cluster |
|
702
|
|
|
|
|
|
|
driver is able to detect that a run failed because the resource |
|
703
|
|
|
|
|
|
|
was inadequate, it will retry the run with the next larger value |
|
704
|
|
|
|
|
|
|
from this list. |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
See B<HPCI::Stage> for more details about resources. |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=item force_retries |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
This attribute specifies an integer number of time to retry the |
|
711
|
|
|
|
|
|
|
stage before comcluding that it has actually failed. You might use |
|
712
|
|
|
|
|
|
|
this if your cluster has some nodes that work differently from |
|
713
|
|
|
|
|
|
|
others and a stage might fail on one type of node but succeed on |
|
714
|
|
|
|
|
|
|
another. |
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
These retries are done after any cluster-specific retry mechanisms |
|
717
|
|
|
|
|
|
|
have been used. |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
The default value for this attribute is 0 (zero), giving no forced |
|
720
|
|
|
|
|
|
|
retries unless you specifically ask for them. |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
=item failure_action ('abort_group', 'abort_deps'*, or 'ignore') |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
Specifies the action to take if this stage fails (terminates with |
|
725
|
|
|
|
|
|
|
a non-zero status). |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
There are three string values that it can have: |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
=over 4 |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item - abort_deps (default) |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
If the stage fails, then any stages which depend upon it |
|
734
|
|
|
|
|
|
|
(recursively) are not run. The group continues executing until |
|
735
|
|
|
|
|
|
|
all stages which are not dependent upon this stage (including those |
|
736
|
|
|
|
|
|
|
that have not yet been initiated) complete execution. |
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
=item - abort_group |
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
If the stage fails, then no other stages are started. The group |
|
741
|
|
|
|
|
|
|
simply waits until stages that have already been started complete |
|
742
|
|
|
|
|
|
|
and then returns. |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=item - ignore |
|
745
|
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
Execution continues unchanged, any dependent stages will be run when they are |
|
747
|
|
|
|
|
|
|
no longer blocked. |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=back |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=item abort_group_on_failure abort_deps_on_failure ignore_failure |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
As an alternative to providing a value to the failute_action attribute |
|
754
|
|
|
|
|
|
|
when you create a stage, you can instead provide one of the pseudo-attributes |
|
755
|
|
|
|
|
|
|
'abort_group_on_failure', 'abort_deps_on_failure', or 'ignore_failure' with |
|
756
|
|
|
|
|
|
|
a true value to specify 'abort_group', 'abort_deps', or 'ignore' respectively. |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=item state |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
The B<state> is mostly an internal attribute but after the group has |
|
761
|
|
|
|
|
|
|
finished execution you can use this to check whether the stage was |
|
762
|
|
|
|
|
|
|
run successfully. After execution, B<state> will either be 'pass" or |
|
763
|
|
|
|
|
|
|
'fail'. |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item Environment passing attributes |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
You can set up a set of environment variables that will be provided to |
|
768
|
|
|
|
|
|
|
this stage. It will use set defined for the group as a basis (if such a set was |
|
769
|
|
|
|
|
|
|
defined for the group), but that set can be changed for individual stages |
|
770
|
|
|
|
|
|
|
or you can have no group default and only provide a set to specific stages |
|
771
|
|
|
|
|
|
|
as needed. See B<HPCI::Env> for further details. |
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=back |
|
774
|
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=head3 Methods |
|
776
|
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=head4 command creation |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
There are a number of helper methods to assist in building different |
|
780
|
|
|
|
|
|
|
types of commands to be provided for the B<command> attribute. |
|
781
|
|
|
|
|
|
|
See B<HPCI::Stage> for details. |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=head1 Local Configuration |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
TODO: write this section |
|
786
|
|
|
|
|
|
|
- describe the HPCI::LocalConfig module |
|
787
|
|
|
|
|
|
|
- describe the mechanism for adding extra roles to group, stage, etc. |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=head1 Additional |
|
790
|
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
This is an early public release of HPCI, and at present, there are |
|
792
|
|
|
|
|
|
|
only two drivers available. |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Only one cluster type is directly included within the HPCI package. |
|
795
|
|
|
|
|
|
|
The cluster type B<HPCD::uni> runs on a "cluster" of only one |
|
796
|
|
|
|
|
|
|
machine. It simply uses fork to submit individual stages and has |
|
797
|
|
|
|
|
|
|
facility for retries and timeouts. This is the default cluster |
|
798
|
|
|
|
|
|
|
type used for testing, as it will work natively on all types of |
|
799
|
|
|
|
|
|
|
Unix systems. It is also possible to use this driver as a fallback, |
|
800
|
|
|
|
|
|
|
in cases where the only available "real" cluster is not accessable |
|
801
|
|
|
|
|
|
|
for some reason. |
|
802
|
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
Additionally, there is the B<HPCD::SGE> driver available on CPAN. |
|
804
|
|
|
|
|
|
|
It has seen heavy use within Boutros Lab. |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
Now that these packages have been released, it is likely new |
|
807
|
|
|
|
|
|
|
cluster drivers will be written. People interested in developing |
|
808
|
|
|
|
|
|
|
drivers for additional cluster types should contact the authors |
|
809
|
|
|
|
|
|
|
of this package to co-ordinate releases, features needed, etc. at |
|
810
|
|
|
|
|
|
|
B<mailto:BoutrosLabSoftware@oicr.on.ca>. |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
Additionally, you may wish to subscribe to the email list mentioned |
|
813
|
|
|
|
|
|
|
at B<https:://lists.oicr.on.ca/mailman/listinfo/hpci-discuss>. |
|
814
|
|
|
|
|
|
|
This is expected to be a low volume discussion group, although the |
|
815
|
|
|
|
|
|
|
future will tell what the actual volume will be. |
|
816
|
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
As additional capabilities of new cluster types are addressed, and as |
|
818
|
|
|
|
|
|
|
different control needs used at other organizations are identified; |
|
819
|
|
|
|
|
|
|
this interface will surely change. As far as possible, such changes |
|
820
|
|
|
|
|
|
|
will be done in an upwardly compatible manner, but until a few more |
|
821
|
|
|
|
|
|
|
drivers have been integrated there is the possibility of changes |
|
822
|
|
|
|
|
|
|
that are not fully backward compatible. Watch the release notes |
|
823
|
|
|
|
|
|
|
for warnings of such issues. At some point there will be a 1.0.0 |
|
824
|
|
|
|
|
|
|
release, at which point this expectation of (limited) incompatible |
|
825
|
|
|
|
|
|
|
future change will be dropped. After that point, incompatible |
|
826
|
|
|
|
|
|
|
changes will only be made for critical reasons. |
|
827
|
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
The reason for separate distribution of cluster-specific HPCD |
|
829
|
|
|
|
|
|
|
packages are fairly obvious: |
|
830
|
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=over 4 |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
=item - |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
The maintainers of the HPCI package do not have access to every |
|
836
|
|
|
|
|
|
|
possible cluster type, and it unlikely that anyone will have access |
|
837
|
|
|
|
|
|
|
to all supported cluster types from one location, so the driver |
|
838
|
|
|
|
|
|
|
modules will need to be tested separately anyhow. |
|
839
|
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=item - |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
A user of HPCI is equally not going to have need to access every |
|
843
|
|
|
|
|
|
|
type of cluster that exists, so they will probably prefer to only |
|
844
|
|
|
|
|
|
|
download the driver modules that they actually need. |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=back |
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
849
|
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
=over 4 |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=item HPCI::Group |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
Describes the interface common to all B<HPCI Group> |
|
855
|
|
|
|
|
|
|
objects, regardless of the particular type of cluster that |
|
856
|
|
|
|
|
|
|
is actually being used to run the stages. In the future, the |
|
857
|
|
|
|
|
|
|
common interface may change somewhat as supprt for additional |
|
858
|
|
|
|
|
|
|
cluster types is added and a better understanding of the common |
|
859
|
|
|
|
|
|
|
features is achieved. |
|
860
|
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
=item HPCI::Stage |
|
862
|
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
Describes the interface common to stage object returned |
|
864
|
|
|
|
|
|
|
by all B<HPCI Stage> objects, regardless of the |
|
865
|
|
|
|
|
|
|
particular type of cluster that is actually being used to |
|
866
|
|
|
|
|
|
|
run the stages. The common interface may change somewhat |
|
867
|
|
|
|
|
|
|
as supprt for additional cluster types is added and a better |
|
868
|
|
|
|
|
|
|
understanding of the common features is achieved. |
|
869
|
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=item HPCI::Logger |
|
871
|
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
Describes the logger parameters in more detail. |
|
873
|
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=item HPCI::Env |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
Describes the environment passing parameters in more detail. |
|
877
|
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=item HPCD::I<$cluster>::Group |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
Describes the group interface unique to a specific type of cluster, |
|
881
|
|
|
|
|
|
|
including any limitations or extensions to the generic interface. |
|
882
|
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
=item HPCD::I<$cluster>::Stage |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
Describes the stage interface unique to a specific type of cluster, |
|
886
|
|
|
|
|
|
|
including any limitations or extensions to the generic interface. |
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
=back |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=head1 AUTHOR |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
Christopher Lalansingh - Boutros Lab |
|
893
|
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
John Macdonald - Boutros Lab |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
Paul Boutros, Phd, PI - Boutros Lab |
|
899
|
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
The Ontario Institute for Cancer Research |
|
901
|
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=cut |
|
903
|
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
1; |
|
905
|
|
|
|
|
|
|
|