line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package HPCI; |
2
|
|
|
|
|
|
|
### HPCI.pm ################################################################### |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
### INCLUDES ################################################################## |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
# safe Perl |
7
|
18
|
|
|
18
|
|
4168585
|
use warnings; |
|
18
|
|
|
|
|
49
|
|
|
18
|
|
|
|
|
717
|
|
8
|
18
|
|
|
18
|
|
108
|
use strict; |
|
18
|
|
|
|
|
45
|
|
|
18
|
|
|
|
|
443
|
|
9
|
18
|
|
|
18
|
|
132
|
use Carp; |
|
18
|
|
|
|
|
49
|
|
|
18
|
|
|
|
|
1027
|
|
10
|
18
|
|
|
18
|
|
8603
|
use Module::Load; |
|
18
|
|
|
|
|
18182
|
|
|
18
|
|
|
|
|
115
|
|
11
|
18
|
|
|
18
|
|
10299
|
use Module::Load::Conditional qw(can_load); |
|
18
|
|
|
|
|
398939
|
|
|
18
|
|
|
|
|
1299
|
|
12
|
18
|
|
|
18
|
|
22511
|
use List::MoreUtils qw(uniq); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @extra_roles; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
sub add_extra_role { |
17
|
|
|
|
|
|
|
# next line is documentation |
18
|
|
|
|
|
|
|
# my ($cluster, $level, $role) = @_; |
19
|
|
|
|
|
|
|
shift; # get rid of HPCI class name |
20
|
|
|
|
|
|
|
push @extra_roles, [ @_ ]; |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub get_extra_roles { |
24
|
|
|
|
|
|
|
my ($target_cluster, $target_level) = @_; |
25
|
|
|
|
|
|
|
my @roles; |
26
|
|
|
|
|
|
|
for my $role_bunch (@extra_roles) { |
27
|
|
|
|
|
|
|
my ($cluster, $level, $roles) = @$role_bunch; |
28
|
|
|
|
|
|
|
next unless $cluster eq 'ALL' || $cluster eq $target_cluster; |
29
|
|
|
|
|
|
|
next unless $level eq $target_level; |
30
|
|
|
|
|
|
|
push @roles, ref $roles ? @$roles : $roles; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
return @roles; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $default_attrs = {}; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub add_default_attrs { |
38
|
|
|
|
|
|
|
shift; # get rid of HPCI class name |
39
|
|
|
|
|
|
|
my $newhash = ref($_[0]) eq 'HASH' ? shift : { @_ }; |
40
|
|
|
|
|
|
|
_merge_hash( $default_attrs, $newhash ); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub _merge_hash { |
44
|
|
|
|
|
|
|
my( $target, $new, $path ) = @_; |
45
|
|
|
|
|
|
|
$path ||= []; |
46
|
|
|
|
|
|
|
croak "not a hash when merging attribute hash{".join('}{',@$path)."}" |
47
|
|
|
|
|
|
|
unless ref($target) eq 'HASH' && ref($new) eq 'HASH'; |
48
|
|
|
|
|
|
|
while (my($k,$v) = each %$new) { |
49
|
|
|
|
|
|
|
if (ref($v) eq 'HASH' || (exists $target->{$k} && ref($target->{$k}) eq 'HASH')) { |
50
|
|
|
|
|
|
|
$target->{$k} //= {}; |
51
|
|
|
|
|
|
|
_merge_hash( $target->{$k}, $v, [ @$path, $k ] ); |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
else { |
54
|
|
|
|
|
|
|
$target->{$k} = $v; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub explist { |
60
|
|
|
|
|
|
|
return ( |
61
|
|
|
|
|
|
|
map { |
62
|
|
|
|
|
|
|
ref($_) eq 'ARRAY' ? @$_ |
63
|
|
|
|
|
|
|
: defined($_) ? ( $_ ) |
64
|
|
|
|
|
|
|
: ( ) |
65
|
|
|
|
|
|
|
} @_ |
66
|
|
|
|
|
|
|
); |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# get the env_keys, in original order, but use *LAST* instance |
70
|
|
|
|
|
|
|
# that retains the order specified in either default or args |
71
|
|
|
|
|
|
|
# but lets the relative order in args take precedence for keys |
72
|
|
|
|
|
|
|
# that are in both |
73
|
|
|
|
|
|
|
# |
74
|
|
|
|
|
|
|
# So, the order is: |
75
|
|
|
|
|
|
|
# [ keys that are only in default in the order they were specified in default ] |
76
|
|
|
|
|
|
|
# [ then keys that are in args in the order they were specified in args ] |
77
|
|
|
|
|
|
|
# No complaint is made if the same key is specified twice in either default |
78
|
|
|
|
|
|
|
# or args, the earlier one(s) are simply ignored. |
79
|
|
|
|
|
|
|
sub keylist { |
80
|
|
|
|
|
|
|
my @keys; |
81
|
|
|
|
|
|
|
for my $arg (@_) { |
82
|
|
|
|
|
|
|
my $keys = (delete $arg->{env_keys}) // []; |
83
|
|
|
|
|
|
|
push @keys, @$keys; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
return (reverse uniq reverse @keys); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub group { |
89
|
|
|
|
|
|
|
my $pkg = shift; |
90
|
|
|
|
|
|
|
my $args = |
91
|
|
|
|
|
|
|
scalar(@_) == 1 && ref($_[0]) eq 'HASH' ? shift |
92
|
|
|
|
|
|
|
: scalar(@_) % 2 == 0 ? { @_ } |
93
|
|
|
|
|
|
|
: croak("HPCI->group() requires a hashref or a hash in list form"); |
94
|
|
|
|
|
|
|
# copy the default attributes as a start |
95
|
|
|
|
|
|
|
my $use_args = {}; |
96
|
|
|
|
|
|
|
_merge_hash( $use_args, $default_attrs ); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# pull out the env_keys (if any) |
99
|
|
|
|
|
|
|
my @keys = keylist( $use_args, $args ); |
100
|
|
|
|
|
|
|
my @key_specific = map { $_->{env_key_specific} // () } $use_args, $args; |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# merge any specified env_key list that has a value available |
103
|
|
|
|
|
|
|
for my $key (@keys) { |
104
|
|
|
|
|
|
|
for my $key_spec (@key_specific) { |
105
|
|
|
|
|
|
|
if (my $spec_args = $key_spec->{$key}) { |
106
|
|
|
|
|
|
|
_merge_hash( $use_args, $spec_args ); |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
my $cluster = $args->{cluster} // $use_args->{cluster} |
112
|
|
|
|
|
|
|
// croak("HPCI->group() requires a cluster key in the argument hash"); |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
for my $arg_set ($use_args, $args) { |
115
|
|
|
|
|
|
|
if (my $spec_args = delete $arg_set->{cluster_specific}) { |
116
|
|
|
|
|
|
|
_merge_hash( $use_args, $spec_args->{$cluster} // {} ); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
_merge_hash( $use_args, $args ); |
120
|
|
|
|
|
|
|
my $clmod = "HPCD::${cluster}::Group"; |
121
|
|
|
|
|
|
|
load $clmod; |
122
|
|
|
|
|
|
|
return $clmod->new($use_args); |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub _trigger_mkdir { |
127
|
|
|
|
|
|
|
my $self = shift; # an object with a log |
128
|
|
|
|
|
|
|
my $dir = shift; # a Path::Class::Dir object |
129
|
|
|
|
|
|
|
$self->info( "Created directory: $_" ) for $dir->mkpath; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 NAME |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
HPCI |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 VERSION |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Version 0.53 |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=cut |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
our $VERSION = '0.53'; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
our $LocalConfigFound; |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
$LocalConfigFound = can_load( modules => { 'HPCI::LocalConfig' => undef }); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
if (!$LocalConfigFound) { |
149
|
|
|
|
|
|
|
my $err = $Module::Load::Conditional::ERROR; |
150
|
|
|
|
|
|
|
if (defined $err && $err !~ /^Could not find or check module /) { |
151
|
|
|
|
|
|
|
print STDERR "Conditional load of HPCI::LocalConfig failed. Error is:\n"; |
152
|
|
|
|
|
|
|
print STDERR "$err\n"; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head1 SYNOPSIS |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
use HPCI; |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
my $group = HPCI->group( |
161
|
|
|
|
|
|
|
cluster => ($ENV{HPCI_CLUSTER} // 'uni'), |
162
|
|
|
|
|
|
|
... |
163
|
|
|
|
|
|
|
); |
164
|
|
|
|
|
|
|
$group->stage( |
165
|
|
|
|
|
|
|
name => 'analysis_A', |
166
|
|
|
|
|
|
|
command => '...' |
167
|
|
|
|
|
|
|
); |
168
|
|
|
|
|
|
|
$group->stage( |
169
|
|
|
|
|
|
|
name => 'analysis_B', |
170
|
|
|
|
|
|
|
command => '...' |
171
|
|
|
|
|
|
|
); |
172
|
|
|
|
|
|
|
$group->stage( |
173
|
|
|
|
|
|
|
name => 'analysis_C', |
174
|
|
|
|
|
|
|
command => '...' |
175
|
|
|
|
|
|
|
); |
176
|
|
|
|
|
|
|
$group->stage( |
177
|
|
|
|
|
|
|
name => 'report', |
178
|
|
|
|
|
|
|
command => '...' |
179
|
|
|
|
|
|
|
); |
180
|
|
|
|
|
|
|
$group->add_deps( |
181
|
|
|
|
|
|
|
pre_reqs => [ qw(analysis_A analysis_B analysis_C) ], |
182
|
|
|
|
|
|
|
dep => 'report' |
183
|
|
|
|
|
|
|
); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
my $status_info = $group->execute; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
my $exit_status = 0; |
188
|
|
|
|
|
|
|
for my $stage ( qw(analysis_A analysis_B analysis_C report) ) { |
189
|
|
|
|
|
|
|
if (my $stat = $status_info->{$stage}[-1]{exit_status}) { |
190
|
|
|
|
|
|
|
$exit_status ||= $stat; |
191
|
|
|
|
|
|
|
print stderr "Stage $stage failed, status $stat!\n"; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
exit(0); # all stages completed without error |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
=head1 OVERVIEW |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
HPCI (High Performance Computing Interface) provides an interface to |
200
|
|
|
|
|
|
|
a range of types of computer aggregations (clusters, clouds, ...). |
201
|
|
|
|
|
|
|
(The rest of this document will use I<cluster> henceforth to refer |
202
|
|
|
|
|
|
|
to any type of aggregation that is supported by HPCI.) |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
A cluster is defined as a software interface that allows running |
205
|
|
|
|
|
|
|
multiple programs on separate compute elements (nodes). |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
HPCI uses an HPCD (High Performance Computing Driver) module |
208
|
|
|
|
|
|
|
to translate its standard interface into the appropriate access |
209
|
|
|
|
|
|
|
mechanisms for the type of cluster that is selected. (If you have |
210
|
|
|
|
|
|
|
used the DBI/DBD modules for accessing databases, this will seem |
211
|
|
|
|
|
|
|
very familiar.) |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
The goal of this HPCI/HPCD split is to allow users to write |
214
|
|
|
|
|
|
|
programs that make use of cluster facilities in a portable manner. |
215
|
|
|
|
|
|
|
If there is a reason to run the same program using a different |
216
|
|
|
|
|
|
|
type of cluster, it should only require change the cluster |
217
|
|
|
|
|
|
|
definition attributes provided to one parent object creation; the |
218
|
|
|
|
|
|
|
rest of code need not know or care about the changed cluster type. |
219
|
|
|
|
|
|
|
Programs which are likely to be run on different cluster types will |
220
|
|
|
|
|
|
|
usually be written to get the cluster attribute information from |
221
|
|
|
|
|
|
|
a configuration file, or command line arguments - so the program |
222
|
|
|
|
|
|
|
itself need not change at all. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Running a program on different types of clusters can happen for a |
225
|
|
|
|
|
|
|
number of reasons. An organization might have access to multiple |
226
|
|
|
|
|
|
|
types of cluster, such as an in-house cluster plus an external cloud. |
227
|
|
|
|
|
|
|
Scholarly research often shares programs both to allow similar |
228
|
|
|
|
|
|
|
research, or to validate existing research results. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
HPCD modules can provide cluster-specific extensions. That can |
231
|
|
|
|
|
|
|
either be a different kind of functionality, or it can be as simple |
232
|
|
|
|
|
|
|
as allowing the teminology familiar to users of that cluster type |
233
|
|
|
|
|
|
|
to be used in place of the generic terminology provided by HPCI. |
234
|
|
|
|
|
|
|
However, using such extensions makes it harder to move to a |
235
|
|
|
|
|
|
|
different cluster type. So, actually making use of such extensions |
236
|
|
|
|
|
|
|
must be considered carefully. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=head1 The life cycle of a B<group> |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
A B<group> is the main mechanism for using HPCI. It is an object that |
241
|
|
|
|
|
|
|
manages a group of computation steps (called B<stage>s), distributing them |
242
|
|
|
|
|
|
|
across the cluster and keeping track of various housekeeping details like |
243
|
|
|
|
|
|
|
when each stage can be run, checking for the result of each completed stage |
244
|
|
|
|
|
|
|
run, deciding whether a failure should cause a stage to be retried to to |
245
|
|
|
|
|
|
|
prevent other stages from being executed, and collecting the status for each |
246
|
|
|
|
|
|
|
stage. |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
The life cycle of running a group of commands on a cluster is: |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=over 4 |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
=item create group |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
A B<group> object is created using the HPCI "class method" B<group>. |
255
|
|
|
|
|
|
|
HPCI isn't really a class, it just appears to be one. Its B<group> |
256
|
|
|
|
|
|
|
"class method" actually delegates creation of a group object to |
257
|
|
|
|
|
|
|
the HPCD module that is indicated by the I<cluster> attribute |
258
|
|
|
|
|
|
|
and it returns an cluster-specific group object that supports the |
259
|
|
|
|
|
|
|
HPCI interface. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=item create stages |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
A B<stage> is created for each command that is to be executed on a |
264
|
|
|
|
|
|
|
separate node of the cluster. This is created using the B<group> |
265
|
|
|
|
|
|
|
object's method B<stage>. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=item define dependency ordering between the stages |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
An important reason for running a group of jobs on a cluster is the |
270
|
|
|
|
|
|
|
ability to use multiple computers to run portions of the computation |
271
|
|
|
|
|
|
|
at the same time, rather than having them compete for the rsources |
272
|
|
|
|
|
|
|
of a single computer. However, often some stages will depend |
273
|
|
|
|
|
|
|
upon the output of other stages. Such a dependent stage cannot |
274
|
|
|
|
|
|
|
start executing until all pre-requisite stages have completed. |
275
|
|
|
|
|
|
|
Specifying such dependency requirements is done with the B<group> |
276
|
|
|
|
|
|
|
method B<add_deps>. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=item execution |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
Finally, the B<group> method B<execute> will run the entire set |
281
|
|
|
|
|
|
|
of stages. It does not return until all stages have completed (or |
282
|
|
|
|
|
|
|
have been skipped). Each stage will normally be run once, however |
283
|
|
|
|
|
|
|
it is possible for some stages to be retried under some |
284
|
|
|
|
|
|
|
failure conditions. |
285
|
|
|
|
|
|
|
A failure of one stage (after retry possibilities have been exhausted) |
286
|
|
|
|
|
|
|
can be a trigger for |
287
|
|
|
|
|
|
|
completely skipping the execution of other stages. Each separate |
288
|
|
|
|
|
|
|
execution of a stage (original or retry) is managed with an internal object |
289
|
|
|
|
|
|
|
called a job - but a user program won't see job objects directly. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
As many stages as possible are run simultaneously. This is limited by |
292
|
|
|
|
|
|
|
the specified dependencies, by cluster-specific driver limits, and by |
293
|
|
|
|
|
|
|
user-specified limits on concurrent execution. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=back |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
The objects that calling code deals with directly are a group object to |
298
|
|
|
|
|
|
|
manage a group of stages, and a stage object for each separately run job. |
299
|
|
|
|
|
|
|
Internally, there are also job objects for each retry of a stage, and a |
300
|
|
|
|
|
|
|
log object for logging the execution process (alternately, the user can |
301
|
|
|
|
|
|
|
provide their own Log4Perl compatible log object for HPCI to use - this may be |
302
|
|
|
|
|
|
|
of use if you wish to merge logging of multiple groups and/or of other |
303
|
|
|
|
|
|
|
processing within your program together in a single log). |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
There are also some facilities to provide local customization of the standard |
306
|
|
|
|
|
|
|
usage of HPCI (see "Local Customization" below). |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=head1 Output Tree Layout |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
There are a number of output files and directories created during a group execution. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
The default layout of these is: |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
<base_dir> "." |
315
|
|
|
|
|
|
|
<group_dir> <base_dir>/<name>-<YYYYMMDD-hhmmss> |
316
|
|
|
|
|
|
|
<log> <group_dir>/<name>.log |
317
|
|
|
|
|
|
|
<stage_dir> <group_dir>/<stage_name> |
318
|
|
|
|
|
|
|
<script_file> <stage_dir>/script.sh |
319
|
|
|
|
|
|
|
<job_dir> <stage_dir>/<retry_number> |
320
|
|
|
|
|
|
|
stdout |
321
|
|
|
|
|
|
|
stderr |
322
|
|
|
|
|
|
|
final_retry symlink to final <job_dir> |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
Many of these files/directories can be re-assigned to different |
325
|
|
|
|
|
|
|
location using group or stage attributes - shown above is the |
326
|
|
|
|
|
|
|
default layout. Commonly, you will specifically use the I<base_dir> |
327
|
|
|
|
|
|
|
attribute to choose a location other than the current directory for |
328
|
|
|
|
|
|
|
placing the tree; or else use the I<group_dir> attribute if you want |
329
|
|
|
|
|
|
|
to choose a location that does not create a sub-directory for you. |
330
|
|
|
|
|
|
|
(If this is an already existing directory that is being re-used you |
331
|
|
|
|
|
|
|
may end up with a mixture of old and new contents that are hard to |
332
|
|
|
|
|
|
|
figure out.) |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=over 4 |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=item base_dir |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
The top level of all the generated output. It defaults to ".", |
339
|
|
|
|
|
|
|
but can be specified explicitly when the group is created with |
340
|
|
|
|
|
|
|
the attribute B<base_dir>. |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=item group_dir |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
By default, a new directory is created under B<base_dir>. Its name |
345
|
|
|
|
|
|
|
is I<name>-I<YYYYMMDD>-I<hhmmss> - the name of the group along with |
346
|
|
|
|
|
|
|
a timestamp of when the execution started. This can be over-ridden |
347
|
|
|
|
|
|
|
when the group is created by providing the group attribute B<group_dir>. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
=item log |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
The automatically provided log is written to the file I<"group.log"> |
352
|
|
|
|
|
|
|
directly under I<group_dir>. This logs information about the |
353
|
|
|
|
|
|
|
execution of the entire group of stages. See B<Logging Attributes |
354
|
|
|
|
|
|
|
of group object> below for ways of changing the default setting. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=item stage_dir |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Each stage creates a sub-directory beneath I<group_dir> with the |
359
|
|
|
|
|
|
|
same name as the stage. An alternate name can be used by providing |
360
|
|
|
|
|
|
|
the B<dir> attribute when the stage object is created. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=item script_file |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
The script created to be executed on the cluster node. This wraps |
365
|
|
|
|
|
|
|
the specified command with additional logic to pass on environment |
366
|
|
|
|
|
|
|
and config info, and to set output redirection. It is called |
367
|
|
|
|
|
|
|
"script.sh" and placed in I<stage_dir>. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=item job_dir |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
A sub-directory is created under I<stage_dir> for each attempt to |
372
|
|
|
|
|
|
|
run the command. Usually, there will only be a single attempt. |
373
|
|
|
|
|
|
|
However, if the cluster driver provides mechanisms for detecting |
374
|
|
|
|
|
|
|
recoverable issues and then retries a command there can be more |
375
|
|
|
|
|
|
|
than one attempt; or alternately, if a pre-requisite stage |
376
|
|
|
|
|
|
|
fails there might be no attempt made (in that case, though, |
377
|
|
|
|
|
|
|
the entire I<stage_dir> directory would not even get created). |
378
|
|
|
|
|
|
|
These directories are simply named with the retry number ("0", |
379
|
|
|
|
|
|
|
"1", ...). |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=item stdout/stderr |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
Within each I<job_dir>, the files "stdout" and "stderr" collect |
384
|
|
|
|
|
|
|
the standard output and standard error output from that (re)try |
385
|
|
|
|
|
|
|
attempt to run the command. |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=item final_retry |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
A symlink named "final_retry" is created within I<stage_dir> that |
390
|
|
|
|
|
|
|
points to the I<job_dir> of the final (re)try. Since you often |
391
|
|
|
|
|
|
|
don't care as much about the initial run tries as you do about the |
392
|
|
|
|
|
|
|
last one, this symlink provides a consistant access path to that |
393
|
|
|
|
|
|
|
final retry. |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=back |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head1 HPCI "Class" Methods |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
You can pretend that B<HPCI> is a class with one primary class |
400
|
|
|
|
|
|
|
method named B<group>. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
There a few other class methods used for localization purposes, they |
403
|
|
|
|
|
|
|
are decribed below in "Local Customization". |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head2 B<group> method |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
The B<group> method creates and returns a group object, which |
408
|
|
|
|
|
|
|
you can treat like a B<HPCI::Group> object. (In fact, it really |
409
|
|
|
|
|
|
|
returns an object of class B<HPCD::I<cluster>::Group>, but if you |
410
|
|
|
|
|
|
|
ignore that fact then you can trivially have your program run on |
411
|
|
|
|
|
|
|
some other cluster type.) |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
=head2 B<group> object |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
The description of attributes and methods for the B<group> object given here describe |
416
|
|
|
|
|
|
|
the generic attributes and how they are treated for all cluster types. |
417
|
|
|
|
|
|
|
Individual cluster drivers can modify this behaviour and can provide |
418
|
|
|
|
|
|
|
additional attributes and methods for cluster-specific purposes. |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head3 Cluster-Related Attributes of B<group> object |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
The one necessary attribute is B<cluster>. For some specific |
423
|
|
|
|
|
|
|
cluster types there may be additional attributes required for |
424
|
|
|
|
|
|
|
connecting to the cluster software (authentification, usage |
425
|
|
|
|
|
|
|
class info, etc.). |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=over 4 |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=item cluster |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
The B<cluster> attribute specifies which type of cluster is to be used. |
432
|
|
|
|
|
|
|
This is the only required attribute. (Some cluster types may have |
433
|
|
|
|
|
|
|
additional attributes that are required for specifying connection |
434
|
|
|
|
|
|
|
and authentification info.) |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item cluster_specific |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
The attribute B<cluster_specific> is optional. If provided, it should |
439
|
|
|
|
|
|
|
contain a hashref of hashrefs. If the value specified for the I<cluster> |
440
|
|
|
|
|
|
|
attribute is present as a key in the B<cluster_specific> |
441
|
|
|
|
|
|
|
hash, the corresponding value will be used as a set of attribute values |
442
|
|
|
|
|
|
|
when the group is created. Its elements will replace or augment any values |
443
|
|
|
|
|
|
|
for the same attribute name provided to the group method. This will normally be |
444
|
|
|
|
|
|
|
used if the program can be dynamically configured for different cluster |
445
|
|
|
|
|
|
|
types, and there are different arg settings required for the different |
446
|
|
|
|
|
|
|
types of cluster. |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=back |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head3 Basic Attributes of B<group> object |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=over 4 |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=item name |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
The B<name> you give to a group is used for creating the directory |
457
|
|
|
|
|
|
|
where output is stored, and also in log messages. A default name |
458
|
|
|
|
|
|
|
"default_group_name" is provided if you do not specific an explicit |
459
|
|
|
|
|
|
|
name. Using the default name is adequate in simple programs which |
460
|
|
|
|
|
|
|
only create one group, but for more complicated programs giving |
461
|
|
|
|
|
|
|
separate names to each group is necessary to easily identify the |
462
|
|
|
|
|
|
|
output of each group. The value of B<name> may also be used by |
463
|
|
|
|
|
|
|
the cluster-specific driver to provide an identifier name (or the |
464
|
|
|
|
|
|
|
basis of one) to the underlying cluster, if it needs one. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=item stage_defaults |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
The attribute B<stage_defaults> is optional. If provided, it should |
469
|
|
|
|
|
|
|
contain a hashref. This hash will be used as default values for |
470
|
|
|
|
|
|
|
every stage created by this group. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=back |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=head3 Directory Layout Attributes of B<group> object |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
=over 4 |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=item base_dir |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
If none of the other directory layout attributes are used to |
481
|
|
|
|
|
|
|
over-ride this, this attribute specifies the directory in which |
482
|
|
|
|
|
|
|
all output directories and files will be created. This is |
483
|
|
|
|
|
|
|
usually an existing directory; it defaults to the current |
484
|
|
|
|
|
|
|
directory ".". |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item group_dir |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
This directory is usually created to contain the outputs of the |
489
|
|
|
|
|
|
|
group execution. By default, it is directly under B<base_dir> with |
490
|
|
|
|
|
|
|
a name that consists of the group name attribute and a timestamp |
491
|
|
|
|
|
|
|
(e.g. "T_Definition-20150521-153256"). |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
If you provide an explicit value for this parameter, then it |
494
|
|
|
|
|
|
|
should not be an existing directory containing previous results. |
495
|
|
|
|
|
|
|
(If it is, the log file will be appended to the previous one, but |
496
|
|
|
|
|
|
|
the stage directories will over-write equivalently named directories |
497
|
|
|
|
|
|
|
and files that are created in this run, while leaving unchanged any |
498
|
|
|
|
|
|
|
that did not recur, so you'll have a mix of old and new contents.) |
499
|
|
|
|
|
|
|
The names of files and directories created under B<group_dir> are |
500
|
|
|
|
|
|
|
chosen to be consistent and easy to find automatically. |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=back |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head3 Logging Attributes of B<group> object |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
An HPCI group logs its activities using a Log::Log4perl logger. |
507
|
|
|
|
|
|
|
The logger can either be provided by the caller, or else HPCI will |
508
|
|
|
|
|
|
|
create its own. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=over 4 |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=item log |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
This a Log::Log4perl::Logger object. If it is provided as an |
515
|
|
|
|
|
|
|
attribute to the B<group> creation call, it will be used as it is, |
516
|
|
|
|
|
|
|
and the other logging attributes will be ignored. |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
If it is not provided by the user, a new Log::Log4perl::Logger |
519
|
|
|
|
|
|
|
object will be created using the attributes below to define where |
520
|
|
|
|
|
|
|
it is logged to. This created logger will send all log entries to |
521
|
|
|
|
|
|
|
a file, as well as sending all info and higher log entries to stderr. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=item log_path |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
If this attribute is provided (and the B<log> attribute is not |
526
|
|
|
|
|
|
|
provided) it will be used as the full pathname of a file where the |
527
|
|
|
|
|
|
|
log will be written. If it is not provided, it will use the path |
528
|
|
|
|
|
|
|
B<log_dir>/B<log_file> by default. |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=item log_dir |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
If neither B<log> or B<log_path> is provided, this attribute can |
533
|
|
|
|
|
|
|
be used to specify the directory where the log file is to be written. |
534
|
|
|
|
|
|
|
By default, it uses B<group_dir>. |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=item log_file |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
If neither B<log> or B<log_path> is provided, this attribute can |
539
|
|
|
|
|
|
|
be used to specify the file name to be written in the log directory. |
540
|
|
|
|
|
|
|
By default, it uses the constant name "group.log". |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=item log_level |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
You can provide this attribute to change the default log level setting from "info" to any of I<debug info warn error fatal>. |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=item log_no_stderr, log_no_file |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
Normally, the default log is written to both stderr and to the log file. |
549
|
|
|
|
|
|
|
Either of those can be suppressed by setting the corresponding attribute to a true value. |
550
|
|
|
|
|
|
|
These attributes have no effect if the user proviedes their own logger instead of using the default one. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=back |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=head3 Operational Attributes of B<group> object |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=over 4 |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=item max_concurrent |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
This attribute specifies the maximum number of stages that will |
561
|
|
|
|
|
|
|
be executing at one time. The default setting of 0 allows as |
562
|
|
|
|
|
|
|
many stages as possible (all those that are not waiting for a |
563
|
|
|
|
|
|
|
pre-requisite stage to complete) to run at the same time. |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=item status |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
This attribute is set internally while stages are executed. |
568
|
|
|
|
|
|
|
It contains the final result status from each stage run that |
569
|
|
|
|
|
|
|
has completed. The B<execute> method returns this value when |
570
|
|
|
|
|
|
|
execution completes, so you will usually not need to access it |
571
|
|
|
|
|
|
|
explicitly yourself. |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
This value is a hashref (indexed by stage name). The values are |
574
|
|
|
|
|
|
|
arrayrefs (indexed by run number 0..n). For each run, there is |
575
|
|
|
|
|
|
|
a hash. The key B<exit_status> contains the exit status of the run. |
576
|
|
|
|
|
|
|
If the stage was never run, B<exit_status> instead contains a text |
577
|
|
|
|
|
|
|
message listing the reason that it was skipped. |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=back |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head3 Environment Passing Attributes of B<group> object |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
You can set up a set of enviroment variables that will be provided to |
584
|
|
|
|
|
|
|
all stages. (You can also set variables that are only for individual |
585
|
|
|
|
|
|
|
stages - if so, they will modify any set you provide in the group.) |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
See B<HPCI::Env> for a description of these. |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=head3 Method B<stage> of B<group> object |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
The method B<stage> is used to create a new stage object. |
592
|
|
|
|
|
|
|
Its characteristics are described below. |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
The B<group> object keeps track of all B<stage> objects created |
595
|
|
|
|
|
|
|
within that group so that they can all be managed properly when the |
596
|
|
|
|
|
|
|
B<execute> method is invoked. |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
=head3 Method B<add_deps> of B<group> object |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
The method B<add_deps> is used to specify pre-requisite/dependent |
601
|
|
|
|
|
|
|
relationships. It takes either a hashref or a list containing |
602
|
|
|
|
|
|
|
pairs. One of the keys must be either B<pre_req> or B<pre_reqs>, |
603
|
|
|
|
|
|
|
another must be either B<dep> or B<deps>. |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
The value for each of these keys can be either a scalar, or an arrayref |
606
|
|
|
|
|
|
|
of scalar values. A scalar value can be either a B<stage> object (a reference), |
607
|
|
|
|
|
|
|
the exact name of a stage object (a string), or a pattern that matches |
608
|
|
|
|
|
|
|
the name of zero or more stages (a regexp). |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
HPCI will ensure that the stage or all of the stages specified for pre_req |
611
|
|
|
|
|
|
|
or pre_reqs have completed execution before any of the dep (or deps) stages |
612
|
|
|
|
|
|
|
is allowed to start executing. |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
The plural forms are provided for convenience - often the output |
615
|
|
|
|
|
|
|
file from one preparation stage is required by many others, or the |
616
|
|
|
|
|
|
|
output from many processing stages is needed by a stage that merges |
617
|
|
|
|
|
|
|
results into a summary report. Rather than having to loop over the |
618
|
|
|
|
|
|
|
pre_reqs and deps and calling B<add_deps> individually for every |
619
|
|
|
|
|
|
|
individual dependency, a single call will handle the entire combination. |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
Allowing a regexp to match no stages at all makes it possible to write |
622
|
|
|
|
|
|
|
an add_deps call for stages that are optional - no dependency will be |
623
|
|
|
|
|
|
|
added if the optional stage was not created this run. |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
While it is recommended for code readability that you use the singular |
626
|
|
|
|
|
|
|
form (B<dep> or B<pre_req>) is you are providing a single stage, and the |
627
|
|
|
|
|
|
|
plural form (B<deps> or B<pre_reqs>) if you are providing a list of |
628
|
|
|
|
|
|
|
stages, either can be used. |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
The B<add_deps> method can be called multiple times. HPCI will |
631
|
|
|
|
|
|
|
accumlate the dependencies appropriately. |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
It is an error to provide a sequence of dependencies that form |
634
|
|
|
|
|
|
|
a cycle in which a stage directly or indirectly has itself as a |
635
|
|
|
|
|
|
|
pre-requisite. (Such a stage could never run. HPCI will detect |
636
|
|
|
|
|
|
|
when all remaining stages are blocked by pre-requisites and abort, |
637
|
|
|
|
|
|
|
but that might be after numerous stages have already been executed.) |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=head3 METHOD execute of B<group> object |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
The B<execute> method is the final goal of building the group. |
642
|
|
|
|
|
|
|
It schedules the execution of individual stages. It waits for |
643
|
|
|
|
|
|
|
pre-requisites before running a stage. It provides for re-running |
644
|
|
|
|
|
|
|
a stage if a soft failure has occurred that allows a retry. If a |
645
|
|
|
|
|
|
|
failure that cannot be retried occurs, it can skip scheduling dependent |
646
|
|
|
|
|
|
|
stages, or even stop scheduling all new stages. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=head2 Stage Object |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
=head3 Attributes |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=over 4 |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
=item name |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
A unique B<name> attribute must be provided for stages. It is a string. |
657
|
|
|
|
|
|
|
There is no default value provided. |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=item command |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
The B<command> attribute must be provided before the group is |
662
|
|
|
|
|
|
|
executed. It can either be provided as a string attribute when the |
663
|
|
|
|
|
|
|
stage is created, or by using the one of |
664
|
|
|
|
|
|
|
the command-setting methods provided by the stage class. |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
See B<HPCI::Stage> for more details about the command setting |
667
|
|
|
|
|
|
|
methods. |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=item dir |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
The B<dir> attribute is optional. It specifies the direcory |
672
|
|
|
|
|
|
|
in which files related to the stage are placed. By default, |
673
|
|
|
|
|
|
|
it is I<group_dir>/I<stage_name>. You will usually not need to |
674
|
|
|
|
|
|
|
change this. |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item cluster |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
The B<cluster> attribute is automatically passed on fro mthe B<group> |
679
|
|
|
|
|
|
|
to each B<stage>. You are not likely to need this. |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=item group |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
The B<group> that created a stage is automatically passed on (as a weak |
684
|
|
|
|
|
|
|
reference) to the stage. You are not likely to need to use this attribute |
685
|
|
|
|
|
|
|
in user code. |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=item resources_required |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=item retry_resources_required |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
The B<resources_required> and B<retry_resources_required> are used to |
692
|
|
|
|
|
|
|
define resources that will be required by the stage when it executes. |
693
|
|
|
|
|
|
|
These attributes are somewhat cluster specific - each cluster has |
694
|
|
|
|
|
|
|
its own set of requirements for how a job submission must specify |
695
|
|
|
|
|
|
|
the sort of resources that it will require. |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
The B<resources_required> attribute is a hash, specifying the |
698
|
|
|
|
|
|
|
value for each resource that is to be considered. |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
The B<retry_resources_required> attribute is also a hash. For |
701
|
|
|
|
|
|
|
each resource, you can specify an array of values. If the cluster |
702
|
|
|
|
|
|
|
driver is able to detect that a run failed because the resource |
703
|
|
|
|
|
|
|
was inadequate, it will retry the run with the next larger value |
704
|
|
|
|
|
|
|
from this list. |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
See B<HPCI::Stage> for more details about resources. |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=item force_retries |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
This attribute specifies an integer number of time to retry the |
711
|
|
|
|
|
|
|
stage before comcluding that it has actually failed. You might use |
712
|
|
|
|
|
|
|
this if your cluster has some nodes that work differently from |
713
|
|
|
|
|
|
|
others and a stage might fail on one type of node but succeed on |
714
|
|
|
|
|
|
|
another. |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
These retries are done after any cluster-specific retry mechanisms |
717
|
|
|
|
|
|
|
have been used. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
The default value for this attribute is 0 (zero), giving no forced |
720
|
|
|
|
|
|
|
retries unless you specifically ask for them. |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
=item failure_action ('abort_group', 'abort_deps'*, or 'ignore') |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
Specifies the action to take if this stage fails (terminates with |
725
|
|
|
|
|
|
|
a non-zero status). |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
There are three string values that it can have: |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
=over 4 |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=item - abort_deps (default) |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
If the stage fails, then any stages which depend upon it |
734
|
|
|
|
|
|
|
(recursively) are not run. The group continues executing until |
735
|
|
|
|
|
|
|
all stages which are not dependent upon this stage (including those |
736
|
|
|
|
|
|
|
that have not yet been initiated) complete execution. |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
=item - abort_group |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
If the stage fails, then no other stages are started. The group |
741
|
|
|
|
|
|
|
simply waits until stages that have already been started complete |
742
|
|
|
|
|
|
|
and then returns. |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=item - ignore |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
Execution continues unchanged, any dependent stages will be run when they are |
747
|
|
|
|
|
|
|
no longer blocked. |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
=back |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
=item abort_group_on_failure abort_deps_on_failure ignore_failure |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
As an alternative to providing a value to the failute_action attribute |
754
|
|
|
|
|
|
|
when you create a stage, you can instead provide one of the pseudo-attributes |
755
|
|
|
|
|
|
|
'abort_group_on_failure', 'abort_deps_on_failure', or 'ignore_failure' with |
756
|
|
|
|
|
|
|
a true value to specify 'abort_group', 'abort_deps', or 'ignore' respectively. |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=item state |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
The B<state> is mostly an internal attribute but after the group has |
761
|
|
|
|
|
|
|
finished execution you can use this to check whether the stage was |
762
|
|
|
|
|
|
|
run successfully. After execution, B<state> will either be 'pass" or |
763
|
|
|
|
|
|
|
'fail'. |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
=item Environment passing attributes |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
You can set up a set of environment variables that will be provided to |
768
|
|
|
|
|
|
|
this stage. It will use set defined for the group as a basis (if such a set was |
769
|
|
|
|
|
|
|
defined for the group), but that set can be changed for individual stages |
770
|
|
|
|
|
|
|
or you can have no group default and only provide a set to specific stages |
771
|
|
|
|
|
|
|
as needed. See B<HPCI::Env> for further details. |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=back |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
=head3 Methods |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=head4 command creation |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
There are a number of helper methods to assist in building different |
780
|
|
|
|
|
|
|
types of commands to be provided for the B<command> attribute. |
781
|
|
|
|
|
|
|
See B<HPCI::Stage> for details. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=head1 Local Configuration |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
TODO: write this section |
786
|
|
|
|
|
|
|
- describe the HPCI::LocalConfig module |
787
|
|
|
|
|
|
|
- describe the mechanism for adding extra roles to group, stage, etc. |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
=head1 Additional |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
This is an early public release of HPCI, and at present, there are |
792
|
|
|
|
|
|
|
only two drivers available. |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Only one cluster type is directly included within the HPCI package. |
795
|
|
|
|
|
|
|
The cluster type B<HPCD::uni> runs on a "cluster" of only one |
796
|
|
|
|
|
|
|
machine. It simply uses fork to submit individual stages and has |
797
|
|
|
|
|
|
|
facility for retries and timeouts. This is the default cluster |
798
|
|
|
|
|
|
|
type used for testing, as it will work natively on all types of |
799
|
|
|
|
|
|
|
Unix systems. It is also possible to use this driver as a fallback, |
800
|
|
|
|
|
|
|
in cases where the only available "real" cluster is not accessable |
801
|
|
|
|
|
|
|
for some reason. |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
Additionally, there is the B<HPCD::SGE> driver available on CPAN. |
804
|
|
|
|
|
|
|
It has seen heavy use within Boutros Lab. |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
Now that these packages have been released, it is likely new |
807
|
|
|
|
|
|
|
cluster drivers will be written. People interested in developing |
808
|
|
|
|
|
|
|
drivers for additional cluster types should contact the authors |
809
|
|
|
|
|
|
|
of this package to co-ordinate releases, features needed, etc. at |
810
|
|
|
|
|
|
|
B<mailto:BoutrosLabSoftware@oicr.on.ca>. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
Additionally, you may wish to subscribe to the email list mentioned |
813
|
|
|
|
|
|
|
at B<https:://lists.oicr.on.ca/mailman/listinfo/hpci-discuss>. |
814
|
|
|
|
|
|
|
This is expected to be a low volume discussion group, although the |
815
|
|
|
|
|
|
|
future will tell what the actual volume will be. |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
As additional capabilities of new cluster types are addressed, and as |
818
|
|
|
|
|
|
|
different control needs used at other organizations are identified; |
819
|
|
|
|
|
|
|
this interface will surely change. As far as possible, such changes |
820
|
|
|
|
|
|
|
will be done in an upwardly compatible manner, but until a few more |
821
|
|
|
|
|
|
|
drivers have been integrated there is the possibility of changes |
822
|
|
|
|
|
|
|
that are not fully backward compatible. Watch the release notes |
823
|
|
|
|
|
|
|
for warnings of such issues. At some point there will be a 1.0.0 |
824
|
|
|
|
|
|
|
release, at which point this expectation of (limited) incompatible |
825
|
|
|
|
|
|
|
future change will be dropped. After that point, incompatible |
826
|
|
|
|
|
|
|
changes will only be made for critical reasons. |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
The reason for separate distribution of cluster-specific HPCD |
829
|
|
|
|
|
|
|
packages are fairly obvious: |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=over 4 |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
=item - |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
The maintainers of the HPCI package do not have access to every |
836
|
|
|
|
|
|
|
possible cluster type, and it unlikely that anyone will have access |
837
|
|
|
|
|
|
|
to all supported cluster types from one location, so the driver |
838
|
|
|
|
|
|
|
modules will need to be tested separately anyhow. |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=item - |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
A user of HPCI is equally not going to have need to access every |
843
|
|
|
|
|
|
|
type of cluster that exists, so they will probably prefer to only |
844
|
|
|
|
|
|
|
download the driver modules that they actually need. |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=back |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=head1 SEE ALSO |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
=over 4 |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=item HPCI::Group |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
Describes the interface common to all B<HPCI Group> |
855
|
|
|
|
|
|
|
objects, regardless of the particular type of cluster that |
856
|
|
|
|
|
|
|
is actually being used to run the stages. In the future, the |
857
|
|
|
|
|
|
|
common interface may change somewhat as supprt for additional |
858
|
|
|
|
|
|
|
cluster types is added and a better understanding of the common |
859
|
|
|
|
|
|
|
features is achieved. |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
=item HPCI::Stage |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
Describes the interface common to stage object returned |
864
|
|
|
|
|
|
|
by all B<HPCI Stage> objects, regardless of the |
865
|
|
|
|
|
|
|
particular type of cluster that is actually being used to |
866
|
|
|
|
|
|
|
run the stages. The common interface may change somewhat |
867
|
|
|
|
|
|
|
as supprt for additional cluster types is added and a better |
868
|
|
|
|
|
|
|
understanding of the common features is achieved. |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=item HPCI::Logger |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
Describes the logger parameters in more detail. |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=item HPCI::Env |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
Describes the environment passing parameters in more detail. |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=item HPCD::I<$cluster>::Group |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
Describes the group interface unique to a specific type of cluster, |
881
|
|
|
|
|
|
|
including any limitations or extensions to the generic interface. |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
=item HPCD::I<$cluster>::Stage |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
Describes the stage interface unique to a specific type of cluster, |
886
|
|
|
|
|
|
|
including any limitations or extensions to the generic interface. |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
=back |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=head1 AUTHOR |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
Christopher Lalansingh - Boutros Lab |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
John Macdonald - Boutros Lab |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
Paul Boutros, Phd, PI - Boutros Lab |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
The Ontario Institute for Cancer Research |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
=cut |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
1; |
905
|
|
|
|
|
|
|
|