| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
#ABSTRACT: Summarise Slurm workload by combining sinfo node states with squeue job pressure |
|
3
|
|
|
|
|
|
|
#PODNAME: slurm-load |
|
4
|
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
8144
|
use v5.12; |
|
|
2
|
|
|
|
|
7
|
|
|
6
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
94
|
|
|
7
|
2
|
|
|
2
|
|
1317
|
use Getopt::Long; |
|
|
2
|
|
|
|
|
28438
|
|
|
|
2
|
|
|
|
|
9
|
|
|
8
|
2
|
|
|
2
|
|
928
|
use FindBin qw($RealBin); |
|
|
2
|
|
|
|
|
2225
|
|
|
|
2
|
|
|
|
|
192
|
|
|
9
|
2
|
|
|
2
|
|
1161
|
use Term::ANSIColor qw(:constants); |
|
|
2
|
|
|
|
|
19974
|
|
|
|
2
|
|
|
|
|
1644
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
2
|
50
|
|
|
|
151137
|
if (-e "$RealBin/../dist.ini") { |
|
|
0
|
|
|
|
|
0
|
|
|
12
|
2
|
50
|
|
|
|
10
|
say STDERR "[dev mode] Using local lib" if ($ENV{"DEBUG"}); |
|
13
|
2
|
|
|
2
|
|
901
|
use lib "$RealBin/../lib"; |
|
|
2
|
|
|
|
|
1196
|
|
|
|
2
|
|
|
|
|
11
|
|
|
14
|
|
|
|
|
|
|
} |
|
15
|
|
|
|
|
|
|
|
|
16
|
2
|
|
|
2
|
|
937
|
use NBI::Slurm; |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
115272
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
2
|
|
|
|
|
4
|
my $opt_tab = 0; |
|
19
|
2
|
|
|
|
|
3
|
my $opt_user; |
|
20
|
2
|
|
|
|
|
4
|
my $opt_help = 0; |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
GetOptions( |
|
23
|
|
|
|
|
|
|
't|tab' => \$opt_tab, |
|
24
|
|
|
|
|
|
|
'u|user=s' => \$opt_user, |
|
25
|
0
|
|
|
0
|
|
0
|
'version' => sub { say "slurm-load v", $NBI::Slurm::VERSION; exit 0 }, |
|
|
0
|
|
|
|
|
0
|
|
|
26
|
2
|
50
|
|
|
|
21
|
'h|help' => \$opt_help, |
|
27
|
|
|
|
|
|
|
) or usage(1); |
|
28
|
|
|
|
|
|
|
|
|
29
|
2
|
100
|
|
|
|
1404
|
usage(0) if $opt_help; |
|
30
|
|
|
|
|
|
|
|
|
31
|
1
|
|
|
|
|
3
|
for my $cmd (qw(sinfo squeue)) { |
|
32
|
2
|
50
|
|
|
|
6
|
unless (_has_command($cmd)) { |
|
33
|
0
|
|
|
|
|
0
|
say STDERR RED, "Error: ", RESET, "$cmd not found in PATH. Are you in a Slurm cluster?"; |
|
34
|
0
|
|
|
|
|
0
|
exit 1; |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
|
|
38
|
1
|
|
|
|
|
23
|
my $partitions = load_sinfo(); |
|
39
|
1
|
|
|
|
|
14
|
my $jobs = load_squeue($opt_user); |
|
40
|
1
|
|
|
|
|
6
|
my $rows = build_rows($partitions, $jobs); |
|
41
|
|
|
|
|
|
|
|
|
42
|
1
|
50
|
|
|
|
3
|
if ($opt_tab) { |
|
43
|
1
|
|
|
|
|
4
|
print_tsv($rows); |
|
44
|
|
|
|
|
|
|
} else { |
|
45
|
0
|
|
|
|
|
0
|
print_table($rows); |
|
46
|
|
|
|
|
|
|
} |
|
47
|
|
|
|
|
|
|
|
|
48
|
1
|
|
|
|
|
117
|
exit 0; |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub load_sinfo { |
|
51
|
1
|
|
|
1
|
|
13
|
my %partitions; |
|
52
|
1
|
|
|
|
|
9653
|
my @lines = `sinfo --noheader --format='%P|%D|%T' 2>/dev/null`; |
|
53
|
1
|
50
|
|
|
|
27
|
if ($? != 0) { |
|
54
|
0
|
|
|
|
|
0
|
die "ERROR slurm-load: sinfo failed\n"; |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
|
|
57
|
1
|
|
|
|
|
9
|
for my $line (@lines) { |
|
58
|
5
|
|
|
|
|
14
|
chomp $line; |
|
59
|
5
|
50
|
|
|
|
41
|
next unless $line =~ /\S/; |
|
60
|
5
|
|
|
|
|
25
|
my ($partition, $nodes, $state) = split /\|/, $line, 3; |
|
61
|
5
|
50
|
33
|
|
|
40
|
next unless defined $partition && defined $nodes && defined $state; |
|
|
|
|
33
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
|
63
|
5
|
|
|
|
|
12
|
$partition =~ s/\*$//; |
|
64
|
5
|
|
|
|
|
32
|
my $bucket = normalise_node_state($state); |
|
65
|
|
|
|
|
|
|
|
|
66
|
5
|
|
|
|
|
33
|
$partitions{$partition}{partition} = $partition; |
|
67
|
5
|
|
|
|
|
12
|
$partitions{$partition}{total} += $nodes; |
|
68
|
5
|
|
|
|
|
32
|
$partitions{$partition}{$bucket} += $nodes; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
1
|
|
|
|
|
10
|
return \%partitions; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub load_squeue { |
|
74
|
1
|
|
|
1
|
|
39
|
my ($user) = @_; |
|
75
|
1
|
|
|
|
|
7
|
my %jobs; |
|
76
|
|
|
|
|
|
|
|
|
77
|
1
|
|
|
|
|
6
|
my $cmd = "squeue --noheader --format='%P|%t|%D'"; |
|
78
|
1
|
50
|
|
|
|
4
|
$cmd .= " --user='$user'" if defined $user; |
|
79
|
1
|
|
|
|
|
6986
|
my @lines = `$cmd 2>/dev/null`; |
|
80
|
1
|
50
|
|
|
|
23
|
if ($? != 0) { |
|
81
|
0
|
|
|
|
|
0
|
die "ERROR slurm-load: squeue failed\n"; |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
|
|
84
|
1
|
|
|
|
|
11
|
for my $line (@lines) { |
|
85
|
3
|
|
|
|
|
8
|
chomp $line; |
|
86
|
3
|
50
|
|
|
|
27
|
next unless $line =~ /\S/; |
|
87
|
3
|
|
|
|
|
21
|
my ($partition, $state, $nodes) = split /\|/, $line, 3; |
|
88
|
3
|
50
|
33
|
|
|
45
|
next unless defined $partition && defined $state && defined $nodes; |
|
|
|
|
33
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
|
90
|
3
|
|
|
|
|
7
|
$partition =~ s/\*$//; |
|
91
|
3
|
|
|
|
|
16
|
my $bucket = normalise_job_state($state); |
|
92
|
|
|
|
|
|
|
|
|
93
|
3
|
|
|
|
|
27
|
$jobs{$partition}{partition} = $partition; |
|
94
|
3
|
|
|
|
|
9
|
$jobs{$partition}{"jobs_$bucket"}++; |
|
95
|
3
|
|
|
|
|
12
|
$jobs{$partition}{"nodes_$bucket"} += $nodes; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
1
|
|
|
|
|
5
|
return \%jobs; |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub build_rows { |
|
102
|
1
|
|
|
1
|
|
4
|
my ($partitions, $jobs) = @_; |
|
103
|
1
|
|
|
|
|
16
|
my @header = qw(Partition Total Up Idle Mix Alloc Down JobsR JobsPD RunNodes PendNodes Load); |
|
104
|
1
|
|
|
|
|
4
|
my @rows = (\@header); |
|
105
|
1
|
|
|
|
|
12
|
my %totals = ( |
|
106
|
|
|
|
|
|
|
partition => 'TOTAL', |
|
107
|
|
|
|
|
|
|
total => 0, |
|
108
|
|
|
|
|
|
|
up => 0, |
|
109
|
|
|
|
|
|
|
idle => 0, |
|
110
|
|
|
|
|
|
|
mix => 0, |
|
111
|
|
|
|
|
|
|
alloc => 0, |
|
112
|
|
|
|
|
|
|
down => 0, |
|
113
|
|
|
|
|
|
|
jobs_running => 0, |
|
114
|
|
|
|
|
|
|
jobs_pending => 0, |
|
115
|
|
|
|
|
|
|
nodes_running => 0, |
|
116
|
|
|
|
|
|
|
nodes_pending => 0, |
|
117
|
|
|
|
|
|
|
); |
|
118
|
|
|
|
|
|
|
|
|
119
|
1
|
|
|
|
|
5
|
for my $partition (sort { |
|
120
|
1
|
0
|
|
|
|
9
|
load_pct($partitions->{$b}) <=> load_pct($partitions->{$a}) || $a cmp $b |
|
121
|
1
|
|
|
|
|
19
|
} keys %{$partitions}) { |
|
122
|
2
|
|
50
|
|
|
3
|
my $node = $partitions->{$partition} || {}; |
|
123
|
2
|
|
50
|
|
|
4
|
my $job = $jobs->{$partition} || {}; |
|
124
|
|
|
|
|
|
|
|
|
125
|
2
|
|
50
|
|
|
5
|
my $total = $node->{total} || 0; |
|
126
|
2
|
|
100
|
|
|
8
|
my $down = $node->{down} || 0; |
|
127
|
2
|
|
50
|
|
|
8
|
my $idle = $node->{idle} || 0; |
|
128
|
2
|
|
100
|
|
|
54
|
my $mix = $node->{mix} || 0; |
|
129
|
2
|
|
100
|
|
|
7
|
my $alloc = $node->{alloc} || 0; |
|
130
|
2
|
|
|
|
|
23
|
my $up = $total - $down; |
|
131
|
2
|
|
50
|
|
|
4
|
my $jobs_running = $job->{jobs_running} || 0; |
|
132
|
2
|
|
100
|
|
|
5
|
my $jobs_pending = $job->{jobs_pending} || 0; |
|
133
|
2
|
|
50
|
|
|
3
|
my $nodes_running = $job->{nodes_running} || 0; |
|
134
|
2
|
|
100
|
|
|
9
|
my $nodes_pending = $job->{nodes_pending} || 0; |
|
135
|
2
|
|
|
|
|
3
|
my $load = sprintf('%d%%', load_pct($node)); |
|
136
|
|
|
|
|
|
|
|
|
137
|
2
|
|
|
|
|
4
|
push @rows, [ |
|
138
|
|
|
|
|
|
|
$partition, |
|
139
|
|
|
|
|
|
|
$total, |
|
140
|
|
|
|
|
|
|
$up, |
|
141
|
|
|
|
|
|
|
$idle, |
|
142
|
|
|
|
|
|
|
$mix, |
|
143
|
|
|
|
|
|
|
$alloc, |
|
144
|
|
|
|
|
|
|
$down, |
|
145
|
|
|
|
|
|
|
$jobs_running, |
|
146
|
|
|
|
|
|
|
$jobs_pending, |
|
147
|
|
|
|
|
|
|
$nodes_running, |
|
148
|
|
|
|
|
|
|
$nodes_pending, |
|
149
|
|
|
|
|
|
|
$load, |
|
150
|
|
|
|
|
|
|
]; |
|
151
|
|
|
|
|
|
|
|
|
152
|
2
|
|
|
|
|
3
|
$totals{total} += $total; |
|
153
|
2
|
|
|
|
|
3
|
$totals{up} += $up; |
|
154
|
2
|
|
|
|
|
1
|
$totals{idle} += $idle; |
|
155
|
2
|
|
|
|
|
3
|
$totals{mix} += $mix; |
|
156
|
2
|
|
|
|
|
3
|
$totals{alloc} += $alloc; |
|
157
|
2
|
|
|
|
|
2
|
$totals{down} += $down; |
|
158
|
2
|
|
|
|
|
2
|
$totals{jobs_running} += $jobs_running; |
|
159
|
2
|
|
|
|
|
3
|
$totals{jobs_pending} += $jobs_pending; |
|
160
|
2
|
|
|
|
|
1
|
$totals{nodes_running} += $nodes_running; |
|
161
|
2
|
|
|
|
|
4
|
$totals{nodes_pending} += $nodes_pending; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
1
|
50
|
|
|
|
5
|
if (@rows > 1) { |
|
165
|
|
|
|
|
|
|
push @rows, [ |
|
166
|
|
|
|
|
|
|
$totals{partition}, |
|
167
|
|
|
|
|
|
|
$totals{total}, |
|
168
|
|
|
|
|
|
|
$totals{up}, |
|
169
|
|
|
|
|
|
|
$totals{idle}, |
|
170
|
|
|
|
|
|
|
$totals{mix}, |
|
171
|
|
|
|
|
|
|
$totals{alloc}, |
|
172
|
|
|
|
|
|
|
$totals{down}, |
|
173
|
|
|
|
|
|
|
$totals{jobs_running}, |
|
174
|
|
|
|
|
|
|
$totals{jobs_pending}, |
|
175
|
|
|
|
|
|
|
$totals{nodes_running}, |
|
176
|
|
|
|
|
|
|
$totals{nodes_pending}, |
|
177
|
1
|
|
|
|
|
7
|
sprintf('%d%%', load_pct(\%totals)), |
|
178
|
|
|
|
|
|
|
]; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
1
|
|
|
|
|
9
|
return \@rows; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub load_pct { |
|
185
|
5
|
|
|
5
|
|
7
|
my ($row) = @_; |
|
186
|
5
|
|
50
|
|
|
15
|
my $total = $row->{total} || 0; |
|
187
|
5
|
|
100
|
|
|
11
|
my $down = $row->{down} || 0; |
|
188
|
5
|
|
|
|
|
8
|
my $up = $total - $down; |
|
189
|
5
|
50
|
|
|
|
9
|
return 0 if $up <= 0; |
|
190
|
5
|
|
100
|
|
|
17
|
my $busy = ($row->{mix} || 0) + ($row->{alloc} || 0); |
|
|
|
|
100
|
|
|
|
|
|
191
|
5
|
|
|
|
|
25
|
return ($busy / $up) * 100; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub normalise_node_state { |
|
195
|
5
|
|
|
5
|
|
9
|
my ($state) = @_; |
|
196
|
5
|
|
50
|
|
|
14
|
$state = lc($state // ''); |
|
197
|
5
|
|
|
|
|
18
|
$state =~ s/[^a-z].*$//; |
|
198
|
|
|
|
|
|
|
|
|
199
|
5
|
100
|
|
|
|
21
|
return 'idle' if $state =~ /^idle/; |
|
200
|
3
|
100
|
|
|
|
12
|
return 'mix' if $state =~ /^mix/; |
|
201
|
2
|
100
|
|
|
|
11
|
return 'alloc' if $state =~ /^alloc/; |
|
202
|
1
|
50
|
|
|
|
11
|
return 'down' if $state =~ /^(down|drain|drng|drained|fail)/; |
|
203
|
0
|
|
|
|
|
0
|
return 'alloc'; |
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub normalise_job_state { |
|
207
|
3
|
|
|
3
|
|
7
|
my ($state) = @_; |
|
208
|
3
|
|
50
|
|
|
9
|
$state = uc($state // ''); |
|
209
|
|
|
|
|
|
|
|
|
210
|
3
|
100
|
|
|
|
10
|
return 'pending' if $state eq 'PD'; |
|
211
|
2
|
50
|
|
|
|
18
|
return 'running' if $state =~ /^(R|CG|CF|SI|SO|ST)$/; |
|
212
|
0
|
|
|
|
|
0
|
return 'running'; |
|
213
|
|
|
|
|
|
|
} |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
sub print_tsv { |
|
216
|
1
|
|
|
1
|
|
3
|
my ($rows) = @_; |
|
217
|
1
|
|
|
|
|
10
|
for my $row (@{$rows}) { |
|
|
1
|
|
|
|
|
4
|
|
|
218
|
4
|
|
|
|
|
5
|
say join("\t", @{$row}); |
|
|
4
|
|
|
|
|
26
|
|
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub print_table { |
|
223
|
0
|
|
|
0
|
|
0
|
my ($rows) = @_; |
|
224
|
0
|
|
|
|
|
0
|
my @widths; |
|
225
|
0
|
|
|
|
|
0
|
for my $row (@{$rows}) { |
|
|
0
|
|
|
|
|
0
|
|
|
226
|
0
|
|
|
|
|
0
|
for my $i (0 .. $#{$row}) { |
|
|
0
|
|
|
|
|
0
|
|
|
227
|
0
|
|
0
|
|
|
0
|
my $len = length($row->[$i] // ''); |
|
228
|
0
|
0
|
0
|
|
|
0
|
$widths[$i] = $len if !defined $widths[$i] || $len > $widths[$i]; |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
0
|
|
|
|
|
0
|
for my $idx (0 .. $#{$rows}) { |
|
|
0
|
|
|
|
|
0
|
|
|
233
|
0
|
|
|
|
|
0
|
my $row = $rows->[$idx]; |
|
234
|
0
|
|
|
|
|
0
|
my @parts; |
|
235
|
0
|
|
|
|
|
0
|
for my $i (0 .. $#{$row}) { |
|
|
0
|
|
|
|
|
0
|
|
|
236
|
0
|
0
|
|
|
|
0
|
my $fmt = $i == 0 ? "%-*s" : "%*s"; |
|
237
|
0
|
|
|
|
|
0
|
push @parts, sprintf($fmt, $widths[$i], $row->[$i]); |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
|
|
240
|
0
|
0
|
|
|
|
0
|
if ($idx == 0) { |
|
241
|
0
|
|
|
|
|
0
|
say BOLD, join(" ", @parts), RESET; |
|
242
|
0
|
|
|
|
|
0
|
my @rule = map { '-' x $_ } @widths; |
|
|
0
|
|
|
|
|
0
|
|
|
243
|
0
|
|
|
|
|
0
|
say join(" ", @rule); |
|
244
|
0
|
|
|
|
|
0
|
next; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
0
|
0
|
|
|
|
0
|
if ($row->[0] eq 'TOTAL') { |
|
248
|
0
|
|
|
|
|
0
|
say BOLD, join(" ", @parts), RESET; |
|
249
|
|
|
|
|
|
|
} else { |
|
250
|
0
|
|
|
|
|
0
|
say join(" ", @parts); |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
sub _has_command { |
|
256
|
2
|
|
|
2
|
|
7
|
my ($cmd) = @_; |
|
257
|
2
|
|
|
|
|
7774
|
my $path = `command -v $cmd 2>/dev/null`; |
|
258
|
2
|
|
33
|
|
|
164
|
return $? == 0 && $path =~ /\S/; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub usage { |
|
262
|
1
|
|
|
1
|
|
2
|
my ($exit_code) = @_; |
|
263
|
1
|
|
|
|
|
49
|
print STDERR <<'END'; |
|
264
|
|
|
|
|
|
|
slurm-load - Summarise Slurm workload by partition |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
Usage: |
|
267
|
|
|
|
|
|
|
slurm-load [options] |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Options: |
|
270
|
|
|
|
|
|
|
-t, --tab Print TSV instead of aligned text |
|
271
|
|
|
|
|
|
|
-u, --user USER Restrict squeue job counts to one user |
|
272
|
|
|
|
|
|
|
--version Show version and exit |
|
273
|
|
|
|
|
|
|
-h, --help Show this help and exit |
|
274
|
|
|
|
|
|
|
END |
|
275
|
1
|
|
|
|
|
139
|
exit $exit_code; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
__END__ |