-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob_report.sh
executable file
·348 lines (294 loc) · 6.69 KB
/
job_report.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/bin/bash
# job_report.sh
# Script to report information on array jobs running on Odyssey
#
usage() {
echo "usage: [SACCT_ARGS=args] job_report.sh [sacct_args] [--array] [--dir <job_dir>] [--verbose]
sacct_args: Add arguments for /usr/bin/sacct by passing arguments inline
They can also be passed by setting SACCT_ARGS as an environment variable
--array: Flag to signal that jobs to report on are from sbatch --array
--dir=dir: value should be the directory containing the .err and .out files generated by SLURM
--verbose: Add this flag to see more information on failed runs
"
}
# source external scripts for additional functionality
source $HOME/reports/job_report_ext.sh
#### GLOBALS ####
SACCT=/usr/bin/sacct
SACCT_ARGS+=("-XP --noheader") # Add required SACCT arguments for parsing
# Keep the format and indexes aligned
export SACCT_FORMAT='jobid,state,partition,submit,start,end'
JOBID=0 # get the jobid from jobid_jobstep
JOBSTEP=1 # get the jobstep from jobid_jobstep
STATE=1 # Job state
PARTITION=2 # Where is the job running?
SUBMIT=3 # Submit time
START=4 # Start time
END=5 # End time
#### Helper funtions for printing ####
pretty_print_tabs() {
# print a tab separated list of jobs in five columns
list=($@)
count=1
mod=5
for l in ${list[@]}; do
printf "\t$l"
if (( $count % $mod == 0 )); then
printf "\n"
fi
((count+=1))
done
printf "\n"
}
pretty_print_commas() {
# print a comma separated list of jobs
# helpful for knowing which jobs to rerun
list=($@)
count=0
for l in ${list[@]}; do
printf "$l"
((count+=1))
if (( $count < ${#list[@]} )); then
printf ","
fi
done
printf "\n"
}
print_sorted_jobs() {
# sort and print a list of jobs
list=($@)
sorted=( $(
for l in ${list[@]}; do
IFS='_' read -ra split <<< "$l"
echo ${split[1]}
done | sort -nu
) )
pretty_print_commas ${sorted[@]}
}
get_sorted_jobs() {
# get the list of jobs
runs=($@)
list=()
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
jobid=${split[$JOBID]}
list+=($jobid)
done
sorted=( $(
for l in ${list[@]}; do
IFS='_' read -ra split <<< "$l"
echo ${split[0]}
done | sort -nu
) )
pretty_print_commas ${sorted[@]}
}
convertsecs() {
# convert value of seconds to a time
((h=${1}/3600))
((m=(${1}%3600)/60))
((s=${1}%60))
printf "%02d:%02d:%02d\n" $h $m $s
}
run_times() {
# use the SUBMIT, START, and END times from sacct to calculate
# average wall time and run time for a set of jobs
runs=($@)
sum_wall_time=0
sum_elapsed=0
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
submit_=$(date --date=${split[$SUBMIT]} +%s )
start_=$(date --date=${split[$START]} +%s )
end_=$(date --date=${split[$END]} +%s )
sum_elapsed=$(( sum_elapsed + $(( $end_ - $start_ )) ))
sum_wall_time=$((sum_wall_time + $(( $end_ - $submit_ )) ))
done
avg_elapsed=$(($sum_elapsed / ${#runs[@]}))
avg_wall_time=$(($sum_wall_time / ${#runs[@]}))
echo " Avg Run Time: $(convertsecs $avg_elapsed)"
echo " Avg Wall Time: $(convertsecs $avg_wall_time)"
}
#### Run STATE handler functions ####
handle_completed() {
runs=($@)
if [ $VERBOSE -eq 1 ]; then
run_times ${runs[@]}
_ext_handle_passed_and_failed ${runs[@]}
echo ""
fi
}
handle_failed() {
runs=($@)
output_dir=$WORK_DIR
prefix=batch_
list=()
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
jobid=${split[$JOBID]}
list+=($jobid)
if [ $VERBOSE -eq 1 ]; then
output_errs="$output_dir/$prefix$jobid.err"
printf " Job $jobid Failed:\t"
if [ -e $output_errs ]; then
echo " $(cat $output_errs)"
else
echo " Output removed"
fi
fi
done
echo "Rerun these jobs:"
if [ $ARRAY -eq 1 ]; then
print_sorted_jobs ${list[@]}
else
pretty_print_tabs ${list[@]}
fi
echo ""
}
handle_running() {
runs=($@)
if [ $VERBOSE -eq 1 ]; then
list=()
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
list+=(${split[$JOBID]})
done
echo "Running jobs: "
if [ $ARRAY -eq 1 ]; then
print_sorted_jobs ${list[@]}
else
pretty_print_tabs ${list[@]}
fi
echo ""
fi
}
handle_pending() {
runs=($@)
list=()
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
list+=(${split[$JOBID]})
done
if [ $VERBOSE -eq 1 ]; then
echo "Pending jobs: "
pretty_print_tabs ${list[@]}
fi
echo ""
}
handle_other() {
runs=($@)
list=()
for run in ${runs[@]}; do
IFS='|' read -ra split <<< "$run"
jobid=${split[$JOBID]}
state=${split[$STATE]}
list+=("$jobid: $state")
done
pretty_print_tabs ${list[@]}
}
#### MAIN ####
ARRAY=0
WORK_DIR=$PWD
VERBOSE=0
while test $# -gt 0
do
case "$1" in
--array)
ARRAY=1
;;
--dir)
shift
if [ -z $1 ]; then
usage
exit 1
fi
WORK_DIR=$1
;;
--exclude)
shift
if [ -z $1 ]; then
usage
exit 1
fi
EXCLUDE=1
IFS=',' read -ra EXCLUDED <<< "$1"
;;
--help)
usage
exit 1
;;
--verbose)
VERBOSE=1
;;
*)
SACCT_ARGS+=($1)
;;
esac
shift
done
COMPLETED=()
FAILED=()
TIMEOUT=()
RUNNING=()
PENDING=()
OTHER=()
echo "Finding jobs using: $SACCT ${SACCT_ARGS[@]}"
all=$($SACCT ${SACCT_ARGS[@]})
if [[ ${#all[@]} = 0 ]]; then
echo "No jobs found with these sacct args"
exit 1
else
echo "Jobs: $(get_sorted_jobs ${all[@]})"
fi
for run in ${all[@]}; do
IFS='|' read -ra split <<< "$run" # split the sacct line by '|'
state=${split[$STATE]}
if [[ $EXCLUDE -eq 1 ]]; then
# don't process excluded jobs
IFS='_' read -ra job <<< "${split[$JOBID]}"
jobid=${job[$JOBID]}
jobstep=${job[$JOBSTEP]}
if [[ "${EXCLUDED[@]}" =~ "${jobid}" ]]; then
continue
fi
fi
if [[ $state = "COMPLETED" ]]; then
COMPLETED+=($run)
elif [[ $state = "FAILED" ]]; then
FAILED+=($run)
elif [[ $state = "TIMEOUT" ]]; then
TIMEOUT+=($run)
elif [[ $state = "RUNNING" ]]; then
RUNNING+=($run)
elif [[ $state = "PENDING" ]]; then
PENDING+=($run)
else
OTHER+=($run)
fi
done
echo "${#COMPLETED[@]} COMPLETED jobs"
if [[ ${#COMPLETED[@]} > 0 && $VERBOSE -eq 1 ]]; then
handle_completed ${COMPLETED[@]}
fi
echo "${#FAILED[@]} FAILED jobs"
if [[ ${#FAILED[@]} > 0 && $VERBOSE -eq 1 ]]; then
handle_failed ${FAILED[@]}
fi
echo "${#TIMEOUT[@]} TIMEOUT jobs"
if [[ ${#TIMEOUT[@]} > 0 && $VERBOSE -eq 1 ]]; then
handle_failed ${TIMEOUT[@]}
fi
echo "${#RUNNING[@]} RUNNING jobs"
if [[ ${#RUNNING[@]} > 0 && $VERBOSE -eq 1 ]]; then
handle_running ${RUNNING[@]}
fi
echo "${#PENDING[@]} PENDING jobs"
if [[ ${#PENDING[@]} > 0 && $VERBOSE -eq 1 ]]; then
handle_pending ${PENDING[@]}
fi
if [[ ${#OTHER[@]} > 0 ]]; then
echo "${#OTHER[@]} jobs with untracked status"
if [[ $VERBOSE -eq 1 ]]; then
handle_other ${OTHER[@]}
fi
fi
exit 0