-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkill-runner-procs
59 lines (52 loc) · 2.62 KB
/
kill-runner-procs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
#
# Clean up orphaned processes left over from canceled jobs as an attempt to work around
# https://gitlab.com/gitlab-org/gitlab-runner/issues/3101. The approach taken herein is
# modeled after https://gitlab.com/gitlab-org/gitlab-runner/issues/3101#note_72077695
# and https://gitlab.com/gitlab-org/gitlab-runner/issues/3031#note_71905742.
#
# This script is used in conjunction with a GitLab Runner pre-build-script that emits
# a file containing information about each running job.
# See https://gitlab.invenia.ca/infrastructure/EC2v2/blob/master/gitlab/runner/gitlab-ci-runners.yml
set -e
TOKEN="$1"
if [ -z "$TOKEN" ]; then
echo "No runner token provided to kill-runner-procs"
exit 1
fi
for file in /mnt/builds/running_build_*; do
# The running builds produce a file called running_build_<id>, where id is the unique
# job ID assigned by GitLab, containing the project ID and the job ID separated by a
# space. We need these to pass to the GitLab jobs API to get the status of the job
# we're looking at.
PROJECT=$(awk '{print $1}' "$file")
JOB=$(awk '{print $2}' "$file")
URL="https://gitlab.invenia.ca/api/v4/projects/$PROJECT/jobs/$JOB"
STATUS=$(curl -s -g --header "PRIVATE-TOKEN: $TOKEN" "$URL" | jq -r '.status')
# Attempt to kill any GitLab Runner processes for canceled jobs
if [ "$STATUS" = "canceled" ]; then
for pdir in /proc/*; do
if [ ! -d "$pdir" ] || [ ! -f "$pdir/environ" ]; then
continue
fi
if [ -z "$(grep -Fa "CI_JOB_ID=$JOB" "$pdir/environ" 2>/dev/null)" ]; then
continue
fi
# Kill the process group, thereby killing the process as well. Canceling the
# job itself via the GitLab UI kills the login shell process owned by root,
# so we only need to kill the child processes owned by gitlab-runner, which
# should (hopefully) all be in the same process group. Note however that by
# killing the group, we're changing the state of the directory over which
# we're iterating, so we may see the `echo`ed message below spuriously if the
# group of the process in the current iteration has already been killed.
PID=$(basename $pdir)
pkill -9 -g $(ps -o pgrp= -p $PID) || echo "Unable to kill process group"
done
fi
# Clean up files for completed jobs
if [ "$STATUS" = "success" ] || [ "$STATUS" = "failed" ] || [ "$STATUS" = "canceled" ]; then
rm -f "$file"
fi
# Sleep for a couple of seconds to ensure we don't send too many API requests too quickly
sleep 2
done