generated from actions/hello-world-docker-action
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathentrypoint.sh
executable file
·430 lines (377 loc) · 16 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/usr/bin/env bash
#
# This file is part of the Kepler project
# See LICENSE file for license information.
# shellcheck disable=SC1000-SC9999
set -o pipefail
# Define instance parameters
AWS_ACCESS_KEY_ID="${INPUT_AWS_ACCESS_KEY_ID:-}"
AWS_SECRET_ACCESS_KEY="${INPUT_AWS_SECRET_ACCESS_KEY:-}"
AMI_ID="${INPUT_AMI_ID:-ami-0e4d0bb9670ea8db0}" # Ubuntu Server 20.04 LTS (HVM), SSD Volume Type, x86_64
INSTANCE_TYPE="${INPUT_INSTANCE_TYPE:-t2.micro}" # c6i.metal: c is for compute, 6 is 6th geneneration, i is for Intel, metal is for bare metal
SECURITY_GROUP_ID="${INPUT_SECURITY_GROUP_ID:-}"
RUNNER_NAME="${INPUT_RUNNER_NAME:-}" # Name of the runner to create
GITHUB_TOKEN="${INPUT_GITHUB_TOKEN:-}"
GITHUB_REPO="${INPUT_GITHUB_REPO:-"sustainable-computing-io/kepler-model-server"}"
REGION="${INPUT_AWS_REGION:-us-east-2}" # Region to launch the spot instance
DEBUG="${DEBUG:-false}" # Enable debug mode
KEY_NAME="${INPUT_KEY_NAME:-}" # Name of the key pair to use for the instance
ROOT_VOLUME_SIZE="${INPUT_ROOT_VOLUME_SIZE:-20}" # Size of the root volume in GB
SPOT_INSTANCE_ONLY="${INPUT_SPOT_INSTANCE_ONLY:-true}" # If true, only create spot instance
CREATE_S3_BUCKET="${INPUT_CREATE_S3_BUCKET:-false}" # Wehther to create a S3 bucket to store the model
BUCKET_NAME="${INPUT_BUCKET_NAME:-}" # Name of the S3 bucket
INSTANCE_ID="${INPUT_INSTANCE_ID:-}" # ID of the created instance
KEY_NAME_OPT="" # Option to pass to the AWS CLI to specify the key pair
AUTO_TERMINATE_HOURS="${INPUT_AUTO_TERMINATE_HOURS:-6}" # Number of hours after which to terminate the instance (0 means no auto-termination)
[ "$DEBUG" == "true" ] && set -x
# get the organization name from the github repo
ORG_NAME=$(echo "$GITHUB_REPO" | cut -d'/' -f1)
# get the repo name from the github repo
REPO_NAME=$(echo "$GITHUB_REPO" | cut -d'/' -f2)
debug() {
[ "$DEBUG" == "true" ] && echo "DEBUG: $*" 1>&2
}
echoerr() {
echo "$*" 1>&2;
}
# check if key name is set
if [ -z "$KEY_NAME" ]; then
debug "KEY_NAME is not set"
else
KEY_NAME_OPT="--key-name $KEY_NAME" # Option to pass to the AWS CLI to specify the key pair
fi
get_github_runner_token () {
# fail if github token is not set
if [ -z "$GITHUB_TOKEN" ]; then
echoerr "GITHUB_TOKEN is not set"
exit 1
fi
# create a github runner registration token using the github token
# https://docs.github.com/en/actions/hosting-your-own-runners/adding-self-hosted-runners#adding-a-self-hosted-runner-to-a-repository-using-an-invitation-url
# https://docs.github.com/en/rest/reference/actions#create-a-registration-token-for-an-organization
# get the token
RUNNER_TOKEN=$(curl -s -XPOST -H "Authorization: token ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${ORG_NAME}/${REPO_NAME}/actions/runners/registration-token" | jq -r '.token')
debug "runner token: " "$RUNNER_TOKEN"
# fail if then length of runner token is less than 5
if [ ${#RUNNER_TOKEN} -lt 5 ]; then
echoerr "Failed to get runner token"
exit 1
fi
}
prep_aws_cred () {
# fail if aws access key id is not set
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
echoerr "AWS_ACCESS_KEY_ID is not set"
exit 1
fi
if [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echoerr "AWS_SECRET_ACCESS_KEY is not set"
exit 1
fi
export AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
}
prep_create() {
# fail if security group id is not set
if [ -z "$SECURITY_GROUP_ID" ]; then
echoerr "SECURITY_GROUP_ID is not set"
exit 1
fi
# set s3 bucket name if not set and create s3 bucket flag is set to true
if [ -z "$BUCKET_NAME" ] && [ "$CREATE_S3_BUCKET" == "true" ]; then
BUCKET_NAME=$REPO_NAME"-"$INSTANCE_TYPE"-"$(date +"%Y%m%d%H%M%S")
fi
prep_aws_cred
get_github_runner_token
# github runner name
if [ -z "$RUNNER_NAME" ]; then
RUNNER_NAME="self-hosted-runner-"$(date +"%Y%m%d%H%M%S")
fi
}
schedule_termination() {
local instance_id=$1
local hours=$2
if [ "$hours" -gt 0 ]; then
# Calculate termination time in UTC
local termination_time
termination_time=$(date -u -d "+${hours} hours" +"%Y-%m-%dT%H:%M:%SZ")
# Create an AWS CloudWatch event rule to terminate the instance
local rule_name="terminate-${instance_id}"
# Create CloudWatch event rule
aws events put-rule \
--name "$rule_name" \
--schedule-expression "at(${termination_time})" \
--state ENABLED \
--region "${REGION}"
# Create IAM role for CloudWatch to terminate EC2 instances if it doesn't exist
local role_name="AutoTerminateEC2Role"
if ! aws iam get-role --role-name "$role_name" 2>/dev/null; then
aws iam create-role \
--role-name "$role_name" \
--assume-role-policy-document '{
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Principal": {
"Service": "events.amazonaws.com"
},
"Action": "sts:AssumeRole"
}]
}'
# Attach policy to allow terminating EC2 instances
aws iam put-role-policy \
--role-name "$role_name" \
--policy-name "TerminateEC2Policy" \
--policy-document '{
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": "ec2:TerminateInstances",
"Resource": "*"
}]
}'
fi
# Create target for the rule
aws events put-targets \
--rule "$rule_name" \
--targets "[{
\"Id\": \"TerminateInstance\",
\"Arn\": \"arn:aws:events:${REGION}:${AWS_ACCOUNT_ID}:rule/${rule_name}\",
\"RoleArn\": \"arn:aws:iam::${AWS_ACCOUNT_ID}:role/${role_name}\",
\"Input\": \"{\\\"instance_ids\\\": [\\\"${instance_id}\\\"]}\",
\"RunCommandParameters\": {
\"RunCommand\": [
\"aws\",
\"ec2\",
\"terminate-instances\",
\"--instance-ids\",
\"${instance_id}\",
\"--region\",
\"${REGION}\"
]
}
}]"
debug "Instance $instance_id scheduled for termination at $termination_time UTC"
fi
}
# create the user data script
create_uesr_data () {
# Encode user data so it can be passed as an argument to the AWS CLI
# FIXME: it appears that the user data is not passed to the instance
# ENCODED_USER_DATA=$(echo "$USER_DATA" | base64 | tr -d \\n)
cat <<EOF > user_data.sh
#!/bin/bash
export RUNNER_LABEL="${INSTANCE_TYPE}"
if command -v yum &> /dev/null; then
# add rhel to the label
export RUNNER_LABEL="rhel,${INSTANCE_TYPE}"
yum install -y curl jq libicu
else
# add ubuntu to the label
export RUNNER_LABEL="ubuntu,${INSTANCE_TYPE}"
apt-get update
apt-get install -y curl jq
fi
# install the latest kernel modeules and enable rapl. it doesn seem uname -r works in user data script...
# export KERNEL_VERSION=$(apt list --installed 2>&1 | grep 'linux-image-' | awk -F'/' '{print $1}' |grep "\." | cut -d'-' -f3-)
# echo "installing kernel modules for version $KERNEL_VERSION"
# apt install linux-modules-${KERNEL_VERSION} linux-modules-extra-${KERNEL_VERSION} -y
# modprobe intel_rapl_common
# Create a folder
mkdir /tmp/actions-runner && cd /tmp/actions-runner
# Download the latest runner package
curl -o actions-runner-linux-x64-2.311.0.tar.gz -L "https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz"
# Optional: Validate the hash
# echo "29fc8cf2dab4c195bb147384e7e2c94cfd4d4022c793b346a6175435265aa278 actions-runner-linux-x64-2.311.0.tar.gz" | shasum -a 256 -c
# Extract the installer
tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
# Create the runner and start the configuration experience
# there is a bug in the github instruction. The config script does not work with sudo unless we set RUNNER_ALLOW_RUNASROOT=true
export RUNNER_ALLOW_RUNASROOT=true
./config.sh --replace --unattended --name ${RUNNER_NAME} --url "https://github.com/${GITHUB_REPO}" --token ${RUNNER_TOKEN} --labels "\${RUNNER_LABEL}"
# Last step, run it!
./run.sh
EOF
}
run_spot_instance () {
INSTANCE_JSON=$(aws ec2 run-instances --image-id $AMI_ID --count 1 --instance-type $INSTANCE_TYPE \
--security-group-ids $SECURITY_GROUP_ID --region ${REGION} --region ${REGION} \
--instance-market-options 'MarketType=spot' \
--block-device-mappings '[{"DeviceName": "/dev/sda1","Ebs": { "VolumeSize": '${ROOT_VOLUME_SIZE}', "DeleteOnTermination": true } }]'\
$KEY_NAME_OPT \
--user-data file://user_data.sh 2>&1)
}
run_on_demand_instance() {
INSTANCE_JSON=$(aws ec2 run-instances --image-id $AMI_ID --count 1 --instance-type $INSTANCE_TYPE \
--security-group-ids $SECURITY_GROUP_ID --region ${REGION} \
--block-device-mappings '[{"DeviceName": "/dev/sda1","Ebs": { "VolumeSize": '${ROOT_VOLUME_SIZE}', "DeleteOnTermination": true } }]'\
$KEY_NAME_OPT \
--user-data file://user_data.sh)
}
create_s3_bucket () {
# Create S3 bucket
if [ "$CREATE_S3_BUCKET" == "true" ]; then
aws s3api create-bucket --bucket "${BUCKET_NAME}" --region "${REGION}" --create-bucket-configuration LocationConstraint="${REGION}"
fi
}
delete_s3_bucket () {
# Delete S3 bucket if BUCKET_NAME is set
if [ -z "$BUCKET_NAME" ]; then
return
fi
aws s3api delete-bucket --bucket "${BUCKET_NAME}" --region "${REGION}"
}
terminate_instance () {
local rule_name="terminate-${instance_id}"
local role_name="AutoTerminateEC2Role"
# Remove CloudWatch Event Rule and Targets if they exist
if aws events list-rules --name-prefix "$rule_name" --region "${REGION}" 2>/dev/null | grep -q "$rule_name"; then
debug "Removing CloudWatch Event targets for rule $rule_name"
aws events remove-targets --rule "$rule_name" --ids "TerminateInstance" --region "${REGION}"
debug "Deleting CloudWatch Event rule $rule_name"
aws events delete-rule --name "$rule_name" --region "${REGION}"
fi
# Check if the IAM role is being used by other rules before deleting
if aws iam get-role --role-name "$role_name" 2>/dev/null; then
# List all rules to check if the role is still in use
local rules_using_role
rules_using_role=$(aws events list-rules --region "${REGION}" --query 'Rules[?RoleArn!=`null`]' --output text)
if [ -z "$rules_using_role" ]; then
debug "Removing IAM role policy"
aws iam delete-role-policy --role-name "$role_name" --policy-name "TerminateEC2Policy"
debug "Deleting IAM role $role_name"
aws iam delete-role --role-name "$role_name"
else
debug "IAM role $role_name is still in use by other rules, skipping deletion"
fi
fi
# Terminate instance
aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --region "${REGION}"
# Delete S3 bucket
delete_s3_bucket
}
get_instance_ip () {
# Get instance IP
INSTANCE_IP=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --region "${REGION}" \
--query 'Reservations[0].Instances[0].PublicIpAddress' --output text)
}
create_runner () {
# GitHub Runner setup script
create_uesr_data
# try 3 times
for i in {1..3}
do
run_spot_instance
# Extract instance ID
INSTANCE_ID=$(echo -n "$INSTANCE_JSON" | jq -r '.Instances[0].InstanceId')
# Check if instance creation failed
if [ -z "$INSTANCE_ID" ]; then
debug "Failed to create instance, retrying"
continue
else
break
fi
done
# if instance id is still empty, then we failed to create a spot instance
# create on-demand instance instead
if [ -z "$INSTANCE_ID" ]; then
if [ "$SPOT_INSTANCE_ONLY" == "true" ]; then
echoerr "SPOT_INSTANCE_ONLY is set to true, exiting"
exit 1
fi
debug "Failed to create spot instance, creating on-demand instance instead"
run_on_demand_instance
# Extract instance ID
INSTANCE_ID=$(echo "$INSTANCE_JSON" | jq -r '.Instances[0].InstanceId')
# Check if instance creation failed
if [ -z "$INSTANCE_ID" ]; then
echoerr "Failed to create on-demand instance"
exit 1
fi
fi
# Schedule auto-termination if enabled
if [ "$AUTO_TERMINATE_HOURS" -gt 0 ]; then
schedule_termination "$INSTANCE_ID" "$AUTO_TERMINATE_HOURS"
fi
rm user_data.sh
# Wait for instance to become ready
aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" --region "${REGION}"
# Check if wait command succeeded
if [ $? -ne 0 ]; then
echoerr "Instance failed to become ready. Terminating instance."
terminate_instance
exit 1
fi
create_s3_bucket
get_instance_ip
# Output the instance ID to github output
echo "instance_id=$INSTANCE_ID" >> $GITHUB_OUTPUT
echo "runner_name=$RUNNER_NAME" >> $GITHUB_OUTPUT
echo "instance_ip=$INSTANCE_IP" >> $GITHUB_OUTPUT
echo "bucket_name=$BUCKET_NAME" >> $GITHUB_OUTPUT
if [ "$AUTO_TERMINATE_HOURS" -gt 0 ]; then
echo "termination_time=$(date -u -d "+${AUTO_TERMINATE_HOURS} hours" +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
fi
}
list_runner () {
# list all the runners
# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28
curl -s -X GET -H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/${ORG_NAME}/${REPO_NAME}/actions/runners" | jq -r '.runners[] | .name'
}
unregister_runner () {
# unregister the runner from github
# https://docs.github.com/en/rest/reference/actions#delete-a-self-hosted-runner-from-an-organization
# cannot delete by runner name, need to get the runner id first
RUNNERS=$(curl -s -X GET -H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/${ORG_NAME}/${REPO_NAME}/actions/runners")
RUNNER_ID=$(echo "$RUNNERS" | jq -r '.runners[] | select(.name=="'$RUNNER_NAME'") | .id ')
debug "runner id: " "$RUNNER_ID"
curl -L -X DELETE -H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/${ORG_NAME}/${REPO_NAME}/actions/runners/${RUNNER_ID}"
}
# Get ACTION from env var passed by workflow.
# If not set, use the command line arguments and run the matching function. This is for local testing.
ACTION=${INPUT_ACTION:-$1}
if [ -z "$ACTION" ]; then
echoerr "ACTION is not set"
exit 1
fi
case $ACTION in
create)
INSTANCE_ID=""
RUNNER_NAME=""
prep_create
create_runner
;;
terminate)
if [ -z "${INSTANCE_ID}" ]; then
echoerr "Instance ID is not set"
exit 1
fi
prep_aws_cred
terminate_instance
;;
unregister)
if [ -z "${RUNNER_NAME}" ]; then
echoerr "Runner name is not set"
exit 1
fi
unregister_runner
;;
list)
list_runner
;;
*)
echoerr "Invalid action:"${ACTION}
echoerr "Usage: $0 {create|terminate|unregister|list}"
exit 1
esac