-
Notifications
You must be signed in to change notification settings - Fork 26
224 lines (198 loc) · 7.87 KB
/
stackhpc.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
push:
branches:
- main
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
pull_request:
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
jobs:
openstack:
name: openstack-ci
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix:
os_version:
- RL8
- RL9
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
TF_VAR_os_version: ${{ matrix.os_version }}
steps:
- uses: actions/checkout@v2
- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
run: |
# Iterate over the labels
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
echo $labels
for label in $labels; do
if [[ $label == CI_CLOUD=* ]]; then
# Extract the value after 'CI_CLOUD='
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
fi
done
- name: Record settings for CI cloud
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash
- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash
- name: Install ansible etc
run: dev/setup-env.sh
- name: Install OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_version: 1.6.2
- name: Initialise OpenTofu
run: tofu init
working-directory: ${{ github.workspace }}/environments/.stackhpc/tofu
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
- name: Setup environment-specific inventory/tofu inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook ansible/adhoc/generate-passwords.yml
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Provision nodes using fat image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
- name: Configure cluster
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Run MPI-based tests
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
# - name: Run EESSI tests
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/check_eessi.yml
- name: Confirm Open Ondemand is up (via SOCKS proxy)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
# load ansible variables into shell:
ansible-playbook ansible/ci/output_vars.yml \
-e output_vars_hosts=openondemand \
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
# setup ssh proxying:
sudo apt-get --yes install proxychains
echo proxychains installed
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip}
echo port 9050 forwarded
# check OOD server returns 200:
statuscode=$(proxychains wget \
--quiet \
--spider \
--server-response \
--no-check-certificate \
--http-user=testuser \
--http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \
2>&1)
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
# - name: Build environment-specific compute image
# id: packer_build
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# cd packer/
# packer init
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
# - name: Test reimage of compute nodes to new environment-specific image (via slurm)
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
# ansible-playbook -v ansible/ci/check_slurm.yml
- name: Test reimage of login and control nodes (via rebuild adhoc)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Check sacct state survived reimage
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
- name: Check MPI-based tests are shown in Grafana
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/tofu
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}
# - name: Delete images
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/delete_images.yml