Skip to content

Commit

Permalink
Several updates and fixes (version, process, doc)
Browse files Browse the repository at this point in the history
fix typo on SLURM_VERSION SLURM_VERSION
update slurm to version 24.05
add documentation on how to create openssl private key for the ssh server
add missing cgroup.conf and disable cgroup usage

from: @akram
moved patch over from old kincl/openshift-hpc-batch-demo repo
  • Loading branch information
kincl committed Aug 30, 2024
1 parent 39f5408 commit c0b9580
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 16 deletions.
32 changes: 29 additions & 3 deletions slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ oc apply -k manifests/
oc -n slurm-system create secret generic munge-key --from-file=munge.key=<(dd status=none if=/dev/urandom bs=1 count=128)
```

### Build images
### Create image builds and build images

```
oc new-build --name munge --binary
Expand Down Expand Up @@ -54,13 +54,39 @@ sleep 10

## containerssh

TODO: replace authconfig python with OPA example: https://github.com/ContainerSSH/examples/tree/main/opa/containerssh
```
oc new-project containerssh
```

Set your user ssh public key into the authentication app (hardcoded)

```
sed -i -e "s#ssh-rsa.*#$(cat ~/.ssh/id_rsa.pub)#" manifests/containerssh/authconfig/app.py
```

TODO: Automate or script this part
Generate host key and create a secret for it

```
openssl rsa -in mykey.pem -pubout > mykey.pub
oc create secret generic -n containerssh containerssh-hostkey --from-file=host.key=mykey.pem
```
```
$ oc apply -k manifests/containerssh
$ openssl genrsa | kubectl create secret generic -n containerssh containerssh-hostkey --from-file=host.key=/dev/stdin
```

## Checks

Check thats pods in projects `slurm-system` and `containerssh` are up and running:

```
oc get pods -n containerssh
oc get pods -n slurm-system
```

You are ready for the demo and you can return to [the main demo README](../README.md)

### Login

```
Expand All @@ -70,4 +96,4 @@ containerssh LoadBalancer 172.30.147.21 a9cf4fabd2d9b49659d8af106ea30536-1
$ ssh -o HostKeyAlgorithms=+ssh-rsa -o PubkeyAcceptedAlgorithms=+ssh-rsa -p 2222 jason@a9cf4fabd2d9b49659d8af106ea30536-1220395463.us-east-2.elb.amazonaws.com
bash-5.1$
```
```
2 changes: 1 addition & 1 deletion slurm/images/slurm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM ghcr.io/naps-product-sa/openshift-batch/munge:latest as build
ARG SLURM_VERSION=23.02.4
ARG SLURM_VERSION=24.05.3

RUN dnf install --enablerepo=crb -y xz gcc procps-ng bzip2 perl git autoconf automake libtool diffutils jansson \
openssl-devel bzip2-devel zlib-devel jansson-devel http-parser-devel json-c-devel perl-devel
Expand Down
1 change: 1 addition & 0 deletions slurm/manifests/configs/cgroup.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CgroupPlugin=disabled
2 changes: 1 addition & 1 deletion slurm/manifests/configs/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ SlurmctldDebug=debug
#SlurmctldLogFile=
SlurmctldParameters=cloud_reg_addrs
SlurmdDebug=debug
#SlurmdLogFile=
SlurmdLogFile=/tmp/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
Expand Down
8 changes: 4 additions & 4 deletions slurm/manifests/containerssh/configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ kubernetes:
serviceAccountName: slurm
initContainers:
- name: fix-munge-paths
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- sh
- -c
Expand All @@ -40,7 +40,7 @@ kubernetes:
- name: munge-socket
mountPath: /run/munge
- name: add-jobs
image: ghcr.io/naps-product-sa/openshift-batch/login:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/login:latest
command:
- sh
- -c
Expand Down Expand Up @@ -69,7 +69,7 @@ kubernetes:
mountPath: /home
containers:
- name: shell
image: ghcr.io/naps-product-sa/openshift-batch/login:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/login:latest
securityContext:
runAsUser: 1000
runAsGroup: 1000
Expand All @@ -82,7 +82,7 @@ kubernetes:
- name: shared
mountPath: /home
- name: munge
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- /sbin/munged
- -F
Expand Down
5 changes: 5 additions & 0 deletions slurm/manifests/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,8 @@ configMapGenerator:
- configs/slurm.conf
options:
disableNameSuffixHash: true
labels:
app: slurm
- name: cgroup-conf
files:
- configs/cgroup.conf
12 changes: 9 additions & 3 deletions slurm/manifests/statefulset-compute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ spec:
topologyKey: "kubernetes.io/hostname"
initContainers:
- name: fix-munge-paths
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- sh
- -c
Expand All @@ -75,7 +75,7 @@ spec:
mountPath: /run/munge
containers:
- name: slurm
image: ghcr.io/naps-product-sa/openshift-batch/slurm:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/slurm:latest
command:
- /sbin/slurmd
- -D
Expand Down Expand Up @@ -104,10 +104,13 @@ spec:
- name: slurm-conf
mountPath: /etc/slurm.conf
subPath: slurm.conf
- name: cgroup-conf
mountPath: /etc/cgroup.conf
subPath: cgroup.conf
- name: shared
mountPath: /home
- name: munge
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- /bin/bash
- /scripts/munged-container.sh
Expand All @@ -131,6 +134,9 @@ spec:
- name: slurm-conf
configMap:
name: slurm-conf
- name: cgroup-conf
configMap:
name: cgroup-conf

- name: shared
persistentVolumeClaim:
Expand Down
14 changes: 10 additions & 4 deletions slurm/manifests/statefulset-head.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ spec:
fsGroupChangePolicy: Always
initContainers:
- name: fix-munge-paths
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- sh
- -c
Expand All @@ -49,7 +49,7 @@ spec:
mountPath: /run/munge
containers:
- name: slurm
image: ghcr.io/naps-product-sa/openshift-batch/slurm:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/slurm:latest
command:
- /sbin/slurmctld
- -D
Expand All @@ -67,7 +67,7 @@ spec:
- name: shared
mountPath: /home
- name: exporter
image: ghcr.io/naps-product-sa/openshift-batch/slurm:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/slurm:latest
command:
- /usr/bin/prometheus-slurm-exporter
securityContext:
Expand All @@ -79,8 +79,11 @@ spec:
- name: slurm-conf
mountPath: /etc/slurm.conf
subPath: slurm.conf
- name: cgroup-conf
mountPath: /etc/cgroup.conf
subPath: cgroup.conf
- name: munge
image: ghcr.io/naps-product-sa/openshift-batch/munge:latest
image: image-registry.openshift-image-registry.svc:5000/slurm-system/munge:latest
command:
- /sbin/munged
- -F
Expand All @@ -98,6 +101,9 @@ spec:
- name: slurm-conf
configMap:
name: slurm-conf
- name: cgroup-conf
configMap:
name: cgroup-conf

- name: munge-key
emptyDir: {}
Expand Down

0 comments on commit c0b9580

Please sign in to comment.