ministryofjustice · Emterry · May 30, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
@@ -1,3 +1,19 @@
 resource "kubernetes_manifest" "nvidia_device_plugin" {
   manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
 }
+
+resource "kubernetes_manifest" "nvidia_gpu_slicing" {
+  manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
+}
+
+resource "kubernetes_manifest" "nvidia_cluster_role" {
+  manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml"))
+}
+
+resource "kubernetes_manifest" "nvidia_cluster_role_binding" {
+  manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml"))
+}
+
+resource "kubernetes_manifest" "nvidia_service_account" {
+  manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml"))
+}
@@ -0,0 +1,13 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: nvidia-device-plugin-daemonset-role-binding
+subjects:
+  - kind: ServiceAccount
+    name: nvidia-device-plugin-daemonset-service-account
+    namespace: kube-system
+roleRef:
+  kind: ClusterRole
+  name: nvidia-device-plugin-daemonset-role
+  apiGroup: rbac.authorization.k8s.io
@@ -0,0 +1,9 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nvidia-device-plugin-daemonset-role
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
@@ -44,20 +44,105 @@ spec:
       # be rescheduled after a failure.
       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
       priorityClassName: "system-node-critical"
+      shareProcessNamespace: true
+      serviceAccountName: nvidia-device-plugin-daemonset-service-account
+      initContainers:
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          name: nvidia-device-plugin-init
+          command: ["config-manager"]
+          env:
+            - name: ONESHOT
+              value: "true"
+            - name: KUBECONFIG
+              value: ""
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: "spec.nodeName"
+            - name: NODE_LABEL
+              value: "nvidia.com/device-plugin.config"
+            - name: CONFIG_FILE_SRCDIR
+              value: "/available-configs"
+            - name: CONFIG_FILE_DST
+              value: "/config/config.yaml"
+            - name: DEFAULT_CONFIG
+              value: ""
+            - name: FALLBACK_STRATEGIES
+              value: named,single
+            - name: SEND_SIGNAL
+              value: "false"
+            - name: SIGNAL
+              value: ""
+            - name: PROCESS_TO_SIGNAL
+              value: ""
+          volumeMounts:
+            - name: available-configs
+              mountPath: /available-configs
+            - name: config
+              mountPath: /config
       containers:
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          name: nvidia-device-plugin-sidecar
+          command: ["config-manager"]
+          env:
+            - name: ONESHOT
+              value: "false"
+            - name: KUBECONFIG
+              value: ""
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: "spec.nodeName"
+            - name: NODE_LABEL
+              value: "nvidia.com/device-plugin.config"
+            - name: CONFIG_FILE_SRCDIR
+              value: "/available-configs"
+            - name: CONFIG_FILE_DST
+              value: "/config/config.yaml"
+            - name: DEFAULT_CONFIG
+              value: ""
+            - name: FALLBACK_STRATEGIES
+              value: named,single
+            - name: SEND_SIGNAL
+              value: "true"
+            - name: SIGNAL
+              value: "1" # SIGHUP
+            - name: PROCESS_TO_SIGNAL
+              value: "nvidia-device-plugin"
+          volumeMounts:
+            - name: available-configs
+              mountPath: /available-configs
+            - name: config
+              mountPath: /config
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
         - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
           name: nvidia-device-plugin-ctr
           env:
             - name: FAIL_ON_INIT_ERROR
               value: "false"
+            - name: CONFIG_FILE
+              value: /config/config.yaml
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
               drop: ["ALL"]
           volumeMounts:
             - name: device-plugin
               mountPath: /var/lib/kubelet/device-plugins
+            - name: available-configs
+              mountPath: /available-configs
+            - name: config
+              mountPath: /config
       volumes:
         - name: device-plugin
           hostPath:
             path: /var/lib/kubelet/device-plugins
+        - name: available-configs
+          configMap:
+            name: nvidia-device-plugin-daemonset
+            defaultMode: 444
+        - name: config
+          emptyDir: {}
@@ -0,0 +1,16 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+data:
+  any: |-
+    version: v1
+    flags:
+      migStrategy: none
+    sharing:
+      timeSlicing:
+        resources:
+        - name: nvidia.com/gpu
+          replicas: 4
@@ -0,0 +1,6 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: nvidia-device-plugin-daemonset-service-account
+  namespace: kube-system