From 042bc8989e2b8d48b96b9499f6335d74df400bad Mon Sep 17 00:00:00 2001 From: Anthony Fitzroy Date: Fri, 31 May 2024 15:57:07 +0000 Subject: [PATCH] nvidia-device-plugin --- .../cluster/kubernetes-manifests.tf | 6 +- .../src/kubernetes/nvidia-device-plugin.yml | 272 +++++++++--------- 2 files changed, 139 insertions(+), 139 deletions(-) diff --git a/terraform/aws/analytical-platform-production/cluster/kubernetes-manifests.tf b/terraform/aws/analytical-platform-production/cluster/kubernetes-manifests.tf index 33272e54ce..9b2ad08da9 100644 --- a/terraform/aws/analytical-platform-production/cluster/kubernetes-manifests.tf +++ b/terraform/aws/analytical-platform-production/cluster/kubernetes-manifests.tf @@ -1,6 +1,6 @@ -# resource "kubernetes_manifest" "nvidia_device_plugin" { -# manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml")) -# } +resource "kubernetes_manifest" "nvidia_device_plugin" { + manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml")) +} resource "kubernetes_manifest" "nvidia_gpu_slicing" { manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml")) diff --git a/terraform/aws/analytical-platform-production/cluster/src/kubernetes/nvidia-device-plugin.yml b/terraform/aws/analytical-platform-production/cluster/src/kubernetes/nvidia-device-plugin.yml index 4cd046d303..dec09181fa 100644 --- a/terraform/aws/analytical-platform-production/cluster/src/kubernetes/nvidia-device-plugin.yml +++ b/terraform/aws/analytical-platform-production/cluster/src/kubernetes/nvidia-device-plugin.yml @@ -1,136 +1,136 @@ -# # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # http://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. -# --- -# apiVersion: apps/v1 -# kind: DaemonSet -# metadata: -# name: nvidia-device-plugin-daemonset -# namespace: kube-system -# spec: -# selector: -# matchLabels: -# name: nvidia-device-plugin-ds -# updateStrategy: -# type: RollingUpdate -# template: -# metadata: -# labels: -# name: nvidia-device-plugin-ds -# spec: -# # @ministryofjustice/analytical-platform: We added this nodeSelector -# nodeSelector: -# gpu-compute: "true" -# tolerations: -# - key: nvidia.com/gpu -# operator: Exists -# effect: NoSchedule -# # @ministryofjustice/analytical-platform: We added this toleration -# - key: gpu-compute -# operator: Exists -# effect: NoSchedule -# # Mark this pod as a critical add-on; when enabled, the critical add-on -# # scheduler reserves resources for critical add-on pods so that they can -# # be rescheduled after a failure. -# # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ -# priorityClassName: "system-node-critical" -# shareProcessNamespace: true -# serviceAccountName: nvidia-device-plugin-daemonset-service-account -# initContainers: -# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 -# name: nvidia-device-plugin-init -# command: ["config-manager"] -# env: -# - name: ONESHOT -# value: "true" -# - name: NODE_NAME -# valueFrom: -# fieldRef: -# fieldPath: "spec.nodeName" -# - name: NODE_LABEL -# value: "nvidia.com/device-plugin.config" -# - name: CONFIG_FILE_SRCDIR -# value: "/available-configs" -# - name: CONFIG_FILE_DST -# value: "/config/config.yaml" -# - name: FALLBACK_STRATEGIES -# value: named,single -# - name: SEND_SIGNAL -# value: "false" -# volumeMounts: -# - name: available-configs -# mountPath: /available-configs -# - name: config -# mountPath: /config -# containers: -# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 -# name: nvidia-device-plugin-sidecar -# command: ["config-manager"] -# env: -# - name: ONESHOT -# value: "false" -# - name: NODE_NAME -# valueFrom: -# fieldRef: -# fieldPath: "spec.nodeName" -# - name: NODE_LABEL -# value: "nvidia.com/device-plugin.config" -# - name: CONFIG_FILE_SRCDIR -# value: "/available-configs" -# - name: CONFIG_FILE_DST -# value: "/config/config.yaml" -# - name: FALLBACK_STRATEGIES -# value: named,single -# - name: SEND_SIGNAL -# value: "true" -# - name: SIGNAL -# value: "1" # SIGHUP -# - name: PROCESS_TO_SIGNAL -# value: "nvidia-device-plugin" -# volumeMounts: -# - name: available-configs -# mountPath: /available-configs -# - name: config -# mountPath: /config -# securityContext: -# allowPrivilegeEscalation: false -# capabilities: -# drop: ["ALL"] -# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 -# name: nvidia-device-plugin-ctr -# env: -# - name: FAIL_ON_INIT_ERROR -# value: "false" -# - name: CONFIG_FILE -# value: /config/config.yaml -# securityContext: -# allowPrivilegeEscalation: false -# capabilities: -# drop: ["ALL"] -# volumeMounts: -# - name: device-plugin -# mountPath: /var/lib/kubelet/device-plugins -# - name: available-configs -# mountPath: /available-configs -# - name: config -# mountPath: /config -# volumes: -# - name: device-plugin -# hostPath: -# path: /var/lib/kubelet/device-plugins -# - name: available-configs -# configMap: -# name: nvidia-device-plugin-daemonset -# defaultMode: 444 -# - name: config -# emptyDir: {} +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + # @ministryofjustice/analytical-platform: We added this nodeSelector + nodeSelector: + gpu-compute: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # @ministryofjustice/analytical-platform: We added this toleration + - key: gpu-compute + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + shareProcessNamespace: true + serviceAccountName: nvidia-device-plugin-daemonset-service-account + initContainers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-init + command: ["config-manager"] + env: + - name: ONESHOT + value: "true" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "false" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-sidecar + command: ["config-manager"] + env: + - name: ONESHOT + value: "false" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "true" + - name: SIGNAL + value: "1" # SIGHUP + - name: PROCESS_TO_SIGNAL + value: "nvidia-device-plugin" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + - name: CONFIG_FILE + value: /config/config.yaml + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: available-configs + configMap: + name: nvidia-device-plugin-daemonset + defaultMode: 444 + - name: config + emptyDir: {}