Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-adding nvidia-device-plugin to production cluster #4436

Merged
merged 1 commit into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# resource "kubernetes_manifest" "nvidia_device_plugin" {
# manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
# }
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}

resource "kubernetes_manifest" "nvidia_gpu_slicing" {
manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,136 +1,136 @@
# # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
# ---
# apiVersion: apps/v1
# kind: DaemonSet
# metadata:
# name: nvidia-device-plugin-daemonset
# namespace: kube-system
# spec:
# selector:
# matchLabels:
# name: nvidia-device-plugin-ds
# updateStrategy:
# type: RollingUpdate
# template:
# metadata:
# labels:
# name: nvidia-device-plugin-ds
# spec:
# # @ministryofjustice/analytical-platform: We added this nodeSelector
# nodeSelector:
# gpu-compute: "true"
# tolerations:
# - key: nvidia.com/gpu
# operator: Exists
# effect: NoSchedule
# # @ministryofjustice/analytical-platform: We added this toleration
# - key: gpu-compute
# operator: Exists
# effect: NoSchedule
# # Mark this pod as a critical add-on; when enabled, the critical add-on
# # scheduler reserves resources for critical add-on pods so that they can
# # be rescheduled after a failure.
# # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
# priorityClassName: "system-node-critical"
# shareProcessNamespace: true
# serviceAccountName: nvidia-device-plugin-daemonset-service-account
# initContainers:
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-init
# command: ["config-manager"]
# env:
# - name: ONESHOT
# value: "true"
# - name: NODE_NAME
# valueFrom:
# fieldRef:
# fieldPath: "spec.nodeName"
# - name: NODE_LABEL
# value: "nvidia.com/device-plugin.config"
# - name: CONFIG_FILE_SRCDIR
# value: "/available-configs"
# - name: CONFIG_FILE_DST
# value: "/config/config.yaml"
# - name: FALLBACK_STRATEGIES
# value: named,single
# - name: SEND_SIGNAL
# value: "false"
# volumeMounts:
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# containers:
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-sidecar
# command: ["config-manager"]
# env:
# - name: ONESHOT
# value: "false"
# - name: NODE_NAME
# valueFrom:
# fieldRef:
# fieldPath: "spec.nodeName"
# - name: NODE_LABEL
# value: "nvidia.com/device-plugin.config"
# - name: CONFIG_FILE_SRCDIR
# value: "/available-configs"
# - name: CONFIG_FILE_DST
# value: "/config/config.yaml"
# - name: FALLBACK_STRATEGIES
# value: named,single
# - name: SEND_SIGNAL
# value: "true"
# - name: SIGNAL
# value: "1" # SIGHUP
# - name: PROCESS_TO_SIGNAL
# value: "nvidia-device-plugin"
# volumeMounts:
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# securityContext:
# allowPrivilegeEscalation: false
# capabilities:
# drop: ["ALL"]
# - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
# name: nvidia-device-plugin-ctr
# env:
# - name: FAIL_ON_INIT_ERROR
# value: "false"
# - name: CONFIG_FILE
# value: /config/config.yaml
# securityContext:
# allowPrivilegeEscalation: false
# capabilities:
# drop: ["ALL"]
# volumeMounts:
# - name: device-plugin
# mountPath: /var/lib/kubelet/device-plugins
# - name: available-configs
# mountPath: /available-configs
# - name: config
# mountPath: /config
# volumes:
# - name: device-plugin
# hostPath:
# path: /var/lib/kubelet/device-plugins
# - name: available-configs
# configMap:
# name: nvidia-device-plugin-daemonset
# defaultMode: 444
# - name: config
# emptyDir: {}
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
# @ministryofjustice/analytical-platform: We added this nodeSelector
nodeSelector:
gpu-compute: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# @ministryofjustice/analytical-platform: We added this toleration
- key: gpu-compute
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
shareProcessNamespace: true
serviceAccountName: nvidia-device-plugin-daemonset-service-account
initContainers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-init
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "false"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-sidecar
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "true"
- name: SIGNAL
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
- name: CONFIG_FILE
value: /config/config.yaml
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: available-configs
configMap:
name: nvidia-device-plugin-daemonset
defaultMode: 444
- name: config
emptyDir: {}
Loading