Skip to content

Commit

Permalink
add time slicing manifest to gpu resource in development cluster (#4419)
Browse files Browse the repository at this point in the history
* add slicing manefest

* update gpu-slicing

* linter fix

---------

Co-authored-by: Emterry <emma.terry@digital.justice.gov.uk>
Co-authored-by: Emterry <123941245+Emterry@users.noreply.github.com>
  • Loading branch information
3 people authored May 30, 2024
1 parent 358ac78 commit 3635d2a
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}

resource "kubernetes_manifest" "nvidia_gpu_slicing" {
manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role_binding" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml"))
}

resource "kubernetes_manifest" "nvidia_service_account" {
manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-device-plugin-daemonset-role-binding
subjects:
- kind: ServiceAccount
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system
roleRef:
kind: ClusterRole
name: nvidia-device-plugin-daemonset-role
apiGroup: rbac.authorization.k8s.io
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-device-plugin-daemonset-role
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,105 @@ spec:
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
shareProcessNamespace: true
serviceAccountName: nvidia-device-plugin-daemonset-service-account
initContainers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-init
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "false"
- name: SIGNAL
value: ""
- name: PROCESS_TO_SIGNAL
value: ""
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-sidecar
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "true"
- name: SIGNAL
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
- name: CONFIG_FILE
value: /config/config.yaml
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: available-configs
configMap:
name: nvidia-device-plugin-daemonset
defaultMode: 444
- name: config
emptyDir: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system

0 comments on commit 3635d2a

Please sign in to comment.