Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add time slicing manifest to gpu resource in development cluster #4419

Merged
merged 4 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}

resource "kubernetes_manifest" "nvidia_gpu_slicing" {
manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role_binding" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml"))
}

resource "kubernetes_manifest" "nvidia_service_account" {
manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-device-plugin-daemonset-role-binding
subjects:
- kind: ServiceAccount
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system
roleRef:
kind: ClusterRole
name: nvidia-device-plugin-daemonset-role
apiGroup: rbac.authorization.k8s.io
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-device-plugin-daemonset-role
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,105 @@ spec:
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
shareProcessNamespace: true
serviceAccountName: nvidia-device-plugin-daemonset-service-account
initContainers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-init
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "false"
- name: SIGNAL
value: ""
- name: PROCESS_TO_SIGNAL
value: ""
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-sidecar
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "true"
- name: SIGNAL
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
- name: CONFIG_FILE
value: /config/config.yaml
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: available-configs
configMap:
name: nvidia-device-plugin-daemonset
defaultMode: 444
- name: config
emptyDir: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system
Loading