Skip to content

Commit

Permalink
update gpu-slicing
Browse files Browse the repository at this point in the history
  • Loading branch information
Emterry committed May 30, 2024
1 parent 23885da commit 682b75c
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@ resource "kubernetes_manifest" "nvidia_device_plugin" {
manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml"))
}

# resource "kubernetes_manifest" "nvidia_gpu_slicing" {
# manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
# }
resource "kubernetes_manifest" "nvidia_gpu_slicing" {
manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml"))
}

resource "kubernetes_manifest" "nvidia_cluster_role_binding" {
manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml"))
}

resource "kubernetes_manifest" "nvidia_service_account" {
manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-device-plugin-daemonset-role-binding
subjects:
- kind: ServiceAccount
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system
roleRef:
kind: ClusterRole
name: nvidia-device-plugin-daemonset-role
apiGroup: rbac.authorization.k8s.io
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-device-plugin-daemonset-role
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,105 @@ spec:
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
shareProcessNamespace: true
serviceAccountName: nvidia-device-plugin-daemonset-service-account
initContainers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-init
command: ["config-manager"]
env:
- name: ONESHOT
value: "true"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "false"
- name: SIGNAL
value: ""
- name: PROCESS_TO_SIGNAL
value: ""
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-sidecar
command: ["config-manager"]
env:
- name: ONESHOT
value: "false"
- name: KUBECONFIG
value: ""
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: "spec.nodeName"
- name: NODE_LABEL
value: "nvidia.com/device-plugin.config"
- name: CONFIG_FILE_SRCDIR
value: "/available-configs"
- name: CONFIG_FILE_DST
value: "/config/config.yaml"
- name: DEFAULT_CONFIG
value: ""
- name: FALLBACK_STRATEGIES
value: named,single
- name: SEND_SIGNAL
value: "true"
- name: SIGNAL
value: "1" # SIGHUP
- name: PROCESS_TO_SIGNAL
value: "nvidia-device-plugin"
volumeMounts:
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
- name: CONFIG_FILE_DST
value: "src/kubernetes/nvidia-gpu-slicing.yml"
- name: CONFIG_FILE
value: /config/config.yaml
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: available-configs
mountPath: /available-configs
- name: config
mountPath: /config
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: available-configs
configMap:
name: nvidia-device-plugin-daemonset
defaultMode: 444
- name: config
emptyDir: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-device-plugin-daemonset-service-account
namespace: kube-system

0 comments on commit 682b75c

Please sign in to comment.