From 23885da6ba37145365fec176001faa211a9cb855 Mon Sep 17 00:00:00 2001 From: BrianEllwood Date: Thu, 30 May 2024 07:49:18 +0000 Subject: [PATCH 1/3] add slicing manefest --- .../cluster/kubernetes-manifests.tf | 4 ++++ .../src/kubernetes/nvidia-device-plugin.yml | 2 ++ .../src/kubernetes/nvidia-gpu-slicing.yml | 16 ++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml diff --git a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf index a8c3c449d4..bcea36bd07 100644 --- a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf +++ b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf @@ -1,3 +1,7 @@ resource "kubernetes_manifest" "nvidia_device_plugin" { manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml")) } + +# resource "kubernetes_manifest" "nvidia_gpu_slicing" { +# manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml")) +# } diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml index d916338afd..a520d13303 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml @@ -50,6 +50,8 @@ spec: env: - name: FAIL_ON_INIT_ERROR value: "false" + - name: CONFIG_FILE_DST + value: "src/kubernetes/nvidia-gpu-slicing.yml" securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml new file mode 100644 index 0000000000..0814992a0c --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4 From 682b75c514c7bcec330320b70c2e6fd9d29df668 Mon Sep 17 00:00:00 2001 From: Emterry Date: Thu, 30 May 2024 13:00:27 +0000 Subject: [PATCH 2/3] update gpu-slicing --- .../cluster/kubernetes-manifests.tf | 18 +++- .../nvidia-cluster-role-binding.yml | 12 +++ .../src/kubernetes/nvidia-cluster-role.yml | 8 ++ .../src/kubernetes/nvidia-device-plugin.yml | 87 ++++++++++++++++++- .../src/kubernetes/nvidia-service-account.yml | 5 ++ 5 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml create mode 100644 terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml create mode 100644 terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml diff --git a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf index bcea36bd07..9b2ad08da9 100644 --- a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf +++ b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf @@ -2,6 +2,18 @@ resource "kubernetes_manifest" "nvidia_device_plugin" { manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml")) } -# resource "kubernetes_manifest" "nvidia_gpu_slicing" { -# manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml")) -# } +resource "kubernetes_manifest" "nvidia_gpu_slicing" { + manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml")) +} + +resource "kubernetes_manifest" "nvidia_cluster_role" { + manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml")) +} + +resource "kubernetes_manifest" "nvidia_cluster_role_binding" { + manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml")) +} + +resource "kubernetes_manifest" "nvidia_service_account" { + manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml")) +} diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml new file mode 100644 index 0000000000..568c7e8b3f --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-device-plugin-daemonset-role-binding +subjects: + - kind: ServiceAccount + name: nvidia-device-plugin-daemonset-service-account + namespace: kube-system +roleRef: + kind: ClusterRole + name: nvidia-device-plugin-daemonset-role + apiGroup: rbac.authorization.k8s.io diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml new file mode 100644 index 0000000000..0da53b18df --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml @@ -0,0 +1,8 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-device-plugin-daemonset-role +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml index a520d13303..ae2ad7f3ad 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml @@ -44,14 +44,87 @@ spec: # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" + shareProcessNamespace: true + serviceAccountName: nvidia-device-plugin-daemonset-service-account + initContainers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-init + command: ["config-manager"] + env: + - name: ONESHOT + value: "true" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "false" + - name: SIGNAL + value: "" + - name: PROCESS_TO_SIGNAL + value: "" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-sidecar + command: ["config-manager"] + env: + - name: ONESHOT + value: "false" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "true" + - name: SIGNAL + value: "1" # SIGHUP + - name: PROCESS_TO_SIGNAL + value: "nvidia-device-plugin" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR value: "false" - - name: CONFIG_FILE_DST - value: "src/kubernetes/nvidia-gpu-slicing.yml" + - name: CONFIG_FILE + value: /config/config.yaml securityContext: allowPrivilegeEscalation: false capabilities: @@ -59,7 +132,17 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: available-configs + configMap: + name: nvidia-device-plugin-daemonset + defaultMode: 444 + - name: config + emptyDir: {} diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml new file mode 100644 index 0000000000..4c112f6bbb --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-device-plugin-daemonset-service-account + namespace: kube-system From abb3be7804c22bab24b3719d6a9314c4afce921c Mon Sep 17 00:00:00 2001 From: Emterry Date: Thu, 30 May 2024 14:47:53 +0100 Subject: [PATCH 3/3] linter fix --- .../nvidia-cluster-role-binding.yml | 1 + .../src/kubernetes/nvidia-cluster-role.yml | 1 + .../src/kubernetes/nvidia-device-plugin.yml | 114 +++++++++--------- .../src/kubernetes/nvidia-service-account.yml | 1 + 4 files changed, 60 insertions(+), 57 deletions(-) diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml index 568c7e8b3f..50f4513568 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml index 0da53b18df..17c895bf45 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml @@ -1,3 +1,4 @@ +--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml index ae2ad7f3ad..edd96fce7a 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml @@ -47,68 +47,68 @@ spec: shareProcessNamespace: true serviceAccountName: nvidia-device-plugin-daemonset-service-account initContainers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 - name: nvidia-device-plugin-init - command: ["config-manager"] - env: - - name: ONESHOT - value: "true" - - name: KUBECONFIG - value: "" - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: "spec.nodeName" - - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" - - name: CONFIG_FILE_SRCDIR - value: "/available-configs" - - name: CONFIG_FILE_DST - value: "/config/config.yaml" - - name: DEFAULT_CONFIG - value: "" - - name: FALLBACK_STRATEGIES - value: named,single - - name: SEND_SIGNAL - value: "false" - - name: SIGNAL - value: "" - - name: PROCESS_TO_SIGNAL - value: "" - volumeMounts: - - name: available-configs - mountPath: /available-configs - - name: config - mountPath: /config + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-init + command: ["config-manager"] + env: + - name: ONESHOT + value: "true" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "false" + - name: SIGNAL + value: "" + - name: PROCESS_TO_SIGNAL + value: "" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config containers: - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 name: nvidia-device-plugin-sidecar command: ["config-manager"] env: - - name: ONESHOT - value: "false" - - name: KUBECONFIG - value: "" - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: "spec.nodeName" - - name: NODE_LABEL - value: "nvidia.com/device-plugin.config" - - name: CONFIG_FILE_SRCDIR - value: "/available-configs" - - name: CONFIG_FILE_DST - value: "/config/config.yaml" - - name: DEFAULT_CONFIG - value: "" - - name: FALLBACK_STRATEGIES - value: named,single - - name: SEND_SIGNAL - value: "true" - - name: SIGNAL - value: "1" # SIGHUP - - name: PROCESS_TO_SIGNAL - value: "nvidia-device-plugin" + - name: ONESHOT + value: "false" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "true" + - name: SIGNAL + value: "1" # SIGHUP + - name: PROCESS_TO_SIGNAL + value: "nvidia-device-plugin" volumeMounts: - name: available-configs mountPath: /available-configs diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml index 4c112f6bbb..8ad61ac98c 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml @@ -1,3 +1,4 @@ +--- apiVersion: v1 kind: ServiceAccount metadata: