diff --git a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf index a8c3c449d4..9b2ad08da9 100644 --- a/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf +++ b/terraform/aws/analytical-platform-development/cluster/kubernetes-manifests.tf @@ -1,3 +1,19 @@ resource "kubernetes_manifest" "nvidia_device_plugin" { manifest = yamldecode(file("src/kubernetes/nvidia-device-plugin.yml")) } + +resource "kubernetes_manifest" "nvidia_gpu_slicing" { + manifest = yamldecode(file("src/kubernetes/nvidia-gpu-slicing.yml")) +} + +resource "kubernetes_manifest" "nvidia_cluster_role" { + manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role.yml")) +} + +resource "kubernetes_manifest" "nvidia_cluster_role_binding" { + manifest = yamldecode(file("src/kubernetes/nvidia-cluster-role-binding.yml")) +} + +resource "kubernetes_manifest" "nvidia_service_account" { + manifest = yamldecode(file("src/kubernetes/nvidia-service-account.yml")) +} diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml new file mode 100644 index 0000000000..50f4513568 --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role-binding.yml @@ -0,0 +1,13 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-device-plugin-daemonset-role-binding +subjects: + - kind: ServiceAccount + name: nvidia-device-plugin-daemonset-service-account + namespace: kube-system +roleRef: + kind: ClusterRole + name: nvidia-device-plugin-daemonset-role + apiGroup: rbac.authorization.k8s.io diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml new file mode 100644 index 0000000000..17c895bf45 --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-cluster-role.yml @@ -0,0 +1,9 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-device-plugin-daemonset-role +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml index d916338afd..edd96fce7a 100644 --- a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-device-plugin.yml @@ -44,12 +44,87 @@ spec: # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" + shareProcessNamespace: true + serviceAccountName: nvidia-device-plugin-daemonset-service-account + initContainers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-init + command: ["config-manager"] + env: + - name: ONESHOT + value: "true" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "false" + - name: SIGNAL + value: "" + - name: PROCESS_TO_SIGNAL + value: "" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-sidecar + command: ["config-manager"] + env: + - name: ONESHOT + value: "false" + - name: KUBECONFIG + value: "" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: "spec.nodeName" + - name: NODE_LABEL + value: "nvidia.com/device-plugin.config" + - name: CONFIG_FILE_SRCDIR + value: "/available-configs" + - name: CONFIG_FILE_DST + value: "/config/config.yaml" + - name: DEFAULT_CONFIG + value: "" + - name: FALLBACK_STRATEGIES + value: named,single + - name: SEND_SIGNAL + value: "true" + - name: SIGNAL + value: "1" # SIGHUP + - name: PROCESS_TO_SIGNAL + value: "nvidia-device-plugin" + volumeMounts: + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR value: "false" + - name: CONFIG_FILE + value: /config/config.yaml securityContext: allowPrivilegeEscalation: false capabilities: @@ -57,7 +132,17 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: available-configs + mountPath: /available-configs + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: available-configs + configMap: + name: nvidia-device-plugin-daemonset + defaultMode: 444 + - name: config + emptyDir: {} diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml new file mode 100644 index 0000000000..0814992a0c --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-gpu-slicing.yml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 4 diff --git a/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml new file mode 100644 index 0000000000..8ad61ac98c --- /dev/null +++ b/terraform/aws/analytical-platform-development/cluster/src/kubernetes/nvidia-service-account.yml @@ -0,0 +1,6 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-device-plugin-daemonset-service-account + namespace: kube-system