diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 2ca810cc..9e0ddb77 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -20,7 +20,11 @@ modelServers: default: "vllm/vllm-openai:v0.6.3.post1" cpu: "substratusai/vllm:v0.6.3.post1-cpu" google-tpu: "substratusai/vllm:v0.6.3.post1-tpu" - gh200: "drikster80/vllm-gh200-openai:v0.6.3.post1" + nvidia-gpu: "vllm/vllm-openai:v0.6.3.post1" + # TODO (samos123) switch to the official image when it is available. + # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1. + # Source: https://github.com/drikster80/vllm/tree/gh200-docker + gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1" OLlama: images: default: "ollama/ollama:latest" diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 3e52c3c2..386092f2 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -161,6 +161,25 @@ catalog: # You can also use nvidia-gpu-a100-80gb:8 resourceProfile: nvidia-gpu-h100:8 targetRequests: 500 + llama-3.1-70b-instruct-fp8-gh200: + enabled: true + features: [TextGeneration] + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + env: + VLLM_ATTENTION_BACKEND: FLASHINFER + args: + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --max-num-seqs=1024 + - --gpu-memory-utilization=0.9 + - --enable-prefix-caching + - --enable-chunked-prefill=false + - --disable-log-requests + - --kv-cache-dtype=fp8 + - --enforce-eager + resourceProfile: nvidia-gpu-gh200:1 + targetRequests: 1024 llama-3.1-70b-instruct-awq-int4-gh200: enabled: false features: [TextGeneration] diff --git a/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml b/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml index 5c595d7d..dd2ef872 100644 --- a/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml +++ b/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml @@ -1,3 +1,4 @@ +# Source: models/templates/models.yaml apiVersion: kubeai.org/v1 kind: Model metadata: @@ -13,5 +14,4 @@ spec: - --enable-prefix-caching - --disable-log-requests targetRequests: 50 - minReplicas: 1 - resourceProfile: nvidia-gpu-gh200:1 \ No newline at end of file + resourceProfile: nvidia-gpu-gh200:1 diff --git a/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml b/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml new file mode 100644 index 00000000..6e690e3a --- /dev/null +++ b/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml @@ -0,0 +1,24 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: llama-3.1-70b-instruct-fp8-gh200 +spec: + features: [TextGeneration] + owner: + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + args: + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --max-num-seqs=1024 + - --gpu-memory-utilization=0.9 + - --enable-prefix-caching + - --enable-chunked-prefill=false + - --disable-log-requests + - --kv-cache-dtype=fp8 + - --enforce-eager + env: + VLLM_ATTENTION_BACKEND: FLASHINFER + targetRequests: 1024 + resourceProfile: nvidia-gpu-gh200:1