diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 2ca810cc..9e0ddb77 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -20,7 +20,11 @@ modelServers: default: "vllm/vllm-openai:v0.6.3.post1" cpu: "substratusai/vllm:v0.6.3.post1-cpu" google-tpu: "substratusai/vllm:v0.6.3.post1-tpu" - gh200: "drikster80/vllm-gh200-openai:v0.6.3.post1" + nvidia-gpu: "vllm/vllm-openai:v0.6.3.post1" + # TODO (samos123) switch to the official image when it is available. + # Note this is simply a clone of drikster80/vllm-gh200-openai:v0.6.3.post1. + # Source: https://github.com/drikster80/vllm/tree/gh200-docker + gh200: "substratusai/vllm-gh200-openai:v0.6.3.post1" OLlama: images: default: "ollama/ollama:latest" diff --git a/charts/models/values.yaml b/charts/models/values.yaml index a02cf28e..aabee642 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -161,6 +161,25 @@ catalog: # You can also use nvidia-gpu-a100-80gb:8 resourceProfile: nvidia-gpu-h100:8 targetRequests: 500 + llama-3.1-70b-instruct-fp8-gh200: + enabled: true + features: [TextGeneration] + url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 + engine: VLLM + env: + VLLM_ATTENTION_BACKEND: FLASHINFER + args: + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --max-num-seqs=1024 + - --gpu-memory-utilization=0.9 + - --enable-prefix-caching + - --enable-chunked-prefill=false + - --disable-log-requests + - --kv-cache-dtype=fp8 + - --enforce-eager + resourceProfile: nvidia-gpu-gh200:1 + targetRequests: 1024 llama-3.1-70b-instruct-fp8-gh200: enabled: true features: [TextGeneration]