opendatahub-io · openshift-merge-bot · Jan 10, 2025 · Dec 11, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/internal/controller/constants/constants.go b/internal/controller/constants/constants.go
@@ -73,6 +73,7 @@ const (
 	DefaultStorageConfig             = "storage-config"
 	IntervalValue                    = "1m"
 	RequestRateInterval              = "5m"
+	GPUKVCacheSamplingInterval       = "24h"
 	OvmsImageName                    = "openvino_model_server"
 	TgisImageName                    = "text-generation-inference"
 	VllmImageName                    = "vllm"

diff --git a/internal/controller/constants/runtime-metrics.go b/internal/controller/constants/runtime-metrics.go
@@ -241,13 +241,13 @@ const (
 					}
 				]
 			},
-{
+			{
 				"title": "Average response time (ms)",
 				"type": "MEAN_LATENCY",
 				"queries": [
 					{
 						"title": "Average e2e latency",
-						"query": "sum by (model_name) (rate(e2e_request_latency_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000) / sum by (model_name) (rate(e2e_request_latency_seconds_count{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000)"
+						"query": "(rate(e2e_request_latency_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000) / (rate(e2e_request_latency_seconds_count{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000)"
 					}
 				]
 			},
@@ -270,6 +270,82 @@ const (
 						"query":  "sum(container_memory_working_set_bytes{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'})/sum(kube_pod_resource_limit{resource='memory', pod=~'${MODEL_NAME}-predictor-.*', namespace='${NAMESPACE}'})"
 					}
 				]
+			},
+			{
+				"title": "GPU cache usage over time",
+				"type": "KV_CACHE",
+				"queries": [
+					{
+						"title": "GPU cache usage over time",
+						"query": "sum_over_time(gpu_cache_usage_perc{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${KV_CACHE_SAMPLING_RATE}])"
+					}
+				]
+			},
+			{
+				"title": "Current running, waiting, and max requests count",
+				"type": "CURRENT_REQUESTS",
+				"queries": [
+					{
+						"title": "Requests waiting",
+						"query":  "num_requests_waiting{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}"
+					},
+					{
+						"title": "Requests running",
+						"query":  "num_requests_running{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}"
+					},
+					{
+						"title": "Max requests",
+						"query":  "num_request_max{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}"
+					}
+				]
+			},
+			{
+				"title": "Tokens count",
+				"type": "TOKENS_COUNT",
+				"queries": [
+					{
+						"title": "Total prompts token",
+						"query":  "round(rate(prompt_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]))"
+					},
+					{
+						"title": "Total generation token",
+						"query":  "round(rate(generation_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]))"
+					}
+				]
+			},
+			{
+				"title": "Time to first token",
+				"type": "TIME_TO_FIRST_TOKEN",
+				"queries": [
+					{
+						"title": "Time to first token",
+						"query": "rate(time_to_first_token_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])"
+					}
+				]
+			},
+			{
+				"title": "Time per output token",
+				"type": "TIME_PER_OUTPUT_TOKEN",
+				"queries": [
+					{
+						"title": "Time per output token",
+						"query": "rate(time_per_output_token_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])"
+					}
+				]
+			},
+			{
+				"title": "Requests outcomes",
+				"type": "REQUEST_OUTCOMES",
+				"queries": [
+					{
+						"title": "Number of successful incoming requests",
+						"query": "round(sum(increase(request_success_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))"
+					},
+					{
+						"title": "Number of failed incoming requests",
+						"query": "round(sum(increase(request_failure_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))"
+					}
+				]
 			}
 		]
     }`

diff --git a/internal/controller/utils/utils.go b/internal/controller/utils/utils.go
@@ -428,6 +428,7 @@ func SubstituteVariablesInQueries(data string, namespace string, name string) st
 		"${NAMESPACE}", namespace,
 		"${MODEL_NAME}", name,
 		"${RATE_INTERVAL}", constants.IntervalValue,
-		"${REQUEST_RATE_INTERVAL}", constants.RequestRateInterval)
+		"${REQUEST_RATE_INTERVAL}", constants.RequestRateInterval,
+		"${KV_CACHE_SAMPLING_RATE}", constants.GPUKVCacheSamplingInterval)
 	return replacer.Replace(data)
 }