Add k6 benchmark for chat threads

substratusai · Dec 13, 2024 · 83433cc · 83433cc
1 parent 47cce1b
commit 83433cc
Show file tree

Hide file tree

Showing 11 changed files with 22,741 additions and 3 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,6 +1,7 @@
 # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
 # Ignore build and test binaries.
 bin/
+benchmarks/
 charts/
 components/
 docs/

diff --git a/benchmarks/chat/.gitignore b/benchmarks/chat/.gitignore
@@ -0,0 +1 @@
+ShareGPT_V3_unfiltered_cleaned_split.json
diff --git a/benchmarks/chat/Makefile b/benchmarks/chat/Makefile
@@ -0,0 +1,5 @@
+ShareGPT_V3_unfiltered_cleaned_split.json:
+	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+prepare-message-threads: ShareGPT_V3_unfiltered_cleaned_split.json
+	python prepare-message-threads.py
diff --git a/benchmarks/chat/k6.js b/benchmarks/chat/k6.js
@@ -0,0 +1,87 @@
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
+import http from 'k6/http';
+import { Trend, Counter } from 'k6/metrics';
+
+const model_addr = __ENV.MODEL_ADDR;
+const model_id = __ENV.MODEL_ID;
+const timePerToken = new Trend('time_per_token', true);
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');
+const max_new_tokens = 50;
+
+const messageThreads = JSON.parse(open("message-threads.json"))
+
+export const options = {
+    thresholds: {
+        http_req_failed: ['rate==0'],
+    },
+    scenarios: {
+        chat: {
+            executor: 'shared-iterations',
+            // Number of VUs to run concurrently.
+            vus: 20,
+            // Total number of script iterations to execute across all VUs (b/c using 'shared-iterations' executor).
+            iterations: 200,
+            maxDuration: '120s',
+        },
+    },
+};
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const msgThread = messageThreads[scenario.iterationInTest % messageThreads.length];
+    var payload = {
+        "messages": [],
+        "temperature": 0,
+        "model": `${model_id}`,
+        "max_tokens": max_new_tokens
+    };
+
+    // console.log(`Message thread: ${JSON.stringify(msgThread)}`);
+
+    // Iterate over all the messages in the thread, appending the completions to the same payload.
+    for (let i = 0; i < msgThread["userMessages"].length; i++) {
+        payload.messages.push({
+            "role": "user",
+            "content": msgThread["userMessages"][i]
+        });
+        //console.log(`Payload: ${JSON.stringify(payload)}`);
+
+        const res = http.post(`http://${model_addr}/v1/chat/completions`, JSON.stringify(payload), {
+            headers,
+        });
+        if (res.status >= 400 && res.status < 500) {
+            return;
+        }
+
+        check(res, {
+            'Post status is 200': (res) => res.status === 200,
+        });
+        const duration = res.timings.duration;
+
+        if (res.status === 200) {
+            // console.log(`Status: ${res.status}`);
+            const body = res.json();
+
+            const completion_tokens = body.usage.completion_tokens;
+            const prompt_tokens = body.usage.prompt_tokens;
+            const latency_ms_per_token = duration / completion_tokens;
+
+            new_tokens.add(completion_tokens);
+            input_tokens.add(prompt_tokens);
+            timePerToken.add(latency_ms_per_token);
+            tokens.add(completion_tokens + prompt_tokens);
+
+            const msg0 = body.choices[0].message;
+            payload.messages.push({
+                "role": msg0.role,
+                "content": msg0.content
+            });
+        } else {
+            console.log(`Error Status: ${res.status}`);
+            console.log(`Response: ${res.body}`);
+        }
+    }
+}
diff --git a/benchmarks/chat/k8s/pod.yaml b/benchmarks/chat/k8s/pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: chat-benchmark
+spec:
+  restartPolicy: Never
+  containers:
+    - name: k6
+      image: grafana/k6
+      command: ["sleep", "infinity"]
+      #args: ["run", "/config/k6.js"] #, "--http-debug"]
+      volumeMounts:
+        - name: work
+          mountPath: /work
+  volumes:
+    - name: work
+      configMap:
+        name: chat-benchmark