From 6d0642cc9676e570ff3fdaaed4dfa13b7fb12695 Mon Sep 17 00:00:00 2001 From: Stefan Bogdan Date: Tue, 12 Nov 2024 04:40:27 +0100 Subject: [PATCH] update README, docker-compose.yaml and change performance graph colours (#39) --- README.md | 34 ++++++------------- .../scripts/python/performance-graphs.py | 6 ++-- docker-compose.yml | 8 +++-- 3 files changed, 18 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index b51c495..a3194eb 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This repo contains a tool for benchmarking Weaviate performance. * 📊 results and context can be found in the [Weaviate documentation](https://weaviate.io/developers/weaviate/current/benchmarks/) * 💬 discuss the results on our [Slack channel](https://join.slack.com/t/weaviate/shared_invite/zt-goaoifjr-o8FuVz9b1HLzhlUfyfddhw) or [Twitter](https://twitter.com/weaviate_io) -### ANN benchmark +## ANN benchmark There are two components you will need to run for the benchmarks: @@ -16,42 +16,30 @@ There are two components you will need to run for the benchmarks: You can run both as containers on the same machine via Docker compose. -For replicating our benchmarks we recommend setting up two separate machines. +For replicating our benchmarks we recommend setting the following machine: -| Machine description | CPU type | CPUs | Memory | Disk size | Disk type | Misc. | +| Machine name | CPU type | CPUs | Memory | Disk size | Disk type | Misc. | | --- | --- | --- | --- | --- | --- | --- | -| Machine to run Weaviate | c2 | 30 | 120GB | 500GB | SSD | [Ubuntu 22.04 with Docker-compose](https://gist.github.com/bobvanluijt/04f6d97916244a7de59fead84ef63cd4) | -| Machine to run benchmark script | N2 | 8 | 64GB | 500GB | SSD | [Ubuntu 22.04 with Docker-compose](https://gist.github.com/bobvanluijt/04f6d97916244a7de59fead84ef63cd4) | +| `n4-highmem-16` | N4 | 16 | 128GB | 512GB | Hyperdisk Balanced | Debian 12 (bookworm) with [Docker and Compose V2](https://gist.github.com/StefanBogdan/821d18bbc5f18978643adff508749cf0) | -#### Prepare the Weaviate machine +### Run tests -Clone this repo and cd into it `$ git clone https://github.com/semi-technologies/weaviate-benchmarking && cd weaviate-benchmarking` - -Run the following command to spin up Weaviate: `$ docker-compose up weaviate -d` - -Copy the IP address and amount of CPU cores this machine has. - -#### Prepare the benchmark machine - -Check if the Weaviate machine is available: `curl http://{IP OF WEAVIATE INSTANCE}/v1/meta`. Note that the instance runs on port `8080`, e.g., `http://10.128.15.12:8080/v1/meta`. -You will also need to allow port `50051` for gRPC and `21121` for metrics. You can verify this via `nc -zv localhost 50051`. - -Clone this repo and cd into it `$ git clone https://github.com/semi-technologies/weaviate-benchmarking && cd weaviate-benchmarking` +Clone this repo and cd into it `$ git clone https://github.com/weaviate/weaviate-benchmarking && cd weaviate-benchmarking` Download the files into a datasets folder as outlined below. ```sh mkdir datasets && \ - curl -o ./datasets/dbpedia-100k-openai-ada002-angular.hdf5 https://storage.googleapis.com/ann-datasets/custom/dbpedia-100k-openai-ada002-angular.hdf5 \ - curl -o ./datasets/deep-image-96-angular.hdf5 https://ann-benchmarks.com/deep-image-96-angular.hdf5 && \ - curl -o ./datasets/mnist-784-euclidean.hdf5 https://ann-benchmarks.com/mnist-784-euclidean.hdf5 && \ - curl -o ./datasets/gist-960-euclidean.hdf5 https://ann-benchmarks.com/gist-960-euclidean.hdf5 + curl -o ./datasets/dbpedia-openai-1000k-angular.hdf5 https://storage.googleapis.com/ann-datasets/ann-benchmarks/dbpedia-openai-1000k-angular.hdf5 && \ + curl -o ./datasets/snowflake-msmarco-arctic-embed-m-v1.5-angular.hdf5 https://storage.googleapis.com/ann-datasets/custom/snowflake-msmarco-arctic-embed-m-v1.5-angular.hdf5 && \ + curl -o ./datasets/sift-128-euclidean.hdf5 http://ann-benchmarks.com/sift-128-euclidean.hdf5 && \ + curl -o ./datasets/sphere-10M-meta-dpr.hdf5 https://storage.googleapis.com/ann-datasets/custom/sphere-10M-meta-dpr.hdf5 ``` Run a single performance test on an [ann-benchmarks](https://ann-benchmarks.com/) hdf5 dataset. ```sh -DATASET=./datasets/dbpedia-100k-openai-ada002-angular.hdf5 DISTANCE=cosine docker compose run benchmarker +DATASET=./datasets/dbpedia-openai-1000k-angular.hdf5 DISTANCE=cosine docker compose up --abort-on-container-exit ``` For more details on additional configuration options see the help options. diff --git a/benchmarker/scripts/python/performance-graphs.py b/benchmarker/scripts/python/performance-graphs.py index e76bc2d..8741444 100644 --- a/benchmarker/scripts/python/performance-graphs.py +++ b/benchmarker/scripts/python/performance-graphs.py @@ -4,8 +4,6 @@ import glob import json import argparse -import seaborn as sns -import matplotlib.ticker as tkr import matplotlib.pyplot as plt import pandas as pd @@ -40,8 +38,8 @@ def create_plot(results_df: pd.DataFrame, mode='light'): # Set custom colors for limits color_map = { - 100: '#098f73', - 10: '#2b17e7' + 100: '#61bd73', + 10: '#fc3988' } # Configure plot style based on mode diff --git a/docker-compose.yml b/docker-compose.yml index fe1262e..4511421 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,8 +7,8 @@ services: dockerfile: Dockerfile command: > /app/benchmarker ann-benchmark - -v ${DATASET:-./datasets/dbpedia-100k-openai-ada002-angular.hdf5 } - -d ${DISTANCE:-cosine} + --vectors ${DATASET:-./datasets/dbpedia-100k-openai-ada002-angular.hdf5 } + --distance ${DISTANCE:-cosine} --grpcOrigin ${GRPC_ORIGIN:-weaviate:50051} --httpOrigin ${HTTP_ORIGIN:-weaviate:8080} volumes: @@ -22,12 +22,14 @@ services: - '8080' - --scheme - http - image: docker.io/semitechnologies/weaviate:1.25.8 + image: docker.io/semitechnologies/weaviate:1.27.1 ports: - 8080:8080 - 50051:50051 - 2112:2112 restart: on-failure:0 + volumes: + - "$PWD/weaviate-data:/var/lib/weaviate" environment: QUERY_DEFAULTS_LIMIT: 25 AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'