Skip to content

Commit

Permalink
Add filtered BEIR benchmark dataset using Cohere embeddings (#34)
Browse files Browse the repository at this point in the history
* Add filtered BEIR benchmark dataset using Cohere embeddings

* Update requirements.txt
  • Loading branch information
trengrj authored Dec 12, 2024
1 parent 324966c commit 8f6a1c1
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 14 deletions.
57 changes: 43 additions & 14 deletions benchmarker/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,46 @@
contourpy==1.2.1
aiohappyeyeballs==2.4.4
aiohttp==3.11.10
aiosignal==1.3.1
attrs==24.2.0
certifi==2024.8.30
charset-normalizer==3.4.0
click==8.1.7
contourpy==1.3.1
cycler==0.12.1
fonttools==4.51.0
kiwisolver==1.4.5
matplotlib==3.8.4
numpy==1.26.4
packaging==24.0
pandas==2.2.2
pillow==10.3.0
pyparsing==3.1.2
datasets==3.2.0
dill==0.3.8
faiss-cpu==1.9.0.post1
filelock==3.16.1
fonttools==4.55.3
frozenlist==1.5.0
fsspec==2024.9.0
h5py==3.12.1
huggingface-hub==0.26.5
idna==3.10
joblib==1.4.2
jsonlines==4.0.0
kiwisolver==1.4.7
matplotlib==3.9.3
multidict==6.1.0
multiprocess==0.70.16
nltk==3.9.1
numpy==2.2.0
packaging==24.2
pandas==2.2.3
pillow==11.0.0
propcache==0.2.1
pyarrow==18.1.0
pyparsing==3.2.0
python-dateutil==2.9.0.post0
pytz==2024.1
pytz==2024.2
PyYAML==6.0.2
regex==2024.11.6
requests==2.32.3
seaborn==0.13.2
six==1.16.0
tzdata==2024.1
faiss-cpu
h5py
six==1.17.0
tqdm==4.67.1
typing_extensions==4.12.2
tzdata==2024.2
urllib3==2.2.3
xxhash==3.5.0
yarl==1.18.3
99 changes: 99 additions & 0 deletions benchmarker/scripts/python/generate-filtered-beir-dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pandas as pd
import numpy as np
from datasets import load_dataset
import h5py
import faiss, json, random
import os

NEGATIVE_FILTERS = False

datasets = ['arguana', 'cqadupstack-android', 'cqadupstack-english', 'cqadupstack-gaming', 'cqadupstack-gis', 'cqadupstack-mathematica', 'cqadupstack-physics', 'cqadupstack-programmers', 'cqadupstack-stats', 'cqadupstack-text', 'cqadupstack-unix', 'cqadupstack-webmasters', 'cqadupstack-wordpress', 'fiqa', 'nfcorpus', 'quora', 'robust04', 'scidocs', 'scifact', 'trec-covid', 'trec-news', 'webis-touche2020']

def process_dataset(dataset_name):
df_corpus = load_dataset("Cohere/beir-embed-english-v3", f"{dataset_name}-corpus", split="train")
embeddings = df_corpus[:50000]['emb']
dataset_names = [dataset_name] * len(embeddings)
return pd.DataFrame({'embedding': embeddings, 'dataset': dataset_names})

all_data = pd.concat([process_dataset(dataset) for dataset in datasets], ignore_index=True)
all_data['embedding'] = all_data['embedding'].apply(lambda x: np.array(x, dtype=np.float32))
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

train_size = len(all_data) - 10000
train_data = all_data[:train_size]
test_data = all_data[train_size:]

dataset_to_id = {dataset: idx for idx, dataset in enumerate(datasets)}

def generate_json_properties(category_id):
return json.dumps({"category": str(category_id)})

def get_random_different_category(original_category, num_categories):
categories = list(range(num_categories))
categories.remove(original_category)
return random.choice(categories)

train_categories = np.array([dataset_to_id[dataset] for dataset in train_data['dataset']], dtype=np.int64)
test_categories = np.array([dataset_to_id[dataset] for dataset in test_data['dataset']], dtype=np.int64)

if NEGATIVE_FILTERS:
test_categories = np.array([get_random_different_category(cat, len(datasets)) for cat in test_categories], dtype=np.int64)

train_properties = [generate_json_properties(cat) for cat in train_categories]
test_properties = [generate_json_properties(cat) for cat in test_categories]

# Generate filters
filters = []
for value in test_categories:
filter_data = {
"path": ["category"],
"valueText": str(value),
"operation": "Equal"
}
filters.append(json.dumps(filter_data))

train_embeddings = np.vstack(train_data['embedding'].values)
test_embeddings = np.vstack(test_data['embedding'].values)

# Build Faiss index
dimensions = train_embeddings.shape[1]
index = faiss.IndexFlatIP(dimensions)
index.add(train_embeddings)

neighbors_data = np.zeros((len(filters), 100), dtype=np.int64)
for i, filter_data in enumerate(filters):
print(f"Brute force query {i + 1}/{len(filters)}")
json_filter = json.loads(filter_data)
category = int(json_filter["valueText"])
train_indices = np.where(train_categories == category)[0]
selector = faiss.IDSelectorArray(train_indices)
search_params = faiss.SearchParameters(sel=selector)
D, I = index.search(test_embeddings[i].reshape(1, -1), 100, params=search_params)
neighbors_data[i] = I[0]

filename = "beir-cohere-dot-filtered-negative.hdf5 " if NEGATIVE_FILTERS else "beir-cohere-dot-filtered.hdf5"

with h5py.File(filename, 'w') as hf:
hf.create_dataset("train", data=train_embeddings)
hf.create_dataset("test", data=test_embeddings)
hf.create_dataset("train_categories", data=train_categories)
hf.create_dataset("test_categories", data=test_categories)
hf.create_dataset("train_properties", data=np.array(train_properties, dtype=h5py.special_dtype(vlen=str)))
hf.create_dataset("test_properties", data=np.array(test_properties, dtype=h5py.special_dtype(vlen=str)))
hf.create_dataset("filters", data=np.array(filters, dtype=h5py.special_dtype(vlen=str)))
hf.create_dataset("neighbors", data=neighbors_data)

# Print file size and some information
file_size = os.path.getsize(filename)
print(f"File size: {file_size / (1024 * 1024):.2f} MB")
print(f"Train dimensions: {train_embeddings.shape}")
print(f"Test dimensions: {test_embeddings.shape}")
print(f"Neighbors dimensions: {neighbors_data.shape}")
print(f"Number of unique categories: {len(np.unique(np.concatenate([train_categories, test_categories])))}")
print(f"Sample filter: {filters[0]}")
print(f"Sample train property: {train_properties[0]}")
print(f"Sample test property: {test_properties[0]}")




0 comments on commit 8f6a1c1

Please sign in to comment.