-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add filtered BEIR benchmark dataset using Cohere embeddings (#34)
* Add filtered BEIR benchmark dataset using Cohere embeddings * Update requirements.txt
- Loading branch information
Showing
2 changed files
with
142 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,46 @@ | ||
contourpy==1.2.1 | ||
aiohappyeyeballs==2.4.4 | ||
aiohttp==3.11.10 | ||
aiosignal==1.3.1 | ||
attrs==24.2.0 | ||
certifi==2024.8.30 | ||
charset-normalizer==3.4.0 | ||
click==8.1.7 | ||
contourpy==1.3.1 | ||
cycler==0.12.1 | ||
fonttools==4.51.0 | ||
kiwisolver==1.4.5 | ||
matplotlib==3.8.4 | ||
numpy==1.26.4 | ||
packaging==24.0 | ||
pandas==2.2.2 | ||
pillow==10.3.0 | ||
pyparsing==3.1.2 | ||
datasets==3.2.0 | ||
dill==0.3.8 | ||
faiss-cpu==1.9.0.post1 | ||
filelock==3.16.1 | ||
fonttools==4.55.3 | ||
frozenlist==1.5.0 | ||
fsspec==2024.9.0 | ||
h5py==3.12.1 | ||
huggingface-hub==0.26.5 | ||
idna==3.10 | ||
joblib==1.4.2 | ||
jsonlines==4.0.0 | ||
kiwisolver==1.4.7 | ||
matplotlib==3.9.3 | ||
multidict==6.1.0 | ||
multiprocess==0.70.16 | ||
nltk==3.9.1 | ||
numpy==2.2.0 | ||
packaging==24.2 | ||
pandas==2.2.3 | ||
pillow==11.0.0 | ||
propcache==0.2.1 | ||
pyarrow==18.1.0 | ||
pyparsing==3.2.0 | ||
python-dateutil==2.9.0.post0 | ||
pytz==2024.1 | ||
pytz==2024.2 | ||
PyYAML==6.0.2 | ||
regex==2024.11.6 | ||
requests==2.32.3 | ||
seaborn==0.13.2 | ||
six==1.16.0 | ||
tzdata==2024.1 | ||
faiss-cpu | ||
h5py | ||
six==1.17.0 | ||
tqdm==4.67.1 | ||
typing_extensions==4.12.2 | ||
tzdata==2024.2 | ||
urllib3==2.2.3 | ||
xxhash==3.5.0 | ||
yarl==1.18.3 |
99 changes: 99 additions & 0 deletions
99
benchmarker/scripts/python/generate-filtered-beir-dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from datasets import load_dataset | ||
import h5py | ||
import faiss, json, random | ||
import os | ||
|
||
NEGATIVE_FILTERS = False | ||
|
||
datasets = ['arguana', 'cqadupstack-android', 'cqadupstack-english', 'cqadupstack-gaming', 'cqadupstack-gis', 'cqadupstack-mathematica', 'cqadupstack-physics', 'cqadupstack-programmers', 'cqadupstack-stats', 'cqadupstack-text', 'cqadupstack-unix', 'cqadupstack-webmasters', 'cqadupstack-wordpress', 'fiqa', 'nfcorpus', 'quora', 'robust04', 'scidocs', 'scifact', 'trec-covid', 'trec-news', 'webis-touche2020'] | ||
|
||
def process_dataset(dataset_name): | ||
df_corpus = load_dataset("Cohere/beir-embed-english-v3", f"{dataset_name}-corpus", split="train") | ||
embeddings = df_corpus[:50000]['emb'] | ||
dataset_names = [dataset_name] * len(embeddings) | ||
return pd.DataFrame({'embedding': embeddings, 'dataset': dataset_names}) | ||
|
||
all_data = pd.concat([process_dataset(dataset) for dataset in datasets], ignore_index=True) | ||
all_data['embedding'] = all_data['embedding'].apply(lambda x: np.array(x, dtype=np.float32)) | ||
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True) | ||
|
||
train_size = len(all_data) - 10000 | ||
train_data = all_data[:train_size] | ||
test_data = all_data[train_size:] | ||
|
||
dataset_to_id = {dataset: idx for idx, dataset in enumerate(datasets)} | ||
|
||
def generate_json_properties(category_id): | ||
return json.dumps({"category": str(category_id)}) | ||
|
||
def get_random_different_category(original_category, num_categories): | ||
categories = list(range(num_categories)) | ||
categories.remove(original_category) | ||
return random.choice(categories) | ||
|
||
train_categories = np.array([dataset_to_id[dataset] for dataset in train_data['dataset']], dtype=np.int64) | ||
test_categories = np.array([dataset_to_id[dataset] for dataset in test_data['dataset']], dtype=np.int64) | ||
|
||
if NEGATIVE_FILTERS: | ||
test_categories = np.array([get_random_different_category(cat, len(datasets)) for cat in test_categories], dtype=np.int64) | ||
|
||
train_properties = [generate_json_properties(cat) for cat in train_categories] | ||
test_properties = [generate_json_properties(cat) for cat in test_categories] | ||
|
||
# Generate filters | ||
filters = [] | ||
for value in test_categories: | ||
filter_data = { | ||
"path": ["category"], | ||
"valueText": str(value), | ||
"operation": "Equal" | ||
} | ||
filters.append(json.dumps(filter_data)) | ||
|
||
train_embeddings = np.vstack(train_data['embedding'].values) | ||
test_embeddings = np.vstack(test_data['embedding'].values) | ||
|
||
# Build Faiss index | ||
dimensions = train_embeddings.shape[1] | ||
index = faiss.IndexFlatIP(dimensions) | ||
index.add(train_embeddings) | ||
|
||
neighbors_data = np.zeros((len(filters), 100), dtype=np.int64) | ||
for i, filter_data in enumerate(filters): | ||
print(f"Brute force query {i + 1}/{len(filters)}") | ||
json_filter = json.loads(filter_data) | ||
category = int(json_filter["valueText"]) | ||
train_indices = np.where(train_categories == category)[0] | ||
selector = faiss.IDSelectorArray(train_indices) | ||
search_params = faiss.SearchParameters(sel=selector) | ||
D, I = index.search(test_embeddings[i].reshape(1, -1), 100, params=search_params) | ||
neighbors_data[i] = I[0] | ||
|
||
filename = "beir-cohere-dot-filtered-negative.hdf5 " if NEGATIVE_FILTERS else "beir-cohere-dot-filtered.hdf5" | ||
|
||
with h5py.File(filename, 'w') as hf: | ||
hf.create_dataset("train", data=train_embeddings) | ||
hf.create_dataset("test", data=test_embeddings) | ||
hf.create_dataset("train_categories", data=train_categories) | ||
hf.create_dataset("test_categories", data=test_categories) | ||
hf.create_dataset("train_properties", data=np.array(train_properties, dtype=h5py.special_dtype(vlen=str))) | ||
hf.create_dataset("test_properties", data=np.array(test_properties, dtype=h5py.special_dtype(vlen=str))) | ||
hf.create_dataset("filters", data=np.array(filters, dtype=h5py.special_dtype(vlen=str))) | ||
hf.create_dataset("neighbors", data=neighbors_data) | ||
|
||
# Print file size and some information | ||
file_size = os.path.getsize(filename) | ||
print(f"File size: {file_size / (1024 * 1024):.2f} MB") | ||
print(f"Train dimensions: {train_embeddings.shape}") | ||
print(f"Test dimensions: {test_embeddings.shape}") | ||
print(f"Neighbors dimensions: {neighbors_data.shape}") | ||
print(f"Number of unique categories: {len(np.unique(np.concatenate([train_categories, test_categories])))}") | ||
print(f"Sample filter: {filters[0]}") | ||
print(f"Sample train property: {train_properties[0]}") | ||
print(f"Sample test property: {test_properties[0]}") | ||
|
||
|
||
|
||
|