Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make monarch text embeddings #46

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,6 @@ dmypy.json
stagedb/*
*.rej
tests/input/dbs/go-nucleus-chroma/*/*.bin
tests/output/*
tests/output/*data/
stage_db/
data/
4 changes: 4 additions & 0 deletions notebooks/command-line/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
stagedb
*kg*.tsv
metadata.yaml
*kg.tar.gz
225 changes: 225 additions & 0 deletions notebooks/command-line/Index-monarch-kg.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!pip install git+https://github.com/monarch-initiative/curate-gpt.git\n",
"!pip install huggingface_hub pyyaml pandas pyarrow"
],
"metadata": {
"collapsed": false
},
"id": "6ccb0b14fb5a11a1"
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [],
"source": [
"# Import necessary libraries\n",
"from huggingface_hub import HfApi, create_repo\n",
"import yaml"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-08-01T11:21:14.843863Z",
"start_time": "2024-08-01T11:21:14.838745Z"
}
},
"id": "105b0e6972a9e087"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!wget https://data.monarchinitiative.org/monarch-kg/latest/monarch-kg.tar.gz\n",
"!tar -xvzf monarch-kg.tar.gz"
],
"metadata": {
"collapsed": false
},
"id": "fb9336dad1877366"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"!curategpt index -p stagedb -c monarch_kg -m openai: data/monarch-kg_nodes.tsv"
],
"metadata": {
"collapsed": false
},
"id": "f47fce4b73e51127"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"import pandas as pd\n",
"from curate_gpt import ChromaDBAdapter\n",
"\n",
"def fetch_embeddings_from_chromadb(path, collection):\n",
" # Initialize the database adapter\n",
" db = ChromaDBAdapter(path)\n",
" \n",
" # Fetch embeddings from the specified collection using get\n",
" collection_obj = db.client.get_collection(name=collection)\n",
" results = collection_obj.get(include=[\"embeddings\"])\n",
" \n",
" # Extract embeddings\n",
" embeddings = []\n",
" for metadata in results['metadatas']:\n",
" if 'embeddings' in metadata:\n",
" embeddings.append(metadata['embeddings'])\n",
" else:\n",
" raise KeyError(f\"Embeddings not found in metadata: {metadata}\")\n",
" \n",
" return embeddings\n",
"\n",
"def export_embeddings_to_parquet(path, collection, output_file):\n",
" # Fetch embeddings\n",
" embeddings = fetch_embeddings_from_chromadb(path, collection)\n",
" \n",
" # Convert embeddings to DataFrame\n",
" df_embeddings = pd.DataFrame(embeddings)\n",
" \n",
" # Export DataFrame to Parquet file\n",
" df_embeddings.to_parquet(output_file, engine='pyarrow')\n",
" print(f\"Embeddings have been successfully exported to {output_file}\")\n",
"\n",
"# Example usage\n",
"path_to_chromadb = './stagedb'\n",
"collection_name = 'monarch_kg'\n",
"output_parquet_file = 'monarch_text_embeddings.parquet'\n",
"\n",
"export_embeddings_to_parquet(path_to_chromadb, collection_name, output_parquet_file)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-08-01T00:09:45.908762Z",
"start_time": "2024-08-01T00:09:45.895383Z"
}
},
"id": "4c04eeafb792a7bd"
},
{
"cell_type": "code",
"execution_count": 21,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Metadata saved to ./metadata.yaml\n"
]
}
],
"source": [
"# Generate metadata in venomx format\n",
"metadata = {\n",
" 'description': 'Embeddings of the Monarch KG nodes, generated using curategpt and the nodes.tsv file from the Monarch KG version 2024-07-12',\n",
" 'model': {\n",
" 'name': 'text-embedding-ada-002'\n",
" },\n",
" 'dataset': {\n",
" 'name': 'Monarch KG 2024-07-12',\n",
" 'url': 'https://data.monarchinitiative.org/monarch-kg/2024-07-12/'\n",
" }\n",
"}\n",
"\n",
"# Save the metadata to a YAML file\n",
"metadata_file_path = './metadata.yaml'\n",
"with open(metadata_file_path, 'w') as f:\n",
" yaml.dump(metadata, f)\n",
"\n",
"print(f\"Metadata saved to {metadata_file_path}\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-08-01T11:21:25.245096Z",
"start_time": "2024-08-01T11:21:25.230167Z"
}
},
"id": "e4573dbb4c2cc72b"
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"ename": "ValueError",
"evalue": "Provided path: 'monarch_text_embeddings.parquet' is not a file on the local file system",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[19], line 11\u001B[0m\n\u001B[1;32m 8\u001B[0m files_to_upload \u001B[38;5;241m=\u001B[39m [output_parquet_file, metadata_file_path, this_notebook_path]\n\u001B[1;32m 10\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m files_to_upload:\n\u001B[0;32m---> 11\u001B[0m \u001B[43mapi\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mupload_file\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 12\u001B[0m \u001B[43m \u001B[49m\u001B[43mpath_or_fileobj\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 13\u001B[0m \u001B[43m \u001B[49m\u001B[43mpath_in_repo\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 14\u001B[0m \u001B[43m \u001B[49m\u001B[43mrepo_id\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mrepo_id\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 15\u001B[0m \u001B[43m \u001B[49m\u001B[43mrepo_type\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mdataset\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\n\u001B[1;32m 16\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 18\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mFiles uploaded to Hugging Face in repository: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mrepo_id\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n",
"File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:114\u001B[0m, in \u001B[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 111\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m check_use_auth_token:\n\u001B[1;32m 112\u001B[0m kwargs \u001B[38;5;241m=\u001B[39m smoothly_deprecate_use_auth_token(fn_name\u001B[38;5;241m=\u001B[39mfn\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m, has_token\u001B[38;5;241m=\u001B[39mhas_token, kwargs\u001B[38;5;241m=\u001B[39mkwargs)\n\u001B[0;32m--> 114\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfn\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/huggingface_hub/hf_api.py:1286\u001B[0m, in \u001B[0;36mfuture_compatible.<locals>._inner\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 1283\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mrun_as_future(fn, \u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 1285\u001B[0m \u001B[38;5;66;03m# Otherwise, call the function normally\u001B[39;00m\n\u001B[0;32m-> 1286\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfn\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/huggingface_hub/hf_api.py:4369\u001B[0m, in \u001B[0;36mHfApi.upload_file\u001B[0;34m(self, path_or_fileobj, path_in_repo, repo_id, token, repo_type, revision, commit_message, commit_description, create_pr, parent_commit, run_as_future)\u001B[0m\n\u001B[1;32m 4364\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mInvalid repo type, must be one of \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mREPO_TYPES\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 4366\u001B[0m commit_message \u001B[38;5;241m=\u001B[39m (\n\u001B[1;32m 4367\u001B[0m commit_message \u001B[38;5;28;01mif\u001B[39;00m commit_message \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUpload \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mpath_in_repo\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m with huggingface_hub\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 4368\u001B[0m )\n\u001B[0;32m-> 4369\u001B[0m operation \u001B[38;5;241m=\u001B[39m \u001B[43mCommitOperationAdd\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 4370\u001B[0m \u001B[43m \u001B[49m\u001B[43mpath_or_fileobj\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpath_or_fileobj\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 4371\u001B[0m \u001B[43m \u001B[49m\u001B[43mpath_in_repo\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpath_in_repo\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 4372\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 4374\u001B[0m commit_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_commit(\n\u001B[1;32m 4375\u001B[0m repo_id\u001B[38;5;241m=\u001B[39mrepo_id,\n\u001B[1;32m 4376\u001B[0m repo_type\u001B[38;5;241m=\u001B[39mrepo_type,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 4383\u001B[0m parent_commit\u001B[38;5;241m=\u001B[39mparent_commit,\n\u001B[1;32m 4384\u001B[0m )\n\u001B[1;32m 4386\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m commit_info\u001B[38;5;241m.\u001B[39mpr_url \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n",
"File \u001B[0;32m<string>:5\u001B[0m, in \u001B[0;36m__init__\u001B[0;34m(self, path_in_repo, path_or_fileobj, **_kwargs)\u001B[0m\n",
"File \u001B[0;32m~/Library/Caches/pypoetry/virtualenvs/curate-gpt-Z25hmCIa-py3.9/lib/python3.9/site-packages/huggingface_hub/_commit_api.py:165\u001B[0m, in \u001B[0;36mCommitOperationAdd.__post_init__\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 163\u001B[0m path_or_fileobj \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mnormpath(os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mexpanduser(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpath_or_fileobj))\n\u001B[1;32m 164\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39misfile(path_or_fileobj):\n\u001B[0;32m--> 165\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProvided path: \u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mpath_or_fileobj\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m is not a file on the local file system\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 166\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpath_or_fileobj, (io\u001B[38;5;241m.\u001B[39mBufferedIOBase, \u001B[38;5;28mbytes\u001B[39m)):\n\u001B[1;32m 167\u001B[0m \u001B[38;5;66;03m# ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode\u001B[39;00m\n\u001B[1;32m 168\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 169\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpath_or_fileobj must be either an instance of str, bytes or\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 170\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m io.BufferedIOBase. If you passed a file-like object, make sure it is\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 171\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m in binary mode.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 172\u001B[0m )\n",
"\u001B[0;31mValueError\u001B[0m: Provided path: 'monarch_text_embeddings.parquet' is not a file on the local file system"
]
}
],
"source": [
"# Upload to Hugging Face\n",
"repo_id = \"biomedical-translator/monarch_kg_node_text_embeddings\"\n",
"create_repo(repo_id, repo_type=\"dataset\")\n",
"\n",
"this_notebook_path = \"index-monarch-kg.ipynb\"\n",
"\n",
"api = HfApi()\n",
"files_to_upload = [output_parquet_file, metadata_file_path, this_notebook_path]\n",
"\n",
"for file in files_to_upload:\n",
" api.upload_file(\n",
" path_or_fileobj=file,\n",
" path_in_repo=file,\n",
" repo_id=repo_id,\n",
" repo_type=\"dataset\"\n",
" )\n",
"\n",
"print(f\"Files uploaded to Hugging Face in repository: {repo_id}\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-08-01T11:18:20.078433Z",
"start_time": "2024-08-01T11:18:18.310593Z"
}
},
"id": "d3fcdcba15078167"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading