Skip to content

Commit

Permalink
huggingface download/upload
Browse files Browse the repository at this point in the history
  • Loading branch information
iQuxLE committed Nov 28, 2024
1 parent 18df810 commit 9347157
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 95 deletions.
106 changes: 56 additions & 50 deletions src/curategpt/agents/huggingface_agent.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
import logging
import tempfile
from contextlib import ExitStack
from dataclasses import dataclass
from pathlib import Path
from typing import Dict

import pandas as pd
import yaml
from huggingface_hub import HfApi, create_repo
from huggingface_hub import HfApi, create_repo, get_token

logger = logging.getLogger(__name__)
HF_DOWNLOAD_PATH = Path(__file__).resolve().parents[4]
HF_DOWNLOAD_PATH = HF_DOWNLOAD_PATH / "hf_download"

@dataclass
class HuggingFaceAgent:

api: HfApi = None

def __post_init__(self):
self.api = HfApi()
self.token = get_token()
# set export HUGGING_FACE_HUB_TOKEN="your_access_token"

def upload(self, objects, metadata, repo_id, private=False, **kwargs):
"""
Upload an entire collection to a Hugging Face repository.
Expand All @@ -18,18 +33,30 @@ def upload(self, objects, metadata, repo_id, private=False, **kwargs):
:param private: Whether the repository should be private.
:param kwargs: Additional arguments such as batch size or metadata options.
"""
venomx_metadata = self._transform_metadata_to_venomx(metadata)

embedding_file = "embeddings.parquet"
metadata_file = "metadata.yaml"
pd.DataFrame(objects).to_parquet(embedding_file)
with open(metadata_file, "w") as f:
yaml.dump(venomx_metadata, f)
try:
df = pd.DataFrame(data=[(obj[0], obj[2]['_embeddings'], obj[2]['document']) for obj in objects])
except Exception as e:
raise ValueError(f"Creation of Dataframe not successful: {e}") from e
with ExitStack() as stack:
tmp_parquet = stack.enter_context(tempfile.NamedTemporaryFile(suffix=".parquet", delete=True))
tmp_yaml = stack.enter_context(tempfile.NamedTemporaryFile(suffix=".yaml", delete=True))

embedding_path = tmp_parquet.name
metadata_path = tmp_yaml.name

df.to_parquet(path=embedding_path, index=False)
with open(metadata_path, "w") as f:
yaml.dump(metadata.model_dump(), f)

self._create_repo(repo_id, private=private)
self._upload_files(repo_id, {
embedding_file: embedding_file,
metadata_file: metadata_file,
})
self._create_repo(repo_id, private=private)

self._upload_files(repo_id, {
embedding_path : repo_id + "/" + embedding_file,
metadata_path : repo_id + "/" + metadata_file
})

def _create_repo(self, repo_id: str, private: bool = False):
"""
Expand All @@ -39,23 +66,23 @@ def _create_repo(self, repo_id: str, private: bool = False):
:param private: Whether the repository is private.
"""
try:
create_repo(repo_id=repo_id, repo_type="dataset", private=private)
create_repo(repo_id=repo_id, token=self.token, repo_type="dataset", private=private)
logger.info(f"Repository {repo_id} created successfully on Hugging Face.")
except Exception as e:
logger.error(f"Failed to create repository {repo_id} on Hugging Face: {e}")
raise


def _upload_files(self, repo_id: str, files: Dict[str, str]):
"""
Upload files to a Hugging Face repository.
:param repo_id: The repository ID on Hugging Face.
:param files: A dictionary with local file paths as keys and paths in the repository as values.
"""
api = HfApi()
try:
for local_path, repo_path in files.items():
api.upload_file(
self.api.upload_file(
path_or_fileobj=local_path,
path_in_repo=repo_path,
repo_id=repo_id,
Expand All @@ -66,42 +93,21 @@ def _upload_files(self, repo_id: str, files: Dict[str, str]):
logger.error(f"Failed to upload files to {repo_id} on Hugging Face: {e}")
raise

def _transform_metadata_to_venomx(self, metadata):
"""
Transform metadata from ChromaDB format to VenomX format.
def cached_download(
self,
repo_id: str,
repo_type: str,
filename: str
):
download_path = self.api.hf_hub_download(
repo_id=repo_id,
repo_type=repo_type,
filename=filename,
token=self.token,
)

return download_path



:param metadata: Metadata object from store
:return: A dictionary formatted according to VenomX
"""

prefixes = metadata.prefixes if hasattr(metadata,
'prefixes') and metadata.prefixes else {}

venomx_metadata = {
"description": metadata.description or "No description provided",
"prefixes": prefixes,
"model": {
"name": metadata.model or "unknown",
# Default to a known model if not specified
},
"model_input_method": {
"description": "Simple pass through of labels only",
"fields": ["rdfs:label"]
# Adjust fields based on actual data structure if needed
},
"dataset": {
"name": metadata.name or "Unknown Dataset",
"url": metadata.source or "Unknown URL"
# Adjust based on available metadata
}
}

# Enrich VenomX format with annotations if available
if metadata.annotations:
venomx_metadata["annotations"] = metadata.annotations

# Include any additional fields from metadata that are relevant
if hasattr(metadata, 'extra_fields') and metadata.extra_fields:
venomx_metadata.update(metadata.extra_fields)

return venomx_metadata
102 changes: 97 additions & 5 deletions src/curategpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from curategpt.evaluation.splitter import stratify_collection
from curategpt.extract import AnnotatedObject
from curategpt.extract.basic_extractor import BasicExtractor
from curategpt.store import get_store
from curategpt.store import Metadata, get_store
from curategpt.store.schema_proxy import SchemaProxy
from curategpt.utils.vectordb_operations import match_collections
from curategpt.wrappers import BaseWrapper, get_wrapper
Expand Down Expand Up @@ -2356,7 +2356,8 @@ def _text_lookup(obj: Dict):
collection=collection,
model=model,
venomx=venomx,
object_type="OntologyClass",
object_type="OntologyClass"

)

e = time.time()
Expand Down Expand Up @@ -2416,9 +2417,7 @@ def load_embeddings(path, collection, append, embedding_format, model, file_or_u
"""
Index embeddings from a local file or URL into a ChromaDB collection.
"""
# TODO: not tested (its not really the usecase of the adapters, at least you wanna have
# ids or belonging objects? otherwise insert will calc that on the fly
# consider using add (working for chroma at least) and add to objects ?
# TODO: not tested
# Check if file_or_url is a URL
if file_or_url.startswith("http://") or file_or_url.startswith("https://"):
print(f"Downloading file from URL: {file_or_url}")
Expand Down Expand Up @@ -2466,6 +2465,7 @@ def upload_embeddings(path, collection, repo_id, private, adapter, database_type
try:
objects = list(db.fetch_all_objects_memory_safe(collection=collection))
metadata = db.collection_metadata(collection)
print(metadata)
except Exception as e:
print(f"Error accessing collection '{collection}' from database: {e}")
return
Expand All @@ -2482,6 +2482,98 @@ def upload_embeddings(path, collection, repo_id, private, adapter, database_type
print(f"Error uploading collection to {repo_id}: {e}")


@embeddings.command(name="download")
@path_option
@collection_option
@click.option(
"--repo-id",
required=True,
help="Repository ID on Hugging Face, e.g., 'biomedical-translator/[repo_name]'.",
)
@click.option(
"--embeddings-filename", "-ef",
type=str,
required=True,
default="embeddings.parquet"
)
@click.option(
"--metadata-filename", "-mf",
type=str,
required=False,
default="metadata.yaml"
)
@click.option("--adapter", default="huggingface", help="Adapter to use for uploading embeddings.")
@database_type_option
def download_embeddings(path, collection, repo_id, embeddings_filename, metadata_filename, adapter, database_type):
"""
Download dataset and insert into a collection
e.g. huggingface.
Example:
curategpt embeddings download --repo-id biomedical-translator/my_repo --collection my_collection --filename embeddings.parquet
curategpt embeddings download --repo-id iQuxLE/hpo_label_embeddings --collection hf_d_collection --filename embeddings.parquet
"""

db = get_store(database_type, path)
parquet_download = None
metadata_download = None
store_objects = None

if adapter == "huggingface":
agent = HuggingFaceAgent()
else:
raise ValueError(
f"Unsupported adapter: {adapter} " f"currently only huggingface adapter is supported"
)
try:
if embeddings_filename:
embedding_filename = repo_id + "/" + embeddings_filename
parquet_download = agent.cached_download(repo_id=repo_id,
repo_type="dataset",
filename=embedding_filename
)
if metadata_filename:
metadata_filename = repo_id + "/" + metadata_filename
metadata_download = agent.api.hf_hub_download(repo_id=repo_id,
repo_type="dataset",
filename=metadata_filename
)

except Exception as e:
click.echo(f"Error meanwhile downloading: {e}")

try:
if parquet_download.endswith(".parquet"):
df = pd.read_parquet(Path(parquet_download))
store_objects = [
{
"metadata": row.iloc[0],
"embeddings": row.iloc[1],
"document": row.iloc[2]
} for _, row in df.iterrows()
]

if metadata_download.endswith(".yaml"):
# populate venomx from file
with open(metadata_download, "r") as infile:
_meta = yaml.safe_load(infile)
try:
venomx_data = _meta.pop("venomx", None)
venomx_obj = Index(**venomx_data) if venomx_data else None
metadata_obj = Metadata(
**_meta,
venomx=venomx_obj
)
except Exception as e:
raise ValueError(
f"Error parsing metadata file: {e}. Downloaded metadata is not in the correct format.") from e

objects = [{k:v for k, v in obj.items()} for obj in store_objects]
db.insert_from_huggingface(collection=collection, objs=objects, venomx=metadata_obj)
except Exception as e:
raise e


@main.group()
def view():
"Virtual store/wrapper"
Expand Down
Loading

0 comments on commit 9347157

Please sign in to comment.