Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prep for v0.2.3 release #125

Merged
merged 2 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,826 changes: 1,382 additions & 1,444 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "curategpt"
version = "0.2.2"
version = "0.2.3"
description = "CurateGPT"
authors = ["Chris Mungall <cjmungall@lbl.gov>", "Carlo Kroll <ckroll95@gmail.com>", "Harshad Hegde <hhegde@lbl.gov>", "J. Harry Caufield <jhc@lbl.gov>"]
license = "BSD-3"
Expand Down
9 changes: 4 additions & 5 deletions src/curategpt/store/db_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ def collection_metadata(
:return:
"""

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for a collection.

Expand Down Expand Up @@ -488,6 +486,7 @@ def dump_then_load(self, collection: str = None, target: "DBAdapter" = None):
"""
raise NotImplementedError

def insert_from_huggingface(self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, **kwargs):
def insert_from_huggingface(
self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, **kwargs
):
raise NotImplementedError

140 changes: 76 additions & 64 deletions src/curategpt/store/duckdb_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,9 @@ def _process_objects(

if collection not in self.list_collection_names():
self._create_table_if_not_exists(
collection, self.vec_dimension, venomx=updated_venomx,
collection,
self.vec_dimension,
venomx=updated_venomx,
)

# if collection already exists, update metadata here
Expand Down Expand Up @@ -409,17 +411,17 @@ def _process_objects(
self.create_index(collection)

def insert_from_huggingface(
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
batch_size: int = None,
text_field: Union[str, Callable] = None,
venomx: Optional[Metadata] = None,
object_type: Optional[str] = None,
distance: Optional[str] = None,
vec_dimension: Optional[int] = None,
method: str = "insert",
**kwargs,
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
batch_size: int = None,
text_field: Union[str, Callable] = None,
venomx: Optional[Metadata] = None,
object_type: Optional[str] = None,
distance: Optional[str] = None,
vec_dimension: Optional[int] = None,
method: str = "insert",
**kwargs,
):
collection = self._get_collection(collection)
model = None
Expand All @@ -434,7 +436,9 @@ def insert_from_huggingface(
vec_dimension = self._get_embedding_dimension(model)

except Exception as e:
raise KeyError(f"Metadata from {collection} is not compatible with the current version of CurateGPT") from e
raise KeyError(
f"Metadata from {collection} is not compatible with the current version of CurateGPT"
) from e

updated_venomx = self.update_or_create_venomx(
venomx.venomx,
Expand All @@ -446,9 +450,11 @@ def insert_from_huggingface(
)
if collection not in self.list_collection_names():
self._create_table_if_not_exists(
collection, vec_dimension, venomx=updated_venomx,
collection,
vec_dimension,
venomx=updated_venomx,
)
updated_venomx.venomx.id = collection # prevent name error
updated_venomx.venomx.id = collection # prevent name error
self.set_collection_metadata(collection_name=collection, metadata=updated_venomx)
if batch_size is None:
batch_size = 100000
Expand All @@ -464,11 +470,17 @@ def insert_from_huggingface(

for next_objs in chunk(objs, batch_size):
next_objs = list(next_objs)
ids = [item['metadata']['id'] for item in next_objs]
ids = [item["metadata"]["id"] for item in next_objs]
metadatas = [self._dict(o) for o in next_objs]
documents = [item['document'] for item in next_objs]
embeddings = [item['embeddings'].tolist() if isinstance(item['embeddings'], np.ndarray)
else item['embeddings'] for item in next_objs]
documents = [item["document"] for item in next_objs]
embeddings = [
(
item["embeddings"].tolist()
if isinstance(item["embeddings"], np.ndarray)
else item["embeddings"]
)
for item in next_objs
]
try:
self.conn.execute("BEGIN TRANSACTION;")
self.conn.executemany(
Expand All @@ -484,67 +496,65 @@ def insert_from_huggingface(
finally:
self.create_index(collection)



def update_or_create_venomx(
self,
venomx: Optional[Index],
collection: str,
model: str,
distance: str,
object_type: str,
embeddings_dimension: Optional[int],
self,
venomx: Optional[Index],
collection: str,
model: str,
distance: str,
object_type: str,
embeddings_dimension: Optional[int],
) -> Metadata:
"""
Updates an existing Index instance (venomx) with additional values or creates a new one if none is provided.
"""
# If venomx already exists, update its nested fields (as e.g. vec_dimension would not be given)
if venomx:
new_embedding_model = Model(name=model)
updated_index = venomx.model_copy(update={ # given venomx comes as venomx=Index()
"embedding_model": new_embedding_model,
"embeddings_dimensions": embeddings_dimension,
})

venomx = Metadata(
venomx=updated_index,
hnsw_space=distance,
object_type=object_type
updated_index = venomx.model_copy(
update={ # given venomx comes as venomx=Index()
"embedding_model": new_embedding_model,
"embeddings_dimensions": embeddings_dimension,
}
)

venomx = Metadata(venomx=updated_index, hnsw_space=distance, object_type=object_type)

else:
if distance is None:
distance = self.distance_metric
venomx = self.populate_venomx(collection, model, distance, object_type, embeddings_dimension)
venomx = self.populate_venomx(
collection, model, distance, object_type, embeddings_dimension
)

return venomx

@staticmethod
def populate_venomx(
collection: Optional[str],
model: Optional[str],
distance: str,
object_type: str,
embeddings_dimension: int,
collection: Optional[str],
model: Optional[str],
distance: str,
object_type: str,
embeddings_dimension: int,
) -> Metadata:
"""
Populate venomx with data currently given when inserting
Populate venomx with data currently given when inserting

:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:return:
"""
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:return:
"""
venomx = Metadata(
venomx=Index(
id=collection,
embedding_model=Model(name=model),
embeddings_dimensions=embeddings_dimension,
),
hnsw_space=distance,
object_type=object_type
object_type=object_type,
)
return venomx

Expand Down Expand Up @@ -764,15 +774,17 @@ def update_collection_metadata(self, collection: str, **kwargs):
raise ValueError("Collection name must be provided.")
metadata = self.collection_metadata(collection)
current_venomx = {**kwargs}
if metadata is None: # should not be possible
logger.warning(f"No existing metadata found for collection {collection}. Initializing new metadata.")
if metadata is None: # should not be possible
logger.warning(
f"No existing metadata found for collection {collection}. Initializing new metadata."
)
metadata = Metadata(venomx=Index(**current_venomx))
else:
metadata_dict = metadata.model_dump(exclude_none=True)
# Check if the existing venomx has an embedding model and if it matches the one in kwargs
if 'venomx' in metadata_dict and metadata_dict['venomx'].get('embedding_model'):
existing_model_name = metadata_dict['venomx']['embedding_model'].get('name')
new_model_name = current_venomx.get('embedding_model', {}).get('name')
if "venomx" in metadata_dict and metadata_dict["venomx"].get("embedding_model"):
existing_model_name = metadata_dict["venomx"]["embedding_model"].get("name")
new_model_name = current_venomx.get("embedding_model", {}).get("name")

if new_model_name and existing_model_name and new_model_name != existing_model_name:
raise ValueError(
Expand All @@ -781,10 +793,10 @@ def update_collection_metadata(self, collection: str, **kwargs):
)

# Merge current_venomx (from kwargs) into the nested venomx dictionary
if 'venomx' in metadata_dict and isinstance(metadata_dict['venomx'], dict):
metadata_dict['venomx'].update(current_venomx)
if "venomx" in metadata_dict and isinstance(metadata_dict["venomx"], dict):
metadata_dict["venomx"].update(current_venomx)
else:
metadata_dict['venomx'] = current_venomx
metadata_dict["venomx"] = current_venomx
# Reconstruct the Metadata object from the updated dictionary
metadata = Metadata(**metadata_dict)
updated_metadata_dict = metadata.model_dump(exclude_none=True)
Expand All @@ -799,9 +811,7 @@ def update_collection_metadata(self, collection: str, **kwargs):
)
return metadata

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for the collection
:param collection_name:
Expand All @@ -816,7 +826,9 @@ def set_collection_metadata(

if metadata:
if metadata.venomx.id != collection_name:
raise ValueError(f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name}")
raise ValueError(
f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name}"
)

new_model = metadata.venomx.embedding_model.name

Expand Down
5 changes: 3 additions & 2 deletions src/curategpt/store/duckdb_connection_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def _kill_process(pid: int) -> None:
"""Kill a process if it's holding the database lock."""
try:
import psutil

if psutil.pid_exists(pid):
process = psutil.Process(pid)
process.terminate()
Expand Down Expand Up @@ -65,11 +66,11 @@ def connect(self) -> duckdb.DuckDBPyConnection:
- Now safely open the fixed database normally

"""
wal_path = Path(self.path + '.wal')
wal_path = Path(self.path + ".wal")
if wal_path.exists():
logger.info("Found WAL file, attempting recovery...")
try:
temp_conn = duckdb.connect(':memory:')
temp_conn = duckdb.connect(":memory:")
self._load_vss_extensions(temp_conn)
temp_conn.execute(f"ATTACH '{self.path}' AS main_db")
temp_conn.execute("CHECKPOINT;")
Expand Down
47 changes: 21 additions & 26 deletions src/curategpt/store/in_memory_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,8 @@ def insert(self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None,
"""
self._insert(objs, collection, **kwargs)


def _insert(
self,
objs: Union[OBJECT, Iterable[OBJECT]],
collection: str = None,
venomx: Metadata = None
self, objs: Union[OBJECT, Iterable[OBJECT]], collection: str = None, venomx: Metadata = None
):
collection_obj = self._get_collection_object(collection)
if venomx is None:
Expand All @@ -121,33 +117,35 @@ def _insert(

@staticmethod
def populate_venomx(
collection: Optional[str],
model: Optional[str] = None,
distance: str = None,
object_type: str = None,
embeddings_dimension: int = None,
index_fields: Optional[Union[List[str], Tuple[str]]] = None,
collection: Optional[str],
model: Optional[str] = None,
distance: str = None,
object_type: str = None,
embeddings_dimension: int = None,
index_fields: Optional[Union[List[str], Tuple[str]]] = None,
) -> Metadata:
"""
Populate venomx with data currently given when inserting
Populate venomx with data currently given when inserting

:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:param index_fields:
:return:
"""
:param collection:
:param model:
:param distance:
:param object_type:
:param embeddings_dimension:
:param index_fields:
:return:
"""
venomx = Metadata(
venomx=Index(
id=collection,
embedding_model=Model(name=model),
embeddings_dimensions=embeddings_dimension,
embedding_input_method=ModelInputMethod(fields=index_fields) if index_fields else None
embedding_input_method=(
ModelInputMethod(fields=index_fields) if index_fields else None
),
),
hnsw_space=distance,
object_type=object_type
object_type=object_type,
)
return venomx

Expand Down Expand Up @@ -200,9 +198,7 @@ def collection_metadata(
cm.object_count = len(collection_obj.objects)
return cm

def set_collection_metadata(
self, collection_name: Optional[str], metadata: Metadata, **kwargs
):
def set_collection_metadata(self, collection_name: Optional[str], metadata: Metadata, **kwargs):
"""
Set the metadata for a collection.

Expand All @@ -215,7 +211,6 @@ def set_collection_metadata(
# raise ValueError(f"venomx.id: {metadata.venomx.id} must match collection_name {collection_name} and should not be changed")
collection_obj.metadata = metadata.model_dump(exclude_none=True)


def update_collection_metadata(self, collection_name: str, **kwargs) -> Metadata:
"""
Update the metadata for a collection.
Expand Down
Loading
Loading