Fix deletion of documents with no chunks (#1771)

* Fix deletion of documents with no chunks * Fix user auth on delete document * Add offset and limit * Fix deletion logic, clean up compose
SciPhi-AI · Jan 7, 2025 · 518138d · 518138d
1 parent a624438
commit 518138d
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 63 deletions.
diff --git a/py/compose.yaml b/py/compose.yaml
@@ -14,11 +14,11 @@ services:
     image: pgvector/pgvector:pg16
     profiles: [postgres]
     environment:
-      - POSTGRES_USER=${R2R_POSTGRES_USER:-${POSTGRES_USER:-postgres}} # Eventually get rid of POSTGRES_USER, but for now keep it for backwards compatibility
-      - POSTGRES_PASSWORD=${R2R_POSTGRES_PASSWORD:-${POSTGRES_PASSWORD:-postgres}} # Eventually get rid of POSTGRES_PASSWORD, but for now keep it for backwards compatibility
-      - POSTGRES_HOST=${R2R_POSTGRES_HOST:-${POSTGRES_HOST:-postgres}} # Eventually get rid of POSTGRES_HOST, but for now keep it for backwards compatibility
-      - POSTGRES_PORT=${R2R_POSTGRES_PORT:-${POSTGRES_PORT:-5432}} # Eventually get rid of POSTGRES_PORT, but for now keep it for backwards compatibility
-      - POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-${POSTGRES_MAX_CONNECTIONS:-1024}} # Eventually get rid of POSTGRES_MAX_CONNECTIONS, but for now keep it for backwards compatibility
+      - POSTGRES_USER=${R2R_POSTGRES_USER:-postgres}
+      - POSTGRES_PASSWORD=${R2R_POSTGRES_PASSWORD:-postgres}
+      - POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
+      - POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
+      - POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
       - PGPORT=${R2R_POSTGRES_PORT:-5432}
     volumes:
       - postgres_data:/var/lib/postgresql/data

diff --git a/py/core/base/providers/ingestion.py b/py/core/base/providers/ingestion.py
@@ -150,7 +150,7 @@ class Config:
             "provider": "r2r",
             "excluded_parsers": ["mp4"],
             "chunking_strategy": "recursive",
-            "chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
+            "chunk_enrichment_settings": ChunkEnrichmentSettings(),
             "extra_parsers": {},
             "audio_transcription_model": "openai/whisper-1",
             "vision_img_prompt_name": "vision_img",

diff --git a/py/core/configs/r2r_azure.toml b/py/core/configs/r2r_azure.toml
@@ -21,7 +21,7 @@ batch_size = 256
 
 [embedding]
 provider = "litellm"
-base_model = "openai/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
+base_model = "openai/text-embedding-3-bad" # continue with `openai` for embeddings, due to server rate limit on azure
 base_dimension = 512
 
 [file]

diff --git a/py/core/main/api/v3/documents_router.py b/py/core/main/api/v3/documents_router.py
@@ -1307,7 +1307,10 @@ async def delete_document_by_filter(
             auth_user=Depends(self.providers.auth.auth_wrapper()),
         ) -> WrappedBooleanResponse:
             """
-            Delete documents based on provided filters. Allowed operators include `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, and `nin`. Deletion requests are limited to a user's own documents.
+            Delete documents based on provided filters. Allowed operators
+            include: `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`,
+            `ilike`, `in`, and `nin`. Deletion requests are limited to a
+            user's own documents.
             """
 
             filters_dict = {

diff --git a/py/core/main/services/ingestion_service.py b/py/core/main/services/ingestion_service.py
@@ -711,21 +711,6 @@ def _parse_user_data(user_data) -> User:
                 ) from e
         return User.from_dict(user_data)
 
-    @staticmethod
-    def _parse_chunk_enrichment_settings(
-        chunk_enrichment_settings: dict,
-    ) -> ChunkEnrichmentSettings:
-        if isinstance(chunk_enrichment_settings, str):
-            try:
-                chunk_enrichment_settings = json.loads(
-                    chunk_enrichment_settings
-                )
-            except json.JSONDecodeError as e:
-                raise ValueError(
-                    f"Invalid chunk enrichment settings format: {chunk_enrichment_settings}"
-                ) from e
-        return ChunkEnrichmentSettings.from_dict(chunk_enrichment_settings)
-
     @staticmethod
     def parse_ingest_file_input(data: dict) -> dict:
         return {

diff --git a/py/core/main/services/management_service.py b/py/core/main/services/management_service.py
@@ -114,30 +114,19 @@ def transform_chunk_id_to_id(
                 return transformed
             return filters
 
-        # 1. (Optional) Validate the input filters based on your rules.
-        #    E.g., check if filters is not empty, allowed fields, etc.
-        # validate_filters(filters)
-
-        # 2. Transform filters if needed.
-        #    For example, if `chunk_id` is used, map it to `id`, or similar transformations.
+        # Transform filters if needed.
         transformed_filters = transform_chunk_id_to_id(filters)
 
-        # 3. First, find out which chunks match these filters *before* deleting, so we know which docs are affected.
-        #    You can do a list operation on chunks to see which chunk IDs and doc IDs would be hit.
+        # Find chunks that match the filters before deleting
         interim_results = (
             await self.providers.database.chunks_handler.list_chunks(
                 filters=transformed_filters,
                 offset=0,
-                limit=1_000,  # Arbitrary large limit or pagination logic
+                limit=1_000,
                 include_vectors=False,
             )
         )
 
-        if interim_results["page_info"]["total_entries"] == 0:
-            raise R2RException(
-                status_code=404, message="No entries found for deletion."
-            )
-
         results = interim_results["results"]
         while interim_results["page_info"]["total_entries"] == 1_000:
             # If we hit the limit, we need to paginate to get all results
@@ -151,43 +140,65 @@ def transform_chunk_id_to_id(
                 )
             )
             results.extend(interim_results["results"])
-        matched_chunk_docs = {UUID(chunk["document_id"]) for chunk in results}
 
-        # If no chunks match, raise or return a no-op result
-        if not matched_chunk_docs:
-            return {
-                "success": False,
-                "message": "No chunks match the given filters.",
-            }
+        document_ids = set()
+        owner_id = None
+
+        if "$and" in filters:
+            for condition in filters["$and"]:
+                if "owner_id" in condition and "$eq" in condition["owner_id"]:
+                    owner_id = condition["owner_id"]["$eq"]
+                elif (
+                    "document_id" in condition
+                    and "$eq" in condition["document_id"]
+                ):
+                    document_ids.add(UUID(condition["document_id"]["$eq"]))
+        elif "document_id" in filters:
+            doc_id = filters["document_id"]
+            if isinstance(doc_id, str):
+                document_ids.add(UUID(doc_id))
+            elif isinstance(doc_id, UUID):
+                document_ids.add(doc_id)
+            elif isinstance(doc_id, dict) and "$eq" in doc_id:
+                value = doc_id["$eq"]
+                document_ids.add(
+                    UUID(value) if isinstance(value, str) else value
+                )
 
-        # 4. Delete the matching chunks from the database.
+        # Delete matching chunks from the database
         delete_results = await self.providers.database.chunks_handler.delete(
             transformed_filters
         )
 
-        # 5. From `delete_results`, extract the document_ids that were affected.
-        #    The delete_results should map chunk_id to details including `document_id`.
+        # Extract the document_ids that were affected.
         affected_doc_ids = {
             UUID(info["document_id"])
             for info in delete_results.values()
             if info.get("document_id")
         }
+        document_ids.update(affected_doc_ids)
 
-        # 6. For each affected document, check if the document still has any chunks left.
+        # Check if the document still has any chunks left
         docs_to_delete = []
-        for doc_id in affected_doc_ids:
-            remaining = await self.providers.database.chunks_handler.list_document_chunks(
-                document_id=doc_id,
-                offset=0,
-                limit=1,  # Just need to know if there's at least one left
-                include_vectors=False,
+        for doc_id in document_ids:
+            documents_overview_response = await self.providers.database.documents_handler.get_documents_overview(
+                offset=0, limit=1, filter_document_ids=[doc_id]
             )
-            # If no remaining chunks, we should delete the document.
-            if remaining["total_entries"] == 0:
-                docs_to_delete.append(doc_id)
+            if not documents_overview_response["results"]:
+                raise R2RException(
+                    status_code=404, message="Document not found"
+                )
+
+            document = documents_overview_response["results"][0]
+
+            if owner_id and str(document.owner_id) != owner_id:
+                raise R2RException(
+                    status_code=404,
+                    message="Document not found or insufficient permissions",
+                )
+            docs_to_delete.append(doc_id)
 
-        # 7. Delete documents that no longer have associated chunks.
-        #    Also update graphs if needed (entities/relationships).
+        # Delete documents that no longer have associated chunks
         for doc_id in docs_to_delete:
             # Delete related entities & relationships if needed:
             await self.providers.database.graphs_handler.entities.delete(
@@ -204,7 +215,6 @@ def transform_chunk_id_to_id(
                 document_id=doc_id
             )
 
-        # 8. Return a summary of what happened.
         return {
             "success": True,
             "deleted_chunks_count": len(delete_results),
@@ -681,10 +691,10 @@ async def summarize_collection(
             ),
         )
 
-        collection_summary = response.choices[0].message.content
-        if not collection_summary:
+        if collection_summary := response.choices[0].message.content:
+            return collection_summary
+        else:
             raise ValueError("Expected a generated response.")
-        return collection_summary
 
     @telemetry_event("AddPrompt")
     async def add_prompt(