Skip to content

Commit

Permalink
Fix deletion of documents with no chunks (#1771)
Browse files Browse the repository at this point in the history
* Fix deletion of documents with no chunks

* Fix user auth on delete document

* Add offset and limit

* Fix deletion logic, clean up compose
  • Loading branch information
NolanTrem authored Jan 7, 2025
1 parent a624438 commit 518138d
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 63 deletions.
10 changes: 5 additions & 5 deletions py/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ services:
image: pgvector/pgvector:pg16
profiles: [postgres]
environment:
- POSTGRES_USER=${R2R_POSTGRES_USER:-${POSTGRES_USER:-postgres}} # Eventually get rid of POSTGRES_USER, but for now keep it for backwards compatibility
- POSTGRES_PASSWORD=${R2R_POSTGRES_PASSWORD:-${POSTGRES_PASSWORD:-postgres}} # Eventually get rid of POSTGRES_PASSWORD, but for now keep it for backwards compatibility
- POSTGRES_HOST=${R2R_POSTGRES_HOST:-${POSTGRES_HOST:-postgres}} # Eventually get rid of POSTGRES_HOST, but for now keep it for backwards compatibility
- POSTGRES_PORT=${R2R_POSTGRES_PORT:-${POSTGRES_PORT:-5432}} # Eventually get rid of POSTGRES_PORT, but for now keep it for backwards compatibility
- POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-${POSTGRES_MAX_CONNECTIONS:-1024}} # Eventually get rid of POSTGRES_MAX_CONNECTIONS, but for now keep it for backwards compatibility
- POSTGRES_USER=${R2R_POSTGRES_USER:-postgres}
- POSTGRES_PASSWORD=${R2R_POSTGRES_PASSWORD:-postgres}
- POSTGRES_HOST=${R2R_POSTGRES_HOST:-postgres}
- POSTGRES_PORT=${R2R_POSTGRES_PORT:-5432}
- POSTGRES_MAX_CONNECTIONS=${R2R_POSTGRES_MAX_CONNECTIONS:-1024}
- PGPORT=${R2R_POSTGRES_PORT:-5432}
volumes:
- postgres_data:/var/lib/postgresql/data
Expand Down
2 changes: 1 addition & 1 deletion py/core/base/providers/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class Config:
"provider": "r2r",
"excluded_parsers": ["mp4"],
"chunking_strategy": "recursive",
"chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
"chunk_enrichment_settings": ChunkEnrichmentSettings(),
"extra_parsers": {},
"audio_transcription_model": "openai/whisper-1",
"vision_img_prompt_name": "vision_img",
Expand Down
2 changes: 1 addition & 1 deletion py/core/configs/r2r_azure.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ batch_size = 256

[embedding]
provider = "litellm"
base_model = "openai/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
base_model = "openai/text-embedding-3-bad" # continue with `openai` for embeddings, due to server rate limit on azure
base_dimension = 512

[file]
Expand Down
5 changes: 4 additions & 1 deletion py/core/main/api/v3/documents_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,7 +1307,10 @@ async def delete_document_by_filter(
auth_user=Depends(self.providers.auth.auth_wrapper()),
) -> WrappedBooleanResponse:
"""
Delete documents based on provided filters. Allowed operators include `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, and `nin`. Deletion requests are limited to a user's own documents.
Delete documents based on provided filters. Allowed operators
include: `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`,
`ilike`, `in`, and `nin`. Deletion requests are limited to a
user's own documents.
"""

filters_dict = {
Expand Down
15 changes: 0 additions & 15 deletions py/core/main/services/ingestion_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,21 +711,6 @@ def _parse_user_data(user_data) -> User:
) from e
return User.from_dict(user_data)

@staticmethod
def _parse_chunk_enrichment_settings(
chunk_enrichment_settings: dict,
) -> ChunkEnrichmentSettings:
if isinstance(chunk_enrichment_settings, str):
try:
chunk_enrichment_settings = json.loads(
chunk_enrichment_settings
)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid chunk enrichment settings format: {chunk_enrichment_settings}"
) from e
return ChunkEnrichmentSettings.from_dict(chunk_enrichment_settings)

@staticmethod
def parse_ingest_file_input(data: dict) -> dict:
return {
Expand Down
90 changes: 50 additions & 40 deletions py/core/main/services/management_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,30 +114,19 @@ def transform_chunk_id_to_id(
return transformed
return filters

# 1. (Optional) Validate the input filters based on your rules.
# E.g., check if filters is not empty, allowed fields, etc.
# validate_filters(filters)

# 2. Transform filters if needed.
# For example, if `chunk_id` is used, map it to `id`, or similar transformations.
# Transform filters if needed.
transformed_filters = transform_chunk_id_to_id(filters)

# 3. First, find out which chunks match these filters *before* deleting, so we know which docs are affected.
# You can do a list operation on chunks to see which chunk IDs and doc IDs would be hit.
# Find chunks that match the filters before deleting
interim_results = (
await self.providers.database.chunks_handler.list_chunks(
filters=transformed_filters,
offset=0,
limit=1_000, # Arbitrary large limit or pagination logic
limit=1_000,
include_vectors=False,
)
)

if interim_results["page_info"]["total_entries"] == 0:
raise R2RException(
status_code=404, message="No entries found for deletion."
)

results = interim_results["results"]
while interim_results["page_info"]["total_entries"] == 1_000:
# If we hit the limit, we need to paginate to get all results
Expand All @@ -151,43 +140,65 @@ def transform_chunk_id_to_id(
)
)
results.extend(interim_results["results"])
matched_chunk_docs = {UUID(chunk["document_id"]) for chunk in results}

# If no chunks match, raise or return a no-op result
if not matched_chunk_docs:
return {
"success": False,
"message": "No chunks match the given filters.",
}
document_ids = set()
owner_id = None

if "$and" in filters:
for condition in filters["$and"]:
if "owner_id" in condition and "$eq" in condition["owner_id"]:
owner_id = condition["owner_id"]["$eq"]
elif (
"document_id" in condition
and "$eq" in condition["document_id"]
):
document_ids.add(UUID(condition["document_id"]["$eq"]))
elif "document_id" in filters:
doc_id = filters["document_id"]
if isinstance(doc_id, str):
document_ids.add(UUID(doc_id))
elif isinstance(doc_id, UUID):
document_ids.add(doc_id)
elif isinstance(doc_id, dict) and "$eq" in doc_id:
value = doc_id["$eq"]
document_ids.add(
UUID(value) if isinstance(value, str) else value
)

# 4. Delete the matching chunks from the database.
# Delete matching chunks from the database
delete_results = await self.providers.database.chunks_handler.delete(
transformed_filters
)

# 5. From `delete_results`, extract the document_ids that were affected.
# The delete_results should map chunk_id to details including `document_id`.
# Extract the document_ids that were affected.
affected_doc_ids = {
UUID(info["document_id"])
for info in delete_results.values()
if info.get("document_id")
}
document_ids.update(affected_doc_ids)

# 6. For each affected document, check if the document still has any chunks left.
# Check if the document still has any chunks left
docs_to_delete = []
for doc_id in affected_doc_ids:
remaining = await self.providers.database.chunks_handler.list_document_chunks(
document_id=doc_id,
offset=0,
limit=1, # Just need to know if there's at least one left
include_vectors=False,
for doc_id in document_ids:
documents_overview_response = await self.providers.database.documents_handler.get_documents_overview(
offset=0, limit=1, filter_document_ids=[doc_id]
)
# If no remaining chunks, we should delete the document.
if remaining["total_entries"] == 0:
docs_to_delete.append(doc_id)
if not documents_overview_response["results"]:
raise R2RException(
status_code=404, message="Document not found"
)

document = documents_overview_response["results"][0]

if owner_id and str(document.owner_id) != owner_id:
raise R2RException(
status_code=404,
message="Document not found or insufficient permissions",
)
docs_to_delete.append(doc_id)

# 7. Delete documents that no longer have associated chunks.
# Also update graphs if needed (entities/relationships).
# Delete documents that no longer have associated chunks
for doc_id in docs_to_delete:
# Delete related entities & relationships if needed:
await self.providers.database.graphs_handler.entities.delete(
Expand All @@ -204,7 +215,6 @@ def transform_chunk_id_to_id(
document_id=doc_id
)

# 8. Return a summary of what happened.
return {
"success": True,
"deleted_chunks_count": len(delete_results),
Expand Down Expand Up @@ -681,10 +691,10 @@ async def summarize_collection(
),
)

collection_summary = response.choices[0].message.content
if not collection_summary:
if collection_summary := response.choices[0].message.content:
return collection_summary
else:
raise ValueError("Expected a generated response.")
return collection_summary

@telemetry_event("AddPrompt")
async def add_prompt(
Expand Down

0 comments on commit 518138d

Please sign in to comment.