Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
emrgnt-cmplxty committed Dec 10, 2024
1 parent 05a9d5f commit c25dbb0
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 22 deletions.
51 changes: 51 additions & 0 deletions .github/actions/run-sdk-documents-tests/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: 'Run SDK Ingestion Tests'
description: 'Runs SDK retrieval tests for R2R'
runs:
using: "composite"
steps:
- name: Create document (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_create_document

- name: List documents (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_list_documents

- name: Retrieve document (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_retrieve_document

- name: Download document (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_download_document

- name: List collections (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_list_document_collections

- name: Extract document (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_extract_document

- name: List entities (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_list_entities


- name: Delete document (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_delete_document

- name: Delete document by filters (SDK)
working-directory: ./py
shell: bash
run: poetry run python tests/integration/runner_documents.py test_delete_document_by_filter

2 changes: 1 addition & 1 deletion py/core/configs/r2r_azure.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ batch_size = 256

[embedding]
provider = "litellm"
base_model = "openai/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
base_model = "azure/text-embedding-3-small" # continue with `openai` for embeddings, due to server rate limit on azure
base_dimension = 512

[file]
Expand Down
1 change: 0 additions & 1 deletion py/core/main/api/v3/documents_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,6 @@ async def delete_document_by_filter(
filters_dict = {
"$and": [{"owner_id": {"$eq": str(auth_user.id)}}, filters]
}
print("filters_dict = ", filters_dict)
await self.services["management"].delete(filters=filters_dict)

return GenericBooleanResponse(success=True) # type: ignore
Expand Down
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "r2r"
readme = "README.md"
version = "3.3.7"
version = "3.3.8"

description = "SciPhi R2R"
authors = ["Owen Colegrove <owen@sciphi.ai>"]
Expand Down
19 changes: 0 additions & 19 deletions py/sdk/v3/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,25 +152,6 @@ async def download(
raise ValueError("Expected BytesIO response")
return response

# async def download(
# self,
# id: str | UUID,
# ) -> BytesIO:
# """
# Download a document's file content.

# Args:
# id (Union[str, UUID]): ID of document to download

# Returns:
# BytesIO: File content as a binary stream
# """
# return await self.client._make_request(
# "GET",
# f"documents/{str(id)}/download",
# version="v3",
# )

async def delete(
self,
id: str | UUID,
Expand Down
249 changes: 249 additions & 0 deletions py/tests/integration/runner_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
import argparse
import sys
import time

from r2r import R2RClient, R2RException


def compare_result_fields(result, expected_fields):
for field, expected_value in expected_fields.items():
if callable(expected_value):
if not expected_value(result[field]):
print(f"Test failed: Incorrect {field}")
print(f"Expected {field} to satisfy the condition")
print(f"Actual {field}:", result[field])
sys.exit(1)
else:
if result[field] != expected_value:
print(f"Test failed: Incorrect {field}")
print(f"Expected {field}:", expected_value)
print(f"Actual {field}:", result[field])
sys.exit(1)


def test_create_document():
print("Testing: Ingest sample file SDK")
file_path = "core/examples/data/aristotle.txt"
create_response = client.documents.create(
file_path=file_path, run_with_orchestration=False
)

if not create_response["results"]:
print("Ingestion test failed")
sys.exit(1)
print("Ingestion successful")
print("~" * 100)


def test_list_documents():
documents = client.documents.list()["results"]
sample_document = {
"id": "db02076e-989a-59cd-98d5-e24e15a0bd27",
"title": "aristotle.txt",
"document_type": "txt",
"ingestion_status": "success",
"extraction_status": "pending",
"collection_ids": ["122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"],
}

if not any(
all(doc.get(k) == v for k, v in sample_document.items())
for doc in documents
):
for doc in documents:
print(doc)
for k, v in sample_document.items():
print(doc.get(k))
print(v)
sys.exit(1)
print("Document overview test passed")
print("~" * 100)


def test_retrieve_document():
print("Testing: Retrieve a specific document")
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
doc = client.documents.retrieve(id=document_id)["results"]
if not doc["id"] == document_id:
print("Failed to retrieve the correct document.")
sys.exit(1)
print("Retrieve document test passed")
print("~" * 100)


def test_download_document():
print("Testing: Download document content")
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
content = client.documents.download(id=document_id)
if not content:
print("Failed to download document content.")
sys.exit(1)

data = content.getvalue()
print("Content length:", len(data))
# If it’s text:
# print("Content (as text):", data.decode("utf-8", errors="replace"))

print("Download document test passed")
print("~" * 100)


def test_delete_document():
print("Testing: Delete a specific document")
# First create a doc to delete
create_resp = client.documents.create(
raw_text="This is a temporary doc", run_with_orchestration=False
)["results"]
print("Created document:", create_resp)
doc_id = create_resp["document_id"]
delete_resp = client.documents.delete(id=doc_id)["results"]
if not delete_resp["success"]:
print("Failed to delete the document.")
sys.exit(1)
# Optionally verify it's gone:
try:
result = client.documents.retrieve(doc_id)
print("retrieve result:", result)
print("Document still exists after deletion.")
sys.exit(1)
except R2RException as e:
if e.status_code != 404:
print("Unexpected error after deletion:", e)
sys.exit(1)
print("Delete document test passed")
print("~" * 100)


def test_delete_document_by_filter():
print("Testing: Delete documents by filter")
# Create a doc with a unique metadata field to filter by
unique_meta = {"to_delete": "yes"}
create_resp = client.documents.create(
raw_text="Document to be filtered out",
metadata=unique_meta,
run_with_orchestration=False,
)["results"]
doc_id = create_resp["document_id"]

# Use a filter that matches this newly created doc
filters = {"to_delete": {"$eq": "yes"}}
del_resp = client.documents.delete_by_filter(filters)["results"]
if not del_resp["success"]:
print("Failed to delete documents by filter.")
sys.exit(1)
# Verify deletion:
try:
client.documents.retrieve(doc_id)
print("Document still exists after filter-based deletion.")
sys.exit(1)
except R2RException as e:
if e.status_code != 404:
print("Unexpected error after filter-based deletion:", e)
sys.exit(1)
print("Delete by filter test passed")
print("~" * 100)


def test_list_document_collections():
print("Testing: List collections containing a document (superuser-only)")
# Assume we have superuser auth and a known document_id
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
collections = client.documents.list_collections(id=document_id)["results"]
# Basic check: ensure we got a list
if not isinstance(collections, list):
print("Failed to list document collections.")
sys.exit(1)
print("List document collections test passed")
print("~" * 100)


def test_extract_document():
print("Testing: Extract entities and relationships")
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
# First just get an estimate:
# estimate_resp = client.documents.extract(id=document_id, run_type="estimate")
# if "estimate" not in estimate_resp:
# print("Failed to get entity extraction estimate.")
# sys.exit(1)
# print("Entity extraction estimate retrieved successfully")

# Then actually run extraction (requires superuser and doc readiness):
run_resp = client.documents.extract(
id=document_id, run_type="run", run_with_orchestration=False
)["results"]
# Just check for a message:
if "message" not in run_resp:
print("Failed to run entity extraction.")
sys.exit(1)
print("Entity extraction test passed")
print("~" * 100)


def test_list_entities():
print("Testing: List entities for a document")
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
entities = client.documents.list_entities(id=document_id)["results"]
# Basic check: we got a list back. Entities might be empty if not extracted yet, but we can still check format.
if not isinstance(entities, list):
print("Failed to list entities.")
sys.exit(1)
print("List entities test passed")
print("~" * 100)


def test_list_relationships():
print("Testing: List relationships for a document")
document_id = "db02076e-989a-59cd-98d5-e24e15a0bd27"
relationships = client.documents.list_relationships(id=document_id)[
"results"
]
# Basic check: ensure it's a list
if not isinstance(relationships, list):
print("Failed to list relationships.")
sys.exit(1)
print("List relationships test passed")
print("~" * 100)


def test_search_documents():
print("Testing: Search documents")
query = "Aristotle philosophy"
search_results = client.documents.search(
query=query, search_mode="custom", search_settings={"limit": 5}
)
# Basic check: ensure we got some results back
if "results" not in search_results:
print("Failed to search documents.")
sys.exit(1)
print("Document search test passed")
print("~" * 100)


def create_client(base_url):
return R2RClient(base_url)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="R2R SDK Integration Tests")
parser.add_argument("test_function", help="Test function to run")
parser.add_argument(
"--base-url",
default="http://localhost:7272",
help="Base URL for the R2R client",
)
args = parser.parse_args()

global client
client = create_client(args.base_url)

test_function = args.test_function
globals()[test_function]()


# if __name__ == "__main__":
# if len(sys.argv) < 2:
# print("Please specify a test function to run")
# sys.exit(1)

# test_function = sys.argv[1]
# globals()[test_function]()

0 comments on commit c25dbb0

Please sign in to comment.