diff --git a/.github/workflows/build-cluster-docker.yml b/.github/workflows/build-cluster-docker.yml
new file mode 100644
index 000000000..19f8a56ec
--- /dev/null
+++ b/.github/workflows/build-cluster-docker.yml
@@ -0,0 +1,51 @@
+name: Build and Publish Unstructured Docker Image
+
+on:
+ workflow_dispatch:
+
+env:
+ REGISTRY_BASE: ragtoriches
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout Repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.12'
+
+ - name: Install toml package
+ run: pip install toml
+
+ - name: Determine version
+ id: version
+ run: |
+ echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/cluster-prod" >> $GITHUB_OUTPUT
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Docker Auth
+ uses: docker/login-action@v3
+ with:
+ username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
+ password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}
+
+ - name: Build and push image
+ uses: docker/build-push-action@v5
+ with:
+ context: ./services/clustering
+ file: ./services/clustering/Dockerfile.clustering
+ platforms: linux/amd64,linux/arm64
+ push: true
+ tags: ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
+ provenance: false
+ sbom: false
+
+ - name: Verify manifest
+ run: |
+ docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
diff --git a/py/compose.full.yaml b/py/compose.full.yaml
index be496f0a0..ab78561e5 100644
--- a/py/compose.full.yaml
+++ b/py/compose.full.yaml
@@ -260,8 +260,6 @@ services:
unstructured:
image: ${UNSTRUCTURED_IMAGE:-ragtoriches/unst-prod}
- ports:
- - "${R2R_UNSTRUCTURED_PORT:-7275}:7275"
networks:
- r2r-network
healthcheck:
@@ -270,6 +268,18 @@ services:
timeout: 5s
retries: 5
+ graph_clustering:
+ image: ${GRAPH_CLUSTERING_IMAGE:-ragtoriches/clustering-prod}
+ ports:
+ - "${R2R_GRAPH_CLUSTERING_PORT:-7276}:7276"
+ networks:
+ - r2r-network
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:7276/health"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+
r2r:
image: ${R2R_IMAGE:-ragtoriches/prod:latest}
build:
@@ -342,13 +352,17 @@ services:
# Unstructured
- UNSTRUCTURED_API_KEY=${UNSTRUCTURED_API_KEY:-}
- UNSTRUCTURED_API_URL=${UNSTRUCTURED_API_URL:-https://api.unstructured.io/general/v0/general}
- - UNSTRUCTURED_LOCAL_URL=${UNSTRUCTURED_LOCAL_URL:-http://unstructured:7275}
+ - UNSTRUCTURED_SERVICE_URL=${UNSTRUCTURED_SERVICE_URL:-http://unstructured:7275}
- UNSTRUCTURED_NUM_WORKERS=${UNSTRUCTURED_NUM_WORKERS:-10}
# Hatchet
- HATCHET_CLIENT_TLS_STRATEGY=none
- HATCHET_CLIENT_GRPC_MAX_RECV_MESSAGE_LENGTH=${HATCHET_CLIENT_GRPC_MAX_RECV_MESSAGE_LENGTH:-134217728}
- HATCHET_CLIENT_GRPC_MAX_SEND_MESSAGE_LENGTH=${HATCHET_CLIENT_GRPC_MAX_SEND_MESSAGE_LENGTH:-134217728}
+
+ # Graphologic
+ - CLUSTERING_SERVICE_URL=http://graph_clustering:7276
+
command: >
sh -c '
if [ -z "$${HATCHET_CLIENT_TOKEN}" ]; then
diff --git a/py/core/configs/full.toml b/py/core/configs/full.toml
index 83fb1b2c6..9b139e499 100644
--- a/py/core/configs/full.toml
+++ b/py/core/configs/full.toml
@@ -2,6 +2,10 @@
provider = "litellm"
concurrent_request_limit = 128
+[database]
+ [database.graph_creation_settings]
+ clustering_mode = "remote"
+
[ingestion]
provider = "unstructured_local"
strategy = "auto"
diff --git a/py/core/configs/full_azure.toml b/py/core/configs/full_azure.toml
index 56cb81fab..76fc7cd24 100644
--- a/py/core/configs/full_azure.toml
+++ b/py/core/configs/full_azure.toml
@@ -10,10 +10,9 @@ concurrent_request_limit = 128
[agent.generation_config]
model = "azure/gpt-4o"
-# KG settings
-batch_size = 256
-
+[database]
[database.graph_creation_settings]
+ clustering_mode = "remote"
generation_config = { model = "azure/gpt-4o-mini" }
[database.graph_entity_deduplication_settings]
diff --git a/py/core/configs/full_local_llm.toml b/py/core/configs/full_local_llm.toml
index b5329beb6..b77fc9a87 100644
--- a/py/core/configs/full_local_llm.toml
+++ b/py/core/configs/full_local_llm.toml
@@ -22,6 +22,7 @@ concurrent_request_limit = 1
provider = "postgres"
[database.graph_creation_settings]
+ clustering_mode = "remote"
graph_entity_description_prompt = "graphrag_entity_description"
entity_types = [] # if empty, all entities are extracted
relation_types = [] # if empty, all relations are extracted
diff --git a/py/core/main/services/kg_service.py b/py/core/main/services/kg_service.py
index a141ad63c..8af1979ba 100644
--- a/py/core/main/services/kg_service.py
+++ b/py/core/main/services/kg_service.py
@@ -581,6 +581,7 @@ async def kg_clustering(
"generation_config": generation_config,
"leiden_params": leiden_params,
"logger": logger,
+ "clustering_mode": self.config.database.graph_creation_settings.clustering_mode,
}
),
state=None,
diff --git a/py/core/pipes/kg/clustering.py b/py/core/pipes/kg/clustering.py
index 8f392fe07..7c5842cd2 100644
--- a/py/core/pipes/kg/clustering.py
+++ b/py/core/pipes/kg/clustering.py
@@ -44,6 +44,7 @@ async def cluster_kg(
self,
collection_id: UUID,
leiden_params: dict,
+ clustering_mode: str,
):
"""
Clusters the knowledge graph relationships into communities using hierarchical Leiden algorithm. Uses graspologic library.
@@ -52,6 +53,7 @@ async def cluster_kg(
num_communities = await self.database_provider.graph_handler.perform_graph_clustering(
collection_id=collection_id,
leiden_params=leiden_params,
+ clustering_mode=clustering_mode,
)
return {
@@ -72,8 +74,10 @@ async def _run_logic( # type: ignore
collection_id = input.message.get("collection_id", None)
leiden_params = input.message["leiden_params"]
+ clustering_mode = input.message["clustering_mode"]
yield await self.cluster_kg(
collection_id=collection_id,
leiden_params=leiden_params,
+ clustering_mode=clustering_mode,
)
diff --git a/py/core/pipes/kg/community_summary.py b/py/core/pipes/kg/community_summary.py
index 70780eed0..55cf9c061 100644
--- a/py/core/pipes/kg/community_summary.py
+++ b/py/core/pipes/kg/community_summary.py
@@ -261,6 +261,7 @@ async def _run_logic( # type: ignore
generation_config = input.message["generation_config"]
max_summary_input_length = input.message["max_summary_input_length"]
collection_id = input.message.get("collection_id", None)
+ clustering_mode = input.message.get("clustering_mode", None)
community_summary_jobs = []
logger = input.message.get("logger", logging.getLogger())
@@ -295,16 +296,23 @@ async def _run_logic( # type: ignore
relationship_ids_cache={},
leiden_params=leiden_params,
collection_id=collection_id,
+ clustering_mode=clustering_mode,
)
)
# Organize clusters
clusters: dict[Any] = {}
for item in community_clusters:
- cluster_id = item.cluster
+ cluster_id = (
+ item["cluster"]
+ if clustering_mode == "remote"
+ else item.cluster
+ )
if cluster_id not in clusters:
clusters[cluster_id] = []
- clusters[cluster_id].append(item.node)
+ clusters[cluster_id].append(
+ item["node"] if clustering_mode == "remote" else item.node
+ )
# Now, process the clusters
for _, nodes in clusters.items():
diff --git a/py/core/providers/database/graph.py b/py/core/providers/database/graph.py
index 67aef893a..3cf51c31f 100644
--- a/py/core/providers/database/graph.py
+++ b/py/core/providers/database/graph.py
@@ -2,12 +2,14 @@
import datetime
import json
import logging
+import os
import time
from enum import Enum
from typing import Any, AsyncGenerator, Optional, Tuple, Union
from uuid import UUID
import asyncpg
+import httpx
from asyncpg.exceptions import UndefinedTableError, UniqueViolationError
from fastapi import HTTPException
@@ -2128,26 +2130,14 @@ async def perform_graph_clustering(
self,
collection_id: UUID,
leiden_params: dict[str, Any],
+ clustering_mode: str,
) -> Tuple[int, Any]:
"""
- Leiden clustering algorithm to cluster the knowledge graph relationships into communities.
-
- Available parameters and defaults:
- max_cluster_size: int = 1000,
- starting_communities: Optional[dict[str, int]] = None,
- extra_forced_iterations: int = 0,
- resolution: int | float = 1.0,
- randomness: int | float = 0.001,
- use_modularity: bool = True,
- random_seed: Optional[int] = None,
- weight_attribute: str = "weight",
- is_weighted: Optional[bool] = None,
- weight_default: int| float = 1.0,
- check_directed: bool = True,
+ Calls the external clustering service to cluster the KG.
"""
offset = 0
- page_size = 1000 # Increased batch size for efficiency
+ page_size = 1000
all_relationships = []
while True:
relationships, count = await self.relationships.get(
@@ -2167,19 +2157,241 @@ async def perform_graph_clustering(
break
relationship_ids_cache = await self._get_relationship_ids_cache(
- relationships
+ all_relationships
)
logger.info(
f"Clustering over {len(all_relationships)} relationships for {collection_id} with settings: {leiden_params}"
)
+
return await self._cluster_and_add_community_info(
- relationships=relationships,
+ relationships=all_relationships,
relationship_ids_cache=relationship_ids_cache,
leiden_params=leiden_params,
collection_id=collection_id,
+ clustering_mode=clustering_mode,
+ )
+
+ async def _call_clustering_service(
+ self, relationships: list[Relationship], leiden_params: dict[str, Any]
+ ) -> list[dict]:
+ """
+ Calls the external Graspologic clustering service, sending relationships and parameters.
+ Expects a response with 'communities' field.
+ """
+ # Convert relationships to a JSON-friendly format
+ rel_data = []
+ for r in relationships:
+ rel_data.append(
+ {
+ "id": str(r.id),
+ "subject": r.subject,
+ "object": r.object,
+ "weight": r.weight if r.weight is not None else 1.0,
+ }
+ )
+
+ endpoint = os.environ.get("CLUSTERING_SERVICE_URL")
+ if not endpoint:
+ raise ValueError("CLUSTERING_SERVICE_URL not set.")
+
+ url = f"{endpoint}/cluster"
+
+ payload = {"relationships": rel_data, "leiden_params": leiden_params}
+
+ async with httpx.AsyncClient() as client:
+ response = await client.post(url, json=payload, timeout=3600)
+ response.raise_for_status()
+
+ data = response.json()
+ communities = data.get("communities", [])
+ return communities
+
+ # async def _create_graph_and_cluster(
+ # self, relationships: list[Relationship], leiden_params: dict[str, Any]
+ # ) -> Any:
+ # """
+ # Previously created a local graph and ran hierarchical_leiden.
+ # Now it calls the external clustering service.
+ # """
+ # logger.info("Sending request to external clustering service...")
+ # communities = await self._call_clustering_service(
+ # relationships, leiden_params
+ # )
+ # logger.info("Received communities from clustering service.")
+ # return communities
+
+ # async def _cluster_and_add_community_info(
+ # self,
+ # relationships: list[Relationship],
+ # relationship_ids_cache: dict[str, list[int]],
+ # leiden_params: dict[str, Any],
+ # collection_id: Optional[UUID] = None,
+ # ) -> Tuple[int, Any]:
+
+ # # Clear old information if needed (unchanged logic)
+ # conditions = []
+ # if collection_id is not None:
+ # conditions.append("collection_id = $1")
+
+ # await asyncio.sleep(0.1)
+
+ # start_time = time.time()
+
+ # logger.info(f"Creating graph and clustering for {collection_id}")
+
+ # # Get communities from the external service
+ # hierarchical_communities = await self._create_graph_and_cluster(
+ # relationships=relationships,
+ # leiden_params=leiden_params,
+ # )
+
+ # logger.info(
+ # f"Computing Leiden communities completed, time {time.time() - start_time:.2f} seconds."
+ # )
+
+ # def relationship_ids(node: str) -> list[int]:
+ # return relationship_ids_cache.get(node, [])
+
+ # logger.info(
+ # f"Cached {len(relationship_ids_cache)} relationship ids, time {time.time() - start_time:.2f} seconds."
+ # )
+
+ # # hierarchical_communities is now a list of dicts like:
+ # # [{"node": str, "cluster": int, "level": int}, ...]
+
+ # if not hierarchical_communities:
+ # num_communities = 0
+ # else:
+ # num_communities = (
+ # max(item["cluster"] for item in hierarchical_communities) + 1
+ # )
+
+ # logger.info(
+ # f"Generated {num_communities} communities, time {time.time() - start_time:.2f} seconds."
+ # )
+
+ # return num_communities, hierarchical_communities
+
+ async def _create_graph_and_cluster(
+ self,
+ relationships: list[Relationship],
+ leiden_params: dict[str, Any],
+ clustering_mode: str = "remote",
+ ) -> Any:
+ """
+ Create a graph and cluster it. If clustering_mode='local', use hierarchical_leiden locally.
+ If clustering_mode='remote', call the external service.
+ """
+
+ if clustering_mode == "remote":
+ logger.info("Sending request to external clustering service...")
+ communities = await self._call_clustering_service(
+ relationships, leiden_params
+ )
+ logger.info("Received communities from clustering service.")
+ return communities
+ else:
+ # Local mode: run hierarchical_leiden directly
+ G = self.nx.Graph()
+ for relationship in relationships:
+ G.add_edge(
+ relationship.subject,
+ relationship.object,
+ weight=relationship.weight,
+ id=relationship.id,
+ )
+
+ logger.info(
+ f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges"
+ )
+ return await self._compute_leiden_communities(G, leiden_params)
+
+ async def _cluster_and_add_community_info(
+ self,
+ relationships: list[Relationship],
+ relationship_ids_cache: dict[str, list[int]],
+ leiden_params: dict[str, Any],
+ collection_id: Optional[UUID] = None,
+ clustering_mode: str = "local",
+ ) -> Tuple[int, Any]:
+
+ # clear if there is any old information
+ conditions = []
+ if collection_id is not None:
+ conditions.append("collection_id = $1")
+
+ await asyncio.sleep(0.1)
+
+ start_time = time.time()
+
+ logger.info(f"Creating graph and clustering for {collection_id}")
+
+ hierarchical_communities = await self._create_graph_and_cluster(
+ relationships=relationships,
+ leiden_params=leiden_params,
+ clustering_mode=clustering_mode,
+ )
+
+ logger.info(
+ f"Computing Leiden communities completed, time {time.time() - start_time:.2f} seconds."
+ )
+
+ def relationship_ids(node: str) -> list[int]:
+ return relationship_ids_cache.get(node, [])
+
+ logger.info(
+ f"Cached {len(relationship_ids_cache)} relationship ids, time {time.time() - start_time:.2f} seconds."
+ )
+
+ # If remote: hierarchical_communities is a list of dicts like:
+ # [{"node": str, "cluster": int, "level": int}, ...]
+ # If local: hierarchical_communities is the returned structure from hierarchical_leiden (list of named tuples)
+
+ if clustering_mode == "remote":
+ if not hierarchical_communities:
+ num_communities = 0
+ else:
+ num_communities = (
+ max(item["cluster"] for item in hierarchical_communities)
+ + 1
+ )
+ else:
+ # Local mode: hierarchical_communities returned by hierarchical_leiden
+ # According to the original code, it's likely a list of items with .cluster attribute
+ if not hierarchical_communities:
+ num_communities = 0
+ else:
+ num_communities = (
+ max(item.cluster for item in hierarchical_communities) + 1
+ )
+
+ logger.info(
+ f"Generated {num_communities} communities, time {time.time() - start_time:.2f} seconds."
)
+ return num_communities, hierarchical_communities
+
+ async def _get_relationship_ids_cache(
+ self, relationships: list[Relationship]
+ ) -> dict[str, list[int]]:
+ relationship_ids_cache: dict[str, list[int]] = {}
+ for relationship in relationships:
+ if relationship.subject is not None:
+ relationship_ids_cache.setdefault(relationship.subject, [])
+ if relationship.id is not None:
+ relationship_ids_cache[relationship.subject].append(
+ relationship.id
+ )
+ if relationship.object is not None:
+ relationship_ids_cache.setdefault(relationship.object, [])
+ if relationship.id is not None:
+ relationship_ids_cache[relationship.object].append(
+ relationship.id
+ )
+
+ return relationship_ids_cache
+
async def get_entity_map(
self, offset: int, limit: int, document_id: UUID
) -> dict[str, dict[str, list[dict[str, Any]]]]:
@@ -2430,98 +2642,22 @@ def parse_filter(fd: dict) -> str:
return parse_filter(filter_dict)
- async def _create_graph_and_cluster(
- self, relationships: list[Relationship], leiden_params: dict[str, Any]
- ) -> Any:
-
- G = self.nx.Graph()
- for relationship in relationships:
- G.add_edge(
- relationship.subject,
- relationship.object,
- weight=relationship.weight,
- id=relationship.id,
- )
-
- logger.info(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges")
-
- return await self._compute_leiden_communities(G, leiden_params)
-
- async def _cluster_and_add_community_info(
- self,
- relationships: list[Relationship],
- relationship_ids_cache: dict[str, list[int]],
- leiden_params: dict[str, Any],
- collection_id: Optional[UUID] = None,
- ) -> Tuple[int, Any]:
-
- # clear if there is any old information
- conditions = []
- if collection_id is not None:
- conditions.append("collection_id = $1")
-
- await asyncio.sleep(0.1)
-
- start_time = time.time()
-
- logger.info(f"Creating graph and clustering for {collection_id}")
-
- hierarchical_communities = await self._create_graph_and_cluster(
- relationships=relationships,
- leiden_params=leiden_params,
- )
-
- logger.info(
- f"Computing Leiden communities completed, time {time.time() - start_time:.2f} seconds."
- )
+ # async def _create_graph_and_cluster(
+ # self, relationships: list[Relationship], leiden_params: dict[str, Any]
+ # ) -> Any:
- def relationship_ids(node: str) -> list[int]:
- return relationship_ids_cache.get(node, [])
+ # G = self.nx.Graph()
+ # for relationship in relationships:
+ # G.add_edge(
+ # relationship.subject,
+ # relationship.object,
+ # weight=relationship.weight,
+ # id=relationship.id,
+ # )
- logger.info(
- f"Cached {len(relationship_ids_cache)} relationship ids, time {time.time() - start_time:.2f} seconds."
- )
-
- num_communities = (
- max(item.cluster for item in hierarchical_communities) + 1
- )
-
- logger.info(
- f"Generated {num_communities} communities, time {time.time() - start_time:.2f} seconds."
- )
-
- return num_communities, hierarchical_communities
-
- async def _get_relationship_ids_cache(
- self, relationships: list[Relationship]
- ) -> dict[str, list[int]]:
-
- # caching the relationship ids
- relationship_ids_cache = dict[str, list[int | UUID]]()
- for relationship in relationships:
- if (
- relationship.subject not in relationship_ids_cache
- and relationship.subject is not None
- ):
- relationship_ids_cache[relationship.subject] = []
- if (
- relationship.object not in relationship_ids_cache
- and relationship.object is not None
- ):
- relationship_ids_cache[relationship.object] = []
- if (
- relationship.subject is not None
- and relationship.id is not None
- ):
- relationship_ids_cache[relationship.subject].append(
- relationship.id
- )
- if relationship.object is not None and relationship.id is not None:
- relationship_ids_cache[relationship.object].append(
- relationship.id
- )
+ # logger.info(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges")
- return relationship_ids_cache # type: ignore
+ # return await self._compute_leiden_communities(G, leiden_params)
async def _compute_leiden_communities(
self,
diff --git a/py/core/providers/ingestion/unstructured/base.py b/py/core/providers/ingestion/unstructured/base.py
index b4dcb6abc..f6a661186 100644
--- a/py/core/providers/ingestion/unstructured/base.py
+++ b/py/core/providers/ingestion/unstructured/base.py
@@ -147,11 +147,11 @@ def __init__(
else:
try:
self.local_unstructured_url = os.environ[
- "UNSTRUCTURED_LOCAL_URL"
+ "UNSTRUCTURED_SERVICE_URL"
]
except KeyError as e:
raise ValueError(
- "UNSTRUCTURED_LOCAL_URL environment variable is not set"
+ "UNSTRUCTURED_SERVICE_URL environment variable is not set"
) from e
self.client = httpx.AsyncClient()
diff --git a/py/core/providers/orchestration/hatchet.py b/py/core/providers/orchestration/hatchet.py
index 13cca0a89..e894c1b28 100644
--- a/py/core/providers/orchestration/hatchet.py
+++ b/py/core/providers/orchestration/hatchet.py
@@ -1,5 +1,6 @@
import asyncio
import logging
+import threading
from typing import Any, Callable, Optional
from core.base import OrchestrationConfig, OrchestrationProvider, Workflow
@@ -55,6 +56,23 @@ async def start_worker(self):
asyncio.create_task(self.worker.async_start())
+ # # Instead of using asyncio.create_task, run the worker in a separate thread
+ # def start_worker(self):
+ # if not self.worker:
+ # raise ValueError(
+ # "Worker not initialized. Call get_worker() first."
+ # )
+
+ # def run_worker():
+ # # Create a new event loop for this thread
+ # loop = asyncio.new_event_loop()
+ # asyncio.set_event_loop(loop)
+ # loop.run_until_complete(self.worker.async_start())
+ # loop.run_forever() # If needed, or just run_until_complete for one task
+
+ # thread = threading.Thread(target=run_worker, daemon=True)
+ # thread.start()
+
async def run_workflow(
self,
workflow_name: str,
diff --git a/py/r2r.toml b/py/r2r.toml
index 11693d300..1c5a1a44d 100644
--- a/py/r2r.toml
+++ b/py/r2r.toml
@@ -45,6 +45,7 @@ default_collection_description = "Your default collection."
batch_size = 256
[database.graph_creation_settings]
+ clustering_mode = "local"
graph_entity_description_prompt = "graphrag_entity_description"
entity_types = [] # if empty, all entities are extracted
relation_types = [] # if empty, all relations are extracted
diff --git a/py/shared/abstractions/kg.py b/py/shared/abstractions/kg.py
index adfb46c57..2e4be0c08 100644
--- a/py/shared/abstractions/kg.py
+++ b/py/shared/abstractions/kg.py
@@ -33,6 +33,11 @@ def __str__(self):
class KGCreationSettings(R2RSerializable):
"""Settings for knowledge graph creation."""
+ clustering_mode: str = Field(
+ default="local",
+ description="Whether to use remote clustering for graph creation.",
+ )
+
graphrag_relationships_extraction_few_shot: str = Field(
default="graphrag_relationships_extraction_few_shot",
description="The prompt to use for knowledge graph extraction.",
diff --git a/py/tests/conftest.py b/py/tests/conftest.py
deleted file mode 100644
index 5c66ce9f6..000000000
--- a/py/tests/conftest.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# tests//conftest.py
-import os
-import random
-import uuid
-from uuid import UUID
-
-import pytest
-
-from core import (
- AppConfig,
- AuthConfig,
- BCryptConfig,
- CompletionConfig,
- DatabaseConfig,
- EmailConfig,
- EmbeddingConfig,
- PersistentLoggingConfig,
- SqlitePersistentLoggingProvider,
- Vector,
- VectorEntry,
-)
-from core.base import (
- DocumentResponse,
- DocumentType,
- IngestionConfig,
- IngestionStatus,
- KGExtractionStatus,
- OrchestrationConfig,
- VectorQuantizationType,
-)
-from core.providers import (
- BCryptProvider,
- ConsoleMockEmailProvider,
- LiteLLMCompletionProvider,
- LiteLLMEmbeddingProvider,
- PostgresDBProvider,
- R2RAuthProvider,
-)
-from core.providers.ingestion import R2RIngestionConfig, R2RIngestionProvider
-from core.providers.orchestration import SimpleOrchestrationProvider
-
-
-# Vectors
-@pytest.fixture(scope="function")
-def dimension():
- return 128
-
-
-@pytest.fixture(scope="function")
-def num_entries():
- return 100
-
-
-@pytest.fixture(scope="function")
-def sample_entries(dimension, num_entries):
- def generate_random_vector_entry(
- id_value: int, dimension: int
- ) -> VectorEntry:
- vector_data = [random.random() for _ in range(dimension)]
- metadata = {"key": f"value_id_{id_value}", "raw_key": id_value}
- return VectorEntry(
- chunk_id=uuid.uuid4(),
- document_id=uuid.uuid4(),
- user_id=uuid.uuid4(),
- collection_ids=[uuid.uuid4()],
- vector=Vector(data=vector_data),
- text=f"Sample text for id_{id_value}",
- metadata=metadata,
- )
-
- return [
- generate_random_vector_entry(i, dimension) for i in range(num_entries)
- ]
-
-
-@pytest.fixture(scope="function")
-def project_name():
- collection_id = uuid.uuid4()
-
- return f"test_collection_{collection_id.hex}"
-
-
-@pytest.fixture(scope="function")
-def app_config(project_name):
-
- return AppConfig(project_name=project_name)
-
-
-# Crypto
-@pytest.fixture(scope="function")
-def crypto_config(app_config):
- return BCryptConfig(app=app_config)
-
-
-@pytest.fixture(scope="function")
-def crypto_provider(crypto_config, app_config):
- return BCryptProvider(crypto_config)
-
-
-# Postgres
-@pytest.fixture(scope="function")
-def db_config(app_config):
- return DatabaseConfig.create(provider="postgres", app=app_config)
-
-
-@pytest.fixture(scope="function")
-async def postgres_db_provider(
- db_config, dimension, crypto_provider, sample_entries, app_config
-):
- db = PostgresDBProvider(
- db_config, dimension=dimension, crypto_provider=crypto_provider
- )
- # await db.create_tables(embedding_dimension, vector_quantization_type)
- await db.initialize()
- await db.upsert_entries(sample_entries)
-
- # upsert into documents_overview
- document_info = DocumentResponse(
- id=UUID("9fbe403b-c11c-5aae-8ade-ef22980c3ad1"),
- collection_ids=[UUID("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")],
- owner_id=UUID("00000000-0000-0000-0000-000000000003"),
- document_type=DocumentType.PDF,
- metadata={},
- title="Test Document for KG",
- version="1.0",
- size_in_bytes=1024,
- ingestion_status=IngestionStatus.PENDING,
- extraction_status=KGExtractionStatus.PENDING,
- )
- await db.upsert_documents_overview(document_info)
- yield db
- # Teardown
- # TODO - Add teardown methods
- # await db.delete_project(db.project_name)
-
-
-@pytest.fixture(scope="function")
-def db_config_temporary(project_name, app_config):
- return DatabaseConfig.create(
- provider="postgres", project_name=project_name, app=app_config
- )
-
-
-@pytest.fixture(scope="function")
-async def temporary_postgres_db_provider(
- db_config_temporary, dimension, crypto_provider, sample_entries
-):
- db = PostgresDBProvider(
- db_config_temporary,
- dimension=dimension,
- crypto_provider=crypto_provider,
- )
- await db.initialize()
- await db.upsert_entries(sample_entries)
- try:
- yield db
- finally:
- await db.close()
- # db.vector.close()
-
-
-# Auth
-@pytest.fixture(scope="function")
-def auth_config(app_config):
- return AuthConfig(
- secret_key="test_secret_key",
- access_token_lifetime_in_minutes=15,
- refresh_token_lifetime_in_days=1,
- require_email_verification=False,
- app=app_config,
- )
-
-
-@pytest.fixture(scope="function")
-def email_provider(app_config):
- return ConsoleMockEmailProvider(
- EmailConfig(provider="console_mock", app=app_config)
- )
-
-
-@pytest.fixture(scope="function")
-async def r2r_auth_provider(
- auth_config,
- crypto_provider,
- temporary_postgres_db_provider,
- email_provider,
-):
- auth_provider = R2RAuthProvider(
- auth_config,
- crypto_provider,
- temporary_postgres_db_provider,
- email_provider,
- )
- await auth_provider.initialize()
- yield auth_provider
-
-
-# Embeddings
-@pytest.fixture
-def litellm_provider(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="text-embedding-3-small",
- base_dimension=1536,
- app=app_config,
- )
- return LiteLLMEmbeddingProvider(config)
-
-
-# Embeddings
-@pytest.fixture
-def litellm_provider_128(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="text-embedding-3-small",
- base_dimension=128,
- app=app_config,
- )
- return LiteLLMEmbeddingProvider(config)
-
-
-# LLM provider
-@pytest.fixture
-def litellm_completion_provider(app_config):
- config = CompletionConfig(provider="litellm", app=app_config)
- return LiteLLMCompletionProvider(config)
-
-
-# Logging
-@pytest.fixture(scope="function")
-async def local_logging_provider(app_config):
- unique_id = str(uuid.uuid4())
- logging_path = f"test_{unique_id}.sqlite"
- provider = SqlitePersistentLoggingProvider(
- PersistentLoggingConfig(logging_path=logging_path, app=app_config)
- )
- await provider.initialize()
- yield provider
- await provider.close()
- if os.path.exists(logging_path):
- os.remove(logging_path)
-
-
-@pytest.fixture(scope="function")
-def embedding_dimension():
- return 128
-
-
-@pytest.fixture(scope="function")
-def vector_quantization_type():
- return VectorQuantizationType.FP32
-
-
-@pytest.fixture(scope="function")
-def orchestration_config(app_config):
- return OrchestrationConfig(provider="simple", app=app_config)
-
-
-@pytest.fixture
-def orchestration_provider(orchestration_config):
- return SimpleOrchestrationProvider(orchestration_config)
-
-
-@pytest.fixture
-def ingestion_config(app_config):
- return IngestionConfig(
- app=app_config # , chunk_enrichment_settings=enrichment_settings
- )
-
-
-@pytest.fixture
-def r2r_ingestion_provider(app_config):
- return R2RIngestionProvider(R2RIngestionConfig(app=app_config))
-
-
-# @pytest.fixture(scope="function")
-# async def postgres_logging_provider(app_config):
-# unique_id = str(uuid.uuid4())
-# logging_path = f"test_{unique_id}.sqlite"
-# provider = PostgresPersistentLoggingProvider(
-# PersistentLoggingConfig(logging_path=logging_path, app=app_config)
-# )
-# await provider.initialize()
-# yield provider
-# await provider.close()
-# if os.path.exists(logging_path):
-# os.remove(logging_path)
diff --git a/py/tests/core/agent/__init__.py b/py/tests/core/agent/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/core/pipelines/__init__.py b/py/tests/core/pipelines/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/core/pipelines/test_pipeline_logic.py b/py/tests/core/pipelines/test_pipeline_logic.py
deleted file mode 100644
index 47ea7aed1..000000000
--- a/py/tests/core/pipelines/test_pipeline_logic.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import asyncio
-from typing import Any, AsyncGenerator
-
-import pytest
-
-from core import AsyncPipe, AsyncPipeline
-
-
-class MultiplierPipe(AsyncPipe):
- def __init__(self, multiplier=1, delay=0, name="multiplier_pipe"):
- super().__init__(config=self.PipeConfig(name=name), logging_provider=1)
- self.multiplier = multiplier
- self.delay = delay
-
- async def _run_logic(
- self,
- input: AsyncGenerator[Any, None],
- state,
- run_id=None,
- *args,
- **kwargs,
- ) -> AsyncGenerator[Any, None]:
- async for item in input.message:
- if self.delay > 0:
- await asyncio.sleep(self.delay) # Simulate processing delay
- if isinstance(item, list):
- processed = [x * self.multiplier for x in item]
- elif isinstance(item, int):
- processed = item * self.multiplier
- else:
- raise ValueError(f"Unsupported type: {type(item)}")
- yield processed
-
-
-class FanOutPipe(AsyncPipe):
- def __init__(self, multiplier=1, delay=0, name="fan_out_pipe"):
- super().__init__(config=self.PipeConfig(name=name), logging_provider=1)
- self.multiplier = multiplier
- self.delay = delay
-
- async def _run_logic(
- self,
- input: AsyncGenerator[Any, None],
- state,
- run_id=None,
- *args,
- **kwargs,
- ) -> AsyncGenerator[Any, None]:
- inputs = []
- async for item in input.message:
- inputs.append(item)
- for it in range(self.multiplier):
- if self.delay > 0:
- await asyncio.sleep(self.delay)
- yield [(it + 1) * ele for ele in inputs]
-
-
-class FanInPipe(AsyncPipe):
- def __init__(self, delay=0, name="fan_in_pipe"):
- super().__init__(config=self.PipeConfig(name=name), logging_provider=1)
- self.delay = delay
-
- async def _run_logic(
- self,
- input: AsyncGenerator[Any, None],
- state,
- run_id=None,
- *args,
- **kwargs,
- ) -> AsyncGenerator[Any, None]:
- total_sum = 0
- async for batch in input.message:
- if self.delay > 0:
- await asyncio.sleep(self.delay) # Simulate processing delay
- total_sum += sum(
- batch
- ) # Assuming batch is iterable and contains numeric values
- yield total_sum
-
-
-@pytest.fixture
-def pipe_factory():
- def create_pipe(type, **kwargs):
- if type == "multiplier":
- return MultiplierPipe(**kwargs)
- elif type == "fan_out":
- return FanOutPipe(**kwargs)
- elif type == "fan_in":
- return FanInPipe(**kwargs)
- else:
- raise ValueError("Unsupported pipe type")
-
- return create_pipe
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("multiplier, delay, name", [(2, 0.1, "pipe")])
-async def test_single_multiplier(pipe_factory, multiplier, delay, name):
- pipe = pipe_factory(
- "multiplier", multiplier=multiplier, delay=delay, name=name
- )
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(pipe)
-
- result = []
- for output in await pipeline.run(input_generator()):
- result.append(output)
-
- expected_result = [i * multiplier for i in [1, 2, 3]]
- assert (
- result == expected_result
- ), "Pipeline output did not match expected multipliers"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b",
- [(2, 0.1, "pipe_a", 2, 0.1, "pipe_b")],
-)
-async def test_double_multiplier(
- pipe_factory, multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b
-):
- pipe_a = pipe_factory(
- "multiplier", multiplier=multiplier_a, delay=delay_a, name=name_a
- )
- pipe_b = pipe_factory(
- "multiplier", multiplier=multiplier_b, delay=delay_b, name=name_b
- )
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(pipe_a)
- pipeline.add_pipe(pipe_b)
-
- result = []
- for output in await pipeline.run(input_generator()):
- result.append(output)
-
- expected_result = [i * multiplier_a * multiplier_b for i in [1, 2, 3]]
- assert (
- result == expected_result
- ), "Pipeline output did not match expected multipliers"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("multiplier, delay, name", [(3, 0.1, "pipe")])
-async def test_fan_out(pipe_factory, multiplier, delay, name):
- pipe = pipe_factory(
- "fan_out", multiplier=multiplier, delay=delay, name=name
- )
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(pipe)
-
- result = []
- for output in await pipeline.run(input_generator()):
- result.append(output)
-
- expected_result = [
- [i + 1, 2 * (i + 1), 3 * (i + 1)] for i in range(multiplier)
- ]
- assert (
- result == expected_result
- ), "Pipeline output did not match expected multipliers"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b",
- [
- (2, 0.1, "pipe_a", 2, 0.1, "pipe_b"),
- (4, 0.1, "pipe_a", 3, 0.1, "pipe_b"),
- ],
-)
-async def multiply_then_fan_out(
- pipe_factory, multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b
-):
- pipe_a = pipe_factory(
- "multiplier", multiplier=multiplier_a, delay=delay_a, name=name_a
- )
- pipe_b = pipe_factory(
- "fan_out", multiplier=multiplier_b, delay=delay_b, name=name_b
- )
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(pipe_a)
- pipeline.add_pipe(pipe_b)
-
- result = []
- async for output in await pipeline.run(input_generator()):
- result.append(output)
-
- expected_result = [[i * multiplier_a] async for i in input_generator()]
- assert (
- result[0] == expected_result
- ), "Pipeline output did not match expected multipliers"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("multiplier, delay, name", [(3, 0.1, "pipe")])
-async def test_fan_in_sum(pipe_factory, multiplier, delay, name):
- # Create fan-out to generate multiple streams
- fan_out_pipe = pipe_factory(
- "fan_out", multiplier=multiplier, delay=delay, name=f"{name}_a"
- )
- # Summing fan-in pipe
- fan_in_sum_pipe = pipe_factory("fan_in", delay=delay, name=f"{name}_b")
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(fan_out_pipe)
- pipeline.add_pipe(fan_in_sum_pipe)
-
- result = await pipeline.run(input_generator())
-
- # Calculate expected results based on the multiplier and the sum of inputs
- expected_result = sum(
- sum(j * i for j in [1, 2, 3]) for i in range(1, multiplier + 1)
- )
- assert (
- result[0] == expected_result
- ), "Pipeline output did not match expected sums"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
- "multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b",
- [
- (3, 0.1, "pipe_a", 2, 0.1, "pipe_b"),
- (4, 0.1, "pipe_a", 3, 0.1, "pipe_b"),
- ],
-)
-async def test_fan_out_then_multiply(
- pipe_factory, multiplier_a, delay_a, name_a, multiplier_b, delay_b, name_b
-):
- pipe_a = pipe_factory(
- "multiplier", multiplier=multiplier_a, delay=delay_a, name=name_a
- )
- pipe_b = pipe_factory(
- "fan_out", multiplier=multiplier_b, delay=delay_b, name=name_b
- )
- pipe_c = pipe_factory("fan_in", delay=0.1, name="pipe_c")
-
- async def input_generator():
- for i in [1, 2, 3]:
- yield i
-
- pipeline = AsyncPipeline(logging_provider=1)
- pipeline.add_pipe(pipe_a)
- pipeline.add_pipe(pipe_b)
- pipeline.add_pipe(pipe_c)
-
- result = await pipeline.run(input_generator())
-
- expected_result = sum(
- sum(j * i * multiplier_a for j in [1, 2, 3])
- for i in range(1, multiplier_b + 1)
- )
- assert (
- result[0] == expected_result
- ), "Pipeline output did not match expected multipliers"
diff --git a/py/tests/core/pipes/__init__.py b/py/tests/core/pipes/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/core/pipes/test_kg_community_summary_pipe.py b/py/tests/core/pipes/test_kg_community_summary_pipe.py
deleted file mode 100644
index 9786b37db..000000000
--- a/py/tests/core/pipes/test_kg_community_summary_pipe.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import random
-import uuid
-
-import pytest
-
-from core.base import AsyncPipe, Community, Entity, KGExtraction, Relationship
-from core.pipes.kg.community_summary import KGCommunitySummaryPipe
-from shared.abstractions.vector import VectorQuantizationType
-
-
-@pytest.fixture(scope="function")
-def kg_pipeline_config():
- return AsyncPipe.PipeConfig(name="kg_community_summary_pipe")
-
-
-@pytest.fixture(scope="function")
-def kg_community_summary_pipe(
- postgres_db_provider,
- litellm_completion_provider,
- litellm_provider,
- kg_pipeline_config,
- local_logging_provider,
-):
- return KGCommunitySummaryPipe(
- postgres_db_provider,
- litellm_completion_provider,
- litellm_provider,
- kg_pipeline_config,
- logging_provider=local_logging_provider,
- )
-
-
-@pytest.fixture(scope="function")
-def max_summary_input_length():
- return 65536
-
-
-@pytest.fixture(scope="function")
-def collection_id():
- return uuid.UUID("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")
-
-
-@pytest.fixture(scope="function")
-def document_id():
- return uuid.UUID("9fbe403b-c11c-5aae-8ade-ef22980c3ad1")
-
-
-@pytest.fixture(scope="function")
-def chunk_ids():
- return [
- uuid.UUID("32ff6daf-6e67-44fa-b2a9-19384f5d9d19"),
- uuid.UUID("42ff6daf-6e67-44fa-b2a9-19384f5d9d19"),
- ]
-
-
-@pytest.fixture(scope="function")
-def embedding_dimension():
- return 512
-
-
-@pytest.fixture(scope="function")
-def vector_quantization_type():
- return VectorQuantizationType.FP32
-
-
-@pytest.fixture(scope="function")
-def embedding_vectors(embedding_dimension):
- random.seed(42)
- return [
- [random.random() for _ in range(embedding_dimension)] for _ in range(2)
- ]
-
-
-@pytest.fixture(scope="function")
-def entities_raw_list(document_id, chunk_ids):
- return [
- Entity(
- name="Entity1",
- description="Description1",
- category="Category1",
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr1": "value1", "attr2": "value2"},
- ),
- Entity(
- name="Entity2",
- description="Description2",
- category="Category2",
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr3": "value3", "attr4": "value4"},
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def entities_list(chunk_ids, document_id, embedding_vectors):
- return [
- Entity(
- id=1,
- name="Entity1",
- description="Description1",
- chunk_ids=chunk_ids,
- document_id=document_id,
- description_embedding=embedding_vectors[0],
- ),
- Entity(
- id=2,
- name="Entity2",
- description="Description2",
- chunk_ids=chunk_ids,
- document_id=document_id,
- description_embedding=embedding_vectors[1],
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def relationships_raw_list(embedding_vectors, chunk_ids, document_id):
- return [
- Relationship(
- id=1,
- subject="Entity1",
- predicate="predicate1",
- object="object1",
- weight=1.0,
- description="description1",
- embedding=embedding_vectors[0],
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr1": "value1", "attr2": "value2"},
- ),
- Relationship(
- id=2,
- subject="Entity2",
- predicate="predicate2",
- object="object2",
- weight=1.0,
- description="description2",
- embedding=embedding_vectors[1],
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr3": "value3", "attr4": "value4"},
- ),
- ]
-
-
-@pytest.mark.asyncio
-async def test_community_summary_prompt(
- kg_community_summary_pipe,
- entities_list,
- relationships_raw_list,
- max_summary_input_length,
-):
- summary = await kg_community_summary_pipe.community_summary_prompt(
- entities_list, relationships_raw_list, max_summary_input_length
- )
- expected_summary = """
- Entity: Entity1
- Descriptions:
- 1,Description1
- Relationships:
- 1,Entity1,object1,predicate1,description1
-
- Entity: Entity2
- Descriptions:
- 2,Description2
- Relationships:
- 2,Entity2,object2,predicate2,description2
- """
- # "\n Entity: Entity1\n Descriptions: \n 1,Description1\n Relationships: \n 1,Entity1,object1,predicate1,description1\n \n Entity: Entity2\n Descriptions: \n 2,Description2\n Relationships: \n 2,Entity2,object2,predicate2,description2\n "
- assert summary.strip() == expected_summary.strip()
diff --git a/py/tests/core/providers/auth/test_auth_provider.py b/py/tests/core/providers/auth/test_auth_provider.py
deleted file mode 100644
index fc44165de..000000000
--- a/py/tests/core/providers/auth/test_auth_provider.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# tests/providers/auth/test_r2r_auth_provider.py
-import pytest
-
-from core.base import R2RException
-
-
-@pytest.mark.asyncio
-async def test_register_and_login(r2r_auth_provider):
- email = "test@example.com"
- password = "password123"
- user = await r2r_auth_provider.register(email, password)
- assert user.email == email
- tokens = await r2r_auth_provider.login(email, password)
- assert "access_token" in tokens
- assert "refresh_token" in tokens
-
-
-@pytest.mark.asyncio
-async def test_invalid_login(r2r_auth_provider):
- email = "test@example.com"
- password = "password123"
- await r2r_auth_provider.register(email, password)
- with pytest.raises(R2RException):
- await r2r_auth_provider.login(email, "wrong_password")
-
-
-@pytest.mark.asyncio
-async def test_refresh_access_token(r2r_auth_provider):
- email = "test@example.com"
- password = "password123"
- await r2r_auth_provider.register(email, password)
- tokens = await r2r_auth_provider.login(email, password)
- new_tokens = await r2r_auth_provider.refresh_access_token(
- tokens["refresh_token"].token
- )
- assert "access_token" in new_tokens
- assert "refresh_token" in new_tokens
-
-
-@pytest.mark.asyncio
-async def test_change_password(r2r_auth_provider):
- email = "test@example.com"
- password = "password123"
- new_password = "new_password456"
- user = await r2r_auth_provider.register(email, password)
- await r2r_auth_provider.change_password(user, password, new_password)
- tokens = await r2r_auth_provider.login(email, new_password)
- assert "access_token" in tokens
- assert "refresh_token" in tokens
-
-
-@pytest.mark.asyncio
-async def test_logout(r2r_auth_provider):
- email = "test@example.com"
- password = "password123"
- await r2r_auth_provider.register(email, password)
- tokens = await r2r_auth_provider.login(email, password)
- await r2r_auth_provider.logout(tokens["access_token"].token)
- with pytest.raises(R2RException):
- await r2r_auth_provider.decode_token(tokens["access_token"].token)
diff --git a/py/tests/core/providers/database/__init__.py b/py/tests/core/providers/database/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/core/providers/database/relational/test_collection_db.py b/py/tests/core/providers/database/relational/test_collection_db.py
deleted file mode 100644
index 276e2bfeb..000000000
--- a/py/tests/core/providers/database/relational/test_collection_db.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from uuid import UUID
-
-import pytest
-
-from core.base import DocumentResponse, DocumentType, R2RException
-from core.base.api.models import CollectionResponse
-
-
-@pytest.mark.asyncio
-async def test_create_collection(temporary_postgres_db_provider):
- collection = await temporary_postgres_db_provider.create_collection(
- "Test Collection", "Test Description"
- )
- assert isinstance(collection, CollectionResponse)
- assert collection.name == "Test Collection"
- assert collection.description == "Test Description"
-
-
-@pytest.mark.asyncio
-async def test_get_collection(temporary_postgres_db_provider):
- created_collection = (
- await temporary_postgres_db_provider.create_collection(
- "Test Collection", "Test Description"
- )
- )
- retrieved_collection = await temporary_postgres_db_provider.get_collection(
- created_collection.collection_id
- )
- assert retrieved_collection == created_collection
-
-
-@pytest.mark.asyncio
-async def test_update_collection(temporary_postgres_db_provider):
- created_collection = (
- await temporary_postgres_db_provider.create_collection(
- "Test Collection", "Test Description"
- )
- )
- updated_collection = (
- await temporary_postgres_db_provider.update_collection(
- created_collection.collection_id,
- name="Updated Collection",
- description="Updated Description",
- )
- )
- assert updated_collection.name == "Updated Collection"
- assert updated_collection.description == "Updated Description"
-
-
-@pytest.mark.asyncio
-async def test_delete_collection(temporary_postgres_db_provider):
- created_collection = (
- await temporary_postgres_db_provider.create_collection(
- "Test Collection", "Test Description"
- )
- )
- await temporary_postgres_db_provider.delete_collection_relational(
- created_collection.collection_id
- )
- with pytest.raises(R2RException):
- await temporary_postgres_db_provider.delete_collection_relational(
- created_collection.collection_id
- )
-
- # await temporary_postgres_db_provider.delete_collection_vector(
- # created_collection.collection_id
- # )
- # with pytest.raises(R2RException):
- # await temporary_postgres_db_provider.delete_collection_vector(
- # created_collection.collection_id
- # )
-
-
-@pytest.mark.asyncio
-async def test_list_collections(temporary_postgres_db_provider):
- await temporary_postgres_db_provider.create_collection(
- "Collection 1", "Description 1"
- )
- await temporary_postgres_db_provider.create_collection(
- "Collection 2", "Description 2"
- )
- collections = await temporary_postgres_db_provider.list_collections()
- assert len(collections["results"]) >= 2
- assert collections["total_entries"] >= 2
-
-
-@pytest.mark.asyncio
-async def test_get_collections_by_ids(temporary_postgres_db_provider):
- collection1 = await temporary_postgres_db_provider.create_collection(
- "Collection 1", "Description 1"
- )
- collection2 = await temporary_postgres_db_provider.create_collection(
- "Collection 2", "Description 2"
- )
- collections = await temporary_postgres_db_provider.get_collections_by_ids(
- [collection1.collection_id, collection2.collection_id]
- )
- assert len(collections) == 2
- assert collections[0].collection_id == collection1.collection_id
- assert collections[1].collection_id == collection2.collection_id
-
-
-@pytest.mark.asyncio
-async def test_assign_and_remove_document_from_collection(
- temporary_postgres_db_provider,
-):
- collection = await temporary_postgres_db_provider.create_collection(
- "Test Collection", "Test Description"
- )
- document_id = UUID("00000000-0000-0000-0000-000000000001")
- await temporary_postgres_db_provider.upsert_documents_overview(
- DocumentResponse(
- id=document_id,
- collection_ids=[],
- user_id=UUID("00000000-0000-0000-0000-000000000002"),
- document_type=DocumentType.PDF,
- metadata={},
- version="v1",
- size_in_bytes=0,
- )
- )
- await temporary_postgres_db_provider.assign_document_to_collection_relational(
- document_id, collection.collection_id
- )
- await temporary_postgres_db_provider.assign_document_to_collection_vector(
- document_id, collection.collection_id
- )
- document_collections = (
- await temporary_postgres_db_provider.document_collections(document_id)
- )
- assert len(document_collections["results"]) == 1
- assert (
- document_collections["results"][0].collection_id
- == collection.collection_id
- )
-
- await temporary_postgres_db_provider.remove_document_from_collection_relational(
- document_id, collection.collection_id
- )
- await temporary_postgres_db_provider.remove_document_from_collection_vector(
- document_id, collection.collection_id
- )
- document_collections = (
- await temporary_postgres_db_provider.document_collections(document_id)
- )
- assert len(document_collections["results"]) == 0
-
-
-@pytest.mark.asyncio
-async def test_get_collections_for_user(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- collection1 = await temporary_postgres_db_provider.create_collection(
- "Collection 1", "Description 1"
- )
- collection2 = await temporary_postgres_db_provider.create_collection(
- "Collection 2", "Description 2"
- )
- await temporary_postgres_db_provider.add_user_to_collection(
- user.id, collection1.collection_id
- )
- await temporary_postgres_db_provider.add_user_to_collection(
- user.id, collection2.collection_id
- )
- user_collections = (
- await temporary_postgres_db_provider.get_collections_for_user(user.id)
- )
- assert len(user_collections["results"]) == 2
- assert user_collections["total_entries"] == 2
diff --git a/py/tests/core/providers/database/relational/test_document_db.py b/py/tests/core/providers/database/relational/test_document_db.py
deleted file mode 100644
index 99764527a..000000000
--- a/py/tests/core/providers/database/relational/test_document_db.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# tests/providers/database/relational/test_document_db.py
-from uuid import UUID
-
-import pytest
-
-from core.base import (
- DocumentResponse,
- DocumentType,
- IngestionStatus,
- KGEnrichmentStatus,
- KGExtractionStatus,
-)
-
-# @pytest.mark.asyncio
-# async def test_create_table(temporary_postgres_db_provider):
-# await temporary_postgres_db_provider.create_tables()
-# # Verify that the table is created with the expected columns and constraints
-# # You can execute a query to check the table structure or use a database inspection tool
-
-
-@pytest.mark.asyncio
-async def test_upsert_documents_overview(temporary_postgres_db_provider):
- document_info = DocumentResponse(
- id=UUID("00000000-0000-0000-0000-000000000001"),
- collection_ids=[UUID("00000000-0000-0000-0000-000000000002")],
- user_id=UUID("00000000-0000-0000-0000-000000000003"),
- document_type=DocumentType.PDF,
- metadata={},
- title="Test Document",
- version="1.0",
- size_in_bytes=1024,
- ingestion_status=IngestionStatus.PENDING,
- extraction_status=KGExtractionStatus.PENDING,
- )
- await temporary_postgres_db_provider.upsert_documents_overview(
- document_info
- )
-
- # Verify that the document is inserted correctly
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_document_ids=[document_info.id]
- )
- assert len(result["results"]) == 1
- inserted_document = result["results"][0]
- assert inserted_document.id == document_info.id
- assert inserted_document.collection_ids == document_info.collection_ids
- assert inserted_document.user_id == document_info.user_id
- assert inserted_document.document_type == document_info.document_type
- assert inserted_document.metadata == document_info.metadata
- assert inserted_document.title == document_info.title
- assert inserted_document.version == document_info.version
- assert inserted_document.size_in_bytes == document_info.size_in_bytes
- assert inserted_document.ingestion_status == document_info.ingestion_status
- assert (
- inserted_document.extraction_status == document_info.extraction_status
- )
-
- # Update the document and verify the changes
- document_info.title = "Updated Test Document"
- document_info.ingestion_status = IngestionStatus.SUCCESS
- await temporary_postgres_db_provider.upsert_documents_overview(
- document_info
- )
-
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_document_ids=[document_info.id]
- )
- assert len(result["results"]) == 1
- updated_document = result["results"][0]
- assert updated_document.title == "Updated Test Document"
- assert updated_document.ingestion_status == IngestionStatus.SUCCESS
-
-
-@pytest.mark.asyncio
-async def test_delete_from_documents_overview(temporary_postgres_db_provider):
- document_info = DocumentResponse(
- id=UUID("00000000-0000-0000-0000-000000000001"),
- collection_ids=[UUID("00000000-0000-0000-0000-000000000002")],
- user_id=UUID("00000000-0000-0000-0000-000000000003"),
- document_type=DocumentType.PDF,
- metadata={},
- title="Test Document",
- version="1.0",
- size_in_bytes=1024,
- ingestion_status=IngestionStatus.PENDING,
- extraction_status=KGExtractionStatus.PENDING,
- )
- await temporary_postgres_db_provider.upsert_documents_overview(
- document_info
- )
-
- await temporary_postgres_db_provider.delete_from_documents_overview(
- document_info.id
- )
-
- # Verify that the document is deleted
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_document_ids=[document_info.id]
- )
- assert len(result["results"]) == 0
-
-
-@pytest.mark.asyncio
-async def test_get_documents_overview(temporary_postgres_db_provider):
- document_info1 = DocumentResponse(
- id=UUID("00000000-0000-0000-0000-000000000001"),
- collection_ids=[UUID("00000000-0000-0000-0000-000000000002")],
- user_id=UUID("00000000-0000-0000-0000-000000000003"),
- document_type=DocumentType.PDF,
- metadata={},
- title="Test Document 1",
- version="1.0",
- size_in_bytes=1024,
- ingestion_status=IngestionStatus.PENDING,
- extraction_status=KGExtractionStatus.PENDING,
- )
- document_info2 = DocumentResponse(
- id=UUID("00000000-0000-0000-0000-000000000004"),
- collection_ids=[UUID("00000000-0000-0000-0000-000000000002")],
- user_id=UUID("00000000-0000-0000-0000-000000000003"),
- document_type=DocumentType.DOCX,
- metadata={},
- title="Test Document 2",
- version="1.0",
- size_in_bytes=2048,
- ingestion_status=IngestionStatus.SUCCESS,
- extraction_status=KGExtractionStatus.PENDING,
- )
- await temporary_postgres_db_provider.upsert_documents_overview(
- [document_info1, document_info2]
- )
-
- # Test filtering by user ID
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_user_ids=[UUID("00000000-0000-0000-0000-000000000003")]
- )
- assert len(result["results"]) == 2
- assert result["total_entries"] == 2
-
- # Test filtering by document ID
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_document_ids=[UUID("00000000-0000-0000-0000-000000000001")]
- )
- assert len(result["results"]) == 1
- assert result["results"][0].id == UUID(
- "00000000-0000-0000-0000-000000000001"
- )
-
- # Test filtering by collection ID
- result = await temporary_postgres_db_provider.get_documents_overview(
- filter_collection_ids=[UUID("00000000-0000-0000-0000-000000000002")]
- )
- assert len(result["results"]) == 2
- assert result["total_entries"] == 2
-
- # Test pagination
- result = await temporary_postgres_db_provider.get_documents_overview(
- offset=1, limit=1
- )
- assert len(result["results"]) == 1
- assert result["total_entries"] == 2
diff --git a/py/tests/core/providers/database/relational/test_relational_db_provider.py b/py/tests/core/providers/database/relational/test_relational_db_provider.py
deleted file mode 100644
index 32a4d1f1b..000000000
--- a/py/tests/core/providers/database/relational/test_relational_db_provider.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# tests/providers/database/test_relational_db_provider.py
-import pytest
-
-from core.providers.database import PostgresDBProvider
-
-
-@pytest.mark.asyncio
-async def test_relational_db_initialization(postgres_db_provider):
- assert isinstance(postgres_db_provider, PostgresDBProvider)
- # assert postgres_db_provider.relational is not None
diff --git a/py/tests/core/providers/database/relational/test_user_db.py b/py/tests/core/providers/database/relational/test_user_db.py
deleted file mode 100644
index 3d4cb8aa4..000000000
--- a/py/tests/core/providers/database/relational/test_user_db.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# tests/providers/database/relational/test_user_db.py
-from datetime import datetime, timedelta
-from uuid import UUID
-
-import pytest
-
-from core.base.api.models import User
-
-
-@pytest.mark.asyncio
-async def test_create_user(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- assert isinstance(user, User)
- assert user.email == "test@example.com"
-
-
-@pytest.mark.asyncio
-async def test_get_user_by_id(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- retrieved_user = await temporary_postgres_db_provider.get_user_by_id(
- user.id
- )
- assert retrieved_user == user
-
-
-@pytest.mark.asyncio
-async def test_get_user_by_email(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- retrieved_user = await temporary_postgres_db_provider.get_user_by_email(
- "test@example.com"
- )
- assert retrieved_user == user
-
-
-@pytest.mark.asyncio
-async def test_delete_user(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- await temporary_postgres_db_provider.delete_user_relational(user.id)
- await temporary_postgres_db_provider.delete_user_vector(user.id)
- try:
- user = await temporary_postgres_db_provider.get_user_by_id(user.id)
- raise ValueError("User should not exist")
- except:
- pass
-
-
-@pytest.mark.asyncio
-async def test_update_user(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- updated_user = User(
- id=user.id,
- email="updated@example.com",
- is_superuser=True,
- is_active=False,
- is_verified=True,
- name="Updated Name",
- profile_picture="updated_picture.jpg",
- bio="Updated bio",
- collection_ids=[],
- )
- result = await temporary_postgres_db_provider.update_user(updated_user)
- assert result.email == updated_user.email
-
-
-@pytest.mark.asyncio
-async def test_update_user_password(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- await temporary_postgres_db_provider.update_user_password(
- user.id, "new_password"
- )
- # Additional assertions can be added based on the expected behavior
-
-
-@pytest.mark.asyncio
-async def test_get_all_users(temporary_postgres_db_provider):
- await temporary_postgres_db_provider.create_user(
- "test1@example.com", "password"
- )
- await temporary_postgres_db_provider.create_user(
- "test2@example.com", "password"
- )
- users = await temporary_postgres_db_provider.get_all_users()
- assert len(users) >= 2
- assert any(user.email == "test1@example.com" for user in users)
- assert any(user.email == "test2@example.com" for user in users)
-
-
-@pytest.mark.asyncio
-async def test_store_and_verify_verification_code(
- temporary_postgres_db_provider,
-):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- verification_code = "123456"
- expiry = datetime.utcnow() + timedelta(hours=1)
- await temporary_postgres_db_provider.store_verification_code(
- user.id, verification_code, expiry
- )
- await temporary_postgres_db_provider.verify_user(verification_code)
- updated_user = await temporary_postgres_db_provider.get_user_by_id(user.id)
- assert updated_user.is_verified
-
-
-@pytest.mark.asyncio
-async def test_store_and_get_reset_token(temporary_postgres_db_provider):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- reset_token = "reset_token"
- expiry = datetime.utcnow() + timedelta(hours=1)
- await temporary_postgres_db_provider.store_reset_token(
- user.id, reset_token, expiry
- )
- user_id = await temporary_postgres_db_provider.get_user_id_by_reset_token(
- reset_token
- )
- assert user_id == user.id
-
-
-@pytest.mark.asyncio
-async def test_add_and_remove_user_from_collection(
- temporary_postgres_db_provider,
-):
- user = await temporary_postgres_db_provider.create_user(
- "test@example.com", "password"
- )
- collection_id = UUID("00000000-0000-0000-0000-000000000001")
- await temporary_postgres_db_provider.add_user_to_collection(
- user.id, collection_id
- )
- updated_user = await temporary_postgres_db_provider.get_user_by_id(user.id)
- assert collection_id in updated_user.collection_ids
- await temporary_postgres_db_provider.remove_user_from_collection(
- user.id, collection_id
- )
- updated_user = await temporary_postgres_db_provider.get_user_by_id(user.id)
- assert collection_id not in updated_user.collection_ids
diff --git a/py/tests/core/providers/database/test_prompt_handler.py b/py/tests/core/providers/database/test_prompt_handler.py
deleted file mode 100644
index cfe130550..000000000
--- a/py/tests/core/providers/database/test_prompt_handler.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import uuid
-from datetime import timedelta
-from typing import Any, Optional
-
-import pytest
-
-from core.base import PromptHandler
-from core.providers.database.prompt import PostgresPromptHandler
-
-
-# Additional fixtures for prompt testing
-@pytest.fixture(scope="function")
-def prompt_handler_config(app_config):
- return {"cache_ttl": timedelta(hours=1), "max_cache_size": 100}
-
-
-@pytest.fixture(scope="function")
-async def prompt_handler(
- postgres_db_provider, prompt_handler_config, app_config
-):
- handler = PostgresPromptHandler(
- project_name=app_config.project_name,
- connection_manager=postgres_db_provider.connection_manager,
- **prompt_handler_config,
- )
- await handler.create_tables()
- yield handler
- # Cleanup will happen via postgres_db_provider fixture
-
-
-@pytest.fixture(scope="function")
-def sample_prompt():
- return {
- "name": "test_prompt",
- "template": "This is a test prompt with {input_var}",
- "input_types": {"input_var": "string"},
- }
-
-
-# Tests
-@pytest.mark.asyncio
-async def test_prompt_handler_initialization(prompt_handler):
- """Test that prompt handler initializes properly"""
- assert isinstance(prompt_handler, PromptHandler)
-
-
-@pytest.mark.asyncio
-async def test_add_and_get_prompt(prompt_handler, sample_prompt):
- """Test adding a prompt and retrieving it"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- result = await prompt_handler.get_prompt(sample_prompt["name"])
- assert result == sample_prompt["template"]
-
-
-@pytest.mark.asyncio
-async def test_get_prompt_with_inputs(prompt_handler, sample_prompt):
- """Test getting a prompt with input variables"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- test_input = "test value"
- result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs={"input_var": test_input}
- )
- assert result == sample_prompt["template"].format(input_var=test_input)
-
-
-@pytest.mark.asyncio
-async def test_prompt_cache_behavior(prompt_handler, sample_prompt):
- """Test that caching works as expected"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- # First call should hit database
- test_input = {"input_var": "cache test"}
- first_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input
- )
-
- # Second call should hit cache
- second_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input
- )
-
- # Results should be the same
- assert first_result == second_result
-
- # Modify the template directly in the database
- new_template = "Modified template {input_var}"
- await prompt_handler._update_prompt_impl(
- name=sample_prompt["name"], template=new_template
- )
-
- # Third call should get the new value since we invalidate cache on update
- third_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input
- )
-
- # Verify the change is reflected
- assert third_result == new_template.format(**test_input)
- assert third_result != first_result
-
- # Test bypass_cache explicitly
- bypass_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input, bypass_cache=True
- )
- assert bypass_result == new_template.format(**test_input)
-
-
-@pytest.mark.asyncio
-async def test_message_payload_creation(prompt_handler, sample_prompt):
- """Test creation of message payloads"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- payload = await prompt_handler.get_message_payload(
- system_prompt_name=sample_prompt["name"],
- system_inputs={"input_var": "system context"},
- task_prompt_name=sample_prompt["name"],
- task_inputs={"input_var": "task context"},
- )
-
- assert len(payload) == 2
- assert payload[0]["role"] == "system"
- assert payload[1]["role"] == "user"
- assert "system context" in payload[0]["content"]
- assert "task context" in payload[1]["content"]
-
-
-@pytest.mark.asyncio
-async def test_get_all_prompts(prompt_handler, sample_prompt):
- """Test retrieving all stored prompts"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- all_prompts = await prompt_handler.get_all_prompts()
- assert len(all_prompts) >= 1
- assert sample_prompt["name"] in all_prompts
- assert (
- all_prompts[sample_prompt["name"]]["template"]
- == sample_prompt["template"]
- )
-
-
-@pytest.mark.asyncio
-async def test_delete_prompt(prompt_handler, sample_prompt):
- """Test deleting a prompt"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- await prompt_handler.delete_prompt(sample_prompt["name"])
-
- with pytest.raises(ValueError):
- await prompt_handler.get_prompt(sample_prompt["name"])
-
-
-@pytest.mark.asyncio
-async def test_prompt_bypass_cache(prompt_handler, sample_prompt):
- """Test bypassing the cache"""
- await prompt_handler.add_prompt(**sample_prompt)
-
- # First call to cache the result
- test_input = {"input_var": "bypass test"}
- first_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input
- )
-
- # Update template
- new_template = "Updated template {input_var}"
- await prompt_handler._update_prompt_impl(
- name=sample_prompt["name"], template=new_template
- )
-
- # Get with bypass_cache=True should return new template
- bypass_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs=test_input, bypass_cache=True
- )
-
- assert bypass_result != first_result
- assert bypass_result == new_template.format(**test_input)
-
-
-@pytest.mark.asyncio
-async def test_prompt_update(prompt_handler, sample_prompt):
- """Test updating an existing prompt"""
- # Add initial prompt
- await prompt_handler.add_prompt(**sample_prompt)
- initial_result = await prompt_handler.get_prompt(sample_prompt["name"])
- assert initial_result == sample_prompt["template"]
-
- # Update template
- updated_template = "This is an updated prompt with {input_var}!"
- await prompt_handler.update_prompt(
- name=sample_prompt["name"], template=updated_template
- )
-
- # Test immediate result
- updated_result = await prompt_handler.get_prompt(sample_prompt["name"])
- assert updated_result == updated_template
-
- # Test with cache bypass to ensure database update
- db_result = await prompt_handler.get_prompt(
- sample_prompt["name"], bypass_cache=True
- )
- assert db_result == updated_template
-
- # Test with input formatting
- formatted_result = await prompt_handler.get_prompt(
- sample_prompt["name"], inputs={"input_var": "test"}
- )
- assert formatted_result == "This is an updated prompt with test!"
diff --git a/py/tests/core/providers/database/test_vector_db_provider.py b/py/tests/core/providers/database/test_vector_db_provider.py
deleted file mode 100644
index 237bb97e0..000000000
--- a/py/tests/core/providers/database/test_vector_db_provider.py
+++ /dev/null
@@ -1,289 +0,0 @@
-import pytest
-
-from core.providers.database import PostgresDBProvider
-from r2r import ChunkSearchSettings
-
-
-@pytest.mark.asyncio
-async def test_vector_db_initialization(postgres_db_provider):
- assert isinstance(postgres_db_provider, PostgresDBProvider)
- # assert postgres_db_provider is not None
-
-
-@pytest.mark.asyncio
-async def test_search_equality_filter(postgres_db_provider, sample_entries):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=10, filters={"key": {"$eq": "value_id_0"}}
- ),
- )
- assert len(results) == 1
- assert results[0].metadata["key"] == "value_id_0"
-
-
-@pytest.mark.asyncio
-async def test_search_not_equal_filter(postgres_db_provider, sample_entries):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=100, filters={"key": {"$ne": "value_id_0"}}
- ),
- )
- assert len(results) == 99
- assert all(r.metadata["key"] != "value_id_0" for r in results)
-
-
-@pytest.mark.asyncio
-async def test_search_greater_than_filter(
- postgres_db_provider, sample_entries
-):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=100, filters={"raw_key": {"$gt": 50}}
- ),
- )
- assert len(results) == 49
- assert all(int(r.text.split("_")[-1]) > 50 for r in results)
-
-
-@pytest.mark.asyncio
-async def test_search_less_than_or_equal_filter(
- postgres_db_provider, sample_entries
-):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=10,
- filters={"raw_key": {"$lte": 20}},
- ef_search=100, # TODO - Better understand why we need to set this to search the entire database.
- ),
- ) # TODO - Why is this number not always 10?
- assert len(results) == 10
-
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=100, filters={"raw_key": {"$lte": 20}}
- ),
- )
- assert len(results) == 21
- assert all(int(r.text.split("_")[-1]) <= 20 for r in results)
-
-
-@pytest.mark.asyncio
-async def test_search_in_filter(postgres_db_provider, sample_entries):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=10,
- filters={"key": {"$in": ["value_id_0", "value_id_1"]}},
- ),
- )
- assert len(results) == 2
- assert all(
- r.metadata["key"] in ["value_id_0", "value_id_1"] for r in results
- )
-
-
-@pytest.mark.asyncio
-async def test_search_complex_and_filter(postgres_db_provider, sample_entries):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=10,
- filters={
- "$and": [
- {"key": {"$eq": "value_id_0"}},
- {"raw_key": {"$lt": 50}},
- ]
- },
- ),
- )
- assert len(results) == 1
- assert results[0].metadata["key"] == "value_id_0"
- assert int(results[0].text.split("_")[-1]) < 50
-
-
-@pytest.mark.asyncio
-async def test_search_complex_or_filter(postgres_db_provider, sample_entries):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=11,
- ef_search=100, # TODO - Better understand why we need to set this to search the entire database.
- filters={
- "$or": [
- {"key": {"$eq": "value_id_0"}},
- {"raw_key": {"$gte": 90}},
- ]
- },
- ),
- )
- assert len(results) == 11
- assert any(r.metadata["key"] == "value_id_0" for r in results)
- assert any(int(r.text.split("_")[-1]) >= 90 for r in results)
-
-
-@pytest.mark.asyncio
-async def test_search_nested_and_or_filters(
- postgres_db_provider, sample_entries
-):
- query_vector = sample_entries[0]
- results = await postgres_db_provider.semantic_search(
- query_vector.vector.data,
- ChunkSearchSettings(
- search_limit=10,
- ef_search=100, # TODO - Better understand why we need to set this to search the entire database.
- filters={
- "$and": [
- {"key": {"$eq": "value_id_0"}},
- {
- "$or": [
- {"key": {"$in": ["value_id_0", "value_id_1"]}},
- {"raw_key": {"$gt": 98}},
- ]
- },
- ]
- },
- ),
- )
- assert len(results) == 1
- assert results[0].metadata["key"] == "value_id_0"
- assert results[0].text == "Sample text for id_0"
-
-
-@pytest.mark.asyncio
-async def test_delete_equality(temporary_postgres_db_provider, sample_entries):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {"key": {"$eq": "value_id_0"}}
- )
- assert len(deleted_ids) == 1
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 99
- assert all(r.metadata["key"] != "value_id_0" for r in remaining)
-
-
-@pytest.mark.asyncio
-async def test_delete_greater_than(
- temporary_postgres_db_provider, sample_entries
-):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {"raw_key": {"$gt": 90}}
- )
- assert len(deleted_ids) == 9
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 91
- assert all(int(r.text.split("_")[-1]) <= 90 for r in remaining)
-
-
-@pytest.mark.asyncio
-async def test_delete_in(temporary_postgres_db_provider, sample_entries):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {"key": {"$in": ["value_id_0", "value_id_1"]}}
- )
- assert len(deleted_ids) == 2
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 98
- assert all(
- r.metadata["key"] not in ["value_id_0", "value_id_1"]
- for r in remaining
- )
-
-
-@pytest.mark.asyncio
-async def test_delete_complex_and(
- temporary_postgres_db_provider, sample_entries
-):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {
- "$and": [
- {"key": {"$eq": "value_id_0"}},
- {"raw_key": {"$lt": 50}},
- ]
- }
- )
- assert len(deleted_ids) == 1
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 99
- assert not any(
- r.metadata["key"] == "value_id_0" and int(r.text.split("_")[-1]) < 50
- for r in remaining
- )
-
-
-@pytest.mark.asyncio
-async def test_delete_complex_or(
- temporary_postgres_db_provider, sample_entries
-):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {
- "$or": [
- {"key": {"$eq": "value_id_0"}},
- {"raw_key": {"$gte": 90}},
- ]
- }
- )
- assert len(deleted_ids) == 11
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 89
- assert all(
- r.metadata["key"] != "value_id_0" and int(r.text.split("_")[-1]) < 90
- for r in remaining
- )
-
-
-@pytest.mark.asyncio
-async def test_delete_nested_and_or(
- temporary_postgres_db_provider, sample_entries
-):
- deleted_ids = await temporary_postgres_db_provider.delete(
- {
- "$and": [
- {"key": {"$eq": "value_id_0"}},
- {
- "$or": [
- {"key": {"$in": ["value_id_0", "value_id_1"]}},
- {"raw_key": {"$gt": 98}},
- ]
- },
- ]
- }
- )
- assert len(deleted_ids) == 1
- remaining = await temporary_postgres_db_provider.semantic_search(
- sample_entries[0].vector.data,
- ChunkSearchSettings(search_limit=100),
- )
- assert len(remaining) == 99
- assert not any(
- r.metadata["key"] == "value_id_0"
- and (
- r.metadata["key"] in ["value_id_0", "value_id_1"]
- or int(r.text.split("_")[-1]) > 98
- )
- for r in remaining
- )
diff --git a/py/tests/core/providers/database/test_vector_index_logic.py b/py/tests/core/providers/database/test_vector_index_logic.py
deleted file mode 100644
index eae97edf9..000000000
--- a/py/tests/core/providers/database/test_vector_index_logic.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from uuid import uuid4
-
-import pytest
-
-from shared.abstractions.vector import (
- IndexArgsHNSW,
- IndexArgsIVFFlat,
- IndexMeasure,
- IndexMethod,
- VectorTableName,
-)
-
-
-@pytest.mark.asyncio
-async def test_index_lifecycle(postgres_db_provider):
- """Test the full lifecycle of index operations"""
-
- # Create an index
- index_name = f"test_index_{uuid4().hex[:8]}"
- await postgres_db_provider.create_index(
- table_name=VectorTableName.CHUNKS,
- index_measure=IndexMeasure.cosine_distance,
- index_method=IndexMethod.hnsw,
- index_name=index_name,
- concurrently=False, # Changed to avoid isolation level issues
- )
-
- # List indices and verify our index exists
- indices = await postgres_db_provider.list_indices(VectorTableName.CHUNKS)
- print("indices = ", indices)
- assert indices, "No indices returned"
- assert any(index["name"] == index_name for index in indices)
-
- # # Select the index for use
- # await postgres_db_provider.select_index(
- # index_name, VectorTableName.CHUNKS
- # )
-
- # Delete the index
- await postgres_db_provider.delete_index(
- index_name,
- table_name=VectorTableName.CHUNKS,
- concurrently=False, # Consistent with creation
- )
-
- # Verify index was deleted
- indices_after = await postgres_db_provider.list_indices(
- VectorTableName.CHUNKS
- )
- assert not any(index["name"] == index_name for index in indices_after)
-
-
-@pytest.mark.asyncio
-async def test_multiple_index_types(postgres_db_provider):
- """Test creating and managing multiple types of indices"""
-
- # Create HNSW index
- hnsw_name = f"hnsw_index_{uuid4().hex[:8]}"
- await postgres_db_provider.create_index(
- table_name=VectorTableName.CHUNKS,
- index_measure=IndexMeasure.cosine_distance,
- index_method=IndexMethod.hnsw,
- index_name=hnsw_name,
- index_arguments=IndexArgsHNSW(m=16, ef_construction=64),
- concurrently=False, # Changed to avoid isolation level issues
- )
-
- # Create IVF-Flat index
- ivf_name = f"ivf_index_{uuid4().hex[:8]}"
- await postgres_db_provider.create_index(
- table_name=VectorTableName.CHUNKS,
- index_measure=IndexMeasure.cosine_distance,
- index_method=IndexMethod.ivfflat,
- index_name=ivf_name,
- index_arguments=IndexArgsIVFFlat(n_lists=100),
- concurrently=False, # Changed to avoid isolation level issues
- )
-
- # List indices and verify both exist
- indices = await postgres_db_provider.list_indices(VectorTableName.CHUNKS)
- assert any(index["name"] == hnsw_name for index in indices)
- assert any(index["name"] == ivf_name for index in indices)
-
- # Clean up
- await postgres_db_provider.delete_index(
- hnsw_name, table_name=VectorTableName.CHUNKS, concurrently=False
- )
- await postgres_db_provider.delete_index(
- ivf_name, table_name=VectorTableName.CHUNKS, concurrently=False
- )
-
-
-@pytest.mark.asyncio
-async def test_index_operations_invalid_inputs(postgres_db_provider):
- """Test error handling for invalid index operations"""
-
- # Try to list indices for invalid table
- with pytest.raises(Exception):
- await postgres_db_provider.list_indices("invalid_table")
-
- # Try to delete non-existent index
- with pytest.raises(Exception):
- await postgres_db_provider.delete_index(
- "nonexistent_index", VectorTableName.CHUNKS
- )
-
- # Try to select non-existent index
- # with pytest.raises(Exception):
- # await postgres_db_provider.select_index(
- # "nonexistent_index", VectorTableName.CHUNKS
- # )
-
-
-@pytest.mark.asyncio
-async def test_index_persistence(
- postgres_db_provider, temporary_postgres_db_provider
-):
- """Test that indices persist and are usable between connections"""
-
- # Create index using first connection
- index_name = f"persist_test_{uuid4().hex[:8]}"
- await postgres_db_provider.create_index(
- table_name=VectorTableName.CHUNKS,
- index_measure=IndexMeasure.cosine_distance,
- index_method=IndexMethod.hnsw,
- index_name=index_name,
- concurrently=False, # Changed to avoid isolation level issues
- )
-
- # Verify index exists using second connection
- indices = await temporary_postgres_db_provider.list_indices(
- VectorTableName.CHUNKS
- )
- assert any(index["name"] == index_name for index in indices)
-
- # Clean up
- await postgres_db_provider.delete_index(
- index_name, table_name=VectorTableName.CHUNKS, concurrently=False
- )
diff --git a/py/tests/core/providers/email/test_email_providers.py b/py/tests/core/providers/email/test_email_providers.py
deleted file mode 100644
index df744eb71..000000000
--- a/py/tests/core/providers/email/test_email_providers.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import pytest
-
-from core.base.providers.email import EmailConfig
-from core.providers.email import SendGridEmailProvider
-
-
-@pytest.fixture(scope="function")
-def sendgrid_config(app_config):
- return EmailConfig(
- provider="sendgrid",
- sendgrid_api_key="your_sendgrid_api_key",
- from_email="support@example.com", # Ensure this email is verified in your SendGrid account
- app=app_config,
- )
-
-
-@pytest.fixture
-def sendgrid_provider(sendgrid_config):
- return SendGridEmailProvider(sendgrid_config)
-
-
-class TestSendGridEmailProvider:
- @pytest.mark.asyncio
- async def test_send_email_basic(self, sendgrid_provider):
- await sendgrid_provider.send_email(
- to_email="example@example.com", # Replace with your email address
- subject="Test Email",
- body="This is a test email sent from the test_send_email_basic test case.",
- )
- # If your send_email method returns a response, you can add assertions here
-
- @pytest.mark.asyncio
- async def test_send_email_with_template(self, sendgrid_provider):
- await sendgrid_provider.send_email(
- to_email="example@example.com", # Replace with your email address
- template_id="template_id", # Replace with your SendGrid template ID
- dynamic_template_data={"first_name": "Example"},
- )
- # Add assertions if applicable
-
- @pytest.mark.asyncio
- async def test_send_verification_email(self, sendgrid_provider):
- await sendgrid_provider.send_verification_email(
- to_email="example@example.com", # Replace with your email address
- verification_code="123456",
- )
- # Add assertions if applicable
-
- @pytest.mark.asyncio
- async def test_send_verification_email_with_template(
- self, sendgrid_provider
- ):
- await sendgrid_provider.send_verification_email(
- to_email="example@example.com", # Replace with your email address
- verification_code="123456",
- )
- # Add assertions if applicable
-
- @pytest.mark.asyncio
- async def test_send_verification_email_with_template_and_dynamic_data(
- self, sendgrid_provider
- ):
- await sendgrid_provider.send_verification_email(
- to_email="example@example.com", # Replace with your email address
- verification_code="123456",
- dynamic_template_data={"name": "User"},
- frontend_url="http://localhost:3000/auth",
- )
- # Add assertions if applicable
-
- @pytest.mark.asyncio
- async def test_send_email_failure(self, sendgrid_provider):
- # Intentionally use an invalid email to simulate a failure
- with pytest.raises(RuntimeError):
- await sendgrid_provider.send_email(
- to_email="invalid-email-address", # Invalid email address
- subject="Test Email",
- body="This should fail.",
- )
diff --git a/py/tests/core/providers/embedding/conftest.py b/py/tests/core/providers/embedding/conftest.py
deleted file mode 100644
index 7ce2b40bc..000000000
--- a/py/tests/core/providers/embedding/conftest.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import asyncio
-
-import pytest
-
-from core import EmbeddingConfig
-from core.providers import OpenAIEmbeddingProvider
-
-
-@pytest.fixture(scope="session", autouse=True)
-def event_loop_policy():
- asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
-
-
-@pytest.fixture(scope="function", autouse=True)
-async def cleanup_tasks():
- yield
- tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
- [task.cancel() for task in tasks]
- await asyncio.gather(*tasks, return_exceptions=True)
-
-
-@pytest.fixture
-def openai_provider(app_config):
- config = EmbeddingConfig(
- provider="openai",
- base_model="text-embedding-ada-002",
- base_dimension=1536,
- app=app_config,
- )
- return OpenAIEmbeddingProvider(config)
diff --git a/py/tests/core/providers/embedding/test_litellm_embedding_provider.py b/py/tests/core/providers/embedding/test_litellm_embedding_provider.py
deleted file mode 100644
index 508e080f7..000000000
--- a/py/tests/core/providers/embedding/test_litellm_embedding_provider.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import asyncio
-import contextlib
-
-import pytest
-
-from core import EmbeddingConfig
-from core.providers import LiteLLMEmbeddingProvider
-
-
-@pytest.fixture(scope="session", autouse=True)
-def event_loop_policy():
- asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
-
-
-@pytest.fixture(scope="function", autouse=True)
-async def cleanup_tasks():
- yield
- tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
- [task.cancel() for task in tasks]
- await asyncio.gather(*tasks, return_exceptions=True)
-
-
-@pytest.fixture
-def litellm_provider(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="openai/text-embedding-3-small",
- base_dimension=1536,
- app=app_config,
- )
-
- return LiteLLMEmbeddingProvider(config)
-
-
-def test_litellm_initialization(litellm_provider):
- assert isinstance(litellm_provider, LiteLLMEmbeddingProvider)
- assert litellm_provider.base_model == "openai/text-embedding-3-small"
- assert litellm_provider.base_dimension == 1536
-
-
-def test_litellm_invalid_provider_initialization(app_config):
- with pytest.raises(ValueError):
- config = EmbeddingConfig(provider="invalid_provider", app=app_config)
- LiteLLMEmbeddingProvider(config)
-
-
-def test_litellm_get_embedding(litellm_provider):
- embedding = litellm_provider.get_embedding("test text")
- assert len(embedding) == 1536
- assert isinstance(embedding, list)
-
-
-@pytest.mark.asyncio
-async def test_litellm_async_get_embedding(litellm_provider):
- with contextlib.suppress(asyncio.CancelledError):
- embedding = await litellm_provider.async_get_embedding("test text")
- assert len(embedding) == 1536
- assert isinstance(embedding, list)
-
-
-def test_litellm_get_embeddings(litellm_provider):
- embeddings = litellm_provider.get_embeddings(["text1", "text2"])
- assert len(embeddings) == 2
- assert all(len(emb) == 1536 for emb in embeddings)
-
-
-@pytest.mark.asyncio
-async def test_litellm_async_get_embeddings(litellm_provider):
- with contextlib.suppress(asyncio.CancelledError):
- embeddings = await litellm_provider.async_get_embeddings(
- ["text1", "text2"]
- )
- assert len(embeddings) == 2
- assert all(len(emb) == 1536 for emb in embeddings)
-
-
-def test_litellm_rerank_model_not_supported(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="openai/text-embedding-3-small",
- base_dimension=1536,
- rerank_model="some-model",
- app=app_config,
- )
- with pytest.raises(
- ValueError, match="does not support separate reranking"
- ):
- LiteLLMEmbeddingProvider(config)
-
-
-def test_litellm_unsupported_stage(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="openai/text-embedding-3-small",
- base_dimension=1536,
- app=app_config,
- )
- provider = LiteLLMEmbeddingProvider(config)
- with pytest.raises(
- ValueError, match="LiteLLMEmbeddingProvider only supports search stage"
- ):
- provider.get_embedding(
- "test", stage=LiteLLMEmbeddingProvider.PipeStage.RERANK
- )
-
-
-@pytest.mark.asyncio
-async def test_litellm_async_unsupported_stage(app_config):
- config = EmbeddingConfig(
- provider="litellm",
- base_model="openai/text-embedding-3-small",
- base_dimension=1536,
- app=app_config,
- )
- provider = LiteLLMEmbeddingProvider(config)
- with pytest.raises(
- ValueError, match="LiteLLMEmbeddingProvider only supports search stage"
- ):
- await provider.async_get_embedding(
- "test", stage=LiteLLMEmbeddingProvider.PipeStage.RERANK
- )
-
-
-def test_litellm_get_embedding_error(mocker, litellm_provider):
- mocker.patch.object(
- litellm_provider, "get_embedding", side_effect=Exception("Test error")
- )
- with pytest.raises(Exception, match="Test error"):
- litellm_provider.get_embedding("test")
-
-
-@pytest.mark.asyncio
-async def test_litellm_async_get_embedding_error(mocker, litellm_provider):
- mocker.patch.object(
- litellm_provider,
- "async_get_embedding",
- side_effect=Exception("Test error"),
- )
- with pytest.raises(Exception, match="Test error"):
- await litellm_provider.async_get_embedding("test")
diff --git a/py/tests/core/providers/file/test_file_provider.py b/py/tests/core/providers/file/test_file_provider.py
deleted file mode 100644
index e636197c1..000000000
--- a/py/tests/core/providers/file/test_file_provider.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import io
-import uuid
-
-import pytest
-
-
-@pytest.mark.asyncio
-async def test_store_and_retrieve_file(postgres_db_provider):
- document_id = uuid.uuid4()
- file_name = "test_file.txt"
- file_content = io.BytesIO(b"Test file content")
- file_type = "text/plain"
-
- await postgres_db_provider.store_file(
- document_id, file_name, file_content, file_type
- )
- retrieved_file = await postgres_db_provider.retrieve_file(document_id)
-
- assert retrieved_file is not None
- assert retrieved_file[0] == file_name
- assert retrieved_file[1].read() == b"Test file content"
- assert retrieved_file[2] == len(b"Test file content")
-
-
-@pytest.mark.asyncio
-async def test_delete_file(postgres_db_provider):
- document_id = uuid.uuid4()
- file_name = "test_file.txt"
- file_content = io.BytesIO(b"Test file content")
- file_type = "text/plain"
-
- await postgres_db_provider.store_file(
- document_id, file_name, file_content, file_type
- )
- deleted = await postgres_db_provider.delete_file(document_id)
-
- assert deleted is True
- with pytest.raises(Exception):
- await postgres_db_provider.retrieve_file(document_id)
-
-
-@pytest.mark.asyncio
-async def test_get_files_overview(postgres_db_provider):
- document_ids = [uuid.uuid4() for _ in range(5)]
- file_names = [f"test_file_{i}.txt" for i in range(5)]
- file_contents = [
- io.BytesIO(f"Test file content {i}".encode()) for i in range(5)
- ]
- file_type = "text/plain"
-
- for document_id, file_name, file_content in zip(
- document_ids, file_names, file_contents
- ):
- await postgres_db_provider.store_file(
- document_id, file_name, file_content, file_type
- )
-
- files_overview = await postgres_db_provider.get_files_overview(limit=3)
-
- assert len(files_overview) == 3
- assert all(file["document_id"] in document_ids for file in files_overview)
- assert all(file["file_name"] in file_names for file in files_overview)
-
- filtered_files_overview = await postgres_db_provider.get_files_overview(
- filter_document_ids=[document_ids[0], document_ids[1]],
- filter_file_names=[file_names[0]],
- )
-
- assert len(filtered_files_overview) == 1
- assert filtered_files_overview[0]["document_id"] == document_ids[0]
- assert filtered_files_overview[0]["file_name"] == file_names[0]
diff --git a/py/tests/core/providers/ingestion/test_contextual_embedding.py b/py/tests/core/providers/ingestion/test_contextual_embedding.py
deleted file mode 100644
index 97d8fb047..000000000
--- a/py/tests/core/providers/ingestion/test_contextual_embedding.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import random
-import subprocess
-from datetime import datetime
-from uuid import UUID
-
-import pytest
-
-from core.base import (
- DocumentType,
- IngestionConfig,
- IngestionStatus,
- RawChunk,
- Vector,
- VectorEntry,
- VectorType,
-)
-from core.main.abstractions import R2RProviders
-from core.main.services.ingestion_service import IngestionService
-from core.providers.ingestion import R2RIngestionConfig, R2RIngestionProvider
-from core.providers.orchestration import SimpleOrchestrationProvider
-from shared.abstractions.ingestion import (
- ChunkEnrichmentSettings,
- ChunkEnrichmentStrategy,
-)
-from shared.api.models.auth.responses import User
-
-
-@pytest.fixture
-def sample_document_id():
- return UUID("12345678-1234-5678-1234-567812345678")
-
-
-@pytest.fixture
-def sample_user():
- return User(
- id=UUID("87654321-8765-4321-8765-432187654321"),
- email="test@example.com",
- is_superuser=True,
- )
-
-
-@pytest.fixture
-def collection_ids():
- return [UUID("12345678-1234-5678-1234-567812345678")]
-
-
-@pytest.fixture
-def chunk_ids():
- return [
- UUID("fce959df-46a2-4983-aa8b-dd1f93777e02"),
- UUID("9a85269c-84cd-4dff-bf21-7bd09974f668"),
- UUID("4b1199b2-2b96-4198-9ded-954c900a23dd"),
- ]
-
-
-@pytest.fixture
-def sample_chunks(sample_document_id, sample_user, collection_ids, chunk_ids):
- return [
- VectorEntry(
- chunk_id=chunk_ids[0],
- document_id=sample_document_id,
- user_id=sample_user.id,
- collection_ids=collection_ids,
- vector=Vector(
- data=[random.random() for _ in range(128)],
- type=VectorType.FIXED,
- length=128,
- ),
- text="This is the first chunk of text.",
- metadata={"chunk_order": 0},
- ),
- VectorEntry(
- chunk_id=chunk_ids[1],
- document_id=sample_document_id,
- user_id=sample_user.id,
- collection_ids=collection_ids,
- vector=Vector(
- data=[random.random() for _ in range(128)],
- type=VectorType.FIXED,
- length=128,
- ),
- text="This is the second chunk with different content.",
- metadata={"chunk_order": 1},
- ),
- VectorEntry(
- chunk_id=chunk_ids[2],
- document_id=sample_document_id,
- user_id=sample_user.id,
- collection_ids=collection_ids,
- vector=Vector(
- data=[random.random() for _ in range(128)],
- type=VectorType.FIXED,
- length=128,
- ),
- text="And this is the third chunk with more information.",
- metadata={"chunk_order": 2},
- ),
- ]
-
-
-@pytest.fixture
-def enrichment_settings():
- return ChunkEnrichmentSettings(
- enable_chunk_enrichment=True,
- strategies=[
- ChunkEnrichmentStrategy.NEIGHBORHOOD,
- ChunkEnrichmentStrategy.SEMANTIC,
- ],
- backward_chunks=1,
- forward_chunks=1,
- semantic_neighbors=2,
- semantic_similarity_threshold=0.7,
- )
-
-
-@pytest.fixture
-def r2r_ingestion_provider(app_config):
- return R2RIngestionProvider(R2RIngestionConfig(app=app_config))
-
-
-@pytest.fixture
-def orchestration_provider(orchestration_config):
- return SimpleOrchestrationProvider(orchestration_config)
-
-
-@pytest.fixture
-def r2r_providers(
- r2r_ingestion_provider,
- postgres_db_provider,
- litellm_provider_128,
- r2r_auth_provider,
- litellm_completion_provider,
- orchestration_provider,
- local_logging_provider,
-):
- return R2RProviders(
- ingestion=r2r_ingestion_provider,
- database=postgres_db_provider,
- embedding=litellm_provider_128,
- auth=r2r_auth_provider,
- llm=litellm_completion_provider,
- orchestration=orchestration_provider,
- logging=local_logging_provider,
- )
-
-
-@pytest.fixture
-def ingestion_config(app_config, enrichment_settings):
- return IngestionConfig(
- app=app_config, chunk_enrichment_settings=enrichment_settings
- )
-
-
-@pytest.fixture
-async def ingestion_service(r2r_providers, ingestion_config):
- # You'll need to mock your dependencies here
- service = IngestionService(
- providers=r2r_providers,
- config=ingestion_config,
- pipes=[],
- pipelines=[],
- agents=[],
- run_manager=None,
- logging_connection=None,
- )
- return service
-
-
-async def test_chunk_enrichment_basic(
- sample_chunks, ingestion_service, sample_document_id, sample_user
-):
- # Test basic chunk enrichment functionality
-
- # ingest chunks ingress. Just add document info to the table
- await ingestion_service.ingest_chunks_ingress(
- document_id=sample_document_id,
- chunks=sample_chunks,
- metadata={},
- user=sample_user,
- )
-
- # upsert entries
- await ingestion_service.providers.database.upsert_entries(sample_chunks)
-
- # enrich chunks
- await ingestion_service.chunk_enrichment(sample_document_id)
-
- # document chunks
- list_document_chunks = (
- await ingestion_service.providers.database.list_document_chunks(
- sample_document_id
- )
- )
-
- assert len(list_document_chunks["results"]) == len(sample_chunks)
-
- for document_chunk in list_document_chunks["results"]:
- assert (
- document_chunk["metadata"]["chunk_enrichment_status"] == "success"
- )
- assert (
- document_chunk["metadata"]["original_text"]
- == sample_chunks[document_chunk["metadata"]["chunk_order"]].text
- )
-
-
-# Other tests
-# TODO: Implement in services/test_ingestion_service.py
-
-# test_enriched_chunk_content:
-# Ingests chunks, enriches them, then verifies each chunk in DB has metadata containing both 'original_text' and 'chunk_enrichment_status' (success/failed)
-
-# test_neighborhood_strategy:
-# Tests _get_enriched_chunk_text() on middle chunk (idx 1) with NEIGHBORHOOD strategy to verify it incorporates text from chunks before/after it
-
-# test_semantic_strategy:
-# Sets ChunkEnrichmentStrategy.SEMANTIC, ingests chunks, then enriches them using semantic similarity to find and incorporate related chunks' content
-
-# test_error_handling:
-# Attempts chunk_enrichment() with non-existent UUID('00000000-0000-0000-0000-000000000000') to verify proper exception handling
-
-# test_empty_chunks:
-# Attempts to ingest_chunks_ingress() with empty chunks list to verify it raises appropriate exception rather than processing empty data
-
-# test_concurrent_processing:
-# Creates 200 RawChunks ("Chunk number {0-199}"), ingests and enriches them all to verify concurrent processing handles large batch correctly
-
-# test_vector_storage:
-# Ingests chunks, enriches them, then verifies get_document_vectors() returns vectors with correct structure including vector data and chunk_id fields
diff --git a/py/tests/core/providers/kg/__init__.py b/py/tests/core/providers/kg/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/core/providers/kg/test_kg_logic.py b/py/tests/core/providers/kg/test_kg_logic.py
deleted file mode 100644
index 59d4d4596..000000000
--- a/py/tests/core/providers/kg/test_kg_logic.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# tests/core/providers/kg/test_kg_logic.py
-import random
-import uuid
-
-import pytest
-
-from core.base import Community, Entity, KGExtraction, Relationship
-from shared.abstractions.vector import VectorQuantizationType
-
-
-@pytest.fixture(scope="function")
-def collection_id():
- return uuid.UUID("122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")
-
-
-@pytest.fixture(scope="function")
-def document_id():
- return uuid.UUID("9fbe403b-c11c-5aae-8ade-ef22980c3ad1")
-
-
-@pytest.fixture(scope="function")
-def chunk_ids():
- return [
- uuid.UUID("32ff6daf-6e67-44fa-b2a9-19384f5d9d19"),
- uuid.UUID("42ff6daf-6e67-44fa-b2a9-19384f5d9d19"),
- ]
-
-
-@pytest.fixture(scope="function")
-def embedding_dimension():
- return 128
-
-
-@pytest.fixture(scope="function")
-def vector_quantization_type():
- return VectorQuantizationType.FP32
-
-
-@pytest.fixture(scope="function")
-def embedding_vectors(embedding_dimension):
- random.seed(42)
- return [
- [random.random() for _ in range(embedding_dimension)] for _ in range(2)
- ]
-
-
-@pytest.fixture(scope="function")
-def entities_raw_list(document_id, chunk_ids):
- return [
- Entity(
- name="Entity1",
- description="Description1",
- category="Category1",
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr1": "value1", "attr2": "value2"},
- ),
- Entity(
- name="Entity2",
- description="Description2",
- category="Category2",
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr3": "value3", "attr4": "value4"},
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def entities_list(chunk_ids, document_id, embedding_vectors):
- return [
- Entity(
- name="Entity1",
- description="Description1",
- chunk_ids=chunk_ids,
- document_id=document_id,
- description_embedding=embedding_vectors[0],
- ),
- Entity(
- name="Entity2",
- description="Description2",
- chunk_ids=chunk_ids,
- document_id=document_id,
- description_embedding=embedding_vectors[1],
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def relationships_raw_list(embedding_vectors, chunk_ids, document_id):
- return [
- Relationship(
- subject="Entity1",
- predicate="predicate1",
- object="object1",
- weight=1.0,
- description="description1",
- embedding=embedding_vectors[0],
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr1": "value1", "attr2": "value2"},
- ),
- Relationship(
- subject="Entity2",
- predicate="predicate2",
- object="object2",
- weight=1.0,
- description="description2",
- embedding=embedding_vectors[1],
- chunk_ids=chunk_ids,
- document_id=document_id,
- attributes={"attr3": "value3", "attr4": "value4"},
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def communities_list(entities_list, relationships_raw_list):
- return [
- Community(
- name="Community1",
- description="Description1",
- entities=[entities_list[0]],
- relationships=[relationships_raw_list[0]],
- ),
- Community(
- name="Community2",
- description="Description2",
- entities=[entities_list[1]],
- relationships=[relationships_raw_list[1]],
- ),
- ]
-
-
-@pytest.fixture(scope="function")
-def community_table_info(collection_id):
- return [
- ("Entity1", 1, None, 0, False, [1, 2], collection_id),
- ("Entity2", 2, None, 0, False, [1, 2], collection_id),
- ]
-
-
-@pytest.fixture(scope="function")
-def kg_extractions(
- chunk_ids, entities_raw_list, relationships_raw_list, document_id
-):
- return [
- KGExtraction(
- relationships=relationships_raw_list,
- document_id=document_id,
- )
- ]
-
-
-@pytest.fixture(scope="function")
-def community_list(embedding_vectors, collection_id):
- return [
- Community(
- community_number=1,
- level=0,
- collection_id=collection_id,
- name="Community Report 1",
- summary="Summary of the community report",
- rating=8.0,
- rating_explanation="Rating explanation of the community report",
- findings=["Findings of the community report"],
- embedding=embedding_vectors[0],
- ),
- Community(
- community_number=2,
- level=0,
- collection_id=collection_id,
- name="Community Report",
- summary="Summary of the community report",
- rating=8.0,
- rating_explanation="Rating explanation of the community report",
- findings=["Findings of the community report"],
- embedding=embedding_vectors[1],
- ),
- ]
-
-
-@pytest.mark.asyncio
-async def test_create_tables(
- postgres_db_provider,
- collection_id,
- embedding_dimension,
- vector_quantization_type,
-):
- assert await postgres_db_provider.get_entities(collection_id) == {
- "entities": [],
- "total_entries": 0,
- }
- assert await postgres_db_provider.get_relationships(collection_id) == {
- "relationships": [],
- "total_entries": 0,
- }
- assert await postgres_db_provider.get_communities(collection_id) == {
- "communities": [],
- "total_entries": 0,
- }
-
-
-@pytest.mark.asyncio
-async def test_add_entities_raw(
- postgres_db_provider, entities_raw_list, collection_id
-):
- await postgres_db_provider.add_entities(
- entities_raw_list, table_name="chunk_entity"
- )
- entities = await postgres_db_provider.get_entities(
- collection_id, entity_table_name="chunk_entity"
- )
- assert entities["entities"][0].name == "Entity1"
- assert entities["entities"][1].name == "Entity2"
- assert len(entities["entities"]) == 2
- assert entities["total_entries"] == 2
-
-
-@pytest.mark.asyncio
-async def test_add_entities(
- postgres_db_provider, entities_list, collection_id
-):
- await postgres_db_provider.graph_handler.add_entities(
- entities_list, table_name="entity"
- )
- entities = await postgres_db_provider.get_entities(
- collection_id, entity_table_name="entity"
- )
- assert entities["entities"][0].name == "Entity1"
- assert entities["entities"][1].name == "Entity2"
- assert len(entities["entities"]) == 2
- assert entities["total_entries"] == 2
-
-
-@pytest.mark.asyncio
-async def test_add_relationships(
- postgres_db_provider, relationships_raw_list, collection_id
-):
- await postgres_db_provider.graph_handler.add_relationships(
- relationships_raw_list, table_name="relationship"
- )
- relationships = await postgres_db_provider.get_relationships(collection_id)
- assert relationships["relationships"][0].subject == "Entity1"
- assert relationships["relationships"][1].subject == "Entity2"
- assert len(relationships["relationships"]) == 2
- assert relationships["total_entries"] == 2
-
-
-@pytest.mark.asyncio
-async def test_get_entity_map(
- postgres_db_provider,
- entities_raw_list,
- relationships_raw_list,
- document_id,
-):
- await postgres_db_provider.add_entities(
- entities_raw_list, table_name="chunk_entity"
- )
- entity_map = await postgres_db_provider.get_entity_map(0, 2, document_id)
- assert entity_map["Entity1"]["entities"][0].name == "Entity1"
- assert entity_map["Entity2"]["entities"][0].name == "Entity2"
-
- await postgres_db_provider.graph_handler.add_relationships(
- relationships_raw_list
- )
- entity_map = await postgres_db_provider.get_entity_map(0, 2, document_id)
- assert entity_map["Entity1"]["entities"][0].name == "Entity1"
- assert entity_map["Entity2"]["entities"][0].name == "Entity2"
-
- assert entity_map["Entity1"]["relationships"][0].subject == "Entity1"
- assert entity_map["Entity2"]["relationships"][0].subject == "Entity2"
-
-
-@pytest.mark.asyncio
-async def test_upsert_embeddings(
- postgres_db_provider, collection_id, entities_list
-):
- table_name = "entity"
-
- entities_list_to_upsert = [
- (
- entity.name,
- entity.description,
- str(entity.description_embedding),
- entity.chunk_ids,
- entity.document_id,
- )
- for entity in entities_list
- ]
-
- await postgres_db_provider.add_entities(
- entities_list_to_upsert, table_name
- )
-
- entities = await postgres_db_provider.get_entities(
- collection_id, entity_table_name=table_name
- )
- assert entities["entities"][0].name == "Entity1"
- assert entities["entities"][1].name == "Entity2"
-
-
-@pytest.mark.asyncio
-async def test_get_all_relationships(
- postgres_db_provider, collection_id, relationships_raw_list
-):
- await postgres_db_provider.graph_handler.add_relationships(
- relationships_raw_list
- )
- relationships = await postgres_db_provider.get_relationships(collection_id)
- assert relationships["relationships"][0].subject == "Entity1"
- assert relationships["relationships"][1].subject == "Entity2"
- assert len(relationships["relationships"]) == 2
-
-
-@pytest.mark.asyncio
-async def test_get_communities(
- postgres_db_provider, collection_id, community_list
-):
- await postgres_db_provider.add_community(community_list[0])
- await postgres_db_provider.add_community(community_list[1])
- communities = await postgres_db_provider.get_communities(collection_id)
- assert communities["communities"][0].name == "Community Report 1"
- assert len(communities["communities"]) == 2
- assert communities["total_entries"] == 2
-
-
-@pytest.fixture(scope="function")
-def leiden_params_1():
- return {
- "resolution": 1.0,
- "max_cluster_size": 1000,
- "random_seed": 42,
- }
-
-
-@pytest.mark.asyncio
-async def test_perform_graph_clustering(
- postgres_db_provider,
- collection_id,
- leiden_params_1,
- entities_list,
- relationships_raw_list,
-):
-
- # addd entities and relationships
- await postgres_db_provider.add_entities(entities_list, table_name="entity")
- await postgres_db_provider.graph_handler.add_relationships(
- relationships_raw_list, table_name="relationship"
- )
-
- num_communities = await postgres_db_provider.perform_graph_clustering(
- collection_id, leiden_params_1
- )
- assert num_communities
-
-
-@pytest.mark.asyncio
-async def test_get_community_details(
- postgres_db_provider,
- entities_list,
- relationships_raw_list,
- collection_id,
- community_list,
- community_table_info,
-):
-
- await postgres_db_provider.add_entities(entities_list, table_name="entity")
- await postgres_db_provider.graph_handler.add_relationships(
- relationships_raw_list, table_name="relationship"
- )
- await postgres_db_provider.add_community_info(community_table_info)
- await postgres_db_provider.add_community(community_list[0])
-
- community_level, entities, relationships = (
- await postgres_db_provider.get_community_details(
- community_number=1, collection_id=collection_id
- )
- )
-
- assert community_level == 0
- # TODO: change these to objects
- assert entities[0].name == "Entity1"
- assert relationships[0].subject == "Entity1"
diff --git a/py/tests/core/providers/llm/test_litellm_llm_provider.py b/py/tests/core/providers/llm/test_litellm_llm_provider.py
deleted file mode 100644
index 00c916a34..000000000
--- a/py/tests/core/providers/llm/test_litellm_llm_provider.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import asyncio
-import contextlib
-
-import pytest
-
-from core import CompletionConfig, GenerationConfig
-from core.providers import LiteLLMCompletionProvider
-
-
-def test_litellm_initialization(litellm_completion_provider):
- assert isinstance(litellm_completion_provider, LiteLLMCompletionProvider)
-
-
-def test_litellm_invalid_provider_initialization():
- with pytest.raises(ValueError):
- config = CompletionConfig(provider="invalid_provider")
- LiteLLMCompletionProvider(config)
-
-
-@pytest.mark.asyncio
-async def test_litellm_async_completion(litellm_completion_provider):
- generation_config = GenerationConfig(model="gpt-3.5-turbo")
- messages = [{"role": "user", "content": "Hello!"}]
-
- with contextlib.suppress(asyncio.CancelledError):
- response = await litellm_completion_provider.aget_completion(
- messages, generation_config
- )
- assert len(response.choices) > 0
diff --git a/py/tests/core/providers/logging/conftest.py b/py/tests/core/providers/logging/conftest.py
deleted file mode 100644
index 56cecfb80..000000000
--- a/py/tests/core/providers/logging/conftest.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import asyncio
-
-import pytest
-
-
-@pytest.fixture(scope="session", autouse=True)
-def event_loop_policy():
- asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
-
-
-@pytest.fixture(scope="function", autouse=True)
-async def cleanup_tasks():
- yield
- tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
- [task.cancel() for task in tasks]
- await asyncio.gather(*tasks, return_exceptions=True)
diff --git a/py/tests/core/providers/logging/test_chat_logging_provider.py b/py/tests/core/providers/logging/test_chat_logging_provider.py
deleted file mode 100644
index abffbc341..000000000
--- a/py/tests/core/providers/logging/test_chat_logging_provider.py
+++ /dev/null
@@ -1,311 +0,0 @@
-import pytest
-
-from core import Message
-
-
-@pytest.mark.asyncio
-async def test_create_conversation(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- assert isinstance(conversation_id, str)
- assert len(conversation_id) > 0
-
-
-@pytest.mark.asyncio
-async def test_add_message(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- message_id = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- assert isinstance(message_id, str)
- assert len(message_id) > 0
-
-
-@pytest.mark.asyncio
-async def test_get_conversation(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- messages = [
- Message(role="user", content="Hello"),
- Message(role="assistant", content="Hi there!"),
- ]
- for message in messages:
- await local_logging_provider.add_message(conversation_id, message)
-
- retrieved_messages = await local_logging_provider.get_conversation(
- conversation_id
- )
- assert len(retrieved_messages) == len(messages)
- for original, retrieved in zip(messages, retrieved_messages):
- assert original.role == retrieved[1].role
- assert original.content == retrieved[1].content
-
-
-@pytest.mark.asyncio
-async def test_edit_message(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- message_id = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- new_message_id, new_branch_id = await local_logging_provider.edit_message(
- message_id, "Hello, edited"
- )
- assert isinstance(new_message_id, str)
- assert len(new_message_id) > 0
- assert isinstance(new_branch_id, str)
- assert len(new_branch_id) > 0
-
- retrieved_messages = await local_logging_provider.get_conversation(
- conversation_id, new_branch_id
- )
- assert len(retrieved_messages) == 1
- assert retrieved_messages[0][1].content == "Hello, edited"
-
-
-# @pytest.mark.asyncio
-# async def test_branches_overview(local_logging_provider):
-# conversation_id = await local_logging_provider.create_conversation()
-# message_id = await local_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Hello")
-# )
-# await local_logging_provider.edit_message(message_id, "Hello, edited")
-
-# branches = await local_logging_provider.branches_overview(conversation_id)
-# assert len(branches) == 2
-# assert branches[0]["branch_point_id"] is None
-# assert branches[1]["branch_point_id"] == message_id
-
-
-@pytest.mark.asyncio
-async def test_get_next_and_prev_branch(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- message_id = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- _, branch_id_1 = await local_logging_provider.edit_message(
- message_id, "Hello, edited 1"
- )
- _, branch_id_2 = await local_logging_provider.edit_message(
- message_id, "Hello, edited 2"
- )
-
- next_branch = await local_logging_provider.get_next_branch(branch_id_1)
- assert next_branch == branch_id_2
-
- prev_branch = await local_logging_provider.get_prev_branch(branch_id_2)
- assert prev_branch == branch_id_1
-
-
-@pytest.mark.asyncio
-async def test_branch_at_message(local_logging_provider):
- conversation_id = await local_logging_provider.create_conversation()
- message_id_1 = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- message_id_2 = await local_logging_provider.add_message(
- conversation_id,
- Message(role="assistant", content="Hi there!"),
- message_id_1,
- )
-
- branch_id = await local_logging_provider.branch_at_message(message_id_1)
- assert isinstance(branch_id, str)
- assert len(branch_id) > 0
-
- retrieved_messages = await local_logging_provider.get_conversation(
- conversation_id, branch_id
- )
- assert len(retrieved_messages) == 1
- assert retrieved_messages[0][1].content == "Hello"
-
-
-@pytest.mark.asyncio
-async def test_edit_message_in_middle(local_logging_provider):
- # Create a conversation with multiple messages
- conversation_id = await local_logging_provider.create_conversation()
-
- # Add initial messages
- message_id_1 = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- message_id_2 = await local_logging_provider.add_message(
- conversation_id,
- Message(role="assistant", content="Hi there!"),
- message_id_1,
- )
- message_id_3 = await local_logging_provider.add_message(
- conversation_id,
- Message(role="user", content="How are you?"),
- message_id_2,
- )
- message_id_4 = await local_logging_provider.add_message(
- conversation_id,
- Message(role="assistant", content="I'm doing well, thanks!"),
- message_id_3,
- )
-
- # Edit message 2
- new_message_id, new_branch_id = await local_logging_provider.edit_message(
- message_id_2, "Greetings!"
- )
-
- # Retrieve messages in the new branch
- retrieved_messages = await local_logging_provider.get_conversation(
- conversation_id, new_branch_id
- )
-
- print("retrieved_messages = ", retrieved_messages)
- # Verify that messages after the edited message are not present
- assert len(retrieved_messages) == 2
- assert retrieved_messages[0][1].content == "Hello"
- assert retrieved_messages[0][1].role == "user"
- assert retrieved_messages[1][1].content == "Greetings!"
- assert retrieved_messages[1][1].role == "assistant"
-
-
-@pytest.mark.asyncio
-async def test_multiple_branches_from_same_message(local_logging_provider):
- # Create a conversation with initial messages
- conversation_id = await local_logging_provider.create_conversation()
- message_id_1 = await local_logging_provider.add_message(
- conversation_id, Message(role="user", content="Tell me a joke.")
- )
- message_id_2 = await local_logging_provider.add_message(
- conversation_id,
- Message(
- role="assistant", content="Why did the chicken cross the road?"
- ),
- message_id_1,
- )
-
- # Create first branch
- new_message_id_1, new_branch_id_1 = (
- await local_logging_provider.edit_message(
- message_id_2, "Knock, knock!"
- )
- )
-
- # Create second branch
- new_message_id_2, new_branch_id_2 = (
- await local_logging_provider.edit_message(
- message_id_2,
- "What do you call a bear with no teeth? A gummy bear!",
- )
- )
-
- # Retrieve messages for the first new branch
- retrieved_messages_1 = await local_logging_provider.get_conversation(
- conversation_id, new_branch_id_1
- )
-
- # Retrieve messages for the second new branch
- retrieved_messages_2 = await local_logging_provider.get_conversation(
- conversation_id, new_branch_id_2
- )
-
- # Verify first branch messages
- assert len(retrieved_messages_1) == 2
- print("retrieved_messages_1[0] = ", retrieved_messages_1[0])
- assert retrieved_messages_1[0][1].content == "Tell me a joke."
- assert retrieved_messages_1[1][1].content == "Knock, knock!"
-
- # Verify second branch messages
- assert len(retrieved_messages_2) == 2
- assert retrieved_messages_2[0][1].content == "Tell me a joke."
- assert (
- retrieved_messages_2[1][1].content
- == "What do you call a bear with no teeth? A gummy bear!"
- )
-
-
-@pytest.mark.asyncio
-async def test_navigate_between_branches(local_logging_provider):
- # Create a conversation and add a message
- conversation_id = await local_logging_provider.create_conversation()
- message_id = await local_logging_provider.add_message(
- conversation_id,
- Message(role="user", content="What's the weather like?"),
- )
-
- # Create multiple branches by editing the message
- _, branch_id_1 = await local_logging_provider.edit_message(
- message_id, "What's the weather in New York?"
- )
- _, branch_id_2 = await local_logging_provider.edit_message(
- message_id, "What's the weather in London?"
- )
- _, branch_id_3 = await local_logging_provider.edit_message(
- message_id, "What's the weather in Tokyo?"
- )
-
- # Test navigating between branches
- next_branch = await local_logging_provider.get_next_branch(branch_id_1)
- assert next_branch == branch_id_2
-
- next_branch = await local_logging_provider.get_next_branch(branch_id_2)
- assert next_branch == branch_id_3
-
- prev_branch = await local_logging_provider.get_prev_branch(branch_id_3)
- assert prev_branch == branch_id_2
-
- prev_branch = await local_logging_provider.get_prev_branch(branch_id_2)
- assert prev_branch == branch_id_1
-
-
-# @pytest.mark.asyncio
-# async def test_messages_at_branch_point(local_logging_provider):
-# # Create a conversation with initial messages
-# conversation_id = await local_logging_provider.create_conversation()
-# user_message_id = await local_logging_provider.add_message(
-# conversation_id, Message(role="user", content="What's the capital of France?")
-# )
-# assistant_message_id = await local_logging_provider.add_message(
-# conversation_id, Message(role="assistant", content="The capital of France is Paris."), user_message_id
-# )
-
-# # Create multiple branches by editing the assistant's message
-# _, branch_id_1 = await local_logging_provider.edit_message(
-# assistant_message_id, "It's Paris."
-# )
-# _, branch_id_2 = await local_logging_provider.edit_message(
-# assistant_message_id, "Paris is the capital city of France."
-# )
-
-# # List all branches
-# branches = await local_logging_provider.branches_overview(conversation_id)
-
-# # Collect messages at the branching point
-# messages_at_branch_point = []
-# for branch in branches:
-# print('branch = ', branch)
-# if branch["branch_point_id"] == assistant_message_id:
-# # Get the message content at the branching point
-# content = Message.parse_raw(branch["content"]).content
-# messages_at_branch_point.append(content)
-
-# # Verify that all alternative messages are available
-# assert len(messages_at_branch_point) == 2
-# assert "It's Paris." in messages_at_branch_point
-# assert "Paris is the capital city of France." in messages_at_branch_point
-
-
-@pytest.mark.asyncio
-async def test_delete_branch(local_logging_provider):
- # Create a conversation and branches
- conversation_id = await local_logging_provider.create_conversation()
- message_id = await local_logging_provider.add_message(
- conversation_id,
- Message(role="user", content="Explain quantum physics."),
- )
- _, branch_id = await local_logging_provider.edit_message(
- message_id, "Explain quantum physics in simple terms."
- )
-
- # Delete the branch (assuming a delete_branch method exists)
- await local_logging_provider.delete_conversation(conversation_id)
-
- # Try to retrieve the deleted branch
- retrieved_messages = await local_logging_provider.get_conversation(
- conversation_id, branch_id
- )
-
- # Verify that the branch no longer exists
- assert retrieved_messages == []
diff --git a/py/tests/core/providers/logging/test_logging_provider.py b/py/tests/core/providers/logging/test_logging_provider.py
deleted file mode 100644
index aa9441ae3..000000000
--- a/py/tests/core/providers/logging/test_logging_provider.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-import os
-import uuid
-from uuid import UUID
-
-import pytest
-
-from core import (
- PersistentLoggingConfig,
- SqlitePersistentLoggingProvider,
- generate_id,
-)
-
-logger = logging.getLogger()
-
-
-@pytest.mark.asyncio
-async def test_logging(local_logging_provider):
- run_id = generate_id()
- await local_logging_provider.log(run_id, "key", "value")
- logs = await local_logging_provider.get_logs([run_id])
- assert len(logs) == 1
- assert logs[0]["key"] == "key"
- assert logs[0]["value"] == "value"
-
-
-async def test_multiple_log_entries(local_logging_provider):
- run_ids = [generate_id() for _ in range(3)]
- entries = [
- (run_id, f"key_{i}", f"value_{i}") for i, run_id in enumerate(run_ids)
- ]
- for run_id, key, value in entries:
- await local_logging_provider.log(run_id, key, value)
-
- logs = await local_logging_provider.get_logs(run_ids)
- assert len(logs) == 3, f"Expected 3 logs, got {len(logs)}"
-
- for log in logs:
- run_id = log.get("run_id")
- assert run_id is not None, f"Log entry is missing 'run_id': {log}"
-
- if isinstance(run_id, str):
- run_id = UUID(run_id)
-
- matching_entries = [entry for entry in entries if entry[0] == run_id]
- assert (
- len(matching_entries) == 1
- ), f"No matching entry found for run_id {run_id}"
-
- selected_entry = matching_entries[0]
- assert log["key"] == selected_entry[1]
- assert log["value"] == selected_entry[2]
-
- # Additional check to ensure all entries were logged
- logged_ids = set(
- (
- UUID(log["run_id"])
- if isinstance(log["run_id"], str)
- else log["run_id"]
- )
- for log in logs
- )
- entry_ids = set(entry[0] for entry in entries)
- assert (
- logged_ids == entry_ids
- ), f"Mismatch between logged IDs {logged_ids} and entry IDs {entry_ids}"
-
-
-@pytest.mark.asyncio
-async def test_log_retrieval_limit(local_logging_provider):
- run_ids = []
- for i in range(10):
- run_ids.append(generate_id())
- await local_logging_provider.log(run_ids[-1], f"key_{i}", f"value_{i}")
-
- logs = await local_logging_provider.get_logs(run_ids[:5])
- assert len(logs) == 5
-
-
-async def test_specific_run_type_retrieval(local_logging_provider):
- run_id_0, run_id_1 = generate_id(), generate_id()
-
- await local_logging_provider.log(run_id_0, "run_type", "RETRIEVAL")
- await local_logging_provider.log(run_id_0, "key_0", "value_0")
- await local_logging_provider.log(run_id_1, "run_type", "MANAGEMENT")
- await local_logging_provider.log(run_id_1, "key_1", "value_1")
-
- # Log info for both run IDs
- await local_logging_provider.info_log(run_id_0, "RETRIEVAL", uuid.uuid4())
- await local_logging_provider.info_log(run_id_1, "MANAGEMENT", uuid.uuid4())
-
- run_info = await local_logging_provider.get_info_logs(
- run_type_filter="RETRIEVAL"
- )
- assert (
- len(run_info) == 1
- ), f"Expected 1 'RETRIEVAL' log, got {len(run_info)}"
- assert (
- run_info[0].run_id == run_id_0
- ), f"Expected run_id {run_id_0}, got {run_info[0].run_id}"
-
- logs = await local_logging_provider.get_logs(
- [run.run_id for run in run_info]
- )
- assert len(logs) == 2, f"Expected 2 logs for run_id_0, got {len(logs)}"
- assert all(
- log["run_id"] == run_id_0 for log in logs
- ), "All logs should be for run_id_0"
- assert any(
- log["key"] == "run_type" and log["value"] == "RETRIEVAL"
- for log in logs
- ), "Should have a 'RETRIEVAL' log"
- assert any(
- log["key"] == "key_0" and log["value"] == "value_0" for log in logs
- ), "Should have a 'key_0' log"
-
-
-@pytest.mark.asyncio
-async def test_info_logging(local_logging_provider):
- run_id = generate_id()
- user_id = uuid.uuid4()
- run_type = "RETRIEVAL"
- await local_logging_provider.info_log(run_id, run_type, user_id)
- info_logs = await local_logging_provider.get_info_logs()
- assert len(info_logs) == 1
- assert info_logs[0].run_id == run_id
- assert info_logs[0].run_type == run_type
- assert info_logs[0].user_id == user_id
-
-
-@pytest.mark.asyncio
-async def test_get_info_logs_with_user_filter(local_logging_provider):
- user_id_1, user_id_2 = uuid.uuid4(), uuid.uuid4()
- await local_logging_provider.info_log(
- generate_id(), "RETRIEVAL", user_id_1
- )
- await local_logging_provider.info_log(
- generate_id(), "MANAGEMENT", user_id_2
- )
-
- info_logs = await local_logging_provider.get_info_logs(
- user_ids=[user_id_1]
- )
- assert len(info_logs) == 1
- assert info_logs[0].user_id == user_id_1
-
- info_logs = await local_logging_provider.get_info_logs(
- run_type_filter="MANAGEMENT", user_ids=[user_id_2]
- )
- assert len(info_logs) == 1
- assert info_logs[0].user_id == user_id_2
- assert info_logs[0].run_type == "MANAGEMENT"
diff --git a/py/tests/core/providers/logging/test_postgress_logging_provider.py b/py/tests/core/providers/logging/test_postgress_logging_provider.py
deleted file mode 100644
index be5c41718..000000000
--- a/py/tests/core/providers/logging/test_postgress_logging_provider.py
+++ /dev/null
@@ -1,311 +0,0 @@
-import pytest
-
-from core import Message
-
-
-@pytest.mark.asyncio
-async def test_create_conversation(postgres_db_provider):
- conversation_id = await postgres_db_provider.create_conversation()
- assert isinstance(conversation_id, str)
- assert len(conversation_id) > 0
-
-
-@pytest.mark.asyncio
-async def test_add_message(postgres_db_provider):
- conversation_id = await postgres_db_provider.create_conversation()
- message_id = await postgres_db_provider.add_message(
- conversation_id, Message(role="user", content="Hello")
- )
- assert isinstance(message_id, str)
- assert len(message_id) > 0
-
-
-# @pytest.mark.asyncio
-# async def test_get_conversation(postgres_logging_provider):
-# conversation_id = await postgres_logging_provider.create_conversation()
-# messages = [
-# Message(role="user", content="Hello"),
-# Message(role="assistant", content="Hi there!"),
-# ]
-# for message in messages:
-# await postgres_logging_provider.add_message(conversation_id, message)
-
-# retrieved_messages = await postgres_logging_provider.get_conversation(
-# conversation_id
-# )
-# assert len(retrieved_messages) == len(messages)
-# for original, retrieved in zip(messages, retrieved_messages):
-# assert original.role == retrieved[1].role
-# assert original.content == retrieved[1].content
-
-
-# @pytest.mark.asyncio
-# async def test_edit_message(postgres_logging_provider):
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id = await postgres_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Hello")
-# )
-# new_message_id, new_branch_id = await postgres_logging_provider.edit_message(
-# message_id, "Hello, edited"
-# )
-# assert isinstance(new_message_id, str)
-# assert len(new_message_id) > 0
-# assert isinstance(new_branch_id, str)
-# assert len(new_branch_id) > 0
-
-# retrieved_messages = await postgres_logging_provider.get_conversation(
-# conversation_id, new_branch_id
-# )
-# assert len(retrieved_messages) == 1
-# assert retrieved_messages[0][1].content == "Hello, edited"
-
-
-# # @pytest.mark.asyncio
-# # async def test_branches_overview(postgres_logging_provider):
-# # conversation_id = await postgres_logging_provider.create_conversation()
-# # message_id = await postgres_logging_provider.add_message(
-# # conversation_id, Message(role="user", content="Hello")
-# # )
-# # await postgres_logging_provider.edit_message(message_id, "Hello, edited")
-
-# # branches = await postgres_logging_provider.branches_overview(conversation_id)
-# # assert len(branches) == 2
-# # assert branches[0]["branch_point_id"] is None
-# # assert branches[1]["branch_point_id"] == message_id
-
-
-# @pytest.mark.asyncio
-# async def test_get_next_and_prev_branch(postgres_logging_provider):
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id = await postgres_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Hello")
-# )
-# _, branch_id_1 = await postgres_logging_provider.edit_message(
-# message_id, "Hello, edited 1"
-# )
-# _, branch_id_2 = await postgres_logging_provider.edit_message(
-# message_id, "Hello, edited 2"
-# )
-
-# next_branch = await postgres_logging_provider.get_next_branch(branch_id_1)
-# assert next_branch == branch_id_2
-
-# prev_branch = await postgres_logging_provider.get_prev_branch(branch_id_2)
-# assert prev_branch == branch_id_1
-
-
-# @pytest.mark.asyncio
-# async def test_branch_at_message(postgres_logging_provider):
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id_1 = await postgres_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Hello")
-# )
-# message_id_2 = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="assistant", content="Hi there!"),
-# message_id_1,
-# )
-
-# branch_id = await postgres_logging_provider.branch_at_message(message_id_1)
-# assert isinstance(branch_id, str)
-# assert len(branch_id) > 0
-
-# retrieved_messages = await postgres_logging_provider.get_conversation(
-# conversation_id, branch_id
-# )
-# assert len(retrieved_messages) == 1
-# assert retrieved_messages[0][1].content == "Hello"
-
-
-# @pytest.mark.asyncio
-# async def test_edit_message_in_middle(postgres_logging_provider):
-# # Create a conversation with multiple messages
-# conversation_id = await postgres_logging_provider.create_conversation()
-
-# # Add initial messages
-# message_id_1 = await postgres_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Hello")
-# )
-# message_id_2 = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="assistant", content="Hi there!"),
-# message_id_1,
-# )
-# message_id_3 = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="user", content="How are you?"),
-# message_id_2,
-# )
-# message_id_4 = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="assistant", content="I'm doing well, thanks!"),
-# message_id_3,
-# )
-
-# # Edit message 2
-# new_message_id, new_branch_id = await postgres_logging_provider.edit_message(
-# message_id_2, "Greetings!"
-# )
-
-# # Retrieve messages in the new branch
-# retrieved_messages = await postgres_logging_provider.get_conversation(
-# conversation_id, new_branch_id
-# )
-
-# print("retrieved_messages = ", retrieved_messages)
-# # Verify that messages after the edited message are not present
-# assert len(retrieved_messages) == 2
-# assert retrieved_messages[0][1].content == "Hello"
-# assert retrieved_messages[0][1].role == "user"
-# assert retrieved_messages[1][1].content == "Greetings!"
-# assert retrieved_messages[1][1].role == "assistant"
-
-
-# @pytest.mark.asyncio
-# async def test_multiple_branches_from_same_message(postgres_logging_provider):
-# # Create a conversation with initial messages
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id_1 = await postgres_logging_provider.add_message(
-# conversation_id, Message(role="user", content="Tell me a joke.")
-# )
-# message_id_2 = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(
-# role="assistant", content="Why did the chicken cross the road?"
-# ),
-# message_id_1,
-# )
-
-# # Create first branch
-# new_message_id_1, new_branch_id_1 = (
-# await postgres_logging_provider.edit_message(
-# message_id_2, "Knock, knock!"
-# )
-# )
-
-# # Create second branch
-# new_message_id_2, new_branch_id_2 = (
-# await postgres_logging_provider.edit_message(
-# message_id_2,
-# "What do you call a bear with no teeth? A gummy bear!",
-# )
-# )
-
-# # Retrieve messages for the first new branch
-# retrieved_messages_1 = await postgres_logging_provider.get_conversation(
-# conversation_id, new_branch_id_1
-# )
-
-# # Retrieve messages for the second new branch
-# retrieved_messages_2 = await postgres_logging_provider.get_conversation(
-# conversation_id, new_branch_id_2
-# )
-
-# # Verify first branch messages
-# assert len(retrieved_messages_1) == 2
-# print("retrieved_messages_1[0] = ", retrieved_messages_1[0])
-# assert retrieved_messages_1[0][1].content == "Tell me a joke."
-# assert retrieved_messages_1[1][1].content == "Knock, knock!"
-
-# # Verify second branch messages
-# assert len(retrieved_messages_2) == 2
-# assert retrieved_messages_2[0][1].content == "Tell me a joke."
-# assert (
-# retrieved_messages_2[1][1].content
-# == "What do you call a bear with no teeth? A gummy bear!"
-# )
-
-
-# @pytest.mark.asyncio
-# async def test_navigate_between_branches(postgres_logging_provider):
-# # Create a conversation and add a message
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="user", content="What's the weather like?"),
-# )
-
-# # Create multiple branches by editing the message
-# _, branch_id_1 = await postgres_logging_provider.edit_message(
-# message_id, "What's the weather in New York?"
-# )
-# _, branch_id_2 = await postgres_logging_provider.edit_message(
-# message_id, "What's the weather in London?"
-# )
-# _, branch_id_3 = await postgres_logging_provider.edit_message(
-# message_id, "What's the weather in Tokyo?"
-# )
-
-# # Test navigating between branches
-# next_branch = await postgres_logging_provider.get_next_branch(branch_id_1)
-# assert next_branch == branch_id_2
-
-# next_branch = await postgres_logging_provider.get_next_branch(branch_id_2)
-# assert next_branch == branch_id_3
-
-# prev_branch = await postgres_logging_provider.get_prev_branch(branch_id_3)
-# assert prev_branch == branch_id_2
-
-# prev_branch = await postgres_logging_provider.get_prev_branch(branch_id_2)
-# assert prev_branch == branch_id_1
-
-
-# # @pytest.mark.asyncio
-# # async def test_messages_at_branch_point(postgres_logging_provider):
-# # # Create a conversation with initial messages
-# # conversation_id = await postgres_logging_provider.create_conversation()
-# # user_message_id = await postgres_logging_provider.add_message(
-# # conversation_id, Message(role="user", content="What's the capital of France?")
-# # )
-# # assistant_message_id = await postgres_logging_provider.add_message(
-# # conversation_id, Message(role="assistant", content="The capital of France is Paris."), user_message_id
-# # )
-
-# # # Create multiple branches by editing the assistant's message
-# # _, branch_id_1 = await postgres_logging_provider.edit_message(
-# # assistant_message_id, "It's Paris."
-# # )
-# # _, branch_id_2 = await postgres_logging_provider.edit_message(
-# # assistant_message_id, "Paris is the capital city of France."
-# # )
-
-# # # List all branches
-# # branches = await postgres_logging_provider.branches_overview(conversation_id)
-
-# # # Collect messages at the branching point
-# # messages_at_branch_point = []
-# # for branch in branches:
-# # print('branch = ', branch)
-# # if branch["branch_point_id"] == assistant_message_id:
-# # # Get the message content at the branching point
-# # content = Message.parse_raw(branch["content"]).content
-# # messages_at_branch_point.append(content)
-
-# # # Verify that all alternative messages are available
-# # assert len(messages_at_branch_point) == 2
-# # assert "It's Paris." in messages_at_branch_point
-# # assert "Paris is the capital city of France." in messages_at_branch_point
-
-
-# @pytest.mark.asyncio
-# async def test_delete_branch(postgres_logging_provider):
-# # Create a conversation and branches
-# conversation_id = await postgres_logging_provider.create_conversation()
-# message_id = await postgres_logging_provider.add_message(
-# conversation_id,
-# Message(role="user", content="Explain quantum physics."),
-# )
-# _, branch_id = await postgres_logging_provider.edit_message(
-# message_id, "Explain quantum physics in simple terms."
-# )
-
-# # Delete the branch (assuming a delete_branch method exists)
-# await postgres_logging_provider.delete_conversation(conversation_id)
-
-# # Try to retrieve the deleted branch
-# retrieved_messages = await postgres_logging_provider.get_conversation(
-# conversation_id, branch_id
-# )
-
-# # Verify that the branch no longer exists
-# assert retrieved_messages == []
diff --git a/py/tests/core/services/test_ingestion_service.py b/py/tests/core/services/test_ingestion_service.py
deleted file mode 100644
index c386ce901..000000000
--- a/py/tests/core/services/test_ingestion_service.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# from uuid import UUID
-
-# import pytest
-
-# from core.base import RawChunk
-# from core.main.services.ingestion_service import IngestionService
-# from core.main.abstractions import R2RProviders
-
-
-# @pytest.fixture
-# def r2r_providers(
-# r2r_ingestion_provider,
-# postgres_db_provider,
-# litellm_provider_128,
-# r2r_auth_provider,
-# litellm_completion_provider,
-# orchestration_provider,
-# local_logging_provider,
-# ):
-# return R2RProviders(
-# ingestion=r2r_ingestion_provider,
-# database=postgres_db_provider,
-# embedding=litellm_provider_128,
-# auth=r2r_auth_provider,
-# llm=litellm_completion_provider,
-# orchestration=orchestration_provider,
-# logging=local_logging_provider
-# )
-
-
-# @pytest.fixture
-# async def ingestion_service(r2r_providers, ingestion_config):
-# # You'll need to mock your dependencies here
-# service = IngestionService(
-# providers=r2r_providers,
-# config=ingestion_config,
-# pipes=[],
-# pipelines=[],
-# agents=[],
-# run_manager=None,
-# logging_connection=None,
-# )
-# return service
-
-# @pytest.fixture
-# def sample_document_id():
-# return UUID("12345678-1234-5678-1234-567812345678")
-
-
-# @pytest.fixture
-# def sample_chunks():
-# return [
-# RawChunk(
-# text="This is the first chunk of text.",
-# metadata={"chunk_order": 1},
-# ),
-# RawChunk(
-# text="This is the second chunk with different content.",
-# metadata={"chunk_order": 2},
-# ),
-# RawChunk(
-# text="And this is the third chunk with more information.",
-# metadata={"chunk_order": 3},
-# ),
-# ]
-
-
-# async def test_ingest_chunks_ingress_success(
-# ingestion_service, sample_document_id, sample_chunks
-# ):
-# """Test successful ingestion of chunks"""
-# result = await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=sample_chunks,
-# metadata={"title": "Test Document"},
-# user="test_user",
-# )
-
-# assert result is not None
-# # Add assertions based on your expected return type
-
-
-# async def test_ingest_chunks_ingress_empty_chunks(
-# ingestion_service, sample_document_id
-# ):
-# """Test handling of empty chunks list"""
-# with pytest.raises(ValueError):
-# await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=[],
-# metadata={},
-# user_id="test_user",
-# )
-
-
-# async def test_ingest_chunks_ingress_invalid_metadata(
-# ingestion_service, sample_document_id, sample_chunks
-# ):
-# """Test handling of invalid metadata"""
-# with pytest.raises(TypeError):
-# await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=sample_chunks,
-# metadata=None, # Invalid metadata
-# user_id="test_user",
-# )
-
-
-# async def test_ingest_chunks_ingress_large_document(
-# ingestion_service, sample_document_id
-# ):
-# """Test ingestion of a large number of chunks"""
-# large_chunks = [
-# RawChunk(text=f"Chunk number {i}", metadata={"chunk_order": i})
-# for i in range(1000)
-# ]
-
-# result = await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=large_chunks,
-# metadata={"title": "Large Document"},
-# user_id="test_user",
-# )
-
-# assert result is not None
-# # Add assertions for large document handling
-
-
-# async def test_ingest_chunks_ingress_duplicate_chunk_orders(
-# ingestion_service, sample_document_id
-# ):
-# """Test handling of chunks with duplicate chunk orders"""
-# duplicate_chunks = [
-# RawChunk(text="First chunk", metadata={"chunk_order": 1}),
-# RawChunk(
-# text="Second chunk",
-# metadata={"chunk_order": 1}, # Duplicate chunk_order
-# ),
-# ]
-
-# with pytest.raises(ValueError):
-# await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=duplicate_chunks,
-# metadata={},
-# user_id="test_user",
-# )
-
-
-# async def test_ingest_chunks_ingress_invalid_user(
-# ingestion_service, sample_document_id, sample_chunks
-# ):
-# """Test handling of invalid user ID"""
-# with pytest.raises(ValueError):
-# await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=sample_chunks,
-# metadata={},
-# user_id="", # Invalid user ID
-# )
-
-
-# async def test_ingest_chunks_ingress_metadata_validation(
-# ingestion_service, sample_document_id, sample_chunks
-# ):
-# """Test metadata validation"""
-# test_cases = [
-# ({"title": "Valid title"}, True),
-# ({"title": ""}, False),
-# ({"invalid_key": "value"}, False),
-# (
-# {},
-# True,
-# ), # Empty metadata might be valid depending on your requirements
-# ]
-
-# for metadata, should_succeed in test_cases:
-# if should_succeed:
-# result = await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=sample_chunks,
-# metadata=metadata,
-# user_id="test_user",
-# )
-# assert result is not None
-# else:
-# with pytest.raises((ValueError, TypeError)):
-# await ingestion_service.ingest_chunks_ingress(
-# document_id=sample_document_id,
-# chunks=sample_chunks,
-# metadata=metadata,
-# user_id="test_user",
-# )
-
-
-# async def test_ingest_chunks_ingress_concurrent_requests(
-# ingestion_service, sample_chunks
-# ):
-# """Test handling of concurrent ingestion requests"""
-# import asyncio
-
-# document_ids = [
-# UUID("12345678-1234-5678-1234-56781234567" + str(i)) for i in range(5)
-# ]
-
-# async def ingest_document(doc_id):
-# return await ingestion_service.ingest_chunks_ingress(
-# document_id=doc_id,
-# chunks=sample_chunks,
-# metadata={"title": f"Document {doc_id}"},
-# user_id="test_user",
-# )
-
-# results = await asyncio.gather(
-# *[ingest_document(doc_id) for doc_id in document_ids]
-# )
-
-# assert len(results) == len(document_ids)
-# for result in results:
-# assert result is not None
diff --git a/py/tests/integration/runner_retrieval.py b/py/tests/integration/runner_retrieval.py
index e3ba07def..82c20700a 100644
--- a/py/tests/integration/runner_retrieval.py
+++ b/py/tests/integration/runner_retrieval.py
@@ -71,6 +71,35 @@ def test_rag_query():
print("~" * 100)
+def test_rag_with_filter():
+ print("Testing: RAG query")
+ # Just do a standard RAG query without streaming
+ client.documents.create(
+ raw_text="Aristotle was a Greek philosopher who studied under Plato, his core contributions to philosophy were in logic.",
+ metadata={"tier": "test"},
+ )
+ resp = client.retrieval.rag(
+ query="What were aristotle's contributions to philosophy?",
+ rag_generation_config={"stream": False, "max_tokens": 100},
+ search_settings={
+ "filters": {"metadata.tier": {"$eq": "test"}},
+ "use_semantic_search": True,
+ "limit": 3,
+ },
+ )["results"]
+
+ print("response:", resp)
+ # # Check response structure
+ # if isinstance(resp, dict):
+ # assert_http_error("answer" in resp and "sources" in resp, "RAG response missing 'answer' or 'sources'")
+ # else:
+ # # Unexpected streaming or different type
+ # print("Expected dict response for non-streaming RAG")
+ # sys.exit(1)
+ print("RAG query test passed")
+ print("~" * 100)
+
+
def test_rag_stream_query():
print("Testing: RAG query with streaming")
# Streamed responses come as an async generator from the SDK
diff --git a/py/tests/main/api/__init__.py b/py/tests/main/api/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/main/app/__init__.py b/py/tests/main/app/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/main/assembly/__init__.py b/py/tests/main/assembly/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/main/sdk/__init__.py b/py/tests/main/sdk/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/main/services/__init__.py b/py/tests/main/services/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/regression/__init__.py b/py/tests/regression/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/regression/expected_outputs/test_document_management.json b/py/tests/regression/expected_outputs/test_document_management.json
deleted file mode 100644
index 45c1b15d6..000000000
--- a/py/tests/regression/expected_outputs/test_document_management.json
+++ /dev/null
@@ -1,1026 +0,0 @@
-{
- "ingest_sample_files": {
- "results": [
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "77ba1665-2646-457c-a892-da3c791f5e88",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "ae150ce4-1f91-4fa6-affb-7dd8ab1ad063",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "1d733a99-2c79-4d3e-bb0c-a0d49b26dc37",
- "document_id": "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "c7d0b32c-00d9-4f0c-9f0d-240f86056665",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "da16c185-57bf-4f30-bc8a-598e127824a1",
- "document_id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "460abc23-60de-4d11-8f10-81079d7d4990",
- "document_id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "aa01d8e2-c866-4e60-8190-3c7894afc263",
- "document_id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "6da2d33b-7b00-4cd3-9ede-9748ca5fd936",
- "document_id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "b16a158c-85da-4290-92ae-6f0a4881c279",
- "document_id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "fcdb9219-9d4f-4de2-aeb0-4fc688ffb7b4",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "082de59f-0252-4f2e-b038-90253b8066f5",
- "document_id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "8864c244-3372-4196-80c2-0d9dd667c3fd",
- "document_id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "2fe08ebb-7157-452c-811c-7ede2b05e750",
- "document_id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "78ea4936-8bf6-4539-9db3-68f0e5b5bbe5",
- "document_id": "d421207a-d799-5806-8d67-46b2005b15d4"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "ad23dd8d-f0c1-4641-a6a2-e909ede1b3d1",
- "document_id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "c100a64b-c975-4a5e-bde4-7f1a0718b851",
- "document_id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c"
- }
- ]
- },
- "reingest_sample_file": {
- "results": [
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "d0c84b68-407c-4329-b7ce-fb32df9f196b",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "8134708a-ac7e-436c-977a-4ba1434aeb94",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "fcf12fcf-092c-43c6-85fa-3c02b62231a1",
- "document_id": "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "ab4f1bc7-7d62-47ee-9e21-6b4fab7ccfb7",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "ef38bf7b-5688-4bb4-b861-6e06dd4603dc",
- "document_id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "8bb837ed-d586-4d8b-8f80-e69e6d848dba",
- "document_id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "873adc8b-c630-49e8-9859-b49e2588e72c",
- "document_id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "e2552c6f-884d-42eb-b279-ae881d6a8338",
- "document_id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "ff2a206b-f9cb-4369-ad7f-be5f930ab6f0",
- "document_id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "2c26fd9d-e35b-4597-821e-c1e4031df1f8",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "1352946d-1a09-4534-ad29-94e6679ee4cf",
- "document_id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "36191afd-7628-4faa-aa7c-4da16cf4ee46",
- "document_id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "07acc8f2-48d6-4e1c-8932-3f0c594105bf",
- "document_id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "d0f06ec0-265e-45c8-a768-04411f96a54c",
- "document_id": "d421207a-d799-5806-8d67-46b2005b15d4"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "56187544-b442-4e5d-8338-bce2881e0d16",
- "document_id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb"
- },
- {
- "message": "Ingestion task queued successfully.",
- "task_id": "5a1d66ee-976c-4dd5-a19e-8e20c1470339",
- "document_id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c"
- }
- ]
- },
- "documents_overview": {
- "results": [
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "pending",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.535113Z",
- "updated_at": "2024-09-20T22:31:28.791381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.522881Z",
- "updated_at": "2024-09-20T22:26:27.522885Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.639855Z",
- "updated_at": "2024-09-20T22:31:28.766869Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.525053Z",
- "updated_at": "2024-09-20T22:31:28.806823Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle.txt",
- "version": "v0",
- "size_in_bytes": 97804,
- "ingestion_status": "pending",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.521911Z",
- "updated_at": "2024-09-20T22:31:28.835132Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.517088Z",
- "updated_at": "2024-09-20T22:31:28.836602Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.513642Z",
- "updated_at": "2024-09-20T22:26:26.513644Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.511219Z",
- "updated_at": "2024-09-20T22:31:28.786697Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "pending",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.504459Z",
- "updated_at": "2024-09-20T22:31:28.767620Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.528586Z",
- "updated_at": "2024-09-20T22:31:28.789182Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.524513Z",
- "updated_at": "2024-09-20T22:31:28.766240Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.517526Z",
- "updated_at": "2024-09-20T22:31:28.765359Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.515504Z",
- "updated_at": "2024-09-20T22:31:28.782778Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.514341Z",
- "updated_at": "2024-09-20T22:31:28.764706Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.511994Z",
- "updated_at": "2024-09-20T22:31:28.770493Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_1.html",
- "version": "v0",
- "size_in_bytes": 175340,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:21.855243Z",
- "updated_at": "2024-09-20T22:31:28.769460Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 16
- },
- "document_chunks_test": {
- "results": [
- {
- "fragment_id": "67e7ab57-eaa0-57d8-9276-da273abcdabd",
- "chunk_id": "286b3218-517c-50bf-b8ea-1262e8ec6b42",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "text": "June 2021 A few days ago, on the way home from school, my nine year old son",
- "metadata": {
- "version": "v0",
- "chunk_order": 0,
- "document_type": "html",
- "unstructured_filetype": "text/html",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_text_as_html": "
June 2021 A few days ago, on the way home from school, my nine year old son |
",
- "partitioned_by_unstructured": true,
- "unstructured_is_continuation": true
- }
- }
- ],
- "total_entries": 1
- },
- "update_document_test": {
- "results": {
- "message": "Update task queued successfully.",
- "task_id": "1828ac12-0804-47cf-9623-8110324a52b1",
- "document_ids": [
- "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"
- ]
- }
- },
- "rerun_documents_overview_test_1": {
- "results": [
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.535113Z",
- "updated_at": "2024-09-20T22:31:28.791381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.522881Z",
- "updated_at": "2024-09-20T22:31:28.779863Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.639855Z",
- "updated_at": "2024-09-20T22:31:28.766869Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.525053Z",
- "updated_at": "2024-09-20T22:31:28.806823Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "title": "aristotle_v2.txt",
- "version": "v1"
- },
- "title": "aristotle_v2.txt",
- "version": "v1",
- "size_in_bytes": 2534,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.521911Z",
- "updated_at": "2024-09-20T22:31:45.500072Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.517088Z",
- "updated_at": "2024-09-20T22:31:28.836602Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.513642Z",
- "updated_at": "2024-09-20T22:31:28.773388Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.511219Z",
- "updated_at": "2024-09-20T22:31:28.786697Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.504459Z",
- "updated_at": "2024-09-20T22:31:28.767620Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.528586Z",
- "updated_at": "2024-09-20T22:31:28.789182Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.524513Z",
- "updated_at": "2024-09-20T22:31:28.766240Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.517526Z",
- "updated_at": "2024-09-20T22:31:28.765359Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.515504Z",
- "updated_at": "2024-09-20T22:31:28.782778Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.514341Z",
- "updated_at": "2024-09-20T22:31:28.764706Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.511994Z",
- "updated_at": "2024-09-20T22:31:28.770493Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_1.html",
- "version": "v0",
- "size_in_bytes": 175340,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:21.855243Z",
- "updated_at": "2024-09-20T22:31:28.769460Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 16
- },
- "delete_document_test": {
- "results": {}
- },
- "rerun_documents_overview_test_2": {
- "results": [
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.535113Z",
- "updated_at": "2024-09-20T22:31:28.791381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:27.522881Z",
- "updated_at": "2024-09-20T22:31:28.779863Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.639855Z",
- "updated_at": "2024-09-20T22:31:28.766869Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.525053Z",
- "updated_at": "2024-09-20T22:31:28.806823Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "title": "aristotle_v2.txt",
- "version": "v1"
- },
- "title": "aristotle_v2.txt",
- "version": "v1",
- "size_in_bytes": 2534,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.521911Z",
- "updated_at": "2024-09-20T22:31:45.500072Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.517088Z",
- "updated_at": "2024-09-20T22:31:28.836602Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.513642Z",
- "updated_at": "2024-09-20T22:31:28.773388Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.511219Z",
- "updated_at": "2024-09-20T22:31:28.786697Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:26.504459Z",
- "updated_at": "2024-09-20T22:31:28.767620Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.528586Z",
- "updated_at": "2024-09-20T22:31:28.789182Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.524513Z",
- "updated_at": "2024-09-20T22:31:28.766240Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.517526Z",
- "updated_at": "2024-09-20T22:31:28.765359Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "parsing",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.515504Z",
- "updated_at": "2024-09-20T22:31:28.782778Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.514341Z",
- "updated_at": "2024-09-20T22:31:28.764706Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "success",
- "restructuring_status": "pending",
- "created_at": "2024-09-20T22:26:22.511994Z",
- "updated_at": "2024-09-20T22:31:28.770493Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 15
- },
- "rerun_document_chunks_test": {
- "results": "{\"detail\":{\"message\":\"An error 'list index out of range' occurred during document_chunks_app\",\"error\":\"list index out of range\",\"error_type\":\"IndexError\"}}"
- }
-}
diff --git a/py/tests/regression/expected_outputs/test_group_management.json b/py/tests/regression/expected_outputs/test_group_management.json
deleted file mode 100644
index b84b01657..000000000
--- a/py/tests/regression/expected_outputs/test_group_management.json
+++ /dev/null
@@ -1,168 +0,0 @@
-{
- "create_groups": {
- "group_1": {
- "results": {
- "group_id": "3b24e22c-843d-4cd1-8183-6f244211542b",
- "name": "Test Group 1 90248f87-cf4f-42ac-9f10-c68716e46ccb",
- "description": "A test group for permissions",
- "created_at": "2024-08-14T01:01:19.662617+00:00",
- "updated_at": "2024-08-14T01:01:19.662617+00:00"
- }
- },
- "group_2": {
- "results": {
- "group_id": "3a2bfab3-5527-47a1-a901-a463e9b06971",
- "name": "Test Group 2 99212e13-64dc-456a-93d3-3e2afc97f3be",
- "description": "Another test group for permissions",
- "created_at": "2024-08-14T01:01:19.803314+00:00",
- "updated_at": "2024-08-14T01:01:19.803314+00:00"
- }
- }
- },
- "add_users_to_groups": {
- "user_1": {
- "results": {
- "email": "user1_62951675-cbc1-40dd-b005-3758adfc4c68@example.com",
- "id": "a34207e5-9b1a-5fe3-8a02-9ff56464b112",
- "group_ids": [],
- "hashed_password": "$2b$12$tVPpwO.Tr5KR/gtWm1UPDuq6.pXY3rlO7YonLYR0XcwwJkhgVI96O",
- "is_superuser": false,
- "is_active": true,
- "is_verified": false,
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null,
- "created_at": "2024-08-14T01:01:20.181505Z",
- "updated_at": "2024-08-14T01:01:20.181505Z"
- }
- },
- "user_2": {
- "results": {
- "email": "user2_16f40e51-174a-4b06-b308-1160b701cda7@example.com",
- "id": "4404314d-12a1-5299-9f7a-adfac07a5a3b",
- "group_ids": [],
- "hashed_password": "$2b$12$oqVl4mg7sL6UfUO2ziz4suSSOOBwYJisag.E.pHNWxd0YMde57e0y",
- "is_superuser": false,
- "is_active": true,
- "is_verified": false,
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null,
- "created_at": "2024-08-14T01:01:20.651635Z",
- "updated_at": "2024-08-14T01:01:20.651635Z"
- }
- }
- },
- "group_based_document_access": {
- "error": "An error ''PostgresVectorDBProvider' object has no attribute 'assign_document_to_group'' occurred during ingest_files_app"
- },
- "admin_ingest_documents": {
- "error": "An error ''PostgresVectorDBProvider' object has no attribute 'assign_document_to_group'' occurred during ingest_files_app"
- },
- "user_ingest_and_search": {
- "user_1_ingest": {
- "results": {
- "processed_documents": [
- {
- "id": "55d7b67e-c717-5e89-a956-61580475199d",
- "group_ids": [],
- "user_id": "a34207e5-9b1a-5fe3-8a02-9ff56464b112",
- "type": "txt",
- "metadata": {},
- "title": "user1_document.txt",
- "version": "v0",
- "size_in_bytes": 15,
- "status": "success",
- "created_at": "2024-08-13T18:01:23.617547",
- "updated_at": "2024-08-13T18:01:23.617547"
- }
- ],
- "failed_documents": [],
- "skipped_documents": []
- }
- },
- "user_1_search": {
- "results": {
- "vector_search_results": [
- {
- "fragment_id": "ced2d47c-524c-58d3-8cc3-0d474312fb00",
- "chunk_id": "b4451d80-760d-5e0f-93bc-cc0a89b1630d",
- "document_id": "55d7b67e-c717-5e89-a956-61580475199d",
- "user_id": "a34207e5-9b1a-5fe3-8a02-9ff56464b112",
- "group_ids": [],
- "score": 0.577595285558281,
- "text": "user1_document",
- "metadata": {
- "text": "user1_document",
- "title": "user1_document.txt",
- "associatedQuery": "document"
- }
- },
- {
- "fragment_id": "2b30d8c0-d037-5ca7-9961-08e2a13a25cd",
- "chunk_id": "eae05bf5-f732-53b3-80e5-a6e39d5a23d3",
- "document_id": "653c933c-867d-5588-b6cd-54d9412a8ffa",
- "user_id": "a34207e5-9b1a-5fe3-8a02-9ff56464b112",
- "group_ids": [],
- "score": 0.487762882211468,
- "text": "user1_document_group",
- "metadata": {
- "text": "user1_document_group",
- "title": "user1_document_group.txt",
- "user_id": "a34207e5-9b1a-5fe3-8a02-9ff56464b112",
- "associatedQuery": "document"
- }
- }
- ],
- "graph_search_results": []
- }
- },
- "user_2_ingest": {
- "results": {
- "processed_documents": [
- {
- "id": "6a7d57a8-0bab-55df-8674-a94b1ecd6492",
- "group_ids": [],
- "user_id": "4404314d-12a1-5299-9f7a-adfac07a5a3b",
- "type": "txt",
- "metadata": {},
- "title": "user2_document.txt",
- "version": "v0",
- "size_in_bytes": 15,
- "status": "success",
- "created_at": "2024-08-13T18:01:25.577549",
- "updated_at": "2024-08-13T18:01:25.577549"
- }
- ],
- "failed_documents": [],
- "skipped_documents": []
- }
- },
- "user_2_search": {
- "results": {
- "vector_search_results": [
- {
- "fragment_id": "4f3e93df-099c-58a7-a5cf-c40ba5ae76c1",
- "chunk_id": "838aa00a-2d5a-588c-9aa1-2553ae514024",
- "document_id": "6a7d57a8-0bab-55df-8674-a94b1ecd6492",
- "user_id": "4404314d-12a1-5299-9f7a-adfac07a5a3b",
- "group_ids": [],
- "score": 0.530904515656706,
- "text": "user2_document",
- "metadata": {
- "text": "user2_document",
- "title": "user2_document.txt",
- "associatedQuery": "document"
- }
- }
- ],
- "graph_search_results": []
- }
- }
- },
- "cleanup": {
- "status": "cleanup completed"
- }
-}
diff --git a/py/tests/regression/expected_outputs/test_observability.json b/py/tests/regression/expected_outputs/test_observability.json
deleted file mode 100644
index cef9c1e32..000000000
--- a/py/tests/regression/expected_outputs/test_observability.json
+++ /dev/null
@@ -1,381 +0,0 @@
-{
- "users_overview": {
- "results": [
- {
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "num_files": 15,
- "total_size_in_bytes": 5067615,
- "document_ids": [
- "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "2f576170-c4f9-5141-a910-a0924f341de4",
- "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
- "d421207a-d799-5806-8d67-46b2005b15d4",
- "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "5b1bd54f-4d70-56b9-a017-a618bc75f94c"
- ]
- }
- ]
- },
- "logs": {
- "results": [
- {
- "run_id": "ad9ece10-fe2f-49e4-835e-1b274dae8167",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:03",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "698aa5cd-dc80-478f-9bb4-ae92c5c344a1",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:03",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "1477b2aa-6ff1-4548-ba07-0a9fb889bf18",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:03",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "0291e48c-2884-46b9-ba95-0ac1515ed60b",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:02",
- "user_id": "f36f0a8a-9f12-5979-9416-1fbb7d28a80b"
- },
- {
- "run_id": "b6284a00-2285-44e8-9ed6-dc617b298948",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:02",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "3b57e745-0d33-463c-947a-cb88287e9584",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:02",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "95a75f58-dbcd-47cb-9883-c035aaf8ac48",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:01",
- "user_id": "f36f0a8a-9f12-5979-9416-1fbb7d28a80b"
- },
- {
- "run_id": "1032bdaa-324f-482b-ab80-5177a214e981",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:01",
- "user_id": "f36f0a8a-9f12-5979-9416-1fbb7d28a80b"
- },
- {
- "run_id": "bc78145e-70d3-47d7-b13c-890e4ea0d098",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:26:01",
- "user_id": "f36f0a8a-9f12-5979-9416-1fbb7d28a80b"
- },
- {
- "run_id": "02a6c397-b69f-4f53-836e-d9ef69dbec10",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:59",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "dfefe355-b78f-463a-ad0b-e4651f20f9f4",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:55",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "4fa823b5-f440-4f5a-88ca-434731fa45c9",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:54",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "c0cf5191-78f2-4e18-b5b7-2df62307fd23",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:53",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "d24c3c19-8354-418c-9b7a-d95cb624a235",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:52",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "4df1f53b-db11-4919-ae20-8f1d7d7eb6fb",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:52",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "1d2ec5de-df04-4ea9-9ef3-8a05e0f36fc9",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:52",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "f997102d-4d2a-4897-9b4a-0049d25a9ada",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:25:10",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "ee0dc7eb-c204-44b7-82a4-d07d59b67741",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:24:58",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "730e188e-6946-4a83-ad77-0861e94c1f29",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:24:46",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "b2c15b09-f69b-444a-bb7d-dc1939e7eba4",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:57",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "86c24884-a353-4298-8183-2037beb139aa",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:57",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "59ea5208-72e9-44f8-8cfe-fe4ce2423f7c",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:57",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "1272f2f6-56ef-411c-a4ce-9437e3bff5dc",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:57",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "19228cf4-34e4-4102-a6e1-2656c136af4b",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:56",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "bdcee52d-9726-4531-8a00-b31244563a9d",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:56",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "3d68fc91-b9a6-405f-b344-e9d825093e8e",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:55",
- "user_id": "4b17bfba-12b5-5c3a-98a7-fcaaab561791"
- },
- {
- "run_id": "d8689825-7f2f-4b70-8114-31fdae4e86fd",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:55",
- "user_id": "4b17bfba-12b5-5c3a-98a7-fcaaab561791"
- },
- {
- "run_id": "0f128c75-25c2-4754-8af9-88d7d63de866",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:55",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "5b61a591-a94a-4f9e-8233-0c82a4f3faa4",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:55",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "d8c3ccb0-0be0-4d59-b7f5-76d043eeb0bc",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:54",
- "user_id": "4b17bfba-12b5-5c3a-98a7-fcaaab561791"
- },
- {
- "run_id": "c84e61f9-1318-4fa0-b174-6a00f59727a9",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:54",
- "user_id": "4b17bfba-12b5-5c3a-98a7-fcaaab561791"
- },
- {
- "run_id": "3b05eccc-3bd4-4b0a-813d-60fdfbc06cad",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:51",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "e81699a2-b47b-4fdf-8d97-b6b48e394533",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:47",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "d41ad353-cd3d-4cbc-b141-010c499acebb",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:45",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "0129829c-55e9-4fee-b396-7e548c5ead4e",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:45",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "81704be9-5286-4373-9f84-839f1c25933e",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:44",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "d09b1ca5-a684-4a0b-9f56-f198d5bdf619",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:44",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "047b36d3-43b1-4239-b65a-07ec1265439a",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:43",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "979b9d23-4617-4754-b220-67936965b60d",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:23:08",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "4bfd3253-60c7-46d1-b309-fb7b816ce2ec",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:22:28",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "75434e64-bd07-4b2d-a11a-5c00adc6753e",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:21:39",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "523a1049-f5ad-4b95-bde4-d3c4a94927aa",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:20:48",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "0fd21ec5-f305-4e48-b75d-7ddb29534dce",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:19:38",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "ace1ffb9-4cd4-4029-8c8b-2e5b25b504a5",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:19:38",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "a926c1be-405e-44fd-ac6a-1e8a0cb92512",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:19:38",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- },
- {
- "run_id": "748191c0-5ae1-424b-8c19-1c35a5428e68",
- "run_type": "UNSPECIFIED",
- "entries": [],
- "timestamp": "2024-08-14T00:16:07",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- }
- ]
- },
- "analytics": {
- "results": {
- "filtered_logs": {
- "search_latencies": [
- {
- "run_id": "c0cf5191-78f2-4e18-b5b7-2df62307fd23",
- "key": "search_latency",
- "value": "0.45",
- "timestamp": "2024-08-14 00:25:53"
- },
- {
- "run_id": "d41ad353-cd3d-4cbc-b141-010c499acebb",
- "key": "search_latency",
- "value": "0.46",
- "timestamp": "2024-08-14 00:23:45"
- }
- ]
- },
- "search_latencies": {
- "Mean": 0.455,
- "Median": 0.455,
- "Mode": null,
- "Standard Deviation": 0.007,
- "Variance": 0.0
- }
- }
- }
-}
diff --git a/py/tests/regression/expected_outputs/test_retrieval.json b/py/tests/regression/expected_outputs/test_retrieval.json
deleted file mode 100644
index 85b5249f4..000000000
--- a/py/tests/regression/expected_outputs/test_retrieval.json
+++ /dev/null
@@ -1,725 +0,0 @@
-{
- "search": {
- "results": {
- "vector_search_results": [
- {
- "fragment_id": "392ab9b4-c4bc-5894-8edf-332fcd9245bb",
- "chunk_id": "cd49a88d-92e5-59f1-8331-3d3d3ecb7f3a",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.2381485473557875,
- "text": "Finance Leases\n\n2022 2023 2024 2025 2026 Thereafter Total undiscounted lease payments Less: imputed interest\n\n$\n\n280 $ 312 264 214 198 2,067 3,335 (1,506) 1,829 $\n\n140 60 34 9 \u2014 1 244 (10) 234\n\nTotal lease liabilities\n\n$\n\nAs of December 31, 2021, we had additional operating leases and finance leases, primarily for corporate offices and servers, that have not yet commenced of $421 million and $19 million, respectively. These operating and finance leases will commence between fiscal year 2022 and fiscal year 2023 with lease terms of 2 years to 13 years.\n\nMission Bay 1 & 2\n\nIn 2015, we entered into a joint venture (\u201cJV\u201d) agreement with a real estate developer (\u201cJV Partner\u201d) to develop land (\u201cthe Land\u201d) in San Francisco to construct our new headquarters (the \u201cHeadquarters\u201d). The Headquarters consists of two adjacent office buildings totaling approximately 423,000 rentable square feet. In connection with the JV arrangement, we acquired a 49% interest in the JV, the principal asset of which was the Land.",
- "metadata": {
- "version": "v0",
- "chunk_order": 759,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 109,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "fd8508db-c444-5ed9-afce-67340354fb1e",
- "chunk_id": "7f16fa20-9bc1-5841-ba74-95cdbb27e9fb",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.21556836366653442,
- "text": "(c)\n\nCommuting and Corporate Housing Expenses. During your Employment, the Company shall cover the cost of your reasonable and substantiated expenses for travel between your primary residence and the Company\u2019s headquarters in San Francisco and corporate housing in the San Francisco Bay Area, up to a pre-tax maximum of $200,000 per year in the aggregate. All expense reimbursements shall be made in accordance with the Company\u2019s expense reimbursement policy.\n\n(d)",
- "metadata": {
- "version": "v0",
- "chunk_order": 971,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 170,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "e74d31b4-5de3-581c-abaf-8d28f48f924b",
- "chunk_id": "f4aa1be1-c0fa-5edd-a536-d5af7f023b31",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.21189823746681213,
- "text": "\u201cPlatform\u201d has the meaning set forth in Section 9.01(d).\n\n\u201cPrime Rate\u201d means the rate of interest the rate of interest published by the Wall Street Journal, from time to time, as the prime rate.\n\nThe Prime Rate is a reference rate and does not necessarily represent the lowest or best rate actually charged to any customer. The Administrative Agent or any other Lender may make commercial loans or other loans at rates of interest at, above or below the Prime Rate.\n\n\u201cPrincipal Office\u201d for each of the Administrative Agent and any Issuing Bank, means the office of the Administrative Agent and such Issuing Bank as set forth in Section 9.01(a), or such other office or office of a third party or sub-agent, as appropriate, as such Person may from time to time designate to Borrower and each Lender upon two Business Days\u2019 written notice.",
- "metadata": {
- "version": "v0",
- "chunk_order": 1210,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 205,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "76d010b2-1498-531a-bf89-66aa17331203",
- "chunk_id": "f31920df-e1db-5a2c-9b8f-9c7b845a21c1",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.19467422366142273,
- "text": "45-2647441 (I.R.S. Employer Identification No.)\n\n1515 3rd Street San Francisco, California 94158 (Address of principal executive offices, including zip code) (415) 612-8582 (Registrant\u2019s telephone number, including area code) ____________________________________________\n\nSecurities registered pursuant to Section 12(b) of the Act:\n\nTitle of each class Common Stock, par value $0.00001 per share",
- "metadata": {
- "version": "v0",
- "chunk_order": 1,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 1,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "c96ee688-6e36-5abb-b066-d87779be1cf6",
- "chunk_id": "eb08b70d-2e82-5de3-90ee-98537a761ea8",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.19264961779117584,
- "text": "Item 2. Properties.\n\nOur corporate headquarters are located in San Francisco, California, and consist of approximately 420,000 square feet under lease agreements through May 31,\n\n2030. We maintain additional offices in multiple locations in the U.S. and internationally in Montreal, Canada, Munich, Germany and Minsk, Belarus.\n\nWe lease all of our facilities and do not own any real property. We believe our facilities are adequate and suitable for our current needs and that, should it be\n\nneeded, suitable additional or alternative space will be available to accommodate our operations.\n\n53\n\nItem 3. Legal Proceedings.\n\nSee discussion under the heading Legal Proceedings in Note 9 to the consolidated financial statements included in Part II, Item 8 of this report.\n\nItem 4. Mine Safety Disclosures.\n\nNot applicable.\n\n54\n\nPART II\n\nItem 5. Market for Registrant\u2019s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities.",
- "metadata": {
- "version": "v0",
- "chunk_order": 434,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 53,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "2abca1b8-f005-59dd-9716-adf883ec3aca",
- "chunk_id": "edd4f1f9-f6c8-5341-a1e7-ce57cac7f2fb",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.19248970195712467,
- "text": "(b)\n\nSolely for purposes of Article II and related definitional provisions to the extent used therein, the applicable amount of any\n\ncurrency (other than dollars) for purposes of the Loan Documents shall be such Dollar Equivalent amount as determined by the Administrative Agent and notified to the applicable Issuing Bank and the Borrower in accordance with Section 1.06(a). Amounts denominated in a Permitted Foreign Currency will be converted to dollars for the purposes of calculating the Senior Secured Net Leverage Ratio at the Exchange Rate as of the date of calculation.",
- "metadata": {
- "version": "v0",
- "chunk_order": 1266,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 216,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "85474903-20cc-58e6-ad3c-a1b64de77557",
- "chunk_id": "b69b89e5-48e1-526e-ba04-c9f5c0c56fa6",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.1894184407418502,
- "text": "Interest, net of amount capitalized Income taxes, net of refunds\n\n$\n\nNon-cash investing and financing activities:\n\nConversion of redeemable convertible preferred stock to common stock upon initial public offering Conversion of convertible notes to common stock upon initial public offering Conversion of convertible notes to common stock related to Careem Finance lease obligations Common stock issued in connection with acquisitions Ownership interest received in exchange for divestitures Issuance of Careem Notes including the holdback amount\n\nThe accompanying notes are an integral part of these consolidated financial statements.\n\n81\n\n2019",
- "metadata": {
- "version": "v0",
- "chunk_order": 590,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 83,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "a2f05c5a-0d43-538d-b4d0-ffd29d215437",
- "chunk_id": "f1cb0bd8-0721-59ab-9e39-110efccf33dd",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.18724104762077332,
- "text": "(1) the rate, or methodology for this rate, and conventions for this rate selected or recommendedby the Relevant Governmental Body\n\nfor determining compounded SOFR; provided that:\n\n(2) if, and to the extent that, the Administrative Agent determines that Compounded SOFRcannot\n\nbe determined in accordance with clause (1) above, then the rate, or methodology for this rate, and conventions for this rate that the Administrative Agent determines are substantially consistent with prevailing market convention for determining Compounded SOFR for U.S. dollar-denominated syndicated credit facilities at such time (as a result of amendment or as originally executed);",
- "metadata": {
- "version": "v0",
- "chunk_order": 1102,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 186,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "2654f646-222c-50af-bd1c-c7311e6a9dc9",
- "chunk_id": "b25d210b-1b58-578a-b038-34f76d77f377",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.18122072927828292,
- "text": "\u201cCanadian BA Rate Borrowing\u201d refers to a Borrowing bearing interest at a rate determined by reference to the Canadian BA Rate.\n\n10\n\n\u201cCanadian BA Rate Loan\u201d refers to a Loan bearing interest at a rate determined by reference to the Canadian BA Rate.\n\n\u201cCanadian Dollars\u201d means the lawful currency of Canada.\n\n\u201cCapital Lease Obligations\u201d of any Person means the obligations of such Person to pay rent or other amounts under any lease of (or",
- "metadata": {
- "version": "v0",
- "chunk_order": 1085,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 182,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "fragment_id": "309f729d-78eb-569a-837c-50367c20e898",
- "chunk_id": "dfc368a6-efaf-5f4d-a20b-0fd6059a5f35",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.1806427240371704,
- "text": "principal payments on finance lease obligations for $35.5 million.\n\nCash provided by financing activities was $512.6 million for the year ended December 31, 2020, which primarily consisted of proceeds from issuance of our 2025\n\nNotes of $734.1 million offset by the purchase of the Capped Calls for $132.7 million.\n\nLiquidity and Capital Resources",
- "metadata": {
- "version": "v0",
- "chunk_order": 531,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 71,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- }
- ],
- "graph_search_results": null
- }
- },
- "basic_rag": {
- "results": {
- "completion": {
- "id": "chatcmpl-A9ggQHI4VJvvkrhyrkBwUtQQ26Ab2",
- "choices": [
- {
- "finish_reason": "stop",
- "index": 0,
- "logprobs": null,
- "message": {
- "content": "Uber did not make a profit in 2020. Instead, Uber reported a net loss attributable to Uber Technologies, Inc. of $6.8 billion for the year ended December 31, 2020 [3].",
- "refusal": null,
- "role": "assistant",
- "function_call": null,
- "tool_calls": null
- }
- }
- ],
- "created": 1726872390,
- "model": "gpt-4o-2024-05-13",
- "object": "chat.completion",
- "service_tier": null,
- "system_fingerprint": "fp_3537616b13",
- "usage": {
- "completion_tokens": 45,
- "prompt_tokens": 2320,
- "total_tokens": 2365,
- "completion_tokens_details": {
- "reasoning_tokens": 0
- }
- }
- },
- "search_results": {
- "vector_search_results": [
- {
- "fragment_id": "07aa09c5-81a8-5a48-953a-532064a446f8",
- "chunk_id": "d3060c36-85dc-5e8d-b8ff-cfe4c1753ccc",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.7445549521115464,
- "text": "Revenue was $17.5 billion, or up 57% year-over-year, reflecting the overall growth in our Delivery business and an increase in Freight revenue attributable to the acquisition of Transplace in the fourth quarter of 2021 as well as growth in the number of shippers and carriers on the network combined with an increase in volumes with our top shippers.\n\nNet loss attributable to Uber Technologies, Inc. was $496 million, a 93% improvement year-over-year, driven by a $1.6 billion pre-tax gain on the sale of our ATG Business to Aurora, a $1.6 billion pre-tax net benefit relating to Uber\u2019s equity investments, as well as reductions in our fixed cost structure and increased variable cost efficiencies. Net loss attributable to Uber Technologies, Inc. also included $1.2 billion of stock-based compensation expense.",
- "metadata": {
- "version": "v0",
- "chunk_order": 445,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 53,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "2044e305-c042-5f0d-b05d-a2b97181f7a8",
- "chunk_id": "4329441a-5faf-5e9d-801f-ebd753ee1bd3",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.7071289420127869,
- "text": "Total costs and expenses Loss from operations\n\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\n\nLess: net loss attributable to non-controlling interests, net of tax\n\n100 %\n\n46 % 16 % 32 % 20 % 24 % 5 % 144 % (44)% (4)% (15)% (62)% (2)% \u2014 % (61)% \u2014 % (61)%\n\n100 %\n\n54 % 11 % 27 % 12 % 13 % 5 % 122 % (22)% (3)% 19 % (6)% (3)% \u2014 % (3)% \u2014 % (3)%\n\nNet loss attributable to Uber Technologies, Inc.\n\n(1)\n\nTotals of percentage of revenues may not foot due to rounding.\n\nComparison of the Years Ended December 31, 2020 and 2021\n\nRevenue\n\nYear Ended December 31,\n\n(In millions, except percentages)\n\n2020\n\n2021\n\n2020 to 2021 % Change\n\nRevenue\n\n$\n\n11,139 $\n\n17,455\n\n57 %",
- "metadata": {
- "version": "v0",
- "chunk_order": 463,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 57,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "3840834b-7c74-5417-9252-9080e609fb2f",
- "chunk_id": "cf934fe1-926d-5525-a230-30946961cf28",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6809690201571295,
- "text": "Year Ended December 31, 2020\n\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization\n\nTotal costs and expenses Loss from operations\n\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\n\nLess: net loss attributable to non-controlling interests, net of tax\n\n$\n\n13,000 $\n\n6,061 2,302 4,626 4,836 3,299 472 21,596 (8,596) (559) 722 (8,433) 45 (34) (8,512) (6) (8,506) $\n\n11,139 $\n\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\n\nNet loss attributable to Uber Technologies, Inc. Net loss per share attributable to Uber Technologies, Inc. common stockholders:\n\n$\n\nBasic\n\n$\n\n(6.81) $\n\n(3.86) $",
- "metadata": {
- "version": "v0",
- "chunk_order": 574,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 77,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "e2a37b27-0644-59e4-9746-37d48592a299",
- "chunk_id": "6b86ac2f-ce33-5126-83e6-a8731ea677c8",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6805637085605776,
- "text": "Less: net loss attributable to non-controlling interests, net of tax\n\n$\n\n11,139 $\n\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\n\n17,455\n\n9,351 1,877 4,789 2,054 2,316 902 21,289 (3,834) (483) 3,292 (1,025) (492) (37) (570) (74) (496)\n\nNet loss attributable to Uber Technologies, Inc.\n\n$\n\n54\n\nThe following table sets forth the components of our consolidated statements of operations for each of the periods presented as a percentage of revenue\n\n(1)\n\n:\n\nYear Ended December 31, 2021 2020\n\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization",
- "metadata": {
- "version": "v0",
- "chunk_order": 462,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 56,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "de30c3c9-cdfd-5872-bdaf-4859bef5c3a8",
- "chunk_id": "33bc6d8b-9fdc-5df7-be1d-fa7de176a0b5",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6537506580352783,
- "text": "The Uber Service activities are performed to satisfy our sole performance obligation in the transaction, which is to connect Drivers and Merchants with end-\n\nusers to facilitate the completion of a successful transaction.\n\nIn 2020, we began charging Mobility end-users a fee to use the platform in certain markets. In these transactions, in addition to a performance obligation to Drivers, we also have a performance obligation to end-users, which is to connect end-users to Drivers in the marketplace. We recognize revenue when a trip is complete. We present revenue on a net basis for these transactions, as we do not control the service provided by Drivers to end-users. For the years ended December 31, 2020 and 2021, we recognized total revenue of $323 million and $336 million, respectively, associated with these fees charged to end-users.",
- "metadata": {
- "version": "v0",
- "chunk_order": 642,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 90,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "4b6ee3eb-daca-5930-bafe-946cad56cdcc",
- "chunk_id": "eb2bc121-0b00-5f70-8eb6-549e1fb1ed72",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6387766599655151,
- "text": "Other income (expense), net\n\n$\n\nDuring the year ended December 31, 2020, gain on business divestitures, net represented a $154 million gain on the sale of our Uber Eats India operations to Zomato recognized in the first quarter of 2020 and a $77 million gain on the sale of our European Freight Business to sennder GmbH (\u201cSennder\u201d) recognized in the fourth quarter of 2020, partially offset by a $27 million loss on the sale of our JUMP operations to Lime recognized in the second quarter of 2020.\n\n(1)\n\nDuring the year ended December 31, 2021, gain on business divestitures, net represented a $1.6 billion gain on the sale of our ATG Business to Aurora",
- "metadata": {
- "version": "v0",
- "chunk_order": 799,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 118,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "af5f93d3-8b2f-5e71-a358-0dd56c2f68ac",
- "chunk_id": "acf12622-2e6e-5234-9768-ba448294a81d",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6321083903312683,
- "text": "2019\n\n100.0 %\n\n60.2 17.6 41.6 22.5 32.8 174.7 (74.7) \u2014 2.8 (71.9) 0.1 (72.0)%\n\n2019 to 2020 % Change\n\n(35) %\n\nsecond quarter of 2021. These increases were offset by investments in driver supply by increasing driver incentives recorded as a reduction to revenue by $942.9 million in 2021 as compared to the prior year as rider demand outpaced driver supply during certain periods of the pandemic recovery in 2021. Revenue in 2020 was also higher in the first quarter of 2020 prior to the implementation of shelter-in-place orders and other travel restrictions across North America beginning March 2020.\n\nWe expect to see continued recovery in demand for our platform and the resulting positive impacts on revenue as there are more widespread immunity levels, more communities reopen and other restrictive travel and social distancing measures in response to COVID-19 are eased. However, we cannot predict the impact of COVID variants and the longer term impact of the pandemic on consumer behavior.\n\nCost of Revenue\n\n2021",
- "metadata": {
- "version": "v0",
- "chunk_order": 493,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 63,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "d5379124-e7ff-509f-b47f-a79152eec2d4",
- "chunk_id": "2562b865-e4df-5376-9e70-927be9afbb7e",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.63012705682194,
- "text": "For additional discussion, see the risk factor titled \u201c\u2014If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, and carriers, whether as a result of competition or other factors, our platform will become less appealing to platform users, and our financial results would be adversely impacted.\u201d included in Part I, Item 1A of this Annual Report on Form 10-K as well our 2021 ESG Report and our 2021 People and Culture Report. The information in these reports is not a part of this Form 10-K.\n\nAdditional Information\n\nWe were founded in 2009 and incorporated as Ubercab, Inc., a Delaware corporation, in July 2010. In February 2011, we changed our name to Uber\n\nTechnologies, Inc. Our principal executive offices are located at 1515 3rd Street, San Francisco, California 94158, and our telephone number is (415) 612-8582.\n\n10",
- "metadata": {
- "version": "v0",
- "chunk_order": 77,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 12,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "98a93be5-13ba-5bd6-9a18-e7ceef0fae88",
- "chunk_id": "8ab931e3-8f47-5598-90b8-928f387ec256",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6285917009601995,
- "text": "Uber Technologies, Inc. (\u201cUber,\u201d \u201cwe,\u201d \u201cour,\u201d or \u201cus\u201d) was incorporated in Delaware in July 2010, and is headquartered in San Francisco, California. Uber is a technology platform that uses a massive network, leading technology, operational excellence and product expertise to power movement from point A to point B. Uber develops and operates proprietary technology applications supporting a variety of offerings on its platform (\u201cplatform(s)\u201d or \u201cPlatform(s)\u201d). Uber connects consumers (\u201cRider(s)\u201d) with independent providers of ride services (\u201cMobility Driver(s)\u201d) for ridesharing services, and connects Riders and other consumers (\u201cEaters\u201d) with restaurants, grocers and other stores (collectively, \u201cMerchants\u201d) with delivery service providers (\u201cCouriers\u201d) for meal preparation, grocery and other delivery services. Riders and Eaters are collectively referred to as \u201cend-user(s)\u201d or \u201cconsumer(s).\u201d Mobility Drivers and Couriers are collectively referred to as \u201cDriver(s).\u201d Uber also connects consumers with public",
- "metadata": {
- "version": "v0",
- "chunk_order": 592,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 84,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "fragment_id": "d87084d1-c52a-5a4b-96ad-9fc1cb98bfc5",
- "chunk_id": "8361bf60-bce2-56c2-b982-376a75e47d58",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.6283430678639979,
- "text": "Year Ended December 31, 2020\n\n1,000 49 1,189 (27) \u2014 (138) (34) 8,939 (4) 3,824\n\n247 125 2,628 (527) (891) (224) 38 1,379 (92) (4,327)\n\n8,209 34 12,067 $\n\n12,067 (349) 7,391 $\n\n332 $ 133\n\n412 $ 82\n\n14,224 4,229 \u2014 251 9 \u2014 \u2014\n\n\u2014 \u2014 \u2014 196 3,898 171 1,634\n\n2021\n\n675 107 1,484 (27) (307) (226) 101 1,780 (69) 65\n\n7,391 349 7,805\n\n449 87\n\n\u2014 \u2014 232 184 1,868 1,018 \u2014\n\nUBER TECHNOLOGIES, INC.\n\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\n\nNote 1 \u2013 Description of Business and Summary of Significant Accounting Policies\n\nDescription of Business",
- "metadata": {
- "version": "v0",
- "chunk_order": 591,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 83,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- }
- ],
- "graph_search_results": null
- }
- }
- },
- "hybrid_rag": {
- "results": {
- "completion": {
- "id": "chatcmpl-A9ggSfpp9sr3LZdfzBYse7JjT8eCK",
- "choices": [
- {
- "finish_reason": "stop",
- "index": 0,
- "logprobs": null,
- "message": {
- "content": "Jon Snow is Ned Stark\u2019s bastard son. Since Catelyn is not his mother, he is not a proper member of the Stark family, and he often feels himself an outsider. He is also a highly capable swordsman and thinker, with a knack for piercing observations [1].",
- "refusal": null,
- "role": "assistant",
- "function_call": null,
- "tool_calls": null
- }
- }
- ],
- "created": 1726872392,
- "model": "gpt-4o-2024-05-13",
- "object": "chat.completion",
- "service_tier": null,
- "system_fingerprint": "fp_3537616b13",
- "usage": {
- "completion_tokens": 57,
- "prompt_tokens": 1810,
- "total_tokens": 1867,
- "completion_tokens_details": {
- "reasoning_tokens": 0
- }
- }
- },
- "search_results": {
- "vector_search_results": [
- {
- "fragment_id": "7cbdab86-1689-5779-81bd-62f7eb3ab36d",
- "chunk_id": "866f85a0-b3d6-5fc5-9ca0-dbd2373eac58",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.26240772008895874,
- "text": "Eddard (Ned) Stark\n\nThe Lord of Winterfell and new Hand of the King. A devoted father and dutiful lord, he is best characterized by his strong sense of honor, and he strives to always do what is right, regardless of his personal feelings.\n\nCatelyn (Cat) Tully\n\nNed\u2019s wife and Lady Stark of Winterfell. She is intelligent, strong, and fiercely devoted to her family, leading her to seek out the person responsible for trying to kill her son Bran.\n\nDaenerys Stormborn Targaryen\n\nThe Dothraki khaleesi (queen) and Targaryen princess. She and her brother are the only surviving members of the Targaryen family, and she grows from a frightened girl to a confident ruler, while still maintaining her kindness, over the course of the novel.\n\nJon Snow\n\nNed Stark\u2019s bastard son. Since Catelyn is not his mother, he is not a proper member of the Stark family, and he often feels himself an outsider. He is also a highly capable swordsman and thinker, with a knack for piercing observations.",
- "metadata": {
- "version": "v0",
- "chunk_order": 0,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "8e563fb1-4665-53a8-8a83-63a1f88e2aea",
- "chunk_id": "f6bc23b5-bc80-5e49-9b55-25e9abe97073",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.2610799748027318,
- "text": "Jeor Mormont (Commander Mormont)\n\nLord Commander of the Night\u2019s Watch at Castle Black. Commander Mormont is tough, old, and wise, and his men call him \u201cThe Old Bear.\u201d\n\nMaester Aemon\n\nThe chief man of learning at Castle Black. Despite his blind white eyes, Maester Aemon sees and speaks the truth in cryptic ways. Though few people realize it, Aemon is one of the few surviving members of the Targaryen family, but he has always put his vows to the Night\u2019s Watch ahead of any family loyalties.\n\nSamwell (Sam) Tarly\n\nA new recruit to the Night\u2019s Watch who is fat and cowardly but very smart. Sam loves to read and eat but hates to fight, and he quickly becomes one of Jon Snow\u2019s closest companions at the Wall.\n\nSer Allister Thorne\n\nCastle Black\u2019s resentful master-at-arms. He hard on the new recruits to the Night\u2019s Watch and seems to enjoy making them suffer, causing Jon to rebel against him. During Robert\u2019s rebellion against the former king, he was a Targaryen loyalist.",
- "metadata": {
- "version": "v0",
- "chunk_order": 7,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "eaf48cfe-592e-55fa-9f07-613a4f221c45",
- "chunk_id": "fdf6127b-e623-58bc-a50b-b7e7b040c03a",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.2572833588977643,
- "text": "Varys (The Spider)\n\nThe Red Keep\u2019s master of whispers and a eunuch. His role in the court is to run a network of spies and keep the king informed, and he often uses what he knows to manipulate those around him, including the king.\n\nRobert Baratheon\n\nThe corpulent king of Westeros. He loves to fight, drink, and sleep with women, and he hates the duties of ruling. He and Ned are long-time friends, and he was engaged to Ned\u2019s sister until she died.\n\nSer Jorah Mormont\n\nAn exiled knight who serves unofficially as Daenerys\u2019s chief advisor. Though he was exiled by Ned Stark for selling slaves, he is intelligent, valiant, and a great fighter. He swears allegiance to Viserys as true king of Westeros, but he also feeds information about the Targaryens back to Varys.",
- "metadata": {
- "version": "v0",
- "chunk_order": 3,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "3cb1c2db-01e4-5ea8-a39f-31f5949637f8",
- "chunk_id": "02b64e7c-5aa5-5380-8fa0-3d8b64866aa8",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.20903720205800558,
- "text": "Aerys II Targaryen\n\nKing of Westeros before Robert Baratheon. He was known as The Mad King because of his cruelty. Aerys murdered Ned\u2019s older brother, Brandon Stark, in the Red Keep\u2019s throne room. At the end of the war that followed, Jaime Lannister slew Aerys in the same room.\n\nRhaegar Targaryen\n\nThe heir to Aerys and older brother of Daenerys and Viserys. Rhaegar kidnapped Lyanna Stark, Robert\u2019s betrothed, helping to set in motion the events that led to Robert\u2019s Rebellion. The war effectively ended when Robert slew Rhaegar with his warhammer on the Trident River.\n\nJon Arryn\n\nThe recently deceased Lord of the Eyrie and Hand of the King. Jon Arryn fostered Ned Stark and Robert Baratheon at the Eyrie. When Robert became king, Jon Arryn served as his Hand until his murder.",
- "metadata": {
- "version": "v0",
- "chunk_order": 10,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "ac15f806-8723-5fe7-832d-ed0427bd3550",
- "chunk_id": "416b07ed-cdd6-51fd-8f54-4164c0160860",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.19556865096092224,
- "text": "\u201cSanctions\u201d means economic or financial sanctions or trade embargoes imposed, administered or enforced from time to time by (a)\n\nthe U.S. government, including those administered by the Office of Foreign Assets Control of the U.S. Department of the Treasury or the U.S. Department of State, or (b) the United Nations Security Council, the European Union, any European Union member state, Her Majesty\u2019s Treasury of the United Kingdom or other relevant sanctions authority.\n\n\u201cSARON\u201d means, with respect to any Business Day, a rate per annum equal to the Swiss Average Rate Overnight for such Business\n\nDay published by the SARON Administrator on the SARON Administrator\u2019s Website.\n\n\u201cSARON Administrator\u201d means the SIX Swiss Exchange AG (or any successor administrator of the Swiss Average Rate",
- "metadata": {
- "version": "v0",
- "chunk_order": 1221,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 208,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "5d09c80f-ba83-5204-a6b3-f08831e150b0",
- "chunk_id": "f86a905c-8d82-52ff-ad72-a800ca3af6f4",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.19119779765605927,
- "text": "Illyrio Mopatis\n\nAn obese merchant from the Free Cities who helps Daenerys and Viserys Targaryen. Illyrio is very rich and very well-informed. He is quick to please, especially when there is a possibility that his kindness will help him avoid trouble or gain greater fortune in the future.\n\nSer Barristan Selmy\n\nLord Commander of the Kingsguard. He has served kings Jaehaerys, Aerys II, and Robert. Though he has grown old, Barristan \u201cThe Bold\u201d is a formidable fighter. He is, and has always been, an honorable knight.\n\nRenly Baratheon\n\nThe youngest of the three Baratheon brothers. Renly is lighthearted and opportunistic, and unexpectedly ambitious. He serves on Robert\u2019s royal council.\n\nStannis Baratheon\n\nThe middle brother of the three Baratheons. Stannis does not appear in A Game of Thrones, but as the brother of the king, he is a potential heir to the throne. Stannis does not seem to be well-liked.",
- "metadata": {
- "version": "v0",
- "chunk_order": 8,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "b47050a2-9906-5922-b6d4-52e4dedb499f",
- "chunk_id": "d436c7f7-d7c7-509e-a383-94a94360e601",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.18726881596221867,
- "text": "121",
- "metadata": {
- "version": "v0",
- "chunk_order": 1719,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 293,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "d143b410-f9a4-5f3b-bb46-fb412eda8201",
- "chunk_id": "37f940e2-18f4-50f5-93aa-cec422fc9211",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.18609081208705902,
- "text": "Sandor (The Hound) Clegane\n\nPrince Joff\u2019s unofficial bodyguard. Proud that he is not a knight, The Hound appears to have no scruples whatsoever and does what Joffrey orders, however cruel or unjust, without question. His face is scarred on one side by extensive burning inflicted by his brother, Gregor.\n\nRobb Stark\n\nThe eldest Stark son and thus heir to Ned Stark. Though just fourteen, he is mature beyond his age as well as being brave and dutiful like his father.\n\nMaester Luwin\n\nCounselor to Ned, Catelyn, and Robb. Luwin is old and wise, and his advice proves indispensible to the Starks.\n\nTheon Greyjoy\n\nThe Starks\u2019s ward and Robb\u2019s best friend. Ned Stark took the young Theon, now nineteen, as a ward after putting down a rebellion led by the Greyjoy family, and Theon consequently grew up with the Stark children as something like a brother.",
- "metadata": {
- "version": "v0",
- "chunk_order": 5,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "09f57c70-e7c6-548b-897f-fb8e9aba31c8",
- "chunk_id": "a021aa95-14d9-5301-9252-b06bcb852956",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.17496788948965758,
- "text": "Shared and Shared Saver Rides enables unrelated parties traveling along similar routes to benefit from a discounted fare at the cost of possibly longer travel times. With a Shared or Shared Saver Ride, when the first rider requests a ride, our algorithms use the first rider\u2019s destination and attempt to match them with other riders traveling along a similar route. If a match between riders is made, our algorithms re-route the driver to include the pick-up location of the matched rider on the active route. For Shared and Shared Saver Rides, drivers earn a fixed amount based on a number of factors, including the time and distance of the ride, the base fare charged to riders and the level of rider demand. We determine the rider fare based on the predicted time and distance of the ride, the level of rider demand and the likelihood of being able to match additional riders along the given route, and such fare is quoted to the riders prior to their commitment to the ride. The fare charged to the riders is decoupled",
- "metadata": {
- "version": "v0",
- "chunk_order": 276,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 36,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "fragment_id": "dbd5427c-f5ef-5fa6-83ae-a4a8ddbb48c2",
- "chunk_id": "14b08757-0819-5105-af37-509686dd6d01",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [],
- "score": 0.16958434879779816,
- "text": "s, drivers, and the communities they serve.",
- "metadata": {
- "version": "v0",
- "chunk_order": 77,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 13,
- "partitioned_by_unstructured": true,
- "unstructured_is_continuation": true,
- "associated_query": "Who is Jon Snow?"
- }
- }
- ],
- "graph_search_results": null
- }
- }
- },
- "streaming_rag": {
- "results": {
- "completion": {
- "choices": [
- {
- "message": {
- "content": "[{\"fragment_id\": \"94684f2d-fe60-5ba3-b1e8-0a921841bac9\", \"chunk_id\": \"fde39a49-00fc-5622-addd-13eb9c3bad4b\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6675721804840684, \"text\": \"Total Stockholders\\u2019 Equity (Deficit) 1,676,163\\n\\n5,184\\n\\n\\u2014\\n\\n(26,298)\\n\\n28,637\\n\\n(1) 721,710 (2,038) (1,009,359) 1,393,998\\n\\nLyft, Inc. Consolidated Statements of Cash Flows (in thousands)\\n\\n2021\\n\\nCash flows from operating activities Net loss Adjustments to reconcile net loss to net cash used in operating activities\\n\\n$\\n\\n(1,009,359)\\n\\nDepreciation and amortization Stock-based compensation Amortization of premium on marketable securities Accretion of discount on marketable securities Amortization of debt discount and issuance costs Deferred income tax from convertible senior notes Loss on sale and disposal of assets, net Gain on divestiture Other Changes in operating assets and liabilities, net effects of acquisition\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 572, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 82, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"0d5c5803-8846-59d4-8ae3-3696b718f162\", \"chunk_id\": \"fa3d2549-593a-5a80-88a2-b2d031d79771\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6647443571534382, \"text\": \"79\\n\\n2019 3,615,960\\n\\n2,176,469 636,116 1,505,640 814,122 1,186,093 6,318,440 (2,702,480) \\u2014 102,595 (2,599,885) 2,356 (2,602,241)\\n\\n(11.44)\\n\\n227,498\\n\\n81,321 75,212 971,941 72,046 398,791\\n\\nLyft, Inc. Consolidated Statements of Comprehensive Loss (in thousands)\\n\\nNet loss Other comprehensive income (loss)\\n\\n$\\n\\nYear Ended December 31, 2020 (1,752,857) $\\n\\n2021 (1,009,359) $\\n\\nForeign currency translation adjustment Unrealized gain (loss) on marketable securities, net of taxes\\n\\nOther comprehensive income (loss)\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 567, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 79, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"5d663f0a-c9a9-580f-818b-5ca0a1ca73f2\", \"chunk_id\": \"12916d2c-0691-528c-86aa-6784c1f35c55\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6628832616140414, \"text\": \"Overview\\n\\nLyft, Inc (the \\u201cCompany\\u201d or \\u201cLyft\\u201d) started a movement to revolutionize transportation. In 2012, we launched our peer-to-peer marketplace for on-demand ridesharing and have continued to pioneer innovations aligned with our mission. Today, Lyft is one of the largest multimodal transportation networks in the United States and Canada.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 16, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 5, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"7133acca-f147-5f43-b2e0-71228282fda0\", \"chunk_id\": \"212d8290-564d-5039-93cc-00cea31a1771\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6603014862717096, \"text\": \"Revenues from Contracts with Customers (ASC 606)\\n\\nWe generate substantially all our revenue from our ridesharing marketplace that connects drivers and riders. We recognize revenue from fees paid by drivers for use of our Lyft Platform offerings in accordance with ASC 606 as described in Note 2 of the notes to our consolidated financial statements. Drivers enter into terms of service (\\u201cToS\\u201d) with us in order to use our Lyft Driver App.\\n\\n58\\n\\n2019 to 2020 % Change\\n\\n19.0% (1.8)% (6.7)% 2.3%\\n\\nWe provide a service to drivers to complete a successful transportation service for riders. This service includes on-demand lead generation that assists drivers to find, receive and fulfill on-demand requests from riders seeking transportation services and related collection activities using our Lyft Platform. As a result, our single performance obligation in the transaction is to connect drivers with riders to facilitate the completion of a successful transportation service for riders.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 459, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 58, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"1a76991f-fa85-59b2-b522-700d47b2d809\", \"chunk_id\": \"0773cd62-b39f-517f-b6f1-be788b38374d\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6477107388714619, \"text\": \"Corporate Information\\n\\nWe were incorporated in 2007 as Bounder Web, Inc., a Delaware corporation. In 2008, we changed our name to Zimride, Inc. We founded Lyft in 2012 and\\n\\nchanged our name to Lyft, Inc. in 2013 when we sold the assets related to our Zimride operations.\\n\\n13\\n\\nAvailable Information\\n\\nOur website is located at www.lyft.com, and our investor relations website is located at investor.lyft.com. Copies of our Annual Report on Form 10-K, Quarterly Reports on Form 10-Q, Current Reports on Form 8-K and amendments to these reports filed or furnished pursuant to Section 13(a) or 15(d) of the Exchange Act, as amended, are available free of charge on our investor relations website as soon as reasonably practicable after we file such material electronically with or furnish it to the Securities and Exchange Commission (the \\u201cSEC\\u201d). The SEC also maintains a website that contains our SEC filings at www.sec.gov.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 82, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 13, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"17673edc-6fb7-577d-9bca-457c5745382d\", \"chunk_id\": \"bde94416-baaa-573a-9bc7-86ddf28535b1\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6406270265579269, \"text\": \"We generate substantially all of our revenue from our ridesharing marketplace that connects drivers and riders. We collect service fees and commissions from drivers for their use of our ridesharing marketplace. As drivers accept more rider leads and complete more rides, we earn more revenue. We also generate revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by making our ridesharing marketplace available to organizations through our Lyft Business offerings, such as our Concierge and Corporate Business Travel programs. In the second quarter of 2021, we began generating revenues from licensing and data access agreements, primarily with third-party autonomous vehicle companies.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 20, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 5, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"8f6299b7-4582-5bac-8c74-7ca57714aefa\", \"chunk_id\": \"310e9e1f-25d3-5287-a905-5446f661d6da\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6394975757149678, \"text\": \"Revenue Recognition\\n\\nThe Company generates its revenue from its multimodal transportation networks that offer access to a variety of transportation options through the Lyft Platform and mobile-based applications. Substantially all of the Company\\u2019s revenue is generated from its ridesharing marketplace that connects drivers and riders and is recognized in accordance with Accounting Standards Codification Topic 606 (\\u201cASC 606\\u201d). In addition, the Company generates revenue in accordance with ASC 606 from licensing and data access, primarily with third-party autonomous vehicle companies. The Company also generates rental revenue from Flexdrive, its network of Light Vehicles and Lyft Rentals, which is recognized in accordance with Accounting Standards Codification Topic 842 (\\u201cASC 842\\u201d).\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 591, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 86, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"ff837ea0-0062-59ca-bb4f-aa7a1c9cecd0\", \"chunk_id\": \"41e4db8a-0478-5015-8263-cde0618ec626\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6308713775353904, \"text\": \"Light Vehicle Rider and Lyft Rentals Renter Incentives\\n\\nIncentives offered to Light Vehicle riders and Lyft Rentals renters were not material for the years ended December 31, 2021 and 2020.\\n\\nFor the years ended December 31, 2021, 2020 and 2019, in relation to the driver, rider, Light Vehicle riders and Lyft Rentals renters incentive programs, the Company recorded $1.3 billion, $390.8 million and $560.3 million as a reduction to revenue and $64.7 million, $135.0 million and $381.5 million as sales and marketing expense, respectively.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 611, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 89, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"c24d144b-c22d-5c08-876f-a03e43620aa4\", \"chunk_id\": \"2a4caab0-6193-5263-8eab-c7763e8f38e8\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6295160430381335, \"text\": \"Software Development Costs\\n\\nThe Company incurs costs related to developing the Lyft Platform and related support systems. The Company capitalizes development costs related to the Lyft Platform and related support systems once the preliminary project stage is complete and it is probable that the project will be completed and the software will be used to perform the function intended. The Company capitalized $16.2 million and $12.8 million of software development costs during the year ended December 31, 2021 and 2020, respectively. For the year ended December 31, 2019, capitalized software development costs was not material.\\n\\nInsurance Reserves\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 649, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 94, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"fragment_id\": \"83985ff5-653f-53eb-b137-f616b4292f51\", \"chunk_id\": \"0919e3d5-03b0-5d54-b5f3-7f6ad4534412\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [], \"score\": 0.6223346365889701, \"text\": \"32.1\\u2020\\n\\nCertifications of Principal Executive Officer and Principal Financial Officer pursuant to 18 U.S.C. Section 1350, as adopted pursuant to Section 906 of the Sarbanes-Oxley Act of 2002.\\n\\n101\\n\\nThe following financial information from Lyft, Inc.\\u2019s Annual Report on Form 10-K for the fiscal year ended December 31, 2021 formatted in Inline XBRL (eXtensible Business Reporting Language): (i) Consolidated Statements of Operations for the fiscal years ended December 31, 2021, 2020 and 2019; (ii) Consolidated Statements of Comprehensive Income (Loss) for the fiscal years ended December 31, 2021, 2020, and 2019; (iii) Consolidated Balance Sheets as of December 31, 2021 and 2020; (iv) Consolidated Statements of Cash Flows for the fiscal years ended December 31, 2021, 2020, and 2019; (v) Consolidated Statements of Redeemable Convertible Preferred Stock and Stockholders\\u2019 Equity for the fiscal years ended December 31, 2021, 2020, and 2019; and (vi) Notes to the Consolidated Financial Statements.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 817, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 127, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}]Lyft's profit in 2020 was a net loss of $1,752,857,000 [2]."
- }
- }
- ]
- }
- }
- }
-}
diff --git a/py/tests/regression/expected_outputs/test_user_management.json b/py/tests/regression/expected_outputs/test_user_management.json
deleted file mode 100644
index 174311e69..000000000
--- a/py/tests/regression/expected_outputs/test_user_management.json
+++ /dev/null
@@ -1,165 +0,0 @@
-{
- "register_user": {
- "results": {
- "id": "14466e62-fd0a-5ad3-85b6-6965d7f6d336",
- "email": "test_4f3ba5b7@example.com",
- "is_active": true,
- "is_superuser": false,
- "created_at": "2024-09-20T22:48:31.575356Z",
- "updated_at": "2024-09-20T22:48:31.575356Z",
- "is_verified": false,
- "collection_ids": [],
- "hashed_password": "$2b$12$91hUOrNh1OhZZODomjxa0Oc/hGNUVwxFh45CXXBBOfhQP98y.pvFm",
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null
- }
- },
- "login_user": {
- "results": {
- "access_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzRmM2JhNWI3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI2ODc2MTExLjgzOTksInRva2VuX3R5cGUiOiJhY2Nlc3MifQ.vW27WohcKi9ipcTC04i6rSg42ZH8NmnXFYUgfXcGAw0",
- "token_type": "access"
- },
- "refresh_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzRmM2JhNWI3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI3NDc3MzExLCJ0b2tlbl90eXBlIjoicmVmcmVzaCJ9.NUBrdq0PFM8TbDN4EGEefvke07HMLp0T4W6zUbOxt7w",
- "token_type": "refresh"
- }
- }
- },
- "user_info": {
- "results": {
- "id": "14466e62-fd0a-5ad3-85b6-6965d7f6d336",
- "email": "test_4f3ba5b7@example.com",
- "is_active": true,
- "is_superuser": false,
- "created_at": "2024-09-20T22:48:31.575356Z",
- "updated_at": "2024-09-20T22:48:31.575356Z",
- "is_verified": true,
- "collection_ids": [],
- "hashed_password": "$2b$12$91hUOrNh1OhZZODomjxa0Oc/hGNUVwxFh45CXXBBOfhQP98y.pvFm",
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null
- }
- },
- "change_password": {
- "results": {
- "message": "Password changed successfully"
- }
- },
- "update_profile": {
- "results": {
- "id": "14466e62-fd0a-5ad3-85b6-6965d7f6d336",
- "email": "test_4f3ba5b7@example.com",
- "is_active": true,
- "is_superuser": false,
- "created_at": "2024-09-20T22:48:31.575356Z",
- "updated_at": "2024-09-20T22:48:32.394281Z",
- "is_verified": true,
- "collection_ids": [],
- "hashed_password": null,
- "verification_code_expiry": null,
- "name": "John Doe",
- "bio": "R2R enthusiast",
- "profile_picture": null
- }
- },
- "refresh_token": {
- "results": {
- "access_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzRmM2JhNWI3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI2ODc2MTEyLjQyMjYzNywidG9rZW5fdHlwZSI6ImFjY2VzcyJ9.09C6tAMd3WhrEmsqPdFsWx7lxsp0abGSdWGessddSw0",
- "token_type": "access"
- },
- "refresh_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzRmM2JhNWI3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI3NDc3MzEyLCJ0b2tlbl90eXBlIjoicmVmcmVzaCJ9.yrm_ncSzAScPpdlqYHD2WWhy1AcwuO9CGHcrNLrTNGY",
- "token_type": "refresh"
- }
- }
- },
- "superuser_test": {
- "results": [
- {
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "email": "admin@example.com",
- "is_superuser": true,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-09-20T22:26:10.261930Z",
- "updated_at": "2024-09-20T22:26:10.261930Z",
- "collection_ids": [],
- "num_files": 15,
- "total_size_in_bytes": 8767226,
- "document_ids": [
- "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "d421207a-d799-5806-8d67-46b2005b15d4",
- "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
- "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "2f576170-c4f9-5141-a910-a0924f341de4",
- "3e157b3a-8469-51db-90d9-52e7d896b49b"
- ]
- },
- {
- "user_id": "a0a60c19-7672-5537-a71c-07a771bc4fa3",
- "email": "test_40669f1c@example.com",
- "is_superuser": false,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-09-20T22:48:01.453932Z",
- "updated_at": "2024-09-20T22:48:01.453932Z",
- "collection_ids": [],
- "num_files": 0,
- "total_size_in_bytes": 0,
- "document_ids": []
- },
- {
- "user_id": "14466e62-fd0a-5ad3-85b6-6965d7f6d336",
- "email": "test_4f3ba5b7@example.com",
- "is_superuser": false,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-09-20T22:48:31.575356Z",
- "updated_at": "2024-09-20T22:48:32.394281Z",
- "collection_ids": [],
- "num_files": 0,
- "total_size_in_bytes": 0,
- "document_ids": []
- },
- {
- "user_id": "a5e4ef83-b9ae-56e0-952b-4116f487b000",
- "email": "test_c8a21611@example.com",
- "is_superuser": false,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-09-20T22:48:14.196002Z",
- "updated_at": "2024-09-20T22:48:14.196002Z",
- "collection_ids": [],
- "num_files": 0,
- "total_size_in_bytes": 0,
- "document_ids": []
- }
- ],
- "total_entries": 4
- },
- "logout": {
- "results": {
- "message": "Logged out successfully"
- }
- },
- "delete_account": {
- "results": {
- "message": "User account c8c03ba0-6d1d-524c-be39-7fb6e01b4298 deleted successfully."
- }
- }
-}
diff --git a/py/tests/regression/observed_outputs/test_document_management.json b/py/tests/regression/observed_outputs/test_document_management.json
deleted file mode 100644
index adee70843..000000000
--- a/py/tests/regression/observed_outputs/test_document_management.json
+++ /dev/null
@@ -1,1114 +0,0 @@
-{
- "ingest_sample_files": {
- "results": [
- {
- "task_id": "198fb971-1064-4e37-9572-d121efb0b176",
- "message": "Ingestion task queued successfully.",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
- },
- {
- "task_id": "670b99fb-872f-4da7-8e65-c74f2837a6f7",
- "message": "Ingestion task queued successfully.",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a"
- },
- {
- "task_id": "94b1892a-61e2-4e7c-a7e3-97045dde745a",
- "message": "Ingestion task queued successfully.",
- "document_id": "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- },
- {
- "task_id": "2be661b4-c957-4fa0-9a34-d0a527397a00",
- "message": "Ingestion task queued successfully.",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b"
- },
- {
- "task_id": "ac76f6dc-3108-480c-b0d0-94027f48171c",
- "message": "Ingestion task queued successfully.",
- "document_id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9"
- },
- {
- "task_id": "ed6cdd45-1167-4606-baf8-9b214a0a361a",
- "message": "Ingestion task queued successfully.",
- "document_id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526"
- },
- {
- "task_id": "defd4b64-85ed-4b1b-80f4-70eb794bbd0e",
- "message": "Ingestion task queued successfully.",
- "document_id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9"
- },
- {
- "task_id": "aeabf522-4d93-4ee2-904a-a10a60653d8c",
- "message": "Ingestion task queued successfully.",
- "document_id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5"
- },
- {
- "task_id": "5a1b2e8b-6cd7-4678-9f8a-547d573b116a",
- "message": "Ingestion task queued successfully.",
- "document_id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c"
- },
- {
- "task_id": "ad7c0ae7-160b-494a-8c69-338ecf2e40af",
- "message": "Ingestion task queued successfully.",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4"
- },
- {
- "task_id": "684e00f2-ba98-471d-94c5-ad216d8fcd7e",
- "message": "Ingestion task queued successfully.",
- "document_id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc"
- },
- {
- "task_id": "17da503a-a5eb-48a5-a4e6-05bc10b505c0",
- "message": "Ingestion task queued successfully.",
- "document_id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1"
- },
- {
- "task_id": "afe2476a-143e-4d37-8f84-5f21c8c6ef95",
- "message": "Ingestion task queued successfully.",
- "document_id": "db02076e-989a-59cd-98d5-e24e15a0bd27"
- },
- {
- "task_id": "d43f2806-6028-435c-8347-f36b6d583755",
- "message": "Ingestion task queued successfully.",
- "document_id": "d421207a-d799-5806-8d67-46b2005b15d4"
- },
- {
- "task_id": "decb58f6-72e9-4cbd-9afc-41263924c8b5",
- "message": "Ingestion task queued successfully.",
- "document_id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb"
- },
- {
- "task_id": "4f39ddbf-d54f-4aee-9a02-2e408dd09853",
- "message": "Ingestion task queued successfully.",
- "document_id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c"
- }
- ]
- },
- "reingest_sample_file": {
- "results": [
- {
- "task_id": "f9914db6-ee89-4a9e-9479-4460d08970f8",
- "message": "Ingestion task queued successfully.",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
- },
- {
- "task_id": "effba8cc-5046-47b3-92e6-283bdfc3d774",
- "message": "Ingestion task queued successfully.",
- "document_id": "716fea3a-826b-5b27-8e59-ffbd1a35455a"
- },
- {
- "task_id": "c819d364-ca3f-422e-bcb8-e73b1f1dd852",
- "message": "Ingestion task queued successfully.",
- "document_id": "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- },
- {
- "task_id": "5e23b793-ac30-466b-a546-d6ab1e540072",
- "message": "Ingestion task queued successfully.",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b"
- },
- {
- "task_id": "b5852f4c-fa9a-4577-968a-f92c6413c373",
- "message": "Ingestion task queued successfully.",
- "document_id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9"
- },
- {
- "task_id": "994c5682-4b70-4437-9d40-64ffbffcf98c",
- "message": "Ingestion task queued successfully.",
- "document_id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526"
- },
- {
- "task_id": "6ba376a6-dbc3-4f0e-ba49-53d8857f19db",
- "message": "Ingestion task queued successfully.",
- "document_id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9"
- },
- {
- "task_id": "024c7668-6d65-4863-8857-15c83ee77d79",
- "message": "Ingestion task queued successfully.",
- "document_id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5"
- },
- {
- "task_id": "4eff848c-c19c-41d0-b165-99ed2b5b8dc6",
- "message": "Ingestion task queued successfully.",
- "document_id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c"
- },
- {
- "task_id": "cd7e79a0-f2f0-4107-bea1-fade73db3e8c",
- "message": "Ingestion task queued successfully.",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4"
- },
- {
- "task_id": "723ee10a-14a0-44ad-9b4e-e4cbbf7866e0",
- "message": "Ingestion task queued successfully.",
- "document_id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc"
- },
- {
- "task_id": "92b12722-7473-4966-ac3f-4d4840c18c90",
- "message": "Ingestion task queued successfully.",
- "document_id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1"
- },
- {
- "task_id": "51b3c064-f2d7-47e6-9473-23466ef89f82",
- "message": "Ingestion task queued successfully.",
- "document_id": "db02076e-989a-59cd-98d5-e24e15a0bd27"
- },
- {
- "task_id": "47c344c0-7f12-4686-93ee-49e9de249658",
- "message": "Ingestion task queued successfully.",
- "document_id": "d421207a-d799-5806-8d67-46b2005b15d4"
- },
- {
- "task_id": "90915451-6cd8-4d99-bdc9-412c88a24087",
- "message": "Ingestion task queued successfully.",
- "document_id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb"
- },
- {
- "task_id": "575a325e-87ec-45ca-b7ec-cf1c42ad0de0",
- "message": "Ingestion task queued successfully.",
- "document_id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c"
- }
- ]
- },
- "documents_overview": {
- "results": [
- {
- "id": "db02076e-989a-59cd-98d5-e24e15a0bd27",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle.txt",
- "version": "v0",
- "size_in_bytes": 97804,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.380069Z",
- "updated_at": "2024-10-03T22:45:59.380071Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.349783Z",
- "updated_at": "2024-10-03T22:45:59.349785Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.346612Z",
- "updated_at": "2024-10-03T22:45:59.346615Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.340378Z",
- "updated_at": "2024-10-03T22:45:59.340381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.317633Z",
- "updated_at": "2024-10-03T22:45:59.317635Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303531Z",
- "updated_at": "2024-10-03T22:45:59.303532Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303007Z",
- "updated_at": "2024-10-03T22:45:59.303009Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.301022Z",
- "updated_at": "2024-10-03T22:45:59.301027Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.298646Z",
- "updated_at": "2024-10-03T22:45:59.298647Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_1.html",
- "version": "v0",
- "size_in_bytes": 175340,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.297885Z",
- "updated_at": "2024-10-03T22:45:59.297886Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296971Z",
- "updated_at": "2024-10-03T22:45:59.296972Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296497Z",
- "updated_at": "2024-10-03T22:45:59.296499Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.294005Z",
- "updated_at": "2024-10-03T22:45:59.294006Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.292811Z",
- "updated_at": "2024-10-03T22:45:59.292813Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.289998Z",
- "updated_at": "2024-10-03T22:45:59.290001Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T21:49:17.961611Z",
- "updated_at": "2024-10-03T22:45:59.376115Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 16
- },
- "document_chunks_test": {
- "results": [
- {
- "chunk_id": "bcd08cd0-1551-5ee2-ad08-551ae15e5ed1",
- "document_id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "text": "\n\nA Project of One's Own\n\nJune 2021A few days ago, on the way home from school, my nine year old son\ntold me he couldn't wait to get home to write more of the story he\nwas working on. This made me as happy as anything I've heard him\nsay \u2014 not just because he was excited about his story, but because\nhe'd discovered this way of working. Working on a project of your\nown is as different from ordinary work as skating is from walking.\nIt's more fun, but also much more productive.What proportion of great work has been done by people who were\nskating in this sense? If not all of it, certainly a lot.There is something special about working on a project of your own.\nI wouldn't say exactly that you're happier. A better word would be\nexcited, or engaged. You're happy when things are going well, but\noften they aren't. When I'm writing an essay, most of the time I'm\nworried and puzzled: worried that the essay will turn out badly,\nand puzzled because I'm groping for some idea that I can't see\nclearly enough. Will I be able to pin it down with words? In the\nend I usually can, if I take long enough, but I'm never sure; the\nfirst few attempts often fail.You have moments of happiness when things work out, but they don't\nlast long, because then you're on to the next problem. So why do\nit at all? Because to the kind of people who like working this way,\nnothing else feels as right. You feel as if you're an animal in its\nnatural habitat, doing what you were meant to do \u2014 not always\nhappy, maybe, but awake and alive.Many kids experience the excitement of working on projects of their\nown. The hard part is making this converge with the work you do as\nan adult. And our customs make it harder. We treat \"playing\" and\n\"hobbies\" as qualitatively different from \"work\". It's not clear\nto a kid building a treehouse that there's a direct (though long)\nroute from that to architecture or engineering. And instead of\npointing out the route, we conceal it, by implicitly treating the\nstuff kids do as different from real work.\n[1]Instead of telling kids that their treehouses could be on the path\nto the work they do as adults, we tell them the path goes through\nschool. And unfortunately schoolwork tends to be very different from\nworking on projects of one's own. It's usually neither a project,\nnor one's own. So as school gets more serious, working on projects\nof one's own is something that survives, if at all, as a thin thread\noff to the side.It's a bit sad to think of all the high school kids turning their\nbacks on building treehouses and sitting in class dutifully learning\nabout Darwin or Newton to pass some exam, when the work that made\nDarwin and Newton famous was actually closer in spirit to building\ntreehouses than studying for exams.If I had to choose between my kids getting good grades and\nworking on ambitious projects of their own, I'd pick\nthe projects. And not because I'm an indulgent parent, but because\nI've been on the other end and I know which has more predictive\nvalue. When I was picking startups for Y Combinator, I didn't care\nabout applicants' grades. But if they'd worked on projects of their\nown, I wanted to hear all about those.\n[2]It may be inevitable that school is the way it is. I'm not saying\nwe have to redesign it (though I'm not saying we don't), just that\nwe should understand what it does to our attitudes to work \u2014 that\nit steers us toward the dutiful plodding kind of work, often using\ncompetition as bait, and away from skating.There are occasionally times when schoolwork becomes a project of\none's own. Whenever I had to write a paper, that would become a\nproject of my own \u2014 except in English classes, ironically, because\nthe things one has to write in English classes are so\nbogus. And\nwhen I got to college and started taking CS classes, the programs\nI had to write became projects of my own. Whenever I was writing\nor programming, I was usually skating, and that has been true ever\nsince.So where exactly is the edge of projects of one's own? That's an\ninteresting question, partly because the answer is so complicated,\nand partly because there's so much at stake. There turn out to be\ntwo senses in which work can be one's own: 1) that you're doing it\nvoluntarily, rather than merely because someone told you to, and\n2) that you're doing it by yourself.The edge of the former is quite sharp. People who care a lot about\ntheir work are usually very sensitive to the difference between\npulling, and being pushed, and work tends to fall into one category\nor the other. But the test isn't simply whether you're told to do\nsomething. You can choose to do something you're told to do. Indeed,\nyou can own it far more thoroughly than the person who told you to\ndo it.For example, math homework is for most people something they're\ntold to do. But for my father, who was a mathematician, it wasn't.\nMost of us think of the problems in a math book as a way to test\nor develop our knowledge of the material explained in each section.\nBut to my father the problems were the part that mattered, and the\ntext was merely a sort of annotation. Whenever he got a new math\nbook it was to him like being given a puzzle: here was a new set\nof problems to solve, and he'd immediately set about solving all\nof them.The other sense of a project being one's own \u2014 working on it by\noneself \u2014 has a much softer edge. It shades gradually into\ncollaboration. And interestingly, it shades into collaboration in\ntwo different ways. One way to collaborate is to share a single\nproject. For example, when two mathematicians collaborate on a proof\nthat takes shape in the course of a conversation between them. The\nother way is when multiple people work on separate projects of their\nown that fit together like a jigsaw puzzle. For example, when one\nperson writes the text of a book and another does the graphic design.\n[3]These two paths into collaboration can of course be combined. But\nunder the right conditions, the excitement of working on a project\nof one's own can be preserved for quite a while before disintegrating\ninto the turbulent flow of work in a large organization. Indeed,\nthe history of successful organizations is partly the history of\ntechniques for preserving that excitement.\n[4]The team that made the original Macintosh were a great example of\nthis phenomenon. People like Burrell Smith and Andy Hertzfeld and\nBill Atkinson and Susan Kare were not just following orders. They\nwere not tennis balls hit by Steve Jobs, but rockets let loose by\nSteve Jobs. There was a lot of collaboration between them, but\nthey all seem to have individually felt the excitement of\nworking on a project of one's own.In Andy Hertzfeld's book on the Macintosh, he describes how they'd\ncome back into the office after dinner and work late into the night.\nPeople who've never experienced the thrill of working on a project\nthey're excited about can't distinguish this kind of working long\nhours from the kind that happens in sweatshops and boiler rooms,\nbut they're at opposite ends of the spectrum. That's why it's a\nmistake to insist dogmatically on \"work/life balance.\" Indeed, the\nmere expression \"work/life\" embodies a mistake: it assumes work and\nlife are distinct. For those to whom the word \"work\" automatically\nimplies the dutiful plodding kind, they are. But for the skaters,\nthe relationship between work and life would be better represented\nby a dash than a slash. I wouldn't want to work on anything that I didn't\nwant to take over my life.Of course, it's easier to achieve this level of motivation when\nyou're making something like the Macintosh. It's easy for something\nnew to feel like a project of your own. That's one of the reasons\nfor the tendency programmers have to rewrite things that don't need\nrewriting, and to write their own versions of things that already\nexist. This sometimes alarms managers, and measured by total number\nof characters typed, it's rarely the optimal solution. But it's not\nalways driven simply by arrogance or cluelessness.\nWriting code from scratch is also much more rewarding \u2014 so much\nmore rewarding that a good programmer can end up net ahead, despite\nthe shocking waste of characters. Indeed, it may be one of the\nadvantages of capitalism that it encourages such rewriting. A company\nthat needs software to do something can't use the software already\nwritten to do it at another company, and thus has to write their\nown, which often turns out better.\n[5]The natural alignment between skating and solving new problems is\none of the reasons the payoffs from startups are so high. Not only\nis the market price of unsolved problems higher, you also get a\ndiscount on productivity when you work on them. In fact, you get a\ndouble increase in productivity: when you're doing a clean-sheet\ndesign, it's easier to recruit skaters, and they get to spend all\ntheir time skating.Steve Jobs knew a thing or two about skaters from having watched\nSteve Wozniak. If you can find the right people, you only have to\ntell them what to do at the highest level. They'll handle the\ndetails. Indeed, they insist on it. For a project to feel like your\nown, you must have sufficient autonomy. You can't be working to\norder, or slowed down\nby bureaucracy.One way to ensure autonomy is not to have a boss at all. There are\ntwo ways to do that: to be the boss yourself, and to work on projects\noutside of work. Though they're at opposite ends of the scale\nfinancially, startups and open source projects have a lot in common,\nincluding the fact that they're often run by skaters. And indeed,\nthere's a wormhole from one end of the scale to the other: one of\nthe best ways to discover\nstartup ideas is to work on a project\njust for fun.If your projects are the kind that make money, it's easy to work\non them. It's harder when they're not. And the hardest part, usually,\nis morale. That's where adults have it harder than kids. Kids just\nplunge in and build their treehouse without worrying about whether\nthey're wasting their time, or how it compares to other treehouses.\nAnd frankly we could learn a lot from kids here. The high standards\nmost grownups have for \"real\" work do not always serve us well.The most important phase in a project of one's own is at the\nbeginning: when you go from thinking it might be cool to do x to\nactually doing x. And at that point high standards are not merely\nuseless but positively harmful. There are a few people who start\ntoo many new projects, but far more, I suspect, who are deterred\nby fear of failure from starting projects that would have succeeded\nif they had.But if we couldn't benefit as kids from the knowledge that our\ntreehouses were on the path to grownup projects, we can at least\nbenefit as grownups from knowing that our projects are on a path\nthat stretches back to treehouses. Remember that careless confidence\nyou had as a kid when starting something new? That would be a\npowerful thing to recapture.If it's harder as adults to retain that kind of confidence, we at\nleast tend to be more aware of what we're doing. Kids bounce, or\nare herded, from one kind of work to the next, barely realizing\nwhat's happening to them. Whereas we know more about different types\nof work and have more control over which we do. Ideally we can have\nthe best of both worlds: to be deliberate in choosing to work on\nprojects of our own, and carelessly confident in starting new ones.\nNotes[1]\n\"Hobby\" is a curious word. Now it means work that isn't real\nwork \u2014 work that one is not to be judged by \u2014 but originally it just\nmeant an obsession in a fairly general sense (even a political\nopinion, for example) that one metaphorically rode as a child rides\na hobby-horse. It's hard to say if its recent, narrower meaning is\na change for the better or the worse. For sure there are lots of\nfalse positives \u2014 lots of projects that end up being important but\nare dismissed initially as mere hobbies. But on the other hand, the\nconcept provides valuable cover for projects in the early, ugly\nduckling phase.[2]\nTiger parents, as parents so often do, are fighting the last\nwar. Grades mattered more in the old days when the route to success\nwas to acquire\ncredentials\nwhile ascending some predefined ladder.\nBut it's just as well that their tactics are focused on grades. How\nawful it would be if they invaded the territory of projects, and\nthereby gave their kids a distaste for this kind of work by forcing\nthem to do it. Grades are already a grim, fake world, and aren't\nharmed much by parental interference, but working on one's own\nprojects is a more delicate, private thing that could be damaged\nvery easily.[3]\nThe complicated, gradual edge between working on one's own\nprojects and collaborating with others is one reason there is so\nmuch disagreement about the idea of the \"lone genius.\" In practice\npeople collaborate (or not) in all kinds of different ways, but the\nidea of the lone genius is definitely not a myth. There's a core\nof truth to it that goes with a certain way of working.[4]\nCollaboration is powerful too. The optimal organization would\ncombine collaboration and ownership in such a way as to do the least\ndamage to each. Interestingly, companies and university departments\napproach this ideal from opposite directions: companies insist on\ncollaboration, and occasionally also manage both to recruit skaters\nand allow them to skate, and university departments insist on the\nability to do independent research (which is by custom treated as\nskating, whether it is or not), and the people they hire collaborate\nas much as they choose.[5]\nIf a company could design its software in such a way that the\nbest newly arrived programmers always got a clean sheet, it could\nhave a kind of eternal youth. That might not be impossible. If you\nhad a software backbone defining a game with sufficiently clear\nrules, individual programmers could write their own players.\nThanks to Trevor Blackwell, Paul Buchheit, Andy Hertzfeld, Jessica\nLivingston, and Peter Norvig for reading drafts of this.\n\n\n \n\n\n\n \n\n",
- "metadata": {
- "version": "v0",
- "chunk_id": 0,
- "chunk_order": 0,
- "document_type": "html",
- "partitioned_by_unstructured": true
- }
- }
- ],
- "total_entries": 1
- },
- "update_document_test": {
- "results": {
- "task_id": "6d6bb051-88c0-46dd-a0cd-8e9739cab04d",
- "message": "Update task queued successfully.",
- "document_ids": [
- "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"
- ]
- }
- },
- "rerun_documents_overview_test_1": {
- "results": [
- {
- "id": "db02076e-989a-59cd-98d5-e24e15a0bd27",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle.txt",
- "version": "v0",
- "size_in_bytes": 97804,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.380069Z",
- "updated_at": "2024-10-03T22:45:59.380071Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.349783Z",
- "updated_at": "2024-10-03T22:45:59.349785Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.346612Z",
- "updated_at": "2024-10-03T22:45:59.346615Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.340378Z",
- "updated_at": "2024-10-03T22:45:59.340381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.317633Z",
- "updated_at": "2024-10-03T22:45:59.317635Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303531Z",
- "updated_at": "2024-10-03T22:45:59.303532Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303007Z",
- "updated_at": "2024-10-03T22:45:59.303009Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.301022Z",
- "updated_at": "2024-10-03T22:45:59.301027Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.298646Z",
- "updated_at": "2024-10-03T22:45:59.298647Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_1.html",
- "version": "v0",
- "size_in_bytes": 175340,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.297885Z",
- "updated_at": "2024-10-03T22:45:59.297886Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296971Z",
- "updated_at": "2024-10-03T22:45:59.296972Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296497Z",
- "updated_at": "2024-10-03T22:45:59.296499Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.294005Z",
- "updated_at": "2024-10-03T22:45:59.294006Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.292811Z",
- "updated_at": "2024-10-03T22:45:59.292813Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.289998Z",
- "updated_at": "2024-10-03T22:45:59.290001Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T21:49:17.961611Z",
- "updated_at": "2024-10-03T22:45:59.376115Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 16
- },
- "delete_document_test": {
- "results": {}
- },
- "rerun_documents_overview_test_2": {
- "results": [
- {
- "id": "db02076e-989a-59cd-98d5-e24e15a0bd27",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle.txt",
- "version": "v0",
- "size_in_bytes": 97804,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.380069Z",
- "updated_at": "2024-10-03T22:45:59.380071Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "uber_2021.pdf",
- "version": "v0",
- "size_in_bytes": 2507312,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.349783Z",
- "updated_at": "2024-10-03T22:45:59.349785Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "png",
- "metadata": {
- "version": "v0"
- },
- "title": "screen_shot.png",
- "version": "v0",
- "size_in_bytes": 1055688,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.346612Z",
- "updated_at": "2024-10-03T22:45:59.346615Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "52e12576-090f-59db-91f4-6d4b2e29ae6c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample.mp3",
- "version": "v0",
- "size_in_bytes": 162228,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.340378Z",
- "updated_at": "2024-10-03T22:45:59.340381Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_3.html",
- "version": "v0",
- "size_in_bytes": 166556,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.317633Z",
- "updated_at": "2024-10-03T22:45:59.317635Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_4.html",
- "version": "v0",
- "size_in_bytes": 157484,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303531Z",
- "updated_at": "2024-10-03T22:45:59.303532Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "yc_companies.txt",
- "version": "v0",
- "size_in_bytes": 62948,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.303007Z",
- "updated_at": "2024-10-03T22:45:59.303009Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "mp3",
- "metadata": {
- "version": "v0"
- },
- "title": "sample2.mp3",
- "version": "v0",
- "size_in_bytes": 96608,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.301022Z",
- "updated_at": "2024-10-03T22:45:59.301027Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_2.html",
- "version": "v0",
- "size_in_bytes": 166816,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.298646Z",
- "updated_at": "2024-10-03T22:45:59.298647Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "graphrag.pdf",
- "version": "v0",
- "size_in_bytes": 2287544,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296971Z",
- "updated_at": "2024-10-03T22:45:59.296972Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "aristotle_v2.txt",
- "version": "v0",
- "size_in_bytes": 3380,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.296497Z",
- "updated_at": "2024-10-03T22:45:59.296499Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "pdf",
- "metadata": {
- "version": "v0"
- },
- "title": "lyft_2021.pdf",
- "version": "v0",
- "size_in_bytes": 1920404,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.294005Z",
- "updated_at": "2024-10-03T22:45:59.294006Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "html",
- "metadata": {
- "version": "v0"
- },
- "title": "pg_essay_5.html",
- "version": "v0",
- "size_in_bytes": 165040,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.292811Z",
- "updated_at": "2024-10-03T22:45:59.292813Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "got.txt",
- "version": "v0",
- "size_in_bytes": 12656,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T22:45:59.289998Z",
- "updated_at": "2024-10-03T22:45:59.290001Z",
- "ingestion_attempt_number": null
- },
- {
- "id": "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "type": "txt",
- "metadata": {
- "version": "v0"
- },
- "title": "test.txt",
- "version": "v0",
- "size_in_bytes": 28,
- "ingestion_status": "success",
- "kg_extraction_status": "pending",
- "created_at": "2024-10-03T21:49:17.961611Z",
- "updated_at": "2024-10-03T22:45:59.376115Z",
- "ingestion_attempt_number": null
- }
- ],
- "total_entries": 15
- },
- "rerun_document_chunks_test": {
- "results": "{\"detail\":{\"message\":\"No chunks found for the given document ID.\",\"error_type\":\"R2RException\"}}"
- }
-}
diff --git a/py/tests/regression/observed_outputs/test_group_management.json b/py/tests/regression/observed_outputs/test_group_management.json
deleted file mode 100644
index 195daa6fc..000000000
--- a/py/tests/regression/observed_outputs/test_group_management.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "create_groups": {
- "error": "'R2RClient' object has no attribute 'create_group'"
- },
- "add_users_to_groups": {
- "error": "'R2RClient' object has no attribute 'add_user_to_group'"
- },
- "group_based_document_access": {
- "error": "'TestGroupManagement' object has no attribute 'group_id_1'"
- },
- "admin_ingest_documents": {
- "error": "'TestGroupManagement' object has no attribute 'group_id_1'"
- },
- "user_ingest_and_search": {
- "error": "'TestGroupManagement' object has no attribute 'group_id_1'"
- },
- "cleanup": {
- "error": "'R2RClient' object has no attribute 'delete_group'"
- }
-}
diff --git a/py/tests/regression/observed_outputs/test_observability.json b/py/tests/regression/observed_outputs/test_observability.json
deleted file mode 100644
index 195861ff8..000000000
--- a/py/tests/regression/observed_outputs/test_observability.json
+++ /dev/null
@@ -1,180 +0,0 @@
-{
- "users_overview": {
- "results": [
- {
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "email": "admin@example.com",
- "is_superuser": true,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-10-03T16:58:58.104181Z",
- "updated_at": "2024-10-03T16:58:58.104181Z",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "num_files": 15,
- "total_size_in_bytes": 8862496,
- "document_ids": [
- "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "2f576170-c4f9-5141-a910-a0924f341de4",
- "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "d421207a-d799-5806-8d67-46b2005b15d4",
- "db02076e-989a-59cd-98d5-e24e15a0bd27",
- "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- ]
- },
- {
- "user_id": "45c36a22-6c43-56a7-b319-9abaae3f1bc0",
- "email": "test_6b09cf57@example.com",
- "is_superuser": false,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-10-03T23:00:58.216612Z",
- "updated_at": "2024-10-03T23:00:59.019061Z",
- "collection_ids": [
- "20efadf0-b06f-5685-a969-ba2800d8fee4"
- ],
- "num_files": 0,
- "total_size_in_bytes": 0,
- "document_ids": []
- }
- ],
- "total_entries": 2
- },
- "logs": {
- "results": [
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "run_type": "MANAGEMENT",
- "entries": [
- {
- "key": "search_latency",
- "value": "0.25",
- "timestamp": "2024-10-03 22:54:00"
- },
- {
- "key": "search_latency",
- "value": "0.28",
- "timestamp": "2024-10-03 22:54:21"
- },
- {
- "key": "completion_record",
- "value": "{\"message_id\": \"10400ac0-cfdc-5bf4-a3db-a18a1fa0cca5\", \"message_type\": \"assistant\", \"timestamp\": \"2024-10-03T22:44:59.813045\", \"feedback\": null, \"score\": null, \"completion_start_time\": \"2024-10-03T22:54:21.484798\", \"completion_end_time\": \"2024-10-03T22:54:22.505957\", \"search_query\": \"What was Uber's profit in 2020?\", \"search_results\": {\"vector_search_results\": [{\"chunk_id\": \"328e5142-bd6c-5553-b5a0-8fdbd72ee6c6\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.7446624344989735, \"text\": \"Revenue was $17.5 billion, or up 57% year-over-year, reflecting the overall growth in our Delivery business and an increase in Freight revenue attributable to the acquisition of Transplace in the fourth quarter of 2021 as well as growth in the number of shippers and carriers on the network combined with an increase in volumes with our top shippers.\\n\\nNet loss attributable to Uber Technologies, Inc. was $496 million, a 93% improvement year-over-year, driven by a $1.6 billion pre-tax gain on the sale of our ATG Business to Aurora, a $1.6 billion pre-tax net benefit relating to Uber\\u2019s equity investments, as well as reductions in our fixed cost structure and increased variable cost efficiencies. Net loss attributable to Uber Technologies, Inc. also included $1.2 billion of stock-based compensation expense.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 445, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 53, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"a0b5c2f6-7dcd-5865-b2c6-0b3cd2189e57\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.7071749146476451, \"text\": \"Total costs and expenses Loss from operations\\n\\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\\n\\nLess: net loss attributable to non-controlling interests, net of tax\\n\\n100 %\\n\\n46 % 16 % 32 % 20 % 24 % 5 % 144 % (44)% (4)% (15)% (62)% (2)% \\u2014 % (61)% \\u2014 % (61)%\\n\\n100 %\\n\\n54 % 11 % 27 % 12 % 13 % 5 % 122 % (22)% (3)% 19 % (6)% (3)% \\u2014 % (3)% \\u2014 % (3)%\\n\\nNet loss attributable to Uber Technologies, Inc.\\n\\n(1)\\n\\nTotals of percentage of revenues may not foot due to rounding.\\n\\nComparison of the Years Ended December 31, 2020 and 2021\\n\\nRevenue\\n\\nYear Ended December 31,\\n\\n(In millions, except percentages)\\n\\n2020\\n\\n2021\\n\\n2020 to 2021 % Change\\n\\nRevenue\\n\\n$\\n\\n11,139 $\\n\\n17,455\\n\\n57 %\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 463, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 57, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"500bf649-b2a8-521b-bdb2-78cdc342531f\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6810148751433258, \"text\": \"Year Ended December 31, 2020\\n\\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization\\n\\nTotal costs and expenses Loss from operations\\n\\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\\n\\nLess: net loss attributable to non-controlling interests, net of tax\\n\\n$\\n\\n13,000 $\\n\\n6,061 2,302 4,626 4,836 3,299 472 21,596 (8,596) (559) 722 (8,433) 45 (34) (8,512) (6) (8,506) $\\n\\n11,139 $\\n\\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\\n\\nNet loss attributable to Uber Technologies, Inc. Net loss per share attributable to Uber Technologies, Inc. common stockholders:\\n\\n$\\n\\nBasic\\n\\n$\\n\\n(6.81) $\\n\\n(3.86) $\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 574, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 77, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"90b1f17b-a97f-5552-9951-fbc6df634039\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6806196963602422, \"text\": \"Less: net loss attributable to non-controlling interests, net of tax\\n\\n$\\n\\n11,139 $\\n\\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\\n\\n17,455\\n\\n9,351 1,877 4,789 2,054 2,316 902 21,289 (3,834) (483) 3,292 (1,025) (492) (37) (570) (74) (496)\\n\\nNet loss attributable to Uber Technologies, Inc.\\n\\n$\\n\\n54\\n\\nThe following table sets forth the components of our consolidated statements of operations for each of the periods presented as a percentage of revenue\\n\\n(1)\\n\\n:\\n\\nYear Ended December 31, 2021 2020\\n\\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 462, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 56, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"845a2b04-70ee-5a70-91fa-44016677fd92\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6537216512130718, \"text\": \"The Uber Service activities are performed to satisfy our sole performance obligation in the transaction, which is to connect Drivers and Merchants with end-\\n\\nusers to facilitate the completion of a successful transaction.\\n\\nIn 2020, we began charging Mobility end-users a fee to use the platform in certain markets. In these transactions, in addition to a performance obligation to Drivers, we also have a performance obligation to end-users, which is to connect end-users to Drivers in the marketplace. We recognize revenue when a trip is complete. We present revenue on a net basis for these transactions, as we do not control the service provided by Drivers to end-users. For the years ended December 31, 2020 and 2021, we recognized total revenue of $323 million and $336 million, respectively, associated with these fees charged to end-users.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 642, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 90, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"1739d713-3fb6-534f-8ddb-7ff9cd6484c7\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.638846836158823, \"text\": \"Other income (expense), net\\n\\n$\\n\\nDuring the year ended December 31, 2020, gain on business divestitures, net represented a $154 million gain on the sale of our Uber Eats India operations to Zomato recognized in the first quarter of 2020 and a $77 million gain on the sale of our European Freight Business to sennder GmbH (\\u201cSennder\\u201d) recognized in the fourth quarter of 2020, partially offset by a $27 million loss on the sale of our JUMP operations to Lime recognized in the second quarter of 2020.\\n\\n(1)\\n\\nDuring the year ended December 31, 2021, gain on business divestitures, net represented a $1.6 billion gain on the sale of our ATG Business to Aurora\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 799, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 118, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"70e9089c-56e0-52f7-80ea-ad66fe1f9a79\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6322252592771936, \"text\": \"2019\\n\\n100.0 %\\n\\n60.2 17.6 41.6 22.5 32.8 174.7 (74.7) \\u2014 2.8 (71.9) 0.1 (72.0)%\\n\\n2019 to 2020 % Change\\n\\n(35) %\\n\\nsecond quarter of 2021. These increases were offset by investments in driver supply by increasing driver incentives recorded as a reduction to revenue by $942.9 million in 2021 as compared to the prior year as rider demand outpaced driver supply during certain periods of the pandemic recovery in 2021. Revenue in 2020 was also higher in the first quarter of 2020 prior to the implementation of shelter-in-place orders and other travel restrictions across North America beginning March 2020.\\n\\nWe expect to see continued recovery in demand for our platform and the resulting positive impacts on revenue as there are more widespread immunity levels, more communities reopen and other restrictive travel and social distancing measures in response to COVID-19 are eased. However, we cannot predict the impact of COVID variants and the longer term impact of the pandemic on consumer behavior.\\n\\nCost of Revenue\\n\\n2021\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 493, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 63, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"5425859b-cbfa-54e4-9729-5f92c6f61efc\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6301008528290666, \"text\": \"For additional discussion, see the risk factor titled \\u201c\\u2014If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, and carriers, whether as a result of competition or other factors, our platform will become less appealing to platform users, and our financial results would be adversely impacted.\\u201d included in Part I, Item 1A of this Annual Report on Form 10-K as well our 2021 ESG Report and our 2021 People and Culture Report. The information in these reports is not a part of this Form 10-K.\\n\\nAdditional Information\\n\\nWe were founded in 2009 and incorporated as Ubercab, Inc., a Delaware corporation, in July 2010. In February 2011, we changed our name to Uber\\n\\nTechnologies, Inc. Our principal executive offices are located at 1515 3rd Street, San Francisco, California 94158, and our telephone number is (415) 612-8582.\\n\\n10\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 77, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 12, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"9dae5d7c-4bcd-52f0-bdfc-a9e327c56069\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6285498210400674, \"text\": \"Uber Technologies, Inc. (\\u201cUber,\\u201d \\u201cwe,\\u201d \\u201cour,\\u201d or \\u201cus\\u201d) was incorporated in Delaware in July 2010, and is headquartered in San Francisco, California. Uber is a technology platform that uses a massive network, leading technology, operational excellence and product expertise to power movement from point A to point B. Uber develops and operates proprietary technology applications supporting a variety of offerings on its platform (\\u201cplatform(s)\\u201d or \\u201cPlatform(s)\\u201d). Uber connects consumers (\\u201cRider(s)\\u201d) with independent providers of ride services (\\u201cMobility Driver(s)\\u201d) for ridesharing services, and connects Riders and other consumers (\\u201cEaters\\u201d) with restaurants, grocers and other stores (collectively, \\u201cMerchants\\u201d) with delivery service providers (\\u201cCouriers\\u201d) for meal preparation, grocery and other delivery services. Riders and Eaters are collectively referred to as \\u201cend-user(s)\\u201d or \\u201cconsumer(s).\\u201d Mobility Drivers and Couriers are collectively referred to as \\u201cDriver(s).\\u201d Uber also connects consumers with public\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 592, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 84, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}, {\"chunk_id\": \"9bba73a7-4ebf-51f2-8a55-553a93d2ac41\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.628432135926722, \"text\": \"Year Ended December 31, 2020\\n\\n1,000 49 1,189 (27) \\u2014 (138) (34) 8,939 (4) 3,824\\n\\n247 125 2,628 (527) (891) (224) 38 1,379 (92) (4,327)\\n\\n8,209 34 12,067 $\\n\\n12,067 (349) 7,391 $\\n\\n332 $ 133\\n\\n412 $ 82\\n\\n14,224 4,229 \\u2014 251 9 \\u2014 \\u2014\\n\\n\\u2014 \\u2014 \\u2014 196 3,898 171 1,634\\n\\n2021\\n\\n675 107 1,484 (27) (307) (226) 101 1,780 (69) 65\\n\\n7,391 349 7,805\\n\\n449 87\\n\\n\\u2014 \\u2014 232 184 1,868 1,018 \\u2014\\n\\nUBER TECHNOLOGIES, INC.\\n\\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\\n\\nNote 1 \\u2013 Description of Business and Summary of Significant Accounting Policies\\n\\nDescription of Business\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 591, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 83, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Uber's profit in 2020?\"}}], \"graph_search_results\": null}, \"llm_response\": {\"id\": \"chatcmpl-AEP09aw4hSfgVVU9Rl7pJxBUUidjO\", \"choices\": [{\"finish_reason\": \"stop\", \"index\": 0, \"logprobs\": null, \"message\": {\"content\": \"Uber's profit in 2020 was not a profit but a net loss. The net loss attributable to Uber Technologies, Inc. for the year ended December 31, 2020, was $6,768 million [3].\", \"refusal\": null, \"role\": \"assistant\", \"function_call\": null, \"tool_calls\": null}}], \"created\": 1727996061, \"model\": \"gpt-4o-2024-08-06\", \"object\": \"chat.completion\", \"service_tier\": null, \"system_fingerprint\": \"fp_e5e4913e83\", \"usage\": {\"completion_tokens\": 47, \"prompt_tokens\": 2320, \"total_tokens\": 2367, \"completion_tokens_details\": {\"audio_tokens\": null, \"reasoning_tokens\": 0}, \"prompt_tokens_details\": {\"audio_tokens\": null, \"cached_tokens\": 2176}}}}",
- "timestamp": "2024-10-03 22:54:22"
- },
- {
- "key": "completion_record",
- "value": "{\"message_id\": \"3b3d27e8-f949-52e2-85d0-00ac4709d44d\", \"message_type\": \"assistant\", \"timestamp\": \"2024-10-03T22:44:59.813045\", \"feedback\": null, \"score\": null, \"completion_start_time\": \"2024-10-03T22:54:22.800521\", \"completion_end_time\": \"2024-10-03T22:54:23.828972\", \"search_query\": \"Who is John Snow?\", \"search_results\": {\"vector_search_results\": [{\"chunk_id\": \"c08344bb-1740-5330-a6e1-00b558a0008c\", \"document_id\": \"e797da22-8c5d-54e5-bed5-a55954cf6bf9\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.20639122052297343, \"text\": \"\\n\\nAn NFT That Saves Lives\\n\\nMay 2021Noora Health, a nonprofit I've\\nsupported for years, just launched\\na new NFT. It has a dramatic name, Save Thousands of Lives,\\nbecause that's what the proceeds will do.Noora has been saving lives for 7 years. They run programs in\\nhospitals in South Asia to teach new mothers how to take care of\\ntheir babies once they get home. They're in 165 hospitals now. And\\nbecause they know the numbers before and after they start at a new\\nhospital, they can measure the impact they have. It is massive.\\nFor every 1000 live births, they save 9 babies.This number comes from a study\\nof 133,733 families at 28 different\\nhospitals that Noora conducted in collaboration with the Better\\nBirth team at Ariadne Labs, a joint center for health systems\\ninnovation at Brigham and Women\\u2019s Hospital and Harvard T.H. Chan\\nSchool of Public Health.Noora is so effective that even if you measure their costs in the\\nmost conservative way, by dividing their entire budget by the number\\nof lives saved, the cost of saving a life is the lowest I've seen.\\n$1,235.For this NFT, they're going to issue a public report tracking how\\nthis specific tranche of money is spent, and estimating the number\\nof lives saved as a result.NFTs are a new territory, and this way of using them is especially\\nnew, but I'm excited about its potential. And I'm excited to see\\nwhat happens with this particular auction, because unlike an NFT\\nrepresenting something that has already happened,\\nthis NFT gets better as the price gets higher.The reserve price was about $2.5 million, because that's what it\\ntakes for the name to be accurate: that's what it costs to save\\n2000 lives. But the higher the price of this NFT goes, the more\\nlives will be saved. What a sentence to be able to write.\\n\\n\\n \\n\\n\\n\\n \\n\\n\", \"metadata\": {\"version\": \"v0\", \"chunk_id\": 0, \"chunk_order\": 0, \"document_type\": \"html\", \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"996675d0-381f-5b26-b4db-5dcc72babdc2\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.17490996867954944, \"text\": \"Shared and Shared Saver Rides enables unrelated parties traveling along similar routes to benefit from a discounted fare at the cost of possibly longer travel times. With a Shared or Shared Saver Ride, when the first rider requests a ride, our algorithms use the first rider\\u2019s destination and attempt to match them with other riders traveling along a similar route. If a match between riders is made, our algorithms re-route the driver to include the pick-up location of the matched rider on the active route. For Shared and Shared Saver Rides, drivers earn a fixed amount based on a number of factors, including the time and distance of the ride, the base fare charged to riders and the level of rider demand. We determine the rider fare based on the predicted time and distance of the ride, the level of rider demand and the likelihood of being able to match additional riders along the given route, and such fare is quoted to the riders prior to their commitment to the ride. The fare charged to the riders is decoupled\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 276, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 36, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"2ff890d6-cb3f-5c17-88c0-5194b98ba56e\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.16959259872524757, \"text\": \"s, drivers, and the communities they serve.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 77, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 13, \"partitioned_by_unstructured\": true, \"unstructured_is_continuation\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"b5e169c0-9779-5e30-a644-7bdf8308d8a5\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.16769101216250615, \"text\": \"Our Proprietary Data-Driven Technology Platform\\n\\nOur robust technology platform powers the millions of rides and connections that we facilitate every day and provides insights that drive our platform in real-time. We leverage historical data to continuously improve experiences for drivers and riders on our platform. Our platform analyzes large datasets covering the ride lifecycle, from when drivers go online and riders request rides, to when they match, which route to take and any feedback given after the rides. Utilizing machine learning capabilities to predict future behavior based on many years of historical data and use cases, we employ various levers to balance supply and demand in the marketplace, creating increased driver earnings while maintaining strong service levels for riders. We also leverage our data science and algorithms to inform our product development.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 42, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 8, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"2d5c5f3b-571b-5a4a-a8ce-e07922823f78\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.16550283637625984, \"text\": \"Several Swiss administrative bodies have issued decisions in which they classify Drivers as employees of Uber Switzerland, Rasier Operations B.V. or of Uber B.V. for social security or regulatory purposes. We are challenging each of them before the Social Security and Administrative Tribunals. In April 2021, a ruling was made that Uber Switzerland could not be held liable for social security contributions. The litigations with regards to Uber B.V. and Raiser Operations B.V. are still pending for years 2014 to 2019. In January 2022, the Social Security Tribunal of Zurich reclassified drivers who have used the App in 2014 as dependent workers of Uber BV and Rasier Operations BV from a social security standpoint, but this ruling has been appealed before the Federal Tribunal and has no impact on our current operations. The ultimate resolution of the social security matters for the other two entities is uncertain and the amount accrued for this matter is recorded within accrued and other current liabilities on the\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 855, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 130, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"d0449e9c-80cb-5873-bb89-ada360f473cf\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.15934575684717212, \"text\": \"Universal Vaccine Access Campaign - mobilizes a coalition of partners to provide rides to and from COVID-19 vaccination sites for low-income, underinsured and at-risk communities;\\n\\nDisaster Response - provides rides to access vital services both leading up to and in the wake of disasters and other local emergencies when roads are safe to do so; and\\n\\nVoting Access - provides rides to the polls during Federal elections, with a focus on supporting individuals who traditionally face barriers to voting, such as seniors, veterans and communities of color.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 80, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 13, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"123a19db-e2ed-5112-9fbd-19afb707ffcb\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.1561906933784496, \"text\": \"COVID-19\\n\\nIn March 2020, the World Health Organization declared the outbreak of coronavirus (\\u201cCOVID-19\\u201d) a pandemic. The COVID-19 pandemic has rapidly changed market and economic conditions globally, impacting Drivers, Merchants, consumers and business partners, as well as our business, results of operations, financial position, and cash flows. Various governmental restrictions, including the declaration of a federal National Emergency, multiple cities\\u2019 and states\\u2019 declarations of states of emergency, school and business closings, quarantines, restrictions on travel, limitations on social or public gatherings, and other measures have, and may continue to have, an adverse impact on our business and operations, including, for example, by reducing the global demand for Mobility rides. Furthermore, we are experiencing and expect to continue to experience Driver supply constraints, and such supply constraints have been and may continue to be impacted by concerns regarding the COVID-19 pandemic.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 426, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 51, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"00983240-785a-5f53-ba0c-2f848e6f29bd\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.1539415625368621, \"text\": \"nsumers with public transportation networks. Uber uses this same network, technology, operational excellence and product expertise to connect shippers with carriers in the freight industry. Uber is also developing technologies that will provide new solutions to solve everyday problems.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 593, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 84, \"partitioned_by_unstructured\": true, \"unstructured_is_continuation\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"37b90a06-bb7d-5146-b667-18f63610ad8c\", \"document_id\": \"d421207a-d799-5806-8d67-46b2005b15d4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.15079258666171103, \"text\": \"https://www.ycombinator.com/companies/watsi\\n\\nhttps://www.ycombinator.com/companies/movley\\n\\nhttps://www.ycombinator.com/companies/heypurple\\n\\nhttps://www.ycombinator.com/companies/pointhound\\n\\nhttps://www.ycombinator.com/companies/reworkd\\n\\nhttps://www.ycombinator.com/companies/shoobs\\n\\nhttps://www.ycombinator.com/companies/strada\\n\\nhttps://www.ycombinator.com/companies/sweep\\n\\nhttps://www.ycombinator.com/companies/terminal\\n\\nhttps://www.ycombinator.com/companies/sante\\n\\nhttps://www.ycombinator.com/companies/sprx\\n\\nhttps://www.ycombinator.com/companies/sails-co\\n\\nhttps://www.ycombinator.com/companies/dyspatch\\n\\nhttps://www.ycombinator.com/companies/orbio-earth\\n\\nhttps://www.ycombinator.com/companies/epsilon\\n\\nhttps://www.ycombinator.com/companies/new-story\\n\\nhttps://www.ycombinator.com/companies/hatchet-2\\n\\nhttps://www.ycombinator.com/companies/epsilla\\n\\nhttps://www.ycombinator.com/companies/resend\\n\\nhttps://www.ycombinator.com/companies/teamnote\\n\\nhttps://www.ycombinator.com/companies/thread-2\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 19, \"document_type\": \"txt\", \"unstructured_filetype\": \"text/plain\", \"unstructured_languages\": [\"eng\"], \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}, {\"chunk_id\": \"c775cf38-8737-59e8-96fc-4e403041eade\", \"document_id\": \"3e157b3a-8469-51db-90d9-52e7d896b49b\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.1487142056993126, \"text\": \"COVID-19 Response Initiatives\\n\\nWe continue to prioritize the health and safety of our consumers, Drivers and Merchants, our employees and the communities we serve and continue to believe we will play an important role in the economic recovery of cities around the globe. We are focused on navigating the challenges presented by COVID-19 through preserving our liquidity and managing our cash flow by taking preemptive action to enhance our ability to meet our short-term liquidity needs. The pandemic has reduced the demand for our Mobility offering globally, while accelerating the growth of our Delivery offerings. We have responded to the COVID-19 pandemic by launching new, or expanding existing, services or features on an expedited basis, particularly those related to delivery of food and other goods.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 427, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 51, \"partitioned_by_unstructured\": true, \"associated_query\": \"Who is John Snow?\"}}], \"graph_search_results\": null}, \"llm_response\": {\"id\": \"chatcmpl-AEP0BjYF7baSJqZHlIb8v7SK3o0hs\", \"choices\": [{\"finish_reason\": \"stop\", \"index\": 0, \"logprobs\": null, \"message\": {\"content\": \"The provided context does not contain any information about John Snow. Therefore, I am unable to provide an answer based on the given context.\", \"refusal\": null, \"role\": \"assistant\", \"function_call\": null, \"tool_calls\": null}}], \"created\": 1727996063, \"model\": \"gpt-4o-2024-08-06\", \"object\": \"chat.completion\", \"service_tier\": null, \"system_fingerprint\": \"fp_e5e4913e83\", \"usage\": {\"completion_tokens\": 27, \"prompt_tokens\": 1904, \"total_tokens\": 1931, \"completion_tokens_details\": {\"audio_tokens\": null, \"reasoning_tokens\": 0}, \"prompt_tokens_details\": {\"audio_tokens\": null, \"cached_tokens\": 1664}}}}",
- "timestamp": "2024-10-03 22:54:23"
- },
- {
- "key": "search_latency",
- "value": "0.38",
- "timestamp": "2024-10-03 22:56:17"
- },
- {
- "key": "search_latency",
- "value": "0.32",
- "timestamp": "2024-10-03 22:58:09"
- },
- {
- "key": "search_latency",
- "value": "0.24",
- "timestamp": "2024-10-03 22:59:09"
- },
- {
- "key": "search_latency",
- "value": "0.27",
- "timestamp": "2024-10-03 22:59:34"
- },
- {
- "key": "search_latency",
- "value": "0.19",
- "timestamp": "2024-10-03 23:00:08"
- },
- {
- "key": "search_latency",
- "value": "0.20",
- "timestamp": "2024-10-03 23:00:17"
- }
- ],
- "timestamp": "2024-10-03T23:01:50",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220"
- }
- ]
- },
- "analytics": {
- "results": {
- "analytics_data": {
- "search_latencies": {
- "Mean": 0.266,
- "Median": 0.26,
- "Mode": null,
- "Standard Deviation": 0.062,
- "Variance": 0.004
- }
- },
- "filtered_logs": {
- "search_latencies": [
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.20",
- "timestamp": "2024-10-03 23:00:17"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.19",
- "timestamp": "2024-10-03 23:00:08"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.27",
- "timestamp": "2024-10-03 22:59:34"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.24",
- "timestamp": "2024-10-03 22:59:09"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.32",
- "timestamp": "2024-10-03 22:58:09"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.38",
- "timestamp": "2024-10-03 22:56:17"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.28",
- "timestamp": "2024-10-03 22:54:21"
- },
- {
- "run_id": "f90976fd-f4c2-5fb8-85f5-60339936bd32",
- "key": "search_latency",
- "value": "0.25",
- "timestamp": "2024-10-03 22:54:00"
- }
- ]
- }
- }
- }
-}
diff --git a/py/tests/regression/observed_outputs/test_retrieval.json b/py/tests/regression/observed_outputs/test_retrieval.json
deleted file mode 100644
index 60f097cbc..000000000
--- a/py/tests/regression/observed_outputs/test_retrieval.json
+++ /dev/null
@@ -1,767 +0,0 @@
-{
- "search": {
- "results": {
- "vector_search_results": [
- {
- "chunk_id": "0484dba9-2b51-5012-9aad-e6efe7e6688f",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.24205637072844854,
- "text": "Finance Leases\n\n2022 2023 2024 2025 2026 Thereafter Total undiscounted lease payments Less: imputed interest\n\n$\n\n280 $ 312 264 214 198 2,067 3,335 (1,506) 1,829 $\n\n140 60 34 9 \u2014 1 244 (10) 234\n\nTotal lease liabilities\n\n$\n\nAs of December 31, 2021, we had additional operating leases and finance leases, primarily for corporate offices and servers, that have not yet commenced of $421 million and $19 million, respectively. These operating and finance leases will commence between fiscal year 2022 and fiscal year 2023 with lease terms of 2 years to 13 years.\n\nMission Bay 1 & 2\n\nIn 2015, we entered into a joint venture (\u201cJV\u201d) agreement with a real estate developer (\u201cJV Partner\u201d) to develop land (\u201cthe Land\u201d) in San Francisco to construct our new headquarters (the \u201cHeadquarters\u201d). The Headquarters consists of two adjacent office buildings totaling approximately 423,000 rentable square feet. In connection with the JV arrangement, we acquired a 49% interest in the JV, the principal asset of which was the Land.",
- "metadata": {
- "version": "v0",
- "chunk_order": 759,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 109,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "afa9d545-b0fb-57d2-aa8e-47b874b5671e",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.21658311983447587,
- "text": "(c)\n\nCommuting and Corporate Housing Expenses. During your Employment, the Company shall cover the cost of your reasonable and substantiated expenses for travel between your primary residence and the Company\u2019s headquarters in San Francisco and corporate housing in the San Francisco Bay Area, up to a pre-tax maximum of $200,000 per year in the aggregate. All expense reimbursements shall be made in accordance with the Company\u2019s expense reimbursement policy.\n\n(d)",
- "metadata": {
- "version": "v0",
- "chunk_order": 971,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 170,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "57a92100-4201-5909-8794-229f3f111cf9",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.19862812165534216,
- "text": "(b)\n\nSolely for purposes of Article II and related definitional provisions to the extent used therein, the applicable amount of any\n\ncurrency (other than dollars) for purposes of the Loan Documents shall be such Dollar Equivalent amount as determined by the Administrative Agent and notified to the applicable Issuing Bank and the Borrower in accordance with Section 1.06(a). Amounts denominated in a Permitted Foreign Currency will be converted to dollars for the purposes of calculating the Senior Secured Net Leverage Ratio at the Exchange Rate as of the date of calculation.",
- "metadata": {
- "version": "v0",
- "chunk_order": 1266,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 216,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "0621a428-a8a1-505f-81c0-b7d6daceda9a",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.19844957003298025,
- "text": "45-2647441 (I.R.S. Employer Identification No.)\n\n1515 3rd Street San Francisco, California 94158 (Address of principal executive offices, including zip code) (415) 612-8582 (Registrant\u2019s telephone number, including area code) ____________________________________________\n\nSecurities registered pursuant to Section 12(b) of the Act:\n\nTitle of each class Common Stock, par value $0.00001 per share",
- "metadata": {
- "version": "v0",
- "chunk_order": 1,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 1,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "1163974b-141e-50b7-8d19-d8d3d9143410",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.19687992650986197,
- "text": "Item 2. Properties.\n\nOur corporate headquarters are located in San Francisco, California, and consist of approximately 420,000 square feet under lease agreements through May 31,\n\n2030. We maintain additional offices in multiple locations in the U.S. and internationally in Montreal, Canada, Munich, Germany and Minsk, Belarus.\n\nWe lease all of our facilities and do not own any real property. We believe our facilities are adequate and suitable for our current needs and that, should it be\n\nneeded, suitable additional or alternative space will be available to accommodate our operations.\n\n53\n\nItem 3. Legal Proceedings.\n\nSee discussion under the heading Legal Proceedings in Note 9 to the consolidated financial statements included in Part II, Item 8 of this report.\n\nItem 4. Mine Safety Disclosures.\n\nNot applicable.\n\n54\n\nPART II\n\nItem 5. Market for Registrant\u2019s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities.",
- "metadata": {
- "version": "v0",
- "chunk_order": 434,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 53,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "a2660071-661b-5928-ad52-c4106ea95ae9",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.1938294561556151,
- "text": "Interest, net of amount capitalized Income taxes, net of refunds\n\n$\n\nNon-cash investing and financing activities:\n\nConversion of redeemable convertible preferred stock to common stock upon initial public offering Conversion of convertible notes to common stock upon initial public offering Conversion of convertible notes to common stock related to Careem Finance lease obligations Common stock issued in connection with acquisitions Ownership interest received in exchange for divestitures Issuance of Careem Notes including the holdback amount\n\nThe accompanying notes are an integral part of these consolidated financial statements.\n\n81\n\n2019",
- "metadata": {
- "version": "v0",
- "chunk_order": 590,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 83,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "8e93af52-b1cd-5c64-afa4-e3a7fcdf412b",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.19048490339697555,
- "text": "(1) the rate, or methodology for this rate, and conventions for this rate selected or recommendedby the Relevant Governmental Body\n\nfor determining compounded SOFR; provided that:\n\n(2) if, and to the extent that, the Administrative Agent determines that Compounded SOFRcannot\n\nbe determined in accordance with clause (1) above, then the rate, or methodology for this rate, and conventions for this rate that the Administrative Agent determines are substantially consistent with prevailing market convention for determining Compounded SOFR for U.S. dollar-denominated syndicated credit facilities at such time (as a result of amendment or as originally executed);",
- "metadata": {
- "version": "v0",
- "chunk_order": 1102,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 186,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "ccacb15d-aef0-5143-b448-380401c71cd1",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.18895512256160085,
- "text": "\u201cCanadian BA Rate Borrowing\u201d refers to a Borrowing bearing interest at a rate determined by reference to the Canadian BA Rate.\n\n10\n\n\u201cCanadian BA Rate Loan\u201d refers to a Loan bearing interest at a rate determined by reference to the Canadian BA Rate.\n\n\u201cCanadian Dollars\u201d means the lawful currency of Canada.\n\n\u201cCapital Lease Obligations\u201d of any Person means the obligations of such Person to pay rent or other amounts under any lease of (or",
- "metadata": {
- "version": "v0",
- "chunk_order": 1085,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 182,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "6a340f36-ef68-59dd-b8a7-a5f5d6cd6d00",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.18443752434477645,
- "text": "Property and equipment, net\n\n$\n\nWe capitalized $76 million and $55 million in internal-use software costs during the years ended December 31, 2020 and 2021, respectively, which is included in property and equipment, net on the consolidated balance sheets. Amortization of capitalized software development costs was $22 million, $55 million, and $69 million for the years ended December 31, 2019, 2020 and 2021, respectively.\n\nAmounts in construction in progress represent buildings, leasehold improvements, assets under construction, and other assets not placed in service.",
- "metadata": {
- "version": "v0",
- "chunk_order": 754,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 107,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- },
- {
- "chunk_id": "0a7e6a54-3804-5b5b-a9cd-a5b4a4753483",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.18329537710006516,
- "text": "principal payments on finance lease obligations for $35.5 million.\n\nCash provided by financing activities was $512.6 million for the year ended December 31, 2020, which primarily consisted of proceeds from issuance of our 2025\n\nNotes of $734.1 million offset by the purchase of the Capped Calls for $132.7 million.\n\nLiquidity and Capital Resources",
- "metadata": {
- "version": "v0",
- "chunk_order": 531,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 71,
- "partitioned_by_unstructured": true,
- "associated_query": "What is the capital of France?"
- }
- }
- ],
- "graph_search_results": null
- }
- },
- "basic_rag": {
- "results": {
- "completion": {
- "id": "chatcmpl-AEP09aw4hSfgVVU9Rl7pJxBUUidjO",
- "choices": [
- {
- "finish_reason": "stop",
- "index": 0,
- "logprobs": null,
- "message": {
- "content": "Uber's profit in 2020 was not a profit but a net loss. The net loss attributable to Uber Technologies, Inc. for the year ended December 31, 2020, was $6,768 million [3].",
- "refusal": null,
- "role": "assistant",
- "function_call": null,
- "tool_calls": null
- }
- }
- ],
- "created": 1727996061,
- "model": "gpt-4o-2024-08-06",
- "object": "chat.completion",
- "service_tier": null,
- "system_fingerprint": "fp_e5e4913e83",
- "usage": {
- "completion_tokens": 47,
- "prompt_tokens": 2320,
- "total_tokens": 2367,
- "completion_tokens_details": {
- "audio_tokens": null,
- "reasoning_tokens": 0
- },
- "prompt_tokens_details": {
- "audio_tokens": null,
- "cached_tokens": 2176
- }
- }
- },
- "search_results": {
- "vector_search_results": [
- {
- "chunk_id": "328e5142-bd6c-5553-b5a0-8fdbd72ee6c6",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.7446624344989735,
- "text": "Revenue was $17.5 billion, or up 57% year-over-year, reflecting the overall growth in our Delivery business and an increase in Freight revenue attributable to the acquisition of Transplace in the fourth quarter of 2021 as well as growth in the number of shippers and carriers on the network combined with an increase in volumes with our top shippers.\n\nNet loss attributable to Uber Technologies, Inc. was $496 million, a 93% improvement year-over-year, driven by a $1.6 billion pre-tax gain on the sale of our ATG Business to Aurora, a $1.6 billion pre-tax net benefit relating to Uber\u2019s equity investments, as well as reductions in our fixed cost structure and increased variable cost efficiencies. Net loss attributable to Uber Technologies, Inc. also included $1.2 billion of stock-based compensation expense.",
- "metadata": {
- "version": "v0",
- "chunk_order": 445,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 53,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "a0b5c2f6-7dcd-5865-b2c6-0b3cd2189e57",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.7071749146476451,
- "text": "Total costs and expenses Loss from operations\n\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\n\nLess: net loss attributable to non-controlling interests, net of tax\n\n100 %\n\n46 % 16 % 32 % 20 % 24 % 5 % 144 % (44)% (4)% (15)% (62)% (2)% \u2014 % (61)% \u2014 % (61)%\n\n100 %\n\n54 % 11 % 27 % 12 % 13 % 5 % 122 % (22)% (3)% 19 % (6)% (3)% \u2014 % (3)% \u2014 % (3)%\n\nNet loss attributable to Uber Technologies, Inc.\n\n(1)\n\nTotals of percentage of revenues may not foot due to rounding.\n\nComparison of the Years Ended December 31, 2020 and 2021\n\nRevenue\n\nYear Ended December 31,\n\n(In millions, except percentages)\n\n2020\n\n2021\n\n2020 to 2021 % Change\n\nRevenue\n\n$\n\n11,139 $\n\n17,455\n\n57 %",
- "metadata": {
- "version": "v0",
- "chunk_order": 463,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 57,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "500bf649-b2a8-521b-bdb2-78cdc342531f",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6810148751433258,
- "text": "Year Ended December 31, 2020\n\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization\n\nTotal costs and expenses Loss from operations\n\nInterest expense Other income (expense), net Loss before income taxes and loss from equity method investments Provision for (benefit from) income taxes Loss from equity method investments Net loss including non-controlling interests\n\nLess: net loss attributable to non-controlling interests, net of tax\n\n$\n\n13,000 $\n\n6,061 2,302 4,626 4,836 3,299 472 21,596 (8,596) (559) 722 (8,433) 45 (34) (8,512) (6) (8,506) $\n\n11,139 $\n\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\n\nNet loss attributable to Uber Technologies, Inc. Net loss per share attributable to Uber Technologies, Inc. common stockholders:\n\n$\n\nBasic\n\n$\n\n(6.81) $\n\n(3.86) $",
- "metadata": {
- "version": "v0",
- "chunk_order": 574,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 77,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "90b1f17b-a97f-5552-9951-fbc6df634039",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6806196963602422,
- "text": "Less: net loss attributable to non-controlling interests, net of tax\n\n$\n\n11,139 $\n\n5,154 1,819 3,583 2,205 2,666 575 16,002 (4,863) (458) (1,625) (6,946) (192) (34) (6,788) (20) (6,768) $\n\n17,455\n\n9,351 1,877 4,789 2,054 2,316 902 21,289 (3,834) (483) 3,292 (1,025) (492) (37) (570) (74) (496)\n\nNet loss attributable to Uber Technologies, Inc.\n\n$\n\n54\n\nThe following table sets forth the components of our consolidated statements of operations for each of the periods presented as a percentage of revenue\n\n(1)\n\n:\n\nYear Ended December 31, 2021 2020\n\nRevenue Costs and expenses Cost of revenue, exclusive of depreciation and amortization shown separately below Operations and support Sales and marketing Research and development General and administrative Depreciation and amortization",
- "metadata": {
- "version": "v0",
- "chunk_order": 462,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 56,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "845a2b04-70ee-5a70-91fa-44016677fd92",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6537216512130718,
- "text": "The Uber Service activities are performed to satisfy our sole performance obligation in the transaction, which is to connect Drivers and Merchants with end-\n\nusers to facilitate the completion of a successful transaction.\n\nIn 2020, we began charging Mobility end-users a fee to use the platform in certain markets. In these transactions, in addition to a performance obligation to Drivers, we also have a performance obligation to end-users, which is to connect end-users to Drivers in the marketplace. We recognize revenue when a trip is complete. We present revenue on a net basis for these transactions, as we do not control the service provided by Drivers to end-users. For the years ended December 31, 2020 and 2021, we recognized total revenue of $323 million and $336 million, respectively, associated with these fees charged to end-users.",
- "metadata": {
- "version": "v0",
- "chunk_order": 642,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 90,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "1739d713-3fb6-534f-8ddb-7ff9cd6484c7",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.638846836158823,
- "text": "Other income (expense), net\n\n$\n\nDuring the year ended December 31, 2020, gain on business divestitures, net represented a $154 million gain on the sale of our Uber Eats India operations to Zomato recognized in the first quarter of 2020 and a $77 million gain on the sale of our European Freight Business to sennder GmbH (\u201cSennder\u201d) recognized in the fourth quarter of 2020, partially offset by a $27 million loss on the sale of our JUMP operations to Lime recognized in the second quarter of 2020.\n\n(1)\n\nDuring the year ended December 31, 2021, gain on business divestitures, net represented a $1.6 billion gain on the sale of our ATG Business to Aurora",
- "metadata": {
- "version": "v0",
- "chunk_order": 799,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 118,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "70e9089c-56e0-52f7-80ea-ad66fe1f9a79",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6322252592771936,
- "text": "2019\n\n100.0 %\n\n60.2 17.6 41.6 22.5 32.8 174.7 (74.7) \u2014 2.8 (71.9) 0.1 (72.0)%\n\n2019 to 2020 % Change\n\n(35) %\n\nsecond quarter of 2021. These increases were offset by investments in driver supply by increasing driver incentives recorded as a reduction to revenue by $942.9 million in 2021 as compared to the prior year as rider demand outpaced driver supply during certain periods of the pandemic recovery in 2021. Revenue in 2020 was also higher in the first quarter of 2020 prior to the implementation of shelter-in-place orders and other travel restrictions across North America beginning March 2020.\n\nWe expect to see continued recovery in demand for our platform and the resulting positive impacts on revenue as there are more widespread immunity levels, more communities reopen and other restrictive travel and social distancing measures in response to COVID-19 are eased. However, we cannot predict the impact of COVID variants and the longer term impact of the pandemic on consumer behavior.\n\nCost of Revenue\n\n2021",
- "metadata": {
- "version": "v0",
- "chunk_order": 493,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 63,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "5425859b-cbfa-54e4-9729-5f92c6f61efc",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6301008528290666,
- "text": "For additional discussion, see the risk factor titled \u201c\u2014If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, and carriers, whether as a result of competition or other factors, our platform will become less appealing to platform users, and our financial results would be adversely impacted.\u201d included in Part I, Item 1A of this Annual Report on Form 10-K as well our 2021 ESG Report and our 2021 People and Culture Report. The information in these reports is not a part of this Form 10-K.\n\nAdditional Information\n\nWe were founded in 2009 and incorporated as Ubercab, Inc., a Delaware corporation, in July 2010. In February 2011, we changed our name to Uber\n\nTechnologies, Inc. Our principal executive offices are located at 1515 3rd Street, San Francisco, California 94158, and our telephone number is (415) 612-8582.\n\n10",
- "metadata": {
- "version": "v0",
- "chunk_order": 77,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 12,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "9dae5d7c-4bcd-52f0-bdfc-a9e327c56069",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.6285498210400674,
- "text": "Uber Technologies, Inc. (\u201cUber,\u201d \u201cwe,\u201d \u201cour,\u201d or \u201cus\u201d) was incorporated in Delaware in July 2010, and is headquartered in San Francisco, California. Uber is a technology platform that uses a massive network, leading technology, operational excellence and product expertise to power movement from point A to point B. Uber develops and operates proprietary technology applications supporting a variety of offerings on its platform (\u201cplatform(s)\u201d or \u201cPlatform(s)\u201d). Uber connects consumers (\u201cRider(s)\u201d) with independent providers of ride services (\u201cMobility Driver(s)\u201d) for ridesharing services, and connects Riders and other consumers (\u201cEaters\u201d) with restaurants, grocers and other stores (collectively, \u201cMerchants\u201d) with delivery service providers (\u201cCouriers\u201d) for meal preparation, grocery and other delivery services. Riders and Eaters are collectively referred to as \u201cend-user(s)\u201d or \u201cconsumer(s).\u201d Mobility Drivers and Couriers are collectively referred to as \u201cDriver(s).\u201d Uber also connects consumers with public",
- "metadata": {
- "version": "v0",
- "chunk_order": 592,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 84,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- },
- {
- "chunk_id": "9bba73a7-4ebf-51f2-8a55-553a93d2ac41",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.628432135926722,
- "text": "Year Ended December 31, 2020\n\n1,000 49 1,189 (27) \u2014 (138) (34) 8,939 (4) 3,824\n\n247 125 2,628 (527) (891) (224) 38 1,379 (92) (4,327)\n\n8,209 34 12,067 $\n\n12,067 (349) 7,391 $\n\n332 $ 133\n\n412 $ 82\n\n14,224 4,229 \u2014 251 9 \u2014 \u2014\n\n\u2014 \u2014 \u2014 196 3,898 171 1,634\n\n2021\n\n675 107 1,484 (27) (307) (226) 101 1,780 (69) 65\n\n7,391 349 7,805\n\n449 87\n\n\u2014 \u2014 232 184 1,868 1,018 \u2014\n\nUBER TECHNOLOGIES, INC.\n\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\n\nNote 1 \u2013 Description of Business and Summary of Significant Accounting Policies\n\nDescription of Business",
- "metadata": {
- "version": "v0",
- "chunk_order": 591,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 83,
- "partitioned_by_unstructured": true,
- "associated_query": "What was Uber's profit in 2020?"
- }
- }
- ],
- "graph_search_results": null
- }
- }
- },
- "hybrid_rag": {
- "results": {
- "completion": {
- "id": "chatcmpl-AEP0BjYF7baSJqZHlIb8v7SK3o0hs",
- "choices": [
- {
- "finish_reason": "stop",
- "index": 0,
- "logprobs": null,
- "message": {
- "content": "The provided context does not contain any information about Jon Snow. Therefore, I am unable to provide an answer based on the given context.",
- "refusal": null,
- "role": "assistant",
- "function_call": null,
- "tool_calls": null
- }
- }
- ],
- "created": 1727996063,
- "model": "gpt-4o-2024-08-06",
- "object": "chat.completion",
- "service_tier": null,
- "system_fingerprint": "fp_e5e4913e83",
- "usage": {
- "completion_tokens": 27,
- "prompt_tokens": 1904,
- "total_tokens": 1931,
- "completion_tokens_details": {
- "audio_tokens": null,
- "reasoning_tokens": 0
- },
- "prompt_tokens_details": {
- "audio_tokens": null,
- "cached_tokens": 1664
- }
- }
- },
- "search_results": {
- "vector_search_results": [
- {
- "chunk_id": "c08344bb-1740-5330-a6e1-00b558a0008c",
- "document_id": "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.20639122052297343,
- "text": "\n\nAn NFT That Saves Lives\n\nMay 2021Noora Health, a nonprofit I've\nsupported for years, just launched\na new NFT. It has a dramatic name, Save Thousands of Lives,\nbecause that's what the proceeds will do.Noora has been saving lives for 7 years. They run programs in\nhospitals in South Asia to teach new mothers how to take care of\ntheir babies once they get home. They're in 165 hospitals now. And\nbecause they know the numbers before and after they start at a new\nhospital, they can measure the impact they have. It is massive.\nFor every 1000 live births, they save 9 babies.This number comes from a study\nof 133,733 families at 28 different\nhospitals that Noora conducted in collaboration with the Better\nBirth team at Ariadne Labs, a joint center for health systems\ninnovation at Brigham and Women\u2019s Hospital and Harvard T.H. Chan\nSchool of Public Health.Noora is so effective that even if you measure their costs in the\nmost conservative way, by dividing their entire budget by the number\nof lives saved, the cost of saving a life is the lowest I've seen.\n$1,235.For this NFT, they're going to issue a public report tracking how\nthis specific tranche of money is spent, and estimating the number\nof lives saved as a result.NFTs are a new territory, and this way of using them is especially\nnew, but I'm excited about its potential. And I'm excited to see\nwhat happens with this particular auction, because unlike an NFT\nrepresenting something that has already happened,\nthis NFT gets better as the price gets higher.The reserve price was about $2.5 million, because that's what it\ntakes for the name to be accurate: that's what it costs to save\n2000 lives. But the higher the price of this NFT goes, the more\nlives will be saved. What a sentence to be able to write.\n\n\n \n\n\n\n \n\n",
- "metadata": {
- "version": "v0",
- "chunk_id": 0,
- "chunk_order": 0,
- "document_type": "html",
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "996675d0-381f-5b26-b4db-5dcc72babdc2",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.17490996867954944,
- "text": "Shared and Shared Saver Rides enables unrelated parties traveling along similar routes to benefit from a discounted fare at the cost of possibly longer travel times. With a Shared or Shared Saver Ride, when the first rider requests a ride, our algorithms use the first rider\u2019s destination and attempt to match them with other riders traveling along a similar route. If a match between riders is made, our algorithms re-route the driver to include the pick-up location of the matched rider on the active route. For Shared and Shared Saver Rides, drivers earn a fixed amount based on a number of factors, including the time and distance of the ride, the base fare charged to riders and the level of rider demand. We determine the rider fare based on the predicted time and distance of the ride, the level of rider demand and the likelihood of being able to match additional riders along the given route, and such fare is quoted to the riders prior to their commitment to the ride. The fare charged to the riders is decoupled",
- "metadata": {
- "version": "v0",
- "chunk_order": 276,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 36,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "2ff890d6-cb3f-5c17-88c0-5194b98ba56e",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.16959259872524757,
- "text": "s, drivers, and the communities they serve.",
- "metadata": {
- "version": "v0",
- "chunk_order": 77,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 13,
- "partitioned_by_unstructured": true,
- "unstructured_is_continuation": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "b5e169c0-9779-5e30-a644-7bdf8308d8a5",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.16769101216250615,
- "text": "Our Proprietary Data-Driven Technology Platform\n\nOur robust technology platform powers the millions of rides and connections that we facilitate every day and provides insights that drive our platform in real-time. We leverage historical data to continuously improve experiences for drivers and riders on our platform. Our platform analyzes large datasets covering the ride lifecycle, from when drivers go online and riders request rides, to when they match, which route to take and any feedback given after the rides. Utilizing machine learning capabilities to predict future behavior based on many years of historical data and use cases, we employ various levers to balance supply and demand in the marketplace, creating increased driver earnings while maintaining strong service levels for riders. We also leverage our data science and algorithms to inform our product development.",
- "metadata": {
- "version": "v0",
- "chunk_order": 42,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 8,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "2d5c5f3b-571b-5a4a-a8ce-e07922823f78",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.16550283637625984,
- "text": "Several Swiss administrative bodies have issued decisions in which they classify Drivers as employees of Uber Switzerland, Rasier Operations B.V. or of Uber B.V. for social security or regulatory purposes. We are challenging each of them before the Social Security and Administrative Tribunals. In April 2021, a ruling was made that Uber Switzerland could not be held liable for social security contributions. The litigations with regards to Uber B.V. and Raiser Operations B.V. are still pending for years 2014 to 2019. In January 2022, the Social Security Tribunal of Zurich reclassified drivers who have used the App in 2014 as dependent workers of Uber BV and Rasier Operations BV from a social security standpoint, but this ruling has been appealed before the Federal Tribunal and has no impact on our current operations. The ultimate resolution of the social security matters for the other two entities is uncertain and the amount accrued for this matter is recorded within accrued and other current liabilities on the",
- "metadata": {
- "version": "v0",
- "chunk_order": 855,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 130,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "d0449e9c-80cb-5873-bb89-ada360f473cf",
- "document_id": "2f576170-c4f9-5141-a910-a0924f341de4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.15934575684717212,
- "text": "Universal Vaccine Access Campaign - mobilizes a coalition of partners to provide rides to and from COVID-19 vaccination sites for low-income, underinsured and at-risk communities;\n\nDisaster Response - provides rides to access vital services both leading up to and in the wake of disasters and other local emergencies when roads are safe to do so; and\n\nVoting Access - provides rides to the polls during Federal elections, with a focus on supporting individuals who traditionally face barriers to voting, such as seniors, veterans and communities of color.",
- "metadata": {
- "version": "v0",
- "chunk_order": 80,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 13,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "123a19db-e2ed-5112-9fbd-19afb707ffcb",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.1561906933784496,
- "text": "COVID-19\n\nIn March 2020, the World Health Organization declared the outbreak of coronavirus (\u201cCOVID-19\u201d) a pandemic. The COVID-19 pandemic has rapidly changed market and economic conditions globally, impacting Drivers, Merchants, consumers and business partners, as well as our business, results of operations, financial position, and cash flows. Various governmental restrictions, including the declaration of a federal National Emergency, multiple cities\u2019 and states\u2019 declarations of states of emergency, school and business closings, quarantines, restrictions on travel, limitations on social or public gatherings, and other measures have, and may continue to have, an adverse impact on our business and operations, including, for example, by reducing the global demand for Mobility rides. Furthermore, we are experiencing and expect to continue to experience Driver supply constraints, and such supply constraints have been and may continue to be impacted by concerns regarding the COVID-19 pandemic.",
- "metadata": {
- "version": "v0",
- "chunk_order": 426,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 51,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "00983240-785a-5f53-ba0c-2f848e6f29bd",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.1539415625368621,
- "text": "nsumers with public transportation networks. Uber uses this same network, technology, operational excellence and product expertise to connect shippers with carriers in the freight industry. Uber is also developing technologies that will provide new solutions to solve everyday problems.",
- "metadata": {
- "version": "v0",
- "chunk_order": 593,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 84,
- "partitioned_by_unstructured": true,
- "unstructured_is_continuation": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "37b90a06-bb7d-5146-b667-18f63610ad8c",
- "document_id": "d421207a-d799-5806-8d67-46b2005b15d4",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.15079258666171103,
- "text": "https://www.ycombinator.com/companies/watsi\n\nhttps://www.ycombinator.com/companies/movley\n\nhttps://www.ycombinator.com/companies/heypurple\n\nhttps://www.ycombinator.com/companies/pointhound\n\nhttps://www.ycombinator.com/companies/reworkd\n\nhttps://www.ycombinator.com/companies/shoobs\n\nhttps://www.ycombinator.com/companies/strada\n\nhttps://www.ycombinator.com/companies/sweep\n\nhttps://www.ycombinator.com/companies/terminal\n\nhttps://www.ycombinator.com/companies/sante\n\nhttps://www.ycombinator.com/companies/sprx\n\nhttps://www.ycombinator.com/companies/sails-co\n\nhttps://www.ycombinator.com/companies/dyspatch\n\nhttps://www.ycombinator.com/companies/orbio-earth\n\nhttps://www.ycombinator.com/companies/epsilon\n\nhttps://www.ycombinator.com/companies/new-story\n\nhttps://www.ycombinator.com/companies/hatchet-2\n\nhttps://www.ycombinator.com/companies/epsilla\n\nhttps://www.ycombinator.com/companies/resend\n\nhttps://www.ycombinator.com/companies/teamnote\n\nhttps://www.ycombinator.com/companies/thread-2",
- "metadata": {
- "version": "v0",
- "chunk_order": 19,
- "document_type": "txt",
- "unstructured_filetype": "text/plain",
- "unstructured_languages": [
- "eng"
- ],
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- },
- {
- "chunk_id": "c775cf38-8737-59e8-96fc-4e403041eade",
- "document_id": "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "score": 0.1487142056993126,
- "text": "COVID-19 Response Initiatives\n\nWe continue to prioritize the health and safety of our consumers, Drivers and Merchants, our employees and the communities we serve and continue to believe we will play an important role in the economic recovery of cities around the globe. We are focused on navigating the challenges presented by COVID-19 through preserving our liquidity and managing our cash flow by taking preemptive action to enhance our ability to meet our short-term liquidity needs. The pandemic has reduced the demand for our Mobility offering globally, while accelerating the growth of our Delivery offerings. We have responded to the COVID-19 pandemic by launching new, or expanding existing, services or features on an expedited basis, particularly those related to delivery of food and other goods.",
- "metadata": {
- "version": "v0",
- "chunk_order": 427,
- "document_type": "pdf",
- "unstructured_filetype": "application/pdf",
- "unstructured_languages": [
- "eng"
- ],
- "unstructured_page_number": 51,
- "partitioned_by_unstructured": true,
- "associated_query": "Who is Jon Snow?"
- }
- }
- ],
- "graph_search_results": null
- }
- }
- },
- "streaming_rag": {
- "results": {
- "completion": {
- "choices": [
- {
- "message": {
- "content": "[{\"chunk_id\": \"31e7a71c-0f89-5b27-972e-89bb8eb1415a\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6675316889565188, \"text\": \"Total Stockholders\\u2019 Equity (Deficit) 1,676,163\\n\\n5,184\\n\\n\\u2014\\n\\n(26,298)\\n\\n28,637\\n\\n(1) 721,710 (2,038) (1,009,359) 1,393,998\\n\\nLyft, Inc. Consolidated Statements of Cash Flows (in thousands)\\n\\n2021\\n\\nCash flows from operating activities Net loss Adjustments to reconcile net loss to net cash used in operating activities\\n\\n$\\n\\n(1,009,359)\\n\\nDepreciation and amortization Stock-based compensation Amortization of premium on marketable securities Accretion of discount on marketable securities Amortization of debt discount and issuance costs Deferred income tax from convertible senior notes Loss on sale and disposal of assets, net Gain on divestiture Other Changes in operating assets and liabilities, net effects of acquisition\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 572, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 82, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"ab62fbab-c5f8-5b3d-ab2e-2484c77c81fb\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6647442183050991, \"text\": \"79\\n\\n2019 3,615,960\\n\\n2,176,469 636,116 1,505,640 814,122 1,186,093 6,318,440 (2,702,480) \\u2014 102,595 (2,599,885) 2,356 (2,602,241)\\n\\n(11.44)\\n\\n227,498\\n\\n81,321 75,212 971,941 72,046 398,791\\n\\nLyft, Inc. Consolidated Statements of Comprehensive Loss (in thousands)\\n\\nNet loss Other comprehensive income (loss)\\n\\n$\\n\\nYear Ended December 31, 2020 (1,752,857) $\\n\\n2021 (1,009,359) $\\n\\nForeign currency translation adjustment Unrealized gain (loss) on marketable securities, net of taxes\\n\\nOther comprehensive income (loss)\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 567, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 79, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"66efee73-7df1-5786-b56b-3b0a6f9bf390\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6629626355789533, \"text\": \"Overview\\n\\nLyft, Inc (the \\u201cCompany\\u201d or \\u201cLyft\\u201d) started a movement to revolutionize transportation. In 2012, we launched our peer-to-peer marketplace for on-demand ridesharing and have continued to pioneer innovations aligned with our mission. Today, Lyft is one of the largest multimodal transportation networks in the United States and Canada.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 16, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 5, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"f043e24f-5d5d-531e-973a-277e65f3b10e\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6602276170118095, \"text\": \"Revenues from Contracts with Customers (ASC 606)\\n\\nWe generate substantially all our revenue from our ridesharing marketplace that connects drivers and riders. We recognize revenue from fees paid by drivers for use of our Lyft Platform offerings in accordance with ASC 606 as described in Note 2 of the notes to our consolidated financial statements. Drivers enter into terms of service (\\u201cToS\\u201d) with us in order to use our Lyft Driver App.\\n\\n58\\n\\n2019 to 2020 % Change\\n\\n19.0% (1.8)% (6.7)% 2.3%\\n\\nWe provide a service to drivers to complete a successful transportation service for riders. This service includes on-demand lead generation that assists drivers to find, receive and fulfill on-demand requests from riders seeking transportation services and related collection activities using our Lyft Platform. As a result, our single performance obligation in the transaction is to connect drivers with riders to facilitate the completion of a successful transportation service for riders.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 459, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 58, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"095cb246-80ec-5c35-96b4-ba902851e0e7\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6476712260834182, \"text\": \"Corporate Information\\n\\nWe were incorporated in 2007 as Bounder Web, Inc., a Delaware corporation. In 2008, we changed our name to Zimride, Inc. We founded Lyft in 2012 and\\n\\nchanged our name to Lyft, Inc. in 2013 when we sold the assets related to our Zimride operations.\\n\\n13\\n\\nAvailable Information\\n\\nOur website is located at www.lyft.com, and our investor relations website is located at investor.lyft.com. Copies of our Annual Report on Form 10-K, Quarterly Reports on Form 10-Q, Current Reports on Form 8-K and amendments to these reports filed or furnished pursuant to Section 13(a) or 15(d) of the Exchange Act, as amended, are available free of charge on our investor relations website as soon as reasonably practicable after we file such material electronically with or furnish it to the Securities and Exchange Commission (the \\u201cSEC\\u201d). The SEC also maintains a website that contains our SEC filings at www.sec.gov.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 82, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 13, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"0da7d65c-a0e7-541f-a404-71f32346d988\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6405097674715796, \"text\": \"We generate substantially all of our revenue from our ridesharing marketplace that connects drivers and riders. We collect service fees and commissions from drivers for their use of our ridesharing marketplace. As drivers accept more rider leads and complete more rides, we earn more revenue. We also generate revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by making our ridesharing marketplace available to organizations through our Lyft Business offerings, such as our Concierge and Corporate Business Travel programs. In the second quarter of 2021, we began generating revenues from licensing and data access agreements, primarily with third-party autonomous vehicle companies.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 20, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 5, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"f7cdb289-ca94-5e40-909d-7a01a8a5d378\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6394687509853404, \"text\": \"Revenue Recognition\\n\\nThe Company generates its revenue from its multimodal transportation networks that offer access to a variety of transportation options through the Lyft Platform and mobile-based applications. Substantially all of the Company\\u2019s revenue is generated from its ridesharing marketplace that connects drivers and riders and is recognized in accordance with Accounting Standards Codification Topic 606 (\\u201cASC 606\\u201d). In addition, the Company generates revenue in accordance with ASC 606 from licensing and data access, primarily with third-party autonomous vehicle companies. The Company also generates rental revenue from Flexdrive, its network of Light Vehicles and Lyft Rentals, which is recognized in accordance with Accounting Standards Codification Topic 842 (\\u201cASC 842\\u201d).\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 591, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 86, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"14c06942-0f82-5a5f-9936-03919f6dac96\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6308834176729083, \"text\": \"Light Vehicle Rider and Lyft Rentals Renter Incentives\\n\\nIncentives offered to Light Vehicle riders and Lyft Rentals renters were not material for the years ended December 31, 2021 and 2020.\\n\\nFor the years ended December 31, 2021, 2020 and 2019, in relation to the driver, rider, Light Vehicle riders and Lyft Rentals renters incentive programs, the Company recorded $1.3 billion, $390.8 million and $560.3 million as a reduction to revenue and $64.7 million, $135.0 million and $381.5 million as sales and marketing expense, respectively.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 611, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 89, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"e0f5bd80-c3d2-58d6-a310-e04fa1618a5a\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6294196468937618, \"text\": \"Software Development Costs\\n\\nThe Company incurs costs related to developing the Lyft Platform and related support systems. The Company capitalizes development costs related to the Lyft Platform and related support systems once the preliminary project stage is complete and it is probable that the project will be completed and the software will be used to perform the function intended. The Company capitalized $16.2 million and $12.8 million of software development costs during the year ended December 31, 2021 and 2020, respectively. For the year ended December 31, 2019, capitalized software development costs was not material.\\n\\nInsurance Reserves\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 649, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 94, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}, {\"chunk_id\": \"ab302f43-5dcf-5e04-82ee-754565cd1cda\", \"document_id\": \"2f576170-c4f9-5141-a910-a0924f341de4\", \"user_id\": \"2acb499e-8428-543b-bd85-0d9098718220\", \"collection_ids\": [\"122fdf6a-e116-546b-a8f6-e4cb2e2c0a09\"], \"score\": 0.6209709459253888, \"text\": \"32.1\\u2020\\n\\nCertifications of Principal Executive Officer and Principal Financial Officer pursuant to 18 U.S.C. Section 1350, as adopted pursuant to Section 906 of the Sarbanes-Oxley Act of 2002.\\n\\n101\\n\\nThe following financial information from Lyft, Inc.\\u2019s Annual Report on Form 10-K for the fiscal year ended December 31, 2021 formatted in Inline XBRL (eXtensible Business Reporting Language): (i) Consolidated Statements of Operations for the fiscal years ended December 31, 2021, 2020 and 2019; (ii) Consolidated Statements of Comprehensive Income (Loss) for the fiscal years ended December 31, 2021, 2020, and 2019; (iii) Consolidated Balance Sheets as of December 31, 2021 and 2020; (iv) Consolidated Statements of Cash Flows for the fiscal years ended December 31, 2021, 2020, and 2019; (v) Consolidated Statements of Redeemable Convertible Preferred Stock and Stockholders\\u2019 Equity for the fiscal years ended December 31, 2021, 2020, and 2019; and (vi) Notes to the Consolidated Financial Statements.\", \"metadata\": {\"version\": \"v0\", \"chunk_order\": 817, \"document_type\": \"pdf\", \"unstructured_filetype\": \"application/pdf\", \"unstructured_languages\": [\"eng\"], \"unstructured_page_number\": 127, \"partitioned_by_unstructured\": true, \"associated_query\": \"What was Lyft's profit in 2020?\"}}]Lyft's profit in 2020 was not a profit but a net loss. According to the provided context, Lyft reported a net loss of $1,752,857,000 for the year ended December 31, 2020 [2]."
- }
- }
- ]
- }
- }
- }
-}
diff --git a/py/tests/regression/observed_outputs/test_user_management.json b/py/tests/regression/observed_outputs/test_user_management.json
deleted file mode 100644
index bdccd1240..000000000
--- a/py/tests/regression/observed_outputs/test_user_management.json
+++ /dev/null
@@ -1,131 +0,0 @@
-{
- "register_user": {
- "results": {
- "id": "45c36a22-6c43-56a7-b319-9abaae3f1bc0",
- "email": "test_6b09cf57@example.com",
- "is_active": true,
- "is_superuser": false,
- "created_at": "2024-10-03T23:00:58.216612Z",
- "updated_at": "2024-10-03T23:00:58.216612Z",
- "is_verified": false,
- "collection_ids": [],
- "hashed_password": "$2b$12$TSiUhA7qXile9sAIwsde0uDFXuAitx5l7d79zq7H8pQitGgl1lmMi",
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null
- }
- },
- "login_user": {
- "results": {
- "access_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzZiMDljZjU3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI4MDAwMDU4LjUxMjY4NCwidG9rZW5fdHlwZSI6ImFjY2VzcyJ9.EnGuTyqILBZFTZdI22mYXjuNs0KsC03Mu7wDh8I6VUE",
- "token_type": "access"
- },
- "refresh_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzZiMDljZjU3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI4NjAxMjU4LCJ0b2tlbl90eXBlIjoicmVmcmVzaCJ9.twvIBFxz_I8nODyNcWW8keJvHL7PEFBieqLm18pxol8",
- "token_type": "refresh"
- }
- }
- },
- "user_info": {
- "results": {
- "id": "45c36a22-6c43-56a7-b319-9abaae3f1bc0",
- "email": "test_6b09cf57@example.com",
- "is_active": true,
- "is_superuser": false,
- "created_at": "2024-10-03T23:00:58.216612Z",
- "updated_at": "2024-10-03T23:00:58.216612Z",
- "is_verified": true,
- "collection_ids": [
- "20efadf0-b06f-5685-a969-ba2800d8fee4"
- ],
- "hashed_password": "$2b$12$TSiUhA7qXile9sAIwsde0uDFXuAitx5l7d79zq7H8pQitGgl1lmMi",
- "verification_code_expiry": null,
- "name": null,
- "bio": null,
- "profile_picture": null
- }
- },
- "change_password": {
- "results": {
- "message": "Password changed successfully"
- }
- },
- "update_profile": {
- "results": "AuthMethods.update_user() missing 1 required positional argument: 'user_id'"
- },
- "refresh_token": {
- "results": {
- "access_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzZiMDljZjU3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI4MDAwMDU5LjAzNjY0MywidG9rZW5fdHlwZSI6ImFjY2VzcyJ9.jqD5fmtx_FP6OGA8N2cjIWx9lD-MBVqP1EnY564ecVM",
- "token_type": "access"
- },
- "refresh_token": {
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0XzZiMDljZjU3QGV4YW1wbGUuY29tIiwiZXhwIjoxNzI4NjAxMjU5LCJ0b2tlbl90eXBlIjoicmVmcmVzaCJ9.dQKCMP8-irRaZ0NiP7Pkh6aQdHDwG_ZTyIDiLFZV9vw",
- "token_type": "refresh"
- }
- }
- },
- "superuser_test": {
- "results": [
- {
- "user_id": "2acb499e-8428-543b-bd85-0d9098718220",
- "email": "admin@example.com",
- "is_superuser": true,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-10-03T16:58:58.104181Z",
- "updated_at": "2024-10-03T16:58:58.104181Z",
- "collection_ids": [
- "122fdf6a-e116-546b-a8f6-e4cb2e2c0a09"
- ],
- "num_files": 15,
- "total_size_in_bytes": 8862496,
- "document_ids": [
- "5b1bd54f-4d70-56b9-a017-a618bc75f94c",
- "30f950f0-c692-57c5-b6ec-ff78ccf5ccdc",
- "2f576170-c4f9-5141-a910-a0924f341de4",
- "3e157b3a-8469-51db-90d9-52e7d896b49b",
- "451adbbd-e24e-5c9b-80a8-f424b6c7accb",
- "d421207a-d799-5806-8d67-46b2005b15d4",
- "db02076e-989a-59cd-98d5-e24e15a0bd27",
- "716fea3a-826b-5b27-8e59-ffbd1a35455a",
- "7b0f40c5-2ace-5781-ae35-ead99ddee8c5",
- "c3291abf-8a4e-5d9d-80fd-232ef6fd8526",
- "e797da22-8c5d-54e5-bed5-a55954cf6bf9",
- "7c1105fc-8f62-5a8f-ac81-fe88f3ec9e4c",
- "57eec3df-cf68-5559-a80d-ae3fb55b9af1",
- "01d514a4-5cb1-5c86-be8c-0c5c312a02c9",
- "52e12576-090f-59db-91f4-6d4b2e29ae6c"
- ]
- },
- {
- "user_id": "45c36a22-6c43-56a7-b319-9abaae3f1bc0",
- "email": "test_6b09cf57@example.com",
- "is_superuser": false,
- "is_active": true,
- "is_verified": true,
- "created_at": "2024-10-03T23:00:58.216612Z",
- "updated_at": "2024-10-03T23:00:59.019061Z",
- "collection_ids": [
- "20efadf0-b06f-5685-a969-ba2800d8fee4"
- ],
- "num_files": 0,
- "total_size_in_bytes": 0,
- "document_ids": []
- }
- ],
- "total_entries": 2
- },
- "logout": {
- "results": {
- "message": "Logged out successfully"
- }
- },
- "delete_account": {
- "results": {
- "message": "User account f7495bfb-58e1-539e-8b8d-6cb61f821c2b deleted successfully."
- }
- }
-}
diff --git a/py/tests/regression/runner.py b/py/tests/regression/runner.py
deleted file mode 100644
index 2e9d577ac..000000000
--- a/py/tests/regression/runner.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import argparse
-import importlib
-import os
-
-from colorama import Fore, Style, init
-from test_cases.base import BaseTest, RegressionTest
-
-# TODO: need to import this from the package, not from the local directory
-from r2r import R2RClient
-
-
-class RegressionTestRunner:
- def __init__(
- self,
- check_only: bool = False,
- update_expected: bool = False,
- base_url: str = "http://localhost:7272",
- ):
- self.client = R2RClient(base_url=base_url)
- self.tests: list[BaseTest] = []
- self.test_order = [
- "TestDocumentManagement",
- "TestRetrieval",
- "TestUserManagement",
- "TestObservability",
- "TestGroupManagement",
- ]
- self.check_only = check_only
- self.update_expected = update_expected
-
- if not check_only:
- self.outputs_dir = os.path.join(
- os.path.dirname(__file__),
- (
- "expected_outputs"
- if self.update_expected
- else "observed_outputs"
- ),
- )
- os.makedirs(self.outputs_dir, exist_ok=True)
-
- def load_tests(self):
- test_dir = os.path.join(os.path.dirname(__file__), "test_cases")
- for class_name in self.test_order:
- # Convert camel case to snake case
- snake_case = "".join(
- ["_" + c.lower() if c.isupper() else c for c in class_name]
- ).lstrip("_")
- filename = f"test_{snake_case[5:]}.py" # Remove "test_" prefix
- if filename in os.listdir(test_dir):
- module_name = f"tests.regression.test_cases.{filename[:-3]}"
- module = importlib.import_module(module_name)
- test_class = getattr(module, class_name)
- self.tests.append(test_class(self.client))
-
- def run_all(self) -> bool:
- for test in self.tests:
- print(
- f"{Fore.CYAN}Running test suite: {test.__class__.__name__}{Style.RESET_ALL}"
- )
- test.run_and_save_outputs(self.outputs_dir)
-
- return self.compare_all() if not self.update_expected else True
-
- def compare_all(self) -> bool:
- all_passed = True
- expected_outputs_dir = os.path.join(
- os.path.dirname(__file__), "expected_outputs"
- )
- observed_outputs_dir = os.path.join(
- os.path.dirname(__file__), "observed_outputs"
- )
- print(
- f"\n{Fore.CYAN}Comparing results for test suites:{Style.RESET_ALL}"
- )
- for test in self.tests:
- if test.compare_outputs(
- observed_outputs_dir, expected_outputs_dir
- ):
- print(
- f"{Fore.GREEN}{test.__class__.__name__} ✓{Style.RESET_ALL}"
- )
- else:
- print(
- f"{Fore.RED}{test.__class__.__name__} ✗{Style.RESET_ALL}"
- )
- all_passed = False
- return all_passed
-
- def update_all_expected_outputs(self):
- for test in self.tests:
- print(
- f"{Fore.YELLOW}Updating expected output for test suite: {test.__class__.__name__}{Style.RESET_ALL}"
- )
- test.update_expected_outputs(self.outputs_dir)
-
-
-def main():
- parser = argparse.ArgumentParser(description="Run regression tests")
- parser.add_argument(
- "--check-only",
- action="store_true",
- help="Run in check mode (compare existing outputs without running tests)",
- )
- parser.add_argument(
- "--update-expected",
- action="store_true",
- help="Run in update mode (update expected outputs)",
- )
- args = parser.parse_args()
-
- runner = RegressionTestRunner(args.check_only)
- runner.load_tests()
-
- if args.check_only:
- print(f"{Fore.CYAN}Running in check-only mode{Style.RESET_ALL}")
- success = runner.compare_all()
- elif runner.update_expected:
- print(f"{Fore.YELLOW}Updating expected outputs{Style.RESET_ALL}")
- runner.run_all() # Run tests to generate outputs
- runner.update_all_expected_outputs()
- if os.environ.get("CHECK_UPDATED_OUTPUTS", "").lower() != "true":
- success = runner.compare_all()
- else:
- print(f"{Fore.CYAN}Running all tests{Style.RESET_ALL}")
- success = runner.run_all()
-
- if success:
- print(f"\n{Fore.GREEN}All tests passed successfully!{Style.RESET_ALL}")
- else:
- print(
- f"\n{Fore.RED}Some tests failed. Please check the output above for details.{Style.RESET_ALL}"
- )
- exit(1)
-
-
-if __name__ == "__main__":
- main()
diff --git a/py/tests/regression/test_cases/__init__.py b/py/tests/regression/test_cases/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/regression/test_cases/base.py b/py/tests/regression/test_cases/base.py
deleted file mode 100644
index afcf57647..000000000
--- a/py/tests/regression/test_cases/base.py
+++ /dev/null
@@ -1,273 +0,0 @@
-import json
-import os
-import re
-from typing import Any, Callable, Optional
-
-from colorama import Fore, Style
-from deepdiff import DeepDiff
-
-# TODO: need to import this from the package, not from the local directory
-from r2r import R2RClient
-
-
-def _to_snake_case(name: str) -> str:
- pattern = re.compile(r"(? bool:
- result = self._run_test(client)
- return self._compare_output(result, self.expected_output)
-
- def update_expected_output(self, client: R2RClient):
- result = self._run_test(client)
- self._save_expected_output(result)
-
- def _run_test(self, client: R2RClient) -> dict[str, Any]:
- return self.test_function(client)
-
- def _load_expected_output(self) -> dict[str, Any]:
- with open(self.expected_output_file, "r") as f:
- return json.load(f)
-
- def _save_expected_output(self, output: dict[str, Any]):
- with open(self.expected_output_file, "w") as f:
- json.dump(output, f, indent=2)
-
- def _compare_output(
- self, actual: dict[str, Any], expected: dict[str, Any]
- ) -> bool:
- diff = self._custom_diff(expected, actual)
- if diff:
- print(f"\nTest {self.name} failed. Differences found:")
- print(json.dumps(diff, indent=2))
- return False
- return True
-
- def _custom_diff(
- self, expected: dict[str, Any], actual: dict[str, Any]
- ) -> dict[str, Any]:
- diff = {}
-
- expected_results = expected.get("results", {})
- actual_results = actual.get("results", {})
-
- if "completion" in expected_results and "completion" in actual_results:
- # Custom comparison for content field
- expected_completion = self._get_completion_content(
- expected_results
- )
- actual_completion = self._get_completion_content(actual_results)
- if (
- expected_completion
- and actual_completion
- and not self._fuzzy_content_match(
- expected_completion, actual_completion
- )
- ):
- diff["content_mismatch"] = {
- "expected": expected_completion,
- "actual": actual_completion,
- }
-
- # Use DeepDiff for the rest, ignoring specified fields
- try:
- deep_diff = DeepDiff(
- expected_results,
- actual_results,
- ignore_order=True,
- exclude_paths=self.exclude_paths,
- )
- if deep_diff:
- diff["other_differences"] = self._serialize_deep_diff(
- deep_diff
- )
- # Print the specific fields that are different
- for change_type, changes in deep_diff.items():
- if change_type == "values_changed":
- for path, change in changes.items():
- print(f"Field '{path}' changed:")
- print(f" Expected: {change['old_value']}")
- print(f" Actual: {change['new_value']}")
-
- except Exception as e:
- diff["deepdiff_error"] = (
- f"Error in DeepDiff comparison: {str(e)}"
- )
- return diff
- elif (
- "completion" in expected_results or "completion" in actual_results
- ):
- diff["content_mismatch"] = {
- "expected_results": expected_results,
- "actual_results": actual_results,
- }
- else:
- deep_diff = DeepDiff(
- expected_results,
- actual_results,
- ignore_order=True,
- exclude_paths=self.exclude_paths,
- )
-
- if deep_diff:
- diff["other_differences"] = self._serialize_deep_diff(
- deep_diff
- )
- # Print the specific fields that are different
- for change_type, changes in deep_diff.items():
- if change_type == "values_changed":
- for path, change in changes.items():
- print(f"Field '{path}' changed:")
- print(f" Expected: {change['old_value']}")
- print(f" Actual: {change['new_value']}")
-
- return self._serialize_deep_diff(deep_diff)
-
- return diff
-
- def _serialize_deep_diff(self, deep_diff):
- if isinstance(deep_diff, dict):
- serializable_diff = {}
- for key, value in deep_diff.items():
- if isinstance(value, (dict, list)):
- serializable_diff[key] = self._serialize_deep_diff(value)
- elif isinstance(value, (int, float, str, bool, type(None))):
- serializable_diff[key] = value
- else:
- serializable_diff[key] = str(value)
- return serializable_diff
- elif isinstance(deep_diff, list):
- return [self._serialize_deep_diff(item) for item in deep_diff]
- elif isinstance(deep_diff, (int, float, str, bool, type(None))):
- return deep_diff
- else:
- return str(deep_diff)
-
- def _get_completion_content(self, data: dict[str, Any]) -> Optional[str]:
- try:
- return data["completion"]["choices"][0]["message"]["content"]
- except (KeyError, IndexError):
- return None
-
- def _fuzzy_content_match(
- self, expected: str, actual: str, threshold: float = 0.6
- ) -> bool:
- expected_words = set(re.findall(r"\w+", expected.lower()))
- actual_words = set(re.findall(r"\w+", actual.lower()))
- common_words = expected_words.intersection(actual_words)
- similarity = len(common_words) / max(
- len(expected_words), len(actual_words)
- )
- return similarity >= threshold
-
-
-class BaseTest:
- def __init__(self, client: R2RClient):
- self.client = client
- self.expected_outputs_file = os.path.join(
- os.path.dirname(os.path.dirname(__file__)),
- "expected_outputs",
- f"{_to_snake_case(self.__class__.__name__)}.json",
- )
- self.exclude_paths_map = {}
-
- def run_and_save_outputs(self, actual_outputs_dir: str):
- actual_outputs = {}
- for test_name, test_func in self.get_test_cases().items():
- snake_case_name = _to_snake_case(test_name)
- print(f" Running test: {snake_case_name}")
- result = test_func(self.client)
- actual_outputs[snake_case_name] = result
-
- actual_outputs_file = os.path.join(
- actual_outputs_dir,
- f"{_to_snake_case(self.__class__.__name__)}.json",
- )
- with open(actual_outputs_file, "w") as f:
- json.dump(actual_outputs, f, indent=2)
-
- def compare_outputs(
- self, observed_outputs_dir: str, expected_outputs_dir: str
- ) -> bool:
- all_passed = True
- expected_outputs_file = os.path.join(
- expected_outputs_dir,
- f"{_to_snake_case(self.__class__.__name__)}.json",
- )
- observed_outputs_file = os.path.join(
- observed_outputs_dir,
- f"{_to_snake_case(self.__class__.__name__)}.json",
- )
-
- with open(expected_outputs_file, "r") as f:
- expected_outputs = json.load(f)
-
- with open(observed_outputs_file, "r") as f:
- observed_outputs = json.load(f)
-
- for test_name in self.get_test_cases().keys():
- snake_case_name = _to_snake_case(test_name)
- exclude_paths = self.exclude_paths_map.get(snake_case_name, [])
- regression_test = RegressionTest(
- test_name,
- lambda x: x,
- expected_outputs.get(snake_case_name, {}),
- exclude_paths,
- )
- if regression_test._compare_output(
- observed_outputs.get(snake_case_name, {}),
- expected_outputs.get(snake_case_name, {}),
- ):
- print(
- f"{Fore.GREEN} Test {snake_case_name} passed ✓{Style.RESET_ALL}"
- )
- else:
- print(
- f"{Fore.RED} Test {snake_case_name} failed ✗{Style.RESET_ALL}"
- )
- all_passed = False
-
- return all_passed
-
- def update_expected_outputs(self, actual_outputs_dir: str):
- actual_outputs_file = os.path.join(
- actual_outputs_dir,
- f"{_to_snake_case(self.__class__.__name__)}.json",
- )
- with open(actual_outputs_file, "r") as f:
- actual_outputs = json.load(f)
-
- with open(self.expected_outputs_file, "w") as f:
- json.dump(actual_outputs, f, indent=2)
-
- def _load_expected_outputs(self) -> dict[str, Any]:
- if os.path.exists(self.expected_outputs_file):
- with open(self.expected_outputs_file, "r") as f:
- return json.load(f)
- return {}
-
- def set_exclude_paths(self, test_name: str, exclude_paths: list[str] = []):
- self.exclude_paths_map[_to_snake_case(test_name)] = exclude_paths
-
- def get_test_cases(self) -> dict[str, callable]:
- raise NotImplementedError(
- "Subclasses must implement get_test_cases method"
- )
-
- def _load_expected_outputs(self) -> dict[str, Any]:
- with open(self.expected_outputs_file, "r") as f:
- return json.load(f)
diff --git a/py/tests/regression/test_cases/test_document_management.py b/py/tests/regression/test_cases/test_document_management.py
deleted file mode 100644
index 0c7bee8c3..000000000
--- a/py/tests/regression/test_cases/test_document_management.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import os
-import time
-
-from core import R2RException
-from tests.regression.test_cases.base import BaseTest
-
-
-class TestDocumentManagement(BaseTest):
- CHUNKS_FILE_ID = "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
- UPDATE_FILE_ID = "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"
- DELETE_FILE_ID = "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
-
- def __init__(self, client):
- super().__init__(client)
- exclude_paths = [f"root[{i}]['created_at']" for i in range(20)] + [
- f"root[{i}]['updated_at']" for i in range(20)
- ]
-
- self.set_exclude_paths("documents_overview", exclude_paths)
- self.set_exclude_paths("rerun_documents_overview", exclude_paths)
-
- def get_test_cases(self):
- return {
- "ingest_sample_files": lambda client: self.ingest_sample_files_test(
- client
- ),
- "reingest_sample_file": lambda client: self.ingest_sample_files_test(
- client, do_sleep=False
- ),
- "documents_overview": lambda client: self.documents_overview_test(
- client
- ),
- "document_chunks_test": lambda client: self.document_chunks_test(
- client
- ),
- "update_document_test": lambda client: self.update_document_test(
- client
- ),
- "rerun_documents_overview_test_1": lambda client: self.documents_overview_test(
- client
- ),
- "delete_document_test": lambda client: self.delete_document_test(
- client
- ),
- "rerun_documents_overview_test_2": lambda client: self.documents_overview_test(
- client
- ),
- "rerun_document_chunks_test": lambda client: self.document_chunks_test(
- client
- ),
- }
-
- def ingest_sample_files_test(self, client, do_sleep=True):
- file_path = os.path.abspath(__file__)
- data_path = os.path.join(
- os.path.dirname(file_path),
- "..",
- "..",
- "..",
- "core",
- "examples",
- "data",
- )
- try:
- result = client.ingest_files(
- [
- os.path.join(data_path, file_name)
- for file_name in os.listdir(data_path)
- ]
- )
- if do_sleep:
- time.sleep(300)
- return result
- except R2RException as e:
- return {"results": str(e)}
-
- def documents_overview_test(self, client):
- try:
- return client.documents_overview()
- except R2RException as e:
- return {"results": str(e)}
-
- def document_chunks_test(self, client):
- try:
- # Now delete the file
- chunks_response = client.list_document_chunks(
- TestDocumentManagement.CHUNKS_FILE_ID
- )
- return chunks_response
- except R2RException as e:
- return {"results": str(e)}
-
- def update_document_test(self, client):
- try:
- # Now update the file
- file_path = os.path.join(
- os.path.dirname(__file__),
- "..",
- "..",
- "..",
- "core",
- "examples",
- "data",
- "aristotle_v2.txt",
- )
- update_response = client.update_files(
- [file_path], [TestDocumentManagement.UPDATE_FILE_ID]
- )
- time.sleep(20)
- return update_response
- except R2RException as e:
- return {"results": str(e)}
-
- def delete_document_test(self, client):
- try:
- # Now delete the file
- delete_response = client.delete(
- {"document_id": {"$eq": TestDocumentManagement.DELETE_FILE_ID}}
- )
- return delete_response
- except R2RException as e:
- return {"results": str(e)}
diff --git a/py/tests/regression/test_cases/test_graph_creation.py b/py/tests/regression/test_cases/test_graph_creation.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/py/tests/regression/test_cases/test_group_management.py b/py/tests/regression/test_cases/test_group_management.py
deleted file mode 100644
index 543c4e667..000000000
--- a/py/tests/regression/test_cases/test_group_management.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import uuid
-
-from tests.regression.test_cases.base import BaseTest
-
-
-class TestGroupManagement(BaseTest):
- def __init__(self, client):
- super().__init__(client)
- self.admin_email = "admin@example.com"
- self.admin_password = "change_me_immediately"
- self.user_password = "test_password"
-
- keys_to_ignore = ["group_id", "created_at", "updated_at"]
- self.set_exclude_paths(
- "create_groups_test",
- [f"root['group_1']['results']['{key}']" for key in keys_to_ignore]
- + [
- f"root['group_1']['results']['{key}']"
- for key in keys_to_ignore
- ],
- )
-
- keys_to_ignore += ["hashed_password"]
- self.set_exclude_paths(
- "add_users_to_groups_test",
- [f"root['user_1']['results']['{key}']" for key in keys_to_ignore]
- + [
- f"root['group_1']['results']['{key}']"
- for key in keys_to_ignore
- ],
- )
-
- def get_test_cases(self):
- return {
- "create_groups": self.create_groups_test,
- "add_users_to_groups": self.add_users_to_groups_test,
- "group_based_document_access": self.group_based_document_access_test,
- "admin_ingest_documents": self.admin_ingest_documents_test,
- "user_ingest_and_search": self.user_ingest_and_search_test,
- "cleanup": self.cleanup_test,
- }
-
- def create_groups_test(self, client):
- try:
- client.login(self.admin_email, self.admin_password)
- group_name_1 = f"Test Group 1 {uuid.uuid4()}"
- group_name_2 = f"Test Group 2 {uuid.uuid4()}"
- group_1 = client.create_group(
- group_name_1, "A test group for permissions"
- )
- group_2 = client.create_group(
- group_name_2, "Another test group for permissions"
- )
- self.group_id_1 = group_1["results"]["group_id"]
- self.group_id_2 = group_2["results"]["group_id"]
- return {"group_1": group_1, "group_2": group_2}
- except Exception as e:
- return {"error": str(e)}
-
- def add_users_to_groups_test(self, client):
- try:
- self.user_email_1 = f"user1_{uuid.uuid4()}@example.com"
- self.user_email_2 = f"user2_{uuid.uuid4()}@example.com"
- user_1 = client.register(self.user_email_1, self.user_password)
- user_2 = client.register(self.user_email_2, self.user_password)
- self.user_id_1 = user_1["results"]["id"]
- self.user_id_2 = user_2["results"]["id"]
-
- client.add_user_to_group(self.user_id_1, self.group_id_1)
- client.add_user_to_group(self.user_id_2, self.group_id_2)
- client.add_user_to_group(self.user_id_2, self.group_id_1)
-
- return {"user_1": user_1, "user_2": user_2}
- except Exception as e:
- return {"error": str(e)}
-
- def group_based_document_access_test(self, client):
- try:
- # Admin ingests a document for group 1
- client.login(self.admin_email, self.admin_password)
- admin_ingest = client.ingest_files(
- file_paths=["tests/regression/test_data/test_document.txt"],
- metadatas=[{"group_ids": [str(self.group_id_1)]}],
- )
-
- # User 1 searches for documents
- client.login(self.user_email_1, self.user_password)
- user_1_search = client.search("test document")
-
- # User 2 searches for documents
- client.login(self.user_email_2, self.user_password)
- user_2_search = client.search("test document")
-
- return {
- "admin_ingest": admin_ingest,
- "user_1_search": user_1_search,
- "user_2_search": user_2_search,
- }
- except Exception as e:
- return {"error": str(e)}
-
- def admin_ingest_documents_test(self, client):
- try:
- client.login(self.admin_email, self.admin_password)
-
- # Admin ingests a document for group 1
- admin_ingest_group1 = client.ingest_files(
- file_paths=[
- "tests/regression/test_data/admin_document_group1.txt"
- ],
- metadatas=[{"group_ids": [str(self.group_id_1)]}],
- )
-
- # Admin ingests a document for user 1
- admin_ingest_user1 = client.ingest_files(
- file_paths=[
- "tests/regression/test_data/admin_document_user1.txt"
- ]
- )
-
- return {
- "admin_ingest_group1": admin_ingest_group1,
- "admin_ingest_user1": admin_ingest_user1,
- }
- except Exception as e:
- return {"error": str(e)}
-
- def user_ingest_and_search_test(self, client):
- try:
- # User 1 actions
- client.login(self.user_email_1, self.user_password)
- user_1_ingest = client.ingest_files(
- file_paths=["tests/regression/test_data/user1_document.txt"]
- )
- user_1_ingest_group = client.ingest_files(
- file_paths=[
- "tests/regression/test_data/user1_document_group.txt"
- ],
- metadatas=[{"group_ids": [str(self.group_id_1)]}],
- )
-
- user_1_search = client.search("document")
-
- # User 2 actions
- client.login(self.user_email_2, self.user_password)
- user_2_ingest = client.ingest_files(
- file_paths=["tests/regression/test_data/user2_document.txt"]
- )
- user_2_search = client.search("document")
-
- return {
- "user_1_ingest": user_1_ingest,
- "user_1_search": user_1_search,
- "user_2_ingest": user_2_ingest,
- "user_2_search": user_2_search,
- }
- except Exception as e:
- return {"error": str(e)}
-
- def cleanup_test(self, client):
- try:
- client.login(self.admin_email, self.admin_password)
- client.delete_group(self.group_id_1)
- client.delete_group(self.group_id_2)
- client.delete_user(self.user_id_1, self.user_password)
- client.delete_user(self.user_id_2, self.user_password)
- return {"status": "cleanup completed"}
- except Exception as e:
- return {"error": str(e)}
diff --git a/py/tests/regression/test_cases/test_observability.py b/py/tests/regression/test_cases/test_observability.py
deleted file mode 100644
index a65071434..000000000
--- a/py/tests/regression/test_cases/test_observability.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from tests.regression.test_cases.base import BaseTest
-
-
-class TestObservability(BaseTest):
- def __init__(self, client):
- super().__init__(client)
- # Add exclude_paths as needed
-
- def get_test_cases(self):
- return {
- "users_overview": lambda client: self.users_overview_test(client),
- "logs": lambda client: self.logs_test(client),
- "analytics": lambda client: self.analytics_test(client),
- }
-
- def users_overview_test(self, client):
- return client.users_overview()
-
- def logs_test(self, client):
- return client.logs()
-
- def analytics_test(self, client):
- return client.analytics(
- {"search_latencies": "search_latency"},
- {"search_latencies": ["basic_statistics", "search_latency"]},
- )
diff --git a/py/tests/regression/test_cases/test_retrieval.py b/py/tests/regression/test_cases/test_retrieval.py
deleted file mode 100644
index 98de4b0e3..000000000
--- a/py/tests/regression/test_cases/test_retrieval.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from tests.regression.test_cases.base import BaseTest
-
-
-class TestRetrieval(BaseTest):
- RAG_EXCLUSIONS = (
- [
- f"root['completion']['{field}']"
- for field in [
- "id",
- "system_fingerprint",
- "usage",
- "created",
- ]
- ]
- + ["root['completion']['choices'][0]['message']['content']"]
- + [f"root['search_results'][{i}]['score']" for i in range(10)]
- )
-
- def __init__(self, client):
- super().__init__(client)
- # ignore scores since math epsilon fails for nested floats, exact text is sufficient.
- self.set_exclude_paths(
- "search",
- [
- f"root['vector_search_results'][{i}]['score']"
- for i in range(10)
- ],
- )
- self.set_exclude_paths("basic_rag", TestRetrieval.RAG_EXCLUSIONS)
- self.set_exclude_paths("hybrid_rag", TestRetrieval.RAG_EXCLUSIONS)
- self.set_exclude_paths("streaming_rag", TestRetrieval.RAG_EXCLUSIONS)
-
- def get_test_cases(self):
- return {
- "search": lambda client: self.search_test(client),
- "basic_rag": lambda client: self.basic_rag_test(client),
- "hybrid_rag": lambda client: self.hybrid_rag_test(client),
- "streaming_rag": lambda client: self.streaming_rag_test(client),
- }
-
- def search_test(self, client):
- try:
- return client.search("What is the capital of France?")
- except Exception as e:
- return {"results": str(e)}
-
- def hybrid_search_test(self, client):
- try:
- return client.search(
- "What is the capital of France?", {"use_hybrid_search": True}
- )
- except Exception as e:
- return {"results": str(e)}
-
- def basic_rag_test(self, client):
- try:
- return client.rag("What was Uber's profit in 2020?")
- except Exception as e:
- return {"results": str(e)}
-
- def hybrid_rag_test(self, client):
- try:
- return client.rag("Who is Jon Snow?", {"use_hybrid_search": True})
- except Exception as e:
- return {"results": str(e)}
-
- def streaming_rag_test(self, client):
- try:
- response = client.rag(
- "What was Lyft's profit in 2020?",
- rag_generation_config={"stream": True},
- )
- return {
- "results": {
- "completion": {
- "choices": [
- {
- "message": {
- "content": f"{''.join([chunk for chunk in response])}"
- }
- }
- ]
- }
- }
- }
- except Exception as e:
- return {"results": str(e)}
diff --git a/py/tests/regression/test_cases/test_user_management.py b/py/tests/regression/test_cases/test_user_management.py
deleted file mode 100644
index c405a7511..000000000
--- a/py/tests/regression/test_cases/test_user_management.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import uuid
-
-from tests.regression.test_cases.base import BaseTest
-
-
-class TestUserManagement(BaseTest):
- VARIABLE_USER_FIELDS = [
- "id",
- "email",
- "created_at",
- "updated_at",
- "hashed_password",
- "access_token",
- "refresh_token",
- ]
-
- def __init__(self, client):
- super().__init__(client)
- self.set_exclude_paths(
- "register_user",
- [
- f"root['{field}']"
- for field in TestUserManagement.VARIABLE_USER_FIELDS
- ],
- )
- self.set_exclude_paths(
- "login_user",
- [
- f"root['{field}']"
- for field in ["access_token", "refresh_token"]
- ],
- )
- self.set_exclude_paths(
- "user_info",
- [
- f"root['{field}']"
- for field in TestUserManagement.VARIABLE_USER_FIELDS
- ],
- )
- self.set_exclude_paths(
- "change_password",
- [
- f"root['{field}']"
- for field in TestUserManagement.VARIABLE_USER_FIELDS
- ],
- )
- self.set_exclude_paths(
- "update_profile",
- [
- f"root['{field}']"
- for field in TestUserManagement.VARIABLE_USER_FIELDS
- ],
- )
- self.set_exclude_paths(
- "refresh_token",
- [
- f"root['{field}']"
- for field in TestUserManagement.VARIABLE_USER_FIELDS
- ],
- )
- self.user_id_string = str(uuid.uuid4()).split("-")[0]
- self.user_id_string_2 = str(uuid.uuid4()).split("-")[0]
-
- def get_test_cases(self):
- return {
- "register_user": lambda client: self.register_user_test(client),
- "login_user": lambda client: self.login_user_test(client),
- "user_info": lambda client: self.user_info_test(client),
- "change_password": lambda client: self.change_password_test(
- client
- ),
- # "reset_password": lambda client: self.reset_password_test(client),
- "update_profile": lambda client: self.update_profile_test(client),
- "refresh_token": lambda client: self.refresh_token_test(client),
- "superuser_test": lambda client: self.superuser_test(client),
- "logout": lambda client: self.logout_test(client),
- "delete_account": lambda client: self.delete_user_test(client),
- "login_user": lambda client: self.login_user_test(client),
- "refresh_token": lambda client: self.refresh_token_test(client),
- }
-
- def register_user_test(self, client):
- try:
- email = f"test_{self.user_id_string}@example.com"
- password = "password123"
- user = client.register(email, password)
- self.user = user
- return user
- except Exception as e:
- return {"results": str(e)}
-
- def login_user_test(self, client):
- try:
- email = f"test_{self.user_id_string}@example.com"
- password = "password123"
- login = client.login(email, password)
- return login
- except Exception as e:
- return {"results": str(e)}
-
- def user_info_test(self, client):
- try:
- return client.user()
- except Exception as e:
- return {"results": str(e)}
-
- def change_password_test(self, client):
- try:
- return client.change_password("password123", "new_password")
- except Exception as e:
- return {"results": str(e)}
-
- # def reset_password_test(self, client):
- # try:
- # reset_request = client.request_password_reset("test@example.com")
- # # In a real scenario, we'd need to get the reset token from the email
- # reset_token = "mock_reset_token"
- # return client.confirm_password_reset(reset_token, "new_password")
- # except Exception as e:
- # return {"results": str(e)}
-
- def update_profile_test(self, client):
- try:
- return client.update_user(name="John Doe", bio="R2R enthusiast")
- except Exception as e:
- return {"results": str(e)}
-
- def delete_user_test(self, client):
- try:
- email = f"test_{self.user_id_string_2}@example.com"
- password = "password123"
- user = client.register(email, password)
-
- return client.delete_user(user["results"]["id"], "password123")
- except Exception as e:
- return {"results": str(e)}
-
- def refresh_token_test(self, client):
- try:
- return client.refresh_access_token()
- except Exception as e:
- return {"results": str(e)}
-
- def superuser_test(self, client):
- try:
- # Login as admin
- client.login("admin@example.com", "change_me_immediately")
- return client.users_overview()
- except Exception as e:
- return {"results": str(e)}
-
- def logout_test(self, client):
- try:
- return client.logout()
- except Exception as e:
- return {"results": str(e)}
diff --git a/py/tests/regression/test_data/admin_document_group1.txt b/py/tests/regression/test_data/admin_document_group1.txt
deleted file mode 100644
index 4c2f5aba5..000000000
--- a/py/tests/regression/test_data/admin_document_group1.txt
+++ /dev/null
@@ -1 +0,0 @@
-admin_document_group1
diff --git a/py/tests/regression/test_data/admin_document_user1.txt b/py/tests/regression/test_data/admin_document_user1.txt
deleted file mode 100644
index 2010bbe57..000000000
--- a/py/tests/regression/test_data/admin_document_user1.txt
+++ /dev/null
@@ -1 +0,0 @@
-admin_document_user1
diff --git a/py/tests/regression/test_data/test_document.txt b/py/tests/regression/test_data/test_document.txt
deleted file mode 100644
index 402428f6c..000000000
--- a/py/tests/regression/test_data/test_document.txt
+++ /dev/null
@@ -1 +0,0 @@
-this is a test document
diff --git a/py/tests/regression/test_data/user1_document.txt b/py/tests/regression/test_data/user1_document.txt
deleted file mode 100644
index 2eb90e11e..000000000
--- a/py/tests/regression/test_data/user1_document.txt
+++ /dev/null
@@ -1 +0,0 @@
-user1_document
diff --git a/py/tests/regression/test_data/user1_document_group.txt b/py/tests/regression/test_data/user1_document_group.txt
deleted file mode 100644
index 1c6e16b61..000000000
--- a/py/tests/regression/test_data/user1_document_group.txt
+++ /dev/null
@@ -1 +0,0 @@
-user1_document_group
diff --git a/py/tests/regression/test_data/user2_document.txt b/py/tests/regression/test_data/user2_document.txt
deleted file mode 100644
index 79b842e10..000000000
--- a/py/tests/regression/test_data/user2_document.txt
+++ /dev/null
@@ -1 +0,0 @@
-user2_document
diff --git a/services/clustering/Dockerfile.clustering b/services/clustering/Dockerfile.clustering
new file mode 100644
index 000000000..47af2fabe
--- /dev/null
+++ b/services/clustering/Dockerfile.clustering
@@ -0,0 +1,25 @@
+FROM python:3.12-slim AS builder
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ gcc g++ musl-dev curl libffi-dev \
+ && apt-get clean && rm -rf /var/lib/apt/lists/* \
+ && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+RUN pip install --no-cache-dir poetry
+
+# Add Rust to PATH
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+# Install graspologic and other dependencies
+RUN pip install --no-cache-dir fastapi uvicorn networkx "graspologic[leiden]" future pydantic==2.8.2
+
+COPY main.py .
+
+EXPOSE 7276
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7276"]
diff --git a/services/clustering/main.py b/services/clustering/main.py
new file mode 100644
index 000000000..ddf7fb31c
--- /dev/null
+++ b/services/clustering/main.py
@@ -0,0 +1,88 @@
+import logging
+from typing import Any, List, Optional
+
+import networkx as nx
+import uvicorn
+from fastapi import FastAPI, HTTPException
+
+# Make sure graspologic and networkx are installed
+# Requires that "graspologic[leiden]" extras are installed if needed.
+from graspologic.partition import hierarchical_leiden
+from pydantic import BaseModel, Field
+
+app = FastAPI()
+logger = logging.getLogger("graspologic_service")
+logger.setLevel(logging.INFO)
+
+class Relationship(BaseModel):
+ id: str
+ subject: str
+ object: str
+ weight: float = 1.0
+
+class LeidenParams(BaseModel):
+ # Add any parameters you use in your code. Here are some examples:
+ resolution: float = 1.0
+ randomness: float = 0.001
+ max_cluster_size: int = 1000
+ extra_forced_iterations: int = 0
+ use_modularity: bool = True
+ random_seed: int = 7272
+ weight_attribute: str = "weight"
+ # Add any other parameters as needed.
+
+class ClusterRequest(BaseModel):
+ relationships: List[Relationship]
+ leiden_params: LeidenParams
+
+class CommunityAssignment(BaseModel):
+ node: str
+ cluster: int
+ level: int
+
+class ClusterResponse(BaseModel):
+ communities: List[CommunityAssignment]
+
+
+@app.post("/cluster", response_model=ClusterResponse)
+def cluster_graph(request: ClusterRequest):
+ try:
+ # Build graph from relationships
+ G = nx.Graph()
+ for rel in request.relationships:
+ G.add_edge(rel.subject, rel.object, weight=rel.weight, id=rel.id)
+
+ # Compute hierarchical leiden
+ # hierarchical_leiden returns a list of objects with node, cluster, level
+ # Adjust this code to match exactly how you handle the returned structure in your code.
+ logger.info("Starting Leiden clustering")
+ communities = hierarchical_leiden(
+ G,
+ resolution=request.leiden_params.resolution,
+ randomness=request.leiden_params.randomness,
+ max_cluster_size=request.leiden_params.max_cluster_size,
+ extra_forced_iterations=request.leiden_params.extra_forced_iterations,
+ use_modularity=request.leiden_params.use_modularity,
+ random_seed=request.leiden_params.random_seed,
+ weight_attribute=request.leiden_params.weight_attribute,
+ )
+ logger.info("Leiden clustering complete")
+
+ # Convert communities to response model
+ # communities is typically a list of objects with node, cluster, and level attributes.
+ # If hierarchical_leiden returns a named tuple or a custom object, adapt accordingly.
+ assignments = [
+ CommunityAssignment(
+ node=c.node, cluster=c.cluster, level=c.level
+ )
+ for c in communities
+ ]
+
+ return ClusterResponse(communities=assignments)
+ except Exception as e:
+ logger.error(f"Error clustering graph: {e}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health")
+def health():
+ return {"status": "ok"}