up docs (#1445)

SciPhi-AI · Oct 22, 2024 · 5dd3a9d · 5dd3a9d
1 parent eaeab06
commit 5dd3a9d
Show file tree

Hide file tree

Showing 13 changed files with 119 additions and 14 deletions.
diff --git a/docs/api-reference/endpoint/completion.mdx b/docs/api-reference/endpoint/completion.mdx
@@ -0,0 +1,4 @@
+---
+title: 'Completion'
+openapi: 'GET /v2/completion'
+---
diff --git a/docs/api-reference/endpoint/deduplicate_entities.mdx b/docs/api-reference/endpoint/deduplicate_entities.mdx
@@ -0,0 +1,4 @@
+---
+title: Deduplicate Entities
+openapi: 'POST /v2/deduplicate_entities'
+---
diff --git a/docs/api-reference/endpoint/delete_entities_and_triples.mdx b/docs/api-reference/endpoint/delete_entities_and_triples.mdx
@@ -0,0 +1,4 @@
+---
+title: 'Delete graph for collection'
+openapi: 'DELETE /v2/delete_graph_for_collection'
+---
diff --git a/docs/api-reference/openapi.json b/docs/api-reference/openapi.json
diff --git a/docs/cookbooks/graphrag.mdx b/docs/cookbooks/graphrag.mdx
@@ -290,14 +290,50 @@ If you are using R2R Full, you can log into the hatchet dashboard on http://loca
 ![Hatchet Dashboard](../images/kg_extraction_progress.png)
 
 
-This step will create a knowledge graph with nodes and relationships. You can get the entities and relationships in the graph using our dashboard on http://localhost:7273 or by calling the following API endpoints:
+This step will create a knowledge graph with nodes and relationships. You can get the entities and relationships in the graph using our dashboard on http://localhost:7273 or by calling the following API endpoints. These hit the /v2/entities and /v2/triples endpoints respectively. We will use the `entity_level=document` query parameter to get the entities and triples at the document level. We will set the default collection id to `122fdf6a-e116-546b-a8f6-e4cb2e2c0a09` in the examples below.
+
+- Entities: [Entities](http://localhost:7272/v2/entities?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09&entity_level=document)
+- Triples: [Triples](http://localhost:7272/v2/triples?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09&entity_level=document)
+
+
+## Entity Deduplication
+
+Note that the entities and triples are created at the document level. This means that if you have multiple documents with the same entity, the entity will be duplicated for each document.
+
+To deduplicate the entities, you can run the `deduplicate-entities` endpoint. This endpoint will merge duplicate entities and delete the duplicate entities.
+
+<Tabs>
+<Tab title="CLI">
+```bash
+r2r deduplicate-entities --collection-id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09
+
+# Example Response
+[{'message': 'Deduplication task queued successfully.', 'task_id': 'd9dae1bb-5862-4a16-abaf-5297024df390'}]
+```
+</Tab>
+
+<Tab title="SDK">
+```python
+from r2r import R2RClient
+
+client = R2RClient("http://localhost:7272")
+client.deduplicate_entities(collection_id="122fdf6a-e116-546b-a8f6-e4cb2e2c0a09")
+
+# Example Response
+[{'message': 'Deduplication task queued successfully.', 'task_id': 'd9dae1bb-5862-4a16-abaf-5297024df390'}]
+```
+</Tab>
+</Tabs>
+
+You can check the status of the deduplication task using the hatchet dashboard on http://localhost:7274. And once that is complete, check the endpoints at a entity_level = collection to see the deduplicated entities and triples.
+
+- Entities: [Entities](http://localhost:7272/v2/entities?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09&entity_level=collection)
+- Triples: [Triples](http://localhost:7272/v2/triples?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09&entity_level=collection)
 
-- Entities: [Entities](http://localhost:7272/v2/entities?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09)
-- Triples: [Triples](http://localhost:7272/v2/triples?collection_id=122fdf6a-e116-546b-a8f6-e4cb2e2c0a09)
 
 ## Graph Enrichment
 
-Now we have a searchable graph, but this graph is not enriched yet. We need to perform the graph enrichment step.
+Now we have a searchable graph, but this graph is not enriched yet. It does not have any community level information. We will now run the enrichment step.
 
 The graph enrichment step performs hierarchical leiden clustering to create communities, and embeds the descriptions. These embeddings will be used later in the local search stage of the pipeline. If you are more interested in the algorithm, please refer to the blog post [here](https://www.sciphi.ai/blog/graphrag).
 

diff --git a/docs/documentation/cli/graph.mdx b/docs/documentation/cli/graph.mdx
@@ -32,6 +32,16 @@ r2r create-graph --collection-id my-collection --run --kg-creation-settings '{"k
   </Accordion>
 </AccordionGroup>
 
+### Deduplicate Entities  
+
+Deduplicate entities in a collection using the `deduplicate-entities` command:
+
+````bash
+r2r deduplicate-entities --collection-id my-collection --run --deduplication-settings '{"key": "value"}'
+````
+
+
+
 ### Enrich Graph
 
 Enrich an existing knowledge graph using the `enrich-graph` command:

diff --git a/docs/mint.json b/docs/mint.json
@@ -317,10 +317,12 @@
       "group": "Knowledge Graph",
       "pages": [
         "api-reference/endpoint/create_graph",
+        "api-reference/endpoint/deduplicate_entities",
         "api-reference/endpoint/enrich_graph",
         "api-reference/endpoint/entities",
         "api-reference/endpoint/triples",
-        "api-reference/endpoint/communities"
+        "api-reference/endpoint/communities",
+        "api-reference/endpoint/delete_entities_and_triples"
       ]
     },
     {

diff --git a/py/core/main/api/ingestion_router.py b/py/core/main/api/ingestion_router.py
@@ -341,11 +341,11 @@ async def create_vector_index_app(
             ),
             index_method: IndexMethod = Body(
                 default=IndexMethod.hnsw,
-                description="The type of vector index to create.",
+                description="The type of vector index to create. Supported values are 'hnsw' and 'ivfflat'.",
             ),
             measure: IndexMeasure = Body(
                 default=IndexMeasure.cosine_distance,
-                description="The measure for the index.",
+                description="The distance measure corresponding to the vector index. Used for calculating the distance between vectors during search.",
             ),
             index_arguments: Optional[
                 Union[IndexArgsIVFFlat, IndexArgsHNSW]
@@ -363,6 +363,10 @@ async def create_vector_index_app(
             ),
             auth_user=Depends(self.service.providers.auth.auth_wrapper),
         ) -> WrappedCreateVectorIndexResponse:
+            """
+            Create a vector index for a given table.
+
+            """
 
             logger.info(
                 f"Creating vector index for {table_name} with method {index_method}, measure {measure}, concurrently {concurrently}"

diff --git a/py/core/main/api/kg_router.py b/py/core/main/api/kg_router.py
@@ -379,3 +379,23 @@ async def deduplicate_entities(
             return await self.orchestration_provider.run_workflow(  # type: ignore
                 "entity-deduplication", {"request": workflow_input}, {}
             )
+
+        @self.router.delete("/delete_graph_for_collection")
+        @self.base_endpoint
+        async def delete_graph_for_collection(
+            collection_id: UUID = Body(..., description="Collection ID to delete graph for."), 
+            cascade: bool = Body(default=False, description="Whether to cascade the deletion, and delete entities and triples belonging to the collection."),
+            auth_user=Depends(self.service.providers.auth.auth_wrapper),
+        ):
+            """
+            Delete the graph for a given collection. Note that this endpoint may delete a large amount of data created by the KG pipeline, this deletion is irreversible, and recreating the graph may be an expensive operation. 
+
+            Notes: 
+            The endpoint deletes all communities for a given collection. If the cascade flag is set to true, the endpoint also deletes all the entities and triples associated with the collection. 
+            
+            WARNING: Setting this flag to true will delete entities and triples for documents that are shared across multiple collections. Do not set this flag unless you are absolutely sure that you want to delete the entities and triples for all documents in the collection.
+            
+            """
+            if not auth_user.is_superuser:
+                logger.warning("Implement permission checks here.")
+            return await self.service.delete_graph_for_collection(collection_id, cascade)
diff --git a/py/core/main/services/kg_service.py b/py/core/main/services/kg_service.py
@@ -423,3 +423,20 @@ async def kg_entity_deduplication_summary(
         )
 
         return await _collect_results(deduplication_summary_results)
+
+
+    @telemetry_event("delete_communities")
+    async def delete_communities(
+        self,
+        collection_id: UUID,
+        community_numbers: list[int],
+    ):
+        return await self.providers.kg.delete_communities(collection_id, community_numbers)
+
+
+    @telemetry_event("delete_entities_and_triples")
+    async def delete_entities_and_triples(
+        self,
+        document_id: UUID,
+    ):
+        return await self.providers.kg.delete_entities_and_triples(document_id)   
diff --git a/py/core/providers/database/vector.py b/py/core/providers/database/vector.py
@@ -534,8 +534,11 @@ async def create_index(
         if table_name == VectorTableName.RAW_CHUNKS:
             table_name_str = f"{self.project_name}.{VectorTableName.RAW_CHUNKS}"  # TODO - Fix bug in vector table naming convention
             col_name = "vec"
-        elif table_name == VectorTableName.ENTITIES:
-            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES}"
+        elif table_name == VectorTableName.ENTITIES_DOCUMENT:
+            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES_DOCUMENT}"
+            col_name = "description_embedding"
+        elif table_name == VectorTableName.ENTITIES_COLLECTION:
+            table_name_str = f"{self.project_name}.{VectorTableName.ENTITIES_COLLECTION}"
             col_name = "description_embedding"
         elif table_name == VectorTableName.COMMUNITIES:
             table_name_str = (

diff --git a/py/core/providers/kg/postgres.py b/py/core/providers/kg/postgres.py
@@ -834,6 +834,7 @@ async def delete_graph_for_collection(
                 DELETE FROM {self._get_table_name("entity_raw")} WHERE document_id = ANY($1);
                 DELETE FROM {self._get_table_name("triple_raw")} WHERE document_id = ANY($1);
                 DELETE FROM {self._get_table_name("entity_embedding")} WHERE document_id = ANY($1);
+                DELETE FROM {self._get_table_name("entity_collection")} WHERE document_id = ANY($1);
             """
 
         await self.execute_query(QUERY, [document_ids])
@@ -1301,4 +1302,4 @@ async def update_entity_descriptions(self, entities: list[Entity]):
             for entity in entities
         ]
 
-        await self.execute_many(query, inputs)  # type: ignore
+        await self.execute_many(query, inputs)  # type: ignore
diff --git a/py/shared/abstractions/vector.py b/py/shared/abstractions/vector.py
@@ -107,9 +107,9 @@ class VectorTableName(str, Enum):
     # {r2r_project_name}.{r2r_project_name} due to a bug in the vector class.
     """
 
-    RAW_CHUNKS = "raw_chunks"
-    ENTITIES = "entity_embedding"
-    ENTITIES_DEDUPLICATED = "entity_deduplicated"
+    RAW_CHUNKS = "vectors"
+    ENTITIES_DOCUMENT = "entity_embedding"
+    ENTITIES_COLLECTION = "entity_collection"
     # TODO: Add support for triples
     # TRIPLES = "triple_raw"
     COMMUNITIES = "community_report"