Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

H-3162: Look for matching entities in database when inserting entities from Flow #6102

Merged
merged 13 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"javascriptreact",
"typescript",
"typescriptreact"
]
],
"vitest.disableWorkspaceWarning": true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be great to address this as some point. Previously flagged here (internal link).

}
1 change: 1 addition & 0 deletions apps/hash-ai-worker-ts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ The service uses the following environment variables:

### Run the worker

- To use actions which require Google Cloud Platform, you must run `gcloud auth application-default login` before starting the worker.
- Ensure the environment variables above are set, either in `.env.local` or in your shell.
- Install dependencies:
- `yarn`
Expand Down
1 change: 1 addition & 0 deletions apps/hash-ai-worker-ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"start:healthcheck": "wait-on --timeout 120000 http-get://localhost:4100/health",
"start:test": "NODE_ENV=test NODE_OPTIONS=--max-old-space-size=2048 node ./dist/main.js",
"start:test:healthcheck": "wait-on --timeout 120000 http-get://localhost:4100/health",
"temporal:clean": "temporal workflow terminate --query \"TaskQueue='ai'\" --reason=\"Batch terminate from CLI\"",
"test:unit": "vitest --run --exclude \"**/*.ai.test.ts\""
},
"dependencies": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ type ModelResponseArgs = {

const maximumIterations = 10;

const model: PermittedOpenAiModel = "gpt-4o";
const model: PermittedOpenAiModel = "gpt-4o-2024-08-06";

const callModel = async (
messages: OpenAI.ChatCompletionCreateParams["messages"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,11 @@ export const persistEntitiesAction: FlowActionActivity = async ({ inputs }) => {
> = {};

/**
* We could potentially parallelize the creation of (a) non-link entities and (b) link entities,
* We could potentially parallelize the creation of (a) non-link entities and then (b) link entities in batches,
* if performance of this function becomes an issue.
*
* We need to create the links after all the non-links as the ids of the links may change,
* if an existing entity is found to update rather than a new one with the localId being created.
*/
for (const unresolvedEntity of entitiesWithDependenciesSortedLast) {
const {
Expand Down Expand Up @@ -188,7 +191,9 @@ export const persistEntitiesAction: FlowActionActivity = async ({ inputs }) => {
if (!output) {
failedEntitiesByLocalId[unresolvedEntity.localEntityId] = {
proposedEntity: unresolvedEntity,
message: `No outputs returned when attempting to persist entity`,
message:
persistedEntityOutputs.message ??
`No outputs returned when attempting to persist entity`,
};
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ import { backOff } from "exponential-backoff";
import { getAiAssistantAccountIdActivity } from "../get-ai-assistant-account-id-activity.js";
import { extractErrorMessage } from "../infer-entities/shared/extract-validation-failure-details.js";
import { createInferredEntityNotification } from "../shared/create-inferred-entity-notification.js";
// import {
// findExistingEntity,
// findExistingLinkEntity,
// } from "../shared/find-existing-entity";
import {
findExistingEntity,
findExistingLinkEntity,
} from "../shared/find-existing-entity.js";
import { getFlowContext } from "../shared/get-flow-context.js";
import { graphApiClient } from "../shared/graph-api-client.js";
import { logProgress } from "../shared/log-progress.js";
import type { MatchedEntityUpdate } from "../shared/match-existing-entity.js";
import { createFileEntityFromUrl } from "./shared/create-file-entity-from-url.js";
import {
getEntityUpdate,
Expand Down Expand Up @@ -125,7 +126,7 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => {
: undefined;

let entity: Entity;
let existingEntity: Entity | undefined;
let matchedEntityUpdate: MatchedEntityUpdate<Entity> | null = null;
let operation: "create" | "update";

if (isFileEntity && fileUrl) {
Expand All @@ -143,95 +144,67 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => {
return {
code: StatusCode.Internal,
message: createFileEntityFromUrlStatus.message,
contents: [
{
outputs: [
{
outputName:
"persistedEntity" as OutputNameForAction<"persistEntity">,
payload: {
kind: "PersistedEntity",
value: { operation },
},
},
],
},
],
contents: [],
};
}

const { entity: updatedEntity } = createFileEntityFromUrlStatus;

entity = updatedEntity;
} else {
/**
* @todo: improve the logic for finding existing entities, to
* reduce the number of false positives.
*/
// existingEntity = await (linkData
// ? findExistingLinkEntity({
// actorId,
// graphApiClient,
// ownedById,
// linkData,
// includeDrafts: createEditionAsDraft,
// })
// : findExistingEntity({
// actorId,
// graphApiClient,
// ownedById,
// proposedEntity: proposedEntityWithResolvedLinks,
// includeDrafts: createEditionAsDraft,
// }));
matchedEntityUpdate = await (linkData
? /**
* @todo H-3883 ensure that the creation of a new link will not violate min/max links on an entity
*/
findExistingLinkEntity({
actorId,
graphApiClient,
ownedById,
linkData,
proposedEntity: proposedEntityWithResolvedLinks,
includeDrafts: createEditionAsDraft,
})
: findExistingEntity({
actorId,
graphApiClient,
ownedById,
proposedEntity: proposedEntityWithResolvedLinks,
includeDrafts: createEditionAsDraft,
}));

operation = existingEntity ? "update" : "create";
operation = matchedEntityUpdate ? "update" : "create";

try {
if (existingEntity) {
const { existingEntityIsDraft, isExactMatch, patchOperations } =
getEntityUpdate({
existingEntity,
newProperties: mergePropertyObjectAndMetadata(
properties,
undefined,
),
});

const serializedEntity = existingEntity.toJSON();

if (isExactMatch) {
return {
code: StatusCode.Ok,
contents: [
{
outputs: [
{
outputName:
"persistedEntity" as OutputNameForAction<"persistEntity">,
payload: {
kind: "PersistedEntity",
value: {
entity: serializedEntity,
existingEntity: serializedEntity,
operation: "already-exists-as-proposed",
},
},
},
],
},
],
};
}
if (matchedEntityUpdate) {
const { existingEntityIsDraft, patchOperations } = getEntityUpdate({
existingEntity: matchedEntityUpdate.existingEntity,
newPropertiesWithMetadata: mergePropertyObjectAndMetadata(
matchedEntityUpdate.newValues.properties,
matchedEntityUpdate.newValues.propertyMetadata,
),
});

/**
* In practice we don't reassign matchedEntityUpdate anywhere below it doesn't harm to make sure it will always
* be the same thing in the backOff function.
*/
const stableReferenceToMatchedEntity = matchedEntityUpdate;

entity = await backOff(
() =>
existingEntity.patch(
stableReferenceToMatchedEntity.existingEntity.patch(
graphApiClient,
{ actorId: webBotActorId },
{
...entityValues,
entityTypeIds:
stableReferenceToMatchedEntity.newValues.entityTypeIds,
draft: existingEntityIsDraft ? true : createEditionAsDraft,
propertyPatches: patchOperations,
provenance: {
...entityValues.provenance,
sources:
stableReferenceToMatchedEntity.newValues.editionSources,
},
},
),
{
Expand Down Expand Up @@ -267,30 +240,14 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => {
return {
code: StatusCode.Internal,
message: `Could not persist entity: ${extractErrorMessage(err)}`,
contents: [
{
outputs: [
{
outputName:
"persistedEntity" as OutputNameForAction<"persistEntity">,
payload: {
kind: "PersistedEntity",
value: {
existingEntity: existingEntity?.toJSON(),
operation,
},
},
},
],
},
],
contents: [],
};
}
}

const persistedEntity = {
entity: entity.toJSON(),
existingEntity: existingEntity?.toJSON(),
existingEntity: matchedEntityUpdate?.existingEntity.toJSON(),
operation,
} satisfies PersistedEntity;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,7 @@ const baseDirectoryPath = path.join(
test(
"Extract links form text system prompt test",
async () => {
const models: LlmParams["model"][] = [
// "claude-3-5-sonnet-20240620",
"claude-3-haiku-20240307",
// "gpt-4o",
];
const models: LlmParams["model"][] = ["claude-3-haiku-20240307"];

await optimizeSystemPrompt({
models,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { EntityId } from "@local/hash-graph-types/entity";
import { sleep } from "@local/hash-isomorphic-utils/sleep";
import dedent from "dedent";

import { logger } from "../../../shared/activity-logger.js";
Expand Down Expand Up @@ -194,6 +195,9 @@ export const deduplicateEntities = async (params: {
}

logger.error(`Error deduplicating entities: ${llmResponse.status}`);

await sleep(2_000);

return deduplicateEntities(params);
}

Expand Down
Loading
Loading