diff --git a/.vscode/settings.json b/.vscode/settings.json index a07ac294966..8f15f045f0a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,6 @@ "javascriptreact", "typescript", "typescriptreact" - ] + ], + "vitest.disableWorkspaceWarning": true } diff --git a/apps/hash-ai-worker-ts/README.md b/apps/hash-ai-worker-ts/README.md index fd9d5ee4767..c5b73a43c03 100644 --- a/apps/hash-ai-worker-ts/README.md +++ b/apps/hash-ai-worker-ts/README.md @@ -25,6 +25,7 @@ The service uses the following environment variables: ### Run the worker +- To use actions which require Google Cloud Platform, you must run `gcloud auth application-default login` before starting the worker. - Ensure the environment variables above are set, either in `.env.local` or in your shell. - Install dependencies: - `yarn` diff --git a/apps/hash-ai-worker-ts/package.json b/apps/hash-ai-worker-ts/package.json index e78a761becc..b1c9872955e 100644 --- a/apps/hash-ai-worker-ts/package.json +++ b/apps/hash-ai-worker-ts/package.json @@ -38,6 +38,7 @@ "start:healthcheck": "wait-on --timeout 120000 http-get://localhost:4100/health", "start:test": "NODE_ENV=test NODE_OPTIONS=--max-old-space-size=2048 node ./dist/main.js", "start:test:healthcheck": "wait-on --timeout 120000 http-get://localhost:4100/health", + "temporal:clean": "temporal workflow terminate --query \"TaskQueue='ai'\" --reason=\"Batch terminate from CLI\"", "test:unit": "vitest --run --exclude \"**/*.ai.test.ts\"" }, "dependencies": { diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts index e0c9286d872..815dcb04660 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts @@ -151,7 +151,7 @@ type ModelResponseArgs = { const maximumIterations = 10; -const model: PermittedOpenAiModel = "gpt-4o"; +const model: PermittedOpenAiModel = "gpt-4o-2024-08-06"; const callModel = async ( messages: OpenAI.ChatCompletionCreateParams["messages"], diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts index a576730fedc..668d04b4010 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts @@ -73,8 +73,11 @@ export const persistEntitiesAction: FlowActionActivity = async ({ inputs }) => { > = {}; /** - * We could potentially parallelize the creation of (a) non-link entities and (b) link entities, + * We could potentially parallelize the creation of (a) non-link entities and then (b) link entities in batches, * if performance of this function becomes an issue. + * + * We need to create the links after all the non-links as the ids of the links may change, + * if an existing entity is found to update rather than a new one with the localId being created. */ for (const unresolvedEntity of entitiesWithDependenciesSortedLast) { const { @@ -188,7 +191,9 @@ export const persistEntitiesAction: FlowActionActivity = async ({ inputs }) => { if (!output) { failedEntitiesByLocalId[unresolvedEntity.localEntityId] = { proposedEntity: unresolvedEntity, - message: `No outputs returned when attempting to persist entity`, + message: + persistedEntityOutputs.message ?? + `No outputs returned when attempting to persist entity`, }; continue; } diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts index 389f2d2b609..1968d38bd18 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts @@ -27,13 +27,14 @@ import { backOff } from "exponential-backoff"; import { getAiAssistantAccountIdActivity } from "../get-ai-assistant-account-id-activity.js"; import { extractErrorMessage } from "../infer-entities/shared/extract-validation-failure-details.js"; import { createInferredEntityNotification } from "../shared/create-inferred-entity-notification.js"; -// import { -// findExistingEntity, -// findExistingLinkEntity, -// } from "../shared/find-existing-entity"; +import { + findExistingEntity, + findExistingLinkEntity, +} from "../shared/find-existing-entity.js"; import { getFlowContext } from "../shared/get-flow-context.js"; import { graphApiClient } from "../shared/graph-api-client.js"; import { logProgress } from "../shared/log-progress.js"; +import type { MatchedEntityUpdate } from "../shared/match-existing-entity.js"; import { createFileEntityFromUrl } from "./shared/create-file-entity-from-url.js"; import { getEntityUpdate, @@ -125,7 +126,7 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => { : undefined; let entity: Entity; - let existingEntity: Entity | undefined; + let matchedEntityUpdate: MatchedEntityUpdate | null = null; let operation: "create" | "update"; if (isFileEntity && fileUrl) { @@ -143,20 +144,7 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => { return { code: StatusCode.Internal, message: createFileEntityFromUrlStatus.message, - contents: [ - { - outputs: [ - { - outputName: - "persistedEntity" as OutputNameForAction<"persistEntity">, - payload: { - kind: "PersistedEntity", - value: { operation }, - }, - }, - ], - }, - ], + contents: [], }; } @@ -164,74 +152,59 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => { entity = updatedEntity; } else { - /** - * @todo: improve the logic for finding existing entities, to - * reduce the number of false positives. - */ - // existingEntity = await (linkData - // ? findExistingLinkEntity({ - // actorId, - // graphApiClient, - // ownedById, - // linkData, - // includeDrafts: createEditionAsDraft, - // }) - // : findExistingEntity({ - // actorId, - // graphApiClient, - // ownedById, - // proposedEntity: proposedEntityWithResolvedLinks, - // includeDrafts: createEditionAsDraft, - // })); + matchedEntityUpdate = await (linkData + ? /** + * @todo H-3883 ensure that the creation of a new link will not violate min/max links on an entity + */ + findExistingLinkEntity({ + actorId, + graphApiClient, + ownedById, + linkData, + proposedEntity: proposedEntityWithResolvedLinks, + includeDrafts: createEditionAsDraft, + }) + : findExistingEntity({ + actorId, + graphApiClient, + ownedById, + proposedEntity: proposedEntityWithResolvedLinks, + includeDrafts: createEditionAsDraft, + })); - operation = existingEntity ? "update" : "create"; + operation = matchedEntityUpdate ? "update" : "create"; try { - if (existingEntity) { - const { existingEntityIsDraft, isExactMatch, patchOperations } = - getEntityUpdate({ - existingEntity, - newProperties: mergePropertyObjectAndMetadata( - properties, - undefined, - ), - }); - - const serializedEntity = existingEntity.toJSON(); - - if (isExactMatch) { - return { - code: StatusCode.Ok, - contents: [ - { - outputs: [ - { - outputName: - "persistedEntity" as OutputNameForAction<"persistEntity">, - payload: { - kind: "PersistedEntity", - value: { - entity: serializedEntity, - existingEntity: serializedEntity, - operation: "already-exists-as-proposed", - }, - }, - }, - ], - }, - ], - }; - } + if (matchedEntityUpdate) { + const { existingEntityIsDraft, patchOperations } = getEntityUpdate({ + existingEntity: matchedEntityUpdate.existingEntity, + newPropertiesWithMetadata: mergePropertyObjectAndMetadata( + matchedEntityUpdate.newValues.properties, + matchedEntityUpdate.newValues.propertyMetadata, + ), + }); + + /** + * In practice we don't reassign matchedEntityUpdate anywhere below it doesn't harm to make sure it will always + * be the same thing in the backOff function. + */ + const stableReferenceToMatchedEntity = matchedEntityUpdate; entity = await backOff( () => - existingEntity.patch( + stableReferenceToMatchedEntity.existingEntity.patch( graphApiClient, { actorId: webBotActorId }, { - ...entityValues, + entityTypeIds: + stableReferenceToMatchedEntity.newValues.entityTypeIds, draft: existingEntityIsDraft ? true : createEditionAsDraft, propertyPatches: patchOperations, + provenance: { + ...entityValues.provenance, + sources: + stableReferenceToMatchedEntity.newValues.editionSources, + }, }, ), { @@ -267,30 +240,14 @@ export const persistEntityAction: FlowActionActivity = async ({ inputs }) => { return { code: StatusCode.Internal, message: `Could not persist entity: ${extractErrorMessage(err)}`, - contents: [ - { - outputs: [ - { - outputName: - "persistedEntity" as OutputNameForAction<"persistEntity">, - payload: { - kind: "PersistedEntity", - value: { - existingEntity: existingEntity?.toJSON(), - operation, - }, - }, - }, - ], - }, - ], + contents: [], }; } } const persistedEntity = { entity: entity.toJSON(), - existingEntity: existingEntity?.toJSON(), + existingEntity: matchedEntityUpdate?.existingEntity.toJSON(), operation, } satisfies PersistedEntity; diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent/choose-relevant-links-from-content.optimize.ai.test.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent/choose-relevant-links-from-content.optimize.ai.test.ts index 5db2adeb782..7950771f079 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent/choose-relevant-links-from-content.optimize.ai.test.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent/choose-relevant-links-from-content.optimize.ai.test.ts @@ -277,11 +277,7 @@ const baseDirectoryPath = path.join( test( "Extract links form text system prompt test", async () => { - const models: LlmParams["model"][] = [ - // "claude-3-5-sonnet-20240620", - "claude-3-haiku-20240307", - // "gpt-4o", - ]; + const models: LlmParams["model"][] = ["claude-3-haiku-20240307"]; await optimizeSystemPrompt({ models, diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/shared/deduplicate-entities.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/shared/deduplicate-entities.ts index 2862f683f5f..5eec37fbbac 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/shared/deduplicate-entities.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/shared/deduplicate-entities.ts @@ -1,4 +1,5 @@ import type { EntityId } from "@local/hash-graph-types/entity"; +import { sleep } from "@local/hash-isomorphic-utils/sleep"; import dedent from "dedent"; import { logger } from "../../../shared/activity-logger.js"; @@ -194,6 +195,9 @@ export const deduplicateEntities = async (params: { } logger.error(`Error deduplicating entities: ${llmResponse.status}`); + + await sleep(2_000); + return deduplicateEntities(params); } diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/graph-requests.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/graph-requests.ts index 2c8c3f71008..87b78dd106a 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/graph-requests.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/graph-requests.ts @@ -1,6 +1,5 @@ import { typedEntries } from "@local/advanced-types/typed-entries"; import type { GraphApi } from "@local/hash-graph-client"; -import type { Entity } from "@local/hash-graph-sdk/entity"; import type { AccountId } from "@local/hash-graph-types/account"; import type { EntityId, @@ -11,6 +10,7 @@ import { currentTimeInstantTemporalAxes, zeroedGraphResolveDepths, } from "@local/hash-isomorphic-utils/graph-queries"; +import { deduplicateSources } from "@local/hash-isomorphic-utils/provenance"; import { mapGraphApiSubgraphToSubgraph } from "@local/hash-isomorphic-utils/subgraph-mapping"; import type { EntityRootType } from "@local/hash-subgraph"; import { @@ -19,7 +19,8 @@ import { } from "@local/hash-subgraph"; import { getRoots } from "@local/hash-subgraph/stdlib"; import isEqual from "lodash.isequal"; -import isMatch from "lodash.ismatch"; + +import type { ExistingEntityForMatching } from "../../shared/match-existing-entity.js"; /** * @todo: move the primitive node helper methods from the Node API into a shared @@ -82,30 +83,74 @@ export const getLatestEntityById = async (params: { export const getEntityUpdate = ({ existingEntity, - newProperties, + newPropertiesWithMetadata, }: { - existingEntity: Entity; - newProperties: T; + existingEntity: ExistingEntityForMatching; + newPropertiesWithMetadata: T; }) => { const patchOperations: PropertyPatchOperation[] = []; - const isExactMatch = isMatch(existingEntity.properties, newProperties); - - if (!isExactMatch) { - for (const [key, property] of typedEntries(newProperties.value)) { - // @todo better handle property objects, will currently overwrite the entire object if there are any differences - if (!isEqual(existingEntity.properties[key], property)) { - patchOperations.push({ - op: existingEntity.properties[key] ? "replace" : "add", - path: [key], - property, - }); - } + let isExactMatch = true; + + for (const [key, propertyWithMetadata] of typedEntries( + newPropertiesWithMetadata.value, + )) { + if (!existingEntity.properties[key]) { + isExactMatch = false; } + + const newPropertySources = + propertyWithMetadata.metadata?.provenance?.sources; + + const existingPropertySources = + existingEntity.propertiesMetadata.value[key]?.metadata?.provenance + ?.sources; + + let sourcesToApply = newPropertySources; + + /** + * This equality check is comparing the value of the properties object on the existingEntity + * with PropertyWithMetadata["value"] on the new input, + * and will always return false for array or object values (because the first is the value only, the second contains metadata). + * @todo H-3900: better handle property objects + */ + if (isEqual(existingEntity.properties[key], propertyWithMetadata.value)) { + /** + * If the values are equal, we can merge the sources from the existing and new properties, + * to capture the fact that we have now seen the value in multiple places. + * This only works for primitive values (see comment above about isEqual check). + */ + sourcesToApply = deduplicateSources([ + ...(existingPropertySources ?? []), + ...(newPropertySources ?? []), + ]); + } else { + isExactMatch = false; + } + + const clonedProperty = JSON.parse( + JSON.stringify(propertyWithMetadata), + ) as typeof propertyWithMetadata; + + if (sourcesToApply?.length) { + clonedProperty.metadata ??= {}; + clonedProperty.metadata.provenance ??= {}; + clonedProperty.metadata.provenance.sources = sourcesToApply; + } + + patchOperations.push({ + op: existingEntity.properties[key] ? "replace" : "add", + path: [key], + /** + * @todo H-3900: consider merging property objects (e.g. if existingEntity has one nested field defined) + * - the entire object will currently be overwritten with the new input. + */ + property: clonedProperty, + }); } const existingEntityIsDraft = !!extractDraftIdFromEntityId( - existingEntity.metadata.recordId.entityId, + existingEntity.entityId, ); return { diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts index bf5d6054c38..b51df30f734 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts @@ -406,7 +406,7 @@ export const writeGoogleSheetAction: FlowActionActivity<{ const { existingEntityIsDraft, isExactMatch, patchOperations } = getEntityUpdate({ existingEntity, - newProperties: fileProperties, + newPropertiesWithMetadata: fileProperties, }); if (isExactMatch) { diff --git a/apps/hash-ai-worker-ts/src/activities/shared/find-existing-entity.ts b/apps/hash-ai-worker-ts/src/activities/shared/find-existing-entity.ts index 5720dc4b2b0..359744b389b 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/find-existing-entity.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/find-existing-entity.ts @@ -16,7 +16,11 @@ import { generateVersionedUrlMatchingFilter, zeroedGraphResolveDepths, } from "@local/hash-isomorphic-utils/graph-queries"; -import { mapGraphApiSubgraphToSubgraph } from "@local/hash-isomorphic-utils/subgraph-mapping"; +import { deduplicateSources } from "@local/hash-isomorphic-utils/provenance"; +import { + mapGraphApiEntityToEntity, + mapGraphApiSubgraphToSubgraph, +} from "@local/hash-isomorphic-utils/subgraph-mapping"; import type { EntityTypeRootType } from "@local/hash-subgraph"; import { extractEntityUuidFromEntityId, @@ -28,7 +32,10 @@ import { logger } from "./activity-logger.js"; import type { DereferencedEntityType } from "./dereference-entity-type.js"; import { dereferenceEntityType } from "./dereference-entity-type.js"; import { createEntityEmbeddings } from "./embeddings.js"; -import { getEntityByFilter } from "./get-entity-by-filter.js"; +import { + type MatchedEntityUpdate, + matchExistingEntity, +} from "./match-existing-entity.js"; export const findExistingEntity = async ({ actorId, @@ -42,9 +49,12 @@ export const findExistingEntity = async ({ dereferencedEntityTypes?: DereferencedEntityType[]; graphApiClient: GraphApi; ownedById: OwnedById; - proposedEntity: Pick; + proposedEntity: Pick< + ProposedEntity, + "entityTypeIds" | "properties" | "propertyMetadata" | "provenance" + >; includeDrafts: boolean; -}): Promise => { +}): Promise | null> => { const entityTypes: DereferencedEntityType[] = dereferencedEntityTypes ?? (await graphApiClient @@ -194,25 +204,30 @@ export const findExistingEntity = async ({ }) .filter((filter: T): filter is NonNullable => filter !== null); - let existingEntity: Entity | undefined; + let potentialMatches: Entity[] | undefined; if (semanticDistanceFilters.length > 0) { - existingEntity = await getEntityByFilter({ - actorId, - graphApiClient, - filter: { - all: [ - ...existingEntityBaseAllFilter, - { - any: semanticDistanceFilters, - }, - ], - }, - includeDrafts, - }); + potentialMatches = await graphApiClient + .getEntities(actorId, { + filter: { + all: [ + ...existingEntityBaseAllFilter, + { + any: semanticDistanceFilters, + }, + ], + }, + temporalAxes: currentTimeInstantTemporalAxes, + includeDrafts, + }) + .then(({ data: response }) => + response.entities + .slice(0, 3) + .map((entity) => mapGraphApiEntityToEntity(entity, actorId)), + ); } - if (!existingEntity) { + if (!potentialMatches?.length) { // If we didn't find a match on individual properties, try matching on the entire properties object const propertyObjectEmbedding = embeddings.find( (embedding) => !embedding.property, @@ -221,100 +236,221 @@ export const findExistingEntity = async ({ if (!propertyObjectEmbedding) { logger.error(`Could not find embedding for properties object – skipping`); } else { - existingEntity = await getEntityByFilter({ - actorId, - graphApiClient, - filter: { - all: [ - ...existingEntityBaseAllFilter, - { - cosineDistance: [ - { path: ["embedding"] }, - { - parameter: propertyObjectEmbedding.embedding, - }, - { parameter: maximumSemanticDistance }, - ], - }, - ], - }, - includeDrafts, - }); + potentialMatches = await graphApiClient + .getEntities(actorId, { + filter: { + all: [ + ...existingEntityBaseAllFilter, + { + cosineDistance: [ + { path: ["embedding"] }, + { + parameter: propertyObjectEmbedding.embedding, + }, + { parameter: maximumSemanticDistance }, + ], + }, + ], + }, + temporalAxes: currentTimeInstantTemporalAxes, + includeDrafts, + }) + .then(({ data: response }) => + response.entities + .slice(0, 3) + .map((entity) => mapGraphApiEntityToEntity(entity, actorId)), + ); } } - return existingEntity; + if (!potentialMatches?.length) { + return null; + } + + const match = await matchExistingEntity({ + isLink: false, + entities: { + newEntity: { + ...proposedEntity, + propertiesMetadata: proposedEntity.propertyMetadata, + editionSources: proposedEntity.provenance.sources ?? [], + }, + potentialMatches, + }, + }); + + return match; }; export const findExistingLinkEntity = async ({ actorId, graphApiClient, + includeDrafts, linkData, ownedById, - includeDrafts, + proposedEntity, }: { actorId: AccountId; graphApiClient: GraphApi; + includeDrafts: boolean; linkData: LinkData; ownedById: OwnedById; - includeDrafts: boolean; -}) => { - return await getEntityByFilter({ - actorId, - graphApiClient, - filter: { - all: [ - { equal: [{ path: ["archived"] }, { parameter: false }] }, - { - equal: [ - { path: ["ownedById"] }, - { - parameter: ownedById, - }, - ], - }, - { - equal: [ - { - path: ["leftEntity", "ownedById"], - }, - { - parameter: extractOwnedByIdFromEntityId(linkData.leftEntityId), - }, - ], - }, - { - equal: [ - { - path: ["leftEntity", "uuid"], - }, - { - parameter: extractEntityUuidFromEntityId(linkData.leftEntityId), - }, - ], - }, - { - equal: [ - { - path: ["rightEntity", "ownedById"], - }, - { - parameter: extractOwnedByIdFromEntityId(linkData.rightEntityId), - }, - ], - }, - { - equal: [ - { - path: ["rightEntity", "uuid"], - }, - { - parameter: extractEntityUuidFromEntityId(linkData.rightEntityId), - }, - ], + proposedEntity: Pick< + ProposedEntity, + "entityTypeIds" | "properties" | "propertyMetadata" | "provenance" + >; +}): Promise | null> => { + const linksWithOverlappingTypes = await graphApiClient + .getEntities(actorId, { + filter: { + all: [ + { equal: [{ path: ["archived"] }, { parameter: false }] }, + { + any: proposedEntity.entityTypeIds.map((entityTypeId) => ({ + equal: [ + { path: ["type", "versionedUrl"] }, + { parameter: entityTypeId }, + ], + })), + }, + { + equal: [ + { path: ["ownedById"] }, + { + parameter: ownedById, + }, + ], + }, + { + equal: [ + { + path: ["leftEntity", "ownedById"], + }, + { + parameter: extractOwnedByIdFromEntityId(linkData.leftEntityId), + }, + ], + }, + { + equal: [ + { + path: ["leftEntity", "uuid"], + }, + { + parameter: extractEntityUuidFromEntityId(linkData.leftEntityId), + }, + ], + }, + { + equal: [ + { + path: ["rightEntity", "ownedById"], + }, + { + parameter: extractOwnedByIdFromEntityId(linkData.rightEntityId), + }, + ], + }, + { + equal: [ + { + path: ["rightEntity", "uuid"], + }, + { + parameter: extractEntityUuidFromEntityId( + linkData.rightEntityId, + ), + }, + ], + }, + ], + }, + temporalAxes: currentTimeInstantTemporalAxes, + includeDrafts, + }) + .then(({ data }) => + data.entities.map((entity) => mapGraphApiEntityToEntity(entity, actorId)), + ); + + if (!linksWithOverlappingTypes.length) { + return null; + } + + const newInputHasNoProperties = + Object.keys(proposedEntity.properties).length === 0; + + if (newInputHasNoProperties) { + const newInputTypeSet = new Set(proposedEntity.entityTypeIds); + + /** + * If the new input has no properties, we look for an existing link with the same type(s) which also has no properties. + * If we find it, we will take it as a match, on the basis that the only meaningful information present (types) matches. + * We'll merge the sources listed for the edition to capture the fact that we inferred this link from multiple sources. + */ + const potentialMatchWithNoProperties = linksWithOverlappingTypes.find( + (entity) => { + if (Object.keys(entity.properties).length !== 0) { + return false; + } + + const potentialMatchTypeSet = new Set(entity.metadata.entityTypeIds); + + return ( + newInputTypeSet.size === potentialMatchTypeSet.size && + newInputTypeSet.isSupersetOf(potentialMatchTypeSet) + ); + }, + ); + + if (potentialMatchWithNoProperties) { + return { + existingEntity: potentialMatchWithNoProperties, + newValues: { + entityTypeIds: proposedEntity.entityTypeIds, + propertyMetadata: proposedEntity.propertyMetadata, + editionSources: deduplicateSources([ + ...(proposedEntity.provenance.sources ?? []), + ...(potentialMatchWithNoProperties.metadata.provenance.edition + .sources ?? []), + ]), + properties: {}, }, - ], + }; + } else { + /** + * If all the existing links either have some properties or don't have the exact same set of types, + * we'll err on the safe side and not pick one to apply the new input as an update to. + */ + return null; + } + } + + /** + * If we've reached here, the input has some properties + */ + const potentialMatchesWithProperties = linksWithOverlappingTypes.filter( + (entity) => Object.keys(entity.properties).length > 0, + ); + + if (!potentialMatchesWithProperties.length) { + /** + * If none of the existing links have property values, + * we'll err on the safe side and not pick one to apply the new input as an update to. + */ + return null; + } + + const match = await matchExistingEntity({ + isLink: false, + entities: { + newEntity: { + ...proposedEntity, + propertiesMetadata: proposedEntity.propertyMetadata, + editionSources: proposedEntity.provenance.sources ?? [], + }, + potentialMatches: potentialMatchesWithProperties, }, - includeDrafts, }); + + return match; }; diff --git a/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.optimize.ai.test.ts b/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.optimize.ai.test.ts new file mode 100644 index 00000000000..dd10d661cad --- /dev/null +++ b/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.optimize.ai.test.ts @@ -0,0 +1,438 @@ +import "../../shared/testing-utilities/mock-get-flow-context.js"; + +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { typedEntries } from "@local/advanced-types/typed-entries"; +import type { ValueMetadata } from "@local/hash-graph-client"; +import type { + EntityId, + EntityUuid, + PropertyMetadataObject, +} from "@local/hash-graph-types/entity"; +import { brandPropertyObject } from "@local/hash-graph-types/entity"; +import type { OwnedById } from "@local/hash-graph-types/web"; +import { generateUuid } from "@local/hash-isomorphic-utils/generate-uuid"; +import { + blockProtocolDataTypes, + systemDataTypes, + systemEntityTypes, + systemPropertyTypes, +} from "@local/hash-isomorphic-utils/ontology-type-ids"; +import { stringifyPropertyValue } from "@local/hash-isomorphic-utils/stringify-property-value"; +import type { PersonProperties } from "@local/hash-isomorphic-utils/system-types/shared"; +import { entityIdFromComponents } from "@local/hash-subgraph"; +import { test } from "vitest"; + +import type { LlmParams } from "./get-llm-response/types.js"; +import type { MatchExistingEntityParams } from "./match-existing-entity.js"; +import { + matchExistingEntity, + matchExistingEntitySystemPrompt, +} from "./match-existing-entity.js"; +import { optimizeSystemPrompt } from "./optimize-system-prompt.js"; +import type { MetricDefinition } from "./optimize-system-prompt/types.js"; + +const emptyMetadataObject: PropertyMetadataObject = { + value: {}, +}; + +const testOwnedById = generateUuid() as OwnedById; + +const generateEntityId = () => + entityIdFromComponents(testOwnedById, generateUuid() as EntityUuid); + +type MatchExistingEntityTest = { + testName: string; + isLink: boolean; + inputData: MatchExistingEntityParams; + expectedMatchEntityId: EntityId | null; +}; + +const billGatesUuid = generateEntityId(); +const williamHenryGatesUuid = generateEntityId(); +const popGatesUuid = generateEntityId(); +const williamGatesBasketballUuid = generateEntityId(); + +const ceoStarted2020Uuid = generateEntityId(); +const ceoStarted2022Uuid = generateEntityId(); + +const matchTestData: MatchExistingEntityTest[] = [ + { + testName: "Person – Match expected", + expectedMatchEntityId: billGatesUuid, + isLink: false, + inputData: { + potentialMatches: [ + { + entityId: billGatesUuid, + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "Bill Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "An American businessman and philanthropist best known for co-founding the software company Microsoft Corporation.", + }), + }, + { + entityId: popGatesUuid, + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "A professional basketball player, he was the first African American player signed to the National Basketball League.", + }), + }, + { + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + entityId: williamGatesBasketballUuid, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "An American former college basketball player, subject of the 1994 documentary film Hoop Dreams.", + }), + }, + ], + newEntity: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + editionSources: [], + propertiesMetadata: { + value: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + { + metadata: { + dataTypeId: blockProtocolDataTypes.text.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + { + metadata: { + dataTypeId: blockProtocolDataTypes.text.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + }), + }, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "He founded Microsoft in Albuquerque, New Mexico.", + }), + }, + }, + }, + { + testName: "Person – No match expected", + expectedMatchEntityId: null, + isLink: false, + inputData: { + potentialMatches: [ + { + entityId: williamHenryGatesUuid, + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "He founded Microsoft in Albuquerque, New Mexico.", + }), + }, + { + entityId: billGatesUuid, + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "Bill Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "An American businessman and philanthropist best known for co-founding the software company Microsoft Corporation.", + }), + }, + { + entityId: popGatesUuid, + metadata: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + provenance: { edition: { sources: [] } }, + }, + propertiesMetadata: emptyMetadataObject, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "A professional basketball player, he was the first African American player signed to the National Basketball League.", + }), + }, + ], + newEntity: { + entityTypeIds: [systemEntityTypes.person.entityTypeId], + editionSources: [], + propertiesMetadata: { + value: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + { + metadata: { + dataTypeId: blockProtocolDataTypes.text.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + { + metadata: { + dataTypeId: blockProtocolDataTypes.text.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + }), + }, + properties: brandPropertyObject({ + "https://blockprotocol.org/@blockprotocol/types/property-type/name/": + "William Gates", + "https://blockprotocol.org/@blockprotocol/types/property-type/description/": + "An American former college basketball player, subject of the 1994 documentary film Hoop Dreams. He never played professionally.", + }), + }, + }, + }, + { + testName: "Worked At – Match expected", + isLink: true, + expectedMatchEntityId: ceoStarted2020Uuid, + inputData: { + potentialMatches: [ + { + entityId: ceoStarted2020Uuid, + metadata: { + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + provenance: { edition: { sources: [] } }, + }, + properties: { + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: "2024-02-11", + [systemPropertyTypes.role.propertyTypeBaseUrl]: "CEO", + }, + propertiesMetadata: emptyMetadataObject, + }, + { + entityId: ceoStarted2022Uuid, + metadata: { + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + provenance: { edition: { sources: [] } }, + }, + properties: { + [systemPropertyTypes.role.propertyTypeBaseUrl]: "CEO", + }, + propertiesMetadata: emptyMetadataObject, + }, + ], + newEntity: { + editionSources: [], + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + properties: { + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: "2024-02-11", + }, + propertiesMetadata: { + value: brandPropertyObject({ + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: { + metadata: { + dataTypeId: systemDataTypes.date.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + }), + }, + }, + }, + }, + { + testName: "Worked At – No match expected", + expectedMatchEntityId: null, + isLink: true, + inputData: { + potentialMatches: [ + { + entityId: generateEntityId(), + metadata: { + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + provenance: { edition: { sources: [] } }, + }, + properties: { + [systemPropertyTypes.role.propertyTypeBaseUrl]: "CIO", + }, + propertiesMetadata: emptyMetadataObject, + }, + { + entityId: ceoStarted2022Uuid, + metadata: { + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + provenance: { edition: { sources: [] } }, + }, + properties: { + [systemPropertyTypes.role.propertyTypeBaseUrl]: "CEO", + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: "2011-04-22", + }, + propertiesMetadata: emptyMetadataObject, + }, + ], + newEntity: { + editionSources: [], + entityTypeIds: [ + "https://hash.ai/@hash/types/entity-type/worked-at/v/1", + ], + properties: { + [systemPropertyTypes.role.propertyTypeBaseUrl]: "CEO", + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: "2024-02-11", + }, + propertiesMetadata: { + value: brandPropertyObject({ + [systemPropertyTypes.role.propertyTypeBaseUrl]: { + metadata: { + dataTypeId: blockProtocolDataTypes.text.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + [systemPropertyTypes.appliesFrom.propertyTypeBaseUrl]: { + metadata: { + dataTypeId: systemDataTypes.date.dataTypeId, + provenance: { + sources: [], + }, + } satisfies ValueMetadata, + }, + }), + }, + }, + }, + }, +]; + +const metrics: MetricDefinition[] = matchTestData.map( + (testItem): MetricDefinition => { + return { + name: testItem.testName, + description: "", + executeMetric: async ({ testingParams }) => { + const { inputData, expectedMatchEntityId } = testItem; + + const match = await matchExistingEntity({ + entities: inputData, + isLink: false, + previousError: null, + testingParams, + }); + + const reportedMatchId = match?.existingEntity.entityId ?? null; + + const score = reportedMatchId === expectedMatchEntityId ? 1 : 0; + + let naturalLanguageReport = ""; + if (!score) { + if (!expectedMatchEntityId) { + naturalLanguageReport = `No match was expected, but a match with entityId ${reportedMatchId} found.`; + } else if (reportedMatchId) { + naturalLanguageReport = `Expected match with entityId ${expectedMatchEntityId} but LLM matched with entityId ${reportedMatchId}.`; + } else { + naturalLanguageReport = `Expected match with entityId ${expectedMatchEntityId} but no match found.`; + } + } else { + naturalLanguageReport = `Correctly matched with entityId ${expectedMatchEntityId}.`; + } + + const mergedProperties = match?.newValues.properties; + const inputProperties = inputData.newEntity.properties; + + const propertiesWithMergedValues = typedEntries( + mergedProperties ?? {}, + ).filter(([key, value]) => { + return inputProperties[key] !== value; + }); + + const additionalInfo = { + propertiesWithMergedValues: propertiesWithMergedValues.map( + ([key, value]) => `${key}: ${stringifyPropertyValue(value)}`, + ), + }; + + return { + score, + testingParams, + naturalLanguageReport, + additionalInfo, + }; + }, + }; + }, +); + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const baseDirectoryPath = path.join( + __dirname, + "/var/match-existing-entity-test", +); + +test( + "Match new entity with existing entity", + async () => { + const models: LlmParams["model"][] = [ + "claude-3-haiku-20240307", + "gpt-4o-2024-08-06", + ]; + + await optimizeSystemPrompt({ + attemptsPerPrompt: 8, + models, + initialSystemPrompt: matchExistingEntitySystemPrompt, + directoryPath: baseDirectoryPath, + metrics, + promptIterations: 1, + }); + }, + { + timeout: 30 * 60 * 1000, + }, +); diff --git a/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.ts b/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.ts new file mode 100644 index 00000000000..94f5d8be2fb --- /dev/null +++ b/apps/hash-ai-worker-ts/src/activities/shared/match-existing-entity.ts @@ -0,0 +1,664 @@ +import type { VersionedUrl } from "@blockprotocol/type-system"; +import { typedEntries } from "@local/advanced-types/typed-entries"; +import type { SourceProvenance } from "@local/hash-graph-client"; +import type { Entity } from "@local/hash-graph-sdk/entity"; +import type { + EntityId, + PropertyMetadataObject, + PropertyObject, +} from "@local/hash-graph-types/entity"; +import { isValueMetadata } from "@local/hash-graph-types/entity"; +import { deduplicateSources } from "@local/hash-isomorphic-utils/provenance"; +import { sleep } from "@local/hash-isomorphic-utils/sleep"; +import { stringifyPropertyValue } from "@local/hash-isomorphic-utils/stringify-property-value"; +import dedent from "dedent"; + +import { logger } from "./activity-logger.js"; +import { getFlowContext } from "./get-flow-context.js"; +import { getLlmResponse } from "./get-llm-response.js"; +import { getToolCallsFromLlmAssistantMessage } from "./get-llm-response/llm-message.js"; +import type { LlmParams, LlmToolDefinition } from "./get-llm-response/types.js"; +import { graphApiClient } from "./graph-api-client.js"; + +export const matchExistingEntitySystemPrompt = ` +You are managing a database of entities, which may be any type of thing. + +You are processing a new report of an entity, and have to decide if it matches an entity that's already in the database. + +You are given one or more entities from the database which _may_ represent the same thing as the new entity input. +You are told the type(s) of each entity. + +You must: +1. Decide which one, if any, of the existing entities match the new input +2. If there is a match, provide: + - the id of the existing entity that matches the new input + - merged values for properties which are suitable for merging (e.g. descriptions which incorporate both the old and new description) + +There may not be a match. Err on the side of caution when deciding if one entity is the same as another. + +Bear in mind that you may encounter entities which are named similarly but actually refer to different entities, + or which are named slightly differently but refer to the same entity, for example: +1. Nintendo of Europe is not the same as Nintendo of America, and neither are the same as Nintendo Co Ltd. +2. Whereas Nintendo Co Ltd. is the same as Nintendo (they refer to the same entity) +3. The Playstation 5 is not the same as the Playstation 4 + +If the user specifies that the entity represents a link (relationship) between two entities, the properties shown will be the attributes of the link. +Use the properties to determine if the new link is the same as the existing link. For example: +1. An 'employed-by' link which has the same 'startDate' property as another can be judged to be the same link, + as long as it doesn't specify any other properties which have different values (e.g. 'jobRole') +2. But an 'invested-in' link which has the same 'investmentDate' property but _different_ 'investmentAmount' properties is likely to be a different link. + +If you are not certain there is a match among the existing entities, provide 'null' as the 'matchedEntityId'. + +Where a property is present on both the new and the old entity, and it is a text field suitable for merging (e.g. a description), +you should provide a new value that combines both the old and new. + +If a property is NOT present on both the new and old entity, or is not suitable for combining (e.g. short values. numbers), +do not return them. Only return mergedProperties where you have written a new value based on the old and new. +If in doubt, don't rewrite properties. The intention between merging them is to preserve useful information from the old value, +which is only likely to apply to longer, descriptive text fields. + + + + + "https://hash.ai/@hash/types/entity-type/business-location/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/address/": "123 Main St, Seattle, WA" + "https://blockprotocol.org/@blockprotocol/types/property-type/business-name/": "Joe's Coffee" + "https://blockprotocol.org/@blockprotocol/types/property-type/opening-date/": "2022-01-15" + + + + + location123 + "https://hash.ai/@hash/types/entity-type/business-location/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/address/" + 123 Main Street, Seattle, Washington + + + "https://blockprotocol.org/@blockprotocol/types/property-type/business-name/" + Joe's Coffee Shop + + + "https://blockprotocol.org/@blockprotocol/types/property-type/opening-date/" + 2022-01-15 + + + + + location456 + "https://hash.ai/@hash/types/entity-type/business-location/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/address/" + 123 Main St, Seattle, WA + + + "https://blockprotocol.org/@blockprotocol/types/property-type/business-name/" + Joe's Coffee + + + "https://blockprotocol.org/@blockprotocol/types/property-type/opening-date/" + 2020-03-01 + + + + + location789 + "https://hash.ai/@hash/types/entity-type/business-location/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/address/" + 123 Main St, Portland, OR + + + "https://blockprotocol.org/@blockprotocol/types/property-type/business-name/" + Joe's Coffee + + + "https://blockprotocol.org/@blockprotocol/types/property-type/opening-date/" + 2022-01-15 + + + + + + This is a match with location123 despite multiple similar entries. + location456 has the same address but a different opening date, suggesting it's a previous business at the same location. + location789 is a different branch in Portland. + location123 matches both the address and opening date, and the slight variation in business name format is mergeable. + + + + + + + "https://hash.ai/@hash/types/entity-type/investment/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/amount/": "1000000" + "https://blockprotocol.org/@blockprotocol/types/property-type/date/": "2024-03-15" + "https://blockprotocol.org/@blockprotocol/types/property-type/investor/": "Acme Ventures" + + + + + investment456 + "https://hash.ai/@hash/types/entity-type/investment/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/amount/" + 1000000 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/date/" + 2024-03-15 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/investor/" + Beta Capital + + + + + investment789 + "https://hash.ai/@hash/types/entity-type/investment/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/amount/" + 1000000 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/investor/" + Acme Ventures + + + + + investment101 + "https://hash.ai/@hash/types/entity-type/investment/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/amount/" + 1000000 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/date/" + 2024-03-15 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/investor/" + Acme Venture Partners + + + + + + This is a match with investment101 because it matches the amount, date, and investor (Acme Venture Partners is the same as Acme Ventures). + investment456 has a different investor, and investment789 is missing the date field which is crucial for identifying a specific investment. + + + + + + + "https://hash.ai/@hash/types/entity-type/software-release/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/version/": "2.0.0" + "https://blockprotocol.org/@blockprotocol/types/property-type/release-notes/": "Major update including performance improvements and bug fixes" + "https://blockprotocol.org/@blockprotocol/types/property-type/release-date/": "2024-06-15" + "https://blockprotocol.org/@blockprotocol/types/property-type/platform/": "Linux" + + + + + release789 + "https://hash.ai/@hash/types/entity-type/software-release/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/version/" + 2.0.0 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/release-notes/" + Performance improvements and critical bug fixes + + + "https://blockprotocol.org/@blockprotocol/types/property-type/release-date/" + 2024-06-15 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/platform/" + Windows + + + + + release790 + "https://hash.ai/@hash/types/entity-type/software-release/v/1" + + + "https://blockprotocol.org/@blockprotocol/types/property-type/version/" + 2.0.0 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/release-notes/" + This release focuses on performance optimizations and fixes several critical bugs + + + "https://blockprotocol.org/@blockprotocol/types/property-type/release-date/" + 2024-06-15 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/platform/" + MacOS + + + + + release791 + "https://hash.ai/@hash/types/entity-type/software-release/v/1" + + + + "https://blockprotocol.org/@blockprotocol/types/property-type/release-date/" + 2024-06-15 + + + "https://blockprotocol.org/@blockprotocol/types/property-type/platform/" + Linux + + + + + + This is not a match with any existing entity. + release789 is for Windows, despite having the same release date and version number. + release790 is for MacOS, despite having the same release date and version number. + release791 is for Linux and shares the release date, but there is no version number. A patch release could have been released on the same date. + The presence of the 'platform' and 'version' properties in these entities indicate that they are important for distinguishing between them. + + + +`; + +export type ExistingEntityReport = { + matchedEntityId: EntityId | null; + mergedProperties?: Record; +}; + +const toolName = "reportExistingEntityFinding"; + +const generateMatchExistingEntityTool = ( + propertyNames: string[], +): LlmToolDefinition => ({ + name: toolName, + description: dedent(` + If an existing entity matches the new input, provide the id of the existing entity and merged versions of properties which are suitable for merging. + If there is no match, provide null. + `), + inputSchema: { + type: "object", + additionalProperties: false, + properties: { + matchedEntityId: { oneOf: [{ type: "null" }, { type: "string" }] }, + mergedProperties: { + type: "object", + additionalProperties: false, + description: + "The properties where you are merging the old and new value together. Do not include any properties which are not being merged – don't include any which don't appear in both the new and old entity, or which you are not changing from their new value.", + properties: Object.fromEntries( + propertyNames.map((propertyName) => [ + propertyName, + { type: "string" }, + ]), + ), + }, + }, + required: ["matchedEntityId"], + }, +}); + +export type NewEntityInput = { + entityTypeIds: VersionedUrl[]; + properties: PropertyObject; + propertiesMetadata: PropertyMetadataObject; + editionSources: SourceProvenance[]; +}; + +export type ExistingEntityForMatching = Pick< + Entity, + "entityId" | "properties" | "propertiesMetadata" +> & { + metadata: Pick & { + provenance: { + edition: Pick; + }; + }; +}; + +export type MatchExistingEntityParams< + T extends ExistingEntityForMatching = ExistingEntityForMatching, +> = { + newEntity: NewEntityInput; + potentialMatches: T[]; +}; + +const generateMatchExistingEntityUserMessage = ({ + isLink, + newEntity, + potentialMatches, + previousError, +}: MatchExistingEntityParams & { + isLink: boolean; + previousError: string | null; +}): string => { + return `${ + isLink + ? `This is a link entity, which creates a relationship between two other entities. +The properties shown for the new link and the potential matches are the attributes of the links. The new link entity is:` + : "The new entity is:" + } + + + ${newEntity.entityTypeIds.join("\n")} + + ${Object.entries(newEntity.properties) + .map( + ([baseUrl, value]) => ` + ${baseUrl} + ${stringifyPropertyValue(value)} + `, + ) + .join("\n")} + + + +The potential matches are: +${potentialMatches + .map( + (potentialMatch) => ` + + ${potentialMatch.entityId} + ${newEntity.entityTypeIds.join("\n")} + + ${typedEntries(potentialMatch.properties) + .map( + ([baseUrl, value]) => + ` + ${baseUrl} + ${stringifyPropertyValue(value)} + ${typeof newEntity.properties[baseUrl] === "string" ? "Maybe" : "No"} + `, + ) + .join("\n")} + + +`, + ) + .join("\n")} + +Do any of the potential matches match the new entity? +If so, please provide the entityId of the match, and merged versions of properties which are suitable for merging. + +Remember to pay close attention to each property you are provided with. +Differences in some properties between the new entity and the potential matches will make it clear that the entities do NOT match, e.g. because they refer to different dates, versions, roles, values, actors involved, etc. +Check differences in property values to see if they are significant. + +${previousError ? `Your previous response had an error – please do not repeat it: ${previousError}` : ""}`; +}; + +const defaultModel: LlmParams["model"] = "gpt-4o-2024-08-06"; + +export type MatchedEntityUpdate = { + existingEntity: T; + /** + * If a match is found, the values that should be used when creating the entity update. + */ + newValues: { + /** + * The merged entityTypeIds of the new input and the matched entity. + */ + entityTypeIds: VersionedUrl[]; + /** + * The properties which the new entity has changed (introduced or updated). + * Where appropriate, the value for a property may be a merged version of the old and new values, + * e.g. for long text fields such as description (where both the new and old value may contain useful, relevant + */ + properties: PropertyObject; + /** + * The metadata for the changed properties. + * If a property value is the result of merging the old and new value, the sources will also be merged. + * e.g. if the old value came from news.com, and the new value came from wikipedia.com, the merged metadata will list both sources. + */ + propertyMetadata: PropertyMetadataObject; + /** + * The sources for the new entity edition. + * This will be a deduplicated list of sources from the new entity and the matched entity. + */ + editionSources: SourceProvenance[]; + }; +} | null; + +/** + * Given one or more entities which may be a match for a new entity, identify if any of them are a match. + * + * If a match is found, it will return the changed properties object and property object metadata. + * This may not represent ALL properties of the existing entity, only those which the new entity has changed (added or + * updated). + */ +export const matchExistingEntity = async ({ + entities, + isLink, + previousError = null, + testingParams, +}: { + entities: MatchExistingEntityParams; + isLink: boolean; + previousError?: string | null; + /** + * Optional parameters for optimization purposes, allowing to overwrite the system prompt and model used. + */ + testingParams?: { + model?: LlmParams["model"]; + systemPrompt?: string; + }; +}): Promise> => { + const { newEntity, potentialMatches } = entities; + + const { flowEntityId, userAuthentication, stepId, webId } = + await getFlowContext(); + + const tool = generateMatchExistingEntityTool( + Object.keys(newEntity.properties), + ); + + const userMessage = generateMatchExistingEntityUserMessage({ + isLink, + newEntity, + potentialMatches, + previousError, + }); + + const llmResponse = await getLlmResponse( + { + systemPrompt: + testingParams?.systemPrompt ?? matchExistingEntitySystemPrompt, + tools: [tool], + toolChoice: toolName, + messages: [ + { + role: "user", + content: [ + { + type: "text", + text: userMessage, + }, + ], + }, + ], + model: testingParams?.model ?? defaultModel, + }, + { + customMetadata: { + stepId, + taskName: "match-existing-entity", + }, + userAccountId: userAuthentication.actorId, + graphApiClient, + incurredInEntities: [{ entityId: flowEntityId }], + webId, + }, + ); + + if (llmResponse.status !== "ok") { + if (llmResponse.status === "aborted") { + return null; + } + + logger.error( + `Error matching existing entity: [${llmResponse.status}]: ${"message" in llmResponse ? llmResponse.message : "No message provided"}`, + ); + + await sleep(2_000); + + return matchExistingEntity({ + entities, + isLink, + previousError: "message" in llmResponse ? llmResponse.message : null, + testingParams, + }); + } + + const { message } = llmResponse; + + const toolCalls = getToolCallsFromLlmAssistantMessage({ message }); + + const firstToolCall = toolCalls[0]; + + if (!firstToolCall || toolCalls.length > 1) { + return matchExistingEntity({ + entities, + isLink, + previousError: "You must make exactly one tool call", + testingParams, + }); + } + + const { matchedEntityId, mergedProperties } = + firstToolCall.input as ExistingEntityReport; + + if (!matchedEntityId) { + return null; + } + + const match = potentialMatches.find( + (potentialMatch) => potentialMatch.entityId === matchedEntityId, + ); + + if (!match) { + return matchExistingEntity({ + entities, + isLink, + previousError: `You supplied an entity id ${matchedEntityId} which was not in the list of potential matches`, + testingParams, + }); + } + + const changedPropertiesWithMergedValues = JSON.parse( + JSON.stringify(newEntity.properties), + ) as typeof newEntity.properties; + + const metadataForChangedProperties = JSON.parse( + JSON.stringify(newEntity.propertiesMetadata), + ) as typeof newEntity.propertiesMetadata; + + for (const [baseUrl, valueFromNewEntity] of typedEntries( + newEntity.properties, + )) { + const mergedValue = mergedProperties?.[baseUrl]; + + const newValue = mergedValue ?? valueFromNewEntity; + + /** + * This is overwriting the old value with the new in all cases. + * For nested property objects, we may wish to attempt to merge them, e.g. + * if one nested 'address' property has 'Street' defined in the old entity and 'City' in the new, we could take both. + * + * But this may not always be appropriate. The 'Street' and 'City' may refer to different addresses. + * It's not clear we can automatically determine if a property object should be merged or not. + * + * @todo H-3900 tracks handling property objects better in flows + */ + changedPropertiesWithMergedValues[baseUrl] = newValue; + + const existingMetadataForProperty = match.propertiesMetadata.value[baseUrl]; + + const newMetadataForProperty = newEntity.propertiesMetadata.value[baseUrl]; + + if (!newMetadataForProperty) { + throw new Error( + `No metadata provided for property changed at ${baseUrl}`, + ); + } + + if (mergedValue || newValue === match.properties[baseUrl]) { + const existingSources = + existingMetadataForProperty && + isValueMetadata(existingMetadataForProperty) + ? (existingMetadataForProperty.metadata.provenance?.sources ?? []) + : []; + + const newSources = isValueMetadata(newMetadataForProperty) + ? (newMetadataForProperty.metadata.provenance?.sources ?? []) + : []; + + const mergedSources = deduplicateSources([ + ...existingSources, + ...newSources, + ]); + + if (!isValueMetadata(newMetadataForProperty)) { + throw new Error( + `Expected metadata to be a value metadata for property at ${baseUrl}, received: ${JSON.stringify( + newMetadataForProperty, + )}`, + ); + } + + metadataForChangedProperties.value[baseUrl] = { + metadata: { + ...newMetadataForProperty.metadata, + provenance: { + sources: mergedSources, + }, + }, + }; + } else { + metadataForChangedProperties.value[baseUrl] = newMetadataForProperty; + } + } + + const mergedEditionSources = deduplicateSources([ + ...newEntity.editionSources, + ...(match.metadata.provenance.edition.sources ?? []), + ]); + + const matchWithMergedValues = { + entityTypeIds: [ + ...new Set([...newEntity.entityTypeIds, ...match.metadata.entityTypeIds]), + ], + editionSources: mergedEditionSources, + properties: changedPropertiesWithMergedValues, + propertyMetadata: metadataForChangedProperties, + }; + + return { + existingEntity: match, + newValues: matchWithMergedValues, + }; +}; diff --git a/apps/hash-ai-worker-ts/src/activities/shared/openai-client.ts b/apps/hash-ai-worker-ts/src/activities/shared/openai-client.ts index d0f5d7a154f..bba8d553ad6 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/openai-client.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/openai-client.ts @@ -2,12 +2,11 @@ import OpenAI from "openai"; const permittedOpenAiModels = [ "gpt-3.5-turbo-1106", - "gpt-4-1106-preview", "gpt-4-0125-preview", "gpt-4-turbo", "gpt-4", - "gpt-4o", "gpt-4o-2024-08-06", + "gpt-4o-mini-2024-07-18", ] as const; export type PermittedOpenAiModel = (typeof permittedOpenAiModels)[number]; @@ -20,12 +19,11 @@ export const isPermittedOpenAiModel = ( /** @see https://platform.openai.com/docs/models */ export const modelToContextWindow: Record = { "gpt-3.5-turbo-1106": 16_385, - "gpt-4-1106-preview": 128_000, "gpt-4-0125-preview": 128_000, "gpt-4-turbo": 128_000, "gpt-4": 8_192, - "gpt-4o": 128_000, "gpt-4o-2024-08-06": 128_000, + "gpt-4o-mini-2024-07-18": 128_000, }; export const isPermittedModel = ( diff --git a/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt.ts b/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt.ts index 06514a8afb4..c3083907349 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt.ts @@ -93,8 +93,8 @@ const saveSummaryToCSV = (params: { "Iteration", "System Prompt", "Overall Score", - ...models.map((model) => `"Average Score for ${model}"`), - ...metrics.map((metric) => `"Average Score for ${metric.name}"`), + ...models.map((model) => model), + ...metrics.map((metric) => metric.name), ]; const rows = results.map( @@ -132,9 +132,9 @@ const saveSummaryToCSV = (params: { return [ iteration.toString(), escapeCSV(systemPrompt), - overallScore.toString(), - ...modelAverageScores.map((score) => score.toString()), - ...metricAverageScores.map((score) => score.toString()), + overallScore.toFixed(2), + ...modelAverageScores.map((score) => score.toFixed(2)), + ...metricAverageScores.map((score) => score.toFixed(2)), ]; }, ); diff --git a/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt/improve-system-prompt.ts b/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt/improve-system-prompt.ts index 699d51b6ccf..71ff31fafab 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt/improve-system-prompt.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/optimize-system-prompt/improve-system-prompt.ts @@ -31,6 +31,9 @@ const improveSystemPromptSystemPrompt = dedent(` Carefully examine all the metric results. Your task is to propose a new system prompt that will improve the performance of the LLM model across all metrics. Be creative, you can propose a completely new prompt or a slight modification of the previous prompt. + + If the original prompt includes examples, try iterations which also include the examples, or different examples. + Prompts with full, clear examples are generally better than prompts without examples. `); const proposeSystemPromptToolDefinition: LlmToolDefinition<"proposeSystemPrompt"> = diff --git a/apps/hash-api/src/graph/ensure-system-graph-is-initialized/migrate-ontology-types/migrations/007-create-api-usage-tracking.migration.ts b/apps/hash-api/src/graph/ensure-system-graph-is-initialized/migrate-ontology-types/migrations/007-create-api-usage-tracking.migration.ts index 8fceab0c3d4..8c560ab2800 100644 --- a/apps/hash-api/src/graph/ensure-system-graph-is-initialized/migrate-ontology-types/migrations/007-create-api-usage-tracking.migration.ts +++ b/apps/hash-api/src/graph/ensure-system-graph-is-initialized/migrate-ontology-types/migrations/007-create-api-usage-tracking.migration.ts @@ -285,12 +285,6 @@ const migrate: MigrationFunction = async ({ */ const initialServices = [ /** @see https://openai.com/pricing */ - { - serviceName: "OpenAI", - featureName: "gpt-4-1106-preview", - inputUnitCost: 0.00001, // price per input token - outputUnitCost: 0.00003, // price per output token - }, { serviceName: "OpenAI", featureName: "gpt-4", @@ -315,24 +309,18 @@ const migrate: MigrationFunction = async ({ inputUnitCost: 0.00001, outputUnitCost: 0.00003, }, - { - serviceName: "OpenAI", - featureName: "gpt-4o", - inputUnitCost: 0.000005, - outputUnitCost: 0.000015, - }, - { - serviceName: "OpenAI", - featureName: "gpt-4o-2024-05-13", - inputUnitCost: 0.000005, - outputUnitCost: 0.000015, - }, { serviceName: "OpenAI", featureName: "gpt-4o-2024-08-06", inputUnitCost: 0.0000025, outputUnitCost: 0.00001, }, + { + serviceName: "OpenAI", + featureName: "gpt-4o-mini-2024-07-18", + inputUnitCost: 0.00000015, + outputUnitCost: 0.00000016, + }, /** @see https://www.anthropic.com/api */ { serviceName: "Anthropic", @@ -340,12 +328,6 @@ const migrate: MigrationFunction = async ({ inputUnitCost: 0.000015, outputUnitCost: 0.000075, }, - { - serviceName: "Anthropic", - featureName: "claude-3-sonnet-20240229", - inputUnitCost: 0.000003, - outputUnitCost: 0.000015, - }, { serviceName: "Anthropic", featureName: "claude-3-5-sonnet-20240620", @@ -358,24 +340,6 @@ const migrate: MigrationFunction = async ({ inputUnitCost: 0.00000025, outputUnitCost: 0.00000125, }, - { - serviceName: "Anthropic", - featureName: "claude-2.1", - inputUnitCost: 0.000008, - outputUnitCost: 0.000024, - }, - { - serviceName: "Anthropic", - featureName: "claude-2.0", - inputUnitCost: 0.000008, - outputUnitCost: 0.000024, - }, - { - serviceName: "Anthropic", - featureName: "claude-instant-1.2", - inputUnitCost: 0.0000008, - outputUnitCost: 0.0000024, - }, ]; const hashOrg = await getOrgByShortname(context, authentication, { diff --git a/apps/hash-frontend/next.config.js b/apps/hash-frontend/next.config.js index d256af525fd..bab805dd5f8 100644 --- a/apps/hash-frontend/next.config.js +++ b/apps/hash-frontend/next.config.js @@ -161,17 +161,15 @@ export default withSentryConfig( "@tldraw/ui", ], - experimental: { - // These are introduced in the monorepo by the Temporal packages, and despite them not being part of the - // frontend dependency tree, they are not shaken and are included in the generated lambdas - // https://github.com/orgs/vercel/discussions/103#discussioncomment-5427097 - outputFileTracingExcludes: { - "*": [ - "node_modules/@swc/core-linux-x64-gnu", - "node_modules/@swc/core-linux-x64-musl", - "node_modules/@esbuild/linux-x64", - ], - }, + // These are introduced in the monorepo by the Temporal packages, and despite them not being part of the + // frontend dependency tree, they are not shaken and are included in the generated lambdas + // https://github.com/orgs/vercel/discussions/103#discussioncomment-5427097 + outputFileTracingExcludes: { + "*": [ + "node_modules/@swc/core-linux-x64-gnu", + "node_modules/@swc/core-linux-x64-musl", + "node_modules/@esbuild/linux-x64", + ], }, webpack: (webpackConfig, { isServer }) => { diff --git a/apps/hash-frontend/src/pages/shared/accept-draft-entity-button.tsx b/apps/hash-frontend/src/pages/shared/accept-draft-entity-button.tsx index daa1912a07b..93ee0d226ec 100644 --- a/apps/hash-frontend/src/pages/shared/accept-draft-entity-button.tsx +++ b/apps/hash-frontend/src/pages/shared/accept-draft-entity-button.tsx @@ -68,6 +68,9 @@ const getRightOrLeftEntitySx = (params: { background: ({ palette }) => palette.blue[20], }; +/** + * @todo H-3883 ensure that the un-drafting of a link will not violate min/max links on an entity + */ export const AcceptDraftEntityButton: FunctionComponent< { draftEntity: Entity; diff --git a/apps/hashdotdev/next.config.js b/apps/hashdotdev/next.config.js index b9fe80dda4d..4d79b6552bd 100644 --- a/apps/hashdotdev/next.config.js +++ b/apps/hashdotdev/next.config.js @@ -238,17 +238,15 @@ const nextConfig = { ]; }, - experimental: { - // These are introduced in the monorepo by the Temporal packages, and despite them not being part of the - // frontend dependency tree, they are not shaken and are included in the generated lambdas - // https://github.com/orgs/vercel/discussions/103#discussioncomment-5427097 - outputFileTracingExcludes: { - "*": [ - "node_modules/@swc/core-linux-x64-gnu", - "node_modules/@swc/core-linux-x64-musl", - "node_modules/@esbuild/linux-x64", - ], - }, + // These are introduced in the monorepo by the Temporal packages, and despite them not being part of the + // frontend dependency tree, they are not shaken and are included in the generated lambdas + // https://github.com/orgs/vercel/discussions/103#discussioncomment-5427097 + outputFileTracingExcludes: { + "*": [ + "node_modules/@swc/core-linux-x64-gnu", + "node_modules/@swc/core-linux-x64-musl", + "node_modules/@esbuild/linux-x64", + ], }, }; diff --git a/libs/@local/advanced-types/src/brand.ts b/libs/@local/advanced-types/src/brand.ts index 74488aa5c0b..d128998d9fe 100644 --- a/libs/@local/advanced-types/src/brand.ts +++ b/libs/@local/advanced-types/src/brand.ts @@ -6,6 +6,7 @@ type BrandedBase> = Base & { /** The unique name for the branded type */ readonly "#kind": Kind; }; + /** * The type-branding type to support nominal (name based) types */ diff --git a/libs/@local/graph/types/typescript/src/entity.ts b/libs/@local/graph/types/typescript/src/entity.ts index 0ba579f4a73..19df3dc7565 100644 --- a/libs/@local/graph/types/typescript/src/entity.ts +++ b/libs/@local/graph/types/typescript/src/entity.ts @@ -108,6 +108,26 @@ export type PropertyObject = { [key: BaseUrl]: Property; }; +/** + * Takes a property object which has unbranded string keys, such as those found in the generated system types + * in @local/hash-isomorphic-utils/system-types/*, and returns an object where the keys are branded as BaseUrls. + * + * Useful when wanting to strongly type a property object and pass it to a function that expends an object with branded keys. + * + * @todo consider updating the system type TS generation code to brand the keys automatically + */ +export type BrandedPropertyObject> = + T & { + [K in keyof T as Brand]: T[K]; + }; + +// Helper function to create branded objects +export const brandPropertyObject = >( + obj: T, +): BrandedPropertyObject => { + return obj as BrandedPropertyObject; +}; + export type EntityProperties = { entityTypeIds: [VersionedUrl, ...VersionedUrl[]]; properties: PropertyObject; diff --git a/libs/@local/hash-isomorphic-utils/src/provenance.ts b/libs/@local/hash-isomorphic-utils/src/provenance.ts new file mode 100644 index 00000000000..7bad6478723 --- /dev/null +++ b/libs/@local/hash-isomorphic-utils/src/provenance.ts @@ -0,0 +1,115 @@ +import type { SourceProvenance } from "@local/hash-graph-client"; + +import { generateUuid } from "./generate-uuid.js"; + +/** + * Deduplicate a list of sources, merging values as appropriate (see inline comments). + */ +export const deduplicateSources = ( + sources: SourceProvenance[], +): SourceProvenance[] => { + const sourcesByIdentifier = new Map(); + + for (const source of sources) { + const sourceKey = source.location?.uri ?? source.location?.name; + + if (!sourceKey) { + /** + * The source has nothing to usefully identify it by. + * This shouldn't happen, but we'll preserve it in case it has for some reason. + */ + sourcesByIdentifier.set(generateUuid(), source); + continue; + } + + const existingSource = sourcesByIdentifier.get(sourceKey); + + if (!existingSource) { + sourcesByIdentifier.set(sourceKey, source); + continue; + } + + if ( + existingSource.entityId && + source.entityId && + existingSource.entityId !== source.entityId + ) { + /** + * The sources have different entityIds – we'll keep both. + * The merging should happen at the entity level, elsewhere, if these are indeed the same source. + */ + sourcesByIdentifier.set(sourceKey, source); + continue; + } + + if (existingSource.type !== source.type) { + /** + * The sources have different types for some reason, even though they have the same id – we'll keep both. + */ + sourcesByIdentifier.set(sourceKey, source); + continue; + } + + const clonedSource = JSON.parse( + JSON.stringify(existingSource), + ) as typeof existingSource; + + if (source.entityId) { + clonedSource.entityId = source.entityId; + } + + clonedSource.authors = + (existingSource.authors ?? source.authors) + ? [ + ...new Set([ + ...(existingSource.authors ?? []), + ...(source.authors ?? []), + ]), + ] + : undefined; + + /** + * In practice we know that location is defined, because the sourceKey is defined and derived from location, + * but we might as well be explicit in case the sourceKey logic above changes. + */ + clonedSource.location ??= {}; + + /** + * These values may be undefined or empty strings. + * Set them if they're falsy in the first encountered source. + */ + if (!clonedSource.location.uri) { + clonedSource.location.uri = source.location?.uri; + } + if (!clonedSource.location.name) { + clonedSource.location.name = source.location?.name; + } + if (!clonedSource.firstPublished) { + clonedSource.firstPublished = source.firstPublished; + } + + if (!clonedSource.lastUpdated) { + clonedSource.lastUpdated = source.lastUpdated; + } else if ( + source.lastUpdated && + /** lastUpdated is an ISO String */ + source.lastUpdated > clonedSource.lastUpdated + ) { + clonedSource.lastUpdated = source.lastUpdated; + } + + if (!clonedSource.loadedAt) { + clonedSource.loadedAt = source.loadedAt; + } else if ( + source.loadedAt && + /** loadedAt is an ISO String */ + source.loadedAt > clonedSource.loadedAt + ) { + clonedSource.loadedAt = source.loadedAt; + } + + sourcesByIdentifier.set(sourceKey, clonedSource); + } + + return Array.from(sourcesByIdentifier.values()); +};