Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update npm package llamaindex to v0.8.29 #5154

Merged
merged 6 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/hash-ai-worker-ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"googleapis": "133.0.0",
"is-docker": "3.0.0",
"jsdom": "24.1.3",
"llamaindex": "0.2.10",
"llamaindex": "0.8.30",
"lodash.debounce": "4.0.8",
"lodash.isequal": "4.5.0",
"lodash.ismatch": "4.4.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,63 +1,67 @@
import { createWriteStream } from "node:fs";
import { mkdir, rm, unlink } from "node:fs/promises";
import fs from "node:fs/promises";
import { Readable } from "node:stream";
import { finished } from "node:stream/promises";
import stream from "node:stream/promises";
import type { ReadableStream } from "node:stream/web";

import {
PDFReader,
SimpleDocumentStore,
SimpleIndexStore,
SimpleVectorStore,
VectorStoreIndex,
} from "llamaindex";
import { PDFReader, VectorStoreIndex } from "llamaindex";
import md5 from "md5";

import { logger } from "../../../../shared/activity-logger.js";
import type { SimpleStorageContext } from "./simple-storage-context.js";
import {
generateSimpleStorageContextFilePaths,
persistSimpleStorageContext,
retrieveSimpleStorageContext,
createStorageContext,
persistStorageContext,
} from "./simple-storage-context.js";

const fileExists = async (path: string) => {
try {
await fs.access(path, fs.constants.F_OK);
return true;
} catch {
return false;
}
};

export const indexPdfFile = async (params: {
fileUrl: string;
}): Promise<{ vectorStoreIndex: VectorStoreIndex }> => {
const { fileUrl } = params;

const hashedUrl = md5(fileUrl);

const { simpleStorageContext } = await retrieveSimpleStorageContext({
const storageContext = await createStorageContext({
hash: hashedUrl,
});

if (!simpleStorageContext) {
logger.info("No existing storage context found. Creating new one...");
const filePath = `${storageContext.directory}/file.pdf`;
const exists = await fileExists(filePath);

const response = await fetch(fileUrl);
let vectorStoreIndex;

if (!response.ok || !response.body) {
throw new Error(`Failed to fetch ${fileUrl}: ${response.statusText}`);
}
if (exists) {
logger.info("Retrieved existing storage context");

const { directoryPath } = generateSimpleStorageContextFilePaths({
hash: hashedUrl,
vectorStoreIndex = await VectorStoreIndex.init({
storageContext,
});
} else {
logger.info("File has not been indexed yet. Downloading...");

await mkdir(directoryPath, { recursive: true });
const response = await fetch(fileUrl);

const filePath = `${directoryPath}/file.pdf`;
if (!response.ok || !response.body) {
throw new Error(`Failed to fetch ${fileUrl}: ${response.statusText}`);
}

try {
const fileStream = createWriteStream(filePath);
await finished(
await stream.finished(
Readable.fromWeb(response.body as ReadableStream<Uint8Array>).pipe(
fileStream,
),
);
} catch (error) {
await unlink(filePath);
await fs.unlink(filePath);
throw new Error(
`Failed to write file to file system: ${(error as Error).message}`,
);
Expand All @@ -69,45 +73,26 @@ export const indexPdfFile = async (params: {

logger.info(`Loaded PDF File as ${documents.length} documents`);

const storageContext: SimpleStorageContext = {
vectorStore: new SimpleVectorStore(),
docStore: new SimpleDocumentStore(),
indexStore: new SimpleIndexStore(),
};

const vectorStoreIndex = await VectorStoreIndex.fromDocuments(documents, {
vectorStoreIndex = await VectorStoreIndex.fromDocuments(documents, {
storageContext,
});

logger.info(
`Indexed PDF File successfully as ${documents.length} documents`,
);

if (process.env.NODE_ENV === "development") {
/**
* In development, cache the storage context for faster iteration
* when testing the same PDF file.
*/
await persistSimpleStorageContext({
hash: hashedUrl,
simpleStorageContext: storageContext,
await persistStorageContext({
storageContext,
});
} else {
/**
* In production, remove the PDF file from disk once it's been
* indexed in the simple vector store.
*/
await rm(filePath);
await fs.rm(filePath);
}

return { vectorStoreIndex };
}

logger.info("Retrieved existing storage context");

const vectorStoreIndex = await VectorStoreIndex.init({
storageContext: simpleStorageContext,
});

return { vectorStoreIndex };
};
Original file line number Diff line number Diff line change
@@ -1,98 +1,70 @@
import { access, mkdir } from "node:fs/promises";
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";

import type { Subtype } from "@local/advanced-types/subtype";
import type { StorageContext } from "llamaindex";
import {
SimpleDocumentStore,
SimpleIndexStore,
SimpleVectorStore,
} from "llamaindex";

export type SimpleStorageContext = Subtype<
StorageContext,
{
docStore: SimpleDocumentStore;
indexStore: SimpleIndexStore;
vectorStore: SimpleVectorStore;
}
>;
import { type StorageContext, storageContextFromDefaults } from "llamaindex";
indietyp marked this conversation as resolved.
Show resolved Hide resolved

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const baseFilePath = path.join(__dirname, "/var/tmp_files");

export const generateSimpleStorageContextFilePaths = (params: {
hash: string;
}) => {
const { hash } = params;
export interface Storage extends StorageContext {
directory: string;
}

const directoryPath = `${baseFilePath}/storage/${hash}`;
const directory = ({ hash }: { hash?: string }) => {
// if no hash is provided, use a random one
const segment = hash ?? Math.random().toString(36).substring(7);
indietyp marked this conversation as resolved.
Show resolved Hide resolved
Fixed Show fixed Hide fixed

return {
directoryPath,
vectorStorePath: `${directoryPath}/vector-store.json`,
docStorePath: `${directoryPath}/doc-store.json`,
indexStorePath: `${directoryPath}/index-store.json`,
};
return `${baseFilePath}/storage/${segment}`;
};

export const retrieveSimpleStorageContext = async (params: {
hash: string;
}): Promise<{ simpleStorageContext: SimpleStorageContext | undefined }> => {
const { hash } = params;

const { directoryPath, vectorStorePath, docStorePath, indexStorePath } =
generateSimpleStorageContextFilePaths({ hash });
export const createStorageContext = async ({ hash }: { hash?: string }) => {
const directoryPath = directory({ hash });

try {
// Check directory exists
await access(directoryPath);

await Promise.all([
access(vectorStorePath),
// access(docStorePath),
access(indexStorePath),
]);
await fs.mkdir(directoryPath, { recursive: true });
} catch (error: unknown) {
if ((error as NodeJS.ErrnoException).code !== "EEXIST") {
// eslint-disable-next-line no-console
console.info(
indietyp marked this conversation as resolved.
Show resolved Hide resolved
`Unable to create directory ${directoryPath}: ${(error as Error).message}`,
);
}
}

const simpleStorageContext: SimpleStorageContext = {
vectorStore: await SimpleVectorStore.fromPersistPath(vectorStorePath),
docStore: await SimpleDocumentStore.fromPersistPath(docStorePath),
indexStore: await SimpleIndexStore.fromPersistPath(indexStorePath),
};
const context = await storageContextFromDefaults({
persistDir: directoryPath,
storeImages: false,
});

return { simpleStorageContext };
} catch (error) {
// eslint-disable-next-line no-console
console.error(
`Failed to retrieve storage context: ${(error as Error).message}`,
);
return { simpleStorageContext: undefined };
}
return { ...context, directory: directoryPath };
};

export const persistSimpleStorageContext = async (params: {
hash: string;
simpleStorageContext: SimpleStorageContext;
}) => {
const { hash, simpleStorageContext } = params;
const ensurePromise = <T>(value: T | Promise<T>): Promise<T> =>
value instanceof Promise ? value : Promise.resolve(value);

const { vectorStore, docStore, indexStore } = simpleStorageContext;
export const persistStorageContext = ({
storageContext: { vectorStores, docStore, indexStore },
}: {
storageContext: StorageContext;
}) => {
const promises: Promise<void>[] = [];

const { directoryPath, vectorStorePath, docStorePath, indexStorePath } =
generateSimpleStorageContextFilePaths({ hash });
const pushPersist = (store: object | undefined) => {
if (store && "persist" in store) {
const persist = store.persist as () => Promise<void> | void;
promises.push(ensurePromise(persist()));
}
};

try {
await access(directoryPath);
} catch {
// If the directory does not exist, create it recursively
await mkdir(directoryPath, { recursive: true });
for (const store of Object.values(vectorStores)) {
pushPersist(store);
}

await vectorStore.persist(vectorStorePath);
/** @todo: figure out why this doesn't get created */
await docStore.persist(docStorePath);
await indexStore.persist(indexStorePath);
pushPersist(docStore);
pushPersist(indexStore);

return Promise.all(promises);
};
Loading
Loading