Skip to content

Commit

Permalink
Update npm package llamaindex to v0.8.29 (#5154)
Browse files Browse the repository at this point in the history
Co-authored-by: hash-worker[bot] <180894564+hash-worker[bot]@users.noreply.github.com>
Co-authored-by: Bilal Mahmoud <bmahmoud@mpi-cbg.de>
Co-authored-by: Ciaran Morinan <37743469+CiaranMn@users.noreply.github.com>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent 40138b2 commit 8de9549
Show file tree
Hide file tree
Showing 4 changed files with 869 additions and 429 deletions.
2 changes: 1 addition & 1 deletion apps/hash-ai-worker-ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"googleapis": "133.0.0",
"is-docker": "3.0.0",
"jsdom": "24.1.3",
"llamaindex": "0.2.10",
"llamaindex": "0.8.30",
"lodash.debounce": "4.0.8",
"lodash.isequal": "4.5.0",
"lodash.ismatch": "4.4.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,63 +1,67 @@
import { createWriteStream } from "node:fs";
import { mkdir, rm, unlink } from "node:fs/promises";
import fs from "node:fs/promises";
import { Readable } from "node:stream";
import { finished } from "node:stream/promises";
import stream from "node:stream/promises";
import type { ReadableStream } from "node:stream/web";

import {
PDFReader,
SimpleDocumentStore,
SimpleIndexStore,
SimpleVectorStore,
VectorStoreIndex,
} from "llamaindex";
import { PDFReader, VectorStoreIndex } from "llamaindex";
import md5 from "md5";

import { logger } from "../../../../shared/activity-logger.js";
import type { SimpleStorageContext } from "./simple-storage-context.js";
import {
generateSimpleStorageContextFilePaths,
persistSimpleStorageContext,
retrieveSimpleStorageContext,
createStorageContext,
persistStorageContext,
} from "./simple-storage-context.js";

const fileExists = async (path: string) => {
try {
await fs.access(path, fs.constants.F_OK);
return true;
} catch {
return false;
}
};

export const indexPdfFile = async (params: {
fileUrl: string;
}): Promise<{ vectorStoreIndex: VectorStoreIndex }> => {
const { fileUrl } = params;

const hashedUrl = md5(fileUrl);

const { simpleStorageContext } = await retrieveSimpleStorageContext({
const storageContext = await createStorageContext({
hash: hashedUrl,
});

if (!simpleStorageContext) {
logger.info("No existing storage context found. Creating new one...");
const filePath = `${storageContext.directory}/file.pdf`;
const exists = await fileExists(filePath);

const response = await fetch(fileUrl);
let vectorStoreIndex;

if (!response.ok || !response.body) {
throw new Error(`Failed to fetch ${fileUrl}: ${response.statusText}`);
}
if (exists) {
logger.info("Retrieved existing storage context");

const { directoryPath } = generateSimpleStorageContextFilePaths({
hash: hashedUrl,
vectorStoreIndex = await VectorStoreIndex.init({
storageContext,
});
} else {
logger.info("File has not been indexed yet. Downloading...");

await mkdir(directoryPath, { recursive: true });
const response = await fetch(fileUrl);

const filePath = `${directoryPath}/file.pdf`;
if (!response.ok || !response.body) {
throw new Error(`Failed to fetch ${fileUrl}: ${response.statusText}`);
}

try {
const fileStream = createWriteStream(filePath);
await finished(
await stream.finished(
Readable.fromWeb(response.body as ReadableStream<Uint8Array>).pipe(
fileStream,
),
);
} catch (error) {
await unlink(filePath);
await fs.unlink(filePath);
throw new Error(
`Failed to write file to file system: ${(error as Error).message}`,
);
Expand All @@ -69,45 +73,26 @@ export const indexPdfFile = async (params: {

logger.info(`Loaded PDF File as ${documents.length} documents`);

const storageContext: SimpleStorageContext = {
vectorStore: new SimpleVectorStore(),
docStore: new SimpleDocumentStore(),
indexStore: new SimpleIndexStore(),
};

const vectorStoreIndex = await VectorStoreIndex.fromDocuments(documents, {
vectorStoreIndex = await VectorStoreIndex.fromDocuments(documents, {
storageContext,
});

logger.info(
`Indexed PDF File successfully as ${documents.length} documents`,
);

if (process.env.NODE_ENV === "development") {
/**
* In development, cache the storage context for faster iteration
* when testing the same PDF file.
*/
await persistSimpleStorageContext({
hash: hashedUrl,
simpleStorageContext: storageContext,
await persistStorageContext({
storageContext,
});
} else {
/**
* In production, remove the PDF file from disk once it's been
* indexed in the simple vector store.
*/
await rm(filePath);
await fs.rm(filePath);
}

return { vectorStoreIndex };
}

logger.info("Retrieved existing storage context");

const vectorStoreIndex = await VectorStoreIndex.init({
storageContext: simpleStorageContext,
});

return { vectorStoreIndex };
};
Original file line number Diff line number Diff line change
@@ -1,98 +1,68 @@
import { access, mkdir } from "node:fs/promises";
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";

import type { Subtype } from "@local/advanced-types/subtype";
import type { StorageContext } from "llamaindex";
import {
SimpleDocumentStore,
SimpleIndexStore,
SimpleVectorStore,
} from "llamaindex";
import { type StorageContext, storageContextFromDefaults } from "llamaindex";

export type SimpleStorageContext = Subtype<
StorageContext,
{
docStore: SimpleDocumentStore;
indexStore: SimpleIndexStore;
vectorStore: SimpleVectorStore;
}
>;
import { logger } from "../../../../shared/activity-logger.js";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const baseFilePath = path.join(__dirname, "/var/tmp_files");

export const generateSimpleStorageContextFilePaths = (params: {
hash: string;
}) => {
const { hash } = params;

const directoryPath = `${baseFilePath}/storage/${hash}`;
export interface Storage extends StorageContext {
directory: string;
}

return {
directoryPath,
vectorStorePath: `${directoryPath}/vector-store.json`,
docStorePath: `${directoryPath}/doc-store.json`,
indexStorePath: `${directoryPath}/index-store.json`,
};
const directory = ({ hash }: { hash: string }) => {
return `${baseFilePath}/storage/${hash}`;
};

export const retrieveSimpleStorageContext = async (params: {
hash: string;
}): Promise<{ simpleStorageContext: SimpleStorageContext | undefined }> => {
const { hash } = params;

const { directoryPath, vectorStorePath, docStorePath, indexStorePath } =
generateSimpleStorageContextFilePaths({ hash });
export const createStorageContext = async ({ hash }: { hash: string }) => {
const directoryPath = directory({ hash });

try {
// Check directory exists
await access(directoryPath);

await Promise.all([
access(vectorStorePath),
// access(docStorePath),
access(indexStorePath),
]);
await fs.mkdir(directoryPath, { recursive: true });
} catch (error: unknown) {
if ((error as NodeJS.ErrnoException).code !== "EEXIST") {
logger.info(
`Unable to create directory ${directoryPath}: ${(error as Error).message}`,
);
}
}

const simpleStorageContext: SimpleStorageContext = {
vectorStore: await SimpleVectorStore.fromPersistPath(vectorStorePath),
docStore: await SimpleDocumentStore.fromPersistPath(docStorePath),
indexStore: await SimpleIndexStore.fromPersistPath(indexStorePath),
};
const context = await storageContextFromDefaults({
persistDir: directoryPath,
storeImages: false,
});

return { simpleStorageContext };
} catch (error) {
// eslint-disable-next-line no-console
console.error(
`Failed to retrieve storage context: ${(error as Error).message}`,
);
return { simpleStorageContext: undefined };
}
return { ...context, directory: directoryPath };
};

export const persistSimpleStorageContext = async (params: {
hash: string;
simpleStorageContext: SimpleStorageContext;
}) => {
const { hash, simpleStorageContext } = params;
const ensurePromise = <T>(value: T | Promise<T>): Promise<T> =>
value instanceof Promise ? value : Promise.resolve(value);

const { vectorStore, docStore, indexStore } = simpleStorageContext;
export const persistStorageContext = ({
storageContext: { vectorStores, docStore, indexStore },
}: {
storageContext: StorageContext;
}) => {
const promises: Promise<void>[] = [];

const { directoryPath, vectorStorePath, docStorePath, indexStorePath } =
generateSimpleStorageContextFilePaths({ hash });
const pushPersist = (store: object | undefined) => {
if (store && "persist" in store) {
const persist = store.persist as () => Promise<void> | void;
promises.push(ensurePromise(persist()));
}
};

try {
await access(directoryPath);
} catch {
// If the directory does not exist, create it recursively
await mkdir(directoryPath, { recursive: true });
for (const store of Object.values(vectorStores)) {
pushPersist(store);
}

await vectorStore.persist(vectorStorePath);
/** @todo: figure out why this doesn't get created */
await docStore.persist(docStorePath);
await indexStore.persist(indexStorePath);
pushPersist(docStore);
pushPersist(indexStore);

return Promise.all(promises);
};
Loading

0 comments on commit 8de9549

Please sign in to comment.