-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfileProcessor.js
81 lines (72 loc) · 2.51 KB
/
fileProcessor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
import { CSVLoader } from "langchain/document_loaders/fs/csv";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { JSONLoader } from "langchain/document_loaders/fs/json";
import { DocxLoader } from "langchain/document_loaders/fs/docx";
import { EPubLoader } from "langchain/document_loaders/fs/epub";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { OpenAIEmbeddings } from "@langchain/openai";
import { SupabaseVectorStore } from "@langchain/community/vectorstores/supabase";
import fs from "fs";
import path from "path";
import { createClient } from "@supabase/supabase-js";
import dotenv from "dotenv";
dotenv.config();
const dataFolderPath = "./data";
const privateKey = process.env.SUPABASE_PRIVATE_KEY;
if (!privateKey) throw new Error(`Expected env var SUPABASE_PRIVATE_KEY`);
const url = process.env.SUPABASE_URL;
if (!url) throw new Error(`Expected env var SUPABASE_URL`);
const client = createClient(url, privateKey);
const loaders = {
pdf: PDFLoader,
csv: CSVLoader,
txt: TextLoader,
json: JSONLoader,
docx: DocxLoader,
epub: EPubLoader,
};
const processFile = async (filePath) => {
const fileExtension = path.extname(filePath).substring(1).toLowerCase();
const loaderClass = loaders[fileExtension];
if (!loaderClass) {
console.error(`No loader found for file extension: ${fileExtension}`);
return;
}
try {
const loader = new loaderClass(filePath);
const docs = await loader.load();
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1500,
chunkOverlap: 200,
});
const docOutput = await splitter.splitDocuments(docs);
const vectorStore = await SupabaseVectorStore.fromDocuments(
docOutput,
new OpenAIEmbeddings(),
{
client,
tableName: "documents",
queryName: "match_documents",
}
);
console.log(`Processed file: ${filePath}`);
} catch (err) {
console.error(`Error processing file: ${filePath}`, err);
}
};
const readAllFiles = async (folderPath) => {
const files = fs.readdirSync(folderPath);
for (const file of files) {
const filePath = path.join(folderPath, file);
const stat = fs.statSync(filePath);
if (stat.isFile()) {
await processFile(filePath);
} else if (stat.isDirectory()) {
await readAllFiles(filePath); // Recursive call for subdirectories
}
}
};
readAllFiles(dataFolderPath).then(() => {
console.log("Finished processing all files");
});