From 77af43d2393766b6fddc429e6d8c80fd287f02c6 Mon Sep 17 00:00:00 2001 From: sinedied Date: Mon, 11 Dec 2023 17:11:03 +0100 Subject: [PATCH] refactor(indexer): add format handler facilities --- .../indexer/src/lib/document-processor.ts | 28 ++++++++++++------- packages/indexer/src/lib/formats/index.ts | 1 + packages/indexer/src/lib/formats/text.ts | 6 ++++ 3 files changed, 25 insertions(+), 10 deletions(-) create mode 100644 packages/indexer/src/lib/formats/text.ts diff --git a/packages/indexer/src/lib/document-processor.ts b/packages/indexer/src/lib/document-processor.ts index be6fce46..f5ee7a25 100644 --- a/packages/indexer/src/lib/document-processor.ts +++ b/packages/indexer/src/lib/document-processor.ts @@ -1,7 +1,7 @@ import { type BaseLogger } from 'pino'; import { getBlobNameFromFile } from './blob-storage.js'; import { type ContentPage, type ContentSection, type Section } from './document.js'; -import { extractTextFromPdf } from './formats/index.js'; +import { extractText, extractTextFromPdf } from './formats/index.js'; const SENTENCE_ENDINGS = new Set(['.', '!', '?']); const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']); @@ -10,7 +10,13 @@ const SENTENCE_SEARCH_LIMIT = 100; const SECTION_OVERLAP = 100; export class DocumentProcessor { - constructor(private logger: BaseLogger) {} + formatHandlers = new Map Promise>(); + + constructor(private logger: BaseLogger) { + this.registerFormatHandler('text/plain', extractText); + this.registerFormatHandler('text/markdown', extractText); + this.registerFormatHandler('application/pdf', extractTextFromPdf); + } async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) { const pages = await this.extractText(data, type); @@ -19,19 +25,21 @@ export class DocumentProcessor { return { filename, type, category, sections }; } + private registerFormatHandler(type: string, handler: (data: Buffer) => Promise) { + this.formatHandlers.set(type, handler); + } + private async extractText(data: Buffer, type: string): Promise { const pages: ContentPage[] = []; - if (type === 'text/plain' || type === 'text/markdown') { - const text = data.toString('utf8'); - pages.push({ content: text, offset: 0, page: 0 }); - } else if (type === 'application/pdf') { - const pdfContent = await extractTextFromPdf(data); - pages.push(...pdfContent); - } else { - // You can add support for other file types here + + const formatHandler = this.formatHandlers.get(type); + if (!formatHandler) { throw new Error(`Unsupported file type: ${type}`); } + const contentPages = await formatHandler(data); + pages.push(...contentPages); + return pages; } diff --git a/packages/indexer/src/lib/formats/index.ts b/packages/indexer/src/lib/formats/index.ts index ce20a117..9015d7f3 100644 --- a/packages/indexer/src/lib/formats/index.ts +++ b/packages/indexer/src/lib/formats/index.ts @@ -1 +1,2 @@ export * from './pdf.js'; +export * from './text.js'; diff --git a/packages/indexer/src/lib/formats/text.ts b/packages/indexer/src/lib/formats/text.ts new file mode 100644 index 00000000..81298559 --- /dev/null +++ b/packages/indexer/src/lib/formats/text.ts @@ -0,0 +1,6 @@ +import { type ContentPage } from '../document.js'; + +export async function extractText(data: Buffer): Promise { + const text = data.toString('utf8'); + return [{ content: text, offset: 0, page: 0 }]; +}