Skip to content

Commit

Permalink
refactor(indexer): add format handler facilities
Browse files Browse the repository at this point in the history
  • Loading branch information
sinedied committed Dec 12, 2023
1 parent 01cb3e3 commit 081659a
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 10 deletions.
28 changes: 18 additions & 10 deletions packages/indexer/src/lib/document-processor.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { type BaseLogger } from 'pino';
import { getBlobNameFromFile } from './blob-storage.js';
import { type ContentPage, type ContentSection, type Section } from './document.js';
import { extractTextFromPdf } from './formats/index.js';
import { extractText, extractTextFromPdf } from './formats/index.js';

const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
Expand All @@ -10,7 +10,13 @@ const SENTENCE_SEARCH_LIMIT = 100;
const SECTION_OVERLAP = 100;

export class DocumentProcessor {
constructor(private logger: BaseLogger) {}
formatHandlers = new Map<string, (data: Buffer) => Promise<ContentPage[]>>();

constructor(private logger: BaseLogger) {
this.registerFormatHandler('text/plain', extractText);
this.registerFormatHandler('text/markdown', extractText);
this.registerFormatHandler('application/pdf', extractTextFromPdf);
}

async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
const pages = await this.extractText(data, type);
Expand All @@ -19,19 +25,21 @@ export class DocumentProcessor {
return { filename, type, category, sections };
}

private registerFormatHandler(type: string, handler: (data: Buffer) => Promise<ContentPage[]>) {
this.formatHandlers.set(type, handler);
}

private async extractText(data: Buffer, type: string): Promise<ContentPage[]> {
const pages: ContentPage[] = [];
if (type === 'text/plain' || type === 'text/markdown') {
const text = data.toString('utf8');
pages.push({ content: text, offset: 0, page: 0 });
} else if (type === 'application/pdf') {
const pdfContent = await extractTextFromPdf(data);
pages.push(...pdfContent);
} else {
// You can add support for other file types here

const formatHandler = this.formatHandlers.get(type);
if (!formatHandler) {
throw new Error(`Unsupported file type: ${type}`);
}

const contentPages = await formatHandler(data);
pages.push(...contentPages);

return pages;
}

Expand Down
1 change: 1 addition & 0 deletions packages/indexer/src/lib/formats/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from './pdf.js';
export * from './text.js';
6 changes: 6 additions & 0 deletions packages/indexer/src/lib/formats/text.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import { type ContentPage } from '../document.js';

export async function extractText(data: Buffer): Promise<ContentPage[]> {
const text = data.toString('utf8');
return [{ content: text, offset: 0, page: 0 }];
}

0 comments on commit 081659a

Please sign in to comment.