Skip to content

Commit

Permalink
feat(indexer): add pdf support (closes #90)
Browse files Browse the repository at this point in the history
  • Loading branch information
sinedied committed Dec 4, 2023
1 parent 672a5c7 commit 56d5ac4
Show file tree
Hide file tree
Showing 9 changed files with 465 additions and 76 deletions.
369 changes: 346 additions & 23 deletions package-lock.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion packages/indexer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
"fastify-cli": "^5.7.0",
"fastify-plugin": "^4.0.0",
"mime": "^3.0.0",
"openai": "^4.4.0"
"openai": "^4.4.0",
"pdfjs-dist": "^4.0.189"
},
"devDependencies": {
"@types/mime": "^3.0.1",
Expand Down
81 changes: 31 additions & 50 deletions packages/indexer/src/lib/document-processor.ts
Original file line number Diff line number Diff line change
@@ -1,33 +1,10 @@
import { type BaseLogger } from 'pino';
import { getBlobNameFromFile } from './blob-storage.js';
import { type ContentPage, type ContentSection, type Section } from './document.js';
import { extractTextFromPdf } from './formats/index.js';

export interface Document {
filename: string;
type: string;
category: string;
sections: Section[];
}

export interface Section {
id: string;
content: string;
category: string;
sourcepage: string;
sourcefile: string;
embedding?: number[];
}

export interface ContentPage {
content: string;
offset: number;
page: number;
}

export interface ContentSection {
content: string;
page: number;
}

const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
const MAX_SECTION_LENGTH = 1000;
const SENTENCE_SEARCH_LIMIT = 100;
const SECTION_OVERLAP = 100;
Expand All @@ -37,7 +14,7 @@ export class DocumentProcessor {

async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
const pages = await this.extractText(data, type);
const contentSections = this.splitText(filename, pages);
const contentSections = this.splitPages(filename, pages);
const sections = await this.createSections(filename, contentSections, category);
return { filename, type, category, sections };
}
Expand All @@ -47,8 +24,11 @@ export class DocumentProcessor {
if (type === 'text/plain' || type === 'text/markdown') {
const text = data.toString('utf8');
pages.push({ content: text, offset: 0, page: 0 });
} else if (type === 'application/pdf') {
const pdfContent = await extractTextFromPdf(data);
pages.push(...pdfContent);
} else {
// TODO: support other file types (PDF...)
// You can add support for other file types here
throw new Error(`Unsupported file type: ${type}`);
}

Expand All @@ -59,9 +39,9 @@ export class DocumentProcessor {
const fileId = filenameToId(filename);
const sections: Section[] = [];

for (const [index, { content }] of contentSections.entries()) {
for (const [index, { content, page }] of contentSections.entries()) {
const section: Section = {
id: `${fileId}-section-${index}`,
id: `${fileId}-page-${page}-section-${index}`,
content,
category: category,
sourcepage: getBlobNameFromFile(filename),
Expand All @@ -73,24 +53,25 @@ export class DocumentProcessor {
return sections;
}

// TODO: use langchain splitters: https://js.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/code_splitter
private splitText(filename: string, pages: ContentPage[]) {
const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
const WORDS_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);

private splitPages(filename: string, pages: ContentPage[]): ContentSection[] {
this.logger.debug(`Splitting '${filename}' into sections`);

const findPage = (pages: ContentPage[], offset: number) =>
pages.findIndex((page, index, array) => {
const nextPage = array[index + 1];
return !nextPage || (offset >= page.offset && offset < nextPage.offset);
});
const findPage = (offset: number): number => {
const pageCount = pages.length;
for (let i = 0; i < pageCount - 1; i++) {
if (offset >= pages[i].offset && offset < pages[i + 1].offset) {
return pages[i].page;
}
}
return pages[pageCount - 1].page;
};

const contentSections: ContentSection[] = [];
const allText = pages.map((p) => p.content).join('');
const allText = pages.map((page) => page.content).join('');
const length = allText.length;
let start = 0;
let end = length;

while (start + SECTION_OVERLAP < length) {
let lastWord = -1;
end = start + MAX_SECTION_LENGTH;
Expand All @@ -104,17 +85,17 @@ export class DocumentProcessor {
end - start - MAX_SECTION_LENGTH < SENTENCE_SEARCH_LIMIT &&
!SENTENCE_ENDINGS.has(allText[end])
) {
if (WORDS_BREAKS.has(allText[end])) {
if (WORD_BREAKS.has(allText[end])) {
lastWord = end;
}
end += 1;
}
if (end < length && !SENTENCE_ENDINGS.has(allText[end]) && lastWord > 0) {
end = lastWord; // Fall back to at least keeping a whole word
}
}
if (end < length) {
end += 1;
if (end < length) {
end += 1;
}
}

// Try to find the start of the sentence or at least a whole word boundary
Expand All @@ -124,7 +105,7 @@ export class DocumentProcessor {
start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT &&
!SENTENCE_ENDINGS.has(allText[start])
) {
if (WORDS_BREAKS.has(allText[start])) {
if (WORD_BREAKS.has(allText[start])) {
lastWord = start;
}
start -= 1;
Expand All @@ -137,14 +118,14 @@ export class DocumentProcessor {
}

const sectionText = allText.slice(start, end);
contentSections.push({ content: sectionText, page: findPage(pages, start) });
contentSections.push({ page: findPage(start), content: sectionText });

const lastTableStart = sectionText.lastIndexOf('<table');
if (lastTableStart > 2 * SENTENCE_SEARCH_LIMIT && lastTableStart > sectionText.lastIndexOf('</table')) {
// If the section ends with an unclosed table, we need to start the next section with the table.
// If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
// If last table starts inside SECTION_OVERLAP, keep overlapping
const page = findPage(pages, start);
const page = findPage(start);
this.logger.debug(
`Section ends with unclosed table, starting next section with the table at page ${page} offset ${start} table start ${lastTableStart}`,
);
Expand All @@ -155,7 +136,7 @@ export class DocumentProcessor {
}

if (start + SECTION_OVERLAP < end) {
contentSections.push({ content: allText.slice(start, end), page: findPage(pages, start) });
contentSections.push({ content: allText.slice(start, end), page: findPage(start) });
}

return contentSections;
Expand Down
26 changes: 26 additions & 0 deletions packages/indexer/src/lib/document.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
export interface Document {
filename: string;
type: string;
category: string;
sections: Section[];
}

export interface Section {
id: string;
content: string;
category: string;
sourcepage: string;
sourcefile: string;
embedding?: number[];
}

export interface ContentPage {
content: string;
offset: number;
page: number;
}

export interface ContentSection {
content: string;
page: number;
}
1 change: 1 addition & 0 deletions packages/indexer/src/lib/formats/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './pdf.js';
33 changes: 33 additions & 0 deletions packages/indexer/src/lib/formats/pdf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import * as pdfjs from 'pdfjs-dist';
import { type TextItem } from 'pdfjs-dist/types/src/display/api.js';
import { type ContentPage } from '../document.js';

export async function extractTextFromPdf(data: Buffer): Promise<ContentPage[]> {
const pages: ContentPage[] = [];
const pdf = await pdfjs.getDocument(new Uint8Array(data)).promise;
let offset = 0;

for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
let previousY = 0;
const text = textContent.items
.filter((item) => 'str' in item)
.map((item) => {
const textItem = item as TextItem;
const y = textItem.transform[5];
let textContent = textItem.str;
if (y !== previousY && previousY !== 0) {
// If the Y coordinate changes, we're on a new line
textContent = '\n' + textContent;
}
previousY = y;
return textContent;
})
.join('');

pages.push({ content: text + '\n', offset, page: i });
offset += text.length;
}
return pages;
}
1 change: 1 addition & 0 deletions packages/indexer/src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ export * from './util/index.js';
export * from './cli.js';
export * from './blob-storage.js';
export * from './document-processor.js';
export * from './document.js';
export * from './indexer.js';
export * from './model-limits.js';
3 changes: 2 additions & 1 deletion packages/indexer/src/lib/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import { encoding_for_model, type TiktokenModel } from '@dqbd/tiktoken';
import { type AzureClients } from '../plugins/azure.js';
import { type OpenAiService } from '../plugins/openai.js';
import { wait } from './util/index.js';
import { DocumentProcessor, type Section } from './document-processor.js';
import { DocumentProcessor } from './document-processor.js';
import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js';
import { BlobStorage } from './blob-storage.js';
import { type Section } from './document.js';

export interface IndexFileOptions {
useVectors?: boolean;
Expand Down
24 changes: 23 additions & 1 deletion packages/indexer/test.http
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ DELETE {{api_host}}/indexes/test

###

# Index a file
# Index a text file
POST {{api_host}}/indexes/test/files
Accept: */*
Content-Type: multipart/form-data; boundary=Boundary
Expand All @@ -43,5 +43,27 @@ Content-Disposition: form-data; name="options"

###

# Index a pdf file
POST {{api_host}}/indexes/test/files
Accept: */*
Content-Type: multipart/form-data; boundary=Boundary

--Boundary
Content-Disposition: form-data; name="file"; filename="test.pdf"
Content-Type: application/pdf

< ../../data/support.pdf
--Boundary
Content-Disposition: form-data; name="options"

{
"category": "test-category",
"wait": true,
"useVectors": true
}
--Boundary--

###

# Delete a file
DELETE {{api_host}}/indexes/test/files/sample.txt

0 comments on commit 56d5ac4

Please sign in to comment.