feat(indexer): add pdf support (closes #90)

Azure-Samples · Nov 21, 2023 · efb07f0 · efb07f0
1 parent 5ad179f
commit efb07f0
Show file tree

Hide file tree

Showing 9 changed files with 465 additions and 76 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/packages/indexer/package.json b/packages/indexer/package.json
@@ -40,7 +40,8 @@
     "fastify-cli": "^5.7.0",
     "fastify-plugin": "^4.0.0",
     "mime": "^3.0.0",
-    "openai": "^4.4.0"
+    "openai": "^4.4.0",
+    "pdfjs-dist": "^4.0.189"
   },
   "devDependencies": {
     "@types/mime": "^3.0.1",

diff --git a/packages/indexer/src/lib/document-processor.ts b/packages/indexer/src/lib/document-processor.ts
@@ -1,33 +1,10 @@
 import { type BaseLogger } from 'pino';
 import { getBlobNameFromFile } from './blob-storage.js';
+import { type ContentPage, type ContentSection, type Section } from './document.js';
+import { extractTextFromPdf } from './formats/index.js';
 
-export interface Document {
-  filename: string;
-  type: string;
-  category: string;
-  sections: Section[];
-}
-
-export interface Section {
-  id: string;
-  content: string;
-  category: string;
-  sourcepage: string;
-  sourcefile: string;
-  embedding?: number[];
-}
-
-export interface ContentPage {
-  content: string;
-  offset: number;
-  page: number;
-}
-
-export interface ContentSection {
-  content: string;
-  page: number;
-}
-
+const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
+const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
 const MAX_SECTION_LENGTH = 1000;
 const SENTENCE_SEARCH_LIMIT = 100;
 const SECTION_OVERLAP = 100;
@@ -37,7 +14,7 @@ export class DocumentProcessor {
 
   async createDocumentFromFile(filename: string, data: Buffer, type: string, category: string) {
     const pages = await this.extractText(data, type);
-    const contentSections = this.splitText(filename, pages);
+    const contentSections = this.splitPages(filename, pages);
     const sections = await this.createSections(filename, contentSections, category);
     return { filename, type, category, sections };
   }
@@ -47,8 +24,11 @@ export class DocumentProcessor {
     if (type === 'text/plain' || type === 'text/markdown') {
       const text = data.toString('utf8');
       pages.push({ content: text, offset: 0, page: 0 });
+    } else if (type === 'application/pdf') {
+      const pdfContent = await extractTextFromPdf(data);
+      pages.push(...pdfContent);
     } else {
-      // TODO: support other file types (PDF...)
+      // You can add support for other file types here
       throw new Error(`Unsupported file type: ${type}`);
     }
 
@@ -59,9 +39,9 @@ export class DocumentProcessor {
     const fileId = filenameToId(filename);
     const sections: Section[] = [];
 
-    for (const [index, { content }] of contentSections.entries()) {
+    for (const [index, { content, page }] of contentSections.entries()) {
       const section: Section = {
-        id: `${fileId}-section-${index}`,
+        id: `${fileId}-page-${page}-section-${index}`,
         content,
         category: category,
         sourcepage: getBlobNameFromFile(filename),
@@ -73,24 +53,25 @@ export class DocumentProcessor {
     return sections;
   }
 
-  // TODO: use langchain splitters: https://js.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/code_splitter
-  private splitText(filename: string, pages: ContentPage[]) {
-    const SENTENCE_ENDINGS = new Set(['.', '!', '?']);
-    const WORDS_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']);
-
+  private splitPages(filename: string, pages: ContentPage[]): ContentSection[] {
     this.logger.debug(`Splitting '${filename}' into sections`);
 
-    const findPage = (pages: ContentPage[], offset: number) =>
-      pages.findIndex((page, index, array) => {
-        const nextPage = array[index + 1];
-        return !nextPage || (offset >= page.offset && offset < nextPage.offset);
-      });
+    const findPage = (offset: number): number => {
+      const pageCount = pages.length;
+      for (let i = 0; i < pageCount - 1; i++) {
+        if (offset >= pages[i].offset && offset < pages[i + 1].offset) {
+          return pages[i].page;
+        }
+      }
+      return pages[pageCount - 1].page;
+    };
 
     const contentSections: ContentSection[] = [];
-    const allText = pages.map((p) => p.content).join('');
+    const allText = pages.map((page) => page.content).join('');
     const length = allText.length;
     let start = 0;
     let end = length;
+
     while (start + SECTION_OVERLAP < length) {
       let lastWord = -1;
       end = start + MAX_SECTION_LENGTH;
@@ -104,17 +85,17 @@ export class DocumentProcessor {
           end - start - MAX_SECTION_LENGTH < SENTENCE_SEARCH_LIMIT &&
           !SENTENCE_ENDINGS.has(allText[end])
         ) {
-          if (WORDS_BREAKS.has(allText[end])) {
+          if (WORD_BREAKS.has(allText[end])) {
             lastWord = end;
           }
           end += 1;
         }
         if (end < length && !SENTENCE_ENDINGS.has(allText[end]) && lastWord > 0) {
           end = lastWord; // Fall back to at least keeping a whole word
         }
-      }
-      if (end < length) {
-        end += 1;
+        if (end < length) {
+          end += 1;
+        }
       }
 
       // Try to find the start of the sentence or at least a whole word boundary
@@ -124,7 +105,7 @@ export class DocumentProcessor {
         start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT &&
         !SENTENCE_ENDINGS.has(allText[start])
       ) {
-        if (WORDS_BREAKS.has(allText[start])) {
+        if (WORD_BREAKS.has(allText[start])) {
           lastWord = start;
         }
         start -= 1;
@@ -137,14 +118,14 @@ export class DocumentProcessor {
       }
 
       const sectionText = allText.slice(start, end);
-      contentSections.push({ content: sectionText, page: findPage(pages, start) });
+      contentSections.push({ page: findPage(start), content: sectionText });
 
       const lastTableStart = sectionText.lastIndexOf('<table');
       if (lastTableStart > 2 * SENTENCE_SEARCH_LIMIT && lastTableStart > sectionText.lastIndexOf('</table')) {
         // If the section ends with an unclosed table, we need to start the next section with the table.
         // If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
         // If last table starts inside SECTION_OVERLAP, keep overlapping
-        const page = findPage(pages, start);
+        const page = findPage(start);
         this.logger.debug(
           `Section ends with unclosed table, starting next section with the table at page ${page} offset ${start} table start ${lastTableStart}`,
         );
@@ -155,7 +136,7 @@ export class DocumentProcessor {
     }
 
     if (start + SECTION_OVERLAP < end) {
-      contentSections.push({ content: allText.slice(start, end), page: findPage(pages, start) });
+      contentSections.push({ content: allText.slice(start, end), page: findPage(start) });
     }
 
     return contentSections;

diff --git a/packages/indexer/src/lib/document.ts b/packages/indexer/src/lib/document.ts
@@ -0,0 +1,26 @@
+export interface Document {
+  filename: string;
+  type: string;
+  category: string;
+  sections: Section[];
+}
+
+export interface Section {
+  id: string;
+  content: string;
+  category: string;
+  sourcepage: string;
+  sourcefile: string;
+  embedding?: number[];
+}
+
+export interface ContentPage {
+  content: string;
+  offset: number;
+  page: number;
+}
+
+export interface ContentSection {
+  content: string;
+  page: number;
+}
diff --git a/packages/indexer/src/lib/formats/index.ts b/packages/indexer/src/lib/formats/index.ts
@@ -0,0 +1 @@
+export * from './pdf.js';
diff --git a/packages/indexer/src/lib/formats/pdf.ts b/packages/indexer/src/lib/formats/pdf.ts
@@ -0,0 +1,33 @@
+import * as pdfjs from 'pdfjs-dist';
+import { type TextItem } from 'pdfjs-dist/types/src/display/api.js';
+import { type ContentPage } from '../document.js';
+
+export async function extractTextFromPdf(data: Buffer): Promise<ContentPage[]> {
+  const pages: ContentPage[] = [];
+  const pdf = await pdfjs.getDocument(new Uint8Array(data)).promise;
+  let offset = 0;
+
+  for (let i = 1; i <= pdf.numPages; i++) {
+    const page = await pdf.getPage(i);
+    const textContent = await page.getTextContent();
+    let previousY = 0;
+    const text = textContent.items
+      .filter((item) => 'str' in item)
+      .map((item) => {
+        const textItem = item as TextItem;
+        const y = textItem.transform[5];
+        let textContent = textItem.str;
+        if (y !== previousY && previousY !== 0) {
+          // If the Y coordinate changes, we're on a new line
+          textContent = '\n' + textContent;
+        }
+        previousY = y;
+        return textContent;
+      })
+      .join('');
+
+    pages.push({ content: text + '\n', offset, page: i });
+    offset += text.length;
+  }
+  return pages;
+}
diff --git a/packages/indexer/src/lib/index.ts b/packages/indexer/src/lib/index.ts
@@ -2,5 +2,6 @@ export * from './util/index.js';
 export * from './cli.js';
 export * from './blob-storage.js';
 export * from './document-processor.js';
+export * from './document.js';
 export * from './indexer.js';
 export * from './model-limits.js';
diff --git a/packages/indexer/src/lib/indexer.ts b/packages/indexer/src/lib/indexer.ts
@@ -4,9 +4,10 @@ import { encoding_for_model, type TiktokenModel } from '@dqbd/tiktoken';
 import { type AzureClients } from '../plugins/azure.js';
 import { type OpenAiService } from '../plugins/openai.js';
 import { wait } from './util/index.js';
-import { DocumentProcessor, type Section } from './document-processor.js';
+import { DocumentProcessor } from './document-processor.js';
 import { MODELS_SUPPORTED_BATCH_SIZE } from './model-limits.js';
 import { BlobStorage } from './blob-storage.js';
+import { type Section } from './document.js';
 
 export interface IndexFileOptions {
   useVectors?: boolean;

diff --git a/packages/indexer/test.http b/packages/indexer/test.http
@@ -20,7 +20,7 @@ DELETE {{api_host}}/indexes/test
 
 ###
 
-# Index a file
+# Index a text file
 POST {{api_host}}/indexes/test/files
 Accept: */*
 Content-Type: multipart/form-data; boundary=Boundary
@@ -43,5 +43,27 @@ Content-Disposition: form-data; name="options"
 
 ###
 
+# Index a pdf file
+POST {{api_host}}/indexes/test/files
+Accept: */*
+Content-Type: multipart/form-data; boundary=Boundary
+
+--Boundary
+Content-Disposition: form-data; name="file"; filename="test.pdf"
+Content-Type: application/pdf
+
+< ../../data/support.pdf
+--Boundary
+Content-Disposition: form-data; name="options"
+
+{
+  "category": "test-category",
+  "wait": true,
+  "useVectors": true
+}
+--Boundary--
+
+###
+
 # Delete a file
 DELETE {{api_host}}/indexes/test/files/sample.txt