From c3e4de904bac268d770d7a1c1b6b23fdd9fddf7e Mon Sep 17 00:00:00 2001 From: Anthony Shaw Date: Thu, 14 Mar 2024 13:22:08 +1100 Subject: [PATCH] Add ideographic and fullwidth punctuation to splitter --- packages/indexer/src/lib/document-processor.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/indexer/src/lib/document-processor.ts b/packages/indexer/src/lib/document-processor.ts index 9f1aae89..074b31a9 100644 --- a/packages/indexer/src/lib/document-processor.ts +++ b/packages/indexer/src/lib/document-processor.ts @@ -2,8 +2,8 @@ import { type BaseLogger } from 'pino'; import { getBlobNameFromFile } from './blob-storage.js'; import { type ContentPage, type ContentSection, type Section } from './document.js'; -const SENTENCE_ENDINGS = new Set(['.', '!', '?']); -const WORD_BREAKS = new Set([',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']); +const SENTENCE_ENDINGS = new Set(['.', '。', '.', '!', '?', '‼', '⁇', '⁈', '⁉']); +const WORD_BREAKS = new Set([',', '、', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']); const MAX_SECTION_LENGTH = 1000; const SENTENCE_SEARCH_LIMIT = 100; const SECTION_OVERLAP = 100;