From 4ec34ed40c7fc6093da251005ac7baa5965b0e91 Mon Sep 17 00:00:00 2001 From: LittlePea13 Date: Fri, 2 Aug 2024 13:37:38 +0200 Subject: [PATCH] fix sentence splitter offset --- relik/inference/data/window/manager.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/relik/inference/data/window/manager.py b/relik/inference/data/window/manager.py index ebe87c5..ffd049a 100644 --- a/relik/inference/data/window/manager.py +++ b/relik/inference/data/window/manager.py @@ -5,6 +5,7 @@ from relik.inference.data.splitters.blank_sentence_splitter import BlankSentenceSplitter from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter from relik.inference.data.splitters.window_based_splitter import WindowSentenceSplitter +from relik.inference.data.splitters.spacy_sentence_splitter import SpacySentenceSplitter from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer from relik.reader.data.relik_reader_sample import RelikReaderSample from relik.inference.data.objects import AnnotationType, TaskType @@ -227,10 +228,10 @@ def _merge_doc_windows(self, windows: List[RelikReaderSample]) -> RelikReaderSam if len(windows) == 1: return self._normalize_single_window(windows[0]) - if not isinstance(self.splitter, WindowSentenceSplitter): - # here we don't really need to merge windows, just normalize them - # TODO: check if we need to merge windows in this case - return [self._normalize_single_window(w) for w in windows] + # if not isinstance(self.splitter, WindowSentenceSplitter): + # # here we don't really need to merge windows, just normalize them + # # TODO: check if we need to merge windows in this case + # return [self._normalize_single_window(w) for w in windows] if len(windows) > 0 and getattr(windows[0], "offset", None) is not None: windows = sorted(windows, key=(lambda x: x.offset)) @@ -507,7 +508,10 @@ def _merge_window_pair( ) = self._merge_predictions(window1, window2) # merge text, take into account overlapping chars - m_text = window1.text[: window2.offset] + window2.text + if isinstance(self.splitter, SpacySentenceSplitter): + m_text = window1.text[: window2.offset] + " " + window2.text + else: + m_text = window1.text[: window2.offset] + window2.text merging_output.update( dict(