From 40633c9e92bbba581c3a13c4ff03ddbae449d4ae Mon Sep 17 00:00:00 2001 From: Kevin Kaiser Date: Tue, 28 May 2019 11:49:33 +0200 Subject: [PATCH 1/2] fixes #81, lowering keywords and search sentence now works on a per char basis to return correct span_info --- flashtext/keyword.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/flashtext/keyword.py b/flashtext/keyword.py index f358c77..4c06093 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import os import string import io @@ -80,11 +81,11 @@ def __contains__(self, word): >>> # True """ - if not self.case_sensitive: - word = word.lower() current_dict = self.keyword_trie_dict len_covered = 0 for char in word: + if not self.case_sensitive: + char = char.lower() if char in current_dict: current_dict = current_dict[char] len_covered += 1 @@ -108,11 +109,11 @@ def __getitem__(self, word): >>> keyword_processor['Big Apple'] >>> # New York """ - if not self.case_sensitive: - word = word.lower() current_dict = self.keyword_trie_dict len_covered = 0 for char in word: + if not self.case_sensitive: + char = char.lower() if char in current_dict: current_dict = current_dict[char] len_covered += 1 @@ -141,10 +142,10 @@ def __setitem__(self, keyword, clean_name=None): clean_name = keyword if keyword and clean_name: - if not self.case_sensitive: - keyword = keyword.lower() current_dict = self.keyword_trie_dict for letter in keyword: + if not self.case_sensitive: + letter = letter.lower() current_dict = current_dict.setdefault(letter, {}) if self._keyword not in current_dict: status = True @@ -166,11 +167,11 @@ def __delitem__(self, keyword): """ status = False if keyword: - if not self.case_sensitive: - keyword = keyword.lower() current_dict = self.keyword_trie_dict character_trie_list = [] for letter in keyword: + if not self.case_sensitive: + letter = letter.lower() if letter in current_dict: character_trie_list.append((letter, current_dict)) current_dict = current_dict[letter] @@ -471,8 +472,6 @@ def extract_keywords(self, sentence, span_info=False): if not sentence: # if sentence is empty or none just return empty list return keywords_extracted - if not self.case_sensitive: - sentence = sentence.lower() current_dict = self.keyword_trie_dict sequence_start_pos = 0 sequence_end_pos = 0 @@ -481,6 +480,8 @@ def extract_keywords(self, sentence, span_info=False): sentence_len = len(sentence) while idx < sentence_len: char = sentence[idx] + if not self.case_sensitive: + char = char.lower() # when we reach a character that might denote word end if char not in self.non_word_boundaries: @@ -502,6 +503,8 @@ def extract_keywords(self, sentence, span_info=False): idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] + if not self.case_sensitive: + inner_char = inner_char.lower() if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: # update longest sequence found longest_sequence_found = current_dict_continued[self._keyword] @@ -540,6 +543,8 @@ def extract_keywords(self, sentence, span_info=False): idy = idx + 1 while idy < sentence_len: char = sentence[idy] + if not self.case_sensitive: + char = char.lower() if char not in self.non_word_boundaries: break idy += 1 @@ -582,8 +587,6 @@ def replace_keywords(self, sentence): return sentence new_sentence = [] orig_sentence = sentence - if not self.case_sensitive: - sentence = sentence.lower() current_word = '' current_dict = self.keyword_trie_dict current_white_space = '' @@ -592,6 +595,8 @@ def replace_keywords(self, sentence): sentence_len = len(sentence) while idx < sentence_len: char = sentence[idx] + if not self.case_sensitive: + char = char.lower() current_word += orig_sentence[idx] # when we reach whitespace if char not in self.non_word_boundaries: @@ -614,6 +619,8 @@ def replace_keywords(self, sentence): idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] + if not self.case_sensitive: + inner_char = inner_char.lower() current_word_continued += orig_sentence[idy] if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued: # update longest sequence found @@ -662,6 +669,8 @@ def replace_keywords(self, sentence): idy = idx + 1 while idy < sentence_len: char = sentence[idy] + if not self.case_sensitive: + char = char.lower() current_word += orig_sentence[idy] if char not in self.non_word_boundaries: break From 81a3f0a7c4494b85496a7f503f52e8a37fe1f0e9 Mon Sep 17 00:00:00 2001 From: Kevin Kaiser Date: Sun, 3 May 2020 09:52:31 +0200 Subject: [PATCH 2/2] added test case --- test/keyword_extractor_test_cases.json | 10 ++++++++++ test/test_kp_extract_span.py | 6 ++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/keyword_extractor_test_cases.json b/test/keyword_extractor_test_cases.json index 6280771..edca626 100644 --- a/test/keyword_extractor_test_cases.json +++ b/test/keyword_extractor_test_cases.json @@ -476,5 +476,15 @@ "explanation": "", "keywords": ["spring framework"], "keywords_case_sensitive": ["spring framework"] + }, + { + "sentence": "İ love Big Apple and Bay Area.", + "keyword_dict": { + "İ love": ["İ love"], + "Big Apple": ["Big Apple"] + }, + "explanation": "Lowering keywords per character for correct span_info", + "keywords": ["İ love", "Big Apple"], + "keywords_case_sensitive": ["İ love", "Big Apple"] } ] diff --git a/test/test_kp_extract_span.py b/test/test_kp_extract_span.py index 2b9f7a4..d549d18 100644 --- a/test/test_kp_extract_span.py +++ b/test/test_kp_extract_span.py @@ -19,7 +19,6 @@ def test_extract_keywords(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. - """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor() @@ -27,16 +26,15 @@ def test_extract_keywords(self): keyword_processor.add_keywords_from_list(test_case['keyword_dict'][key]) keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'], span_info=True) for kwd in keywords_extracted: - # returned keyword lowered should match the sapn from sentence + # returned keyword lowered should match the span from sentence self.assertEqual( - kwd[0].lower(), test_case['sentence'].lower()[kwd[1]:kwd[2]], + kwd[0].lower(), test_case['sentence'][kwd[1]:kwd[2]].lower(), "keywords span don't match the expected results for test case: {}".format(test_id)) def test_extract_keywords_case_sensitive(self): """For each of the test case initialize a new KeywordProcessor. Add the keywords the test case to KeywordProcessor. Extract keywords and check if they match the expected result for the test case. - """ for test_id, test_case in enumerate(self.test_cases): keyword_processor = KeywordProcessor(case_sensitive=True)