From 6f787291ca76d7f03476bfdac70fc62697b586dc Mon Sep 17 00:00:00 2001 From: Riddhimaan-Senapati <114703025+Riddhimaan-Senapati@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:40:10 +0530 Subject: [PATCH 1/2] Fixed issue #17397 by updating the markdown format in MarkdownReader --- .../llama_index/readers/file/markdown/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py index 827412aab96c5..2c88f7b358a42 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py @@ -76,7 +76,7 @@ def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str] def remove_images(self, content: str) -> str: """Remove images in markdown content.""" - pattern = r"!{1}\[\[(.*)\]\]" + pattern = r"!\[.*?\]\(.*?\)" return re.sub(pattern, "", content) def remove_hyperlinks(self, content: str) -> str: From 655032844fa4c79090fe375030f1ff42c7b5d5d7 Mon Sep 17 00:00:00 2001 From: Riddhimaan-Senapati Date: Thu, 9 Jan 2025 18:45:48 +0530 Subject: [PATCH 2/2] made the required changes as detailed by the reviewe --- .../llama_index/readers/file/markdown/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py index 2c88f7b358a42..94ef75684dfae 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py @@ -73,11 +73,11 @@ def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str] ) for key, value in markdown_tups ] - + def remove_images(self, content: str) -> str: - """Remove images in markdown content.""" - pattern = r"!\[.*?\]\(.*?\)" - return re.sub(pattern, "", content) + """Remove images in markdown content but keep the description.""" + pattern = r"![(.?)](.?)" + return re.sub(pattern, r"\1", content) def remove_hyperlinks(self, content: str) -> str: """Remove hyperlinks in markdown content."""