refactor: Refactor markdown_to_tups method to better handle multi-lev… (

#17508)
run-llama · Jan 23, 2025 · 91d2107 · 91d2107
1 parent 2d35a0f
commit 91d2107
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 36 deletions.
diff --git a/...x-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py b/...x-integrations/readers/llama-index-readers-file/llama_index/readers/file/markdown/base.py
@@ -5,7 +5,6 @@
 """
 
 import re
-from pathlib import Path
 from fsspec import AbstractFileSystem
 from fsspec.implementations.local import LocalFileSystem
 from typing import Any, Dict, List, Optional, Tuple
@@ -26,50 +25,75 @@ def __init__(
         *args: Any,
         remove_hyperlinks: bool = True,
         remove_images: bool = True,
+        separator: str = " ",
         **kwargs: Any,
     ) -> None:
         """Init params."""
         super().__init__(*args, **kwargs)
         self._remove_hyperlinks = remove_hyperlinks
         self._remove_images = remove_images
+        self._separator = separator
 
     def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
-        """Convert a markdown file to a dictionary.
-
-        The keys are the headers and the values are the text under each header.
-
-        """
+        """Convert a markdown file to a list of tuples containing header and text."""
         markdown_tups: List[Tuple[Optional[str], str]] = []
         lines = markdown_text.split("\n")
 
-        current_header = None
         current_lines = []
         in_code_block = False
-
+        headers = {}
         for line in lines:
+            # Toggle code block state
             if line.startswith("```"):
-                # This is the end of a code block if we are already in it, and vice versa.
                 in_code_block = not in_code_block
 
-            header_match = re.match(r"^#+\s", line)
-            if not in_code_block and header_match:
-                # Upon first header, skip if current text chunk is empty
-                if current_header is not None or len(current_lines) > 0:
-                    markdown_tups.append((current_header, "\n".join(current_lines)))
-
-                current_header = line
-                current_lines.clear()
-            else:
+            if in_code_block:
                 current_lines.append(line)
-
-        # Append final text chunk
-        markdown_tups.append((current_header, "\n".join(current_lines)))
+                continue
+            # Process headers only when not in a code block
+            else:
+                line = line.strip()
+                if not line:
+                    continue
+
+                header_match = re.match(r"^(#+)\s+(.*)", line)
+                if header_match:
+                    if current_lines and not headers:
+                        # Add content before first header
+                        markdown_tups.append((None, "\n".join(current_lines)))
+                        current_lines.clear()
+                    # Extract header level and text
+                    header_level = len(
+                        header_match.group(1)
+                    )  # number of '#' indicates level
+                    current_header = header_match.group(2)  # the header text
+                    if headers.get(header_level):
+                        # Add previous section to the list before switching header
+                        markdown_tups.append(
+                            (
+                                self._separator.join(headers.values()),
+                                "\n".join(current_lines),
+                            )
+                        )
+                        # remove all headers with level greater than current header
+                        headers = {k: v for k, v in headers.items() if k < header_level}
+                        current_lines.clear()
+
+                    headers[header_level] = current_header
+                else:
+                    current_lines.append(line)
+
+        # Append the last section
+        if current_lines or headers:
+            markdown_tups.append(
+                (self._separator.join(headers.values()), "\n".join(current_lines))
+            )
 
         # Postprocess the tuples before returning
         return [
             (
-                key if key is None else re.sub(r"#", "", key).strip(),
-                re.sub(r"<.*?>", "", value),
+                key.strip() if key else None,  # Clean up header (strip whitespace)
+                re.sub(r"<.*?>", "", value),  # Remove HTML tags
             )
             for key, value in markdown_tups
         ]
@@ -90,7 +114,7 @@ def _init_parser(self) -> Dict:
 
     def parse_tups(
         self,
-        filepath: Path,
+        filepath: str,
         errors: str = "ignore",
         fs: Optional[AbstractFileSystem] = None,
     ) -> List[Tuple[Optional[str], str]]:
@@ -106,19 +130,19 @@ def parse_tups(
 
     def load_data(
         self,
-        file: Path,
+        file: str,
         extra_info: Optional[Dict] = None,
         fs: Optional[AbstractFileSystem] = None,
     ) -> List[Document]:
         """Parse file into string."""
         tups = self.parse_tups(file, fs=fs)
         results = []
-        # TODO: don't include headers right now
-        for header, value in tups:
+
+        for header, text in tups:
             if header is None:
-                results.append(Document(text=value, metadata=extra_info or {}))
+                results.append(Document(text=text, metadata=extra_info or {}))
             else:
                 results.append(
-                    Document(text=f"\n\n{header}\n{value}", metadata=extra_info or {})
+                    Document(text=f"\n\n{header}\n{text}", metadata=extra_info or {})
                 )
         return results
diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
@@ -51,7 +51,7 @@ license = "MIT"
 maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
 name = "llama-index-readers-file"
 readme = "README.md"
-version = "0.4.3"
+version = "0.4.4"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"

diff --git a/llama-index-integrations/readers/llama-index-readers-file/tests/test_markdown.py b/llama-index-integrations/readers/llama-index-readers-file/tests/test_markdown.py
@@ -1,4 +1,4 @@
-from llama_index.readers.file import MarkdownReader
+from llama_index.readers.file.markdown.base import MarkdownReader
 
 
 def test_parse_markdown_starting_with_header() -> None:
@@ -18,7 +18,7 @@ def test_parse_markdown_with_text_before_first_header() -> None:
 def test_parse_markdown_with_empty_lines_before_first_header() -> None:
     reader = MarkdownReader()
     markdown_text = "\n\n\n# ABC\ndef"
-    expected_tups = [(None, "\n\n"), ("ABC", "def")]
+    expected_tups = [("ABC", "def")]
     assert reader.markdown_to_tups(markdown_text) == expected_tups
 
 
@@ -55,21 +55,108 @@ def test_parse_markdown_with_headers_in_code_block() -> None:
 def test_parse_empty_markdown() -> None:
     reader = MarkdownReader()
     markdown_text = ""
-    expected_tups = [(None, "")]
+    expected_tups = []
     assert reader.markdown_to_tups(markdown_text) == expected_tups
 
 
 def test_parse_omits_trailing_newline_before_new_header() -> None:
     reader = MarkdownReader()
 
     markdown_text = ("\n" * 4) + "# ABC\nabc"
-    expected_tups = [(None, "\n" * 3), ("ABC", "abc")]
+    expected_tups = [("ABC", "abc")]
     assert reader.markdown_to_tups(markdown_text) == expected_tups
 
     markdown_text = ("\n" * 4) + "# ABC\nabc\n"
-    expected_tups = [(None, "\n" * 3), ("ABC", "abc\n")]
+    expected_tups = [("ABC", "abc")]
     assert reader.markdown_to_tups(markdown_text) == expected_tups
 
     markdown_text = "\n" * 4
-    expected_tups = [(None, "\n" * 4)]
+    expected_tups = []
+    assert reader.markdown_to_tups(markdown_text) == expected_tups
+
+
+def test_multiple_class_titles_parse() -> None:
+    reader = MarkdownReader()
+    markdown_text = """
+# Main Title (Level 1)
+
+## Section 1: Introduction (Level 2)
+
+### Subsection 1.1: Background (Level 3)
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia arcu eget nulla fermentum, et suscipit justo volutpat.
+
+### Subsection 1.2: Objective (Level 3)
+
+Curabitur non nulla sit amet nisl tempus convallis quis ac lectus. Integer posuere erat a ante venenatis dapibus posuere velit aliquet.
+
+## Section 2: Methodology (Level 2)
+
+### Subsection 2.1: Approach (Level 3)
+
+Mauris blandit aliquet elit, eget tincidunt nibh pulvinar a. Pellentesque in ipsum id orci porta dapibus.
+
+### Subsection 2.2: Tools and Techniques (Level 3)
+
+Donec rutrum congue leo eget malesuada. Vivamus suscipit tortor eget felis porttitor volutpat.
+
+#### Sub-subsection 2.2.1: Tool 1 (Level 4)
+
+Donec sollicitudin molestie malesuada.
+
+#### Sub-subsection 2.2.2: Tool 2 (Level 4)
+
+Proin eget tortor risus. Cras ultricies ligula sed magna dictum porta.
+
+## Section 3: Results (Level 2)
+
+### Subsection 3.1: Data Analysis (Level 3)
+
+Sed porttitor lectus nibh. Donec rutrum congue leo eget malesuada.
+
+### Subsection 3.2: Findings (Level 3)
+
+Curabitur arcu erat, accumsan id imperdiet et, porttitor at sem.
+    """
+    expected_tups = [
+        (
+            "Main Title (Level 1) Section 1: Introduction (Level 2) Subsection 1.1: Background (Level 3)",
+            "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia arcu eget nulla fermentum, et suscipit justo volutpat.",
+        ),
+        (
+            "Main Title (Level 1) Section 1: Introduction (Level 2) Subsection 1.2: Objective (Level 3)",
+            "Curabitur non nulla sit amet nisl tempus convallis quis ac lectus. Integer posuere erat a ante venenatis dapibus posuere velit aliquet.",
+        ),
+        (
+            "Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.1: Approach (Level 3)",
+            "Mauris blandit aliquet elit, eget tincidunt nibh pulvinar a. Pellentesque in ipsum id orci porta dapibus.",
+        ),
+        (
+            "Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.2: Tools and Techniques (Level 3) Sub-subsection 2.2.1: Tool 1 (Level 4)",
+            "Donec rutrum congue leo eget malesuada. Vivamus suscipit tortor eget felis porttitor volutpat.\nDonec sollicitudin molestie malesuada.",
+        ),
+        (
+            "Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.2: Tools and Techniques (Level 3) Sub-subsection 2.2.2: Tool 2 (Level 4)",
+            "Proin eget tortor risus. Cras ultricies ligula sed magna dictum porta.",
+        ),
+        (
+            "Main Title (Level 1) Section 3: Results (Level 2) Subsection 3.1: Data Analysis (Level 3)",
+            "Sed porttitor lectus nibh. Donec rutrum congue leo eget malesuada.",
+        ),
+        (
+            "Main Title (Level 1) Section 3: Results (Level 2) Subsection 3.2: Findings (Level 3)",
+            "Curabitur arcu erat, accumsan id imperdiet et, porttitor at sem.",
+        ),
+    ]
+
+    assert reader.markdown_to_tups(markdown_text) == expected_tups
+
+
+def test_blank_lines_in_markdown() -> None:
+    reader = MarkdownReader()
+    markdown_text = """
+
+
+    """
+    expected_tups = []
     assert reader.markdown_to_tups(markdown_text) == expected_tups