Skip to content

Commit

Permalink
refactor: Refactor markdown_to_tups method to better handle multi-lev… (
Browse files Browse the repository at this point in the history
  • Loading branch information
minglu7 authored Jan 23, 2025
1 parent 2d35a0f commit 91d2107
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

import re
from pathlib import Path
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem
from typing import Any, Dict, List, Optional, Tuple
Expand All @@ -26,50 +25,75 @@ def __init__(
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
separator: str = " ",
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images
self._separator = separator

def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.
The keys are the headers and the values are the text under each header.
"""
"""Convert a markdown file to a list of tuples containing header and text."""
markdown_tups: List[Tuple[Optional[str], str]] = []
lines = markdown_text.split("\n")

current_header = None
current_lines = []
in_code_block = False

headers = {}
for line in lines:
# Toggle code block state
if line.startswith("```"):
# This is the end of a code block if we are already in it, and vice versa.
in_code_block = not in_code_block

header_match = re.match(r"^#+\s", line)
if not in_code_block and header_match:
# Upon first header, skip if current text chunk is empty
if current_header is not None or len(current_lines) > 0:
markdown_tups.append((current_header, "\n".join(current_lines)))

current_header = line
current_lines.clear()
else:
if in_code_block:
current_lines.append(line)

# Append final text chunk
markdown_tups.append((current_header, "\n".join(current_lines)))
continue
# Process headers only when not in a code block
else:
line = line.strip()
if not line:
continue

header_match = re.match(r"^(#+)\s+(.*)", line)
if header_match:
if current_lines and not headers:
# Add content before first header
markdown_tups.append((None, "\n".join(current_lines)))
current_lines.clear()
# Extract header level and text
header_level = len(
header_match.group(1)
) # number of '#' indicates level
current_header = header_match.group(2) # the header text
if headers.get(header_level):
# Add previous section to the list before switching header
markdown_tups.append(
(
self._separator.join(headers.values()),
"\n".join(current_lines),
)
)
# remove all headers with level greater than current header
headers = {k: v for k, v in headers.items() if k < header_level}
current_lines.clear()

headers[header_level] = current_header
else:
current_lines.append(line)

# Append the last section
if current_lines or headers:
markdown_tups.append(
(self._separator.join(headers.values()), "\n".join(current_lines))
)

# Postprocess the tuples before returning
return [
(
key if key is None else re.sub(r"#", "", key).strip(),
re.sub(r"<.*?>", "", value),
key.strip() if key else None, # Clean up header (strip whitespace)
re.sub(r"<.*?>", "", value), # Remove HTML tags
)
for key, value in markdown_tups
]
Expand All @@ -90,7 +114,7 @@ def _init_parser(self) -> Dict:

def parse_tups(
self,
filepath: Path,
filepath: str,
errors: str = "ignore",
fs: Optional[AbstractFileSystem] = None,
) -> List[Tuple[Optional[str], str]]:
Expand All @@ -106,19 +130,19 @@ def parse_tups(

def load_data(
self,
file: Path,
file: str,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Parse file into string."""
tups = self.parse_tups(file, fs=fs)
results = []
# TODO: don't include headers right now
for header, value in tups:

for header, text in tups:
if header is None:
results.append(Document(text=value, metadata=extra_info or {}))
results.append(Document(text=text, metadata=extra_info or {}))
else:
results.append(
Document(text=f"\n\n{header}\n{value}", metadata=extra_info or {})
Document(text=f"\n\n{header}\n{text}", metadata=extra_info or {})
)
return results
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ license = "MIT"
maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
name = "llama-index-readers-file"
readme = "README.md"
version = "0.4.3"
version = "0.4.4"

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from llama_index.readers.file import MarkdownReader
from llama_index.readers.file.markdown.base import MarkdownReader


def test_parse_markdown_starting_with_header() -> None:
Expand All @@ -18,7 +18,7 @@ def test_parse_markdown_with_text_before_first_header() -> None:
def test_parse_markdown_with_empty_lines_before_first_header() -> None:
reader = MarkdownReader()
markdown_text = "\n\n\n# ABC\ndef"
expected_tups = [(None, "\n\n"), ("ABC", "def")]
expected_tups = [("ABC", "def")]
assert reader.markdown_to_tups(markdown_text) == expected_tups


Expand Down Expand Up @@ -55,21 +55,108 @@ def test_parse_markdown_with_headers_in_code_block() -> None:
def test_parse_empty_markdown() -> None:
reader = MarkdownReader()
markdown_text = ""
expected_tups = [(None, "")]
expected_tups = []
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_parse_omits_trailing_newline_before_new_header() -> None:
reader = MarkdownReader()

markdown_text = ("\n" * 4) + "# ABC\nabc"
expected_tups = [(None, "\n" * 3), ("ABC", "abc")]
expected_tups = [("ABC", "abc")]
assert reader.markdown_to_tups(markdown_text) == expected_tups

markdown_text = ("\n" * 4) + "# ABC\nabc\n"
expected_tups = [(None, "\n" * 3), ("ABC", "abc\n")]
expected_tups = [("ABC", "abc")]
assert reader.markdown_to_tups(markdown_text) == expected_tups

markdown_text = "\n" * 4
expected_tups = [(None, "\n" * 4)]
expected_tups = []
assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_multiple_class_titles_parse() -> None:
reader = MarkdownReader()
markdown_text = """
# Main Title (Level 1)
## Section 1: Introduction (Level 2)
### Subsection 1.1: Background (Level 3)
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia arcu eget nulla fermentum, et suscipit justo volutpat.
### Subsection 1.2: Objective (Level 3)
Curabitur non nulla sit amet nisl tempus convallis quis ac lectus. Integer posuere erat a ante venenatis dapibus posuere velit aliquet.
## Section 2: Methodology (Level 2)
### Subsection 2.1: Approach (Level 3)
Mauris blandit aliquet elit, eget tincidunt nibh pulvinar a. Pellentesque in ipsum id orci porta dapibus.
### Subsection 2.2: Tools and Techniques (Level 3)
Donec rutrum congue leo eget malesuada. Vivamus suscipit tortor eget felis porttitor volutpat.
#### Sub-subsection 2.2.1: Tool 1 (Level 4)
Donec sollicitudin molestie malesuada.
#### Sub-subsection 2.2.2: Tool 2 (Level 4)
Proin eget tortor risus. Cras ultricies ligula sed magna dictum porta.
## Section 3: Results (Level 2)
### Subsection 3.1: Data Analysis (Level 3)
Sed porttitor lectus nibh. Donec rutrum congue leo eget malesuada.
### Subsection 3.2: Findings (Level 3)
Curabitur arcu erat, accumsan id imperdiet et, porttitor at sem.
"""
expected_tups = [
(
"Main Title (Level 1) Section 1: Introduction (Level 2) Subsection 1.1: Background (Level 3)",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus lacinia arcu eget nulla fermentum, et suscipit justo volutpat.",
),
(
"Main Title (Level 1) Section 1: Introduction (Level 2) Subsection 1.2: Objective (Level 3)",
"Curabitur non nulla sit amet nisl tempus convallis quis ac lectus. Integer posuere erat a ante venenatis dapibus posuere velit aliquet.",
),
(
"Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.1: Approach (Level 3)",
"Mauris blandit aliquet elit, eget tincidunt nibh pulvinar a. Pellentesque in ipsum id orci porta dapibus.",
),
(
"Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.2: Tools and Techniques (Level 3) Sub-subsection 2.2.1: Tool 1 (Level 4)",
"Donec rutrum congue leo eget malesuada. Vivamus suscipit tortor eget felis porttitor volutpat.\nDonec sollicitudin molestie malesuada.",
),
(
"Main Title (Level 1) Section 2: Methodology (Level 2) Subsection 2.2: Tools and Techniques (Level 3) Sub-subsection 2.2.2: Tool 2 (Level 4)",
"Proin eget tortor risus. Cras ultricies ligula sed magna dictum porta.",
),
(
"Main Title (Level 1) Section 3: Results (Level 2) Subsection 3.1: Data Analysis (Level 3)",
"Sed porttitor lectus nibh. Donec rutrum congue leo eget malesuada.",
),
(
"Main Title (Level 1) Section 3: Results (Level 2) Subsection 3.2: Findings (Level 3)",
"Curabitur arcu erat, accumsan id imperdiet et, porttitor at sem.",
),
]

assert reader.markdown_to_tups(markdown_text) == expected_tups


def test_blank_lines_in_markdown() -> None:
reader = MarkdownReader()
markdown_text = """
"""
expected_tups = []
assert reader.markdown_to_tups(markdown_text) == expected_tups

0 comments on commit 91d2107

Please sign in to comment.