Skip to content

Commit

Permalink
improve CODE_BLOCK_PATTERN for a more robust code match (#571)
Browse files Browse the repository at this point in the history
* improve CODE_BLOCK_PATTERN for more robust match

* improve and add tests

* Add support for \r\n

* Updated the regex to support indented code blocks (per the Markdown spec). Added test cases for both.

* Update formatting

---------

Co-authored-by: Adam Fourney <adamfo@microsoft.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
  • Loading branch information
3 people authored Nov 21, 2023
1 parent 19c7da2 commit d22664f
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 7 deletions.
16 changes: 12 additions & 4 deletions autogen/code_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@
DEFAULT_MODEL = "gpt-4"
FAST_MODEL = "gpt-3.5-turbo"
# Regular expression for finding a code block
CODE_BLOCK_PATTERN = r"```(\w*)\n(.*?)\n```"
# ```[ \t]*(\w+)?[ \t]*\r?\n(.*?)[ \t]*\r?\n``` Matches multi-line code blocks.
# The [ \t]* matches the potential spaces before language name.
# The (\w+)? matches the language, where the ? indicates it is optional.
# The [ \t]* matches the potential spaces (not newlines) after language name.
# The \r?\n makes sure there is a linebreak after ```.
# The (.*?) matches the code itself (non-greedy).
# The \r?\n makes sure there is a linebreak before ```.
# The [ \t]* matches the potential spaces before closing ``` (the spec allows indentation).
CODE_BLOCK_PATTERN = r"```[ \t]*(\w+)?[ \t]*\r?\n(.*?)\r?\n[ \t]*```"
WORKING_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extensions")
UNKNOWN = "unknown"
TIMEOUT_MSG = "Timeout"
Expand Down Expand Up @@ -59,6 +67,8 @@ def infer_lang(code):
return UNKNOWN


# TODO: In the future move, to better support https://spec.commonmark.org/0.30/#fenced-code-blocks
# perhaps by using a full Markdown parser.
def extract_code(
text: Union[str, List], pattern: str = CODE_BLOCK_PATTERN, detect_single_line_code: bool = False
) -> List[Tuple[str, str]]:
Expand All @@ -83,10 +93,8 @@ def extract_code(
return match if match else [(UNKNOWN, text)]

# Extract both multi-line and single-line code block, separated by the | operator
# `{3}(\w+)?\s*([\s\S]*?)`{3}: Matches multi-line code blocks.
# The (\w+)? matches the language, where the ? indicates it is optional.
# `([^`]+)`: Matches inline code.
code_pattern = re.compile(r"`{3}(\w+)?\s*([\s\S]*?)`{3}|`([^`]+)`")
code_pattern = re.compile(CODE_BLOCK_PATTERN + r"|`([^`]+)`")
code_blocks = code_pattern.findall(text)

# Extract the individual code blocks and languages from the matched groups
Expand Down
76 changes: 73 additions & 3 deletions test/test_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def test_extract_code():
""",
detect_single_line_code=True,
)
print(codeblocks2)

assert codeblocks2 == codeblocks
# import pdb; pdb.set_trace()

Expand All @@ -207,9 +209,77 @@ def scrape(url):
title, text = scrape(url)
print(f"Title: {title}")
print(f"Text: {text}")
```
"""
)
print(codeblocks)
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"

codeblocks = extract_code(
"""
Example:
``` python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
Test:
``` python
url = "https://en.wikipedia.org/wiki/Web_scraping"
title, text = scrape(url)
print(f"Title: {title}")
print(f"Text: {text}")
```
"""
)
print(codeblocks)
assert len(codeblocks) == 2 and codeblocks[0][0] == "python" and codeblocks[1][0] == "python"

# Check for indented code blocks
codeblocks = extract_code(
"""
Example:
```python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
"""
)
print(codeblocks)
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"

# Check for codeblocks with \r\n
codeblocks = extract_code(
"""
Example:
``` python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
""".replace(
"\n", "\r\n"
)
)
print(codeblocks)
assert len(codeblocks) == 1 and codeblocks[0][0] == "python"

codeblocks = extract_code("no code block")
assert len(codeblocks) == 1 and codeblocks[0] == (UNKNOWN, "no code block")

Expand Down Expand Up @@ -348,7 +418,7 @@ def test_non_dict_in_list(self):

if __name__ == "__main__":
# test_infer_lang()
# test_extract_code()
test_execute_code()
test_extract_code()
# test_execute_code()
# test_find_code()
unittest.main()
# unittest.main()

0 comments on commit d22664f

Please sign in to comment.