monarch-initiative · caufieldjh · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/src/curate_gpt/agents/mapping_agent.py b/src/curate_gpt/agents/mapping_agent.py
@@ -13,6 +13,7 @@
 from pydantic import BaseModel, ConfigDict
 
 from curate_gpt.agents.base_agent import BaseAgent
+from curate_gpt.formatters.format_utils import remove_formatting
 from curate_gpt.store.db_adapter import SEARCH_RESULT
 from curate_gpt.utils.tokens import estimate_num_tokens, max_tokens_by_model
 
@@ -139,10 +140,10 @@ def match(
                     raise ValueError(f"Prompt too long: {prompt}.")
                 kb_results.pop()
         response = model.prompt(prompt)
+
         # Need to remove Markdown formatting here or it won't parse as JSON
-        response_text = response.text()
-        if response_text.startswith("```json"):
-            response_text = response_text[7:-3]
+        response_text = remove_formatting(text=response.text(), expect_format="json")
+
         mappings = []
         try:
             for m in json.loads(response_text):
@@ -160,6 +161,7 @@ def match(
                     )
                 )
         except json.decoder.JSONDecodeError:
+            # This will happen if the response is still not valid JSON
             # This returns an empty set of mappings, but the prompt and response text are retained
             return MappingSet(mappings=mappings, prompt=prompt, response_text=response_text)
 

diff --git a/src/curate_gpt/extract/basic_extractor.py b/src/curate_gpt/extract/basic_extractor.py
@@ -10,6 +10,8 @@
 import yaml
 from pydantic import ConfigDict
 
+from curate_gpt.formatters.format_utils import remove_formatting
+
 from ..utils.tokens import estimate_num_tokens, max_tokens_by_model
 from .extractor import AnnotatedObject, Extractor
 
@@ -87,6 +89,7 @@ def deserialize(self, text: str, format=None, **kwargs) -> AnnotatedObject:
         if format == "yaml":
             return self.deserialize_yaml(text, **kwargs)
         logger.debug(f"Parsing {text}")
+        text = remove_formatting(text=text, expect_format="json")
         try:
             obj = json.loads(text)
             if isinstance(obj, str):
@@ -108,12 +111,7 @@ def deserialize(self, text: str, format=None, **kwargs) -> AnnotatedObject:
 
     def deserialize_yaml(self, text: str, multiple=False) -> AnnotatedObject:
         logger.debug(f"Parsing YAML: {text}")
-        if "```" in text:
-            logger.debug("Removing code block")
-            text = text.split("```")[1]
-            text = text.strip()
-        if text.startswith("yaml"):
-            text = text[4:]
+        text = remove_formatting(text=text, expect_format="yaml")
         try:
             if multiple:
                 obj = yaml.safe_load_all(text)

diff --git a/src/curate_gpt/formatters/format_utils.py b/src/curate_gpt/formatters/format_utils.py
@@ -27,3 +27,16 @@ def object_as_yaml(obj: Dict) -> str:
     :return:
     """
     return yaml.dump({k: v for k, v in obj.items() if v}, sort_keys=False)
+
+
+def remove_formatting(text: str, expect_format: str = "") -> str:
+    """
+    Remove markdown formatting from text if present.
+
+    :param text:
+    :param expect_format: The expected format of the text, e.g., "json" (optional)
+    :return:
+    """
+    if text.startswith("```" + expect_format):
+        return text[3 + len(expect_format) : -3]
+    return text