Skip to content

Commit

Permalink
improved prompt to avoid multiple extractions failing to return corre…
Browse files Browse the repository at this point in the history
…ct json objects
  • Loading branch information
emcf committed Sep 9, 2024
1 parent c9e53ec commit 3d680df
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_git_requirements(file):

setup(
name='thepipe_api',
version='1.3.2',
version='1.3.4',
author='Emmett McFarlane',
author_email='emmett@thepi.pe',
description='AI-native extractor, powered by multimodal LLMs.',
Expand Down
12 changes: 10 additions & 2 deletions thepipe/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os
from openai import OpenAI

DEFAULT_EXTRACTION_PROMPT = "Extract structured information from the above document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but use your reasoning to always fill in every column as best you can. Always immediately return valid JSON."
DEFAULT_EXTRACTION_PROMPT = "Extract structured information from the given document according to the following schema: {schema}. Immediately return valid JSON formatted data. If there is missing data, you may use null, but use your reasoning to always fill in every column as best you can. Always immediately return valid JSON."
DEFAULT_AI_MODEL = os.getenv("DEFAULT_AI_MODEL", "gpt-4o-mini")

def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
Expand Down Expand Up @@ -59,13 +59,21 @@ def extract_from_chunk(chunk: Chunk, chunk_index: int, schema: str, ai_model: st
base_url=os.environ["LLM_SERVER_BASE_URL"],
api_key=os.environ["LLM_SERVER_API_KEY"],
)

corrected_extraction_prompt = extraction_prompt.replace("{schema}", schema)
if multiple_extractions:
corrected_extraction_prompt += """\nIf there are multiple extractions, return each JSON dictionary in a list under the key "extraction". The list should contain each extraction dict (according to the schema) and the entire list should be set to the "extraction" key. Immediately return this extraction JSON object with the "extraction" key mapping to a list containing all the extracted data."""
else:
corrected_extraction_prompt += """\nImmediately return the JSON dictionary."""

messages = [
chunk.to_message(host_images=host_images),
{
"role": "user",
"content": extraction_prompt.replace("{schema}", schema)
"content": corrected_extraction_prompt,
},
]

response = openrouter_client.chat.completions.create(
model=ai_model,
messages=messages,
Expand Down

0 comments on commit 3d680df

Please sign in to comment.