diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..88a388b9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+ - repo: https://github.com/psf/black
+ rev: 24.2.0
+ hooks:
+ - id: black
+
diff --git a/AnthropicEvaluator.py b/AnthropicEvaluator.py
new file mode 100644
index 00000000..bed2297d
--- /dev/null
+++ b/AnthropicEvaluator.py
@@ -0,0 +1,86 @@
+import os
+import tiktoken
+from LLMNeedleHaystackTester import LLMNeedleHaystackTester
+from anthropic import AsyncAnthropic, Anthropic
+
+
+class AnthropicEvaluator(LLMNeedleHaystackTester):
+ def __init__(self, **kwargs):
+ if "anthropic_api_key" not in kwargs and not os.getenv("ANTHROPIC_API_KEY"):
+ raise ValueError(
+ "Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env"
+ )
+
+ if "model_name" not in kwargs:
+ raise ValueError("model_name must be supplied with init")
+ elif "claude" not in kwargs["model_name"]:
+ raise ValueError(
+ "If the model provider is 'Anthropic', the model name must include 'claude'. "
+ "See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models"
+ )
+
+ if "evaluation_method" not in kwargs:
+ print(
+ "since evaluation method is not specified , default method substring_match will be used for evaluation"
+ )
+ elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"):
+ raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'")
+ elif (
+ kwargs["evaluation_method"] == "gpt4"
+ and "openai_api_key" not in kwargs
+ and not os.getenv("OPENAI_API_KEY")
+ ):
+ raise ValueError(
+ "if evaluation_method is gpt4 , openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env"
+ )
+ else:
+ self.openai_api_key = kwargs.get(
+ "openai_api_key", os.getenv("OPENAI_API_KEY")
+ )
+
+ self.anthropic_api_key = kwargs.pop(
+ "anthropic_api_key", os.getenv("ANTHROPIC_API_KEY")
+ )
+ self.model_name = kwargs["model_name"]
+ self.model_to_test_description = kwargs.pop("model_name")
+ self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key)
+ self.tokenizer = Anthropic().get_tokenizer()
+
+ super().__init__(**kwargs)
+
+ def get_encoding(self, context):
+ return self.tokenizer.encode(context).ids
+
+ def get_decoding(self, encoded_context):
+ return self.tokenizer.decode(encoded_context)
+
+ def get_prompt(self, context):
+ return [
+ {
+ "role": "user",
+ "content": f"{context}\n\n {self.retrieval_question} Don't give information outside the document or repeat your findings",
+ },
+ {
+ "role": "assistant",
+ "content": "Here is the most relevant sentence in the context:",
+ },
+ ]
+
+ async def get_response_from_model(self, prompt):
+ response = await self.model_to_test.messages.create(
+ model=self.model_name,
+ messages=prompt,
+ system="You are a helpful AI bot that answers questions for a user. Keep your response short and direct",
+ max_tokens=300,
+ temperature=0,
+ )
+ return response.content[0].text
+
+
+if __name__ == "__main__":
+ # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
+ ht = AnthropicEvaluator(
+ model_name="claude-2.1", evaluation_method="substring_match"
+ )
+
+ ht.start_test()
diff --git a/Anthropic_prompt.txt b/Anthropic_prompt.txt
deleted file mode 100644
index cee594a6..00000000
--- a/Anthropic_prompt.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-You are a helpful AI bot that answers questions for a user. Keep your response short and direct
-
-Human:
-{context}
-
-
-{retrieval_question} Don't give information outside the document or repeat your findings
-
-Assistant: Here is the most relevant sentence in the context:
\ No newline at end of file
diff --git a/LLMNeedleHaystackTester.py b/LLMNeedleHaystackTester.py
index ef6cfc7b..16bba6d1 100644
--- a/LLMNeedleHaystackTester.py
+++ b/LLMNeedleHaystackTester.py
@@ -1,53 +1,56 @@
from dotenv import load_dotenv
-import os
-import tiktoken
+from pathlib import Path
import glob
import json
from langchain.evaluation import load_evaluator
from langchain.chat_models import ChatOpenAI
-from anthropic import AsyncAnthropic, Anthropic
-from dotenv import load_dotenv
+
import numpy as np
-from openai import AsyncOpenAI
+
import asyncio
from asyncio import Semaphore
from datetime import datetime, timezone
import time
+from abc import ABC, abstractmethod
+
load_dotenv()
-class LLMNeedleHaystackTester:
+
+class LLMNeedleHaystackTester(ABC):
"""
This class is used to test the LLM Needle Haystack.
"""
- def __init__(self,
- needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
- haystack_dir="PaulGrahamEssays",
- retrieval_question="What is the best thing to do in San Francisco?",
- results_version = 1,
- context_lengths_min = 1000,
- context_lengths_max = 200000,
- context_lengths_num_intervals = 35,
- context_lengths = None,
- document_depth_percent_min = 0,
- document_depth_percent_max = 100,
- document_depth_percent_intervals = 35,
- document_depth_percents = None,
- document_depth_percent_interval_type = "linear",
- model_provider = "OpenAI",
- openai_api_key=None,
- anthropic_api_key = None,
- model_name='gpt-4-1106-preview',
- num_concurrent_requests = 1,
- save_results = True,
- save_contexts = True,
- final_context_length_buffer = 200,
- seconds_to_sleep_between_completions = None,
- print_ongoing_status = True):
- """
+
+ def __init__(
+ self,
+ needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
+ haystack_dir="PaulGrahamEssays",
+ retrieval_question="What is the best thing to do in San Francisco?",
+ substr_validation_words=["dolores", "sandwich"],
+ results_version=1,
+ context_lengths_min=1000,
+ context_lengths_max=200000,
+ context_lengths_num_intervals=35,
+ context_lengths=None,
+ document_depth_percent_min=0,
+ document_depth_percent_max=100,
+ document_depth_percent_intervals=35,
+ document_depth_percents=None,
+ document_depth_percent_interval_type="linear",
+ num_concurrent_requests=1,
+ save_results=True,
+ save_contexts=True,
+ final_context_length_buffer=200,
+ seconds_to_sleep_between_completions=None,
+ print_ongoing_status=True,
+ evaluation_method="gpt4",
+ ):
+ """
:param needle: The needle to be found in the haystack. Default is None.
:param haystack_dir: The directory of text files to use as background context (or a haystack) in which the needle is to be found. Default is Paul Graham Essays.
:param retrieval_question: The question which with to prompt the model to do the retrieval.
+ :param substr_validation_words: If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not
:param results_version: In case you would like to try the same combination of model, context length, and depth % multiple times, change the results version other than 1
:param num_concurrent_requests: Due to volume, this object is set up to run concurrent requests, default = 1. Be careful of rate limits.
:param save_results: Whether or not you would like to save your contexts to file. Warning: These will get long! Default = True
@@ -62,19 +65,14 @@ def __init__(self,
:param document_depth_percent_intervals: The number of intervals for the document depth percent. Default is 35.
:param document_depth_percents: The depth percentages of the document. Default is None.
:param document_depth_percent_interval_type: The type of interval for the document depth percent. Must be either 'linear' or 'sigmoid'. Default is 'linear'.
- :param model_provider: The provider of the model. Must be either 'OpenAI' or 'Anthropic'. Default is 'OpenAI'.
- :param openai_api_key: The API key for OpenAI. Default is None.
- :param anthropic_api_key: The API key for Anthropic. Default is None.
- :param model_name: The name of the model. Default is 'gpt-4-1106-preview'.
:param seconds_to_sleep_between_completions: The number of seconds to sleep between completions. Default is None.
:param print_ongoing_status: Whether or not to print the ongoing status. Default is True.
+ :param evaluation_method: Choose between gpt to evaluate (get the score 1,3,5,7,10) else using simple substring matching , default is gpt4
"""
- if not needle or not haystack_dir or not retrieval_question:
- raise ValueError("Needle, haystack, and retrieval_question must be provided.")
-
self.needle = needle
self.haystack_dir = haystack_dir
self.retrieval_question = retrieval_question
+ self.substr_validation_words = substr_validation_words
self.results_version = results_version
self.num_concurrent_requests = num_concurrent_requests
self.save_results = save_results
@@ -82,73 +80,87 @@ def __init__(self,
self.save_contexts = save_contexts
self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions
self.print_ongoing_status = print_ongoing_status
- self.model_provider = model_provider
self.testing_results = []
+ self.evaluation_method = evaluation_method
if context_lengths is None:
- if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None:
- raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.")
+ if (
+ context_lengths_min is None
+ or context_lengths_max is None
+ or context_lengths_num_intervals is None
+ ):
+ raise ValueError(
+ "Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied."
+ )
else:
- self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int)
+ self.context_lengths = np.round(
+ np.linspace(
+ context_lengths_min,
+ context_lengths_max,
+ num=context_lengths_num_intervals,
+ endpoint=True,
+ )
+ ).astype(int)
else:
self.context_lengths = context_lengths
if document_depth_percents is None:
- if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None:
- raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.")
+ if (
+ document_depth_percent_min is None
+ or document_depth_percent_max is None
+ or document_depth_percent_intervals is None
+ ):
+ raise ValueError(
+ "Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied."
+ )
else:
- if document_depth_percent_interval_type == 'linear':
- self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int)
- elif document_depth_percent_interval_type == 'sigmoid':
- self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)]
+ if document_depth_percent_interval_type == "linear":
+ self.document_depth_percents = np.round(
+ np.linspace(
+ document_depth_percent_min,
+ document_depth_percent_max,
+ num=document_depth_percent_intervals,
+ endpoint=True,
+ )
+ ).astype(int)
+ elif document_depth_percent_interval_type == "sigmoid":
+ self.document_depth_percents = [
+ self.logistic(x)
+ for x in np.linspace(
+ document_depth_percent_min,
+ document_depth_percent_max,
+ document_depth_percent_intervals,
+ )
+ ]
else:
self.document_depth_percents = document_depth_percents
if document_depth_percent_interval_type not in [None, "linear", "sigmoid"]:
- raise ValueError("document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals")
-
- if model_provider not in ["OpenAI", "Anthropic"]:
- raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-
- if model_provider == "Anthropic" and "claude" not in model_name:
- raise ValueError("If the model provider is 'Anthropic', the model name must include 'claude'. See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models")
-
- self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
- self.model_name = model_name
-
- if not self.openai_api_key and not os.getenv('OPENAI_API_KEY'):
- raise ValueError("Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env. Used for evaluation model")
- else:
- self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
-
- self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY')
+ raise ValueError(
+ "document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals"
+ )
- if self.model_provider == "Anthropic":
- if not self.anthropic_api_key and not os.getenv('ANTHROPIC_API_KEY'):
- raise ValueError("Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env.")
- else:
- self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY')
-
- if not self.model_name:
- raise ValueError("model_name must be provided.")
-
- if model_provider == "OpenAI":
- self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key)
- self.enc = tiktoken.encoding_for_model(self.model_name)
- elif model_provider == "Anthropic":
- self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key)
- self.enc = Anthropic().get_tokenizer()
-
- self.model_to_test_description = model_name
- self.evaluation_model = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = self.openai_api_key)
-
- def logistic(self, x, L=100, x0=50, k=.1):
+ if evaluation_method == "gpt4":
+ self.evaluation_model = ChatOpenAI(
+ model="gpt-4", temperature=0, openai_api_key=self.openai_api_key
+ )
+
+ if evaluation_method == "substring_match" and not all(
+ word.lower() in needle.lower() for word in substr_validation_words
+ ):
+ raise ValueError(
+ "You choose substring evaluation method but some of the words in substr_validation_words is not in the needle you provided"
+ f"\n\nneedle: {needle}"
+ f"\nsubstr_validation_words: {substr_validation_words}"
+ )
+
+ def logistic(self, x, L=100, x0=50, k=0.1):
if x == 0:
return 0
if x == 100:
return 100
return np.round(L / (1 + np.exp(-k * (x - x0))), 3)
-
+
async def bound_evaluate_and_log(self, sem, *args):
async with sem:
await self.evaluate_and_log(*args)
@@ -166,28 +178,13 @@ async def run_test(self):
# Wait for all tasks to complete
await asyncio.gather(*tasks)
- def generate_prompt(self, context):
- if self.model_provider == "Anthropic":
- with open('Anthropic_prompt.txt', 'r') as file:
- prompt = file.read()
- return prompt.format(retrieval_question=self.retrieval_question, context=context)
- elif self.model_provider == "OpenAI":
- # Generate the prompt for the Anthropic model
- # Replace the following line with the appropriate prompt structure
- return [
- {
- "role": "system",
- "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
- },
- {
- "role": "user",
- "content": context
- },
- {
- "role": "user",
- "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings"
- }
- ]
+ @abstractmethod
+ def get_prompt(self, context):
+ pass
+
+ @abstractmethod
+ async def get_response_from_model(self, prompt):
+ pass
async def evaluate_and_log(self, context_length, depth_percent):
# Checks to see if you've already checked a length/percent/version.
@@ -200,27 +197,12 @@ async def evaluate_and_log(self, context_length, depth_percent):
context = await self.generate_context(context_length, depth_percent)
# Prepare your message to send to the model you're going to evaluate
- prompt = self.generate_prompt(context)
+ prompt = self.get_prompt(context)
test_start_time = time.time()
# Go see if the model can answer the question to pull out your random fact
- if self.model_provider == "OpenAI":
- response = await self.model_to_test.chat.completions.create(
- model=self.model_name,
- messages=prompt,
- max_tokens=300,
- temperature=0
- )
- response = response.choices[0].message.content
- elif self.model_provider == "Anthropic":
- response = await self.model_to_test.completions.create(
- model=self.model_name,
- max_tokens_to_sample=300,
- prompt=prompt,
- temperature=0
- )
- response = response.completion
+ response = await self.get_response_from_model(prompt)
test_end_time = time.time()
test_elapsed_time = test_end_time - test_start_time
@@ -228,50 +210,53 @@ async def evaluate_and_log(self, context_length, depth_percent):
# Compare the reponse to the actual needle you placed
score = self.evaluate_response(response)
-
results = {
# 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
- 'model' : self.model_to_test_description,
- 'context_length' : int(context_length),
- 'depth_percent' : float(depth_percent),
- 'version' : self.results_version,
- 'needle' : self.needle,
- 'model_response' : response,
- 'score' : score,
- 'test_duration_seconds' : test_elapsed_time,
- 'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
+ "model": self.model_to_test_description,
+ "context_length": int(context_length),
+ "depth_percent": float(depth_percent),
+ "version": self.results_version,
+ "needle": self.needle,
+ "model_response": response,
+ "score": score,
+ "test_duration_seconds": test_elapsed_time,
+ "test_timestamp_utc": datetime.now(timezone.utc).strftime(
+ "%Y-%m-%d %H:%M:%S%z"
+ ),
}
self.testing_results.append(results)
if self.print_ongoing_status:
- print (f"-- Test Summary -- ")
- print (f"Duration: {test_elapsed_time:.1f} seconds")
- print (f"Context: {context_length} tokens")
- print (f"Depth: {depth_percent}%")
- print (f"Score: {score}")
- print (f"Response: {response}\n")
+ print("-- Test Summary -- ")
+ print(f"Duration: {test_elapsed_time:.1f} seconds")
+ print(f"Context: {context_length} tokens")
+ print(f"Depth: {depth_percent}%")
+ print(f"Score: {score}")
+ print(f"Response: {response}\n")
- context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
+ context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent * 100)}'
if self.save_contexts:
- results['file_name'] : context_file_location
+ results["file_name"] = context_file_location
# Save the context to file for retesting
- if not os.path.exists('contexts'):
- os.makedirs('contexts')
+ contexts_dir = Path("contexts")
+ contexts_dir.mkdir(parents=True, exist_ok=True)
+
+ context_file_path = contexts_dir / f"{context_file_location}_context.txt"
+ context_file_path.write_text(context)
- with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
- f.write(context)
-
if self.save_results:
- # Save the context to file for retesting
- if not os.path.exists('results'):
- os.makedirs('results')
+ # Ensure the 'results' directory exists
+ results_dir = Path("results")
+ results_dir.mkdir(parents=True, exist_ok=True)
- # Save the result to file for retesting
- with open(f'results/{context_file_location}_results.json', 'w') as f:
- json.dump(results, f)
+ # Define the file path for the results file
+ results_file_path = results_dir / f"{context_file_location}_results.json"
+
+ # Serialize the results dictionary to a JSON formatted string and write to the file
+ results_file_path.write_text(json.dumps(results))
if self.seconds_to_sleep_between_completions:
await asyncio.sleep(self.seconds_to_sleep_between_completions)
@@ -281,20 +266,24 @@ def result_exists(self, context_length, depth_percent):
Checks to see if a result has already been evaluated or not
"""
- results_dir = 'results/'
- if not os.path.exists(results_dir):
+ results_dir = Path("results")
+ if not results_dir.exists():
return False
-
- for filename in os.listdir(results_dir):
- if filename.endswith('.json'):
- with open(os.path.join(results_dir, filename), 'r') as f:
- result = json.load(f)
- context_length_met = result['context_length'] == context_length
- depth_percent_met = result['depth_percent'] == depth_percent
- version_met = result.get('version', 1) == self.results_version
- model_met = result['model'] == self.model_name
- if context_length_met and depth_percent_met and version_met and model_met:
- return True
+
+ for filepath in results_dir.glob("*.json"):
+ with filepath.open("r") as f:
+ result = json.load(f)
+ context_length_met = result["context_length"] == context_length
+ depth_percent_met = result["depth_percent"] == depth_percent
+ version_met = result.get("version", 1) == self.results_version
+ model_met = result["model"] == self.model_name
+ if (
+ context_length_met
+ and depth_percent_met
+ and version_met
+ and model_met
+ ):
+ return True
return False
async def generate_context(self, context_length, depth_percent):
@@ -310,26 +299,17 @@ async def generate_context(self, context_length, depth_percent):
context = self.insert_needle(context, depth_percent, context_length)
return context
-
- def encode_text_to_tokens(self, text):
- if self.model_provider == "OpenAI":
- return self.enc.encode(text)
- elif self.model_provider == "Anthropic":
- # Assuming you have a different encoder for Anthropic
- return self.enc.encode(text).ids
- else:
- raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-
+
def insert_needle(self, context, depth_percent, context_length):
- tokens_needle = self.encode_text_to_tokens(self.needle)
- tokens_context = self.encode_text_to_tokens(context)
+ tokens_needle = self.get_encoding(self.needle)
+ tokens_context = self.get_encoding(context)
# Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
context_length -= self.final_context_length_buffer
# If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
if len(tokens_context) + len(tokens_needle) > context_length:
- tokens_context = tokens_context[:context_length - len(tokens_needle)]
+ tokens_context = tokens_context[: context_length - len(tokens_needle)]
if depth_percent == 100:
# If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
@@ -342,8 +322,8 @@ def insert_needle(self, context, depth_percent, context_length):
tokens_new_context = tokens_context[:insertion_point]
# We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
- period_tokens = self.encode_text_to_tokens('.')
-
+ period_tokens = self.get_encoding(".")
+
# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
insertion_point -= 1
@@ -354,10 +334,16 @@ def insert_needle(self, context, depth_percent, context_length):
tokens_new_context += tokens_needle + tokens_context[insertion_point:]
# Convert back to a string and return it
- new_context = self.decode_tokens(tokens_new_context)
+ new_context = self.get_decoding(tokens_new_context)
return new_context
def evaluate_response(self, response):
+ if self.evaluation_method == "gpt4":
+ return self.evaluate_response_gpt4(response)
+ else:
+ return self.evaluate_response_substring_match(response)
+
+ def evaluate_response_gpt4(self, response):
accuracy_criteria = {
"accuracy": """
Score 1: The answer is completely unrelated to the reference.
@@ -379,24 +365,31 @@ def evaluate_response(self, response):
eval_result = evaluator.evaluate_strings(
# The models response
prediction=response,
-
# The actual answer
reference=self.needle,
-
# The question asked
input=self.retrieval_question,
)
- return int(eval_result['score'])
+ return int(eval_result["score"])
- def get_context_length_in_tokens(self, context):
- if self.model_provider == "OpenAI":
- return len(self.enc.encode(context))
- elif self.model_provider == "Anthropic":
- # Assuming you have a different encoder for Anthropic
- return len(self.enc.encode(context).ids)
+ def evaluate_response_substring_match(self, response):
+ response_lower = response.lower()
+ if all(word in response_lower for word in self.substr_validation_words):
+ return 1
else:
- raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
+ return 0
+
+ @abstractmethod
+ def get_encoding(self, context):
+ pass
+
+ @abstractmethod
+ def get_decoding(self, encoded_context):
+ pass
+
+ def get_context_length_in_tokens(self, context):
+ return len(self.get_encoding(context))
def read_context_files(self):
context = ""
@@ -404,53 +397,38 @@ def read_context_files(self):
while self.get_context_length_in_tokens(context) < max_context_length:
for file in glob.glob(f"{self.haystack_dir}/*.txt"):
- with open(file, 'r') as f:
+ with open(file, "r") as f:
context += f.read()
return context
- def get_tokens_from_context(self, context):
- if self.model_provider == "OpenAI":
- return self.enc.encode(context)
- elif self.model_provider == "Anthropic":
- # Assuming you have a different encoder for Anthropic
- return self.enc.encode(context).ids
- else:
- raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-
- def decode_tokens(self, tokens, context_length=None):
- if self.model_provider == "OpenAI":
- return self.enc.decode(tokens[:context_length])
- elif self.model_provider == "Anthropic":
- # Assuming you have a different decoder for Anthropic
- return self.enc.decode(tokens[:context_length])
- else:
- raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-
def encode_and_trim(self, context, context_length):
- tokens = self.get_tokens_from_context(context)
- if len(tokens) > context_length:
- context = self.decode_tokens(tokens, context_length)
- return context
-
+ encoded_context = self.get_encoding(context)
+ return self.get_decoding(encoded_context[:context_length])
+
def get_results(self):
return self.testing_results
-
+
def print_start_test_summary(self):
- print ("\n")
- print ("Starting Needle In A Haystack Testing...")
- print (f"- Model: {self.model_name}")
- print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
- print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
- print (f"- Needle: {self.needle.strip()}")
- print ("\n\n")
+ print("\n")
+ print("Starting Needle In A Haystack Testing...")
+ print(f"- Model: {self.model_name}")
+ print(
+ f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}"
+ )
+ print(
+ f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%"
+ )
+ print(f"- Needle: {self.needle.strip()}")
+ print("\n\n")
def start_test(self):
if self.print_ongoing_status:
self.print_start_test_summary()
asyncio.run(self.run_test())
+
if __name__ == "__main__":
# Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
ht = LLMNeedleHaystackTester()
- ht.start_test()
\ No newline at end of file
+ ht.start_test()
diff --git a/OpenAIEvaluator.py b/OpenAIEvaluator.py
new file mode 100644
index 00000000..b0eef38e
--- /dev/null
+++ b/OpenAIEvaluator.py
@@ -0,0 +1,65 @@
+import os
+import tiktoken
+from LLMNeedleHaystackTester import LLMNeedleHaystackTester
+from openai import AsyncOpenAI
+
+
+class OpenAIEvaluator(LLMNeedleHaystackTester):
+ def __init__(self, **kwargs):
+ if "openai_api_key" not in kwargs and not os.getenv("OPENAI_API_KEY"):
+ raise ValueError(
+ "Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env"
+ )
+
+ if "model_name" not in kwargs:
+ raise ValueError(
+ "model_name must be supplied with init, accepted model_names are 'gpt-4-1106-preview'"
+ )
+ elif kwargs["model_name"] not in ["gpt-4-1106-preview"]:
+ raise ValueError("Model name must be in this list (gpt-4-1106-preview)")
+
+ if "evaluation_method" not in kwargs:
+ print(
+ "since evaluation method is not specified , 'gpt4' will be used for evaluation"
+ )
+ elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"):
+ raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'")
+
+ self.openai_api_key = kwargs.pop("openai_api_key", os.getenv("OPENAI_API_KEY"))
+ self.model_name = kwargs["model_name"]
+ self.model_to_test_description = kwargs.pop("model_name")
+ self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+ self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key)
+
+ super().__init__(**kwargs)
+
+ def get_encoding(self, context):
+ return self.tokenizer.encode(context)
+
+ def get_decoding(self, encoded_context):
+ return self.tokenizer.decode(encoded_context)
+
+ def get_prompt(self, context):
+ return [
+ {
+ "role": "system",
+ "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct",
+ },
+ {"role": "user", "content": context},
+ {
+ "role": "user",
+ "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings",
+ },
+ ]
+
+ async def get_response_from_model(self, prompt):
+ response = await self.model_to_test.chat.completions.create(
+ model=self.model_name, messages=prompt, max_tokens=300, temperature=0
+ )
+ return response.choices[0].message.content
+
+
+if __name__ == "__main__":
+ # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
+ ht = OpenAIEvaluator(model_name="gpt-4-1106-preview", evaluation_method="gpt4")
+ ht.start_test()
diff --git a/README.md b/README.md
index 21c4e609..96ae4b85 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,24 @@ A simple 'needle in a haystack' analysis to test in-context retrieval ability of
Get the behind the scenes on the [overview video](https://youtu.be/KwRRuiCCdmc).
-![GPT-4-128 Context Testing](img/NeedleHaystackCodeSnippet.png)
```
-git clone https://github.com/gkamradt/LLMTest_NeedleInAHaystack.git
+$ git clone https://github.com/prabha-git/LLMTest_NeedleInAHaystack.git
+$ cd LLMTest_NeedleInAHaystack
+$ python -m venv venv
+$ pip install -r requirements.txt
+$ export OPENAI_API_KEY=<>
+$ Python
+
+>>> from OpenAIEvaluator import OpenAIEvaluator
+>>> openai_ht = OpenAIEvaluator(model_name='gpt-4-1106-preview', evaluation_method='gpt4')
+>>> openai_ht.start_test()
+
+Starting Needle In A Haystack Testing...
+- Model: gpt-4-1106-preview
+- Context Lengths: 35, Min: 1000, Max: 200000
+- Document Depths: 35, Min: 0%, Max: 100%
+- Needle: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.
```
## The Test
@@ -35,7 +49,6 @@ The key parameters:
* `document_depth_percent_max` - The ending point of your document depths. Should be int < 100
* `document_depth_percent_intervals` - The number of iterations to do between your min/max points
* `document_depth_percent_interval_type` - Determines the distribution of depths to iterate over. 'linear' or 'sigmoid
-* `model_provider` - 'OpenAI' or 'Anthropic'
* `model_name` - The name of the model you'd like to test. Should match the exact value which needs to be passed to the api. Ex: `gpt-4-1106-preview`
* `save_results` - Whether or not you'd like to save your results to file. They will be temporarily saved in the object regardless. True/False
* `save_contexts` - Whether or not you'd like to save your contexts to file. **Warning** these will get very long. True/False
@@ -49,6 +62,24 @@ Other Parameters:
* `final_context_length_buffer` - The amount of context to take off each input to account for system messages and output tokens. This can be more intelligent but using a static value for now. Default 200 tokens.
* `seconds_to_sleep_between_completions` - Default: None, set # of seconds if you'd like to slow down your requests
* `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete
+* `evaluation_method` - Default: gpt4 Choose between gpt4 and simple substring matching (substring_match) to evaluate
+* `substr_validation_words` - Default: ['dolores', 'sandwich'] If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not
+
+
+
+#### Note on Evaluation Method (`evaluation_method`):
+
+There are two options for evaluation: `gpt4` and `substring_method`.
+
+- `gpt4`: This is the default, utilizing the GPT-4 model to assess responses with a scoring range from 1 to 10. This method is particularly effective when dealing with a broad topic (Large Needle), where using a few hardcoded keywords to evaluate the accuracy and relevance of the response may not be sufficient.
+```
+ - Score 1: The response is completely unrelated to the reference.
+ - Score 3: The response has some relevance but does not fully align with the reference.
+ - Score 5: The response is moderately relevant but includes inaccuracies.
+ - Score 7: The response aligns well with the reference but has minor omissions.
+ - Score 10: The response is entirely accurate and aligns perfectly with the reference.
+```
+- `substring_method`: This approach is suitable for "small needles", where a predefined list of keywords can effectively determine if the response retrieves the essential information. It provides a binary score of either 0 or 1. Opting for this method can also reduce GPT-4 API evaluation costs.
## Results Visualization
`LLMNeedleInHaystackVisualization.ipynb` holds the code to make the pivot table visualization. The pivot table was then transferred to Google Slides for custom annotations and formatting. See the [google slides version](https://docs.google.com/presentation/d/15JEdEBjm32qBbqeYM6DK6G-3mUJd7FAJu-qEzj8IYLQ/edit?usp=sharing). See an overview of how this viz was created [here](https://twitter.com/GregKamradt/status/1729573848893579488).
diff --git a/requirements.txt b/requirements.txt
index 309c34c1..f9255207 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
-anthropic==0.7.5
+anthropic==0.16.0
anyio==3.7.1
attrs==23.1.0
certifi==2023.11.17
@@ -27,6 +27,7 @@ mypy-extensions==1.0.0
numpy==1.26.2
openai==1.3.5
packaging==23.2
+pre-commit==3.6.2
pydantic==2.5.2
pydantic_core==2.14.5
python-dotenv==1.0.0