diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..88a388b9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/psf/black + rev: 24.2.0 + hooks: + - id: black + diff --git a/AnthropicEvaluator.py b/AnthropicEvaluator.py new file mode 100644 index 00000000..bed2297d --- /dev/null +++ b/AnthropicEvaluator.py @@ -0,0 +1,86 @@ +import os +import tiktoken +from LLMNeedleHaystackTester import LLMNeedleHaystackTester +from anthropic import AsyncAnthropic, Anthropic + + +class AnthropicEvaluator(LLMNeedleHaystackTester): + def __init__(self, **kwargs): + if "anthropic_api_key" not in kwargs and not os.getenv("ANTHROPIC_API_KEY"): + raise ValueError( + "Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env" + ) + + if "model_name" not in kwargs: + raise ValueError("model_name must be supplied with init") + elif "claude" not in kwargs["model_name"]: + raise ValueError( + "If the model provider is 'Anthropic', the model name must include 'claude'. " + "See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models" + ) + + if "evaluation_method" not in kwargs: + print( + "since evaluation method is not specified , default method substring_match will be used for evaluation" + ) + elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"): + raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'") + elif ( + kwargs["evaluation_method"] == "gpt4" + and "openai_api_key" not in kwargs + and not os.getenv("OPENAI_API_KEY") + ): + raise ValueError( + "if evaluation_method is gpt4 , openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env" + ) + else: + self.openai_api_key = kwargs.get( + "openai_api_key", os.getenv("OPENAI_API_KEY") + ) + + self.anthropic_api_key = kwargs.pop( + "anthropic_api_key", os.getenv("ANTHROPIC_API_KEY") + ) + self.model_name = kwargs["model_name"] + self.model_to_test_description = kwargs.pop("model_name") + self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key) + self.tokenizer = Anthropic().get_tokenizer() + + super().__init__(**kwargs) + + def get_encoding(self, context): + return self.tokenizer.encode(context).ids + + def get_decoding(self, encoded_context): + return self.tokenizer.decode(encoded_context) + + def get_prompt(self, context): + return [ + { + "role": "user", + "content": f"{context}\n\n {self.retrieval_question} Don't give information outside the document or repeat your findings", + }, + { + "role": "assistant", + "content": "Here is the most relevant sentence in the context:", + }, + ] + + async def get_response_from_model(self, prompt): + response = await self.model_to_test.messages.create( + model=self.model_name, + messages=prompt, + system="You are a helpful AI bot that answers questions for a user. Keep your response short and direct", + max_tokens=300, + temperature=0, + ) + return response.content[0].text + + +if __name__ == "__main__": + # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info + ht = AnthropicEvaluator( + model_name="claude-2.1", evaluation_method="substring_match" + ) + + ht.start_test() diff --git a/Anthropic_prompt.txt b/Anthropic_prompt.txt deleted file mode 100644 index cee594a6..00000000 --- a/Anthropic_prompt.txt +++ /dev/null @@ -1,9 +0,0 @@ -You are a helpful AI bot that answers questions for a user. Keep your response short and direct - -Human: -{context} - - -{retrieval_question} Don't give information outside the document or repeat your findings - -Assistant: Here is the most relevant sentence in the context: \ No newline at end of file diff --git a/LLMNeedleHaystackTester.py b/LLMNeedleHaystackTester.py index ef6cfc7b..16bba6d1 100644 --- a/LLMNeedleHaystackTester.py +++ b/LLMNeedleHaystackTester.py @@ -1,53 +1,56 @@ from dotenv import load_dotenv -import os -import tiktoken +from pathlib import Path import glob import json from langchain.evaluation import load_evaluator from langchain.chat_models import ChatOpenAI -from anthropic import AsyncAnthropic, Anthropic -from dotenv import load_dotenv + import numpy as np -from openai import AsyncOpenAI + import asyncio from asyncio import Semaphore from datetime import datetime, timezone import time +from abc import ABC, abstractmethod + load_dotenv() -class LLMNeedleHaystackTester: + +class LLMNeedleHaystackTester(ABC): """ This class is used to test the LLM Needle Haystack. """ - def __init__(self, - needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n", - haystack_dir="PaulGrahamEssays", - retrieval_question="What is the best thing to do in San Francisco?", - results_version = 1, - context_lengths_min = 1000, - context_lengths_max = 200000, - context_lengths_num_intervals = 35, - context_lengths = None, - document_depth_percent_min = 0, - document_depth_percent_max = 100, - document_depth_percent_intervals = 35, - document_depth_percents = None, - document_depth_percent_interval_type = "linear", - model_provider = "OpenAI", - openai_api_key=None, - anthropic_api_key = None, - model_name='gpt-4-1106-preview', - num_concurrent_requests = 1, - save_results = True, - save_contexts = True, - final_context_length_buffer = 200, - seconds_to_sleep_between_completions = None, - print_ongoing_status = True): - """ + + def __init__( + self, + needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n", + haystack_dir="PaulGrahamEssays", + retrieval_question="What is the best thing to do in San Francisco?", + substr_validation_words=["dolores", "sandwich"], + results_version=1, + context_lengths_min=1000, + context_lengths_max=200000, + context_lengths_num_intervals=35, + context_lengths=None, + document_depth_percent_min=0, + document_depth_percent_max=100, + document_depth_percent_intervals=35, + document_depth_percents=None, + document_depth_percent_interval_type="linear", + num_concurrent_requests=1, + save_results=True, + save_contexts=True, + final_context_length_buffer=200, + seconds_to_sleep_between_completions=None, + print_ongoing_status=True, + evaluation_method="gpt4", + ): + """ :param needle: The needle to be found in the haystack. Default is None. :param haystack_dir: The directory of text files to use as background context (or a haystack) in which the needle is to be found. Default is Paul Graham Essays. :param retrieval_question: The question which with to prompt the model to do the retrieval. + :param substr_validation_words: If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not :param results_version: In case you would like to try the same combination of model, context length, and depth % multiple times, change the results version other than 1 :param num_concurrent_requests: Due to volume, this object is set up to run concurrent requests, default = 1. Be careful of rate limits. :param save_results: Whether or not you would like to save your contexts to file. Warning: These will get long! Default = True @@ -62,19 +65,14 @@ def __init__(self, :param document_depth_percent_intervals: The number of intervals for the document depth percent. Default is 35. :param document_depth_percents: The depth percentages of the document. Default is None. :param document_depth_percent_interval_type: The type of interval for the document depth percent. Must be either 'linear' or 'sigmoid'. Default is 'linear'. - :param model_provider: The provider of the model. Must be either 'OpenAI' or 'Anthropic'. Default is 'OpenAI'. - :param openai_api_key: The API key for OpenAI. Default is None. - :param anthropic_api_key: The API key for Anthropic. Default is None. - :param model_name: The name of the model. Default is 'gpt-4-1106-preview'. :param seconds_to_sleep_between_completions: The number of seconds to sleep between completions. Default is None. :param print_ongoing_status: Whether or not to print the ongoing status. Default is True. + :param evaluation_method: Choose between gpt to evaluate (get the score 1,3,5,7,10) else using simple substring matching , default is gpt4 """ - if not needle or not haystack_dir or not retrieval_question: - raise ValueError("Needle, haystack, and retrieval_question must be provided.") - self.needle = needle self.haystack_dir = haystack_dir self.retrieval_question = retrieval_question + self.substr_validation_words = substr_validation_words self.results_version = results_version self.num_concurrent_requests = num_concurrent_requests self.save_results = save_results @@ -82,73 +80,87 @@ def __init__(self, self.save_contexts = save_contexts self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions self.print_ongoing_status = print_ongoing_status - self.model_provider = model_provider self.testing_results = [] + self.evaluation_method = evaluation_method if context_lengths is None: - if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None: - raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.") + if ( + context_lengths_min is None + or context_lengths_max is None + or context_lengths_num_intervals is None + ): + raise ValueError( + "Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied." + ) else: - self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int) + self.context_lengths = np.round( + np.linspace( + context_lengths_min, + context_lengths_max, + num=context_lengths_num_intervals, + endpoint=True, + ) + ).astype(int) else: self.context_lengths = context_lengths if document_depth_percents is None: - if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None: - raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.") + if ( + document_depth_percent_min is None + or document_depth_percent_max is None + or document_depth_percent_intervals is None + ): + raise ValueError( + "Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied." + ) else: - if document_depth_percent_interval_type == 'linear': - self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int) - elif document_depth_percent_interval_type == 'sigmoid': - self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)] + if document_depth_percent_interval_type == "linear": + self.document_depth_percents = np.round( + np.linspace( + document_depth_percent_min, + document_depth_percent_max, + num=document_depth_percent_intervals, + endpoint=True, + ) + ).astype(int) + elif document_depth_percent_interval_type == "sigmoid": + self.document_depth_percents = [ + self.logistic(x) + for x in np.linspace( + document_depth_percent_min, + document_depth_percent_max, + document_depth_percent_intervals, + ) + ] else: self.document_depth_percents = document_depth_percents if document_depth_percent_interval_type not in [None, "linear", "sigmoid"]: - raise ValueError("document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals") - - if model_provider not in ["OpenAI", "Anthropic"]: - raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'") - - if model_provider == "Anthropic" and "claude" not in model_name: - raise ValueError("If the model provider is 'Anthropic', the model name must include 'claude'. See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models") - - self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY') - self.model_name = model_name - - if not self.openai_api_key and not os.getenv('OPENAI_API_KEY'): - raise ValueError("Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env. Used for evaluation model") - else: - self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY') - - self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY') + raise ValueError( + "document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals" + ) - if self.model_provider == "Anthropic": - if not self.anthropic_api_key and not os.getenv('ANTHROPIC_API_KEY'): - raise ValueError("Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env.") - else: - self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY') - - if not self.model_name: - raise ValueError("model_name must be provided.") - - if model_provider == "OpenAI": - self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key) - self.enc = tiktoken.encoding_for_model(self.model_name) - elif model_provider == "Anthropic": - self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key) - self.enc = Anthropic().get_tokenizer() - - self.model_to_test_description = model_name - self.evaluation_model = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = self.openai_api_key) - - def logistic(self, x, L=100, x0=50, k=.1): + if evaluation_method == "gpt4": + self.evaluation_model = ChatOpenAI( + model="gpt-4", temperature=0, openai_api_key=self.openai_api_key + ) + + if evaluation_method == "substring_match" and not all( + word.lower() in needle.lower() for word in substr_validation_words + ): + raise ValueError( + "You choose substring evaluation method but some of the words in substr_validation_words is not in the needle you provided" + f"\n\nneedle: {needle}" + f"\nsubstr_validation_words: {substr_validation_words}" + ) + + def logistic(self, x, L=100, x0=50, k=0.1): if x == 0: return 0 if x == 100: return 100 return np.round(L / (1 + np.exp(-k * (x - x0))), 3) - + async def bound_evaluate_and_log(self, sem, *args): async with sem: await self.evaluate_and_log(*args) @@ -166,28 +178,13 @@ async def run_test(self): # Wait for all tasks to complete await asyncio.gather(*tasks) - def generate_prompt(self, context): - if self.model_provider == "Anthropic": - with open('Anthropic_prompt.txt', 'r') as file: - prompt = file.read() - return prompt.format(retrieval_question=self.retrieval_question, context=context) - elif self.model_provider == "OpenAI": - # Generate the prompt for the Anthropic model - # Replace the following line with the appropriate prompt structure - return [ - { - "role": "system", - "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct" - }, - { - "role": "user", - "content": context - }, - { - "role": "user", - "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings" - } - ] + @abstractmethod + def get_prompt(self, context): + pass + + @abstractmethod + async def get_response_from_model(self, prompt): + pass async def evaluate_and_log(self, context_length, depth_percent): # Checks to see if you've already checked a length/percent/version. @@ -200,27 +197,12 @@ async def evaluate_and_log(self, context_length, depth_percent): context = await self.generate_context(context_length, depth_percent) # Prepare your message to send to the model you're going to evaluate - prompt = self.generate_prompt(context) + prompt = self.get_prompt(context) test_start_time = time.time() # Go see if the model can answer the question to pull out your random fact - if self.model_provider == "OpenAI": - response = await self.model_to_test.chat.completions.create( - model=self.model_name, - messages=prompt, - max_tokens=300, - temperature=0 - ) - response = response.choices[0].message.content - elif self.model_provider == "Anthropic": - response = await self.model_to_test.completions.create( - model=self.model_name, - max_tokens_to_sample=300, - prompt=prompt, - temperature=0 - ) - response = response.completion + response = await self.get_response_from_model(prompt) test_end_time = time.time() test_elapsed_time = test_end_time - test_start_time @@ -228,50 +210,53 @@ async def evaluate_and_log(self, context_length, depth_percent): # Compare the reponse to the actual needle you placed score = self.evaluate_response(response) - results = { # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. - 'model' : self.model_to_test_description, - 'context_length' : int(context_length), - 'depth_percent' : float(depth_percent), - 'version' : self.results_version, - 'needle' : self.needle, - 'model_response' : response, - 'score' : score, - 'test_duration_seconds' : test_elapsed_time, - 'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') + "model": self.model_to_test_description, + "context_length": int(context_length), + "depth_percent": float(depth_percent), + "version": self.results_version, + "needle": self.needle, + "model_response": response, + "score": score, + "test_duration_seconds": test_elapsed_time, + "test_timestamp_utc": datetime.now(timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S%z" + ), } self.testing_results.append(results) if self.print_ongoing_status: - print (f"-- Test Summary -- ") - print (f"Duration: {test_elapsed_time:.1f} seconds") - print (f"Context: {context_length} tokens") - print (f"Depth: {depth_percent}%") - print (f"Score: {score}") - print (f"Response: {response}\n") + print("-- Test Summary -- ") + print(f"Duration: {test_elapsed_time:.1f} seconds") + print(f"Context: {context_length} tokens") + print(f"Depth: {depth_percent}%") + print(f"Score: {score}") + print(f"Response: {response}\n") - context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}' + context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent * 100)}' if self.save_contexts: - results['file_name'] : context_file_location + results["file_name"] = context_file_location # Save the context to file for retesting - if not os.path.exists('contexts'): - os.makedirs('contexts') + contexts_dir = Path("contexts") + contexts_dir.mkdir(parents=True, exist_ok=True) + + context_file_path = contexts_dir / f"{context_file_location}_context.txt" + context_file_path.write_text(context) - with open(f'contexts/{context_file_location}_context.txt', 'w') as f: - f.write(context) - if self.save_results: - # Save the context to file for retesting - if not os.path.exists('results'): - os.makedirs('results') + # Ensure the 'results' directory exists + results_dir = Path("results") + results_dir.mkdir(parents=True, exist_ok=True) - # Save the result to file for retesting - with open(f'results/{context_file_location}_results.json', 'w') as f: - json.dump(results, f) + # Define the file path for the results file + results_file_path = results_dir / f"{context_file_location}_results.json" + + # Serialize the results dictionary to a JSON formatted string and write to the file + results_file_path.write_text(json.dumps(results)) if self.seconds_to_sleep_between_completions: await asyncio.sleep(self.seconds_to_sleep_between_completions) @@ -281,20 +266,24 @@ def result_exists(self, context_length, depth_percent): Checks to see if a result has already been evaluated or not """ - results_dir = 'results/' - if not os.path.exists(results_dir): + results_dir = Path("results") + if not results_dir.exists(): return False - - for filename in os.listdir(results_dir): - if filename.endswith('.json'): - with open(os.path.join(results_dir, filename), 'r') as f: - result = json.load(f) - context_length_met = result['context_length'] == context_length - depth_percent_met = result['depth_percent'] == depth_percent - version_met = result.get('version', 1) == self.results_version - model_met = result['model'] == self.model_name - if context_length_met and depth_percent_met and version_met and model_met: - return True + + for filepath in results_dir.glob("*.json"): + with filepath.open("r") as f: + result = json.load(f) + context_length_met = result["context_length"] == context_length + depth_percent_met = result["depth_percent"] == depth_percent + version_met = result.get("version", 1) == self.results_version + model_met = result["model"] == self.model_name + if ( + context_length_met + and depth_percent_met + and version_met + and model_met + ): + return True return False async def generate_context(self, context_length, depth_percent): @@ -310,26 +299,17 @@ async def generate_context(self, context_length, depth_percent): context = self.insert_needle(context, depth_percent, context_length) return context - - def encode_text_to_tokens(self, text): - if self.model_provider == "OpenAI": - return self.enc.encode(text) - elif self.model_provider == "Anthropic": - # Assuming you have a different encoder for Anthropic - return self.enc.encode(text).ids - else: - raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'") - + def insert_needle(self, context, depth_percent, context_length): - tokens_needle = self.encode_text_to_tokens(self.needle) - tokens_context = self.encode_text_to_tokens(context) + tokens_needle = self.get_encoding(self.needle) + tokens_context = self.get_encoding(context) # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response. context_length -= self.final_context_length_buffer # If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length if len(tokens_context) + len(tokens_needle) > context_length: - tokens_context = tokens_context[:context_length - len(tokens_needle)] + tokens_context = tokens_context[: context_length - len(tokens_needle)] if depth_percent == 100: # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end @@ -342,8 +322,8 @@ def insert_needle(self, context, depth_percent, context_length): tokens_new_context = tokens_context[:insertion_point] # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is - period_tokens = self.encode_text_to_tokens('.') - + period_tokens = self.get_encoding(".") + # Then we iteration backwards until we find the first period while tokens_new_context and tokens_new_context[-1] not in period_tokens: insertion_point -= 1 @@ -354,10 +334,16 @@ def insert_needle(self, context, depth_percent, context_length): tokens_new_context += tokens_needle + tokens_context[insertion_point:] # Convert back to a string and return it - new_context = self.decode_tokens(tokens_new_context) + new_context = self.get_decoding(tokens_new_context) return new_context def evaluate_response(self, response): + if self.evaluation_method == "gpt4": + return self.evaluate_response_gpt4(response) + else: + return self.evaluate_response_substring_match(response) + + def evaluate_response_gpt4(self, response): accuracy_criteria = { "accuracy": """ Score 1: The answer is completely unrelated to the reference. @@ -379,24 +365,31 @@ def evaluate_response(self, response): eval_result = evaluator.evaluate_strings( # The models response prediction=response, - # The actual answer reference=self.needle, - # The question asked input=self.retrieval_question, ) - return int(eval_result['score']) + return int(eval_result["score"]) - def get_context_length_in_tokens(self, context): - if self.model_provider == "OpenAI": - return len(self.enc.encode(context)) - elif self.model_provider == "Anthropic": - # Assuming you have a different encoder for Anthropic - return len(self.enc.encode(context).ids) + def evaluate_response_substring_match(self, response): + response_lower = response.lower() + if all(word in response_lower for word in self.substr_validation_words): + return 1 else: - raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'") + return 0 + + @abstractmethod + def get_encoding(self, context): + pass + + @abstractmethod + def get_decoding(self, encoded_context): + pass + + def get_context_length_in_tokens(self, context): + return len(self.get_encoding(context)) def read_context_files(self): context = "" @@ -404,53 +397,38 @@ def read_context_files(self): while self.get_context_length_in_tokens(context) < max_context_length: for file in glob.glob(f"{self.haystack_dir}/*.txt"): - with open(file, 'r') as f: + with open(file, "r") as f: context += f.read() return context - def get_tokens_from_context(self, context): - if self.model_provider == "OpenAI": - return self.enc.encode(context) - elif self.model_provider == "Anthropic": - # Assuming you have a different encoder for Anthropic - return self.enc.encode(context).ids - else: - raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'") - - def decode_tokens(self, tokens, context_length=None): - if self.model_provider == "OpenAI": - return self.enc.decode(tokens[:context_length]) - elif self.model_provider == "Anthropic": - # Assuming you have a different decoder for Anthropic - return self.enc.decode(tokens[:context_length]) - else: - raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'") - def encode_and_trim(self, context, context_length): - tokens = self.get_tokens_from_context(context) - if len(tokens) > context_length: - context = self.decode_tokens(tokens, context_length) - return context - + encoded_context = self.get_encoding(context) + return self.get_decoding(encoded_context[:context_length]) + def get_results(self): return self.testing_results - + def print_start_test_summary(self): - print ("\n") - print ("Starting Needle In A Haystack Testing...") - print (f"- Model: {self.model_name}") - print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}") - print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%") - print (f"- Needle: {self.needle.strip()}") - print ("\n\n") + print("\n") + print("Starting Needle In A Haystack Testing...") + print(f"- Model: {self.model_name}") + print( + f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}" + ) + print( + f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%" + ) + print(f"- Needle: {self.needle.strip()}") + print("\n\n") def start_test(self): if self.print_ongoing_status: self.print_start_test_summary() asyncio.run(self.run_test()) + if __name__ == "__main__": # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info ht = LLMNeedleHaystackTester() - ht.start_test() \ No newline at end of file + ht.start_test() diff --git a/OpenAIEvaluator.py b/OpenAIEvaluator.py new file mode 100644 index 00000000..b0eef38e --- /dev/null +++ b/OpenAIEvaluator.py @@ -0,0 +1,65 @@ +import os +import tiktoken +from LLMNeedleHaystackTester import LLMNeedleHaystackTester +from openai import AsyncOpenAI + + +class OpenAIEvaluator(LLMNeedleHaystackTester): + def __init__(self, **kwargs): + if "openai_api_key" not in kwargs and not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env" + ) + + if "model_name" not in kwargs: + raise ValueError( + "model_name must be supplied with init, accepted model_names are 'gpt-4-1106-preview'" + ) + elif kwargs["model_name"] not in ["gpt-4-1106-preview"]: + raise ValueError("Model name must be in this list (gpt-4-1106-preview)") + + if "evaluation_method" not in kwargs: + print( + "since evaluation method is not specified , 'gpt4' will be used for evaluation" + ) + elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"): + raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'") + + self.openai_api_key = kwargs.pop("openai_api_key", os.getenv("OPENAI_API_KEY")) + self.model_name = kwargs["model_name"] + self.model_to_test_description = kwargs.pop("model_name") + self.tokenizer = tiktoken.encoding_for_model(self.model_name) + self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key) + + super().__init__(**kwargs) + + def get_encoding(self, context): + return self.tokenizer.encode(context) + + def get_decoding(self, encoded_context): + return self.tokenizer.decode(encoded_context) + + def get_prompt(self, context): + return [ + { + "role": "system", + "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct", + }, + {"role": "user", "content": context}, + { + "role": "user", + "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings", + }, + ] + + async def get_response_from_model(self, prompt): + response = await self.model_to_test.chat.completions.create( + model=self.model_name, messages=prompt, max_tokens=300, temperature=0 + ) + return response.choices[0].message.content + + +if __name__ == "__main__": + # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info + ht = OpenAIEvaluator(model_name="gpt-4-1106-preview", evaluation_method="gpt4") + ht.start_test() diff --git a/README.md b/README.md index 21c4e609..96ae4b85 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,24 @@ A simple 'needle in a haystack' analysis to test in-context retrieval ability of Get the behind the scenes on the [overview video](https://youtu.be/KwRRuiCCdmc). -![GPT-4-128 Context Testing](img/NeedleHaystackCodeSnippet.png) ``` -git clone https://github.com/gkamradt/LLMTest_NeedleInAHaystack.git +$ git clone https://github.com/prabha-git/LLMTest_NeedleInAHaystack.git +$ cd LLMTest_NeedleInAHaystack +$ python -m venv venv +$ pip install -r requirements.txt +$ export OPENAI_API_KEY=<> +$ Python + +>>> from OpenAIEvaluator import OpenAIEvaluator +>>> openai_ht = OpenAIEvaluator(model_name='gpt-4-1106-preview', evaluation_method='gpt4') +>>> openai_ht.start_test() + +Starting Needle In A Haystack Testing... +- Model: gpt-4-1106-preview +- Context Lengths: 35, Min: 1000, Max: 200000 +- Document Depths: 35, Min: 0%, Max: 100% +- Needle: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day. ``` ## The Test @@ -35,7 +49,6 @@ The key parameters: * `document_depth_percent_max` - The ending point of your document depths. Should be int < 100 * `document_depth_percent_intervals` - The number of iterations to do between your min/max points * `document_depth_percent_interval_type` - Determines the distribution of depths to iterate over. 'linear' or 'sigmoid -* `model_provider` - 'OpenAI' or 'Anthropic' * `model_name` - The name of the model you'd like to test. Should match the exact value which needs to be passed to the api. Ex: `gpt-4-1106-preview` * `save_results` - Whether or not you'd like to save your results to file. They will be temporarily saved in the object regardless. True/False * `save_contexts` - Whether or not you'd like to save your contexts to file. **Warning** these will get very long. True/False @@ -49,6 +62,24 @@ Other Parameters: * `final_context_length_buffer` - The amount of context to take off each input to account for system messages and output tokens. This can be more intelligent but using a static value for now. Default 200 tokens. * `seconds_to_sleep_between_completions` - Default: None, set # of seconds if you'd like to slow down your requests * `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete +* `evaluation_method` - Default: gpt4 Choose between gpt4 and simple substring matching (substring_match) to evaluate +* `substr_validation_words` - Default: ['dolores', 'sandwich'] If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not + + + +#### Note on Evaluation Method (`evaluation_method`): + +There are two options for evaluation: `gpt4` and `substring_method`. + +- `gpt4`: This is the default, utilizing the GPT-4 model to assess responses with a scoring range from 1 to 10. This method is particularly effective when dealing with a broad topic (Large Needle), where using a few hardcoded keywords to evaluate the accuracy and relevance of the response may not be sufficient. +``` + - Score 1: The response is completely unrelated to the reference. + - Score 3: The response has some relevance but does not fully align with the reference. + - Score 5: The response is moderately relevant but includes inaccuracies. + - Score 7: The response aligns well with the reference but has minor omissions. + - Score 10: The response is entirely accurate and aligns perfectly with the reference. +``` +- `substring_method`: This approach is suitable for "small needles", where a predefined list of keywords can effectively determine if the response retrieves the essential information. It provides a binary score of either 0 or 1. Opting for this method can also reduce GPT-4 API evaluation costs. ## Results Visualization `LLMNeedleInHaystackVisualization.ipynb` holds the code to make the pivot table visualization. The pivot table was then transferred to Google Slides for custom annotations and formatting. See the [google slides version](https://docs.google.com/presentation/d/15JEdEBjm32qBbqeYM6DK6G-3mUJd7FAJu-qEzj8IYLQ/edit?usp=sharing). See an overview of how this viz was created [here](https://twitter.com/GregKamradt/status/1729573848893579488). diff --git a/requirements.txt b/requirements.txt index 309c34c1..f9255207 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ aiohttp==3.9.1 aiosignal==1.3.1 annotated-types==0.6.0 -anthropic==0.7.5 +anthropic==0.16.0 anyio==3.7.1 attrs==23.1.0 certifi==2023.11.17 @@ -27,6 +27,7 @@ mypy-extensions==1.0.0 numpy==1.26.2 openai==1.3.5 packaging==23.2 +pre-commit==3.6.2 pydantic==2.5.2 pydantic_core==2.14.5 python-dotenv==1.0.0