diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..88a388b9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+      - id: black
+
diff --git a/AnthropicEvaluator.py b/AnthropicEvaluator.py
new file mode 100644
index 00000000..bed2297d
--- /dev/null
+++ b/AnthropicEvaluator.py
@@ -0,0 +1,86 @@
+import os
+import tiktoken
+from LLMNeedleHaystackTester import LLMNeedleHaystackTester
+from anthropic import AsyncAnthropic, Anthropic
+
+
+class AnthropicEvaluator(LLMNeedleHaystackTester):
+    def __init__(self, **kwargs):
+        if "anthropic_api_key" not in kwargs and not os.getenv("ANTHROPIC_API_KEY"):
+            raise ValueError(
+                "Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env"
+            )
+
+        if "model_name" not in kwargs:
+            raise ValueError("model_name must be supplied with init")
+        elif "claude" not in kwargs["model_name"]:
+            raise ValueError(
+                "If the model provider is 'Anthropic', the model name must include 'claude'. "
+                "See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models"
+            )
+
+        if "evaluation_method" not in kwargs:
+            print(
+                "since evaluation method is not specified , default method substring_match will be used for evaluation"
+            )
+        elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"):
+            raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'")
+        elif (
+            kwargs["evaluation_method"] == "gpt4"
+            and "openai_api_key" not in kwargs
+            and not os.getenv("OPENAI_API_KEY")
+        ):
+            raise ValueError(
+                "if evaluation_method is gpt4 , openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env"
+            )
+        else:
+            self.openai_api_key = kwargs.get(
+                "openai_api_key", os.getenv("OPENAI_API_KEY")
+            )
+
+        self.anthropic_api_key = kwargs.pop(
+            "anthropic_api_key", os.getenv("ANTHROPIC_API_KEY")
+        )
+        self.model_name = kwargs["model_name"]
+        self.model_to_test_description = kwargs.pop("model_name")
+        self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key)
+        self.tokenizer = Anthropic().get_tokenizer()
+
+        super().__init__(**kwargs)
+
+    def get_encoding(self, context):
+        return self.tokenizer.encode(context).ids
+
+    def get_decoding(self, encoded_context):
+        return self.tokenizer.decode(encoded_context)
+
+    def get_prompt(self, context):
+        return [
+            {
+                "role": "user",
+                "content": f"{context}\n\n {self.retrieval_question} Don't give information outside the document or repeat your findings",
+            },
+            {
+                "role": "assistant",
+                "content": "Here is the most relevant sentence in the context:",
+            },
+        ]
+
+    async def get_response_from_model(self, prompt):
+        response = await self.model_to_test.messages.create(
+            model=self.model_name,
+            messages=prompt,
+            system="You are a helpful AI bot that answers questions for a user. Keep your response short and direct",
+            max_tokens=300,
+            temperature=0,
+        )
+        return response.content[0].text
+
+
+if __name__ == "__main__":
+    # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
+    ht = AnthropicEvaluator(
+        model_name="claude-2.1", evaluation_method="substring_match"
+    )
+
+    ht.start_test()
diff --git a/Anthropic_prompt.txt b/Anthropic_prompt.txt
deleted file mode 100644
index cee594a6..00000000
--- a/Anthropic_prompt.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-You are a helpful AI bot that answers questions for a user. Keep your response short and direct
-
-Human: <context>
-{context}
-</context>
-
-{retrieval_question} Don't give information outside the document or repeat your findings
-
-Assistant: Here is the most relevant sentence in the context:
\ No newline at end of file
diff --git a/LLMNeedleHaystackTester.py b/LLMNeedleHaystackTester.py
index ef6cfc7b..16bba6d1 100644
--- a/LLMNeedleHaystackTester.py
+++ b/LLMNeedleHaystackTester.py
@@ -1,53 +1,56 @@
 from dotenv import load_dotenv
-import os
-import tiktoken
+from pathlib import Path
 import glob
 import json
 from langchain.evaluation import load_evaluator
 from langchain.chat_models import ChatOpenAI
-from anthropic import AsyncAnthropic, Anthropic
-from dotenv import load_dotenv
+
 import numpy as np
-from openai import AsyncOpenAI
+
 import asyncio
 from asyncio import Semaphore
 from datetime import datetime, timezone
 import time
 
+from abc import ABC, abstractmethod
+
 load_dotenv()
 
-class LLMNeedleHaystackTester:
+
+class LLMNeedleHaystackTester(ABC):
     """
     This class is used to test the LLM Needle Haystack.
     """
-    def __init__(self,
-                 needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
-                 haystack_dir="PaulGrahamEssays",
-                 retrieval_question="What is the best thing to do in San Francisco?",
-                 results_version = 1,
-                 context_lengths_min = 1000,
-                 context_lengths_max = 200000,
-                 context_lengths_num_intervals = 35,
-                 context_lengths = None,
-                 document_depth_percent_min = 0,
-                 document_depth_percent_max = 100,
-                 document_depth_percent_intervals = 35,
-                 document_depth_percents = None,
-                 document_depth_percent_interval_type = "linear",
-                 model_provider = "OpenAI",
-                 openai_api_key=None,
-                 anthropic_api_key = None,
-                 model_name='gpt-4-1106-preview',
-                 num_concurrent_requests = 1,
-                 save_results = True,
-                 save_contexts = True,
-                 final_context_length_buffer = 200,
-                 seconds_to_sleep_between_completions = None,
-                 print_ongoing_status = True):
-        """        
+
+    def __init__(
+        self,
+        needle="\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n",
+        haystack_dir="PaulGrahamEssays",
+        retrieval_question="What is the best thing to do in San Francisco?",
+        substr_validation_words=["dolores", "sandwich"],
+        results_version=1,
+        context_lengths_min=1000,
+        context_lengths_max=200000,
+        context_lengths_num_intervals=35,
+        context_lengths=None,
+        document_depth_percent_min=0,
+        document_depth_percent_max=100,
+        document_depth_percent_intervals=35,
+        document_depth_percents=None,
+        document_depth_percent_interval_type="linear",
+        num_concurrent_requests=1,
+        save_results=True,
+        save_contexts=True,
+        final_context_length_buffer=200,
+        seconds_to_sleep_between_completions=None,
+        print_ongoing_status=True,
+        evaluation_method="gpt4",
+    ):
+        """
         :param needle: The needle to be found in the haystack. Default is None.
         :param haystack_dir: The directory of text files to use as background context (or a haystack) in which the needle is to be found. Default is Paul Graham Essays.
         :param retrieval_question: The question which with to prompt the model to do the retrieval.
+        :param substr_validation_words: If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not
         :param results_version: In case you would like to try the same combination of model, context length, and depth % multiple times, change the results version other than 1
         :param num_concurrent_requests: Due to volume, this object is set up to run concurrent requests, default = 1. Be careful of rate limits.
         :param save_results: Whether or not you would like to save your contexts to file. Warning: These will get long! Default = True
@@ -62,19 +65,14 @@ def __init__(self,
         :param document_depth_percent_intervals: The number of intervals for the document depth percent. Default is 35.
         :param document_depth_percents: The depth percentages of the document. Default is None.
         :param document_depth_percent_interval_type: The type of interval for the document depth percent. Must be either 'linear' or 'sigmoid'. Default is 'linear'.
-        :param model_provider: The provider of the model. Must be either 'OpenAI' or 'Anthropic'. Default is 'OpenAI'.
-        :param openai_api_key: The API key for OpenAI. Default is None.
-        :param anthropic_api_key: The API key for Anthropic. Default is None.
-        :param model_name: The name of the model. Default is 'gpt-4-1106-preview'.
         :param seconds_to_sleep_between_completions: The number of seconds to sleep between completions. Default is None.
         :param print_ongoing_status: Whether or not to print the ongoing status. Default is True.
+        :param evaluation_method: Choose between gpt to evaluate (get the score 1,3,5,7,10) else using simple substring matching , default is gpt4
         """
-        if not needle or not haystack_dir or not retrieval_question:
-            raise ValueError("Needle, haystack, and retrieval_question must be provided.")
-        
         self.needle = needle
         self.haystack_dir = haystack_dir
         self.retrieval_question = retrieval_question
+        self.substr_validation_words = substr_validation_words
         self.results_version = results_version
         self.num_concurrent_requests = num_concurrent_requests
         self.save_results = save_results
@@ -82,73 +80,87 @@ def __init__(self,
         self.save_contexts = save_contexts
         self.seconds_to_sleep_between_completions = seconds_to_sleep_between_completions
         self.print_ongoing_status = print_ongoing_status
-        self.model_provider = model_provider
         self.testing_results = []
+        self.evaluation_method = evaluation_method
 
         if context_lengths is None:
-            if context_lengths_min is None or context_lengths_max is None or context_lengths_num_intervals is None:
-                raise ValueError("Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied.")
+            if (
+                context_lengths_min is None
+                or context_lengths_max is None
+                or context_lengths_num_intervals is None
+            ):
+                raise ValueError(
+                    "Either context_lengths_min, context_lengths_max, context_lengths_intervals need to be filled out OR the context_lengths_list needs to be supplied."
+                )
             else:
-                self.context_lengths = np.round(np.linspace(context_lengths_min, context_lengths_max, num=context_lengths_num_intervals, endpoint=True)).astype(int)
+                self.context_lengths = np.round(
+                    np.linspace(
+                        context_lengths_min,
+                        context_lengths_max,
+                        num=context_lengths_num_intervals,
+                        endpoint=True,
+                    )
+                ).astype(int)
         else:
             self.context_lengths = context_lengths
 
         if document_depth_percents is None:
-            if document_depth_percent_min is None or document_depth_percent_max is None or document_depth_percent_intervals is None:
-                raise ValueError("Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied.")
+            if (
+                document_depth_percent_min is None
+                or document_depth_percent_max is None
+                or document_depth_percent_intervals is None
+            ):
+                raise ValueError(
+                    "Either document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals need to be filled out OR the document_depth_percents needs to be supplied."
+                )
             else:
-                if document_depth_percent_interval_type == 'linear':
-                    self.document_depth_percents = np.round(np.linspace(document_depth_percent_min, document_depth_percent_max, num=document_depth_percent_intervals, endpoint=True)).astype(int)
-                elif document_depth_percent_interval_type == 'sigmoid':
-                    self.document_depth_percents = [self.logistic(x) for x in np.linspace(document_depth_percent_min, document_depth_percent_max, document_depth_percent_intervals)]
+                if document_depth_percent_interval_type == "linear":
+                    self.document_depth_percents = np.round(
+                        np.linspace(
+                            document_depth_percent_min,
+                            document_depth_percent_max,
+                            num=document_depth_percent_intervals,
+                            endpoint=True,
+                        )
+                    ).astype(int)
+                elif document_depth_percent_interval_type == "sigmoid":
+                    self.document_depth_percents = [
+                        self.logistic(x)
+                        for x in np.linspace(
+                            document_depth_percent_min,
+                            document_depth_percent_max,
+                            document_depth_percent_intervals,
+                        )
+                    ]
         else:
             self.document_depth_percents = document_depth_percents
 
         if document_depth_percent_interval_type not in [None, "linear", "sigmoid"]:
-            raise ValueError("document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals")
-        
-        if model_provider not in ["OpenAI", "Anthropic"]:
-            raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-        
-        if model_provider == "Anthropic" and "claude" not in model_name:
-            raise ValueError("If the model provider is 'Anthropic', the model name must include 'claude'. See https://docs.anthropic.com/claude/reference/selecting-a-model for more details on Anthropic models")
-        
-        self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
-        self.model_name = model_name
-
-        if not self.openai_api_key and not os.getenv('OPENAI_API_KEY'):
-            raise ValueError("Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env. Used for evaluation model")
-        else:
-            self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
-        
-        self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY')
+            raise ValueError(
+                "document_depth_percent_interval_type must be either None, 'linear' or 'sigmoid'. If you'd like your own distribution give a list of ints in via document_depth_percent_intervals"
+            )
 
-        if self.model_provider == "Anthropic":
-            if not self.anthropic_api_key and not os.getenv('ANTHROPIC_API_KEY'):
-                raise ValueError("Either anthropic_api_key must be supplied with init, or ANTHROPIC_API_KEY must be in env.")
-            else:
-                self.anthropic_api_key = anthropic_api_key or os.getenv('ANTHROPIC_API_KEY')
-            
-        if not self.model_name:
-            raise ValueError("model_name must be provided.")
-        
-        if model_provider == "OpenAI":
-            self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key)
-            self.enc = tiktoken.encoding_for_model(self.model_name)
-        elif model_provider == "Anthropic":
-            self.model_to_test = AsyncAnthropic(api_key=self.anthropic_api_key)
-            self.enc = Anthropic().get_tokenizer()
-        
-        self.model_to_test_description = model_name
-        self.evaluation_model = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = self.openai_api_key)
-
-    def logistic(self, x, L=100, x0=50, k=.1):
+        if evaluation_method == "gpt4":
+            self.evaluation_model = ChatOpenAI(
+                model="gpt-4", temperature=0, openai_api_key=self.openai_api_key
+            )
+
+        if evaluation_method == "substring_match" and not all(
+            word.lower() in needle.lower() for word in substr_validation_words
+        ):
+            raise ValueError(
+                "You choose substring evaluation method but some of the words in substr_validation_words is not in the needle you provided"
+                f"\n\nneedle: {needle}"
+                f"\nsubstr_validation_words: {substr_validation_words}"
+            )
+
+    def logistic(self, x, L=100, x0=50, k=0.1):
         if x == 0:
             return 0
         if x == 100:
             return 100
         return np.round(L / (1 + np.exp(-k * (x - x0))), 3)
-    
+
     async def bound_evaluate_and_log(self, sem, *args):
         async with sem:
             await self.evaluate_and_log(*args)
@@ -166,28 +178,13 @@ async def run_test(self):
         # Wait for all tasks to complete
         await asyncio.gather(*tasks)
 
-    def generate_prompt(self, context):
-        if self.model_provider == "Anthropic":
-            with open('Anthropic_prompt.txt', 'r') as file:
-                prompt = file.read()
-            return prompt.format(retrieval_question=self.retrieval_question, context=context)
-        elif self.model_provider == "OpenAI":
-            # Generate the prompt for the Anthropic model
-            # Replace the following line with the appropriate prompt structure
-            return [
-                {
-                    "role": "system",
-                    "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
-                },
-                {
-                    "role": "user",
-                    "content": context
-                },
-                {
-                    "role": "user",
-                    "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings"
-                }
-            ]
+    @abstractmethod
+    def get_prompt(self, context):
+        pass
+
+    @abstractmethod
+    async def get_response_from_model(self, prompt):
+        pass
 
     async def evaluate_and_log(self, context_length, depth_percent):
         # Checks to see if you've already checked a length/percent/version.
@@ -200,27 +197,12 @@ async def evaluate_and_log(self, context_length, depth_percent):
         context = await self.generate_context(context_length, depth_percent)
 
         # Prepare your message to send to the model you're going to evaluate
-        prompt = self.generate_prompt(context)
+        prompt = self.get_prompt(context)
 
         test_start_time = time.time()
 
         # Go see if the model can answer the question to pull out your random fact
-        if self.model_provider == "OpenAI":
-            response = await self.model_to_test.chat.completions.create(
-                model=self.model_name,
-                messages=prompt,
-                max_tokens=300,
-                temperature=0
-            )
-            response = response.choices[0].message.content
-        elif self.model_provider == "Anthropic":
-            response = await self.model_to_test.completions.create(
-                model=self.model_name,
-                max_tokens_to_sample=300,
-                prompt=prompt,
-                temperature=0
-            )
-            response = response.completion
+        response = await self.get_response_from_model(prompt)
 
         test_end_time = time.time()
         test_elapsed_time = test_end_time - test_start_time
@@ -228,50 +210,53 @@ async def evaluate_and_log(self, context_length, depth_percent):
         # Compare the reponse to the actual needle you placed
         score = self.evaluate_response(response)
 
-
         results = {
             # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
-            'model' : self.model_to_test_description,
-            'context_length' : int(context_length),
-            'depth_percent' : float(depth_percent),
-            'version' : self.results_version,
-            'needle' : self.needle,
-            'model_response' : response,
-            'score' : score,
-            'test_duration_seconds' : test_elapsed_time,
-            'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
+            "model": self.model_to_test_description,
+            "context_length": int(context_length),
+            "depth_percent": float(depth_percent),
+            "version": self.results_version,
+            "needle": self.needle,
+            "model_response": response,
+            "score": score,
+            "test_duration_seconds": test_elapsed_time,
+            "test_timestamp_utc": datetime.now(timezone.utc).strftime(
+                "%Y-%m-%d %H:%M:%S%z"
+            ),
         }
 
         self.testing_results.append(results)
 
         if self.print_ongoing_status:
-            print (f"-- Test Summary -- ")
-            print (f"Duration: {test_elapsed_time:.1f} seconds")
-            print (f"Context: {context_length} tokens")
-            print (f"Depth: {depth_percent}%")
-            print (f"Score: {score}")
-            print (f"Response: {response}\n")
+            print("-- Test Summary -- ")
+            print(f"Duration: {test_elapsed_time:.1f} seconds")
+            print(f"Context: {context_length} tokens")
+            print(f"Depth: {depth_percent}%")
+            print(f"Score: {score}")
+            print(f"Response: {response}\n")
 
-        context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
+        context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent * 100)}'
 
         if self.save_contexts:
-            results['file_name'] : context_file_location
+            results["file_name"] = context_file_location
 
             # Save the context to file for retesting
-            if not os.path.exists('contexts'):
-                os.makedirs('contexts')
+            contexts_dir = Path("contexts")
+            contexts_dir.mkdir(parents=True, exist_ok=True)
+
+            context_file_path = contexts_dir / f"{context_file_location}_context.txt"
+            context_file_path.write_text(context)
 
-            with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
-                f.write(context)
-            
         if self.save_results:
-            # Save the context to file for retesting
-            if not os.path.exists('results'):
-                os.makedirs('results')
+            # Ensure the 'results' directory exists
+            results_dir = Path("results")
+            results_dir.mkdir(parents=True, exist_ok=True)
 
-            # Save the result to file for retesting
-            with open(f'results/{context_file_location}_results.json', 'w') as f:
-                json.dump(results, f)
+            # Define the file path for the results file
+            results_file_path = results_dir / f"{context_file_location}_results.json"
+
+            # Serialize the results dictionary to a JSON formatted string and write to the file
+            results_file_path.write_text(json.dumps(results))
 
         if self.seconds_to_sleep_between_completions:
             await asyncio.sleep(self.seconds_to_sleep_between_completions)
@@ -281,20 +266,24 @@ def result_exists(self, context_length, depth_percent):
         Checks to see if a result has already been evaluated or not
         """
 
-        results_dir = 'results/'
-        if not os.path.exists(results_dir):
+        results_dir = Path("results")
+        if not results_dir.exists():
             return False
-        
-        for filename in os.listdir(results_dir):
-            if filename.endswith('.json'):
-                with open(os.path.join(results_dir, filename), 'r') as f:
-                    result = json.load(f)
-                    context_length_met = result['context_length'] == context_length
-                    depth_percent_met = result['depth_percent'] == depth_percent
-                    version_met = result.get('version', 1) == self.results_version
-                    model_met = result['model'] == self.model_name
-                    if context_length_met and depth_percent_met and version_met and model_met:
-                        return True
+
+        for filepath in results_dir.glob("*.json"):
+            with filepath.open("r") as f:
+                result = json.load(f)
+                context_length_met = result["context_length"] == context_length
+                depth_percent_met = result["depth_percent"] == depth_percent
+                version_met = result.get("version", 1) == self.results_version
+                model_met = result["model"] == self.model_name
+                if (
+                    context_length_met
+                    and depth_percent_met
+                    and version_met
+                    and model_met
+                ):
+                    return True
         return False
 
     async def generate_context(self, context_length, depth_percent):
@@ -310,26 +299,17 @@ async def generate_context(self, context_length, depth_percent):
         context = self.insert_needle(context, depth_percent, context_length)
 
         return context
-    
-    def encode_text_to_tokens(self, text):
-        if self.model_provider == "OpenAI":
-            return self.enc.encode(text)
-        elif self.model_provider == "Anthropic":
-            # Assuming you have a different encoder for Anthropic
-            return self.enc.encode(text).ids
-        else:
-            raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-    
+
     def insert_needle(self, context, depth_percent, context_length):
-        tokens_needle = self.encode_text_to_tokens(self.needle)
-        tokens_context = self.encode_text_to_tokens(context)
+        tokens_needle = self.get_encoding(self.needle)
+        tokens_context = self.get_encoding(context)
 
         # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
         context_length -= self.final_context_length_buffer
 
         # If your context + needle are longer than the context length (which it will be), then reduce tokens from the context by the needle length
         if len(tokens_context) + len(tokens_needle) > context_length:
-            tokens_context = tokens_context[:context_length - len(tokens_needle)]
+            tokens_context = tokens_context[: context_length - len(tokens_needle)]
 
         if depth_percent == 100:
             # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
@@ -342,8 +322,8 @@ def insert_needle(self, context, depth_percent, context_length):
             tokens_new_context = tokens_context[:insertion_point]
 
             # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
-            period_tokens = self.encode_text_to_tokens('.')
-            
+            period_tokens = self.get_encoding(".")
+
             # Then we iteration backwards until we find the first period
             while tokens_new_context and tokens_new_context[-1] not in period_tokens:
                 insertion_point -= 1
@@ -354,10 +334,16 @@ def insert_needle(self, context, depth_percent, context_length):
             tokens_new_context += tokens_needle + tokens_context[insertion_point:]
 
         # Convert back to a string and return it
-        new_context = self.decode_tokens(tokens_new_context)
+        new_context = self.get_decoding(tokens_new_context)
         return new_context
 
     def evaluate_response(self, response):
+        if self.evaluation_method == "gpt4":
+            return self.evaluate_response_gpt4(response)
+        else:
+            return self.evaluate_response_substring_match(response)
+
+    def evaluate_response_gpt4(self, response):
         accuracy_criteria = {
             "accuracy": """
             Score 1: The answer is completely unrelated to the reference.
@@ -379,24 +365,31 @@ def evaluate_response(self, response):
         eval_result = evaluator.evaluate_strings(
             # The models response
             prediction=response,
-
             # The actual answer
             reference=self.needle,
-
             # The question asked
             input=self.retrieval_question,
         )
 
-        return int(eval_result['score'])
+        return int(eval_result["score"])
 
-    def get_context_length_in_tokens(self, context):
-        if self.model_provider == "OpenAI":
-            return len(self.enc.encode(context))
-        elif self.model_provider == "Anthropic":
-            # Assuming you have a different encoder for Anthropic
-            return len(self.enc.encode(context).ids)
+    def evaluate_response_substring_match(self, response):
+        response_lower = response.lower()
+        if all(word in response_lower for word in self.substr_validation_words):
+            return 1
         else:
-            raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
+            return 0
+
+    @abstractmethod
+    def get_encoding(self, context):
+        pass
+
+    @abstractmethod
+    def get_decoding(self, encoded_context):
+        pass
+
+    def get_context_length_in_tokens(self, context):
+        return len(self.get_encoding(context))
 
     def read_context_files(self):
         context = ""
@@ -404,53 +397,38 @@ def read_context_files(self):
 
         while self.get_context_length_in_tokens(context) < max_context_length:
             for file in glob.glob(f"{self.haystack_dir}/*.txt"):
-                with open(file, 'r') as f:
+                with open(file, "r") as f:
                     context += f.read()
         return context
 
-    def get_tokens_from_context(self, context):
-        if self.model_provider == "OpenAI":
-            return self.enc.encode(context)
-        elif self.model_provider == "Anthropic":
-            # Assuming you have a different encoder for Anthropic
-            return self.enc.encode(context).ids
-        else:
-            raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-        
-    def decode_tokens(self, tokens, context_length=None):
-        if self.model_provider == "OpenAI":
-            return self.enc.decode(tokens[:context_length])
-        elif self.model_provider == "Anthropic":
-            # Assuming you have a different decoder for Anthropic
-            return self.enc.decode(tokens[:context_length])
-        else:
-            raise ValueError("model_provider must be either 'OpenAI' or 'Anthropic'")
-
     def encode_and_trim(self, context, context_length):
-        tokens = self.get_tokens_from_context(context)
-        if len(tokens) > context_length:
-            context = self.decode_tokens(tokens, context_length)
-        return context
-    
+        encoded_context = self.get_encoding(context)
+        return self.get_decoding(encoded_context[:context_length])
+
     def get_results(self):
         return self.testing_results
-    
+
     def print_start_test_summary(self):
-        print ("\n")
-        print ("Starting Needle In A Haystack Testing...")
-        print (f"- Model: {self.model_name}")
-        print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
-        print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
-        print (f"- Needle: {self.needle.strip()}")
-        print ("\n\n")
+        print("\n")
+        print("Starting Needle In A Haystack Testing...")
+        print(f"- Model: {self.model_name}")
+        print(
+            f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}"
+        )
+        print(
+            f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%"
+        )
+        print(f"- Needle: {self.needle.strip()}")
+        print("\n\n")
 
     def start_test(self):
         if self.print_ongoing_status:
             self.print_start_test_summary()
         asyncio.run(self.run_test())
 
+
 if __name__ == "__main__":
     # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
     ht = LLMNeedleHaystackTester()
 
-    ht.start_test()
\ No newline at end of file
+    ht.start_test()
diff --git a/OpenAIEvaluator.py b/OpenAIEvaluator.py
new file mode 100644
index 00000000..b0eef38e
--- /dev/null
+++ b/OpenAIEvaluator.py
@@ -0,0 +1,65 @@
+import os
+import tiktoken
+from LLMNeedleHaystackTester import LLMNeedleHaystackTester
+from openai import AsyncOpenAI
+
+
+class OpenAIEvaluator(LLMNeedleHaystackTester):
+    def __init__(self, **kwargs):
+        if "openai_api_key" not in kwargs and not os.getenv("OPENAI_API_KEY"):
+            raise ValueError(
+                "Either openai_api_key must be supplied with init, or OPENAI_API_KEY must be in env"
+            )
+
+        if "model_name" not in kwargs:
+            raise ValueError(
+                "model_name must be supplied with init, accepted model_names are 'gpt-4-1106-preview'"
+            )
+        elif kwargs["model_name"] not in ["gpt-4-1106-preview"]:
+            raise ValueError("Model name must be in this list (gpt-4-1106-preview)")
+
+        if "evaluation_method" not in kwargs:
+            print(
+                "since evaluation method is not specified , 'gpt4' will be used for evaluation"
+            )
+        elif kwargs["evaluation_method"] not in ("gpt4", "substring_match"):
+            raise ValueError("evaluation_method must be 'substring_match' or 'gpt4'")
+
+        self.openai_api_key = kwargs.pop("openai_api_key", os.getenv("OPENAI_API_KEY"))
+        self.model_name = kwargs["model_name"]
+        self.model_to_test_description = kwargs.pop("model_name")
+        self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+        self.model_to_test = AsyncOpenAI(api_key=self.openai_api_key)
+
+        super().__init__(**kwargs)
+
+    def get_encoding(self, context):
+        return self.tokenizer.encode(context)
+
+    def get_decoding(self, encoded_context):
+        return self.tokenizer.decode(encoded_context)
+
+    def get_prompt(self, context):
+        return [
+            {
+                "role": "system",
+                "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct",
+            },
+            {"role": "user", "content": context},
+            {
+                "role": "user",
+                "content": f"{self.retrieval_question} Don't give information outside the document or repeat your findings",
+            },
+        ]
+
+    async def get_response_from_model(self, prompt):
+        response = await self.model_to_test.chat.completions.create(
+            model=self.model_name, messages=prompt, max_tokens=300, temperature=0
+        )
+        return response.choices[0].message.content
+
+
+if __name__ == "__main__":
+    # Tons of defaults set, check out the LLMNeedleHaystackTester's init for more info
+    ht = OpenAIEvaluator(model_name="gpt-4-1106-preview", evaluation_method="gpt4")
+    ht.start_test()
diff --git a/README.md b/README.md
index 21c4e609..96ae4b85 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,24 @@ A simple 'needle in a haystack' analysis to test in-context retrieval ability of
 
 Get the behind the scenes on the [overview video](https://youtu.be/KwRRuiCCdmc).
 
-![GPT-4-128 Context Testing](img/NeedleHaystackCodeSnippet.png)
 
 ```
-git clone https://github.com/gkamradt/LLMTest_NeedleInAHaystack.git
+$ git clone https://github.com/prabha-git/LLMTest_NeedleInAHaystack.git
+$ cd LLMTest_NeedleInAHaystack
+$ python -m venv venv
+$ pip install -r requirements.txt 
+$ export OPENAI_API_KEY=<<openai_key>>
+$ Python
+
+>>> from OpenAIEvaluator import OpenAIEvaluator
+>>> openai_ht = OpenAIEvaluator(model_name='gpt-4-1106-preview', evaluation_method='gpt4')
+>>> openai_ht.start_test()
+
+Starting Needle In A Haystack Testing...
+- Model: gpt-4-1106-preview
+- Context Lengths: 35, Min: 1000, Max: 200000 
+- Document Depths: 35, Min: 0%, Max: 100%
+- Needle: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.
 ```
 
 ## The Test
@@ -35,7 +49,6 @@ The key parameters:
 * `document_depth_percent_max` - The ending point of your document depths. Should be int < 100
 * `document_depth_percent_intervals` - The number of iterations to do between your min/max points
 * `document_depth_percent_interval_type` - Determines the distribution of depths to iterate over. 'linear' or 'sigmoid
-* `model_provider` - 'OpenAI' or 'Anthropic'
 * `model_name` - The name of the model you'd like to test. Should match the exact value which needs to be passed to the api. Ex: `gpt-4-1106-preview`
 * `save_results` - Whether or not you'd like to save your results to file. They will be temporarily saved in the object regardless. True/False
 * `save_contexts` - Whether or not you'd like to save your contexts to file. **Warning** these will get very long. True/False
@@ -49,6 +62,24 @@ Other Parameters:
 * `final_context_length_buffer` - The amount of context to take off each input to account for system messages and output tokens. This can be more intelligent but using a static value for now. Default 200 tokens.
 * `seconds_to_sleep_between_completions` - Default: None, set # of seconds if you'd like to slow down your requests
 * `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete
+* `evaluation_method` - Default: gpt4 Choose between gpt4 and simple substring matching (substring_match) to evaluate
+* `substr_validation_words` - Default: ['dolores', 'sandwich'] If you choose substring evaluation of LLM response, presence of these list of keywords are verified to determine if the LLM respone is correct or not
+
+
+
+#### Note on Evaluation Method (`evaluation_method`):
+
+There are two options for evaluation: `gpt4` and `substring_method`.
+
+- `gpt4`: This is the default, utilizing the GPT-4 model to assess responses with a scoring range from 1 to 10. This method is particularly effective when dealing with a broad topic (Large Needle), where using a few hardcoded keywords to evaluate the accuracy and relevance of the response may not be sufficient.
+```
+    - Score 1: The response is completely unrelated to the reference.
+    - Score 3: The response has some relevance but does not fully align with the reference.
+    - Score 5: The response is moderately relevant but includes inaccuracies.
+    - Score 7: The response aligns well with the reference but has minor omissions.
+    - Score 10: The response is entirely accurate and aligns perfectly with the reference.
+```
+- `substring_method`: This approach is suitable for "small needles", where a predefined list of keywords can effectively determine if the response retrieves the essential information. It provides a binary score of either 0 or 1. Opting for this method can also reduce GPT-4 API evaluation costs.
 
 ## Results Visualization
 `LLMNeedleInHaystackVisualization.ipynb` holds the code to make the pivot table visualization. The pivot table was then transferred to Google Slides for custom annotations and formatting. See the [google slides version](https://docs.google.com/presentation/d/15JEdEBjm32qBbqeYM6DK6G-3mUJd7FAJu-qEzj8IYLQ/edit?usp=sharing). See an overview of how this viz was created [here](https://twitter.com/GregKamradt/status/1729573848893579488).
diff --git a/requirements.txt b/requirements.txt
index 309c34c1..f9255207 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 aiohttp==3.9.1
 aiosignal==1.3.1
 annotated-types==0.6.0
-anthropic==0.7.5
+anthropic==0.16.0
 anyio==3.7.1
 attrs==23.1.0
 certifi==2023.11.17
@@ -27,6 +27,7 @@ mypy-extensions==1.0.0
 numpy==1.26.2
 openai==1.3.5
 packaging==23.2
+pre-commit==3.6.2
 pydantic==2.5.2
 pydantic_core==2.14.5
 python-dotenv==1.0.0