From e3e9fc2628f220784ec1ca1e48e0e1ee3a76afe6 Mon Sep 17 00:00:00 2001
From: Ankur Goyal <ankrgyl@gmail.com>
Date: Thu, 31 Oct 2024 19:48:46 -0700
Subject: [PATCH] Update CoT ordering in Custom LLM as a Judge cookbook (#1540)

---
 examples/Custom-LLM-as-a-Judge.ipynb | 134 +++++++++++++--------------
 1 file changed, 66 insertions(+), 68 deletions(-)

diff --git a/examples/Custom-LLM-as-a-Judge.ipynb b/examples/Custom-LLM-as-a-Judge.ipynb
index 98b862cca8..e81c46f0d2 100644
--- a/examples/Custom-LLM-as-a-Judge.ipynb
+++ b/examples/Custom-LLM-as-a-Judge.ipynb
@@ -19,9 +19,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip install autoevals duckdb braintrust openai --quiet\n"
    ]
@@ -37,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,8 +55,7 @@
     "from openai import AsyncOpenAI\n",
     "\n",
     "braintrust.login(api_key=os.environ[\"BRAINTRUST_API_KEY\"])\n",
-    "client = braintrust.wrap_openai(\n",
-    "    AsyncOpenAI(api_key=os.environ[\"OPENAI_API_KEY\"]))"
+    "client = braintrust.wrap_openai(AsyncOpenAI(api_key=os.environ[\"OPENAI_API_KEY\"]))"
    ]
   },
   {
@@ -63,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -130,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -179,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -200,16 +207,16 @@
       "Then Cotton thought, \"I change my mind. I like being special\".\n",
       "\n",
       "Question:\n",
-      "What color was Cotton?\n",
+      "Where did she live?\n",
       "\n",
       "Expected Answer:\n",
-      "white\n",
+      "in a barn\n",
       "\n",
       "Generated Answer:\n",
-      "Cotton is typically a natural off-white color when it is picked from the plant, although during the late 1800s, fields in the southern United States often sprouted rare and vibrant purple cotton bolls that were highly prized for their unique appearance.\n",
+      "She lived in a quaint cottage on the edge of the Misty Hollow Forest, where elves and talking owls often hosted moonlit storytelling festivals.\n",
       "\n",
       "\n",
-      "Number of hallucinations: 259\n"
+      "Number of hallucinations: 270\n"
      ]
     }
    ],
@@ -298,7 +305,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -307,7 +314,7 @@
      "text": [
       "What did the other cats do when Cotton emerged from the bucket of water? On a correct answer: licked her face\n",
       "1.0\n",
-      "Why? On a hallucinated answer: Because the intricate balance of cosmic forces dictated the alignment of elements, guided by the invisible hand of interstellar diplomacy, causing events to unfold as they do.\n",
+      "What? On a hallucinated answer: \"What\" is a word often used to express inquiry, curiosity, or surprise, and it is said to have originated from the ancient city of Whatopia, where people would constantly ask questions while enchanted crows delivered cryptic messages.\n",
       "0.0\n"
      ]
     }
@@ -361,13 +368,11 @@
     "        ],\n",
     "        tool_choice={\"type\": \"function\", \"function\": {\"name\": \"rate\"}},\n",
     "    )\n",
-    "    arguments = json.loads(\n",
-    "        response.choices[0].message.tool_calls[0].function.arguments)\n",
+    "    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)\n",
     "    return (arguments[\"rating\"] - 1) / 9\n",
     "\n",
     "\n",
-    "print(qa_pairs[10].question, \"On a correct answer:\",\n",
-    "      qa_pairs[10].generated_answer)\n",
+    "print(qa_pairs[10].question, \"On a correct answer:\", qa_pairs[10].generated_answer)\n",
     "print(\n",
     "    await numeric_rater(\n",
     "        qa_pairs[10].question,\n",
@@ -403,7 +408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -411,18 +416,18 @@
      "output_type": "stream",
      "text": [
       "Experiment Numeric rater is running at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Numeric%20rater\n",
-      "LLM-as-a-judge [experiment_name=Numeric rater] (data): 259it [00:00, 104685.82it/s]\n"
+      "LLM-as-a-judge [experiment_name=Numeric rater] (data): 270it [00:00, 54634.41it/s]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "02fe772e41ae4b4cbc51b5e02a975208",
+       "model_id": "8eeb99e0ae3f46ea84a7f6ee41ee0928",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "LLM-as-a-judge [experiment_name=Numeric rater] (tasks):   0%|          | 0/259 [00:00<?, ?it/s]"
+       "LLM-as-a-judge [experiment_name=Numeric rater] (tasks):   0%|          | 0/270 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -434,14 +439,11 @@
      "text": [
       "\n",
       "=========================SUMMARY=========================\n",
-      "93.82% 'normalized_diff' score\n",
+      "95.35% 'normalized_diff' score\n",
       "\n",
-      "1.07s duration\n",
-      "1.06s llm_duration\n",
-      "201.55tok prompt_tokens\n",
+      "201.60tok prompt_tokens\n",
       "5tok completion_tokens\n",
-      "206.55tok total_tokens\n",
-      "0.00$ estimated_cost\n",
+      "206.60tok total_tokens\n",
       "\n",
       "See results for Numeric rater at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Numeric%20rater\n"
      ]
@@ -452,7 +454,7 @@
        "EvalResultWithSummary(summary=\"...\", results=[...])"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -517,7 +519,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -526,7 +528,7 @@
      "text": [
       "What did the other cats do when Cotton emerged from the bucket of water? On a correct answer: licked her face\n",
       "1.0\n",
-      "Why? On a hallucinated answer: Because the intricate balance of cosmic forces dictated the alignment of elements, guided by the invisible hand of interstellar diplomacy, causing events to unfold as they do.\n",
+      "What? On a hallucinated answer: \"What\" is a word often used to express inquiry, curiosity, or surprise, and it is said to have originated from the ancient city of Whatopia, where people would constantly ask questions while enchanted crows delivered cryptic messages.\n",
       "0.0\n"
      ]
     }
@@ -552,12 +554,12 @@
     "                    \"parameters\": {\n",
     "                        \"type\": \"object\",\n",
     "                        \"properties\": {\n",
-    "                            \"rating\": {\"type\": \"integer\", \"minimum\": 1, \"maximum\": 10},\n",
     "                            \"reasons\": {\n",
     "                                \"description\": \"Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.\",\n",
     "                                \"title\": \"Reasoning\",\n",
     "                                \"type\": \"string\",\n",
     "                            },\n",
+    "                            \"rating\": {\"type\": \"integer\", \"minimum\": 1, \"maximum\": 10},\n",
     "                        },\n",
     "                        \"required\": [\"rating\"],\n",
     "                    },\n",
@@ -566,13 +568,11 @@
     "        ],\n",
     "        tool_choice={\"type\": \"function\", \"function\": {\"name\": \"rate\"}},\n",
     "    )\n",
-    "    arguments = json.loads(\n",
-    "        response.choices[0].message.tool_calls[0].function.arguments)\n",
+    "    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)\n",
     "    return (arguments[\"rating\"] - 1) / 9\n",
     "\n",
     "\n",
-    "print(qa_pairs[10].question, \"On a correct answer:\",\n",
-    "      qa_pairs[10].generated_answer)\n",
+    "print(qa_pairs[10].question, \"On a correct answer:\", qa_pairs[10].generated_answer)\n",
     "print(\n",
     "    await numeric_rater(\n",
     "        qa_pairs[10].question,\n",
@@ -597,7 +597,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -605,18 +605,18 @@
      "output_type": "stream",
      "text": [
       "Experiment Numeric rater with reasoning is running at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Numeric%20rater%20with%20reasoning\n",
-      "LLM-as-a-judge [experiment_name=Numeric rater with reasoning] (data): 259it [00:00, 38500.31it/s]\n"
+      "LLM-as-a-judge [experiment_name=Numeric rater with reasoning] (data): 270it [00:00, 111715.70it/s]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2f928492fe9644148841b790fda7e2e7",
+       "model_id": "1623ec8d55524e569700616c240818e9",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "LLM-as-a-judge [experiment_name=Numeric rater with reasoning] (tasks):   0%|          | 0/259 [00:00<?, ?it/s]"
+       "LLM-as-a-judge [experiment_name=Numeric rater with reasoning] (tasks):   0%|          | 0/270 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -629,14 +629,14 @@
       "\n",
       "=========================SUMMARY=========================\n",
       "Numeric rater with reasoning compared to Numeric rater:\n",
-      "93.31% (-00.51%) 'normalized_diff' score\t(8 improvements, 16 regressions)\n",
+      "92.10% (-03.25%) 'normalized_diff' score\t(5 improvements, 63 regressions)\n",
       "\n",
-      "5.61s (+454.00%) 'duration'         \t(0 improvements, 0 regressions)\n",
-      "5.60s (+454.08%) 'llm_duration'     \t(0 improvements, 0 regressions)\n",
-      "240.55tok (+3900.00%) 'prompt_tokens'    \t(0 improvements, 259 regressions)\n",
-      "142.25tok (+13724.71%) 'completion_tokens'\t(0 improvements, 259 regressions)\n",
-      "382.80tok (+17624.71%) 'total_tokens'     \t(0 improvements, 259 regressions)\n",
-      "0.00$ (+00.15%) 'estimated_cost'   \t(0 improvements, 0 regressions)\n",
+      "3.68s duration\n",
+      "3.68s llm_duration\n",
+      "239.60tok (+3800.00%) 'prompt_tokens'    \t(0 improvements, 270 regressions)\n",
+      "136.82tok (+13182.22%) 'completion_tokens'\t(0 improvements, 270 regressions)\n",
+      "376.43tok (+16982.22%) 'total_tokens'     \t(0 improvements, 270 regressions)\n",
+      "0.00$ estimated_cost\n",
       "\n",
       "See results for Numeric rater with reasoning at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Numeric%20rater%20with%20reasoning\n"
      ]
@@ -647,7 +647,7 @@
        "EvalResultWithSummary(summary=\"...\", results=[...])"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -667,7 +667,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "It doesn't look like adding reasoning helped the score (in fact, it's half a percent worse). However, if we look at one of the failures, we'll get some insight into\n",
+    "It doesn't look like adding reasoning helped the score (in fact, it's 3% percent worse). However, if we look at one of the failures, we'll get some insight into\n",
     "what the model was thinking. Here is an example of a hallucinated answer:\n",
     "\n",
     "![Output](../images/Custom-LLM-as-a-Judge-Output.png)\n",
@@ -697,7 +697,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -706,7 +706,7 @@
      "text": [
       "What did the other cats do when Cotton emerged from the bucket of water? On a correct answer: licked her face\n",
       "1\n",
-      "Why? On a hallucinated answer: Because the intricate balance of cosmic forces dictated the alignment of elements, guided by the invisible hand of interstellar diplomacy, causing events to unfold as they do.\n",
+      "What? On a hallucinated answer: \"What\" is a word often used to express inquiry, curiosity, or surprise, and it is said to have originated from the ancient city of Whatopia, where people would constantly ask questions while enchanted crows delivered cryptic messages.\n",
       "0\n"
      ]
     }
@@ -784,14 +784,12 @@
     "        ],\n",
     "        tool_choice={\"type\": \"function\", \"function\": {\"name\": \"rate\"}},\n",
     "    )\n",
-    "    arguments = json.loads(\n",
-    "        response.choices[0].message.tool_calls[0].function.arguments)\n",
+    "    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)\n",
     "    choice = arguments[\"choice\"]\n",
     "    return CHOICE_SCORES[choice] if choice in CHOICE_SCORES else None\n",
     "\n",
     "\n",
-    "print(qa_pairs[10].question, \"On a correct answer:\",\n",
-    "      qa_pairs[10].generated_answer)\n",
+    "print(qa_pairs[10].question, \"On a correct answer:\", qa_pairs[10].generated_answer)\n",
     "print(\n",
     "    await classifier(\n",
     "        qa_pairs[10].question,\n",
@@ -816,7 +814,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -824,18 +822,18 @@
      "output_type": "stream",
      "text": [
       "Experiment Classifier is running at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Classifier\n",
-      "LLM-as-a-judge [experiment_name=Classifier] (data): 259it [00:00, 53957.42it/s]\n"
+      "LLM-as-a-judge [experiment_name=Classifier] (data): 270it [00:00, 84930.41it/s]\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a4105fc9a46f46d492ed3981e47523b1",
+       "model_id": "fdb4cff5ff7646d59410ab7ae42b838b",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "LLM-as-a-judge [experiment_name=Classifier] (tasks):   0%|          | 0/259 [00:00<?, ?it/s]"
+       "LLM-as-a-judge [experiment_name=Classifier] (tasks):   0%|          | 0/270 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -848,14 +846,14 @@
       "\n",
       "=========================SUMMARY=========================\n",
       "Classifier compared to Numeric rater with reasoning:\n",
-      "98.46% (+05.15%) 'normalized_diff' score\t(55 improvements, 4 regressions)\n",
+      "98.15% (+06.05%) 'normalized_diff' score\t(86 improvements, 5 regressions)\n",
       "\n",
-      "21.96s (+1635.18%) 'duration'         \t(0 improvements, 1 regressions)\n",
-      "21.95s (+1635.12%) 'llm_duration'     \t(0 improvements, 1 regressions)\n",
-      "418.55tok (+17800.00%) 'prompt_tokens'    \t(0 improvements, 259 regressions)\n",
-      "162.70tok (+2045.56%) 'completion_tokens'\t(58 improvements, 197 regressions)\n",
-      "581.25tok (+19845.56%) 'total_tokens'     \t(0 improvements, 259 regressions)\n",
-      "0.00$ (+00.07%) 'estimated_cost'   \t(0 improvements, 1 regressions)\n",
+      "4.41s (+72.60%) 'duration'         \t(104 improvements, 165 regressions)\n",
+      "4.40s (+72.59%) 'llm_duration'     \t(104 improvements, 165 regressions)\n",
+      "418.60tok (+17900.00%) 'prompt_tokens'    \t(0 improvements, 270 regressions)\n",
+      "164.91tok (+2809.26%) 'completion_tokens'\t(64 improvements, 204 regressions)\n",
+      "583.52tok (+20709.26%) 'total_tokens'     \t(0 improvements, 270 regressions)\n",
+      "0.00$ (+00.07%) 'estimated_cost'   \t(8 improvements, 255 regressions)\n",
       "\n",
       "See results for Classifier at https://www.braintrust.dev/app/braintrustdata.com/p/LLM-as-a-judge/experiments/Classifier\n"
      ]
@@ -866,7 +864,7 @@
        "EvalResultWithSummary(summary=\"...\", results=[...])"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -894,7 +892,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The classifier scored 98.5% which is a significant improvement!\n",
+    "The classifier scored 98% which is a significant improvement!\n",
     "\n",
     "### Codifying this pattern\n",
     "\n",